1 Summary

This notebook results from an R markdown file that generates scripts to create a .vcf file for 1000+ Pacific herring samples (low coverage whole genomes). The markdown file can be found here

This notebook uses the R package reticulate with python to automate the tedious process of writing bash scripts for each job run on the computing cluster. These scripts were made to run on the Farm cluster which uses SLURM scheduling. Each bash script has a SLURM header.

Written by Joe McGirr, postdoc in Andrew Whitehead’s lab.

Data info: fastq, bam, and .vcf files are currently located on the Farm sever at UC Davis (/home/eoziolor/phpopg/data/align/). Raw g.vcfs were generated from .bams aligned to c. harengus fasta.

2 Load R libraries and set color palette

knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyverse)
library(reticulate)
library(ggpubr)

# color-blind friendly 
# Wong, B. Points of view: Color blindness. Nat Methods (2011).
bla <- "#000000"
blu <- "#0072b2"
grb <- "#56b4e9"
lir <- "#cc79a7"
gre <- "#009e73"
red <- "#d55e00"
org <- "#e69f00"
yel <- "#f0e442"
gry<-  '#BBBBBB'

3 Load python libraries

import io
import pandas as pd
import numpy as np
import math

def sbatch_header(job,mem,tasks,hours):
    #sbatch submission script header
    script = 'script_' + job + '.sh'
    outfile = io.open(script,'w', newline='\n')    
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00  \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()
    
def sbatch_header_loop(job,mem,tasks,hours,infile):
    #sbatch submission script header
    script = 'script_' + infile + job + '.sh'
    outfile = io.open(script,'w', newline='\n') 
    jobname= infile + job   
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+jobname+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+jobname+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00 \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()

4 Trim, align, index

Trim .fastq, align with bwa mem, and index .bam with samtools

example script


#!/bin/bash

#SBATCH --job-name=trim_align
#SBATCH --mem=16G 
#SBATCH --ntasks=4 
#SBATCH -e trim_align_%A_%a.err 
#SBATCH --time=24:00:00  
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 


gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq.gz 
gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq.gz 

module load trim_galore 
trim_galore -q 20 --paired --illumina /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq 

module load bwa
bwa mem -aM -p -t 4 -R "@RG\tID:group1\tSM:PH-Sitka-93_S1_L008\tPL:illumina\tLB:lib1" /home/eoziolor/phpopg/data/genome_chr/c.harengus.fa /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001_val_2.fq > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam 
module load samtools
samtools view -Shu /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam
samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam
rm /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam 
samtools sort /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam -o /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam
samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam


#run: sbatch script_trim_align.sh

4.1 Generate trim and align scripts

job_name = 'trim_align_high_cov_ph'

sbatch_header(job_name,'16','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('# cp /home/eoziolor/phgenome/data/raw/PH-Sitka-93_S1_L008_R1_001.fastq.gz /home/jamcgirr/ph/data/hi_cov/ \n')
o.write('# cp /home/eoziolor/phgenome/data/raw/PH-Sitka-93_S1_L008_R2_001.fastq.gz /home/jamcgirr/ph/data/hi_cov/ \n')
o.write('#gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq.gz \n')
o.write('#gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq.gz \n\n')

o.write('# module load trim_galore \n')
o.write('# trim_galore -q 20 --paired --illumina /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq \n\n') 
o.write('# mv PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov \n')
o.write('# mv PH-Sitka-93_S1_L008_R2_001_val_2.fq /home/jamcgirr/ph/data/hi_cov \n')

o.write('# module load bwa\n')
o.write('# bwa mem -aM -p -t 4 -R "@RG\\tID:group1\\tSM:PH-Sitka-93_S1_L008\\tPL:illumina\\tLB:lib1" /home/eoziolor/phpopg/data/genome_chr/c.harengus.fa /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001_val_2.fq > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam \n')
o.write('module load samtools\n')
o.write('# samtools view -Shu /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam\n')
o.write('# samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam\n')
o.write('# rm /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam \n')
o.write('samtools sort /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam -o /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam\n')
o.write('samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam\n') 

o.write('\n\n#run: sbatch '+script)
o.close()

5 Interrogate alignment files

use picard-tools to generate .bam metrics

example script

#!/bin/bash

#SBATCH --job-name=00_CollectWgsMetrics
#SBATCH --mem=8G 
#SBATCH --ntasks=1 
#SBATCH -e 00_CollectWgsMetrics_%A_%a.err 
#SBATCH --time=24:00:00 
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

module load picardtools
for file in /home/eoziolor/phpopg/data/align/00*.bam
do
filename=$(basename $file .bam)
picard-tools CollectWgsMetrics I=$file R=/home/jamcgirr/ph/data/c_harengus/c.harengus.fa O=/home/jamcgirr/ph/familiarize/elias_qc_stats/wgsMetrics/$filename.collect_wgs_metrics.txt
done


#run: sbatch script_00_CollectWgsMetrics.sh

5.1 Generate bam metric scripts

job_name = '_CollectWgsMetrics'

infiles = ["00","01","02","03","04","05","06","07","08","09","1"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','48', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('module load picardtools\n') 

    o.write('for file in /home/eoziolor/phpopg/data/align/'+infile+'*.bam\n')
    o.write('do\n')
    
    o.write('filename=$(basename $file .bam)\n')
    o.write('picard-tools CollectWgsMetrics I=$file R=/home/jamcgirr/ph/data/c_harengus/c.harengus.fa O=/home/jamcgirr/ph/familiarize/elias_qc_stats/wgsMetrics/$filename.collect_wgs_metrics.txt\n')
    
    o.write('done\n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

5.2 Plot Coverage

pop_info <- read.table("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/EVOS_MasterSheet_JoeMcGirr_April2020.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t")

bam_met_files <- list.files("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/multiqc_data/wgsMetrics/")
met_files_in_pop_info <- c()
for(met_file in bam_met_files){
  met_files_in_pop_info <- c(met_files_in_pop_info,strsplit(met_file,"\\.collect_wgs_metrics.txt")[[1]])
}

pop_info <- pop_info[pop_info$Sample %in% met_files_in_pop_info, ]
sample_name <- pop_info$Sample[1]
bam_met <- paste("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/multiqc_data/wgsMetrics/",sample_name, ".collect_wgs_metrics.txt", sep = "")
bam_met <- read.table(bam_met, header = TRUE, stringsAsFactors = FALSE, nrow= 1)
bam_met$Sample <- sample_name
pop_info_bams <- merge(pop_info,bam_met, by = "Sample")
  
for (sample_name in pop_info$Sample[2:(length(pop_info$Sample))])
{
  
  bam_met <- paste("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/multiqc_data/wgsMetrics/",sample_name, ".collect_wgs_metrics.txt", sep = "")
  bam_met <- read.table(bam_met, header = TRUE, stringsAsFactors = FALSE, nrow= 1)
  bam_met$Sample <- sample_name
  pop_info_bams1 <- merge(pop_info,bam_met, by = "Sample")
  pop_info_bams <- rbind(pop_info_bams,pop_info_bams1)
  
}

5.2.1 Compare by year collected

p1 <- ggplot(pop_info_bams,aes(factor(Year.Collected), MEAN_COVERAGE))+
      geom_violin()+ xlab("year collected") + ylab("mean coverage")+
      #geom_boxplot(width=0.1)
      theme_minimal()+
      geom_jitter(height = 0, width = 0.1, alpha = 0.1)+
      stat_compare_means(method = "anova", label.y = max(pop_info_bams$MEAN_COVERAGE)+(max(pop_info_bams$MEAN_COVERAGE)*.05),label.x = 2.25)      # Add global p-value
p1

an <- aov(pop_info_bams$MEAN_COVERAGE~factor(pop_info_bams$Year.Collected))
#summary(an)
TukeyHSD(an)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pop_info_bams$MEAN_COVERAGE ~ factor(pop_info_bams$Year.Collected))
## 
## $`factor(pop_info_bams$Year.Collected)`
##                  diff         lwr         upr     p adj
## 1996-1991 -0.02372375 -0.15281713  0.10536963 0.9871732
## 2006-1991  0.16428980  0.01119635  0.31738325 0.0282996
## 2007-1991  0.23988046  0.05256954  0.42719139 0.0044104
## 2017-1991  0.02771758 -0.08867635  0.14411152 0.9665524
## 2006-1996  0.18801355  0.04976938  0.32625771 0.0019777
## 2007-1996  0.26360421  0.08822112  0.43898730 0.0004127
## 2017-1996  0.05144133 -0.04458675  0.14746942 0.5865319
## 2007-2006  0.07559067 -0.11814042  0.26932175 0.8239828
## 2017-2006 -0.13657221 -0.26303919 -0.01010523 0.0267848
## 2017-2007 -0.21216288 -0.37842074 -0.04590502 0.0046109
p1 <- ggplot(pop_info_bams,aes(factor(Year.Collected), PCT_EXC_TOTAL))+
      geom_violin()+ xlab("\nyear collected") + ylab("% bases excluded due to all filters\n")+
      #geom_boxplot(width=0.1)
      theme_minimal()+
      geom_jitter(height = 0, width = 0.1, alpha = 0.1)+
      stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_EXC_TOTAL)+(max(pop_info_bams$PCT_EXC_TOTAL)*.05),label.x = 2.25)      # Add global p-value
p1

an <- aov(pop_info_bams$PCT_EXC_TOTAL~factor(pop_info_bams$Year.Collected))
#summary(an)
TukeyHSD(an)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pop_info_bams$PCT_EXC_TOTAL ~ factor(pop_info_bams$Year.Collected))
## 
## $`factor(pop_info_bams$Year.Collected)`
##                   diff          lwr           upr     p adj
## 1996-1991  0.004985309 -0.004163969  0.0141345880 0.5701218
## 2006-1991  0.007126707 -0.003723537  0.0179769513 0.3772626
## 2007-1991  0.027552319  0.014276968  0.0408276695 0.0000002
## 2017-1991 -0.001436104 -0.009685331  0.0068131223 0.9895467
## 2006-1996  0.002141398 -0.007656428  0.0119392238 0.9755369
## 2007-1996  0.022567009  0.010137024  0.0349969947 0.0000080
## 2017-1996 -0.006421414 -0.013227245  0.0003844174 0.0751570
## 2007-2006  0.020425611  0.006695243  0.0341559800 0.0004921
## 2017-2006 -0.008562812 -0.017525949  0.0004003254 0.0691750
## 2017-2007 -0.028988423 -0.040771673 -0.0172051733 0.0000000
p1 <- ggplot(pop_info_bams,aes(factor(Year.Collected), PCT_1X))+
      geom_violin()+ xlab("\nyear collected") + ylab("% bases with >= 1X coverage post-filtering\n")+
      #geom_boxplot(width=0.1)
      theme_minimal()+
      geom_jitter(height = 0, width = 0.1, alpha = 0.1)+
      stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_1X)+(max(pop_info_bams$PCT_1X)*.05),label.x = 2.25)      # Add global p-value
p1

an <- aov(pop_info_bams$PCT_1X~factor(pop_info_bams$Year.Collected))
#summary(an)
TukeyHSD(an)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pop_info_bams$PCT_1X ~ factor(pop_info_bams$Year.Collected))
## 
## $`factor(pop_info_bams$Year.Collected)`
##                   diff          lwr          upr     p adj
## 1996-1991 -0.010602557 -0.039550728  0.018345613 0.8551484
## 2006-1991  0.035997720  0.001667723  0.070327716 0.0344357
## 2007-1991  0.057955576  0.015952581  0.099958572 0.0016070
## 2017-1991  0.003695846 -0.022404574  0.029796266 0.9952600
## 2006-1996  0.046600277  0.015600115  0.077600439 0.0004117
## 2007-1996  0.068558134  0.029229861  0.107886406 0.0000212
## 2017-1996  0.014298403 -0.007235136  0.035831942 0.3658409
## 2007-2006  0.021957856 -0.021484809  0.065400521 0.6401328
## 2017-2006 -0.032301874 -0.060661095 -0.003942653 0.0162973
## 2017-2007 -0.054259730 -0.091541742 -0.016977719 0.0007074

5.2.2 Compare by location

p1 <- ggplot(pop_info_bams,aes(factor(Location), MEAN_COVERAGE))+
      geom_violin()+ xlab("population location") + ylab("mean coverage")+
      #geom_boxplot(width=0.1)
      theme_minimal()+
      geom_jitter(height = 0, width = 0.1, alpha = 0.1)#+
      #stat_compare_means(method = "anova", label.y = max(pop_info_bams$MEAN_COVERAGE)+(max(pop_info_bams$MEAN_COVERAGE)*.05),label.x = 2.25)      # Add global p-value
p1

an <- aov(pop_info_bams$MEAN_COVERAGE~factor(pop_info_bams$Location))
#summary(an)
#TukeyHSD(an)
p1 <- ggplot(pop_info_bams,aes(factor(Location), PCT_EXC_TOTAL))+
      geom_violin()+ xlab("\npopulation location") + ylab("% bases excluded due to all filters\n")+
      #geom_boxplot(width=0.1)
      theme_minimal()+
      geom_jitter(height = 0, width = 0.1, alpha = 0.1)#+
      #stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_EXC_TOTAL)+(max(pop_info_bams$PCT_EXC_TOTAL)*.05),label.x = 2.25)      # Add global p-value
p1

an <- aov(pop_info_bams$PCT_EXC_TOTAL~factor(pop_info_bams$Location))
#summary(an)
#TukeyHSD(an)
p1 <- ggplot(pop_info_bams,aes(factor(Location), PCT_1X))+
      geom_violin()+ xlab("\npopulation location") + ylab("% bases with >= 1X coverage post-filtering\n")+
      #geom_boxplot(width=0.1)
      theme_minimal()+
      geom_jitter(height = 0, width = 0.1, alpha = 0.1)#+
      #stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_1X)+(max(pop_info_bams$PCT_1X)*.05),label.x = 2.25)      # Add global p-value
p1

an <- aov(pop_info_bams$PCT_1X~factor(pop_info_bams$Location))
#summary(an)
#TukeyHSD(an)

6 Combine g.vcfs to create SNP vcf

6.1 Make a database for each of the 26 chromosomes

use gatk GenomicsDBImport

example script

#!/bin/bash

#SBATCH --job-name=chr2_make_db
#SBATCH --mem=16G 
#SBATCH --ntasks=4 
#SBATCH -e chr2_make_db_%A_%a.err 
#SBATCH --time=72:00:00 
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)

scratch_tmp=/scratch/jamcgirr/combine_gvcfs_chr2_tmp 
scratch_cp_to_home=/scratch/jamcgirr/combine_gvcfs_chr2 
home_cp=/home/jamcgirr/ph/data/combine_gvcfs/chr2 

module load R 
module load maven 
module load java 
module load GATK/4.1.4.1 

mkdir -vp $scratch_tmp 

gatk --java-options "-Xmx12g -Xms8g" GenomicsDBImport --genomicsdb-workspace-path $scratch_cp_to_home --batch-size 100 -L chr2_interval.list --sample-name-map /home/jamcgirr/ph/scripts/fastq_to_vcf/combine_gvcfs/ph.gvcfs_map --tmp-dir=$scratch_tmp --reader-threads 4 

cp -r $scratch_cp_to_home/* $home_cp 
##rm SCRATCH ONLY!! 
rm -r $scratch_tmp 
rm -r $scratch_cp_to_home 

# run next chr
sbatch script_chr2_1_genotypegvcf.sh 
sbatch script_chr2_2_genotypegvcf.sh 


#run: sbatch script_chr2_make_db.sh

6.1.1 Generate database scripts

job_name = '_make_db'

infiles =["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]
#infiles =["chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'16','4','72', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
    
    o.write('scratch_tmp=/scratch/jamcgirr/combine_gvcfs_'+infile+'_tmp \n')
    o.write('scratch_cp_to_home=/scratch/jamcgirr/combine_gvcfs_'+infile+' \n')
    o.write('home_cp=/home/jamcgirr/ph/data/combine_gvcfs/'+infile+' \n\n')
    
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('mkdir -vp $scratch_tmp \n\n')

    o.write('gatk --java-options "-Xmx12g -Xms8g" GenomicsDBImport --genomicsdb-workspace-path $scratch_cp_to_home --batch-size 100 -L '+infile+'_interval.list --sample-name-map /home/jamcgirr/ph/scripts/fastq_to_vcf/combine_gvcfs/ph.gvcfs_map --tmp-dir=$scratch_tmp --reader-threads 4 \n\n')
    
    o.write('cp -r $scratch_cp_to_home/* $home_cp \n')

    o.write('##rm SCRATCH ONLY!! \n')
    o.write('rm -r $scratch_tmp \n')
    o.write('rm -r $scratch_cp_to_home \n\n')

    o.write('sbatch script_'+infile+'_1_genotypegvcf.sh \n')
    o.write('sbatch script_'+infile+'_2_genotypegvcf.sh \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

6.2 Create .vcf for each chromosome split into two intervals

use gatk GenotypeGVCFs

example script

#!/bin/bash

#SBATCH --job-name=chr1_1_genotypegvcf
#SBATCH --mem=16G 
#SBATCH --ntasks=1 
#SBATCH -e chr1_1_genotypegvcf_%A_%a.err 
#SBATCH --time=06-00:00 
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)

module load R 
module load maven 
module load java 
module load GATK/4.1.4.1 

gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://chr1 -L chr1:1-16542129 -O raw_variants_chr1_1.vcf 


#run: sbatch script_chr1_1_genotypegvcf.sh

6.2.1 Generate database scripts

job_name = '_genotypegvcf'
infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}

for infile in infiles:
    chr_name = infile.split("_")[0]
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'16','4','144', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
    
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://'+chr_name+' -L '+chr_name+':'+chr_dict[infile]+' -O raw_variants_'+infile+'.vcf \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

job_name = '_genotypegvcf_allsites'
infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}

for infile in infiles:
    chr_name = infile.split("_")[0]
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'60','4','144', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
    
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://'+chr_name+' -L '+chr_name+':'+chr_dict[infile]+' -all-sites -O raw_variants_allsites_'+infile+'.vcf \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

6.3 Create .vcf with biallelic SNPs and filter by depth and quality

use gatk SelectVariants and bcftools filter

minimum DP = 600 maximum DP = 2000 minimum quality score = 20 minimum mapping quality = 30

example script

#!/bin/bash

#SBATCH --job-name=chr1_filter_gvcf
#SBATCH --mem=8G 
#SBATCH --ntasks=4 
#SBATCH -e chr1_filter_gvcf_%A_%a.err 
#SBATCH --time=24:00:00 
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

module load samtools 
module load bcftools 

bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_chr1_1.vcf | bcftools +fill-tags -- -t all,'DP=sum(DP)' | bcftools filter -Oz -i 'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz 
bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_chr1_2.vcf | bcftools +fill-tags -- -t all,'DP=sum(DP)' | bcftools filter -Oz -i 'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz 

bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz 
bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz 



#run: sbatch script_chr1_filter_gvcf.sh

6.3.1 Generate bcftools filter scripts

job_name = '_filter_gvcf'
infiles = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
           
    o.write('module load samtools \n')
    o.write('module load bcftools \n\n')
    
    o.write('bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_1.vcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000\' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
    o.write('bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_2.vcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000\' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')

    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')
   
    
    o.write('\n\n#run: sbatch '+script)
    o.close()

    #EST time ~ 1.5hr

6.4 Plot SNP stats for first half of Chr 1

1,073,808 SNPs across 16,542,120 bp

snps <- read.table("C:/Users/jmcgirr/Documents/GitHub/pac_herring/slurm_scripts/fastq_to_vcf/combine_gvcfs/chr1_1_out.INFO", header = TRUE, stringsAsFactors = FALSE, sep = "\t")

# GATK Hard filters

#  qd_x <-  "Variant Confidence/Quality by Depth"
#  qd <- 2
#  sor_x <-  "Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"
#  sor <- 3
#  fs_x <- "Phred-scaled p-value using Fisher's exact test to detect strand bias"
#  fs <- 60
#  mq_x <-  "RMS Mapping Quality"
#  mq <- 40
#  mqrs_x <-  "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"
#  mqrs <- -12.5
#  rprs_x <-  "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"
#  rprs <- -8

#filtered <- snps %>% filter(QD < qd) %>%
#                     filter(SOR > sor) %>%
#                     filter(FS > fs) %>%
#                     filter(MQ < qd) 
#
#nrow(snps)
#nrow(filtered)
#head(snps)

#  {hist(as.numeric(snps$QD), xlab = qd_x, main = paste("< ",qd, sep = ""))
#  abline(v = qd, col = red, lty = 2)}
#  
#  {hist(as.numeric(snps$SOR), xlab = sor_x, main = paste("> ",sor, sep = ""))
#  abline(v = sor, col = red, lty = 2)}
#  
#  {hist(as.numeric(snps$FS), xlab = fs_x, main = paste("> ",fs, sep = ""))
#  abline(v = fs, col = red, lty = 2)}
#  
#  {hist(as.numeric(snps$MQ), xlab = mq_x, main = paste("< ",mq, sep = ""))
#  abline(v = mq, col = red, lty = 2)}
#  
#  
#  # filter high coverage regions
#  # doi.org/10.1093/bioinformatics/btu356
#  # max depth between between d+3sqrt(d) and d+4sqrt(d) where d is mean read depth
#  #"These false positives are mostly caused by CNVs 
#  # or paralogous sequences not present in the reference genome"
#  
  {hist(as.numeric(snps$DP), xlab = "Depth",breaks = 1000,xlim = c(0,20000), main = paste("600 > Depth ","< ",round((mean(snps$DP)+(4*sqrt(mean(snps$DP)))),0), sep = ""))
  abline(v = 600, col = red, lty = 2)
  abline(v = (mean(snps$DP)+(4*sqrt(mean(snps$DP)))), col = red, lty = 2)}

  filter_DP <- snps %>% filter(DP < (mean(snps$DP)+(4*sqrt(mean(snps$DP))))) %>% filter(DP >600)
  # % filtered
  #((nrow(snps) - nrow(filter_DP))/ nrow(snps))*100

  
# define populations for SNP filtering
# put these txt files in directory for step 4

pop_info <- read.table("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/EVOS_MasterSheet_JoeMcGirr_April2020.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
aligned <- read.table("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/aligned_samples.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
pop_info <- pop_info[pop_info$Sample %in% aligned$sample,]

for (pop in unique(pop_info$Population.Year)){
pop1 <- pop_info %>% filter(Population.Year == pop) 
#(unique(pop1$Location))
#print(unique(pop1$Year.Collected))
#print(nrow(pop1))
outfile_name <- paste("population_",pop1$Population.Year[1],".txt", sep = "")
#write.table(pop1$Sample, outfile_name, col.names = FALSE, row.names = FALSE, quote = FALSE)
}

7 Concatenate chromosomes into one .vcf

use bcftools concat

example script

#!/bin/bash

#SBATCH --job-name=merge_snp_vcfs
#SBATCH --mem=16G 
#SBATCH --ntasks=4 
#SBATCH -e merge_snp_vcfs_%A_%a.err 
#SBATCH --time=06-00:00  
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

module load samtools 
module load bcftools 
bcftools concat -o /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -O b --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_2.recode.vcf.gz 


#run: sbatch script_merge_snp_vcfs.sh

7.0.1 Generate bcftools concat script

job = 'merge_snp_vcfs'

#sbatch submission script header
script = 'script_' + job + '.sh'
outfile = io.open(script,'w', newline='\n')    
outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
outfile.write('#SBATCH --mem=16G \n')
outfile.write('#SBATCH --ntasks=4 \n')
outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
outfile.write('#SBATCH --time=06-00:00  \n')
outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
outfile.write('#SBATCH -p high \n\n')


outfile.write('module load samtools \n')
outfile.write('module load bcftools \n')
outfile.write('bcftools concat -o /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -O b --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_2.recode.vcf.gz \n') 
              
#run sbatch submission 
outfile.write('\n\n#run: sbatch '+script)
outfile.close()

8 Subset .vcf into 14 populations (grouped by unique(location:year)) and filter for 50% genotyping rate

use bcftools

example script

#!/bin/bash

#SBATCH --job-name=BC17_subset_snps_by_pop
#SBATCH --mem=8G 
#SBATCH --ntasks=4 
#SBATCH -e BC17_subset_snps_by_pop_%A_%a.err 
#SBATCH --time=24:00:00 
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

module load bcftools 
bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/BC17_plates_1_through_5_rm.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf | bcftools +fill-tags -- -t all,'DP=sum(DP)' | bcftools filter -Oz -i 'NS>32 ' > /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz 
bcftools index /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz 



#command to run: sbatch script_BC17_subset_snps_by_pop.sh

8.0.1 Generate subset/filter scripts

job_name = '_subset_snps_by_pop'

pop_n = {"BC17":"64","CA17":"70","PWS07":"46","PWS17":"56","PWS91":"58","PWS96":"72","SS06":"41","SS17":"64","SS96":"78","TB06":"52","TB17":"72","TB91":"74","TB96":"73","WA17":"72"}
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
        
    inds_with_data = math.ceil(int(pop_n[infile]) * 0.5)   
    o.write('module load bcftools \n')

    o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'NS>'+str(inds_with_data)+' \' > /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n')
    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n\n')

    o.write('\n\n#command to run: sbatch '+script)
    o.close()

# EST time ~1hr

9 Intersect filtered vcfs to find common SNPs

example script

#!/bin/bash

#SBATCH --job-name=isec
#SBATCH --mem=16G 
#SBATCH --ntasks=4 
#SBATCH -e isec_%A_%a.err 
#SBATCH --time=144:00:00  
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

module load bcftools 
bcftools isec -n=14 -p isec_test --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz 



#run: sbatch script_isec.sh

9.0.1 Generate bcftools isec scripts

job_name = 'isec'

sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load bcftools \n')
o.write('bcftools isec -n=14 -p isec_test --threads 16 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz \n\n')


o.write('\n\n#run: sbatch '+script)
o.close()

10 Filter final vcf to include SNPs with minor allele frequency > 0.05

Use sites.txt generated by bcftools isec in previous step.

Final vcf named ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.vcf.gz

example script

#!/bin/bash

#SBATCH --job-name=isec
#SBATCH --mem=16G 
#SBATCH --ntasks=4 
#SBATCH -e isec_%A_%a.err 
#SBATCH --time=144:00:00  
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high 

module load bcftools 
bcftools isec -n=14 -p isec_test --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz 



#run: sbatch script_isec.sh

10.0.1 Generate final maf scripts

job_name = 'final_subset_maf_0.05'

sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('#module load bcftools \n')
o.write('##bcftools index /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf \n') 
o.write('#bcftools view -R /home/jamcgirr/ph/data/combine_gvcfs/isec_test/sites.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -Oz > /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5.vcf.gz \n')
o.write('#bcftools index /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5.vcf.gz  \n')
o.write('bcftools filter -Oz -i \'INFO/AF>0.05\' -o /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.vcf.gz \n')

o.write('\n\n#run: sbatch '+script)
o.close()
# 5 hrs (for view)