This notebook results from an R markdown file that generates scripts to create a .vcf
file for 1000+ Pacific herring samples (low coverage whole genomes). The markdown file can be found here
This notebook uses the R package reticulate with python to automate the tedious process of writing bash scripts for each job run on the computing cluster. These scripts were made to run on the Farm cluster which uses SLURM scheduling. Each bash script has a SLURM header.
Written by Joe McGirr, postdoc in Andrew Whitehead’s lab.
Data info: fastq
, bam
, and .vcf
files are currently located on the Farm sever at UC Davis (/home/eoziolor/phpopg/data/align/). Raw g.vcfs were generated from .bam
s aligned to c. harengus fasta.
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(tidyverse)
library(reticulate)
library(ggpubr)
# color-blind friendly
# Wong, B. Points of view: Color blindness. Nat Methods (2011).
bla <- "#000000"
blu <- "#0072b2"
grb <- "#56b4e9"
lir <- "#cc79a7"
gre <- "#009e73"
red <- "#d55e00"
org <- "#e69f00"
yel <- "#f0e442"
gry<- '#BBBBBB'
import io
import pandas as pd
import numpy as np
import math
def sbatch_header(job,mem,tasks,hours):
#sbatch submission script header
script = 'script_' + job + '.sh'
outfile = io.open(script,'w', newline='\n')
outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
outfile.write('#SBATCH --mem='+mem+'G \n')
outfile.write('#SBATCH --ntasks='+tasks+' \n')
outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
outfile.write('#SBATCH --time='+hours+':00:00 \n')
outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
outfile.write('#SBATCH -p high \n\n')
outfile.close()
def sbatch_header_loop(job,mem,tasks,hours,infile):
#sbatch submission script header
script = 'script_' + infile + job + '.sh'
outfile = io.open(script,'w', newline='\n')
jobname= infile + job
outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+jobname+'\n')
outfile.write('#SBATCH --mem='+mem+'G \n')
outfile.write('#SBATCH --ntasks='+tasks+' \n')
outfile.write('#SBATCH -e '+jobname+'_%A_%a.err \n')
outfile.write('#SBATCH --time='+hours+':00:00 \n')
outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
outfile.write('#SBATCH -p high \n\n')
outfile.close()
Trim .fastq
, align with bwa mem, and index .bam
with samtools
example script
#!/bin/bash
#SBATCH --job-name=trim_align
#SBATCH --mem=16G
#SBATCH --ntasks=4
#SBATCH -e trim_align_%A_%a.err
#SBATCH --time=24:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq.gz
gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq.gz
module load trim_galore
trim_galore -q 20 --paired --illumina /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq
module load bwa
bwa mem -aM -p -t 4 -R "@RG\tID:group1\tSM:PH-Sitka-93_S1_L008\tPL:illumina\tLB:lib1" /home/eoziolor/phpopg/data/genome_chr/c.harengus.fa /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001_val_2.fq > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam
module load samtools
samtools view -Shu /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam
samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam
rm /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam
samtools sort /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam -o /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam
samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam
#run: sbatch script_trim_align.sh
job_name = 'trim_align_high_cov_ph'
sbatch_header(job_name,'16','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')
o.write('# cp /home/eoziolor/phgenome/data/raw/PH-Sitka-93_S1_L008_R1_001.fastq.gz /home/jamcgirr/ph/data/hi_cov/ \n')
o.write('# cp /home/eoziolor/phgenome/data/raw/PH-Sitka-93_S1_L008_R2_001.fastq.gz /home/jamcgirr/ph/data/hi_cov/ \n')
o.write('#gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq.gz \n')
o.write('#gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq.gz \n\n')
o.write('# module load trim_galore \n')
o.write('# trim_galore -q 20 --paired --illumina /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq \n\n')
o.write('# mv PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov \n')
o.write('# mv PH-Sitka-93_S1_L008_R2_001_val_2.fq /home/jamcgirr/ph/data/hi_cov \n')
o.write('# module load bwa\n')
o.write('# bwa mem -aM -p -t 4 -R "@RG\\tID:group1\\tSM:PH-Sitka-93_S1_L008\\tPL:illumina\\tLB:lib1" /home/eoziolor/phpopg/data/genome_chr/c.harengus.fa /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001_val_2.fq > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam \n')
o.write('module load samtools\n')
o.write('# samtools view -Shu /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam\n')
o.write('# samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam\n')
o.write('# rm /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam \n')
o.write('samtools sort /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam -o /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam\n')
o.write('samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam\n')
o.write('\n\n#run: sbatch '+script)
o.close()
use picard-tools to generate .bam
metrics
example script
#!/bin/bash
#SBATCH --job-name=00_CollectWgsMetrics
#SBATCH --mem=8G
#SBATCH --ntasks=1
#SBATCH -e 00_CollectWgsMetrics_%A_%a.err
#SBATCH --time=24:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
module load picardtools
for file in /home/eoziolor/phpopg/data/align/00*.bam
do
filename=$(basename $file .bam)
picard-tools CollectWgsMetrics I=$file R=/home/jamcgirr/ph/data/c_harengus/c.harengus.fa O=/home/jamcgirr/ph/familiarize/elias_qc_stats/wgsMetrics/$filename.collect_wgs_metrics.txt
done
#run: sbatch script_00_CollectWgsMetrics.sh
job_name = '_CollectWgsMetrics'
infiles = ["00","01","02","03","04","05","06","07","08","09","1"]
for infile in infiles:
script = 'script_' + infile + job_name + '.sh'
sbatch_header_loop(job_name,'8','4','48', infile)
o = io.open(script,'a+', newline='\n')
o.write('module load picardtools\n')
o.write('for file in /home/eoziolor/phpopg/data/align/'+infile+'*.bam\n')
o.write('do\n')
o.write('filename=$(basename $file .bam)\n')
o.write('picard-tools CollectWgsMetrics I=$file R=/home/jamcgirr/ph/data/c_harengus/c.harengus.fa O=/home/jamcgirr/ph/familiarize/elias_qc_stats/wgsMetrics/$filename.collect_wgs_metrics.txt\n')
o.write('done\n')
o.write('\n\n#run: sbatch '+script)
o.close()
pop_info <- read.table("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/EVOS_MasterSheet_JoeMcGirr_April2020.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
bam_met_files <- list.files("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/multiqc_data/wgsMetrics/")
met_files_in_pop_info <- c()
for(met_file in bam_met_files){
met_files_in_pop_info <- c(met_files_in_pop_info,strsplit(met_file,"\\.collect_wgs_metrics.txt")[[1]])
}
pop_info <- pop_info[pop_info$Sample %in% met_files_in_pop_info, ]
sample_name <- pop_info$Sample[1]
bam_met <- paste("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/multiqc_data/wgsMetrics/",sample_name, ".collect_wgs_metrics.txt", sep = "")
bam_met <- read.table(bam_met, header = TRUE, stringsAsFactors = FALSE, nrow= 1)
bam_met$Sample <- sample_name
pop_info_bams <- merge(pop_info,bam_met, by = "Sample")
for (sample_name in pop_info$Sample[2:(length(pop_info$Sample))])
{
bam_met <- paste("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/multiqc_data/wgsMetrics/",sample_name, ".collect_wgs_metrics.txt", sep = "")
bam_met <- read.table(bam_met, header = TRUE, stringsAsFactors = FALSE, nrow= 1)
bam_met$Sample <- sample_name
pop_info_bams1 <- merge(pop_info,bam_met, by = "Sample")
pop_info_bams <- rbind(pop_info_bams,pop_info_bams1)
}
p1 <- ggplot(pop_info_bams,aes(factor(Year.Collected), MEAN_COVERAGE))+
geom_violin()+ xlab("year collected") + ylab("mean coverage")+
#geom_boxplot(width=0.1)
theme_minimal()+
geom_jitter(height = 0, width = 0.1, alpha = 0.1)+
stat_compare_means(method = "anova", label.y = max(pop_info_bams$MEAN_COVERAGE)+(max(pop_info_bams$MEAN_COVERAGE)*.05),label.x = 2.25) # Add global p-value
p1
an <- aov(pop_info_bams$MEAN_COVERAGE~factor(pop_info_bams$Year.Collected))
#summary(an)
TukeyHSD(an)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = pop_info_bams$MEAN_COVERAGE ~ factor(pop_info_bams$Year.Collected))
##
## $`factor(pop_info_bams$Year.Collected)`
## diff lwr upr p adj
## 1996-1991 -0.02372375 -0.15281713 0.10536963 0.9871732
## 2006-1991 0.16428980 0.01119635 0.31738325 0.0282996
## 2007-1991 0.23988046 0.05256954 0.42719139 0.0044104
## 2017-1991 0.02771758 -0.08867635 0.14411152 0.9665524
## 2006-1996 0.18801355 0.04976938 0.32625771 0.0019777
## 2007-1996 0.26360421 0.08822112 0.43898730 0.0004127
## 2017-1996 0.05144133 -0.04458675 0.14746942 0.5865319
## 2007-2006 0.07559067 -0.11814042 0.26932175 0.8239828
## 2017-2006 -0.13657221 -0.26303919 -0.01010523 0.0267848
## 2017-2007 -0.21216288 -0.37842074 -0.04590502 0.0046109
p1 <- ggplot(pop_info_bams,aes(factor(Year.Collected), PCT_EXC_TOTAL))+
geom_violin()+ xlab("\nyear collected") + ylab("% bases excluded due to all filters\n")+
#geom_boxplot(width=0.1)
theme_minimal()+
geom_jitter(height = 0, width = 0.1, alpha = 0.1)+
stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_EXC_TOTAL)+(max(pop_info_bams$PCT_EXC_TOTAL)*.05),label.x = 2.25) # Add global p-value
p1
an <- aov(pop_info_bams$PCT_EXC_TOTAL~factor(pop_info_bams$Year.Collected))
#summary(an)
TukeyHSD(an)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = pop_info_bams$PCT_EXC_TOTAL ~ factor(pop_info_bams$Year.Collected))
##
## $`factor(pop_info_bams$Year.Collected)`
## diff lwr upr p adj
## 1996-1991 0.004985309 -0.004163969 0.0141345880 0.5701218
## 2006-1991 0.007126707 -0.003723537 0.0179769513 0.3772626
## 2007-1991 0.027552319 0.014276968 0.0408276695 0.0000002
## 2017-1991 -0.001436104 -0.009685331 0.0068131223 0.9895467
## 2006-1996 0.002141398 -0.007656428 0.0119392238 0.9755369
## 2007-1996 0.022567009 0.010137024 0.0349969947 0.0000080
## 2017-1996 -0.006421414 -0.013227245 0.0003844174 0.0751570
## 2007-2006 0.020425611 0.006695243 0.0341559800 0.0004921
## 2017-2006 -0.008562812 -0.017525949 0.0004003254 0.0691750
## 2017-2007 -0.028988423 -0.040771673 -0.0172051733 0.0000000
p1 <- ggplot(pop_info_bams,aes(factor(Year.Collected), PCT_1X))+
geom_violin()+ xlab("\nyear collected") + ylab("% bases with >= 1X coverage post-filtering\n")+
#geom_boxplot(width=0.1)
theme_minimal()+
geom_jitter(height = 0, width = 0.1, alpha = 0.1)+
stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_1X)+(max(pop_info_bams$PCT_1X)*.05),label.x = 2.25) # Add global p-value
p1
an <- aov(pop_info_bams$PCT_1X~factor(pop_info_bams$Year.Collected))
#summary(an)
TukeyHSD(an)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = pop_info_bams$PCT_1X ~ factor(pop_info_bams$Year.Collected))
##
## $`factor(pop_info_bams$Year.Collected)`
## diff lwr upr p adj
## 1996-1991 -0.010602557 -0.039550728 0.018345613 0.8551484
## 2006-1991 0.035997720 0.001667723 0.070327716 0.0344357
## 2007-1991 0.057955576 0.015952581 0.099958572 0.0016070
## 2017-1991 0.003695846 -0.022404574 0.029796266 0.9952600
## 2006-1996 0.046600277 0.015600115 0.077600439 0.0004117
## 2007-1996 0.068558134 0.029229861 0.107886406 0.0000212
## 2017-1996 0.014298403 -0.007235136 0.035831942 0.3658409
## 2007-2006 0.021957856 -0.021484809 0.065400521 0.6401328
## 2017-2006 -0.032301874 -0.060661095 -0.003942653 0.0162973
## 2017-2007 -0.054259730 -0.091541742 -0.016977719 0.0007074
p1 <- ggplot(pop_info_bams,aes(factor(Location), MEAN_COVERAGE))+
geom_violin()+ xlab("population location") + ylab("mean coverage")+
#geom_boxplot(width=0.1)
theme_minimal()+
geom_jitter(height = 0, width = 0.1, alpha = 0.1)#+
#stat_compare_means(method = "anova", label.y = max(pop_info_bams$MEAN_COVERAGE)+(max(pop_info_bams$MEAN_COVERAGE)*.05),label.x = 2.25) # Add global p-value
p1
an <- aov(pop_info_bams$MEAN_COVERAGE~factor(pop_info_bams$Location))
#summary(an)
#TukeyHSD(an)
p1 <- ggplot(pop_info_bams,aes(factor(Location), PCT_EXC_TOTAL))+
geom_violin()+ xlab("\npopulation location") + ylab("% bases excluded due to all filters\n")+
#geom_boxplot(width=0.1)
theme_minimal()+
geom_jitter(height = 0, width = 0.1, alpha = 0.1)#+
#stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_EXC_TOTAL)+(max(pop_info_bams$PCT_EXC_TOTAL)*.05),label.x = 2.25) # Add global p-value
p1
an <- aov(pop_info_bams$PCT_EXC_TOTAL~factor(pop_info_bams$Location))
#summary(an)
#TukeyHSD(an)
p1 <- ggplot(pop_info_bams,aes(factor(Location), PCT_1X))+
geom_violin()+ xlab("\npopulation location") + ylab("% bases with >= 1X coverage post-filtering\n")+
#geom_boxplot(width=0.1)
theme_minimal()+
geom_jitter(height = 0, width = 0.1, alpha = 0.1)#+
#stat_compare_means(method = "anova", label.y = max(pop_info_bams$PCT_1X)+(max(pop_info_bams$PCT_1X)*.05),label.x = 2.25) # Add global p-value
p1
an <- aov(pop_info_bams$PCT_1X~factor(pop_info_bams$Location))
#summary(an)
#TukeyHSD(an)
g.vcfs
to create SNP vcfuse gatk GenomicsDBImport
example script
#!/bin/bash
#SBATCH --job-name=chr2_make_db
#SBATCH --mem=16G
#SBATCH --ntasks=4
#SBATCH -e chr2_make_db_%A_%a.err
#SBATCH --time=72:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)
scratch_tmp=/scratch/jamcgirr/combine_gvcfs_chr2_tmp
scratch_cp_to_home=/scratch/jamcgirr/combine_gvcfs_chr2
home_cp=/home/jamcgirr/ph/data/combine_gvcfs/chr2
module load R
module load maven
module load java
module load GATK/4.1.4.1
mkdir -vp $scratch_tmp
gatk --java-options "-Xmx12g -Xms8g" GenomicsDBImport --genomicsdb-workspace-path $scratch_cp_to_home --batch-size 100 -L chr2_interval.list --sample-name-map /home/jamcgirr/ph/scripts/fastq_to_vcf/combine_gvcfs/ph.gvcfs_map --tmp-dir=$scratch_tmp --reader-threads 4
cp -r $scratch_cp_to_home/* $home_cp
##rm SCRATCH ONLY!!
rm -r $scratch_tmp
rm -r $scratch_cp_to_home
# run next chr
sbatch script_chr2_1_genotypegvcf.sh
sbatch script_chr2_2_genotypegvcf.sh
#run: sbatch script_chr2_make_db.sh
job_name = '_make_db'
infiles =["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]
#infiles =["chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]
for infile in infiles:
script = 'script_' + infile + job_name + '.sh'
sbatch_header_loop(job_name,'16','4','72', infile)
o = io.open(script,'a+', newline='\n')
o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
o.write('scratch_tmp=/scratch/jamcgirr/combine_gvcfs_'+infile+'_tmp \n')
o.write('scratch_cp_to_home=/scratch/jamcgirr/combine_gvcfs_'+infile+' \n')
o.write('home_cp=/home/jamcgirr/ph/data/combine_gvcfs/'+infile+' \n\n')
o.write('module load R \n')
o.write('module load maven \n')
o.write('module load java \n')
o.write('module load GATK/4.1.4.1 \n\n')
o.write('mkdir -vp $scratch_tmp \n\n')
o.write('gatk --java-options "-Xmx12g -Xms8g" GenomicsDBImport --genomicsdb-workspace-path $scratch_cp_to_home --batch-size 100 -L '+infile+'_interval.list --sample-name-map /home/jamcgirr/ph/scripts/fastq_to_vcf/combine_gvcfs/ph.gvcfs_map --tmp-dir=$scratch_tmp --reader-threads 4 \n\n')
o.write('cp -r $scratch_cp_to_home/* $home_cp \n')
o.write('##rm SCRATCH ONLY!! \n')
o.write('rm -r $scratch_tmp \n')
o.write('rm -r $scratch_cp_to_home \n\n')
o.write('sbatch script_'+infile+'_1_genotypegvcf.sh \n')
o.write('sbatch script_'+infile+'_2_genotypegvcf.sh \n')
o.write('\n\n#run: sbatch '+script)
o.close()
.vcf
for each chromosome split into two intervalsuse gatk GenotypeGVCFs
example script
#!/bin/bash
#SBATCH --job-name=chr1_1_genotypegvcf
#SBATCH --mem=16G
#SBATCH --ntasks=1
#SBATCH -e chr1_1_genotypegvcf_%A_%a.err
#SBATCH --time=06-00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)
module load R
module load maven
module load java
module load GATK/4.1.4.1
gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://chr1 -L chr1:1-16542129 -O raw_variants_chr1_1.vcf
#run: sbatch script_chr1_1_genotypegvcf.sh
job_name = '_genotypegvcf'
infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}
for infile in infiles:
chr_name = infile.split("_")[0]
script = 'script_' + infile + job_name + '.sh'
sbatch_header_loop(job_name,'16','4','144', infile)
o = io.open(script,'a+', newline='\n')
o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
o.write('module load R \n')
o.write('module load maven \n')
o.write('module load java \n')
o.write('module load GATK/4.1.4.1 \n\n')
o.write('gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://'+chr_name+' -L '+chr_name+':'+chr_dict[infile]+' -O raw_variants_'+infile+'.vcf \n')
o.write('\n\n#run: sbatch '+script)
o.close()
job_name = '_genotypegvcf_allsites'
infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}
for infile in infiles:
chr_name = infile.split("_")[0]
script = 'script_' + infile + job_name + '.sh'
sbatch_header_loop(job_name,'60','4','144', infile)
o = io.open(script,'a+', newline='\n')
o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
o.write('module load R \n')
o.write('module load maven \n')
o.write('module load java \n')
o.write('module load GATK/4.1.4.1 \n\n')
o.write('gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://'+chr_name+' -L '+chr_name+':'+chr_dict[infile]+' -all-sites -O raw_variants_allsites_'+infile+'.vcf \n')
o.write('\n\n#run: sbatch '+script)
o.close()
.vcf
with biallelic SNPs and filter by depth and qualityuse gatk SelectVariants and bcftools filter
minimum DP = 600 maximum DP = 2000 minimum quality score = 20 minimum mapping quality = 30
example script
#!/bin/bash
#SBATCH --job-name=chr1_filter_gvcf
#SBATCH --mem=8G
#SBATCH --ntasks=4
#SBATCH -e chr1_filter_gvcf_%A_%a.err
#SBATCH --time=24:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
module load samtools
module load bcftools
bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_chr1_1.vcf | bcftools +fill-tags -- -t all,'DP=sum(DP)' | bcftools filter -Oz -i 'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz
bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_chr1_2.vcf | bcftools +fill-tags -- -t all,'DP=sum(DP)' | bcftools filter -Oz -i 'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz
bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz
bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz
#run: sbatch script_chr1_filter_gvcf.sh
job_name = '_filter_gvcf'
infiles = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]
for infile in infiles:
script = 'script_' + infile + job_name + '.sh'
sbatch_header_loop(job_name,'8','4','24', infile)
o = io.open(script,'a+', newline='\n')
o.write('module load samtools \n')
o.write('module load bcftools \n\n')
o.write('bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_1.vcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000\' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
o.write('bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_2.vcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000\' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')
o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')
o.write('\n\n#run: sbatch '+script)
o.close()
#EST time ~ 1.5hr
1,073,808 SNPs across 16,542,120 bp
snps <- read.table("C:/Users/jmcgirr/Documents/GitHub/pac_herring/slurm_scripts/fastq_to_vcf/combine_gvcfs/chr1_1_out.INFO", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
# GATK Hard filters
# qd_x <- "Variant Confidence/Quality by Depth"
# qd <- 2
# sor_x <- "Symmetric Odds Ratio of 2x2 contingency table to detect strand bias"
# sor <- 3
# fs_x <- "Phred-scaled p-value using Fisher's exact test to detect strand bias"
# fs <- 60
# mq_x <- "RMS Mapping Quality"
# mq <- 40
# mqrs_x <- "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"
# mqrs <- -12.5
# rprs_x <- "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"
# rprs <- -8
#filtered <- snps %>% filter(QD < qd) %>%
# filter(SOR > sor) %>%
# filter(FS > fs) %>%
# filter(MQ < qd)
#
#nrow(snps)
#nrow(filtered)
#head(snps)
# {hist(as.numeric(snps$QD), xlab = qd_x, main = paste("< ",qd, sep = ""))
# abline(v = qd, col = red, lty = 2)}
#
# {hist(as.numeric(snps$SOR), xlab = sor_x, main = paste("> ",sor, sep = ""))
# abline(v = sor, col = red, lty = 2)}
#
# {hist(as.numeric(snps$FS), xlab = fs_x, main = paste("> ",fs, sep = ""))
# abline(v = fs, col = red, lty = 2)}
#
# {hist(as.numeric(snps$MQ), xlab = mq_x, main = paste("< ",mq, sep = ""))
# abline(v = mq, col = red, lty = 2)}
#
#
# # filter high coverage regions
# # doi.org/10.1093/bioinformatics/btu356
# # max depth between between d+3sqrt(d) and d+4sqrt(d) where d is mean read depth
# #"These false positives are mostly caused by CNVs
# # or paralogous sequences not present in the reference genome"
#
{hist(as.numeric(snps$DP), xlab = "Depth",breaks = 1000,xlim = c(0,20000), main = paste("600 > Depth ","< ",round((mean(snps$DP)+(4*sqrt(mean(snps$DP)))),0), sep = ""))
abline(v = 600, col = red, lty = 2)
abline(v = (mean(snps$DP)+(4*sqrt(mean(snps$DP)))), col = red, lty = 2)}
filter_DP <- snps %>% filter(DP < (mean(snps$DP)+(4*sqrt(mean(snps$DP))))) %>% filter(DP >600)
# % filtered
#((nrow(snps) - nrow(filter_DP))/ nrow(snps))*100
# define populations for SNP filtering
# put these txt files in directory for step 4
pop_info <- read.table("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/EVOS_MasterSheet_JoeMcGirr_April2020.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
aligned <- read.table("C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/familiarize/aligned_samples.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t")
pop_info <- pop_info[pop_info$Sample %in% aligned$sample,]
for (pop in unique(pop_info$Population.Year)){
pop1 <- pop_info %>% filter(Population.Year == pop)
#(unique(pop1$Location))
#print(unique(pop1$Year.Collected))
#print(nrow(pop1))
outfile_name <- paste("population_",pop1$Population.Year[1],".txt", sep = "")
#write.table(pop1$Sample, outfile_name, col.names = FALSE, row.names = FALSE, quote = FALSE)
}
.vcf
use bcftools concat
example script
#!/bin/bash
#SBATCH --job-name=merge_snp_vcfs
#SBATCH --mem=16G
#SBATCH --ntasks=4
#SBATCH -e merge_snp_vcfs_%A_%a.err
#SBATCH --time=06-00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
module load samtools
module load bcftools
bcftools concat -o /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -O b --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_2.recode.vcf.gz
#run: sbatch script_merge_snp_vcfs.sh
job = 'merge_snp_vcfs'
#sbatch submission script header
script = 'script_' + job + '.sh'
outfile = io.open(script,'w', newline='\n')
outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
outfile.write('#SBATCH --mem=16G \n')
outfile.write('#SBATCH --ntasks=4 \n')
outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
outfile.write('#SBATCH --time=06-00:00 \n')
outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
outfile.write('#SBATCH -p high \n\n')
outfile.write('module load samtools \n')
outfile.write('module load bcftools \n')
outfile.write('bcftools concat -o /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -O b --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_2.recode.vcf.gz \n')
#run sbatch submission
outfile.write('\n\n#run: sbatch '+script)
outfile.close()
vcf
into 14 populations (grouped by unique(location:year)) and filter for 50% genotyping rateuse bcftools
example script
#!/bin/bash
#SBATCH --job-name=BC17_subset_snps_by_pop
#SBATCH --mem=8G
#SBATCH --ntasks=4
#SBATCH -e BC17_subset_snps_by_pop_%A_%a.err
#SBATCH --time=24:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
module load bcftools
bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/BC17_plates_1_through_5_rm.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf | bcftools +fill-tags -- -t all,'DP=sum(DP)' | bcftools filter -Oz -i 'NS>32 ' > /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz
bcftools index /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz
#command to run: sbatch script_BC17_subset_snps_by_pop.sh
job_name = '_subset_snps_by_pop'
pop_n = {"BC17":"64","CA17":"70","PWS07":"46","PWS17":"56","PWS91":"58","PWS96":"72","SS06":"41","SS17":"64","SS96":"78","TB06":"52","TB17":"72","TB91":"74","TB96":"73","WA17":"72"}
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
for infile in infiles:
script = 'script_' + infile + job_name + '.sh'
sbatch_header_loop(job_name,'8','4','24', infile)
o = io.open(script,'a+', newline='\n')
inds_with_data = math.ceil(int(pop_n[infile]) * 0.5)
o.write('module load bcftools \n')
o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'NS>'+str(inds_with_data)+' \' > /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n')
o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n\n')
o.write('\n\n#command to run: sbatch '+script)
o.close()
# EST time ~1hr
example script
#!/bin/bash
#SBATCH --job-name=isec
#SBATCH --mem=16G
#SBATCH --ntasks=4
#SBATCH -e isec_%A_%a.err
#SBATCH --time=144:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
module load bcftools
bcftools isec -n=14 -p isec_test --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz
#run: sbatch script_isec.sh
job_name = 'isec'
sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')
o.write('module load bcftools \n')
o.write('bcftools isec -n=14 -p isec_test --threads 16 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz \n\n')
o.write('\n\n#run: sbatch '+script)
o.close()
Use sites.txt generated by bcftools isec in previous step.
Final vcf named ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.vcf.gz
example script
#!/bin/bash
#SBATCH --job-name=isec
#SBATCH --mem=16G
#SBATCH --ntasks=4
#SBATCH -e isec_%A_%a.err
#SBATCH --time=144:00:00
#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc
#SBATCH --mail-type=ALL
#SBATCH -p high
module load bcftools
bcftools isec -n=14 -p isec_test --threads 4 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz
#run: sbatch script_isec.sh
job_name = 'final_subset_maf_0.05'
sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')
o.write('#module load bcftools \n')
o.write('##bcftools index /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf \n')
o.write('#bcftools view -R /home/jamcgirr/ph/data/combine_gvcfs/isec_test/sites.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -Oz > /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5.vcf.gz \n')
o.write('#bcftools index /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5.vcf.gz \n')
o.write('bcftools filter -Oz -i \'INFO/AF>0.05\' -o /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.vcf.gz \n')
o.write('\n\n#run: sbatch '+script)
o.close()
# 5 hrs (for view)