# gnomAD
As of 2019-08-16, the latest release of gnomAD is 2.1.1 from March 6, 2019.
Variant-level allele counts and frequencies from gnomAD's exome and genome cohorts are used to annotate somatic and germline SNVs/indels.
These files were retrieved and processed as such:
# Exomes
The exome VCF file contains a lot of information. We prune most of this to reduce file size and only keep relevant information. Only values from the non-cancer subset of the total population are used, since this excludes the normals from TCGA.
# Download files
wget https://storage.googleapis.com/gnomad-public/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz
wget https://storage.googleapis.com/gnomad-public/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz.tbi
mv gnomad.exomes.r2.1.1.sites.vcf.bgz gnomad.exomes.r2.1.1.sites.vcf.gz
mv gnomad.exomes.r2.1.1.sites.vcf.bgz.tbi gnomad.exomes.r2.1.1.sites.vcf.gz.tbi
# Parse INFO columns to retain only relevant ones
bcftools view --header-only gnomad.exomes.r2.1.1.sites.vcf.gz | \
grep -E "non_cancer_AC|non_cancer_AF" | \
grep -v -e "_male" -e "_female" \
> gnomad.exomes.r2.1.1.sites.retained.info
paste <(grep -oP "(?<=ID\=)[A-Za-z_]+" gnomad.exomes.r2.1.1.sites.retained.info) \
<(grep -oP "(?<=Description=\")[A-Za-z\(\),\-\_\ ]+" gnomad.exomes.r2.1.1.sites.retained.info) \
> tmp && \
mv tmp gnomad.exomes.r2.1.1.sites.retained.info
COLS=$(Rscript -e "out = paste0('^INFO/', paste(read.delim('gnomad.exomes.r2.1.1.sites.retained.info', header = F)[['V1']], collapse = ',INFO/')); cat(out)")
# Apply this, and retain filtered sites
bcftools annotate \
--remove "$COLS" \
--include 'FILTER~"PASS" | FILTER~"RF"' \
--output-type z \
--output tmp.vcf.gz \
gnomad.exomes.r2.1.1.sites.vcf.gz
tabix --preset vcf tmp.vcf.gz
# Mark filtered sites
bcftools annotate \
--annotations gnomad.exomes.r2.1.1.sites.non_cancer.vcf.gz \
--include 'FILTER!="PASS"' \
--mark-sites "+gnomAD_FILTER" \
-k \
--output-type z \
--output gnomad.exomes.r2.1.1.sites.non_cancer.vcf.gz \
tmp.vcf.gz
tabix --preset vcf gnomad.exomes.r2.1.1.sites.non_cancer.vcf.gz
# Clean up
rm gnomad.exomes.r2.1.1.sites.retained.info
rm tmp.vcf.gz tmp.vcf.gz.tbi
# Genomes
For genomes, there is no non-cancer subset.
# Download, one chromosome at the time
for chr in {1..22} X
do
wget https://storage.googleapis.com/gnomad-public/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chr}.vcf.bgz
wget https://storage.googleapis.com/gnomad-public/release/2.1.1/vcf/genomes/gnomad.genomes.r2.1.1.sites.${chr}.vcf.bgz.tbi
mv gnomad.genomes.r2.1.1.sites.${chr}.vcf.bgz gnomad.genomes.r2.1.1.sites.${chr}.vcf.gz
mv gnomad.genomes.r2.1.1.sites.${chr}.vcf.bgz.tbi gnomad.genomes.r2.1.1.sites.${chr}.vcf.gz.tbi
done
# Parse INFO columns to retain only relevant ones
bcftools view --header-only gnomad.genomes.r2.1.1.sites.1.vcf.gz | \
grep -E "AC|AF" | \
grep -v -e "_male" -e "_female" -e "controls" -e "topmed" -e "neuro" -e "vep" -e "raw" -e "=popmax" -e "AC0" | \
cut -f3 -d"=" | cut -f1 -d"," \
> gnomad.genomes.r2.1.1.sites.retained.info
COLS=$(Rscript -e "out = paste0('^INFO/', paste(read.delim('gnomad.genomes.r2.1.1.sites.retained.info', header = F)[['V1']], collapse = ',INFO/')); cat(out)")
CHR=({1..22} X)
# Apply this, and retain filtered sites
for chr in ${CHR[@]}
do
bcftools annotate \
--remove "$COLS" \
--include 'FILTER~"PASS" | FILTER~"RF"' \
--output-type z \
--output tmp.${chr}.vcf.gz \
gnomad.genomes.r2.1.1.sites.${chr}.vcf.gz
tabix --preset vcf tmp.${chr}.vcf.gz
bcftools annotate \
--annotations tmp.${chr}.vcf.gz \
--include 'FILTER!="PASS"' \
--mark-sites "+gnomAD_FILTER" \
-k \
--output-type z \
--output gnomad.genomes.r2.1.1.sites.${chr}.minimal.vcf.gz \
tmp.${chr}.vcf.gz
tabix --preset vcf gnomad.genomes.r2.1.1.sites.${chr}.minimal.vcf.gz
done
# Concatenate to one file
bcftools concat \
--output-type z \
--output gnomad.genomes.r2.1.1.sites.minimal.vcf.gz \
gnomad.genomes.r2.1.1.sites.{1..22}.minimal.vcf.gz \
gnomad.genomes.r2.1.1.sites.X.minimal.vcf.gz
tabix --preset vcf gnomad.genomes.r2.1.1.sites.minimal.vcf.gz
# Clean up
for chr in ${CHR[@]}
do
rm gnomad.genomes.r2.1.1.sites.${chr}.minimal.vcf.gz*
rm tmp.${chr}.vcf.gz*
done