GenNet / preprocess /run_beagle.py
lnalinaf's picture
Upload imputation helper run_beagle.py
0efff5c
raw
history blame
4.63 kB
#!/usr/bin/env python
import re
import click
import os
import glob
"""
script for running Beagle 5.4
All kind of data for this script like human reference panel, genetic maps
and executable files for Beagle and Conform-gt can be found on the Beagle website
http://faculty.washington.edu/browning/beagle/beagle.html
"""
def bgzip_and_index(vcf, samples):
os.system(f'bcftools view {vcf} -Oz -o {vcf}.gz')
os.system(f'bcftools index {vcf}.gz')
if samples:
# for VCF with other samples
os.system(f'bcftools view {samples} -Oz -o {samples}.gz')
os.system(f'bcftools index {samples}.gz')
print('bgzip_and_index: done')
def merge(vcf, samples):
os.system(f'bcftools merge {vcf}.gz {samples}.gz -o merged.vcf')
bgzip_and_index('merged.vcf', False)
print('merge: done')
def clean_and_gzip(vcf, samples):
if samples:
vcf = 'merged.vcf'
result_file = f'{vcf.split(".vcf")[0]}_clean.vcf'
os.system(f'bcftools view -e \'ALT =="." | REF=="."\' {vcf}.gz -Oz -o temp.vcf.gz') # remove unknown alleles
os.system(f'bcftools norm -d none temp.vcf.gz >> {result_file}') # remove duplicates
os.system(f'gzip {result_file}')
os.remove('temp.vcf.gz') # remove temporal file
print('clean_and_gzip: done')
return f'{result_file}.gz'
def run_conform(conform, vcf_gz_file, ref_folder):
"""output files: checked_{chr_type}.vcf.gz
docs: http://faculty.washington.edu/browning/conform-gt.html
reference: files was downloaded from from Beagle human reference link
https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
print('conform ', ref_file)
if re.search("chr(\d+)", ref_file):
chr_type = (re.search("chr(\d+)", ref_file))[1]
elif re.search("chrX", ref_file):
chr_type = (re.search("chrX", ref_file))[0].split('chr')[1]
os.system(f'java -jar {conform} ref={ref_file} gt={vcf_gz_file} chrom={chr_type} '
f'out=checked_{chr_type}')
print('run_conform: done')
def ensure_biallelic_ref(ref_dir):
for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
print('ensure ', ref_file, ref_biall_path)
os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
os.system(f'bcftools index {ref_biall_path}.gz')
os.remove(ref_file) # remove initial ref file
def run_beagle(beagle, gb, map_dir, ref_dir):
"""output files: checked_{chr_type}.vcf.gz
docs: http://faculty.washington.edu/browning/conform-gt.html"""
for checked_file in glob.glob(f'{os.getcwd()}/checked_*.vcf.gz'):
if re.search("checked_(\d+)", checked_file):
chr_type = (re.search("checked_(\d+)", checked_file))[1]
elif re.search("checked_X", checked_file):
chr_type = (re.search("checked_X", checked_file))[0].split('checked_')[1]
for ref_file in glob.glob(f'{ref_dir}/chr{chr_type}.*biallelic.vcf.gz'):
for map_file in glob.glob(f'{map_dir}/plink.chr{chr_type}.*.map'):
os.system(f'java -Xmx{gb}g -jar {beagle} gt={checked_file} ref={ref_file}'
f' out=imputed_{chr_type} map={map_file}')
@click.command()
@click.option('--vcf', help='Path to the target vcf file')
@click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
'contains data for at least 20 individuals', required=False)
@click.option('--conform', help='Path to conform .jar file')
@click.option('--beagle', help='Path to beagle .jar file')
@click.option('--ref', help='Path to folder with reference genome:'
' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
@click.option('--map', help='Path to folder with PLINK format genetic maps, files are expected to start with'
'"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
@click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
def main(vcf, samples, conform, beagle, ref, map, gb):
bgzip_and_index(vcf, samples)
if samples:
merge(vcf, samples)
cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
ensure_biallelic_ref(ref)
run_conform(conform, cleaned_file, ref)
run_beagle(beagle, gb, map, ref)
main()