Upload imputation helper run_beagle.py

0efff5c almost 3 years ago

4.63 kB

	#!/usr/bin/env python
	import re
	import click
	import os
	import glob
	"""
	script for running Beagle 5.4
	All kind of data for this script like human reference panel, genetic maps
	and executable files for Beagle and Conform-gt can be found on the Beagle website
	http://faculty.washington.edu/browning/beagle/beagle.html
	"""


	def bgzip_and_index(vcf, samples):
	os.system(f'bcftools view {vcf} -Oz -o {vcf}.gz')
	os.system(f'bcftools index {vcf}.gz')
	if samples:
	# for VCF with other samples
	os.system(f'bcftools view {samples} -Oz -o {samples}.gz')
	os.system(f'bcftools index {samples}.gz')
	print('bgzip_and_index: done')


	def merge(vcf, samples):
	os.system(f'bcftools merge {vcf}.gz {samples}.gz -o merged.vcf')
	bgzip_and_index('merged.vcf', False)
	print('merge: done')


	def clean_and_gzip(vcf, samples):
	if samples:
	vcf = 'merged.vcf'
	result_file = f'{vcf.split(".vcf")[0]}_clean.vcf'
	os.system(f'bcftools view -e \'ALT =="." \| REF=="."\' {vcf}.gz -Oz -o temp.vcf.gz') # remove unknown alleles
	os.system(f'bcftools norm -d none temp.vcf.gz >> {result_file}') # remove duplicates
	os.system(f'gzip {result_file}')
	os.remove('temp.vcf.gz') # remove temporal file
	print('clean_and_gzip: done')
	return f'{result_file}.gz'


	def run_conform(conform, vcf_gz_file, ref_folder):
	"""output files: checked_{chr_type}.vcf.gz
	docs: http://faculty.washington.edu/browning/conform-gt.html
	reference: files was downloaded from from Beagle human reference link
	https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
	for ref_file in glob.glob(f'{ref_folder}/*/chr.vcf.gz', recursive=True):
	print('conform ', ref_file)
	if re.search("chr(\d+)", ref_file):
	chr_type = (re.search("chr(\d+)", ref_file))[1]
	elif re.search("chrX", ref_file):
	chr_type = (re.search("chrX", ref_file))[0].split('chr')[1]
	os.system(f'java -jar {conform} ref={ref_file} gt={vcf_gz_file} chrom={chr_type} '
	f'out=checked_{chr_type}')
	print('run_conform: done')


	def ensure_biallelic_ref(ref_dir):
	for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
	ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
	print('ensure ', ref_file, ref_biall_path)
	os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
	os.system(f'bcftools index {ref_biall_path}.gz')
	os.remove(ref_file) # remove initial ref file


	def run_beagle(beagle, gb, map_dir, ref_dir):
	"""output files: checked_{chr_type}.vcf.gz
	docs: http://faculty.washington.edu/browning/conform-gt.html"""
	for checked_file in glob.glob(f'{os.getcwd()}/checked_*.vcf.gz'):
	if re.search("checked_(\d+)", checked_file):
	chr_type = (re.search("checked_(\d+)", checked_file))[1]
	elif re.search("checked_X", checked_file):
	chr_type = (re.search("checked_X", checked_file))[0].split('checked_')[1]
	for ref_file in glob.glob(f'{ref_dir}/chr{chr_type}.*biallelic.vcf.gz'):
	for map_file in glob.glob(f'{map_dir}/plink.chr{chr_type}.*.map'):
	os.system(f'java -Xmx{gb}g -jar {beagle} gt={checked_file} ref={ref_file}'
	f' out=imputed_{chr_type} map={map_file}')


	@click.command()
	@click.option('--vcf', help='Path to the target vcf file')
	@click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
	'contains data for at least 20 individuals', required=False)
	@click.option('--conform', help='Path to conform .jar file')
	@click.option('--beagle', help='Path to beagle .jar file')
	@click.option('--ref', help='Path to folder with reference genome:'
	' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
	@click.option('--map', help='Path to folder with PLINK format genetic maps, files are expected to start with'
	'"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
	@click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
	def main(vcf, samples, conform, beagle, ref, map, gb):
	bgzip_and_index(vcf, samples)
	if samples:
	merge(vcf, samples)
	cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
	ensure_biallelic_ref(ref)
	run_conform(conform, cleaned_file, ref)
	run_beagle(beagle, gb, map, ref)


	main()