File size: 7,844 Bytes
7a92197 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
"""
Variant Calling Pipeline
Process sequencing data to identify genetic variants
"""
from pathlib import Path
from typing import Dict, List, Optional
import yaml
import logging
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class Variant:
"""Represents a genetic variant"""
chromosome: str
position: int
reference: str
alternate: str
quality: float
depth: int
allele_frequency: float
gene: Optional[str] = None
consequence: Optional[str] = None
class VariantCaller:
"""Call variants from sequencing data"""
def __init__(self, config_path: str = "config.yml"):
with open(config_path, 'r') as f:
self.config = yaml.safe_load(f)['pipeline']['variant_calling']
self.min_coverage = self.config['min_coverage']
self.min_allele_frequency = self.config['min_allele_frequency']
self.output_dir = Path(self.config['output_dir'])
self.output_dir.mkdir(parents=True, exist_ok=True)
def call_variants(
self,
alignment_file: Path,
reference_genome: Path,
output_vcf: Optional[Path] = None
) -> Path:
"""
Call variants from aligned sequencing data
Args:
alignment_file: BAM/SAM alignment file
reference_genome: Reference genome FASTA
output_vcf: Output VCF file
Returns:
Path to VCF file
"""
if output_vcf is None:
output_vcf = self.output_dir / f"{alignment_file.stem}_variants.vcf"
logger.info(f"Calling variants from {alignment_file.name}")
# Simulate variant calling for demo
# In production, use tools like GATK, FreeBayes, or BCFtools
variants = self._simulate_variant_calling()
# Write VCF
self._write_vcf(variants, output_vcf)
logger.info(f"Identified {len(variants)} variants")
return output_vcf
def _simulate_variant_calling(self) -> List[Variant]:
"""Simulate variant calling for demo purposes"""
# Common cancer-associated variants
variants = [
Variant('chr17', 7577538, 'C', 'T', 35.2, 50, 0.45, 'TP53', 'missense'),
Variant('chr7', 140453136, 'A', 'T', 42.1, 65, 0.52, 'BRAF', 'missense'),
Variant('chr13', 32914438, 'T', 'C', 38.7, 55, 0.48, 'BRCA2', 'missense'),
Variant('chr17', 41244936, 'G', 'A', 40.3, 60, 0.50, 'BRCA1', 'missense'),
Variant('chr3', 178936091, 'G', 'A', 33.5, 48, 0.43, 'PIK3CA', 'missense'),
Variant('chr9', 133748283, 'T', 'G', 37.9, 52, 0.46, 'ABL1', 'missense'),
Variant('chr12', 25398284, 'C', 'T', 39.4, 58, 0.49, 'KRAS', 'missense'),
]
return variants
def _write_vcf(self, variants: List[Variant], output_file: Path):
"""Write variants to VCF format"""
with open(output_file, 'w') as f:
# VCF header
f.write("##fileformat=VCFv4.2\n")
f.write("##source=CancerAtHomeVariantCaller\n")
f.write("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n")
f.write("##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">\n")
f.write("##INFO=<ID=GENE,Number=1,Type=String,Description=\"Gene Name\">\n")
f.write("##INFO=<ID=CONS,Number=1,Type=String,Description=\"Consequence\">\n")
f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
# Variant records
for v in variants:
info = f"DP={v.depth};AF={v.allele_frequency:.3f}"
if v.gene:
info += f";GENE={v.gene}"
if v.consequence:
info += f";CONS={v.consequence}"
filter_status = "PASS" if v.depth >= self.min_coverage and v.allele_frequency >= self.min_allele_frequency else "LowQual"
f.write(f"{v.chromosome}\t{v.position}\t.\t{v.reference}\t{v.alternate}\t{v.quality:.1f}\t{filter_status}\t{info}\n")
def filter_variants(
self,
vcf_file: Path,
min_quality: float = 30.0
) -> List[Variant]:
"""Filter variants by quality metrics"""
variants = []
try:
with open(vcf_file, 'r') as f:
for line in f:
if line.startswith('#'):
continue
fields = line.strip().split('\t')
if len(fields) < 8:
continue
quality = float(fields[5])
if quality < min_quality:
continue
# Parse INFO field
info = dict(item.split('=') for item in fields[7].split(';') if '=' in item)
variant = Variant(
chromosome=fields[0],
position=int(fields[1]),
reference=fields[3],
alternate=fields[4],
quality=quality,
depth=int(info.get('DP', 0)),
allele_frequency=float(info.get('AF', 0)),
gene=info.get('GENE'),
consequence=info.get('CONS')
)
variants.append(variant)
logger.info(f"Filtered to {len(variants)} high-quality variants")
return variants
except Exception as e:
logger.error(f"Error filtering variants: {e}")
return []
def annotate_variants(self, variants: List[Variant]) -> List[Variant]:
"""
Annotate variants with functional information
In production, integrate with tools like:
- ANNOVAR
- VEP (Variant Effect Predictor)
- SnpEff
"""
# Simulated annotation
for variant in variants:
if not variant.gene:
variant.gene = "UNKNOWN"
if not variant.consequence:
variant.consequence = "unknown"
return variants
class VariantAnalyzer:
"""Analyze and interpret variants"""
def __init__(self):
self.caller = VariantCaller()
def identify_cancer_variants(self, variants: List[Variant]) -> List[Variant]:
"""Identify known cancer-associated variants"""
# Common cancer genes
cancer_genes = {
'TP53', 'BRCA1', 'BRCA2', 'KRAS', 'EGFR', 'BRAF',
'PIK3CA', 'APC', 'PTEN', 'MYC', 'RB1', 'CDKN2A'
}
cancer_variants = [
v for v in variants
if v.gene and v.gene in cancer_genes
]
logger.info(f"Found {len(cancer_variants)} cancer-associated variants")
return cancer_variants
def calculate_mutation_burden(self, variants: List[Variant]) -> float:
"""Calculate tumor mutation burden (TMB)"""
# TMB = number of somatic mutations per megabase
coding_variants = [v for v in variants if v.consequence in ['missense', 'nonsense', 'frameshift']]
# Assume exome size of ~30 Mb
exome_size_mb = 30
tmb = len(coding_variants) / exome_size_mb
logger.info(f"Tumor Mutation Burden: {tmb:.2f} mutations/Mb")
return tmb
|