#!/usr/bin/env python3 """ Comprehensive validation of example variants: 1. Literature accuracy 2. Example diversity and quality 3. Track type coverage in results """ import sys sys.path.insert(0, ".") from app import predict_single_variant import pandas as pd from collections import defaultdict import json # Known variants from literature VARIANT_REFERENCES = { ("chr17", 7675088, "C", "T"): { "name": "TP53 R175H", "disease": "Cancer (tumor suppressor)", "frequency": "~6% of cancers", "literature": "Highly common hotspot, eliminates DNA binding domain", "impact_expected": "HIGH", "regions_expected": ["coding", "regulatory"], }, ("chr7", 117559593, "ATCT", "A"): { "name": "CFTR F508del", "disease": "Cystic Fibrosis", "frequency": "~70% of CF patients", "literature": "Most common CF mutation, causes protein misfolding", "impact_expected": "HIGH", "regions_expected": ["coding", "structural"], }, ("chr13", 32332771, "AGAGA", "AGA"): { "name": "BRCA2 frameshift", "disease": "Hereditary breast/ovarian cancer", "frequency": "Rare, pathogenic", "literature": "BRCA2 frameshift deletion (c.5946delT), causes loss of function", "impact_expected": "HIGH", "regions_expected": ["coding", "frameshift"], }, ("chr11", 5227002, "T", "A"): { "name": "HBB E6V", "disease": "Sickle cell disease", "frequency": "Common in African populations", "literature": "Missense mutation (rs334) causing hemoglobin S polymerization", "impact_expected": "HIGH", "regions_expected": ["coding", "regulatory"], }, ("chr17", 43092418, "T", "C"): { "name": "BRCA1 synonymous", "disease": "Benign control variant", "frequency": "Common", "literature": "Synonymous variant (c.3113A>G, rs16941), expected benign", "impact_expected": "LOW", "regions_expected": ["coding"], }, } def categorize_track(feature_name, feature_type): """Categorize track type (BED or BigWig, and specific kind)""" if feature_type == "BED": if any( x in feature_name.lower() for x in ["splice", "exon", "intron", "codon"] ): return "bed_splicing" elif any(x in feature_name.lower() for x in ["cds", "coding"]): return "bed_coding" elif any(x in feature_name.lower() for x in ["promoter", "enhancer"]): return "bed_regulatory" elif any(x in feature_name.lower() for x in ["utr", "5utr", "3utr"]): return "bed_utr" else: return "bed_other" elif feature_type == "BigWig": if "Histone" in feature_name: return "bw_histone" elif "RNA" in feature_name or "CAGE" in feature_name: return "bw_expression" elif "DNase" in feature_name or "ATAC" in feature_name: return "bw_accessibility" elif "ChIP-seq" in feature_name: return "bw_chipseq" else: return "bw_other" return "unknown" def validate_example(chrom, pos, ref, alt): """Validate a single example""" key = (chrom, pos, ref, alt) ref_data = VARIANT_REFERENCES.get(key) if not ref_data: return None print("\n" + "=" * 80) print(f"VARIANT: {ref_data['name']} ({chrom}:{pos} {ref}>{alt})") print("=" * 80) # Literature context print("\nšŸ“š LITERATURE CONTEXT:") print(f" Disease: {ref_data['disease']}") print(f" Frequency: {ref_data['frequency']}") print(f" Summary: {ref_data['literature']}") print(f" Expected Impact: {ref_data['impact_expected']}") # Run prediction print("\nšŸ”¬ Running prediction...") result = predict_single_variant(chrom, pos, ref, alt) ( summary_md, interpretation_md, top_table_df, fp_fig, rt_fig, csv_path, bed_df, mlm_md, ranked, ) = result # Extract impact score from summary impact_from_summary = None if "BED Impact Score" in summary_md: import re m = re.search(r"BED Impact Score\s*\|\s*([\d.]+)", summary_md) if m: impact_from_summary = float(m.group(1)) print("\nšŸ“Š RESULTS SUMMARY:") gene_region = "N/A" if "**Gene:**" in summary_md: gene_part = summary_md.split("**Gene:**")[1] gene_region = gene_part.split("\n")[0].strip() print(f" Gene/Region: {gene_region}") print(f" Top tracks: {len(top_table_df)} features") print(f" Total ranked: {len(ranked)} tracks") print(f" Interpretation panel present: {bool(interpretation_md)}") print( f" Impact Score (BED): {impact_from_summary if impact_from_summary else 'N/A'}" ) # Track diversity analysis print("\nšŸŽÆ TRACK DIVERSITY ANALYSIS:") track_categories = defaultdict(int) sources = set() # Analyze ranked tracks for r in ranked: feat = r["display_name"] ftype = r["track_type"] category = categorize_track(feat, ftype) track_categories[category] += 1 if "|" in feat: # BigWig with tissue info tissue = feat.split("|")[0].strip() sources.add(tissue) print("\n Track Categories:") for cat, count in sorted(track_categories.items(), key=lambda x: -x[1]): cat_display = cat.replace("bed_", "BED: ").replace("bw_", "BigWig: ") print(f" - {cat_display}: {count}") print(f"\n Tissue/Cell Sources Found: {len(sources)}") if len(sources) > 0: for source in sorted(list(sources))[:5]: # Show first 5 print(f" - {source}") if len(sources) > 5: print(f" ... and {len(sources) - 5} more") # Quality assessment print("\nāœ“ VARIANT QUALITY ASSESSMENT:") # Has diverse tracks diversity_score = len(track_categories) if diversity_score >= 3: print(f" āœ“ Good track diversity ({diversity_score} categories)") else: print(f" āš ļø Limited track diversity ({diversity_score} categories)") # Has gains and losses gains = [r for r in ranked if r["delta"] > 0] losses = [r for r in ranked if r["delta"] < 0] if gains and losses: print(f" āœ“ Shows both gains ({len(gains)}) and losses ({len(losses)})") elif gains: print(" āš ļø Only shows gains, no losses") elif losses: print(" āš ļø Only shows losses, no gains") else: print(" āœ— No gains or losses - may not be suitable example") # Expected impact matches actual has_high_impacts = any(abs(r["delta"]) >= 0.1 for r in ranked) if ref_data["impact_expected"] == "HIGH" and has_high_impacts: print(" āœ“ HIGH impact expected and observed") elif ref_data["impact_expected"] == "MODERATE" and not has_high_impacts: print(" āœ“ MODERATE impact expected and observed (no extreme deltas)") else: if ref_data["impact_expected"] == "HIGH": print(" āš ļø HIGH impact expected but weak observed") # Relevant to disease print(f" āœ“ Disease-relevant example ({ref_data['disease']})") return True # Validate all examples examples = [ ("chr17", 7675088, "C", "T"), ("chr7", 117559593, "ATCT", "A"), ("chr13", 32332771, "AGAGA", "AGA"), ("chr11", 5227002, "T", "A"), ("chr17", 43092418, "T", "C"), ] print("\n" + "=" * 80) print("COMPREHENSIVE EXAMPLE VALIDATION") print("=" * 80) for chrom, pos, ref, alt in examples: validate_example(chrom, pos, ref, alt) print("\n" + "=" * 80) print("VALIDATION COMPLETE") print("=" * 80)