Spaces:

GrimSqueaker
/

MAGI

Running

App Files Files Community

MAGI / validate_examples.py

GrimSqueaker

Initial deploy: MAGI variant interpreter (gradio_app)

30b7e77 verified about 1 month ago

Raw

History Blame Contribute Delete

7.73 kB

	#!/usr/bin/env python3
	"""
	Comprehensive validation of example variants:
	1. Literature accuracy
	2. Example diversity and quality
	3. Track type coverage in results
	"""

	import sys

	sys.path.insert(0, ".")

	from app import predict_single_variant
	import pandas as pd
	from collections import defaultdict
	import json

	# Known variants from literature
	VARIANT_REFERENCES = {
	("chr17", 7675088, "C", "T"): {
	"name": "TP53 R175H",
	"disease": "Cancer (tumor suppressor)",
	"frequency": "~6% of cancers",
	"literature": "Highly common hotspot, eliminates DNA binding domain",
	"impact_expected": "HIGH",
	"regions_expected": ["coding", "regulatory"],
	},
	("chr7", 117559593, "ATCT", "A"): {
	"name": "CFTR F508del",
	"disease": "Cystic Fibrosis",
	"frequency": "~70% of CF patients",
	"literature": "Most common CF mutation, causes protein misfolding",
	"impact_expected": "HIGH",
	"regions_expected": ["coding", "structural"],
	},
	("chr13", 32332771, "AGAGA", "AGA"): {
	"name": "BRCA2 frameshift",
	"disease": "Hereditary breast/ovarian cancer",
	"frequency": "Rare, pathogenic",
	"literature": "BRCA2 frameshift deletion (c.5946delT), causes loss of function",
	"impact_expected": "HIGH",
	"regions_expected": ["coding", "frameshift"],
	},
	("chr11", 5227002, "T", "A"): {
	"name": "HBB E6V",
	"disease": "Sickle cell disease",
	"frequency": "Common in African populations",
	"literature": "Missense mutation (rs334) causing hemoglobin S polymerization",
	"impact_expected": "HIGH",
	"regions_expected": ["coding", "regulatory"],
	},
	("chr17", 43092418, "T", "C"): {
	"name": "BRCA1 synonymous",
	"disease": "Benign control variant",
	"frequency": "Common",
	"literature": "Synonymous variant (c.3113A>G, rs16941), expected benign",
	"impact_expected": "LOW",
	"regions_expected": ["coding"],
	},
	}


	def categorize_track(feature_name, feature_type):
	"""Categorize track type (BED or BigWig, and specific kind)"""
	if feature_type == "BED":
	if any(
	x in feature_name.lower() for x in ["splice", "exon", "intron", "codon"]
	):
	return "bed_splicing"
	elif any(x in feature_name.lower() for x in ["cds", "coding"]):
	return "bed_coding"
	elif any(x in feature_name.lower() for x in ["promoter", "enhancer"]):
	return "bed_regulatory"
	elif any(x in feature_name.lower() for x in ["utr", "5utr", "3utr"]):
	return "bed_utr"
	else:
	return "bed_other"
	elif feature_type == "BigWig":
	if "Histone" in feature_name:
	return "bw_histone"
	elif "RNA" in feature_name or "CAGE" in feature_name:
	return "bw_expression"
	elif "DNase" in feature_name or "ATAC" in feature_name:
	return "bw_accessibility"
	elif "ChIP-seq" in feature_name:
	return "bw_chipseq"
	else:
	return "bw_other"
	return "unknown"


	def validate_example(chrom, pos, ref, alt):
	"""Validate a single example"""
	key = (chrom, pos, ref, alt)
	ref_data = VARIANT_REFERENCES.get(key)

	if not ref_data:
	return None

	print("\n" + "=" * 80)
	print(f"VARIANT: {ref_data['name']} ({chrom}:{pos} {ref}>{alt})")
	print("=" * 80)

	# Literature context
	print("\n📚 LITERATURE CONTEXT:")
	print(f" Disease: {ref_data['disease']}")
	print(f" Frequency: {ref_data['frequency']}")
	print(f" Summary: {ref_data['literature']}")
	print(f" Expected Impact: {ref_data['impact_expected']}")

	# Run prediction
	print("\n🔬 Running prediction...")
	result = predict_single_variant(chrom, pos, ref, alt)
	(
	summary_md,
	interpretation_md,
	top_table_df,
	fp_fig,
	rt_fig,
	csv_path,
	bed_df,
	mlm_md,
	ranked,
	) = result

	# Extract impact score from summary
	impact_from_summary = None
	if "BED Impact Score" in summary_md:
	import re

	m = re.search(r"BED Impact Score\s\\|\s([\d.]+)", summary_md)
	if m:
	impact_from_summary = float(m.group(1))

	print("\n📊 RESULTS SUMMARY:")
	gene_region = "N/A"
	if "Gene:" in summary_md:
	gene_part = summary_md.split("Gene:")[1]
	gene_region = gene_part.split("\n")[0].strip()
	print(f" Gene/Region: {gene_region}")
	print(f" Top tracks: {len(top_table_df)} features")
	print(f" Total ranked: {len(ranked)} tracks")
	print(f" Interpretation panel present: {bool(interpretation_md)}")
	print(
	f" Impact Score (BED): {impact_from_summary if impact_from_summary else 'N/A'}"
	)

	# Track diversity analysis
	print("\n🎯 TRACK DIVERSITY ANALYSIS:")

	track_categories = defaultdict(int)
	sources = set()

	# Analyze ranked tracks
	for r in ranked:
	feat = r["display_name"]
	ftype = r["track_type"]
	category = categorize_track(feat, ftype)
	track_categories[category] += 1

	if "\|" in feat: # BigWig with tissue info
	tissue = feat.split("\|")[0].strip()
	sources.add(tissue)

	print("\n Track Categories:")
	for cat, count in sorted(track_categories.items(), key=lambda x: -x[1]):
	cat_display = cat.replace("bed_", "BED: ").replace("bw_", "BigWig: ")
	print(f" - {cat_display}: {count}")

	print(f"\n Tissue/Cell Sources Found: {len(sources)}")
	if len(sources) > 0:
	for source in sorted(list(sources))[:5]: # Show first 5
	print(f" - {source}")
	if len(sources) > 5:
	print(f" ... and {len(sources) - 5} more")

	# Quality assessment
	print("\n✓ VARIANT QUALITY ASSESSMENT:")

	# Has diverse tracks
	diversity_score = len(track_categories)
	if diversity_score >= 3:
	print(f" ✓ Good track diversity ({diversity_score} categories)")
	else:
	print(f" ⚠️ Limited track diversity ({diversity_score} categories)")

	# Has gains and losses
	gains = [r for r in ranked if r["delta"] > 0]
	losses = [r for r in ranked if r["delta"] < 0]
	if gains and losses:
	print(f" ✓ Shows both gains ({len(gains)}) and losses ({len(losses)})")
	elif gains:
	print(" ⚠️ Only shows gains, no losses")
	elif losses:
	print(" ⚠️ Only shows losses, no gains")
	else:
	print(" ✗ No gains or losses - may not be suitable example")

	# Expected impact matches actual
	has_high_impacts = any(abs(r["delta"]) >= 0.1 for r in ranked)

	if ref_data["impact_expected"] == "HIGH" and has_high_impacts:
	print(" ✓ HIGH impact expected and observed")
	elif ref_data["impact_expected"] == "MODERATE" and not has_high_impacts:
	print(" ✓ MODERATE impact expected and observed (no extreme deltas)")
	else:
	if ref_data["impact_expected"] == "HIGH":
	print(" ⚠️ HIGH impact expected but weak observed")

	# Relevant to disease
	print(f" ✓ Disease-relevant example ({ref_data['disease']})")

	return True


	# Validate all examples
	examples = [
	("chr17", 7675088, "C", "T"),
	("chr7", 117559593, "ATCT", "A"),
	("chr13", 32332771, "AGAGA", "AGA"),
	("chr11", 5227002, "T", "A"),
	("chr17", 43092418, "T", "C"),
	]

	print("\n" + "=" * 80)
	print("COMPREHENSIVE EXAMPLE VALIDATION")
	print("=" * 80)

	for chrom, pos, ref, alt in examples:
	validate_example(chrom, pos, ref, alt)

	print("\n" + "=" * 80)
	print("VALIDATION COMPLETE")
	print("=" * 80)