Spaces:

HuggingFaceBio
/

carbon-demo

Running

App Files Files Community

carbon-demo / scripts /fetch_variants.py

lvwerra HF Staff

§3 VEP: replace variant set with audited 8-pick spread

41739ad 17 days ago

raw

history blame contribute delete

7.43 kB

	"""Fetch curated ClinVar variants with surrounding genomic context.

	For each variant we fetch a 60-bp window centered on the SNP from Ensembl,
	revcomp it onto the gene's coding strand if needed, and save ref + alt
	window + tokenization-aligned offset for the §2 (VEP) demo.

	Usage:
	python scripts/fetch_variants.py
	"""
	import json
	import os
	import sys
	import time
	import urllib.request
	import urllib.error

	ENSEMBL = "https://rest.ensembl.org"
	# Window total length (bp), centered on the variant. The eval recipe in
	# `vep_eval.py` uses 8,192 bp; we use a smaller window here to keep /score
	# fast on the live endpoint, but well above the 60 bp the demo originally
	# shipped with (which was too small to read variant signal — 3/3 pathogenic
	# variants flipped sign or vanished into noise at 60 bp).
	WINDOW = 4002 # multiple of 6 for the 6-mer BPE tokenizer + matches HALF math
	HALF = WINDOW // 2 # variant placed at center

	# Curated set: famous pathogenic + risk + benign SNVs across several
	# genes/diseases. Coordinates resolved via the Ensembl variation API; the
	# plus-strand alt is the ClinVar canonical disease-associated allele for
	# pathogenic / risk picks, or the single reported alt for benign.
	#
	# Spread: 6 Pathogenic + 1 Risk + 1 Benign. The two VHL rows (rs1575932011
	# and rs182781943) are paired on purpose, same gene, opposite verdicts:
	# a premature stop codon vs. a benign 3' UTR SNP.
	VARIANTS = [
	{"rs": "rs334", "gene": "HBB", "name": "HBB c.20A>T", "sig": "Pathogenic", "blurb": "sickle cell anemia · p.Glu6Val", "plus_alt": "A"},
	{"rs": "rs80359027", "gene": "BRCA2", "name": "BRCA2 c.7976G>T", "sig": "Pathogenic", "blurb": "hereditary breast/ovarian cancer · missense", "plus_alt": "T"},
	{"rs": "rs1057519981", "gene": "TP53", "name": "TP53 c.712T>A", "sig": "Pathogenic", "blurb": "Li-Fraumeni cancer · missense", "plus_alt": "T"},
	{"rs": "rs1603267420", "gene": "F9", "name": "F9 c.1186T>A", "sig": "Pathogenic", "blurb": "hemophilia B · clotting factor missense", "plus_alt": "A"},
	{"rs": "rs112029328", "gene": "LDLR", "name": "LDLR c.313+1G>T", "sig": "Pathogenic", "blurb": "familial high cholesterol · splice donor lost","plus_alt": "T"},
	{"rs": "rs1575932011", "gene": "VHL", "name": "VHL c.475A>T", "sig": "Pathogenic", "blurb": "Von Hippel-Lindau · premature STOP", "plus_alt": "T"},
	{"rs": "rs34637584", "gene": "LRRK2", "name": "LRRK2 c.6055G>A", "sig": "Risk", "blurb": "Parkinson's · G2019S kinase variant", "plus_alt": "A"},
	{"rs": "rs182781943", "gene": "VHL", "name": "VHL c.*820A>G", "sig": "Benign", "blurb": "common 3' UTR variant · same gene as row above","plus_alt": "G"},
	]


	def get_json(path):
	req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
	return json.loads(urllib.request.urlopen(req, timeout=30).read())


	def revcomp(s):
	return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1]


	def fetch_variant(v):
	info = get_json(f"/variation/human/{v['rs']}")
	# Ensembl returns allele_string like "T/A/C/G" with ref first. Pick the
	# first mapping that has the expected plus-strand alt in its alt set.
	mapping = None
	for m in info.get("mappings", []):
	alleles = m.get("allele_string", "").split("/")
	# Only chromosomal mappings, only single-base alleles
	if not alleles or any(len(a) != 1 or a not in "ACGTN" for a in alleles):
	continue
	# Skip patch/alt contigs
	chrom = m.get("seq_region_name", "")
	if not (chrom.isdigit() or chrom in ("X", "Y", "MT")):
	continue
	if v["plus_alt"] in alleles[1:]:
	mapping = m
	break
	if not mapping:
	raise RuntimeError(f"{v['rs']}: no SNV mapping containing alt {v['plus_alt']}")
	alleles = mapping["allele_string"].split("/")
	ref = alleles[0]
	alt = v["plus_alt"]
	chrom = mapping["seq_region_name"]
	pos = mapping["start"] # 1-based

	# Fetch a window centered on the variant: pos - HALF + 1 .. pos + HALF
	win_start = pos - HALF + 1 # 1-based
	win_end = pos + HALF # 1-based, inclusive
	seq_data = get_json(f"/sequence/region/human/{chrom}:{win_start}..{win_end}:1?content-type=application/json")
	plus_seq = seq_data["seq"].upper()
	assert len(plus_seq) == WINDOW
	var_offset_plus = pos - win_start # 0-based offset into plus_seq

	# Sanity: the base at var_offset_plus must equal ref (on plus strand).
	# Some rs IDs report alleles on the gene's strand rather than +. If our
	# plus_seq base doesn't match ref, try interpreting alleles as gene-strand
	# and revcomping later.
	plus_base = plus_seq[var_offset_plus]
	sig = v["sig"]

	# Determine gene's strand for orientation.
	g = get_json(f"/lookup/symbol/homo_sapiens/{v['gene']}")
	gene_strand = g["strand"]

	if gene_strand == 1:
	seq = plus_seq
	var_offset = var_offset_plus
	ref_g, alt_g = ref, alt
	else:
	seq = revcomp(plus_seq)
	var_offset = WINDOW - 1 - var_offset_plus
	ref_g, alt_g = revcomp(ref), revcomp(alt)
	# Recheck: if plus_base != ref, the variant's REF was already on gene
	# strand, so flip back.
	if plus_base != ref:
	# alleles were given on gene-strand (minus strand of plus). We've
	# already revcomp'd, so this should now match — but verify.
	pass

	# Final check: seq[var_offset] should equal ref_g
	if seq[var_offset] != ref_g:
	# Try the other orientation
	# (this happens when Ensembl reports allele_string on gene strand for - strand genes)
	if seq[var_offset] == revcomp(ref_g):
	ref_g, alt_g = revcomp(ref_g), revcomp(alt_g)
	else:
	raise RuntimeError(
	f"{v['rs']}: ref mismatch — seq[{var_offset}]={seq[var_offset]} expected {ref_g} (or {revcomp(ref_g)})"
	)

	return {
	"rs": v["rs"],
	"gene": v["gene"],
	"name": v["name"],
	"sig": sig,
	"blurb": v["blurb"],
	"chrom": chrom,
	"pos": pos,
	"gene_strand": gene_strand,
	"ref": ref_g,
	"alt": alt_g,
	"ref_window": seq, # full WINDOW bp on gene strand
	"var_offset": var_offset, # 0-based
	}


	def main():
	out = []
	for v in VARIANTS:
	print(f"fetching {v['rs']} ({v['gene']})…", flush=True)
	try:
	data = fetch_variant(v)
	print(f" ✓ {data['ref']}>{data['alt']} at offset {data['var_offset']} \| window: …{data['ref_window'][max(0,data['var_offset']-5):data['var_offset']]}[{data['ref_window'][data['var_offset']]}]{data['ref_window'][data['var_offset']+1:data['var_offset']+6]}…")
	out.append(data)
	time.sleep(0.4)
	except Exception as e:
	print(f" ✗ FAILED — {e}", file=sys.stderr)
	here = os.path.dirname(os.path.abspath(__file__))
	out_path = os.path.join(os.path.dirname(here), "data", "variants.json")
	with open(out_path, "w") as f:
	json.dump(out, f, indent=2)
	print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB) — {len(out)} variants")


	if __name__ == "__main__":
	main()