carbon-demo / scripts /fetch_variants.py
lvwerra's picture
lvwerra HF Staff
§3 VEP: replace variant set with audited 8-pick spread
41739ad
"""Fetch curated ClinVar variants with surrounding genomic context.
For each variant we fetch a 60-bp window centered on the SNP from Ensembl,
revcomp it onto the gene's coding strand if needed, and save ref + alt
window + tokenization-aligned offset for the §2 (VEP) demo.
Usage:
python scripts/fetch_variants.py
"""
import json
import os
import sys
import time
import urllib.request
import urllib.error
ENSEMBL = "https://rest.ensembl.org"
# Window total length (bp), centered on the variant. The eval recipe in
# `vep_eval.py` uses 8,192 bp; we use a smaller window here to keep /score
# fast on the live endpoint, but well above the 60 bp the demo originally
# shipped with (which was too small to read variant signal — 3/3 pathogenic
# variants flipped sign or vanished into noise at 60 bp).
WINDOW = 4002 # multiple of 6 for the 6-mer BPE tokenizer + matches HALF math
HALF = WINDOW // 2 # variant placed at center
# Curated set: famous pathogenic + risk + benign SNVs across several
# genes/diseases. Coordinates resolved via the Ensembl variation API; the
# plus-strand alt is the ClinVar canonical disease-associated allele for
# pathogenic / risk picks, or the single reported alt for benign.
#
# Spread: 6 Pathogenic + 1 Risk + 1 Benign. The two VHL rows (rs1575932011
# and rs182781943) are paired on purpose, same gene, opposite verdicts:
# a premature stop codon vs. a benign 3' UTR SNP.
VARIANTS = [
{"rs": "rs334", "gene": "HBB", "name": "HBB c.20A>T", "sig": "Pathogenic", "blurb": "sickle cell anemia · p.Glu6Val", "plus_alt": "A"},
{"rs": "rs80359027", "gene": "BRCA2", "name": "BRCA2 c.7976G>T", "sig": "Pathogenic", "blurb": "hereditary breast/ovarian cancer · missense", "plus_alt": "T"},
{"rs": "rs1057519981", "gene": "TP53", "name": "TP53 c.712T>A", "sig": "Pathogenic", "blurb": "Li-Fraumeni cancer · missense", "plus_alt": "T"},
{"rs": "rs1603267420", "gene": "F9", "name": "F9 c.1186T>A", "sig": "Pathogenic", "blurb": "hemophilia B · clotting factor missense", "plus_alt": "A"},
{"rs": "rs112029328", "gene": "LDLR", "name": "LDLR c.313+1G>T", "sig": "Pathogenic", "blurb": "familial high cholesterol · splice donor lost","plus_alt": "T"},
{"rs": "rs1575932011", "gene": "VHL", "name": "VHL c.475A>T", "sig": "Pathogenic", "blurb": "Von Hippel-Lindau · premature STOP", "plus_alt": "T"},
{"rs": "rs34637584", "gene": "LRRK2", "name": "LRRK2 c.6055G>A", "sig": "Risk", "blurb": "Parkinson's · G2019S kinase variant", "plus_alt": "A"},
{"rs": "rs182781943", "gene": "VHL", "name": "VHL c.*820A>G", "sig": "Benign", "blurb": "common 3' UTR variant · same gene as row above","plus_alt": "G"},
]
def get_json(path):
req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read())
def revcomp(s):
return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1]
def fetch_variant(v):
info = get_json(f"/variation/human/{v['rs']}")
# Ensembl returns allele_string like "T/A/C/G" with ref first. Pick the
# first mapping that has the expected plus-strand alt in its alt set.
mapping = None
for m in info.get("mappings", []):
alleles = m.get("allele_string", "").split("/")
# Only chromosomal mappings, only single-base alleles
if not alleles or any(len(a) != 1 or a not in "ACGTN" for a in alleles):
continue
# Skip patch/alt contigs
chrom = m.get("seq_region_name", "")
if not (chrom.isdigit() or chrom in ("X", "Y", "MT")):
continue
if v["plus_alt"] in alleles[1:]:
mapping = m
break
if not mapping:
raise RuntimeError(f"{v['rs']}: no SNV mapping containing alt {v['plus_alt']}")
alleles = mapping["allele_string"].split("/")
ref = alleles[0]
alt = v["plus_alt"]
chrom = mapping["seq_region_name"]
pos = mapping["start"] # 1-based
# Fetch a window centered on the variant: pos - HALF + 1 .. pos + HALF
win_start = pos - HALF + 1 # 1-based
win_end = pos + HALF # 1-based, inclusive
seq_data = get_json(f"/sequence/region/human/{chrom}:{win_start}..{win_end}:1?content-type=application/json")
plus_seq = seq_data["seq"].upper()
assert len(plus_seq) == WINDOW
var_offset_plus = pos - win_start # 0-based offset into plus_seq
# Sanity: the base at var_offset_plus must equal ref (on plus strand).
# Some rs IDs report alleles on the gene's strand rather than +. If our
# plus_seq base doesn't match ref, try interpreting alleles as gene-strand
# and revcomping later.
plus_base = plus_seq[var_offset_plus]
sig = v["sig"]
# Determine gene's strand for orientation.
g = get_json(f"/lookup/symbol/homo_sapiens/{v['gene']}")
gene_strand = g["strand"]
if gene_strand == 1:
seq = plus_seq
var_offset = var_offset_plus
ref_g, alt_g = ref, alt
else:
seq = revcomp(plus_seq)
var_offset = WINDOW - 1 - var_offset_plus
ref_g, alt_g = revcomp(ref), revcomp(alt)
# Recheck: if plus_base != ref, the variant's REF was already on gene
# strand, so flip back.
if plus_base != ref:
# alleles were given on gene-strand (minus strand of plus). We've
# already revcomp'd, so this should now match — but verify.
pass
# Final check: seq[var_offset] should equal ref_g
if seq[var_offset] != ref_g:
# Try the other orientation
# (this happens when Ensembl reports allele_string on gene strand for - strand genes)
if seq[var_offset] == revcomp(ref_g):
ref_g, alt_g = revcomp(ref_g), revcomp(alt_g)
else:
raise RuntimeError(
f"{v['rs']}: ref mismatch — seq[{var_offset}]={seq[var_offset]} expected {ref_g} (or {revcomp(ref_g)})"
)
return {
"rs": v["rs"],
"gene": v["gene"],
"name": v["name"],
"sig": sig,
"blurb": v["blurb"],
"chrom": chrom,
"pos": pos,
"gene_strand": gene_strand,
"ref": ref_g,
"alt": alt_g,
"ref_window": seq, # full WINDOW bp on gene strand
"var_offset": var_offset, # 0-based
}
def main():
out = []
for v in VARIANTS:
print(f"fetching {v['rs']} ({v['gene']})…", flush=True)
try:
data = fetch_variant(v)
print(f" ✓ {data['ref']}>{data['alt']} at offset {data['var_offset']} | window: …{data['ref_window'][max(0,data['var_offset']-5):data['var_offset']]}[{data['ref_window'][data['var_offset']]}]{data['ref_window'][data['var_offset']+1:data['var_offset']+6]}…")
out.append(data)
time.sleep(0.4)
except Exception as e:
print(f" ✗ FAILED — {e}", file=sys.stderr)
here = os.path.dirname(os.path.abspath(__file__))
out_path = os.path.join(os.path.dirname(here), "data", "variants.json")
with open(out_path, "w") as f:
json.dump(out, f, indent=2)
print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB) — {len(out)} variants")
if __name__ == "__main__":
main()