Spaces:
Running
Running
| """Fetch curated ClinVar variants with surrounding genomic context. | |
| For each variant we fetch a 60-bp window centered on the SNP from Ensembl, | |
| revcomp it onto the gene's coding strand if needed, and save ref + alt | |
| window + tokenization-aligned offset for the §2 (VEP) demo. | |
| Usage: | |
| python scripts/fetch_variants.py | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import urllib.request | |
| import urllib.error | |
| ENSEMBL = "https://rest.ensembl.org" | |
| # Window total length (bp), centered on the variant. The eval recipe in | |
| # `vep_eval.py` uses 8,192 bp; we use a smaller window here to keep /score | |
| # fast on the live endpoint, but well above the 60 bp the demo originally | |
| # shipped with (which was too small to read variant signal — 3/3 pathogenic | |
| # variants flipped sign or vanished into noise at 60 bp). | |
| WINDOW = 4002 # multiple of 6 for the 6-mer BPE tokenizer + matches HALF math | |
| HALF = WINDOW // 2 # variant placed at center | |
| # Curated set: famous pathogenic + risk + benign SNVs across several | |
| # genes/diseases. Coordinates resolved via the Ensembl variation API; the | |
| # plus-strand alt is the ClinVar canonical disease-associated allele for | |
| # pathogenic / risk picks, or the single reported alt for benign. | |
| # | |
| # Spread: 6 Pathogenic + 1 Risk + 1 Benign. The two VHL rows (rs1575932011 | |
| # and rs182781943) are paired on purpose, same gene, opposite verdicts: | |
| # a premature stop codon vs. a benign 3' UTR SNP. | |
| VARIANTS = [ | |
| {"rs": "rs334", "gene": "HBB", "name": "HBB c.20A>T", "sig": "Pathogenic", "blurb": "sickle cell anemia · p.Glu6Val", "plus_alt": "A"}, | |
| {"rs": "rs80359027", "gene": "BRCA2", "name": "BRCA2 c.7976G>T", "sig": "Pathogenic", "blurb": "hereditary breast/ovarian cancer · missense", "plus_alt": "T"}, | |
| {"rs": "rs1057519981", "gene": "TP53", "name": "TP53 c.712T>A", "sig": "Pathogenic", "blurb": "Li-Fraumeni cancer · missense", "plus_alt": "T"}, | |
| {"rs": "rs1603267420", "gene": "F9", "name": "F9 c.1186T>A", "sig": "Pathogenic", "blurb": "hemophilia B · clotting factor missense", "plus_alt": "A"}, | |
| {"rs": "rs112029328", "gene": "LDLR", "name": "LDLR c.313+1G>T", "sig": "Pathogenic", "blurb": "familial high cholesterol · splice donor lost","plus_alt": "T"}, | |
| {"rs": "rs1575932011", "gene": "VHL", "name": "VHL c.475A>T", "sig": "Pathogenic", "blurb": "Von Hippel-Lindau · premature STOP", "plus_alt": "T"}, | |
| {"rs": "rs34637584", "gene": "LRRK2", "name": "LRRK2 c.6055G>A", "sig": "Risk", "blurb": "Parkinson's · G2019S kinase variant", "plus_alt": "A"}, | |
| {"rs": "rs182781943", "gene": "VHL", "name": "VHL c.*820A>G", "sig": "Benign", "blurb": "common 3' UTR variant · same gene as row above","plus_alt": "G"}, | |
| ] | |
| def get_json(path): | |
| req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"}) | |
| return json.loads(urllib.request.urlopen(req, timeout=30).read()) | |
| def revcomp(s): | |
| return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1] | |
| def fetch_variant(v): | |
| info = get_json(f"/variation/human/{v['rs']}") | |
| # Ensembl returns allele_string like "T/A/C/G" with ref first. Pick the | |
| # first mapping that has the expected plus-strand alt in its alt set. | |
| mapping = None | |
| for m in info.get("mappings", []): | |
| alleles = m.get("allele_string", "").split("/") | |
| # Only chromosomal mappings, only single-base alleles | |
| if not alleles or any(len(a) != 1 or a not in "ACGTN" for a in alleles): | |
| continue | |
| # Skip patch/alt contigs | |
| chrom = m.get("seq_region_name", "") | |
| if not (chrom.isdigit() or chrom in ("X", "Y", "MT")): | |
| continue | |
| if v["plus_alt"] in alleles[1:]: | |
| mapping = m | |
| break | |
| if not mapping: | |
| raise RuntimeError(f"{v['rs']}: no SNV mapping containing alt {v['plus_alt']}") | |
| alleles = mapping["allele_string"].split("/") | |
| ref = alleles[0] | |
| alt = v["plus_alt"] | |
| chrom = mapping["seq_region_name"] | |
| pos = mapping["start"] # 1-based | |
| # Fetch a window centered on the variant: pos - HALF + 1 .. pos + HALF | |
| win_start = pos - HALF + 1 # 1-based | |
| win_end = pos + HALF # 1-based, inclusive | |
| seq_data = get_json(f"/sequence/region/human/{chrom}:{win_start}..{win_end}:1?content-type=application/json") | |
| plus_seq = seq_data["seq"].upper() | |
| assert len(plus_seq) == WINDOW | |
| var_offset_plus = pos - win_start # 0-based offset into plus_seq | |
| # Sanity: the base at var_offset_plus must equal ref (on plus strand). | |
| # Some rs IDs report alleles on the gene's strand rather than +. If our | |
| # plus_seq base doesn't match ref, try interpreting alleles as gene-strand | |
| # and revcomping later. | |
| plus_base = plus_seq[var_offset_plus] | |
| sig = v["sig"] | |
| # Determine gene's strand for orientation. | |
| g = get_json(f"/lookup/symbol/homo_sapiens/{v['gene']}") | |
| gene_strand = g["strand"] | |
| if gene_strand == 1: | |
| seq = plus_seq | |
| var_offset = var_offset_plus | |
| ref_g, alt_g = ref, alt | |
| else: | |
| seq = revcomp(plus_seq) | |
| var_offset = WINDOW - 1 - var_offset_plus | |
| ref_g, alt_g = revcomp(ref), revcomp(alt) | |
| # Recheck: if plus_base != ref, the variant's REF was already on gene | |
| # strand, so flip back. | |
| if plus_base != ref: | |
| # alleles were given on gene-strand (minus strand of plus). We've | |
| # already revcomp'd, so this should now match — but verify. | |
| pass | |
| # Final check: seq[var_offset] should equal ref_g | |
| if seq[var_offset] != ref_g: | |
| # Try the other orientation | |
| # (this happens when Ensembl reports allele_string on gene strand for - strand genes) | |
| if seq[var_offset] == revcomp(ref_g): | |
| ref_g, alt_g = revcomp(ref_g), revcomp(alt_g) | |
| else: | |
| raise RuntimeError( | |
| f"{v['rs']}: ref mismatch — seq[{var_offset}]={seq[var_offset]} expected {ref_g} (or {revcomp(ref_g)})" | |
| ) | |
| return { | |
| "rs": v["rs"], | |
| "gene": v["gene"], | |
| "name": v["name"], | |
| "sig": sig, | |
| "blurb": v["blurb"], | |
| "chrom": chrom, | |
| "pos": pos, | |
| "gene_strand": gene_strand, | |
| "ref": ref_g, | |
| "alt": alt_g, | |
| "ref_window": seq, # full WINDOW bp on gene strand | |
| "var_offset": var_offset, # 0-based | |
| } | |
| def main(): | |
| out = [] | |
| for v in VARIANTS: | |
| print(f"fetching {v['rs']} ({v['gene']})…", flush=True) | |
| try: | |
| data = fetch_variant(v) | |
| print(f" ✓ {data['ref']}>{data['alt']} at offset {data['var_offset']} | window: …{data['ref_window'][max(0,data['var_offset']-5):data['var_offset']]}[{data['ref_window'][data['var_offset']]}]{data['ref_window'][data['var_offset']+1:data['var_offset']+6]}…") | |
| out.append(data) | |
| time.sleep(0.4) | |
| except Exception as e: | |
| print(f" ✗ FAILED — {e}", file=sys.stderr) | |
| here = os.path.dirname(os.path.abspath(__file__)) | |
| out_path = os.path.join(os.path.dirname(here), "data", "variants.json") | |
| with open(out_path, "w") as f: | |
| json.dump(out, f, indent=2) | |
| print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB) — {len(out)} variants") | |
| if __name__ == "__main__": | |
| main() | |