Spaces:
Running
Running
File size: 7,433 Bytes
971b586 1d3e72f 971b586 41739ad 971b586 41739ad 971b586 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | """Fetch curated ClinVar variants with surrounding genomic context.
For each variant we fetch a 60-bp window centered on the SNP from Ensembl,
revcomp it onto the gene's coding strand if needed, and save ref + alt
window + tokenization-aligned offset for the §2 (VEP) demo.
Usage:
python scripts/fetch_variants.py
"""
import json
import os
import sys
import time
import urllib.request
import urllib.error
ENSEMBL = "https://rest.ensembl.org"
# Window total length (bp), centered on the variant. The eval recipe in
# `vep_eval.py` uses 8,192 bp; we use a smaller window here to keep /score
# fast on the live endpoint, but well above the 60 bp the demo originally
# shipped with (which was too small to read variant signal — 3/3 pathogenic
# variants flipped sign or vanished into noise at 60 bp).
WINDOW = 4002 # multiple of 6 for the 6-mer BPE tokenizer + matches HALF math
HALF = WINDOW // 2 # variant placed at center
# Curated set: famous pathogenic + risk + benign SNVs across several
# genes/diseases. Coordinates resolved via the Ensembl variation API; the
# plus-strand alt is the ClinVar canonical disease-associated allele for
# pathogenic / risk picks, or the single reported alt for benign.
#
# Spread: 6 Pathogenic + 1 Risk + 1 Benign. The two VHL rows (rs1575932011
# and rs182781943) are paired on purpose, same gene, opposite verdicts:
# a premature stop codon vs. a benign 3' UTR SNP.
VARIANTS = [
{"rs": "rs334", "gene": "HBB", "name": "HBB c.20A>T", "sig": "Pathogenic", "blurb": "sickle cell anemia · p.Glu6Val", "plus_alt": "A"},
{"rs": "rs80359027", "gene": "BRCA2", "name": "BRCA2 c.7976G>T", "sig": "Pathogenic", "blurb": "hereditary breast/ovarian cancer · missense", "plus_alt": "T"},
{"rs": "rs1057519981", "gene": "TP53", "name": "TP53 c.712T>A", "sig": "Pathogenic", "blurb": "Li-Fraumeni cancer · missense", "plus_alt": "T"},
{"rs": "rs1603267420", "gene": "F9", "name": "F9 c.1186T>A", "sig": "Pathogenic", "blurb": "hemophilia B · clotting factor missense", "plus_alt": "A"},
{"rs": "rs112029328", "gene": "LDLR", "name": "LDLR c.313+1G>T", "sig": "Pathogenic", "blurb": "familial high cholesterol · splice donor lost","plus_alt": "T"},
{"rs": "rs1575932011", "gene": "VHL", "name": "VHL c.475A>T", "sig": "Pathogenic", "blurb": "Von Hippel-Lindau · premature STOP", "plus_alt": "T"},
{"rs": "rs34637584", "gene": "LRRK2", "name": "LRRK2 c.6055G>A", "sig": "Risk", "blurb": "Parkinson's · G2019S kinase variant", "plus_alt": "A"},
{"rs": "rs182781943", "gene": "VHL", "name": "VHL c.*820A>G", "sig": "Benign", "blurb": "common 3' UTR variant · same gene as row above","plus_alt": "G"},
]
def get_json(path):
req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read())
def revcomp(s):
return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1]
def fetch_variant(v):
info = get_json(f"/variation/human/{v['rs']}")
# Ensembl returns allele_string like "T/A/C/G" with ref first. Pick the
# first mapping that has the expected plus-strand alt in its alt set.
mapping = None
for m in info.get("mappings", []):
alleles = m.get("allele_string", "").split("/")
# Only chromosomal mappings, only single-base alleles
if not alleles or any(len(a) != 1 or a not in "ACGTN" for a in alleles):
continue
# Skip patch/alt contigs
chrom = m.get("seq_region_name", "")
if not (chrom.isdigit() or chrom in ("X", "Y", "MT")):
continue
if v["plus_alt"] in alleles[1:]:
mapping = m
break
if not mapping:
raise RuntimeError(f"{v['rs']}: no SNV mapping containing alt {v['plus_alt']}")
alleles = mapping["allele_string"].split("/")
ref = alleles[0]
alt = v["plus_alt"]
chrom = mapping["seq_region_name"]
pos = mapping["start"] # 1-based
# Fetch a window centered on the variant: pos - HALF + 1 .. pos + HALF
win_start = pos - HALF + 1 # 1-based
win_end = pos + HALF # 1-based, inclusive
seq_data = get_json(f"/sequence/region/human/{chrom}:{win_start}..{win_end}:1?content-type=application/json")
plus_seq = seq_data["seq"].upper()
assert len(plus_seq) == WINDOW
var_offset_plus = pos - win_start # 0-based offset into plus_seq
# Sanity: the base at var_offset_plus must equal ref (on plus strand).
# Some rs IDs report alleles on the gene's strand rather than +. If our
# plus_seq base doesn't match ref, try interpreting alleles as gene-strand
# and revcomping later.
plus_base = plus_seq[var_offset_plus]
sig = v["sig"]
# Determine gene's strand for orientation.
g = get_json(f"/lookup/symbol/homo_sapiens/{v['gene']}")
gene_strand = g["strand"]
if gene_strand == 1:
seq = plus_seq
var_offset = var_offset_plus
ref_g, alt_g = ref, alt
else:
seq = revcomp(plus_seq)
var_offset = WINDOW - 1 - var_offset_plus
ref_g, alt_g = revcomp(ref), revcomp(alt)
# Recheck: if plus_base != ref, the variant's REF was already on gene
# strand, so flip back.
if plus_base != ref:
# alleles were given on gene-strand (minus strand of plus). We've
# already revcomp'd, so this should now match — but verify.
pass
# Final check: seq[var_offset] should equal ref_g
if seq[var_offset] != ref_g:
# Try the other orientation
# (this happens when Ensembl reports allele_string on gene strand for - strand genes)
if seq[var_offset] == revcomp(ref_g):
ref_g, alt_g = revcomp(ref_g), revcomp(alt_g)
else:
raise RuntimeError(
f"{v['rs']}: ref mismatch — seq[{var_offset}]={seq[var_offset]} expected {ref_g} (or {revcomp(ref_g)})"
)
return {
"rs": v["rs"],
"gene": v["gene"],
"name": v["name"],
"sig": sig,
"blurb": v["blurb"],
"chrom": chrom,
"pos": pos,
"gene_strand": gene_strand,
"ref": ref_g,
"alt": alt_g,
"ref_window": seq, # full WINDOW bp on gene strand
"var_offset": var_offset, # 0-based
}
def main():
out = []
for v in VARIANTS:
print(f"fetching {v['rs']} ({v['gene']})…", flush=True)
try:
data = fetch_variant(v)
print(f" ✓ {data['ref']}>{data['alt']} at offset {data['var_offset']} | window: …{data['ref_window'][max(0,data['var_offset']-5):data['var_offset']]}[{data['ref_window'][data['var_offset']]}]{data['ref_window'][data['var_offset']+1:data['var_offset']+6]}…")
out.append(data)
time.sleep(0.4)
except Exception as e:
print(f" ✗ FAILED — {e}", file=sys.stderr)
here = os.path.dirname(os.path.abspath(__file__))
out_path = os.path.join(os.path.dirname(here), "data", "variants.json")
with open(out_path, "w") as f:
json.dump(out, f, indent=2)
print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB) — {len(out)} variants")
if __name__ == "__main__":
main()
|