File size: 7,433 Bytes
971b586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d3e72f
 
 
 
 
 
971b586
 
41739ad
 
 
 
 
 
 
 
971b586
41739ad
 
 
 
 
 
 
 
971b586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""Fetch curated ClinVar variants with surrounding genomic context.

For each variant we fetch a 60-bp window centered on the SNP from Ensembl,
revcomp it onto the gene's coding strand if needed, and save ref + alt
window + tokenization-aligned offset for the §2 (VEP) demo.

Usage:
    python scripts/fetch_variants.py
"""
import json
import os
import sys
import time
import urllib.request
import urllib.error

ENSEMBL = "https://rest.ensembl.org"
# Window total length (bp), centered on the variant. The eval recipe in
# `vep_eval.py` uses 8,192 bp; we use a smaller window here to keep /score
# fast on the live endpoint, but well above the 60 bp the demo originally
# shipped with (which was too small to read variant signal — 3/3 pathogenic
# variants flipped sign or vanished into noise at 60 bp).
WINDOW = 4002             # multiple of 6 for the 6-mer BPE tokenizer + matches HALF math
HALF = WINDOW // 2        # variant placed at center

# Curated set: famous pathogenic + risk + benign SNVs across several
# genes/diseases. Coordinates resolved via the Ensembl variation API; the
# plus-strand alt is the ClinVar canonical disease-associated allele for
# pathogenic / risk picks, or the single reported alt for benign.
#
# Spread: 6 Pathogenic + 1 Risk + 1 Benign. The two VHL rows (rs1575932011
# and rs182781943) are paired on purpose, same gene, opposite verdicts:
# a premature stop codon vs. a benign 3' UTR SNP.
VARIANTS = [
    {"rs": "rs334",         "gene": "HBB",   "name": "HBB c.20A>T",      "sig": "Pathogenic", "blurb": "sickle cell anemia · p.Glu6Val",               "plus_alt": "A"},
    {"rs": "rs80359027",    "gene": "BRCA2", "name": "BRCA2 c.7976G>T",  "sig": "Pathogenic", "blurb": "hereditary breast/ovarian cancer · missense",  "plus_alt": "T"},
    {"rs": "rs1057519981",  "gene": "TP53",  "name": "TP53 c.712T>A",    "sig": "Pathogenic", "blurb": "Li-Fraumeni cancer · missense",                "plus_alt": "T"},
    {"rs": "rs1603267420",  "gene": "F9",    "name": "F9 c.1186T>A",     "sig": "Pathogenic", "blurb": "hemophilia B · clotting factor missense",      "plus_alt": "A"},
    {"rs": "rs112029328",   "gene": "LDLR",  "name": "LDLR c.313+1G>T",  "sig": "Pathogenic", "blurb": "familial high cholesterol · splice donor lost","plus_alt": "T"},
    {"rs": "rs1575932011",  "gene": "VHL",   "name": "VHL c.475A>T",     "sig": "Pathogenic", "blurb": "Von Hippel-Lindau · premature STOP",           "plus_alt": "T"},
    {"rs": "rs34637584",    "gene": "LRRK2", "name": "LRRK2 c.6055G>A",  "sig": "Risk",       "blurb": "Parkinson's · G2019S kinase variant",          "plus_alt": "A"},
    {"rs": "rs182781943",   "gene": "VHL",   "name": "VHL c.*820A>G",    "sig": "Benign",     "blurb": "common 3' UTR variant · same gene as row above","plus_alt": "G"},
]


def get_json(path):
    req = urllib.request.Request(ENSEMBL + path, headers={"Content-Type": "application/json"})
    return json.loads(urllib.request.urlopen(req, timeout=30).read())


def revcomp(s):
    return s.translate(str.maketrans("ACGTNacgtn", "TGCANtgcan"))[::-1]


def fetch_variant(v):
    info = get_json(f"/variation/human/{v['rs']}")
    # Ensembl returns allele_string like "T/A/C/G" with ref first. Pick the
    # first mapping that has the expected plus-strand alt in its alt set.
    mapping = None
    for m in info.get("mappings", []):
        alleles = m.get("allele_string", "").split("/")
        # Only chromosomal mappings, only single-base alleles
        if not alleles or any(len(a) != 1 or a not in "ACGTN" for a in alleles):
            continue
        # Skip patch/alt contigs
        chrom = m.get("seq_region_name", "")
        if not (chrom.isdigit() or chrom in ("X", "Y", "MT")):
            continue
        if v["plus_alt"] in alleles[1:]:
            mapping = m
            break
    if not mapping:
        raise RuntimeError(f"{v['rs']}: no SNV mapping containing alt {v['plus_alt']}")
    alleles = mapping["allele_string"].split("/")
    ref = alleles[0]
    alt = v["plus_alt"]
    chrom = mapping["seq_region_name"]
    pos = mapping["start"]   # 1-based

    # Fetch a window centered on the variant: pos - HALF + 1 .. pos + HALF
    win_start = pos - HALF + 1   # 1-based
    win_end = pos + HALF         # 1-based, inclusive
    seq_data = get_json(f"/sequence/region/human/{chrom}:{win_start}..{win_end}:1?content-type=application/json")
    plus_seq = seq_data["seq"].upper()
    assert len(plus_seq) == WINDOW
    var_offset_plus = pos - win_start   # 0-based offset into plus_seq

    # Sanity: the base at var_offset_plus must equal ref (on plus strand).
    # Some rs IDs report alleles on the gene's strand rather than +. If our
    # plus_seq base doesn't match ref, try interpreting alleles as gene-strand
    # and revcomping later.
    plus_base = plus_seq[var_offset_plus]
    sig = v["sig"]

    # Determine gene's strand for orientation.
    g = get_json(f"/lookup/symbol/homo_sapiens/{v['gene']}")
    gene_strand = g["strand"]

    if gene_strand == 1:
        seq = plus_seq
        var_offset = var_offset_plus
        ref_g, alt_g = ref, alt
    else:
        seq = revcomp(plus_seq)
        var_offset = WINDOW - 1 - var_offset_plus
        ref_g, alt_g = revcomp(ref), revcomp(alt)
        # Recheck: if plus_base != ref, the variant's REF was already on gene
        # strand, so flip back.
        if plus_base != ref:
            # alleles were given on gene-strand (minus strand of plus). We've
            # already revcomp'd, so this should now match — but verify.
            pass

    # Final check: seq[var_offset] should equal ref_g
    if seq[var_offset] != ref_g:
        # Try the other orientation
        # (this happens when Ensembl reports allele_string on gene strand for - strand genes)
        if seq[var_offset] == revcomp(ref_g):
            ref_g, alt_g = revcomp(ref_g), revcomp(alt_g)
        else:
            raise RuntimeError(
                f"{v['rs']}: ref mismatch — seq[{var_offset}]={seq[var_offset]} expected {ref_g} (or {revcomp(ref_g)})"
            )

    return {
        "rs": v["rs"],
        "gene": v["gene"],
        "name": v["name"],
        "sig": sig,
        "blurb": v["blurb"],
        "chrom": chrom,
        "pos": pos,
        "gene_strand": gene_strand,
        "ref": ref_g,
        "alt": alt_g,
        "ref_window": seq,                               # full WINDOW bp on gene strand
        "var_offset": var_offset,                        # 0-based
    }


def main():
    out = []
    for v in VARIANTS:
        print(f"fetching {v['rs']} ({v['gene']})…", flush=True)
        try:
            data = fetch_variant(v)
            print(f"  ✓ {data['ref']}>{data['alt']} at offset {data['var_offset']} | window: …{data['ref_window'][max(0,data['var_offset']-5):data['var_offset']]}[{data['ref_window'][data['var_offset']]}]{data['ref_window'][data['var_offset']+1:data['var_offset']+6]}…")
            out.append(data)
            time.sleep(0.4)
        except Exception as e:
            print(f"  ✗ FAILED — {e}", file=sys.stderr)
    here = os.path.dirname(os.path.abspath(__file__))
    out_path = os.path.join(os.path.dirname(here), "data", "variants.json")
    with open(out_path, "w") as f:
        json.dump(out, f, indent=2)
    print(f"\nwrote {out_path} ({os.path.getsize(out_path)/1024:.1f} KB) — {len(out)} variants")


if __name__ == "__main__":
    main()