File size: 15,449 Bytes
9d2f5a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88bf984
9d2f5a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
"""
vep_handler.py — PeVe v1.4
===========================
Fixes:
  1. Genome build normalisation  (GRCh38 enforced, "chr" prefix stripped)
  2. Correct Ensembl REST endpoint and headers
  3. Full HTTP debug logging to Space stdout
  4. Retry with back-off
  5. annotation_available flag surfaced in every return value
  6. AF lookup independent of VEP success
  7. Test block:  python vep_handler.py
"""
from __future__ import annotations

import json
import time
import traceback
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from typing import Optional

# ── Constants ─────────────────────────────────────────────────────────────────
_ENSEMBL_REST   = "https://rest.ensembl.org"
_GNOMAD_API     = "https://gnomad.broadinstitute.org/api"
_GENOME_BUILD   = "GRCh38"          # enforced throughout
_REQUEST_TIMEOUT = 20               # seconds
_MAX_RETRIES    = 3
_RETRY_DELAY    = 2                 # seconds, doubled each retry

# ── Return types ──────────────────────────────────────────────────────────────

@dataclass
class VEPResult:
    consequence:        str   = "unknown"
    impact:             str   = "MODIFIER"
    gene:               str   = ""
    transcript:         str   = ""
    all_consequences:   list  = field(default_factory=lambda: ["unknown"])
    annotation_available: bool = False
    error_message:      Optional[str] = None
    raw_response:       Optional[dict] = None


@dataclass
class AFResult:
    state:               str   = "AF_UNKNOWN"
    global_af:           Optional[float] = None
    population_afs:      dict  = field(default_factory=dict)
    is_rare:             Optional[bool]  = None
    founder_variant_flag: bool = False
    annotation_available: bool = False
    error_message:       Optional[str]  = None


# ══════════════════════════════════════════════════════════════════════════════
# Helpers
# ══════════════════════════════════════════════════════════════════════════════

def _normalise_chrom(chrom: str) -> str:
    """Strip 'chr' prefix, uppercase.  '17' and 'chr17' → '17'."""
    return str(chrom).strip().upper().lstrip("CHR")


def _http_get(url: str, label: str) -> Optional[dict | str]:
    """
    GET with retry + back-off.
    Returns parsed JSON dict, raw string, or None on failure.
    Logs full request/response to stdout (visible in HF Space logs).
    """
    headers = {
        "Content-Type": "application/json",
        "Accept":        "application/json",
    }
    delay = _RETRY_DELAY
    last_err = ""

    for attempt in range(1, _MAX_RETRIES + 1):
        print(f"[VEP] {label} → GET {url}  (attempt {attempt}/{_MAX_RETRIES})")
        try:
            req  = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT) as resp:
                status = resp.status
                body   = resp.read().decode("utf-8")
            print(f"[VEP] {label} ← HTTP {status}  ({len(body)} bytes)")

            # Try JSON parse; fall back to raw string
            try:
                return json.loads(body)
            except json.JSONDecodeError:
                return body

        except urllib.error.HTTPError as e:
            last_err = f"HTTP {e.code}: {e.reason}"
            body_preview = ""
            try:
                body_preview = e.read().decode()[:300]
            except Exception:
                pass
            print(f"[VEP] {label} attempt {attempt} HTTPError: {last_err}")
            print(f"[VEP]   body: {body_preview}")

        except urllib.error.URLError as e:
            last_err = f"URLError: {e.reason}"
            print(f"[VEP] {label} attempt {attempt} URLError: {last_err}")

        except Exception:
            last_err = traceback.format_exc()
            print(f"[VEP] {label} attempt {attempt} Exception:\n{last_err}")

        if attempt < _MAX_RETRIES:
            print(f"[VEP] {label} retrying in {delay}s …")
            time.sleep(delay)
            delay *= 2

    print(f"[VEP] {label} FAILED after {_MAX_RETRIES} attempts: {last_err}")
    return None


# ══════════════════════════════════════════════════════════════════════════════
# VEP Annotation
# ══════════════════════════════════════════════════════════════════════════════

def fetch_vep(chrom: str, pos: int, ref: str, alt: str) -> VEPResult:
    """
    Query Ensembl REST VEP (GRCh38).
    Endpoint: /vep/human/region/{chrom}:{pos}-{pos}/{alt}

    Debug info printed to stdout on every call.
    """
    chrom_norm = _normalise_chrom(chrom)

    # Ensembl VEP REST requires no "chr" prefix for human GRCh38
    url = (
        f"{_ENSEMBL_REST}/vep/human/region/"
        f"{chrom_norm}:{pos}-{pos}/{ref}/{alt}"
        f"?content-type=application/json"
        f"&canonical=1"
        f"&pick=1"
        f"&hgvs=1"
        f"&LoF=1"
    )

    print(f"[VEP] Querying VEP | build={_GENOME_BUILD} | "
          f"coord={chrom_norm}:{pos} {ref}>{alt}")
    print(f"[VEP] URL: {url}")

    data = _http_get(url, f"VEP {chrom_norm}:{pos}")

    if data is None:
        return VEPResult(
            annotation_available=False,
            error_message="HTTP request failed — see logs above",
        )

    if not isinstance(data, list) or len(data) == 0:
        msg = f"Unexpected VEP response type={type(data).__name__}: {str(data)[:200]}"
        print(f"[VEP] ✗ {msg}")
        return VEPResult(annotation_available=False, error_message=msg)

    entry = data[0]

    # Check for Ensembl error object
    if "error" in entry:
        msg = f"Ensembl VEP error: {entry['error']}"
        print(f"[VEP] ✗ {msg}")
        return VEPResult(annotation_available=False, error_message=msg)

    tcs = entry.get("transcript_consequences") or []
    if not tcs:
        # Try intergenic
        ics = entry.get("intergenic_consequences") or [{}]
        tc  = ics[0]
        print(f"[VEP] ⚠ No transcript consequences — variant may be intergenic")
    else:
        tc = tcs[0]

    result = VEPResult(
        consequence        = tc.get("consequence_terms", ["unknown"])[0],
        impact             = tc.get("impact", "MODIFIER"),
        gene               = tc.get("gene_symbol", ""),
        transcript         = tc.get("transcript_id", ""),
        all_consequences   = [
            t.get("consequence_terms", ["unknown"])[0] for t in tcs
        ] or ["unknown"],
        annotation_available = True,
        raw_response       = entry,
    )

    print(f"[VEP] ✓ gene={result.gene}  consequence={result.consequence}  "
          f"impact={result.impact}  tx={result.transcript}")
    return result


# ══════════════════════════════════════════════════════════════════════════════
# Allele Frequency  (gnomAD v4 GraphQL)
# ══════════════════════════════════════════════════════════════════════════════

_GNOMAD_QUERY = """
query VariantAF($variantId: String!, $dataset: DatasetId!) {
  variant(variantId: $variantId, dataset: $dataset) {
    variant_id
    genome {
      af
      populations {
        id
        af
      }
    }
  }
}
"""

_POP_FOUNDER = {"asj", "fin"}          # populations with founder-effect risk
_RARE_THRESHOLD = 0.001                 # AF < 0.1%


def fetch_af(
    chrom: str,
    pos: int,
    ref: str,
    alt: str,
    ancestry: Optional[str] = None,
) -> AFResult:
    """
    Query gnomAD v4 for global + population AF.
    variant_id format:  17-43092176-G-T  (no 'chr' prefix)
    """
    chrom_norm  = _normalise_chrom(chrom)
    variant_id  = f"{chrom_norm}-{pos}-{ref}-{alt}"
    dataset     = "gnomad_r4"

    print(f"[AF]  Querying gnomAD | variant_id={variant_id}")

    query_body = json.dumps({
        "query":     _GNOMAD_QUERY,
        "variables": {"variantId": variant_id, "dataset": dataset},
    }).encode("utf-8")

    try:
        req = urllib.request.Request(
            _GNOMAD_API,
            data=query_body,
            headers={
                "Content-Type": "application/json",
                "Accept":       "application/json",
            },
            method="POST",
        )
        with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT) as resp:
            status = resp.status
            body   = resp.read().decode("utf-8")
        print(f"[AF]  gnomAD ← HTTP {status}  ({len(body)} bytes)")
        data = json.loads(body)

    except Exception:
        tb = traceback.format_exc()
        print(f"[AF]  gnomAD request failed:\n{tb}")
        return AFResult(
            state="AF_UNKNOWN",
            annotation_available=False,
            error_message=tb,
        )

    variant_data = (
        (data.get("data") or {}).get("variant") or {}
    )
    genome_data  = variant_data.get("genome") or {}

    if not genome_data:
        print(f"[AF]  ⚠ No genome AF data for {variant_id} in {dataset}")
        # Try v2 fallback
        return _fetch_af_gnomad_v2(chrom_norm, pos, ref, alt, ancestry)

    global_af = genome_data.get("af")
    pops_raw  = genome_data.get("populations") or []
    pop_afs   = {p["id"]: p["af"] for p in pops_raw if p.get("af") is not None}

    # Ancestry-specific AF
    anc_af = None
    if ancestry:
        anc_key = ancestry.lower()
        for k, v in pop_afs.items():
            if anc_key in k.lower():
                anc_af = v
                break

    effective_af = anc_af if anc_af is not None else global_af

    if effective_af is None:
        print(f"[AF]  ⚠ AF is null for {variant_id}")
        return AFResult(
            state="AF_UNKNOWN",
            population_afs=pop_afs,
            annotation_available=True,
            error_message="AF field is null — variant may be absent from gnomAD",
        )

    is_rare        = effective_af < _RARE_THRESHOLD
    founder_flag   = any(
        pop_afs.get(p, 0) > effective_af * 5
        for p in _POP_FOUNDER
        if p in pop_afs
    )

    state = "AF_RARE" if is_rare else "AF_COMMON"
    print(f"[AF]  ✓ global_af={global_af:.6f}  "
          f"effective={effective_af:.6f}  "
          f"rare={is_rare}  founder={founder_flag}")

    return AFResult(
        state                = state,
        global_af            = float(global_af),
        population_afs       = pop_afs,
        is_rare              = is_rare,
        founder_variant_flag = founder_flag,
        annotation_available = True,
    )


def _fetch_af_gnomad_v2(
    chrom: str, pos: int, ref: str, alt: str,
    ancestry: Optional[str]
) -> AFResult:
    """Fallback to gnomAD v2.1.1 (GRCh37 liftover via 38 API)."""
    variant_id = f"{chrom}-{pos}-{ref}-{alt}"
    dataset    = "gnomad_r2_1"
    print(f"[AF]  Trying gnomAD v2 fallback | variant_id={variant_id}")

    query_body = json.dumps({
        "query":     _GNOMAD_QUERY,
        "variables": {"variantId": variant_id, "dataset": dataset},
    }).encode("utf-8")

    try:
        req = urllib.request.Request(
            _GNOMAD_API,
            data=query_body,
            headers={"Content-Type": "application/json", "Accept": "application/json"},
            method="POST",
        )
        with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT) as resp:
            data = json.loads(resp.read().decode("utf-8"))

        genome_data = ((data.get("data") or {}).get("variant") or {}).get("genome") or {}
        global_af   = genome_data.get("af")

        if global_af is not None:
            is_rare = float(global_af) < _RARE_THRESHOLD
            print(f"[AF]  ✓ gnomAD v2 fallback: global_af={global_af:.6f}")
            return AFResult(
                state        = "AF_RARE" if is_rare else "AF_COMMON",
                global_af    = float(global_af),
                is_rare      = is_rare,
                annotation_available=True,
            )
    except Exception:
        print(f"[AF]  gnomAD v2 fallback failed:\n{traceback.format_exc()}")

    return AFResult(
        state="AF_UNKNOWN",
        annotation_available=False,
        error_message="Both gnomAD v4 and v2 lookups failed",
    )


# ── Compat shims for existing app.py / decision_engine calls ──────────────────

def format_af_display(af_result: AFResult) -> str:
    if af_result.global_af is None:
        return "Not found in gnomAD"
    return f"{af_result.global_af:.6f}"


# ══════════════════════════════════════════════════════════════════════════════
# Test block
# ══════════════════════════════════════════════════════════════════════════════

def _test():
    print("=" * 60)
    print("TEST: chr17:43092176 G>T  (BRCA1 known pathogenic)")
    print("=" * 60)

    vep = fetch_vep("17", 43092176, "G", "T")
    print(f"\nVEP result:")
    print(f"  annotation_available : {vep.annotation_available}")
    print(f"  gene                 : {vep.gene}")
    print(f"  consequence          : {vep.consequence}")
    print(f"  impact               : {vep.impact}")
    print(f"  transcript           : {vep.transcript}")
    print(f"  error_message        : {vep.error_message}")

    print()

    af = fetch_af("17", 43092176, "G", "T", ancestry="nfe")
    print(f"AF result:")
    print(f"  annotation_available : {af.annotation_available}")
    print(f"  state                : {af.state}")
    print(f"  global_af            : {af.global_af}")
    print(f"  is_rare              : {af.is_rare}")
    print(f"  founder_flag         : {af.founder_variant_flag}")
    print(f"  error_message        : {af.error_message}")

    print()
    print("EXPECTED:")
    print("  gene        = BRCA1")
    print("  consequence = missense_variant (or similar)")
    print("  global_af   = very small float (rare)")

    assert vep.annotation_available, "VEP annotation_available should be True"
    assert vep.gene == "BRCA1",       f"Expected BRCA1, got '{vep.gene}'"
    assert vep.consequence != "unknown", "consequence should not be 'unknown'"
    print("\n✓ All assertions passed")


if __name__ == "__main__":
    _test()