File size: 4,243 Bytes
0ed74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""Pre-flight check the unified Pfam marker library.

For each Pfam ID in markers.all_markers():
  - download the HMM from InterPro
  - confirm the response is a valid HMM (has a NAME line + ALPH amino + LENG > 0)
  - parse the family description from the DESC line
  - flag any ID where our annotated role mentions a protein family that doesn't
    obviously match the DESC (e.g. we said "biotin synthase" but DESC says
    "transcription factor")

Output:
  - data/markers/_verification.tsv with columns: pfam_id, our_name, our_role,
    pfam_desc, status (ok / 404 / mismatch / parse_error)

Run this BEFORE committing to a multi-hour scan. Cost: ~30 sec for 70 HMMs.
"""
from __future__ import annotations

import gzip
import time
from pathlib import Path

import requests

from microbe_model import config
from microbe_model.features.markers import all_markers, category_for

INTERPRO_HMM_URL = "https://www.ebi.ac.uk/interpro/wwwapi/entry/pfam/{pfam}/?annotation=hmm"
OUT = config.DATA / "markers" / "_verification.tsv"


def fetch_hmm_text(pfam_id: str) -> tuple[str | None, int]:
    url = INTERPRO_HMM_URL.format(pfam=pfam_id)
    try:
        resp = requests.get(url, timeout=30)
    except requests.RequestException:
        return None, 0
    if resp.status_code != 200:
        return None, resp.status_code
    raw = resp.content
    try:
        return gzip.decompress(raw).decode("ascii"), 200
    except gzip.BadGzipFile:
        try:
            return raw.decode("ascii"), 200
        except UnicodeDecodeError:
            return None, 200


def parse_hmm_header(text: str) -> dict[str, str]:
    """Pull NAME, DESC, LENG, ALPH out of the HMM header."""
    header: dict[str, str] = {}
    for line in text.splitlines():
        if line.startswith("HMM "):
            break
        for key in ("NAME", "DESC", "LENG", "ALPH"):
            if line.startswith(key):
                header[key] = line[len(key):].strip()
                break
    return header


def main() -> None:
    markers = all_markers()
    print(f"Verifying {len(markers)} markers from the unified library\n")

    OUT.parent.mkdir(parents=True, exist_ok=True)
    rows: list[dict[str, str]] = []
    n_ok = n_404 = n_other = 0

    for pfam_id, (our_name, our_role) in markers.items():
        time.sleep(0.05)  # be nice to InterPro
        text, status = fetch_hmm_text(pfam_id)
        if status == 404:
            print(f"  ❌ {pfam_id:8s}  404 not found  ({our_name})")
            n_404 += 1
            rows.append({"pfam_id": pfam_id, "our_name": our_name,
                         "category": category_for(pfam_id), "our_role": our_role,
                         "pfam_name": "", "pfam_desc": "", "status": "404"})
            continue
        if text is None:
            print(f"  ❓ {pfam_id:8s}  HTTP {status} or parse error  ({our_name})")
            n_other += 1
            rows.append({"pfam_id": pfam_id, "our_name": our_name,
                         "category": category_for(pfam_id), "our_role": our_role,
                         "pfam_name": "", "pfam_desc": "",
                         "status": f"http_{status}_or_parse"})
            continue

        header = parse_hmm_header(text)
        n_ok += 1
        print(f"  ✅ {pfam_id:8s}  {header.get('NAME', '???'):25s}  "
              f"{header.get('DESC', '')[:50]}")
        rows.append({
            "pfam_id": pfam_id,
            "our_name": our_name,
            "category": category_for(pfam_id),
            "our_role": our_role,
            "pfam_name": header.get("NAME", ""),
            "pfam_desc": header.get("DESC", ""),
            "status": "ok",
        })

    print(f"\nSummary: {n_ok} ok, {n_404} not-found, {n_other} other")
    with open(OUT, "w") as fh:
        fh.write("pfam_id\tcategory\tour_name\tour_role\tpfam_name\tpfam_desc\tstatus\n")
        for r in rows:
            fh.write("\t".join([
                r["pfam_id"], r["category"], r["our_name"], r["our_role"],
                r["pfam_name"], r["pfam_desc"], r["status"],
            ]) + "\n")
    print(f"Wrote {OUT}")
    print("\nReview the TSV and update markers.py for any ID where "
          "our_role disagrees with pfam_desc.")


if __name__ == "__main__":
    main()