File size: 4,252 Bytes
0ed74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""Fetch KEGG module definitions via REST API.

Each module has a DEFINITION line that is a boolean rule over KEGG Orthologue
(KO) IDs:

    DEFINITION  K00001 (K01810,K06859) K00850 ...

Rule grammar:
  - whitespace      → AND  (sequential pathway steps)
  - comma in parens → OR   (alternative isozymes)
  - parens          → grouping

We parse this rule into an AST so it can be evaluated against a per-genome KO
set to produce a 0.0-1.0 completeness score (see 28_compute_module_completeness).

Output: data/kegg/modules.json
    [
      {"id": "M00001", "name": "Glycolysis ...", "definition": "K00844 ...",
       "ko_list": ["K00844", ...], "category": "Carbohydrate metabolism"},
      ...
    ]

KEGG REST has a soft rate limit; we sleep 100 ms between calls to stay polite.
~470 modules → ~1 minute total.
"""
from __future__ import annotations

import json
import re
import time
from pathlib import Path

import requests

from microbe_model import config

LIST_URL = "https://rest.kegg.jp/list/module"
GET_URL = "https://rest.kegg.jp/get/{module_id}"
SLEEP_S = 0.10


def fetch_module_list() -> list[tuple[str, str]]:
    """Return [(M00001, 'Glycolysis ...'), ...]."""
    resp = requests.get(LIST_URL, timeout=30)
    resp.raise_for_status()
    out: list[tuple[str, str]] = []
    for line in resp.text.splitlines():
        if "\t" not in line:
            continue
        mod_id, name = line.split("\t", 1)
        if mod_id.startswith("M") and mod_id[1:].isdigit():
            out.append((mod_id, name))
    return out


def fetch_module_detail(module_id: str) -> dict | None:
    resp = requests.get(GET_URL.format(module_id=module_id), timeout=30)
    if resp.status_code != 200:
        return None
    text = resp.text

    # Extract DEFINITION line (may span multiple lines if very long)
    definition = ""
    name = ""
    category = ""
    in_definition = False
    in_class = False
    for line in text.splitlines():
        if line.startswith("NAME"):
            name = line[len("NAME"):].strip()
            in_definition = False
            in_class = False
        elif line.startswith("DEFINITION"):
            definition = line[len("DEFINITION"):].strip()
            in_definition = True
            in_class = False
        elif line.startswith("CLASS"):
            category = line[len("CLASS"):].strip()
            in_class = True
            in_definition = False
        elif line.startswith(" ") and (in_definition or in_class):
            cont = line.strip()
            if in_definition:
                definition += " " + cont
            else:
                category += " " + cont
        else:
            in_definition = False
            in_class = False

    if not definition:
        return None

    # Extract every KO (Knnnnn) appearing in the definition
    ko_list = sorted(set(re.findall(r"K\d{5}", definition)))

    return {
        "id": module_id,
        "name": name,
        "definition": definition.strip(),
        "ko_list": ko_list,
        "category": category.strip(),
    }


def main() -> None:
    out_dir = config.DATA / "kegg"
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "modules.json"

    print("Fetching KEGG module list...")
    modules = fetch_module_list()
    print(f"  {len(modules)} modules listed")

    print("\nFetching definitions (with 100 ms politeness sleep)...")
    records: list[dict] = []
    for i, (mod_id, _) in enumerate(modules, 1):
        time.sleep(SLEEP_S)
        rec = fetch_module_detail(mod_id)
        if rec is None:
            print(f"  ⚠ {mod_id}: failed to fetch detail")
            continue
        records.append(rec)
        if i % 50 == 0:
            print(f"  {i}/{len(modules)}  latest: {mod_id} {rec['name'][:50]}")

    print(f"\nFetched {len(records)} modules")
    with open(out_path, "w") as fh:
        json.dump(records, fh, indent=2)
    print(f"Wrote {out_path}")

    n_kos = len({ko for r in records for ko in r["ko_list"]})
    print(f"Distinct KOs across all modules: {n_kos:,}")
    print()
    print("Sample modules (first 5):")
    for r in records[:5]:
        print(f"  {r['id']}: {r['name'][:60]}  ({len(r['ko_list'])} KOs)")


if __name__ == "__main__":
    main()