Spaces:
Running
Running
| """Fetch KEGG module definitions via REST API. | |
| Each module has a DEFINITION line that is a boolean rule over KEGG Orthologue | |
| (KO) IDs: | |
| DEFINITION K00001 (K01810,K06859) K00850 ... | |
| Rule grammar: | |
| - whitespace → AND (sequential pathway steps) | |
| - comma in parens → OR (alternative isozymes) | |
| - parens → grouping | |
| We parse this rule into an AST so it can be evaluated against a per-genome KO | |
| set to produce a 0.0-1.0 completeness score (see 28_compute_module_completeness). | |
| Output: data/kegg/modules.json | |
| [ | |
| {"id": "M00001", "name": "Glycolysis ...", "definition": "K00844 ...", | |
| "ko_list": ["K00844", ...], "category": "Carbohydrate metabolism"}, | |
| ... | |
| ] | |
| KEGG REST has a soft rate limit; we sleep 100 ms between calls to stay polite. | |
| ~470 modules → ~1 minute total. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| import time | |
| from pathlib import Path | |
| import requests | |
| from microbe_model import config | |
| LIST_URL = "https://rest.kegg.jp/list/module" | |
| GET_URL = "https://rest.kegg.jp/get/{module_id}" | |
| SLEEP_S = 0.10 | |
| def fetch_module_list() -> list[tuple[str, str]]: | |
| """Return [(M00001, 'Glycolysis ...'), ...].""" | |
| resp = requests.get(LIST_URL, timeout=30) | |
| resp.raise_for_status() | |
| out: list[tuple[str, str]] = [] | |
| for line in resp.text.splitlines(): | |
| if "\t" not in line: | |
| continue | |
| mod_id, name = line.split("\t", 1) | |
| if mod_id.startswith("M") and mod_id[1:].isdigit(): | |
| out.append((mod_id, name)) | |
| return out | |
| def fetch_module_detail(module_id: str) -> dict | None: | |
| resp = requests.get(GET_URL.format(module_id=module_id), timeout=30) | |
| if resp.status_code != 200: | |
| return None | |
| text = resp.text | |
| # Extract DEFINITION line (may span multiple lines if very long) | |
| definition = "" | |
| name = "" | |
| category = "" | |
| in_definition = False | |
| in_class = False | |
| for line in text.splitlines(): | |
| if line.startswith("NAME"): | |
| name = line[len("NAME"):].strip() | |
| in_definition = False | |
| in_class = False | |
| elif line.startswith("DEFINITION"): | |
| definition = line[len("DEFINITION"):].strip() | |
| in_definition = True | |
| in_class = False | |
| elif line.startswith("CLASS"): | |
| category = line[len("CLASS"):].strip() | |
| in_class = True | |
| in_definition = False | |
| elif line.startswith(" ") and (in_definition or in_class): | |
| cont = line.strip() | |
| if in_definition: | |
| definition += " " + cont | |
| else: | |
| category += " " + cont | |
| else: | |
| in_definition = False | |
| in_class = False | |
| if not definition: | |
| return None | |
| # Extract every KO (Knnnnn) appearing in the definition | |
| ko_list = sorted(set(re.findall(r"K\d{5}", definition))) | |
| return { | |
| "id": module_id, | |
| "name": name, | |
| "definition": definition.strip(), | |
| "ko_list": ko_list, | |
| "category": category.strip(), | |
| } | |
| def main() -> None: | |
| out_dir = config.DATA / "kegg" | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| out_path = out_dir / "modules.json" | |
| print("Fetching KEGG module list...") | |
| modules = fetch_module_list() | |
| print(f" {len(modules)} modules listed") | |
| print("\nFetching definitions (with 100 ms politeness sleep)...") | |
| records: list[dict] = [] | |
| for i, (mod_id, _) in enumerate(modules, 1): | |
| time.sleep(SLEEP_S) | |
| rec = fetch_module_detail(mod_id) | |
| if rec is None: | |
| print(f" ⚠ {mod_id}: failed to fetch detail") | |
| continue | |
| records.append(rec) | |
| if i % 50 == 0: | |
| print(f" {i}/{len(modules)} latest: {mod_id} {rec['name'][:50]}") | |
| print(f"\nFetched {len(records)} modules") | |
| with open(out_path, "w") as fh: | |
| json.dump(records, fh, indent=2) | |
| print(f"Wrote {out_path}") | |
| n_kos = len({ko for r in records for ko in r["ko_list"]}) | |
| print(f"Distinct KOs across all modules: {n_kos:,}") | |
| print() | |
| print("Sample modules (first 5):") | |
| for r in records[:5]: | |
| print(f" {r['id']}: {r['name'][:60]} ({len(r['ko_list'])} KOs)") | |
| if __name__ == "__main__": | |
| main() | |