microbe-model / scripts /27_fetch_kegg_modules.py
Miyu Horiuchi
Deploy app from main@a3254bf (no paper/ binaries)
0ed74db
"""Fetch KEGG module definitions via REST API.
Each module has a DEFINITION line that is a boolean rule over KEGG Orthologue
(KO) IDs:
DEFINITION K00001 (K01810,K06859) K00850 ...
Rule grammar:
- whitespace → AND (sequential pathway steps)
- comma in parens → OR (alternative isozymes)
- parens → grouping
We parse this rule into an AST so it can be evaluated against a per-genome KO
set to produce a 0.0-1.0 completeness score (see 28_compute_module_completeness).
Output: data/kegg/modules.json
[
{"id": "M00001", "name": "Glycolysis ...", "definition": "K00844 ...",
"ko_list": ["K00844", ...], "category": "Carbohydrate metabolism"},
...
]
KEGG REST has a soft rate limit; we sleep 100 ms between calls to stay polite.
~470 modules → ~1 minute total.
"""
from __future__ import annotations
import json
import re
import time
from pathlib import Path
import requests
from microbe_model import config
LIST_URL = "https://rest.kegg.jp/list/module"
GET_URL = "https://rest.kegg.jp/get/{module_id}"
SLEEP_S = 0.10
def fetch_module_list() -> list[tuple[str, str]]:
"""Return [(M00001, 'Glycolysis ...'), ...]."""
resp = requests.get(LIST_URL, timeout=30)
resp.raise_for_status()
out: list[tuple[str, str]] = []
for line in resp.text.splitlines():
if "\t" not in line:
continue
mod_id, name = line.split("\t", 1)
if mod_id.startswith("M") and mod_id[1:].isdigit():
out.append((mod_id, name))
return out
def fetch_module_detail(module_id: str) -> dict | None:
resp = requests.get(GET_URL.format(module_id=module_id), timeout=30)
if resp.status_code != 200:
return None
text = resp.text
# Extract DEFINITION line (may span multiple lines if very long)
definition = ""
name = ""
category = ""
in_definition = False
in_class = False
for line in text.splitlines():
if line.startswith("NAME"):
name = line[len("NAME"):].strip()
in_definition = False
in_class = False
elif line.startswith("DEFINITION"):
definition = line[len("DEFINITION"):].strip()
in_definition = True
in_class = False
elif line.startswith("CLASS"):
category = line[len("CLASS"):].strip()
in_class = True
in_definition = False
elif line.startswith(" ") and (in_definition or in_class):
cont = line.strip()
if in_definition:
definition += " " + cont
else:
category += " " + cont
else:
in_definition = False
in_class = False
if not definition:
return None
# Extract every KO (Knnnnn) appearing in the definition
ko_list = sorted(set(re.findall(r"K\d{5}", definition)))
return {
"id": module_id,
"name": name,
"definition": definition.strip(),
"ko_list": ko_list,
"category": category.strip(),
}
def main() -> None:
out_dir = config.DATA / "kegg"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "modules.json"
print("Fetching KEGG module list...")
modules = fetch_module_list()
print(f" {len(modules)} modules listed")
print("\nFetching definitions (with 100 ms politeness sleep)...")
records: list[dict] = []
for i, (mod_id, _) in enumerate(modules, 1):
time.sleep(SLEEP_S)
rec = fetch_module_detail(mod_id)
if rec is None:
print(f" ⚠ {mod_id}: failed to fetch detail")
continue
records.append(rec)
if i % 50 == 0:
print(f" {i}/{len(modules)} latest: {mod_id} {rec['name'][:50]}")
print(f"\nFetched {len(records)} modules")
with open(out_path, "w") as fh:
json.dump(records, fh, indent=2)
print(f"Wrote {out_path}")
n_kos = len({ko for r in records for ko in r["ko_list"]})
print(f"Distinct KOs across all modules: {n_kos:,}")
print()
print("Sample modules (first 5):")
for r in records[:5]:
print(f" {r['id']}: {r['name'][:60]} ({len(r['ko_list'])} KOs)")
if __name__ == "__main__":
main()