Spaces:

miyuiu
/

microbe-model

Running

File size: 4,252 Bytes

0ed74db

"""Fetch KEGG module definitions via REST API.

Each module has a DEFINITION line that is a boolean rule over KEGG Orthologue
(KO) IDs:

    DEFINITION  K00001 (K01810,K06859) K00850 ...

Rule grammar:
  - whitespace      → AND  (sequential pathway steps)
  - comma in parens → OR   (alternative isozymes)
  - parens          → grouping

We parse this rule into an AST so it can be evaluated against a per-genome KO
set to produce a 0.0-1.0 completeness score (see 28_compute_module_completeness).

Output: data/kegg/modules.json
    [
      {"id": "M00001", "name": "Glycolysis ...", "definition": "K00844 ...",
       "ko_list": ["K00844", ...], "category": "Carbohydrate metabolism"},
      ...
    ]

KEGG REST has a soft rate limit; we sleep 100 ms between calls to stay polite.
~470 modules → ~1 minute total.
"""
from __future__ import annotations

import json
import re
import time
from pathlib import Path

import requests

from microbe_model import config

LIST_URL = "https://rest.kegg.jp/list/module"
GET_URL = "https://rest.kegg.jp/get/{module_id}"
SLEEP_S = 0.10


def fetch_module_list() -> list[tuple[str, str]]:
    """Return [(M00001, 'Glycolysis ...'), ...]."""
    resp = requests.get(LIST_URL, timeout=30)
    resp.raise_for_status()
    out: list[tuple[str, str]] = []
    for line in resp.text.splitlines():
        if "\t" not in line:
            continue
        mod_id, name = line.split("\t", 1)
        if mod_id.startswith("M") and mod_id[1:].isdigit():
            out.append((mod_id, name))
    return out


def fetch_module_detail(module_id: str) -> dict | None:
    resp = requests.get(GET_URL.format(module_id=module_id), timeout=30)
    if resp.status_code != 200:
        return None
    text = resp.text

    # Extract DEFINITION line (may span multiple lines if very long)
    definition = ""
    name = ""
    category = ""
    in_definition = False
    in_class = False
    for line in text.splitlines():
        if line.startswith("NAME"):
            name = line[len("NAME"):].strip()
            in_definition = False
            in_class = False
        elif line.startswith("DEFINITION"):
            definition = line[len("DEFINITION"):].strip()
            in_definition = True
            in_class = False
        elif line.startswith("CLASS"):
            category = line[len("CLASS"):].strip()
            in_class = True
            in_definition = False
        elif line.startswith(" ") and (in_definition or in_class):
            cont = line.strip()
            if in_definition:
                definition += " " + cont
            else:
                category += " " + cont
        else:
            in_definition = False
            in_class = False

    if not definition:
        return None

    # Extract every KO (Knnnnn) appearing in the definition
    ko_list = sorted(set(re.findall(r"K\d{5}", definition)))

    return {
        "id": module_id,
        "name": name,
        "definition": definition.strip(),
        "ko_list": ko_list,
        "category": category.strip(),
    }


def main() -> None:
    out_dir = config.DATA / "kegg"
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "modules.json"

    print("Fetching KEGG module list...")
    modules = fetch_module_list()
    print(f"  {len(modules)} modules listed")

    print("\nFetching definitions (with 100 ms politeness sleep)...")
    records: list[dict] = []
    for i, (mod_id, _) in enumerate(modules, 1):
        time.sleep(SLEEP_S)
        rec = fetch_module_detail(mod_id)
        if rec is None:
            print(f"  ⚠ {mod_id}: failed to fetch detail")
            continue
        records.append(rec)
        if i % 50 == 0:
            print(f"  {i}/{len(modules)}  latest: {mod_id} {rec['name'][:50]}")

    print(f"\nFetched {len(records)} modules")
    with open(out_path, "w") as fh:
        json.dump(records, fh, indent=2)
    print(f"Wrote {out_path}")

    n_kos = len({ko for r in records for ko in r["ko_list"]})
    print(f"Distinct KOs across all modules: {n_kos:,}")
    print()
    print("Sample modules (first 5):")
    for r in records[:5]:
        print(f"  {r['id']}: {r['name'][:60]}  ({len(r['ko_list'])} KOs)")


if __name__ == "__main__":
    main()