Spaces:
Running
Running
File size: 4,252 Bytes
0ed74db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | """Fetch KEGG module definitions via REST API.
Each module has a DEFINITION line that is a boolean rule over KEGG Orthologue
(KO) IDs:
DEFINITION K00001 (K01810,K06859) K00850 ...
Rule grammar:
- whitespace → AND (sequential pathway steps)
- comma in parens → OR (alternative isozymes)
- parens → grouping
We parse this rule into an AST so it can be evaluated against a per-genome KO
set to produce a 0.0-1.0 completeness score (see 28_compute_module_completeness).
Output: data/kegg/modules.json
[
{"id": "M00001", "name": "Glycolysis ...", "definition": "K00844 ...",
"ko_list": ["K00844", ...], "category": "Carbohydrate metabolism"},
...
]
KEGG REST has a soft rate limit; we sleep 100 ms between calls to stay polite.
~470 modules → ~1 minute total.
"""
from __future__ import annotations
import json
import re
import time
from pathlib import Path
import requests
from microbe_model import config
LIST_URL = "https://rest.kegg.jp/list/module"
GET_URL = "https://rest.kegg.jp/get/{module_id}"
SLEEP_S = 0.10
def fetch_module_list() -> list[tuple[str, str]]:
"""Return [(M00001, 'Glycolysis ...'), ...]."""
resp = requests.get(LIST_URL, timeout=30)
resp.raise_for_status()
out: list[tuple[str, str]] = []
for line in resp.text.splitlines():
if "\t" not in line:
continue
mod_id, name = line.split("\t", 1)
if mod_id.startswith("M") and mod_id[1:].isdigit():
out.append((mod_id, name))
return out
def fetch_module_detail(module_id: str) -> dict | None:
resp = requests.get(GET_URL.format(module_id=module_id), timeout=30)
if resp.status_code != 200:
return None
text = resp.text
# Extract DEFINITION line (may span multiple lines if very long)
definition = ""
name = ""
category = ""
in_definition = False
in_class = False
for line in text.splitlines():
if line.startswith("NAME"):
name = line[len("NAME"):].strip()
in_definition = False
in_class = False
elif line.startswith("DEFINITION"):
definition = line[len("DEFINITION"):].strip()
in_definition = True
in_class = False
elif line.startswith("CLASS"):
category = line[len("CLASS"):].strip()
in_class = True
in_definition = False
elif line.startswith(" ") and (in_definition or in_class):
cont = line.strip()
if in_definition:
definition += " " + cont
else:
category += " " + cont
else:
in_definition = False
in_class = False
if not definition:
return None
# Extract every KO (Knnnnn) appearing in the definition
ko_list = sorted(set(re.findall(r"K\d{5}", definition)))
return {
"id": module_id,
"name": name,
"definition": definition.strip(),
"ko_list": ko_list,
"category": category.strip(),
}
def main() -> None:
out_dir = config.DATA / "kegg"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "modules.json"
print("Fetching KEGG module list...")
modules = fetch_module_list()
print(f" {len(modules)} modules listed")
print("\nFetching definitions (with 100 ms politeness sleep)...")
records: list[dict] = []
for i, (mod_id, _) in enumerate(modules, 1):
time.sleep(SLEEP_S)
rec = fetch_module_detail(mod_id)
if rec is None:
print(f" ⚠ {mod_id}: failed to fetch detail")
continue
records.append(rec)
if i % 50 == 0:
print(f" {i}/{len(modules)} latest: {mod_id} {rec['name'][:50]}")
print(f"\nFetched {len(records)} modules")
with open(out_path, "w") as fh:
json.dump(records, fh, indent=2)
print(f"Wrote {out_path}")
n_kos = len({ko for r in records for ko in r["ko_list"]})
print(f"Distinct KOs across all modules: {n_kos:,}")
print()
print("Sample modules (first 5):")
for r in records[:5]:
print(f" {r['id']}: {r['name'][:60]} ({len(r['ko_list'])} KOs)")
if __name__ == "__main__":
main()
|