Spaces:
Running
Running
File size: 2,857 Bytes
0ed74db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | """Compute per-genome KEGG module completeness from KOfam hits.
Reads:
data/kegg/modules.json — module definitions (from script 27)
data/kofam_hits.jsonl — per-genome KO sets (from script 28)
Writes:
data/kegg_modules.parquet — one row per genome, columns are kegg_<module_id>
Each cell is a 0.0-1.0 fractional completeness score (KEGG-style: average over
the AND chain, max across OR alternatives). XGBoost can read these directly.
Quick to run (~seconds for 22K genomes); rerun any time the module set changes.
"""
from __future__ import annotations
import json
import pandas as pd
from tqdm import tqdm
from microbe_model import config
from microbe_model.features.kegg_modules import module_completeness
def main() -> None:
modules_path = config.DATA / "kegg" / "modules.json"
if not modules_path.exists():
raise SystemExit(f"Missing {modules_path}. Run scripts/27_fetch_kegg_modules.py first.")
hits_path = config.DATA / "kofam_hits.jsonl"
if not hits_path.exists():
raise SystemExit(f"Missing {hits_path}. Run scripts/28_kofam_scan.py first.")
with open(modules_path) as fh:
modules = json.load(fh)
print(f"Loaded {len(modules)} KEGG modules")
# Pre-parse each rule once so we don't re-parse per genome
from microbe_model.features.kegg_modules import parse_definition, evaluate
parsed: list[tuple[str, object]] = []
for m in modules:
try:
ast = parse_definition(m["definition"])
parsed.append((m["id"], ast))
except Exception as exc:
print(f" ⚠ couldn't parse {m['id']}: {exc}")
print(f"Parsed {len(parsed)} module rules")
rows: list[dict] = []
seen: set[str] = set()
with open(hits_path) as fh:
for line in tqdm(fh, desc="genomes"):
r = json.loads(line)
acc = r.get("genome_accession") or r.get("accession")
if not acc or acc in seen:
continue
seen.add(str(acc))
ko_set = set(r.get("ko_hits", []))
row: dict = {"genome_accession": acc}
for mod_id, ast in parsed:
row[f"kegg_{mod_id}"] = evaluate(ast, ko_set, fractional=True)
rows.append(row)
df = pd.DataFrame(rows)
out = config.DATA / "kegg_modules.parquet"
df.to_parquet(out, index=False)
print(f"\nWrote {out}: {len(df):,} rows × {df.shape[1]} cols")
if len(df) > 0:
print("\nMean completeness per module (top 10 most-present):")
means = df.iloc[:, 1:].mean().sort_values(ascending=False).head(10)
for col, val in means.items():
mod_id = col[len("kegg_"):]
name = next((m["name"] for m in modules if m["id"] == mod_id), "")
print(f" {val:.2f} {mod_id} {name[:55]}")
if __name__ == "__main__":
main()
|