microbe-model / scripts /05_overnight_summary.py
Miyu Horiuchi
Final cleanup: sync OVERNIGHT_SUMMARY.md + fix size display for small files
6b52ab8
"""Render a one-page OVERNIGHT_SUMMARY.md the user reads first.
Pulls headline numbers from the eval report + featurize log + git history,
and renders a single short page that orients the reader cold.
"""
from __future__ import annotations
import re
import subprocess
from datetime import UTC, datetime
from pathlib import Path
import pandas as pd
from microbe_model import config
ROOT = config.ROOT
def _read_jsonl_count(path: Path) -> int:
if not path.exists():
return 0
with open(path) as fh:
return sum(1 for _ in fh)
def _read_featurize_log_summary(path: Path) -> dict[str, str | int]:
"""Parse the last tqdm line for success/fail counts."""
if not path.exists():
return {}
with open(path) as fh:
text = fh.read().replace("\r", "\n")
last_match = None
pattern = re.compile(r"(\d+)/(\d+).*?fail=(\d+).*?success=(\d+)")
for m in pattern.finditer(text):
last_match = m
if last_match is None:
# Fall back to alternate ordering (success first then fail)
alt = re.compile(r"(\d+)/(\d+).*?success=(\d+).*?fail=(\d+)")
for m in alt.finditer(text):
last_match = m
if last_match is None:
return {}
completed, total, success, fail = (int(last_match.group(i)) for i in range(1, 5))
else:
completed, total, fail, success = (int(last_match.group(i)) for i in range(1, 5))
return {
"completed": completed,
"total": total,
"success": success,
"fail": fail,
}
def _git_log_since_yesterday() -> list[str]:
try:
out = subprocess.check_output(
["git", "-C", str(ROOT), "log", "--oneline", "--since=yesterday"],
text=True,
)
except subprocess.CalledProcessError:
return []
return [line.strip() for line in out.splitlines() if line.strip()]
def main() -> None:
pheno_path = config.DATA / "bacdive_phenotypes.parquet"
features_jsonl = config.DATA / "features.jsonl"
featurize_log = config.ARTIFACTS / "featurize.log"
eval_report = config.ARTIFACTS / "eval_report.md"
blockers = config.ARTIFACTS / "blockers.md"
train_log = config.ARTIFACTS / "train.log"
pheno = pd.read_parquet(pheno_path) if pheno_path.exists() else None
feats_n = _read_jsonl_count(features_jsonl)
summary = _read_featurize_log_summary(featurize_log)
lines: list[str] = []
lines.append("# Overnight run β€” summary")
lines.append("")
lines.append(f"_Written {datetime.now(UTC).isoformat(timespec='minutes')}_")
lines.append("")
# Pipeline status
lines.append("## Pipeline status")
lines.append("")
if pheno is not None:
lines.append(f"- βœ… BacDive scan: **{len(pheno):,}** strains pulled")
n_genome = pheno["genome_accession"].notna().sum()
n_temp = pheno["optimal_temperature_c"].notna().sum()
n_train_ready = (
pheno["genome_accession"].notna() & pheno["optimal_temperature_c"].notna()
).sum()
lines.append(f" - {n_genome:,} have genome accessions")
lines.append(f" - {n_temp:,} have optimal_temperature_c labels")
lines.append(f" - **{n_train_ready:,}** strains are training-ready (genome + T_opt)")
else:
lines.append("- ❌ BacDive scan did not complete (data/bacdive_phenotypes.parquet missing)")
if summary:
pct = 100 * summary["completed"] / max(1, summary["total"])
success_rate = 100 * summary["success"] / max(1, summary["completed"])
if summary["completed"] >= summary["total"]:
status = "βœ…"
descriptor = "complete"
else:
status = "🟑"
descriptor = f"in progress ({pct:.0f}%)"
lines.append(f"- {status} Featurize: {descriptor}")
lines.append(f" - Processed: {summary['completed']:,} / {summary['total']:,}")
lines.append(f" - Successful: {summary['success']:,} ({success_rate:.1f}%)")
lines.append(f" - Failed: {summary['fail']:,} (mostly suppressed/withdrawn NCBI assemblies)")
elif feats_n:
lines.append(f"- 🟑 Featurize: {feats_n:,} feature rows (log not parseable)")
else:
lines.append("- ❌ Featurize did not run")
if train_log.exists():
lines.append("- βœ… Training: see `artifacts/train.log` for stdout")
else:
lines.append("- ⏭ Training: not yet run (waits for featurize completion)")
if eval_report.exists():
lines.append(f"- βœ… Eval report: **`{eval_report.relative_to(ROOT)}`**")
else:
lines.append("- ⏭ Eval report: not yet generated")
lines.append("")
# What to read first
lines.append("## What to read first")
lines.append("")
next_steps: list[str] = []
if eval_report.exists():
next_steps.append(f"Open **`{eval_report.relative_to(ROOT)}`** β€” headline metrics + per-target detail.")
else:
next_steps.append("Wait for `artifacts/eval_report.md` to be generated, then open it.")
if blockers.exists():
next_steps.append(f"Open **`{blockers.relative_to(ROOT)}`** β€” anything I got stuck on.")
next_steps.append("Check `git log --oneline` to see the commit timeline.")
for i, step in enumerate(next_steps, start=1):
lines.append(f"{i}. {step}")
lines.append("")
# Files written this run
lines.append("## Files of interest")
lines.append("")
files_of_interest = [
("artifacts/eval_report.md", "headline result + metrics"),
("artifacts/baseline_results.json", "machine-readable per-fold scores"),
("data/bacdive_phenotypes.parquet", "phenotype labels (gitignored)"),
("data/features.parquet", "extracted genome features (gitignored)"),
("data/training_table.parquet", "merged + group-keyed table used for training (gitignored)"),
]
for rel, desc in files_of_interest:
path = ROOT / rel
marker = "βœ…" if path.exists() else "β€”"
if path.exists():
size = path.stat().st_size
size_label = f"{size / 1e6:.1f} MB" if size >= 100_000 else f"{size / 1e3:.1f} KB"
else:
size_label = ""
lines.append(f"- {marker} `{rel}` β€” {desc} {size_label}")
lines.append("")
# Commits overnight
commits = _git_log_since_yesterday()
if commits:
lines.append("## Commits since yesterday")
lines.append("")
for c in commits:
lines.append(f"- {c}")
lines.append("")
# Reminders
lines.append("## Reminders")
lines.append("")
lines.append("- The NCBI API key was pasted into chat earlier in this session β€” "
"rotate it at ncbi.nlm.nih.gov β†’ Account Settings β†’ API Key Management β†’ "
"Revoke + Create New.")
lines.append("- The `caffeinate -dimsu` you started for the overnight run can be stopped "
"now (`Ctrl+C` in that terminal).")
lines.append("- Display sleep / Battery settings: revert to your normal preferences if "
"you changed them last night.")
lines.append("")
out_path = ROOT / "OVERNIGHT_SUMMARY.md"
# Atomic write β€” avoid races with the periodic regen loop
tmp_path = out_path.with_suffix(".md.tmp")
tmp_path.write_text("\n".join(lines))
tmp_path.replace(out_path)
print(f"Wrote {out_path}")
if __name__ == "__main__":
main()