"""Render a one-page OVERNIGHT_SUMMARY.md the user reads first. Pulls headline numbers from the eval report + featurize log + git history, and renders a single short page that orients the reader cold. """ from __future__ import annotations import re import subprocess from datetime import UTC, datetime from pathlib import Path import pandas as pd from microbe_model import config ROOT = config.ROOT def _read_jsonl_count(path: Path) -> int: if not path.exists(): return 0 with open(path) as fh: return sum(1 for _ in fh) def _read_featurize_log_summary(path: Path) -> dict[str, str | int]: """Parse the last tqdm line for success/fail counts.""" if not path.exists(): return {} with open(path) as fh: text = fh.read().replace("\r", "\n") last_match = None pattern = re.compile(r"(\d+)/(\d+).*?fail=(\d+).*?success=(\d+)") for m in pattern.finditer(text): last_match = m if last_match is None: # Fall back to alternate ordering (success first then fail) alt = re.compile(r"(\d+)/(\d+).*?success=(\d+).*?fail=(\d+)") for m in alt.finditer(text): last_match = m if last_match is None: return {} completed, total, success, fail = (int(last_match.group(i)) for i in range(1, 5)) else: completed, total, fail, success = (int(last_match.group(i)) for i in range(1, 5)) return { "completed": completed, "total": total, "success": success, "fail": fail, } def _git_log_since_yesterday() -> list[str]: try: out = subprocess.check_output( ["git", "-C", str(ROOT), "log", "--oneline", "--since=yesterday"], text=True, ) except subprocess.CalledProcessError: return [] return [line.strip() for line in out.splitlines() if line.strip()] def main() -> None: pheno_path = config.DATA / "bacdive_phenotypes.parquet" features_jsonl = config.DATA / "features.jsonl" featurize_log = config.ARTIFACTS / "featurize.log" eval_report = config.ARTIFACTS / "eval_report.md" blockers = config.ARTIFACTS / "blockers.md" train_log = config.ARTIFACTS / "train.log" pheno = pd.read_parquet(pheno_path) if pheno_path.exists() else None feats_n = _read_jsonl_count(features_jsonl) summary = _read_featurize_log_summary(featurize_log) lines: list[str] = [] lines.append("# Overnight run — summary") lines.append("") lines.append(f"_Written {datetime.now(UTC).isoformat(timespec='minutes')}_") lines.append("") # Pipeline status lines.append("## Pipeline status") lines.append("") if pheno is not None: lines.append(f"- ✅ BacDive scan: **{len(pheno):,}** strains pulled") n_genome = pheno["genome_accession"].notna().sum() n_temp = pheno["optimal_temperature_c"].notna().sum() n_train_ready = ( pheno["genome_accession"].notna() & pheno["optimal_temperature_c"].notna() ).sum() lines.append(f" - {n_genome:,} have genome accessions") lines.append(f" - {n_temp:,} have optimal_temperature_c labels") lines.append(f" - **{n_train_ready:,}** strains are training-ready (genome + T_opt)") else: lines.append("- ❌ BacDive scan did not complete (data/bacdive_phenotypes.parquet missing)") if summary: pct = 100 * summary["completed"] / max(1, summary["total"]) success_rate = 100 * summary["success"] / max(1, summary["completed"]) if summary["completed"] >= summary["total"]: status = "✅" descriptor = "complete" else: status = "🟡" descriptor = f"in progress ({pct:.0f}%)" lines.append(f"- {status} Featurize: {descriptor}") lines.append(f" - Processed: {summary['completed']:,} / {summary['total']:,}") lines.append(f" - Successful: {summary['success']:,} ({success_rate:.1f}%)") lines.append(f" - Failed: {summary['fail']:,} (mostly suppressed/withdrawn NCBI assemblies)") elif feats_n: lines.append(f"- 🟡 Featurize: {feats_n:,} feature rows (log not parseable)") else: lines.append("- ❌ Featurize did not run") if train_log.exists(): lines.append("- ✅ Training: see `artifacts/train.log` for stdout") else: lines.append("- ⏭ Training: not yet run (waits for featurize completion)") if eval_report.exists(): lines.append(f"- ✅ Eval report: **`{eval_report.relative_to(ROOT)}`**") else: lines.append("- ⏭ Eval report: not yet generated") lines.append("") # What to read first lines.append("## What to read first") lines.append("") next_steps: list[str] = [] if eval_report.exists(): next_steps.append(f"Open **`{eval_report.relative_to(ROOT)}`** — headline metrics + per-target detail.") else: next_steps.append("Wait for `artifacts/eval_report.md` to be generated, then open it.") if blockers.exists(): next_steps.append(f"Open **`{blockers.relative_to(ROOT)}`** — anything I got stuck on.") next_steps.append("Check `git log --oneline` to see the commit timeline.") for i, step in enumerate(next_steps, start=1): lines.append(f"{i}. {step}") lines.append("") # Files written this run lines.append("## Files of interest") lines.append("") files_of_interest = [ ("artifacts/eval_report.md", "headline result + metrics"), ("artifacts/baseline_results.json", "machine-readable per-fold scores"), ("data/bacdive_phenotypes.parquet", "phenotype labels (gitignored)"), ("data/features.parquet", "extracted genome features (gitignored)"), ("data/training_table.parquet", "merged + group-keyed table used for training (gitignored)"), ] for rel, desc in files_of_interest: path = ROOT / rel marker = "✅" if path.exists() else "—" if path.exists(): size = path.stat().st_size size_label = f"{size / 1e6:.1f} MB" if size >= 100_000 else f"{size / 1e3:.1f} KB" else: size_label = "" lines.append(f"- {marker} `{rel}` — {desc} {size_label}") lines.append("") # Commits overnight commits = _git_log_since_yesterday() if commits: lines.append("## Commits since yesterday") lines.append("") for c in commits: lines.append(f"- {c}") lines.append("") # Reminders lines.append("## Reminders") lines.append("") lines.append("- The NCBI API key was pasted into chat earlier in this session — " "rotate it at ncbi.nlm.nih.gov → Account Settings → API Key Management → " "Revoke + Create New.") lines.append("- The `caffeinate -dimsu` you started for the overnight run can be stopped " "now (`Ctrl+C` in that terminal).") lines.append("- Display sleep / Battery settings: revert to your normal preferences if " "you changed them last night.") lines.append("") out_path = ROOT / "OVERNIGHT_SUMMARY.md" # Atomic write — avoid races with the periodic regen loop tmp_path = out_path.with_suffix(".md.tmp") tmp_path.write_text("\n".join(lines)) tmp_path.replace(out_path) print(f"Wrote {out_path}") if __name__ == "__main__": main()