Spaces:
Running
Running
File size: 7,445 Bytes
4b79970 6b52ab8 4b79970 de9e822 4b79970 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | """Render a one-page OVERNIGHT_SUMMARY.md the user reads first.
Pulls headline numbers from the eval report + featurize log + git history,
and renders a single short page that orients the reader cold.
"""
from __future__ import annotations
import re
import subprocess
from datetime import UTC, datetime
from pathlib import Path
import pandas as pd
from microbe_model import config
ROOT = config.ROOT
def _read_jsonl_count(path: Path) -> int:
if not path.exists():
return 0
with open(path) as fh:
return sum(1 for _ in fh)
def _read_featurize_log_summary(path: Path) -> dict[str, str | int]:
"""Parse the last tqdm line for success/fail counts."""
if not path.exists():
return {}
with open(path) as fh:
text = fh.read().replace("\r", "\n")
last_match = None
pattern = re.compile(r"(\d+)/(\d+).*?fail=(\d+).*?success=(\d+)")
for m in pattern.finditer(text):
last_match = m
if last_match is None:
# Fall back to alternate ordering (success first then fail)
alt = re.compile(r"(\d+)/(\d+).*?success=(\d+).*?fail=(\d+)")
for m in alt.finditer(text):
last_match = m
if last_match is None:
return {}
completed, total, success, fail = (int(last_match.group(i)) for i in range(1, 5))
else:
completed, total, fail, success = (int(last_match.group(i)) for i in range(1, 5))
return {
"completed": completed,
"total": total,
"success": success,
"fail": fail,
}
def _git_log_since_yesterday() -> list[str]:
try:
out = subprocess.check_output(
["git", "-C", str(ROOT), "log", "--oneline", "--since=yesterday"],
text=True,
)
except subprocess.CalledProcessError:
return []
return [line.strip() for line in out.splitlines() if line.strip()]
def main() -> None:
pheno_path = config.DATA / "bacdive_phenotypes.parquet"
features_jsonl = config.DATA / "features.jsonl"
featurize_log = config.ARTIFACTS / "featurize.log"
eval_report = config.ARTIFACTS / "eval_report.md"
blockers = config.ARTIFACTS / "blockers.md"
train_log = config.ARTIFACTS / "train.log"
pheno = pd.read_parquet(pheno_path) if pheno_path.exists() else None
feats_n = _read_jsonl_count(features_jsonl)
summary = _read_featurize_log_summary(featurize_log)
lines: list[str] = []
lines.append("# Overnight run β summary")
lines.append("")
lines.append(f"_Written {datetime.now(UTC).isoformat(timespec='minutes')}_")
lines.append("")
# Pipeline status
lines.append("## Pipeline status")
lines.append("")
if pheno is not None:
lines.append(f"- β
BacDive scan: **{len(pheno):,}** strains pulled")
n_genome = pheno["genome_accession"].notna().sum()
n_temp = pheno["optimal_temperature_c"].notna().sum()
n_train_ready = (
pheno["genome_accession"].notna() & pheno["optimal_temperature_c"].notna()
).sum()
lines.append(f" - {n_genome:,} have genome accessions")
lines.append(f" - {n_temp:,} have optimal_temperature_c labels")
lines.append(f" - **{n_train_ready:,}** strains are training-ready (genome + T_opt)")
else:
lines.append("- β BacDive scan did not complete (data/bacdive_phenotypes.parquet missing)")
if summary:
pct = 100 * summary["completed"] / max(1, summary["total"])
success_rate = 100 * summary["success"] / max(1, summary["completed"])
if summary["completed"] >= summary["total"]:
status = "β
"
descriptor = "complete"
else:
status = "π‘"
descriptor = f"in progress ({pct:.0f}%)"
lines.append(f"- {status} Featurize: {descriptor}")
lines.append(f" - Processed: {summary['completed']:,} / {summary['total']:,}")
lines.append(f" - Successful: {summary['success']:,} ({success_rate:.1f}%)")
lines.append(f" - Failed: {summary['fail']:,} (mostly suppressed/withdrawn NCBI assemblies)")
elif feats_n:
lines.append(f"- π‘ Featurize: {feats_n:,} feature rows (log not parseable)")
else:
lines.append("- β Featurize did not run")
if train_log.exists():
lines.append("- β
Training: see `artifacts/train.log` for stdout")
else:
lines.append("- β Training: not yet run (waits for featurize completion)")
if eval_report.exists():
lines.append(f"- β
Eval report: **`{eval_report.relative_to(ROOT)}`**")
else:
lines.append("- β Eval report: not yet generated")
lines.append("")
# What to read first
lines.append("## What to read first")
lines.append("")
next_steps: list[str] = []
if eval_report.exists():
next_steps.append(f"Open **`{eval_report.relative_to(ROOT)}`** β headline metrics + per-target detail.")
else:
next_steps.append("Wait for `artifacts/eval_report.md` to be generated, then open it.")
if blockers.exists():
next_steps.append(f"Open **`{blockers.relative_to(ROOT)}`** β anything I got stuck on.")
next_steps.append("Check `git log --oneline` to see the commit timeline.")
for i, step in enumerate(next_steps, start=1):
lines.append(f"{i}. {step}")
lines.append("")
# Files written this run
lines.append("## Files of interest")
lines.append("")
files_of_interest = [
("artifacts/eval_report.md", "headline result + metrics"),
("artifacts/baseline_results.json", "machine-readable per-fold scores"),
("data/bacdive_phenotypes.parquet", "phenotype labels (gitignored)"),
("data/features.parquet", "extracted genome features (gitignored)"),
("data/training_table.parquet", "merged + group-keyed table used for training (gitignored)"),
]
for rel, desc in files_of_interest:
path = ROOT / rel
marker = "β
" if path.exists() else "β"
if path.exists():
size = path.stat().st_size
size_label = f"{size / 1e6:.1f} MB" if size >= 100_000 else f"{size / 1e3:.1f} KB"
else:
size_label = ""
lines.append(f"- {marker} `{rel}` β {desc} {size_label}")
lines.append("")
# Commits overnight
commits = _git_log_since_yesterday()
if commits:
lines.append("## Commits since yesterday")
lines.append("")
for c in commits:
lines.append(f"- {c}")
lines.append("")
# Reminders
lines.append("## Reminders")
lines.append("")
lines.append("- The NCBI API key was pasted into chat earlier in this session β "
"rotate it at ncbi.nlm.nih.gov β Account Settings β API Key Management β "
"Revoke + Create New.")
lines.append("- The `caffeinate -dimsu` you started for the overnight run can be stopped "
"now (`Ctrl+C` in that terminal).")
lines.append("- Display sleep / Battery settings: revert to your normal preferences if "
"you changed them last night.")
lines.append("")
out_path = ROOT / "OVERNIGHT_SUMMARY.md"
# Atomic write β avoid races with the periodic regen loop
tmp_path = out_path.with_suffix(".md.tmp")
tmp_path.write_text("\n".join(lines))
tmp_path.replace(out_path)
print(f"Wrote {out_path}")
if __name__ == "__main__":
main()
|