File size: 7,445 Bytes
4b79970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b52ab8
 
 
 
 
 
4b79970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de9e822
 
 
 
4b79970
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""Render a one-page OVERNIGHT_SUMMARY.md the user reads first.

Pulls headline numbers from the eval report + featurize log + git history,
and renders a single short page that orients the reader cold.
"""
from __future__ import annotations

import re
import subprocess
from datetime import UTC, datetime
from pathlib import Path

import pandas as pd

from microbe_model import config

ROOT = config.ROOT


def _read_jsonl_count(path: Path) -> int:
    if not path.exists():
        return 0
    with open(path) as fh:
        return sum(1 for _ in fh)


def _read_featurize_log_summary(path: Path) -> dict[str, str | int]:
    """Parse the last tqdm line for success/fail counts."""
    if not path.exists():
        return {}
    with open(path) as fh:
        text = fh.read().replace("\r", "\n")
    last_match = None
    pattern = re.compile(r"(\d+)/(\d+).*?fail=(\d+).*?success=(\d+)")
    for m in pattern.finditer(text):
        last_match = m
    if last_match is None:
        # Fall back to alternate ordering (success first then fail)
        alt = re.compile(r"(\d+)/(\d+).*?success=(\d+).*?fail=(\d+)")
        for m in alt.finditer(text):
            last_match = m
        if last_match is None:
            return {}
        completed, total, success, fail = (int(last_match.group(i)) for i in range(1, 5))
    else:
        completed, total, fail, success = (int(last_match.group(i)) for i in range(1, 5))
    return {
        "completed": completed,
        "total": total,
        "success": success,
        "fail": fail,
    }


def _git_log_since_yesterday() -> list[str]:
    try:
        out = subprocess.check_output(
            ["git", "-C", str(ROOT), "log", "--oneline", "--since=yesterday"],
            text=True,
        )
    except subprocess.CalledProcessError:
        return []
    return [line.strip() for line in out.splitlines() if line.strip()]


def main() -> None:
    pheno_path = config.DATA / "bacdive_phenotypes.parquet"
    features_jsonl = config.DATA / "features.jsonl"
    featurize_log = config.ARTIFACTS / "featurize.log"
    eval_report = config.ARTIFACTS / "eval_report.md"
    blockers = config.ARTIFACTS / "blockers.md"
    train_log = config.ARTIFACTS / "train.log"

    pheno = pd.read_parquet(pheno_path) if pheno_path.exists() else None
    feats_n = _read_jsonl_count(features_jsonl)
    summary = _read_featurize_log_summary(featurize_log)

    lines: list[str] = []
    lines.append("# Overnight run β€” summary")
    lines.append("")
    lines.append(f"_Written {datetime.now(UTC).isoformat(timespec='minutes')}_")
    lines.append("")

    # Pipeline status
    lines.append("## Pipeline status")
    lines.append("")
    if pheno is not None:
        lines.append(f"- βœ… BacDive scan: **{len(pheno):,}** strains pulled")
        n_genome = pheno["genome_accession"].notna().sum()
        n_temp = pheno["optimal_temperature_c"].notna().sum()
        n_train_ready = (
            pheno["genome_accession"].notna() & pheno["optimal_temperature_c"].notna()
        ).sum()
        lines.append(f"  - {n_genome:,} have genome accessions")
        lines.append(f"  - {n_temp:,} have optimal_temperature_c labels")
        lines.append(f"  - **{n_train_ready:,}** strains are training-ready (genome + T_opt)")
    else:
        lines.append("- ❌ BacDive scan did not complete (data/bacdive_phenotypes.parquet missing)")

    if summary:
        pct = 100 * summary["completed"] / max(1, summary["total"])
        success_rate = 100 * summary["success"] / max(1, summary["completed"])
        if summary["completed"] >= summary["total"]:
            status = "βœ…"
            descriptor = "complete"
        else:
            status = "🟑"
            descriptor = f"in progress ({pct:.0f}%)"
        lines.append(f"- {status} Featurize: {descriptor}")
        lines.append(f"  - Processed: {summary['completed']:,} / {summary['total']:,}")
        lines.append(f"  - Successful: {summary['success']:,} ({success_rate:.1f}%)")
        lines.append(f"  - Failed: {summary['fail']:,} (mostly suppressed/withdrawn NCBI assemblies)")
    elif feats_n:
        lines.append(f"- 🟑 Featurize: {feats_n:,} feature rows (log not parseable)")
    else:
        lines.append("- ❌ Featurize did not run")

    if train_log.exists():
        lines.append("- βœ… Training: see `artifacts/train.log` for stdout")
    else:
        lines.append("- ⏭ Training: not yet run (waits for featurize completion)")

    if eval_report.exists():
        lines.append(f"- βœ… Eval report: **`{eval_report.relative_to(ROOT)}`**")
    else:
        lines.append("- ⏭ Eval report: not yet generated")

    lines.append("")

    # What to read first
    lines.append("## What to read first")
    lines.append("")
    next_steps: list[str] = []
    if eval_report.exists():
        next_steps.append(f"Open **`{eval_report.relative_to(ROOT)}`** β€” headline metrics + per-target detail.")
    else:
        next_steps.append("Wait for `artifacts/eval_report.md` to be generated, then open it.")
    if blockers.exists():
        next_steps.append(f"Open **`{blockers.relative_to(ROOT)}`** β€” anything I got stuck on.")
    next_steps.append("Check `git log --oneline` to see the commit timeline.")
    for i, step in enumerate(next_steps, start=1):
        lines.append(f"{i}. {step}")
    lines.append("")

    # Files written this run
    lines.append("## Files of interest")
    lines.append("")
    files_of_interest = [
        ("artifacts/eval_report.md", "headline result + metrics"),
        ("artifacts/baseline_results.json", "machine-readable per-fold scores"),
        ("data/bacdive_phenotypes.parquet", "phenotype labels (gitignored)"),
        ("data/features.parquet", "extracted genome features (gitignored)"),
        ("data/training_table.parquet", "merged + group-keyed table used for training (gitignored)"),
    ]
    for rel, desc in files_of_interest:
        path = ROOT / rel
        marker = "βœ…" if path.exists() else "β€”"
        if path.exists():
            size = path.stat().st_size
            size_label = f"{size / 1e6:.1f} MB" if size >= 100_000 else f"{size / 1e3:.1f} KB"
        else:
            size_label = ""
        lines.append(f"- {marker} `{rel}` β€” {desc} {size_label}")
    lines.append("")

    # Commits overnight
    commits = _git_log_since_yesterday()
    if commits:
        lines.append("## Commits since yesterday")
        lines.append("")
        for c in commits:
            lines.append(f"- {c}")
        lines.append("")

    # Reminders
    lines.append("## Reminders")
    lines.append("")
    lines.append("- The NCBI API key was pasted into chat earlier in this session β€” "
                 "rotate it at ncbi.nlm.nih.gov β†’ Account Settings β†’ API Key Management β†’ "
                 "Revoke + Create New.")
    lines.append("- The `caffeinate -dimsu` you started for the overnight run can be stopped "
                 "now (`Ctrl+C` in that terminal).")
    lines.append("- Display sleep / Battery settings: revert to your normal preferences if "
                 "you changed them last night.")
    lines.append("")

    out_path = ROOT / "OVERNIGHT_SUMMARY.md"
    # Atomic write β€” avoid races with the periodic regen loop
    tmp_path = out_path.with_suffix(".md.tmp")
    tmp_path.write_text("\n".join(lines))
    tmp_path.replace(out_path)
    print(f"Wrote {out_path}")


if __name__ == "__main__":
    main()