"""Evaluation report generation. Renders a markdown report from a trained-results JSON (the output of train/baseline.py) joined with the source dataset. Designed to be readable cold — every number includes a comparison baseline so the reader can interpret it without context. """ from __future__ import annotations import json from datetime import UTC, datetime from pathlib import Path from typing import Any import numpy as np import pandas as pd from microbe_model import config def _baseline_mae(y: np.ndarray) -> float: """MAE of the always-predict-mean baseline (sanity floor).""" if len(y) == 0: return float("nan") return float(np.mean(np.abs(y - np.mean(y)))) def _baseline_f1(y: np.ndarray) -> float: """Macro-F1 of the always-predict-majority baseline.""" from sklearn.metrics import f1_score if len(y) == 0: return float("nan") values, counts = np.unique(y, return_counts=True) majority = values[np.argmax(counts)] pred = np.full_like(y, majority) return float(f1_score(y, pred, average="macro")) def render_report( results_path: Path, dataset_path: Path, out_path: Path, *, n_strains: int | None = None, runtime_seconds: float | None = None, predictions_path: Path | None = None, feature_cols: list[str] | None = None, ) -> None: raw_results: dict[str, Any] = json.loads(results_path.read_text()) meta = raw_results.pop("__meta__", {}) if feature_cols is None and "feature_cols" in meta: feature_cols = meta["feature_cols"] results: dict[str, Any] = raw_results df = pd.read_parquet(dataset_path) predictions = ( pd.read_parquet(predictions_path) if predictions_path is not None and predictions_path.exists() else None ) lines: list[str] = [] lines.append("# microbe-model — v0 baseline eval report") lines.append("") lines.append(f"_Generated: {datetime.now(UTC).isoformat(timespec='seconds')}_") lines.append("") # Section: TL;DR — the headline number lines.append("## TL;DR") lines.append("") headline_lines = [] for target, r in results.items(): if not r["folds"]: continue y = df[target].dropna().to_numpy() if r["task"] == "regression": baseline = _baseline_mae(y.astype(float)) improvement = (baseline - r["mean_metric"]) / max(0.001, baseline) * 100 headline_lines.append( f"- **`{target}`**: MAE = **{r['mean_metric']:.2f}** " f"(vs always-predict-mean {baseline:.2f}, **{improvement:+.0f}%**)" ) else: baseline = _baseline_f1(y) improvement = (r["mean_metric"] - baseline) / max(0.001, baseline) * 100 headline_lines.append( f"- **`{target}`**: macro-F1 = **{r['mean_metric']:.3f}** " f"(vs always-predict-majority {baseline:.3f}, **{improvement:+.0f}%**)" ) lines.extend(headline_lines if headline_lines else [ "- _No targets trained successfully — see logs._" ]) lines.append("") n_features = ( len(feature_cols) if feature_cols is not None else sum( 1 for c in df.columns if c.startswith(( "aa_frac_", "genome_size", "gc_", "n_predicted", "coding_", "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_", )) ) ) lines.append(f"Trained on **{len(df):,}** strains with **{n_features}** genome-derived features. " f"Cross-validation: 5-fold GroupKFold by taxonomic family.") lines.append("") # Section: corpus lines.append("## Corpus") lines.append("") lines.append(f"- Total strains in feature table: **{len(df):,}**") if n_strains is not None: lines.append(f"- Total strains attempted (had genome accession + label): {n_strains:,}") lines.append(f"- Feature-extraction success rate: {100 * len(df) / max(1, n_strains):.1f}%") if runtime_seconds is not None: lines.append(f"- Featurize wall time: {runtime_seconds / 60:.1f} min") # Per-target label counts lines.append("- Labeled-strain counts by target:") for target in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"): if target in df.columns: n = df[target].notna().sum() lines.append(f" - `{target}`: {n:,}") lines.append("") # Section: data exploration — distributions of the regression targets lines.append("## Target distributions") lines.append("") for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"): if target not in df.columns: continue y = df[target].dropna() if len(y) == 0: continue lines.append( f"- `{target}`: n={len(y):,}, mean={y.mean():.2f}, " f"std={y.std():.2f}, p10={y.quantile(0.1):.2f}, " f"median={y.median():.2f}, p90={y.quantile(0.9):.2f}" ) if "oxygen_requirement" in df.columns: lines.append("- `oxygen_requirement`:") for cls, n in df["oxygen_requirement"].value_counts().head(10).items(): lines.append(f" - `{cls}`: {n:,}") lines.append("") # Section: per-target results lines.append("## Per-target results (5-fold GroupKFold by family)") lines.append("") lines.append("Metrics: regression = MAE (lower is better), classification = macro-F1 (higher is better).") lines.append("Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-majority).") lines.append("") lines.append("| Target | Task | n labeled | Model metric | Baseline | Improvement |") lines.append("|---|---|---|---|---|---|") for target, r in results.items(): if not r["folds"]: lines.append(f"| {target} | {r['task']} | — | _skipped (insufficient data)_ | — | — |") continue y = df[target].dropna().to_numpy() n_labeled = len(y) if r["task"] == "regression": baseline = _baseline_mae(y.astype(float)) mean = r["mean_metric"] improvement = f"{(baseline - mean) / baseline * 100:+.1f}%" lines.append(f"| `{target}` | regression | {n_labeled:,} | " f"MAE={mean:.3f} | MAE={baseline:.3f} | {improvement} |") else: baseline = _baseline_f1(y) mean = r["mean_metric"] improvement = f"{(mean - baseline) / max(0.01, baseline) * 100:+.1f}%" lines.append(f"| `{target}` | classification | {n_labeled:,} | " f"F1={mean:.3f} | F1={baseline:.3f} | {improvement} |") lines.append("") # Section: per-fold detail for target, r in results.items(): if not r["folds"]: continue lines.append(f"### `{target}` — fold-by-fold") lines.append("") lines.append("| Fold | Metric | Train | Test |") lines.append("|---|---|---|---|") for i, f in enumerate(r["folds"]): lines.append(f"| {i+1} | {f['metric_name']} = {f['value']:.3f} | " f"n={f['n_train']:,} | n={f['n_test']:,} |") lines.append("") top = r.get("top_features", {}) if top: lines.append(f"**Top 10 features for `{target}`:**") lines.append("") for name, importance in list(top.items())[:10]: lines.append(f"- `{name}` — {importance:.4f}") lines.append("") # Section: feature-target correlations (data-exploration sanity check) detected_feature_cols = feature_cols if feature_cols is not None else [ c for c in df.columns if c.startswith(("aa_frac_", "genome_size", "gc_", "n_predicted", "coding_", "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_", "f")) and pd.api.types.is_numeric_dtype(df[c]) ] if detected_feature_cols: from microbe_model.explore import feature_target_correlations lines.append("## Feature ↔ target correlations (Spearman, top 10)") lines.append("") lines.append("Sanity-checks the biology — features known to track each target should " "appear here at high |ρ|. E.g. `ivywrel_frac` should correlate with " "`optimal_temperature_c` (Zeldovich 2007 thermophile signature).") lines.append("") for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"): corrs = feature_target_correlations(df, detected_feature_cols, target, top_n=10) if not corrs: continue lines.append(f"### `{target}`") lines.append("") lines.append("| Feature | Spearman ρ | p-value |") lines.append("|---|---|---|") for row in corrs: lines.append(f"| `{row['feature']}` | {row['spearman_rho']:+.3f} | " f"{row['p_value']:.1e} |") lines.append("") # Section: per-phylum error breakdown (regression targets only) if predictions is not None and not predictions.empty and "row_idx" in predictions.columns: joined = predictions.merge( df[["genus", "family"]].rename_axis("row_idx").reset_index(), on="row_idx", how="left", ) regression_preds = joined[joined["task"] == "regression"] if not regression_preds.empty: lines.append("## Per-family error breakdown (regression targets)") lines.append("") lines.append("Top 15 most-represented families, MAE per family. Highlights where the " "model is doing well vs. struggling.") lines.append("") for target in regression_preds["target"].unique(): sub = regression_preds[regression_preds["target"] == target].copy() sub["abs_error"] = ( pd.to_numeric(sub["predicted"]) - pd.to_numeric(sub["observed"]) ).abs() grp = (sub.groupby("family", dropna=False) .agg(n=("abs_error", "size"), mae=("abs_error", "mean")) .sort_values("n", ascending=False) .head(15)) if grp.empty: continue lines.append(f"### `{target}`") lines.append("") lines.append("| Family | n | MAE |") lines.append("|---|---|---|") for fam, row in grp.iterrows(): fam_label = fam if pd.notna(fam) else "_(no family)_" lines.append(f"| {fam_label} | {int(row['n'])} | {row['mae']:.3f} |") lines.append("") # Section: limitations lines.append("## Known limitations") lines.append("") lines.append("- **Survivorship bias.** BacDive only contains organisms that have been cultured " "successfully at least once. The model cannot generalize to truly uncultured strains " "without explicit out-of-distribution evaluation.") lines.append("- **Optimum derivation is heuristic.** Most BacDive temperature entries are tagged " "as `growth` (positive growth at this temperature), not `optimum`. We approximate " "the optimum as the median of positive-growth temperatures when no explicit " "optimum is recorded — this can be off by 5°C or more for some strains.") lines.append("- **Family grouping is naive.** The current `family` column is derived from the " "genus (first word of binomial name). A proper LPSN/GTDB family assignment would " "give tighter taxonomic grouping.") lines.append("- **Feature set is shallow.** No HMM/KEGG annotations, no codon usage indices, no " "tRNA counts. These are interpretable next steps before moving to genome LMs.") lines.append("- **Pyrodigal accuracy.** Gene prediction quality drops on highly-fragmented " "assemblies and atypical genetic codes. Not currently flagged in the feature set.") lines.append("") # Section: next steps lines.append("## Next steps") lines.append("") lines.append("1. **Add tetranucleotide / codon-usage features.** ~50 extra columns, " "well-known signal for thermophily.") lines.append("2. **Replace naive family lookup with LPSN/GTDB join.** Reduces leakage in CV.") lines.append("3. **Integrate KOMODO media DB** as a richer label source than BacDive alone.") lines.append("4. **Move to genome embeddings** (Nucleotide Transformer / Evo-1 / DNABERT-2) " "once the tabular ceiling is established.") lines.append("5. **Active learning loop**: select novel-family strains where the model is " "uncertain, prioritize these for wet-lab cultivation testing.") lines.append("") out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text("\n".join(lines)) if __name__ == "__main__": render_report( results_path=config.ARTIFACTS / "baseline_results.json", dataset_path=config.DATA / "training_table.parquet", out_path=config.ARTIFACTS / "eval_report.md", predictions_path=config.ARTIFACTS / "predictions.parquet", )