Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi commited on Apr 27

Commit

bbbea9d

1 Parent(s): edf6713

Fix predictions parquet type mix + plumb feature_cols through eval

Two bugs surfaced by new tests that would have broken the morning chain:

1. predictions.parquet schema error
pyarrow can't store a column with both float values (regression
predictions) and string values (classification labels). save_results now
casts both predicted/observed to str on write — eval.py already handles
the inverse cast via pd.to_numeric for regression analyses.

2. Feature-target correlations section missing on synthetic data
eval.py was inferring feature_cols from column-name prefixes, which
matched the production feature names but missed the test fixture's f0..f7
columns. Now: feature_cols can be passed explicitly OR read from a
__meta__ section in baseline_results.json (which 03_train_baseline.py
now writes).

Both surfaced by extending tests/test_integration.py with two new tests:
- test_save_results_writes_predictions_parquet
- test_full_chain_render_with_predictions

Total tests: 21/21 passing.

Featurize at 59% (10140/17094, 0.21% failure rate) — well within tolerance.

Files changed (4) hide show

scripts/03_train_baseline.py +1 -1
src/microbe_model/eval.py +22 -6
src/microbe_model/train/baseline.py +9 -1
tests/test_integration.py +46 -0

scripts/03_train_baseline.py CHANGED Viewed

@@ -47,7 +47,7 @@ def main() -> None:
     out = config.ARTIFACTS / "baseline_results.json"
     predictions_out = config.ARTIFACTS / "predictions.parquet"
-    save_results(results, out, predictions_path=predictions_out)
     print(f"Wrote per-strain predictions to {predictions_out}")
     print(f"\nResults summary ({time.time() - t0:.1f}s):\n")

     out = config.ARTIFACTS / "baseline_results.json"
     predictions_out = config.ARTIFACTS / "predictions.parquet"
+    save_results(results, out, predictions_path=predictions_out, feature_cols=feature_cols)
     print(f"Wrote per-strain predictions to {predictions_out}")
     print(f"\nResults summary ({time.time() - t0:.1f}s):\n")

src/microbe_model/eval.py CHANGED Viewed

@@ -43,8 +43,13 @@ def render_report(
     n_strains: int | None = None,
     runtime_seconds: float | None = None,
     predictions_path: Path | None = None,
 ) -> None:
-    results: dict[str, Any] = json.loads(results_path.read_text())
     df = pd.read_parquet(dataset_path)
     predictions = (
         pd.read_parquet(predictions_path)
@@ -84,7 +89,17 @@ def render_report(
         "- _No targets trained successfully — see logs._"
     ])
     lines.append("")
-    lines.append(f"Trained on **{len(df):,}** strains with **{len([c for c in df.columns if c.startswith(('aa_frac_', 'genome_size', 'gc_', 'n_predicted', 'coding_', 'mean_', 'aromatic_', 'pos_', 'neg_', 'ivywrel_', 'median_'))])}** genome-derived features. "
                  f"Cross-validation: 5-fold GroupKFold by taxonomic family.")
     lines.append("")
@@ -175,12 +190,13 @@ def render_report(
             lines.append("")
     # Section: feature-target correlations (data-exploration sanity check)
-    feature_cols = [
         c for c in df.columns
         if c.startswith(("aa_frac_", "genome_size", "gc_", "n_predicted", "coding_",
-                          "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_"))
     ]
-    if feature_cols:
         from microbe_model.explore import feature_target_correlations
         lines.append("## Feature ↔ target correlations (Spearman, top 10)")
         lines.append("")
@@ -189,7 +205,7 @@ def render_report(
                      "`optimal_temperature_c` (Zeldovich 2007 thermophile signature).")
         lines.append("")
         for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
-            corrs = feature_target_correlations(df, feature_cols, target, top_n=10)
             if not corrs:
                 continue
             lines.append(f"### `{target}`")

     n_strains: int | None = None,
     runtime_seconds: float | None = None,
     predictions_path: Path | None = None,
+    feature_cols: list[str] | None = None,
 ) -> None:
+    raw_results: dict[str, Any] = json.loads(results_path.read_text())
+    meta = raw_results.pop("__meta__", {})
+    if feature_cols is None and "feature_cols" in meta:
+        feature_cols = meta["feature_cols"]
+    results: dict[str, Any] = raw_results
     df = pd.read_parquet(dataset_path)
     predictions = (
         pd.read_parquet(predictions_path)
         "- _No targets trained successfully — see logs._"
     ])
     lines.append("")
+    n_features = (
+        len(feature_cols) if feature_cols is not None
+        else sum(
+            1 for c in df.columns
+            if c.startswith((
+                "aa_frac_", "genome_size", "gc_", "n_predicted", "coding_",
+                "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_",
+            ))
+        )
+    )
+    lines.append(f"Trained on **{len(df):,}** strains with **{n_features}** genome-derived features. "
                  f"Cross-validation: 5-fold GroupKFold by taxonomic family.")
     lines.append("")
             lines.append("")
     # Section: feature-target correlations (data-exploration sanity check)
+    detected_feature_cols = feature_cols if feature_cols is not None else [
         c for c in df.columns
         if c.startswith(("aa_frac_", "genome_size", "gc_", "n_predicted", "coding_",
+                          "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_", "f"))
+        and pd.api.types.is_numeric_dtype(df[c])
     ]
+    if detected_feature_cols:
         from microbe_model.explore import feature_target_correlations
         lines.append("## Feature ↔ target correlations (Spearman, top 10)")
         lines.append("")
                      "`optimal_temperature_c` (Zeldovich 2007 thermophile signature).")
         lines.append("")
         for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
+            corrs = feature_target_correlations(df, detected_feature_cols, target, top_n=10)
             if not corrs:
                 continue
             lines.append(f"### `{target}`")

src/microbe_model/train/baseline.py CHANGED Viewed

@@ -170,8 +170,9 @@ def save_results(
     path: Path,
     *,
     predictions_path: Path | None = None,
 ) -> None:
-    payload = {
         target: {
             "task": r.task,
             "mean_metric": r.mean(),
@@ -182,6 +183,8 @@ def save_results(
         }
         for target, r in results.items()
     }
     path.write_text(json.dumps(payload, indent=2))
     if predictions_path is not None:
@@ -190,6 +193,11 @@ def save_results(
             if r.predictions is None or r.predictions.empty:
                 continue
             df = r.predictions.copy()
             df["target"] = target
             df["task"] = r.task
             frames.append(df)

     path: Path,
     *,
     predictions_path: Path | None = None,
+    feature_cols: list[str] | None = None,
 ) -> None:
+    payload: dict[str, Any] = {
         target: {
             "task": r.task,
             "mean_metric": r.mean(),
         }
         for target, r in results.items()
     }
+    if feature_cols is not None:
+        payload["__meta__"] = {"feature_cols": list(feature_cols)}
     path.write_text(json.dumps(payload, indent=2))
     if predictions_path is not None:
             if r.predictions is None or r.predictions.empty:
                 continue
             df = r.predictions.copy()
+            # Cast to str for parquet compatibility — predicted/observed can be float
+            # (regression) or class label (classification). Eval re-casts numerics
+            # via pd.to_numeric where needed.
+            df["predicted"] = df["predicted"].astype(str)
+            df["observed"] = df["observed"].astype(str)
             df["target"] = target
             df["task"] = r.task
             frames.append(df)

tests/test_integration.py CHANGED Viewed

@@ -96,3 +96,49 @@ def test_save_results_roundtrip(tmp_path: Path) -> None:
         assert "task" in loaded[target]
         assert "mean_metric" in loaded[target]
         assert "folds" in loaded[target]

         assert "task" in loaded[target]
         assert "mean_metric" in loaded[target]
         assert "folds" in loaded[target]
+def test_save_results_writes_predictions_parquet(tmp_path: Path) -> None:
+    df, feature_cols = _synthetic_dataset(n=200)
+    results = train_all(df, feature_cols, group_col_override="group")
+    results_path = tmp_path / "results.json"
+    pred_path = tmp_path / "predictions.parquet"
+    save_results(results, results_path, predictions_path=pred_path)
+    assert pred_path.exists()
+    preds = pd.read_parquet(pred_path)
+    # Should have rows for both regression and classification targets
+    assert "target" in preds.columns
+    assert "task" in preds.columns
+    assert "row_idx" in preds.columns
+    assert "predicted" in preds.columns
+    assert "observed" in preds.columns
+    assert preds["task"].isin({"regression", "classification"}).all()
+    # row_idx should map back to the source df
+    assert preds["row_idx"].max() < len(df)
+def test_full_chain_render_with_predictions(tmp_path: Path) -> None:
+    """Full chain: train → save with predictions → render report → check per-family section."""
+    df, feature_cols = _synthetic_dataset(n=200)
+    results = train_all(df, feature_cols, group_col_override="group")
+    results_path = tmp_path / "results.json"
+    pred_path = tmp_path / "predictions.parquet"
+    save_results(results, results_path, predictions_path=pred_path)
+    table_path = tmp_path / "table.parquet"
+    df.to_parquet(table_path, index=False)
+    out_path = tmp_path / "report.md"
+    render_report(
+        results_path, table_path, out_path,
+        predictions_path=pred_path,
+        feature_cols=feature_cols,
+    )
+    text = out_path.read_text()
+    assert "## Per-family error breakdown" in text
+    assert "## Feature ↔ target correlations" in text
+    assert "## TL;DR" in text