Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi Claude Opus 4.7 (1M context) commited on Apr 26

Commit

7ba05d6

1 Parent(s): d082ced

Fix classification fold bug + add end-to-end integration tests

Bug: XGBClassifier requires contiguous class labels 0..k-1, but global LabelEncoder
followed by GroupKFold can produce train folds with non-contiguous subsets (e.g.
classes {0,1,3,4,5} when class 2 happened to be all in the test fold). xgboost
raised ValueError and the entire classification target failed.

Fix in src/microbe_model/train/baseline.py:
- Re-encode labels per fold (LabelEncoder fit on train fold only)
- Drop test samples whose class never appeared in train (correct behavior — model
cannot be evaluated on a class it has never seen)
- Skip folds where train fold has fewer than 2 distinct classes

Caught by a partial-data smoke run on the live featurize output. Now caught
automatically by tests/test_integration.py:
- test_train_all_handles_classification_with_missing_classes_per_fold — exercises
the bug case with synthetic data containing 5 oxygen classes across 12 families
- test_render_report_writes_markdown — full train→save→render path
- test_save_results_roundtrip — JSON serialization

All three pass. Total tests: 15/15.

Featurize at 11% (1799/17094 strains, 0 failures across the version-fallback path,
3 ascertainment failures elsewhere — all within tolerance).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

src/microbe_model/train/baseline.py +23 -12
tests/test_integration.py +98 -0

src/microbe_model/train/baseline.py CHANGED Viewed

@@ -60,10 +60,9 @@ def train_target(
         return TargetResult(target=target, task=task)
     if task == "classification":
-        encoder = LabelEncoder()
-        y_enc = encoder.fit_transform(y.astype(str))
     else:
-        y_enc = y.to_numpy(dtype=float)
     n_unique_groups = groups.nunique()
     splits = min(n_splits, max(2, n_unique_groups))
@@ -73,11 +72,21 @@ def train_target(
     importance_acc = np.zeros(len(feature_cols), dtype=float)
     fold_count = 0
-    for tr_idx, te_idx in kfold.split(X, y_enc, groups):
         if task == "classification":
-            n_classes = len(np.unique(y_enc[tr_idx]))
-            if n_classes < 2:
                 continue
             model = xgb.XGBClassifier(
                 n_estimators=300,
                 max_depth=5,
@@ -86,10 +95,11 @@ def train_target(
                 n_jobs=-1,
                 eval_metric="mlogloss",
             )
-            model.fit(X.iloc[tr_idx], y_enc[tr_idx])
-            preds = model.predict(X.iloc[te_idx])
-            score = f1_score(y_enc[te_idx], preds, average="macro")
             metric = "f1_macro"
         else:
             model = xgb.XGBRegressor(
                 n_estimators=500,
@@ -98,10 +108,11 @@ def train_target(
                 tree_method="hist",
                 n_jobs=-1,
             )
-            model.fit(X.iloc[tr_idx], y_enc[tr_idx])
             preds = model.predict(X.iloc[te_idx])
-            score = mean_absolute_error(y_enc[te_idx], preds)
             metric = "mae"
         result.folds.append(FoldResult(
             target=target,
@@ -109,7 +120,7 @@ def train_target(
             metric_name=metric,
             value=float(score),
             n_train=int(len(tr_idx)),
-            n_test=int(len(te_idx)),
         ))
         importance_acc += model.feature_importances_
         fold_count += 1

         return TargetResult(target=target, task=task)
     if task == "classification":
+        y_str = y.astype(str).to_numpy()
     else:
+        y_arr = y.to_numpy(dtype=float)
     n_unique_groups = groups.nunique()
     splits = min(n_splits, max(2, n_unique_groups))
     importance_acc = np.zeros(len(feature_cols), dtype=float)
     fold_count = 0
+    split_iter = kfold.split(X, y_str if task == "classification" else y_arr, groups)
+    for tr_idx, te_idx in split_iter:
         if task == "classification":
+            # Per-fold encoding: ensures contiguous 0..k-1 labels for xgboost.
+            # Test samples whose class never appears in train are dropped from eval.
+            fold_encoder = LabelEncoder()
+            y_tr = fold_encoder.fit_transform(y_str[tr_idx])
+            if len(fold_encoder.classes_) < 2:
                 continue
+            known = set(fold_encoder.classes_)
+            te_mask = np.array([c in known for c in y_str[te_idx]])
+            if te_mask.sum() == 0:
+                continue
+            y_te = fold_encoder.transform(y_str[te_idx][te_mask])
             model = xgb.XGBClassifier(
                 n_estimators=300,
                 max_depth=5,
                 n_jobs=-1,
                 eval_metric="mlogloss",
             )
+            model.fit(X.iloc[tr_idx], y_tr)
+            preds = model.predict(X.iloc[te_idx][te_mask])
+            score = f1_score(y_te, preds, average="macro")
             metric = "f1_macro"
+            n_test = int(te_mask.sum())
         else:
             model = xgb.XGBRegressor(
                 n_estimators=500,
                 tree_method="hist",
                 n_jobs=-1,
             )
+            model.fit(X.iloc[tr_idx], y_arr[tr_idx])
             preds = model.predict(X.iloc[te_idx])
+            score = mean_absolute_error(y_arr[te_idx], preds)
             metric = "mae"
+            n_test = int(len(te_idx))
         result.folds.append(FoldResult(
             target=target,
             metric_name=metric,
             value=float(score),
             n_train=int(len(tr_idx)),
+            n_test=n_test,
         ))
         importance_acc += model.feature_importances_
         fold_count += 1

tests/test_integration.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Integration test: train_all + render_report end-to-end on synthetic data.
+Exercises the contiguous-class fix in the classification path, the GroupKFold split,
+and the markdown rendering — without needing real BacDive or NCBI data.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from microbe_model.eval import render_report
+from microbe_model.train.baseline import save_results, train_all
+def _synthetic_dataset(n: int = 300, seed: int = 0) -> tuple[pd.DataFrame, list[str]]:
+    rng = np.random.default_rng(seed)
+    feature_cols = [f"f{i}" for i in range(8)]
+    df = pd.DataFrame(rng.normal(size=(n, 8)), columns=feature_cols)
+    df["bacdive_id"] = np.arange(n)
+    df["genome_accession"] = [f"GCA_{i:09d}.1" for i in range(n)]
+    df["family"] = [f"family_{i % 12}" for i in range(n)]
+    df["genus"] = [f"genus_{i % 30}" for i in range(n)]
+    df["species"] = [f"species_{i}" for i in range(n)]
+    # Regression target with real signal in f0 + noise
+    df["optimal_temperature_c"] = 30 + 5 * df["f0"] + rng.normal(scale=2, size=n)
+    df["optimal_ph"] = 7.0 + 0.5 * df["f1"] + rng.normal(scale=0.1, size=n)
+    df["salt_tolerance_pct"] = np.abs(2 + df["f2"] + rng.normal(scale=0.5, size=n))
+    # Classification target — sometimes only some classes appear in a fold
+    classes = ["aerobe", "anaerobe", "facultative", "microaerophile", "obligate aerobe"]
+    df["oxygen_requirement"] = rng.choice(classes, size=n)
+    # Inject some NaNs to mirror real BacDive sparsity
+    nan_mask = rng.random(n) > 0.7
+    df.loc[nan_mask, "optimal_ph"] = np.nan
+    nan_mask = rng.random(n) > 0.5
+    df.loc[nan_mask, "salt_tolerance_pct"] = np.nan
+    df["group"] = df["family"]
+    return df, feature_cols
+def test_train_all_handles_classification_with_missing_classes_per_fold(tmp_path: Path) -> None:
+    df, feature_cols = _synthetic_dataset(n=200)
+    results = train_all(df, feature_cols, group_col_override="group")
+    # All four targets should produce at least one fold of results
+    for target in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"):
+        assert target in results
+        assert results[target].folds, f"{target} produced no folds"
+    # Regression should beat the always-mean baseline since f0 carries real signal
+    temp_result = results["optimal_temperature_c"]
+    baseline_mae = float(np.mean(np.abs(
+        df["optimal_temperature_c"] - df["optimal_temperature_c"].mean()
+    )))
+    assert temp_result.mean() < baseline_mae, "model worse than always-mean baseline"
+def test_render_report_writes_markdown(tmp_path: Path) -> None:
+    df, feature_cols = _synthetic_dataset(n=150)
+    results = train_all(df, feature_cols, group_col_override="group")
+    results_path = tmp_path / "results.json"
+    save_results(results, results_path)
+    table_path = tmp_path / "table.parquet"
+    df.to_parquet(table_path, index=False)
+    out_path = tmp_path / "report.md"
+    render_report(results_path, table_path, out_path)
+    text = out_path.read_text()
+    assert text.startswith("# microbe-model")
+    assert "## Per-target results" in text
+    assert "optimal_temperature_c" in text
+    assert "oxygen_requirement" in text
+    assert "## Known limitations" in text
+    assert "## Next steps" in text
+def test_save_results_roundtrip(tmp_path: Path) -> None:
+    df, feature_cols = _synthetic_dataset(n=100)
+    results = train_all(df, feature_cols, group_col_override="group")
+    path = tmp_path / "results.json"
+    save_results(results, path)
+    loaded = json.loads(path.read_text())
+    for target in results:
+        assert target in loaded
+        assert "task" in loaded[target]
+        assert "mean_metric" in loaded[target]
+        assert "folds" in loaded[target]