microbe-model / tests /test_integration.py
Miyu Horiuchi
Fix predictions parquet type mix + plumb feature_cols through eval
bbbea9d
"""Integration test: train_all + render_report end-to-end on synthetic data.
Exercises the contiguous-class fix in the classification path, the GroupKFold split,
and the markdown rendering — without needing real BacDive or NCBI data.
"""
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import pandas as pd
from microbe_model.eval import render_report
from microbe_model.train.baseline import save_results, train_all
def _synthetic_dataset(n: int = 300, seed: int = 0) -> tuple[pd.DataFrame, list[str]]:
rng = np.random.default_rng(seed)
feature_cols = [f"f{i}" for i in range(8)]
df = pd.DataFrame(rng.normal(size=(n, 8)), columns=feature_cols)
df["bacdive_id"] = np.arange(n)
df["genome_accession"] = [f"GCA_{i:09d}.1" for i in range(n)]
df["family"] = [f"family_{i % 12}" for i in range(n)]
df["genus"] = [f"genus_{i % 30}" for i in range(n)]
df["species"] = [f"species_{i}" for i in range(n)]
# Regression target with real signal in f0 + noise
df["optimal_temperature_c"] = 30 + 5 * df["f0"] + rng.normal(scale=2, size=n)
df["optimal_ph"] = 7.0 + 0.5 * df["f1"] + rng.normal(scale=0.1, size=n)
df["salt_tolerance_pct"] = np.abs(2 + df["f2"] + rng.normal(scale=0.5, size=n))
# Classification target — sometimes only some classes appear in a fold
classes = ["aerobe", "anaerobe", "facultative", "microaerophile", "obligate aerobe"]
df["oxygen_requirement"] = rng.choice(classes, size=n)
# Inject some NaNs to mirror real BacDive sparsity
nan_mask = rng.random(n) > 0.7
df.loc[nan_mask, "optimal_ph"] = np.nan
nan_mask = rng.random(n) > 0.5
df.loc[nan_mask, "salt_tolerance_pct"] = np.nan
df["group"] = df["family"]
return df, feature_cols
def test_train_all_handles_classification_with_missing_classes_per_fold(tmp_path: Path) -> None:
df, feature_cols = _synthetic_dataset(n=200)
results = train_all(df, feature_cols, group_col_override="group")
# All four targets should produce at least one fold of results
for target in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"):
assert target in results
assert results[target].folds, f"{target} produced no folds"
# Regression should beat the always-mean baseline since f0 carries real signal
temp_result = results["optimal_temperature_c"]
baseline_mae = float(np.mean(np.abs(
df["optimal_temperature_c"] - df["optimal_temperature_c"].mean()
)))
assert temp_result.mean() < baseline_mae, "model worse than always-mean baseline"
def test_render_report_writes_markdown(tmp_path: Path) -> None:
df, feature_cols = _synthetic_dataset(n=150)
results = train_all(df, feature_cols, group_col_override="group")
results_path = tmp_path / "results.json"
save_results(results, results_path)
table_path = tmp_path / "table.parquet"
df.to_parquet(table_path, index=False)
out_path = tmp_path / "report.md"
render_report(results_path, table_path, out_path)
text = out_path.read_text()
assert text.startswith("# microbe-model")
assert "## Per-target results" in text
assert "optimal_temperature_c" in text
assert "oxygen_requirement" in text
assert "## Known limitations" in text
assert "## Next steps" in text
def test_save_results_roundtrip(tmp_path: Path) -> None:
df, feature_cols = _synthetic_dataset(n=100)
results = train_all(df, feature_cols, group_col_override="group")
path = tmp_path / "results.json"
save_results(results, path)
loaded = json.loads(path.read_text())
for target in results:
assert target in loaded
assert "task" in loaded[target]
assert "mean_metric" in loaded[target]
assert "folds" in loaded[target]
def test_save_results_writes_predictions_parquet(tmp_path: Path) -> None:
df, feature_cols = _synthetic_dataset(n=200)
results = train_all(df, feature_cols, group_col_override="group")
results_path = tmp_path / "results.json"
pred_path = tmp_path / "predictions.parquet"
save_results(results, results_path, predictions_path=pred_path)
assert pred_path.exists()
preds = pd.read_parquet(pred_path)
# Should have rows for both regression and classification targets
assert "target" in preds.columns
assert "task" in preds.columns
assert "row_idx" in preds.columns
assert "predicted" in preds.columns
assert "observed" in preds.columns
assert preds["task"].isin({"regression", "classification"}).all()
# row_idx should map back to the source df
assert preds["row_idx"].max() < len(df)
def test_full_chain_render_with_predictions(tmp_path: Path) -> None:
"""Full chain: train → save with predictions → render report → check per-family section."""
df, feature_cols = _synthetic_dataset(n=200)
results = train_all(df, feature_cols, group_col_override="group")
results_path = tmp_path / "results.json"
pred_path = tmp_path / "predictions.parquet"
save_results(results, results_path, predictions_path=pred_path)
table_path = tmp_path / "table.parquet"
df.to_parquet(table_path, index=False)
out_path = tmp_path / "report.md"
render_report(
results_path, table_path, out_path,
predictions_path=pred_path,
feature_cols=feature_cols,
)
text = out_path.read_text()
assert "## Per-family error breakdown" in text
assert "## Feature ↔ target correlations" in text
assert "## TL;DR" in text