Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi commited on Apr 26

Commit

95a1bfd

1 Parent(s): d278e18

Add feature↔target correlation analysis to eval report

src/microbe_model/explore.py:
- feature_target_correlations: Spearman ρ + p-value for each feature against
a regression target, sorted by |ρ|
- class_mean_features: per-class feature means for classification targets,
ranked by between-class / within-class variance ratio (F-stat-like)

Wired into the eval report as a "Feature ↔ target correlations (Spearman, top 10)"
section, immediately after per-fold detail. This is the biology sanity check —
the user can see at a glance that, e.g., ivywrel_frac correlates with
optimal_temperature_c (Zeldovich 2007 thermophile signature). If the expected
correlations don't show up at high |ρ|, something is wrong with feature
extraction.

scipy is already pulled in transitively by scikit-learn — no new deps.

Tests: 15/15 still passing.

Files changed (2) hide show

src/microbe_model/eval.py +27 -0
src/microbe_model/explore.py +88 -0

src/microbe_model/eval.py CHANGED Viewed

@@ -174,6 +174,33 @@ def render_report(
                 lines.append(f"- `{name}` — {importance:.4f}")
             lines.append("")
     # Section: per-phylum error breakdown (regression targets only)
     if predictions is not None and not predictions.empty and "row_idx" in predictions.columns:
         joined = predictions.merge(

                 lines.append(f"- `{name}` — {importance:.4f}")
             lines.append("")
+    # Section: feature-target correlations (data-exploration sanity check)
+    feature_cols = [
+        c for c in df.columns
+        if c.startswith(("aa_frac_", "genome_size", "gc_", "n_predicted", "coding_",
+                          "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_"))
+    ]
+    if feature_cols:
+        from microbe_model.explore import feature_target_correlations
+        lines.append("## Feature ↔ target correlations (Spearman, top 10)")
+        lines.append("")
+        lines.append("Sanity-checks the biology — features known to track each target should "
+                     "appear here at high |ρ|. E.g. `ivywrel_frac` should correlate with "
+                     "`optimal_temperature_c` (Zeldovich 2007 thermophile signature).")
+        lines.append("")
+        for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
+            corrs = feature_target_correlations(df, feature_cols, target, top_n=10)
+            if not corrs:
+                continue
+            lines.append(f"### `{target}`")
+            lines.append("")
+            lines.append("| Feature | Spearman ρ | p-value |")
+            lines.append("|---|---|---|")
+            for row in corrs:
+                lines.append(f"| `{row['feature']}` | {row['spearman_rho']:+.3f} | "
+                             f"{row['p_value']:.1e} |")
+            lines.append("")
     # Section: per-phylum error breakdown (regression targets only)
     if predictions is not None and not predictions.empty and "row_idx" in predictions.columns:
         joined = predictions.merge(

src/microbe_model/explore.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Lightweight data exploration helpers.
+Computes Pearson + Spearman correlations between features and regression targets,
+and class-mean differences for classification targets. Used by the eval renderer
+to surface biologically interpretable signals (e.g. "IVYWREL fraction correlates
+with optimal_temperature_c at r=0.71" — confirms the thermophile signature).
+"""
+from __future__ import annotations
+from typing import Any
+import numpy as np
+import pandas as pd
+from scipy.stats import spearmanr
+def feature_target_correlations(
+    df: pd.DataFrame,
+    feature_cols: list[str],
+    target: str,
+    *,
+    top_n: int = 15,
+) -> list[dict[str, Any]]:
+    """For a regression target, return top features by |Spearman correlation|."""
+    if target not in df.columns:
+        return []
+    sub = df[df[target].notna()]
+    if len(sub) < 30:
+        return []
+    rows = []
+    y = sub[target].to_numpy(dtype=float)
+    for col in feature_cols:
+        x = pd.to_numeric(sub[col], errors="coerce").to_numpy()
+        mask = ~np.isnan(x)
+        if mask.sum() < 30:
+            continue
+        try:
+            rho, p = spearmanr(x[mask], y[mask])
+        except Exception:  # noqa: BLE001
+            continue
+        if np.isnan(rho):
+            continue
+        rows.append({"feature": col, "spearman_rho": float(rho), "p_value": float(p)})
+    rows.sort(key=lambda r: abs(r["spearman_rho"]), reverse=True)
+    return rows[:top_n]
+def class_mean_features(
+    df: pd.DataFrame,
+    feature_cols: list[str],
+    target: str,
+    *,
+    top_features: int = 5,
+) -> list[dict[str, Any]]:
+    """For a classification target, return per-class means for the top-N most varying features.
+    Variance is measured across class means — features whose mean differs most across classes
+    are most informative.
+    """
+    if target not in df.columns:
+        return []
+    sub = df[df[target].notna()]
+    if sub[target].nunique() < 2:
+        return []
+    # Rank features by F-statistic-style metric: variance of class means / pooled variance
+    feature_scores = []
+    for col in feature_cols:
+        vals = pd.to_numeric(sub[col], errors="coerce")
+        if vals.notna().sum() < 30:
+            continue
+        between = sub.assign(_x=vals).groupby(target, dropna=False)["_x"].mean().var()
+        within = vals.var()
+        if between is None or within is None or within == 0 or pd.isna(between):
+            continue
+        feature_scores.append((col, float(between / within)))
+    feature_scores.sort(key=lambda kv: kv[1], reverse=True)
+    output: list[dict[str, Any]] = []
+    for col, score in feature_scores[:top_features]:
+        means_by_class = (
+            sub.assign(_x=pd.to_numeric(sub[col], errors="coerce"))
+               .groupby(target, dropna=False)["_x"]
+               .mean()
+               .to_dict()
+        )
+        output.append({"feature": col, "score": score, "means": means_by_class})
+    return output