Spaces:
Running
Add feature↔target correlation analysis to eval report
Browse filessrc/microbe_model/explore.py:
- feature_target_correlations: Spearman ρ + p-value for each feature against
a regression target, sorted by |ρ|
- class_mean_features: per-class feature means for classification targets,
ranked by between-class / within-class variance ratio (F-stat-like)
Wired into the eval report as a "Feature ↔ target correlations (Spearman, top 10)"
section, immediately after per-fold detail. This is the biology sanity check —
the user can see at a glance that, e.g., ivywrel_frac correlates with
optimal_temperature_c (Zeldovich 2007 thermophile signature). If the expected
correlations don't show up at high |ρ|, something is wrong with feature
extraction.
scipy is already pulled in transitively by scikit-learn — no new deps.
Tests: 15/15 still passing.
- src/microbe_model/eval.py +27 -0
- src/microbe_model/explore.py +88 -0
|
@@ -174,6 +174,33 @@ def render_report(
|
|
| 174 |
lines.append(f"- `{name}` — {importance:.4f}")
|
| 175 |
lines.append("")
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
# Section: per-phylum error breakdown (regression targets only)
|
| 178 |
if predictions is not None and not predictions.empty and "row_idx" in predictions.columns:
|
| 179 |
joined = predictions.merge(
|
|
|
|
| 174 |
lines.append(f"- `{name}` — {importance:.4f}")
|
| 175 |
lines.append("")
|
| 176 |
|
| 177 |
+
# Section: feature-target correlations (data-exploration sanity check)
|
| 178 |
+
feature_cols = [
|
| 179 |
+
c for c in df.columns
|
| 180 |
+
if c.startswith(("aa_frac_", "genome_size", "gc_", "n_predicted", "coding_",
|
| 181 |
+
"mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_"))
|
| 182 |
+
]
|
| 183 |
+
if feature_cols:
|
| 184 |
+
from microbe_model.explore import feature_target_correlations
|
| 185 |
+
lines.append("## Feature ↔ target correlations (Spearman, top 10)")
|
| 186 |
+
lines.append("")
|
| 187 |
+
lines.append("Sanity-checks the biology — features known to track each target should "
|
| 188 |
+
"appear here at high |ρ|. E.g. `ivywrel_frac` should correlate with "
|
| 189 |
+
"`optimal_temperature_c` (Zeldovich 2007 thermophile signature).")
|
| 190 |
+
lines.append("")
|
| 191 |
+
for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
|
| 192 |
+
corrs = feature_target_correlations(df, feature_cols, target, top_n=10)
|
| 193 |
+
if not corrs:
|
| 194 |
+
continue
|
| 195 |
+
lines.append(f"### `{target}`")
|
| 196 |
+
lines.append("")
|
| 197 |
+
lines.append("| Feature | Spearman ρ | p-value |")
|
| 198 |
+
lines.append("|---|---|---|")
|
| 199 |
+
for row in corrs:
|
| 200 |
+
lines.append(f"| `{row['feature']}` | {row['spearman_rho']:+.3f} | "
|
| 201 |
+
f"{row['p_value']:.1e} |")
|
| 202 |
+
lines.append("")
|
| 203 |
+
|
| 204 |
# Section: per-phylum error breakdown (regression targets only)
|
| 205 |
if predictions is not None and not predictions.empty and "row_idx" in predictions.columns:
|
| 206 |
joined = predictions.merge(
|
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lightweight data exploration helpers.
|
| 2 |
+
|
| 3 |
+
Computes Pearson + Spearman correlations between features and regression targets,
|
| 4 |
+
and class-mean differences for classification targets. Used by the eval renderer
|
| 5 |
+
to surface biologically interpretable signals (e.g. "IVYWREL fraction correlates
|
| 6 |
+
with optimal_temperature_c at r=0.71" — confirms the thermophile signature).
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from scipy.stats import spearmanr
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def feature_target_correlations(
|
| 18 |
+
df: pd.DataFrame,
|
| 19 |
+
feature_cols: list[str],
|
| 20 |
+
target: str,
|
| 21 |
+
*,
|
| 22 |
+
top_n: int = 15,
|
| 23 |
+
) -> list[dict[str, Any]]:
|
| 24 |
+
"""For a regression target, return top features by |Spearman correlation|."""
|
| 25 |
+
if target not in df.columns:
|
| 26 |
+
return []
|
| 27 |
+
sub = df[df[target].notna()]
|
| 28 |
+
if len(sub) < 30:
|
| 29 |
+
return []
|
| 30 |
+
rows = []
|
| 31 |
+
y = sub[target].to_numpy(dtype=float)
|
| 32 |
+
for col in feature_cols:
|
| 33 |
+
x = pd.to_numeric(sub[col], errors="coerce").to_numpy()
|
| 34 |
+
mask = ~np.isnan(x)
|
| 35 |
+
if mask.sum() < 30:
|
| 36 |
+
continue
|
| 37 |
+
try:
|
| 38 |
+
rho, p = spearmanr(x[mask], y[mask])
|
| 39 |
+
except Exception: # noqa: BLE001
|
| 40 |
+
continue
|
| 41 |
+
if np.isnan(rho):
|
| 42 |
+
continue
|
| 43 |
+
rows.append({"feature": col, "spearman_rho": float(rho), "p_value": float(p)})
|
| 44 |
+
rows.sort(key=lambda r: abs(r["spearman_rho"]), reverse=True)
|
| 45 |
+
return rows[:top_n]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def class_mean_features(
|
| 49 |
+
df: pd.DataFrame,
|
| 50 |
+
feature_cols: list[str],
|
| 51 |
+
target: str,
|
| 52 |
+
*,
|
| 53 |
+
top_features: int = 5,
|
| 54 |
+
) -> list[dict[str, Any]]:
|
| 55 |
+
"""For a classification target, return per-class means for the top-N most varying features.
|
| 56 |
+
|
| 57 |
+
Variance is measured across class means — features whose mean differs most across classes
|
| 58 |
+
are most informative.
|
| 59 |
+
"""
|
| 60 |
+
if target not in df.columns:
|
| 61 |
+
return []
|
| 62 |
+
sub = df[df[target].notna()]
|
| 63 |
+
if sub[target].nunique() < 2:
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
# Rank features by F-statistic-style metric: variance of class means / pooled variance
|
| 67 |
+
feature_scores = []
|
| 68 |
+
for col in feature_cols:
|
| 69 |
+
vals = pd.to_numeric(sub[col], errors="coerce")
|
| 70 |
+
if vals.notna().sum() < 30:
|
| 71 |
+
continue
|
| 72 |
+
between = sub.assign(_x=vals).groupby(target, dropna=False)["_x"].mean().var()
|
| 73 |
+
within = vals.var()
|
| 74 |
+
if between is None or within is None or within == 0 or pd.isna(between):
|
| 75 |
+
continue
|
| 76 |
+
feature_scores.append((col, float(between / within)))
|
| 77 |
+
feature_scores.sort(key=lambda kv: kv[1], reverse=True)
|
| 78 |
+
|
| 79 |
+
output: list[dict[str, Any]] = []
|
| 80 |
+
for col, score in feature_scores[:top_features]:
|
| 81 |
+
means_by_class = (
|
| 82 |
+
sub.assign(_x=pd.to_numeric(sub[col], errors="coerce"))
|
| 83 |
+
.groupby(target, dropna=False)["_x"]
|
| 84 |
+
.mean()
|
| 85 |
+
.to_dict()
|
| 86 |
+
)
|
| 87 |
+
output.append({"feature": col, "score": score, "means": means_by_class})
|
| 88 |
+
return output
|