Miyu Horiuchi commited on
Commit
95a1bfd
·
1 Parent(s): d278e18

Add feature↔target correlation analysis to eval report

Browse files

src/microbe_model/explore.py:
- feature_target_correlations: Spearman ρ + p-value for each feature against
a regression target, sorted by |ρ|
- class_mean_features: per-class feature means for classification targets,
ranked by between-class / within-class variance ratio (F-stat-like)

Wired into the eval report as a "Feature ↔ target correlations (Spearman, top 10)"
section, immediately after per-fold detail. This is the biology sanity check —
the user can see at a glance that, e.g., ivywrel_frac correlates with
optimal_temperature_c (Zeldovich 2007 thermophile signature). If the expected
correlations don't show up at high |ρ|, something is wrong with feature
extraction.

scipy is already pulled in transitively by scikit-learn — no new deps.

Tests: 15/15 still passing.

src/microbe_model/eval.py CHANGED
@@ -174,6 +174,33 @@ def render_report(
174
  lines.append(f"- `{name}` — {importance:.4f}")
175
  lines.append("")
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  # Section: per-phylum error breakdown (regression targets only)
178
  if predictions is not None and not predictions.empty and "row_idx" in predictions.columns:
179
  joined = predictions.merge(
 
174
  lines.append(f"- `{name}` — {importance:.4f}")
175
  lines.append("")
176
 
177
+ # Section: feature-target correlations (data-exploration sanity check)
178
+ feature_cols = [
179
+ c for c in df.columns
180
+ if c.startswith(("aa_frac_", "genome_size", "gc_", "n_predicted", "coding_",
181
+ "mean_", "aromatic_", "pos_", "neg_", "ivywrel_", "median_"))
182
+ ]
183
+ if feature_cols:
184
+ from microbe_model.explore import feature_target_correlations
185
+ lines.append("## Feature ↔ target correlations (Spearman, top 10)")
186
+ lines.append("")
187
+ lines.append("Sanity-checks the biology — features known to track each target should "
188
+ "appear here at high |ρ|. E.g. `ivywrel_frac` should correlate with "
189
+ "`optimal_temperature_c` (Zeldovich 2007 thermophile signature).")
190
+ lines.append("")
191
+ for target in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
192
+ corrs = feature_target_correlations(df, feature_cols, target, top_n=10)
193
+ if not corrs:
194
+ continue
195
+ lines.append(f"### `{target}`")
196
+ lines.append("")
197
+ lines.append("| Feature | Spearman ρ | p-value |")
198
+ lines.append("|---|---|---|")
199
+ for row in corrs:
200
+ lines.append(f"| `{row['feature']}` | {row['spearman_rho']:+.3f} | "
201
+ f"{row['p_value']:.1e} |")
202
+ lines.append("")
203
+
204
  # Section: per-phylum error breakdown (regression targets only)
205
  if predictions is not None and not predictions.empty and "row_idx" in predictions.columns:
206
  joined = predictions.merge(
src/microbe_model/explore.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lightweight data exploration helpers.
2
+
3
+ Computes Pearson + Spearman correlations between features and regression targets,
4
+ and class-mean differences for classification targets. Used by the eval renderer
5
+ to surface biologically interpretable signals (e.g. "IVYWREL fraction correlates
6
+ with optimal_temperature_c at r=0.71" — confirms the thermophile signature).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ from scipy.stats import spearmanr
15
+
16
+
17
+ def feature_target_correlations(
18
+ df: pd.DataFrame,
19
+ feature_cols: list[str],
20
+ target: str,
21
+ *,
22
+ top_n: int = 15,
23
+ ) -> list[dict[str, Any]]:
24
+ """For a regression target, return top features by |Spearman correlation|."""
25
+ if target not in df.columns:
26
+ return []
27
+ sub = df[df[target].notna()]
28
+ if len(sub) < 30:
29
+ return []
30
+ rows = []
31
+ y = sub[target].to_numpy(dtype=float)
32
+ for col in feature_cols:
33
+ x = pd.to_numeric(sub[col], errors="coerce").to_numpy()
34
+ mask = ~np.isnan(x)
35
+ if mask.sum() < 30:
36
+ continue
37
+ try:
38
+ rho, p = spearmanr(x[mask], y[mask])
39
+ except Exception: # noqa: BLE001
40
+ continue
41
+ if np.isnan(rho):
42
+ continue
43
+ rows.append({"feature": col, "spearman_rho": float(rho), "p_value": float(p)})
44
+ rows.sort(key=lambda r: abs(r["spearman_rho"]), reverse=True)
45
+ return rows[:top_n]
46
+
47
+
48
+ def class_mean_features(
49
+ df: pd.DataFrame,
50
+ feature_cols: list[str],
51
+ target: str,
52
+ *,
53
+ top_features: int = 5,
54
+ ) -> list[dict[str, Any]]:
55
+ """For a classification target, return per-class means for the top-N most varying features.
56
+
57
+ Variance is measured across class means — features whose mean differs most across classes
58
+ are most informative.
59
+ """
60
+ if target not in df.columns:
61
+ return []
62
+ sub = df[df[target].notna()]
63
+ if sub[target].nunique() < 2:
64
+ return []
65
+
66
+ # Rank features by F-statistic-style metric: variance of class means / pooled variance
67
+ feature_scores = []
68
+ for col in feature_cols:
69
+ vals = pd.to_numeric(sub[col], errors="coerce")
70
+ if vals.notna().sum() < 30:
71
+ continue
72
+ between = sub.assign(_x=vals).groupby(target, dropna=False)["_x"].mean().var()
73
+ within = vals.var()
74
+ if between is None or within is None or within == 0 or pd.isna(between):
75
+ continue
76
+ feature_scores.append((col, float(between / within)))
77
+ feature_scores.sort(key=lambda kv: kv[1], reverse=True)
78
+
79
+ output: list[dict[str, Any]] = []
80
+ for col, score in feature_scores[:top_features]:
81
+ means_by_class = (
82
+ sub.assign(_x=pd.to_numeric(sub[col], errors="coerce"))
83
+ .groupby(target, dropna=False)["_x"]
84
+ .mean()
85
+ .to_dict()
86
+ )
87
+ output.append({"feature": col, "score": score, "means": means_by_class})
88
+ return output