Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi Claude Opus 4.7 (1M context) commited on 27 days ago

Commit

4c18dfd

1 Parent(s): 9365561

Add unified strain catalog (100K rows w/ provenance) + selective weak supervision for pH

Three new artifacts:

scripts/21_build_strain_catalog.py
Emits data/strain_catalog.parquet — every BacDive strain (n=100,866) with each
phenotype as {value, source}. source ∈ {bacdive, mediadive_weak, unknown}.
Coverage: T_opt 50% (BacDive only), pH 29% (5,794 BacDive + 23,574 MediaDive),
oxygen 23%, salt 30% (4,242 BacDive + 26,055 MediaDive). 50K strains have an
explicitly 'unknown' temperature, the largest single bucket.

scripts/23_weak_label_apples_to_apples.py + artifacts/weak_label_test.log
Honest semi-supervised test: train on (curated + MediaDive-weak), evaluate on
held-out *curated* test rows only — does weak supervision help generalization
to the gold-standard distribution?
optimal_ph curated 0.5133 → curated+weak 0.4934 (-3.9%, HELPS)
salt_tolerance_pct curated 2.1060 → curated+weak 2.1859 (+3.8%, HURTS)
Matches the pre-experiment correlation probe: pH↔MediaDive corr 0.62 helps,
salt↔MediaDive corr 0.42 hurts.

scripts/15_train_phenotype_heads.py
Backfills pH labels from MediaDive (5,103 → 26,181) before training the deployed
pH quantile heads. Salt deliberately stays curated-only. Re-saved .ubj heads.

scripts/22_train_with_weak_labels.py kept for reproducibility (full-table weak
training; superseded by scripts/23 for the rigorous comparison).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (7) hide show

models/phenotype/optimal_ph_q10.ubj +2 -2
models/phenotype/optimal_ph_q50.ubj +2 -2
models/phenotype/optimal_ph_q90.ubj +2 -2
scripts/15_train_phenotype_heads.py +17 -0
scripts/21_build_strain_catalog.py +104 -0
scripts/22_train_with_weak_labels.py +74 -0
scripts/23_weak_label_apples_to_apples.py +128 -0

models/phenotype/optimal_ph_q10.ubj CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ee3dc1e9911b70ea915af55dcfdf5cf73b815d6a454ae8279dbac2142c7b596
-size 827905

 version https://git-lfs.github.com/spec/v1
+oid sha256:16498ff43324f45392404520ef74f6e29f3f40572b57662247f4c03f46c2401d
+size 937045

models/phenotype/optimal_ph_q50.ubj CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99652ebd8c1d5bbdaefd59d4724e15f6126cd56313dded2af8f1c32fc14ead66
-size 892163

 version https://git-lfs.github.com/spec/v1
+oid sha256:8e9682110e2d5ee8e1e8bd36c9dc48911566f946df2a2b8b656d658290e471bf
+size 1058561

models/phenotype/optimal_ph_q90.ubj CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:136173f080b8ea4b0bab00c23a340c0c5da4eb0dbbba9032701e12fc1fdd91cd
-size 833683

 version https://git-lfs.github.com/spec/v1
+oid sha256:c680f7a908fa9ffe63f63c376a734799f5df5fd6a37baf0de52900d2e95a3cbc
+size 904405

scripts/15_train_phenotype_heads.py CHANGED Viewed

@@ -38,6 +38,23 @@ def main() -> None:
     df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
     feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
     out_dir = config.ROOT / "models" / "phenotype"
     out_dir.mkdir(parents=True, exist_ok=True)
     (out_dir / "feature_cols.json").write_text(json.dumps(feature_cols))

     df = pheno.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
     feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
+    # Backfill pH with MediaDive-derived weak labels where BacDive has none.
+    # Apples-to-apples test (scripts/23) showed this nets -3.9% MAE on held-out
+    # curated test rows (corr 0.62 with optima). Salt did NOT pass the same test
+    # (+3.8% MAE, corr only 0.42), so we deliberately don't backfill salt.
+    catalog_path = config.DATA / "strain_catalog.parquet"
+    if catalog_path.exists():
+        catalog = pd.read_parquet(catalog_path)[["bacdive_id", "optimal_ph", "optimal_ph_source"]]
+        catalog["bacdive_id"] = catalog["bacdive_id"].astype(int)
+        catalog["optimal_ph"] = pd.to_numeric(catalog["optimal_ph"], errors="coerce")
+        df["bacdive_id"] = df["bacdive_id"].astype(int)
+        df = df.merge(catalog, on="bacdive_id", how="left", suffixes=("", "_cat"))
+        n_before = df["optimal_ph"].notna().sum()
+        ph_missing = df["optimal_ph"].isna() & df["optimal_ph_cat"].notna() & df["optimal_ph_source"].eq("mediadive_weak")
+        df.loc[ph_missing, "optimal_ph"] = df.loc[ph_missing, "optimal_ph_cat"]
+        n_after = df["optimal_ph"].notna().sum()
+        print(f"pH labels: {n_before:,} curated → {n_after:,} after MediaDive backfill (+{n_after - n_before:,})")
     out_dir = config.ROOT / "models" / "phenotype"
     out_dir.mkdir(parents=True, exist_ok=True)
     (out_dir / "feature_cols.json").write_text(json.dumps(feature_cols))

scripts/21_build_strain_catalog.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Build a unified per-strain catalog with label provenance.
+Every BacDive strain in data/bacdive_phenotypes.parquet gets a row. For each of the
+4 phenotype targets, two columns are emitted:
+  - <target>          numeric/categorical value (or NaN if unknown)
+  - <target>_source   one of: 'bacdive' | 'mediadive_weak' | 'unknown'
+`bacdive` means the value came from BacDive's curated optimum / oxygen tolerance
+(high-quality). `mediadive_weak` means we derived it from the median pH/NaCl% of
+the DSMZ media the strain has been recorded as growing on (lower-quality fallback,
+only filled in when BacDive has no value). `unknown` means we have nothing.
+Saves to data/strain_catalog.parquet.
+"""
+from __future__ import annotations
+import pandas as pd
+from microbe_model import config
+def main() -> None:
+    pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet").copy()
+    pheno["bacdive_id"] = pheno["bacdive_id"].astype(int)
+    # Pull MediaDive-derived signals (per-strain median across grown media)
+    md_path = config.DATA / "mediadive_features.parquet"
+    md = pd.read_parquet(md_path) if md_path.exists() else pd.DataFrame()
+    if len(md):
+        md["bacdive_id"] = md["bacdive_id"].astype(int)
+    # Build per-target value + source
+    out_cols: dict[str, pd.Series] = {}
+    for col in ("bacdive_id", "species", "genus", "family", "ncbi_taxon_id",
+                "genome_accession", "genome_source"):
+        if col in pheno.columns:
+            out_cols[col] = pheno[col]
+    # Direct BacDive labels (canonical). For each, fall back to MediaDive when missing.
+    targets = {
+        "optimal_temperature_c": None,        # no MediaDive proxy for temperature
+        "optimal_ph": "md_ph_median" if len(md) else None,
+        "oxygen_requirement": None,           # no media-based proxy
+        "salt_tolerance_pct": "md_nacl_pct_median" if len(md) else None,
+    }
+    md_indexed = md.set_index("bacdive_id") if len(md) else None
+    for target, md_col in targets.items():
+        bacdive_vals = pheno.set_index("bacdive_id")[target] if target in pheno.columns else None
+        # Reindex to match pheno order
+        ordered_bacdive = (
+            bacdive_vals.reindex(pheno["bacdive_id"]).values if bacdive_vals is not None else None
+        )
+        # MediaDive-derived fallback (numeric only; salt% capped at 30 already)
+        weak_vals = (
+            md_indexed[md_col].reindex(pheno["bacdive_id"]).values
+            if md_col and md_indexed is not None and md_col in md_indexed.columns
+            else None
+        )
+        values = []
+        sources = []
+        for i in range(len(pheno)):
+            v = ordered_bacdive[i] if ordered_bacdive is not None else None
+            # pandas-aware NaN check
+            v_is_na = (v is None) or (isinstance(v, float) and pd.isna(v)) or (
+                isinstance(v, str) and not v
+            )
+            if not v_is_na:
+                values.append(v)
+                sources.append("bacdive")
+                continue
+            wv = weak_vals[i] if weak_vals is not None else None
+            wv_is_na = wv is None or (isinstance(wv, float) and pd.isna(wv))
+            if not wv_is_na:
+                values.append(wv)
+                sources.append("mediadive_weak")
+                continue
+            values.append(None)
+            sources.append("unknown")
+        out_cols[target] = pd.Series(values, dtype="object")
+        out_cols[f"{target}_source"] = pd.Series(sources, dtype="string")
+    catalog = pd.DataFrame(out_cols)
+    out = config.DATA / "strain_catalog.parquet"
+    catalog.to_parquet(out, index=False)
+    print(f"wrote {len(catalog):,} strains to {out}\n")
+    # Summary per target
+    for target in targets:
+        src_col = f"{target}_source"
+        counts = catalog[src_col].value_counts().to_dict()
+        n_known = counts.get("bacdive", 0) + counts.get("mediadive_weak", 0)
+        print(f"{target}")
+        print(f"  bacdive (curated):   {counts.get('bacdive', 0):>7,}")
+        print(f"  mediadive_weak:      {counts.get('mediadive_weak', 0):>7,}")
+        print(f"  unknown:             {counts.get('unknown', 0):>7,}")
+        print(f"  ─ any-known:         {n_known:>7,} ({100*n_known/len(catalog):.0f}%)")
+        print()
+if __name__ == "__main__":
+    main()

scripts/22_train_with_weak_labels.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Train baseline using BacDive-first + MediaDive-weak fallback labels for pH/salt.
+Compares vs the curated-only baseline (artifacts/baseline_results.json) to see whether
+the weak labels are net-helpful for the model. T_opt and oxygen are unaffected (no
+weak source). pH and salt get many more training rows but with noisier labels.
+Output: artifacts/baseline_results_weak.json
+"""
+from __future__ import annotations
+import time
+import pandas as pd
+from microbe_model import config
+from microbe_model.train.baseline import save_results, train_all
+# Reuse the encoders from scripts/03 — copy locally to avoid sys.path gymnastics
+import importlib.util
+spec = importlib.util.spec_from_file_location("train03", config.ROOT / "scripts" / "03_train_baseline.py")
+train03 = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(train03)
+def main() -> None:
+    t0 = time.time()
+    catalog = pd.read_parquet(config.DATA / "strain_catalog.parquet")
+    feats = pd.read_parquet(config.DATA / "features.parquet")
+    pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
+    # Isolation categories live in the original phenotype table — graft them onto the catalog
+    iso_in = [c for c in pheno.columns if c.startswith("isolation_cat")]
+    pheno_iso = pheno[["bacdive_id"] + iso_in].copy()
+    pheno_iso["bacdive_id"] = pheno_iso["bacdive_id"].astype(int)
+    catalog["bacdive_id"] = catalog["bacdive_id"].astype(int)
+    feats["bacdive_id"] = feats["bacdive_id"].astype(int)
+    catalog = catalog.merge(pheno_iso, on="bacdive_id", how="left")
+    df = catalog.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
+    df["group"] = df.apply(train03.derive_group, axis=1)
+    # Coerce numeric targets (catalog stored as object)
+    for col in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+    df, iso_cols = train03.encode_isolation_categories(df)
+    print(f"Encoded {len(iso_cols)} isolation features")
+    # IMPORTANT: do NOT add MediaDive features here. The weak labels for pH and salt
+    # are *derived from those same features* (per-strain median across DSMZ media), so
+    # including them as inputs leaks the target — the model trivially predicts the
+    # matching feature column. The honest test is: do MediaDive-derived weak labels
+    # help a genome+isolation model generalize?
+    feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
+    feature_cols = feature_cols + iso_cols
+    print(f"\nTraining table: {len(df):,} strains × {len(feature_cols)} features")
+    for tgt in ("optimal_temperature_c", "optimal_ph", "oxygen_requirement", "salt_tolerance_pct"):
+        n = df[tgt].notna().sum() if tgt in df.columns else 0
+        print(f"  {tgt:25s} labeled={n:>6,}")
+    print()
+    results = train_all(df, feature_cols, group_col_override="group")
+    out = config.ARTIFACTS / "baseline_results_weak.json"
+    save_results(results, out, predictions_path=None, feature_cols=feature_cols)
+    print(f"\nResults summary ({time.time() - t0:.1f}s):\n")
+    for target, r in results.items():
+        if r.folds:
+            print(f"  {target:25s} {r.folds[0].metric_name:10s} = {r.mean():.4f}")
+if __name__ == "__main__":
+    main()

scripts/23_weak_label_apples_to_apples.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Honest test: do MediaDive-derived weak labels help generalize to BacDive-curated optima?
+For pH and salt, two training regimes — same held-out test rows (curated only):
+  A. CURATED-ONLY: train on BacDive curated labels.
+  B. CURATED + WEAK: train on BacDive curated + MediaDive-derived weak labels.
+In both, the test set per fold is the *intersection* of the held-out group with the
+curated subset. This isolates whether weak labels help the model do better on the
+gold-standard distribution, rather than just helping it predict the medium pH/salt
+of the strain itself (which would be circular).
+No MediaDive features are used (deployed model parity).
+"""
+from __future__ import annotations
+import importlib.util
+import sys
+import time
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import mean_absolute_error
+from sklearn.model_selection import GroupKFold
+from microbe_model import config
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT / "scripts"))
+spec = importlib.util.spec_from_file_location("train03", ROOT / "scripts" / "03_train_baseline.py")
+train03 = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(train03)
+def cv_mae(
+    df: pd.DataFrame,
+    feature_cols: list[str],
+    target: str,
+    *,
+    train_mask: pd.Series,
+    test_mask: pd.Series,
+    n_splits: int = 5,
+) -> tuple[float, int]:
+    """5-fold GroupKFold by family. Train on (train_mask & target.notna() & not test fold).
+    Evaluate on (test_mask & target.notna() & test fold). Returns mean MAE across folds.
+    """
+    eligible = df[df[target].notna()].copy()
+    eligible[target] = pd.to_numeric(eligible[target], errors="coerce")
+    eligible = eligible[eligible[target].notna()]
+    groups = eligible["group"].fillna("__unknown__")
+    splits = min(n_splits, max(2, groups.nunique()))
+    kf = GroupKFold(n_splits=splits)
+    maes = []
+    n_eval_total = 0
+    for tr_idx, te_idx in kf.split(eligible, eligible[target], groups):
+        tr = eligible.iloc[tr_idx]
+        te = eligible.iloc[te_idx]
+        # Apply masks to the row indices we're using
+        tr = tr[train_mask.reindex(tr.index, fill_value=False).values]
+        te = te[test_mask.reindex(te.index, fill_value=False).values]
+        if len(tr) < 100 or len(te) < 50:
+            continue
+        m = xgb.XGBRegressor(
+            n_estimators=400, max_depth=5, learning_rate=0.05,
+            tree_method="hist", n_jobs=-1,
+        )
+        m.fit(tr[feature_cols], tr[target].astype(float))
+        preds = m.predict(te[feature_cols])
+        maes.append(mean_absolute_error(te[target].astype(float), preds))
+        n_eval_total += len(te)
+    return (float(np.mean(maes)) if maes else float("nan")), n_eval_total
+def main() -> None:
+    t0 = time.time()
+    catalog = pd.read_parquet(config.DATA / "strain_catalog.parquet")
+    feats = pd.read_parquet(config.DATA / "features.parquet")
+    pheno = pd.read_parquet(config.DATA / "bacdive_phenotypes.parquet")
+    iso_in = [c for c in pheno.columns if c.startswith("isolation_cat")]
+    pheno_iso = pheno[["bacdive_id"] + iso_in].copy()
+    pheno_iso["bacdive_id"] = pheno_iso["bacdive_id"].astype(int)
+    catalog["bacdive_id"] = catalog["bacdive_id"].astype(int)
+    feats["bacdive_id"] = feats["bacdive_id"].astype(int)
+    catalog = catalog.merge(pheno_iso, on="bacdive_id", how="left")
+    df = catalog.merge(feats, on=["bacdive_id", "genome_accession"], how="inner")
+    df["group"] = df.apply(train03.derive_group, axis=1)
+    # Numeric coercion
+    for col in ("optimal_temperature_c", "optimal_ph", "salt_tolerance_pct"):
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    df, iso_cols = train03.encode_isolation_categories(df)
+    feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
+    feature_cols = feature_cols + iso_cols
+    print(f"\nTraining table: {len(df):,} strains × {len(feature_cols)} features")
+    print("Held-out test rows are always BacDive-curated only.\n")
+    for target in ("optimal_ph", "salt_tolerance_pct"):
+        src_col = f"{target}_source"
+        curated = (df[src_col] == "bacdive")
+        weak = (df[src_col] == "mediadive_weak")
+        print(f"=== {target} ===")
+        print(f"  curated rows:        {curated.sum():,}")
+        print(f"  weak rows:           {weak.sum():,}")
+        # A) CURATED-ONLY training
+        mae_a, n_a = cv_mae(df, feature_cols, target,
+                            train_mask=curated, test_mask=curated)
+        # B) CURATED + WEAK training
+        mae_b, n_b = cv_mae(df, feature_cols, target,
+                            train_mask=(curated | weak), test_mask=curated)
+        delta_pct = 100 * (mae_b - mae_a) / mae_a
+        verdict = "HELPS" if mae_b < mae_a - 0.001 else (
+            "HURTS" if mae_b > mae_a + 0.001 else "WASH"
+        )
+        print(f"  A. curated-only  MAE = {mae_a:.4f}  (eval n={n_a:,})")
+        print(f"  B. curated+weak  MAE = {mae_b:.4f}  (eval n={n_b:,})")
+        print(f"  → Δ = {delta_pct:+.1f}%   [{verdict}]\n")
+    print(f"({time.time() - t0:.1f}s total)")
+if __name__ == "__main__":
+    main()