Spaces:

miyuiu
/

microbe-model

Running

Miyu Horiuchi Claude Opus 4.7 (1M context) commited on 27 days ago

Commit

5df9ef8

1 Parent(s): 56b0c4e

Add MediaDive-derived features (medium pH, NaCl, n_media) — all 4 targets improve

For each strain in data/strain_media.parquet, compute median pH and NaCl% across
the DSMZ media it has been grown on, plus a count of media. These are model inputs
(features), NOT labels — the previous probe showed BacDive↔MediaDive label
correlation is only 0.42 for salt, so using MediaDive as a label source would
corrupt the now-clean salt MAE. As features, the model learns the right weighting.

5 new features per strain:
md_n_media count of media the strain grows on
md_ph_median median midpoint(min_pH, max_pH)
md_ph_range spread (max - min) of medium pH
md_nacl_pct_median median NaCl % w/v across recipes
md_nacl_pct_max highest tolerated NaCl

Coverage: 28,704 strains (62% of 46K training table) have MediaDive data.

Cumulative metrics vs original v0 genome-only baseline:
optimal_temperature_c MAE 3.28 → 2.86 (-12.9%)
optimal_ph MAE 0.52 → 0.48 ( -7.7%)
oxygen_requirement F1 0.279 → 0.358 (+28.2%)
salt_tolerance_pct MAE 2.51 → 2.11 (-15.9%)

This step alone:
T_opt 2.94 → 2.86
pH 0.51 → 0.48
Oxygen 0.341 → 0.358
Salt 2.17 → 2.11

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (4) hide show

artifacts/baseline_results.json +110 -105
artifacts/eval_report.md +120 -120
scripts/03_train_baseline.py +13 -1
scripts/20_build_mediadive_features.py +94 -0

artifacts/baseline_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
   "optimal_temperature_c": {
     "task": "regression",
-    "mean_metric": 2.939444159350111,
     "folds": [
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.103597222252415,
         "n_train": 36496,
         "n_test": 9125
       },
@@ -15,7 +15,7 @@
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.7356862682357583,
         "n_train": 36497,
         "n_test": 9124
       },
@@ -23,7 +23,7 @@
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.145843773419164,
         "n_train": 36497,
         "n_test": 9124
       },
@@ -31,7 +31,7 @@
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 3.2767152481045656,
         "n_train": 36497,
         "n_test": 9124
       },
@@ -39,43 +39,43 @@
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.43537828473865,
         "n_train": 36497,
         "n_test": 9124
       }
     ],
     "top_features": {
-      "ivywrel_frac": 0.12668818831443787,
-      "iso_cat2_thermophilic_gt45_c": 0.029868930205702783,
-      "n_predicted_cds": 0.025075340643525124,
-      "iso_cat2_human": 0.020858772844076157,
-      "iso_cat1_infection": 0.020640516839921474,
-      "iso_cat2_patient": 0.017751351464539766,
-      "aa_frac_C": 0.015003016591072083,
-      "genome_size_nt": 0.012203263118863106,
-      "aa_frac_D": 0.011290411837399006,
-      "codon_AGG": 0.010900856088846922,
-      "iso_cat1_environmental": 0.010176281817257405,
-      "tetra_GCCT": 0.009658925677649676,
-      "tetra_TAGT": 0.00883282758295536,
-      "aa_frac_Y": 0.008421392692252994,
-      "aa_frac_E": 0.007741594593971968,
-      "tetra_TTCC": 0.007376640872098506,
-      "mean_isoelectric_point": 0.007058459660038352,
-      "tetra_CTAA": 0.0070426638238132,
-      "iso_cat2_built_environment": 0.006164434866514057,
-      "iso_cat2_industrial": 0.005895084328949451
     }
   },
   "optimal_ph": {
     "task": "regression",
-    "mean_metric": 0.5090253015368336,
     "folds": [
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.45639293885487886,
         "n_train": 4082,
         "n_test": 1021
       },
@@ -83,7 +83,7 @@
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.6262803867911733,
         "n_train": 4082,
         "n_test": 1021
       },
@@ -91,7 +91,7 @@
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.528334212326513,
         "n_train": 4082,
         "n_test": 1021
       },
@@ -99,7 +99,7 @@
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.48048674237494376,
         "n_train": 4083,
         "n_test": 1020
       },
@@ -107,43 +107,43 @@
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
-        "value": 0.4536322273366591,
         "n_train": 4083,
         "n_test": 1020
       }
     ],
     "top_features": {
-      "iso_cat2_acidic": 0.05219607315957546,
-      "iso_cat2_alkaline": 0.043521419167518616,
-      "neg_charged_frac": 0.016875072754919528,
-      "aa_frac_E": 0.008599728252738715,
-      "tetra_CTCT": 0.008368687890470027,
-      "aa_frac_H": 0.008003219496458769,
-      "mean_isoelectric_point": 0.007599354162812233,
-      "tetra_CACT": 0.007427609874866903,
-      "tetra_AGAC": 0.007137532206252217,
-      "tetra_AGGT": 0.005891842069104314,
-      "tetra_GACT": 0.005873983446508646,
-      "tetra_GAGA": 0.005548427533358336,
-      "tetra_GTCT": 0.005475769587792456,
-      "codon_GAA": 0.005408304557204246,
-      "n_predicted_cds": 0.005280579440295696,
-      "iso_cat2_plants": 0.005045945569872856,
-      "tetra_TTGA": 0.004973787232302129,
-      "codon_AAG": 0.0048154488438740374,
-      "tetra_ACGA": 0.004731484339572489,
-      "aa_frac_Y": 0.0046834095381200315
     }
   },
   "oxygen_requirement": {
     "task": "classification",
-    "mean_metric": 0.34127360853732613,
     "folds": [
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.31515576471296236,
         "n_train": 17311,
         "n_test": 4328
       },
@@ -151,7 +151,7 @@
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.38181774862206597,
         "n_train": 17311,
         "n_test": 4326
       },
@@ -159,7 +159,7 @@
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.34440677114867413,
         "n_train": 17311,
         "n_test": 4328
       },
@@ -167,7 +167,7 @@
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.25943178539399836,
         "n_train": 17311,
         "n_test": 4328
       },
@@ -175,43 +175,43 @@
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
-        "value": 0.40555597280892947,
         "n_train": 17312,
         "n_test": 4327
       }
     ],
     "top_features": {
-      "codon_ATA": 0.0414140235632658,
-      "iso_cat1_host": 0.02601129524409771,
-      "n_predicted_cds": 0.025201210007071494,
-      "aa_frac_C": 0.019132474437355995,
-      "iso_cat1_environmental": 0.01645018421113491,
-      "codon_CGT": 0.014759847987443208,
-      "iso_cat1_engineered": 0.01378793753683567,
-      "genome_size_nt": 0.011305144988000393,
-      "iso_cat2_human": 0.010168002359569073,
-      "codon_TAA": 0.00900037819519639,
-      "aa_frac_V": 0.008459322061389685,
-      "aa_frac_Y": 0.008259046915918588,
-      "aa_frac_L": 0.0072497081011533735,
-      "tetra_CTGG": 0.006922230357304215,
-      "aa_frac_T": 0.006535647064447403,
-      "codon_TGG": 0.006477221753448248,
-      "aa_frac_Q": 0.0063397581689059734,
-      "aa_frac_M": 0.006198597187176347,
-      "tetra_CAAA": 0.006141273584216833,
-      "codon_CAA": 0.00611291266977787
     }
   },
   "salt_tolerance_pct": {
     "task": "regression",
-    "mean_metric": 2.1678824807340775,
     "folds": [
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.0015166708623555,
         "n_train": 3075,
         "n_test": 769
       },
@@ -219,7 +219,7 @@
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 1.933744682528282,
         "n_train": 3075,
         "n_test": 769
       },
@@ -227,7 +227,7 @@
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.8480368776648506,
         "n_train": 3075,
         "n_test": 769
       },
@@ -235,7 +235,7 @@
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 1.9080503232621326,
         "n_train": 3075,
         "n_test": 769
       },
@@ -243,32 +243,32 @@
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
-        "value": 2.148063849352766,
         "n_train": 3076,
         "n_test": 768
       }
     ],
     "top_features": {
-      "neg_charged_frac": 0.07161131724715233,
-      "tetra_ATCC": 0.042717094696126875,
-      "aa_frac_C": 0.03307443875819445,
-      "iso_cat2_saline": 0.029842879995703696,
-      "aa_frac_T": 0.011370222107507289,
-      "codon_CCG": 0.01071425569243729,
-      "tetra_GTTC": 0.008600032026879489,
-      "codon_ATT": 0.007889647269621491,
-      "iso_cat2_built_environment": 0.0076506318233441565,
-      "tetra_TGAT": 0.006314040301367641,
-      "tetra_CGCT": 0.006236091535538435,
-      "tetra_AATT": 0.006198087707161903,
-      "codon_CGT": 0.006119634560309351,
-      "mean_isoelectric_point": 0.005993681214749813,
-      "tetra_GTAT": 0.005874662450514734,
-      "tetra_TCCA": 0.005588621075730771,
-      "aa_frac_Y": 0.005549108772538602,
-      "codon_ACG": 0.005454356223344803,
-      "tetra_TTCC": 0.0053013143828138706,
-      "tetra_CACA": 0.005279732123017311
     }
   },
   "__meta__": {
@@ -690,7 +690,12 @@
       "iso_cat2_urogenital_tract",
       "iso_cat2_waste",
       "iso_cat2_xerophilic",
-      "iso_cat2_yeast"
     ]
   }
 }

 {
   "optimal_temperature_c": {
     "task": "regression",
+    "mean_metric": 2.8569134461172,
     "folds": [
       {
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.952921209821309,
         "n_train": 36496,
         "n_test": 9125
       },
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.6256106255400447,
         "n_train": 36497,
         "n_test": 9124
       },
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.0601953129348187,
         "n_train": 36497,
         "n_test": 9124
       },
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 3.2652047467513965,
         "n_train": 36497,
         "n_test": 9124
       },
         "target": "optimal_temperature_c",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.38063533553843,
         "n_train": 36497,
         "n_test": 9124
       }
     ],
     "top_features": {
+      "ivywrel_frac": 0.12348818182945251,
+      "iso_cat2_thermophilic_gt45_c": 0.028791341930627823,
+      "iso_cat2_patient": 0.025099934451282023,
+      "iso_cat2_human": 0.02344932146370411,
+      "n_predicted_cds": 0.021633704751729967,
+      "iso_cat1_infection": 0.020425693690776826,
+      "aa_frac_C": 0.014341578260064125,
+      "genome_size_nt": 0.01227616611868143,
+      "tetra_CTAA": 0.011777869192883372,
+      "aa_frac_D": 0.01087347036227584,
+      "codon_AGG": 0.009832531120628119,
+      "tetra_GCCT": 0.009409325825981796,
+      "aa_frac_E": 0.008742744009941817,
+      "tetra_TTAG": 0.008621749537996947,
+      "iso_cat1_environmental": 0.00781458979472518,
+      "mean_isoelectric_point": 0.007003072090446949,
+      "aa_frac_Y": 0.00684288409538567,
+      "tetra_AGGC": 0.006669195392169059,
+      "iso_cat2_industrial": 0.006660213135182858,
+      "tetra_TTCC": 0.006564890965819359
     }
   },
   "optimal_ph": {
     "task": "regression",
+    "mean_metric": 0.4824498969036545,
     "folds": [
       {
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.440339089476747,
         "n_train": 4082,
         "n_test": 1021
       },
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.5678683244049492,
         "n_train": 4082,
         "n_test": 1021
       },
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.4943884038785062,
         "n_train": 4082,
         "n_test": 1021
       },
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.46583879377327714,
         "n_train": 4083,
         "n_test": 1020
       },
         "target": "optimal_ph",
         "task": "regression",
         "metric_name": "mae",
+        "value": 0.44381487298479266,
         "n_train": 4083,
         "n_test": 1020
       }
     ],
     "top_features": {
+      "md_ph_median": 0.05177119821310043,
+      "iso_cat2_acidic": 0.030658208578824998,
+      "iso_cat2_alkaline": 0.02869502492249012,
+      "neg_charged_frac": 0.014565921388566494,
+      "aa_frac_H": 0.008134929556399583,
+      "aa_frac_E": 0.007721887435764074,
+      "tetra_CTCT": 0.007108068186789751,
+      "iso_cat2_plant": 0.006769544072449207,
+      "tetra_AGAC": 0.006719858897849917,
+      "tetra_CACT": 0.006461512250825763,
+      "tetra_GACT": 0.0064593076705932615,
+      "tetra_TCTC": 0.005769496783614159,
+      "tetra_TGGG": 0.005730107612907887,
+      "codon_ACG": 0.005510704545304179,
+      "tetra_TAAC": 0.004951662756502629,
+      "mean_isoelectric_point": 0.004634805396199227,
+      "tetra_TGGT": 0.004585431469604373,
+      "tetra_AGTC": 0.004529385082423687,
+      "aa_frac_Y": 0.0043250532820820805,
+      "iso_cat2_plants": 0.004201814788393677
     }
   },
   "oxygen_requirement": {
     "task": "classification",
+    "mean_metric": 0.3574661512390337,
     "folds": [
       {
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.3529974318071035,
         "n_train": 17311,
         "n_test": 4328
       },
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.37463614052135813,
         "n_train": 17311,
         "n_test": 4326
       },
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.357449136753726,
         "n_train": 17311,
         "n_test": 4328
       },
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.2736180772079518,
         "n_train": 17311,
         "n_test": 4328
       },
         "target": "oxygen_requirement",
         "task": "classification",
         "metric_name": "f1_macro",
+        "value": 0.4286299699050292,
         "n_train": 17312,
         "n_test": 4327
       }
     ],
     "top_features": {
+      "codon_ATA": 0.03948382511734962,
+      "iso_cat1_host": 0.02685815468430519,
+      "n_predicted_cds": 0.02664864845573902,
+      "aa_frac_C": 0.01948722042143345,
+      "iso_cat1_environmental": 0.016227377578616142,
+      "codon_CGT": 0.014393045380711556,
+      "iso_cat1_engineered": 0.013875876553356647,
+      "iso_cat2_human": 0.012424463033676147,
+      "genome_size_nt": 0.010264858696609735,
+      "codon_TAA": 0.0082530552521348,
+      "tetra_CAAA": 0.007871841243468226,
+      "aa_frac_V": 0.0073866051156073805,
+      "aa_frac_Y": 0.007194060226902365,
+      "aa_frac_L": 0.006919718533754349,
+      "aa_frac_T": 0.006779328640550375,
+      "md_ph_median": 0.006684697233140469,
+      "aa_frac_Q": 0.006629320327192545,
+      "codon_CAA": 0.006617056485265493,
+      "aa_frac_M": 0.006288983486592769,
+      "codon_TGG": 0.00552113470621407
     }
   },
   "salt_tolerance_pct": {
     "task": "regression",
+    "mean_metric": 2.1124094661234083,
     "folds": [
       {
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 1.9258830039615904,
         "n_train": 3075,
         "n_test": 769
       },
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 1.892595597748997,
         "n_train": 3075,
         "n_test": 769
       },
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.7457253220944784,
         "n_train": 3075,
         "n_test": 769
       },
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 1.870206453444744,
         "n_train": 3075,
         "n_test": 769
       },
         "target": "salt_tolerance_pct",
         "task": "regression",
         "metric_name": "mae",
+        "value": 2.127636953367231,
         "n_train": 3076,
         "n_test": 768
       }
     ],
     "top_features": {
+      "neg_charged_frac": 0.07017230689525604,
+      "tetra_ATCC": 0.04281170674366876,
+      "aa_frac_C": 0.029778398107737303,
+      "iso_cat2_saline": 0.028634220734238623,
+      "md_nacl_pct_median": 0.02563472166657448,
+      "tetra_ACAT": 0.025493022409500556,
+      "md_nacl_pct_max": 0.012753746472299099,
+      "aa_frac_T": 0.011963088880293071,
+      "codon_CCG": 0.009299659519456327,
+      "tetra_TGAT": 0.008889634546358138,
+      "tetra_GTTC": 0.00881260905880481,
+      "codon_TCA": 0.00808500499697402,
+      "mean_isoelectric_point": 0.007483909465372562,
+      "codon_ATT": 0.0072766575030982494,
+      "codon_ACT": 0.006583173375111074,
+      "codon_CGT": 0.005766081786714494,
+      "tetra_TTCG": 0.005739881168119609,
+      "tetra_CGCT": 0.005698419373948127,
+      "codon_TGT": 0.005473139556124806,
+      "aa_frac_S": 0.005398909421637654
     }
   },
   "__meta__": {
       "iso_cat2_urogenital_tract",
       "iso_cat2_waste",
       "iso_cat2_xerophilic",
+      "iso_cat2_yeast",
+      "md_n_media",
+      "md_ph_median",
+      "md_ph_range",
+      "md_nacl_pct_median",
+      "md_nacl_pct_max"
     ]
   }
 }

artifacts/eval_report.md CHANGED Viewed

@@ -1,15 +1,15 @@
 # microbe-model — v0 baseline eval report
-_Generated: 2026-05-05T08:48:33+00:00_
 ## TL;DR
-- **`optimal_temperature_c`**: MAE = **2.94** (vs always-predict-mean 4.98, **+41%**)
-- **`optimal_ph`**: MAE = **0.51** (vs always-predict-mean 0.55, **+7%**)
-- **`oxygen_requirement`**: macro-F1 = **0.341** (vs always-predict-majority 0.059, **+479%**)
-- **`salt_tolerance_pct`**: MAE = **2.17** (vs always-predict-mean 2.51, **+14%**)
-Trained on **46,029** strains with **418** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
 ## Corpus
@@ -43,102 +43,102 @@ Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-
 | Target | Task | n labeled | Model metric | Baseline | Improvement |
 |---|---|---|---|---|---|
-| `optimal_temperature_c` | regression | 45,621 | MAE=2.939 | MAE=4.981 | +41.0% |
-| `optimal_ph` | regression | 5,103 | MAE=0.509 | MAE=0.546 | +6.8% |
-| `oxygen_requirement` | classification | 21,639 | F1=0.341 | F1=0.059 | +479.5% |
-| `salt_tolerance_pct` | regression | 3,844 | MAE=2.168 | MAE=2.515 | +13.8% |
 ### `optimal_temperature_c` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | mae = 3.104 | n=36,496 | n=9,125 |
-| 2 | mae = 2.736 | n=36,497 | n=9,124 |
-| 3 | mae = 3.146 | n=36,497 | n=9,124 |
-| 4 | mae = 3.277 | n=36,497 | n=9,124 |
-| 5 | mae = 2.435 | n=36,497 | n=9,124 |
 **Top 10 features for `optimal_temperature_c`:**
-- `ivywrel_frac` — 0.1267
-- `iso_cat2_thermophilic_gt45_c` — 0.0299
-- `n_predicted_cds` — 0.0251
-- `iso_cat2_human` — 0.0209
-- `iso_cat1_infection` — 0.0206
-- `iso_cat2_patient` — 0.0178
-- `aa_frac_C` — 0.0150
-- `genome_size_nt` — 0.0122
-- `aa_frac_D` — 0.0113
-- `codon_AGG` — 0.0109
 ### `optimal_ph` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | mae = 0.456 | n=4,082 | n=1,021 |
-| 2 | mae = 0.626 | n=4,082 | n=1,021 |
-| 3 | mae = 0.528 | n=4,082 | n=1,021 |
-| 4 | mae = 0.480 | n=4,083 | n=1,020 |
-| 5 | mae = 0.454 | n=4,083 | n=1,020 |
 **Top 10 features for `optimal_ph`:**
-- `iso_cat2_acidic` — 0.0522
-- `iso_cat2_alkaline` — 0.0435
-- `neg_charged_frac` — 0.0169
-- `aa_frac_E` — 0.0086
-- `tetra_CTCT` — 0.0084
-- `aa_frac_H` — 0.0080
-- `mean_isoelectric_point` — 0.0076
-- `tetra_CACT` — 0.0074
-- `tetra_AGAC` — 0.0071
-- `tetra_AGGT` — 0.0059
 ### `oxygen_requirement` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | f1_macro = 0.315 | n=17,311 | n=4,328 |
-| 2 | f1_macro = 0.382 | n=17,311 | n=4,326 |
-| 3 | f1_macro = 0.344 | n=17,311 | n=4,328 |
-| 4 | f1_macro = 0.259 | n=17,311 | n=4,328 |
-| 5 | f1_macro = 0.406 | n=17,312 | n=4,327 |
 **Top 10 features for `oxygen_requirement`:**
-- `codon_ATA` — 0.0414
-- `iso_cat1_host` — 0.0260
-- `n_predicted_cds` — 0.0252
-- `aa_frac_C` — 0.0191
-- `iso_cat1_environmental` — 0.0165
-- `codon_CGT` — 0.0148
-- `iso_cat1_engineered` — 0.0138
-- `genome_size_nt` — 0.0113
-- `iso_cat2_human` — 0.0102
-- `codon_TAA` — 0.0090
 ### `salt_tolerance_pct` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
-| 1 | mae = 2.002 | n=3,075 | n=769 |
-| 2 | mae = 1.934 | n=3,075 | n=769 |
-| 3 | mae = 2.848 | n=3,075 | n=769 |
-| 4 | mae = 1.908 | n=3,075 | n=769 |
-| 5 | mae = 2.148 | n=3,076 | n=768 |
 **Top 10 features for `salt_tolerance_pct`:**
-- `neg_charged_frac` — 0.0716
-- `tetra_ATCC` — 0.0427
-- `aa_frac_C` — 0.0331
-- `iso_cat2_saline` — 0.0298
-- `aa_frac_T` — 0.0114
-- `codon_CCG` — 0.0107
-- `tetra_GTTC` — 0.0086
-- `codon_ATT` — 0.0079
-- `iso_cat2_built_environment` — 0.0077
-- `tetra_TGAT` — 0.0063
 ## Feature ↔ target correlations (Spearman, top 10)
@@ -163,16 +163,16 @@ Sanity-checks the biology — features known to track each target should appear
 | Feature | Spearman ρ | p-value |
 |---|---|---|
 | `neg_charged_frac` | +0.304 | 1.6e-109 |
 | `mean_isoelectric_point` | -0.278 | 1.8e-91 |
 | `aa_frac_E` | +0.256 | 4.5e-77 |
 | `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
 | `ivywrel_frac` | +0.159 | 2.4e-30 |
 | `codon_AAG` | -0.154 | 1.7e-28 |
 | `codon_CGA` | +0.153 | 5.8e-28 |
-| `codon_TGC` | -0.151 | 2.6e-27 |
-| `iso_cat2_saline` | +0.137 | 8.9e-23 |
-| `tetra_CACT` | +0.135 | 4.3e-22 |
 ### `salt_tolerance_pct`
@@ -183,11 +183,11 @@ Sanity-checks the biology — features known to track each target should appear
 | `aa_frac_E` | +0.310 | 3.1e-86 |
 | `tetra_GACT` | +0.302 | 4.3e-82 |
 | `tetra_AGTC` | +0.302 | 1.0e-81 |
 | `tetra_ACTC` | +0.282 | 2.2e-71 |
 | `tetra_GAGT` | +0.273 | 1.9e-66 |
 | `iso_cat2_saline` | +0.263 | 9.4e-62 |
-| `aa_frac_D` | +0.257 | 5.3e-59 |
-| `codon_AGC` | -0.252 | 6.0e-57 |
 ## Per-family error breakdown (regression targets)
@@ -197,61 +197,61 @@ Top 15 most-represented families, MAE per family. Highlights where the model is
 | Family | n | MAE |
 |---|---|---|
-| Enterobacteriaceae | 2662 | 4.086 |
-| Streptomycetaceae | 2212 | 1.919 |
-| Bacillaceae | 1886 | 3.195 |
-| Lactobacillaceae | 1732 | 3.537 |
-| Pseudomonadaceae | 1621 | 2.576 |
-| Myxococcaceae | 1546 | 0.403 |
-| Streptococcaceae | 1170 | 2.367 |
-| Staphylococcaceae | 1068 | 4.288 |
-| Flavobacteriaceae | 981 | 4.202 |
-| Corynebacteriaceae | 900 | 2.231 |
-| Moraxellaceae | 890 | 3.514 |
-| Paenibacillaceae | 760 | 2.967 |
-| Microbacteriaceae | 734 | 2.482 |
-| Micrococcaceae | 719 | 2.991 |
-| Nocardiaceae | 715 | 2.679 |
 ### `optimal_ph`
 | Family | n | MAE |
 |---|---|---|
-| Flavobacteriaceae | 355 | 0.391 |
-| Bacillaceae | 298 | 0.678 |
-| Roseobacteraceae | 204 | 0.400 |
-| Paenibacillaceae | 139 | 0.435 |
-| Microbacteriaceae | 120 | 0.438 |
-| Sphingobacteriaceae | 114 | 0.353 |
-| Sphingomonadaceae | 102 | 0.346 |
-| Streptomycetaceae | 98 | 0.599 |
-| Pseudonocardiaceae | 93 | 0.495 |
-| Halomonadaceae | 82 | 0.603 |
-| Micrococcaceae | 82 | 0.619 |
-| Nocardioidaceae | 80 | 0.490 |
-| Paracoccaceae | 76 | 0.564 |
-| Alteromonadaceae | 71 | 0.349 |
-| Erythrobacteraceae | 68 | 0.423 |
 ### `salt_tolerance_pct`
 | Family | n | MAE |
 |---|---|---|
-| Flavobacteriaceae | 267 | 1.917 |
-| Streptomycetaceae | 264 | 2.022 |
-| Bacillaceae | 201 | 3.508 |
-| Roseobacteraceae | 127 | 1.416 |
-| Pseudonocardiaceae | 123 | 2.315 |
-| Paenibacillaceae | 93 | 1.792 |
-| Enterococcaceae | 93 | 2.822 |
-| Microbacteriaceae | 91 | 2.824 |
-| Micromonosporaceae | 90 | 1.550 |
-| Sphingomonadaceae | 81 | 0.923 |
-| Micrococcaceae | 71 | 2.768 |
-| Streptosporangiaceae | 68 | 1.546 |
-| Lactobacillaceae | 66 | 2.367 |
-| Sphingobacteriaceae | 55 | 1.236 |
-| Halomonadaceae | 52 | 2.820 |
 ## Known limitations

 # microbe-model — v0 baseline eval report
+_Generated: 2026-05-05T10:42:09+00:00_
 ## TL;DR
+- **`optimal_temperature_c`**: MAE = **2.86** (vs always-predict-mean 4.98, **+43%**)
+- **`optimal_ph`**: MAE = **0.48** (vs always-predict-mean 0.55, **+12%**)
+- **`oxygen_requirement`**: macro-F1 = **0.357** (vs always-predict-majority 0.059, **+507%**)
+- **`salt_tolerance_pct`**: MAE = **2.11** (vs always-predict-mean 2.51, **+16%**)
+Trained on **46,029** strains with **423** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
 ## Corpus
 | Target | Task | n labeled | Model metric | Baseline | Improvement |
 |---|---|---|---|---|---|
+| `optimal_temperature_c` | regression | 45,621 | MAE=2.857 | MAE=4.981 | +42.6% |
+| `optimal_ph` | regression | 5,103 | MAE=0.482 | MAE=0.546 | +11.6% |
+| `oxygen_requirement` | classification | 21,639 | F1=0.357 | F1=0.059 | +507.0% |
+| `salt_tolerance_pct` | regression | 3,844 | MAE=2.112 | MAE=2.515 | +16.0% |
 ### `optimal_temperature_c` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | mae = 2.953 | n=36,496 | n=9,125 |
+| 2 | mae = 2.626 | n=36,497 | n=9,124 |
+| 3 | mae = 3.060 | n=36,497 | n=9,124 |
+| 4 | mae = 3.265 | n=36,497 | n=9,124 |
+| 5 | mae = 2.381 | n=36,497 | n=9,124 |
 **Top 10 features for `optimal_temperature_c`:**
+- `ivywrel_frac` — 0.1235
+- `iso_cat2_thermophilic_gt45_c` — 0.0288
+- `iso_cat2_patient` — 0.0251
+- `iso_cat2_human` — 0.0234
+- `n_predicted_cds` — 0.0216
+- `iso_cat1_infection` — 0.0204
+- `aa_frac_C` — 0.0143
+- `genome_size_nt` — 0.0123
+- `tetra_CTAA` — 0.0118
+- `aa_frac_D` — 0.0109
 ### `optimal_ph` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | mae = 0.440 | n=4,082 | n=1,021 |
+| 2 | mae = 0.568 | n=4,082 | n=1,021 |
+| 3 | mae = 0.494 | n=4,082 | n=1,021 |
+| 4 | mae = 0.466 | n=4,083 | n=1,020 |
+| 5 | mae = 0.444 | n=4,083 | n=1,020 |
 **Top 10 features for `optimal_ph`:**
+- `md_ph_median` — 0.0518
+- `iso_cat2_acidic` — 0.0307
+- `iso_cat2_alkaline` — 0.0287
+- `neg_charged_frac` — 0.0146
+- `aa_frac_H` — 0.0081
+- `aa_frac_E` — 0.0077
+- `tetra_CTCT` — 0.0071
+- `iso_cat2_plant` — 0.0068
+- `tetra_AGAC` — 0.0067
+- `tetra_CACT` — 0.0065
 ### `oxygen_requirement` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | f1_macro = 0.353 | n=17,311 | n=4,328 |
+| 2 | f1_macro = 0.375 | n=17,311 | n=4,326 |
+| 3 | f1_macro = 0.357 | n=17,311 | n=4,328 |
+| 4 | f1_macro = 0.274 | n=17,311 | n=4,328 |
+| 5 | f1_macro = 0.429 | n=17,312 | n=4,327 |
 **Top 10 features for `oxygen_requirement`:**
+- `codon_ATA` — 0.0395
+- `iso_cat1_host` — 0.0269
+- `n_predicted_cds` — 0.0266
+- `aa_frac_C` — 0.0195
+- `iso_cat1_environmental` — 0.0162
+- `codon_CGT` — 0.0144
+- `iso_cat1_engineered` — 0.0139
+- `iso_cat2_human` — 0.0124
+- `genome_size_nt` — 0.0103
+- `codon_TAA` — 0.0083
 ### `salt_tolerance_pct` — fold-by-fold
 | Fold | Metric | Train | Test |
 |---|---|---|---|
+| 1 | mae = 1.926 | n=3,075 | n=769 |
+| 2 | mae = 1.893 | n=3,075 | n=769 |
+| 3 | mae = 2.746 | n=3,075 | n=769 |
+| 4 | mae = 1.870 | n=3,075 | n=769 |
+| 5 | mae = 2.128 | n=3,076 | n=768 |
 **Top 10 features for `salt_tolerance_pct`:**
+- `neg_charged_frac` — 0.0702
+- `tetra_ATCC` — 0.0428
+- `aa_frac_C` — 0.0298
+- `iso_cat2_saline` — 0.0286
+- `md_nacl_pct_median` — 0.0256
+- `tetra_ACAT` — 0.0255
+- `md_nacl_pct_max` — 0.0128
+- `aa_frac_T` — 0.0120
+- `codon_CCG` — 0.0093
+- `tetra_TGAT` — 0.0089
 ## Feature ↔ target correlations (Spearman, top 10)
 | Feature | Spearman ρ | p-value |
 |---|---|---|
+| `md_ph_median` | +0.429 | 4.0e-131 |
 | `neg_charged_frac` | +0.304 | 1.6e-109 |
 | `mean_isoelectric_point` | -0.278 | 1.8e-91 |
 | `aa_frac_E` | +0.256 | 4.5e-77 |
+| `md_nacl_pct_max` | +0.218 | 1.9e-33 |
+| `md_nacl_pct_median` | +0.212 | 9.9e-32 |
 | `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
 | `ivywrel_frac` | +0.159 | 2.4e-30 |
 | `codon_AAG` | -0.154 | 1.7e-28 |
 | `codon_CGA` | +0.153 | 5.8e-28 |
 ### `salt_tolerance_pct`
 | `aa_frac_E` | +0.310 | 3.1e-86 |
 | `tetra_GACT` | +0.302 | 4.3e-82 |
 | `tetra_AGTC` | +0.302 | 1.0e-81 |
+| `md_nacl_pct_max` | +0.298 | 2.9e-52 |
+| `md_nacl_pct_median` | +0.290 | 1.6e-49 |
 | `tetra_ACTC` | +0.282 | 2.2e-71 |
 | `tetra_GAGT` | +0.273 | 1.9e-66 |
 | `iso_cat2_saline` | +0.263 | 9.4e-62 |
 ## Per-family error breakdown (regression targets)
 | Family | n | MAE |
 |---|---|---|
+| Enterobacteriaceae | 2662 | 3.792 |
+| Streptomycetaceae | 2212 | 1.783 |
+| Bacillaceae | 1886 | 3.174 |
+| Lactobacillaceae | 1732 | 3.709 |
+| Pseudomonadaceae | 1621 | 2.488 |
+| Myxococcaceae | 1546 | 0.238 |
+| Streptococcaceae | 1170 | 2.537 |
+| Staphylococcaceae | 1068 | 3.374 |
+| Flavobacteriaceae | 981 | 4.116 |
+| Corynebacteriaceae | 900 | 2.146 |
+| Moraxellaceae | 890 | 3.388 |
+| Paenibacillaceae | 760 | 3.081 |
+| Microbacteriaceae | 734 | 2.459 |
+| Micrococcaceae | 719 | 2.811 |
+| Nocardiaceae | 715 | 2.276 |
 ### `optimal_ph`
 | Family | n | MAE |
 |---|---|---|
+| Flavobacteriaceae | 355 | 0.405 |
+| Bacillaceae | 298 | 0.606 |
+| Roseobacteraceae | 204 | 0.375 |
+| Paenibacillaceae | 139 | 0.469 |
+| Microbacteriaceae | 120 | 0.446 |
+| Sphingobacteriaceae | 114 | 0.336 |
+| Sphingomonadaceae | 102 | 0.319 |
+| Streptomycetaceae | 98 | 0.513 |
+| Pseudonocardiaceae | 93 | 0.479 |
+| Halomonadaceae | 82 | 0.584 |
+| Micrococcaceae | 82 | 0.613 |
+| Nocardioidaceae | 80 | 0.502 |
+| Paracoccaceae | 76 | 0.574 |
+| Alteromonadaceae | 71 | 0.355 |
+| Erythrobacteraceae | 68 | 0.446 |
 ### `salt_tolerance_pct`
 | Family | n | MAE |
 |---|---|---|
+| Flavobacteriaceae | 267 | 1.713 |
+| Streptomycetaceae | 264 | 1.987 |
+| Bacillaceae | 201 | 3.315 |
+| Roseobacteraceae | 127 | 1.395 |
+| Pseudonocardiaceae | 123 | 2.280 |
+| Paenibacillaceae | 93 | 1.651 |
+| Enterococcaceae | 93 | 2.935 |
+| Microbacteriaceae | 91 | 2.789 |
+| Micromonosporaceae | 90 | 1.609 |
+| Sphingomonadaceae | 81 | 1.028 |
+| Micrococcaceae | 71 | 2.613 |
+| Streptosporangiaceae | 68 | 1.480 |
+| Lactobacillaceae | 66 | 2.559 |
+| Sphingobacteriaceae | 55 | 1.218 |
+| Halomonadaceae | 52 | 2.815 |
 ## Known limitations

scripts/03_train_baseline.py CHANGED Viewed

@@ -71,8 +71,20 @@ def main() -> None:
     print(f"Encoded {len(iso_cols)} isolation-category features "
           f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
     feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
-    feature_cols = feature_cols + iso_cols
     print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
     print(f"Distinct groups: {df['group'].nunique():,}")

     print(f"Encoded {len(iso_cols)} isolation-category features "
           f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
+    md_path = config.DATA / "mediadive_features.parquet"
+    md_cols: list[str] = []
+    if md_path.exists():
+        md = pd.read_parquet(md_path)
+        md["bacdive_id"] = md["bacdive_id"].astype(int)
+        df["bacdive_id"] = df["bacdive_id"].astype(int)
+        md_cols = [c for c in md.columns if c != "bacdive_id"]
+        df = df.merge(md, on="bacdive_id", how="left")
+        n_with_md = df[md_cols[0]].notna().sum() if md_cols else 0
+        print(f"Joined MediaDive features ({len(md_cols)} cols) — "
+              f"{n_with_md:,}/{len(df):,} training rows have MediaDive data")
     feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
+    feature_cols = feature_cols + iso_cols + md_cols
     print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
     print(f"Distinct groups: {df['group'].nunique():,}")

scripts/20_build_mediadive_features.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Build per-strain MediaDive features from strain_media + media_recipes + raw JSON.
+For each BacDive strain, compute the median pH and NaCl% across all DSMZ media that
+strain has been recorded as growing on. These are NOT labels — they're additional
+features the model can use to predict the actual phenotype optima. Saves to
+data/mediadive_features.parquet (joined into the training table by scripts/03).
+Per-strain features written:
+  - md_n_media:        count of media the strain grows on
+  - md_ph_median:      median midpoint(min_pH, max_pH) across those media
+  - md_ph_range:       max - min of medium pH across those media
+  - md_nacl_pct_median:median NaCl % w/v across those media
+  - md_nacl_pct_max:   max NaCl % w/v (highest tolerated)
+Sanity check: where a BacDive optimum_pH or salt_tolerance_pct exists, we expect
+moderate (not perfect) correlation with the corresponding MediaDive feature.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import pandas as pd
+from microbe_model import config
+NACL_CAP_PCT = 30.0  # clip recipes with absurd NaCl values (parse artifacts)
+def build_medium_ph_map() -> dict[str, float]:
+    """Return {medium_id: midpoint pH} from raw MediaDive cache."""
+    out: dict[str, float] = {}
+    for path in Path(config.DATA / "mediadive").glob("*.json"):
+        try:
+            d = json.loads(path.read_text())
+        except json.JSONDecodeError:
+            continue
+        if not isinstance(d, dict):
+            continue
+        m = d.get("medium")
+        if not isinstance(m, dict):
+            continue
+        mid = m.get("id")
+        min_ph = m.get("min_pH")
+        max_ph = m.get("max_pH")
+        if mid is None or min_ph is None or max_ph is None:
+            continue
+        try:
+            out[str(mid)] = (float(min_ph) + float(max_ph)) / 2
+        except (ValueError, TypeError):
+            continue
+    return out
+def build_medium_nacl_map() -> dict[str, float]:
+    """Return {medium_id: NaCl % w/v} summed from recipe compounds (clipped)."""
+    mr = pd.read_parquet(config.DATA / "media_recipes.parquet")
+    nacl = mr[mr["compound"].str.contains(r"sodium chlor|^nacl$", case=False, na=False, regex=True)]
+    pct = (nacl.groupby("medium_id")["g_l"].sum() / 10).clip(upper=NACL_CAP_PCT)
+    return pct.astype(float).to_dict()
+def main() -> None:
+    sm = pd.read_parquet(config.DATA / "strain_media.parquet")
+    sm = sm[sm["growth"].str.lower() == "yes"].copy()
+    sm["medium_id"] = sm["medium_id"].astype(str)
+    ph_map = build_medium_ph_map()
+    nacl_map = build_medium_nacl_map()
+    print(f"medium pH map: {len(ph_map):,} media")
+    print(f"medium NaCl map: {len(nacl_map):,} media")
+    sm["m_ph"] = sm["medium_id"].map(ph_map)
+    # Strains may grow on media not in the recipe table — treat absent as 0% NaCl
+    sm["m_nacl"] = sm["medium_id"].map(nacl_map).fillna(0.0)
+    # Aggregate per-strain
+    grouped = sm.groupby("bacdive_id")
+    feat = pd.DataFrame({
+        "md_n_media": grouped.size(),
+        "md_ph_median": grouped["m_ph"].median(),
+        "md_ph_range": grouped["m_ph"].max() - grouped["m_ph"].min(),
+        "md_nacl_pct_median": grouped["m_nacl"].median(),
+        "md_nacl_pct_max": grouped["m_nacl"].max(),
+    }).reset_index()
+    out = config.DATA / "mediadive_features.parquet"
+    feat.to_parquet(out, index=False)
+    print(f"\nwrote {len(feat):,} strains to {out}")
+    print(feat.describe().round(2).to_string())
+if __name__ == "__main__":
+    main()