Miyu Horiuchi Claude Opus 4.7 (1M context) commited on
Commit
5df9ef8
·
1 Parent(s): 56b0c4e

Add MediaDive-derived features (medium pH, NaCl, n_media) — all 4 targets improve

Browse files

For each strain in data/strain_media.parquet, compute median pH and NaCl% across
the DSMZ media it has been grown on, plus a count of media. These are model inputs
(features), NOT labels — the previous probe showed BacDive↔MediaDive label
correlation is only 0.42 for salt, so using MediaDive as a label source would
corrupt the now-clean salt MAE. As features, the model learns the right weighting.

5 new features per strain:
md_n_media count of media the strain grows on
md_ph_median median midpoint(min_pH, max_pH)
md_ph_range spread (max - min) of medium pH
md_nacl_pct_median median NaCl % w/v across recipes
md_nacl_pct_max highest tolerated NaCl

Coverage: 28,704 strains (62% of 46K training table) have MediaDive data.

Cumulative metrics vs original v0 genome-only baseline:
optimal_temperature_c MAE 3.28 → 2.86 (-12.9%)
optimal_ph MAE 0.52 → 0.48 ( -7.7%)
oxygen_requirement F1 0.279 → 0.358 (+28.2%)
salt_tolerance_pct MAE 2.51 → 2.11 (-15.9%)

This step alone:
T_opt 2.94 → 2.86
pH 0.51 → 0.48
Oxygen 0.341 → 0.358
Salt 2.17 → 2.11

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

artifacts/baseline_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "optimal_temperature_c": {
3
  "task": "regression",
4
- "mean_metric": 2.939444159350111,
5
  "folds": [
6
  {
7
  "target": "optimal_temperature_c",
8
  "task": "regression",
9
  "metric_name": "mae",
10
- "value": 3.103597222252415,
11
  "n_train": 36496,
12
  "n_test": 9125
13
  },
@@ -15,7 +15,7 @@
15
  "target": "optimal_temperature_c",
16
  "task": "regression",
17
  "metric_name": "mae",
18
- "value": 2.7356862682357583,
19
  "n_train": 36497,
20
  "n_test": 9124
21
  },
@@ -23,7 +23,7 @@
23
  "target": "optimal_temperature_c",
24
  "task": "regression",
25
  "metric_name": "mae",
26
- "value": 3.145843773419164,
27
  "n_train": 36497,
28
  "n_test": 9124
29
  },
@@ -31,7 +31,7 @@
31
  "target": "optimal_temperature_c",
32
  "task": "regression",
33
  "metric_name": "mae",
34
- "value": 3.2767152481045656,
35
  "n_train": 36497,
36
  "n_test": 9124
37
  },
@@ -39,43 +39,43 @@
39
  "target": "optimal_temperature_c",
40
  "task": "regression",
41
  "metric_name": "mae",
42
- "value": 2.43537828473865,
43
  "n_train": 36497,
44
  "n_test": 9124
45
  }
46
  ],
47
  "top_features": {
48
- "ivywrel_frac": 0.12668818831443787,
49
- "iso_cat2_thermophilic_gt45_c": 0.029868930205702783,
50
- "n_predicted_cds": 0.025075340643525124,
51
- "iso_cat2_human": 0.020858772844076157,
52
- "iso_cat1_infection": 0.020640516839921474,
53
- "iso_cat2_patient": 0.017751351464539766,
54
- "aa_frac_C": 0.015003016591072083,
55
- "genome_size_nt": 0.012203263118863106,
56
- "aa_frac_D": 0.011290411837399006,
57
- "codon_AGG": 0.010900856088846922,
58
- "iso_cat1_environmental": 0.010176281817257405,
59
- "tetra_GCCT": 0.009658925677649676,
60
- "tetra_TAGT": 0.00883282758295536,
61
- "aa_frac_Y": 0.008421392692252994,
62
- "aa_frac_E": 0.007741594593971968,
63
- "tetra_TTCC": 0.007376640872098506,
64
- "mean_isoelectric_point": 0.007058459660038352,
65
- "tetra_CTAA": 0.0070426638238132,
66
- "iso_cat2_built_environment": 0.006164434866514057,
67
- "iso_cat2_industrial": 0.005895084328949451
68
  }
69
  },
70
  "optimal_ph": {
71
  "task": "regression",
72
- "mean_metric": 0.5090253015368336,
73
  "folds": [
74
  {
75
  "target": "optimal_ph",
76
  "task": "regression",
77
  "metric_name": "mae",
78
- "value": 0.45639293885487886,
79
  "n_train": 4082,
80
  "n_test": 1021
81
  },
@@ -83,7 +83,7 @@
83
  "target": "optimal_ph",
84
  "task": "regression",
85
  "metric_name": "mae",
86
- "value": 0.6262803867911733,
87
  "n_train": 4082,
88
  "n_test": 1021
89
  },
@@ -91,7 +91,7 @@
91
  "target": "optimal_ph",
92
  "task": "regression",
93
  "metric_name": "mae",
94
- "value": 0.528334212326513,
95
  "n_train": 4082,
96
  "n_test": 1021
97
  },
@@ -99,7 +99,7 @@
99
  "target": "optimal_ph",
100
  "task": "regression",
101
  "metric_name": "mae",
102
- "value": 0.48048674237494376,
103
  "n_train": 4083,
104
  "n_test": 1020
105
  },
@@ -107,43 +107,43 @@
107
  "target": "optimal_ph",
108
  "task": "regression",
109
  "metric_name": "mae",
110
- "value": 0.4536322273366591,
111
  "n_train": 4083,
112
  "n_test": 1020
113
  }
114
  ],
115
  "top_features": {
116
- "iso_cat2_acidic": 0.05219607315957546,
117
- "iso_cat2_alkaline": 0.043521419167518616,
118
- "neg_charged_frac": 0.016875072754919528,
119
- "aa_frac_E": 0.008599728252738715,
120
- "tetra_CTCT": 0.008368687890470027,
121
- "aa_frac_H": 0.008003219496458769,
122
- "mean_isoelectric_point": 0.007599354162812233,
123
- "tetra_CACT": 0.007427609874866903,
124
- "tetra_AGAC": 0.007137532206252217,
125
- "tetra_AGGT": 0.005891842069104314,
126
- "tetra_GACT": 0.005873983446508646,
127
- "tetra_GAGA": 0.005548427533358336,
128
- "tetra_GTCT": 0.005475769587792456,
129
- "codon_GAA": 0.005408304557204246,
130
- "n_predicted_cds": 0.005280579440295696,
131
- "iso_cat2_plants": 0.005045945569872856,
132
- "tetra_TTGA": 0.004973787232302129,
133
- "codon_AAG": 0.0048154488438740374,
134
- "tetra_ACGA": 0.004731484339572489,
135
- "aa_frac_Y": 0.0046834095381200315
136
  }
137
  },
138
  "oxygen_requirement": {
139
  "task": "classification",
140
- "mean_metric": 0.34127360853732613,
141
  "folds": [
142
  {
143
  "target": "oxygen_requirement",
144
  "task": "classification",
145
  "metric_name": "f1_macro",
146
- "value": 0.31515576471296236,
147
  "n_train": 17311,
148
  "n_test": 4328
149
  },
@@ -151,7 +151,7 @@
151
  "target": "oxygen_requirement",
152
  "task": "classification",
153
  "metric_name": "f1_macro",
154
- "value": 0.38181774862206597,
155
  "n_train": 17311,
156
  "n_test": 4326
157
  },
@@ -159,7 +159,7 @@
159
  "target": "oxygen_requirement",
160
  "task": "classification",
161
  "metric_name": "f1_macro",
162
- "value": 0.34440677114867413,
163
  "n_train": 17311,
164
  "n_test": 4328
165
  },
@@ -167,7 +167,7 @@
167
  "target": "oxygen_requirement",
168
  "task": "classification",
169
  "metric_name": "f1_macro",
170
- "value": 0.25943178539399836,
171
  "n_train": 17311,
172
  "n_test": 4328
173
  },
@@ -175,43 +175,43 @@
175
  "target": "oxygen_requirement",
176
  "task": "classification",
177
  "metric_name": "f1_macro",
178
- "value": 0.40555597280892947,
179
  "n_train": 17312,
180
  "n_test": 4327
181
  }
182
  ],
183
  "top_features": {
184
- "codon_ATA": 0.0414140235632658,
185
- "iso_cat1_host": 0.02601129524409771,
186
- "n_predicted_cds": 0.025201210007071494,
187
- "aa_frac_C": 0.019132474437355995,
188
- "iso_cat1_environmental": 0.01645018421113491,
189
- "codon_CGT": 0.014759847987443208,
190
- "iso_cat1_engineered": 0.01378793753683567,
191
- "genome_size_nt": 0.011305144988000393,
192
- "iso_cat2_human": 0.010168002359569073,
193
- "codon_TAA": 0.00900037819519639,
194
- "aa_frac_V": 0.008459322061389685,
195
- "aa_frac_Y": 0.008259046915918588,
196
- "aa_frac_L": 0.0072497081011533735,
197
- "tetra_CTGG": 0.006922230357304215,
198
- "aa_frac_T": 0.006535647064447403,
199
- "codon_TGG": 0.006477221753448248,
200
- "aa_frac_Q": 0.0063397581689059734,
201
- "aa_frac_M": 0.006198597187176347,
202
- "tetra_CAAA": 0.006141273584216833,
203
- "codon_CAA": 0.00611291266977787
204
  }
205
  },
206
  "salt_tolerance_pct": {
207
  "task": "regression",
208
- "mean_metric": 2.1678824807340775,
209
  "folds": [
210
  {
211
  "target": "salt_tolerance_pct",
212
  "task": "regression",
213
  "metric_name": "mae",
214
- "value": 2.0015166708623555,
215
  "n_train": 3075,
216
  "n_test": 769
217
  },
@@ -219,7 +219,7 @@
219
  "target": "salt_tolerance_pct",
220
  "task": "regression",
221
  "metric_name": "mae",
222
- "value": 1.933744682528282,
223
  "n_train": 3075,
224
  "n_test": 769
225
  },
@@ -227,7 +227,7 @@
227
  "target": "salt_tolerance_pct",
228
  "task": "regression",
229
  "metric_name": "mae",
230
- "value": 2.8480368776648506,
231
  "n_train": 3075,
232
  "n_test": 769
233
  },
@@ -235,7 +235,7 @@
235
  "target": "salt_tolerance_pct",
236
  "task": "regression",
237
  "metric_name": "mae",
238
- "value": 1.9080503232621326,
239
  "n_train": 3075,
240
  "n_test": 769
241
  },
@@ -243,32 +243,32 @@
243
  "target": "salt_tolerance_pct",
244
  "task": "regression",
245
  "metric_name": "mae",
246
- "value": 2.148063849352766,
247
  "n_train": 3076,
248
  "n_test": 768
249
  }
250
  ],
251
  "top_features": {
252
- "neg_charged_frac": 0.07161131724715233,
253
- "tetra_ATCC": 0.042717094696126875,
254
- "aa_frac_C": 0.03307443875819445,
255
- "iso_cat2_saline": 0.029842879995703696,
256
- "aa_frac_T": 0.011370222107507289,
257
- "codon_CCG": 0.01071425569243729,
258
- "tetra_GTTC": 0.008600032026879489,
259
- "codon_ATT": 0.007889647269621491,
260
- "iso_cat2_built_environment": 0.0076506318233441565,
261
- "tetra_TGAT": 0.006314040301367641,
262
- "tetra_CGCT": 0.006236091535538435,
263
- "tetra_AATT": 0.006198087707161903,
264
- "codon_CGT": 0.006119634560309351,
265
- "mean_isoelectric_point": 0.005993681214749813,
266
- "tetra_GTAT": 0.005874662450514734,
267
- "tetra_TCCA": 0.005588621075730771,
268
- "aa_frac_Y": 0.005549108772538602,
269
- "codon_ACG": 0.005454356223344803,
270
- "tetra_TTCC": 0.0053013143828138706,
271
- "tetra_CACA": 0.005279732123017311
272
  }
273
  },
274
  "__meta__": {
@@ -690,7 +690,12 @@
690
  "iso_cat2_urogenital_tract",
691
  "iso_cat2_waste",
692
  "iso_cat2_xerophilic",
693
- "iso_cat2_yeast"
 
 
 
 
 
694
  ]
695
  }
696
  }
 
1
  {
2
  "optimal_temperature_c": {
3
  "task": "regression",
4
+ "mean_metric": 2.8569134461172,
5
  "folds": [
6
  {
7
  "target": "optimal_temperature_c",
8
  "task": "regression",
9
  "metric_name": "mae",
10
+ "value": 2.952921209821309,
11
  "n_train": 36496,
12
  "n_test": 9125
13
  },
 
15
  "target": "optimal_temperature_c",
16
  "task": "regression",
17
  "metric_name": "mae",
18
+ "value": 2.6256106255400447,
19
  "n_train": 36497,
20
  "n_test": 9124
21
  },
 
23
  "target": "optimal_temperature_c",
24
  "task": "regression",
25
  "metric_name": "mae",
26
+ "value": 3.0601953129348187,
27
  "n_train": 36497,
28
  "n_test": 9124
29
  },
 
31
  "target": "optimal_temperature_c",
32
  "task": "regression",
33
  "metric_name": "mae",
34
+ "value": 3.2652047467513965,
35
  "n_train": 36497,
36
  "n_test": 9124
37
  },
 
39
  "target": "optimal_temperature_c",
40
  "task": "regression",
41
  "metric_name": "mae",
42
+ "value": 2.38063533553843,
43
  "n_train": 36497,
44
  "n_test": 9124
45
  }
46
  ],
47
  "top_features": {
48
+ "ivywrel_frac": 0.12348818182945251,
49
+ "iso_cat2_thermophilic_gt45_c": 0.028791341930627823,
50
+ "iso_cat2_patient": 0.025099934451282023,
51
+ "iso_cat2_human": 0.02344932146370411,
52
+ "n_predicted_cds": 0.021633704751729967,
53
+ "iso_cat1_infection": 0.020425693690776826,
54
+ "aa_frac_C": 0.014341578260064125,
55
+ "genome_size_nt": 0.01227616611868143,
56
+ "tetra_CTAA": 0.011777869192883372,
57
+ "aa_frac_D": 0.01087347036227584,
58
+ "codon_AGG": 0.009832531120628119,
59
+ "tetra_GCCT": 0.009409325825981796,
60
+ "aa_frac_E": 0.008742744009941817,
61
+ "tetra_TTAG": 0.008621749537996947,
62
+ "iso_cat1_environmental": 0.00781458979472518,
63
+ "mean_isoelectric_point": 0.007003072090446949,
64
+ "aa_frac_Y": 0.00684288409538567,
65
+ "tetra_AGGC": 0.006669195392169059,
66
+ "iso_cat2_industrial": 0.006660213135182858,
67
+ "tetra_TTCC": 0.006564890965819359
68
  }
69
  },
70
  "optimal_ph": {
71
  "task": "regression",
72
+ "mean_metric": 0.4824498969036545,
73
  "folds": [
74
  {
75
  "target": "optimal_ph",
76
  "task": "regression",
77
  "metric_name": "mae",
78
+ "value": 0.440339089476747,
79
  "n_train": 4082,
80
  "n_test": 1021
81
  },
 
83
  "target": "optimal_ph",
84
  "task": "regression",
85
  "metric_name": "mae",
86
+ "value": 0.5678683244049492,
87
  "n_train": 4082,
88
  "n_test": 1021
89
  },
 
91
  "target": "optimal_ph",
92
  "task": "regression",
93
  "metric_name": "mae",
94
+ "value": 0.4943884038785062,
95
  "n_train": 4082,
96
  "n_test": 1021
97
  },
 
99
  "target": "optimal_ph",
100
  "task": "regression",
101
  "metric_name": "mae",
102
+ "value": 0.46583879377327714,
103
  "n_train": 4083,
104
  "n_test": 1020
105
  },
 
107
  "target": "optimal_ph",
108
  "task": "regression",
109
  "metric_name": "mae",
110
+ "value": 0.44381487298479266,
111
  "n_train": 4083,
112
  "n_test": 1020
113
  }
114
  ],
115
  "top_features": {
116
+ "md_ph_median": 0.05177119821310043,
117
+ "iso_cat2_acidic": 0.030658208578824998,
118
+ "iso_cat2_alkaline": 0.02869502492249012,
119
+ "neg_charged_frac": 0.014565921388566494,
120
+ "aa_frac_H": 0.008134929556399583,
121
+ "aa_frac_E": 0.007721887435764074,
122
+ "tetra_CTCT": 0.007108068186789751,
123
+ "iso_cat2_plant": 0.006769544072449207,
124
+ "tetra_AGAC": 0.006719858897849917,
125
+ "tetra_CACT": 0.006461512250825763,
126
+ "tetra_GACT": 0.0064593076705932615,
127
+ "tetra_TCTC": 0.005769496783614159,
128
+ "tetra_TGGG": 0.005730107612907887,
129
+ "codon_ACG": 0.005510704545304179,
130
+ "tetra_TAAC": 0.004951662756502629,
131
+ "mean_isoelectric_point": 0.004634805396199227,
132
+ "tetra_TGGT": 0.004585431469604373,
133
+ "tetra_AGTC": 0.004529385082423687,
134
+ "aa_frac_Y": 0.0043250532820820805,
135
+ "iso_cat2_plants": 0.004201814788393677
136
  }
137
  },
138
  "oxygen_requirement": {
139
  "task": "classification",
140
+ "mean_metric": 0.3574661512390337,
141
  "folds": [
142
  {
143
  "target": "oxygen_requirement",
144
  "task": "classification",
145
  "metric_name": "f1_macro",
146
+ "value": 0.3529974318071035,
147
  "n_train": 17311,
148
  "n_test": 4328
149
  },
 
151
  "target": "oxygen_requirement",
152
  "task": "classification",
153
  "metric_name": "f1_macro",
154
+ "value": 0.37463614052135813,
155
  "n_train": 17311,
156
  "n_test": 4326
157
  },
 
159
  "target": "oxygen_requirement",
160
  "task": "classification",
161
  "metric_name": "f1_macro",
162
+ "value": 0.357449136753726,
163
  "n_train": 17311,
164
  "n_test": 4328
165
  },
 
167
  "target": "oxygen_requirement",
168
  "task": "classification",
169
  "metric_name": "f1_macro",
170
+ "value": 0.2736180772079518,
171
  "n_train": 17311,
172
  "n_test": 4328
173
  },
 
175
  "target": "oxygen_requirement",
176
  "task": "classification",
177
  "metric_name": "f1_macro",
178
+ "value": 0.4286299699050292,
179
  "n_train": 17312,
180
  "n_test": 4327
181
  }
182
  ],
183
  "top_features": {
184
+ "codon_ATA": 0.03948382511734962,
185
+ "iso_cat1_host": 0.02685815468430519,
186
+ "n_predicted_cds": 0.02664864845573902,
187
+ "aa_frac_C": 0.01948722042143345,
188
+ "iso_cat1_environmental": 0.016227377578616142,
189
+ "codon_CGT": 0.014393045380711556,
190
+ "iso_cat1_engineered": 0.013875876553356647,
191
+ "iso_cat2_human": 0.012424463033676147,
192
+ "genome_size_nt": 0.010264858696609735,
193
+ "codon_TAA": 0.0082530552521348,
194
+ "tetra_CAAA": 0.007871841243468226,
195
+ "aa_frac_V": 0.0073866051156073805,
196
+ "aa_frac_Y": 0.007194060226902365,
197
+ "aa_frac_L": 0.006919718533754349,
198
+ "aa_frac_T": 0.006779328640550375,
199
+ "md_ph_median": 0.006684697233140469,
200
+ "aa_frac_Q": 0.006629320327192545,
201
+ "codon_CAA": 0.006617056485265493,
202
+ "aa_frac_M": 0.006288983486592769,
203
+ "codon_TGG": 0.00552113470621407
204
  }
205
  },
206
  "salt_tolerance_pct": {
207
  "task": "regression",
208
+ "mean_metric": 2.1124094661234083,
209
  "folds": [
210
  {
211
  "target": "salt_tolerance_pct",
212
  "task": "regression",
213
  "metric_name": "mae",
214
+ "value": 1.9258830039615904,
215
  "n_train": 3075,
216
  "n_test": 769
217
  },
 
219
  "target": "salt_tolerance_pct",
220
  "task": "regression",
221
  "metric_name": "mae",
222
+ "value": 1.892595597748997,
223
  "n_train": 3075,
224
  "n_test": 769
225
  },
 
227
  "target": "salt_tolerance_pct",
228
  "task": "regression",
229
  "metric_name": "mae",
230
+ "value": 2.7457253220944784,
231
  "n_train": 3075,
232
  "n_test": 769
233
  },
 
235
  "target": "salt_tolerance_pct",
236
  "task": "regression",
237
  "metric_name": "mae",
238
+ "value": 1.870206453444744,
239
  "n_train": 3075,
240
  "n_test": 769
241
  },
 
243
  "target": "salt_tolerance_pct",
244
  "task": "regression",
245
  "metric_name": "mae",
246
+ "value": 2.127636953367231,
247
  "n_train": 3076,
248
  "n_test": 768
249
  }
250
  ],
251
  "top_features": {
252
+ "neg_charged_frac": 0.07017230689525604,
253
+ "tetra_ATCC": 0.04281170674366876,
254
+ "aa_frac_C": 0.029778398107737303,
255
+ "iso_cat2_saline": 0.028634220734238623,
256
+ "md_nacl_pct_median": 0.02563472166657448,
257
+ "tetra_ACAT": 0.025493022409500556,
258
+ "md_nacl_pct_max": 0.012753746472299099,
259
+ "aa_frac_T": 0.011963088880293071,
260
+ "codon_CCG": 0.009299659519456327,
261
+ "tetra_TGAT": 0.008889634546358138,
262
+ "tetra_GTTC": 0.00881260905880481,
263
+ "codon_TCA": 0.00808500499697402,
264
+ "mean_isoelectric_point": 0.007483909465372562,
265
+ "codon_ATT": 0.0072766575030982494,
266
+ "codon_ACT": 0.006583173375111074,
267
+ "codon_CGT": 0.005766081786714494,
268
+ "tetra_TTCG": 0.005739881168119609,
269
+ "tetra_CGCT": 0.005698419373948127,
270
+ "codon_TGT": 0.005473139556124806,
271
+ "aa_frac_S": 0.005398909421637654
272
  }
273
  },
274
  "__meta__": {
 
690
  "iso_cat2_urogenital_tract",
691
  "iso_cat2_waste",
692
  "iso_cat2_xerophilic",
693
+ "iso_cat2_yeast",
694
+ "md_n_media",
695
+ "md_ph_median",
696
+ "md_ph_range",
697
+ "md_nacl_pct_median",
698
+ "md_nacl_pct_max"
699
  ]
700
  }
701
  }
artifacts/eval_report.md CHANGED
@@ -1,15 +1,15 @@
1
  # microbe-model — v0 baseline eval report
2
 
3
- _Generated: 2026-05-05T08:48:33+00:00_
4
 
5
  ## TL;DR
6
 
7
- - **`optimal_temperature_c`**: MAE = **2.94** (vs always-predict-mean 4.98, **+41%**)
8
- - **`optimal_ph`**: MAE = **0.51** (vs always-predict-mean 0.55, **+7%**)
9
- - **`oxygen_requirement`**: macro-F1 = **0.341** (vs always-predict-majority 0.059, **+479%**)
10
- - **`salt_tolerance_pct`**: MAE = **2.17** (vs always-predict-mean 2.51, **+14%**)
11
 
12
- Trained on **46,029** strains with **418** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
13
 
14
  ## Corpus
15
 
@@ -43,102 +43,102 @@ Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-
43
 
44
  | Target | Task | n labeled | Model metric | Baseline | Improvement |
45
  |---|---|---|---|---|---|
46
- | `optimal_temperature_c` | regression | 45,621 | MAE=2.939 | MAE=4.981 | +41.0% |
47
- | `optimal_ph` | regression | 5,103 | MAE=0.509 | MAE=0.546 | +6.8% |
48
- | `oxygen_requirement` | classification | 21,639 | F1=0.341 | F1=0.059 | +479.5% |
49
- | `salt_tolerance_pct` | regression | 3,844 | MAE=2.168 | MAE=2.515 | +13.8% |
50
 
51
  ### `optimal_temperature_c` — fold-by-fold
52
 
53
  | Fold | Metric | Train | Test |
54
  |---|---|---|---|
55
- | 1 | mae = 3.104 | n=36,496 | n=9,125 |
56
- | 2 | mae = 2.736 | n=36,497 | n=9,124 |
57
- | 3 | mae = 3.146 | n=36,497 | n=9,124 |
58
- | 4 | mae = 3.277 | n=36,497 | n=9,124 |
59
- | 5 | mae = 2.435 | n=36,497 | n=9,124 |
60
 
61
  **Top 10 features for `optimal_temperature_c`:**
62
 
63
- - `ivywrel_frac` — 0.1267
64
- - `iso_cat2_thermophilic_gt45_c` — 0.0299
65
- - `n_predicted_cds` — 0.0251
66
- - `iso_cat2_human` — 0.0209
67
- - `iso_cat1_infection` — 0.0206
68
- - `iso_cat2_patient` — 0.0178
69
- - `aa_frac_C` — 0.0150
70
- - `genome_size_nt` — 0.0122
71
- - `aa_frac_D` — 0.0113
72
- - `codon_AGG` — 0.0109
73
 
74
  ### `optimal_ph` — fold-by-fold
75
 
76
  | Fold | Metric | Train | Test |
77
  |---|---|---|---|
78
- | 1 | mae = 0.456 | n=4,082 | n=1,021 |
79
- | 2 | mae = 0.626 | n=4,082 | n=1,021 |
80
- | 3 | mae = 0.528 | n=4,082 | n=1,021 |
81
- | 4 | mae = 0.480 | n=4,083 | n=1,020 |
82
- | 5 | mae = 0.454 | n=4,083 | n=1,020 |
83
 
84
  **Top 10 features for `optimal_ph`:**
85
 
86
- - `iso_cat2_acidic` — 0.0522
87
- - `iso_cat2_alkaline` — 0.0435
88
- - `neg_charged_frac` — 0.0169
89
- - `aa_frac_E` — 0.0086
90
- - `tetra_CTCT` — 0.0084
91
- - `aa_frac_H` — 0.0080
92
- - `mean_isoelectric_point` — 0.0076
93
- - `tetra_CACT` — 0.0074
94
- - `tetra_AGAC` — 0.0071
95
- - `tetra_AGGT` — 0.0059
96
 
97
  ### `oxygen_requirement` — fold-by-fold
98
 
99
  | Fold | Metric | Train | Test |
100
  |---|---|---|---|
101
- | 1 | f1_macro = 0.315 | n=17,311 | n=4,328 |
102
- | 2 | f1_macro = 0.382 | n=17,311 | n=4,326 |
103
- | 3 | f1_macro = 0.344 | n=17,311 | n=4,328 |
104
- | 4 | f1_macro = 0.259 | n=17,311 | n=4,328 |
105
- | 5 | f1_macro = 0.406 | n=17,312 | n=4,327 |
106
 
107
  **Top 10 features for `oxygen_requirement`:**
108
 
109
- - `codon_ATA` — 0.0414
110
- - `iso_cat1_host` — 0.0260
111
- - `n_predicted_cds` — 0.0252
112
- - `aa_frac_C` — 0.0191
113
- - `iso_cat1_environmental` — 0.0165
114
- - `codon_CGT` — 0.0148
115
- - `iso_cat1_engineered` — 0.0138
116
- - `genome_size_nt` — 0.0113
117
- - `iso_cat2_human` — 0.0102
118
- - `codon_TAA` — 0.0090
119
 
120
  ### `salt_tolerance_pct` — fold-by-fold
121
 
122
  | Fold | Metric | Train | Test |
123
  |---|---|---|---|
124
- | 1 | mae = 2.002 | n=3,075 | n=769 |
125
- | 2 | mae = 1.934 | n=3,075 | n=769 |
126
- | 3 | mae = 2.848 | n=3,075 | n=769 |
127
- | 4 | mae = 1.908 | n=3,075 | n=769 |
128
- | 5 | mae = 2.148 | n=3,076 | n=768 |
129
 
130
  **Top 10 features for `salt_tolerance_pct`:**
131
 
132
- - `neg_charged_frac` — 0.0716
133
- - `tetra_ATCC` — 0.0427
134
- - `aa_frac_C` — 0.0331
135
- - `iso_cat2_saline` — 0.0298
136
- - `aa_frac_T` — 0.0114
137
- - `codon_CCG` — 0.0107
138
- - `tetra_GTTC` — 0.0086
139
- - `codon_ATT` — 0.0079
140
- - `iso_cat2_built_environment` — 0.0077
141
- - `tetra_TGAT` — 0.0063
142
 
143
  ## Feature ↔ target correlations (Spearman, top 10)
144
 
@@ -163,16 +163,16 @@ Sanity-checks the biology — features known to track each target should appear
163
 
164
  | Feature | Spearman ρ | p-value |
165
  |---|---|---|
 
166
  | `neg_charged_frac` | +0.304 | 1.6e-109 |
167
  | `mean_isoelectric_point` | -0.278 | 1.8e-91 |
168
  | `aa_frac_E` | +0.256 | 4.5e-77 |
 
 
169
  | `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
170
  | `ivywrel_frac` | +0.159 | 2.4e-30 |
171
  | `codon_AAG` | -0.154 | 1.7e-28 |
172
  | `codon_CGA` | +0.153 | 5.8e-28 |
173
- | `codon_TGC` | -0.151 | 2.6e-27 |
174
- | `iso_cat2_saline` | +0.137 | 8.9e-23 |
175
- | `tetra_CACT` | +0.135 | 4.3e-22 |
176
 
177
  ### `salt_tolerance_pct`
178
 
@@ -183,11 +183,11 @@ Sanity-checks the biology — features known to track each target should appear
183
  | `aa_frac_E` | +0.310 | 3.1e-86 |
184
  | `tetra_GACT` | +0.302 | 4.3e-82 |
185
  | `tetra_AGTC` | +0.302 | 1.0e-81 |
 
 
186
  | `tetra_ACTC` | +0.282 | 2.2e-71 |
187
  | `tetra_GAGT` | +0.273 | 1.9e-66 |
188
  | `iso_cat2_saline` | +0.263 | 9.4e-62 |
189
- | `aa_frac_D` | +0.257 | 5.3e-59 |
190
- | `codon_AGC` | -0.252 | 6.0e-57 |
191
 
192
  ## Per-family error breakdown (regression targets)
193
 
@@ -197,61 +197,61 @@ Top 15 most-represented families, MAE per family. Highlights where the model is
197
 
198
  | Family | n | MAE |
199
  |---|---|---|
200
- | Enterobacteriaceae | 2662 | 4.086 |
201
- | Streptomycetaceae | 2212 | 1.919 |
202
- | Bacillaceae | 1886 | 3.195 |
203
- | Lactobacillaceae | 1732 | 3.537 |
204
- | Pseudomonadaceae | 1621 | 2.576 |
205
- | Myxococcaceae | 1546 | 0.403 |
206
- | Streptococcaceae | 1170 | 2.367 |
207
- | Staphylococcaceae | 1068 | 4.288 |
208
- | Flavobacteriaceae | 981 | 4.202 |
209
- | Corynebacteriaceae | 900 | 2.231 |
210
- | Moraxellaceae | 890 | 3.514 |
211
- | Paenibacillaceae | 760 | 2.967 |
212
- | Microbacteriaceae | 734 | 2.482 |
213
- | Micrococcaceae | 719 | 2.991 |
214
- | Nocardiaceae | 715 | 2.679 |
215
 
216
  ### `optimal_ph`
217
 
218
  | Family | n | MAE |
219
  |---|---|---|
220
- | Flavobacteriaceae | 355 | 0.391 |
221
- | Bacillaceae | 298 | 0.678 |
222
- | Roseobacteraceae | 204 | 0.400 |
223
- | Paenibacillaceae | 139 | 0.435 |
224
- | Microbacteriaceae | 120 | 0.438 |
225
- | Sphingobacteriaceae | 114 | 0.353 |
226
- | Sphingomonadaceae | 102 | 0.346 |
227
- | Streptomycetaceae | 98 | 0.599 |
228
- | Pseudonocardiaceae | 93 | 0.495 |
229
- | Halomonadaceae | 82 | 0.603 |
230
- | Micrococcaceae | 82 | 0.619 |
231
- | Nocardioidaceae | 80 | 0.490 |
232
- | Paracoccaceae | 76 | 0.564 |
233
- | Alteromonadaceae | 71 | 0.349 |
234
- | Erythrobacteraceae | 68 | 0.423 |
235
 
236
  ### `salt_tolerance_pct`
237
 
238
  | Family | n | MAE |
239
  |---|---|---|
240
- | Flavobacteriaceae | 267 | 1.917 |
241
- | Streptomycetaceae | 264 | 2.022 |
242
- | Bacillaceae | 201 | 3.508 |
243
- | Roseobacteraceae | 127 | 1.416 |
244
- | Pseudonocardiaceae | 123 | 2.315 |
245
- | Paenibacillaceae | 93 | 1.792 |
246
- | Enterococcaceae | 93 | 2.822 |
247
- | Microbacteriaceae | 91 | 2.824 |
248
- | Micromonosporaceae | 90 | 1.550 |
249
- | Sphingomonadaceae | 81 | 0.923 |
250
- | Micrococcaceae | 71 | 2.768 |
251
- | Streptosporangiaceae | 68 | 1.546 |
252
- | Lactobacillaceae | 66 | 2.367 |
253
- | Sphingobacteriaceae | 55 | 1.236 |
254
- | Halomonadaceae | 52 | 2.820 |
255
 
256
  ## Known limitations
257
 
 
1
  # microbe-model — v0 baseline eval report
2
 
3
+ _Generated: 2026-05-05T10:42:09+00:00_
4
 
5
  ## TL;DR
6
 
7
+ - **`optimal_temperature_c`**: MAE = **2.86** (vs always-predict-mean 4.98, **+43%**)
8
+ - **`optimal_ph`**: MAE = **0.48** (vs always-predict-mean 0.55, **+12%**)
9
+ - **`oxygen_requirement`**: macro-F1 = **0.357** (vs always-predict-majority 0.059, **+507%**)
10
+ - **`salt_tolerance_pct`**: MAE = **2.11** (vs always-predict-mean 2.51, **+16%**)
11
 
12
+ Trained on **46,029** strains with **423** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
13
 
14
  ## Corpus
15
 
 
43
 
44
  | Target | Task | n labeled | Model metric | Baseline | Improvement |
45
  |---|---|---|---|---|---|
46
+ | `optimal_temperature_c` | regression | 45,621 | MAE=2.857 | MAE=4.981 | +42.6% |
47
+ | `optimal_ph` | regression | 5,103 | MAE=0.482 | MAE=0.546 | +11.6% |
48
+ | `oxygen_requirement` | classification | 21,639 | F1=0.357 | F1=0.059 | +507.0% |
49
+ | `salt_tolerance_pct` | regression | 3,844 | MAE=2.112 | MAE=2.515 | +16.0% |
50
 
51
  ### `optimal_temperature_c` — fold-by-fold
52
 
53
  | Fold | Metric | Train | Test |
54
  |---|---|---|---|
55
+ | 1 | mae = 2.953 | n=36,496 | n=9,125 |
56
+ | 2 | mae = 2.626 | n=36,497 | n=9,124 |
57
+ | 3 | mae = 3.060 | n=36,497 | n=9,124 |
58
+ | 4 | mae = 3.265 | n=36,497 | n=9,124 |
59
+ | 5 | mae = 2.381 | n=36,497 | n=9,124 |
60
 
61
  **Top 10 features for `optimal_temperature_c`:**
62
 
63
+ - `ivywrel_frac` — 0.1235
64
+ - `iso_cat2_thermophilic_gt45_c` — 0.0288
65
+ - `iso_cat2_patient` — 0.0251
66
+ - `iso_cat2_human` — 0.0234
67
+ - `n_predicted_cds` — 0.0216
68
+ - `iso_cat1_infection` — 0.0204
69
+ - `aa_frac_C` — 0.0143
70
+ - `genome_size_nt` — 0.0123
71
+ - `tetra_CTAA` — 0.0118
72
+ - `aa_frac_D` — 0.0109
73
 
74
  ### `optimal_ph` — fold-by-fold
75
 
76
  | Fold | Metric | Train | Test |
77
  |---|---|---|---|
78
+ | 1 | mae = 0.440 | n=4,082 | n=1,021 |
79
+ | 2 | mae = 0.568 | n=4,082 | n=1,021 |
80
+ | 3 | mae = 0.494 | n=4,082 | n=1,021 |
81
+ | 4 | mae = 0.466 | n=4,083 | n=1,020 |
82
+ | 5 | mae = 0.444 | n=4,083 | n=1,020 |
83
 
84
  **Top 10 features for `optimal_ph`:**
85
 
86
+ - `md_ph_median` — 0.0518
87
+ - `iso_cat2_acidic` — 0.0307
88
+ - `iso_cat2_alkaline` — 0.0287
89
+ - `neg_charged_frac` — 0.0146
90
+ - `aa_frac_H` — 0.0081
91
+ - `aa_frac_E` — 0.0077
92
+ - `tetra_CTCT` — 0.0071
93
+ - `iso_cat2_plant` — 0.0068
94
+ - `tetra_AGAC` — 0.0067
95
+ - `tetra_CACT` — 0.0065
96
 
97
  ### `oxygen_requirement` — fold-by-fold
98
 
99
  | Fold | Metric | Train | Test |
100
  |---|---|---|---|
101
+ | 1 | f1_macro = 0.353 | n=17,311 | n=4,328 |
102
+ | 2 | f1_macro = 0.375 | n=17,311 | n=4,326 |
103
+ | 3 | f1_macro = 0.357 | n=17,311 | n=4,328 |
104
+ | 4 | f1_macro = 0.274 | n=17,311 | n=4,328 |
105
+ | 5 | f1_macro = 0.429 | n=17,312 | n=4,327 |
106
 
107
  **Top 10 features for `oxygen_requirement`:**
108
 
109
+ - `codon_ATA` — 0.0395
110
+ - `iso_cat1_host` — 0.0269
111
+ - `n_predicted_cds` — 0.0266
112
+ - `aa_frac_C` — 0.0195
113
+ - `iso_cat1_environmental` — 0.0162
114
+ - `codon_CGT` — 0.0144
115
+ - `iso_cat1_engineered` — 0.0139
116
+ - `iso_cat2_human` — 0.0124
117
+ - `genome_size_nt` — 0.0103
118
+ - `codon_TAA` — 0.0083
119
 
120
  ### `salt_tolerance_pct` — fold-by-fold
121
 
122
  | Fold | Metric | Train | Test |
123
  |---|---|---|---|
124
+ | 1 | mae = 1.926 | n=3,075 | n=769 |
125
+ | 2 | mae = 1.893 | n=3,075 | n=769 |
126
+ | 3 | mae = 2.746 | n=3,075 | n=769 |
127
+ | 4 | mae = 1.870 | n=3,075 | n=769 |
128
+ | 5 | mae = 2.128 | n=3,076 | n=768 |
129
 
130
  **Top 10 features for `salt_tolerance_pct`:**
131
 
132
+ - `neg_charged_frac` — 0.0702
133
+ - `tetra_ATCC` — 0.0428
134
+ - `aa_frac_C` — 0.0298
135
+ - `iso_cat2_saline` — 0.0286
136
+ - `md_nacl_pct_median` — 0.0256
137
+ - `tetra_ACAT` — 0.0255
138
+ - `md_nacl_pct_max` — 0.0128
139
+ - `aa_frac_T` — 0.0120
140
+ - `codon_CCG` — 0.0093
141
+ - `tetra_TGAT` — 0.0089
142
 
143
  ## Feature ↔ target correlations (Spearman, top 10)
144
 
 
163
 
164
  | Feature | Spearman ρ | p-value |
165
  |---|---|---|
166
+ | `md_ph_median` | +0.429 | 4.0e-131 |
167
  | `neg_charged_frac` | +0.304 | 1.6e-109 |
168
  | `mean_isoelectric_point` | -0.278 | 1.8e-91 |
169
  | `aa_frac_E` | +0.256 | 4.5e-77 |
170
+ | `md_nacl_pct_max` | +0.218 | 1.9e-33 |
171
+ | `md_nacl_pct_median` | +0.212 | 9.9e-32 |
172
  | `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
173
  | `ivywrel_frac` | +0.159 | 2.4e-30 |
174
  | `codon_AAG` | -0.154 | 1.7e-28 |
175
  | `codon_CGA` | +0.153 | 5.8e-28 |
 
 
 
176
 
177
  ### `salt_tolerance_pct`
178
 
 
183
  | `aa_frac_E` | +0.310 | 3.1e-86 |
184
  | `tetra_GACT` | +0.302 | 4.3e-82 |
185
  | `tetra_AGTC` | +0.302 | 1.0e-81 |
186
+ | `md_nacl_pct_max` | +0.298 | 2.9e-52 |
187
+ | `md_nacl_pct_median` | +0.290 | 1.6e-49 |
188
  | `tetra_ACTC` | +0.282 | 2.2e-71 |
189
  | `tetra_GAGT` | +0.273 | 1.9e-66 |
190
  | `iso_cat2_saline` | +0.263 | 9.4e-62 |
 
 
191
 
192
  ## Per-family error breakdown (regression targets)
193
 
 
197
 
198
  | Family | n | MAE |
199
  |---|---|---|
200
+ | Enterobacteriaceae | 2662 | 3.792 |
201
+ | Streptomycetaceae | 2212 | 1.783 |
202
+ | Bacillaceae | 1886 | 3.174 |
203
+ | Lactobacillaceae | 1732 | 3.709 |
204
+ | Pseudomonadaceae | 1621 | 2.488 |
205
+ | Myxococcaceae | 1546 | 0.238 |
206
+ | Streptococcaceae | 1170 | 2.537 |
207
+ | Staphylococcaceae | 1068 | 3.374 |
208
+ | Flavobacteriaceae | 981 | 4.116 |
209
+ | Corynebacteriaceae | 900 | 2.146 |
210
+ | Moraxellaceae | 890 | 3.388 |
211
+ | Paenibacillaceae | 760 | 3.081 |
212
+ | Microbacteriaceae | 734 | 2.459 |
213
+ | Micrococcaceae | 719 | 2.811 |
214
+ | Nocardiaceae | 715 | 2.276 |
215
 
216
  ### `optimal_ph`
217
 
218
  | Family | n | MAE |
219
  |---|---|---|
220
+ | Flavobacteriaceae | 355 | 0.405 |
221
+ | Bacillaceae | 298 | 0.606 |
222
+ | Roseobacteraceae | 204 | 0.375 |
223
+ | Paenibacillaceae | 139 | 0.469 |
224
+ | Microbacteriaceae | 120 | 0.446 |
225
+ | Sphingobacteriaceae | 114 | 0.336 |
226
+ | Sphingomonadaceae | 102 | 0.319 |
227
+ | Streptomycetaceae | 98 | 0.513 |
228
+ | Pseudonocardiaceae | 93 | 0.479 |
229
+ | Halomonadaceae | 82 | 0.584 |
230
+ | Micrococcaceae | 82 | 0.613 |
231
+ | Nocardioidaceae | 80 | 0.502 |
232
+ | Paracoccaceae | 76 | 0.574 |
233
+ | Alteromonadaceae | 71 | 0.355 |
234
+ | Erythrobacteraceae | 68 | 0.446 |
235
 
236
  ### `salt_tolerance_pct`
237
 
238
  | Family | n | MAE |
239
  |---|---|---|
240
+ | Flavobacteriaceae | 267 | 1.713 |
241
+ | Streptomycetaceae | 264 | 1.987 |
242
+ | Bacillaceae | 201 | 3.315 |
243
+ | Roseobacteraceae | 127 | 1.395 |
244
+ | Pseudonocardiaceae | 123 | 2.280 |
245
+ | Paenibacillaceae | 93 | 1.651 |
246
+ | Enterococcaceae | 93 | 2.935 |
247
+ | Microbacteriaceae | 91 | 2.789 |
248
+ | Micromonosporaceae | 90 | 1.609 |
249
+ | Sphingomonadaceae | 81 | 1.028 |
250
+ | Micrococcaceae | 71 | 2.613 |
251
+ | Streptosporangiaceae | 68 | 1.480 |
252
+ | Lactobacillaceae | 66 | 2.559 |
253
+ | Sphingobacteriaceae | 55 | 1.218 |
254
+ | Halomonadaceae | 52 | 2.815 |
255
 
256
  ## Known limitations
257
 
scripts/03_train_baseline.py CHANGED
@@ -71,8 +71,20 @@ def main() -> None:
71
  print(f"Encoded {len(iso_cols)} isolation-category features "
72
  f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
73
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
75
- feature_cols = feature_cols + iso_cols
76
 
77
  print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
78
  print(f"Distinct groups: {df['group'].nunique():,}")
 
71
  print(f"Encoded {len(iso_cols)} isolation-category features "
72
  f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
73
 
74
+ md_path = config.DATA / "mediadive_features.parquet"
75
+ md_cols: list[str] = []
76
+ if md_path.exists():
77
+ md = pd.read_parquet(md_path)
78
+ md["bacdive_id"] = md["bacdive_id"].astype(int)
79
+ df["bacdive_id"] = df["bacdive_id"].astype(int)
80
+ md_cols = [c for c in md.columns if c != "bacdive_id"]
81
+ df = df.merge(md, on="bacdive_id", how="left")
82
+ n_with_md = df[md_cols[0]].notna().sum() if md_cols else 0
83
+ print(f"Joined MediaDive features ({len(md_cols)} cols) — "
84
+ f"{n_with_md:,}/{len(df):,} training rows have MediaDive data")
85
+
86
  feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
87
+ feature_cols = feature_cols + iso_cols + md_cols
88
 
89
  print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
90
  print(f"Distinct groups: {df['group'].nunique():,}")
scripts/20_build_mediadive_features.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build per-strain MediaDive features from strain_media + media_recipes + raw JSON.
2
+
3
+ For each BacDive strain, compute the median pH and NaCl% across all DSMZ media that
4
+ strain has been recorded as growing on. These are NOT labels — they're additional
5
+ features the model can use to predict the actual phenotype optima. Saves to
6
+ data/mediadive_features.parquet (joined into the training table by scripts/03).
7
+
8
+ Per-strain features written:
9
+ - md_n_media: count of media the strain grows on
10
+ - md_ph_median: median midpoint(min_pH, max_pH) across those media
11
+ - md_ph_range: max - min of medium pH across those media
12
+ - md_nacl_pct_median:median NaCl % w/v across those media
13
+ - md_nacl_pct_max: max NaCl % w/v (highest tolerated)
14
+
15
+ Sanity check: where a BacDive optimum_pH or salt_tolerance_pct exists, we expect
16
+ moderate (not perfect) correlation with the corresponding MediaDive feature.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ from pathlib import Path
22
+
23
+ import pandas as pd
24
+
25
+ from microbe_model import config
26
+
27
+ NACL_CAP_PCT = 30.0 # clip recipes with absurd NaCl values (parse artifacts)
28
+
29
+
30
+ def build_medium_ph_map() -> dict[str, float]:
31
+ """Return {medium_id: midpoint pH} from raw MediaDive cache."""
32
+ out: dict[str, float] = {}
33
+ for path in Path(config.DATA / "mediadive").glob("*.json"):
34
+ try:
35
+ d = json.loads(path.read_text())
36
+ except json.JSONDecodeError:
37
+ continue
38
+ if not isinstance(d, dict):
39
+ continue
40
+ m = d.get("medium")
41
+ if not isinstance(m, dict):
42
+ continue
43
+ mid = m.get("id")
44
+ min_ph = m.get("min_pH")
45
+ max_ph = m.get("max_pH")
46
+ if mid is None or min_ph is None or max_ph is None:
47
+ continue
48
+ try:
49
+ out[str(mid)] = (float(min_ph) + float(max_ph)) / 2
50
+ except (ValueError, TypeError):
51
+ continue
52
+ return out
53
+
54
+
55
+ def build_medium_nacl_map() -> dict[str, float]:
56
+ """Return {medium_id: NaCl % w/v} summed from recipe compounds (clipped)."""
57
+ mr = pd.read_parquet(config.DATA / "media_recipes.parquet")
58
+ nacl = mr[mr["compound"].str.contains(r"sodium chlor|^nacl$", case=False, na=False, regex=True)]
59
+ pct = (nacl.groupby("medium_id")["g_l"].sum() / 10).clip(upper=NACL_CAP_PCT)
60
+ return pct.astype(float).to_dict()
61
+
62
+
63
+ def main() -> None:
64
+ sm = pd.read_parquet(config.DATA / "strain_media.parquet")
65
+ sm = sm[sm["growth"].str.lower() == "yes"].copy()
66
+ sm["medium_id"] = sm["medium_id"].astype(str)
67
+
68
+ ph_map = build_medium_ph_map()
69
+ nacl_map = build_medium_nacl_map()
70
+ print(f"medium pH map: {len(ph_map):,} media")
71
+ print(f"medium NaCl map: {len(nacl_map):,} media")
72
+
73
+ sm["m_ph"] = sm["medium_id"].map(ph_map)
74
+ # Strains may grow on media not in the recipe table — treat absent as 0% NaCl
75
+ sm["m_nacl"] = sm["medium_id"].map(nacl_map).fillna(0.0)
76
+
77
+ # Aggregate per-strain
78
+ grouped = sm.groupby("bacdive_id")
79
+ feat = pd.DataFrame({
80
+ "md_n_media": grouped.size(),
81
+ "md_ph_median": grouped["m_ph"].median(),
82
+ "md_ph_range": grouped["m_ph"].max() - grouped["m_ph"].min(),
83
+ "md_nacl_pct_median": grouped["m_nacl"].median(),
84
+ "md_nacl_pct_max": grouped["m_nacl"].max(),
85
+ }).reset_index()
86
+
87
+ out = config.DATA / "mediadive_features.parquet"
88
+ feat.to_parquet(out, index=False)
89
+ print(f"\nwrote {len(feat):,} strains to {out}")
90
+ print(feat.describe().round(2).to_string())
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()