Spaces:
Running
Add MediaDive-derived features (medium pH, NaCl, n_media) — all 4 targets improve
Browse filesFor each strain in data/strain_media.parquet, compute median pH and NaCl% across
the DSMZ media it has been grown on, plus a count of media. These are model inputs
(features), NOT labels — the previous probe showed BacDive↔MediaDive label
correlation is only 0.42 for salt, so using MediaDive as a label source would
corrupt the now-clean salt MAE. As features, the model learns the right weighting.
5 new features per strain:
md_n_media count of media the strain grows on
md_ph_median median midpoint(min_pH, max_pH)
md_ph_range spread (max - min) of medium pH
md_nacl_pct_median median NaCl % w/v across recipes
md_nacl_pct_max highest tolerated NaCl
Coverage: 28,704 strains (62% of 46K training table) have MediaDive data.
Cumulative metrics vs original v0 genome-only baseline:
optimal_temperature_c MAE 3.28 → 2.86 (-12.9%)
optimal_ph MAE 0.52 → 0.48 ( -7.7%)
oxygen_requirement F1 0.279 → 0.358 (+28.2%)
salt_tolerance_pct MAE 2.51 → 2.11 (-15.9%)
This step alone:
T_opt 2.94 → 2.86
pH 0.51 → 0.48
Oxygen 0.341 → 0.358
Salt 2.17 → 2.11
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- artifacts/baseline_results.json +110 -105
- artifacts/eval_report.md +120 -120
- scripts/03_train_baseline.py +13 -1
- scripts/20_build_mediadive_features.py +94 -0
|
@@ -1,13 +1,13 @@
|
|
| 1 |
{
|
| 2 |
"optimal_temperature_c": {
|
| 3 |
"task": "regression",
|
| 4 |
-
"mean_metric": 2.
|
| 5 |
"folds": [
|
| 6 |
{
|
| 7 |
"target": "optimal_temperature_c",
|
| 8 |
"task": "regression",
|
| 9 |
"metric_name": "mae",
|
| 10 |
-
"value":
|
| 11 |
"n_train": 36496,
|
| 12 |
"n_test": 9125
|
| 13 |
},
|
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"target": "optimal_temperature_c",
|
| 16 |
"task": "regression",
|
| 17 |
"metric_name": "mae",
|
| 18 |
-
"value": 2.
|
| 19 |
"n_train": 36497,
|
| 20 |
"n_test": 9124
|
| 21 |
},
|
|
@@ -23,7 +23,7 @@
|
|
| 23 |
"target": "optimal_temperature_c",
|
| 24 |
"task": "regression",
|
| 25 |
"metric_name": "mae",
|
| 26 |
-
"value": 3.
|
| 27 |
"n_train": 36497,
|
| 28 |
"n_test": 9124
|
| 29 |
},
|
|
@@ -31,7 +31,7 @@
|
|
| 31 |
"target": "optimal_temperature_c",
|
| 32 |
"task": "regression",
|
| 33 |
"metric_name": "mae",
|
| 34 |
-
"value": 3.
|
| 35 |
"n_train": 36497,
|
| 36 |
"n_test": 9124
|
| 37 |
},
|
|
@@ -39,43 +39,43 @@
|
|
| 39 |
"target": "optimal_temperature_c",
|
| 40 |
"task": "regression",
|
| 41 |
"metric_name": "mae",
|
| 42 |
-
"value": 2.
|
| 43 |
"n_train": 36497,
|
| 44 |
"n_test": 9124
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"top_features": {
|
| 48 |
-
"ivywrel_frac": 0.
|
| 49 |
-
"iso_cat2_thermophilic_gt45_c": 0.
|
| 50 |
-
"
|
| 51 |
-
"iso_cat2_human": 0.
|
| 52 |
-
"
|
| 53 |
-
"
|
| 54 |
-
"aa_frac_C": 0.
|
| 55 |
-
"genome_size_nt": 0.
|
| 56 |
-
"
|
| 57 |
-
"
|
| 58 |
-
"
|
| 59 |
-
"tetra_GCCT": 0.
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
}
|
| 69 |
},
|
| 70 |
"optimal_ph": {
|
| 71 |
"task": "regression",
|
| 72 |
-
"mean_metric": 0.
|
| 73 |
"folds": [
|
| 74 |
{
|
| 75 |
"target": "optimal_ph",
|
| 76 |
"task": "regression",
|
| 77 |
"metric_name": "mae",
|
| 78 |
-
"value": 0.
|
| 79 |
"n_train": 4082,
|
| 80 |
"n_test": 1021
|
| 81 |
},
|
|
@@ -83,7 +83,7 @@
|
|
| 83 |
"target": "optimal_ph",
|
| 84 |
"task": "regression",
|
| 85 |
"metric_name": "mae",
|
| 86 |
-
"value": 0.
|
| 87 |
"n_train": 4082,
|
| 88 |
"n_test": 1021
|
| 89 |
},
|
|
@@ -91,7 +91,7 @@
|
|
| 91 |
"target": "optimal_ph",
|
| 92 |
"task": "regression",
|
| 93 |
"metric_name": "mae",
|
| 94 |
-
"value": 0.
|
| 95 |
"n_train": 4082,
|
| 96 |
"n_test": 1021
|
| 97 |
},
|
|
@@ -99,7 +99,7 @@
|
|
| 99 |
"target": "optimal_ph",
|
| 100 |
"task": "regression",
|
| 101 |
"metric_name": "mae",
|
| 102 |
-
"value": 0.
|
| 103 |
"n_train": 4083,
|
| 104 |
"n_test": 1020
|
| 105 |
},
|
|
@@ -107,43 +107,43 @@
|
|
| 107 |
"target": "optimal_ph",
|
| 108 |
"task": "regression",
|
| 109 |
"metric_name": "mae",
|
| 110 |
-
"value": 0.
|
| 111 |
"n_train": 4083,
|
| 112 |
"n_test": 1020
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"top_features": {
|
| 116 |
-
"
|
| 117 |
-
"
|
| 118 |
-
"
|
| 119 |
-
"
|
| 120 |
-
"
|
| 121 |
-
"
|
| 122 |
-
"
|
| 123 |
-
"
|
| 124 |
-
"tetra_AGAC": 0.
|
| 125 |
-
"
|
| 126 |
-
"tetra_GACT": 0.
|
| 127 |
-
"
|
| 128 |
-
"
|
| 129 |
-
"
|
| 130 |
-
"
|
| 131 |
-
"
|
| 132 |
-
"
|
| 133 |
-
"
|
| 134 |
-
"
|
| 135 |
-
"
|
| 136 |
}
|
| 137 |
},
|
| 138 |
"oxygen_requirement": {
|
| 139 |
"task": "classification",
|
| 140 |
-
"mean_metric": 0.
|
| 141 |
"folds": [
|
| 142 |
{
|
| 143 |
"target": "oxygen_requirement",
|
| 144 |
"task": "classification",
|
| 145 |
"metric_name": "f1_macro",
|
| 146 |
-
"value": 0.
|
| 147 |
"n_train": 17311,
|
| 148 |
"n_test": 4328
|
| 149 |
},
|
|
@@ -151,7 +151,7 @@
|
|
| 151 |
"target": "oxygen_requirement",
|
| 152 |
"task": "classification",
|
| 153 |
"metric_name": "f1_macro",
|
| 154 |
-
"value": 0.
|
| 155 |
"n_train": 17311,
|
| 156 |
"n_test": 4326
|
| 157 |
},
|
|
@@ -159,7 +159,7 @@
|
|
| 159 |
"target": "oxygen_requirement",
|
| 160 |
"task": "classification",
|
| 161 |
"metric_name": "f1_macro",
|
| 162 |
-
"value": 0.
|
| 163 |
"n_train": 17311,
|
| 164 |
"n_test": 4328
|
| 165 |
},
|
|
@@ -167,7 +167,7 @@
|
|
| 167 |
"target": "oxygen_requirement",
|
| 168 |
"task": "classification",
|
| 169 |
"metric_name": "f1_macro",
|
| 170 |
-
"value": 0.
|
| 171 |
"n_train": 17311,
|
| 172 |
"n_test": 4328
|
| 173 |
},
|
|
@@ -175,43 +175,43 @@
|
|
| 175 |
"target": "oxygen_requirement",
|
| 176 |
"task": "classification",
|
| 177 |
"metric_name": "f1_macro",
|
| 178 |
-
"value": 0.
|
| 179 |
"n_train": 17312,
|
| 180 |
"n_test": 4327
|
| 181 |
}
|
| 182 |
],
|
| 183 |
"top_features": {
|
| 184 |
-
"codon_ATA": 0.
|
| 185 |
-
"iso_cat1_host": 0.
|
| 186 |
-
"n_predicted_cds": 0.
|
| 187 |
-
"aa_frac_C": 0.
|
| 188 |
-
"iso_cat1_environmental": 0.
|
| 189 |
-
"codon_CGT": 0.
|
| 190 |
-
"iso_cat1_engineered": 0.
|
| 191 |
-
"
|
| 192 |
-
"
|
| 193 |
-
"codon_TAA": 0.
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
-
"
|
| 197 |
-
"
|
| 198 |
-
"aa_frac_T": 0.
|
| 199 |
-
"
|
| 200 |
-
"aa_frac_Q": 0.
|
| 201 |
-
"
|
| 202 |
-
"
|
| 203 |
-
"
|
| 204 |
}
|
| 205 |
},
|
| 206 |
"salt_tolerance_pct": {
|
| 207 |
"task": "regression",
|
| 208 |
-
"mean_metric": 2.
|
| 209 |
"folds": [
|
| 210 |
{
|
| 211 |
"target": "salt_tolerance_pct",
|
| 212 |
"task": "regression",
|
| 213 |
"metric_name": "mae",
|
| 214 |
-
"value":
|
| 215 |
"n_train": 3075,
|
| 216 |
"n_test": 769
|
| 217 |
},
|
|
@@ -219,7 +219,7 @@
|
|
| 219 |
"target": "salt_tolerance_pct",
|
| 220 |
"task": "regression",
|
| 221 |
"metric_name": "mae",
|
| 222 |
-
"value": 1.
|
| 223 |
"n_train": 3075,
|
| 224 |
"n_test": 769
|
| 225 |
},
|
|
@@ -227,7 +227,7 @@
|
|
| 227 |
"target": "salt_tolerance_pct",
|
| 228 |
"task": "regression",
|
| 229 |
"metric_name": "mae",
|
| 230 |
-
"value": 2.
|
| 231 |
"n_train": 3075,
|
| 232 |
"n_test": 769
|
| 233 |
},
|
|
@@ -235,7 +235,7 @@
|
|
| 235 |
"target": "salt_tolerance_pct",
|
| 236 |
"task": "regression",
|
| 237 |
"metric_name": "mae",
|
| 238 |
-
"value": 1.
|
| 239 |
"n_train": 3075,
|
| 240 |
"n_test": 769
|
| 241 |
},
|
|
@@ -243,32 +243,32 @@
|
|
| 243 |
"target": "salt_tolerance_pct",
|
| 244 |
"task": "regression",
|
| 245 |
"metric_name": "mae",
|
| 246 |
-
"value": 2.
|
| 247 |
"n_train": 3076,
|
| 248 |
"n_test": 768
|
| 249 |
}
|
| 250 |
],
|
| 251 |
"top_features": {
|
| 252 |
-
"neg_charged_frac": 0.
|
| 253 |
-
"tetra_ATCC": 0.
|
| 254 |
-
"aa_frac_C": 0.
|
| 255 |
-
"iso_cat2_saline": 0.
|
| 256 |
-
"
|
| 257 |
-
"
|
| 258 |
-
"
|
| 259 |
-
"
|
| 260 |
-
"
|
| 261 |
-
"tetra_TGAT": 0.
|
| 262 |
-
"
|
| 263 |
-
"
|
| 264 |
-
"
|
| 265 |
-
"
|
| 266 |
-
"
|
| 267 |
-
"
|
| 268 |
-
"
|
| 269 |
-
"
|
| 270 |
-
"
|
| 271 |
-
"
|
| 272 |
}
|
| 273 |
},
|
| 274 |
"__meta__": {
|
|
@@ -690,7 +690,12 @@
|
|
| 690 |
"iso_cat2_urogenital_tract",
|
| 691 |
"iso_cat2_waste",
|
| 692 |
"iso_cat2_xerophilic",
|
| 693 |
-
"iso_cat2_yeast"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
]
|
| 695 |
}
|
| 696 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"optimal_temperature_c": {
|
| 3 |
"task": "regression",
|
| 4 |
+
"mean_metric": 2.8569134461172,
|
| 5 |
"folds": [
|
| 6 |
{
|
| 7 |
"target": "optimal_temperature_c",
|
| 8 |
"task": "regression",
|
| 9 |
"metric_name": "mae",
|
| 10 |
+
"value": 2.952921209821309,
|
| 11 |
"n_train": 36496,
|
| 12 |
"n_test": 9125
|
| 13 |
},
|
|
|
|
| 15 |
"target": "optimal_temperature_c",
|
| 16 |
"task": "regression",
|
| 17 |
"metric_name": "mae",
|
| 18 |
+
"value": 2.6256106255400447,
|
| 19 |
"n_train": 36497,
|
| 20 |
"n_test": 9124
|
| 21 |
},
|
|
|
|
| 23 |
"target": "optimal_temperature_c",
|
| 24 |
"task": "regression",
|
| 25 |
"metric_name": "mae",
|
| 26 |
+
"value": 3.0601953129348187,
|
| 27 |
"n_train": 36497,
|
| 28 |
"n_test": 9124
|
| 29 |
},
|
|
|
|
| 31 |
"target": "optimal_temperature_c",
|
| 32 |
"task": "regression",
|
| 33 |
"metric_name": "mae",
|
| 34 |
+
"value": 3.2652047467513965,
|
| 35 |
"n_train": 36497,
|
| 36 |
"n_test": 9124
|
| 37 |
},
|
|
|
|
| 39 |
"target": "optimal_temperature_c",
|
| 40 |
"task": "regression",
|
| 41 |
"metric_name": "mae",
|
| 42 |
+
"value": 2.38063533553843,
|
| 43 |
"n_train": 36497,
|
| 44 |
"n_test": 9124
|
| 45 |
}
|
| 46 |
],
|
| 47 |
"top_features": {
|
| 48 |
+
"ivywrel_frac": 0.12348818182945251,
|
| 49 |
+
"iso_cat2_thermophilic_gt45_c": 0.028791341930627823,
|
| 50 |
+
"iso_cat2_patient": 0.025099934451282023,
|
| 51 |
+
"iso_cat2_human": 0.02344932146370411,
|
| 52 |
+
"n_predicted_cds": 0.021633704751729967,
|
| 53 |
+
"iso_cat1_infection": 0.020425693690776826,
|
| 54 |
+
"aa_frac_C": 0.014341578260064125,
|
| 55 |
+
"genome_size_nt": 0.01227616611868143,
|
| 56 |
+
"tetra_CTAA": 0.011777869192883372,
|
| 57 |
+
"aa_frac_D": 0.01087347036227584,
|
| 58 |
+
"codon_AGG": 0.009832531120628119,
|
| 59 |
+
"tetra_GCCT": 0.009409325825981796,
|
| 60 |
+
"aa_frac_E": 0.008742744009941817,
|
| 61 |
+
"tetra_TTAG": 0.008621749537996947,
|
| 62 |
+
"iso_cat1_environmental": 0.00781458979472518,
|
| 63 |
+
"mean_isoelectric_point": 0.007003072090446949,
|
| 64 |
+
"aa_frac_Y": 0.00684288409538567,
|
| 65 |
+
"tetra_AGGC": 0.006669195392169059,
|
| 66 |
+
"iso_cat2_industrial": 0.006660213135182858,
|
| 67 |
+
"tetra_TTCC": 0.006564890965819359
|
| 68 |
}
|
| 69 |
},
|
| 70 |
"optimal_ph": {
|
| 71 |
"task": "regression",
|
| 72 |
+
"mean_metric": 0.4824498969036545,
|
| 73 |
"folds": [
|
| 74 |
{
|
| 75 |
"target": "optimal_ph",
|
| 76 |
"task": "regression",
|
| 77 |
"metric_name": "mae",
|
| 78 |
+
"value": 0.440339089476747,
|
| 79 |
"n_train": 4082,
|
| 80 |
"n_test": 1021
|
| 81 |
},
|
|
|
|
| 83 |
"target": "optimal_ph",
|
| 84 |
"task": "regression",
|
| 85 |
"metric_name": "mae",
|
| 86 |
+
"value": 0.5678683244049492,
|
| 87 |
"n_train": 4082,
|
| 88 |
"n_test": 1021
|
| 89 |
},
|
|
|
|
| 91 |
"target": "optimal_ph",
|
| 92 |
"task": "regression",
|
| 93 |
"metric_name": "mae",
|
| 94 |
+
"value": 0.4943884038785062,
|
| 95 |
"n_train": 4082,
|
| 96 |
"n_test": 1021
|
| 97 |
},
|
|
|
|
| 99 |
"target": "optimal_ph",
|
| 100 |
"task": "regression",
|
| 101 |
"metric_name": "mae",
|
| 102 |
+
"value": 0.46583879377327714,
|
| 103 |
"n_train": 4083,
|
| 104 |
"n_test": 1020
|
| 105 |
},
|
|
|
|
| 107 |
"target": "optimal_ph",
|
| 108 |
"task": "regression",
|
| 109 |
"metric_name": "mae",
|
| 110 |
+
"value": 0.44381487298479266,
|
| 111 |
"n_train": 4083,
|
| 112 |
"n_test": 1020
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"top_features": {
|
| 116 |
+
"md_ph_median": 0.05177119821310043,
|
| 117 |
+
"iso_cat2_acidic": 0.030658208578824998,
|
| 118 |
+
"iso_cat2_alkaline": 0.02869502492249012,
|
| 119 |
+
"neg_charged_frac": 0.014565921388566494,
|
| 120 |
+
"aa_frac_H": 0.008134929556399583,
|
| 121 |
+
"aa_frac_E": 0.007721887435764074,
|
| 122 |
+
"tetra_CTCT": 0.007108068186789751,
|
| 123 |
+
"iso_cat2_plant": 0.006769544072449207,
|
| 124 |
+
"tetra_AGAC": 0.006719858897849917,
|
| 125 |
+
"tetra_CACT": 0.006461512250825763,
|
| 126 |
+
"tetra_GACT": 0.0064593076705932615,
|
| 127 |
+
"tetra_TCTC": 0.005769496783614159,
|
| 128 |
+
"tetra_TGGG": 0.005730107612907887,
|
| 129 |
+
"codon_ACG": 0.005510704545304179,
|
| 130 |
+
"tetra_TAAC": 0.004951662756502629,
|
| 131 |
+
"mean_isoelectric_point": 0.004634805396199227,
|
| 132 |
+
"tetra_TGGT": 0.004585431469604373,
|
| 133 |
+
"tetra_AGTC": 0.004529385082423687,
|
| 134 |
+
"aa_frac_Y": 0.0043250532820820805,
|
| 135 |
+
"iso_cat2_plants": 0.004201814788393677
|
| 136 |
}
|
| 137 |
},
|
| 138 |
"oxygen_requirement": {
|
| 139 |
"task": "classification",
|
| 140 |
+
"mean_metric": 0.3574661512390337,
|
| 141 |
"folds": [
|
| 142 |
{
|
| 143 |
"target": "oxygen_requirement",
|
| 144 |
"task": "classification",
|
| 145 |
"metric_name": "f1_macro",
|
| 146 |
+
"value": 0.3529974318071035,
|
| 147 |
"n_train": 17311,
|
| 148 |
"n_test": 4328
|
| 149 |
},
|
|
|
|
| 151 |
"target": "oxygen_requirement",
|
| 152 |
"task": "classification",
|
| 153 |
"metric_name": "f1_macro",
|
| 154 |
+
"value": 0.37463614052135813,
|
| 155 |
"n_train": 17311,
|
| 156 |
"n_test": 4326
|
| 157 |
},
|
|
|
|
| 159 |
"target": "oxygen_requirement",
|
| 160 |
"task": "classification",
|
| 161 |
"metric_name": "f1_macro",
|
| 162 |
+
"value": 0.357449136753726,
|
| 163 |
"n_train": 17311,
|
| 164 |
"n_test": 4328
|
| 165 |
},
|
|
|
|
| 167 |
"target": "oxygen_requirement",
|
| 168 |
"task": "classification",
|
| 169 |
"metric_name": "f1_macro",
|
| 170 |
+
"value": 0.2736180772079518,
|
| 171 |
"n_train": 17311,
|
| 172 |
"n_test": 4328
|
| 173 |
},
|
|
|
|
| 175 |
"target": "oxygen_requirement",
|
| 176 |
"task": "classification",
|
| 177 |
"metric_name": "f1_macro",
|
| 178 |
+
"value": 0.4286299699050292,
|
| 179 |
"n_train": 17312,
|
| 180 |
"n_test": 4327
|
| 181 |
}
|
| 182 |
],
|
| 183 |
"top_features": {
|
| 184 |
+
"codon_ATA": 0.03948382511734962,
|
| 185 |
+
"iso_cat1_host": 0.02685815468430519,
|
| 186 |
+
"n_predicted_cds": 0.02664864845573902,
|
| 187 |
+
"aa_frac_C": 0.01948722042143345,
|
| 188 |
+
"iso_cat1_environmental": 0.016227377578616142,
|
| 189 |
+
"codon_CGT": 0.014393045380711556,
|
| 190 |
+
"iso_cat1_engineered": 0.013875876553356647,
|
| 191 |
+
"iso_cat2_human": 0.012424463033676147,
|
| 192 |
+
"genome_size_nt": 0.010264858696609735,
|
| 193 |
+
"codon_TAA": 0.0082530552521348,
|
| 194 |
+
"tetra_CAAA": 0.007871841243468226,
|
| 195 |
+
"aa_frac_V": 0.0073866051156073805,
|
| 196 |
+
"aa_frac_Y": 0.007194060226902365,
|
| 197 |
+
"aa_frac_L": 0.006919718533754349,
|
| 198 |
+
"aa_frac_T": 0.006779328640550375,
|
| 199 |
+
"md_ph_median": 0.006684697233140469,
|
| 200 |
+
"aa_frac_Q": 0.006629320327192545,
|
| 201 |
+
"codon_CAA": 0.006617056485265493,
|
| 202 |
+
"aa_frac_M": 0.006288983486592769,
|
| 203 |
+
"codon_TGG": 0.00552113470621407
|
| 204 |
}
|
| 205 |
},
|
| 206 |
"salt_tolerance_pct": {
|
| 207 |
"task": "regression",
|
| 208 |
+
"mean_metric": 2.1124094661234083,
|
| 209 |
"folds": [
|
| 210 |
{
|
| 211 |
"target": "salt_tolerance_pct",
|
| 212 |
"task": "regression",
|
| 213 |
"metric_name": "mae",
|
| 214 |
+
"value": 1.9258830039615904,
|
| 215 |
"n_train": 3075,
|
| 216 |
"n_test": 769
|
| 217 |
},
|
|
|
|
| 219 |
"target": "salt_tolerance_pct",
|
| 220 |
"task": "regression",
|
| 221 |
"metric_name": "mae",
|
| 222 |
+
"value": 1.892595597748997,
|
| 223 |
"n_train": 3075,
|
| 224 |
"n_test": 769
|
| 225 |
},
|
|
|
|
| 227 |
"target": "salt_tolerance_pct",
|
| 228 |
"task": "regression",
|
| 229 |
"metric_name": "mae",
|
| 230 |
+
"value": 2.7457253220944784,
|
| 231 |
"n_train": 3075,
|
| 232 |
"n_test": 769
|
| 233 |
},
|
|
|
|
| 235 |
"target": "salt_tolerance_pct",
|
| 236 |
"task": "regression",
|
| 237 |
"metric_name": "mae",
|
| 238 |
+
"value": 1.870206453444744,
|
| 239 |
"n_train": 3075,
|
| 240 |
"n_test": 769
|
| 241 |
},
|
|
|
|
| 243 |
"target": "salt_tolerance_pct",
|
| 244 |
"task": "regression",
|
| 245 |
"metric_name": "mae",
|
| 246 |
+
"value": 2.127636953367231,
|
| 247 |
"n_train": 3076,
|
| 248 |
"n_test": 768
|
| 249 |
}
|
| 250 |
],
|
| 251 |
"top_features": {
|
| 252 |
+
"neg_charged_frac": 0.07017230689525604,
|
| 253 |
+
"tetra_ATCC": 0.04281170674366876,
|
| 254 |
+
"aa_frac_C": 0.029778398107737303,
|
| 255 |
+
"iso_cat2_saline": 0.028634220734238623,
|
| 256 |
+
"md_nacl_pct_median": 0.02563472166657448,
|
| 257 |
+
"tetra_ACAT": 0.025493022409500556,
|
| 258 |
+
"md_nacl_pct_max": 0.012753746472299099,
|
| 259 |
+
"aa_frac_T": 0.011963088880293071,
|
| 260 |
+
"codon_CCG": 0.009299659519456327,
|
| 261 |
+
"tetra_TGAT": 0.008889634546358138,
|
| 262 |
+
"tetra_GTTC": 0.00881260905880481,
|
| 263 |
+
"codon_TCA": 0.00808500499697402,
|
| 264 |
+
"mean_isoelectric_point": 0.007483909465372562,
|
| 265 |
+
"codon_ATT": 0.0072766575030982494,
|
| 266 |
+
"codon_ACT": 0.006583173375111074,
|
| 267 |
+
"codon_CGT": 0.005766081786714494,
|
| 268 |
+
"tetra_TTCG": 0.005739881168119609,
|
| 269 |
+
"tetra_CGCT": 0.005698419373948127,
|
| 270 |
+
"codon_TGT": 0.005473139556124806,
|
| 271 |
+
"aa_frac_S": 0.005398909421637654
|
| 272 |
}
|
| 273 |
},
|
| 274 |
"__meta__": {
|
|
|
|
| 690 |
"iso_cat2_urogenital_tract",
|
| 691 |
"iso_cat2_waste",
|
| 692 |
"iso_cat2_xerophilic",
|
| 693 |
+
"iso_cat2_yeast",
|
| 694 |
+
"md_n_media",
|
| 695 |
+
"md_ph_median",
|
| 696 |
+
"md_ph_range",
|
| 697 |
+
"md_nacl_pct_median",
|
| 698 |
+
"md_nacl_pct_max"
|
| 699 |
]
|
| 700 |
}
|
| 701 |
}
|
|
@@ -1,15 +1,15 @@
|
|
| 1 |
# microbe-model — v0 baseline eval report
|
| 2 |
|
| 3 |
-
_Generated: 2026-05-
|
| 4 |
|
| 5 |
## TL;DR
|
| 6 |
|
| 7 |
-
- **`optimal_temperature_c`**: MAE = **2.
|
| 8 |
-
- **`optimal_ph`**: MAE = **0.
|
| 9 |
-
- **`oxygen_requirement`**: macro-F1 = **0.
|
| 10 |
-
- **`salt_tolerance_pct`**: MAE = **2.
|
| 11 |
|
| 12 |
-
Trained on **46,029** strains with **
|
| 13 |
|
| 14 |
## Corpus
|
| 15 |
|
|
@@ -43,102 +43,102 @@ Each is shown alongside the dumb-baseline (always-predict-mean / always-predict-
|
|
| 43 |
|
| 44 |
| Target | Task | n labeled | Model metric | Baseline | Improvement |
|
| 45 |
|---|---|---|---|---|---|
|
| 46 |
-
| `optimal_temperature_c` | regression | 45,621 | MAE=2.
|
| 47 |
-
| `optimal_ph` | regression | 5,103 | MAE=0.
|
| 48 |
-
| `oxygen_requirement` | classification | 21,639 | F1=0.
|
| 49 |
-
| `salt_tolerance_pct` | regression | 3,844 | MAE=2.
|
| 50 |
|
| 51 |
### `optimal_temperature_c` — fold-by-fold
|
| 52 |
|
| 53 |
| Fold | Metric | Train | Test |
|
| 54 |
|---|---|---|---|
|
| 55 |
-
| 1 | mae =
|
| 56 |
-
| 2 | mae = 2.
|
| 57 |
-
| 3 | mae = 3.
|
| 58 |
-
| 4 | mae = 3.
|
| 59 |
-
| 5 | mae = 2.
|
| 60 |
|
| 61 |
**Top 10 features for `optimal_temperature_c`:**
|
| 62 |
|
| 63 |
-
- `ivywrel_frac` — 0.
|
| 64 |
-
- `iso_cat2_thermophilic_gt45_c` — 0.
|
| 65 |
-
- `
|
| 66 |
-
- `iso_cat2_human` — 0.
|
| 67 |
-
- `
|
| 68 |
-
- `
|
| 69 |
-
- `aa_frac_C` — 0.
|
| 70 |
-
- `genome_size_nt` — 0.
|
| 71 |
-
- `
|
| 72 |
-
- `
|
| 73 |
|
| 74 |
### `optimal_ph` — fold-by-fold
|
| 75 |
|
| 76 |
| Fold | Metric | Train | Test |
|
| 77 |
|---|---|---|---|
|
| 78 |
-
| 1 | mae = 0.
|
| 79 |
-
| 2 | mae = 0.
|
| 80 |
-
| 3 | mae = 0.
|
| 81 |
-
| 4 | mae = 0.
|
| 82 |
-
| 5 | mae = 0.
|
| 83 |
|
| 84 |
**Top 10 features for `optimal_ph`:**
|
| 85 |
|
| 86 |
-
- `
|
| 87 |
-
- `
|
| 88 |
-
- `
|
| 89 |
-
- `
|
| 90 |
-
- `
|
| 91 |
-
- `
|
| 92 |
-
- `
|
| 93 |
-
- `
|
| 94 |
-
- `tetra_AGAC` — 0.
|
| 95 |
-
- `
|
| 96 |
|
| 97 |
### `oxygen_requirement` — fold-by-fold
|
| 98 |
|
| 99 |
| Fold | Metric | Train | Test |
|
| 100 |
|---|---|---|---|
|
| 101 |
-
| 1 | f1_macro = 0.
|
| 102 |
-
| 2 | f1_macro = 0.
|
| 103 |
-
| 3 | f1_macro = 0.
|
| 104 |
-
| 4 | f1_macro = 0.
|
| 105 |
-
| 5 | f1_macro = 0.
|
| 106 |
|
| 107 |
**Top 10 features for `oxygen_requirement`:**
|
| 108 |
|
| 109 |
-
- `codon_ATA` — 0.
|
| 110 |
-
- `iso_cat1_host` — 0.
|
| 111 |
-
- `n_predicted_cds` — 0.
|
| 112 |
-
- `aa_frac_C` — 0.
|
| 113 |
-
- `iso_cat1_environmental` — 0.
|
| 114 |
-
- `codon_CGT` — 0.
|
| 115 |
-
- `iso_cat1_engineered` — 0.
|
| 116 |
-
- `
|
| 117 |
-
- `
|
| 118 |
-
- `codon_TAA` — 0.
|
| 119 |
|
| 120 |
### `salt_tolerance_pct` — fold-by-fold
|
| 121 |
|
| 122 |
| Fold | Metric | Train | Test |
|
| 123 |
|---|---|---|---|
|
| 124 |
-
| 1 | mae =
|
| 125 |
-
| 2 | mae = 1.
|
| 126 |
-
| 3 | mae = 2.
|
| 127 |
-
| 4 | mae = 1.
|
| 128 |
-
| 5 | mae = 2.
|
| 129 |
|
| 130 |
**Top 10 features for `salt_tolerance_pct`:**
|
| 131 |
|
| 132 |
-
- `neg_charged_frac` — 0.
|
| 133 |
-
- `tetra_ATCC` — 0.
|
| 134 |
-
- `aa_frac_C` — 0.
|
| 135 |
-
- `iso_cat2_saline` — 0.
|
| 136 |
-
- `
|
| 137 |
-
- `
|
| 138 |
-
- `
|
| 139 |
-
- `
|
| 140 |
-
- `
|
| 141 |
-
- `tetra_TGAT` — 0.
|
| 142 |
|
| 143 |
## Feature ↔ target correlations (Spearman, top 10)
|
| 144 |
|
|
@@ -163,16 +163,16 @@ Sanity-checks the biology — features known to track each target should appear
|
|
| 163 |
|
| 164 |
| Feature | Spearman ρ | p-value |
|
| 165 |
|---|---|---|
|
|
|
|
| 166 |
| `neg_charged_frac` | +0.304 | 1.6e-109 |
|
| 167 |
| `mean_isoelectric_point` | -0.278 | 1.8e-91 |
|
| 168 |
| `aa_frac_E` | +0.256 | 4.5e-77 |
|
|
|
|
|
|
|
| 169 |
| `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
|
| 170 |
| `ivywrel_frac` | +0.159 | 2.4e-30 |
|
| 171 |
| `codon_AAG` | -0.154 | 1.7e-28 |
|
| 172 |
| `codon_CGA` | +0.153 | 5.8e-28 |
|
| 173 |
-
| `codon_TGC` | -0.151 | 2.6e-27 |
|
| 174 |
-
| `iso_cat2_saline` | +0.137 | 8.9e-23 |
|
| 175 |
-
| `tetra_CACT` | +0.135 | 4.3e-22 |
|
| 176 |
|
| 177 |
### `salt_tolerance_pct`
|
| 178 |
|
|
@@ -183,11 +183,11 @@ Sanity-checks the biology — features known to track each target should appear
|
|
| 183 |
| `aa_frac_E` | +0.310 | 3.1e-86 |
|
| 184 |
| `tetra_GACT` | +0.302 | 4.3e-82 |
|
| 185 |
| `tetra_AGTC` | +0.302 | 1.0e-81 |
|
|
|
|
|
|
|
| 186 |
| `tetra_ACTC` | +0.282 | 2.2e-71 |
|
| 187 |
| `tetra_GAGT` | +0.273 | 1.9e-66 |
|
| 188 |
| `iso_cat2_saline` | +0.263 | 9.4e-62 |
|
| 189 |
-
| `aa_frac_D` | +0.257 | 5.3e-59 |
|
| 190 |
-
| `codon_AGC` | -0.252 | 6.0e-57 |
|
| 191 |
|
| 192 |
## Per-family error breakdown (regression targets)
|
| 193 |
|
|
@@ -197,61 +197,61 @@ Top 15 most-represented families, MAE per family. Highlights where the model is
|
|
| 197 |
|
| 198 |
| Family | n | MAE |
|
| 199 |
|---|---|---|
|
| 200 |
-
| Enterobacteriaceae | 2662 |
|
| 201 |
-
| Streptomycetaceae | 2212 | 1.
|
| 202 |
-
| Bacillaceae | 1886 | 3.
|
| 203 |
-
| Lactobacillaceae | 1732 | 3.
|
| 204 |
-
| Pseudomonadaceae | 1621 | 2.
|
| 205 |
-
| Myxococcaceae | 1546 | 0.
|
| 206 |
-
| Streptococcaceae | 1170 | 2.
|
| 207 |
-
| Staphylococcaceae | 1068 |
|
| 208 |
-
| Flavobacteriaceae | 981 | 4.
|
| 209 |
-
| Corynebacteriaceae | 900 | 2.
|
| 210 |
-
| Moraxellaceae | 890 | 3.
|
| 211 |
-
| Paenibacillaceae | 760 |
|
| 212 |
-
| Microbacteriaceae | 734 | 2.
|
| 213 |
-
| Micrococcaceae | 719 | 2.
|
| 214 |
-
| Nocardiaceae | 715 | 2.
|
| 215 |
|
| 216 |
### `optimal_ph`
|
| 217 |
|
| 218 |
| Family | n | MAE |
|
| 219 |
|---|---|---|
|
| 220 |
-
| Flavobacteriaceae | 355 | 0.
|
| 221 |
-
| Bacillaceae | 298 | 0.
|
| 222 |
-
| Roseobacteraceae | 204 | 0.
|
| 223 |
-
| Paenibacillaceae | 139 | 0.
|
| 224 |
-
| Microbacteriaceae | 120 | 0.
|
| 225 |
-
| Sphingobacteriaceae | 114 | 0.
|
| 226 |
-
| Sphingomonadaceae | 102 | 0.
|
| 227 |
-
| Streptomycetaceae | 98 | 0.
|
| 228 |
-
| Pseudonocardiaceae | 93 | 0.
|
| 229 |
-
| Halomonadaceae | 82 | 0.
|
| 230 |
-
| Micrococcaceae | 82 | 0.
|
| 231 |
-
| Nocardioidaceae | 80 | 0.
|
| 232 |
-
| Paracoccaceae | 76 | 0.
|
| 233 |
-
| Alteromonadaceae | 71 | 0.
|
| 234 |
-
| Erythrobacteraceae | 68 | 0.
|
| 235 |
|
| 236 |
### `salt_tolerance_pct`
|
| 237 |
|
| 238 |
| Family | n | MAE |
|
| 239 |
|---|---|---|
|
| 240 |
-
| Flavobacteriaceae | 267 | 1.
|
| 241 |
-
| Streptomycetaceae | 264 |
|
| 242 |
-
| Bacillaceae | 201 | 3.
|
| 243 |
-
| Roseobacteraceae | 127 | 1.
|
| 244 |
-
| Pseudonocardiaceae | 123 | 2.
|
| 245 |
-
| Paenibacillaceae | 93 | 1.
|
| 246 |
-
| Enterococcaceae | 93 | 2.
|
| 247 |
-
| Microbacteriaceae | 91 | 2.
|
| 248 |
-
| Micromonosporaceae | 90 | 1.
|
| 249 |
-
| Sphingomonadaceae | 81 |
|
| 250 |
-
| Micrococcaceae | 71 | 2.
|
| 251 |
-
| Streptosporangiaceae | 68 | 1.
|
| 252 |
-
| Lactobacillaceae | 66 | 2.
|
| 253 |
-
| Sphingobacteriaceae | 55 | 1.
|
| 254 |
-
| Halomonadaceae | 52 | 2.
|
| 255 |
|
| 256 |
## Known limitations
|
| 257 |
|
|
|
|
| 1 |
# microbe-model — v0 baseline eval report
|
| 2 |
|
| 3 |
+
_Generated: 2026-05-05T10:42:09+00:00_
|
| 4 |
|
| 5 |
## TL;DR
|
| 6 |
|
| 7 |
+
- **`optimal_temperature_c`**: MAE = **2.86** (vs always-predict-mean 4.98, **+43%**)
|
| 8 |
+
- **`optimal_ph`**: MAE = **0.48** (vs always-predict-mean 0.55, **+12%**)
|
| 9 |
+
- **`oxygen_requirement`**: macro-F1 = **0.357** (vs always-predict-majority 0.059, **+507%**)
|
| 10 |
+
- **`salt_tolerance_pct`**: MAE = **2.11** (vs always-predict-mean 2.51, **+16%**)
|
| 11 |
|
| 12 |
+
Trained on **46,029** strains with **423** genome-derived features. Cross-validation: 5-fold GroupKFold by taxonomic family.
|
| 13 |
|
| 14 |
## Corpus
|
| 15 |
|
|
|
|
| 43 |
|
| 44 |
| Target | Task | n labeled | Model metric | Baseline | Improvement |
|
| 45 |
|---|---|---|---|---|---|
|
| 46 |
+
| `optimal_temperature_c` | regression | 45,621 | MAE=2.857 | MAE=4.981 | +42.6% |
|
| 47 |
+
| `optimal_ph` | regression | 5,103 | MAE=0.482 | MAE=0.546 | +11.6% |
|
| 48 |
+
| `oxygen_requirement` | classification | 21,639 | F1=0.357 | F1=0.059 | +507.0% |
|
| 49 |
+
| `salt_tolerance_pct` | regression | 3,844 | MAE=2.112 | MAE=2.515 | +16.0% |
|
| 50 |
|
| 51 |
### `optimal_temperature_c` — fold-by-fold
|
| 52 |
|
| 53 |
| Fold | Metric | Train | Test |
|
| 54 |
|---|---|---|---|
|
| 55 |
+
| 1 | mae = 2.953 | n=36,496 | n=9,125 |
|
| 56 |
+
| 2 | mae = 2.626 | n=36,497 | n=9,124 |
|
| 57 |
+
| 3 | mae = 3.060 | n=36,497 | n=9,124 |
|
| 58 |
+
| 4 | mae = 3.265 | n=36,497 | n=9,124 |
|
| 59 |
+
| 5 | mae = 2.381 | n=36,497 | n=9,124 |
|
| 60 |
|
| 61 |
**Top 10 features for `optimal_temperature_c`:**
|
| 62 |
|
| 63 |
+
- `ivywrel_frac` — 0.1235
|
| 64 |
+
- `iso_cat2_thermophilic_gt45_c` — 0.0288
|
| 65 |
+
- `iso_cat2_patient` — 0.0251
|
| 66 |
+
- `iso_cat2_human` — 0.0234
|
| 67 |
+
- `n_predicted_cds` — 0.0216
|
| 68 |
+
- `iso_cat1_infection` — 0.0204
|
| 69 |
+
- `aa_frac_C` — 0.0143
|
| 70 |
+
- `genome_size_nt` — 0.0123
|
| 71 |
+
- `tetra_CTAA` — 0.0118
|
| 72 |
+
- `aa_frac_D` — 0.0109
|
| 73 |
|
| 74 |
### `optimal_ph` — fold-by-fold
|
| 75 |
|
| 76 |
| Fold | Metric | Train | Test |
|
| 77 |
|---|---|---|---|
|
| 78 |
+
| 1 | mae = 0.440 | n=4,082 | n=1,021 |
|
| 79 |
+
| 2 | mae = 0.568 | n=4,082 | n=1,021 |
|
| 80 |
+
| 3 | mae = 0.494 | n=4,082 | n=1,021 |
|
| 81 |
+
| 4 | mae = 0.466 | n=4,083 | n=1,020 |
|
| 82 |
+
| 5 | mae = 0.444 | n=4,083 | n=1,020 |
|
| 83 |
|
| 84 |
**Top 10 features for `optimal_ph`:**
|
| 85 |
|
| 86 |
+
- `md_ph_median` — 0.0518
|
| 87 |
+
- `iso_cat2_acidic` — 0.0307
|
| 88 |
+
- `iso_cat2_alkaline` — 0.0287
|
| 89 |
+
- `neg_charged_frac` — 0.0146
|
| 90 |
+
- `aa_frac_H` — 0.0081
|
| 91 |
+
- `aa_frac_E` — 0.0077
|
| 92 |
+
- `tetra_CTCT` — 0.0071
|
| 93 |
+
- `iso_cat2_plant` — 0.0068
|
| 94 |
+
- `tetra_AGAC` — 0.0067
|
| 95 |
+
- `tetra_CACT` — 0.0065
|
| 96 |
|
| 97 |
### `oxygen_requirement` — fold-by-fold
|
| 98 |
|
| 99 |
| Fold | Metric | Train | Test |
|
| 100 |
|---|---|---|---|
|
| 101 |
+
| 1 | f1_macro = 0.353 | n=17,311 | n=4,328 |
|
| 102 |
+
| 2 | f1_macro = 0.375 | n=17,311 | n=4,326 |
|
| 103 |
+
| 3 | f1_macro = 0.357 | n=17,311 | n=4,328 |
|
| 104 |
+
| 4 | f1_macro = 0.274 | n=17,311 | n=4,328 |
|
| 105 |
+
| 5 | f1_macro = 0.429 | n=17,312 | n=4,327 |
|
| 106 |
|
| 107 |
**Top 10 features for `oxygen_requirement`:**
|
| 108 |
|
| 109 |
+
- `codon_ATA` — 0.0395
|
| 110 |
+
- `iso_cat1_host` — 0.0269
|
| 111 |
+
- `n_predicted_cds` — 0.0266
|
| 112 |
+
- `aa_frac_C` — 0.0195
|
| 113 |
+
- `iso_cat1_environmental` — 0.0162
|
| 114 |
+
- `codon_CGT` — 0.0144
|
| 115 |
+
- `iso_cat1_engineered` — 0.0139
|
| 116 |
+
- `iso_cat2_human` — 0.0124
|
| 117 |
+
- `genome_size_nt` — 0.0103
|
| 118 |
+
- `codon_TAA` — 0.0083
|
| 119 |
|
| 120 |
### `salt_tolerance_pct` — fold-by-fold
|
| 121 |
|
| 122 |
| Fold | Metric | Train | Test |
|
| 123 |
|---|---|---|---|
|
| 124 |
+
| 1 | mae = 1.926 | n=3,075 | n=769 |
|
| 125 |
+
| 2 | mae = 1.893 | n=3,075 | n=769 |
|
| 126 |
+
| 3 | mae = 2.746 | n=3,075 | n=769 |
|
| 127 |
+
| 4 | mae = 1.870 | n=3,075 | n=769 |
|
| 128 |
+
| 5 | mae = 2.128 | n=3,076 | n=768 |
|
| 129 |
|
| 130 |
**Top 10 features for `salt_tolerance_pct`:**
|
| 131 |
|
| 132 |
+
- `neg_charged_frac` — 0.0702
|
| 133 |
+
- `tetra_ATCC` — 0.0428
|
| 134 |
+
- `aa_frac_C` — 0.0298
|
| 135 |
+
- `iso_cat2_saline` — 0.0286
|
| 136 |
+
- `md_nacl_pct_median` — 0.0256
|
| 137 |
+
- `tetra_ACAT` — 0.0255
|
| 138 |
+
- `md_nacl_pct_max` — 0.0128
|
| 139 |
+
- `aa_frac_T` — 0.0120
|
| 140 |
+
- `codon_CCG` — 0.0093
|
| 141 |
+
- `tetra_TGAT` — 0.0089
|
| 142 |
|
| 143 |
## Feature ↔ target correlations (Spearman, top 10)
|
| 144 |
|
|
|
|
| 163 |
|
| 164 |
| Feature | Spearman ρ | p-value |
|
| 165 |
|---|---|---|
|
| 166 |
+
| `md_ph_median` | +0.429 | 4.0e-131 |
|
| 167 |
| `neg_charged_frac` | +0.304 | 1.6e-109 |
|
| 168 |
| `mean_isoelectric_point` | -0.278 | 1.8e-91 |
|
| 169 |
| `aa_frac_E` | +0.256 | 4.5e-77 |
|
| 170 |
+
| `md_nacl_pct_max` | +0.218 | 1.9e-33 |
|
| 171 |
+
| `md_nacl_pct_median` | +0.212 | 9.9e-32 |
|
| 172 |
| `iso_cat2_alkaline` | +0.165 | 2.5e-32 |
|
| 173 |
| `ivywrel_frac` | +0.159 | 2.4e-30 |
|
| 174 |
| `codon_AAG` | -0.154 | 1.7e-28 |
|
| 175 |
| `codon_CGA` | +0.153 | 5.8e-28 |
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
### `salt_tolerance_pct`
|
| 178 |
|
|
|
|
| 183 |
| `aa_frac_E` | +0.310 | 3.1e-86 |
|
| 184 |
| `tetra_GACT` | +0.302 | 4.3e-82 |
|
| 185 |
| `tetra_AGTC` | +0.302 | 1.0e-81 |
|
| 186 |
+
| `md_nacl_pct_max` | +0.298 | 2.9e-52 |
|
| 187 |
+
| `md_nacl_pct_median` | +0.290 | 1.6e-49 |
|
| 188 |
| `tetra_ACTC` | +0.282 | 2.2e-71 |
|
| 189 |
| `tetra_GAGT` | +0.273 | 1.9e-66 |
|
| 190 |
| `iso_cat2_saline` | +0.263 | 9.4e-62 |
|
|
|
|
|
|
|
| 191 |
|
| 192 |
## Per-family error breakdown (regression targets)
|
| 193 |
|
|
|
|
| 197 |
|
| 198 |
| Family | n | MAE |
|
| 199 |
|---|---|---|
|
| 200 |
+
| Enterobacteriaceae | 2662 | 3.792 |
|
| 201 |
+
| Streptomycetaceae | 2212 | 1.783 |
|
| 202 |
+
| Bacillaceae | 1886 | 3.174 |
|
| 203 |
+
| Lactobacillaceae | 1732 | 3.709 |
|
| 204 |
+
| Pseudomonadaceae | 1621 | 2.488 |
|
| 205 |
+
| Myxococcaceae | 1546 | 0.238 |
|
| 206 |
+
| Streptococcaceae | 1170 | 2.537 |
|
| 207 |
+
| Staphylococcaceae | 1068 | 3.374 |
|
| 208 |
+
| Flavobacteriaceae | 981 | 4.116 |
|
| 209 |
+
| Corynebacteriaceae | 900 | 2.146 |
|
| 210 |
+
| Moraxellaceae | 890 | 3.388 |
|
| 211 |
+
| Paenibacillaceae | 760 | 3.081 |
|
| 212 |
+
| Microbacteriaceae | 734 | 2.459 |
|
| 213 |
+
| Micrococcaceae | 719 | 2.811 |
|
| 214 |
+
| Nocardiaceae | 715 | 2.276 |
|
| 215 |
|
| 216 |
### `optimal_ph`
|
| 217 |
|
| 218 |
| Family | n | MAE |
|
| 219 |
|---|---|---|
|
| 220 |
+
| Flavobacteriaceae | 355 | 0.405 |
|
| 221 |
+
| Bacillaceae | 298 | 0.606 |
|
| 222 |
+
| Roseobacteraceae | 204 | 0.375 |
|
| 223 |
+
| Paenibacillaceae | 139 | 0.469 |
|
| 224 |
+
| Microbacteriaceae | 120 | 0.446 |
|
| 225 |
+
| Sphingobacteriaceae | 114 | 0.336 |
|
| 226 |
+
| Sphingomonadaceae | 102 | 0.319 |
|
| 227 |
+
| Streptomycetaceae | 98 | 0.513 |
|
| 228 |
+
| Pseudonocardiaceae | 93 | 0.479 |
|
| 229 |
+
| Halomonadaceae | 82 | 0.584 |
|
| 230 |
+
| Micrococcaceae | 82 | 0.613 |
|
| 231 |
+
| Nocardioidaceae | 80 | 0.502 |
|
| 232 |
+
| Paracoccaceae | 76 | 0.574 |
|
| 233 |
+
| Alteromonadaceae | 71 | 0.355 |
|
| 234 |
+
| Erythrobacteraceae | 68 | 0.446 |
|
| 235 |
|
| 236 |
### `salt_tolerance_pct`
|
| 237 |
|
| 238 |
| Family | n | MAE |
|
| 239 |
|---|---|---|
|
| 240 |
+
| Flavobacteriaceae | 267 | 1.713 |
|
| 241 |
+
| Streptomycetaceae | 264 | 1.987 |
|
| 242 |
+
| Bacillaceae | 201 | 3.315 |
|
| 243 |
+
| Roseobacteraceae | 127 | 1.395 |
|
| 244 |
+
| Pseudonocardiaceae | 123 | 2.280 |
|
| 245 |
+
| Paenibacillaceae | 93 | 1.651 |
|
| 246 |
+
| Enterococcaceae | 93 | 2.935 |
|
| 247 |
+
| Microbacteriaceae | 91 | 2.789 |
|
| 248 |
+
| Micromonosporaceae | 90 | 1.609 |
|
| 249 |
+
| Sphingomonadaceae | 81 | 1.028 |
|
| 250 |
+
| Micrococcaceae | 71 | 2.613 |
|
| 251 |
+
| Streptosporangiaceae | 68 | 1.480 |
|
| 252 |
+
| Lactobacillaceae | 66 | 2.559 |
|
| 253 |
+
| Sphingobacteriaceae | 55 | 1.218 |
|
| 254 |
+
| Halomonadaceae | 52 | 2.815 |
|
| 255 |
|
| 256 |
## Known limitations
|
| 257 |
|
|
@@ -71,8 +71,20 @@ def main() -> None:
|
|
| 71 |
print(f"Encoded {len(iso_cols)} isolation-category features "
|
| 72 |
f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
|
| 75 |
-
feature_cols = feature_cols + iso_cols
|
| 76 |
|
| 77 |
print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
|
| 78 |
print(f"Distinct groups: {df['group'].nunique():,}")
|
|
|
|
| 71 |
print(f"Encoded {len(iso_cols)} isolation-category features "
|
| 72 |
f"({df[iso_cols].sum().sum():.0f} non-zero entries)")
|
| 73 |
|
| 74 |
+
md_path = config.DATA / "mediadive_features.parquet"
|
| 75 |
+
md_cols: list[str] = []
|
| 76 |
+
if md_path.exists():
|
| 77 |
+
md = pd.read_parquet(md_path)
|
| 78 |
+
md["bacdive_id"] = md["bacdive_id"].astype(int)
|
| 79 |
+
df["bacdive_id"] = df["bacdive_id"].astype(int)
|
| 80 |
+
md_cols = [c for c in md.columns if c != "bacdive_id"]
|
| 81 |
+
df = df.merge(md, on="bacdive_id", how="left")
|
| 82 |
+
n_with_md = df[md_cols[0]].notna().sum() if md_cols else 0
|
| 83 |
+
print(f"Joined MediaDive features ({len(md_cols)} cols) — "
|
| 84 |
+
f"{n_with_md:,}/{len(df):,} training rows have MediaDive data")
|
| 85 |
+
|
| 86 |
feature_cols = [c for c in feats.columns if c not in {"bacdive_id", "genome_accession"}]
|
| 87 |
+
feature_cols = feature_cols + iso_cols + md_cols
|
| 88 |
|
| 89 |
print(f"Training table: {len(df):,} strains × {len(feature_cols)} features")
|
| 90 |
print(f"Distinct groups: {df['group'].nunique():,}")
|
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build per-strain MediaDive features from strain_media + media_recipes + raw JSON.
|
| 2 |
+
|
| 3 |
+
For each BacDive strain, compute the median pH and NaCl% across all DSMZ media that
|
| 4 |
+
strain has been recorded as growing on. These are NOT labels — they're additional
|
| 5 |
+
features the model can use to predict the actual phenotype optima. Saves to
|
| 6 |
+
data/mediadive_features.parquet (joined into the training table by scripts/03).
|
| 7 |
+
|
| 8 |
+
Per-strain features written:
|
| 9 |
+
- md_n_media: count of media the strain grows on
|
| 10 |
+
- md_ph_median: median midpoint(min_pH, max_pH) across those media
|
| 11 |
+
- md_ph_range: max - min of medium pH across those media
|
| 12 |
+
- md_nacl_pct_median:median NaCl % w/v across those media
|
| 13 |
+
- md_nacl_pct_max: max NaCl % w/v (highest tolerated)
|
| 14 |
+
|
| 15 |
+
Sanity check: where a BacDive optimum_pH or salt_tolerance_pct exists, we expect
|
| 16 |
+
moderate (not perfect) correlation with the corresponding MediaDive feature.
|
| 17 |
+
"""
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import json
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
import pandas as pd
|
| 24 |
+
|
| 25 |
+
from microbe_model import config
|
| 26 |
+
|
| 27 |
+
NACL_CAP_PCT = 30.0 # clip recipes with absurd NaCl values (parse artifacts)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def build_medium_ph_map() -> dict[str, float]:
|
| 31 |
+
"""Return {medium_id: midpoint pH} from raw MediaDive cache."""
|
| 32 |
+
out: dict[str, float] = {}
|
| 33 |
+
for path in Path(config.DATA / "mediadive").glob("*.json"):
|
| 34 |
+
try:
|
| 35 |
+
d = json.loads(path.read_text())
|
| 36 |
+
except json.JSONDecodeError:
|
| 37 |
+
continue
|
| 38 |
+
if not isinstance(d, dict):
|
| 39 |
+
continue
|
| 40 |
+
m = d.get("medium")
|
| 41 |
+
if not isinstance(m, dict):
|
| 42 |
+
continue
|
| 43 |
+
mid = m.get("id")
|
| 44 |
+
min_ph = m.get("min_pH")
|
| 45 |
+
max_ph = m.get("max_pH")
|
| 46 |
+
if mid is None or min_ph is None or max_ph is None:
|
| 47 |
+
continue
|
| 48 |
+
try:
|
| 49 |
+
out[str(mid)] = (float(min_ph) + float(max_ph)) / 2
|
| 50 |
+
except (ValueError, TypeError):
|
| 51 |
+
continue
|
| 52 |
+
return out
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def build_medium_nacl_map() -> dict[str, float]:
|
| 56 |
+
"""Return {medium_id: NaCl % w/v} summed from recipe compounds (clipped)."""
|
| 57 |
+
mr = pd.read_parquet(config.DATA / "media_recipes.parquet")
|
| 58 |
+
nacl = mr[mr["compound"].str.contains(r"sodium chlor|^nacl$", case=False, na=False, regex=True)]
|
| 59 |
+
pct = (nacl.groupby("medium_id")["g_l"].sum() / 10).clip(upper=NACL_CAP_PCT)
|
| 60 |
+
return pct.astype(float).to_dict()
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def main() -> None:
|
| 64 |
+
sm = pd.read_parquet(config.DATA / "strain_media.parquet")
|
| 65 |
+
sm = sm[sm["growth"].str.lower() == "yes"].copy()
|
| 66 |
+
sm["medium_id"] = sm["medium_id"].astype(str)
|
| 67 |
+
|
| 68 |
+
ph_map = build_medium_ph_map()
|
| 69 |
+
nacl_map = build_medium_nacl_map()
|
| 70 |
+
print(f"medium pH map: {len(ph_map):,} media")
|
| 71 |
+
print(f"medium NaCl map: {len(nacl_map):,} media")
|
| 72 |
+
|
| 73 |
+
sm["m_ph"] = sm["medium_id"].map(ph_map)
|
| 74 |
+
# Strains may grow on media not in the recipe table — treat absent as 0% NaCl
|
| 75 |
+
sm["m_nacl"] = sm["medium_id"].map(nacl_map).fillna(0.0)
|
| 76 |
+
|
| 77 |
+
# Aggregate per-strain
|
| 78 |
+
grouped = sm.groupby("bacdive_id")
|
| 79 |
+
feat = pd.DataFrame({
|
| 80 |
+
"md_n_media": grouped.size(),
|
| 81 |
+
"md_ph_median": grouped["m_ph"].median(),
|
| 82 |
+
"md_ph_range": grouped["m_ph"].max() - grouped["m_ph"].min(),
|
| 83 |
+
"md_nacl_pct_median": grouped["m_nacl"].median(),
|
| 84 |
+
"md_nacl_pct_max": grouped["m_nacl"].max(),
|
| 85 |
+
}).reset_index()
|
| 86 |
+
|
| 87 |
+
out = config.DATA / "mediadive_features.parquet"
|
| 88 |
+
feat.to_parquet(out, index=False)
|
| 89 |
+
print(f"\nwrote {len(feat):,} strains to {out}")
|
| 90 |
+
print(feat.describe().round(2).to_string())
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
main()
|