ashaddams commited on
Commit
e7518d8
·
verified ·
1 Parent(s): e28c08a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -6
app.py CHANGED
@@ -175,6 +175,57 @@ def get_bounds(species: str, media: str, target: str):
175
  return None, None
176
  lo, hi = rng[tg]
177
  return float(lo), float(hi)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  # -----------------------------
180
  # Helpers
@@ -744,19 +795,31 @@ def _load_ensemble(target: str) -> EnsembleBundle:
744
  return bundle
745
 
746
  def _encode_df_for_bundle(bundle: EnsembleBundle, df_like: pd.DataFrame) -> pd.DataFrame:
747
- """Apply the SAVED encoders + numeric parsing + SAVED imputer; returns imputed numeric DF in training feature order."""
 
 
 
748
  def _norm(x):
749
  return "nan" if pd.isna(x) else str(x).strip().lower()
750
 
 
751
  X = pd.DataFrame({c: df_like[c] if c in df_like.columns else np.nan for c in bundle.feature_order})
752
 
753
- # encode categoricals
 
 
 
 
 
 
 
 
 
 
754
  for col in bundle.categorical_cols:
755
- if col in X.columns:
756
- X[col] = X[col].map(_norm).astype(str)
757
- X[col] = X[col].apply(lambda v: bundle.encoders[col].transform([v])[0])
758
 
759
- # numerics – parse like training
760
  def _extract_first_float(x):
761
  if pd.isna(x): return np.nan
762
  s = str(x); m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
@@ -773,6 +836,7 @@ def _encode_df_for_bundle(bundle: EnsembleBundle, df_like: pd.DataFrame) -> pd.D
773
  if c in X.columns:
774
  X[c] = X[c].apply(_extract_first_float)
775
 
 
776
  X_imp = pd.DataFrame(bundle.imputer.transform(X[bundle.feature_order]), columns=bundle.feature_order)
777
  return X_imp
778
 
 
175
  return None, None
176
  lo, hi = rng[tg]
177
  return float(lo), float(hi)
178
+ # --- Robust canonicalization for species/media -> encoder classes ---
179
+ # Accepts dotted-without-space, dotted-with-space, synonyms, fuzzy fallback.
180
+
181
+ SPECIES_ALIASES_CANON = {
182
+ "a. platensis": ["a.platensis", "a platensis", "arthrospira platensis", "spirulina platensis"],
183
+ "c. pyrenoidosa": ["c.pyrenoidosa", "c pyrenoidosa", "chlorella pyrenoidosa"],
184
+ "c. sorokiniana": ["c.sorokiniana", "c sorokiniana", "chlorella sorokiniana"],
185
+ "c. variabilis": ["c.variabilis", "c variabilis", "chlorella variabilis"],
186
+ "c. vulgaris": ["c.vulgaris", "c vulgaris", "chlorella vulgaris"],
187
+ "c. zofingiensis": ["c.zofingiensis", "c zofingiensis", "chromochloris zofingiensis", "chlorella zofingiensis"],
188
+ "h. pluvialis": ["h.pluvialis", "h pluvialis", "haematococcus pluvialis"],
189
+ "p. purpureum": ["p.purpureum", "p purpureum", "porphyridium purpureum"],
190
+ "scenedesmus sp.": ["scenedesmus", "scenedesmus sp", "desmodesmus sp."],
191
+ }
192
+
193
+ def _canon_from_alias(value: str, alias_map: dict[str, list[str]]) -> str:
194
+ v = normalize_str(value)
195
+ if v in alias_map:
196
+ return v
197
+ # match any key or its synonyms
198
+ for k, syns in alias_map.items():
199
+ if v == k or v in [normalize_str(s) for s in syns]:
200
+ return k
201
+ # punctuation/spacing heuristics for dotted species
202
+ v2 = v.replace(" .", ".").replace(". ", ".") # collapse spaces around dots
203
+ for k, syns in alias_map.items():
204
+ if v2 == k or v2 in [normalize_str(s) for s in syns]:
205
+ return k
206
+ v3 = v.replace(" .", ".").replace(".", ". ") # ensure a space after dot
207
+ for k, syns in alias_map.items():
208
+ if v3 == k or v3 in [normalize_str(s) for s in syns]:
209
+ return k
210
+ return v
211
+
212
+ def _canon_to_known(value: str, known_classes: list[str] | np.ndarray, alias_map: dict[str, list[str]]) -> str:
213
+ """Return a token that is guaranteed to exist in known_classes."""
214
+ known = [str(k) for k in list(known_classes)]
215
+ v = _canon_from_alias(value, alias_map)
216
+ if v in known:
217
+ return v
218
+ # if alias key maps to a known token, use it
219
+ for k, syns in alias_map.items():
220
+ if v == k or v in [normalize_str(s) for s in syns]:
221
+ if k in known:
222
+ return k
223
+ # try fuzzy on known classes
224
+ hit = get_close_matches(v, known, n=1, cutoff=0.6)
225
+ if hit:
226
+ return hit[0]
227
+ # graceful fallback: try 'nan' if present, else the most frequent class (index 0)
228
+ return "nan" if "nan" in known else known[0]
229
 
230
  # -----------------------------
231
  # Helpers
 
795
  return bundle
796
 
797
  def _encode_df_for_bundle(bundle: EnsembleBundle, df_like: pd.DataFrame) -> pd.DataFrame:
798
+ """
799
+ Apply the SAVED encoders + numeric parsing + SAVED imputer; returns imputed numeric DF in training feature order.
800
+ Critically: canonizes species/media to avoid 'previously unseen labels' errors.
801
+ """
802
  def _norm(x):
803
  return "nan" if pd.isna(x) else str(x).strip().lower()
804
 
805
+ # Build X with exactly the training feature order
806
  X = pd.DataFrame({c: df_like[c] if c in df_like.columns else np.nan for c in bundle.feature_order})
807
 
808
+ # ---- Canonicalize categoricals to known encoder classes BEFORE transform ----
809
+ if "species" in X.columns:
810
+ X["species"] = X["species"].map(_norm).apply(
811
+ lambda v: _canon_to_known(v, bundle.encoders["species"].classes_, SPECIES_ALIASES_CANON)
812
+ )
813
+ if "media" in X.columns:
814
+ X["media"] = X["media"].map(_norm).apply(
815
+ lambda v: _canon_to_known(v, bundle.encoders["media"].classes_, MEDIA_ALIASES)
816
+ )
817
+
818
+ # Now safe to transform: every token is guaranteed to exist in classes_
819
  for col in bundle.categorical_cols:
820
+ X[col] = bundle.encoders[col].transform(X[col].astype(str))
 
 
821
 
822
+ # ---- Numerics – parse exactly like training ----
823
  def _extract_first_float(x):
824
  if pd.isna(x): return np.nan
825
  s = str(x); m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
 
836
  if c in X.columns:
837
  X[c] = X[c].apply(_extract_first_float)
838
 
839
+ # ---- Impute to match training numeric space ----
840
  X_imp = pd.DataFrame(bundle.imputer.transform(X[bundle.feature_order]), columns=bundle.feature_order)
841
  return X_imp
842