Spaces:

ashaddamsAT
/

algae_yield_predictor

Sleeping

App Files Files Community

ashaddams commited on Sep 22, 2025

Commit

d386725

verified ·

1 Parent(s): 5defdae

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -3

app.py CHANGED Viewed

@@ -289,6 +289,49 @@ def _normalize_species_label(s: str) -> str:
         "scenedesmus": "scenedesmus sp.", "scenedesmus sp": "scenedesmus sp.", "scenedesmus sp.": "scenedesmus sp.",
     }
     return alias.get(s2, s2)
 def _format_suggestion_md(species: str, target: str) -> str:
     sp = _normalize_species_label(species)
@@ -562,9 +605,14 @@ def preprocess_row(species, media, light, expo_day, expo_night, temp_c, ph, days
         "_c": temp_c, "ph": ph, "days": days
     }], columns=FEATURES)
-    # encode cats
-    for col in CATEGORICAL:
-        row[col] = encoders[col].transform([row.loc[0, col]])[0]
     # numerics
     row["light"] = row["light"].apply(parse_cycle_first)

         "scenedesmus": "scenedesmus sp.", "scenedesmus sp": "scenedesmus sp.", "scenedesmus sp.": "scenedesmus sp.",
     }
     return alias.get(s2, s2)
+    from difflib import get_close_matches
+def _canon_categorical_for_encoder(col: str, v, enc) -> str:
+    """
+    Map user's string to a label known by the saved LabelEncoder.
+    - Normalize + alias species/media
+    - Try exact match to encoder.classes_
+    - Try normalized-to-original mapping
+    - Try fuzzy match
+    - Fall back to 'nan' (if present) or the first class
+    """
+    s = "nan" if pd.isna(v) else str(v).strip().lower()
+    # apply your canonicalizers
+    if col == "species":
+        s = _normalize_species_label(s)
+    elif col == "media":
+        s = _canon_media_for_bounds(s)
+    # quick exact match if encoder classes already lowercased
+    if s in enc.classes_:
+        return s
+    # build a normalized->original lookup over the encoder classes
+    norm_map = {str(c).strip().lower(): c for c in enc.classes_}
+    if s in norm_map:
+        return norm_map[s]
+    # try a couple of punctuation/spacing tweaks (common in your data)
+    s2 = s.replace(" .", ".").replace(". ", ".")
+    if s2 in norm_map:
+        return norm_map[s2]
+    # fuzzy match to what's in the encoder
+    hits = get_close_matches(s, list(norm_map.keys()), n=1, cutoff=0.6)
+    if hits:
+        return norm_map[hits[0]]
+    # graceful fallback
+    if "nan" in enc.classes_:
+        return "nan"
+    return enc.classes_[0]
 def _format_suggestion_md(species: str, target: str) -> str:
     sp = _normalize_species_label(species)
         "_c": temp_c, "ph": ph, "days": days
     }], columns=FEATURES)
+    # encode categoricals — robust to unseen labels
+for col in bundle.categorical_cols:
+    if col in X.columns:
+        enc = bundle.encoders[col]
+        def _to_known_code(v):
+            known = _canon_categorical_for_encoder(col, v, enc)
+            return enc.transform([known])[0]
+        X[col] = X[col].apply(_to_known_code)
     # numerics
     row["light"] = row["light"].apply(parse_cycle_first)