Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -289,6 +289,49 @@ def _normalize_species_label(s: str) -> str:
|
|
| 289 |
"scenedesmus": "scenedesmus sp.", "scenedesmus sp": "scenedesmus sp.", "scenedesmus sp.": "scenedesmus sp.",
|
| 290 |
}
|
| 291 |
return alias.get(s2, s2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
def _format_suggestion_md(species: str, target: str) -> str:
|
| 294 |
sp = _normalize_species_label(species)
|
|
@@ -562,9 +605,14 @@ def preprocess_row(species, media, light, expo_day, expo_night, temp_c, ph, days
|
|
| 562 |
"_c": temp_c, "ph": ph, "days": days
|
| 563 |
}], columns=FEATURES)
|
| 564 |
|
| 565 |
-
# encode
|
| 566 |
-
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
|
| 569 |
# numerics
|
| 570 |
row["light"] = row["light"].apply(parse_cycle_first)
|
|
|
|
| 289 |
"scenedesmus": "scenedesmus sp.", "scenedesmus sp": "scenedesmus sp.", "scenedesmus sp.": "scenedesmus sp.",
|
| 290 |
}
|
| 291 |
return alias.get(s2, s2)
|
| 292 |
+
from difflib import get_close_matches
|
| 293 |
+
|
| 294 |
+
def _canon_categorical_for_encoder(col: str, v, enc) -> str:
|
| 295 |
+
"""
|
| 296 |
+
Map user's string to a label known by the saved LabelEncoder.
|
| 297 |
+
- Normalize + alias species/media
|
| 298 |
+
- Try exact match to encoder.classes_
|
| 299 |
+
- Try normalized-to-original mapping
|
| 300 |
+
- Try fuzzy match
|
| 301 |
+
- Fall back to 'nan' (if present) or the first class
|
| 302 |
+
"""
|
| 303 |
+
s = "nan" if pd.isna(v) else str(v).strip().lower()
|
| 304 |
+
|
| 305 |
+
# apply your canonicalizers
|
| 306 |
+
if col == "species":
|
| 307 |
+
s = _normalize_species_label(s)
|
| 308 |
+
elif col == "media":
|
| 309 |
+
s = _canon_media_for_bounds(s)
|
| 310 |
+
|
| 311 |
+
# quick exact match if encoder classes already lowercased
|
| 312 |
+
if s in enc.classes_:
|
| 313 |
+
return s
|
| 314 |
+
|
| 315 |
+
# build a normalized->original lookup over the encoder classes
|
| 316 |
+
norm_map = {str(c).strip().lower(): c for c in enc.classes_}
|
| 317 |
+
if s in norm_map:
|
| 318 |
+
return norm_map[s]
|
| 319 |
+
|
| 320 |
+
# try a couple of punctuation/spacing tweaks (common in your data)
|
| 321 |
+
s2 = s.replace(" .", ".").replace(". ", ".")
|
| 322 |
+
if s2 in norm_map:
|
| 323 |
+
return norm_map[s2]
|
| 324 |
+
|
| 325 |
+
# fuzzy match to what's in the encoder
|
| 326 |
+
hits = get_close_matches(s, list(norm_map.keys()), n=1, cutoff=0.6)
|
| 327 |
+
if hits:
|
| 328 |
+
return norm_map[hits[0]]
|
| 329 |
+
|
| 330 |
+
# graceful fallback
|
| 331 |
+
if "nan" in enc.classes_:
|
| 332 |
+
return "nan"
|
| 333 |
+
return enc.classes_[0]
|
| 334 |
+
|
| 335 |
|
| 336 |
def _format_suggestion_md(species: str, target: str) -> str:
|
| 337 |
sp = _normalize_species_label(species)
|
|
|
|
| 605 |
"_c": temp_c, "ph": ph, "days": days
|
| 606 |
}], columns=FEATURES)
|
| 607 |
|
| 608 |
+
# encode categoricals — robust to unseen labels
|
| 609 |
+
for col in bundle.categorical_cols:
|
| 610 |
+
if col in X.columns:
|
| 611 |
+
enc = bundle.encoders[col]
|
| 612 |
+
def _to_known_code(v):
|
| 613 |
+
known = _canon_categorical_for_encoder(col, v, enc)
|
| 614 |
+
return enc.transform([known])[0]
|
| 615 |
+
X[col] = X[col].apply(_to_known_code)
|
| 616 |
|
| 617 |
# numerics
|
| 618 |
row["light"] = row["light"].apply(parse_cycle_first)
|