ashaddams commited on
Commit
d386725
·
verified ·
1 Parent(s): 5defdae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -3
app.py CHANGED
@@ -289,6 +289,49 @@ def _normalize_species_label(s: str) -> str:
289
  "scenedesmus": "scenedesmus sp.", "scenedesmus sp": "scenedesmus sp.", "scenedesmus sp.": "scenedesmus sp.",
290
  }
291
  return alias.get(s2, s2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  def _format_suggestion_md(species: str, target: str) -> str:
294
  sp = _normalize_species_label(species)
@@ -562,9 +605,14 @@ def preprocess_row(species, media, light, expo_day, expo_night, temp_c, ph, days
562
  "_c": temp_c, "ph": ph, "days": days
563
  }], columns=FEATURES)
564
 
565
- # encode cats
566
- for col in CATEGORICAL:
567
- row[col] = encoders[col].transform([row.loc[0, col]])[0]
 
 
 
 
 
568
 
569
  # numerics
570
  row["light"] = row["light"].apply(parse_cycle_first)
 
289
  "scenedesmus": "scenedesmus sp.", "scenedesmus sp": "scenedesmus sp.", "scenedesmus sp.": "scenedesmus sp.",
290
  }
291
  return alias.get(s2, s2)
292
+ from difflib import get_close_matches
293
+
294
+ def _canon_categorical_for_encoder(col: str, v, enc) -> str:
295
+ """
296
+ Map user's string to a label known by the saved LabelEncoder.
297
+ - Normalize + alias species/media
298
+ - Try exact match to encoder.classes_
299
+ - Try normalized-to-original mapping
300
+ - Try fuzzy match
301
+ - Fall back to 'nan' (if present) or the first class
302
+ """
303
+ s = "nan" if pd.isna(v) else str(v).strip().lower()
304
+
305
+ # apply your canonicalizers
306
+ if col == "species":
307
+ s = _normalize_species_label(s)
308
+ elif col == "media":
309
+ s = _canon_media_for_bounds(s)
310
+
311
+ # quick exact match if encoder classes already lowercased
312
+ if s in enc.classes_:
313
+ return s
314
+
315
+ # build a normalized->original lookup over the encoder classes
316
+ norm_map = {str(c).strip().lower(): c for c in enc.classes_}
317
+ if s in norm_map:
318
+ return norm_map[s]
319
+
320
+ # try a couple of punctuation/spacing tweaks (common in your data)
321
+ s2 = s.replace(" .", ".").replace(". ", ".")
322
+ if s2 in norm_map:
323
+ return norm_map[s2]
324
+
325
+ # fuzzy match to what's in the encoder
326
+ hits = get_close_matches(s, list(norm_map.keys()), n=1, cutoff=0.6)
327
+ if hits:
328
+ return norm_map[hits[0]]
329
+
330
+ # graceful fallback
331
+ if "nan" in enc.classes_:
332
+ return "nan"
333
+ return enc.classes_[0]
334
+
335
 
336
  def _format_suggestion_md(species: str, target: str) -> str:
337
  sp = _normalize_species_label(species)
 
605
  "_c": temp_c, "ph": ph, "days": days
606
  }], columns=FEATURES)
607
 
608
+ # encode categoricals — robust to unseen labels
609
+ for col in bundle.categorical_cols:
610
+ if col in X.columns:
611
+ enc = bundle.encoders[col]
612
+ def _to_known_code(v):
613
+ known = _canon_categorical_for_encoder(col, v, enc)
614
+ return enc.transform([known])[0]
615
+ X[col] = X[col].apply(_to_known_code)
616
 
617
  # numerics
618
  row["light"] = row["light"].apply(parse_cycle_first)