Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -175,6 +175,57 @@ def get_bounds(species: str, media: str, target: str):
|
|
| 175 |
return None, None
|
| 176 |
lo, hi = rng[tg]
|
| 177 |
return float(lo), float(hi)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
# -----------------------------
|
| 180 |
# Helpers
|
|
@@ -744,19 +795,31 @@ def _load_ensemble(target: str) -> EnsembleBundle:
|
|
| 744 |
return bundle
|
| 745 |
|
| 746 |
def _encode_df_for_bundle(bundle: EnsembleBundle, df_like: pd.DataFrame) -> pd.DataFrame:
|
| 747 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 748 |
def _norm(x):
|
| 749 |
return "nan" if pd.isna(x) else str(x).strip().lower()
|
| 750 |
|
|
|
|
| 751 |
X = pd.DataFrame({c: df_like[c] if c in df_like.columns else np.nan for c in bundle.feature_order})
|
| 752 |
|
| 753 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
for col in bundle.categorical_cols:
|
| 755 |
-
|
| 756 |
-
X[col] = X[col].map(_norm).astype(str)
|
| 757 |
-
X[col] = X[col].apply(lambda v: bundle.encoders[col].transform([v])[0])
|
| 758 |
|
| 759 |
-
#
|
| 760 |
def _extract_first_float(x):
|
| 761 |
if pd.isna(x): return np.nan
|
| 762 |
s = str(x); m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
|
|
@@ -773,6 +836,7 @@ def _encode_df_for_bundle(bundle: EnsembleBundle, df_like: pd.DataFrame) -> pd.D
|
|
| 773 |
if c in X.columns:
|
| 774 |
X[c] = X[c].apply(_extract_first_float)
|
| 775 |
|
|
|
|
| 776 |
X_imp = pd.DataFrame(bundle.imputer.transform(X[bundle.feature_order]), columns=bundle.feature_order)
|
| 777 |
return X_imp
|
| 778 |
|
|
|
|
| 175 |
return None, None
|
| 176 |
lo, hi = rng[tg]
|
| 177 |
return float(lo), float(hi)
|
| 178 |
+
# --- Robust canonicalization for species/media -> encoder classes ---
|
| 179 |
+
# Accepts dotted-without-space, dotted-with-space, synonyms, fuzzy fallback.
|
| 180 |
+
|
| 181 |
+
SPECIES_ALIASES_CANON = {
|
| 182 |
+
"a. platensis": ["a.platensis", "a platensis", "arthrospira platensis", "spirulina platensis"],
|
| 183 |
+
"c. pyrenoidosa": ["c.pyrenoidosa", "c pyrenoidosa", "chlorella pyrenoidosa"],
|
| 184 |
+
"c. sorokiniana": ["c.sorokiniana", "c sorokiniana", "chlorella sorokiniana"],
|
| 185 |
+
"c. variabilis": ["c.variabilis", "c variabilis", "chlorella variabilis"],
|
| 186 |
+
"c. vulgaris": ["c.vulgaris", "c vulgaris", "chlorella vulgaris"],
|
| 187 |
+
"c. zofingiensis": ["c.zofingiensis", "c zofingiensis", "chromochloris zofingiensis", "chlorella zofingiensis"],
|
| 188 |
+
"h. pluvialis": ["h.pluvialis", "h pluvialis", "haematococcus pluvialis"],
|
| 189 |
+
"p. purpureum": ["p.purpureum", "p purpureum", "porphyridium purpureum"],
|
| 190 |
+
"scenedesmus sp.": ["scenedesmus", "scenedesmus sp", "desmodesmus sp."],
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
def _canon_from_alias(value: str, alias_map: dict[str, list[str]]) -> str:
|
| 194 |
+
v = normalize_str(value)
|
| 195 |
+
if v in alias_map:
|
| 196 |
+
return v
|
| 197 |
+
# match any key or its synonyms
|
| 198 |
+
for k, syns in alias_map.items():
|
| 199 |
+
if v == k or v in [normalize_str(s) for s in syns]:
|
| 200 |
+
return k
|
| 201 |
+
# punctuation/spacing heuristics for dotted species
|
| 202 |
+
v2 = v.replace(" .", ".").replace(". ", ".") # collapse spaces around dots
|
| 203 |
+
for k, syns in alias_map.items():
|
| 204 |
+
if v2 == k or v2 in [normalize_str(s) for s in syns]:
|
| 205 |
+
return k
|
| 206 |
+
v3 = v.replace(" .", ".").replace(".", ". ") # ensure a space after dot
|
| 207 |
+
for k, syns in alias_map.items():
|
| 208 |
+
if v3 == k or v3 in [normalize_str(s) for s in syns]:
|
| 209 |
+
return k
|
| 210 |
+
return v
|
| 211 |
+
|
| 212 |
+
def _canon_to_known(value: str, known_classes: list[str] | np.ndarray, alias_map: dict[str, list[str]]) -> str:
|
| 213 |
+
"""Return a token that is guaranteed to exist in known_classes."""
|
| 214 |
+
known = [str(k) for k in list(known_classes)]
|
| 215 |
+
v = _canon_from_alias(value, alias_map)
|
| 216 |
+
if v in known:
|
| 217 |
+
return v
|
| 218 |
+
# if alias key maps to a known token, use it
|
| 219 |
+
for k, syns in alias_map.items():
|
| 220 |
+
if v == k or v in [normalize_str(s) for s in syns]:
|
| 221 |
+
if k in known:
|
| 222 |
+
return k
|
| 223 |
+
# try fuzzy on known classes
|
| 224 |
+
hit = get_close_matches(v, known, n=1, cutoff=0.6)
|
| 225 |
+
if hit:
|
| 226 |
+
return hit[0]
|
| 227 |
+
# graceful fallback: try 'nan' if present, else the most frequent class (index 0)
|
| 228 |
+
return "nan" if "nan" in known else known[0]
|
| 229 |
|
| 230 |
# -----------------------------
|
| 231 |
# Helpers
|
|
|
|
| 795 |
return bundle
|
| 796 |
|
| 797 |
def _encode_df_for_bundle(bundle: EnsembleBundle, df_like: pd.DataFrame) -> pd.DataFrame:
|
| 798 |
+
"""
|
| 799 |
+
Apply the SAVED encoders + numeric parsing + SAVED imputer; returns imputed numeric DF in training feature order.
|
| 800 |
+
Critically: canonizes species/media to avoid 'previously unseen labels' errors.
|
| 801 |
+
"""
|
| 802 |
def _norm(x):
|
| 803 |
return "nan" if pd.isna(x) else str(x).strip().lower()
|
| 804 |
|
| 805 |
+
# Build X with exactly the training feature order
|
| 806 |
X = pd.DataFrame({c: df_like[c] if c in df_like.columns else np.nan for c in bundle.feature_order})
|
| 807 |
|
| 808 |
+
# ---- Canonicalize categoricals to known encoder classes BEFORE transform ----
|
| 809 |
+
if "species" in X.columns:
|
| 810 |
+
X["species"] = X["species"].map(_norm).apply(
|
| 811 |
+
lambda v: _canon_to_known(v, bundle.encoders["species"].classes_, SPECIES_ALIASES_CANON)
|
| 812 |
+
)
|
| 813 |
+
if "media" in X.columns:
|
| 814 |
+
X["media"] = X["media"].map(_norm).apply(
|
| 815 |
+
lambda v: _canon_to_known(v, bundle.encoders["media"].classes_, MEDIA_ALIASES)
|
| 816 |
+
)
|
| 817 |
+
|
| 818 |
+
# Now safe to transform: every token is guaranteed to exist in classes_
|
| 819 |
for col in bundle.categorical_cols:
|
| 820 |
+
X[col] = bundle.encoders[col].transform(X[col].astype(str))
|
|
|
|
|
|
|
| 821 |
|
| 822 |
+
# ---- Numerics – parse exactly like training ----
|
| 823 |
def _extract_first_float(x):
|
| 824 |
if pd.isna(x): return np.nan
|
| 825 |
s = str(x); m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
|
|
|
|
| 836 |
if c in X.columns:
|
| 837 |
X[c] = X[c].apply(_extract_first_float)
|
| 838 |
|
| 839 |
+
# ---- Impute to match training numeric space ----
|
| 840 |
X_imp = pd.DataFrame(bundle.imputer.transform(X[bundle.feature_order]), columns=bundle.feature_order)
|
| 841 |
return X_imp
|
| 842 |
|