Spaces:

ashaddamsAT
/

algae_yield_predictor

Build error

App Files Files Community

ashaddams commited on Sep 23, 2025

Commit

7da4f51

verified ·

1 Parent(s): 2f16d3c

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -18

app.py CHANGED Viewed

@@ -538,23 +538,29 @@ def _maybe_load_doi():
                      .str.lower()
                      .str.replace("[^0-9a-zA-Z]+", "_", regex=True)
         )
         for c in ["species", "media"]:
             if c in df_doi_raw.columns:
                 df_doi_raw[c] = df_doi_raw[c].map(normalize_str)
         if "light" in df_doi_raw.columns:
             df_doi_raw["light"] = coerce_numeric(df_doi_raw["light"], "cycle_first")
         for c in ["expo_day","expo_night","_c","ph","days"]:
             if c in df_doi_raw.columns:
                 df_doi_raw[c] = coerce_numeric(df_doi_raw[c], "float")
         doi_col_candidates = [c for c in df_doi_raw.columns if c in {"doi","doi_id","reference","url","link"}]
         doi_col = doi_col_candidates[0] if doi_col_candidates else None
-        NUMERIC_COLS = ["light","expo_day","expo_night","_c","ph","days"]
         scales = {}
-        for col in NUMERIC_COLS:
-            if col not in df_doi_raw.columns:
-                continue
             v = pd.to_numeric(df_doi_raw[col], errors="coerce").dropna()
             if len(v) >= 4:
                 lo, hi = np.percentile(v, [5,95]); span = max(1e-6, hi - lo)
@@ -563,10 +569,12 @@ def _maybe_load_doi():
             else:
                 span = 1.0
             scales[col] = span
         return df_doi_raw, doi_col, scales, True
     except Exception:
         return None, None, None, False
 df_doi_raw, DOI_COL, DOI_SCALES, DOI_READY = _maybe_load_doi()
 def _media_similarity(a, b):
@@ -587,16 +595,33 @@ def _doi_url(x):
     s = s.lower().replace("doi:", "").strip()
     return f"https://doi.org/{s}"
-def _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days, topk=3):
     if not DOI_READY or df_doi_raw is None or len(df_doi_raw) == 0:
         return "> ℹ️ doi.csv not found or not readable."
     s_key = _normalize_species_label(normalize_str(species))
-    df_cand = df_doi_raw[df_doi_raw["species"] == s_key]
-    if df_cand.empty:
         sp_unique = df_doi_raw["species"].dropna().unique().tolist()
         best = get_close_matches(s_key, sp_unique, n=1, cutoff=0.6)
         df_cand = df_doi_raw[df_doi_raw["species"] == (best[0] if best else s_key)]
     q = {
         "light": parse_cycle_first(light),
         "expo_day": extract_first_float(expo_day),
@@ -606,31 +631,72 @@ def _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days,
         "days": extract_first_float(days),
     }
     rows = []
     for _, r in df_cand.iterrows():
-        sim = _media_similarity(media, r.get("media", "")); media_penalty = (1.0 - sim) * 0.5
         dist = 0.0; denom = 0
         for col in ["light","expo_day","expo_night","_c","ph","days"]:
             if col in df_cand.columns:
                 rv, qv = r.get(col, np.nan), q[col]
                 if pd.notna(rv) and pd.notna(qv):
                     span = DOI_SCALES.get(col, 1.0) if DOI_SCALES else 1.0
-                    dist += abs(float(qv) - float(rv)) / span; denom += 1
         dist = dist/denom if denom>0 else 1.0
-        rows.append((media_penalty + dist, r))
     if not rows:
         return "> ℹ️ No comparable rows in doi.csv."
-    rows.sort(key=lambda x: x[0]); top = rows[:topk]
-    md = "### 📚 Closest DOI matches\n"
     for rank, (score, r) in enumerate(top, 1):
         sim_pct = max(0.0, min(100.0, 100.0 * np.exp(-score)))
         doi_link = _doi_url(r.get(DOI_COL)) if DOI_COL else None
-        head = f"**{rank}. {r.get('species','?')} — {r.get('media','?')}** · Similarity **{sim_pct:.1f}%**"
-        if doi_link: head += f" · [DOI]({doi_link})"
-        md += head + "\n"
-        md += (f"• Light: {r.get('light','NA')} · Day: {r.get('expo_day','NA')} · Night: {r.get('expo_night','NA')} · "
-               f"T(°C): {r.get('_c','NA')} · pH: {r.get('ph','NA')} · Days: {r.get('days','NA')}\n")
     return md
 # -----------------------------
@@ -1027,7 +1093,37 @@ def predict_and_plot_ui(
 def doi_matches_ui(target, species, media, light, expo_day, expo_night, temp_c, ph, days):
-    return _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days, topk=3)
 # -----------------------------
 # UI — professional layout

                      .str.lower()
                      .str.replace("[^0-9a-zA-Z]+", "_", regex=True)
         )
+        # normalize categoricals
         for c in ["species", "media"]:
             if c in df_doi_raw.columns:
                 df_doi_raw[c] = df_doi_raw[c].map(normalize_str)
+        # parse numerics
         if "light" in df_doi_raw.columns:
             df_doi_raw["light"] = coerce_numeric(df_doi_raw["light"], "cycle_first")
         for c in ["expo_day","expo_night","_c","ph","days"]:
             if c in df_doi_raw.columns:
                 df_doi_raw[c] = coerce_numeric(df_doi_raw[c], "float")
+        # find a DOI-like column to link
         doi_col_candidates = [c for c in df_doi_raw.columns if c in {"doi","doi_id","reference","url","link"}]
         doi_col = doi_col_candidates[0] if doi_col_candidates else None
+        # build scales for numeric cols, including any target columns present
+        base_num = ["light","expo_day","expo_night","_c","ph","days"]
+        target_cols_present = [t for t in TARGETS if t in df_doi_raw.columns]
+        num_cols = base_num + target_cols_present
         scales = {}
+        for col in num_cols:
             v = pd.to_numeric(df_doi_raw[col], errors="coerce").dropna()
             if len(v) >= 4:
                 lo, hi = np.percentile(v, [5,95]); span = max(1e-6, hi - lo)
             else:
                 span = 1.0
             scales[col] = span
         return df_doi_raw, doi_col, scales, True
     except Exception:
         return None, None, None, False
 df_doi_raw, DOI_COL, DOI_SCALES, DOI_READY = _maybe_load_doi()
 def _media_similarity(a, b):
     s = s.lower().replace("doi:", "").strip()
     return f"https://doi.org/{s}"
+def _closest_doi(
+    target_name,            # "biomass" | "lipid" | "protein" | "carb"
+    species, media,
+    light, expo_day, expo_night, temp_c, ph, days,
+    y_target=None,          # float | None (model point prediction for the target)
+    topk=5
+):
     if not DOI_READY or df_doi_raw is None or len(df_doi_raw) == 0:
         return "> ℹ️ doi.csv not found or not readable."
+    # narrow to species (with fuzzy fallback)
     s_key = _normalize_species_label(normalize_str(species))
+    df_cand = df_doi_raw[df_doi_raw.get("species", "") == s_key]
+    if df_cand.empty and "species" in df_doi_raw.columns:
         sp_unique = df_doi_raw["species"].dropna().unique().tolist()
         best = get_close_matches(s_key, sp_unique, n=1, cutoff=0.6)
         df_cand = df_doi_raw[df_doi_raw["species"] == (best[0] if best else s_key)]
+    if df_cand.empty:
+        df_cand = df_doi_raw  # last-resort: search whole table
+    # require rows that at least *have* a value for the chosen target (if present in table)
+    if target_name in df_cand.columns:
+        df_cand = df_cand[pd.to_numeric(df_cand[target_name], errors="coerce").notna()].copy()
+        if df_cand.empty:
+            return f"> ℹ️ No entries with '{target_name}' found for species filter."
+    # query vector
     q = {
         "light": parse_cycle_first(light),
         "expo_day": extract_first_float(expo_day),
         "days": extract_first_float(days),
     }
+    # weights (tune if desired): give the target a stronger influence if we have y_target
+    w_media = 0.5        # penalty scale for media dissimilarity (0..0.5)
+    w_num   = 1.0        # per-numeric component weight
+    w_tgt   = 2.0 if y_target is not None else 0.0  # emphasize matching the target value
     rows = []
     for _, r in df_cand.iterrows():
+        # media similarity
+        sim = _media_similarity(media, r.get("media", ""))
+        media_penalty = (1.0 - sim) * w_media
+        # numeric distance (scaled by column span)
         dist = 0.0; denom = 0
         for col in ["light","expo_day","expo_night","_c","ph","days"]:
             if col in df_cand.columns:
                 rv, qv = r.get(col, np.nan), q[col]
                 if pd.notna(rv) and pd.notna(qv):
                     span = DOI_SCALES.get(col, 1.0) if DOI_SCALES else 1.0
+                    dist += w_num * abs(float(qv) - float(rv)) / span
+                    denom += 1
         dist = dist/denom if denom>0 else 1.0
+        # target proximity (if we have both the column and a predicted y)
+        tgt_term = 0.0
+        if w_tgt > 0 and target_name in df_cand.columns:
+            rv = r.get(target_name, np.nan)
+            if pd.notna(rv):
+                span = DOI_SCALES.get(target_name, 1.0) if DOI_SCALES else 1.0
+                tgt_term = w_tgt * abs(float(y_target) - float(rv)) / span
+        score = media_penalty + dist + tgt_term
+        rows.append((score, r))
     if not rows:
         return "> ℹ️ No comparable rows in doi.csv."
+    # rank by combined score
+    rows.sort(key=lambda x: x[0])
+    top = rows[:topk]
+    # build markdown
+    head_note = f" (target: **{target_name}**"
+    if y_target is not None:
+        head_note += f", y≈**{float(y_target):.3f}**"
+    head_note += ")"
+    md = f"### 📚 Closest DOI matches{head_note}\n"
     for rank, (score, r) in enumerate(top, 1):
+        # Convert score to a readable similarity heuristic
         sim_pct = max(0.0, min(100.0, 100.0 * np.exp(-score)))
         doi_link = _doi_url(r.get(DOI_COL)) if DOI_COL else None
+        title = f"**{rank}. {r.get('species','?')} — {r.get('media','?')}** · Similarity **{sim_pct:.1f}%**"
+        if doi_link:
+            title += f" · [DOI]({doi_link})"
+        md += title + "\n"
+        # details line including target value if present
+        tgt_str = ""
+        if target_name in df_cand.columns and pd.notna(r.get(target_name, np.nan)):
+            tgt_str = f" · {target_name}: {r.get(target_name)}"
+        md += (
+            f"• Light: {r.get('light','NA')} · Day: {r.get('expo_day','NA')} · Night: {r.get('expo_night','NA')} · "
+            f"T(°C): {r.get('_c','NA')} · pH: {r.get('ph','NA')} · Days: {r.get('days','NA')}{tgt_str}\n"
+        )
     return md
 # -----------------------------
 def doi_matches_ui(target, species, media, light, expo_day, expo_night, temp_c, ph, days):
+    """
+    Compute a point prediction ŷ for the chosen target (STACK if available),
+    then retrieve the 5 closest DOI rows using both condition proximity and
+    target proximity to ŷ.
+    """
+    # try to get a point prediction for the target to compare with DOI target values
+    yhat = None
+    try:
+        raw_row = {
+            "species": species, "media": media, "light": light,
+            "expo_day": expo_day, "expo_night": expo_night,
+            "_c": temp_c, "ph": ph, "days": days
+        }
+        df_one = pd.DataFrame([raw_row])
+        # prefer STACK if present; otherwise fallback to first available model
+        avail = _available_models_for_target(target)
+        chosen = "STACK" if "STACK" in avail else (avail[0] if avail else None)
+        if chosen is not None:
+            y_point = _predict_with_model_choice(target, chosen, df_one)
+            yhat = float(y_point[0])
+    except Exception:
+        # silently continue without yhat; DOI ranking will just ignore target term
+        yhat = None
+    return _closest_doi(
+        target_name=target,
+        species=species, media=media,
+        light=light, expo_day=expo_day, expo_night=expo_night, temp_c=temp_c, ph=ph, days=days,
+        y_target=yhat,
+        topk=5
+    )
 # -----------------------------
 # UI — professional layout