Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -538,23 +538,29 @@ def _maybe_load_doi():
|
|
| 538 |
.str.lower()
|
| 539 |
.str.replace("[^0-9a-zA-Z]+", "_", regex=True)
|
| 540 |
)
|
|
|
|
| 541 |
for c in ["species", "media"]:
|
| 542 |
if c in df_doi_raw.columns:
|
| 543 |
df_doi_raw[c] = df_doi_raw[c].map(normalize_str)
|
|
|
|
|
|
|
| 544 |
if "light" in df_doi_raw.columns:
|
| 545 |
df_doi_raw["light"] = coerce_numeric(df_doi_raw["light"], "cycle_first")
|
| 546 |
for c in ["expo_day","expo_night","_c","ph","days"]:
|
| 547 |
if c in df_doi_raw.columns:
|
| 548 |
df_doi_raw[c] = coerce_numeric(df_doi_raw[c], "float")
|
| 549 |
|
|
|
|
| 550 |
doi_col_candidates = [c for c in df_doi_raw.columns if c in {"doi","doi_id","reference","url","link"}]
|
| 551 |
doi_col = doi_col_candidates[0] if doi_col_candidates else None
|
| 552 |
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
scales = {}
|
| 555 |
-
for col in
|
| 556 |
-
if col not in df_doi_raw.columns:
|
| 557 |
-
continue
|
| 558 |
v = pd.to_numeric(df_doi_raw[col], errors="coerce").dropna()
|
| 559 |
if len(v) >= 4:
|
| 560 |
lo, hi = np.percentile(v, [5,95]); span = max(1e-6, hi - lo)
|
|
@@ -563,10 +569,12 @@ def _maybe_load_doi():
|
|
| 563 |
else:
|
| 564 |
span = 1.0
|
| 565 |
scales[col] = span
|
|
|
|
| 566 |
return df_doi_raw, doi_col, scales, True
|
| 567 |
except Exception:
|
| 568 |
return None, None, None, False
|
| 569 |
|
|
|
|
| 570 |
df_doi_raw, DOI_COL, DOI_SCALES, DOI_READY = _maybe_load_doi()
|
| 571 |
|
| 572 |
def _media_similarity(a, b):
|
|
@@ -587,16 +595,33 @@ def _doi_url(x):
|
|
| 587 |
s = s.lower().replace("doi:", "").strip()
|
| 588 |
return f"https://doi.org/{s}"
|
| 589 |
|
| 590 |
-
def _closest_doi(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
if not DOI_READY or df_doi_raw is None or len(df_doi_raw) == 0:
|
| 592 |
return "> ℹ️ doi.csv not found or not readable."
|
|
|
|
|
|
|
| 593 |
s_key = _normalize_species_label(normalize_str(species))
|
| 594 |
-
df_cand = df_doi_raw[df_doi_raw
|
| 595 |
-
if df_cand.empty:
|
| 596 |
sp_unique = df_doi_raw["species"].dropna().unique().tolist()
|
| 597 |
best = get_close_matches(s_key, sp_unique, n=1, cutoff=0.6)
|
| 598 |
df_cand = df_doi_raw[df_doi_raw["species"] == (best[0] if best else s_key)]
|
|
|
|
|
|
|
| 599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
q = {
|
| 601 |
"light": parse_cycle_first(light),
|
| 602 |
"expo_day": extract_first_float(expo_day),
|
|
@@ -606,31 +631,72 @@ def _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days,
|
|
| 606 |
"days": extract_first_float(days),
|
| 607 |
}
|
| 608 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 609 |
rows = []
|
| 610 |
for _, r in df_cand.iterrows():
|
| 611 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
dist = 0.0; denom = 0
|
| 613 |
for col in ["light","expo_day","expo_night","_c","ph","days"]:
|
| 614 |
if col in df_cand.columns:
|
| 615 |
rv, qv = r.get(col, np.nan), q[col]
|
| 616 |
if pd.notna(rv) and pd.notna(qv):
|
| 617 |
span = DOI_SCALES.get(col, 1.0) if DOI_SCALES else 1.0
|
| 618 |
-
dist += abs(float(qv) - float(rv)) / span
|
|
|
|
| 619 |
dist = dist/denom if denom>0 else 1.0
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
if not rows:
|
| 622 |
return "> ℹ️ No comparable rows in doi.csv."
|
| 623 |
-
rows.sort(key=lambda x: x[0]); top = rows[:topk]
|
| 624 |
|
| 625 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
for rank, (score, r) in enumerate(top, 1):
|
|
|
|
| 627 |
sim_pct = max(0.0, min(100.0, 100.0 * np.exp(-score)))
|
| 628 |
doi_link = _doi_url(r.get(DOI_COL)) if DOI_COL else None
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
return md
|
| 635 |
|
| 636 |
# -----------------------------
|
|
@@ -1027,7 +1093,37 @@ def predict_and_plot_ui(
|
|
| 1027 |
|
| 1028 |
|
| 1029 |
def doi_matches_ui(target, species, media, light, expo_day, expo_night, temp_c, ph, days):
|
| 1030 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1031 |
|
| 1032 |
# -----------------------------
|
| 1033 |
# UI — professional layout
|
|
|
|
| 538 |
.str.lower()
|
| 539 |
.str.replace("[^0-9a-zA-Z]+", "_", regex=True)
|
| 540 |
)
|
| 541 |
+
# normalize categoricals
|
| 542 |
for c in ["species", "media"]:
|
| 543 |
if c in df_doi_raw.columns:
|
| 544 |
df_doi_raw[c] = df_doi_raw[c].map(normalize_str)
|
| 545 |
+
|
| 546 |
+
# parse numerics
|
| 547 |
if "light" in df_doi_raw.columns:
|
| 548 |
df_doi_raw["light"] = coerce_numeric(df_doi_raw["light"], "cycle_first")
|
| 549 |
for c in ["expo_day","expo_night","_c","ph","days"]:
|
| 550 |
if c in df_doi_raw.columns:
|
| 551 |
df_doi_raw[c] = coerce_numeric(df_doi_raw[c], "float")
|
| 552 |
|
| 553 |
+
# find a DOI-like column to link
|
| 554 |
doi_col_candidates = [c for c in df_doi_raw.columns if c in {"doi","doi_id","reference","url","link"}]
|
| 555 |
doi_col = doi_col_candidates[0] if doi_col_candidates else None
|
| 556 |
|
| 557 |
+
# build scales for numeric cols, including any target columns present
|
| 558 |
+
base_num = ["light","expo_day","expo_night","_c","ph","days"]
|
| 559 |
+
target_cols_present = [t for t in TARGETS if t in df_doi_raw.columns]
|
| 560 |
+
num_cols = base_num + target_cols_present
|
| 561 |
+
|
| 562 |
scales = {}
|
| 563 |
+
for col in num_cols:
|
|
|
|
|
|
|
| 564 |
v = pd.to_numeric(df_doi_raw[col], errors="coerce").dropna()
|
| 565 |
if len(v) >= 4:
|
| 566 |
lo, hi = np.percentile(v, [5,95]); span = max(1e-6, hi - lo)
|
|
|
|
| 569 |
else:
|
| 570 |
span = 1.0
|
| 571 |
scales[col] = span
|
| 572 |
+
|
| 573 |
return df_doi_raw, doi_col, scales, True
|
| 574 |
except Exception:
|
| 575 |
return None, None, None, False
|
| 576 |
|
| 577 |
+
|
| 578 |
df_doi_raw, DOI_COL, DOI_SCALES, DOI_READY = _maybe_load_doi()
|
| 579 |
|
| 580 |
def _media_similarity(a, b):
|
|
|
|
| 595 |
s = s.lower().replace("doi:", "").strip()
|
| 596 |
return f"https://doi.org/{s}"
|
| 597 |
|
| 598 |
+
def _closest_doi(
|
| 599 |
+
target_name, # "biomass" | "lipid" | "protein" | "carb"
|
| 600 |
+
species, media,
|
| 601 |
+
light, expo_day, expo_night, temp_c, ph, days,
|
| 602 |
+
y_target=None, # float | None (model point prediction for the target)
|
| 603 |
+
topk=5
|
| 604 |
+
):
|
| 605 |
if not DOI_READY or df_doi_raw is None or len(df_doi_raw) == 0:
|
| 606 |
return "> ℹ️ doi.csv not found or not readable."
|
| 607 |
+
|
| 608 |
+
# narrow to species (with fuzzy fallback)
|
| 609 |
s_key = _normalize_species_label(normalize_str(species))
|
| 610 |
+
df_cand = df_doi_raw[df_doi_raw.get("species", "") == s_key]
|
| 611 |
+
if df_cand.empty and "species" in df_doi_raw.columns:
|
| 612 |
sp_unique = df_doi_raw["species"].dropna().unique().tolist()
|
| 613 |
best = get_close_matches(s_key, sp_unique, n=1, cutoff=0.6)
|
| 614 |
df_cand = df_doi_raw[df_doi_raw["species"] == (best[0] if best else s_key)]
|
| 615 |
+
if df_cand.empty:
|
| 616 |
+
df_cand = df_doi_raw # last-resort: search whole table
|
| 617 |
|
| 618 |
+
# require rows that at least *have* a value for the chosen target (if present in table)
|
| 619 |
+
if target_name in df_cand.columns:
|
| 620 |
+
df_cand = df_cand[pd.to_numeric(df_cand[target_name], errors="coerce").notna()].copy()
|
| 621 |
+
if df_cand.empty:
|
| 622 |
+
return f"> ℹ️ No entries with '{target_name}' found for species filter."
|
| 623 |
+
|
| 624 |
+
# query vector
|
| 625 |
q = {
|
| 626 |
"light": parse_cycle_first(light),
|
| 627 |
"expo_day": extract_first_float(expo_day),
|
|
|
|
| 631 |
"days": extract_first_float(days),
|
| 632 |
}
|
| 633 |
|
| 634 |
+
# weights (tune if desired): give the target a stronger influence if we have y_target
|
| 635 |
+
w_media = 0.5 # penalty scale for media dissimilarity (0..0.5)
|
| 636 |
+
w_num = 1.0 # per-numeric component weight
|
| 637 |
+
w_tgt = 2.0 if y_target is not None else 0.0 # emphasize matching the target value
|
| 638 |
+
|
| 639 |
rows = []
|
| 640 |
for _, r in df_cand.iterrows():
|
| 641 |
+
# media similarity
|
| 642 |
+
sim = _media_similarity(media, r.get("media", ""))
|
| 643 |
+
media_penalty = (1.0 - sim) * w_media
|
| 644 |
+
|
| 645 |
+
# numeric distance (scaled by column span)
|
| 646 |
dist = 0.0; denom = 0
|
| 647 |
for col in ["light","expo_day","expo_night","_c","ph","days"]:
|
| 648 |
if col in df_cand.columns:
|
| 649 |
rv, qv = r.get(col, np.nan), q[col]
|
| 650 |
if pd.notna(rv) and pd.notna(qv):
|
| 651 |
span = DOI_SCALES.get(col, 1.0) if DOI_SCALES else 1.0
|
| 652 |
+
dist += w_num * abs(float(qv) - float(rv)) / span
|
| 653 |
+
denom += 1
|
| 654 |
dist = dist/denom if denom>0 else 1.0
|
| 655 |
+
|
| 656 |
+
# target proximity (if we have both the column and a predicted y)
|
| 657 |
+
tgt_term = 0.0
|
| 658 |
+
if w_tgt > 0 and target_name in df_cand.columns:
|
| 659 |
+
rv = r.get(target_name, np.nan)
|
| 660 |
+
if pd.notna(rv):
|
| 661 |
+
span = DOI_SCALES.get(target_name, 1.0) if DOI_SCALES else 1.0
|
| 662 |
+
tgt_term = w_tgt * abs(float(y_target) - float(rv)) / span
|
| 663 |
+
|
| 664 |
+
score = media_penalty + dist + tgt_term
|
| 665 |
+
rows.append((score, r))
|
| 666 |
+
|
| 667 |
if not rows:
|
| 668 |
return "> ℹ️ No comparable rows in doi.csv."
|
|
|
|
| 669 |
|
| 670 |
+
# rank by combined score
|
| 671 |
+
rows.sort(key=lambda x: x[0])
|
| 672 |
+
top = rows[:topk]
|
| 673 |
+
|
| 674 |
+
# build markdown
|
| 675 |
+
head_note = f" (target: **{target_name}**"
|
| 676 |
+
if y_target is not None:
|
| 677 |
+
head_note += f", y≈**{float(y_target):.3f}**"
|
| 678 |
+
head_note += ")"
|
| 679 |
+
|
| 680 |
+
md = f"### 📚 Closest DOI matches{head_note}\n"
|
| 681 |
for rank, (score, r) in enumerate(top, 1):
|
| 682 |
+
# Convert score to a readable similarity heuristic
|
| 683 |
sim_pct = max(0.0, min(100.0, 100.0 * np.exp(-score)))
|
| 684 |
doi_link = _doi_url(r.get(DOI_COL)) if DOI_COL else None
|
| 685 |
+
|
| 686 |
+
title = f"**{rank}. {r.get('species','?')} — {r.get('media','?')}** · Similarity **{sim_pct:.1f}%**"
|
| 687 |
+
if doi_link:
|
| 688 |
+
title += f" · [DOI]({doi_link})"
|
| 689 |
+
md += title + "\n"
|
| 690 |
+
|
| 691 |
+
# details line including target value if present
|
| 692 |
+
tgt_str = ""
|
| 693 |
+
if target_name in df_cand.columns and pd.notna(r.get(target_name, np.nan)):
|
| 694 |
+
tgt_str = f" · {target_name}: {r.get(target_name)}"
|
| 695 |
+
|
| 696 |
+
md += (
|
| 697 |
+
f"• Light: {r.get('light','NA')} · Day: {r.get('expo_day','NA')} · Night: {r.get('expo_night','NA')} · "
|
| 698 |
+
f"T(°C): {r.get('_c','NA')} · pH: {r.get('ph','NA')} · Days: {r.get('days','NA')}{tgt_str}\n"
|
| 699 |
+
)
|
| 700 |
return md
|
| 701 |
|
| 702 |
# -----------------------------
|
|
|
|
| 1093 |
|
| 1094 |
|
| 1095 |
def doi_matches_ui(target, species, media, light, expo_day, expo_night, temp_c, ph, days):
|
| 1096 |
+
"""
|
| 1097 |
+
Compute a point prediction ŷ for the chosen target (STACK if available),
|
| 1098 |
+
then retrieve the 5 closest DOI rows using both condition proximity and
|
| 1099 |
+
target proximity to ŷ.
|
| 1100 |
+
"""
|
| 1101 |
+
# try to get a point prediction for the target to compare with DOI target values
|
| 1102 |
+
yhat = None
|
| 1103 |
+
try:
|
| 1104 |
+
raw_row = {
|
| 1105 |
+
"species": species, "media": media, "light": light,
|
| 1106 |
+
"expo_day": expo_day, "expo_night": expo_night,
|
| 1107 |
+
"_c": temp_c, "ph": ph, "days": days
|
| 1108 |
+
}
|
| 1109 |
+
df_one = pd.DataFrame([raw_row])
|
| 1110 |
+
# prefer STACK if present; otherwise fallback to first available model
|
| 1111 |
+
avail = _available_models_for_target(target)
|
| 1112 |
+
chosen = "STACK" if "STACK" in avail else (avail[0] if avail else None)
|
| 1113 |
+
if chosen is not None:
|
| 1114 |
+
y_point = _predict_with_model_choice(target, chosen, df_one)
|
| 1115 |
+
yhat = float(y_point[0])
|
| 1116 |
+
except Exception:
|
| 1117 |
+
# silently continue without yhat; DOI ranking will just ignore target term
|
| 1118 |
+
yhat = None
|
| 1119 |
+
|
| 1120 |
+
return _closest_doi(
|
| 1121 |
+
target_name=target,
|
| 1122 |
+
species=species, media=media,
|
| 1123 |
+
light=light, expo_day=expo_day, expo_night=expo_night, temp_c=temp_c, ph=ph, days=days,
|
| 1124 |
+
y_target=yhat,
|
| 1125 |
+
topk=5
|
| 1126 |
+
)
|
| 1127 |
|
| 1128 |
# -----------------------------
|
| 1129 |
# UI — professional layout
|