ashaddams commited on
Commit
7da4f51
·
verified ·
1 Parent(s): 2f16d3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -18
app.py CHANGED
@@ -538,23 +538,29 @@ def _maybe_load_doi():
538
  .str.lower()
539
  .str.replace("[^0-9a-zA-Z]+", "_", regex=True)
540
  )
 
541
  for c in ["species", "media"]:
542
  if c in df_doi_raw.columns:
543
  df_doi_raw[c] = df_doi_raw[c].map(normalize_str)
 
 
544
  if "light" in df_doi_raw.columns:
545
  df_doi_raw["light"] = coerce_numeric(df_doi_raw["light"], "cycle_first")
546
  for c in ["expo_day","expo_night","_c","ph","days"]:
547
  if c in df_doi_raw.columns:
548
  df_doi_raw[c] = coerce_numeric(df_doi_raw[c], "float")
549
 
 
550
  doi_col_candidates = [c for c in df_doi_raw.columns if c in {"doi","doi_id","reference","url","link"}]
551
  doi_col = doi_col_candidates[0] if doi_col_candidates else None
552
 
553
- NUMERIC_COLS = ["light","expo_day","expo_night","_c","ph","days"]
 
 
 
 
554
  scales = {}
555
- for col in NUMERIC_COLS:
556
- if col not in df_doi_raw.columns:
557
- continue
558
  v = pd.to_numeric(df_doi_raw[col], errors="coerce").dropna()
559
  if len(v) >= 4:
560
  lo, hi = np.percentile(v, [5,95]); span = max(1e-6, hi - lo)
@@ -563,10 +569,12 @@ def _maybe_load_doi():
563
  else:
564
  span = 1.0
565
  scales[col] = span
 
566
  return df_doi_raw, doi_col, scales, True
567
  except Exception:
568
  return None, None, None, False
569
 
 
570
  df_doi_raw, DOI_COL, DOI_SCALES, DOI_READY = _maybe_load_doi()
571
 
572
  def _media_similarity(a, b):
@@ -587,16 +595,33 @@ def _doi_url(x):
587
  s = s.lower().replace("doi:", "").strip()
588
  return f"https://doi.org/{s}"
589
 
590
- def _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days, topk=3):
 
 
 
 
 
 
591
  if not DOI_READY or df_doi_raw is None or len(df_doi_raw) == 0:
592
  return "> ℹ️ doi.csv not found or not readable."
 
 
593
  s_key = _normalize_species_label(normalize_str(species))
594
- df_cand = df_doi_raw[df_doi_raw["species"] == s_key]
595
- if df_cand.empty:
596
  sp_unique = df_doi_raw["species"].dropna().unique().tolist()
597
  best = get_close_matches(s_key, sp_unique, n=1, cutoff=0.6)
598
  df_cand = df_doi_raw[df_doi_raw["species"] == (best[0] if best else s_key)]
 
 
599
 
 
 
 
 
 
 
 
600
  q = {
601
  "light": parse_cycle_first(light),
602
  "expo_day": extract_first_float(expo_day),
@@ -606,31 +631,72 @@ def _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days,
606
  "days": extract_first_float(days),
607
  }
608
 
 
 
 
 
 
609
  rows = []
610
  for _, r in df_cand.iterrows():
611
- sim = _media_similarity(media, r.get("media", "")); media_penalty = (1.0 - sim) * 0.5
 
 
 
 
612
  dist = 0.0; denom = 0
613
  for col in ["light","expo_day","expo_night","_c","ph","days"]:
614
  if col in df_cand.columns:
615
  rv, qv = r.get(col, np.nan), q[col]
616
  if pd.notna(rv) and pd.notna(qv):
617
  span = DOI_SCALES.get(col, 1.0) if DOI_SCALES else 1.0
618
- dist += abs(float(qv) - float(rv)) / span; denom += 1
 
619
  dist = dist/denom if denom>0 else 1.0
620
- rows.append((media_penalty + dist, r))
 
 
 
 
 
 
 
 
 
 
 
621
  if not rows:
622
  return "> ℹ️ No comparable rows in doi.csv."
623
- rows.sort(key=lambda x: x[0]); top = rows[:topk]
624
 
625
- md = "### 📚 Closest DOI matches\n"
 
 
 
 
 
 
 
 
 
 
626
  for rank, (score, r) in enumerate(top, 1):
 
627
  sim_pct = max(0.0, min(100.0, 100.0 * np.exp(-score)))
628
  doi_link = _doi_url(r.get(DOI_COL)) if DOI_COL else None
629
- head = f"**{rank}. {r.get('species','?')} — {r.get('media','?')}** · Similarity **{sim_pct:.1f}%**"
630
- if doi_link: head += f" · [DOI]({doi_link})"
631
- md += head + "\n"
632
- md += (f" Light: {r.get('light','NA')} · Day: {r.get('expo_day','NA')} · Night: {r.get('expo_night','NA')} · "
633
- f"T(°C): {r.get('_c','NA')} · pH: {r.get('ph','NA')} · Days: {r.get('days','NA')}\n")
 
 
 
 
 
 
 
 
 
 
634
  return md
635
 
636
  # -----------------------------
@@ -1027,7 +1093,37 @@ def predict_and_plot_ui(
1027
 
1028
 
1029
  def doi_matches_ui(target, species, media, light, expo_day, expo_night, temp_c, ph, days):
1030
- return _closest_doi(species, media, light, expo_day, expo_night, temp_c, ph, days, topk=3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1031
 
1032
  # -----------------------------
1033
  # UI — professional layout
 
538
  .str.lower()
539
  .str.replace("[^0-9a-zA-Z]+", "_", regex=True)
540
  )
541
+ # normalize categoricals
542
  for c in ["species", "media"]:
543
  if c in df_doi_raw.columns:
544
  df_doi_raw[c] = df_doi_raw[c].map(normalize_str)
545
+
546
+ # parse numerics
547
  if "light" in df_doi_raw.columns:
548
  df_doi_raw["light"] = coerce_numeric(df_doi_raw["light"], "cycle_first")
549
  for c in ["expo_day","expo_night","_c","ph","days"]:
550
  if c in df_doi_raw.columns:
551
  df_doi_raw[c] = coerce_numeric(df_doi_raw[c], "float")
552
 
553
+ # find a DOI-like column to link
554
  doi_col_candidates = [c for c in df_doi_raw.columns if c in {"doi","doi_id","reference","url","link"}]
555
  doi_col = doi_col_candidates[0] if doi_col_candidates else None
556
 
557
+ # build scales for numeric cols, including any target columns present
558
+ base_num = ["light","expo_day","expo_night","_c","ph","days"]
559
+ target_cols_present = [t for t in TARGETS if t in df_doi_raw.columns]
560
+ num_cols = base_num + target_cols_present
561
+
562
  scales = {}
563
+ for col in num_cols:
 
 
564
  v = pd.to_numeric(df_doi_raw[col], errors="coerce").dropna()
565
  if len(v) >= 4:
566
  lo, hi = np.percentile(v, [5,95]); span = max(1e-6, hi - lo)
 
569
  else:
570
  span = 1.0
571
  scales[col] = span
572
+
573
  return df_doi_raw, doi_col, scales, True
574
  except Exception:
575
  return None, None, None, False
576
 
577
+
578
  df_doi_raw, DOI_COL, DOI_SCALES, DOI_READY = _maybe_load_doi()
579
 
580
  def _media_similarity(a, b):
 
595
  s = s.lower().replace("doi:", "").strip()
596
  return f"https://doi.org/{s}"
597
 
598
+ def _closest_doi(
599
+ target_name, # "biomass" | "lipid" | "protein" | "carb"
600
+ species, media,
601
+ light, expo_day, expo_night, temp_c, ph, days,
602
+ y_target=None, # float | None (model point prediction for the target)
603
+ topk=5
604
+ ):
605
  if not DOI_READY or df_doi_raw is None or len(df_doi_raw) == 0:
606
  return "> ℹ️ doi.csv not found or not readable."
607
+
608
+ # narrow to species (with fuzzy fallback)
609
  s_key = _normalize_species_label(normalize_str(species))
610
+ df_cand = df_doi_raw[df_doi_raw.get("species", "") == s_key]
611
+ if df_cand.empty and "species" in df_doi_raw.columns:
612
  sp_unique = df_doi_raw["species"].dropna().unique().tolist()
613
  best = get_close_matches(s_key, sp_unique, n=1, cutoff=0.6)
614
  df_cand = df_doi_raw[df_doi_raw["species"] == (best[0] if best else s_key)]
615
+ if df_cand.empty:
616
+ df_cand = df_doi_raw # last-resort: search whole table
617
 
618
+ # require rows that at least *have* a value for the chosen target (if present in table)
619
+ if target_name in df_cand.columns:
620
+ df_cand = df_cand[pd.to_numeric(df_cand[target_name], errors="coerce").notna()].copy()
621
+ if df_cand.empty:
622
+ return f"> ℹ️ No entries with '{target_name}' found for species filter."
623
+
624
+ # query vector
625
  q = {
626
  "light": parse_cycle_first(light),
627
  "expo_day": extract_first_float(expo_day),
 
631
  "days": extract_first_float(days),
632
  }
633
 
634
+ # weights (tune if desired): give the target a stronger influence if we have y_target
635
+ w_media = 0.5 # penalty scale for media dissimilarity (0..0.5)
636
+ w_num = 1.0 # per-numeric component weight
637
+ w_tgt = 2.0 if y_target is not None else 0.0 # emphasize matching the target value
638
+
639
  rows = []
640
  for _, r in df_cand.iterrows():
641
+ # media similarity
642
+ sim = _media_similarity(media, r.get("media", ""))
643
+ media_penalty = (1.0 - sim) * w_media
644
+
645
+ # numeric distance (scaled by column span)
646
  dist = 0.0; denom = 0
647
  for col in ["light","expo_day","expo_night","_c","ph","days"]:
648
  if col in df_cand.columns:
649
  rv, qv = r.get(col, np.nan), q[col]
650
  if pd.notna(rv) and pd.notna(qv):
651
  span = DOI_SCALES.get(col, 1.0) if DOI_SCALES else 1.0
652
+ dist += w_num * abs(float(qv) - float(rv)) / span
653
+ denom += 1
654
  dist = dist/denom if denom>0 else 1.0
655
+
656
+ # target proximity (if we have both the column and a predicted y)
657
+ tgt_term = 0.0
658
+ if w_tgt > 0 and target_name in df_cand.columns:
659
+ rv = r.get(target_name, np.nan)
660
+ if pd.notna(rv):
661
+ span = DOI_SCALES.get(target_name, 1.0) if DOI_SCALES else 1.0
662
+ tgt_term = w_tgt * abs(float(y_target) - float(rv)) / span
663
+
664
+ score = media_penalty + dist + tgt_term
665
+ rows.append((score, r))
666
+
667
  if not rows:
668
  return "> ℹ️ No comparable rows in doi.csv."
 
669
 
670
+ # rank by combined score
671
+ rows.sort(key=lambda x: x[0])
672
+ top = rows[:topk]
673
+
674
+ # build markdown
675
+ head_note = f" (target: **{target_name}**"
676
+ if y_target is not None:
677
+ head_note += f", y≈**{float(y_target):.3f}**"
678
+ head_note += ")"
679
+
680
+ md = f"### 📚 Closest DOI matches{head_note}\n"
681
  for rank, (score, r) in enumerate(top, 1):
682
+ # Convert score to a readable similarity heuristic
683
  sim_pct = max(0.0, min(100.0, 100.0 * np.exp(-score)))
684
  doi_link = _doi_url(r.get(DOI_COL)) if DOI_COL else None
685
+
686
+ title = f"**{rank}. {r.get('species','?')} — {r.get('media','?')}** · Similarity **{sim_pct:.1f}%**"
687
+ if doi_link:
688
+ title += f" · [DOI]({doi_link})"
689
+ md += title + "\n"
690
+
691
+ # details line including target value if present
692
+ tgt_str = ""
693
+ if target_name in df_cand.columns and pd.notna(r.get(target_name, np.nan)):
694
+ tgt_str = f" · {target_name}: {r.get(target_name)}"
695
+
696
+ md += (
697
+ f"• Light: {r.get('light','NA')} · Day: {r.get('expo_day','NA')} · Night: {r.get('expo_night','NA')} · "
698
+ f"T(°C): {r.get('_c','NA')} · pH: {r.get('ph','NA')} · Days: {r.get('days','NA')}{tgt_str}\n"
699
+ )
700
  return md
701
 
702
  # -----------------------------
 
1093
 
1094
 
1095
  def doi_matches_ui(target, species, media, light, expo_day, expo_night, temp_c, ph, days):
1096
+ """
1097
+ Compute a point prediction ŷ for the chosen target (STACK if available),
1098
+ then retrieve the 5 closest DOI rows using both condition proximity and
1099
+ target proximity to ŷ.
1100
+ """
1101
+ # try to get a point prediction for the target to compare with DOI target values
1102
+ yhat = None
1103
+ try:
1104
+ raw_row = {
1105
+ "species": species, "media": media, "light": light,
1106
+ "expo_day": expo_day, "expo_night": expo_night,
1107
+ "_c": temp_c, "ph": ph, "days": days
1108
+ }
1109
+ df_one = pd.DataFrame([raw_row])
1110
+ # prefer STACK if present; otherwise fallback to first available model
1111
+ avail = _available_models_for_target(target)
1112
+ chosen = "STACK" if "STACK" in avail else (avail[0] if avail else None)
1113
+ if chosen is not None:
1114
+ y_point = _predict_with_model_choice(target, chosen, df_one)
1115
+ yhat = float(y_point[0])
1116
+ except Exception:
1117
+ # silently continue without yhat; DOI ranking will just ignore target term
1118
+ yhat = None
1119
+
1120
+ return _closest_doi(
1121
+ target_name=target,
1122
+ species=species, media=media,
1123
+ light=light, expo_day=expo_day, expo_night=expo_night, temp_c=temp_c, ph=ph, days=days,
1124
+ y_target=yhat,
1125
+ topk=5
1126
+ )
1127
 
1128
  # -----------------------------
1129
  # UI — professional layout