Inframat-x commited on
Commit
a3b7322
·
verified ·
1 Parent(s): b902f03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -54
app.py CHANGED
@@ -1,8 +1,8 @@
1
  # ================================================================
2
  # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
- # - Predictor tab: identical behavior (kept)
4
  # - Literature tab: Hybrid RAG; LLM runs silently when available
5
- # - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete
6
  # ================================================================
7
 
8
  # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -12,7 +12,7 @@ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
12
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
 
14
  # ------------------------------- Imports ------------------------------
15
- import re, time, joblib, warnings, json
16
  from pathlib import Path
17
  from typing import List, Dict, Any
18
 
@@ -49,6 +49,7 @@ LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and
49
  # ========================= Predictor (kept) =========================
50
  CF_COL = "Conductive Filler Conc. (wt%)"
51
  TARGET_COL = "Stress GF (MPa-1)"
 
52
 
53
  MAIN_VARIABLES = [
54
  "Filler 1 Type",
@@ -112,24 +113,54 @@ OPTIONAL_FIELDS = {
112
  "Filler 2 Dimensionality",
113
  }
114
 
115
- DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
116
- CURRENT_CHOICES = ["DC", "AC", "NA"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  MODEL_CANDIDATES = [
119
  "stress_gf_xgb.joblib",
120
  "models/stress_gf_xgb.joblib",
121
  "/home/user/app/stress_gf_xgb.joblib",
 
122
  ]
123
 
124
  def _load_model_or_error():
125
- for p in MODEL_CANDIDATES:
126
  if os.path.exists(p):
127
  try:
128
  return joblib.load(p)
129
  except Exception as e:
130
- return f"Could not load model from {p}: {e}"
131
- return ("Model file not found. Upload your trained pipeline as "
132
- "stress_gf_xgb.joblib (or put it in models/).")
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
135
  row = {}
@@ -143,14 +174,16 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
143
  row[col] = float(v)
144
  except Exception:
145
  row[col] = np.nan
 
 
146
  else:
147
- row[col] = "" if v in (None, "NA") else str(v).strip()
 
 
148
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
149
 
150
  def _is_complete(form_dict: dict) -> bool:
151
- for col in MAIN_VARIABLES:
152
- if col in OPTIONAL_FIELDS:
153
- continue
154
  v = form_dict.get(col, None)
155
  if col in NUMERIC_COLS:
156
  try:
@@ -158,31 +191,48 @@ def _is_complete(form_dict: dict) -> bool:
158
  return False
159
  except Exception:
160
  return False
161
- elif col in CATEGORICAL_COLS:
162
- s = "" if v in (None, "NA") else str(v).strip()
163
- if s == "":
164
- return False
165
  else:
166
- s = "" if v is None else str(v).strip()
167
- if s == "":
168
  return False
169
  return True
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  def predict_fn(**kwargs):
 
172
  if not _is_complete(kwargs):
173
  return 0.0
174
  mdl = _load_model_or_error()
175
- if isinstance(mdl, str):
176
- return mdl
177
  X_new = _coerce_to_row(kwargs)
 
178
  try:
179
- y_log = mdl.predict(X_new) # model predicts log1p(target)
180
- y = float(np.expm1(y_log)[0]) # back to original scale MPa^-1
181
- if -1e-10 < y < 0:
182
- y = 0.0
 
 
 
183
  return y
184
  except Exception as e:
185
- return f"Prediction error: {e}"
 
 
186
 
187
  EXAMPLE = {
188
  "Filler 1 Type": "CNT",
@@ -191,7 +241,7 @@ EXAMPLE = {
191
  "Filler 1 Length (mm)": 1.2,
192
  CF_COL: 0.5,
193
  "Filler 2 Type": "",
194
- "Filler 2 Dimensionality": "NA",
195
  "Filler 2 Diameter (µm)": None,
196
  "Filler 2 Length (mm)": None,
197
  "Specimen Volume (mm3)": 1000,
@@ -219,9 +269,9 @@ def _clear_all():
219
  if col in NUMERIC_COLS:
220
  cleared.append(None)
221
  elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
222
- cleared.append("NA")
223
  elif col == "Current Type":
224
- cleared.append("NA")
225
  else:
226
  cleared.append("")
227
  return cleared
@@ -343,7 +393,7 @@ def build_or_load_hybrid(pdf_dir: Path):
343
  emb = None
344
 
345
  joblib.dump(vectorizer, TFIDF_VECT_PATH)
346
- joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH) # assign + save
347
  if BM25Okapi is not None:
348
  joblib.dump(all_tokens, BM25_TOK_PATH)
349
  meta.to_parquet(RAG_META_PATH, index=False)
@@ -354,7 +404,7 @@ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not N
354
  st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
355
 
356
  def _extract_page(text_chunk: str) -> str:
357
- m = list(re.finditer(r"\\[\\[PAGE=(\\d+)\\]\\]", text_chunk or ""))
358
  return (m[-1].group(1) if m else "?")
359
 
360
  def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
@@ -383,7 +433,7 @@ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAUL
383
 
384
  # BM25 scores
385
  if bm25 is not None:
386
- q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\\-\\/\\.%%]+", query)]
387
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
388
  else:
389
  bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
@@ -456,10 +506,9 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_ch
456
  for i in remain:
457
  sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
458
  score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
459
- cand_scores.append((score, i))
460
- cand_scores.sort(reverse=True)
461
- best_i = cand_scores[0][1]
462
- selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
463
  return selected
464
 
465
  def compose_extractive(selected: List[Dict[str, Any]]) -> str:
@@ -518,7 +567,6 @@ def rag_reply(
518
  srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
519
  coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
520
 
521
- # Hidden policy: if strict==True → no paraphrasing; else try LLM if available
522
  if strict_quotes_only:
523
  if not selected:
524
  return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
@@ -644,27 +692,28 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
644
  with gr.Row():
645
  with gr.Column(scale=7):
646
  with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
647
- f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
648
- f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
649
- f1_len = gr.Number(label="Filler 1 Length (mm)")
650
- cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
651
- f1_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
 
652
 
653
  with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
654
  f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
655
  f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
656
  f2_len = gr.Number(label="Filler 2 Length (mm)")
657
- f2_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
658
 
659
  with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
660
- spec_vol = gr.Number(label="Specimen Volume (mm3)")
661
- probe_cnt = gr.Number(label="Probe Count")
662
- probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
663
- wb = gr.Number(label="W/B")
664
- sb = gr.Number(label="S/B")
665
- gauge_len = gr.Number(label="Gauge Length (mm)")
666
- curing = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
667
- n_fillers = gr.Number(label="Number of Fillers")
668
 
669
  with gr.Accordion("Processing", open=False, elem_classes=["card"]):
670
  dry_temp = gr.Number(label="Drying Temperature (°C)")
@@ -672,8 +721,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
672
 
673
  with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
674
  load_rate = gr.Number(label="Loading Rate (MPa/s)")
675
- E_mod = gr.Number(label="Modulus of Elasticity (GPa)")
676
- current = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
677
  voltage = gr.Number(label="Applied Voltage (V)")
678
 
679
  with gr.Column(scale=5):
@@ -687,7 +736,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
687
  with gr.Accordion("About this model", open=False, elem_classes=["card"]):
688
  gr.Markdown(
689
  "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
690
- "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model trains on log1p).\n"
691
  "- Missing values are safely imputed per-feature.\n"
692
  "- Trained columns:\n"
693
  f" `{', '.join(MAIN_VARIABLES)}`",
 
1
  # ================================================================
2
  # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
+ # - Predictor tab: identical behavior (kept) + * marks for required fields
4
  # - Literature tab: Hybrid RAG; LLM runs silently when available
5
+ # - UX: no visible "LLM & Controls" window; prediction=0.0 if incomplete/error
6
  # ================================================================
7
 
8
  # ---------------------- Runtime flags (HF-safe) ----------------------
 
12
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
 
14
  # ------------------------------- Imports ------------------------------
15
+ import re, time, joblib, warnings, json, traceback
16
  from pathlib import Path
17
  from typing import List, Dict, Any
18
 
 
49
  # ========================= Predictor (kept) =========================
50
  CF_COL = "Conductive Filler Conc. (wt%)"
51
  TARGET_COL = "Stress GF (MPa-1)"
52
+ CANON_NA = "NA" # <-- canonical placeholder for categoricals
53
 
54
  MAIN_VARIABLES = [
55
  "Filler 1 Type",
 
113
  "Filler 2 Dimensionality",
114
  }
115
 
116
+ # Only these fields are required
117
+ REQUIRED_FIELDS = {
118
+ "Filler 1 Type",
119
+ "Filler 1 Diameter (µm)",
120
+ "Filler 1 Length (mm)",
121
+ CF_COL,
122
+ "Filler 1 Dimensionality",
123
+ "Specimen Volume (mm3)",
124
+ "Probe Count",
125
+ "Probe Material",
126
+ "W/B",
127
+ "S/B",
128
+ "Gauge Length (mm)",
129
+ "Curing Condition",
130
+ "Number of Fillers",
131
+ "Modulus of Elasticity (GPa)"
132
+ }
133
+
134
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", CANON_NA]
135
+ CURRENT_CHOICES = ["DC", "AC", CANON_NA]
136
 
137
  MODEL_CANDIDATES = [
138
  "stress_gf_xgb.joblib",
139
  "models/stress_gf_xgb.joblib",
140
  "/home/user/app/stress_gf_xgb.joblib",
141
+ os.getenv("MODEL_PATH", "")
142
  ]
143
 
144
  def _load_model_or_error():
145
+ for p in [x for x in MODEL_CANDIDATES if x]:
146
  if os.path.exists(p):
147
  try:
148
  return joblib.load(p)
149
  except Exception as e:
150
+ print(f"[ModelLoad] Error from {p}: {e}")
151
+ traceback.print_exc()
152
+ return None
153
+ print("[ModelLoad] Model file not found. Upload stress_gf_xgb.joblib (or set MODEL_PATH).")
154
+ return None
155
+
156
+ def _canon_cat(v: Any) -> str:
157
+ """Stable, canonical category placeholder normalization."""
158
+ if v is None:
159
+ return CANON_NA
160
+ s = str(v).strip()
161
+ if s == "" or s.upper() in {"N/A", "NONE", "NULL"}:
162
+ return CANON_NA
163
+ return s
164
 
165
  def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
166
  row = {}
 
174
  row[col] = float(v)
175
  except Exception:
176
  row[col] = np.nan
177
+ elif col in CATEGORICAL_COLS:
178
+ row[col] = _canon_cat(v)
179
  else:
180
+ # non-numeric, non-categorical (free text) keep trimmed, but not empty
181
+ s = str(v).strip() if v is not None else ""
182
+ row[col] = s if s else CANON_NA
183
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
184
 
185
  def _is_complete(form_dict: dict) -> bool:
186
+ for col in REQUIRED_FIELDS:
 
 
187
  v = form_dict.get(col, None)
188
  if col in NUMERIC_COLS:
189
  try:
 
191
  return False
192
  except Exception:
193
  return False
 
 
 
 
194
  else:
195
+ s = _canon_cat(v)
196
+ if s == CANON_NA:
197
  return False
198
  return True
199
 
200
+ def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
201
+ """Align incoming dataframe columns to the model's expected feature order."""
202
+ try:
203
+ feat = getattr(mdl, "feature_names_in_", None)
204
+ if feat is not None and len(feat) > 0:
205
+ for c in feat:
206
+ if c not in df.columns:
207
+ df[c] = np.nan
208
+ df = df[list(feat)]
209
+ except Exception as e:
210
+ print(f"[Align] Could not align columns: {e}")
211
+ traceback.print_exc()
212
+ return df
213
+
214
  def predict_fn(**kwargs):
215
+ # Return a NUMBER always (0.0 on incomplete or any error) to keep gr.Number happy
216
  if not _is_complete(kwargs):
217
  return 0.0
218
  mdl = _load_model_or_error()
219
+ if mdl is None:
220
+ return 0.0
221
  X_new = _coerce_to_row(kwargs)
222
+ X_new = _align_columns_to_model(X_new, mdl)
223
  try:
224
+ y_raw = mdl.predict(X_new) # model may predict log1p(target) or original scale
225
+ # If your model was trained on log1p, set mdl.target_is_log1p_ = True before saving.
226
+ if getattr(mdl, "target_is_log1p_", False):
227
+ y = np.expm1(y_raw)
228
+ else:
229
+ y = y_raw
230
+ y = float(np.maximum(np.asarray(y).ravel()[0], 0.0))
231
  return y
232
  except Exception as e:
233
+ print(f"[Predict] {e}")
234
+ traceback.print_exc()
235
+ return 0.0
236
 
237
  EXAMPLE = {
238
  "Filler 1 Type": "CNT",
 
241
  "Filler 1 Length (mm)": 1.2,
242
  CF_COL: 0.5,
243
  "Filler 2 Type": "",
244
+ "Filler 2 Dimensionality": CANON_NA,
245
  "Filler 2 Diameter (µm)": None,
246
  "Filler 2 Length (mm)": None,
247
  "Specimen Volume (mm3)": 1000,
 
269
  if col in NUMERIC_COLS:
270
  cleared.append(None)
271
  elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
272
+ cleared.append(CANON_NA)
273
  elif col == "Current Type":
274
+ cleared.append(CANON_NA)
275
  else:
276
  cleared.append("")
277
  return cleared
 
393
  emb = None
394
 
395
  joblib.dump(vectorizer, TFIDF_VECT_PATH)
396
+ joblib.dump(X_tfidF:=X_tfidf, TFIDF_MAT_PATH)
397
  if BM25Okapi is not None:
398
  joblib.dump(all_tokens, BM25_TOK_PATH)
399
  meta.to_parquet(RAG_META_PATH, index=False)
 
404
  st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
405
 
406
  def _extract_page(text_chunk: str) -> str:
407
+ m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
408
  return (m[-1].group(1) if m else "?")
409
 
410
  def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
 
433
 
434
  # BM25 scores
435
  if bm25 is not None:
436
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-\/\.%]+", query)]
437
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
438
  else:
439
  bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
 
506
  for i in remain:
507
  sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
508
  score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
509
+ cand_scores.sort(reverse=True)
510
+ best_i = cand_scores[0][1]
511
+ selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
 
512
  return selected
513
 
514
  def compose_extractive(selected: List[Dict[str, Any]]) -> str:
 
567
  srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
568
  coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
569
 
 
570
  if strict_quotes_only:
571
  if not selected:
572
  return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
 
692
  with gr.Row():
693
  with gr.Column(scale=7):
694
  with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
695
+ # * marks on required inputs only
696
+ f1_type = gr.Textbox(label="Filler 1 Type *", placeholder="e.g., CNT, Graphite, Steel fiber")
697
+ f1_diam = gr.Number(label="Filler 1 Diameter (µm) *")
698
+ f1_len = gr.Number(label="Filler 1 Length (mm) *")
699
+ cf_conc = gr.Number(label=f"{CF_COL} *", info="Weight percent of total binder")
700
+ f1_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 1 Dimensionality *")
701
 
702
  with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
703
  f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
704
  f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
705
  f2_len = gr.Number(label="Filler 2 Length (mm)")
706
+ f2_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 2 Dimensionality")
707
 
708
  with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
709
+ spec_vol = gr.Number(label="Specimen Volume (mm3) *")
710
+ probe_cnt = gr.Number(label="Probe Count *")
711
+ probe_mat = gr.Textbox(label="Probe Material *", placeholder="e.g., Copper, Silver paste")
712
+ wb = gr.Number(label="W/B *")
713
+ sb = gr.Number(label="S/B *")
714
+ gauge_len = gr.Number(label="Gauge Length (mm) *")
715
+ curing = gr.Textbox(label="Curing Condition *", placeholder="e.g., 28d water, 20°C")
716
+ n_fillers = gr.Number(label="Number of Fillers *")
717
 
718
  with gr.Accordion("Processing", open=False, elem_classes=["card"]):
719
  dry_temp = gr.Number(label="Drying Temperature (°C)")
 
721
 
722
  with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
723
  load_rate = gr.Number(label="Loading Rate (MPa/s)")
724
+ E_mod = gr.Number(label="Modulus of Elasticity (GPa) *")
725
+ current = gr.Dropdown(CURRENT_CHOICES, value=CANON_NA, label="Current Type")
726
  voltage = gr.Number(label="Applied Voltage (V)")
727
 
728
  with gr.Column(scale=5):
 
736
  with gr.Accordion("About this model", open=False, elem_classes=["card"]):
737
  gr.Markdown(
738
  "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
739
+ "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model may train on log1p; saved flag used at inference).\n"
740
  "- Missing values are safely imputed per-feature.\n"
741
  "- Trained columns:\n"
742
  f" `{', '.join(MAIN_VARIABLES)}`",