OmarOmar91 commited on
Commit
64b5e18
·
verified ·
1 Parent(s): 8239631

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +524 -135
app.py CHANGED
@@ -1,16 +1,50 @@
1
- # Gradio UI aligned to the training script column names (October1.xlsx)
2
- # - Uses the trained pipeline saved as: stress_gf_xgb.joblib
3
- # - Makes many inputs optional; missing values are handled by the pipeline imputers
4
-
 
 
 
 
5
  import os
6
- import joblib
 
 
 
 
 
 
 
 
7
  import numpy as np
8
  import pandas as pd
9
  import gradio as gr
10
 
11
- # ========================= Column Names (match training script) =========================
12
-
13
- CF_COL = "Conductive Filler Conc. (wt%)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  TARGET_COL = "Stress GF (MPa-1)"
15
 
16
  MAIN_VARIABLES = [
@@ -68,12 +102,9 @@ CATEGORICAL_COLS = {
68
  "Current Type"
69
  }
70
 
71
- # Reasonable UI choices (free text is still allowed)
72
- DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
73
  CURRENT_CHOICES = ["DC", "AC", "NA"]
74
 
75
- # ========================= Model Loader ================================================
76
-
77
  MODEL_CANDIDATES = [
78
  "stress_gf_xgb.joblib",
79
  "models/stress_gf_xgb.joblib",
@@ -87,19 +118,10 @@ def _load_model_or_error():
87
  return joblib.load(p)
88
  except Exception as e:
89
  return f"Could not load model from {p}: {e}"
90
- return (
91
- "Model file not found. Upload your trained pipeline as "
92
- "stress_gf_xgb.joblib (or put it in models/)."
93
- )
94
-
95
- # ========================= Input Coercion =============================================
96
 
97
  def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
98
- """
99
- Convert raw UI dict -> single-row DataFrame with columns MAIN_VARIABLES.
100
- Numeric fields: float or NaN; categorical: stripped string (or empty).
101
- Missing columns are filled with NaN/'' so the pipeline imputers can handle them.
102
- """
103
  row = {}
104
  for col in MAIN_VARIABLES:
105
  v = form_dict.get(col, None)
@@ -109,35 +131,26 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
109
  else:
110
  try:
111
  row[col] = float(v)
112
- except:
113
  row[col] = np.nan
114
  else:
115
  row[col] = "" if v in (None, "NA") else str(v).strip()
116
- # Ensure exact column order
117
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
118
 
119
- # ========================= Predict Function ===========================================
120
-
121
  def predict_fn(**kwargs):
122
  mdl = _load_model_or_error()
123
  if isinstance(mdl, str):
124
- # return a friendly error string
125
  return mdl
126
-
127
  X_new = _coerce_to_row(kwargs)
128
-
129
  try:
130
- y_log = mdl.predict(X_new) # model predicts log1p(target)
131
- y = float(np.expm1(y_log)[0]) # back to original scale MPa^-1
132
- # protect from tiny negative numeric noise
133
  if -1e-10 < y < 0:
134
  y = 0.0
135
  return y
136
  except Exception as e:
137
  return f"Prediction error: {e}"
138
 
139
- # ========================= Example Prefill ============================================
140
-
141
  EXAMPLE = {
142
  "Filler 1 Type": "CNT",
143
  "Filler 1 Dimensionality": "1D",
@@ -168,7 +181,6 @@ def _fill_example():
168
  return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
169
 
170
  def _clear_all():
171
- # Return blanks in the same order as MAIN_VARIABLES
172
  cleared = []
173
  for col in MAIN_VARIABLES:
174
  if col in NUMERIC_COLS:
@@ -181,118 +193,495 @@ def _clear_all():
181
  cleared.append("")
182
  return cleared
183
 
184
- # ========================= UI =========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
 
186
  CSS = """
187
- /* soft gradient background */
188
- .gradio-container {background: linear-gradient(135deg,#0f172a 0%, #1f2937 60%, #334155 100%);}
 
 
189
  * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
190
- /* cards */
191
- .card {background: rgba(255,255,255,0.04) !important; border: 1px solid rgba(255,255,255,0.08);}
192
- label.svelte-1ipelgc {color: #e5e7eb !important;}
193
  """
194
 
195
- theme = gr.themes.Soft(primary_hue="blue", neutral_hue="slate").set(
196
- body_background_fill="#0f172a",
197
- body_text_color="#e5e7eb",
198
- input_background_fill="#111827",
199
- input_border_color="#1f2937",
200
- button_primary_background_fill="#3b82f6",
 
 
 
201
  button_primary_text_color="#ffffff",
202
- button_secondary_background_fill="#111827",
 
203
  )
204
 
205
  with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
206
  gr.Markdown(
207
- "<h1 style='margin:0'>🔮 Stress Gauge Factor (MPa⁻¹) — ML Predictor</h1>"
208
- "<p style='opacity:.9'>Fields and units match your training data. "
209
- "Leave anything blank if unknown the model handles missing values.</p>"
 
 
 
210
  )
211
 
212
- with gr.Row():
213
- # ---------------- Inputs (Left) ----------------
214
- with gr.Column(scale=7):
215
- # Primary filler
216
- with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
217
- f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
218
- f1_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
219
- f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
220
- f1_len = gr.Number(label="Filler 1 Length (mm)")
221
- cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
222
-
223
- # Secondary filler (optional)
224
- with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
225
- f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
226
- f2_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
227
- f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
228
- f2_len = gr.Number(label="Filler 2 Length (mm)")
229
-
230
- # Mix & specimen
231
- with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
232
- spec_vol = gr.Number(label="Specimen Volume (mm3)")
233
- probe_cnt = gr.Number(label="Probe Count")
234
- probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
235
- wb = gr.Number(label="W/B")
236
- sb = gr.Number(label="S/B")
237
- gauge_len = gr.Number(label="Gauge Length (mm)")
238
- curing = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
239
- n_fillers = gr.Number(label="Number of Fillers")
240
-
241
- # Processing
242
- with gr.Accordion("Processing", open=False, elem_classes=["card"]):
243
- dry_temp = gr.Number(label="Drying Temperature (°C)")
244
- dry_hrs = gr.Number(label="Drying Duration (hr)")
245
-
246
- # Mechanical & Electrical loading
247
- with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
248
- load_rate = gr.Number(label="Loading Rate (MPa/s)")
249
- E_mod = gr.Number(label="Modulus of Elasticity (GPa)")
250
- current = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
251
- voltage = gr.Number(label="Applied Voltage (V)")
252
-
253
- # ---------------- Output (Right) ----------------
254
- with gr.Column(scale=5):
255
- with gr.Group(elem_classes=["card"]):
256
- out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", precision=6)
257
- with gr.Row():
258
- btn_pred = gr.Button("Predict", variant="primary")
259
- btn_clear = gr.Button("Clear")
260
- btn_demo = gr.Button("Fill Example")
261
-
262
- with gr.Accordion("About this model", open=False, elem_classes=["card"]):
263
- gr.Markdown(
264
- "- Pipeline: **ColumnTransformer → (RobustScaler + OneHot) → XGBoost**\n"
265
- "- Target: **Stress GF (MPa⁻¹)** on original scale (model trains on log1p).\n"
266
- "- Missing values are safely imputed per-feature.\n"
267
- "- Trained columns:\n"
268
- f" `{', '.join(MAIN_VARIABLES)}`"
269
- )
270
-
271
- # Wire buttons
272
- inputs_in_order = [
273
- # MAIN_VARIABLES exact order:
274
- # "Filler 1 Type","Filler 1 Diameter (µm)","Filler 1 Length (mm)", CF_COL,
275
- # "Filler 1 Dimensionality","Filler 2 Type","Filler 2 Diameter (µm)","Filler 2 Length (mm)",
276
- # "Filler 2 Dimensionality","Specimen Volume (mm3)","Probe Count","Probe Material",
277
- # "W/B","S/B","Gauge Length (mm)","Curing Condition","Number of Fillers",
278
- # "Drying Temperature (°C)","Drying Duration (hr)","Loading Rate (MPa/s)",
279
- # "Modulus of Elasticity (GPa)","Current Type","Applied Voltage (V)"
280
- f1_type, f1_diam, f1_len, cf_conc,
281
- f1_dim, f2_type, f2_diam, f2_len,
282
- f2_dim, spec_vol, probe_cnt, probe_mat,
283
- wb, sb, gauge_len, curing, n_fillers,
284
- dry_temp, dry_hrs, load_rate,
285
- E_mod, current, voltage
286
- ]
287
-
288
- def _predict_wrapper(*vals):
289
- data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
290
- return predict_fn(**data)
291
-
292
- btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
293
- btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order)
294
- btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  # ------------- Launch -------------
297
  if __name__ == "__main__":
 
298
  demo.queue().launch()
 
1
+ # ================================================================
2
+ # Self-Sensing Concrete Assistant Predictor (XGB) + Hybrid RAG
3
+ # - Predictor tab: identical behavior to your "second code"
4
+ # - Literature tab: from your "first code" (Hybrid RAG + MMR)
5
+ # - Hugging Face friendly: online PDF fetching OFF by default
6
+ # ================================================================
7
+
8
+ # ---------------------- Runtime flags (HF-safe) ----------------------
9
  import os
10
+ os.environ["TRANSFORMERS_NO_TF"] = "1"
11
+ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
12
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
+
14
+ # ------------------------------- Imports ------------------------------
15
+ import re, time, joblib, warnings, json
16
+ from pathlib import Path
17
+ from typing import List, Dict, Any
18
+
19
  import numpy as np
20
  import pandas as pd
21
  import gradio as gr
22
 
23
+ warnings.filterwarnings("ignore", category=UserWarning)
24
+
25
+ # Optional deps (handled gracefully if missing)
26
+ USE_DENSE = True
27
+ try:
28
+ from sentence_transformers import SentenceTransformer
29
+ except Exception:
30
+ USE_DENSE = False
31
+
32
+ try:
33
+ from rank_bm25 import BM25Okapi
34
+ except Exception:
35
+ BM25Okapi = None
36
+ print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
37
+
38
+ # Optional OpenAI (for LLM paraphrase)
39
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
40
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
41
+ try:
42
+ from openai import OpenAI
43
+ except Exception:
44
+ OpenAI = None
45
+
46
+ # ========================= Predictor (kept same as 2nd) =========================
47
+ CF_COL = "Conductive Filler Conc. (wt%)"
48
  TARGET_COL = "Stress GF (MPa-1)"
49
 
50
  MAIN_VARIABLES = [
 
102
  "Current Type"
103
  }
104
 
105
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
 
106
  CURRENT_CHOICES = ["DC", "AC", "NA"]
107
 
 
 
108
  MODEL_CANDIDATES = [
109
  "stress_gf_xgb.joblib",
110
  "models/stress_gf_xgb.joblib",
 
118
  return joblib.load(p)
119
  except Exception as e:
120
  return f"Could not load model from {p}: {e}"
121
+ return ("Model file not found. Upload your trained pipeline as "
122
+ "stress_gf_xgb.joblib (or put it in models/).")
 
 
 
 
123
 
124
  def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
 
 
 
 
 
125
  row = {}
126
  for col in MAIN_VARIABLES:
127
  v = form_dict.get(col, None)
 
131
  else:
132
  try:
133
  row[col] = float(v)
134
+ except Exception:
135
  row[col] = np.nan
136
  else:
137
  row[col] = "" if v in (None, "NA") else str(v).strip()
 
138
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
139
 
 
 
140
  def predict_fn(**kwargs):
141
  mdl = _load_model_or_error()
142
  if isinstance(mdl, str):
 
143
  return mdl
 
144
  X_new = _coerce_to_row(kwargs)
 
145
  try:
146
+ y_log = mdl.predict(X_new) # model predicts log1p(target)
147
+ y = float(np.expm1(y_log)[0]) # back to original scale MPa^-1
 
148
  if -1e-10 < y < 0:
149
  y = 0.0
150
  return y
151
  except Exception as e:
152
  return f"Prediction error: {e}"
153
 
 
 
154
  EXAMPLE = {
155
  "Filler 1 Type": "CNT",
156
  "Filler 1 Dimensionality": "1D",
 
181
  return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
182
 
183
  def _clear_all():
 
184
  cleared = []
185
  for col in MAIN_VARIABLES:
186
  if col in NUMERIC_COLS:
 
193
  cleared.append("")
194
  return cleared
195
 
196
+ # ========================= Hybrid RAG (from 1st code) =========================
197
+ # Configuration
198
+ ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
199
+ TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
200
+ TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
201
+ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
202
+ EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
203
+ RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
204
+
205
+ # PDF source (HF-safe: rely on local /papers by default)
206
+ LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
207
+ USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
208
+
209
+ # Retrieval weights
210
+ W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
211
+ W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
212
+ W_EMB_DEFAULT = 0.00 if not USE_DENSE else 0.40
213
+
214
+ # Simple text processing
215
+ _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
216
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
217
+ def sent_split(text: str) -> List[str]:
218
+ sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
219
+ return [s for s in sents if len(s.split()) >= 5]
220
+ def tokenize(text: str) -> List[str]:
221
+ return [t.lower() for t in TOKEN_RE.findall(text)]
222
+
223
+ # PDF text extraction (PyMuPDF preferred; pypdf fallback)
224
+ def _extract_pdf_text(pdf_path: Path) -> str:
225
+ try:
226
+ import fitz
227
+ doc = fitz.open(pdf_path)
228
+ out = []
229
+ for i, page in enumerate(doc):
230
+ out.append(f"[[PAGE={i+1}]]\n{page.get_text('text') or ''}")
231
+ return "\n\n".join(out)
232
+ except Exception:
233
+ try:
234
+ from pypdf import PdfReader
235
+ reader = PdfReader(str(pdf_path))
236
+ out = []
237
+ for i, p in enumerate(reader.pages):
238
+ txt = p.extract_text() or ""
239
+ out.append(f"[[PAGE={i+1}]]\n{txt}")
240
+ return "\n\n".join(out)
241
+ except Exception as e:
242
+ print(f"PDF read error ({pdf_path}): {e}")
243
+ return ""
244
+
245
+ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
246
+ sents = sent_split(text)
247
+ chunks, step = [], max(1, win_size - overlap)
248
+ for i in range(0, len(sents), step):
249
+ window = sents[i:i+win_size]
250
+ if not window: break
251
+ chunks.append(" ".join(window))
252
+ return chunks
253
+
254
+ def _safe_init_st_model(name: str):
255
+ global USE_DENSE
256
+ if not USE_DENSE:
257
+ return None
258
+ try:
259
+ return SentenceTransformer(name)
260
+ except Exception as e:
261
+ print("Dense embeddings unavailable:", e)
262
+ USE_DENSE = False
263
+ return None
264
+
265
+ # Build or load index
266
+ def build_or_load_hybrid(pdf_dir: Path):
267
+ have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
268
+ and RAG_META_PATH.exists()
269
+ and (BM25_TOK_PATH.exists() or BM25Okapi is None)
270
+ and (EMB_NPY_PATH.exists() or not USE_DENSE))
271
+ if have_cache:
272
+ vectorizer = joblib.load(TFIDF_VECT_PATH)
273
+ X_tfidf = joblib.load(TFIDF_MAT_PATH)
274
+ meta = pd.read_parquet(RAG_META_PATH)
275
+ bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
276
+ emb = np.load(EMB_NPY_PATH) if (USE_DENSE and EMB_NPY_PATH.exists()) else None
277
+ return vectorizer, X_tfidf, meta, bm25_toks, emb
278
+
279
+ rows, all_tokens = [], []
280
+ pdf_paths = list(Path(pdf_dir).glob("**/*.pdf"))
281
+ print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} files.")
282
+ for pdf in pdf_paths:
283
+ raw = _extract_pdf_text(pdf)
284
+ if not raw.strip():
285
+ continue
286
+ for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
287
+ rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
288
+ all_tokens.append(tokenize(ch))
289
+ if not rows:
290
+ # create empty stub to avoid crashes; UI will message user to upload PDFs
291
+ meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
292
+ vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
293
+ return vectorizer, X_tfidf, meta, all_tokens, emb
294
+
295
+ meta = pd.DataFrame(rows)
296
+
297
+ from sklearn.feature_extraction.text import TfidfVectorizer
298
+ vectorizer = TfidfVectorizer(
299
+ ngram_range=(1,2),
300
+ min_df=1, max_df=0.95,
301
+ sublinear_tf=True, smooth_idf=True,
302
+ lowercase=True,
303
+ token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
304
+ )
305
+ X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
306
+
307
+ emb = None
308
+ if USE_DENSE:
309
+ try:
310
+ st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
311
+ if st_model is not None:
312
+ from sklearn.preprocessing import normalize as sk_normalize
313
+ em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
314
+ emb = sk_normalize(em)
315
+ np.save(EMB_NPY_PATH, emb)
316
+ except Exception as e:
317
+ print("Dense embedding failed:", e)
318
+ emb = None
319
+
320
+ # Save artifacts
321
+ joblib.dump(vectorizer, TFIDF_VECT_PATH)
322
+ joblib.dump(X_tfidf, TFIDF_MAT_PATH)
323
+ if BM25Okapi is not None:
324
+ joblib.dump(all_tokens, BM25_TOK_PATH)
325
+ meta.to_parquet(RAG_META_PATH, index=False)
326
+
327
+ return vectorizer, X_tfidf, meta, all_tokens, emb
328
+
329
+ tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
330
+ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
331
+ st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
332
+
333
+ def _extract_page(text_chunk: str) -> str:
334
+ m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
335
+ return (m[-1].group(1) if m else "?")
336
+
337
+ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
338
+ if rag_meta is None or rag_meta.empty:
339
+ return pd.DataFrame()
340
+
341
+ # Dense scores
342
+ if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
343
+ try:
344
+ from sklearn.preprocessing import normalize as sk_normalize
345
+ q_emb = st_query_model.encode([query], convert_to_numpy=True)
346
+ q_emb = sk_normalize(q_emb)[0]
347
+ dense_scores = emb_matrix @ q_emb
348
+ except Exception as e:
349
+ print("Dense query encoding failed:", e)
350
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
351
+ else:
352
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
353
+
354
+ # TF-IDF scores
355
+ if tfidf_vectorizer is not None and tfidf_matrix is not None:
356
+ q_vec = tfidf_vectorizer.transform([query])
357
+ tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
358
+ else:
359
+ tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
360
+
361
+ # BM25 scores
362
+ if bm25 is not None:
363
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-/\.%]+", query)]
364
+ bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
365
+ else:
366
+ bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
367
+
368
+ def _norm(x):
369
+ x = np.asarray(x, dtype=float)
370
+ if np.allclose(x.max(), x.min()):
371
+ return np.zeros_like(x)
372
+ return (x - x.min()) / (x.max() - x.min())
373
+
374
+ s_dense = _norm(dense_scores)
375
+ s_tfidf = _norm(tfidf_scores)
376
+ s_bm25 = _norm(bm25_scores)
377
+
378
+ total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
379
+ w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
380
+
381
+ combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
382
+ idx = np.argsort(-combo)[:k]
383
+ hits = rag_meta.iloc[idx].copy()
384
+ hits["score_dense"] = s_dense[idx]
385
+ hits["score_tfidf"] = s_tfidf[idx]
386
+ hits["score_bm25"] = s_bm25[idx]
387
+ hits["score"] = combo[idx]
388
+ return hits.reset_index(drop=True)
389
+
390
+ def split_sentences(text: str) -> List[str]:
391
+ sents = sent_split(text)
392
+ return [s for s in sents if 6 <= len(s.split()) <= 60]
393
+
394
+ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
395
+ pool = []
396
+ for _, row in hits.iterrows():
397
+ doc = Path(row["doc_path"]).name
398
+ page = _extract_page(row["text"])
399
+ for s in split_sentences(row["text"])[:pool_per_chunk]:
400
+ pool.append({"sent": s, "doc": doc, "page": page})
401
+ if not pool:
402
+ return []
403
+
404
+ sent_texts = [p["sent"] for p in pool]
405
+
406
+ # Embedding-based relevance if available, else TF-IDF
407
+ use_dense = USE_DENSE and st_query_model is not None
408
+ if use_dense:
409
+ try:
410
+ from sklearn.preprocessing import normalize as sk_normalize
411
+ texts = [question] + sent_texts
412
+ enc = st_query_model.encode(texts, convert_to_numpy=True)
413
+ q_vec = sk_normalize(enc[:1])[0]
414
+ S = sk_normalize(enc[1:])
415
+ rel = (S @ q_vec)
416
+ def sim_fn(i, j): return float(S[i] @ S[j])
417
+ except Exception:
418
+ use_dense = False
419
+
420
+ if not use_dense:
421
+ from sklearn.feature_extraction.text import TfidfVectorizer
422
+ vect = TfidfVectorizer().fit(sent_texts + [question])
423
+ Q = vect.transform([question]); S = vect.transform(sent_texts)
424
+ rel = (S @ Q.T).toarray().ravel()
425
+ def sim_fn(i, j): return float((S[i] @ S[j].T).toarray()[0, 0])
426
+
427
+ selected, selected_idx = [], []
428
+ remain = list(range(len(pool)))
429
+ first = int(np.argmax(rel))
430
+ selected.append(pool[first]); selected_idx.append(first); remain.remove(first)
431
+
432
+ while len(selected) < top_n and remain:
433
+ cand_scores = []
434
+ for i in remain:
435
+ sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
436
+ score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
437
+ cand_scores.append((score, i))
438
+ cand_scores.sort(reverse=True)
439
+ best_i = cand_scores[0][1]
440
+ selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
441
+ return selected
442
+
443
+ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
444
+ if not selected:
445
+ return ""
446
+ return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
447
+
448
+ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
449
+ if OPENAI_API_KEY is None or OpenAI is None:
450
+ return None
451
+ client = OpenAI(api_key=OPENAI_API_KEY)
452
+ model = model or OPENAI_MODEL
453
+ SYSTEM_PROMPT = (
454
+ "You are a scientific assistant for self-sensing cementitious materials.\n"
455
+ "Answer STRICTLY using the provided sentences.\n"
456
+ "Do not invent facts. Keep it concise (3–6 sentences).\n"
457
+ "Retain inline citations like (Doc.pdf, p.X) exactly as given."
458
+ )
459
+ user_prompt = (
460
+ f"Question: {question}\n\n"
461
+ f"Use ONLY these sentences to answer; keep their inline citations:\n" +
462
+ "\n".join(f"- {s}" for s in sentence_lines)
463
+ )
464
+ try:
465
+ resp = client.responses.create(
466
+ model=model,
467
+ input=[
468
+ {"role": "system", "content": SYSTEM_PROMPT},
469
+ {"role": "user", "content": user_prompt},
470
+ ],
471
+ temperature=temperature,
472
+ )
473
+ return getattr(resp, "output_text", None) or str(resp)
474
+ except Exception:
475
+ return None
476
+
477
+ def rag_reply(
478
+ question: str,
479
+ k: int = 8,
480
+ n_sentences: int = 4,
481
+ include_passages: bool = False,
482
+ use_llm: bool = False,
483
+ model: str = None,
484
+ temperature: float = 0.2,
485
+ strict_quotes_only: bool = False,
486
+ w_tfidf: float = W_TFIDF_DEFAULT,
487
+ w_bm25: float = W_BM25_DEFAULT,
488
+ w_emb: float = W_EMB_DEFAULT
489
+ ) -> str:
490
+ hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
491
+ if hits is None or hits.empty:
492
+ return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
493
+
494
+ selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
495
+ header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
496
+ srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
497
+ coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
498
+
499
+ if strict_quotes_only:
500
+ if not selected:
501
+ return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
502
+ msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
503
+ msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
504
+ if include_passages:
505
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
506
+ return msg
507
+
508
+ extractive = compose_extractive(selected)
509
+ if use_llm and selected:
510
+ lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
511
+ llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
512
+ if llm_text:
513
+ msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
514
+ if include_passages:
515
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
516
+ return msg
517
+
518
+ if not extractive:
519
+ return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
520
+
521
+ msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
522
+ if include_passages:
523
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
524
+ return msg
525
+
526
+ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
527
+ use_llm, model_name, temperature, strict_quotes_only,
528
+ w_tfidf, w_bm25, w_emb):
529
+ if not message or not message.strip():
530
+ return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
531
+ try:
532
+ return rag_reply(
533
+ question=message,
534
+ k=int(top_k),
535
+ n_sentences=int(n_sentences),
536
+ include_passages=bool(include_passages),
537
+ use_llm=bool(use_llm),
538
+ model=(model_name or None),
539
+ temperature=float(temperature),
540
+ strict_quotes_only=bool(strict_quotes_only),
541
+ w_tfidf=float(w_tfidf),
542
+ w_bm25=float(w_bm25),
543
+ w_emb=float(w_emb),
544
+ )
545
+ except Exception as e:
546
+ return f"RAG error: {e}"
547
 
548
+ # ========================= UI (predictor styling kept) =========================
549
  CSS = """
550
+ /* Blue to green gradient background */
551
+ .gradio-container {
552
+ background: linear-gradient(135deg, #1e3a8a 0%, #166534 60%, #15803d 100%) !important;
553
+ }
554
  * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
555
+ .card {background: rgba(255,255,255,0.07) !important; border: 1px solid rgba(255,255,255,0.12);}
556
+ label.svelte-1ipelgc {color: #e0f2fe !important;}
 
557
  """
558
 
559
+ theme = gr.themes.Soft(
560
+ primary_hue="blue",
561
+ neutral_hue="green"
562
+ ).set(
563
+ body_background_fill="#1e3a8a",
564
+ body_text_color="#e0f2fe",
565
+ input_background_fill="#172554",
566
+ input_border_color="#1e40af",
567
+ button_primary_background_fill="#2563eb",
568
  button_primary_text_color="#ffffff",
569
+ button_secondary_background_fill="#14532d",
570
+ button_secondary_text_color="#ecfdf5",
571
  )
572
 
573
  with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
574
  gr.Markdown(
575
+ "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
576
+ "<p style='opacity:.9'>"
577
+ "Left tab: ML prediction for Stress Gauge Factor (kept identical to your deployed predictor). "
578
+ "Right tab: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
579
+ "Upload PDFs into <code>papers/</code> in your Space repo."
580
+ "</p>"
581
  )
582
 
583
+ with gr.Tabs():
584
+ # ------------------------- Predictor Tab -------------------------
585
+ with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
586
+ with gr.Row():
587
+ with gr.Column(scale=7):
588
+ with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
589
+ f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
590
+ f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
591
+ f1_len = gr.Number(label="Filler 1 Length (mm)")
592
+ cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
593
+ f1_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
594
+
595
+ with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
596
+ f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
597
+ f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
598
+ f2_len = gr.Number(label="Filler 2 Length (mm)")
599
+ f2_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
600
+
601
+ with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
602
+ spec_vol = gr.Number(label="Specimen Volume (mm3)")
603
+ probe_cnt = gr.Number(label="Probe Count")
604
+ probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
605
+ wb = gr.Number(label="W/B")
606
+ sb = gr.Number(label="S/B")
607
+ gauge_len = gr.Number(label="Gauge Length (mm)")
608
+ curing = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
609
+ n_fillers = gr.Number(label="Number of Fillers")
610
+
611
+ with gr.Accordion("Processing", open=False, elem_classes=["card"]):
612
+ dry_temp = gr.Number(label="Drying Temperature (°C)")
613
+ dry_hrs = gr.Number(label="Drying Duration (hr)")
614
+
615
+ with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
616
+ load_rate = gr.Number(label="Loading Rate (MPa/s)")
617
+ E_mod = gr.Number(label="Modulus of Elasticity (GPa)")
618
+ current = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
619
+ voltage = gr.Number(label="Applied Voltage (V)")
620
+
621
+ with gr.Column(scale=5):
622
+ with gr.Group(elem_classes=["card"]):
623
+ out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", precision=6)
624
+ with gr.Row():
625
+ btn_pred = gr.Button("Predict", variant="primary")
626
+ btn_clear = gr.Button("Clear")
627
+ btn_demo = gr.Button("Fill Example")
628
+
629
+ with gr.Accordion("About this model", open=False, elem_classes=["card"]):
630
+ gr.Markdown(
631
+ "- Pipeline: ColumnTransformer -> (RobustScaler + OneHot) -> XGBoost\n"
632
+ "- Target: Stress GF (MPa^-1) on original scale (model trains on log1p).\n"
633
+ "- Missing values are safely imputed per-feature.\n"
634
+ "- Trained columns:\n"
635
+ f" `{', '.join(MAIN_VARIABLES)}`"
636
+ )
637
+
638
+ # Wire predictor buttons
639
+ inputs_in_order = [
640
+ f1_type, f1_diam, f1_len, cf_conc,
641
+ f1_dim, f2_type, f2_diam, f2_len,
642
+ f2_dim, spec_vol, probe_cnt, probe_mat,
643
+ wb, sb, gauge_len, curing, n_fillers,
644
+ dry_temp, dry_hrs, load_rate,
645
+ E_mod, current, voltage
646
+ ]
647
+
648
+ def _predict_wrapper(*vals):
649
+ data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
650
+ return predict_fn(**data)
651
+
652
+ btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
653
+ btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order)
654
+ btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
655
+
656
+ # ------------------------- Literature Tab -------------------------
657
+ with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)"):
658
+ gr.Markdown(
659
+ "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
660
+ "Answers cite (Doc.pdf, p.X). Toggle strict quotes or optional LLM paraphrasing."
661
+ )
662
+ with gr.Row():
663
+ top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
664
+ n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
665
+ include_passages = gr.Checkbox(value=False, label="Include supporting passages")
666
+ with gr.Accordion("Retriever weights (advanced)", open=False):
667
+ w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
668
+ w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
669
+ w_emb = gr.Slider(0.0, 1.0, value=W_EMB_DEFAULT, step=0.05, label="Dense weight (set 0 if disabled)")
670
+ with gr.Accordion("LLM & Controls", open=False):
671
+ strict_quotes_only = gr.Checkbox(value=False, label="Strict quotes only (no paraphrasing)")
672
+ use_llm = gr.Checkbox(value=False, label="Use LLM to paraphrase selected sentences")
673
+ model_name = gr.Textbox(value=os.getenv("OPENAI_MODEL", OPENAI_MODEL),
674
+ label="LLM model", placeholder="e.g., gpt-5 or gpt-5-mini")
675
+ temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
676
+ gr.ChatInterface(
677
+ fn=rag_chat_fn,
678
+ additional_inputs=[top_k, n_sentences, include_passages, use_llm, model_name,
679
+ temperature, strict_quotes_only, w_tfidf, w_bm25, w_emb],
680
+ title="Literature Q&A",
681
+ description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations. Toggle strict/LLM modes."
682
+ )
683
 
684
  # ------------- Launch -------------
685
  if __name__ == "__main__":
686
+ # queue() helps HF Spaces with concurrency; show_error suggests upload PDFs if none
687
  demo.queue().launch()