Inframat-x commited on
Commit
207d57b
·
verified ·
1 Parent(s): b2bcf33

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +831 -0
app.py ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================================
2
+ # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
+ # - Uses local 'papers/' folder for literature
4
+ # - Robust MMR sentence selection (no list index errors)
5
+ # - Predictor: safe model caching + safe feature alignment
6
+ # - Stable categoricals ("NA"); no over-strict completeness gate
7
+ # - Fixed [[PAGE=...]] regex
8
+ # ================================================================
9
+
10
+ # ---------------------- Runtime flags (HF-safe) ----------------------
11
+ import os
12
+ os.environ["TRANSFORMERS_NO_TF"] = "1"
13
+ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
14
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
+
16
+ # ------------------------------- Imports ------------------------------
17
+ import re, joblib, warnings, json, traceback
18
+ from pathlib import Path
19
+ from typing import List, Dict, Any
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import gradio as gr
24
+
25
+ warnings.filterwarnings("ignore", category=UserWarning)
26
+
27
+ # Optional deps (handled gracefully if missing)
28
+ USE_DENSE = True
29
+ try:
30
+ from sentence_transformers import SentenceTransformer
31
+ except Exception:
32
+ USE_DENSE = False
33
+
34
+ try:
35
+ from rank_bm25 import BM25Okapi
36
+ except Exception:
37
+ BM25Okapi = None
38
+ print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
39
+
40
+ # Optional OpenAI (for LLM paraphrase)
41
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
42
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
43
+ try:
44
+ from openai import OpenAI
45
+ except Exception:
46
+ OpenAI = None
47
+
48
+ # LLM availability flag — used internally; UI remains hidden
49
+ LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
50
+
51
+ # ========================= Predictor (kept) =========================
52
+ CF_COL = "Conductive Filler Conc. (wt%)"
53
+ TARGET_COL = "Stress GF (MPa-1)"
54
+ CANON_NA = "NA" # canonical placeholder for categoricals
55
+
56
+ MAIN_VARIABLES = [
57
+ "Filler 1 Type",
58
+ "Filler 1 Diameter (µm)",
59
+ "Filler 1 Length (mm)",
60
+ CF_COL,
61
+ "Filler 1 Dimensionality",
62
+ "Filler 2 Type",
63
+ "Filler 2 Diameter (µm)",
64
+ "Filler 2 Length (mm)",
65
+ "Filler 2 Dimensionality",
66
+ "Specimen Volume (mm3)",
67
+ "Probe Count",
68
+ "Probe Material",
69
+ "W/B",
70
+ "S/B",
71
+ "Gauge Length (mm)",
72
+ "Curing Condition",
73
+ "Number of Fillers",
74
+ "Drying Temperature (°C)",
75
+ "Drying Duration (hr)",
76
+ "Loading Rate (MPa/s)",
77
+ "Modulus of Elasticity (GPa)",
78
+ "Current Type",
79
+ "Applied Voltage (V)"
80
+ ]
81
+
82
+ NUMERIC_COLS = {
83
+ "Filler 1 Diameter (µm)",
84
+ "Filler 1 Length (mm)",
85
+ CF_COL,
86
+ "Filler 2 Diameter (µm)",
87
+ "Filler 2 Length (mm)",
88
+ "Specimen Volume (mm3)",
89
+ "Probe Count",
90
+ "W/B",
91
+ "S/B",
92
+ "Gauge Length (mm)",
93
+ "Number of Fillers",
94
+ "Drying Temperature (°C)",
95
+ "Drying Duration (hr)",
96
+ "Loading Rate (MPa/s)",
97
+ "Modulus of Elasticity (GPa)",
98
+ "Applied Voltage (V)"
99
+ }
100
+
101
+ CATEGORICAL_COLS = {
102
+ "Filler 1 Type",
103
+ "Filler 1 Dimensionality",
104
+ "Filler 2 Type",
105
+ "Filler 2 Dimensionality",
106
+ "Probe Material",
107
+ "Curing Condition",
108
+ "Current Type"
109
+ }
110
+
111
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", CANON_NA]
112
+ CURRENT_CHOICES = ["DC", "AC", CANON_NA]
113
+
114
+ MODEL_CANDIDATES = [
115
+ "stress_gf_xgb.joblib",
116
+ "models/stress_gf_xgb.joblib",
117
+ "/home/user/app/stress_gf_xgb.joblib",
118
+ os.getenv("MODEL_PATH", "")
119
+ ]
120
+
121
+ # ---------- Model caching + status ----------
122
+ MODEL = None
123
+ MODEL_STATUS = "🔴 Model not loaded"
124
+
125
+ def _try_load_model():
126
+ global MODEL, MODEL_STATUS
127
+ for p in [x for x in MODEL_CANDIDATES if x]:
128
+ if os.path.exists(p):
129
+ try:
130
+ MODEL = joblib.load(p)
131
+ MODEL_STATUS = f"🟢 Loaded model: {Path(p).name}"
132
+ print("[ModelLoad] Loaded:", p)
133
+ return
134
+ except Exception as e:
135
+ print(f"[ModelLoad] Error from {p}: {e}")
136
+ traceback.print_exc()
137
+ MODEL = None
138
+ if MODEL is None:
139
+ MODEL_STATUS = "🔴 Model not found (place stress_gf_xgb.joblib at repo root or models/, or set MODEL_PATH)"
140
+ print("[ModelLoad]", MODEL_STATUS)
141
+
142
+ _try_load_model() # load at import time
143
+
144
+ def _canon_cat(v: Any) -> str:
145
+ """Stable, canonical category placeholder normalization."""
146
+ if v is None:
147
+ return CANON_NA
148
+ s = str(v).strip()
149
+ if s == "" or s.upper() in {"N/A", "NONE", "NULL"}:
150
+ return CANON_NA
151
+ return s
152
+
153
+ def _to_float_or_nan(v):
154
+ if v in ("", None):
155
+ return np.nan
156
+ try:
157
+ return float(str(v).replace(",", ""))
158
+ except Exception:
159
+ return np.nan
160
+
161
+ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
162
+ row = {}
163
+ for col in MAIN_VARIABLES:
164
+ v = form_dict.get(col, None)
165
+ if col in NUMERIC_COLS:
166
+ row[col] = _to_float_or_nan(v)
167
+ elif col in CATEGORICAL_COLS:
168
+ row[col] = _canon_cat(v)
169
+ else:
170
+ s = str(v).strip() if v is not None else ""
171
+ row[col] = s if s else CANON_NA
172
+ return pd.DataFrame([row], columns=MAIN_VARIABLES)
173
+
174
+ def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
175
+ """
176
+ SAFE alignment:
177
+ - If mdl.feature_names_in_ exists AND is a subset of df.columns (raw names), reorder to it.
178
+ - Else, try a Pipeline step (e.g., 'preprocessor') with feature_names_in_ subset of df.columns.
179
+ - Else, DO NOT align (let the pipeline handle columns by name).
180
+ """
181
+ try:
182
+ feat = getattr(mdl, "feature_names_in_", None)
183
+ if isinstance(feat, (list, np.ndarray, pd.Index)):
184
+ feat = list(feat)
185
+ if all(c in df.columns for c in feat):
186
+ return df[feat]
187
+
188
+ if hasattr(mdl, "named_steps"):
189
+ for key in ["preprocessor", "columntransformer"]:
190
+ if key in mdl.named_steps:
191
+ step = mdl.named_steps[key]
192
+ feat2 = getattr(step, "feature_names_in_", None)
193
+ if isinstance(feat2, (list, np.ndarray, pd.Index)):
194
+ feat2 = list(feat2)
195
+ if all(c in df.columns for c in feat2):
196
+ return df[feat2]
197
+ # fallback to first step if it exposes input names
198
+ try:
199
+ first_key = list(mdl.named_steps.keys())[0]
200
+ step = mdl.named_steps[first_key]
201
+ feat3 = getattr(step, "feature_names_in_", None)
202
+ if isinstance(feat3, (list, np.ndarray, pd.Index)):
203
+ feat3 = list(feat3)
204
+ if all(c in df.columns for c in feat3):
205
+ return df[feat3]
206
+ except Exception:
207
+ pass
208
+
209
+ return df
210
+ except Exception as e:
211
+ print(f"[Align] Skip aligning due to: {e}")
212
+ traceback.print_exc()
213
+ return df
214
+
215
+ def predict_fn(**kwargs):
216
+ """
217
+ Always attempt prediction.
218
+ - Missing numerics -> NaN (imputer handles)
219
+ - Categoricals -> 'NA'
220
+ - If model missing or inference error -> 0.0 (keeps UI stable)
221
+ """
222
+ if MODEL is None:
223
+ return 0.0
224
+ X_new = _coerce_to_row(kwargs)
225
+ X_new = _align_columns_to_model(X_new, MODEL)
226
+ try:
227
+ y_raw = MODEL.predict(X_new) # log1p or original scale depending on training
228
+ if getattr(MODEL, "target_is_log1p_", False):
229
+ y = np.expm1(y_raw)
230
+ else:
231
+ y = y_raw
232
+ y = float(np.asarray(y).ravel()[0])
233
+ return max(y, 0.0)
234
+ except Exception as e:
235
+ print(f"[Predict] {e}")
236
+ traceback.print_exc()
237
+ return 0.0
238
+
239
+ EXAMPLE = {
240
+ "Filler 1 Type": "CNT",
241
+ "Filler 1 Dimensionality": "1D",
242
+ "Filler 1 Diameter (µm)": 0.02,
243
+ "Filler 1 Length (mm)": 1.2,
244
+ CF_COL: 0.5,
245
+ "Filler 2 Type": "",
246
+ "Filler 2 Dimensionality": CANON_NA,
247
+ "Filler 2 Diameter (µm)": None,
248
+ "Filler 2 Length (mm)": None,
249
+ "Specimen Volume (mm3)": 1000,
250
+ "Probe Count": 2,
251
+ "Probe Material": "Copper",
252
+ "W/B": 0.4,
253
+ "S/B": 2.5,
254
+ "Gauge Length (mm)": 20,
255
+ "Curing Condition": "28d water, 20°C",
256
+ "Number of Fillers": 1,
257
+ "Drying Temperature (°C)": 60,
258
+ "Drying Duration (hr)": 24,
259
+ "Loading Rate (MPa/s)": 0.1,
260
+ "Modulus of Elasticity (GPa)": 25,
261
+ "Current Type": "DC",
262
+ "Applied Voltage (V)": 5.0,
263
+ }
264
+
265
+ def _fill_example():
266
+ return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
267
+
268
+ def _clear_all():
269
+ cleared = []
270
+ for col in MAIN_VARIABLES:
271
+ if col in NUMERIC_COLS:
272
+ cleared.append(None)
273
+ elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
274
+ cleared.append(CANON_NA)
275
+ elif col == "Current Type":
276
+ cleared.append(CANON_NA)
277
+ else:
278
+ cleared.append("")
279
+ return cleared
280
+
281
+ # ========================= Hybrid RAG =========================
282
+ ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
283
+ TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
284
+ TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
285
+ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
286
+ EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
287
+ RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
288
+
289
+ LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
290
+ USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
291
+
292
+ W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
293
+ W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
294
+ W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
295
+
296
+ _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
297
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
298
+ def sent_split(text: str) -> List[str]:
299
+ sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
300
+ return [s for s in sents if len(s.split()) >= 5]
301
+ def tokenize(text: str) -> List[str]:
302
+ return [t.lower() for t in TOKEN_RE.findall(text)]
303
+
304
+ def _extract_pdf_text(pdf_path: Path) -> str:
305
+ try:
306
+ import fitz
307
+ doc = fitz.open(pdf_path)
308
+ out = []
309
+ for i, page in enumerate(doc):
310
+ out.append(f"[[PAGE={i+1}]]\n{page.get_text('text') or ''}")
311
+ return "\n\n".join(out)
312
+ except Exception:
313
+ try:
314
+ from pypdf import PdfReader
315
+ reader = PdfReader(str(pdf_path))
316
+ out = []
317
+ for i, p in enumerate(reader.pages):
318
+ txt = p.extract_text() or ""
319
+ out.append(f"[[PAGE={i+1}]]\n{txt}")
320
+ return "\n\n".join(out)
321
+ except Exception as e:
322
+ print(f"PDF read error ({pdf_path}): {e}")
323
+ return ""
324
+
325
+ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
326
+ sents = sent_split(text)
327
+ chunks, step = [], max(1, win_size - overlap)
328
+ for i in range(0, len(sents), step):
329
+ window = sents[i:i+win_size]
330
+ if not window: break
331
+ chunks.append(" ".join(window))
332
+ return chunks
333
+
334
+ def _safe_init_st_model(name: str):
335
+ global USE_DENSE
336
+ if not USE_DENSE:
337
+ return None
338
+ try:
339
+ return SentenceTransformer(name)
340
+ except Exception as e:
341
+ print("Dense embeddings unavailable:", e)
342
+ USE_DENSE = False
343
+ return None
344
+
345
+ def build_or_load_hybrid(pdf_dir: Path):
346
+ # Build or load the hybrid retriever cache
347
+ have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
348
+ and RAG_META_PATH.exists()
349
+ and (BM25_TOK_PATH.exists() or BM25Okapi is None)
350
+ and (EMB_NPY_PATH.exists() or not USE_DENSE))
351
+ if have_cache:
352
+ vectorizer = joblib.load(TFIDF_VECT_PATH)
353
+ X_tfidf = joblib.load(TFIDF_MAT_PATH)
354
+ meta = pd.read_parquet(RAG_META_PATH)
355
+ bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
356
+ emb = np.load(EMB_NPY_PATH) if (USE_DENSE and EMB_NPY_PATH.exists()) else None
357
+ return vectorizer, X_tfidf, meta, bm25_toks, emb
358
+
359
+ rows, all_tokens = [], []
360
+ pdf_paths = list(Path(pdf_dir).glob("**/*.pdf"))
361
+ print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} files.")
362
+ for pdf in pdf_paths:
363
+ raw = _extract_pdf_text(pdf)
364
+ if not raw.strip():
365
+ continue
366
+ for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
367
+ rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
368
+ all_tokens.append(tokenize(ch))
369
+ if not rows:
370
+ meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
371
+ vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
372
+ return vectorizer, X_tfidf, meta, all_tokens, emb
373
+
374
+ meta = pd.DataFrame(rows)
375
+ from sklearn.feature_extraction.text import TfidfVectorizer
376
+ vectorizer = TfidfVectorizer(
377
+ ngram_range=(1,2),
378
+ min_df=1, max_df=0.95,
379
+ sublinear_tf=True, smooth_idf=True,
380
+ lowercase=True,
381
+ token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
382
+ )
383
+ X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
384
+
385
+ emb = None
386
+ if USE_DENSE:
387
+ try:
388
+ st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
389
+ if st_model is not None:
390
+ from sklearn.preprocessing import normalize as sk_normalize
391
+ em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
392
+ emb = sk_normalize(em)
393
+ np.save(EMB_NPY_PATH, emb)
394
+ except Exception as e:
395
+ print("Dense embedding failed:", e)
396
+ emb = None
397
+
398
+ joblib.dump(vectorizer, TFIDF_VECT_PATH)
399
+ joblib.dump(X_tfidf, TFIDF_MAT_PATH)
400
+ if BM25Okapi is not None:
401
+ joblib.dump(all_tokens, BM25_TOK_PATH)
402
+ meta.to_parquet(RAG_META_PATH, index=False)
403
+ return vectorizer, X_tfidf, meta, all_tokens, emb
404
+
405
+ tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
406
+ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
407
+ st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
408
+
409
+ def _extract_page(text_chunk: str) -> str:
410
+ # Correct: [[PAGE=123]]
411
+ m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
412
+ return (m[-1].group(1) if m else "?")
413
+
414
+ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
415
+ if rag_meta is None or rag_meta.empty:
416
+ return pd.DataFrame()
417
+
418
+ # Dense scores
419
+ if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
420
+ try:
421
+ from sklearn.preprocessing import normalize as sk_normalize
422
+ q_emb = st_query_model.encode([query], convert_to_numpy=True)
423
+ q_emb = sk_normalize(q_emb)[0]
424
+ dense_scores = emb_matrix @ q_emb
425
+ except Exception as e:
426
+ print("Dense query encoding failed:", e)
427
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
428
+ else:
429
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
430
+
431
+ # TF-IDF scores
432
+ if tfidf_vectorizer is not None and tfidf_matrix is not None:
433
+ q_vec = tfidf_vectorizer.transform([query])
434
+ tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
435
+ else:
436
+ tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
437
+
438
+ # BM25 scores
439
+ if bm25 is not None:
440
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-\/\.%]+", query)]
441
+ bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
442
+ else:
443
+ bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
444
+
445
+ def _norm(x):
446
+ x = np.asarray(x, dtype=float)
447
+ if np.allclose(x.max(), x.min()):
448
+ return np.zeros_like(x)
449
+ return (x - x.min()) / (x.max() - x.min())
450
+
451
+ s_dense = _norm(dense_scores)
452
+ s_tfidf = _norm(tfidf_scores)
453
+ s_bm25 = _norm(bm25_scores)
454
+
455
+ total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
456
+ w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
457
+
458
+ combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
459
+ idx = np.argsort(-combo)[:k]
460
+ hits = rag_meta.iloc[idx].copy()
461
+ hits["score_dense"] = s_dense[idx]
462
+ hits["score_tfidf"] = s_tfidf[idx]
463
+ hits["score_bm25"] = s_bm25[idx]
464
+ hits["score"] = combo[idx]
465
+ return hits.reset_index(drop=True)
466
+
467
+ def split_sentences(text: str) -> List[str]:
468
+ sents = sent_split(text)
469
+ return [s for s in sents if 6 <= len(s.split()) <= 60]
470
+
471
+ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
472
+ """
473
+ Robust MMR sentence picker:
474
+ - Handles empty pools
475
+ - Clamps top_n to pool size
476
+ - Avoids 'list index out of range'
477
+ """
478
+ # Build pool
479
+ pool = []
480
+ for _, row in hits.iterrows():
481
+ doc = Path(row["doc_path"]).name
482
+ page = _extract_page(row["text"])
483
+ sents = split_sentences(row["text"])
484
+ if not sents:
485
+ continue
486
+ for s in sents[:max(1, int(pool_per_chunk))]:
487
+ pool.append({"sent": s, "doc": doc, "page": page})
488
+
489
+ if not pool:
490
+ return []
491
+
492
+ # Relevance vectors
493
+ sent_texts = [p["sent"] for p in pool]
494
+ use_dense = USE_DENSE and st_query_model is not None
495
+ try:
496
+ if use_dense:
497
+ from sklearn.preprocessing import normalize as sk_normalize
498
+ enc = st_query_model.encode([question] + sent_texts, convert_to_numpy=True)
499
+ q_vec = sk_normalize(enc[:1])[0]
500
+ S = sk_normalize(enc[1:])
501
+ rel = (S @ q_vec)
502
+ def sim_fn(i, j): return float(S[i] @ S[j])
503
+ else:
504
+ from sklearn.feature_extraction.text import TfidfVectorizer
505
+ vect = TfidfVectorizer().fit(sent_texts + [question])
506
+ Q = vect.transform([question]); S = vect.transform(sent_texts)
507
+ rel = (S @ Q.T).toarray().ravel()
508
+ def sim_fn(i, j):
509
+ num = (S[i] @ S[j].T)
510
+ return float(num.toarray()[0, 0]) if hasattr(num, "toarray") else float(num)
511
+ except Exception:
512
+ # Fallback: uniform relevance if vectorization fails
513
+ rel = np.ones(len(sent_texts), dtype=float)
514
+ def sim_fn(i, j): return 0.0
515
+
516
+ # Normalize lambda_div
517
+ lambda_div = float(np.clip(lambda_div, 0.0, 1.0))
518
+
519
+ # Select first by highest relevance
520
+ remain = list(range(len(pool)))
521
+ if not remain:
522
+ return []
523
+ first = int(np.argmax(rel))
524
+ selected_idx = [first]
525
+ selected = [pool[first]]
526
+ remain.remove(first)
527
+
528
+ # Clamp top_n
529
+ max_pick = min(int(top_n), len(pool))
530
+ while len(selected) < max_pick and remain:
531
+ cand_scores = []
532
+ for i in remain:
533
+ div_i = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
534
+ score = lambda_div * float(rel[i]) - (1.0 - lambda_div) * div_i
535
+ cand_scores.append((score, i))
536
+ if not cand_scores:
537
+ break
538
+ cand_scores.sort(reverse=True)
539
+ _, best_i = cand_scores[0]
540
+ selected_idx.append(best_i)
541
+ selected.append(pool[best_i])
542
+ remain.remove(best_i)
543
+
544
+ return selected
545
+
546
+ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
547
+ if not selected:
548
+ return ""
549
+ return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
550
+
551
+ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
552
+ if not LLM_AVAILABLE:
553
+ return None
554
+ client = OpenAI(api_key=OPENAI_API_KEY)
555
+ model = model or OPENAI_MODEL
556
+ SYSTEM_PROMPT = (
557
+ "You are a scientific assistant for self-sensing cementitious materials.\n"
558
+ "Answer STRICTLY using the provided sentences.\n"
559
+ "Do not invent facts. Keep it concise (3–6 sentences).\n"
560
+ "Retain inline citations like (Doc.pdf, p.X) exactly as given."
561
+ )
562
+ user_prompt = (
563
+ f"Question: {question}\n\n"
564
+ f"Use ONLY these sentences to answer; keep their inline citations:\n" +
565
+ "\n".join(f"- {s}" for s in sentence_lines)
566
+ )
567
+ try:
568
+ resp = client.responses.create(
569
+ model=model,
570
+ input=[
571
+ {"role": "system", "content": SYSTEM_PROMPT},
572
+ {"role": "user", "content": user_prompt},
573
+ ],
574
+ temperature=temperature,
575
+ )
576
+ return getattr(resp, "output_text", None) or str(resp)
577
+ except Exception:
578
+ return None
579
+
580
+ def rag_reply(
581
+ question: str,
582
+ k: int = 8,
583
+ n_sentences: int = 4,
584
+ include_passages: bool = False,
585
+ use_llm: bool = False,
586
+ model: str = None,
587
+ temperature: float = 0.2,
588
+ strict_quotes_only: bool = False,
589
+ w_tfidf: float = W_TFIDF_DEFAULT,
590
+ w_bm25: float = W_BM25_DEFAULT,
591
+ w_emb: float = W_EMB_DEFAULT
592
+ ) -> str:
593
+ hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
594
+ if hits is None or hits.empty:
595
+ return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
596
+
597
+ selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
598
+ header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
599
+ srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
600
+ coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
601
+
602
+ if strict_quotes_only:
603
+ if not selected:
604
+ return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
605
+ msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
606
+ msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
607
+ if include_passages:
608
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
609
+ return msg
610
+
611
+ extractive = compose_extractive(selected)
612
+ if use_llm and selected:
613
+ lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
614
+ llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
615
+ if llm_text:
616
+ msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
617
+ if include_passages:
618
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
619
+ return msg
620
+
621
+ if not extractive:
622
+ return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
623
+
624
+ msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
625
+ if include_passages:
626
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
627
+ return msg
628
+
629
+ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
630
+ use_llm, model_name, temperature, strict_quotes_only,
631
+ w_tfidf, w_bm25, w_emb):
632
+ if not message or not message.strip():
633
+ return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
634
+ try:
635
+ return rag_reply(
636
+ question=message,
637
+ k=int(top_k),
638
+ n_sentences=int(n_sentences),
639
+ include_passages=bool(include_passages),
640
+ use_llm=bool(use_llm),
641
+ model=(model_name or None),
642
+ temperature=float(temperature),
643
+ strict_quotes_only=bool(strict_quotes_only),
644
+ w_tfidf=float(w_tfidf),
645
+ w_bm25=float(w_bm25),
646
+ w_emb=float(w_emb),
647
+ )
648
+ except Exception as e:
649
+ return f"RAG error: {e}"
650
+
651
+ # ========================= UI (science-oriented styling) =========================
652
+ CSS = """
653
+ /* Science-oriented: crisp contrast + readable numerics */
654
+ * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
655
+ .gradio-container {
656
+ background: linear-gradient(135deg, #0b1020 0%, #0c2b1a 60%, #0a2b4d 100%) !important;
657
+ }
658
+ .card {background: rgba(255,255,255,0.06) !important; border: 1px solid rgba(255,255,255,0.14); border-radius: 12px;}
659
+ label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
660
+ input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
661
+
662
+ /* Checkbox clickability fixes */
663
+ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
664
+ .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
665
+ #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
666
+
667
+ /* RAG tab background and elements */
668
+ #rag-tab .block, #rag-tab .group, #rag-tab .accordion {
669
+ background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
670
+ border-radius: 12px;
671
+ border: 1px solid rgba(255,255,255,0.14);
672
+ }
673
+ #rag-tab input, #rag-tab textarea, #rag-tab select, #rag-tab .scroll-hide, #rag-tab .chatbot textarea {
674
+ background: rgba(17, 24, 39, 0.85) !important;
675
+ border: 1px solid #60a5fa !important;
676
+ color: #e5f2ff !important;
677
+ }
678
+ #rag-tab input[type="range"] { accent-color: #22c55e !important; }
679
+ #rag-tab button { border-radius: 10px !important; font-weight: 600 !important; }
680
+ #rag-tab .chatbot {
681
+ background: rgba(15, 23, 42, 0.6) !important;
682
+ border: 1px solid rgba(148, 163, 184, 0.35) !important;
683
+ }
684
+ #rag-tab .message.user {
685
+ background: rgba(34, 197, 94, 0.15) !important;
686
+ border-left: 3px solid #22c55e !important;
687
+ }
688
+ #rag-tab .message.bot {
689
+ background: rgba(59, 130, 246, 0.15) !important;
690
+ border-left: 3px solid #60a5fa !important;
691
+ color: #eef6ff !important;
692
+ }
693
+
694
+ /* Predictor output emphasis */
695
+ #pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
696
+ """
697
+
698
+ theme = gr.themes.Soft(
699
+ primary_hue="blue",
700
+ neutral_hue="green"
701
+ ).set(
702
+ body_background_fill="#0b1020",
703
+ body_text_color="#e0f2fe",
704
+ input_background_fill="#0f172a",
705
+ input_border_color="#1e40af",
706
+ button_primary_background_fill="#2563eb",
707
+ button_primary_text_color="#ffffff",
708
+ button_secondary_background_fill="#14532d",
709
+ button_secondary_text_color="#ecfdf5",
710
+ )
711
+
712
+ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
713
+ gr.Markdown(
714
+ "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
715
+ "<p style='opacity:.9'>"
716
+ "Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
717
+ "Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection."
718
+ "</p>"
719
+ )
720
+
721
+ with gr.Tabs():
722
+ # ------------------------- Predictor Tab -------------------------
723
+ with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
724
+ with gr.Row():
725
+ with gr.Column(scale=7):
726
+ with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
727
+ f1_type = gr.Textbox(label="Filler 1 Type *", placeholder="e.g., CNT, Graphite, Steel fiber")
728
+ f1_diam = gr.Number(label="Filler 1 Diameter (µm) *")
729
+ f1_len = gr.Number(label="Filler 1 Length (mm) *")
730
+ cf_conc = gr.Number(label=f"{CF_COL} *", info="Weight percent of total binder")
731
+ f1_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 1 Dimensionality *")
732
+
733
+ with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
734
+ f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
735
+ f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
736
+ f2_len = gr.Number(label="Filler 2 Length (mm)")
737
+ f2_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 2 Dimensionality")
738
+
739
+ with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
740
+ spec_vol = gr.Number(label="Specimen Volume (mm3) *")
741
+ probe_cnt = gr.Number(label="Probe Count *")
742
+ probe_mat = gr.Textbox(label="Probe Material *", placeholder="e.g., Copper, Silver paste")
743
+ wb = gr.Number(label="W/B *")
744
+ sb = gr.Number(label="S/B *")
745
+ gauge_len = gr.Number(label="Gauge Length (mm) *")
746
+ curing = gr.Textbox(label="Curing Condition *", placeholder="e.g., 28d water, 20°C")
747
+ n_fillers = gr.Number(label="Number of Fillers *")
748
+
749
+ with gr.Accordion("Processing", open=False, elem_classes=["card"]):
750
+ dry_temp = gr.Number(label="Drying Temperature (°C)")
751
+ dry_hrs = gr.Number(label="Drying Duration (hr)")
752
+
753
+ with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
754
+ load_rate = gr.Number(label="Loading Rate (MPa/s)")
755
+ E_mod = gr.Number(label="Modulus of Elasticity (GPa) *")
756
+ current = gr.Dropdown(CURRENT_CHOICES, value=CANON_NA, label="Current Type")
757
+ voltage = gr.Number(label="Applied Voltage (V)")
758
+
759
+ with gr.Column(scale=5):
760
+ with gr.Group(elem_classes=["card"]):
761
+ out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", value=0.0, precision=6, elem_id="pred-out")
762
+ gr.Markdown(f"<small>{MODEL_STATUS}</small>")
763
+ with gr.Row():
764
+ btn_pred = gr.Button("Predict", variant="primary")
765
+ btn_clear = gr.Button("Clear")
766
+ btn_demo = gr.Button("Fill Example")
767
+
768
+ with gr.Accordion("About this model", open=False, elem_classes=["card"]):
769
+ gr.Markdown(
770
+ "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
771
+ "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model may train on log1p; saved flag used at inference).\n"
772
+ "- Missing values are safely imputed per-feature.\n"
773
+ "- Trained columns:\n"
774
+ f" `{', '.join(MAIN_VARIABLES)}`",
775
+ elem_classes=["prose"]
776
+ )
777
+
778
+ inputs_in_order = [
779
+ f1_type, f1_diam, f1_len, cf_conc,
780
+ f1_dim, f2_type, f2_diam, f2_len,
781
+ f2_dim, spec_vol, probe_cnt, probe_mat,
782
+ wb, sb, gauge_len, curing, n_fillers,
783
+ dry_temp, dry_hrs, load_rate,
784
+ E_mod, current, voltage
785
+ ]
786
+
787
+ def _predict_wrapper(*vals):
788
+ data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
789
+ return predict_fn(**data)
790
+
791
+ btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
792
+ btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order).then(lambda: 0.0, outputs=out_pred)
793
+ btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
794
+
795
+ # ------------------------- Literature Tab -------------------------
796
+ with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
797
+ pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
798
+ gr.Markdown(
799
+ f"Using local folder <code>papers/</code> — **{pdf_count} PDF(s)** indexed. "
800
+ "Upload more PDFs and reload the Space to expand coverage. Answers cite (Doc.pdf, p.X)."
801
+ )
802
+ with gr.Row():
803
+ top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
804
+ n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
805
+ include_passages = gr.Checkbox(value=False, label="Include supporting passages", interactive=True)
806
+
807
+ with gr.Accordion("Retriever weights (advanced)", open=False):
808
+ w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
809
+ w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
810
+ w_emb = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
811
+
812
+ # Hidden states (unchanged)
813
+ state_use_llm = gr.State(LLM_AVAILABLE)
814
+ state_model_name = gr.State(os.getenv("OPENAI_MODEL", OPENAI_MODEL))
815
+ state_temperature = gr.State(0.2)
816
+ state_strict = gr.State(False)
817
+
818
+ gr.ChatInterface(
819
+ fn=rag_chat_fn,
820
+ additional_inputs=[
821
+ top_k, n_sentences, include_passages,
822
+ state_use_llm, state_model_name, state_temperature, state_strict,
823
+ w_tfidf, w_bm25, w_emb
824
+ ],
825
+ title="Literature Q&A",
826
+ description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
827
+ )
828
+
829
+ # ------------- Launch -------------
830
+ if __name__ == "__main__":
831
+ demo.queue().launch()