Files changed (1) hide show
  1. app.py +664 -101
app.py CHANGED
@@ -1,124 +1,687 @@
1
- # ========================= UI (predictor styling kept) =========================
2
- CSS = """
3
- /* ----- Base layout: light, high-contrast ----- */
4
- .gradio-container {
5
- background: #f8fafc !important; /* slate-50 */
6
- color: #0f172a !important; /* slate-900 */
7
- --card-bg: #ffffff;
8
- --card-brd: #e2e8f0; /* slate-200 */
9
- }
10
 
11
- /* Cards / groups / accordions */
12
- .card, .gr-accordion, .gr-group, .gr-box {
13
- background: var(--card-bg) !important;
14
- border: 1px solid var(--card-brd) !important;
15
- box-shadow: 0 1px 2px rgba(15, 23, 42, 0.06);
16
- border-radius: 14px !important;
17
- }
18
 
19
- /* Inputs: white background, dark text */
20
- .gradio-container input,
21
- .gradio-container textarea,
22
- .gradio-container select,
23
- .gradio-container .gr-input,
24
- .gradio-container .gr-textbox textarea {
25
- background: #ffffff !important;
26
- color: #0f172a !important;
27
- border: 1px solid #cbd5e1 !important; /* slate-300 */
28
- }
29
 
30
- /* Buttons */
31
- .gradio-container button {
32
- font-weight: 700 !important;
33
- }
34
 
35
- /* ----- Label colors by component type (Blue / Green / Red) ----- */
36
- /* Blue: text-like fields & sliders */
37
- .gradio-container .gr-textbox label,
38
- .gradio-container .gr-markdown h1,
39
- .gradio-container .gr-markdown h2,
40
- .gradio-container .gr-markdown h3,
41
- .gradio-container .gr-slider label {
42
- color: #1d4ed8 !important; /* blue-700 */
43
- font-weight: 700 !important;
44
- text-shadow: 0 0 0.01px rgba(29,78,216,0.3);
45
- }
46
 
47
- /* Green: selections & toggles */
48
- .gradio-container .gr-dropdown label,
49
- .gradio-container .gr-checkbox label,
50
- .gradio-container .gr-checkbox-group label {
51
- color: #166534 !important; /* green-800 */
52
- font-weight: 700 !important;
53
- text-shadow: 0 0 0.01px rgba(22,101,52,0.3);
54
- }
55
 
56
- /* Red: numeric/measurement inputs (to stand out) */
57
- .gradio-container .gr-number label {
58
- color: #b91c1c !important; /* red-700 */
59
- font-weight: 800 !important;
60
- text-shadow: 0 0 0.01px rgba(185,28,28,0.25);
61
- }
62
 
63
- /* Secondary hint/info text under labels */
64
- .gradio-container .label > .text-gray-500,
65
- .gradio-container .label .secondary-text,
66
- .gradio-container .gr-input .text-gray-500 {
67
- color: #334155 !important; /* slate-700 */
68
- }
 
69
 
70
- /* Tabs: clearer selected state */
71
- .gradio-container .tabs .tabitem.selected {
72
- border-bottom: 3px solid #1d4ed8 !important; /* blue underline */
73
- font-weight: 800 !important;
74
- }
75
 
76
- /* Chat bubbles: better contrast */
77
- .gradio-container .message.user {
78
- background: #e0f2fe !important; /* sky-100 */
79
- border: 1px solid #bae6fd !important;
80
- color: #0c4a6e !important;
81
- }
82
- .gradio-container .message.bot {
83
- background: #ecfdf5 !important; /* emerald-50 */
84
- border: 1px solid #d1fae5 !important;
85
- color: #064e3b !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  }
87
 
88
- /* Sliders & focus states */
89
- .gradio-container input:focus,
90
- .gradio-container textarea:focus,
91
- .gradio-container select:focus {
92
- outline: 2px solid #1d4ed8 !important; /* blue focus ring */
93
- border-color: #1d4ed8 !important;
 
 
94
  }
95
 
96
- /* Headline block at top */
97
- .gradio-container h1, .gradio-container .prose h1 {
98
- color: #0f172a !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  }
100
 
101
- /* Small bump to label size */
102
- .gradio-container label {
103
- font-size: 0.98rem !important;
104
- letter-spacing: 0.1px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  }
 
 
 
106
  """
107
 
108
- # Tailwind-like hues mapped into Gradio theme tokens
109
  theme = gr.themes.Soft(
110
  primary_hue="blue",
111
- secondary_hue="green",
112
- neutral_hue="slate"
113
  ).set(
114
- body_background_fill="#f8fafc",
115
- body_text_color="#0f172a",
116
- input_background_fill="#ffffff",
117
- input_border_color="#cbd5e1",
118
- button_primary_background_fill="#1d4ed8", # blue
119
  button_primary_text_color="#ffffff",
120
- button_secondary_background_fill="#16a34a", # green
121
- button_secondary_text_color="#ffffff",
122
- radius_large="14px",
123
- spacing_size="8px"
124
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================================
2
+ # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
+ # - Predictor tab: identical behavior to your "second code"
4
+ # - Literature tab: from your "first code" (Hybrid RAG + MMR)
5
+ # - Hugging Face friendly: online PDF fetching OFF by default
6
+ # ================================================================
 
 
 
7
 
8
+ # ---------------------- Runtime flags (HF-safe) ----------------------
9
+ import os
10
+ os.environ["TRANSFORMERS_NO_TF"] = "1"
11
+ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
12
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
13
 
14
+ # ------------------------------- Imports ------------------------------
15
+ import re, time, joblib, warnings, json
16
+ from pathlib import Path
17
+ from typing import List, Dict, Any
 
 
 
 
 
 
18
 
19
+ import numpy as np
20
+ import pandas as pd
21
+ import gradio as gr
 
22
 
23
+ warnings.filterwarnings("ignore", category=UserWarning)
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Optional deps (handled gracefully if missing)
26
+ USE_DENSE = True
27
+ try:
28
+ from sentence_transformers import SentenceTransformer
29
+ except Exception:
30
+ USE_DENSE = False
 
 
31
 
32
+ try:
33
+ from rank_bm25 import BM25Okapi
34
+ except Exception:
35
+ BM25Okapi = None
36
+ print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
 
37
 
38
+ # Optional OpenAI (for LLM paraphrase)
39
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
40
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
41
+ try:
42
+ from openai import OpenAI
43
+ except Exception:
44
+ OpenAI = None
45
 
46
+ # ========================= Predictor (kept same as 2nd) =========================
47
+ CF_COL = "Conductive Filler Conc. (wt%)"
48
+ TARGET_COL = "Stress GF (MPa-1)"
 
 
49
 
50
+ MAIN_VARIABLES = [
51
+ "Filler 1 Type",
52
+ "Filler 1 Diameter (µm)",
53
+ "Filler 1 Length (mm)",
54
+ CF_COL,
55
+ "Filler 1 Dimensionality",
56
+ "Filler 2 Type",
57
+ "Filler 2 Diameter (µm)",
58
+ "Filler 2 Length (mm)",
59
+ "Filler 2 Dimensionality",
60
+ "Specimen Volume (mm3)",
61
+ "Probe Count",
62
+ "Probe Material",
63
+ "W/B",
64
+ "S/B",
65
+ "Gauge Length (mm)",
66
+ "Curing Condition",
67
+ "Number of Fillers",
68
+ "Drying Temperature (°C)",
69
+ "Drying Duration (hr)",
70
+ "Loading Rate (MPa/s)",
71
+ "Modulus of Elasticity (GPa)",
72
+ "Current Type",
73
+ "Applied Voltage (V)"
74
+ ]
75
+
76
+ NUMERIC_COLS = {
77
+ "Filler 1 Diameter (µm)",
78
+ "Filler 1 Length (mm)",
79
+ CF_COL,
80
+ "Filler 2 Diameter (µm)",
81
+ "Filler 2 Length (mm)",
82
+ "Specimen Volume (mm3)",
83
+ "Probe Count",
84
+ "W/B",
85
+ "S/B",
86
+ "Gauge Length (mm)",
87
+ "Number of Fillers",
88
+ "Drying Temperature (°C)",
89
+ "Drying Duration (hr)",
90
+ "Loading Rate (MPa/s)",
91
+ "Modulus of Elasticity (GPa)",
92
+ "Applied Voltage (V)"
93
  }
94
 
95
+ CATEGORICAL_COLS = {
96
+ "Filler 1 Type",
97
+ "Filler 1 Dimensionality",
98
+ "Filler 2 Type",
99
+ "Filler 2 Dimensionality",
100
+ "Probe Material",
101
+ "Curing Condition",
102
+ "Current Type"
103
  }
104
 
105
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
106
+ CURRENT_CHOICES = ["DC", "AC", "NA"]
107
+
108
+ MODEL_CANDIDATES = [
109
+ "stress_gf_xgb.joblib",
110
+ "models/stress_gf_xgb.joblib",
111
+ "/home/user/app/stress_gf_xgb.joblib",
112
+ ]
113
+
114
+ def _load_model_or_error():
115
+ for p in MODEL_CANDIDATES:
116
+ if os.path.exists(p):
117
+ try:
118
+ return joblib.load(p)
119
+ except Exception as e:
120
+ return f"Could not load model from {p}: {e}"
121
+ return ("Model file not found. Upload your trained pipeline as "
122
+ "stress_gf_xgb.joblib (or put it in models/).")
123
+
124
+ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
125
+ row = {}
126
+ for col in MAIN_VARIABLES:
127
+ v = form_dict.get(col, None)
128
+ if col in NUMERIC_COLS:
129
+ if v in ("", None):
130
+ row[col] = np.nan
131
+ else:
132
+ try:
133
+ row[col] = float(v)
134
+ except Exception:
135
+ row[col] = np.nan
136
+ else:
137
+ row[col] = "" if v in (None, "NA") else str(v).strip()
138
+ return pd.DataFrame([row], columns=MAIN_VARIABLES)
139
+
140
+ def predict_fn(**kwargs):
141
+ mdl = _load_model_or_error()
142
+ if isinstance(mdl, str):
143
+ return mdl
144
+ X_new = _coerce_to_row(kwargs)
145
+ try:
146
+ y_log = mdl.predict(X_new) # model predicts log1p(target)
147
+ y = float(np.expm1(y_log)[0]) # back to original scale MPa^-1
148
+ if -1e-10 < y < 0:
149
+ y = 0.0
150
+ return y
151
+ except Exception as e:
152
+ return f"Prediction error: {e}"
153
+
154
+ EXAMPLE = {
155
+ "Filler 1 Type": "CNT",
156
+ "Filler 1 Dimensionality": "1D",
157
+ "Filler 1 Diameter (µm)": 0.02,
158
+ "Filler 1 Length (mm)": 1.2,
159
+ CF_COL: 0.5,
160
+ "Filler 2 Type": "",
161
+ "Filler 2 Dimensionality": "NA",
162
+ "Filler 2 Diameter (µm)": None,
163
+ "Filler 2 Length (mm)": None,
164
+ "Specimen Volume (mm3)": 1000,
165
+ "Probe Count": 2,
166
+ "Probe Material": "Copper",
167
+ "W/B": 0.4,
168
+ "S/B": 2.5,
169
+ "Gauge Length (mm)": 20,
170
+ "Curing Condition": "28d water, 20°C",
171
+ "Number of Fillers": 1,
172
+ "Drying Temperature (°C)": 60,
173
+ "Drying Duration (hr)": 24,
174
+ "Loading Rate (MPa/s)": 0.1,
175
+ "Modulus of Elasticity (GPa)": 25,
176
+ "Current Type": "DC",
177
+ "Applied Voltage (V)": 5.0,
178
  }
179
 
180
+ def _fill_example():
181
+ return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
182
+
183
+ def _clear_all():
184
+ cleared = []
185
+ for col in MAIN_VARIABLES:
186
+ if col in NUMERIC_COLS:
187
+ cleared.append(None)
188
+ elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
189
+ cleared.append("NA")
190
+ elif col == "Current Type":
191
+ cleared.append("NA")
192
+ else:
193
+ cleared.append("")
194
+ return cleared
195
+
196
+ # ========================= Hybrid RAG (from 1st code) =========================
197
+ # Configuration
198
+ ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
199
+ TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
200
+ TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
201
+ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
202
+ EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
203
+ RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
204
+
205
+ # PDF source (HF-safe: rely on local /papers by default)
206
+ LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
207
+ USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
208
+
209
+ # Retrieval weights
210
+ W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
211
+ W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
212
+ W_EMB_DEFAULT = 0.00 if not USE_DENSE else 0.40
213
+
214
+ # Simple text processing
215
+ _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
216
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
217
+ def sent_split(text: str) -> List[str]:
218
+ sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
219
+ return [s for s in sents if len(s.split()) >= 5]
220
+ def tokenize(text: str) -> List[str]:
221
+ return [t.lower() for t in TOKEN_RE.findall(text)]
222
+
223
+ # PDF text extraction (PyMuPDF preferred; pypdf fallback)
224
+ def _extract_pdf_text(pdf_path: Path) -> str:
225
+ try:
226
+ import fitz
227
+ doc = fitz.open(pdf_path)
228
+ out = []
229
+ for i, page in enumerate(doc):
230
+ out.append(f"[[PAGE={i+1}]]\n{page.get_text('text') or ''}")
231
+ return "\n\n".join(out)
232
+ except Exception:
233
+ try:
234
+ from pypdf import PdfReader
235
+ reader = PdfReader(str(pdf_path))
236
+ out = []
237
+ for i, p in enumerate(reader.pages):
238
+ txt = p.extract_text() or ""
239
+ out.append(f"[[PAGE={i+1}]]\n{txt}")
240
+ return "\n\n".join(out)
241
+ except Exception as e:
242
+ print(f"PDF read error ({pdf_path}): {e}")
243
+ return ""
244
+
245
+ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
246
+ sents = sent_split(text)
247
+ chunks, step = [], max(1, win_size - overlap)
248
+ for i in range(0, len(sents), step):
249
+ window = sents[i:i+win_size]
250
+ if not window: break
251
+ chunks.append(" ".join(window))
252
+ return chunks
253
+
254
+ def _safe_init_st_model(name: str):
255
+ global USE_DENSE
256
+ if not USE_DENSE:
257
+ return None
258
+ try:
259
+ return SentenceTransformer(name)
260
+ except Exception as e:
261
+ print("Dense embeddings unavailable:", e)
262
+ USE_DENSE = False
263
+ return None
264
+
265
+ # Build or load index
266
+ def build_or_load_hybrid(pdf_dir: Path):
267
+ have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
268
+ and RAG_META_PATH.exists()
269
+ and (BM25_TOK_PATH.exists() or BM25Okapi is None)
270
+ and (EMB_NPY_PATH.exists() or not USE_DENSE))
271
+ if have_cache:
272
+ vectorizer = joblib.load(TFIDF_VECT_PATH)
273
+ X_tfidf = joblib.load(TFIDF_MAT_PATH)
274
+ meta = pd.read_parquet(RAG_META_PATH)
275
+ bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
276
+ emb = np.load(EMB_NPY_PATH) if (USE_DENSE and EMB_NPY_PATH.exists()) else None
277
+ return vectorizer, X_tfidf, meta, bm25_toks, emb
278
+
279
+ rows, all_tokens = [], []
280
+ pdf_paths = list(Path(pdf_dir).glob("**/*.pdf"))
281
+ print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} files.")
282
+ for pdf in pdf_paths:
283
+ raw = _extract_pdf_text(pdf)
284
+ if not raw.strip():
285
+ continue
286
+ for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
287
+ rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
288
+ all_tokens.append(tokenize(ch))
289
+ if not rows:
290
+ # create empty stub to avoid crashes; UI will message user to upload PDFs
291
+ meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
292
+ vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
293
+ return vectorizer, X_tfidf, meta, all_tokens, emb
294
+
295
+ meta = pd.DataFrame(rows)
296
+
297
+ from sklearn.feature_extraction.text import TfidfVectorizer
298
+ vectorizer = TfidfVectorizer(
299
+ ngram_range=(1,2),
300
+ min_df=1, max_df=0.95,
301
+ sublinear_tf=True, smooth_idf=True,
302
+ lowercase=True,
303
+ token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
304
+ )
305
+ X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
306
+
307
+ emb = None
308
+ if USE_DENSE:
309
+ try:
310
+ st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
311
+ if st_model is not None:
312
+ from sklearn.preprocessing import normalize as sk_normalize
313
+ em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
314
+ emb = sk_normalize(em)
315
+ np.save(EMB_NPY_PATH, emb)
316
+ except Exception as e:
317
+ print("Dense embedding failed:", e)
318
+ emb = None
319
+
320
+ # Save artifacts
321
+ joblib.dump(vectorizer, TFIDF_VECT_PATH)
322
+ joblib.dump(X_tfidf, TFIDF_MAT_PATH)
323
+ if BM25Okapi is not None:
324
+ joblib.dump(all_tokens, BM25_TOK_PATH)
325
+ meta.to_parquet(RAG_META_PATH, index=False)
326
+
327
+ return vectorizer, X_tfidf, meta, all_tokens, emb
328
+
329
+ tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
330
+ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
331
+ st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
332
+
333
+ def _extract_page(text_chunk: str) -> str:
334
+ m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
335
+ return (m[-1].group(1) if m else "?")
336
+
337
+ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
338
+ if rag_meta is None or rag_meta.empty:
339
+ return pd.DataFrame()
340
+
341
+ # Dense scores
342
+ if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
343
+ try:
344
+ from sklearn.preprocessing import normalize as sk_normalize
345
+ q_emb = st_query_model.encode([query], convert_to_numpy=True)
346
+ q_emb = sk_normalize(q_emb)[0]
347
+ dense_scores = emb_matrix @ q_emb
348
+ except Exception as e:
349
+ print("Dense query encoding failed:", e)
350
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
351
+ else:
352
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
353
+
354
+ # TF-IDF scores
355
+ if tfidf_vectorizer is not None and tfidf_matrix is not None:
356
+ q_vec = tfidf_vectorizer.transform([query])
357
+ tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
358
+ else:
359
+ tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
360
+
361
+ # BM25 scores
362
+ if bm25 is not None:
363
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-/\.%]+", query)]
364
+ bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
365
+ else:
366
+ bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
367
+
368
+ def _norm(x):
369
+ x = np.asarray(x, dtype=float)
370
+ if np.allclose(x.max(), x.min()):
371
+ return np.zeros_like(x)
372
+ return (x - x.min()) / (x.max() - x.min())
373
+
374
+ s_dense = _norm(dense_scores)
375
+ s_tfidf = _norm(tfidf_scores)
376
+ s_bm25 = _norm(bm25_scores)
377
+
378
+ total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
379
+ w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
380
+
381
+ combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
382
+ idx = np.argsort(-combo)[:k]
383
+ hits = rag_meta.iloc[idx].copy()
384
+ hits["score_dense"] = s_dense[idx]
385
+ hits["score_tfidf"] = s_tfidf[idx]
386
+ hits["score_bm25"] = s_bm25[idx]
387
+ hits["score"] = combo[idx]
388
+ return hits.reset_index(drop=True)
389
+
390
+ def split_sentences(text: str) -> List[str]:
391
+ sents = sent_split(text)
392
+ return [s for s in sents if 6 <= len(s.split()) <= 60]
393
+
394
+ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
395
+ pool = []
396
+ for _, row in hits.iterrows():
397
+ doc = Path(row["doc_path"]).name
398
+ page = _extract_page(row["text"])
399
+ for s in split_sentences(row["text"])[:pool_per_chunk]:
400
+ pool.append({"sent": s, "doc": doc, "page": page})
401
+ if not pool:
402
+ return []
403
+
404
+ sent_texts = [p["sent"] for p in pool]
405
+
406
+ # Embedding-based relevance if available, else TF-IDF
407
+ use_dense = USE_DENSE and st_query_model is not None
408
+ if use_dense:
409
+ try:
410
+ from sklearn.preprocessing import normalize as sk_normalize
411
+ texts = [question] + sent_texts
412
+ enc = st_query_model.encode(texts, convert_to_numpy=True)
413
+ q_vec = sk_normalize(enc[:1])[0]
414
+ S = sk_normalize(enc[1:])
415
+ rel = (S @ q_vec)
416
+ def sim_fn(i, j): return float(S[i] @ S[j])
417
+ except Exception:
418
+ use_dense = False
419
+
420
+ if not use_dense:
421
+ from sklearn.feature_extraction.text import TfidfVectorizer
422
+ vect = TfidfVectorizer().fit(sent_texts + [question])
423
+ Q = vect.transform([question]); S = vect.transform(sent_texts)
424
+ rel = (S @ Q.T).toarray().ravel()
425
+ def sim_fn(i, j): return float((S[i] @ S[j].T).toarray()[0, 0])
426
+
427
+ selected, selected_idx = [], []
428
+ remain = list(range(len(pool)))
429
+ first = int(np.argmax(rel))
430
+ selected.append(pool[first]); selected_idx.append(first); remain.remove(first)
431
+
432
+ while len(selected) < top_n and remain:
433
+ cand_scores = []
434
+ for i in remain:
435
+ sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
436
+ score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
437
+ cand_scores.append((score, i))
438
+ cand_scores.sort(reverse=True)
439
+ best_i = cand_scores[0][1]
440
+ selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
441
+ return selected
442
+
443
+ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
444
+ if not selected:
445
+ return ""
446
+ return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
447
+
448
+ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
449
+ if OPENAI_API_KEY is None or OpenAI is None:
450
+ return None
451
+ client = OpenAI(api_key=OPENAI_API_KEY)
452
+ model = model or OPENAI_MODEL
453
+ SYSTEM_PROMPT = (
454
+ "You are a scientific assistant for self-sensing cementitious materials.\n"
455
+ "Answer STRICTLY using the provided sentences.\n"
456
+ "Do not invent facts. Keep it concise (3–6 sentences).\n"
457
+ "Retain inline citations like (Doc.pdf, p.X) exactly as given."
458
+ )
459
+ user_prompt = (
460
+ f"Question: {question}\n\n"
461
+ f"Use ONLY these sentences to answer; keep their inline citations:\n" +
462
+ "\n".join(f"- {s}" for s in sentence_lines)
463
+ )
464
+ try:
465
+ resp = client.responses.create(
466
+ model=model,
467
+ input=[
468
+ {"role": "system", "content": SYSTEM_PROMPT},
469
+ {"role": "user", "content": user_prompt},
470
+ ],
471
+ temperature=temperature,
472
+ )
473
+ return getattr(resp, "output_text", None) or str(resp)
474
+ except Exception:
475
+ return None
476
+
477
+ def rag_reply(
478
+ question: str,
479
+ k: int = 8,
480
+ n_sentences: int = 4,
481
+ include_passages: bool = False,
482
+ use_llm: bool = False,
483
+ model: str = None,
484
+ temperature: float = 0.2,
485
+ strict_quotes_only: bool = False,
486
+ w_tfidf: float = W_TFIDF_DEFAULT,
487
+ w_bm25: float = W_BM25_DEFAULT,
488
+ w_emb: float = W_EMB_DEFAULT
489
+ ) -> str:
490
+ hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
491
+ if hits is None or hits.empty:
492
+ return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
493
+
494
+ selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
495
+ header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
496
+ srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
497
+ coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
498
+
499
+ if strict_quotes_only:
500
+ if not selected:
501
+ return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
502
+ msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
503
+ msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
504
+ if include_passages:
505
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
506
+ return msg
507
+
508
+ extractive = compose_extractive(selected)
509
+ if use_llm and selected:
510
+ lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
511
+ llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
512
+ if llm_text:
513
+ msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
514
+ if include_passages:
515
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
516
+ return msg
517
+
518
+ if not extractive:
519
+ return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
520
+
521
+ msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
522
+ if include_passages:
523
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
524
+ return msg
525
+
526
+ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
527
+ use_llm, model_name, temperature, strict_quotes_only,
528
+ w_tfidf, w_bm25, w_emb):
529
+ if not message or not message.strip():
530
+ return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
531
+ try:
532
+ return rag_reply(
533
+ question=message,
534
+ k=int(top_k),
535
+ n_sentences=int(n_sentences),
536
+ include_passages=bool(include_passages),
537
+ use_llm=bool(use_llm),
538
+ model=(model_name or None),
539
+ temperature=float(temperature),
540
+ strict_quotes_only=bool(strict_quotes_only),
541
+ w_tfidf=float(w_tfidf),
542
+ w_bm25=float(w_bm25),
543
+ w_emb=float(w_emb),
544
+ )
545
+ except Exception as e:
546
+ return f"RAG error: {e}"
547
+
548
+ # ========================= UI (predictor styling kept) =========================
549
+ CSS = """
550
+ /* Blue to green gradient background */
551
+ .gradio-container {
552
+ background: linear-gradient(135deg, #1e3a8a 0%, #166534 60%, #15803d 100%) !important;
553
  }
554
+ * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
555
+ .card {background: rgba(255,255,255,0.07) !important; border: 1px solid rgba(255,255,255,0.12);}
556
+ label.svelte-1ipelgc {color: #e0f2fe !important;}
557
  """
558
 
 
559
  theme = gr.themes.Soft(
560
  primary_hue="blue",
561
+ neutral_hue="green"
 
562
  ).set(
563
+ body_background_fill="#1e3a8a",
564
+ body_text_color="#e0f2fe",
565
+ input_background_fill="#172554",
566
+ input_border_color="#1e40af",
567
+ button_primary_background_fill="#2563eb",
568
  button_primary_text_color="#ffffff",
569
+ button_secondary_background_fill="#14532d",
570
+ button_secondary_text_color="#ecfdf5",
 
 
571
  )
572
+
573
+ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
574
+ gr.Markdown(
575
+ "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
576
+ "<p style='opacity:.9'>"
577
+ "Left tab: ML prediction for Stress Gauge Factor (kept identical to your deployed predictor). "
578
+ "Right tab: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
579
+ "Upload PDFs into <code>papers/</code> in your Space repo."
580
+ "</p>"
581
+ )
582
+
583
+ with gr.Tabs():
584
+ # ------------------------- Predictor Tab -------------------------
585
+ with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
586
+ with gr.Row():
587
+ with gr.Column(scale=7):
588
+ with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
589
+ f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
590
+ f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
591
+ f1_len = gr.Number(label="Filler 1 Length (mm)")
592
+ cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
593
+ f1_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
594
+
595
+ with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
596
+ f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
597
+ f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
598
+ f2_len = gr.Number(label="Filler 2 Length (mm)")
599
+ f2_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
600
+
601
+ with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
602
+ spec_vol = gr.Number(label="Specimen Volume (mm3)")
603
+ probe_cnt = gr.Number(label="Probe Count")
604
+ probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
605
+ wb = gr.Number(label="W/B")
606
+ sb = gr.Number(label="S/B")
607
+ gauge_len = gr.Number(label="Gauge Length (mm)")
608
+ curing = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
609
+ n_fillers = gr.Number(label="Number of Fillers")
610
+
611
+ with gr.Accordion("Processing", open=False, elem_classes=["card"]):
612
+ dry_temp = gr.Number(label="Drying Temperature (°C)")
613
+ dry_hrs = gr.Number(label="Drying Duration (hr)")
614
+
615
+ with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
616
+ load_rate = gr.Number(label="Loading Rate (MPa/s)")
617
+ E_mod = gr.Number(label="Modulus of Elasticity (GPa)")
618
+ current = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
619
+ voltage = gr.Number(label="Applied Voltage (V)")
620
+
621
+ with gr.Column(scale=5):
622
+ with gr.Group(elem_classes=["card"]):
623
+ out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", precision=6)
624
+ with gr.Row():
625
+ btn_pred = gr.Button("Predict", variant="primary")
626
+ btn_clear = gr.Button("Clear")
627
+ btn_demo = gr.Button("Fill Example")
628
+
629
+ with gr.Accordion("About this model", open=False, elem_classes=["card"]):
630
+ gr.Markdown(
631
+ "- Pipeline: ColumnTransformer -> (RobustScaler + OneHot) -> XGBoost\n"
632
+ "- Target: Stress GF (MPa^-1) on original scale (model trains on log1p).\n"
633
+ "- Missing values are safely imputed per-feature.\n"
634
+ "- Trained columns:\n"
635
+ f" `{', '.join(MAIN_VARIABLES)}`"
636
+ )
637
+
638
+ # Wire predictor buttons
639
+ inputs_in_order = [
640
+ f1_type, f1_diam, f1_len, cf_conc,
641
+ f1_dim, f2_type, f2_diam, f2_len,
642
+ f2_dim, spec_vol, probe_cnt, probe_mat,
643
+ wb, sb, gauge_len, curing, n_fillers,
644
+ dry_temp, dry_hrs, load_rate,
645
+ E_mod, current, voltage
646
+ ]
647
+
648
+ def _predict_wrapper(*vals):
649
+ data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
650
+ return predict_fn(**data)
651
+
652
+ btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
653
+ btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order)
654
+ btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
655
+
656
+ # ------------------------- Literature Tab -------------------------
657
+ with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)"):
658
+ gr.Markdown(
659
+ "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
660
+ "Answers cite (Doc.pdf, p.X). Toggle strict quotes or optional LLM paraphrasing."
661
+ )
662
+ with gr.Row():
663
+ top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
664
+ n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
665
+ include_passages = gr.Checkbox(value=False, label="Include supporting passages")
666
+ with gr.Accordion("Retriever weights (advanced)", open=False):
667
+ w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
668
+ w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
669
+ w_emb = gr.Slider(0.0, 1.0, value=W_EMB_DEFAULT, step=0.05, label="Dense weight (set 0 if disabled)")
670
+ with gr.Accordion("LLM & Controls", open=False):
671
+ strict_quotes_only = gr.Checkbox(value=False, label="Strict quotes only (no paraphrasing)")
672
+ use_llm = gr.Checkbox(value=False, label="Use LLM to paraphrase selected sentences")
673
+ model_name = gr.Textbox(value=os.getenv("OPENAI_MODEL", OPENAI_MODEL),
674
+ label="LLM model", placeholder="e.g., gpt-5 or gpt-5-mini")
675
+ temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
676
+ gr.ChatInterface(
677
+ fn=rag_chat_fn,
678
+ additional_inputs=[top_k, n_sentences, include_passages, use_llm, model_name,
679
+ temperature, strict_quotes_only, w_tfidf, w_bm25, w_emb],
680
+ title="Literature Q&A",
681
+ description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations. Toggle strict/LLM modes."
682
+ )
683
+
684
+ # ------------- Launch -------------
685
+ if __name__ == "__main__":
686
+ # queue() helps HF Spaces with concurrency; show_error suggests upload PDFs if none
687
+ demo.queue().launch()