Inframat-x commited on
Commit
480df8c
·
1 Parent(s): 0688a59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +689 -687
app.py CHANGED
@@ -1,687 +1,689 @@
1
- # ================================================================
2
- # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
- # - Predictor tab: identical behavior to your "second code"
4
- # - Literature tab: from your "first code" (Hybrid RAG + MMR)
5
- # - Hugging Face friendly: online PDF fetching OFF by default
6
- # ================================================================
7
-
8
- # ---------------------- Runtime flags (HF-safe) ----------------------
9
- import os
10
- os.environ["TRANSFORMERS_NO_TF"] = "1"
11
- os.environ["TRANSFORMERS_NO_FLAX"] = "1"
12
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
-
14
- # ------------------------------- Imports ------------------------------
15
- import re, time, joblib, warnings, json
16
- from pathlib import Path
17
- from typing import List, Dict, Any
18
-
19
- import numpy as np
20
- import pandas as pd
21
- import gradio as gr
22
-
23
- warnings.filterwarnings("ignore", category=UserWarning)
24
-
25
- # Optional deps (handled gracefully if missing)
26
- USE_DENSE = True
27
- try:
28
- from sentence_transformers import SentenceTransformer
29
- except Exception:
30
- USE_DENSE = False
31
-
32
- try:
33
- from rank_bm25 import BM25Okapi
34
- except Exception:
35
- BM25Okapi = None
36
- print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
37
-
38
- # Optional OpenAI (for LLM paraphrase)
39
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
40
- OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
41
- try:
42
- from openai import OpenAI
43
- except Exception:
44
- OpenAI = None
45
-
46
- # ========================= Predictor (kept same as 2nd) =========================
47
- CF_COL = "Conductive Filler Conc. (wt%)"
48
- TARGET_COL = "Stress GF (MPa-1)"
49
-
50
- MAIN_VARIABLES = [
51
- "Filler 1 Type",
52
- "Filler 1 Diameter (µm)",
53
- "Filler 1 Length (mm)",
54
- CF_COL,
55
- "Filler 1 Dimensionality",
56
- "Filler 2 Type",
57
- "Filler 2 Diameter (µm)",
58
- "Filler 2 Length (mm)",
59
- "Filler 2 Dimensionality",
60
- "Specimen Volume (mm3)",
61
- "Probe Count",
62
- "Probe Material",
63
- "W/B",
64
- "S/B",
65
- "Gauge Length (mm)",
66
- "Curing Condition",
67
- "Number of Fillers",
68
- "Drying Temperature (°C)",
69
- "Drying Duration (hr)",
70
- "Loading Rate (MPa/s)",
71
- "Modulus of Elasticity (GPa)",
72
- "Current Type",
73
- "Applied Voltage (V)"
74
- ]
75
-
76
- NUMERIC_COLS = {
77
- "Filler 1 Diameter (µm)",
78
- "Filler 1 Length (mm)",
79
- CF_COL,
80
- "Filler 2 Diameter (µm)",
81
- "Filler 2 Length (mm)",
82
- "Specimen Volume (mm3)",
83
- "Probe Count",
84
- "W/B",
85
- "S/B",
86
- "Gauge Length (mm)",
87
- "Number of Fillers",
88
- "Drying Temperature (°C)",
89
- "Drying Duration (hr)",
90
- "Loading Rate (MPa/s)",
91
- "Modulus of Elasticity (GPa)",
92
- "Applied Voltage (V)"
93
- }
94
-
95
- CATEGORICAL_COLS = {
96
- "Filler 1 Type",
97
- "Filler 1 Dimensionality",
98
- "Filler 2 Type",
99
- "Filler 2 Dimensionality",
100
- "Probe Material",
101
- "Curing Condition",
102
- "Current Type"
103
- }
104
-
105
- DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
106
- CURRENT_CHOICES = ["DC", "AC", "NA"]
107
-
108
- MODEL_CANDIDATES = [
109
- "stress_gf_xgb.joblib",
110
- "models/stress_gf_xgb.joblib",
111
- "/home/user/app/stress_gf_xgb.joblib",
112
- ]
113
-
114
- def _load_model_or_error():
115
- for p in MODEL_CANDIDATES:
116
- if os.path.exists(p):
117
- try:
118
- return joblib.load(p)
119
- except Exception as e:
120
- return f"Could not load model from {p}: {e}"
121
- return ("Model file not found. Upload your trained pipeline as "
122
- "stress_gf_xgb.joblib (or put it in models/).")
123
-
124
- def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
125
- row = {}
126
- for col in MAIN_VARIABLES:
127
- v = form_dict.get(col, None)
128
- if col in NUMERIC_COLS:
129
- if v in ("", None):
130
- row[col] = np.nan
131
- else:
132
- try:
133
- row[col] = float(v)
134
- except Exception:
135
- row[col] = np.nan
136
- else:
137
- row[col] = "" if v in (None, "NA") else str(v).strip()
138
- return pd.DataFrame([row], columns=MAIN_VARIABLES)
139
-
140
- def predict_fn(**kwargs):
141
- mdl = _load_model_or_error()
142
- if isinstance(mdl, str):
143
- return mdl
144
- X_new = _coerce_to_row(kwargs)
145
- try:
146
- y_log = mdl.predict(X_new) # model predicts log1p(target)
147
- y = float(np.expm1(y_log)[0]) # back to original scale MPa^-1
148
- if -1e-10 < y < 0:
149
- y = 0.0
150
- return y
151
- except Exception as e:
152
- return f"Prediction error: {e}"
153
-
154
- EXAMPLE = {
155
- "Filler 1 Type": "CNT",
156
- "Filler 1 Dimensionality": "1D",
157
- "Filler 1 Diameter (µm)": 0.02,
158
- "Filler 1 Length (mm)": 1.2,
159
- CF_COL: 0.5,
160
- "Filler 2 Type": "",
161
- "Filler 2 Dimensionality": "NA",
162
- "Filler 2 Diameter (µm)": None,
163
- "Filler 2 Length (mm)": None,
164
- "Specimen Volume (mm3)": 1000,
165
- "Probe Count": 2,
166
- "Probe Material": "Copper",
167
- "W/B": 0.4,
168
- "S/B": 2.5,
169
- "Gauge Length (mm)": 20,
170
- "Curing Condition": "28d water, 20°C",
171
- "Number of Fillers": 1,
172
- "Drying Temperature (°C)": 60,
173
- "Drying Duration (hr)": 24,
174
- "Loading Rate (MPa/s)": 0.1,
175
- "Modulus of Elasticity (GPa)": 25,
176
- "Current Type": "DC",
177
- "Applied Voltage (V)": 5.0,
178
- }
179
-
180
- def _fill_example():
181
- return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
182
-
183
- def _clear_all():
184
- cleared = []
185
- for col in MAIN_VARIABLES:
186
- if col in NUMERIC_COLS:
187
- cleared.append(None)
188
- elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
189
- cleared.append("NA")
190
- elif col == "Current Type":
191
- cleared.append("NA")
192
- else:
193
- cleared.append("")
194
- return cleared
195
-
196
- # ========================= Hybrid RAG (from 1st code) =========================
197
- # Configuration
198
- ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
199
- TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
200
- TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
201
- BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
202
- EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
203
- RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
204
-
205
- # PDF source (HF-safe: rely on local /papers by default)
206
- LOCAL_PDF_DIR = Path("./literature_pdfs"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
207
- USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
208
-
209
- # Retrieval weights
210
- W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
211
- W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
212
- W_EMB_DEFAULT = 0.00 if not USE_DENSE else 0.40
213
-
214
- # Simple text processing
215
- _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
216
- TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
217
- def sent_split(text: str) -> List[str]:
218
- sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
219
- return [s for s in sents if len(s.split()) >= 5]
220
- def tokenize(text: str) -> List[str]:
221
- return [t.lower() for t in TOKEN_RE.findall(text)]
222
-
223
- # PDF text extraction (PyMuPDF preferred; pypdf fallback)
224
- def _extract_pdf_text(pdf_path: Path) -> str:
225
- try:
226
- import fitz
227
- doc = fitz.open(pdf_path)
228
- out = []
229
- for i, page in enumerate(doc):
230
- out.append(f"[[PAGE={i+1}]]\n{page.get_text('text') or ''}")
231
- return "\n\n".join(out)
232
- except Exception:
233
- try:
234
- from pypdf import PdfReader
235
- reader = PdfReader(str(pdf_path))
236
- out = []
237
- for i, p in enumerate(reader.pages):
238
- txt = p.extract_text() or ""
239
- out.append(f"[[PAGE={i+1}]]\n{txt}")
240
- return "\n\n".join(out)
241
- except Exception as e:
242
- print(f"PDF read error ({pdf_path}): {e}")
243
- return ""
244
-
245
- def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
246
- sents = sent_split(text)
247
- chunks, step = [], max(1, win_size - overlap)
248
- for i in range(0, len(sents), step):
249
- window = sents[i:i+win_size]
250
- if not window: break
251
- chunks.append(" ".join(window))
252
- return chunks
253
-
254
- def _safe_init_st_model(name: str):
255
- global USE_DENSE
256
- if not USE_DENSE:
257
- return None
258
- try:
259
- return SentenceTransformer(name)
260
- except Exception as e:
261
- print("Dense embeddings unavailable:", e)
262
- USE_DENSE = False
263
- return None
264
-
265
- # Build or load index
266
- def build_or_load_hybrid(pdf_dir: Path):
267
- have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
268
- and RAG_META_PATH.exists()
269
- and (BM25_TOK_PATH.exists() or BM25Okapi is None)
270
- and (EMB_NPY_PATH.exists() or not USE_DENSE))
271
- if have_cache:
272
- vectorizer = joblib.load(TFIDF_VECT_PATH)
273
- X_tfidf = joblib.load(TFIDF_MAT_PATH)
274
- meta = pd.read_parquet(RAG_META_PATH)
275
- bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
276
- emb = np.load(EMB_NPY_PATH) if (USE_DENSE and EMB_NPY_PATH.exists()) else None
277
- return vectorizer, X_tfidf, meta, bm25_toks, emb
278
-
279
- rows, all_tokens = [], []
280
- pdf_paths = list(Path(pdf_dir).glob("**/*.pdf"))
281
- print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} files.")
282
- for pdf in pdf_paths:
283
- raw = _extract_pdf_text(pdf)
284
- if not raw.strip():
285
- continue
286
- for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
287
- rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
288
- all_tokens.append(tokenize(ch))
289
- if not rows:
290
- # create empty stub to avoid crashes; UI will message user to upload PDFs
291
- meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
292
- vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
293
- return vectorizer, X_tfidf, meta, all_tokens, emb
294
-
295
- meta = pd.DataFrame(rows)
296
-
297
- from sklearn.feature_extraction.text import TfidfVectorizer
298
- vectorizer = TfidfVectorizer(
299
- ngram_range=(1,2),
300
- min_df=1, max_df=0.95,
301
- sublinear_tf=True, smooth_idf=True,
302
- lowercase=True,
303
- token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
304
- )
305
- X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
306
-
307
- emb = None
308
- if USE_DENSE:
309
- try:
310
- st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
311
- if st_model is not None:
312
- from sklearn.preprocessing import normalize as sk_normalize
313
- em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
314
- emb = sk_normalize(em)
315
- np.save(EMB_NPY_PATH, emb)
316
- except Exception as e:
317
- print("Dense embedding failed:", e)
318
- emb = None
319
-
320
- # Save artifacts
321
- joblib.dump(vectorizer, TFIDF_VECT_PATH)
322
- joblib.dump(X_tfidf, TFIDF_MAT_PATH)
323
- if BM25Okapi is not None:
324
- joblib.dump(all_tokens, BM25_TOK_PATH)
325
- meta.to_parquet(RAG_META_PATH, index=False)
326
-
327
- return vectorizer, X_tfidf, meta, all_tokens, emb
328
-
329
- tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
330
- bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
331
- st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
332
-
333
- def _extract_page(text_chunk: str) -> str:
334
- m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
335
- return (m[-1].group(1) if m else "?")
336
-
337
- def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
338
- if rag_meta is None or rag_meta.empty:
339
- return pd.DataFrame()
340
-
341
- # Dense scores
342
- if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
343
- try:
344
- from sklearn.preprocessing import normalize as sk_normalize
345
- q_emb = st_query_model.encode([query], convert_to_numpy=True)
346
- q_emb = sk_normalize(q_emb)[0]
347
- dense_scores = emb_matrix @ q_emb
348
- except Exception as e:
349
- print("Dense query encoding failed:", e)
350
- dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
351
- else:
352
- dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
353
-
354
- # TF-IDF scores
355
- if tfidf_vectorizer is not None and tfidf_matrix is not None:
356
- q_vec = tfidf_vectorizer.transform([query])
357
- tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
358
- else:
359
- tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
360
-
361
- # BM25 scores
362
- if bm25 is not None:
363
- q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-/\.%]+", query)]
364
- bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
365
- else:
366
- bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
367
-
368
- def _norm(x):
369
- x = np.asarray(x, dtype=float)
370
- if np.allclose(x.max(), x.min()):
371
- return np.zeros_like(x)
372
- return (x - x.min()) / (x.max() - x.min())
373
-
374
- s_dense = _norm(dense_scores)
375
- s_tfidf = _norm(tfidf_scores)
376
- s_bm25 = _norm(bm25_scores)
377
-
378
- total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
379
- w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
380
-
381
- combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
382
- idx = np.argsort(-combo)[:k]
383
- hits = rag_meta.iloc[idx].copy()
384
- hits["score_dense"] = s_dense[idx]
385
- hits["score_tfidf"] = s_tfidf[idx]
386
- hits["score_bm25"] = s_bm25[idx]
387
- hits["score"] = combo[idx]
388
- return hits.reset_index(drop=True)
389
-
390
- def split_sentences(text: str) -> List[str]:
391
- sents = sent_split(text)
392
- return [s for s in sents if 6 <= len(s.split()) <= 60]
393
-
394
- def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
395
- pool = []
396
- for _, row in hits.iterrows():
397
- doc = Path(row["doc_path"]).name
398
- page = _extract_page(row["text"])
399
- for s in split_sentences(row["text"])[:pool_per_chunk]:
400
- pool.append({"sent": s, "doc": doc, "page": page})
401
- if not pool:
402
- return []
403
-
404
- sent_texts = [p["sent"] for p in pool]
405
-
406
- # Embedding-based relevance if available, else TF-IDF
407
- use_dense = USE_DENSE and st_query_model is not None
408
- if use_dense:
409
- try:
410
- from sklearn.preprocessing import normalize as sk_normalize
411
- texts = [question] + sent_texts
412
- enc = st_query_model.encode(texts, convert_to_numpy=True)
413
- q_vec = sk_normalize(enc[:1])[0]
414
- S = sk_normalize(enc[1:])
415
- rel = (S @ q_vec)
416
- def sim_fn(i, j): return float(S[i] @ S[j])
417
- except Exception:
418
- use_dense = False
419
-
420
- if not use_dense:
421
- from sklearn.feature_extraction.text import TfidfVectorizer
422
- vect = TfidfVectorizer().fit(sent_texts + [question])
423
- Q = vect.transform([question]); S = vect.transform(sent_texts)
424
- rel = (S @ Q.T).toarray().ravel()
425
- def sim_fn(i, j): return float((S[i] @ S[j].T).toarray()[0, 0])
426
-
427
- selected, selected_idx = [], []
428
- remain = list(range(len(pool)))
429
- first = int(np.argmax(rel))
430
- selected.append(pool[first]); selected_idx.append(first); remain.remove(first)
431
-
432
- while len(selected) < top_n and remain:
433
- cand_scores = []
434
- for i in remain:
435
- sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
436
- score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
437
- cand_scores.append((score, i))
438
- cand_scores.sort(reverse=True)
439
- best_i = cand_scores[0][1]
440
- selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
441
- return selected
442
-
443
- def compose_extractive(selected: List[Dict[str, Any]]) -> str:
444
- if not selected:
445
- return ""
446
- return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
447
-
448
- def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
449
- if OPENAI_API_KEY is None or OpenAI is None:
450
- return None
451
- client = OpenAI(api_key=OPENAI_API_KEY)
452
- model = model or OPENAI_MODEL
453
- SYSTEM_PROMPT = (
454
- "You are a scientific assistant for self-sensing cementitious materials.\n"
455
- "Answer STRICTLY using the provided sentences.\n"
456
- "Do not invent facts. Keep it concise (3–6 sentences).\n"
457
- "Retain inline citations like (Doc.pdf, p.X) exactly as given."
458
- )
459
- user_prompt = (
460
- f"Question: {question}\n\n"
461
- f"Use ONLY these sentences to answer; keep their inline citations:\n" +
462
- "\n".join(f"- {s}" for s in sentence_lines)
463
- )
464
- try:
465
- resp = client.responses.create(
466
- model=model,
467
- input=[
468
- {"role": "system", "content": SYSTEM_PROMPT},
469
- {"role": "user", "content": user_prompt},
470
- ],
471
- temperature=temperature,
472
- )
473
- return getattr(resp, "output_text", None) or str(resp)
474
- except Exception:
475
- return None
476
-
477
- def rag_reply(
478
- question: str,
479
- k: int = 8,
480
- n_sentences: int = 4,
481
- include_passages: bool = False,
482
- use_llm: bool = False,
483
- model: str = None,
484
- temperature: float = 0.2,
485
- strict_quotes_only: bool = False,
486
- w_tfidf: float = W_TFIDF_DEFAULT,
487
- w_bm25: float = W_BM25_DEFAULT,
488
- w_emb: float = W_EMB_DEFAULT
489
- ) -> str:
490
- hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
491
- if hits is None or hits.empty:
492
- return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
493
-
494
- selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
495
- header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
496
- srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
497
- coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
498
-
499
- if strict_quotes_only:
500
- if not selected:
501
- return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
502
- msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
503
- msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
504
- if include_passages:
505
- msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
506
- return msg
507
-
508
- extractive = compose_extractive(selected)
509
- if use_llm and selected:
510
- lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
511
- llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
512
- if llm_text:
513
- msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
514
- if include_passages:
515
- msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
516
- return msg
517
-
518
- if not extractive:
519
- return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
520
-
521
- msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
522
- if include_passages:
523
- msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
524
- return msg
525
-
526
- def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
527
- use_llm, model_name, temperature, strict_quotes_only,
528
- w_tfidf, w_bm25, w_emb):
529
- if not message or not message.strip():
530
- return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
531
- try:
532
- return rag_reply(
533
- question=message,
534
- k=int(top_k),
535
- n_sentences=int(n_sentences),
536
- include_passages=bool(include_passages),
537
- use_llm=bool(use_llm),
538
- model=(model_name or None),
539
- temperature=float(temperature),
540
- strict_quotes_only=bool(strict_quotes_only),
541
- w_tfidf=float(w_tfidf),
542
- w_bm25=float(w_bm25),
543
- w_emb=float(w_emb),
544
- )
545
- except Exception as e:
546
- return f"RAG error: {e}"
547
-
548
- # ========================= UI (predictor styling kept) =========================
549
- CSS = """
550
- /* Blue to green gradient background */
551
- .gradio-container {
552
- background: linear-gradient(135deg, #1e3a8a 0%, #166534 60%, #15803d 100%) !important;
553
- }
554
- * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
555
- .card {background: rgba(255,255,255,0.07) !important; border: 1px solid rgba(255,255,255,0.12);}
556
- label.svelte-1ipelgc {color: #e0f2fe !important;}
557
- """
558
-
559
- theme = gr.themes.Soft(
560
- primary_hue="blue",
561
- neutral_hue="green"
562
- ).set(
563
- body_background_fill="#1e3a8a",
564
- body_text_color="#e0f2fe",
565
- input_background_fill="#172554",
566
- input_border_color="#1e40af",
567
- button_primary_background_fill="#2563eb",
568
- button_primary_text_color="#ffffff",
569
- button_secondary_background_fill="#14532d",
570
- button_secondary_text_color="#ecfdf5",
571
- )
572
-
573
- with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
574
- gr.Markdown(
575
- "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
576
- "<p style='opacity:.9'>"
577
- "Left tab: ML prediction for Stress Gauge Factor (kept identical to your deployed predictor). "
578
- "Right tab: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
579
- "Upload PDFs into <code>papers/</code> in your Space repo."
580
- "</p>"
581
- )
582
-
583
- with gr.Tabs():
584
- # ------------------------- Predictor Tab -------------------------
585
- with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
586
- with gr.Row():
587
- with gr.Column(scale=7):
588
- with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
589
- f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
590
- f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
591
- f1_len = gr.Number(label="Filler 1 Length (mm)")
592
- cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
593
- f1_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
594
-
595
- with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
596
- f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
597
- f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
598
- f2_len = gr.Number(label="Filler 2 Length (mm)")
599
- f2_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
600
-
601
- with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
602
- spec_vol = gr.Number(label="Specimen Volume (mm3)")
603
- probe_cnt = gr.Number(label="Probe Count")
604
- probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
605
- wb = gr.Number(label="W/B")
606
- sb = gr.Number(label="S/B")
607
- gauge_len = gr.Number(label="Gauge Length (mm)")
608
- curing = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
609
- n_fillers = gr.Number(label="Number of Fillers")
610
-
611
- with gr.Accordion("Processing", open=False, elem_classes=["card"]):
612
- dry_temp = gr.Number(label="Drying Temperature (°C)")
613
- dry_hrs = gr.Number(label="Drying Duration (hr)")
614
-
615
- with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
616
- load_rate = gr.Number(label="Loading Rate (MPa/s)")
617
- E_mod = gr.Number(label="Modulus of Elasticity (GPa)")
618
- current = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
619
- voltage = gr.Number(label="Applied Voltage (V)")
620
-
621
- with gr.Column(scale=5):
622
- with gr.Group(elem_classes=["card"]):
623
- out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", precision=6)
624
- with gr.Row():
625
- btn_pred = gr.Button("Predict", variant="primary")
626
- btn_clear = gr.Button("Clear")
627
- btn_demo = gr.Button("Fill Example")
628
-
629
- with gr.Accordion("About this model", open=False, elem_classes=["card"]):
630
- gr.Markdown(
631
- "- Pipeline: ColumnTransformer -> (RobustScaler + OneHot) -> XGBoost\n"
632
- "- Target: Stress GF (MPa^-1) on original scale (model trains on log1p).\n"
633
- "- Missing values are safely imputed per-feature.\n"
634
- "- Trained columns:\n"
635
- f" `{', '.join(MAIN_VARIABLES)}`"
636
- )
637
-
638
- # Wire predictor buttons
639
- inputs_in_order = [
640
- f1_type, f1_diam, f1_len, cf_conc,
641
- f1_dim, f2_type, f2_diam, f2_len,
642
- f2_dim, spec_vol, probe_cnt, probe_mat,
643
- wb, sb, gauge_len, curing, n_fillers,
644
- dry_temp, dry_hrs, load_rate,
645
- E_mod, current, voltage
646
- ]
647
-
648
- def _predict_wrapper(*vals):
649
- data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
650
- return predict_fn(**data)
651
-
652
- btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
653
- btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order)
654
- btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
655
-
656
- # ------------------------- Literature Tab -------------------------
657
- with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)"):
658
- gr.Markdown(
659
- "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
660
- "Answers cite (Doc.pdf, p.X). Toggle strict quotes or optional LLM paraphrasing."
661
- )
662
- with gr.Row():
663
- top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
664
- n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
665
- include_passages = gr.Checkbox(value=False, label="Include supporting passages")
666
- with gr.Accordion("Retriever weights (advanced)", open=False):
667
- w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
668
- w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
669
- w_emb = gr.Slider(0.0, 1.0, value=W_EMB_DEFAULT, step=0.05, label="Dense weight (set 0 if disabled)")
670
- with gr.Accordion("LLM & Controls", open=False):
671
- strict_quotes_only = gr.Checkbox(value=False, label="Strict quotes only (no paraphrasing)")
672
- use_llm = gr.Checkbox(value=True, label="Use LLM to paraphrase selected sentences")
673
- model_name = gr.Textbox(value=os.getenv("OPENAI_MODEL", OPENAI_MODEL),
674
- label="LLM model", placeholder="e.g., gpt-5 or gpt-5-mini")
675
- temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
676
- gr.ChatInterface(
677
- fn=rag_chat_fn,
678
- additional_inputs=[top_k, n_sentences, include_passages, use_llm, model_name,
679
- temperature, strict_quotes_only, w_tfidf, w_bm25, w_emb],
680
- title="Literature Q&A",
681
- description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations. Toggle strict/LLM modes."
682
- )
683
-
684
- # ------------- Launch -------------
685
- if __name__ == "__main__":
686
- # queue() helps HF Spaces with concurrency; show_error suggests upload PDFs if none
687
- demo.queue().launch()
 
 
 
1
+ # ================================================================
2
+ # Self-Sensing Concrete Assistant — Predictor (XGB) + Hybrid RAG
3
+ # - Predictor tab: identical behavior to your "second code"
4
+ # - Literature tab: from your "first code" (Hybrid RAG + MMR)
5
+ # - Hugging Face friendly: online PDF fetching OFF by default
6
+ # ================================================================
7
+
8
+ # ---------------------- Runtime flags (HF-safe) ----------------------
9
+ import os
10
+ os.environ["TRANSFORMERS_NO_TF"] = "1"
11
+ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
12
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
+
14
+ # ------------------------------- Imports ------------------------------
15
+ import re, time, joblib, warnings, json
16
+ from pathlib import Path
17
+ from typing import List, Dict, Any
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import gradio as gr
22
+
23
+ warnings.filterwarnings("ignore", category=UserWarning)
24
+
25
+ # Optional deps (handled gracefully if missing)
26
+ USE_DENSE = True
27
+ try:
28
+ from sentence_transformers import SentenceTransformer
29
+ except Exception:
30
+ USE_DENSE = False
31
+
32
+ try:
33
+ from rank_bm25 import BM25Okapi
34
+ except Exception:
35
+ BM25Okapi = None
36
+ print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
37
+
38
+ # Optional OpenAI (for LLM paraphrase)
39
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
40
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
41
+ try:
42
+ from openai import OpenAI
43
+ except Exception:
44
+ OpenAI = None
45
+
46
+ # ========================= Predictor (kept same as 2nd) =========================
47
+ CF_COL = "Conductive Filler Conc. (wt%)"
48
+ TARGET_COL = "Stress GF (MPa-1)"
49
+
50
+ MAIN_VARIABLES = [
51
+ "Filler 1 Type",
52
+ "Filler 1 Diameter (µm)",
53
+ "Filler 1 Length (mm)",
54
+ CF_COL,
55
+ "Filler 1 Dimensionality",
56
+ "Filler 2 Type",
57
+ "Filler 2 Diameter (µm)",
58
+ "Filler 2 Length (mm)",
59
+ "Filler 2 Dimensionality",
60
+ "Specimen Volume (mm3)",
61
+ "Probe Count",
62
+ "Probe Material",
63
+ "W/B",
64
+ "S/B",
65
+ "Gauge Length (mm)",
66
+ "Curing Condition",
67
+ "Number of Fillers",
68
+ "Drying Temperature (°C)",
69
+ "Drying Duration (hr)",
70
+ "Loading Rate (MPa/s)",
71
+ "Modulus of Elasticity (GPa)",
72
+ "Current Type",
73
+ "Applied Voltage (V)"
74
+ ]
75
+
76
+ NUMERIC_COLS = {
77
+ "Filler 1 Diameter (µm)",
78
+ "Filler 1 Length (mm)",
79
+ CF_COL,
80
+ "Filler 2 Diameter (µm)",
81
+ "Filler 2 Length (mm)",
82
+ "Specimen Volume (mm3)",
83
+ "Probe Count",
84
+ "W/B",
85
+ "S/B",
86
+ "Gauge Length (mm)",
87
+ "Number of Fillers",
88
+ "Drying Temperature (°C)",
89
+ "Drying Duration (hr)",
90
+ "Loading Rate (MPa/s)",
91
+ "Modulus of Elasticity (GPa)",
92
+ "Applied Voltage (V)"
93
+ }
94
+
95
+ CATEGORICAL_COLS = {
96
+ "Filler 1 Type",
97
+ "Filler 1 Dimensionality",
98
+ "Filler 2 Type",
99
+ "Filler 2 Dimensionality",
100
+ "Probe Material",
101
+ "Curing Condition",
102
+ "Current Type"
103
+ }
104
+
105
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", "NA"]
106
+ CURRENT_CHOICES = ["DC", "AC", "NA"]
107
+
108
+ MODEL_CANDIDATES = [
109
+ "stress_gf_xgb.joblib",
110
+ "models/stress_gf_xgb.joblib",
111
+ "/home/user/app/stress_gf_xgb.joblib",
112
+ ]
113
+
114
+ def _load_model_or_error():
115
+ for p in MODEL_CANDIDATES:
116
+ if os.path.exists(p):
117
+ try:
118
+ return joblib.load(p)
119
+ except Exception as e:
120
+ return f"Could not load model from {p}: {e}"
121
+ return ("Model file not found. Upload your trained pipeline as "
122
+ "stress_gf_xgb.joblib (or put it in models/).")
123
+
124
+ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
125
+ row = {}
126
+ for col in MAIN_VARIABLES:
127
+ v = form_dict.get(col, None)
128
+ if col in NUMERIC_COLS:
129
+ if v in ("", None):
130
+ row[col] = np.nan
131
+ else:
132
+ try:
133
+ row[col] = float(v)
134
+ except Exception:
135
+ row[col] = np.nan
136
+ else:
137
+ row[col] = "" if v in (None, "NA") else str(v).strip()
138
+ return pd.DataFrame([row], columns=MAIN_VARIABLES)
139
+
140
+ def predict_fn(**kwargs):
141
+ mdl = _load_model_or_error()
142
+ if isinstance(mdl, str):
143
+ return mdl
144
+ X_new = _coerce_to_row(kwargs)
145
+ try:
146
+ y_log = mdl.predict(X_new) # model predicts log1p(target)
147
+ y = float(np.expm1(y_log)[0]) # back to original scale MPa^-1
148
+ if -1e-10 < y < 0:
149
+ y = 0.0
150
+ return y
151
+ except Exception as e:
152
+ return f"Prediction error: {e}"
153
+
154
+ EXAMPLE = {
155
+ "Filler 1 Type": "CNT",
156
+ "Filler 1 Dimensionality": "1D",
157
+ "Filler 1 Diameter (µm)": 0.02,
158
+ "Filler 1 Length (mm)": 1.2,
159
+ CF_COL: 0.5,
160
+ "Filler 2 Type": "",
161
+ "Filler 2 Dimensionality": "NA",
162
+ "Filler 2 Diameter (µm)": None,
163
+ "Filler 2 Length (mm)": None,
164
+ "Specimen Volume (mm3)": 1000,
165
+ "Probe Count": 2,
166
+ "Probe Material": "Copper",
167
+ "W/B": 0.4,
168
+ "S/B": 2.5,
169
+ "Gauge Length (mm)": 20,
170
+ "Curing Condition": "28d water, 20°C",
171
+ "Number of Fillers": 1,
172
+ "Drying Temperature (°C)": 60,
173
+ "Drying Duration (hr)": 24,
174
+ "Loading Rate (MPa/s)": 0.1,
175
+ "Modulus of Elasticity (GPa)": 25,
176
+ "Current Type": "DC",
177
+ "Applied Voltage (V)": 5.0,
178
+ }
179
+
180
+ def _fill_example():
181
+ return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
182
+
183
+ def _clear_all():
184
+ cleared = []
185
+ for col in MAIN_VARIABLES:
186
+ if col in NUMERIC_COLS:
187
+ cleared.append(None)
188
+ elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
189
+ cleared.append("NA")
190
+ elif col == "Current Type":
191
+ cleared.append("NA")
192
+ else:
193
+ cleared.append("")
194
+ return cleared
195
+
196
+ # ========================= Hybrid RAG (from 1st code) =========================
197
+ # Configuration
198
+ ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
199
+ TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
200
+ TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
201
+ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
202
+ EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
203
+ RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
204
+
205
+ # PDF source (HF-safe: rely on local /papers by default)
206
+ LOCAL_PDF_DIR = Path("./literature_pdfs"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
207
+ USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
208
+
209
+ # Retrieval weights
210
+ W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
211
+ W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
212
+ W_EMB_DEFAULT = 0.00 if not USE_DENSE else 0.40
213
+
214
+ # Simple text processing
215
+ _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
216
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
217
+ def sent_split(text: str) -> List[str]:
218
+ sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
219
+ return [s for s in sents if len(s.split()) >= 5]
220
+ def tokenize(text: str) -> List[str]:
221
+ return [t.lower() for t in TOKEN_RE.findall(text)]
222
+
223
+ # PDF text extraction (PyMuPDF preferred; pypdf fallback)
224
+ def _extract_pdf_text(pdf_path: Path) -> str:
225
+ try:
226
+ import fitz
227
+ doc = fitz.open(pdf_path)
228
+ out = []
229
+ for i, page in enumerate(doc):
230
+ out.append(f"[[PAGE={i+1}]]\n{page.get_text('text') or ''}")
231
+ return "\n\n".join(out)
232
+ except Exception:
233
+ try:
234
+ from pypdf import PdfReader
235
+ reader = PdfReader(str(pdf_path))
236
+ out = []
237
+ for i, p in enumerate(reader.pages):
238
+ txt = p.extract_text() or ""
239
+ out.append(f"[[PAGE={i+1}]]\n{txt}")
240
+ return "\n\n".join(out)
241
+ except Exception as e:
242
+ print(f"PDF read error ({pdf_path}): {e}")
243
+ return ""
244
+
245
+ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
246
+ sents = sent_split(text)
247
+ chunks, step = [], max(1, win_size - overlap)
248
+ for i in range(0, len(sents), step):
249
+ window = sents[i:i+win_size]
250
+ if not window: break
251
+ chunks.append(" ".join(window))
252
+ return chunks
253
+
254
+ def _safe_init_st_model(name: str):
255
+ global USE_DENSE
256
+ if not USE_DENSE:
257
+ return None
258
+ try:
259
+ return SentenceTransformer(name)
260
+ except Exception as e:
261
+ print("Dense embeddings unavailable:", e)
262
+ USE_DENSE = False
263
+ return None
264
+
265
+ # Build or load index
266
+ def build_or_load_hybrid(pdf_dir: Path):
267
+ have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
268
+ and RAG_META_PATH.exists()
269
+ and (BM25_TOK_PATH.exists() or BM25Okapi is None)
270
+ and (EMB_NPY_PATH.exists() or not USE_DENSE))
271
+ if have_cache:
272
+ vectorizer = joblib.load(TFIDF_VECT_PATH)
273
+ X_tfidf = joblib.load(TFIDF_MAT_PATH)
274
+ meta = pd.read_parquet(RAG_META_PATH)
275
+ bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
276
+ emb = np.load(EMB_NPY_PATH) if (USE_DENSE and EMB_NPY_PATH.exists()) else None
277
+ return vectorizer, X_tfidf, meta, bm25_toks, emb
278
+
279
+ rows, all_tokens = [], []
280
+ pdf_paths = list(Path(pdf_dir).glob("**/*.pdf"))
281
+ print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} files.")
282
+ for pdf in pdf_paths:
283
+ raw = _extract_pdf_text(pdf)
284
+ if not raw.strip():
285
+ continue
286
+ for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
287
+ rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
288
+ all_tokens.append(tokenize(ch))
289
+ if not rows:
290
+ # create empty stub to avoid crashes; UI will message user to upload PDFs
291
+ meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
292
+ vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
293
+ return vectorizer, X_tfidf, meta, all_tokens, emb
294
+
295
+ meta = pd.DataFrame(rows)
296
+
297
+ from sklearn.feature_extraction.text import TfidfVectorizer
298
+ vectorizer = TfidfVectorizer(
299
+ ngram_range=(1,2),
300
+ min_df=1, max_df=0.95,
301
+ sublinear_tf=True, smooth_idf=True,
302
+ lowercase=True,
303
+ token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
304
+ )
305
+ X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
306
+
307
+ emb = None
308
+ if USE_DENSE:
309
+ try:
310
+ st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
311
+ if st_model is not None:
312
+ from sklearn.preprocessing import normalize as sk_normalize
313
+ em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
314
+ emb = sk_normalize(em)
315
+ np.save(EMB_NPY_PATH, emb)
316
+ except Exception as e:
317
+ print("Dense embedding failed:", e)
318
+ emb = None
319
+
320
+ # Save artifacts
321
+ joblib.dump(vectorizer, TFIDF_VECT_PATH)
322
+ joblib.dump(X_tfidf, TFIDF_MAT_PATH)
323
+ if BM25Okapi is not None:
324
+ joblib.dump(all_tokens, BM25_TOK_PATH)
325
+ meta.to_parquet(RAG_META_PATH, index=False)
326
+
327
+ return vectorizer, X_tfidf, meta, all_tokens, emb
328
+
329
+ tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
330
+ bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
331
+ st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
332
+
333
+ def _extract_page(text_chunk: str) -> str:
334
+ m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
335
+ return (m[-1].group(1) if m else "?")
336
+
337
+ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
338
+ if rag_meta is None or rag_meta.empty:
339
+ return pd.DataFrame()
340
+
341
+ # Dense scores
342
+ if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
343
+ try:
344
+ from sklearn.preprocessing import normalize as sk_normalize
345
+ q_emb = st_query_model.encode([query], convert_to_numpy=True)
346
+ q_emb = sk_normalize(q_emb)[0]
347
+ dense_scores = emb_matrix @ q_emb
348
+ except Exception as e:
349
+ print("Dense query encoding failed:", e)
350
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
351
+ else:
352
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
353
+
354
+ # TF-IDF scores
355
+ if tfidf_vectorizer is not None and tfidf_matrix is not None:
356
+ q_vec = tfidf_vectorizer.transform([query])
357
+ tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
358
+ else:
359
+ tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
360
+
361
+ # BM25 scores
362
+ if bm25 is not None:
363
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-/\.%]+", query)]
364
+ bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
365
+ else:
366
+ bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
367
+
368
+ def _norm(x):
369
+ x = np.asarray(x, dtype=float)
370
+ if np.allclose(x.max(), x.min()):
371
+ return np.zeros_like(x)
372
+ return (x - x.min()) / (x.max() - x.min())
373
+
374
+ s_dense = _norm(dense_scores)
375
+ s_tfidf = _norm(tfidf_scores)
376
+ s_bm25 = _norm(bm25_scores)
377
+
378
+ total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
379
+ w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
380
+
381
+ combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
382
+ idx = np.argsort(-combo)[:k]
383
+ hits = rag_meta.iloc[idx].copy()
384
+ hits["score_dense"] = s_dense[idx]
385
+ hits["score_tfidf"] = s_tfidf[idx]
386
+ hits["score_bm25"] = s_bm25[idx]
387
+ hits["score"] = combo[idx]
388
+ return hits.reset_index(drop=True)
389
+
390
+ def split_sentences(text: str) -> List[str]:
391
+ sents = sent_split(text)
392
+ return [s for s in sents if 6 <= len(s.split()) <= 60]
393
+
394
+ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
395
+ pool = []
396
+ for _, row in hits.iterrows():
397
+ doc = Path(row["doc_path"]).name
398
+ page = _extract_page(row["text"])
399
+ for s in split_sentences(row["text"])[:pool_per_chunk]:
400
+ pool.append({"sent": s, "doc": doc, "page": page})
401
+ if not pool:
402
+ return []
403
+
404
+ sent_texts = [p["sent"] for p in pool]
405
+
406
+ # Embedding-based relevance if available, else TF-IDF
407
+ use_dense = USE_DENSE and st_query_model is not None
408
+ if use_dense:
409
+ try:
410
+ from sklearn.preprocessing import normalize as sk_normalize
411
+ texts = [question] + sent_texts
412
+ enc = st_query_model.encode(texts, convert_to_numpy=True)
413
+ q_vec = sk_normalize(enc[:1])[0]
414
+ S = sk_normalize(enc[1:])
415
+ rel = (S @ q_vec)
416
+ def sim_fn(i, j): return float(S[i] @ S[j])
417
+ except Exception:
418
+ use_dense = False
419
+
420
+ if not use_dense:
421
+ from sklearn.feature_extraction.text import TfidfVectorizer
422
+ vect = TfidfVectorizer().fit(sent_texts + [question])
423
+ Q = vect.transform([question]); S = vect.transform(sent_texts)
424
+ rel = (S @ Q.T).toarray().ravel()
425
+ def sim_fn(i, j): return float((S[i] @ S[j].T).toarray()[0, 0])
426
+
427
+ selected, selected_idx = [], []
428
+ remain = list(range(len(pool)))
429
+ first = int(np.argmax(rel))
430
+ selected.append(pool[first]); selected_idx.append(first); remain.remove(first)
431
+
432
+ while len(selected) < top_n and remain:
433
+ cand_scores = []
434
+ for i in remain:
435
+ sim_to_sel = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
436
+ score = lambda_div * rel[i] - (1 - lambda_div) * sim_to_sel
437
+ cand_scores.append((score, i))
438
+ cand_scores.sort(reverse=True)
439
+ best_i = cand_scores[0][1]
440
+ selected.append(pool[best_i]); selected_idx.append(best_i); remain.remove(best_i)
441
+ return selected
442
+
443
+ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
444
+ if not selected:
445
+ return ""
446
+ return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
447
+
448
+ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2) -> str:
449
+ if OPENAI_API_KEY is None or OpenAI is None:
450
+ console.log("calling LLM api")
451
+ return None
452
+ client = OpenAI(api_key=OPENAI_API_KEY)
453
+ model = model or OPENAI_MODEL
454
+ console.log("using: ", model)
455
+ SYSTEM_PROMPT = (
456
+ "You are a scientific assistant for self-sensing cementitious materials.\n"
457
+ "Answer STRICTLY using the provided sentences.\n"
458
+ "Do not invent facts. Keep it concise (3–6 sentences).\n"
459
+ "Retain inline citations like (Doc.pdf, p.X) exactly as given."
460
+ )
461
+ user_prompt = (
462
+ f"Question: {question}\n\n"
463
+ f"Use ONLY these sentences to answer; keep their inline citations:\n" +
464
+ "\n".join(f"- {s}" for s in sentence_lines)
465
+ )
466
+ try:
467
+ resp = client.responses.create(
468
+ model=model,
469
+ input=[
470
+ {"role": "system", "content": SYSTEM_PROMPT},
471
+ {"role": "user", "content": user_prompt},
472
+ ],
473
+ temperature=temperature,
474
+ )
475
+ return getattr(resp, "output_text", None) or str(resp)
476
+ except Exception:
477
+ return None
478
+
479
+ def rag_reply(
480
+ question: str,
481
+ k: int = 8,
482
+ n_sentences: int = 4,
483
+ include_passages: bool = False,
484
+ use_llm: bool = True,
485
+ model: str = "gpt-4o-mini",
486
+ temperature: float = 0.2,
487
+ strict_quotes_only: bool = False,
488
+ w_tfidf: float = W_TFIDF_DEFAULT,
489
+ w_bm25: float = W_BM25_DEFAULT,
490
+ w_emb: float = W_EMB_DEFAULT
491
+ ) -> str:
492
+ hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
493
+ if hits is None or hits.empty:
494
+ return "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
495
+
496
+ selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
497
+ header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
498
+ srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
499
+ coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
500
+
501
+ if strict_quotes_only:
502
+ if not selected:
503
+ return f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
504
+ msg = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
505
+ msg += f"\n\n**Citations:** {header_cites}{coverage_note}"
506
+ if include_passages:
507
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
508
+ return msg
509
+
510
+ extractive = compose_extractive(selected)
511
+ if use_llm and selected:
512
+ lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
513
+ llm_text = synthesize_with_llm(question, lines, model=model, temperature=temperature)
514
+ if llm_text:
515
+ msg = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
516
+ if include_passages:
517
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
518
+ return msg
519
+
520
+ if not extractive:
521
+ return f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
522
+
523
+ msg = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
524
+ if include_passages:
525
+ msg += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
526
+ return msg
527
+
528
+ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
529
+ use_llm, model_name, temperature, strict_quotes_only,
530
+ w_tfidf, w_bm25, w_emb):
531
+ if not message or not message.strip():
532
+ return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
533
+ try:
534
+ return rag_reply(
535
+ question=message,
536
+ k=int(top_k),
537
+ n_sentences=int(n_sentences),
538
+ include_passages=bool(include_passages),
539
+ use_llm=bool(use_llm),
540
+ model=(model_name or None),
541
+ temperature=float(temperature),
542
+ strict_quotes_only=bool(strict_quotes_only),
543
+ w_tfidf=float(w_tfidf),
544
+ w_bm25=float(w_bm25),
545
+ w_emb=float(w_emb),
546
+ )
547
+ except Exception as e:
548
+ return f"RAG error: {e}"
549
+
550
+ # ========================= UI (predictor styling kept) =========================
551
+ CSS = """
552
+ /* Blue to green gradient background */
553
+ .gradio-container {
554
+ background: linear-gradient(135deg, #1e3a8a 0%, #166534 60%, #15803d 100%) !important;
555
+ }
556
+ * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
557
+ .card {background: rgba(255,255,255,0.07) !important; border: 1px solid rgba(255,255,255,0.12);}
558
+ label.svelte-1ipelgc {color: #e0f2fe !important;}
559
+ """
560
+
561
+ theme = gr.themes.Soft(
562
+ primary_hue="blue",
563
+ neutral_hue="green"
564
+ ).set(
565
+ body_background_fill="#1e3a8a",
566
+ body_text_color="#e0f2fe",
567
+ input_background_fill="#172554",
568
+ input_border_color="#1e40af",
569
+ button_primary_background_fill="#2563eb",
570
+ button_primary_text_color="#ffffff",
571
+ button_secondary_background_fill="#14532d",
572
+ button_secondary_text_color="#ecfdf5",
573
+ )
574
+
575
+ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
576
+ gr.Markdown(
577
+ "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
578
+ "<p style='opacity:.9'>"
579
+ "Left tab: ML prediction for Stress Gauge Factor (kept identical to your deployed predictor). "
580
+ "Right tab: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
581
+ "Upload PDFs into <code>papers/</code> in your Space repo."
582
+ "</p>"
583
+ )
584
+
585
+ with gr.Tabs():
586
+ # ------------------------- Predictor Tab -------------------------
587
+ with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
588
+ with gr.Row():
589
+ with gr.Column(scale=7):
590
+ with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
591
+ f1_type = gr.Textbox(label="Filler 1 Type", placeholder="e.g., CNT, Graphite, Steel fiber")
592
+ f1_diam = gr.Number(label="Filler 1 Diameter (µm)")
593
+ f1_len = gr.Number(label="Filler 1 Length (mm)")
594
+ cf_conc = gr.Number(label=f"{CF_COL}", info="Weight percent of total binder")
595
+ f1_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 1 Dimensionality")
596
+
597
+ with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
598
+ f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
599
+ f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
600
+ f2_len = gr.Number(label="Filler 2 Length (mm)")
601
+ f2_dim = gr.Dropdown(DIM_CHOICES, value="NA", label="Filler 2 Dimensionality")
602
+
603
+ with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
604
+ spec_vol = gr.Number(label="Specimen Volume (mm3)")
605
+ probe_cnt = gr.Number(label="Probe Count")
606
+ probe_mat = gr.Textbox(label="Probe Material", placeholder="e.g., Copper, Silver paste")
607
+ wb = gr.Number(label="W/B")
608
+ sb = gr.Number(label="S/B")
609
+ gauge_len = gr.Number(label="Gauge Length (mm)")
610
+ curing = gr.Textbox(label="Curing Condition", placeholder="e.g., 28d water, 20°C")
611
+ n_fillers = gr.Number(label="Number of Fillers")
612
+
613
+ with gr.Accordion("Processing", open=False, elem_classes=["card"]):
614
+ dry_temp = gr.Number(label="Drying Temperature (°C)")
615
+ dry_hrs = gr.Number(label="Drying Duration (hr)")
616
+
617
+ with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
618
+ load_rate = gr.Number(label="Loading Rate (MPa/s)")
619
+ E_mod = gr.Number(label="Modulus of Elasticity (GPa)")
620
+ current = gr.Dropdown(CURRENT_CHOICES, value="NA", label="Current Type")
621
+ voltage = gr.Number(label="Applied Voltage (V)")
622
+
623
+ with gr.Column(scale=5):
624
+ with gr.Group(elem_classes=["card"]):
625
+ out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", precision=6)
626
+ with gr.Row():
627
+ btn_pred = gr.Button("Predict", variant="primary")
628
+ btn_clear = gr.Button("Clear")
629
+ btn_demo = gr.Button("Fill Example")
630
+
631
+ with gr.Accordion("About this model", open=False, elem_classes=["card"]):
632
+ gr.Markdown(
633
+ "- Pipeline: ColumnTransformer -> (RobustScaler + OneHot) -> XGBoost\n"
634
+ "- Target: Stress GF (MPa^-1) on original scale (model trains on log1p).\n"
635
+ "- Missing values are safely imputed per-feature.\n"
636
+ "- Trained columns:\n"
637
+ f" `{', '.join(MAIN_VARIABLES)}`"
638
+ )
639
+
640
+ # Wire predictor buttons
641
+ inputs_in_order = [
642
+ f1_type, f1_diam, f1_len, cf_conc,
643
+ f1_dim, f2_type, f2_diam, f2_len,
644
+ f2_dim, spec_vol, probe_cnt, probe_mat,
645
+ wb, sb, gauge_len, curing, n_fillers,
646
+ dry_temp, dry_hrs, load_rate,
647
+ E_mod, current, voltage
648
+ ]
649
+
650
+ def _predict_wrapper(*vals):
651
+ data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
652
+ return predict_fn(**data)
653
+
654
+ btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
655
+ btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order)
656
+ btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
657
+
658
+ # ------------------------- Literature Tab -------------------------
659
+ with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)"):
660
+ gr.Markdown(
661
+ "Upload PDFs into the repository folder <code>papers/</code> then reload the Space. "
662
+ "Answers cite (Doc.pdf, p.X). Toggle strict quotes or optional LLM paraphrasing."
663
+ )
664
+ with gr.Row():
665
+ top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
666
+ n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
667
+ include_passages = gr.Checkbox(value=False, label="Include supporting passages")
668
+ with gr.Accordion("Retriever weights (advanced)", open=False):
669
+ w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
670
+ w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
671
+ w_emb = gr.Slider(0.0, 1.0, value=W_EMB_DEFAULT, step=0.05, label="Dense weight (set 0 if disabled)")
672
+ with gr.Accordion("LLM & Controls", open=False):
673
+ strict_quotes_only = gr.Checkbox(value=False, label="Strict quotes only (no paraphrasing)")
674
+ use_llm = gr.Checkbox(value=True, label="Use LLM to paraphrase selected sentences")
675
+ model_name = gr.Textbox(value=os.getenv("OPENAI_MODEL", OPENAI_MODEL),
676
+ label="LLM model", placeholder="e.g., gpt-5 or gpt-5-mini")
677
+ temperature = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
678
+ gr.ChatInterface(
679
+ fn=rag_chat_fn,
680
+ additional_inputs=[top_k, n_sentences, include_passages, use_llm, model_name,
681
+ temperature, strict_quotes_only, w_tfidf, w_bm25, w_emb],
682
+ title="Literature Q&A",
683
+ description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations. Toggle strict/LLM modes."
684
+ )
685
+
686
+ # ------------- Launch -------------
687
+ if __name__ == "__main__":
688
+ # queue() helps HF Spaces with concurrency; show_error suggests upload PDFs if none
689
+ demo.queue().launch()