Inframat-x commited on
Commit
222e225
·
verified ·
1 Parent(s): 301030b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +817 -411
app.py CHANGED
@@ -1,7 +1,24 @@
1
- # ===================== app.py =====================
2
- # RAG core + logging + grid eval + Gradio UI wrapper
3
-
4
- import os, re, json, time, uuid, traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from pathlib import Path
6
  from typing import List, Dict, Any, Optional
7
 
@@ -9,12 +26,9 @@ import numpy as np
9
  import pandas as pd
10
  import gradio as gr
11
 
12
- from rag_eval_metrics import evaluate_rag # must exist as rag_eval_metrics.py
13
-
14
 
15
- # ===================== RAG CORE + LOGGING + GRID EVAL =====================
16
-
17
- # Optional dense + BM25
18
  USE_DENSE = True
19
  try:
20
  from sentence_transformers import SentenceTransformer
@@ -27,7 +41,7 @@ except Exception:
27
  BM25Okapi = None
28
  print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
29
 
30
- # Optional OpenAI (for LLM synthesis, not needed for grid search)
31
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
32
  OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
33
  try:
@@ -35,44 +49,265 @@ try:
35
  except Exception:
36
  OpenAI = None
37
 
38
- LLM_AVAILABLE = (
39
- OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None
40
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # -------------------------- Paths & artifacts ---------------------------
43
- ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
44
- LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
 
 
46
  TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
47
  TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
48
  BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
49
  EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
50
  RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
51
 
52
- LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
53
-
54
  USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
55
 
56
- # default hybrid weights
57
  W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
58
  W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
59
- W_EMB_DEFAULT = 0.00 if not USE_DENSE else 0.40
60
 
61
- # ---------- basic text helpers ----------
62
  _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
63
  TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
64
-
65
  def sent_split(text: str) -> List[str]:
66
  sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
67
  return [s for s in sents if len(s.split()) >= 5]
68
-
69
  def tokenize(text: str) -> List[str]:
70
  return [t.lower() for t in TOKEN_RE.findall(text)]
71
 
72
- # ---------- PDF text extraction ----------
73
  def _extract_pdf_text(pdf_path: Path) -> str:
74
  try:
75
- import fitz # PyMuPDF
76
  doc = fitz.open(pdf_path)
77
  out = []
78
  for i, page in enumerate(doc):
@@ -96,12 +331,10 @@ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
96
  chunks, step = [], max(1, win_size - overlap)
97
  for i in range(0, len(sents), step):
98
  window = sents[i:i+win_size]
99
- if not window:
100
- break
101
  chunks.append(" ".join(window))
102
  return chunks
103
 
104
- # ---------- dense encoder ----------
105
  def _safe_init_st_model(name: str):
106
  global USE_DENSE
107
  if not USE_DENSE:
@@ -113,19 +346,12 @@ def _safe_init_st_model(name: str):
113
  USE_DENSE = False
114
  return None
115
 
116
- # ---------- build / load hybrid index ----------
117
  def build_or_load_hybrid(pdf_dir: Path):
118
- from sklearn.feature_extraction.text import TfidfVectorizer
119
- import joblib
120
-
121
- have_cache = (
122
- TFIDF_VECT_PATH.exists()
123
- and TFIDF_MAT_PATH.exists()
124
- and RAG_META_PATH.exists()
125
- and (BM25_TOK_PATH.exists() or BM25Okapi is None)
126
- and (EMB_NPY_PATH.exists() or not USE_DENSE)
127
- )
128
-
129
  if have_cache:
130
  vectorizer = joblib.load(TFIDF_VECT_PATH)
131
  X_tfidf = joblib.load(TFIDF_MAT_PATH)
@@ -135,8 +361,8 @@ def build_or_load_hybrid(pdf_dir: Path):
135
  return vectorizer, X_tfidf, meta, bm25_toks, emb
136
 
137
  rows, all_tokens = [], []
138
- pdf_paths = list(pdf_dir.glob("**/*.pdf"))
139
- print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} file(s).")
140
  for pdf in pdf_paths:
141
  raw = _extract_pdf_text(pdf)
142
  if not raw.strip():
@@ -144,79 +370,56 @@ def build_or_load_hybrid(pdf_dir: Path):
144
  for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
145
  rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
146
  all_tokens.append(tokenize(ch))
147
-
148
  if not rows:
149
  meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
150
- return None, None, meta, None, None
 
151
 
152
  meta = pd.DataFrame(rows)
153
-
154
  vectorizer = TfidfVectorizer(
155
- ngram_range=(1, 2),
156
- min_df=1,
157
- max_df=0.95,
158
- sublinear_tf=True,
159
- smooth_idf=True,
160
  lowercase=True,
161
- token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b",
162
  )
163
  X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
164
 
165
  emb = None
166
  if USE_DENSE:
167
  try:
168
- st_model = _safe_init_st_model(
169
- os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
170
- )
171
  if st_model is not None:
172
  from sklearn.preprocessing import normalize as sk_normalize
173
- em = st_model.encode(
174
- meta["text"].tolist(),
175
- batch_size=64,
176
- show_progress_bar=False,
177
- convert_to_numpy=True,
178
- )
179
  emb = sk_normalize(em)
180
  np.save(EMB_NPY_PATH, emb)
181
  except Exception as e:
182
  print("Dense embedding failed:", e)
183
  emb = None
184
 
185
- import joblib
186
  joblib.dump(vectorizer, TFIDF_VECT_PATH)
187
  joblib.dump(X_tfidf, TFIDF_MAT_PATH)
188
  if BM25Okapi is not None:
189
  joblib.dump(all_tokens, BM25_TOK_PATH)
190
  meta.to_parquet(RAG_META_PATH, index=False)
191
-
192
  return vectorizer, X_tfidf, meta, all_tokens, emb
193
 
194
- tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(
195
- LOCAL_PDF_DIR
196
- )
197
  bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
198
- st_query_model = _safe_init_st_model(
199
- os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
200
- )
201
 
202
- # ---------- hybrid retrieval ----------
203
  def _extract_page(text_chunk: str) -> str:
 
204
  m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
205
- return m[-1].group(1) if m else "?"
206
 
207
- def hybrid_search(
208
- query: str,
209
- k: int = 8,
210
- w_tfidf: float = W_TFIDF_DEFAULT,
211
- w_bm25: float = W_BM25_DEFAULT,
212
- w_emb: float = W_EMB_DEFAULT,
213
- ) -> pd.DataFrame:
214
  if rag_meta is None or rag_meta.empty:
215
  return pd.DataFrame()
216
 
217
- n_chunks = len(rag_meta)
218
-
219
- # dense scores
220
  if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
221
  try:
222
  from sklearn.preprocessing import normalize as sk_normalize
@@ -225,23 +428,23 @@ def hybrid_search(
225
  dense_scores = emb_matrix @ q_emb
226
  except Exception as e:
227
  print("Dense query encoding failed:", e)
228
- dense_scores = np.zeros(n_chunks); w_emb = 0.0
229
  else:
230
- dense_scores = np.zeros(n_chunks); w_emb = 0.0
231
 
232
- # tf-idf
233
  if tfidf_vectorizer is not None and tfidf_matrix is not None:
234
  q_vec = tfidf_vectorizer.transform([query])
235
  tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
236
  else:
237
- tfidf_scores = np.zeros(n_chunks); w_tfidf = 0.0
238
 
239
- # bm25
240
  if bm25 is not None:
241
- q_tokens = [t.lower() for t in TOKEN_RE.findall(query)]
242
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
243
  else:
244
- bm25_scores = np.zeros(n_chunks); w_bm25 = 0.0
245
 
246
  def _norm(x):
247
  x = np.asarray(x, dtype=float)
@@ -254,11 +457,10 @@ def hybrid_search(
254
  s_bm25 = _norm(bm25_scores)
255
 
256
  total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
257
- w_tfidf, w_bm25, w_emb = w_tfidf / total_w, w_bm25 / total_w, w_emb / total_w
258
 
259
  combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
260
  idx = np.argsort(-combo)[:k]
261
-
262
  hits = rag_meta.iloc[idx].copy()
263
  hits["score_dense"] = s_dense[idx]
264
  hits["score_tfidf"] = s_tfidf[idx]
@@ -266,18 +468,18 @@ def hybrid_search(
266
  hits["score"] = combo[idx]
267
  return hits.reset_index(drop=True)
268
 
269
- # ---------- MMR sentence selection ----------
270
  def split_sentences(text: str) -> List[str]:
271
  sents = sent_split(text)
272
  return [s for s in sents if 6 <= len(s.split()) <= 60]
273
 
274
- def mmr_select_sentences(
275
- question: str,
276
- hits: pd.DataFrame,
277
- top_n: int = 4,
278
- pool_per_chunk: int = 6,
279
- lambda_div: float = 0.7,
280
- ) -> List[Dict[str, Any]]:
 
281
  pool = []
282
  for _, row in hits.iterrows():
283
  doc = Path(row["doc_path"]).name
@@ -287,19 +489,20 @@ def mmr_select_sentences(
287
  continue
288
  for s in sents[:max(1, int(pool_per_chunk))]:
289
  pool.append({"sent": s, "doc": doc, "page": page})
 
290
  if not pool:
291
  return []
292
 
 
293
  sent_texts = [p["sent"] for p in pool]
294
  use_dense = USE_DENSE and st_query_model is not None
295
-
296
  try:
297
  if use_dense:
298
  from sklearn.preprocessing import normalize as sk_normalize
299
  enc = st_query_model.encode([question] + sent_texts, convert_to_numpy=True)
300
  q_vec = sk_normalize(enc[:1])[0]
301
  S = sk_normalize(enc[1:])
302
- rel = S @ q_vec
303
  def sim_fn(i, j): return float(S[i] @ S[j])
304
  else:
305
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -310,17 +513,23 @@ def mmr_select_sentences(
310
  num = (S[i] @ S[j].T)
311
  return float(num.toarray()[0, 0]) if hasattr(num, "toarray") else float(num)
312
  except Exception:
313
- rel = np.ones(len(sent_texts))
 
314
  def sim_fn(i, j): return 0.0
315
 
 
316
  lambda_div = float(np.clip(lambda_div, 0.0, 1.0))
317
 
 
318
  remain = list(range(len(pool)))
 
 
319
  first = int(np.argmax(rel))
320
  selected_idx = [first]
321
  selected = [pool[first]]
322
  remain.remove(first)
323
 
 
324
  max_pick = min(int(top_n), len(pool))
325
  while len(selected) < max_pick and remain:
326
  cand_scores = []
@@ -328,6 +537,8 @@ def mmr_select_sentences(
328
  div_i = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
329
  score = lambda_div * float(rel[i]) - (1.0 - lambda_div) * div_i
330
  cand_scores.append((score, i))
 
 
331
  cand_scores.sort(reverse=True)
332
  _, best_i = cand_scores[0]
333
  selected_idx.append(best_i)
@@ -341,7 +552,8 @@ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
341
  return ""
342
  return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
343
 
344
- # ---------- logging helpers ----------
 
345
  OPENAI_IN_COST_PER_1K = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
346
  OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
347
 
@@ -355,22 +567,14 @@ def _safe_write_jsonl(path: Path, record: dict):
355
  def _calc_cost_usd(prompt_toks, completion_toks):
356
  if prompt_toks is None or completion_toks is None:
357
  return None
358
- return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (
359
- completion_toks / 1000.0
360
- ) * OPENAI_OUT_COST_PER_1K
361
 
362
- # ---------- optional LLM synthesis ----------
363
- def synthesize_with_llm(
364
- question: str,
365
- sentence_lines: List[str],
366
- model: Optional[str] = None,
367
- temperature: float = 0.2,
368
- ):
369
  if not LLM_AVAILABLE:
370
  return None, None
371
  client = OpenAI(api_key=OPENAI_API_KEY)
372
  model = model or OPENAI_MODEL
373
-
374
  SYSTEM_PROMPT = (
375
  "You are a scientific assistant for self-sensing cementitious materials.\n"
376
  "Answer STRICTLY using the provided sentences.\n"
@@ -379,10 +583,9 @@ def synthesize_with_llm(
379
  )
380
  user_prompt = (
381
  f"Question: {question}\n\n"
382
- "Use ONLY these sentences to answer; keep their inline citations:\n"
383
- + "\n".join(f"- {s}" for s in sentence_lines)
384
  )
385
-
386
  try:
387
  resp = client.responses.create(
388
  model=model,
@@ -406,183 +609,124 @@ def synthesize_with_llm(
406
  except Exception:
407
  return None, None
408
 
409
- # ---------- main RAG reply (with config_id) ----------
410
  def rag_reply(
411
  question: str,
412
  k: int = 8,
413
  n_sentences: int = 4,
414
  include_passages: bool = False,
415
  use_llm: bool = False,
416
- model: Optional[str] = None,
417
  temperature: float = 0.2,
418
  strict_quotes_only: bool = False,
419
  w_tfidf: float = W_TFIDF_DEFAULT,
420
- w_bm25: float = W_BM25_DEFAULT,
421
- w_emb: float = W_EMB_DEFAULT,
422
- config_id: Optional[str] = None,
423
  ) -> str:
424
  run_id = str(uuid.uuid4())
425
  t0_total = time.time()
426
  t0_retr = time.time()
427
 
428
- hits = hybrid_search(
429
- question,
430
- k=int(k),
431
- w_tfidf=float(w_tfidf),
432
- w_bm25=float(w_bm25),
433
- w_emb=float(w_emb),
434
- )
435
  t1_retr = time.time()
436
  latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
437
 
438
  if hits is None or hits.empty:
439
- final = "No indexed PDFs found."
440
  record = {
441
  "run_id": run_id,
442
- "ts": int(time.time() * 1000),
443
  "inputs": {
444
- "question": question,
445
- "top_k": int(k),
446
- "n_sentences": int(n_sentences),
447
- "w_tfidf": float(w_tfidf),
448
- "w_bm25": float(w_bm25),
449
- "w_emb": float(w_emb),
450
- "use_llm": bool(use_llm),
451
- "model": model,
452
- "temperature": float(temperature),
453
- "config_id": config_id,
454
  },
455
  "retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
456
  "output": {"final_answer": final, "used_sentences": []},
457
- "latency_ms_total": int((time.time() - t0_total) * 1000),
458
- "openai": None,
459
  }
460
  _safe_write_jsonl(LOG_PATH, record)
461
  return final
462
 
463
- selected = mmr_select_sentences(
464
- question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7
465
- )
466
- header_cites = "; ".join(
467
- f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})"
468
- for _, r in hits.head(6).iterrows()
469
- )
470
-
471
- srcs = {Path(r["doc_path"]).name for _, r in hits.iterrows()}
472
- coverage_note = (
473
- ""
474
- if len(srcs) >= 3
475
- else f"\n\n> Note: Only {len(srcs)} unique source(s). Add more PDFs or increase Top-K."
476
- )
477
 
 
478
  retr_list = []
479
  for _, r in hits.iterrows():
480
- retr_list.append(
481
- {
482
- "doc": Path(r["doc_path"]).name,
483
- "page": _extract_page(r["text"]),
484
- "score_tfidf": float(r.get("score_tfidf", 0.0)),
485
- "score_bm25": float(r.get("score_bm25", 0.0)),
486
- "score_dense": float(r.get("score_dense", 0.0)),
487
- "combo_score": float(r.get("score", 0.0)),
488
- }
489
- )
490
-
491
- # retrieval-only / strict quotations (used in grid search)
492
  if strict_quotes_only:
493
  if not selected:
494
- final = (
495
- f"**Quoted Passages:**\n\n---\n"
496
- + "\n\n".join(hits["text"].tolist()[:2])
497
- + f"\n\n**Citations:** {header_cites}{coverage_note}"
498
- )
499
  else:
500
- final = "**Quoted Passages:**\n- " + "\n- ".join(
501
- f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected
502
- )
503
  final += f"\n\n**Citations:** {header_cites}{coverage_note}"
504
  if include_passages:
505
- final += "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
506
 
507
  record = {
508
  "run_id": run_id,
509
- "ts": int(time.time() * 1000),
510
  "inputs": {
511
- "question": question,
512
- "top_k": int(k),
513
- "n_sentences": int(n_sentences),
514
- "w_tfidf": float(w_tfidf),
515
- "w_bm25": float(w_bm25),
516
- "w_emb": float(w_emb),
517
- "use_llm": False,
518
- "model": None,
519
- "temperature": float(temperature),
520
- "config_id": config_id,
521
  },
522
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
523
  "output": {
524
  "final_answer": final,
525
- "used_sentences": [
526
- {"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
527
- for s in selected
528
- ],
529
  },
530
- "latency_ms_total": int((time.time() - t0_total) * 1000),
531
- "openai": None,
532
  }
533
  _safe_write_jsonl(LOG_PATH, record)
534
  return final
535
 
536
- # extractive / LLM synthesis
537
  extractive = compose_extractive(selected)
538
  llm_usage = None
539
  llm_latency_ms = None
540
-
541
  if use_llm and selected:
542
  lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
543
  t0_llm = time.time()
544
- llm_text, llm_usage = synthesize_with_llm(
545
- question, lines, model=model, temperature=temperature
546
- )
547
  t1_llm = time.time()
548
  llm_latency_ms = int((t1_llm - t0_llm) * 1000)
549
 
550
  if llm_text:
551
- final = (
552
- f"**Answer (LLM synthesis):** {llm_text}\n\n"
553
- f"**Citations:** {header_cites}{coverage_note}"
554
- )
555
  if include_passages:
556
- final += "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
557
  else:
558
  if not extractive:
559
- final = (
560
- f"**Answer:** Here are relevant passages.\n\n"
561
- f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
562
- + "\n\n".join(hits["text"].tolist()[:2])
563
- )
564
  else:
565
- final = (
566
- f"**Answer:** {extractive}\n\n"
567
- f"**Citations:** {header_cites}{coverage_note}"
568
- )
569
  if include_passages:
570
- final += "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
571
  else:
572
  if not extractive:
573
- final = (
574
- f"**Answer:** Here are relevant passages.\n\n"
575
- f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
576
- + "\n\n".join(hits["text"].tolist()[:2])
577
- )
578
  else:
579
- final = (
580
- f"**Answer:** {extractive}\n\n"
581
- f"**Citations:** {header_cites}{coverage_note}"
582
- )
583
  if include_passages:
584
- final += "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
585
 
 
586
  prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
587
  completion_toks = llm_usage.get("completion_tokens") if llm_usage else None
588
  cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
@@ -590,226 +734,488 @@ def rag_reply(
590
  total_ms = int((time.time() - t0_total) * 1000)
591
  record = {
592
  "run_id": run_id,
593
- "ts": int(time.time() * 1000),
594
  "inputs": {
595
- "question": question,
596
- "top_k": int(k),
597
- "n_sentences": int(n_sentences),
598
- "w_tfidf": float(w_tfidf),
599
- "w_bm25": float(w_bm25),
600
- "w_emb": float(w_emb),
601
- "use_llm": bool(use_llm),
602
- "model": model,
603
- "temperature": float(temperature),
604
- "config_id": config_id,
605
  },
606
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
607
  "output": {
608
  "final_answer": final,
609
- "used_sentences": [
610
- {"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
611
- for s in selected
612
- ],
613
  },
614
  "latency_ms_total": total_ms,
615
  "latency_ms_llm": llm_latency_ms,
616
  "openai": {
617
  "prompt_tokens": prompt_toks,
618
  "completion_tokens": completion_toks,
619
- "cost_usd": cost_usd,
620
- }
621
- if use_llm
622
- else None,
623
  }
624
  _safe_write_jsonl(LOG_PATH, record)
625
  return final
626
 
627
- # ---------- automated grid evaluation over weights ----------
628
- def run_weight_grid_eval(
629
- gold_csv: str,
630
- weight_grid: List[Dict[str, float]],
631
- k: int = 8,
632
- n_sentences: int = 4,
633
- ) -> None:
634
- """
635
- Automatically evaluate many (w_tfidf, w_bm25, w_emb) combinations
636
- on the full gold question set.
637
-
638
- - Reads questions from gold_csv (column 'question')
639
- - For each configuration in weight_grid, calls rag_reply(...)
640
- with use_llm=False and strict_quotes_only=True
641
- - All runs are logged into rag_logs.jsonl with a 'config_id'
642
- and the exact weights.
643
- """
644
- gold_df = pd.read_csv(gold_csv)
645
- if "question" not in gold_df.columns:
646
- raise ValueError("gold_csv must contain a 'question' column.")
647
- questions = gold_df["question"].astype(str).tolist()
648
-
649
- for cfg in weight_grid:
650
- wt = float(cfg.get("w_tfidf", 0.0))
651
- wb = float(cfg.get("w_bm25", 0.0))
652
- we = float(cfg.get("w_emb", 0.0))
653
- cid = cfg.get("id") or f"tfidf{wt}_bm25{wb}_emb{we}"
654
-
655
- print(
656
- f"\n[GridEval] Running config {cid} "
657
- f"(w_tfidf={wt}, w_bm25={wb}, w_emb={we}, k={k})"
658
- )
659
-
660
- for q in questions:
661
- _ = rag_reply(
662
- question=q,
663
- k=int(k),
664
- n_sentences=int(n_sentences),
665
- include_passages=False,
666
- use_llm=False,
667
- model=None,
668
- temperature=0.0,
669
- strict_quotes_only=True,
670
- w_tfidf=wt,
671
- w_bm25=wb,
672
- w_emb=we,
673
- config_id=cid,
674
- )
675
-
676
- print("✅ RAG core + grid evaluation helpers loaded.")
677
-
678
-
679
- # ===================== GRADIO UI WRAPPER =====================
680
-
681
- # --- Chat wrapper ---
682
- def rag_chat_fn(
683
- message,
684
- history,
685
- top_k,
686
- n_sentences,
687
- include_passages,
688
- w_tfidf,
689
- w_bm25,
690
- w_emb,
691
- ):
692
  if not message or not message.strip():
693
  return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
694
- return rag_reply(
695
- question=message,
696
- k=int(top_k),
697
- n_sentences=int(n_sentences),
698
- include_passages=bool(include_passages),
699
- use_llm=False,
700
- model=None,
701
- temperature=0.2,
702
- strict_quotes_only=False,
703
- w_tfidf=float(w_tfidf),
704
- w_bm25=float(w_bm25),
705
- w_emb=float(w_emb),
706
- config_id=None,
707
- )
708
-
709
- # --- Evaluation wrapper: returns log + file paths ---
710
- def run_eval_ui(gold_file, k):
711
- if gold_file is None:
712
- gold_path = Path("gold.csv")
713
- if not gold_path.exists():
714
- return (
715
- "**No gold.csv provided or found in the working directory.**\n"
716
- "Upload a file or place gold.csv next to app.py.",
717
- None,
718
- None,
719
- None,
720
- )
721
- gold_csv = str(gold_path)
722
- else:
723
- gold_csv = gold_file.name
724
-
725
- logs_jsonl = str(LOG_PATH)
726
- out_dir = str(ARTIFACT_DIR)
727
-
728
- # run evaluation and write artifacts
729
- evaluate_rag(
730
- gold_csv,
731
- logs_jsonl,
732
- k=int(k),
733
- out_dir=out_dir,
734
- group_by_weights=True,
735
- )
736
-
737
- base = ARTIFACT_DIR
738
- perq = base / "metrics_per_question.csv"
739
- agg = base / "metrics_aggregate.json"
740
- surf = base / "metrics_by_weights.csv"
741
-
742
- perq_path = str(perq) if perq.exists() else None
743
- agg_path = str(agg) if agg.exists() else None
744
- surf_path = str(surf) if surf.exists() else None
745
-
746
- log_msg = (
747
- "✅ Evaluation finished.\n\n"
748
- f"- Per-question metrics: `{perq}`\n"
749
- f"- Aggregate metrics: `{agg}`\n"
750
- f"- Config surface: `{surf}`\n\n"
751
- "Use the download links below to save the files."
752
- )
753
-
754
- return log_msg, perq_path, agg_path, surf_path
755
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
 
757
- # ------------- Build Gradio UI -----------------
758
- with gr.Blocks(title="Self-Sensing Concrete RAG") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
759
  gr.Markdown(
760
- "<h1>Self-Sensing Concrete Assistant — Hybrid RAG</h1>"
761
- "<p>Ask questions about self-sensing concrete; answers are grounded in your local PDFs.</p>"
 
 
 
762
  )
763
 
764
  with gr.Tabs():
765
- # --------- RAG Chat tab ---------
766
- with gr.Tab("📚 RAG Chat"):
767
  with gr.Row():
768
- top_k = gr.Slider(3, 15, value=8, step=1, label="Top-K chunks")
769
- n_sentences = gr.Slider(2, 8, value=4, step=1, label="Answer length (sentences)")
770
- include_passages = gr.Checkbox(
771
- value=False, label="Include supporting passages"
772
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
773
  with gr.Row():
 
 
 
 
 
774
  w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
775
  w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
776
- w_emb = gr.Slider(0.0, 1.0, value=W_EMB_DEFAULT, step=0.05, label="Dense weight")
 
 
 
 
 
 
777
 
778
  gr.ChatInterface(
779
  fn=rag_chat_fn,
780
- additional_inputs=[top_k, n_sentences, include_passages, w_tfidf, w_bm25, w_emb],
781
- title="Hybrid RAG Q&A",
782
- description="Hybrid BM25 + TF-IDF + dense retrieval with MMR sentence selection."
 
 
 
 
783
  )
784
 
785
- # --------- Evaluation tab ---------
786
- with gr.Tab("📏 Evaluate RAG"):
787
- gr.Markdown(
788
- "Upload **gold.csv** and compute retrieval metrics against "
789
- "`rag_artifacts/rag_logs.jsonl`.\n\n"
790
- "When you click **Run Evaluation**, the CSV/JSON files will be created "
791
- "and appear as download links below."
792
- )
793
- gold_file = gr.File(label="gold.csv", file_types=[".csv"])
794
- k_slider = gr.Slider(3, 15, value=8, step=1, label="k for Hit/Recall/nDCG")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
 
796
- btn_eval = gr.Button("Run Evaluation", variant="primary")
797
- eval_out = gr.Markdown(label="Evaluation log")
798
 
799
- perq_file = gr.File(label="Per-question metrics (CSV)")
800
- agg_file = gr.File(label="Aggregate metrics (JSON)")
801
- surf_file = gr.File(label="Config surface (CSV)")
802
 
803
- btn_eval.click(
804
- fn=run_eval_ui,
805
- inputs=[gold_file, k_slider],
806
- outputs=[eval_out, perq_file, agg_file, surf_file],
807
- )
808
 
809
- # ------------- Launch app -----------------
810
- if __name__ == "__main__":
811
- demo.queue().launch(
812
- server_name="0.0.0.0",
813
- server_port=7860,
814
- share=False,
815
- )
 
1
+ # ================================================================
2
+ # Self-Sensing Concrete Assistant Predictor (XGB) + Hybrid RAG
3
+ # - Uses local 'papers/' folder for literature
4
+ # - Robust MMR sentence selection (no list index errors)
5
+ # - Predictor: safe model caching + safe feature alignment
6
+ # - Stable categoricals ("NA"); no over-strict completeness gate
7
+ # - Fixed [[PAGE=...]] regex
8
+ # - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
9
+ # - UPDATED THEME: Dark-blue tabs + Evaluate tab + k-slider styling
10
+ # - PATCH: Per-question/aggregate File + JSON outputs now dark-themed via elem_id hooks
11
+ # - OPTIONAL JS: Adds .eval-active class when Evaluate tab is selected
12
+ # ================================================================
13
+
14
+ # ---------------------- Runtime flags (HF-safe) ----------------------
15
+ import os
16
+ os.environ["TRANSFORMERS_NO_TF"] = "1"
17
+ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
18
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
+
20
+ # ------------------------------- Imports ------------------------------
21
+ import re, joblib, warnings, json, traceback, time, uuid, subprocess, sys
22
  from pathlib import Path
23
  from typing import List, Dict, Any, Optional
24
 
 
26
  import pandas as pd
27
  import gradio as gr
28
 
29
+ warnings.filterwarnings("ignore", category=UserWarning)
 
30
 
31
+ # Optional deps (handled gracefully if missing)
 
 
32
  USE_DENSE = True
33
  try:
34
  from sentence_transformers import SentenceTransformer
 
41
  BM25Okapi = None
42
  print("rank_bm25 not installed; BM25 disabled (TF-IDF still works).")
43
 
44
+ # Optional OpenAI (for LLM paraphrase)
45
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
46
  OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
47
  try:
 
49
  except Exception:
50
  OpenAI = None
51
 
52
+ # LLM availability flag — used internally; UI remains hidden
53
+ LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
54
+
55
+ # ========================= Predictor (kept) =========================
56
+ CF_COL = "Conductive Filler Conc. (wt%)"
57
+ TARGET_COL = "Stress GF (MPa-1)"
58
+ CANON_NA = "NA" # canonical placeholder for categoricals
59
+
60
+ MAIN_VARIABLES = [
61
+ "Filler 1 Type",
62
+ "Filler 1 Diameter (µm)",
63
+ "Filler 1 Length (mm)",
64
+ CF_COL,
65
+ "Filler 1 Dimensionality",
66
+ "Filler 2 Type",
67
+ "Filler 2 Diameter (µm)",
68
+ "Filler 2 Length (mm)",
69
+ "Filler 2 Dimensionality",
70
+ "Specimen Volume (mm3)",
71
+ "Probe Count",
72
+ "Probe Material",
73
+ "W/B",
74
+ "S/B",
75
+ "Gauge Length (mm)",
76
+ "Curing Condition",
77
+ "Number of Fillers",
78
+ "Drying Temperature (°C)",
79
+ "Drying Duration (hr)",
80
+ "Loading Rate (MPa/s)",
81
+ "Modulus of Elasticity (GPa)",
82
+ "Current Type",
83
+ "Applied Voltage (V)"
84
+ ]
85
+
86
+ NUMERIC_COLS = {
87
+ "Filler 1 Diameter (µm)",
88
+ "Filler 1 Length (mm)",
89
+ CF_COL,
90
+ "Filler 2 Diameter (µm)",
91
+ "Filler 2 Length (mm)",
92
+ "Specimen Volume (mm3)",
93
+ "Probe Count",
94
+ "W/B",
95
+ "S/B",
96
+ "Gauge Length (mm)",
97
+ "Number of Fillers",
98
+ "Drying Temperature (°C)",
99
+ "Drying Duration (hr)",
100
+ "Loading Rate (MPa/s)",
101
+ "Modulus of Elasticity (GPa)",
102
+ "Applied Voltage (V)"
103
+ }
104
+
105
+ CATEGORICAL_COLS = {
106
+ "Filler 1 Type",
107
+ "Filler 1 Dimensionality",
108
+ "Filler 2 Type",
109
+ "Filler 2 Dimensionality",
110
+ "Probe Material",
111
+ "Curing Condition",
112
+ "Current Type"
113
+ }
114
+
115
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", CANON_NA]
116
+ CURRENT_CHOICES = ["DC", "AC", CANON_NA]
117
+
118
+ MODEL_CANDIDATES = [
119
+ "stress_gf_xgb.joblib",
120
+ "models/stress_gf_xgb.joblib",
121
+ "/home/user/app/stress_gf_xgb.joblib",
122
+ os.getenv("MODEL_PATH", "")
123
+ ]
124
+
125
+ # ---------- Model caching + status ----------
126
+ MODEL = None
127
+ MODEL_STATUS = "🔴 Model not loaded"
128
+
129
+ def _try_load_model():
130
+ global MODEL, MODEL_STATUS
131
+ for p in [x for x in MODEL_CANDIDATES if x]:
132
+ if os.path.exists(p):
133
+ try:
134
+ MODEL = joblib.load(p)
135
+ MODEL_STATUS = f"🟢 Loaded model: {Path(p).name}"
136
+ print("[ModelLoad] Loaded:", p)
137
+ return
138
+ except Exception as e:
139
+ print(f"[ModelLoad] Error from {p}: {e}")
140
+ traceback.print_exc()
141
+ MODEL = None
142
+ if MODEL is None:
143
+ MODEL_STATUS = "🔴 Model not found (place stress_gf_xgb.joblib at repo root or models/, or set MODEL_PATH)"
144
+ print("[ModelLoad]", MODEL_STATUS)
145
+
146
+ _try_load_model() # load at import time
147
+
148
+ def _canon_cat(v: Any) -> str:
149
+ """Stable, canonical category placeholder normalization."""
150
+ if v is None:
151
+ return CANON_NA
152
+ s = str(v).strip()
153
+ if s == "" or s.upper() in {"N/A", "NONE", "NULL"}:
154
+ return CANON_NA
155
+ return s
156
+
157
+ def _to_float_or_nan(v):
158
+ if v in ("", None):
159
+ return np.nan
160
+ try:
161
+ return float(str(v).replace(",", ""))
162
+ except Exception:
163
+ return np.nan
164
+
165
+ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
166
+ row = {}
167
+ for col in MAIN_VARIABLES:
168
+ v = form_dict.get(col, None)
169
+ if col in NUMERIC_COLS:
170
+ row[col] = _to_float_or_nan(v)
171
+ elif col in CATEGORICAL_COLS:
172
+ row[col] = _canon_cat(v)
173
+ else:
174
+ s = str(v).strip() if v is not None else ""
175
+ row[col] = s if s else CANON_NA
176
+ return pd.DataFrame([row], columns=MAIN_VARIABLES)
177
 
178
+ def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
179
+ """
180
+ SAFE alignment:
181
+ - If mdl.feature_names_in_ exists AND is a subset of df.columns (raw names), reorder to it.
182
+ - Else, try a Pipeline step (e.g., 'preprocessor') with feature_names_in_ subset of df.columns.
183
+ - Else, DO NOT align (let the pipeline handle columns by name).
184
+ """
185
+ try:
186
+ feat = getattr(mdl, "feature_names_in_", None)
187
+ if isinstance(feat, (list, np.ndarray, pd.Index)):
188
+ feat = list(feat)
189
+ if all(c in df.columns for c in feat):
190
+ return df[feat]
191
+
192
+ if hasattr(mdl, "named_steps"):
193
+ for key in ["preprocessor", "columntransformer"]:
194
+ if key in mdl.named_steps:
195
+ step = mdl.named_steps[key]
196
+ feat2 = getattr(step, "feature_names_in_", None)
197
+ if isinstance(feat2, (list, np.ndarray, pd.Index)):
198
+ feat2 = list(feat2)
199
+ if all(c in df.columns for c in feat2):
200
+ return df[feat2]
201
+ # fallback to first step if it exposes input names
202
+ try:
203
+ first_key = list(mdl.named_steps.keys())[0]
204
+ step = mdl.named_steps[first_key]
205
+ feat3 = getattr(step, "feature_names_in_", None)
206
+ if isinstance(feat3, (list, np.ndarray, pd.Index)):
207
+ feat3 = list(feat3)
208
+ if all(c in df.columns for c in feat3):
209
+ return df[feat3]
210
+ except Exception:
211
+ pass
212
+
213
+ return df
214
+ except Exception as e:
215
+ print(f"[Align] Skip aligning due to: {e}")
216
+ traceback.print_exc()
217
+ return df
218
+
219
+ def predict_fn(**kwargs):
220
+ """
221
+ Always attempt prediction.
222
+ - Missing numerics -> NaN (imputer handles)
223
+ - Categoricals -> 'NA'
224
+ - If model missing or inference error -> 0.0 (keeps UI stable)
225
+ """
226
+ if MODEL is None:
227
+ return 0.0
228
+ X_new = _coerce_to_row(kwargs)
229
+ X_new = _align_columns_to_model(X_new, MODEL)
230
+ try:
231
+ y_raw = MODEL.predict(X_new) # log1p or original scale depending on training
232
+ if getattr(MODEL, "target_is_log1p_", False):
233
+ y = np.expm1(y_raw)
234
+ else:
235
+ y = y_raw
236
+ y = float(np.asarray(y).ravel()[0])
237
+ return max(y, 0.0)
238
+ except Exception as e:
239
+ print(f"[Predict] {e}")
240
+ traceback.print_exc()
241
+ return 0.0
242
+
243
+ EXAMPLE = {
244
+ "Filler 1 Type": "CNT",
245
+ "Filler 1 Dimensionality": "1D",
246
+ "Filler 1 Diameter (µm)": 0.02,
247
+ "Filler 1 Length (mm)": 1.2,
248
+ CF_COL: 0.5,
249
+ "Filler 2 Type": "",
250
+ "Filler 2 Dimensionality": CANON_NA,
251
+ "Filler 2 Diameter (µm)": None,
252
+ "Filler 2 Length (mm)": None,
253
+ "Specimen Volume (mm3)": 1000,
254
+ "Probe Count": 2,
255
+ "Probe Material": "Copper",
256
+ "W/B": 0.4,
257
+ "S/B": 2.5,
258
+ "Gauge Length (mm)": 20,
259
+ "Curing Condition": "28d water, 20°C",
260
+ "Number of Fillers": 1,
261
+ "Drying Temperature (°C)": 60,
262
+ "Drying Duration (hr)": 24,
263
+ "Loading Rate (MPa/s)": 0.1,
264
+ "Modulus of Elasticity (GPa)": 25,
265
+ "Current Type": "DC",
266
+ "Applied Voltage (V)": 5.0,
267
+ }
268
+
269
+ def _fill_example():
270
+ return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
271
+
272
+ def _clear_all():
273
+ cleared = []
274
+ for col in MAIN_VARIABLES:
275
+ if col in NUMERIC_COLS:
276
+ cleared.append(None)
277
+ elif col in {"Filler 1 Dimensionality", "Filler 2 Dimensionality"}:
278
+ cleared.append(CANON_NA)
279
+ elif col == "Current Type":
280
+ cleared.append(CANON_NA)
281
+ else:
282
+ cleared.append("")
283
+ return cleared
284
 
285
+ # ========================= Hybrid RAG =========================
286
+ ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
287
  TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
288
  TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
289
  BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
290
  EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
291
  RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
292
 
293
+ LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
 
294
  USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
295
 
 
296
  W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
297
  W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
298
+ W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
299
 
 
300
  _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
301
  TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
 
302
  def sent_split(text: str) -> List[str]:
303
  sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
304
  return [s for s in sents if len(s.split()) >= 5]
 
305
  def tokenize(text: str) -> List[str]:
306
  return [t.lower() for t in TOKEN_RE.findall(text)]
307
 
 
308
  def _extract_pdf_text(pdf_path: Path) -> str:
309
  try:
310
+ import fitz
311
  doc = fitz.open(pdf_path)
312
  out = []
313
  for i, page in enumerate(doc):
 
331
  chunks, step = [], max(1, win_size - overlap)
332
  for i in range(0, len(sents), step):
333
  window = sents[i:i+win_size]
334
+ if not window: break
 
335
  chunks.append(" ".join(window))
336
  return chunks
337
 
 
338
  def _safe_init_st_model(name: str):
339
  global USE_DENSE
340
  if not USE_DENSE:
 
346
  USE_DENSE = False
347
  return None
348
 
 
349
  def build_or_load_hybrid(pdf_dir: Path):
350
+ # Build or load the hybrid retriever cache
351
+ have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
352
+ and RAG_META_PATH.exists()
353
+ and (BM25_TOK_PATH.exists() or BM25Okapi is None)
354
+ and (EMB_NPY_PATH.exists() or not USE_DENSE))
 
 
 
 
 
 
355
  if have_cache:
356
  vectorizer = joblib.load(TFIDF_VECT_PATH)
357
  X_tfidf = joblib.load(TFIDF_MAT_PATH)
 
361
  return vectorizer, X_tfidf, meta, bm25_toks, emb
362
 
363
  rows, all_tokens = [], []
364
+ pdf_paths = list(Path(pdf_dir).glob("**/*.pdf"))
365
+ print(f"Indexing PDFs in {pdf_dir} — found {len(pdf_paths)} files.")
366
  for pdf in pdf_paths:
367
  raw = _extract_pdf_text(pdf)
368
  if not raw.strip():
 
370
  for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
371
  rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
372
  all_tokens.append(tokenize(ch))
 
373
  if not rows:
374
  meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
375
+ vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
376
+ return vectorizer, X_tfidf, meta, all_tokens, emb
377
 
378
  meta = pd.DataFrame(rows)
379
+ from sklearn.feature_extraction.text import TfidfVectorizer
380
  vectorizer = TfidfVectorizer(
381
+ ngram_range=(1,2),
382
+ min_df=1, max_df=0.95,
383
+ sublinear_tf=True, smooth_idf=True,
 
 
384
  lowercase=True,
385
+ token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
386
  )
387
  X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
388
 
389
  emb = None
390
  if USE_DENSE:
391
  try:
392
+ st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 
 
393
  if st_model is not None:
394
  from sklearn.preprocessing import normalize as sk_normalize
395
+ em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
 
 
 
 
 
396
  emb = sk_normalize(em)
397
  np.save(EMB_NPY_PATH, emb)
398
  except Exception as e:
399
  print("Dense embedding failed:", e)
400
  emb = None
401
 
 
402
  joblib.dump(vectorizer, TFIDF_VECT_PATH)
403
  joblib.dump(X_tfidf, TFIDF_MAT_PATH)
404
  if BM25Okapi is not None:
405
  joblib.dump(all_tokens, BM25_TOK_PATH)
406
  meta.to_parquet(RAG_META_PATH, index=False)
 
407
  return vectorizer, X_tfidf, meta, all_tokens, emb
408
 
409
+ tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
 
 
410
  bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
411
+ st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 
 
412
 
 
413
  def _extract_page(text_chunk: str) -> str:
414
+ # Correct: [[PAGE=123]]
415
  m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
416
+ return (m[-1].group(1) if m else "?")
417
 
418
+ def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
 
 
 
 
 
 
419
  if rag_meta is None or rag_meta.empty:
420
  return pd.DataFrame()
421
 
422
+ # Dense scores
 
 
423
  if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
424
  try:
425
  from sklearn.preprocessing import normalize as sk_normalize
 
428
  dense_scores = emb_matrix @ q_emb
429
  except Exception as e:
430
  print("Dense query encoding failed:", e)
431
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
432
  else:
433
+ dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
434
 
435
+ # TF-IDF scores
436
  if tfidf_vectorizer is not None and tfidf_matrix is not None:
437
  q_vec = tfidf_vectorizer.transform([query])
438
  tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
439
  else:
440
+ tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
441
 
442
+ # BM25 scores
443
  if bm25 is not None:
444
+ q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-\/\.%]+", query)]
445
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
446
  else:
447
+ bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
448
 
449
  def _norm(x):
450
  x = np.asarray(x, dtype=float)
 
457
  s_bm25 = _norm(bm25_scores)
458
 
459
  total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
460
+ w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
461
 
462
  combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
463
  idx = np.argsort(-combo)[:k]
 
464
  hits = rag_meta.iloc[idx].copy()
465
  hits["score_dense"] = s_dense[idx]
466
  hits["score_tfidf"] = s_tfidf[idx]
 
468
  hits["score"] = combo[idx]
469
  return hits.reset_index(drop=True)
470
 
 
471
  def split_sentences(text: str) -> List[str]:
472
  sents = sent_split(text)
473
  return [s for s in sents if 6 <= len(s.split()) <= 60]
474
 
475
+ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
476
+ """
477
+ Robust MMR sentence picker:
478
+ - Handles empty pools
479
+ - Clamps top_n to pool size
480
+ - Avoids 'list index out of range'
481
+ """
482
+ # Build pool
483
  pool = []
484
  for _, row in hits.iterrows():
485
  doc = Path(row["doc_path"]).name
 
489
  continue
490
  for s in sents[:max(1, int(pool_per_chunk))]:
491
  pool.append({"sent": s, "doc": doc, "page": page})
492
+
493
  if not pool:
494
  return []
495
 
496
+ # Relevance vectors
497
  sent_texts = [p["sent"] for p in pool]
498
  use_dense = USE_DENSE and st_query_model is not None
 
499
  try:
500
  if use_dense:
501
  from sklearn.preprocessing import normalize as sk_normalize
502
  enc = st_query_model.encode([question] + sent_texts, convert_to_numpy=True)
503
  q_vec = sk_normalize(enc[:1])[0]
504
  S = sk_normalize(enc[1:])
505
+ rel = (S @ q_vec)
506
  def sim_fn(i, j): return float(S[i] @ S[j])
507
  else:
508
  from sklearn.feature_extraction.text import TfidfVectorizer
 
513
  num = (S[i] @ S[j].T)
514
  return float(num.toarray()[0, 0]) if hasattr(num, "toarray") else float(num)
515
  except Exception:
516
+ # Fallback: uniform relevance if vectorization fails
517
+ rel = np.ones(len(sent_texts), dtype=float)
518
  def sim_fn(i, j): return 0.0
519
 
520
+ # Normalize lambda_div
521
  lambda_div = float(np.clip(lambda_div, 0.0, 1.0))
522
 
523
+ # Select first by highest relevance
524
  remain = list(range(len(pool)))
525
+ if not remain:
526
+ return []
527
  first = int(np.argmax(rel))
528
  selected_idx = [first]
529
  selected = [pool[first]]
530
  remain.remove(first)
531
 
532
+ # Clamp top_n
533
  max_pick = min(int(top_n), len(pool))
534
  while len(selected) < max_pick and remain:
535
  cand_scores = []
 
537
  div_i = max(sim_fn(i, j) for j in selected_idx) if selected_idx else 0.0
538
  score = lambda_div * float(rel[i]) - (1.0 - lambda_div) * div_i
539
  cand_scores.append((score, i))
540
+ if not cand_scores:
541
+ break
542
  cand_scores.sort(reverse=True)
543
  _, best_i = cand_scores[0]
544
  selected_idx.append(best_i)
 
552
  return ""
553
  return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
554
 
555
+ # ========================= NEW: Instrumentation helpers =========================
556
+ LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
557
  OPENAI_IN_COST_PER_1K = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
558
  OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
559
 
 
567
  def _calc_cost_usd(prompt_toks, completion_toks):
568
  if prompt_toks is None or completion_toks is None:
569
  return None
570
+ return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
 
 
571
 
572
+ # ----------------- Modified to return (text, usage_dict) -----------------
573
+ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2):
 
 
 
 
 
574
  if not LLM_AVAILABLE:
575
  return None, None
576
  client = OpenAI(api_key=OPENAI_API_KEY)
577
  model = model or OPENAI_MODEL
 
578
  SYSTEM_PROMPT = (
579
  "You are a scientific assistant for self-sensing cementitious materials.\n"
580
  "Answer STRICTLY using the provided sentences.\n"
 
583
  )
584
  user_prompt = (
585
  f"Question: {question}\n\n"
586
+ f"Use ONLY these sentences to answer; keep their inline citations:\n" +
587
+ "\n".join(f"- {s}" for s in sentence_lines)
588
  )
 
589
  try:
590
  resp = client.responses.create(
591
  model=model,
 
609
  except Exception:
610
  return None, None
611
 
 
612
  def rag_reply(
613
  question: str,
614
  k: int = 8,
615
  n_sentences: int = 4,
616
  include_passages: bool = False,
617
  use_llm: bool = False,
618
+ model: str = None,
619
  temperature: float = 0.2,
620
  strict_quotes_only: bool = False,
621
  w_tfidf: float = W_TFIDF_DEFAULT,
622
+ w_bm25: float = W_BM25_DEFAULT,
623
+ w_emb: float = W_EMB_DEFAULT
 
624
  ) -> str:
625
  run_id = str(uuid.uuid4())
626
  t0_total = time.time()
627
  t0_retr = time.time()
628
 
629
+ # --- Retrieval ---
630
+ hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
 
 
 
 
 
631
  t1_retr = time.time()
632
  latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
633
 
634
  if hits is None or hits.empty:
635
+ final = "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
636
  record = {
637
  "run_id": run_id,
638
+ "ts": int(time.time()*1000),
639
  "inputs": {
640
+ "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
641
+ "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
642
+ "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
 
 
 
 
 
 
 
643
  },
644
  "retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
645
  "output": {"final_answer": final, "used_sentences": []},
646
+ "latency_ms_total": int((time.time()-t0_total)*1000),
647
+ "openai": None
648
  }
649
  _safe_write_jsonl(LOG_PATH, record)
650
  return final
651
 
652
+ # Select sentences
653
+ selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
654
+ header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
655
+ srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
656
+ coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
 
 
 
 
 
 
 
 
 
657
 
658
+ # Prepare retrieval list for logging
659
  retr_list = []
660
  for _, r in hits.iterrows():
661
+ retr_list.append({
662
+ "doc": Path(r["doc_path"]).name,
663
+ "page": _extract_page(r["text"]),
664
+ "score_tfidf": float(r.get("score_tfidf", 0.0)),
665
+ "score_bm25": float(r.get("score_bm25", 0.0)),
666
+ "score_dense": float(r.get("score_dense", 0.0)),
667
+ "combo_score": float(r.get("score", 0.0)),
668
+ })
669
+
670
+ # Strict quotes only (no LLM)
 
 
671
  if strict_quotes_only:
672
  if not selected:
673
+ final = f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
 
674
  else:
675
+ final = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
 
 
676
  final += f"\n\n**Citations:** {header_cites}{coverage_note}"
677
  if include_passages:
678
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
679
 
680
  record = {
681
  "run_id": run_id,
682
+ "ts": int(time.time()*1000),
683
  "inputs": {
684
+ "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
685
+ "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
686
+ "use_llm": False, "model": None, "temperature": float(temperature)
 
 
 
 
 
 
 
687
  },
688
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
689
  "output": {
690
  "final_answer": final,
691
+ "used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
 
 
 
692
  },
693
+ "latency_ms_total": int((time.time()-t0_total)*1000),
694
+ "openai": None
695
  }
696
  _safe_write_jsonl(LOG_PATH, record)
697
  return final
698
 
699
+ # Extractive or LLM synthesis
700
  extractive = compose_extractive(selected)
701
  llm_usage = None
702
  llm_latency_ms = None
 
703
  if use_llm and selected:
704
  lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
705
  t0_llm = time.time()
706
+ llm_text, llm_usage = synthesize_with_llm(question, lines, model=model, temperature=temperature)
 
 
707
  t1_llm = time.time()
708
  llm_latency_ms = int((t1_llm - t0_llm) * 1000)
709
 
710
  if llm_text:
711
+ final = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
712
  if include_passages:
713
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
714
  else:
715
  if not extractive:
716
+ final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
 
 
717
  else:
718
+ final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
719
  if include_passages:
720
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
721
  else:
722
  if not extractive:
723
+ final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
 
 
724
  else:
725
+ final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
726
  if include_passages:
727
+ final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
728
 
729
+ # --------- Log full run ---------
730
  prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
731
  completion_toks = llm_usage.get("completion_tokens") if llm_usage else None
732
  cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
 
734
  total_ms = int((time.time() - t0_total) * 1000)
735
  record = {
736
  "run_id": run_id,
737
+ "ts": int(time.time()*1000),
738
  "inputs": {
739
+ "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
740
+ "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
741
+ "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
 
 
 
 
 
 
 
742
  },
743
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
744
  "output": {
745
  "final_answer": final,
746
+ "used_sentences": [{"sent": s['sent'], "doc": s['doc'], "page": s['page']} for s in selected]
 
 
 
747
  },
748
  "latency_ms_total": total_ms,
749
  "latency_ms_llm": llm_latency_ms,
750
  "openai": {
751
  "prompt_tokens": prompt_toks,
752
  "completion_tokens": completion_toks,
753
+ "cost_usd": cost_usd
754
+ } if use_llm else None
 
 
755
  }
756
  _safe_write_jsonl(LOG_PATH, record)
757
  return final
758
 
759
+ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
760
+ use_llm, model_name, temperature, strict_quotes_only,
761
+ w_tfidf, w_bm25, w_emb):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
  if not message or not message.strip():
763
  return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
764
+ try:
765
+ return rag_reply(
766
+ question=message,
767
+ k=int(top_k),
768
+ n_sentences=int(n_sentences),
769
+ include_passages=bool(include_passages),
770
+ use_llm=bool(use_llm),
771
+ model=(model_name or None),
772
+ temperature=float(temperature),
773
+ strict_quotes_only=bool(strict_quotes_only),
774
+ w_tfidf=float(w_tfidf),
775
+ w_bm25=float(w_bm25),
776
+ w_emb=float(w_emb),
777
+ )
778
+ except Exception as e:
779
+ return f"RAG error: {e}"
780
+
781
+ # ========================= UI (science-oriented styling) =========================
782
+ CSS = """
783
+ /* Science-oriented: crisp contrast + readable numerics */
784
+ * {font-family: ui-sans-serif, system-ui, -apple-system, 'Segoe UI', Roboto, 'Helvetica Neue', Arial;}
785
+ .gradio-container {
786
+ background: linear-gradient(135deg, #0b1020 0%, #0c2b1a 60%, #0a2b4d 100%) !important;
787
+ }
788
+ .card {background: rgba(255,255,255,0.06) !important; border: 1px solid rgba(255,255,255,0.14); border-radius: 12px;}
789
+ label {color: #e8f7ff !important; text-shadow: 0 1px 0 rgba(0,0,0,0.35); cursor: pointer;}
790
+ input[type="number"] {font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;}
791
+
792
+ /* Checkbox clickability fixes */
793
+ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !important; }
794
+ .gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
795
+ #rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
796
+
797
+ /* RAG tab styling */
798
+ #rag-tab .block, #rag-tab .group, #rag-tab .accordion {
799
+ background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
800
+ border-radius: 12px;
801
+ border: 1px solid rgba(255,255,255,0.14);
802
+ }
803
+ #rag-tab input, #rag-tab textarea, #rag-tab select, #rag-tab .scroll-hide, #rag-tab .chatbot textarea {
804
+ background: rgba(17, 24, 39, 0.85) !important;
805
+ border: 1px solid #60a5fa !important;
806
+ color: #e5f2ff !important;
807
+ }
808
+ #rag-tab input[type="range"] { accent-color: #22c55e !important; }
809
+ #rag-tab button { border-radius: 10px !important; font-weight: 600 !important; }
810
+ #rag-tab .chatbot {
811
+ background: rgba(15, 23, 42, 0.6) !important;
812
+ border: 1px solid rgba(148, 163, 184, 0.35) !important;
813
+ }
814
+ #rag-tab .message.user {
815
+ background: rgba(34, 197, 94, 0.15) !important;
816
+ border-left: 3px solid #22c55e !important;
817
+ }
818
+ #rag-tab .message.bot {
819
+ background: rgba(59, 130, 246, 0.15) !important;
820
+ border-left: 3px solid #60a5fa !important;
821
+ color: #eef6ff !important;
822
+ }
823
+
824
+ /* Evaluate tab dark/high-contrast styling */
825
+ #eval-tab .block, #eval-tab .group, #eval-tab .accordion {
826
+ background: linear-gradient(165deg, #0a0f1f 0%, #0d1a31 60%, #0a1c2e 100%) !important;
827
+ border-radius: 12px;
828
+ border: 1px solid rgba(139, 197, 255, 0.28);
829
+ }
830
+ #eval-tab label, #eval-tab .markdown, #eval-tab .prose, #eval-tab p, #eval-tab span {
831
+ color: #e6f2ff !important;
832
+ }
833
+ #eval-tab input, #eval-tab .gr-file, #eval-tab .scroll-hide, #eval-tab textarea, #eval-tab select {
834
+ background: rgba(8, 13, 26, 0.9) !important;
835
+ border: 1px solid #3b82f6 !important;
836
+ color: #dbeafe !important;
837
+ }
838
+ #eval-tab input[type="range"] { accent-color: #22c55e !important; }
839
+ #eval-tab button {
840
+ border-radius: 10px !important;
841
+ font-weight: 700 !important;
842
+ background: #0ea5e9 !important;
843
+ color: #001321 !important;
844
+ border: 1px solid #7dd3fc !important;
845
+ }
846
+ #eval-tab .gr-json, #eval-tab .markdown pre, #eval-tab .markdown code {
847
+ background: rgba(2, 6, 23, 0.85) !important;
848
+ color: #e2e8f0 !important;
849
+ border: 1px solid rgba(148, 163, 184, 0.3) !important;
850
+ border-radius: 10px !important;
851
+ }
852
+
853
+ /* Predictor output emphasis */
854
+ #pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
855
+
856
+ /* Tab header: darker blue theme for all tabs */
857
+ .gradio-container .tab-nav button[role="tab"] {
858
+ background: #0b1b34 !important;
859
+ color: #cfe6ff !important;
860
+ border: 1px solid #1e3a8a !important;
861
+ }
862
+ .gradio-container .tab-nav button[role="tab"][aria-selected="true"] {
863
+ background: #0e2a57 !important;
864
+ color: #e0f2fe !important;
865
+ border-color: #3b82f6 !important;
866
+ }
867
+
868
+ /* Evaluate tab: enforce dark-blue text for labels/marks */
869
+ #eval-tab .label,
870
+ #eval-tab label,
871
+ #eval-tab .gr-slider .label,
872
+ #eval-tab .wrap .label,
873
+ #eval-tab .prose,
874
+ #eval-tab .markdown,
875
+ #eval-tab p,
876
+ #eval-tab span {
877
+ color: #cfe6ff !important; /* softer than pure white */
878
+ }
879
+
880
+ /* Target the specific k-slider label strongly */
881
+ #k-slider .label,
882
+ #k-slider label,
883
+ #k-slider .wrap .label {
884
+ color: #cfe6ff !important;
885
+ text-shadow: 0 1px 0 rgba(0,0,0,0.35);
886
+ }
887
+
888
+ /* Slider track/thumb (dark blue gradient + blue thumb) */
889
+ #eval-tab input[type="range"] {
890
+ accent-color: #3b82f6 !important; /* fallback */
891
+ }
892
+
893
+ /* WebKit */
894
+ #eval-tab input[type="range"]::-webkit-slider-runnable-track {
895
+ height: 6px;
896
+ background: linear-gradient(90deg, #0b3b68, #1e3a8a);
897
+ border-radius: 4px;
898
+ }
899
+ #eval-tab input[type="range"]::-webkit-slider-thumb {
900
+ -webkit-appearance: none;
901
+ appearance: none;
902
+ margin-top: -6px; /* centers thumb on 6px track */
903
+ width: 18px; height: 18px;
904
+ background: #1d4ed8;
905
+ border: 1px solid #60a5fa;
906
+ border-radius: 50%;
907
+ }
908
+
909
+ /* Firefox */
910
+ #eval-tab input[type="range"]::-moz-range-track {
911
+ height: 6px;
912
+ background: linear-gradient(90deg, #0b3b68, #1e3a8a);
913
+ border-radius: 4px;
914
+ }
915
+ #eval-tab input[type="range"]::-moz-range-thumb {
916
+ width: 18px; height: 18px;
917
+ background: #1d4ed8;
918
+ border: 1px solid #60a5fa;
919
+ border-radius: 50%;
920
+ }
921
+
922
+ /* ======== PATCH: Style the File + JSON outputs by ID ======== */
923
+ #perq-file, #agg-file {
924
+ background: rgba(8, 13, 26, 0.9) !important;
925
+ border: 1px solid #3b82f6 !important;
926
+ border-radius: 12px !important;
927
+ padding: 8px !important;
928
+ }
929
+ #perq-file * , #agg-file * { color: #dbeafe !important; }
930
+ #perq-file a, #agg-file a {
931
+ background: #0e2a57 !important;
932
+ color: #e0f2fe !important;
933
+ border: 1px solid #60a5fa !important;
934
+ border-radius: 8px !important;
935
+ padding: 6px 10px !important;
936
+ text-decoration: none !important;
937
+ }
938
+ #perq-file a:hover, #agg-file a:hover {
939
+ background: #10356f !important;
940
+ border-color: #93c5fd !important;
941
+ }
942
+ /* File preview wrappers (covers multiple Gradio render modes) */
943
+ #perq-file .file-preview, #agg-file .file-preview,
944
+ #perq-file .wrap, #agg-file .wrap {
945
+ background: rgba(2, 6, 23, 0.85) !important;
946
+ border-radius: 10px !important;
947
+ border: 1px solid rgba(148,163,184,.3) !important;
948
+ }
949
+
950
+ /* JSON output: dark panel + readable text */
951
+ #agg-json {
952
+ background: rgba(2, 6, 23, 0.85) !important;
953
+ border: 1px solid rgba(148,163,184,.35) !important;
954
+ border-radius: 12px !important;
955
+ padding: 8px !important;
956
+ }
957
+ #agg-json *, #agg-json .json, #agg-json .wrap { color: #e6f2ff !important; }
958
+ #agg-json pre, #agg-json code {
959
+ background: rgba(4, 10, 24, 0.9) !important;
960
+ color: #e2e8f0 !important;
961
+ border: 1px solid rgba(148,163,184,.35) !important;
962
+ border-radius: 10px !important;
963
+ }
964
+ /* Tree/overflow modes */
965
+ #agg-json [data-testid="json-tree"],
966
+ #agg-json [role="tree"],
967
+ #agg-json .overflow-auto {
968
+ background: rgba(4, 10, 24, 0.9) !important;
969
+ color: #e6f2ff !important;
970
+ border-radius: 10px !important;
971
+ border: 1px solid rgba(148,163,184,.35) !important;
972
+ }
973
+
974
+ /* Eval log markdown */
975
+ #eval-log, #eval-log * { color: #cfe6ff !important; }
976
+ #eval-log pre, #eval-log code {
977
+ background: rgba(2, 6, 23, 0.85) !important;
978
+ color: #e2e8f0 !important;
979
+ border: 1px solid rgba(148,163,184,.3) !important;
980
+ border-radius: 10px !important;
981
+ }
982
+
983
+ /* When Evaluate tab is active and JS has added .eval-active, bump contrast subtly */
984
+ #eval-tab.eval-active .block,
985
+ #eval-tab.eval-active .group {
986
+ border-color: #60a5fa !important;
987
+ }
988
+ #eval-tab.eval-active .label {
989
+ color: #e6f2ff !important;
990
+ }
991
+ """
992
+
993
+ theme = gr.themes.Soft(
994
+ primary_hue="blue",
995
+ neutral_hue="green"
996
+ ).set(
997
+ body_background_fill="#0b1020",
998
+ body_text_color="#e0f2fe",
999
+ input_background_fill="#0f172a",
1000
+ input_border_color="#1e40af",
1001
+ button_primary_background_fill="#2563eb",
1002
+ button_primary_text_color="#ffffff",
1003
+ button_secondary_background_fill="#14532d",
1004
+ button_secondary_text_color="#ecfdf5",
1005
+ )
1006
 
1007
+ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1008
+ # Optional: JS to toggle .eval-active when Evaluate tab selected
1009
+ gr.HTML("""
1010
+ <script>
1011
+ (function(){
1012
+ const applyEvalActive = () => {
1013
+ const selected = document.querySelector('.tab-nav button[role="tab"][aria-selected="true"]');
1014
+ const evalPanel = document.querySelector('#eval-tab');
1015
+ if (!evalPanel) return;
1016
+ if (selected && /Evaluate/.test(selected.textContent)) {
1017
+ evalPanel.classList.add('eval-active');
1018
+ } else {
1019
+ evalPanel.classList.remove('eval-active');
1020
+ }
1021
+ };
1022
+ document.addEventListener('click', function(e) {
1023
+ if (e.target && e.target.getAttribute('role') === 'tab') {
1024
+ setTimeout(applyEvalActive, 50);
1025
+ }
1026
+ }, true);
1027
+ document.addEventListener('DOMContentLoaded', applyEvalActive);
1028
+ setTimeout(applyEvalActive, 300);
1029
+ })();
1030
+ </script>
1031
+ """)
1032
+
1033
  gr.Markdown(
1034
+ "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
1035
+ "<p style='opacity:.9'>"
1036
+ "Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
1037
+ "Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection."
1038
+ "</p>"
1039
  )
1040
 
1041
  with gr.Tabs():
1042
+ # ------------------------- Predictor Tab -------------------------
1043
+ with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
1044
  with gr.Row():
1045
+ with gr.Column(scale=7):
1046
+ with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
1047
+ f1_type = gr.Textbox(label="Filler 1 Type *", placeholder="e.g., CNT, Graphite, Steel fiber")
1048
+ f1_diam = gr.Number(label="Filler 1 Diameter (µm) *")
1049
+ f1_len = gr.Number(label="Filler 1 Length (mm) *")
1050
+ cf_conc = gr.Number(label=f"{CF_COL} *", info="Weight percent of total binder")
1051
+ f1_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 1 Dimensionality *")
1052
+
1053
+ with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
1054
+ f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
1055
+ f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
1056
+ f2_len = gr.Number(label="Filler 2 Length (mm)")
1057
+ f2_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 2 Dimensionality")
1058
+
1059
+ with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
1060
+ spec_vol = gr.Number(label="Specimen Volume (mm3) *")
1061
+ probe_cnt = gr.Number(label="Probe Count *")
1062
+ probe_mat = gr.Textbox(label="Probe Material *", placeholder="e.g., Copper, Silver paste")
1063
+ wb = gr.Number(label="W/B *")
1064
+ sb = gr.Number(label="S/B *")
1065
+ gauge_len = gr.Number(label="Gauge Length (mm) *")
1066
+ curing = gr.Textbox(label="Curing Condition *", placeholder="e.g., 28d water, 20°C")
1067
+ n_fillers = gr.Number(label="Number of Fillers *")
1068
+
1069
+ with gr.Accordion("Processing", open=False, elem_classes=["card"]):
1070
+ dry_temp = gr.Number(label="Drying Temperature (°C)")
1071
+ dry_hrs = gr.Number(label="Drying Duration (hr)")
1072
+
1073
+ with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
1074
+ load_rate = gr.Number(label="Loading Rate (MPa/s)")
1075
+ E_mod = gr.Number(label="Modulus of Elasticity (GPa) *")
1076
+ current = gr.Dropdown(CURRENT_CHOICES, value=CANON_NA, label="Current Type")
1077
+ voltage = gr.Number(label="Applied Voltage (V)")
1078
+
1079
+ with gr.Column(scale=5):
1080
+ with gr.Group(elem_classes=["card"]):
1081
+ out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", value=0.0, precision=6, elem_id="pred-out")
1082
+ gr.Markdown(f"<small>{MODEL_STATUS}</small>")
1083
+ with gr.Row():
1084
+ btn_pred = gr.Button("Predict", variant="primary")
1085
+ btn_clear = gr.Button("Clear")
1086
+ btn_demo = gr.Button("Fill Example")
1087
+
1088
+ with gr.Accordion("About this model", open=False, elem_classes=["card"]):
1089
+ gr.Markdown(
1090
+ "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
1091
+ "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model may train on log1p; saved flag used at inference).\n"
1092
+ "- Missing values are safely imputed per-feature.\n"
1093
+ "- Trained columns:\n"
1094
+ f" `{', '.join(MAIN_VARIABLES)}`",
1095
+ elem_classes=["prose"]
1096
+ )
1097
+
1098
+ inputs_in_order = [
1099
+ f1_type, f1_diam, f1_len, cf_conc,
1100
+ f1_dim, f2_type, f2_diam, f2_len,
1101
+ f2_dim, spec_vol, probe_cnt, probe_mat,
1102
+ wb, sb, gauge_len, curing, n_fillers,
1103
+ dry_temp, dry_hrs, load_rate,
1104
+ E_mod, current, voltage
1105
+ ]
1106
+
1107
+ def _predict_wrapper(*vals):
1108
+ data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
1109
+ return predict_fn(**data)
1110
+
1111
+ btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
1112
+ btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order).then(lambda: 0.0, outputs=out_pred)
1113
+ btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
1114
+
1115
+ # ------------------------- Literature Tab -------------------------
1116
+ with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
1117
+ pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
1118
+ gr.Markdown(
1119
+ f"Using local folder <code>papers/</code> — **{pdf_count} PDF(s)** indexed. "
1120
+ "Upload more PDFs and reload the Space to expand coverage. Answers cite (Doc.pdf, p.X)."
1121
+ )
1122
  with gr.Row():
1123
+ top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
1124
+ n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
1125
+ include_passages = gr.Checkbox(value=False, label="Include supporting passages", interactive=True)
1126
+
1127
+ with gr.Accordion("Retriever weights (advanced)", open=False):
1128
  w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
1129
  w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
1130
+ w_emb = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
1131
+
1132
+ # Hidden states (unchanged)
1133
+ state_use_llm = gr.State(LLM_AVAILABLE)
1134
+ state_model_name = gr.State(os.getenv("OPENAI_MODEL", OPENAI_MODEL))
1135
+ state_temperature = gr.State(0.2)
1136
+ state_strict = gr.State(False)
1137
 
1138
  gr.ChatInterface(
1139
  fn=rag_chat_fn,
1140
+ additional_inputs=[
1141
+ top_k, n_sentences, include_passages,
1142
+ state_use_llm, state_model_name, state_temperature, state_strict,
1143
+ w_tfidf, w_bm25, w_emb
1144
+ ],
1145
+ title="Literature Q&A",
1146
+ description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
1147
  )
1148
 
1149
+ # ====== Evaluate (Gold vs Logs) — darker, higher-contrast ======
1150
+ with gr.Tab("📏 Evaluate (Gold vs Logs)", elem_id="eval-tab"):
1151
+ gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
1152
+ with gr.Row():
1153
+ gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
1154
+ k_slider = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG", elem_id="k-slider")
1155
+ with gr.Row():
1156
+ btn_eval = gr.Button("Compute Metrics", variant="primary")
1157
+ with gr.Row():
1158
+ out_perq = gr.File(label="Per-question metrics (CSV)", elem_id="perq-file")
1159
+ out_agg = gr.File(label="Aggregate metrics (JSON)", elem_id="agg-file")
1160
+ out_json = gr.JSON(label="Aggregate summary", elem_id="agg-json")
1161
+ out_log = gr.Markdown(label="Run log", elem_id="eval-log")
1162
+
1163
+ def _run_eval_inproc(gold_path: str, k: int = 8):
1164
+ import json as _json
1165
+ out_dir = str(ARTIFACT_DIR)
1166
+ logs = str(LOG_PATH)
1167
+ cmd = [
1168
+ sys.executable, "rag_eval_metrics.py",
1169
+ "--gold_csv", gold_path,
1170
+ "--logs_jsonl", logs,
1171
+ "--k", str(k),
1172
+ "--out_dir", out_dir
1173
+ ]
1174
+ try:
1175
+ p = subprocess.run(cmd, capture_output=True, text=True, check=False)
1176
+ stdout = p.stdout or ""
1177
+ stderr = p.stderr or ""
1178
+ perq = ARTIFACT_DIR / "metrics_per_question.csv"
1179
+ agg = ARTIFACT_DIR / "metrics_aggregate.json"
1180
+ agg_json = {}
1181
+ if agg.exists():
1182
+ agg_json = _json.loads(agg.read_text(encoding="utf-8"))
1183
+ report = "```\n" + (stdout.strip() or "(no stdout)") + ("\n" + stderr.strip() if stderr else "") + "\n```"
1184
+ return (str(perq) if perq.exists() else None,
1185
+ str(agg) if agg.exists() else None,
1186
+ agg_json,
1187
+ report)
1188
+ except Exception as e:
1189
+ return (None, None, {}, f"**Eval error:** {e}")
1190
+
1191
+ def _eval_wrapper(gf, k):
1192
+ from pathlib import Path
1193
+ if gf is None:
1194
+ default_gold = Path("gold.csv")
1195
+ if not default_gold.exists():
1196
+ return None, None, {}, "**No gold.csv provided or found in repo root.**"
1197
+ gold_path = str(default_gold)
1198
+ else:
1199
+ gold_path = gf.name
1200
+ return _run_eval_inproc(gold_path, int(k))
1201
+
1202
+ btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider],
1203
+ outputs=[out_perq, out_agg, out_json, out_log])
1204
+
1205
+ # ------------- Launch -------------
1206
+ if __name__ == "__main__":
1207
+ demo.queue().launch()
1208
+ import os
1209
+ import pandas as pd
1210
 
1211
+ # Folder where your RAG files are stored
1212
+ folder = "papers" # change if needed
1213
 
1214
+ # List all files in the folder
1215
+ files = sorted(os.listdir(folder))
 
1216
 
1217
+ # Save them to a CSV file
1218
+ pd.DataFrame({"doc": files}).to_csv("paper_list.csv", index=False)
1219
+
1220
+ print("✅ Saved paper_list.csv with", len(files), "papers")
 
1221