Inframat-x commited on
Commit
0617995
·
verified ·
1 Parent(s): 5d07d24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +649 -300
app.py CHANGED
@@ -5,10 +5,9 @@
5
  # - Predictor: safe model caching + safe feature alignment
6
  # - Stable categoricals ("NA"); no over-strict completeness gate
7
  # - Fixed [[PAGE=...]] regex
8
- # - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
9
- # - UPDATED THEME: Dark-blue tabs + Evaluate tab + k-slider styling
10
- # - PATCH: Per-question/aggregate File + JSON outputs now dark-themed via elem_id hooks
11
- # - NEW: APA-like inline citations via CITATION_MAP + format_citation()
12
  # ================================================================
13
 
14
  # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -18,7 +17,15 @@ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
18
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
19
 
20
  # ------------------------------- Imports ------------------------------
21
- import re, joblib, warnings, json, traceback, time, uuid, subprocess, sys
 
 
 
 
 
 
 
 
22
  from pathlib import Path
23
  from typing import List, Dict, Any, Optional
24
 
@@ -43,7 +50,7 @@ except Exception:
43
 
44
  # Optional OpenAI (for LLM paraphrase)
45
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
46
- OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
47
  try:
48
  from openai import OpenAI
49
  except Exception:
@@ -56,36 +63,10 @@ LLM_AVAILABLE = (
56
  and OpenAI is not None
57
  )
58
 
59
- # ========================= APA-style citation helpers =========================
60
-
61
- # Map PDF basenames → short APA-style in-text citations.
62
- # IMPORTANT: edit these entries to match your actual filenames + desired citations.
63
- CITATION_MAP: Dict[str, str] = {
64
- # "filename.pdf": "Author et al., YEAR",
65
- # Examples (you can edit/remove these):
66
- "S92-Research-on-the-self-sensing-and-mechanical-properties-of_2021_Cement-and-Co.pdf": "Omar et al., 2021",
67
- "S60-Study-on-self-sensing-capabilities-of-smart-cements-filled-with-graphene-oxide-under-dynamic-cyclic-loading.pdf": "Zhang et al., 2019",
68
- "S59-Modifying self-sensing cement-based composites through multiscale composition.pdf": "Li et al., 2020",
69
- }
70
-
71
- def format_citation(doc_name: str, page: Any) -> str:
72
- """
73
- Convert a doc filename + page into an APA-like inline citation.
74
- Fallback: use filename if doc not in CITATION_MAP.
75
- """
76
- base = Path(doc_name).name
77
- short = CITATION_MAP.get(base, base)
78
- if page is None:
79
- return short
80
- page_str = str(page).strip()
81
- if page_str == "" or page_str == "?":
82
- return short
83
- return f"{short}, p. {page_str}"
84
-
85
  # ========================= Predictor (kept) =========================
86
- CF_COL = "Conductive Filler Conc. (wt%)"
87
  TARGET_COL = "Stress GF (MPa-1)"
88
- CANON_NA = "NA" # canonical placeholder for categoricals
89
 
90
  MAIN_VARIABLES = [
91
  "Filler 1 Type",
@@ -110,7 +91,7 @@ MAIN_VARIABLES = [
110
  "Loading Rate (MPa/s)",
111
  "Modulus of Elasticity (GPa)",
112
  "Current Type",
113
- "Applied Voltage (V)"
114
  ]
115
 
116
  NUMERIC_COLS = {
@@ -129,7 +110,7 @@ NUMERIC_COLS = {
129
  "Drying Duration (hr)",
130
  "Loading Rate (MPa/s)",
131
  "Modulus of Elasticity (GPa)",
132
- "Applied Voltage (V)"
133
  }
134
 
135
  CATEGORICAL_COLS = {
@@ -139,23 +120,24 @@ CATEGORICAL_COLS = {
139
  "Filler 2 Dimensionality",
140
  "Probe Material",
141
  "Curing Condition",
142
- "Current Type"
143
  }
144
 
145
- DIM_CHOICES = ["0D", "1D", "2D", "3D", CANON_NA]
146
  CURRENT_CHOICES = ["DC", "AC", CANON_NA]
147
 
148
  MODEL_CANDIDATES = [
149
  "stress_gf_xgb.joblib",
150
  "models/stress_gf_xgb.joblib",
151
  "/home/user/app/stress_gf_xgb.joblib",
152
- os.getenv("MODEL_PATH", "")
153
  ]
154
 
155
  # ---------- Model caching + status ----------
156
  MODEL = None
157
  MODEL_STATUS = "🔴 Model not loaded"
158
 
 
159
  def _try_load_model():
160
  global MODEL, MODEL_STATUS
161
  for p in [x for x in MODEL_CANDIDATES if x]:
@@ -171,12 +153,15 @@ def _try_load_model():
171
  MODEL = None
172
  if MODEL is None:
173
  MODEL_STATUS = (
174
- "🔴 Model not found (place stress_gf_xgb.joblib at repo root or models/, or set MODEL_PATH)"
 
175
  )
176
  print("[ModelLoad]", MODEL_STATUS)
177
 
 
178
  _try_load_model() # load at import time
179
 
 
180
  def _canon_cat(v: Any) -> str:
181
  """Stable, canonical category placeholder normalization."""
182
  if v is None:
@@ -186,6 +171,7 @@ def _canon_cat(v: Any) -> str:
186
  return CANON_NA
187
  return s
188
 
 
189
  def _to_float_or_nan(v):
190
  if v in ("", None):
191
  return np.nan
@@ -194,6 +180,7 @@ def _to_float_or_nan(v):
194
  except Exception:
195
  return np.nan
196
 
 
197
  def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
198
  row = {}
199
  for col in MAIN_VARIABLES:
@@ -207,6 +194,7 @@ def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
207
  row[col] = s if s else CANON_NA
208
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
209
 
 
210
  def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
211
  """
212
  SAFE alignment:
@@ -248,6 +236,7 @@ def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
248
  traceback.print_exc()
249
  return df
250
 
 
251
  def predict_fn(**kwargs):
252
  """
253
  Always attempt prediction.
@@ -272,6 +261,7 @@ def predict_fn(**kwargs):
272
  traceback.print_exc()
273
  return 0.0
274
 
 
275
  EXAMPLE = {
276
  "Filler 1 Type": "CNT",
277
  "Filler 1 Dimensionality": "1D",
@@ -298,9 +288,11 @@ EXAMPLE = {
298
  "Applied Voltage (V)": 5.0,
299
  }
300
 
 
301
  def _fill_example():
302
  return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
303
 
 
304
  def _clear_all():
305
  cleared = []
306
  for col in MAIN_VARIABLES:
@@ -314,32 +306,41 @@ def _clear_all():
314
  cleared.append("")
315
  return cleared
316
 
 
317
  # ========================= Hybrid RAG =========================
318
- ARTIFACT_DIR = Path("rag_artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
 
319
  TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
320
- TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
321
- BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
322
- EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
323
- RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
324
 
325
- LOCAL_PDF_DIR = Path("papers"); LOCAL_PDF_DIR.mkdir(exist_ok=True)
 
326
  USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
327
 
328
  W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
329
- W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
330
- W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
331
 
332
  _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
333
- TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
 
 
334
  def sent_split(text: str) -> List[str]:
335
  sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
336
  return [s for s in sents if len(s.split()) >= 5]
 
 
337
  def tokenize(text: str) -> List[str]:
338
  return [t.lower() for t in TOKEN_RE.findall(text)]
339
 
 
340
  def _extract_pdf_text(pdf_path: Path) -> str:
341
  try:
342
  import fitz
 
343
  doc = fitz.open(pdf_path)
344
  out = []
345
  for i, page in enumerate(doc):
@@ -348,6 +349,7 @@ def _extract_pdf_text(pdf_path: Path) -> str:
348
  except Exception:
349
  try:
350
  from pypdf import PdfReader
 
351
  reader = PdfReader(str(pdf_path))
352
  out = []
353
  for i, p in enumerate(reader.pages):
@@ -358,16 +360,18 @@ def _extract_pdf_text(pdf_path: Path) -> str:
358
  print(f"PDF read error ({pdf_path}): {e}")
359
  return ""
360
 
 
361
  def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
362
  sents = sent_split(text)
363
  chunks, step = [], max(1, win_size - overlap)
364
  for i in range(0, len(sents), step):
365
- window = sents[i:i+win_size]
366
  if not window:
367
  break
368
  chunks.append(" ".join(window))
369
  return chunks
370
 
 
371
  def _safe_init_st_model(name: str):
372
  global USE_DENSE
373
  if not USE_DENSE:
@@ -379,20 +383,26 @@ def _safe_init_st_model(name: str):
379
  USE_DENSE = False
380
  return None
381
 
 
382
  def build_or_load_hybrid(pdf_dir: Path):
383
  # Build or load the hybrid retriever cache
384
  have_cache = (
385
- TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
 
386
  and RAG_META_PATH.exists()
387
  and (BM25_TOK_PATH.exists() or BM25Okapi is None)
388
  and (EMB_NPY_PATH.exists() or not USE_DENSE)
389
  )
390
  if have_cache:
391
  vectorizer = joblib.load(TFIDF_VECT_PATH)
392
- X_tfidf = joblib.load(TFIDF_MAT_PATH)
393
- meta = pd.read_parquet(RAG_META_PATH)
394
- bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
395
- emb = np.load(EMB_NPY_PATH) if (USE_DENSE and EMB_NPY_PATH.exists()) else None
 
 
 
 
396
  return vectorizer, X_tfidf, meta, bm25_toks, emb
397
 
398
  rows, all_tokens = [], []
@@ -402,22 +412,30 @@ def build_or_load_hybrid(pdf_dir: Path):
402
  raw = _extract_pdf_text(pdf)
403
  if not raw.strip():
404
  continue
405
- for i, ch in enumerate(chunk_by_sentence_windows(raw, win_size=8, overlap=2)):
 
 
406
  rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
407
  all_tokens.append(tokenize(ch))
408
  if not rows:
409
  meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
410
- vectorizer = None; X_tfidf = None; emb = None; all_tokens = None
 
 
 
411
  return vectorizer, X_tfidf, meta, all_tokens, emb
412
 
413
  meta = pd.DataFrame(rows)
414
  from sklearn.feature_extraction.text import TfidfVectorizer
 
415
  vectorizer = TfidfVectorizer(
416
  ngram_range=(1, 2),
417
- min_df=1, max_df=0.95,
418
- sublinear_tf=True, smooth_idf=True,
 
 
419
  lowercase=True,
420
- token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b"
421
  )
422
  X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
423
 
@@ -425,15 +443,18 @@ def build_or_load_hybrid(pdf_dir: Path):
425
  if USE_DENSE:
426
  try:
427
  st_model = _safe_init_st_model(
428
- os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
 
 
429
  )
430
  if st_model is not None:
431
  from sklearn.preprocessing import normalize as sk_normalize
 
432
  em = st_model.encode(
433
  meta["text"].tolist(),
434
  batch_size=64,
435
  show_progress_bar=False,
436
- convert_to_numpy=True
437
  )
438
  emb = sk_normalize(em)
439
  np.save(EMB_NPY_PATH, emb)
@@ -448,19 +469,33 @@ def build_or_load_hybrid(pdf_dir: Path):
448
  meta.to_parquet(RAG_META_PATH, index=False)
449
  return vectorizer, X_tfidf, meta, all_tokens, emb
450
 
451
- tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(LOCAL_PDF_DIR)
452
- bm25 = BM25Okapi(bm25_tokens) if (BM25Okapi is not None and bm25_tokens is not None) else None
453
- st_query_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 
 
 
 
 
 
 
 
 
 
454
 
455
  def _extract_page(text_chunk: str) -> str:
456
  # Correct: [[PAGE=123]]
457
  m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
458
- return (m[-1].group(1) if m else "?")
 
459
 
460
- def hybrid_search(query: str, k=8,
461
- w_tfidf=W_TFIDF_DEFAULT,
462
- w_bm25=W_BM25_DEFAULT,
463
- w_emb=W_EMB_DEFAULT):
 
 
 
464
  if rag_meta is None or rag_meta.empty:
465
  return pd.DataFrame()
466
 
@@ -468,28 +503,35 @@ def hybrid_search(query: str, k=8,
468
  if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
469
  try:
470
  from sklearn.preprocessing import normalize as sk_normalize
 
471
  q_emb = st_query_model.encode([query], convert_to_numpy=True)
472
  q_emb = sk_normalize(q_emb)[0]
473
  dense_scores = emb_matrix @ q_emb
474
  except Exception as e:
475
  print("Dense query encoding failed:", e)
476
- dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
 
477
  else:
478
- dense_scores = np.zeros(len(rag_meta), dtype=float); w_emb = 0.0
 
479
 
480
  # TF-IDF scores
481
  if tfidf_vectorizer is not None and tfidf_matrix is not None:
482
  q_vec = tfidf_vectorizer.transform([query])
483
  tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
484
  else:
485
- tfidf_scores = np.zeros(len(rag_meta), dtype=float); w_tfidf = 0.0
 
486
 
487
  # BM25 scores
488
  if bm25 is not None:
489
- q_tokens = [t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-\/\.%]+", query)]
 
 
490
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
491
  else:
492
- bm25_scores = np.zeros(len(rag_meta), dtype=float); w_bm25 = 0.0
 
493
 
494
  def _norm(x):
495
  x = np.asarray(x, dtype=float)
@@ -499,26 +541,37 @@ def hybrid_search(query: str, k=8,
499
 
500
  s_dense = _norm(dense_scores)
501
  s_tfidf = _norm(tfidf_scores)
502
- s_bm25 = _norm(bm25_scores)
503
 
504
  total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
505
- w_tfidf, w_bm25, w_emb = w_tfidf/total_w, w_bm25/total_w, w_emb/total_w
 
 
 
 
506
 
507
  combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
508
  idx = np.argsort(-combo)[:k]
509
  hits = rag_meta.iloc[idx].copy()
510
  hits["score_dense"] = s_dense[idx]
511
  hits["score_tfidf"] = s_tfidf[idx]
512
- hits["score_bm25"] = s_bm25[idx]
513
- hits["score"] = combo[idx]
514
  return hits.reset_index(drop=True)
515
 
 
516
  def split_sentences(text: str) -> List[str]:
517
  sents = sent_split(text)
518
  return [s for s in sents if 6 <= len(s.split()) <= 60]
519
 
520
- def mmr_select_sentences(question: str, hits: pd.DataFrame,
521
- top_n=4, pool_per_chunk=6, lambda_div=0.7):
 
 
 
 
 
 
522
  """
523
  Robust MMR sentence picker:
524
  - Handles empty pools
@@ -528,12 +581,12 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame,
528
  # Build pool
529
  pool = []
530
  for _, row in hits.iterrows():
531
- doc = Path(row["doc_path"]).name
532
  page = _extract_page(row["text"])
533
  sents = split_sentences(row["text"])
534
  if not sents:
535
  continue
536
- for s in sents[:max(1, int(pool_per_chunk))]:
537
  pool.append({"sent": s, "doc": doc, "page": page})
538
 
539
  if not pool:
@@ -545,23 +598,39 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame,
545
  try:
546
  if use_dense:
547
  from sklearn.preprocessing import normalize as sk_normalize
548
- enc = st_query_model.encode([question] + sent_texts, convert_to_numpy=True)
 
 
 
549
  q_vec = sk_normalize(enc[:1])[0]
550
- S = sk_normalize(enc[1:])
551
- rel = (S @ q_vec)
552
- def sim_fn(i, j): return float(S[i] @ S[j])
 
 
 
553
  else:
554
  from sklearn.feature_extraction.text import TfidfVectorizer
 
555
  vect = TfidfVectorizer().fit(sent_texts + [question])
556
- Q = vect.transform([question]); S = vect.transform(sent_texts)
 
557
  rel = (S @ Q.T).toarray().ravel()
 
558
  def sim_fn(i, j):
559
- num = (S[i] @ S[j].T)
560
- return float(num.toarray()[0, 0]) if hasattr(num, "toarray") else float(num)
 
 
 
 
 
561
  except Exception:
562
  # Fallback: uniform relevance if vectorization fails
563
  rel = np.ones(len(sent_texts), dtype=float)
564
- def sim_fn(i, j): return 0.0
 
 
565
 
566
  # Normalize lambda_div
567
  lambda_div = float(np.clip(lambda_div, 0.0, 1.0))
@@ -572,7 +641,7 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame,
572
  return []
573
  first = int(np.argmax(rel))
574
  selected_idx = [first]
575
- selected = [pool[first]]
576
  remain.remove(first)
577
 
578
  # Clamp top_n
@@ -593,10 +662,63 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame,
593
 
594
  return selected
595
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  def compose_extractive(selected: List[Dict[str, Any]]) -> str:
597
  """
598
  Build an extractive answer with APA-style inline citations.
599
- Each sentence ends with (Author et al., YEAR, p. X) when mapped in CITATION_MAP.
600
  """
601
  if not selected:
602
  return ""
@@ -606,11 +728,13 @@ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
606
  parts.append(f"{s['sent']} ({cite})")
607
  return " ".join(parts)
608
 
 
609
  # ========================= NEW: Instrumentation helpers =========================
610
  LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
611
- OPENAI_IN_COST_PER_1K = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
612
  OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
613
 
 
614
  def _safe_write_jsonl(path: Path, record: dict):
615
  try:
616
  with open(path, "a", encoding="utf-8") as f:
@@ -618,14 +742,23 @@ def _safe_write_jsonl(path: Path, record: dict):
618
  except Exception as e:
619
  print("[Log] write failed:", e)
620
 
 
621
  def _calc_cost_usd(prompt_toks, completion_toks):
622
  if prompt_toks is None or completion_toks is None:
623
  return None
624
- return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
 
 
 
 
625
 
626
  # ----------------- Modified to return (text, usage_dict) -----------------
627
- def synthesize_with_llm(question: str, sentence_lines: List[str],
628
- model: str = None, temperature: float = 0.2):
 
 
 
 
629
  if not LLM_AVAILABLE:
630
  return None, None
631
  client = OpenAI(api_key=OPENAI_API_KEY)
@@ -634,12 +767,12 @@ def synthesize_with_llm(question: str, sentence_lines: List[str],
634
  "You are a scientific assistant for self-sensing cementitious materials.\n"
635
  "Answer STRICTLY using the provided sentences.\n"
636
  "Do not invent facts. Keep it concise (3–6 sentences).\n"
637
- "Retain inline citations exactly as given (e.g., Omar et al., 2021, p. X).\n"
638
  )
639
  user_prompt = (
640
  f"Question: {question}\n\n"
641
- f"Use ONLY these sentences to answer; keep their inline citations:\n" +
642
- "\n".join(f"- {s}" for s in sentence_lines)
643
  )
644
  try:
645
  resp = client.responses.create(
@@ -655,8 +788,16 @@ def synthesize_with_llm(question: str, sentence_lines: List[str],
655
  try:
656
  u = getattr(resp, "usage", None)
657
  if u:
658
- pt = getattr(u, "prompt_tokens", None) if hasattr(u, "prompt_tokens") else u.get("prompt_tokens", None)
659
- ct = getattr(u, "completion_tokens", None) if hasattr(u, "completion_tokens") else u.get("completion_tokens", None)
 
 
 
 
 
 
 
 
660
  usage = {"prompt_tokens": pt, "completion_tokens": ct}
661
  except Exception:
662
  usage = None
@@ -664,6 +805,7 @@ def synthesize_with_llm(question: str, sentence_lines: List[str],
664
  except Exception:
665
  return None, None
666
 
 
667
  def rag_reply(
668
  question: str,
669
  k: int = 8,
@@ -674,102 +816,119 @@ def rag_reply(
674
  temperature: float = 0.2,
675
  strict_quotes_only: bool = False,
676
  w_tfidf: float = W_TFIDF_DEFAULT,
677
- w_bm25: float = W_BM25_DEFAULT,
678
- w_emb: float = W_EMB_DEFAULT
679
  ) -> str:
680
  run_id = str(uuid.uuid4())
681
  t0_total = time.time()
682
- t0_retr = time.time()
683
 
684
  # --- Retrieval ---
685
  hits = hybrid_search(
686
- question, k=k,
687
- w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb
 
 
 
688
  )
689
  t1_retr = time.time()
690
  latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
691
 
692
  if hits is None or hits.empty:
693
- final = "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
 
 
694
  record = {
695
  "run_id": run_id,
696
- "ts": int(time.time()*1000),
697
  "inputs": {
698
- "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
699
- "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
700
- "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
 
 
 
 
 
 
701
  },
702
  "retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
703
  "output": {"final_answer": final, "used_sentences": []},
704
- "latency_ms_total": int((time.time()-t0_total)*1000),
705
- "openai": None
706
  }
707
  _safe_write_jsonl(LOG_PATH, record)
708
  return final
709
 
710
  # Select sentences
711
  selected = mmr_select_sentences(
712
- question, hits,
 
713
  top_n=int(n_sentences),
714
  pool_per_chunk=6,
715
- lambda_div=0.7
716
  )
717
-
718
- # APA-style header citations: e.g., "Omar et al., 2021, p. 5; Zhang et al., 2019, p. 12"
719
  header_cites = "; ".join(
720
  format_citation(
721
  Path(r["doc_path"]).name,
722
- _extract_page(r["text"])
723
  )
724
  for _, r in hits.head(6).iterrows()
725
  )
726
  srcs = {Path(r["doc_path"]).name for _, r in hits.iterrows()}
727
- coverage_note = "" if len(srcs) >= 3 else (
728
- f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. "
729
- f"Add more PDFs or increase Top-K."
730
  )
731
 
732
  # Prepare retrieval list for logging
733
  retr_list = []
734
  for _, r in hits.iterrows():
735
- retr_list.append({
736
- "doc": Path(r["doc_path"]).name,
737
- "page": _extract_page(r["text"]),
738
- "score_tfidf": float(r.get("score_tfidf", 0.0)),
739
- "score_bm25": float(r.get("score_bm25", 0.0)),
740
- "score_dense": float(r.get("score_dense", 0.0)),
741
- "combo_score": float(r.get("score", 0.0)),
742
- })
 
 
743
 
744
  # Strict quotes only (no LLM)
745
  if strict_quotes_only:
746
  if not selected:
747
  final = (
748
- f"**Quoted Passages:**\n\n---\n"
749
- + "\n\n".join(hits['text'].tolist()[:2])
750
  + f"\n\n**Citations:** {header_cites}{coverage_note}"
751
  )
752
  else:
753
- final = (
754
- "**Quoted Passages:**\n- "
755
- + "\n- ".join(
756
- f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
757
- for s in selected
758
- )
759
- )
760
  final += f"\n\n**Citations:** {header_cites}{coverage_note}"
761
  if include_passages:
762
  final += (
763
- "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
764
  )
765
 
766
  record = {
767
  "run_id": run_id,
768
- "ts": int(time.time()*1000),
769
  "inputs": {
770
- "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
771
- "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
772
- "use_llm": False, "model": None, "temperature": float(temperature)
 
 
 
 
 
 
773
  },
774
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
775
  "output": {
@@ -777,10 +936,10 @@ def rag_reply(
777
  "used_sentences": [
778
  {"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
779
  for s in selected
780
- ]
781
  },
782
- "latency_ms_total": int((time.time()-t0_total)*1000),
783
- "openai": None
784
  }
785
  _safe_write_jsonl(LOG_PATH, record)
786
  return final
@@ -790,15 +949,14 @@ def rag_reply(
790
  llm_usage = None
791
  llm_latency_ms = None
792
  if use_llm and selected:
793
- # Pass APA-style citations into the LLM so it preserves them
794
  lines = [
795
  f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
796
  for s in selected
797
  ]
798
  t0_llm = time.time()
799
  llm_text, llm_usage = synthesize_with_llm(
800
- question, lines,
801
- model=model, temperature=temperature
802
  )
803
  t1_llm = time.time()
804
  llm_latency_ms = int((t1_llm - t0_llm) * 1000)
@@ -810,14 +968,14 @@ def rag_reply(
810
  )
811
  if include_passages:
812
  final += (
813
- "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
814
  )
815
  else:
816
  if not extractive:
817
  final = (
818
- f"**Answer:** Here are relevant passages.\n\n"
819
  f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
820
- + "\n\n".join(hits['text'].tolist()[:2])
821
  )
822
  else:
823
  final = (
@@ -826,14 +984,14 @@ def rag_reply(
826
  )
827
  if include_passages:
828
  final += (
829
- "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
830
  )
831
  else:
832
  if not extractive:
833
  final = (
834
- f"**Answer:** Here are relevant passages.\n\n"
835
  f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
836
- + "\n\n".join(hits['text'].tolist()[:2])
837
  )
838
  else:
839
  final = (
@@ -842,47 +1000,71 @@ def rag_reply(
842
  )
843
  if include_passages:
844
  final += (
845
- "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
846
  )
847
 
848
  # --------- Log full run ---------
849
  prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
850
- completion_toks = llm_usage.get("completion_tokens") if llm_usage else None
 
 
851
  cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
852
 
853
  total_ms = int((time.time() - t0_total) * 1000)
854
  record = {
855
  "run_id": run_id,
856
- "ts": int(time.time()*1000),
857
  "inputs": {
858
- "question": question, "top_k": int(k), "n_sentences": int(n_sentences),
859
- "w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
860
- "use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
 
 
 
 
 
 
861
  },
862
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
863
  "output": {
864
  "final_answer": final,
865
  "used_sentences": [
866
- {"sent": s['sent'], "doc": s['doc'], "page": s['page']}
867
  for s in selected
868
- ]
869
  },
870
  "latency_ms_total": total_ms,
871
  "latency_ms_llm": llm_latency_ms,
872
  "openai": {
873
  "prompt_tokens": prompt_toks,
874
  "completion_tokens": completion_toks,
875
- "cost_usd": cost_usd
876
- } if use_llm else None
 
 
877
  }
878
  _safe_write_jsonl(LOG_PATH, record)
879
  return final
880
 
881
- def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
882
- use_llm, model_name, temperature, strict_quotes_only,
883
- w_tfidf, w_bm25, w_emb):
 
 
 
 
 
 
 
 
 
 
 
 
884
  if not message or not message.strip():
885
- return "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
 
 
886
  try:
887
  return rag_reply(
888
  question=message,
@@ -900,6 +1082,7 @@ def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
900
  except Exception as e:
901
  return f"RAG error: {e}"
902
 
 
903
  # ========================= UI (science-oriented styling) =========================
904
  CSS = """
905
  /* Science-oriented: crisp contrast + readable numerics */
@@ -958,11 +1141,11 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
958
  color: #dbeafe !important;
959
  }
960
  #eval-tab input[type="range"] { accent-color: #22c55e !important; }
961
- #eval-tab button {
962
- border-radius: 10px !important;
963
- font-weight: 700 !important;
964
- background: #0ea5e9 !important;
965
- color: #001321 !important;
966
  border: 1px solid #7dd3fc !important;
967
  }
968
  #eval-tab .gr-json, #eval-tab .markdown pre, #eval-tab .markdown code {
@@ -988,27 +1171,27 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
988
  }
989
 
990
  /* Evaluate tab: enforce dark-blue text for labels/marks */
991
- #eval-tab .label,
992
- #eval-tab label,
993
  #eval-tab .gr-slider .label,
994
  #eval-tab .wrap .label,
995
- #eval-tab .prose,
996
- #eval-tab .markdown,
997
- #eval-tab p,
998
  #eval-tab span {
999
  color: #cfe6ff !important; /* softer than pure white */
1000
  }
1001
 
1002
  /* Target the specific k-slider label strongly */
1003
- #k-slider .label,
1004
- #k-slider label,
1005
  #k-slider .wrap .label {
1006
  color: #cfe6ff !important;
1007
  text-shadow: 0 1px 0 rgba(0,0,0,0.35);
1008
  }
1009
 
1010
  /* Slider track/thumb (dark blue gradient + blue thumb) */
1011
- #eval-tab input[type="range"] {
1012
  accent-color: #3b82f6 !important; /* fallback */
1013
  }
1014
 
@@ -1103,7 +1286,7 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
1103
  }
1104
 
1105
  /* When Evaluate tab is active and JS has added .eval-active, bump contrast subtly */
1106
- #eval-tab.eval-active .block,
1107
  #eval-tab.eval-active .group {
1108
  border-color: #60a5fa !important;
1109
  }
@@ -1114,7 +1297,7 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
1114
 
1115
  theme = gr.themes.Soft(
1116
  primary_hue="blue",
1117
- neutral_hue="green"
1118
  ).set(
1119
  body_background_fill="#0b1020",
1120
  body_text_color="#e0f2fe",
@@ -1128,7 +1311,8 @@ theme = gr.themes.Soft(
1128
 
1129
  with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1130
  # Optional: JS to toggle .eval-active when Evaluate tab selected
1131
- gr.HTML("""
 
1132
  <script>
1133
  (function(){
1134
  const applyEvalActive = () => {
@@ -1150,14 +1334,14 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1150
  setTimeout(applyEvalActive, 300);
1151
  })();
1152
  </script>
1153
- """)
1154
-
 
1155
  gr.Markdown(
1156
  "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
1157
  "<p style='opacity:.9'>"
1158
  "Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
1159
- "Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
1160
- "Answers use APA-style inline citations (e.g., Omar et al., 2021, p. X)."
1161
  "</p>"
1162
  )
1163
 
@@ -1166,38 +1350,108 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1166
  with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
1167
  with gr.Row():
1168
  with gr.Column(scale=7):
1169
- with gr.Accordion("Primary conductive filler", open=True, elem_classes=["card"]):
1170
- f1_type = gr.Textbox(label="Filler 1 Type *", placeholder="e.g., CNT, Graphite, Steel fiber")
1171
- f1_diam = gr.Number(label="Filler 1 Diameter (µm) *")
1172
- f1_len = gr.Number(label="Filler 1 Length (mm) *")
1173
- cf_conc = gr.Number(label=f"{CF_COL} *", info="Weight percent of total binder")
1174
- f1_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 1 Dimensionality *")
1175
-
1176
- with gr.Accordion("Secondary filler (optional)", open=False, elem_classes=["card"]):
1177
- f2_type = gr.Textbox(label="Filler 2 Type", placeholder="Optional")
1178
- f2_diam = gr.Number(label="Filler 2 Diameter (µm)")
1179
- f2_len = gr.Number(label="Filler 2 Length (mm)")
1180
- f2_dim = gr.Dropdown(DIM_CHOICES, value=CANON_NA, label="Filler 2 Dimensionality")
1181
-
1182
- with gr.Accordion("Mix design & specimen", open=False, elem_classes=["card"]):
1183
- spec_vol = gr.Number(label="Specimen Volume (mm3) *")
1184
- probe_cnt = gr.Number(label="Probe Count *")
1185
- probe_mat = gr.Textbox(label="Probe Material *", placeholder="e.g., Copper, Silver paste")
1186
- wb = gr.Number(label="W/B *")
1187
- sb = gr.Number(label="S/B *")
1188
- gauge_len = gr.Number(label="Gauge Length (mm) *")
1189
- curing = gr.Textbox(label="Curing Condition *", placeholder="e.g., 28d water, 20°C")
1190
- n_fillers = gr.Number(label="Number of Fillers *")
1191
-
1192
- with gr.Accordion("Processing", open=False, elem_classes=["card"]):
1193
- dry_temp = gr.Number(label="Drying Temperature (°C)")
1194
- dry_hrs = gr.Number(label="Drying Duration (hr)")
1195
-
1196
- with gr.Accordion("Mechanical & electrical loading", open=False, elem_classes=["card"]):
1197
- load_rate = gr.Number(label="Loading Rate (MPa/s)")
1198
- E_mod = gr.Number(label="Modulus of Elasticity (GPa) *")
1199
- current = gr.Dropdown(CURRENT_CHOICES, value=CANON_NA, label="Current Type")
1200
- voltage = gr.Number(label="Applied Voltage (V)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1201
 
1202
  with gr.Column(scale=5):
1203
  with gr.Group(elem_classes=["card"]):
@@ -1205,32 +1459,54 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1205
  label="Predicted Stress GF (MPa-1)",
1206
  value=0.0,
1207
  precision=6,
1208
- elem_id="pred-out"
1209
  )
1210
  gr.Markdown(f"<small>{MODEL_STATUS}</small>")
1211
  with gr.Row():
1212
- btn_pred = gr.Button("Predict", variant="primary")
 
 
1213
  btn_clear = gr.Button("Clear")
1214
- btn_demo = gr.Button("Fill Example")
1215
 
1216
- with gr.Accordion("About this model", open=False, elem_classes=["card"]):
 
 
 
 
1217
  gr.Markdown(
1218
  "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
1219
- "- Target: Stress GF (MPa<sup>-1</sup>) on original scale "
1220
- "(model may train on log1p; saved flag used at inference).\n"
1221
  "- Missing values are safely imputed per-feature.\n"
1222
  "- Trained columns:\n"
1223
  f" `{', '.join(MAIN_VARIABLES)}`",
1224
- elem_classes=["prose"]
1225
  )
1226
 
1227
  inputs_in_order = [
1228
- f1_type, f1_diam, f1_len, cf_conc,
1229
- f1_dim, f2_type, f2_diam, f2_len,
1230
- f2_dim, spec_vol, probe_cnt, probe_mat,
1231
- wb, sb, gauge_len, curing, n_fillers,
1232
- dry_temp, dry_hrs, load_rate,
1233
- E_mod, current, voltage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
  ]
1235
 
1236
  def _predict_wrapper(*vals):
@@ -1240,106 +1516,177 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1240
  btn_pred.click(
1241
  _predict_wrapper,
1242
  inputs=inputs_in_order,
1243
- outputs=out_pred
1244
  )
1245
  btn_clear.click(
1246
  lambda: _clear_all(),
1247
  inputs=None,
1248
- outputs=inputs_in_order
1249
- ).then(
1250
- lambda: 0.0,
1251
- outputs=out_pred
1252
- )
1253
  btn_demo.click(
1254
  lambda: _fill_example(),
1255
  inputs=None,
1256
- outputs=inputs_in_order
1257
  )
1258
 
1259
  # ------------------------- Literature Tab -------------------------
1260
- with gr.Tab("📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
 
 
1261
  pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
1262
  gr.Markdown(
1263
  f"Using local folder <code>papers/</code> — **{pdf_count} PDF(s)** indexed. "
1264
- "Upload more PDFs and reload the Space to expand coverage. "
1265
- "Answers use APA-like inline citations (e.g., Omar et al., 2021, p. X)."
1266
  )
1267
  with gr.Row():
1268
- top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
1269
- n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
 
 
 
 
 
 
 
 
 
 
 
 
1270
  include_passages = gr.Checkbox(
1271
  value=False,
1272
  label="Include supporting passages",
1273
- interactive=True
1274
  )
1275
 
1276
- with gr.Accordion("Retriever weights (advanced)", open=False):
1277
- w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
1278
- w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
1279
- w_emb = gr.Slider(
1280
- 0.0, 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281
  value=(0.0 if not USE_DENSE else 0.40),
1282
  step=0.05,
1283
- label="Dense weight (set 0 if disabled)"
1284
  )
1285
 
1286
  # Hidden states (unchanged)
1287
- state_use_llm = gr.State(LLM_AVAILABLE)
1288
- state_model_name = gr.State(os.getenv("OPENAI_MODEL", OPENAI_MODEL))
 
 
1289
  state_temperature = gr.State(0.2)
1290
- state_strict = gr.State(False)
1291
 
1292
  gr.ChatInterface(
1293
  fn=rag_chat_fn,
1294
  additional_inputs=[
1295
- top_k, n_sentences, include_passages,
1296
- state_use_llm, state_model_name, state_temperature, state_strict,
1297
- w_tfidf, w_bm25, w_emb
 
 
 
 
 
 
 
1298
  ],
1299
  title="Literature Q&A",
1300
- description="Hybrid retrieval with diversity. Answers carry APA-style inline citations."
 
 
 
1301
  )
1302
 
1303
- # ====== Evaluate (Gold vs Logs) — darker, higher-contrast ======
1304
- with gr.Tab("📏 Evaluate (Gold vs Logs)", elem_id="eval-tab"):
1305
- gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
 
 
 
 
1306
  with gr.Row():
1307
- gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
1308
- k_slider = gr.Slider(3, 12, value=8, step=1,
1309
- label="k for Hit/Recall/nDCG", elem_id="k-slider")
 
 
 
 
 
 
 
 
 
 
1310
  with gr.Row():
1311
- btn_eval = gr.Button("Compute Metrics", variant="primary")
 
 
1312
  with gr.Row():
1313
- out_perq = gr.File(label="Per-question metrics (CSV)", elem_id="perq-file")
1314
- out_agg = gr.File(label="Aggregate metrics (JSON)", elem_id="agg-file")
1315
- out_json = gr.JSON(label="Aggregate summary", elem_id="agg-json")
1316
- out_log = gr.Markdown(label="Run log", elem_id="eval-log")
 
 
 
 
 
 
 
 
 
 
1317
 
1318
  def _run_eval_inproc(gold_path: str, k: int = 8):
1319
  import json as _json
 
1320
  out_dir = str(ARTIFACT_DIR)
1321
  logs = str(LOG_PATH)
1322
  cmd = [
1323
- sys.executable, "rag_eval_metrics.py",
1324
- "--gold_csv", gold_path,
1325
- "--logs_jsonl", logs,
1326
- "--k", str(k),
1327
- "--out_dir", out_dir
 
 
 
 
 
1328
  ]
1329
  try:
1330
  p = subprocess.run(
1331
  cmd,
1332
  capture_output=True,
1333
  text=True,
1334
- check=False
1335
  )
1336
  stdout = p.stdout or ""
1337
  stderr = p.stderr or ""
1338
  perq = ARTIFACT_DIR / "metrics_per_question.csv"
1339
- agg = ARTIFACT_DIR / "metrics_aggregate.json"
1340
  agg_json = {}
1341
  if agg.exists():
1342
- agg_json = _json.loads(agg.read_text(encoding="utf-8"))
 
 
1343
  report = (
1344
  "```\n"
1345
  + (stdout.strip() or "(no stdout)")
@@ -1348,19 +1695,30 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1348
  )
1349
  return (
1350
  str(perq) if perq.exists() else None,
1351
- str(agg) if agg.exists() else None,
1352
  agg_json,
1353
- report
1354
  )
1355
  except Exception as e:
1356
- return (None, None, {}, f"**Eval error:** {e}")
 
 
 
 
 
1357
 
1358
  def _eval_wrapper(gf, k):
1359
- from pathlib import Path as _Path
 
1360
  if gf is None:
1361
- default_gold = _Path("gold.csv")
1362
  if not default_gold.exists():
1363
- return None, None, {}, "**No gold.csv provided or found in repo root.**"
 
 
 
 
 
1364
  gold_path = str(default_gold)
1365
  else:
1366
  gold_path = gf.name
@@ -1369,23 +1727,14 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1369
  btn_eval.click(
1370
  _eval_wrapper,
1371
  inputs=[gold_file, k_slider],
1372
- outputs=[out_perq, out_agg, out_json, out_log]
1373
  )
1374
 
1375
  # ------------- Launch -------------
1376
  if __name__ == "__main__":
1377
  demo.queue().launch()
1378
-
1379
- import os
1380
- import pandas as pd
1381
-
1382
- # Folder where your RAG files are stored
1383
- folder = "papers" # change if needed
1384
-
1385
- # List all files in the folder
1386
- files = sorted(os.listdir(folder))
1387
-
1388
- # Save them to a CSV file
1389
  pd.DataFrame({"doc": files}).to_csv("paper_list.csv", index=False)
1390
-
1391
- print("✅ Saved paper_list.csv with", len(files), "papers")
 
5
  # - Predictor: safe model caching + safe feature alignment
6
  # - Stable categoricals ("NA"); no over-strict completeness gate
7
  # - Fixed [[PAGE=...]] regex
8
+ # - Lightweight instrumentation (JSONL logs per RAG turn)
9
+ # - Dark-blue theme + Evaluate tab + k-slider styling
10
+ # - APA-style citations via paper_citations.csv
 
11
  # ================================================================
12
 
13
  # ---------------------- Runtime flags (HF-safe) ----------------------
 
17
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
18
 
19
  # ------------------------------- Imports ------------------------------
20
+ import re
21
+ import joblib
22
+ import warnings
23
+ import json
24
+ import traceback
25
+ import time
26
+ import uuid
27
+ import subprocess
28
+ import sys
29
  from pathlib import Path
30
  from typing import List, Dict, Any, Optional
31
 
 
50
 
51
  # Optional OpenAI (for LLM paraphrase)
52
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
53
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
54
  try:
55
  from openai import OpenAI
56
  except Exception:
 
63
  and OpenAI is not None
64
  )
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # ========================= Predictor (kept) =========================
67
+ CF_COL = "Conductive Filler Conc. (wt%)"
68
  TARGET_COL = "Stress GF (MPa-1)"
69
+ CANON_NA = "NA" # canonical placeholder for categoricals
70
 
71
  MAIN_VARIABLES = [
72
  "Filler 1 Type",
 
91
  "Loading Rate (MPa/s)",
92
  "Modulus of Elasticity (GPa)",
93
  "Current Type",
94
+ "Applied Voltage (V)",
95
  ]
96
 
97
  NUMERIC_COLS = {
 
110
  "Drying Duration (hr)",
111
  "Loading Rate (MPa/s)",
112
  "Modulus of Elasticity (GPa)",
113
+ "Applied Voltage (V)",
114
  }
115
 
116
  CATEGORICAL_COLS = {
 
120
  "Filler 2 Dimensionality",
121
  "Probe Material",
122
  "Curing Condition",
123
+ "Current Type",
124
  }
125
 
126
+ DIM_CHOICES = ["0D", "1D", "2D", "3D", CANON_NA]
127
  CURRENT_CHOICES = ["DC", "AC", CANON_NA]
128
 
129
  MODEL_CANDIDATES = [
130
  "stress_gf_xgb.joblib",
131
  "models/stress_gf_xgb.joblib",
132
  "/home/user/app/stress_gf_xgb.joblib",
133
+ os.getenv("MODEL_PATH", ""),
134
  ]
135
 
136
  # ---------- Model caching + status ----------
137
  MODEL = None
138
  MODEL_STATUS = "🔴 Model not loaded"
139
 
140
+
141
  def _try_load_model():
142
  global MODEL, MODEL_STATUS
143
  for p in [x for x in MODEL_CANDIDATES if x]:
 
153
  MODEL = None
154
  if MODEL is None:
155
  MODEL_STATUS = (
156
+ "🔴 Model not found (place stress_gf_xgb.joblib at repo root or models/, "
157
+ "or set MODEL_PATH)"
158
  )
159
  print("[ModelLoad]", MODEL_STATUS)
160
 
161
+
162
  _try_load_model() # load at import time
163
 
164
+
165
  def _canon_cat(v: Any) -> str:
166
  """Stable, canonical category placeholder normalization."""
167
  if v is None:
 
171
  return CANON_NA
172
  return s
173
 
174
+
175
  def _to_float_or_nan(v):
176
  if v in ("", None):
177
  return np.nan
 
180
  except Exception:
181
  return np.nan
182
 
183
+
184
  def _coerce_to_row(form_dict: dict) -> pd.DataFrame:
185
  row = {}
186
  for col in MAIN_VARIABLES:
 
194
  row[col] = s if s else CANON_NA
195
  return pd.DataFrame([row], columns=MAIN_VARIABLES)
196
 
197
+
198
  def _align_columns_to_model(df: pd.DataFrame, mdl) -> pd.DataFrame:
199
  """
200
  SAFE alignment:
 
236
  traceback.print_exc()
237
  return df
238
 
239
+
240
  def predict_fn(**kwargs):
241
  """
242
  Always attempt prediction.
 
261
  traceback.print_exc()
262
  return 0.0
263
 
264
+
265
  EXAMPLE = {
266
  "Filler 1 Type": "CNT",
267
  "Filler 1 Dimensionality": "1D",
 
288
  "Applied Voltage (V)": 5.0,
289
  }
290
 
291
+
292
  def _fill_example():
293
  return [EXAMPLE.get(k, None) for k in MAIN_VARIABLES]
294
 
295
+
296
  def _clear_all():
297
  cleared = []
298
  for col in MAIN_VARIABLES:
 
306
  cleared.append("")
307
  return cleared
308
 
309
+
310
  # ========================= Hybrid RAG =========================
311
+ ARTIFACT_DIR = Path("rag_artifacts")
312
+ ARTIFACT_DIR.mkdir(exist_ok=True)
313
  TFIDF_VECT_PATH = ARTIFACT_DIR / "tfidf_vectorizer.joblib"
314
+ TFIDF_MAT_PATH = ARTIFACT_DIR / "tfidf_matrix.joblib"
315
+ BM25_TOK_PATH = ARTIFACT_DIR / "bm25_tokens.joblib"
316
+ EMB_NPY_PATH = ARTIFACT_DIR / "chunk_embeddings.npy"
317
+ RAG_META_PATH = ARTIFACT_DIR / "chunks.parquet"
318
 
319
+ LOCAL_PDF_DIR = Path("papers")
320
+ LOCAL_PDF_DIR.mkdir(exist_ok=True)
321
  USE_ONLINE_SOURCES = os.getenv("USE_ONLINE_SOURCES", "false").lower() == "true"
322
 
323
  W_TFIDF_DEFAULT = 0.50 if not USE_DENSE else 0.30
324
+ W_BM25_DEFAULT = 0.50 if not USE_DENSE else 0.30
325
+ W_EMB_DEFAULT = 0.00 if USE_DENSE is False else 0.40
326
 
327
  _SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
328
+ TOKEN_RE = re.compile(r"[A-Za-z0-9_#+\-/\.%]+")
329
+
330
+
331
  def sent_split(text: str) -> List[str]:
332
  sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
333
  return [s for s in sents if len(s.split()) >= 5]
334
+
335
+
336
  def tokenize(text: str) -> List[str]:
337
  return [t.lower() for t in TOKEN_RE.findall(text)]
338
 
339
+
340
  def _extract_pdf_text(pdf_path: Path) -> str:
341
  try:
342
  import fitz
343
+
344
  doc = fitz.open(pdf_path)
345
  out = []
346
  for i, page in enumerate(doc):
 
349
  except Exception:
350
  try:
351
  from pypdf import PdfReader
352
+
353
  reader = PdfReader(str(pdf_path))
354
  out = []
355
  for i, p in enumerate(reader.pages):
 
360
  print(f"PDF read error ({pdf_path}): {e}")
361
  return ""
362
 
363
+
364
  def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
365
  sents = sent_split(text)
366
  chunks, step = [], max(1, win_size - overlap)
367
  for i in range(0, len(sents), step):
368
+ window = sents[i : i + win_size]
369
  if not window:
370
  break
371
  chunks.append(" ".join(window))
372
  return chunks
373
 
374
+
375
  def _safe_init_st_model(name: str):
376
  global USE_DENSE
377
  if not USE_DENSE:
 
383
  USE_DENSE = False
384
  return None
385
 
386
+
387
  def build_or_load_hybrid(pdf_dir: Path):
388
  # Build or load the hybrid retriever cache
389
  have_cache = (
390
+ TFIDF_VECT_PATH.exists()
391
+ and TFIDF_MAT_PATH.exists()
392
  and RAG_META_PATH.exists()
393
  and (BM25_TOK_PATH.exists() or BM25Okapi is None)
394
  and (EMB_NPY_PATH.exists() or not USE_DENSE)
395
  )
396
  if have_cache:
397
  vectorizer = joblib.load(TFIDF_VECT_PATH)
398
+ X_tfidf = joblib.load(TFIDF_MAT_PATH)
399
+ meta = pd.read_parquet(RAG_META_PATH)
400
+ bm25_toks = joblib.load(BM25_TOK_PATH) if BM25Okapi is not None else None
401
+ emb = (
402
+ np.load(EMB_NPY_PATH)
403
+ if (USE_DENSE and EMB_NPY_PATH.exists())
404
+ else None
405
+ )
406
  return vectorizer, X_tfidf, meta, bm25_toks, emb
407
 
408
  rows, all_tokens = [], []
 
412
  raw = _extract_pdf_text(pdf)
413
  if not raw.strip():
414
  continue
415
+ for i, ch in enumerate(
416
+ chunk_by_sentence_windows(raw, win_size=8, overlap=2)
417
+ ):
418
  rows.append({"doc_path": str(pdf), "chunk_id": i, "text": ch})
419
  all_tokens.append(tokenize(ch))
420
  if not rows:
421
  meta = pd.DataFrame(columns=["doc_path", "chunk_id", "text"])
422
+ vectorizer = None
423
+ X_tfidf = None
424
+ emb = None
425
+ all_tokens = None
426
  return vectorizer, X_tfidf, meta, all_tokens, emb
427
 
428
  meta = pd.DataFrame(rows)
429
  from sklearn.feature_extraction.text import TfidfVectorizer
430
+
431
  vectorizer = TfidfVectorizer(
432
  ngram_range=(1, 2),
433
+ min_df=1,
434
+ max_df=0.95,
435
+ sublinear_tf=True,
436
+ smooth_idf=True,
437
  lowercase=True,
438
+ token_pattern=r"(?u)\b\w[\w\-\./%+#]*\b",
439
  )
440
  X_tfidf = vectorizer.fit_transform(meta["text"].tolist())
441
 
 
443
  if USE_DENSE:
444
  try:
445
  st_model = _safe_init_st_model(
446
+ os.getenv(
447
+ "EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"
448
+ )
449
  )
450
  if st_model is not None:
451
  from sklearn.preprocessing import normalize as sk_normalize
452
+
453
  em = st_model.encode(
454
  meta["text"].tolist(),
455
  batch_size=64,
456
  show_progress_bar=False,
457
+ convert_to_numpy=True,
458
  )
459
  emb = sk_normalize(em)
460
  np.save(EMB_NPY_PATH, emb)
 
469
  meta.to_parquet(RAG_META_PATH, index=False)
470
  return vectorizer, X_tfidf, meta, all_tokens, emb
471
 
472
+
473
+ tfidf_vectorizer, tfidf_matrix, rag_meta, bm25_tokens, emb_matrix = build_or_load_hybrid(
474
+ LOCAL_PDF_DIR
475
+ )
476
+ bm25 = (
477
+ BM25Okapi(bm25_tokens)
478
+ if (BM25Okapi is not None and bm25_tokens is not None)
479
+ else None
480
+ )
481
+ st_query_model = _safe_init_st_model(
482
+ os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
483
+ )
484
+
485
 
486
  def _extract_page(text_chunk: str) -> str:
487
  # Correct: [[PAGE=123]]
488
  m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
489
+ return m[-1].group(1) if m else "?"
490
+
491
 
492
+ def hybrid_search(
493
+ query: str,
494
+ k: int = 8,
495
+ w_tfidf: float = W_TFIDF_DEFAULT,
496
+ w_bm25: float = W_BM25_DEFAULT,
497
+ w_emb: float = W_EMB_DEFAULT,
498
+ ):
499
  if rag_meta is None or rag_meta.empty:
500
  return pd.DataFrame()
501
 
 
503
  if USE_DENSE and st_query_model is not None and emb_matrix is not None and w_emb > 0:
504
  try:
505
  from sklearn.preprocessing import normalize as sk_normalize
506
+
507
  q_emb = st_query_model.encode([query], convert_to_numpy=True)
508
  q_emb = sk_normalize(q_emb)[0]
509
  dense_scores = emb_matrix @ q_emb
510
  except Exception as e:
511
  print("Dense query encoding failed:", e)
512
+ dense_scores = np.zeros(len(rag_meta), dtype=float)
513
+ w_emb = 0.0
514
  else:
515
+ dense_scores = np.zeros(len(rag_meta), dtype=float)
516
+ w_emb = 0.0
517
 
518
  # TF-IDF scores
519
  if tfidf_vectorizer is not None and tfidf_matrix is not None:
520
  q_vec = tfidf_vectorizer.transform([query])
521
  tfidf_scores = (tfidf_matrix @ q_vec.T).toarray().ravel()
522
  else:
523
+ tfidf_scores = np.zeros(len(rag_meta), dtype=float)
524
+ w_tfidf = 0.0
525
 
526
  # BM25 scores
527
  if bm25 is not None:
528
+ q_tokens = [
529
+ t.lower() for t in re.findall(r"[A-Za-z0-9_#+\-\/\.%]+", query)
530
+ ]
531
  bm25_scores = np.array(bm25.get_scores(q_tokens), dtype=float)
532
  else:
533
+ bm25_scores = np.zeros(len(rag_meta), dtype=float)
534
+ w_bm25 = 0.0
535
 
536
  def _norm(x):
537
  x = np.asarray(x, dtype=float)
 
541
 
542
  s_dense = _norm(dense_scores)
543
  s_tfidf = _norm(tfidf_scores)
544
+ s_bm25 = _norm(bm25_scores)
545
 
546
  total_w = (w_tfidf + w_bm25 + w_emb) or 1.0
547
+ w_tfidf, w_bm25, w_emb = (
548
+ w_tfidf / total_w,
549
+ w_bm25 / total_w,
550
+ w_emb / total_w,
551
+ )
552
 
553
  combo = w_emb * s_dense + w_tfidf * s_tfidf + w_bm25 * s_bm25
554
  idx = np.argsort(-combo)[:k]
555
  hits = rag_meta.iloc[idx].copy()
556
  hits["score_dense"] = s_dense[idx]
557
  hits["score_tfidf"] = s_tfidf[idx]
558
+ hits["score_bm25"] = s_bm25[idx]
559
+ hits["score"] = combo[idx]
560
  return hits.reset_index(drop=True)
561
 
562
+
563
  def split_sentences(text: str) -> List[str]:
564
  sents = sent_split(text)
565
  return [s for s in sents if 6 <= len(s.split()) <= 60]
566
 
567
+
568
+ def mmr_select_sentences(
569
+ question: str,
570
+ hits: pd.DataFrame,
571
+ top_n: int = 4,
572
+ pool_per_chunk: int = 6,
573
+ lambda_div: float = 0.7,
574
+ ):
575
  """
576
  Robust MMR sentence picker:
577
  - Handles empty pools
 
581
  # Build pool
582
  pool = []
583
  for _, row in hits.iterrows():
584
+ doc = Path(row["doc_path"]).name
585
  page = _extract_page(row["text"])
586
  sents = split_sentences(row["text"])
587
  if not sents:
588
  continue
589
+ for s in sents[: max(1, int(pool_per_chunk))]:
590
  pool.append({"sent": s, "doc": doc, "page": page})
591
 
592
  if not pool:
 
598
  try:
599
  if use_dense:
600
  from sklearn.preprocessing import normalize as sk_normalize
601
+
602
+ enc = st_query_model.encode(
603
+ [question] + sent_texts, convert_to_numpy=True
604
+ )
605
  q_vec = sk_normalize(enc[:1])[0]
606
+ S = sk_normalize(enc[1:])
607
+ rel = S @ q_vec
608
+
609
+ def sim_fn(i, j):
610
+ return float(S[i] @ S[j])
611
+
612
  else:
613
  from sklearn.feature_extraction.text import TfidfVectorizer
614
+
615
  vect = TfidfVectorizer().fit(sent_texts + [question])
616
+ Q = vect.transform([question])
617
+ S = vect.transform(sent_texts)
618
  rel = (S @ Q.T).toarray().ravel()
619
+
620
  def sim_fn(i, j):
621
+ num = S[i] @ S[j].T
622
+ return (
623
+ float(num.toarray()[0, 0])
624
+ if hasattr(num, "toarray")
625
+ else float(num)
626
+ )
627
+
628
  except Exception:
629
  # Fallback: uniform relevance if vectorization fails
630
  rel = np.ones(len(sent_texts), dtype=float)
631
+
632
+ def sim_fn(i, j):
633
+ return 0.0
634
 
635
  # Normalize lambda_div
636
  lambda_div = float(np.clip(lambda_div, 0.0, 1.0))
 
641
  return []
642
  first = int(np.argmax(rel))
643
  selected_idx = [first]
644
+ selected = [pool[first]]
645
  remain.remove(first)
646
 
647
  # Clamp top_n
 
662
 
663
  return selected
664
 
665
+
666
+ # ========================= APA-style citation helpers =========================
667
+ CITATION_CSV = Path("paper_citations.csv")
668
+
669
+
670
+ def _load_citation_map() -> Dict[str, str]:
671
+ """
672
+ Load a mapping from PDF basename -> APA-style in-text citation
673
+ from paper_citations.csv (columns: doc, citation).
674
+ """
675
+ if not CITATION_CSV.exists():
676
+ print(
677
+ "[Citations] paper_citations.csv not found; "
678
+ "will fall back to raw filenames."
679
+ )
680
+ return {}
681
+ try:
682
+ df = pd.read_csv(CITATION_CSV)
683
+ if not {"doc", "citation"}.issubset(df.columns):
684
+ print(
685
+ "[Citations] paper_citations.csv must have columns: 'doc', 'citation'."
686
+ )
687
+ return {}
688
+ cmap: Dict[str, str] = {}
689
+ for _, row in df.iterrows():
690
+ doc = str(row["doc"]).strip()
691
+ cit = str(row["citation"]).strip()
692
+ if doc and cit:
693
+ cmap[doc] = cit
694
+ print(
695
+ f"[Citations] Loaded {len(cmap)} citation entries from paper_citations.csv"
696
+ )
697
+ return cmap
698
+ except Exception as e:
699
+ print("[Citations] Failed to load paper_citations.csv:", e)
700
+ return {}
701
+
702
+
703
+ CITATION_MAP: Dict[str, str] = _load_citation_map()
704
+
705
+
706
+ def format_citation(doc_name: str, page: Any) -> str:
707
+ """
708
+ Convert a doc filename + page into an APA-like inline citation.
709
+ Fallback: use basename without .pdf if there's no mapping.
710
+ """
711
+ base = Path(doc_name).name
712
+ short = CITATION_MAP.get(base, Path(base).stem)
713
+ page_str = str(page).strip() if page is not None else ""
714
+ if not page_str or page_str == "?":
715
+ return short
716
+ return f"{short}, p. {page_str}"
717
+
718
+
719
  def compose_extractive(selected: List[Dict[str, Any]]) -> str:
720
  """
721
  Build an extractive answer with APA-style inline citations.
 
722
  """
723
  if not selected:
724
  return ""
 
728
  parts.append(f"{s['sent']} ({cite})")
729
  return " ".join(parts)
730
 
731
+
732
  # ========================= NEW: Instrumentation helpers =========================
733
  LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
734
+ OPENAI_IN_COST_PER_1K = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
735
  OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
736
 
737
+
738
  def _safe_write_jsonl(path: Path, record: dict):
739
  try:
740
  with open(path, "a", encoding="utf-8") as f:
 
742
  except Exception as e:
743
  print("[Log] write failed:", e)
744
 
745
+
746
  def _calc_cost_usd(prompt_toks, completion_toks):
747
  if prompt_toks is None or completion_toks is None:
748
  return None
749
+ return (
750
+ (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K
751
+ + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
752
+ )
753
+
754
 
755
  # ----------------- Modified to return (text, usage_dict) -----------------
756
+ def synthesize_with_llm(
757
+ question: str,
758
+ sentence_lines: List[str],
759
+ model: str = None,
760
+ temperature: float = 0.2,
761
+ ):
762
  if not LLM_AVAILABLE:
763
  return None, None
764
  client = OpenAI(api_key=OPENAI_API_KEY)
 
767
  "You are a scientific assistant for self-sensing cementitious materials.\n"
768
  "Answer STRICTLY using the provided sentences.\n"
769
  "Do not invent facts. Keep it concise (3–6 sentences).\n"
770
+ "Retain inline citations exactly as given."
771
  )
772
  user_prompt = (
773
  f"Question: {question}\n\n"
774
+ f"Use ONLY these sentences to answer; keep their inline citations:\n"
775
+ + "\n".join(f"- {s}" for s in sentence_lines)
776
  )
777
  try:
778
  resp = client.responses.create(
 
788
  try:
789
  u = getattr(resp, "usage", None)
790
  if u:
791
+ pt = (
792
+ getattr(u, "prompt_tokens", None)
793
+ if hasattr(u, "prompt_tokens")
794
+ else u.get("prompt_tokens", None)
795
+ )
796
+ ct = (
797
+ getattr(u, "completion_tokens", None)
798
+ if hasattr(u, "completion_tokens")
799
+ else u.get("completion_tokens", None)
800
+ )
801
  usage = {"prompt_tokens": pt, "completion_tokens": ct}
802
  except Exception:
803
  usage = None
 
805
  except Exception:
806
  return None, None
807
 
808
+
809
  def rag_reply(
810
  question: str,
811
  k: int = 8,
 
816
  temperature: float = 0.2,
817
  strict_quotes_only: bool = False,
818
  w_tfidf: float = W_TFIDF_DEFAULT,
819
+ w_bm25: float = W_BM25_DEFAULT,
820
+ w_emb: float = W_EMB_DEFAULT,
821
  ) -> str:
822
  run_id = str(uuid.uuid4())
823
  t0_total = time.time()
824
+ t0_retr = time.time()
825
 
826
  # --- Retrieval ---
827
  hits = hybrid_search(
828
+ question,
829
+ k=k,
830
+ w_tfidf=w_tfidf,
831
+ w_bm25=w_bm25,
832
+ w_emb=w_emb,
833
  )
834
  t1_retr = time.time()
835
  latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
836
 
837
  if hits is None or hits.empty:
838
+ final = (
839
+ "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
840
+ )
841
  record = {
842
  "run_id": run_id,
843
+ "ts": int(time.time() * 1000),
844
  "inputs": {
845
+ "question": question,
846
+ "top_k": int(k),
847
+ "n_sentences": int(n_sentences),
848
+ "w_tfidf": float(w_tfidf),
849
+ "w_bm25": float(w_bm25),
850
+ "w_emb": float(w_emb),
851
+ "use_llm": bool(use_llm),
852
+ "model": model,
853
+ "temperature": float(temperature),
854
  },
855
  "retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
856
  "output": {"final_answer": final, "used_sentences": []},
857
+ "latency_ms_total": int((time.time() - t0_total) * 1000),
858
+ "openai": None,
859
  }
860
  _safe_write_jsonl(LOG_PATH, record)
861
  return final
862
 
863
  # Select sentences
864
  selected = mmr_select_sentences(
865
+ question,
866
+ hits,
867
  top_n=int(n_sentences),
868
  pool_per_chunk=6,
869
+ lambda_div=0.7,
870
  )
871
+ # Header citations: APA-style
 
872
  header_cites = "; ".join(
873
  format_citation(
874
  Path(r["doc_path"]).name,
875
+ _extract_page(r["text"]),
876
  )
877
  for _, r in hits.head(6).iterrows()
878
  )
879
  srcs = {Path(r["doc_path"]).name for _, r in hits.iterrows()}
880
+ coverage_note = (
881
+ "" if len(srcs) >= 3 else
882
+ f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
883
  )
884
 
885
  # Prepare retrieval list for logging
886
  retr_list = []
887
  for _, r in hits.iterrows():
888
+ retr_list.append(
889
+ {
890
+ "doc": Path(r["doc_path"]).name,
891
+ "page": _extract_page(r["text"]),
892
+ "score_tfidf": float(r.get("score_tfidf", 0.0)),
893
+ "score_bm25": float(r.get("score_bm25", 0.0)),
894
+ "score_dense": float(r.get("score_dense", 0.0)),
895
+ "combo_score": float(r.get("score", 0.0)),
896
+ }
897
+ )
898
 
899
  # Strict quotes only (no LLM)
900
  if strict_quotes_only:
901
  if not selected:
902
  final = (
903
+ "**Quoted Passages:**\n\n---\n"
904
+ + "\n\n".join(hits["text"].tolist()[:2])
905
  + f"\n\n**Citations:** {header_cites}{coverage_note}"
906
  )
907
  else:
908
+ bullets = []
909
+ for s in selected:
910
+ cite = format_citation(s["doc"], s["page"])
911
+ bullets.append(f"{s['sent']} ({cite})")
912
+ final = "**Quoted Passages:**\n- " + "\n- ".join(bullets)
 
 
913
  final += f"\n\n**Citations:** {header_cites}{coverage_note}"
914
  if include_passages:
915
  final += (
916
+ "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
917
  )
918
 
919
  record = {
920
  "run_id": run_id,
921
+ "ts": int(time.time() * 1000),
922
  "inputs": {
923
+ "question": question,
924
+ "top_k": int(k),
925
+ "n_sentences": int(n_sentences),
926
+ "w_tfidf": float(w_tfidf),
927
+ "w_bm25": float(w_bm25),
928
+ "w_emb": float(w_emb),
929
+ "use_llm": False,
930
+ "model": None,
931
+ "temperature": float(temperature),
932
  },
933
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
934
  "output": {
 
936
  "used_sentences": [
937
  {"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
938
  for s in selected
939
+ ],
940
  },
941
+ "latency_ms_total": int((time.time() - t0_total) * 1000),
942
+ "openai": None,
943
  }
944
  _safe_write_jsonl(LOG_PATH, record)
945
  return final
 
949
  llm_usage = None
950
  llm_latency_ms = None
951
  if use_llm and selected:
952
+ # Pass APA-cited sentences into LLM
953
  lines = [
954
  f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
955
  for s in selected
956
  ]
957
  t0_llm = time.time()
958
  llm_text, llm_usage = synthesize_with_llm(
959
+ question, lines, model=model, temperature=temperature
 
960
  )
961
  t1_llm = time.time()
962
  llm_latency_ms = int((t1_llm - t0_llm) * 1000)
 
968
  )
969
  if include_passages:
970
  final += (
971
+ "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
972
  )
973
  else:
974
  if not extractive:
975
  final = (
976
+ "**Answer:** Here are relevant passages.\n\n"
977
  f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
978
+ + "\n\n".join(hits["text"].tolist()[:2])
979
  )
980
  else:
981
  final = (
 
984
  )
985
  if include_passages:
986
  final += (
987
+ "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
988
  )
989
  else:
990
  if not extractive:
991
  final = (
992
+ "**Answer:** Here are relevant passages.\n\n"
993
  f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
994
+ + "\n\n".join(hits["text"].tolist()[:2])
995
  )
996
  else:
997
  final = (
 
1000
  )
1001
  if include_passages:
1002
  final += (
1003
+ "\n\n---\n" + "\n\n".join(hits["text"].tolist()[:2])
1004
  )
1005
 
1006
  # --------- Log full run ---------
1007
  prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
1008
+ completion_toks = (
1009
+ llm_usage.get("completion_tokens") if llm_usage else None
1010
+ )
1011
  cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
1012
 
1013
  total_ms = int((time.time() - t0_total) * 1000)
1014
  record = {
1015
  "run_id": run_id,
1016
+ "ts": int(time.time() * 1000),
1017
  "inputs": {
1018
+ "question": question,
1019
+ "top_k": int(k),
1020
+ "n_sentences": int(n_sentences),
1021
+ "w_tfidf": float(w_tfidf),
1022
+ "w_bm25": float(w_bm25),
1023
+ "w_emb": float(w_emb),
1024
+ "use_llm": bool(use_llm),
1025
+ "model": model,
1026
+ "temperature": float(temperature),
1027
  },
1028
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
1029
  "output": {
1030
  "final_answer": final,
1031
  "used_sentences": [
1032
+ {"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
1033
  for s in selected
1034
+ ],
1035
  },
1036
  "latency_ms_total": total_ms,
1037
  "latency_ms_llm": llm_latency_ms,
1038
  "openai": {
1039
  "prompt_tokens": prompt_toks,
1040
  "completion_tokens": completion_toks,
1041
+ "cost_usd": cost_usd,
1042
+ }
1043
+ if use_llm
1044
+ else None,
1045
  }
1046
  _safe_write_jsonl(LOG_PATH, record)
1047
  return final
1048
 
1049
+
1050
+ def rag_chat_fn(
1051
+ message,
1052
+ history,
1053
+ top_k,
1054
+ n_sentences,
1055
+ include_passages,
1056
+ use_llm,
1057
+ model_name,
1058
+ temperature,
1059
+ strict_quotes_only,
1060
+ w_tfidf,
1061
+ w_bm25,
1062
+ w_emb,
1063
+ ):
1064
  if not message or not message.strip():
1065
+ return (
1066
+ "Ask a literature question (e.g., *How does CNT length affect gauge factor?*)"
1067
+ )
1068
  try:
1069
  return rag_reply(
1070
  question=message,
 
1082
  except Exception as e:
1083
  return f"RAG error: {e}"
1084
 
1085
+
1086
  # ========================= UI (science-oriented styling) =========================
1087
  CSS = """
1088
  /* Science-oriented: crisp contrast + readable numerics */
 
1141
  color: #dbeafe !important;
1142
  }
1143
  #eval-tab input[type="range"] { accent-color: #22c55e !important; }
1144
+ #eval-tab button {
1145
+ border-radius: 10px !important;
1146
+ font-weight: 700 !important;
1147
+ background: #0ea5e9 !important;
1148
+ color: #001321 !important;
1149
  border: 1px solid #7dd3fc !important;
1150
  }
1151
  #eval-tab .gr-json, #eval-tab .markdown pre, #eval-tab .markdown code {
 
1171
  }
1172
 
1173
  /* Evaluate tab: enforce dark-blue text for labels/marks */
1174
+ #eval-tab .label,
1175
+ #eval-tab label,
1176
  #eval-tab .gr-slider .label,
1177
  #eval-tab .wrap .label,
1178
+ #eval-tab .prose,
1179
+ #eval-tab .markdown,
1180
+ #eval-tab p,
1181
  #eval-tab span {
1182
  color: #cfe6ff !important; /* softer than pure white */
1183
  }
1184
 
1185
  /* Target the specific k-slider label strongly */
1186
+ #k-slider .label,
1187
+ #k-slider label,
1188
  #k-slider .wrap .label {
1189
  color: #cfe6ff !important;
1190
  text-shadow: 0 1px 0 rgba(0,0,0,0.35);
1191
  }
1192
 
1193
  /* Slider track/thumb (dark blue gradient + blue thumb) */
1194
+ #eval-tab input[type="range"] {
1195
  accent-color: #3b82f6 !important; /* fallback */
1196
  }
1197
 
 
1286
  }
1287
 
1288
  /* When Evaluate tab is active and JS has added .eval-active, bump contrast subtly */
1289
+ #eval-tab.eval-active .block,
1290
  #eval-tab.eval-active .group {
1291
  border-color: #60a5fa !important;
1292
  }
 
1297
 
1298
  theme = gr.themes.Soft(
1299
  primary_hue="blue",
1300
+ neutral_hue="green",
1301
  ).set(
1302
  body_background_fill="#0b1020",
1303
  body_text_color="#e0f2fe",
 
1311
 
1312
  with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1313
  # Optional: JS to toggle .eval-active when Evaluate tab selected
1314
+ gr.HTML(
1315
+ """
1316
  <script>
1317
  (function(){
1318
  const applyEvalActive = () => {
 
1334
  setTimeout(applyEvalActive, 300);
1335
  })();
1336
  </script>
1337
+ """
1338
+ )
1339
+
1340
  gr.Markdown(
1341
  "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
1342
  "<p style='opacity:.9'>"
1343
  "Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
1344
+ "Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection."
 
1345
  "</p>"
1346
  )
1347
 
 
1350
  with gr.Tab("🔮 Predict Gauge Factor (XGB)"):
1351
  with gr.Row():
1352
  with gr.Column(scale=7):
1353
+ with gr.Accordion(
1354
+ "Primary conductive filler",
1355
+ open=True,
1356
+ elem_classes=["card"],
1357
+ ):
1358
+ f1_type = gr.Textbox(
1359
+ label="Filler 1 Type *",
1360
+ placeholder="e.g., CNT, Graphite, Steel fiber",
1361
+ )
1362
+ f1_diam = gr.Number(
1363
+ label="Filler 1 Diameter (µm) *"
1364
+ )
1365
+ f1_len = gr.Number(
1366
+ label="Filler 1 Length (mm) *"
1367
+ )
1368
+ cf_conc = gr.Number(
1369
+ label=f"{CF_COL} *",
1370
+ info="Weight percent of total binder",
1371
+ )
1372
+ f1_dim = gr.Dropdown(
1373
+ DIM_CHOICES,
1374
+ value=CANON_NA,
1375
+ label="Filler 1 Dimensionality *",
1376
+ )
1377
+
1378
+ with gr.Accordion(
1379
+ "Secondary filler (optional)",
1380
+ open=False,
1381
+ elem_classes=["card"],
1382
+ ):
1383
+ f2_type = gr.Textbox(
1384
+ label="Filler 2 Type", placeholder="Optional"
1385
+ )
1386
+ f2_diam = gr.Number(
1387
+ label="Filler 2 Diameter (µm)"
1388
+ )
1389
+ f2_len = gr.Number(
1390
+ label="Filler 2 Length (mm)"
1391
+ )
1392
+ f2_dim = gr.Dropdown(
1393
+ DIM_CHOICES,
1394
+ value=CANON_NA,
1395
+ label="Filler 2 Dimensionality",
1396
+ )
1397
+
1398
+ with gr.Accordion(
1399
+ "Mix design & specimen",
1400
+ open=False,
1401
+ elem_classes=["card"],
1402
+ ):
1403
+ spec_vol = gr.Number(
1404
+ label="Specimen Volume (mm3) *"
1405
+ )
1406
+ probe_cnt = gr.Number(
1407
+ label="Probe Count *"
1408
+ )
1409
+ probe_mat = gr.Textbox(
1410
+ label="Probe Material *",
1411
+ placeholder="e.g., Copper, Silver paste",
1412
+ )
1413
+ wb = gr.Number(label="W/B *")
1414
+ sb = gr.Number(label="S/B *")
1415
+ gauge_len = gr.Number(
1416
+ label="Gauge Length (mm) *"
1417
+ )
1418
+ curing = gr.Textbox(
1419
+ label="Curing Condition *",
1420
+ placeholder="e.g., 28d water, 20°C",
1421
+ )
1422
+ n_fillers = gr.Number(
1423
+ label="Number of Fillers *"
1424
+ )
1425
+
1426
+ with gr.Accordion(
1427
+ "Processing", open=False, elem_classes=["card"]
1428
+ ):
1429
+ dry_temp = gr.Number(
1430
+ label="Drying Temperature (°C)"
1431
+ )
1432
+ dry_hrs = gr.Number(
1433
+ label="Drying Duration (hr)"
1434
+ )
1435
+
1436
+ with gr.Accordion(
1437
+ "Mechanical & electrical loading",
1438
+ open=False,
1439
+ elem_classes=["card"],
1440
+ ):
1441
+ load_rate = gr.Number(
1442
+ label="Loading Rate (MPa/s)"
1443
+ )
1444
+ E_mod = gr.Number(
1445
+ label="Modulus of Elasticity (GPa) *"
1446
+ )
1447
+ current = gr.Dropdown(
1448
+ CURRENT_CHOICES,
1449
+ value=CANON_NA,
1450
+ label="Current Type",
1451
+ )
1452
+ voltage = gr.Number(
1453
+ label="Applied Voltage (V)"
1454
+ )
1455
 
1456
  with gr.Column(scale=5):
1457
  with gr.Group(elem_classes=["card"]):
 
1459
  label="Predicted Stress GF (MPa-1)",
1460
  value=0.0,
1461
  precision=6,
1462
+ elem_id="pred-out",
1463
  )
1464
  gr.Markdown(f"<small>{MODEL_STATUS}</small>")
1465
  with gr.Row():
1466
+ btn_pred = gr.Button(
1467
+ "Predict", variant="primary"
1468
+ )
1469
  btn_clear = gr.Button("Clear")
1470
+ btn_demo = gr.Button("Fill Example")
1471
 
1472
+ with gr.Accordion(
1473
+ "About this model",
1474
+ open=False,
1475
+ elem_classes=["card"],
1476
+ ):
1477
  gr.Markdown(
1478
  "- Pipeline: ColumnTransformer → (RobustScaler + OneHot) → XGBoost\n"
1479
+ "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model may train on log1p; saved flag used at inference).\n"
 
1480
  "- Missing values are safely imputed per-feature.\n"
1481
  "- Trained columns:\n"
1482
  f" `{', '.join(MAIN_VARIABLES)}`",
1483
+ elem_classes=["prose"],
1484
  )
1485
 
1486
  inputs_in_order = [
1487
+ f1_type,
1488
+ f1_diam,
1489
+ f1_len,
1490
+ cf_conc,
1491
+ f1_dim,
1492
+ f2_type,
1493
+ f2_diam,
1494
+ f2_len,
1495
+ f2_dim,
1496
+ spec_vol,
1497
+ probe_cnt,
1498
+ probe_mat,
1499
+ wb,
1500
+ sb,
1501
+ gauge_len,
1502
+ curing,
1503
+ n_fillers,
1504
+ dry_temp,
1505
+ dry_hrs,
1506
+ load_rate,
1507
+ E_mod,
1508
+ current,
1509
+ voltage,
1510
  ]
1511
 
1512
  def _predict_wrapper(*vals):
 
1516
  btn_pred.click(
1517
  _predict_wrapper,
1518
  inputs=inputs_in_order,
1519
+ outputs=out_pred,
1520
  )
1521
  btn_clear.click(
1522
  lambda: _clear_all(),
1523
  inputs=None,
1524
+ outputs=inputs_in_order,
1525
+ ).then(lambda: 0.0, outputs=out_pred)
 
 
 
1526
  btn_demo.click(
1527
  lambda: _fill_example(),
1528
  inputs=None,
1529
+ outputs=inputs_in_order,
1530
  )
1531
 
1532
  # ------------------------- Literature Tab -------------------------
1533
+ with gr.Tab(
1534
+ "📚 Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"
1535
+ ):
1536
  pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
1537
  gr.Markdown(
1538
  f"Using local folder <code>papers/</code> — **{pdf_count} PDF(s)** indexed. "
1539
+ "Upload more PDFs and reload the Space to expand coverage. Answers use APA-style inline citations."
 
1540
  )
1541
  with gr.Row():
1542
+ top_k = gr.Slider(
1543
+ 5,
1544
+ 12,
1545
+ value=8,
1546
+ step=1,
1547
+ label="Top-K chunks",
1548
+ )
1549
+ n_sentences = gr.Slider(
1550
+ 2,
1551
+ 6,
1552
+ value=4,
1553
+ step=1,
1554
+ label="Answer length (sentences)",
1555
+ )
1556
  include_passages = gr.Checkbox(
1557
  value=False,
1558
  label="Include supporting passages",
1559
+ interactive=True,
1560
  )
1561
 
1562
+ with gr.Accordion(
1563
+ "Retriever weights (advanced)", open=False
1564
+ ):
1565
+ w_tfidf = gr.Slider(
1566
+ 0.0,
1567
+ 1.0,
1568
+ value=W_TFIDF_DEFAULT,
1569
+ step=0.05,
1570
+ label="TF-IDF weight",
1571
+ )
1572
+ w_bm25 = gr.Slider(
1573
+ 0.0,
1574
+ 1.0,
1575
+ value=W_BM25_DEFAULT,
1576
+ step=0.05,
1577
+ label="BM25 weight",
1578
+ )
1579
+ w_emb = gr.Slider(
1580
+ 0.0,
1581
+ 1.0,
1582
  value=(0.0 if not USE_DENSE else 0.40),
1583
  step=0.05,
1584
+ label="Dense weight (set 0 if disabled)",
1585
  )
1586
 
1587
  # Hidden states (unchanged)
1588
+ state_use_llm = gr.State(LLM_AVAILABLE)
1589
+ state_model_name = gr.State(
1590
+ os.getenv("OPENAI_MODEL", OPENAI_MODEL)
1591
+ )
1592
  state_temperature = gr.State(0.2)
1593
+ state_strict = gr.State(False)
1594
 
1595
  gr.ChatInterface(
1596
  fn=rag_chat_fn,
1597
  additional_inputs=[
1598
+ top_k,
1599
+ n_sentences,
1600
+ include_passages,
1601
+ state_use_llm,
1602
+ state_model_name,
1603
+ state_temperature,
1604
+ state_strict,
1605
+ w_tfidf,
1606
+ w_bm25,
1607
+ w_emb,
1608
  ],
1609
  title="Literature Q&A",
1610
+ description=(
1611
+ "Hybrid retrieval with diversity. Answers carry APA-style inline citations "
1612
+ "derived from paper_citations.csv."
1613
+ ),
1614
  )
1615
 
1616
+ # ====== Evaluate (Gold vs Logs) ======
1617
+ with gr.Tab(
1618
+ "📏 Evaluate (Gold vs Logs)", elem_id="eval-tab"
1619
+ ):
1620
+ gr.Markdown(
1621
+ "Upload your **gold.csv** and compute retrieval metrics against the app logs."
1622
+ )
1623
  with gr.Row():
1624
+ gold_file = gr.File(
1625
+ label="gold.csv",
1626
+ file_types=[".csv"],
1627
+ interactive=True,
1628
+ )
1629
+ k_slider = gr.Slider(
1630
+ 3,
1631
+ 12,
1632
+ value=8,
1633
+ step=1,
1634
+ label="k for Hit/Recall/nDCG",
1635
+ elem_id="k-slider",
1636
+ )
1637
  with gr.Row():
1638
+ btn_eval = gr.Button(
1639
+ "Compute Metrics", variant="primary"
1640
+ )
1641
  with gr.Row():
1642
+ out_perq = gr.File(
1643
+ label="Per-question metrics (CSV)",
1644
+ elem_id="perq-file",
1645
+ )
1646
+ out_agg = gr.File(
1647
+ label="Aggregate metrics (JSON)",
1648
+ elem_id="agg-file",
1649
+ )
1650
+ out_json = gr.JSON(
1651
+ label="Aggregate summary", elem_id="agg-json"
1652
+ )
1653
+ out_log = gr.Markdown(
1654
+ label="Run log", elem_id="eval-log"
1655
+ )
1656
 
1657
  def _run_eval_inproc(gold_path: str, k: int = 8):
1658
  import json as _json
1659
+
1660
  out_dir = str(ARTIFACT_DIR)
1661
  logs = str(LOG_PATH)
1662
  cmd = [
1663
+ sys.executable,
1664
+ "rag_eval_metrics.py",
1665
+ "--gold_csv",
1666
+ gold_path,
1667
+ "--logs_jsonl",
1668
+ logs,
1669
+ "--k",
1670
+ str(k),
1671
+ "--out_dir",
1672
+ out_dir,
1673
  ]
1674
  try:
1675
  p = subprocess.run(
1676
  cmd,
1677
  capture_output=True,
1678
  text=True,
1679
+ check=False,
1680
  )
1681
  stdout = p.stdout or ""
1682
  stderr = p.stderr or ""
1683
  perq = ARTIFACT_DIR / "metrics_per_question.csv"
1684
+ agg = ARTIFACT_DIR / "metrics_aggregate.json"
1685
  agg_json = {}
1686
  if agg.exists():
1687
+ agg_json = _json.loads(
1688
+ agg.read_text(encoding="utf-8")
1689
+ )
1690
  report = (
1691
  "```\n"
1692
  + (stdout.strip() or "(no stdout)")
 
1695
  )
1696
  return (
1697
  str(perq) if perq.exists() else None,
1698
+ str(agg) if agg.exists() else None,
1699
  agg_json,
1700
+ report,
1701
  )
1702
  except Exception as e:
1703
+ return (
1704
+ None,
1705
+ None,
1706
+ {},
1707
+ f"**Eval error:** {e}",
1708
+ )
1709
 
1710
  def _eval_wrapper(gf, k):
1711
+ from pathlib import Path as _P
1712
+
1713
  if gf is None:
1714
+ default_gold = _P("gold.csv")
1715
  if not default_gold.exists():
1716
+ return (
1717
+ None,
1718
+ None,
1719
+ {},
1720
+ "**No gold.csv provided or found in repo root.**",
1721
+ )
1722
  gold_path = str(default_gold)
1723
  else:
1724
  gold_path = gf.name
 
1727
  btn_eval.click(
1728
  _eval_wrapper,
1729
  inputs=[gold_file, k_slider],
1730
+ outputs=[out_perq, out_agg, out_json, out_log],
1731
  )
1732
 
1733
  # ------------- Launch -------------
1734
  if __name__ == "__main__":
1735
  demo.queue().launch()
1736
+ # Build/update paper_list.csv for convenience
1737
+ folder = "papers"
1738
+ files = sorted(os.listdir(folder)) if os.path.isdir(folder) else []
 
 
 
 
 
 
 
 
1739
  pd.DataFrame({"doc": files}).to_csv("paper_list.csv", index=False)
1740
+ print(f"✅ Saved paper_list.csv with {len(files)} papers")