Inframat-x commited on
Commit
5d07d24
Β·
verified Β·
1 Parent(s): 1697774

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -66
app.py CHANGED
@@ -8,7 +8,7 @@
8
  # - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
9
  # - UPDATED THEME: Dark-blue tabs + Evaluate tab + k-slider styling
10
  # - PATCH: Per-question/aggregate File + JSON outputs now dark-themed via elem_id hooks
11
- # - OPTIONAL JS: Adds .eval-active class when Evaluate tab is selected
12
  # ================================================================
13
 
14
  # ---------------------- Runtime flags (HF-safe) ----------------------
@@ -50,7 +50,37 @@ except Exception:
50
  OpenAI = None
51
 
52
  # LLM availability flag β€” used internally; UI remains hidden
53
- LLM_AVAILABLE = (OPENAI_API_KEY is not None and OPENAI_API_KEY.strip() != "" and OpenAI is not None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  # ========================= Predictor (kept) =========================
56
  CF_COL = "Conductive Filler Conc. (wt%)"
@@ -140,7 +170,9 @@ def _try_load_model():
140
  traceback.print_exc()
141
  MODEL = None
142
  if MODEL is None:
143
- MODEL_STATUS = "πŸ”΄ Model not found (place stress_gf_xgb.joblib at repo root or models/, or set MODEL_PATH)"
 
 
144
  print("[ModelLoad]", MODEL_STATUS)
145
 
146
  _try_load_model() # load at import time
@@ -331,7 +363,8 @@ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
331
  chunks, step = [], max(1, win_size - overlap)
332
  for i in range(0, len(sents), step):
333
  window = sents[i:i+win_size]
334
- if not window: break
 
335
  chunks.append(" ".join(window))
336
  return chunks
337
 
@@ -348,10 +381,12 @@ def _safe_init_st_model(name: str):
348
 
349
  def build_or_load_hybrid(pdf_dir: Path):
350
  # Build or load the hybrid retriever cache
351
- have_cache = (TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
352
- and RAG_META_PATH.exists()
353
- and (BM25_TOK_PATH.exists() or BM25Okapi is None)
354
- and (EMB_NPY_PATH.exists() or not USE_DENSE))
 
 
355
  if have_cache:
356
  vectorizer = joblib.load(TFIDF_VECT_PATH)
357
  X_tfidf = joblib.load(TFIDF_MAT_PATH)
@@ -378,7 +413,7 @@ def build_or_load_hybrid(pdf_dir: Path):
378
  meta = pd.DataFrame(rows)
379
  from sklearn.feature_extraction.text import TfidfVectorizer
380
  vectorizer = TfidfVectorizer(
381
- ngram_range=(1,2),
382
  min_df=1, max_df=0.95,
383
  sublinear_tf=True, smooth_idf=True,
384
  lowercase=True,
@@ -389,10 +424,17 @@ def build_or_load_hybrid(pdf_dir: Path):
389
  emb = None
390
  if USE_DENSE:
391
  try:
392
- st_model = _safe_init_st_model(os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2"))
 
 
393
  if st_model is not None:
394
  from sklearn.preprocessing import normalize as sk_normalize
395
- em = st_model.encode(meta["text"].tolist(), batch_size=64, show_progress_bar=False, convert_to_numpy=True)
 
 
 
 
 
396
  emb = sk_normalize(em)
397
  np.save(EMB_NPY_PATH, emb)
398
  except Exception as e:
@@ -415,7 +457,10 @@ def _extract_page(text_chunk: str) -> str:
415
  m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
416
  return (m[-1].group(1) if m else "?")
417
 
418
- def hybrid_search(query: str, k=8, w_tfidf=W_TFIDF_DEFAULT, w_bm25=W_BM25_DEFAULT, w_emb=W_EMB_DEFAULT):
 
 
 
419
  if rag_meta is None or rag_meta.empty:
420
  return pd.DataFrame()
421
 
@@ -472,7 +517,8 @@ def split_sentences(text: str) -> List[str]:
472
  sents = sent_split(text)
473
  return [s for s in sents if 6 <= len(s.split()) <= 60]
474
 
475
- def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_chunk=6, lambda_div=0.7):
 
476
  """
477
  Robust MMR sentence picker:
478
  - Handles empty pools
@@ -548,9 +594,17 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_ch
548
  return selected
549
 
550
  def compose_extractive(selected: List[Dict[str, Any]]) -> str:
 
 
 
 
551
  if not selected:
552
  return ""
553
- return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
 
 
 
 
554
 
555
  # ========================= NEW: Instrumentation helpers =========================
556
  LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
@@ -570,7 +624,8 @@ def _calc_cost_usd(prompt_toks, completion_toks):
570
  return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
571
 
572
  # ----------------- Modified to return (text, usage_dict) -----------------
573
- def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2):
 
574
  if not LLM_AVAILABLE:
575
  return None, None
576
  client = OpenAI(api_key=OPENAI_API_KEY)
@@ -579,7 +634,7 @@ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = N
579
  "You are a scientific assistant for self-sensing cementitious materials.\n"
580
  "Answer STRICTLY using the provided sentences.\n"
581
  "Do not invent facts. Keep it concise (3–6 sentences).\n"
582
- "Retain inline citations like (Doc.pdf, p.X) exactly as given."
583
  )
584
  user_prompt = (
585
  f"Question: {question}\n\n"
@@ -627,7 +682,10 @@ def rag_reply(
627
  t0_retr = time.time()
628
 
629
  # --- Retrieval ---
630
- hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
 
 
 
631
  t1_retr = time.time()
632
  latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
633
 
@@ -650,10 +708,26 @@ def rag_reply(
650
  return final
651
 
652
  # Select sentences
653
- selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
654
- header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
655
- srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
656
- coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
 
658
  # Prepare retrieval list for logging
659
  retr_list = []
@@ -670,12 +744,24 @@ def rag_reply(
670
  # Strict quotes only (no LLM)
671
  if strict_quotes_only:
672
  if not selected:
673
- final = f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
 
674
  else:
675
- final = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
 
 
 
 
 
 
676
  final += f"\n\n**Citations:** {header_cites}{coverage_note}"
677
  if include_passages:
678
- final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
679
 
680
  record = {
681
  "run_id": run_id,
@@ -688,7 +774,10 @@ def rag_reply(
688
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
689
  "output": {
690
  "final_answer": final,
691
- "used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
 
 
 
692
  },
693
  "latency_ms_total": int((time.time()-t0_total)*1000),
694
  "openai": None
@@ -701,30 +790,60 @@ def rag_reply(
701
  llm_usage = None
702
  llm_latency_ms = None
703
  if use_llm and selected:
704
- lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
 
 
 
 
705
  t0_llm = time.time()
706
- llm_text, llm_usage = synthesize_with_llm(question, lines, model=model, temperature=temperature)
 
 
 
707
  t1_llm = time.time()
708
  llm_latency_ms = int((t1_llm - t0_llm) * 1000)
709
 
710
  if llm_text:
711
- final = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
712
  if include_passages:
713
- final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
714
  else:
715
  if not extractive:
716
- final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
 
 
717
  else:
718
- final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
719
  if include_passages:
720
- final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
721
  else:
722
  if not extractive:
723
- final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
 
 
724
  else:
725
- final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
 
 
 
726
  if include_passages:
727
- final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
 
 
728
 
729
  # --------- Log full run ---------
730
  prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
@@ -743,7 +862,10 @@ def rag_reply(
743
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
744
  "output": {
745
  "final_answer": final,
746
- "used_sentences": [{"sent": s['sent'], "doc": s['doc'], "page": s['page']} for s in selected]
 
 
 
747
  },
748
  "latency_ms_total": total_ms,
749
  "latency_ms_llm": llm_latency_ms,
@@ -1034,7 +1156,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1034
  "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
1035
  "<p style='opacity:.9'>"
1036
  "Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
1037
- "Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection."
 
1038
  "</p>"
1039
  )
1040
 
@@ -1078,7 +1201,12 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1078
 
1079
  with gr.Column(scale=5):
1080
  with gr.Group(elem_classes=["card"]):
1081
- out_pred = gr.Number(label="Predicted Stress GF (MPa-1)", value=0.0, precision=6, elem_id="pred-out")
 
 
 
 
 
1082
  gr.Markdown(f"<small>{MODEL_STATUS}</small>")
1083
  with gr.Row():
1084
  btn_pred = gr.Button("Predict", variant="primary")
@@ -1088,7 +1216,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1088
  with gr.Accordion("About this model", open=False, elem_classes=["card"]):
1089
  gr.Markdown(
1090
  "- Pipeline: ColumnTransformer β†’ (RobustScaler + OneHot) β†’ XGBoost\n"
1091
- "- Target: Stress GF (MPa<sup>-1</sup>) on original scale (model may train on log1p; saved flag used at inference).\n"
 
1092
  "- Missing values are safely imputed per-feature.\n"
1093
  "- Trained columns:\n"
1094
  f" `{', '.join(MAIN_VARIABLES)}`",
@@ -1108,26 +1237,51 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1108
  data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
1109
  return predict_fn(**data)
1110
 
1111
- btn_pred.click(_predict_wrapper, inputs=inputs_in_order, outputs=out_pred)
1112
- btn_clear.click(lambda: _clear_all(), inputs=None, outputs=inputs_in_order).then(lambda: 0.0, outputs=out_pred)
1113
- btn_demo.click(lambda: _fill_example(), inputs=None, outputs=inputs_in_order)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1114
 
1115
  # ------------------------- Literature Tab -------------------------
1116
  with gr.Tab("πŸ“š Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
1117
  pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
1118
  gr.Markdown(
1119
  f"Using local folder <code>papers/</code> β€” **{pdf_count} PDF(s)** indexed. "
1120
- "Upload more PDFs and reload the Space to expand coverage. Answers cite (Doc.pdf, p.X)."
 
1121
  )
1122
  with gr.Row():
1123
  top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
1124
  n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
1125
- include_passages = gr.Checkbox(value=False, label="Include supporting passages", interactive=True)
 
 
 
 
1126
 
1127
  with gr.Accordion("Retriever weights (advanced)", open=False):
1128
  w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
1129
  w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
1130
- w_emb = gr.Slider(0.0, 1.0, value=(0.0 if not USE_DENSE else 0.40), step=0.05, label="Dense weight (set 0 if disabled)")
 
 
 
 
 
1131
 
1132
  # Hidden states (unchanged)
1133
  state_use_llm = gr.State(LLM_AVAILABLE)
@@ -1143,7 +1297,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1143
  w_tfidf, w_bm25, w_emb
1144
  ],
1145
  title="Literature Q&A",
1146
- description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
1147
  )
1148
 
1149
  # ====== Evaluate (Gold vs Logs) β€” darker, higher-contrast ======
@@ -1151,7 +1305,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1151
  gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
1152
  with gr.Row():
1153
  gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
1154
- k_slider = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG", elem_id="k-slider")
 
1155
  with gr.Row():
1156
  btn_eval = gr.Button("Compute Metrics", variant="primary")
1157
  with gr.Row():
@@ -1172,7 +1327,12 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1172
  "--out_dir", out_dir
1173
  ]
1174
  try:
1175
- p = subprocess.run(cmd, capture_output=True, text=True, check=False)
 
 
 
 
 
1176
  stdout = p.stdout or ""
1177
  stderr = p.stderr or ""
1178
  perq = ARTIFACT_DIR / "metrics_per_question.csv"
@@ -1180,18 +1340,25 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1180
  agg_json = {}
1181
  if agg.exists():
1182
  agg_json = _json.loads(agg.read_text(encoding="utf-8"))
1183
- report = "```\n" + (stdout.strip() or "(no stdout)") + ("\n" + stderr.strip() if stderr else "") + "\n```"
1184
- return (str(perq) if perq.exists() else None,
1185
- str(agg) if agg.exists() else None,
1186
- agg_json,
1187
- report)
 
 
 
 
 
 
 
1188
  except Exception as e:
1189
  return (None, None, {}, f"**Eval error:** {e}")
1190
 
1191
  def _eval_wrapper(gf, k):
1192
- from pathlib import Path
1193
  if gf is None:
1194
- default_gold = Path("gold.csv")
1195
  if not default_gold.exists():
1196
  return None, None, {}, "**No gold.csv provided or found in repo root.**"
1197
  gold_path = str(default_gold)
@@ -1199,23 +1366,26 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
1199
  gold_path = gf.name
1200
  return _run_eval_inproc(gold_path, int(k))
1201
 
1202
- btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider],
1203
- outputs=[out_perq, out_agg, out_json, out_log])
 
 
 
1204
 
1205
  # ------------- Launch -------------
1206
  if __name__ == "__main__":
1207
  demo.queue().launch()
1208
- import os
1209
- import pandas as pd
1210
 
1211
- # Folder where your RAG files are stored
1212
- folder = "papers" # change if needed
1213
 
1214
- # List all files in the folder
1215
- files = sorted(os.listdir(folder))
1216
 
1217
- # Save them to a CSV file
1218
- pd.DataFrame({"doc": files}).to_csv("paper_list.csv", index=False)
1219
 
1220
- print("βœ… Saved paper_list.csv with", len(files), "papers")
 
1221
 
 
 
8
  # - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
9
  # - UPDATED THEME: Dark-blue tabs + Evaluate tab + k-slider styling
10
  # - PATCH: Per-question/aggregate File + JSON outputs now dark-themed via elem_id hooks
11
+ # - NEW: APA-like inline citations via CITATION_MAP + format_citation()
12
  # ================================================================
13
 
14
  # ---------------------- Runtime flags (HF-safe) ----------------------
 
50
  OpenAI = None
51
 
52
  # LLM availability flag β€” used internally; UI remains hidden
53
+ LLM_AVAILABLE = (
54
+ OPENAI_API_KEY is not None
55
+ and OPENAI_API_KEY.strip() != ""
56
+ and OpenAI is not None
57
+ )
58
+
59
+ # ========================= APA-style citation helpers =========================
60
+
61
+ # Map PDF basenames β†’ short APA-style in-text citations.
62
+ # IMPORTANT: edit these entries to match your actual filenames + desired citations.
63
+ CITATION_MAP: Dict[str, str] = {
64
+ # "filename.pdf": "Author et al., YEAR",
65
+ # Examples (you can edit/remove these):
66
+ "S92-Research-on-the-self-sensing-and-mechanical-properties-of_2021_Cement-and-Co.pdf": "Omar et al., 2021",
67
+ "S60-Study-on-self-sensing-capabilities-of-smart-cements-filled-with-graphene-oxide-under-dynamic-cyclic-loading.pdf": "Zhang et al., 2019",
68
+ "S59-Modifying self-sensing cement-based composites through multiscale composition.pdf": "Li et al., 2020",
69
+ }
70
+
71
+ def format_citation(doc_name: str, page: Any) -> str:
72
+ """
73
+ Convert a doc filename + page into an APA-like inline citation.
74
+ Fallback: use filename if doc not in CITATION_MAP.
75
+ """
76
+ base = Path(doc_name).name
77
+ short = CITATION_MAP.get(base, base)
78
+ if page is None:
79
+ return short
80
+ page_str = str(page).strip()
81
+ if page_str == "" or page_str == "?":
82
+ return short
83
+ return f"{short}, p. {page_str}"
84
 
85
  # ========================= Predictor (kept) =========================
86
  CF_COL = "Conductive Filler Conc. (wt%)"
 
170
  traceback.print_exc()
171
  MODEL = None
172
  if MODEL is None:
173
+ MODEL_STATUS = (
174
+ "πŸ”΄ Model not found (place stress_gf_xgb.joblib at repo root or models/, or set MODEL_PATH)"
175
+ )
176
  print("[ModelLoad]", MODEL_STATUS)
177
 
178
  _try_load_model() # load at import time
 
363
  chunks, step = [], max(1, win_size - overlap)
364
  for i in range(0, len(sents), step):
365
  window = sents[i:i+win_size]
366
+ if not window:
367
+ break
368
  chunks.append(" ".join(window))
369
  return chunks
370
 
 
381
 
382
  def build_or_load_hybrid(pdf_dir: Path):
383
  # Build or load the hybrid retriever cache
384
+ have_cache = (
385
+ TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
386
+ and RAG_META_PATH.exists()
387
+ and (BM25_TOK_PATH.exists() or BM25Okapi is None)
388
+ and (EMB_NPY_PATH.exists() or not USE_DENSE)
389
+ )
390
  if have_cache:
391
  vectorizer = joblib.load(TFIDF_VECT_PATH)
392
  X_tfidf = joblib.load(TFIDF_MAT_PATH)
 
413
  meta = pd.DataFrame(rows)
414
  from sklearn.feature_extraction.text import TfidfVectorizer
415
  vectorizer = TfidfVectorizer(
416
+ ngram_range=(1, 2),
417
  min_df=1, max_df=0.95,
418
  sublinear_tf=True, smooth_idf=True,
419
  lowercase=True,
 
424
  emb = None
425
  if USE_DENSE:
426
  try:
427
+ st_model = _safe_init_st_model(
428
+ os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
429
+ )
430
  if st_model is not None:
431
  from sklearn.preprocessing import normalize as sk_normalize
432
+ em = st_model.encode(
433
+ meta["text"].tolist(),
434
+ batch_size=64,
435
+ show_progress_bar=False,
436
+ convert_to_numpy=True
437
+ )
438
  emb = sk_normalize(em)
439
  np.save(EMB_NPY_PATH, emb)
440
  except Exception as e:
 
457
  m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
458
  return (m[-1].group(1) if m else "?")
459
 
460
+ def hybrid_search(query: str, k=8,
461
+ w_tfidf=W_TFIDF_DEFAULT,
462
+ w_bm25=W_BM25_DEFAULT,
463
+ w_emb=W_EMB_DEFAULT):
464
  if rag_meta is None or rag_meta.empty:
465
  return pd.DataFrame()
466
 
 
517
  sents = sent_split(text)
518
  return [s for s in sents if 6 <= len(s.split()) <= 60]
519
 
520
+ def mmr_select_sentences(question: str, hits: pd.DataFrame,
521
+ top_n=4, pool_per_chunk=6, lambda_div=0.7):
522
  """
523
  Robust MMR sentence picker:
524
  - Handles empty pools
 
594
  return selected
595
 
596
  def compose_extractive(selected: List[Dict[str, Any]]) -> str:
597
+ """
598
+ Build an extractive answer with APA-style inline citations.
599
+ Each sentence ends with (Author et al., YEAR, p. X) when mapped in CITATION_MAP.
600
+ """
601
  if not selected:
602
  return ""
603
+ parts = []
604
+ for s in selected:
605
+ cite = format_citation(s["doc"], s["page"])
606
+ parts.append(f"{s['sent']} ({cite})")
607
+ return " ".join(parts)
608
 
609
  # ========================= NEW: Instrumentation helpers =========================
610
  LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
 
624
  return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
625
 
626
  # ----------------- Modified to return (text, usage_dict) -----------------
627
+ def synthesize_with_llm(question: str, sentence_lines: List[str],
628
+ model: str = None, temperature: float = 0.2):
629
  if not LLM_AVAILABLE:
630
  return None, None
631
  client = OpenAI(api_key=OPENAI_API_KEY)
 
634
  "You are a scientific assistant for self-sensing cementitious materials.\n"
635
  "Answer STRICTLY using the provided sentences.\n"
636
  "Do not invent facts. Keep it concise (3–6 sentences).\n"
637
+ "Retain inline citations exactly as given (e.g., Omar et al., 2021, p. X).\n"
638
  )
639
  user_prompt = (
640
  f"Question: {question}\n\n"
 
682
  t0_retr = time.time()
683
 
684
  # --- Retrieval ---
685
+ hits = hybrid_search(
686
+ question, k=k,
687
+ w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb
688
+ )
689
  t1_retr = time.time()
690
  latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
691
 
 
708
  return final
709
 
710
  # Select sentences
711
+ selected = mmr_select_sentences(
712
+ question, hits,
713
+ top_n=int(n_sentences),
714
+ pool_per_chunk=6,
715
+ lambda_div=0.7
716
+ )
717
+
718
+ # APA-style header citations: e.g., "Omar et al., 2021, p. 5; Zhang et al., 2019, p. 12"
719
+ header_cites = "; ".join(
720
+ format_citation(
721
+ Path(r["doc_path"]).name,
722
+ _extract_page(r["text"])
723
+ )
724
+ for _, r in hits.head(6).iterrows()
725
+ )
726
+ srcs = {Path(r["doc_path"]).name for _, r in hits.iterrows()}
727
+ coverage_note = "" if len(srcs) >= 3 else (
728
+ f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. "
729
+ f"Add more PDFs or increase Top-K."
730
+ )
731
 
732
  # Prepare retrieval list for logging
733
  retr_list = []
 
744
  # Strict quotes only (no LLM)
745
  if strict_quotes_only:
746
  if not selected:
747
+ final = (
748
+ f"**Quoted Passages:**\n\n---\n"
749
+ + "\n\n".join(hits['text'].tolist()[:2])
750
+ + f"\n\n**Citations:** {header_cites}{coverage_note}"
751
+ )
752
  else:
753
+ final = (
754
+ "**Quoted Passages:**\n- "
755
+ + "\n- ".join(
756
+ f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
757
+ for s in selected
758
+ )
759
+ )
760
  final += f"\n\n**Citations:** {header_cites}{coverage_note}"
761
  if include_passages:
762
+ final += (
763
+ "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
764
+ )
765
 
766
  record = {
767
  "run_id": run_id,
 
774
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
775
  "output": {
776
  "final_answer": final,
777
+ "used_sentences": [
778
+ {"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
779
+ for s in selected
780
+ ]
781
  },
782
  "latency_ms_total": int((time.time()-t0_total)*1000),
783
  "openai": None
 
790
  llm_usage = None
791
  llm_latency_ms = None
792
  if use_llm and selected:
793
+ # Pass APA-style citations into the LLM so it preserves them
794
+ lines = [
795
+ f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
796
+ for s in selected
797
+ ]
798
  t0_llm = time.time()
799
+ llm_text, llm_usage = synthesize_with_llm(
800
+ question, lines,
801
+ model=model, temperature=temperature
802
+ )
803
  t1_llm = time.time()
804
  llm_latency_ms = int((t1_llm - t0_llm) * 1000)
805
 
806
  if llm_text:
807
+ final = (
808
+ f"**Answer (LLM synthesis):** {llm_text}\n\n"
809
+ f"**Citations:** {header_cites}{coverage_note}"
810
+ )
811
  if include_passages:
812
+ final += (
813
+ "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
814
+ )
815
  else:
816
  if not extractive:
817
+ final = (
818
+ f"**Answer:** Here are relevant passages.\n\n"
819
+ f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
820
+ + "\n\n".join(hits['text'].tolist()[:2])
821
+ )
822
  else:
823
+ final = (
824
+ f"**Answer:** {extractive}\n\n"
825
+ f"**Citations:** {header_cites}{coverage_note}"
826
+ )
827
  if include_passages:
828
+ final += (
829
+ "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
830
+ )
831
  else:
832
  if not extractive:
833
+ final = (
834
+ f"**Answer:** Here are relevant passages.\n\n"
835
+ f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
836
+ + "\n\n".join(hits['text'].tolist()[:2])
837
+ )
838
  else:
839
+ final = (
840
+ f"**Answer:** {extractive}\n\n"
841
+ f"**Citations:** {header_cites}{coverage_note}"
842
+ )
843
  if include_passages:
844
+ final += (
845
+ "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
846
+ )
847
 
848
  # --------- Log full run ---------
849
  prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
 
862
  "retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
863
  "output": {
864
  "final_answer": final,
865
+ "used_sentences": [
866
+ {"sent": s['sent'], "doc": s['doc'], "page": s['page']}
867
+ for s in selected
868
+ ]
869
  },
870
  "latency_ms_total": total_ms,
871
  "latency_ms_llm": llm_latency_ms,
 
1156
  "<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
1157
  "<p style='opacity:.9'>"
1158
  "Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
1159
+ "Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
1160
+ "Answers use APA-style inline citations (e.g., Omar et al., 2021, p. X)."
1161
  "</p>"
1162
  )
1163
 
 
1201
 
1202
  with gr.Column(scale=5):
1203
  with gr.Group(elem_classes=["card"]):
1204
+ out_pred = gr.Number(
1205
+ label="Predicted Stress GF (MPa-1)",
1206
+ value=0.0,
1207
+ precision=6,
1208
+ elem_id="pred-out"
1209
+ )
1210
  gr.Markdown(f"<small>{MODEL_STATUS}</small>")
1211
  with gr.Row():
1212
  btn_pred = gr.Button("Predict", variant="primary")
 
1216
  with gr.Accordion("About this model", open=False, elem_classes=["card"]):
1217
  gr.Markdown(
1218
  "- Pipeline: ColumnTransformer β†’ (RobustScaler + OneHot) β†’ XGBoost\n"
1219
+ "- Target: Stress GF (MPa<sup>-1</sup>) on original scale "
1220
+ "(model may train on log1p; saved flag used at inference).\n"
1221
  "- Missing values are safely imputed per-feature.\n"
1222
  "- Trained columns:\n"
1223
  f" `{', '.join(MAIN_VARIABLES)}`",
 
1237
  data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
1238
  return predict_fn(**data)
1239
 
1240
+ btn_pred.click(
1241
+ _predict_wrapper,
1242
+ inputs=inputs_in_order,
1243
+ outputs=out_pred
1244
+ )
1245
+ btn_clear.click(
1246
+ lambda: _clear_all(),
1247
+ inputs=None,
1248
+ outputs=inputs_in_order
1249
+ ).then(
1250
+ lambda: 0.0,
1251
+ outputs=out_pred
1252
+ )
1253
+ btn_demo.click(
1254
+ lambda: _fill_example(),
1255
+ inputs=None,
1256
+ outputs=inputs_in_order
1257
+ )
1258
 
1259
  # ------------------------- Literature Tab -------------------------
1260
  with gr.Tab("πŸ“š Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
1261
  pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
1262
  gr.Markdown(
1263
  f"Using local folder <code>papers/</code> β€” **{pdf_count} PDF(s)** indexed. "
1264
+ "Upload more PDFs and reload the Space to expand coverage. "
1265
+ "Answers use APA-like inline citations (e.g., Omar et al., 2021, p. X)."
1266
  )
1267
  with gr.Row():
1268
  top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
1269
  n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
1270
+ include_passages = gr.Checkbox(
1271
+ value=False,
1272
+ label="Include supporting passages",
1273
+ interactive=True
1274
+ )
1275
 
1276
  with gr.Accordion("Retriever weights (advanced)", open=False):
1277
  w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
1278
  w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
1279
+ w_emb = gr.Slider(
1280
+ 0.0, 1.0,
1281
+ value=(0.0 if not USE_DENSE else 0.40),
1282
+ step=0.05,
1283
+ label="Dense weight (set 0 if disabled)"
1284
+ )
1285
 
1286
  # Hidden states (unchanged)
1287
  state_use_llm = gr.State(LLM_AVAILABLE)
 
1297
  w_tfidf, w_bm25, w_emb
1298
  ],
1299
  title="Literature Q&A",
1300
+ description="Hybrid retrieval with diversity. Answers carry APA-style inline citations."
1301
  )
1302
 
1303
  # ====== Evaluate (Gold vs Logs) β€” darker, higher-contrast ======
 
1305
  gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
1306
  with gr.Row():
1307
  gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
1308
+ k_slider = gr.Slider(3, 12, value=8, step=1,
1309
+ label="k for Hit/Recall/nDCG", elem_id="k-slider")
1310
  with gr.Row():
1311
  btn_eval = gr.Button("Compute Metrics", variant="primary")
1312
  with gr.Row():
 
1327
  "--out_dir", out_dir
1328
  ]
1329
  try:
1330
+ p = subprocess.run(
1331
+ cmd,
1332
+ capture_output=True,
1333
+ text=True,
1334
+ check=False
1335
+ )
1336
  stdout = p.stdout or ""
1337
  stderr = p.stderr or ""
1338
  perq = ARTIFACT_DIR / "metrics_per_question.csv"
 
1340
  agg_json = {}
1341
  if agg.exists():
1342
  agg_json = _json.loads(agg.read_text(encoding="utf-8"))
1343
+ report = (
1344
+ "```\n"
1345
+ + (stdout.strip() or "(no stdout)")
1346
+ + ("\n" + stderr.strip() if stderr else "")
1347
+ + "\n```"
1348
+ )
1349
+ return (
1350
+ str(perq) if perq.exists() else None,
1351
+ str(agg) if agg.exists() else None,
1352
+ agg_json,
1353
+ report
1354
+ )
1355
  except Exception as e:
1356
  return (None, None, {}, f"**Eval error:** {e}")
1357
 
1358
  def _eval_wrapper(gf, k):
1359
+ from pathlib import Path as _Path
1360
  if gf is None:
1361
+ default_gold = _Path("gold.csv")
1362
  if not default_gold.exists():
1363
  return None, None, {}, "**No gold.csv provided or found in repo root.**"
1364
  gold_path = str(default_gold)
 
1366
  gold_path = gf.name
1367
  return _run_eval_inproc(gold_path, int(k))
1368
 
1369
+ btn_eval.click(
1370
+ _eval_wrapper,
1371
+ inputs=[gold_file, k_slider],
1372
+ outputs=[out_perq, out_agg, out_json, out_log]
1373
+ )
1374
 
1375
  # ------------- Launch -------------
1376
  if __name__ == "__main__":
1377
  demo.queue().launch()
 
 
1378
 
1379
+ import os
1380
+ import pandas as pd
1381
 
1382
+ # Folder where your RAG files are stored
1383
+ folder = "papers" # change if needed
1384
 
1385
+ # List all files in the folder
1386
+ files = sorted(os.listdir(folder))
1387
 
1388
+ # Save them to a CSV file
1389
+ pd.DataFrame({"doc": files}).to_csv("paper_list.csv", index=False)
1390
 
1391
+ print("βœ… Saved paper_list.csv with", len(files), "papers")