Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@
|
|
| 8 |
# - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
|
| 9 |
# - UPDATED THEME: Dark-blue tabs + Evaluate tab + k-slider styling
|
| 10 |
# - PATCH: Per-question/aggregate File + JSON outputs now dark-themed via elem_id hooks
|
| 11 |
-
# -
|
| 12 |
# ================================================================
|
| 13 |
|
| 14 |
# ---------------------- Runtime flags (HF-safe) ----------------------
|
|
@@ -50,7 +50,37 @@ except Exception:
|
|
| 50 |
OpenAI = None
|
| 51 |
|
| 52 |
# LLM availability flag β used internally; UI remains hidden
|
| 53 |
-
LLM_AVAILABLE = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
# ========================= Predictor (kept) =========================
|
| 56 |
CF_COL = "Conductive Filler Conc. (wt%)"
|
|
@@ -140,7 +170,9 @@ def _try_load_model():
|
|
| 140 |
traceback.print_exc()
|
| 141 |
MODEL = None
|
| 142 |
if MODEL is None:
|
| 143 |
-
MODEL_STATUS =
|
|
|
|
|
|
|
| 144 |
print("[ModelLoad]", MODEL_STATUS)
|
| 145 |
|
| 146 |
_try_load_model() # load at import time
|
|
@@ -331,7 +363,8 @@ def chunk_by_sentence_windows(text: str, win_size=8, overlap=2) -> List[str]:
|
|
| 331 |
chunks, step = [], max(1, win_size - overlap)
|
| 332 |
for i in range(0, len(sents), step):
|
| 333 |
window = sents[i:i+win_size]
|
| 334 |
-
if not window:
|
|
|
|
| 335 |
chunks.append(" ".join(window))
|
| 336 |
return chunks
|
| 337 |
|
|
@@ -348,10 +381,12 @@ def _safe_init_st_model(name: str):
|
|
| 348 |
|
| 349 |
def build_or_load_hybrid(pdf_dir: Path):
|
| 350 |
# Build or load the hybrid retriever cache
|
| 351 |
-
have_cache = (
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
| 355 |
if have_cache:
|
| 356 |
vectorizer = joblib.load(TFIDF_VECT_PATH)
|
| 357 |
X_tfidf = joblib.load(TFIDF_MAT_PATH)
|
|
@@ -378,7 +413,7 @@ def build_or_load_hybrid(pdf_dir: Path):
|
|
| 378 |
meta = pd.DataFrame(rows)
|
| 379 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 380 |
vectorizer = TfidfVectorizer(
|
| 381 |
-
ngram_range=(1,2),
|
| 382 |
min_df=1, max_df=0.95,
|
| 383 |
sublinear_tf=True, smooth_idf=True,
|
| 384 |
lowercase=True,
|
|
@@ -389,10 +424,17 @@ def build_or_load_hybrid(pdf_dir: Path):
|
|
| 389 |
emb = None
|
| 390 |
if USE_DENSE:
|
| 391 |
try:
|
| 392 |
-
st_model = _safe_init_st_model(
|
|
|
|
|
|
|
| 393 |
if st_model is not None:
|
| 394 |
from sklearn.preprocessing import normalize as sk_normalize
|
| 395 |
-
em = st_model.encode(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
emb = sk_normalize(em)
|
| 397 |
np.save(EMB_NPY_PATH, emb)
|
| 398 |
except Exception as e:
|
|
@@ -415,7 +457,10 @@ def _extract_page(text_chunk: str) -> str:
|
|
| 415 |
m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
|
| 416 |
return (m[-1].group(1) if m else "?")
|
| 417 |
|
| 418 |
-
def hybrid_search(query: str, k=8,
|
|
|
|
|
|
|
|
|
|
| 419 |
if rag_meta is None or rag_meta.empty:
|
| 420 |
return pd.DataFrame()
|
| 421 |
|
|
@@ -472,7 +517,8 @@ def split_sentences(text: str) -> List[str]:
|
|
| 472 |
sents = sent_split(text)
|
| 473 |
return [s for s in sents if 6 <= len(s.split()) <= 60]
|
| 474 |
|
| 475 |
-
def mmr_select_sentences(question: str, hits: pd.DataFrame,
|
|
|
|
| 476 |
"""
|
| 477 |
Robust MMR sentence picker:
|
| 478 |
- Handles empty pools
|
|
@@ -548,9 +594,17 @@ def mmr_select_sentences(question: str, hits: pd.DataFrame, top_n=4, pool_per_ch
|
|
| 548 |
return selected
|
| 549 |
|
| 550 |
def compose_extractive(selected: List[Dict[str, Any]]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
if not selected:
|
| 552 |
return ""
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
# ========================= NEW: Instrumentation helpers =========================
|
| 556 |
LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
|
|
@@ -570,7 +624,8 @@ def _calc_cost_usd(prompt_toks, completion_toks):
|
|
| 570 |
return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
|
| 571 |
|
| 572 |
# ----------------- Modified to return (text, usage_dict) -----------------
|
| 573 |
-
def synthesize_with_llm(question: str, sentence_lines: List[str],
|
|
|
|
| 574 |
if not LLM_AVAILABLE:
|
| 575 |
return None, None
|
| 576 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
@@ -579,7 +634,7 @@ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = N
|
|
| 579 |
"You are a scientific assistant for self-sensing cementitious materials.\n"
|
| 580 |
"Answer STRICTLY using the provided sentences.\n"
|
| 581 |
"Do not invent facts. Keep it concise (3β6 sentences).\n"
|
| 582 |
-
"Retain inline citations
|
| 583 |
)
|
| 584 |
user_prompt = (
|
| 585 |
f"Question: {question}\n\n"
|
|
@@ -627,7 +682,10 @@ def rag_reply(
|
|
| 627 |
t0_retr = time.time()
|
| 628 |
|
| 629 |
# --- Retrieval ---
|
| 630 |
-
hits = hybrid_search(
|
|
|
|
|
|
|
|
|
|
| 631 |
t1_retr = time.time()
|
| 632 |
latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
|
| 633 |
|
|
@@ -650,10 +708,26 @@ def rag_reply(
|
|
| 650 |
return final
|
| 651 |
|
| 652 |
# Select sentences
|
| 653 |
-
selected = mmr_select_sentences(
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
|
| 658 |
# Prepare retrieval list for logging
|
| 659 |
retr_list = []
|
|
@@ -670,12 +744,24 @@ def rag_reply(
|
|
| 670 |
# Strict quotes only (no LLM)
|
| 671 |
if strict_quotes_only:
|
| 672 |
if not selected:
|
| 673 |
-
final =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
else:
|
| 675 |
-
final =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
final += f"\n\n**Citations:** {header_cites}{coverage_note}"
|
| 677 |
if include_passages:
|
| 678 |
-
final +=
|
|
|
|
|
|
|
| 679 |
|
| 680 |
record = {
|
| 681 |
"run_id": run_id,
|
|
@@ -688,7 +774,10 @@ def rag_reply(
|
|
| 688 |
"retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
|
| 689 |
"output": {
|
| 690 |
"final_answer": final,
|
| 691 |
-
"used_sentences": [
|
|
|
|
|
|
|
|
|
|
| 692 |
},
|
| 693 |
"latency_ms_total": int((time.time()-t0_total)*1000),
|
| 694 |
"openai": None
|
|
@@ -701,30 +790,60 @@ def rag_reply(
|
|
| 701 |
llm_usage = None
|
| 702 |
llm_latency_ms = None
|
| 703 |
if use_llm and selected:
|
| 704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
t0_llm = time.time()
|
| 706 |
-
llm_text, llm_usage = synthesize_with_llm(
|
|
|
|
|
|
|
|
|
|
| 707 |
t1_llm = time.time()
|
| 708 |
llm_latency_ms = int((t1_llm - t0_llm) * 1000)
|
| 709 |
|
| 710 |
if llm_text:
|
| 711 |
-
final =
|
|
|
|
|
|
|
|
|
|
| 712 |
if include_passages:
|
| 713 |
-
final +=
|
|
|
|
|
|
|
| 714 |
else:
|
| 715 |
if not extractive:
|
| 716 |
-
final =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
else:
|
| 718 |
-
final =
|
|
|
|
|
|
|
|
|
|
| 719 |
if include_passages:
|
| 720 |
-
final +=
|
|
|
|
|
|
|
| 721 |
else:
|
| 722 |
if not extractive:
|
| 723 |
-
final =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
else:
|
| 725 |
-
final =
|
|
|
|
|
|
|
|
|
|
| 726 |
if include_passages:
|
| 727 |
-
final +=
|
|
|
|
|
|
|
| 728 |
|
| 729 |
# --------- Log full run ---------
|
| 730 |
prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
|
|
@@ -743,7 +862,10 @@ def rag_reply(
|
|
| 743 |
"retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
|
| 744 |
"output": {
|
| 745 |
"final_answer": final,
|
| 746 |
-
"used_sentences": [
|
|
|
|
|
|
|
|
|
|
| 747 |
},
|
| 748 |
"latency_ms_total": total_ms,
|
| 749 |
"latency_ms_llm": llm_latency_ms,
|
|
@@ -1034,7 +1156,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1034 |
"<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
|
| 1035 |
"<p style='opacity:.9'>"
|
| 1036 |
"Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
|
| 1037 |
-
"Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection."
|
|
|
|
| 1038 |
"</p>"
|
| 1039 |
)
|
| 1040 |
|
|
@@ -1078,7 +1201,12 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1078 |
|
| 1079 |
with gr.Column(scale=5):
|
| 1080 |
with gr.Group(elem_classes=["card"]):
|
| 1081 |
-
out_pred = gr.Number(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1082 |
gr.Markdown(f"<small>{MODEL_STATUS}</small>")
|
| 1083 |
with gr.Row():
|
| 1084 |
btn_pred = gr.Button("Predict", variant="primary")
|
|
@@ -1088,7 +1216,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1088 |
with gr.Accordion("About this model", open=False, elem_classes=["card"]):
|
| 1089 |
gr.Markdown(
|
| 1090 |
"- Pipeline: ColumnTransformer β (RobustScaler + OneHot) β XGBoost\n"
|
| 1091 |
-
"- Target: Stress GF (MPa<sup>-1</sup>) on original scale
|
|
|
|
| 1092 |
"- Missing values are safely imputed per-feature.\n"
|
| 1093 |
"- Trained columns:\n"
|
| 1094 |
f" `{', '.join(MAIN_VARIABLES)}`",
|
|
@@ -1108,26 +1237,51 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1108 |
data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
|
| 1109 |
return predict_fn(**data)
|
| 1110 |
|
| 1111 |
-
btn_pred.click(
|
| 1112 |
-
|
| 1113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
|
| 1115 |
# ------------------------- Literature Tab -------------------------
|
| 1116 |
with gr.Tab("π Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
|
| 1117 |
pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
|
| 1118 |
gr.Markdown(
|
| 1119 |
f"Using local folder <code>papers/</code> β **{pdf_count} PDF(s)** indexed. "
|
| 1120 |
-
"Upload more PDFs and reload the Space to expand coverage.
|
|
|
|
| 1121 |
)
|
| 1122 |
with gr.Row():
|
| 1123 |
top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
|
| 1124 |
n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
|
| 1125 |
-
include_passages = gr.Checkbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1126 |
|
| 1127 |
with gr.Accordion("Retriever weights (advanced)", open=False):
|
| 1128 |
w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
|
| 1129 |
w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
|
| 1130 |
-
w_emb = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1131 |
|
| 1132 |
# Hidden states (unchanged)
|
| 1133 |
state_use_llm = gr.State(LLM_AVAILABLE)
|
|
@@ -1143,7 +1297,7 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1143 |
w_tfidf, w_bm25, w_emb
|
| 1144 |
],
|
| 1145 |
title="Literature Q&A",
|
| 1146 |
-
description="Hybrid retrieval with diversity. Answers carry inline
|
| 1147 |
)
|
| 1148 |
|
| 1149 |
# ====== Evaluate (Gold vs Logs) β darker, higher-contrast ======
|
|
@@ -1151,7 +1305,8 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1151 |
gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
|
| 1152 |
with gr.Row():
|
| 1153 |
gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
|
| 1154 |
-
k_slider = gr.Slider(3, 12, value=8, step=1,
|
|
|
|
| 1155 |
with gr.Row():
|
| 1156 |
btn_eval = gr.Button("Compute Metrics", variant="primary")
|
| 1157 |
with gr.Row():
|
|
@@ -1172,7 +1327,12 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1172 |
"--out_dir", out_dir
|
| 1173 |
]
|
| 1174 |
try:
|
| 1175 |
-
p = subprocess.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1176 |
stdout = p.stdout or ""
|
| 1177 |
stderr = p.stderr or ""
|
| 1178 |
perq = ARTIFACT_DIR / "metrics_per_question.csv"
|
|
@@ -1180,18 +1340,25 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1180 |
agg_json = {}
|
| 1181 |
if agg.exists():
|
| 1182 |
agg_json = _json.loads(agg.read_text(encoding="utf-8"))
|
| 1183 |
-
report =
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1188 |
except Exception as e:
|
| 1189 |
return (None, None, {}, f"**Eval error:** {e}")
|
| 1190 |
|
| 1191 |
def _eval_wrapper(gf, k):
|
| 1192 |
-
from pathlib import Path
|
| 1193 |
if gf is None:
|
| 1194 |
-
default_gold =
|
| 1195 |
if not default_gold.exists():
|
| 1196 |
return None, None, {}, "**No gold.csv provided or found in repo root.**"
|
| 1197 |
gold_path = str(default_gold)
|
|
@@ -1199,23 +1366,26 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 1199 |
gold_path = gf.name
|
| 1200 |
return _run_eval_inproc(gold_path, int(k))
|
| 1201 |
|
| 1202 |
-
btn_eval.click(
|
| 1203 |
-
|
|
|
|
|
|
|
|
|
|
| 1204 |
|
| 1205 |
# ------------- Launch -------------
|
| 1206 |
if __name__ == "__main__":
|
| 1207 |
demo.queue().launch()
|
| 1208 |
-
import os
|
| 1209 |
-
import pandas as pd
|
| 1210 |
|
| 1211 |
-
|
| 1212 |
-
|
| 1213 |
|
| 1214 |
-
#
|
| 1215 |
-
|
| 1216 |
|
| 1217 |
-
#
|
| 1218 |
-
|
| 1219 |
|
| 1220 |
-
|
|
|
|
| 1221 |
|
|
|
|
|
|
| 8 |
# - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
|
| 9 |
# - UPDATED THEME: Dark-blue tabs + Evaluate tab + k-slider styling
|
| 10 |
# - PATCH: Per-question/aggregate File + JSON outputs now dark-themed via elem_id hooks
|
| 11 |
+
# - NEW: APA-like inline citations via CITATION_MAP + format_citation()
|
| 12 |
# ================================================================
|
| 13 |
|
| 14 |
# ---------------------- Runtime flags (HF-safe) ----------------------
|
|
|
|
| 50 |
OpenAI = None
|
| 51 |
|
| 52 |
# LLM availability flag β used internally; UI remains hidden
|
| 53 |
+
LLM_AVAILABLE = (
|
| 54 |
+
OPENAI_API_KEY is not None
|
| 55 |
+
and OPENAI_API_KEY.strip() != ""
|
| 56 |
+
and OpenAI is not None
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# ========================= APA-style citation helpers =========================
|
| 60 |
+
|
| 61 |
+
# Map PDF basenames β short APA-style in-text citations.
|
| 62 |
+
# IMPORTANT: edit these entries to match your actual filenames + desired citations.
|
| 63 |
+
CITATION_MAP: Dict[str, str] = {
|
| 64 |
+
# "filename.pdf": "Author et al., YEAR",
|
| 65 |
+
# Examples (you can edit/remove these):
|
| 66 |
+
"S92-Research-on-the-self-sensing-and-mechanical-properties-of_2021_Cement-and-Co.pdf": "Omar et al., 2021",
|
| 67 |
+
"S60-Study-on-self-sensing-capabilities-of-smart-cements-filled-with-graphene-oxide-under-dynamic-cyclic-loading.pdf": "Zhang et al., 2019",
|
| 68 |
+
"S59-Modifying self-sensing cement-based composites through multiscale composition.pdf": "Li et al., 2020",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
def format_citation(doc_name: str, page: Any) -> str:
|
| 72 |
+
"""
|
| 73 |
+
Convert a doc filename + page into an APA-like inline citation.
|
| 74 |
+
Fallback: use filename if doc not in CITATION_MAP.
|
| 75 |
+
"""
|
| 76 |
+
base = Path(doc_name).name
|
| 77 |
+
short = CITATION_MAP.get(base, base)
|
| 78 |
+
if page is None:
|
| 79 |
+
return short
|
| 80 |
+
page_str = str(page).strip()
|
| 81 |
+
if page_str == "" or page_str == "?":
|
| 82 |
+
return short
|
| 83 |
+
return f"{short}, p. {page_str}"
|
| 84 |
|
| 85 |
# ========================= Predictor (kept) =========================
|
| 86 |
CF_COL = "Conductive Filler Conc. (wt%)"
|
|
|
|
| 170 |
traceback.print_exc()
|
| 171 |
MODEL = None
|
| 172 |
if MODEL is None:
|
| 173 |
+
MODEL_STATUS = (
|
| 174 |
+
"π΄ Model not found (place stress_gf_xgb.joblib at repo root or models/, or set MODEL_PATH)"
|
| 175 |
+
)
|
| 176 |
print("[ModelLoad]", MODEL_STATUS)
|
| 177 |
|
| 178 |
_try_load_model() # load at import time
|
|
|
|
| 363 |
chunks, step = [], max(1, win_size - overlap)
|
| 364 |
for i in range(0, len(sents), step):
|
| 365 |
window = sents[i:i+win_size]
|
| 366 |
+
if not window:
|
| 367 |
+
break
|
| 368 |
chunks.append(" ".join(window))
|
| 369 |
return chunks
|
| 370 |
|
|
|
|
| 381 |
|
| 382 |
def build_or_load_hybrid(pdf_dir: Path):
|
| 383 |
# Build or load the hybrid retriever cache
|
| 384 |
+
have_cache = (
|
| 385 |
+
TFIDF_VECT_PATH.exists() and TFIDF_MAT_PATH.exists()
|
| 386 |
+
and RAG_META_PATH.exists()
|
| 387 |
+
and (BM25_TOK_PATH.exists() or BM25Okapi is None)
|
| 388 |
+
and (EMB_NPY_PATH.exists() or not USE_DENSE)
|
| 389 |
+
)
|
| 390 |
if have_cache:
|
| 391 |
vectorizer = joblib.load(TFIDF_VECT_PATH)
|
| 392 |
X_tfidf = joblib.load(TFIDF_MAT_PATH)
|
|
|
|
| 413 |
meta = pd.DataFrame(rows)
|
| 414 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 415 |
vectorizer = TfidfVectorizer(
|
| 416 |
+
ngram_range=(1, 2),
|
| 417 |
min_df=1, max_df=0.95,
|
| 418 |
sublinear_tf=True, smooth_idf=True,
|
| 419 |
lowercase=True,
|
|
|
|
| 424 |
emb = None
|
| 425 |
if USE_DENSE:
|
| 426 |
try:
|
| 427 |
+
st_model = _safe_init_st_model(
|
| 428 |
+
os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
|
| 429 |
+
)
|
| 430 |
if st_model is not None:
|
| 431 |
from sklearn.preprocessing import normalize as sk_normalize
|
| 432 |
+
em = st_model.encode(
|
| 433 |
+
meta["text"].tolist(),
|
| 434 |
+
batch_size=64,
|
| 435 |
+
show_progress_bar=False,
|
| 436 |
+
convert_to_numpy=True
|
| 437 |
+
)
|
| 438 |
emb = sk_normalize(em)
|
| 439 |
np.save(EMB_NPY_PATH, emb)
|
| 440 |
except Exception as e:
|
|
|
|
| 457 |
m = list(re.finditer(r"\[\[PAGE=(\d+)\]\]", text_chunk or ""))
|
| 458 |
return (m[-1].group(1) if m else "?")
|
| 459 |
|
| 460 |
+
def hybrid_search(query: str, k=8,
|
| 461 |
+
w_tfidf=W_TFIDF_DEFAULT,
|
| 462 |
+
w_bm25=W_BM25_DEFAULT,
|
| 463 |
+
w_emb=W_EMB_DEFAULT):
|
| 464 |
if rag_meta is None or rag_meta.empty:
|
| 465 |
return pd.DataFrame()
|
| 466 |
|
|
|
|
| 517 |
sents = sent_split(text)
|
| 518 |
return [s for s in sents if 6 <= len(s.split()) <= 60]
|
| 519 |
|
| 520 |
+
def mmr_select_sentences(question: str, hits: pd.DataFrame,
|
| 521 |
+
top_n=4, pool_per_chunk=6, lambda_div=0.7):
|
| 522 |
"""
|
| 523 |
Robust MMR sentence picker:
|
| 524 |
- Handles empty pools
|
|
|
|
| 594 |
return selected
|
| 595 |
|
| 596 |
def compose_extractive(selected: List[Dict[str, Any]]) -> str:
|
| 597 |
+
"""
|
| 598 |
+
Build an extractive answer with APA-style inline citations.
|
| 599 |
+
Each sentence ends with (Author et al., YEAR, p. X) when mapped in CITATION_MAP.
|
| 600 |
+
"""
|
| 601 |
if not selected:
|
| 602 |
return ""
|
| 603 |
+
parts = []
|
| 604 |
+
for s in selected:
|
| 605 |
+
cite = format_citation(s["doc"], s["page"])
|
| 606 |
+
parts.append(f"{s['sent']} ({cite})")
|
| 607 |
+
return " ".join(parts)
|
| 608 |
|
| 609 |
# ========================= NEW: Instrumentation helpers =========================
|
| 610 |
LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
|
|
|
|
| 624 |
return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
|
| 625 |
|
| 626 |
# ----------------- Modified to return (text, usage_dict) -----------------
|
| 627 |
+
def synthesize_with_llm(question: str, sentence_lines: List[str],
|
| 628 |
+
model: str = None, temperature: float = 0.2):
|
| 629 |
if not LLM_AVAILABLE:
|
| 630 |
return None, None
|
| 631 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
|
|
| 634 |
"You are a scientific assistant for self-sensing cementitious materials.\n"
|
| 635 |
"Answer STRICTLY using the provided sentences.\n"
|
| 636 |
"Do not invent facts. Keep it concise (3β6 sentences).\n"
|
| 637 |
+
"Retain inline citations exactly as given (e.g., Omar et al., 2021, p. X).\n"
|
| 638 |
)
|
| 639 |
user_prompt = (
|
| 640 |
f"Question: {question}\n\n"
|
|
|
|
| 682 |
t0_retr = time.time()
|
| 683 |
|
| 684 |
# --- Retrieval ---
|
| 685 |
+
hits = hybrid_search(
|
| 686 |
+
question, k=k,
|
| 687 |
+
w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb
|
| 688 |
+
)
|
| 689 |
t1_retr = time.time()
|
| 690 |
latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
|
| 691 |
|
|
|
|
| 708 |
return final
|
| 709 |
|
| 710 |
# Select sentences
|
| 711 |
+
selected = mmr_select_sentences(
|
| 712 |
+
question, hits,
|
| 713 |
+
top_n=int(n_sentences),
|
| 714 |
+
pool_per_chunk=6,
|
| 715 |
+
lambda_div=0.7
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
# APA-style header citations: e.g., "Omar et al., 2021, p. 5; Zhang et al., 2019, p. 12"
|
| 719 |
+
header_cites = "; ".join(
|
| 720 |
+
format_citation(
|
| 721 |
+
Path(r["doc_path"]).name,
|
| 722 |
+
_extract_page(r["text"])
|
| 723 |
+
)
|
| 724 |
+
for _, r in hits.head(6).iterrows()
|
| 725 |
+
)
|
| 726 |
+
srcs = {Path(r["doc_path"]).name for _, r in hits.iterrows()}
|
| 727 |
+
coverage_note = "" if len(srcs) >= 3 else (
|
| 728 |
+
f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. "
|
| 729 |
+
f"Add more PDFs or increase Top-K."
|
| 730 |
+
)
|
| 731 |
|
| 732 |
# Prepare retrieval list for logging
|
| 733 |
retr_list = []
|
|
|
|
| 744 |
# Strict quotes only (no LLM)
|
| 745 |
if strict_quotes_only:
|
| 746 |
if not selected:
|
| 747 |
+
final = (
|
| 748 |
+
f"**Quoted Passages:**\n\n---\n"
|
| 749 |
+
+ "\n\n".join(hits['text'].tolist()[:2])
|
| 750 |
+
+ f"\n\n**Citations:** {header_cites}{coverage_note}"
|
| 751 |
+
)
|
| 752 |
else:
|
| 753 |
+
final = (
|
| 754 |
+
"**Quoted Passages:**\n- "
|
| 755 |
+
+ "\n- ".join(
|
| 756 |
+
f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
|
| 757 |
+
for s in selected
|
| 758 |
+
)
|
| 759 |
+
)
|
| 760 |
final += f"\n\n**Citations:** {header_cites}{coverage_note}"
|
| 761 |
if include_passages:
|
| 762 |
+
final += (
|
| 763 |
+
"\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 764 |
+
)
|
| 765 |
|
| 766 |
record = {
|
| 767 |
"run_id": run_id,
|
|
|
|
| 774 |
"retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
|
| 775 |
"output": {
|
| 776 |
"final_answer": final,
|
| 777 |
+
"used_sentences": [
|
| 778 |
+
{"sent": s["sent"], "doc": s["doc"], "page": s["page"]}
|
| 779 |
+
for s in selected
|
| 780 |
+
]
|
| 781 |
},
|
| 782 |
"latency_ms_total": int((time.time()-t0_total)*1000),
|
| 783 |
"openai": None
|
|
|
|
| 790 |
llm_usage = None
|
| 791 |
llm_latency_ms = None
|
| 792 |
if use_llm and selected:
|
| 793 |
+
# Pass APA-style citations into the LLM so it preserves them
|
| 794 |
+
lines = [
|
| 795 |
+
f"{s['sent']} ({format_citation(s['doc'], s['page'])})"
|
| 796 |
+
for s in selected
|
| 797 |
+
]
|
| 798 |
t0_llm = time.time()
|
| 799 |
+
llm_text, llm_usage = synthesize_with_llm(
|
| 800 |
+
question, lines,
|
| 801 |
+
model=model, temperature=temperature
|
| 802 |
+
)
|
| 803 |
t1_llm = time.time()
|
| 804 |
llm_latency_ms = int((t1_llm - t0_llm) * 1000)
|
| 805 |
|
| 806 |
if llm_text:
|
| 807 |
+
final = (
|
| 808 |
+
f"**Answer (LLM synthesis):** {llm_text}\n\n"
|
| 809 |
+
f"**Citations:** {header_cites}{coverage_note}"
|
| 810 |
+
)
|
| 811 |
if include_passages:
|
| 812 |
+
final += (
|
| 813 |
+
"\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 814 |
+
)
|
| 815 |
else:
|
| 816 |
if not extractive:
|
| 817 |
+
final = (
|
| 818 |
+
f"**Answer:** Here are relevant passages.\n\n"
|
| 819 |
+
f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
|
| 820 |
+
+ "\n\n".join(hits['text'].tolist()[:2])
|
| 821 |
+
)
|
| 822 |
else:
|
| 823 |
+
final = (
|
| 824 |
+
f"**Answer:** {extractive}\n\n"
|
| 825 |
+
f"**Citations:** {header_cites}{coverage_note}"
|
| 826 |
+
)
|
| 827 |
if include_passages:
|
| 828 |
+
final += (
|
| 829 |
+
"\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 830 |
+
)
|
| 831 |
else:
|
| 832 |
if not extractive:
|
| 833 |
+
final = (
|
| 834 |
+
f"**Answer:** Here are relevant passages.\n\n"
|
| 835 |
+
f"**Citations:** {header_cites}{coverage_note}\n\n---\n"
|
| 836 |
+
+ "\n\n".join(hits['text'].tolist()[:2])
|
| 837 |
+
)
|
| 838 |
else:
|
| 839 |
+
final = (
|
| 840 |
+
f"**Answer:** {extractive}\n\n"
|
| 841 |
+
f"**Citations:** {header_cites}{coverage_note}"
|
| 842 |
+
)
|
| 843 |
if include_passages:
|
| 844 |
+
final += (
|
| 845 |
+
"\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 846 |
+
)
|
| 847 |
|
| 848 |
# --------- Log full run ---------
|
| 849 |
prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
|
|
|
|
| 862 |
"retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
|
| 863 |
"output": {
|
| 864 |
"final_answer": final,
|
| 865 |
+
"used_sentences": [
|
| 866 |
+
{"sent": s['sent'], "doc": s['doc'], "page": s['page']}
|
| 867 |
+
for s in selected
|
| 868 |
+
]
|
| 869 |
},
|
| 870 |
"latency_ms_total": total_ms,
|
| 871 |
"latency_ms_llm": llm_latency_ms,
|
|
|
|
| 1156 |
"<h1 style='margin:0'>Self-Sensing Concrete Assistant</h1>"
|
| 1157 |
"<p style='opacity:.9'>"
|
| 1158 |
"Left: ML prediction for Stress Gauge Factor (original scale, MPa<sup>-1</sup>). "
|
| 1159 |
+
"Right: Literature Q&A via Hybrid RAG (BM25 + TF-IDF + optional dense) with MMR sentence selection. "
|
| 1160 |
+
"Answers use APA-style inline citations (e.g., Omar et al., 2021, p. X)."
|
| 1161 |
"</p>"
|
| 1162 |
)
|
| 1163 |
|
|
|
|
| 1201 |
|
| 1202 |
with gr.Column(scale=5):
|
| 1203 |
with gr.Group(elem_classes=["card"]):
|
| 1204 |
+
out_pred = gr.Number(
|
| 1205 |
+
label="Predicted Stress GF (MPa-1)",
|
| 1206 |
+
value=0.0,
|
| 1207 |
+
precision=6,
|
| 1208 |
+
elem_id="pred-out"
|
| 1209 |
+
)
|
| 1210 |
gr.Markdown(f"<small>{MODEL_STATUS}</small>")
|
| 1211 |
with gr.Row():
|
| 1212 |
btn_pred = gr.Button("Predict", variant="primary")
|
|
|
|
| 1216 |
with gr.Accordion("About this model", open=False, elem_classes=["card"]):
|
| 1217 |
gr.Markdown(
|
| 1218 |
"- Pipeline: ColumnTransformer β (RobustScaler + OneHot) β XGBoost\n"
|
| 1219 |
+
"- Target: Stress GF (MPa<sup>-1</sup>) on original scale "
|
| 1220 |
+
"(model may train on log1p; saved flag used at inference).\n"
|
| 1221 |
"- Missing values are safely imputed per-feature.\n"
|
| 1222 |
"- Trained columns:\n"
|
| 1223 |
f" `{', '.join(MAIN_VARIABLES)}`",
|
|
|
|
| 1237 |
data = {k: v for k, v in zip(MAIN_VARIABLES, vals)}
|
| 1238 |
return predict_fn(**data)
|
| 1239 |
|
| 1240 |
+
btn_pred.click(
|
| 1241 |
+
_predict_wrapper,
|
| 1242 |
+
inputs=inputs_in_order,
|
| 1243 |
+
outputs=out_pred
|
| 1244 |
+
)
|
| 1245 |
+
btn_clear.click(
|
| 1246 |
+
lambda: _clear_all(),
|
| 1247 |
+
inputs=None,
|
| 1248 |
+
outputs=inputs_in_order
|
| 1249 |
+
).then(
|
| 1250 |
+
lambda: 0.0,
|
| 1251 |
+
outputs=out_pred
|
| 1252 |
+
)
|
| 1253 |
+
btn_demo.click(
|
| 1254 |
+
lambda: _fill_example(),
|
| 1255 |
+
inputs=None,
|
| 1256 |
+
outputs=inputs_in_order
|
| 1257 |
+
)
|
| 1258 |
|
| 1259 |
# ------------------------- Literature Tab -------------------------
|
| 1260 |
with gr.Tab("π Ask the Literature (Hybrid RAG + MMR)", elem_id="rag-tab"):
|
| 1261 |
pdf_count = len(list(LOCAL_PDF_DIR.glob("**/*.pdf")))
|
| 1262 |
gr.Markdown(
|
| 1263 |
f"Using local folder <code>papers/</code> β **{pdf_count} PDF(s)** indexed. "
|
| 1264 |
+
"Upload more PDFs and reload the Space to expand coverage. "
|
| 1265 |
+
"Answers use APA-like inline citations (e.g., Omar et al., 2021, p. X)."
|
| 1266 |
)
|
| 1267 |
with gr.Row():
|
| 1268 |
top_k = gr.Slider(5, 12, value=8, step=1, label="Top-K chunks")
|
| 1269 |
n_sentences = gr.Slider(2, 6, value=4, step=1, label="Answer length (sentences)")
|
| 1270 |
+
include_passages = gr.Checkbox(
|
| 1271 |
+
value=False,
|
| 1272 |
+
label="Include supporting passages",
|
| 1273 |
+
interactive=True
|
| 1274 |
+
)
|
| 1275 |
|
| 1276 |
with gr.Accordion("Retriever weights (advanced)", open=False):
|
| 1277 |
w_tfidf = gr.Slider(0.0, 1.0, value=W_TFIDF_DEFAULT, step=0.05, label="TF-IDF weight")
|
| 1278 |
w_bm25 = gr.Slider(0.0, 1.0, value=W_BM25_DEFAULT, step=0.05, label="BM25 weight")
|
| 1279 |
+
w_emb = gr.Slider(
|
| 1280 |
+
0.0, 1.0,
|
| 1281 |
+
value=(0.0 if not USE_DENSE else 0.40),
|
| 1282 |
+
step=0.05,
|
| 1283 |
+
label="Dense weight (set 0 if disabled)"
|
| 1284 |
+
)
|
| 1285 |
|
| 1286 |
# Hidden states (unchanged)
|
| 1287 |
state_use_llm = gr.State(LLM_AVAILABLE)
|
|
|
|
| 1297 |
w_tfidf, w_bm25, w_emb
|
| 1298 |
],
|
| 1299 |
title="Literature Q&A",
|
| 1300 |
+
description="Hybrid retrieval with diversity. Answers carry APA-style inline citations."
|
| 1301 |
)
|
| 1302 |
|
| 1303 |
# ====== Evaluate (Gold vs Logs) β darker, higher-contrast ======
|
|
|
|
| 1305 |
gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
|
| 1306 |
with gr.Row():
|
| 1307 |
gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
|
| 1308 |
+
k_slider = gr.Slider(3, 12, value=8, step=1,
|
| 1309 |
+
label="k for Hit/Recall/nDCG", elem_id="k-slider")
|
| 1310 |
with gr.Row():
|
| 1311 |
btn_eval = gr.Button("Compute Metrics", variant="primary")
|
| 1312 |
with gr.Row():
|
|
|
|
| 1327 |
"--out_dir", out_dir
|
| 1328 |
]
|
| 1329 |
try:
|
| 1330 |
+
p = subprocess.run(
|
| 1331 |
+
cmd,
|
| 1332 |
+
capture_output=True,
|
| 1333 |
+
text=True,
|
| 1334 |
+
check=False
|
| 1335 |
+
)
|
| 1336 |
stdout = p.stdout or ""
|
| 1337 |
stderr = p.stderr or ""
|
| 1338 |
perq = ARTIFACT_DIR / "metrics_per_question.csv"
|
|
|
|
| 1340 |
agg_json = {}
|
| 1341 |
if agg.exists():
|
| 1342 |
agg_json = _json.loads(agg.read_text(encoding="utf-8"))
|
| 1343 |
+
report = (
|
| 1344 |
+
"```\n"
|
| 1345 |
+
+ (stdout.strip() or "(no stdout)")
|
| 1346 |
+
+ ("\n" + stderr.strip() if stderr else "")
|
| 1347 |
+
+ "\n```"
|
| 1348 |
+
)
|
| 1349 |
+
return (
|
| 1350 |
+
str(perq) if perq.exists() else None,
|
| 1351 |
+
str(agg) if agg.exists() else None,
|
| 1352 |
+
agg_json,
|
| 1353 |
+
report
|
| 1354 |
+
)
|
| 1355 |
except Exception as e:
|
| 1356 |
return (None, None, {}, f"**Eval error:** {e}")
|
| 1357 |
|
| 1358 |
def _eval_wrapper(gf, k):
|
| 1359 |
+
from pathlib import Path as _Path
|
| 1360 |
if gf is None:
|
| 1361 |
+
default_gold = _Path("gold.csv")
|
| 1362 |
if not default_gold.exists():
|
| 1363 |
return None, None, {}, "**No gold.csv provided or found in repo root.**"
|
| 1364 |
gold_path = str(default_gold)
|
|
|
|
| 1366 |
gold_path = gf.name
|
| 1367 |
return _run_eval_inproc(gold_path, int(k))
|
| 1368 |
|
| 1369 |
+
btn_eval.click(
|
| 1370 |
+
_eval_wrapper,
|
| 1371 |
+
inputs=[gold_file, k_slider],
|
| 1372 |
+
outputs=[out_perq, out_agg, out_json, out_log]
|
| 1373 |
+
)
|
| 1374 |
|
| 1375 |
# ------------- Launch -------------
|
| 1376 |
if __name__ == "__main__":
|
| 1377 |
demo.queue().launch()
|
|
|
|
|
|
|
| 1378 |
|
| 1379 |
+
import os
|
| 1380 |
+
import pandas as pd
|
| 1381 |
|
| 1382 |
+
# Folder where your RAG files are stored
|
| 1383 |
+
folder = "papers" # change if needed
|
| 1384 |
|
| 1385 |
+
# List all files in the folder
|
| 1386 |
+
files = sorted(os.listdir(folder))
|
| 1387 |
|
| 1388 |
+
# Save them to a CSV file
|
| 1389 |
+
pd.DataFrame({"doc": files}).to_csv("paper_list.csv", index=False)
|
| 1390 |
|
| 1391 |
+
print("β
Saved paper_list.csv with", len(files), "papers")
|