Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
# - Predictor: safe model caching + safe feature alignment
|
| 6 |
# - Stable categoricals ("NA"); no over-strict completeness gate
|
| 7 |
# - Fixed [[PAGE=...]] regex
|
|
|
|
| 8 |
# ================================================================
|
| 9 |
|
| 10 |
# ---------------------- Runtime flags (HF-safe) ----------------------
|
|
@@ -14,7 +15,7 @@ os.environ["TRANSFORMERS_NO_FLAX"] = "1"
|
|
| 14 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 15 |
|
| 16 |
# ------------------------------- Imports ------------------------------
|
| 17 |
-
import re, joblib, warnings, json, traceback
|
| 18 |
from pathlib import Path
|
| 19 |
from typing import List, Dict, Any
|
| 20 |
|
|
@@ -548,9 +549,27 @@ def compose_extractive(selected: List[Dict[str, Any]]) -> str:
|
|
| 548 |
return ""
|
| 549 |
return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
|
| 550 |
|
| 551 |
-
|
| 552 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 555 |
model = model or OPENAI_MODEL
|
| 556 |
SYSTEM_PROMPT = (
|
|
@@ -573,9 +592,19 @@ def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = N
|
|
| 573 |
],
|
| 574 |
temperature=temperature,
|
| 575 |
)
|
| 576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
except Exception:
|
| 578 |
-
return None
|
| 579 |
|
| 580 |
def rag_reply(
|
| 581 |
question: str,
|
|
@@ -590,41 +619,139 @@ def rag_reply(
|
|
| 590 |
w_bm25: float = W_BM25_DEFAULT,
|
| 591 |
w_emb: float = W_EMB_DEFAULT
|
| 592 |
) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
|
| 594 |
-
|
| 595 |
-
|
| 596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
|
| 598 |
header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
|
| 599 |
srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
|
| 600 |
coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
|
| 601 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
if strict_quotes_only:
|
| 603 |
if not selected:
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
extractive = compose_extractive(selected)
|
|
|
|
|
|
|
| 612 |
if use_llm and selected:
|
| 613 |
lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
|
| 614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
if llm_text:
|
| 616 |
-
|
| 617 |
if include_passages:
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
|
| 630 |
use_llm, model_name, temperature, strict_quotes_only,
|
|
@@ -664,7 +791,7 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
|
|
| 664 |
.gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
|
| 665 |
#rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
|
| 666 |
|
| 667 |
-
/* RAG tab
|
| 668 |
#rag-tab .block, #rag-tab .group, #rag-tab .accordion {
|
| 669 |
background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
|
| 670 |
border-radius: 12px;
|
|
@@ -691,6 +818,35 @@ input[type="checkbox"], .gr-checkbox, .gr-checkbox > * { pointer-events: auto !i
|
|
| 691 |
color: #eef6ff !important;
|
| 692 |
}
|
| 693 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
/* Predictor output emphasis */
|
| 695 |
#pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
|
| 696 |
"""
|
|
@@ -826,6 +982,62 @@ with gr.Blocks(css=CSS, theme=theme, fill_height=True) as demo:
|
|
| 826 |
description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
|
| 827 |
)
|
| 828 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
# ------------- Launch -------------
|
| 830 |
if __name__ == "__main__":
|
| 831 |
demo.queue().launch()
|
|
|
|
| 5 |
# - Predictor: safe model caching + safe feature alignment
|
| 6 |
# - Stable categoricals ("NA"); no over-strict completeness gate
|
| 7 |
# - Fixed [[PAGE=...]] regex
|
| 8 |
+
# - NEW: Lightweight instrumentation (JSONL logs per RAG turn)
|
| 9 |
# ================================================================
|
| 10 |
|
| 11 |
# ---------------------- Runtime flags (HF-safe) ----------------------
|
|
|
|
| 15 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 16 |
|
| 17 |
# ------------------------------- Imports ------------------------------
|
| 18 |
+
import re, joblib, warnings, json, traceback, time, uuid, subprocess, sys
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import List, Dict, Any
|
| 21 |
|
|
|
|
| 549 |
return ""
|
| 550 |
return " ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
|
| 551 |
|
| 552 |
+
# ========================= NEW: Instrumentation helpers =========================
|
| 553 |
+
LOG_PATH = ARTIFACT_DIR / "rag_logs.jsonl"
|
| 554 |
+
OPENAI_IN_COST_PER_1K = float(os.getenv("OPENAI_COST_IN_PER_1K", "0"))
|
| 555 |
+
OPENAI_OUT_COST_PER_1K = float(os.getenv("OPENAI_COST_OUT_PER_1K", "0"))
|
| 556 |
+
|
| 557 |
+
def _safe_write_jsonl(path: Path, record: dict):
|
| 558 |
+
try:
|
| 559 |
+
with open(path, "a", encoding="utf-8") as f:
|
| 560 |
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
| 561 |
+
except Exception as e:
|
| 562 |
+
print("[Log] write failed:", e)
|
| 563 |
+
|
| 564 |
+
def _calc_cost_usd(prompt_toks, completion_toks):
|
| 565 |
+
if prompt_toks is None or completion_toks is None:
|
| 566 |
return None
|
| 567 |
+
return (prompt_toks / 1000.0) * OPENAI_IN_COST_PER_1K + (completion_toks / 1000.0) * OPENAI_OUT_COST_PER_1K
|
| 568 |
+
|
| 569 |
+
# ----------------- Modified to return (text, usage_dict) -----------------
|
| 570 |
+
def synthesize_with_llm(question: str, sentence_lines: List[str], model: str = None, temperature: float = 0.2):
|
| 571 |
+
if not LLM_AVAILABLE:
|
| 572 |
+
return None, None
|
| 573 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 574 |
model = model or OPENAI_MODEL
|
| 575 |
SYSTEM_PROMPT = (
|
|
|
|
| 592 |
],
|
| 593 |
temperature=temperature,
|
| 594 |
)
|
| 595 |
+
out_text = getattr(resp, "output_text", None) or str(resp)
|
| 596 |
+
usage = None
|
| 597 |
+
try:
|
| 598 |
+
u = getattr(resp, "usage", None)
|
| 599 |
+
if u:
|
| 600 |
+
pt = getattr(u, "prompt_tokens", None) if hasattr(u, "prompt_tokens") else u.get("prompt_tokens", None)
|
| 601 |
+
ct = getattr(u, "completion_tokens", None) if hasattr(u, "completion_tokens") else u.get("completion_tokens", None)
|
| 602 |
+
usage = {"prompt_tokens": pt, "completion_tokens": ct}
|
| 603 |
+
except Exception:
|
| 604 |
+
usage = None
|
| 605 |
+
return out_text, usage
|
| 606 |
except Exception:
|
| 607 |
+
return None, None
|
| 608 |
|
| 609 |
def rag_reply(
|
| 610 |
question: str,
|
|
|
|
| 619 |
w_bm25: float = W_BM25_DEFAULT,
|
| 620 |
w_emb: float = W_EMB_DEFAULT
|
| 621 |
) -> str:
|
| 622 |
+
run_id = str(uuid.uuid4())
|
| 623 |
+
t0_total = time.time()
|
| 624 |
+
t0_retr = time.time()
|
| 625 |
+
|
| 626 |
+
# --- Retrieval ---
|
| 627 |
hits = hybrid_search(question, k=k, w_tfidf=w_tfidf, w_bm25=w_bm25, w_emb=w_emb)
|
| 628 |
+
t1_retr = time.time()
|
| 629 |
+
latency_ms_retriever = int((t1_retr - t0_retr) * 1000)
|
| 630 |
|
| 631 |
+
if hits is None or hits.empty:
|
| 632 |
+
final = "No indexed PDFs found. Upload PDFs to the 'papers/' folder and reload the Space."
|
| 633 |
+
record = {
|
| 634 |
+
"run_id": run_id,
|
| 635 |
+
"ts": int(time.time()*1000),
|
| 636 |
+
"inputs": {
|
| 637 |
+
"question": question, "top_k": int(k), "n_sentences": int(n_sentences),
|
| 638 |
+
"w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
|
| 639 |
+
"use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
|
| 640 |
+
},
|
| 641 |
+
"retrieval": {"hits": [], "latency_ms_retriever": latency_ms_retriever},
|
| 642 |
+
"output": {"final_answer": final, "used_sentences": []},
|
| 643 |
+
"latency_ms_total": int((time.time()-t0_total)*1000),
|
| 644 |
+
"openai": None
|
| 645 |
+
}
|
| 646 |
+
_safe_write_jsonl(LOG_PATH, record)
|
| 647 |
+
return final
|
| 648 |
+
|
| 649 |
+
# Select sentences
|
| 650 |
selected = mmr_select_sentences(question, hits, top_n=int(n_sentences), pool_per_chunk=6, lambda_div=0.7)
|
| 651 |
header_cites = "; ".join(f"{Path(r['doc_path']).name} (p.{_extract_page(r['text'])})" for _, r in hits.head(6).iterrows())
|
| 652 |
srcs = {Path(r['doc_path']).name for _, r in hits.iterrows()}
|
| 653 |
coverage_note = "" if len(srcs) >= 3 else f"\n\n> Note: Only {len(srcs)} unique source(s) contributed. Add more PDFs or increase Top-K."
|
| 654 |
|
| 655 |
+
# Prepare retrieval list for logging
|
| 656 |
+
retr_list = []
|
| 657 |
+
for _, r in hits.iterrows():
|
| 658 |
+
retr_list.append({
|
| 659 |
+
"doc": Path(r["doc_path"]).name,
|
| 660 |
+
"page": _extract_page(r["text"]),
|
| 661 |
+
"score_tfidf": float(r.get("score_tfidf", 0.0)),
|
| 662 |
+
"score_bm25": float(r.get("score_bm25", 0.0)),
|
| 663 |
+
"score_dense": float(r.get("score_dense", 0.0)),
|
| 664 |
+
"combo_score": float(r.get("score", 0.0)),
|
| 665 |
+
})
|
| 666 |
+
|
| 667 |
+
# Strict quotes only (no LLM)
|
| 668 |
if strict_quotes_only:
|
| 669 |
if not selected:
|
| 670 |
+
final = f"**Quoted Passages:**\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2]) + f"\n\n**Citations:** {header_cites}{coverage_note}"
|
| 671 |
+
else:
|
| 672 |
+
final = "**Quoted Passages:**\n- " + "\n- ".join(f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected)
|
| 673 |
+
final += f"\n\n**Citations:** {header_cites}{coverage_note}"
|
| 674 |
+
if include_passages:
|
| 675 |
+
final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 676 |
+
|
| 677 |
+
record = {
|
| 678 |
+
"run_id": run_id,
|
| 679 |
+
"ts": int(time.time()*1000),
|
| 680 |
+
"inputs": {
|
| 681 |
+
"question": question, "top_k": int(k), "n_sentences": int(n_sentences),
|
| 682 |
+
"w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
|
| 683 |
+
"use_llm": False, "model": None, "temperature": float(temperature)
|
| 684 |
+
},
|
| 685 |
+
"retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
|
| 686 |
+
"output": {
|
| 687 |
+
"final_answer": final,
|
| 688 |
+
"used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
|
| 689 |
+
},
|
| 690 |
+
"latency_ms_total": int((time.time()-t0_total)*1000),
|
| 691 |
+
"openai": None
|
| 692 |
+
}
|
| 693 |
+
_safe_write_jsonl(LOG_PATH, record)
|
| 694 |
+
return final
|
| 695 |
+
|
| 696 |
+
# Extractive or LLM synthesis
|
| 697 |
extractive = compose_extractive(selected)
|
| 698 |
+
llm_usage = None
|
| 699 |
+
llm_latency_ms = None
|
| 700 |
if use_llm and selected:
|
| 701 |
lines = [f"{s['sent']} ({s['doc']}, p.{s['page']})" for s in selected]
|
| 702 |
+
t0_llm = time.time()
|
| 703 |
+
llm_text, llm_usage = synthesize_with_llm(question, lines, model=model, temperature=temperature)
|
| 704 |
+
t1_llm = time.time()
|
| 705 |
+
llm_latency_ms = int((t1_llm - t0_llm) * 1000)
|
| 706 |
+
|
| 707 |
if llm_text:
|
| 708 |
+
final = f"**Answer (LLM synthesis):** {llm_text}\n\n**Citations:** {header_cites}{coverage_note}"
|
| 709 |
if include_passages:
|
| 710 |
+
final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 711 |
+
else:
|
| 712 |
+
if not extractive:
|
| 713 |
+
final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 714 |
+
else:
|
| 715 |
+
final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
|
| 716 |
+
if include_passages:
|
| 717 |
+
final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 718 |
+
else:
|
| 719 |
+
if not extractive:
|
| 720 |
+
final = f"**Answer:** Here are relevant passages.\n\n**Citations:** {header_cites}{coverage_note}\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 721 |
+
else:
|
| 722 |
+
final = f"**Answer:** {extractive}\n\n**Citations:** {header_cites}{coverage_note}"
|
| 723 |
+
if include_passages:
|
| 724 |
+
final += "\n\n---\n" + "\n\n".join(hits['text'].tolist()[:2])
|
| 725 |
+
|
| 726 |
+
# --------- Log full run ---------
|
| 727 |
+
prompt_toks = llm_usage.get("prompt_tokens") if llm_usage else None
|
| 728 |
+
completion_toks = llm_usage.get("completion_tokens") if llm_usage else None
|
| 729 |
+
cost_usd = _calc_cost_usd(prompt_toks, completion_toks)
|
| 730 |
+
|
| 731 |
+
total_ms = int((time.time() - t0_total) * 1000)
|
| 732 |
+
record = {
|
| 733 |
+
"run_id": run_id,
|
| 734 |
+
"ts": int(time.time()*1000),
|
| 735 |
+
"inputs": {
|
| 736 |
+
"question": question, "top_k": int(k), "n_sentences": int(n_sentences),
|
| 737 |
+
"w_tfidf": float(w_tfidf), "w_bm25": float(w_bm25), "w_emb": float(w_emb),
|
| 738 |
+
"use_llm": bool(use_llm), "model": model, "temperature": float(temperature)
|
| 739 |
+
},
|
| 740 |
+
"retrieval": {"hits": retr_list, "latency_ms_retriever": latency_ms_retriever},
|
| 741 |
+
"output": {
|
| 742 |
+
"final_answer": final,
|
| 743 |
+
"used_sentences": [{"sent": s["sent"], "doc": s["doc"], "page": s["page"]} for s in selected]
|
| 744 |
+
},
|
| 745 |
+
"latency_ms_total": total_ms,
|
| 746 |
+
"latency_ms_llm": llm_latency_ms,
|
| 747 |
+
"openai": {
|
| 748 |
+
"prompt_tokens": prompt_toks,
|
| 749 |
+
"completion_tokens": completion_toks,
|
| 750 |
+
"cost_usd": cost_usd
|
| 751 |
+
} if use_llm else None
|
| 752 |
+
}
|
| 753 |
+
_safe_write_jsonl(LOG_PATH, record)
|
| 754 |
+
return final
|
| 755 |
|
| 756 |
def rag_chat_fn(message, history, top_k, n_sentences, include_passages,
|
| 757 |
use_llm, model_name, temperature, strict_quotes_only,
|
|
|
|
| 791 |
.gr-checkbox label, .gr-check-radio label { pointer-events: auto !important; cursor: pointer; }
|
| 792 |
#rag-tab input[type="checkbox"] { accent-color: #60a5fa !important; }
|
| 793 |
|
| 794 |
+
/* RAG tab styling */
|
| 795 |
#rag-tab .block, #rag-tab .group, #rag-tab .accordion {
|
| 796 |
background: linear-gradient(160deg, #1f2937 0%, #14532d 55%, #0b3b68 100%) !important;
|
| 797 |
border-radius: 12px;
|
|
|
|
| 818 |
color: #eef6ff !important;
|
| 819 |
}
|
| 820 |
|
| 821 |
+
/* NEW — Evaluate tab dark/high-contrast styling */
|
| 822 |
+
#eval-tab .block, #eval-tab .group, #eval-tab .accordion {
|
| 823 |
+
background: linear-gradient(165deg, #0a0f1f 0%, #0d1a31 60%, #0a1c2e 100%) !important;
|
| 824 |
+
border-radius: 12px;
|
| 825 |
+
border: 1px solid rgba(139, 197, 255, 0.28);
|
| 826 |
+
}
|
| 827 |
+
#eval-tab label, #eval-tab .markdown, #eval-tab .prose, #eval-tab p, #eval-tab span {
|
| 828 |
+
color: #e6f2ff !important;
|
| 829 |
+
}
|
| 830 |
+
#eval-tab input, #eval-tab .gr-file, #eval-tab .scroll-hide, #eval-tab textarea, #eval-tab select {
|
| 831 |
+
background: rgba(8, 13, 26, 0.9) !important;
|
| 832 |
+
border: 1px solid #3b82f6 !important;
|
| 833 |
+
color: #dbeafe !important;
|
| 834 |
+
}
|
| 835 |
+
#eval-tab input[type="range"] { accent-color: #22c55e !important; }
|
| 836 |
+
#eval-tab button {
|
| 837 |
+
border-radius: 10px !important;
|
| 838 |
+
font-weight: 700 !important;
|
| 839 |
+
background: #0ea5e9 !important;
|
| 840 |
+
color: #001321 !important;
|
| 841 |
+
border: 1px solid #7dd3fc !important;
|
| 842 |
+
}
|
| 843 |
+
#eval-tab .gr-json, #eval-tab .markdown pre, #eval-tab .markdown code {
|
| 844 |
+
background: rgba(2, 6, 23, 0.85) !important;
|
| 845 |
+
color: #e2e8f0 !important;
|
| 846 |
+
border: 1px solid rgba(148, 163, 184, 0.3) !important;
|
| 847 |
+
border-radius: 10px !important;
|
| 848 |
+
}
|
| 849 |
+
|
| 850 |
/* Predictor output emphasis */
|
| 851 |
#pred-out .wrap { font-size: 20px; font-weight: 700; color: #ecfdf5; }
|
| 852 |
"""
|
|
|
|
| 982 |
description="Hybrid retrieval with diversity. Answers carry inline (Doc, p.X) citations."
|
| 983 |
)
|
| 984 |
|
| 985 |
+
# ====== Evaluate (Gold vs Logs) — darker, higher-contrast ======
|
| 986 |
+
with gr.Tab("📏 Evaluate (Gold vs Logs)", elem_id="eval-tab"):
|
| 987 |
+
gr.Markdown("Upload your **gold.csv** and compute metrics against the app logs.")
|
| 988 |
+
with gr.Row():
|
| 989 |
+
gold_file = gr.File(label="gold.csv", file_types=[".csv"], interactive=True)
|
| 990 |
+
k_slider = gr.Slider(3, 12, value=8, step=1, label="k for Hit/Recall/nDCG")
|
| 991 |
+
with gr.Row():
|
| 992 |
+
btn_eval = gr.Button("Compute Metrics", variant="primary")
|
| 993 |
+
with gr.Row():
|
| 994 |
+
out_perq = gr.File(label="Per-question metrics (CSV)")
|
| 995 |
+
out_agg = gr.File(label="Aggregate metrics (JSON)")
|
| 996 |
+
out_json = gr.JSON(label="Aggregate summary")
|
| 997 |
+
out_log = gr.Markdown(label="Run log")
|
| 998 |
+
|
| 999 |
+
def _run_eval_inproc(gold_path: str, k: int = 8):
|
| 1000 |
+
import json as _json
|
| 1001 |
+
out_dir = str(ARTIFACT_DIR)
|
| 1002 |
+
logs = str(LOG_PATH)
|
| 1003 |
+
cmd = [
|
| 1004 |
+
sys.executable, "rag_eval_metrics.py",
|
| 1005 |
+
"--gold_csv", gold_path,
|
| 1006 |
+
"--logs_jsonl", logs,
|
| 1007 |
+
"--k", str(k),
|
| 1008 |
+
"--out_dir", out_dir
|
| 1009 |
+
]
|
| 1010 |
+
try:
|
| 1011 |
+
p = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
| 1012 |
+
stdout = p.stdout or ""
|
| 1013 |
+
stderr = p.stderr or ""
|
| 1014 |
+
perq = ARTIFACT_DIR / "metrics_per_question.csv"
|
| 1015 |
+
agg = ARTIFACT_DIR / "metrics_aggregate.json"
|
| 1016 |
+
agg_json = {}
|
| 1017 |
+
if agg.exists():
|
| 1018 |
+
agg_json = _json.loads(agg.read_text(encoding="utf-8"))
|
| 1019 |
+
report = "```\n" + (stdout.strip() or "(no stdout)") + ("\n" + stderr.strip() if stderr else "") + "\n```"
|
| 1020 |
+
return (str(perq) if perq.exists() else None,
|
| 1021 |
+
str(agg) if agg.exists() else None,
|
| 1022 |
+
agg_json,
|
| 1023 |
+
report)
|
| 1024 |
+
except Exception as e:
|
| 1025 |
+
return (None, None, {}, f"**Eval error:** {e}")
|
| 1026 |
+
|
| 1027 |
+
def _eval_wrapper(gf, k):
|
| 1028 |
+
from pathlib import Path
|
| 1029 |
+
if gf is None:
|
| 1030 |
+
default_gold = Path("gold.csv")
|
| 1031 |
+
if not default_gold.exists():
|
| 1032 |
+
return None, None, {}, "**No gold.csv provided or found in repo root.**"
|
| 1033 |
+
gold_path = str(default_gold)
|
| 1034 |
+
else:
|
| 1035 |
+
gold_path = gf.name
|
| 1036 |
+
return _run_eval_inproc(gold_path, int(k))
|
| 1037 |
+
|
| 1038 |
+
btn_eval.click(_eval_wrapper, inputs=[gold_file, k_slider],
|
| 1039 |
+
outputs=[out_perq, out_agg, out_json, out_log])
|
| 1040 |
+
|
| 1041 |
# ------------- Launch -------------
|
| 1042 |
if __name__ == "__main__":
|
| 1043 |
demo.queue().launch()
|