Spaces:
Sleeping
Sleeping
Add reranking + RAGAS evaluation
Browse files
app.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
|
|
| 1 |
import re
|
| 2 |
import time
|
|
|
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import plotly.express as px
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
from rag_chain import (
|
| 9 |
retrieve_docs, stream_llm, query, add_to_kb, vectorstore_exists,
|
| 10 |
get_all_philosophers, get_kb_stats, get_umap_data,
|
| 11 |
)
|
| 12 |
-
from config import
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# ---------------------------------------------------------------------------
|
| 15 |
# Display helpers
|
|
@@ -86,14 +92,22 @@ def _format_retrieved_chunks(docs: list, scores: list[float]) -> str:
|
|
| 86 |
if not docs:
|
| 87 |
return "_No chunks retrieved._"
|
| 88 |
|
| 89 |
-
|
| 90 |
-
avg = sum(
|
| 91 |
has_bm25 = any(s < 0 for s in scores)
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
lines = [
|
| 95 |
f"**{len(docs)} chunks** Β· {method}"
|
| 96 |
-
f" Β·
|
| 97 |
]
|
| 98 |
for i, (doc, score) in enumerate(zip(docs, scores), 1):
|
| 99 |
phil = doc.metadata.get("philosopher", "?")
|
|
@@ -288,6 +302,109 @@ def build_umap_plot():
|
|
| 288 |
return fig
|
| 289 |
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
# ---------------------------------------------------------------------------
|
| 292 |
# UI
|
| 293 |
# ---------------------------------------------------------------------------
|
|
@@ -332,8 +449,8 @@ with gr.Blocks(title="Philosopher Chat") as demo:
|
|
| 332 |
# π Philosopher Chat
|
| 333 |
**RAG chatbot grounded in Western philosophical primary texts**
|
| 334 |
|
| 335 |
-
Hybrid
|
| 336 |
-
Β· Multi-provider LLM routing Β· 12 primary texts Β· ~5 700 chunks
|
| 337 |
"""
|
| 338 |
)
|
| 339 |
|
|
@@ -395,7 +512,8 @@ Hybrid BM25 + Semantic retrieval Β· Real-time streaming
|
|
| 395 |
with gr.Group():
|
| 396 |
gr.Markdown("**βΉοΈ Stack**", elem_classes="section-label")
|
| 397 |
gr.Markdown(
|
| 398 |
-
"- Retrieval: **Hybrid
|
|
|
|
| 399 |
"- Embeddings: **EmbeddingGemma-300M**\n"
|
| 400 |
"- Vector DB: **ChromaDB**\n"
|
| 401 |
"- Framework: **LangChain LCEL**\n"
|
|
@@ -492,6 +610,30 @@ Hybrid BM25 + Semantic retrieval Β· Real-time streaming
|
|
| 492 |
elem_classes="status-box",
|
| 493 |
)
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
# ββ Event wiring βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 496 |
|
| 497 |
msg_input.submit(
|
|
@@ -516,6 +658,8 @@ Hybrid BM25 + Semantic retrieval Β· Real-time streaming
|
|
| 516 |
outputs=[upload_status, philosopher_filter],
|
| 517 |
).then(refresh_kb, outputs=kb_display)
|
| 518 |
|
|
|
|
|
|
|
| 519 |
|
| 520 |
def _auto_ingest() -> None:
|
| 521 |
"""Trigger background KB build on first Spaces run (non-blocking)."""
|
|
|
|
| 1 |
+
import json
|
| 2 |
import re
|
| 3 |
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import plotly.express as px
|
| 8 |
+
import plotly.graph_objects as go
|
| 9 |
import pandas as pd
|
| 10 |
|
| 11 |
from rag_chain import (
|
| 12 |
retrieve_docs, stream_llm, query, add_to_kb, vectorstore_exists,
|
| 13 |
get_all_philosophers, get_kb_stats, get_umap_data,
|
| 14 |
)
|
| 15 |
+
from config import (
|
| 16 |
+
LLM_OPTIONS, DEFAULT_LLM, EMBEDDING_OPTIONS, DEFAULT_EMBEDDING,
|
| 17 |
+
USE_RERANKER, RERANKER_MODEL,
|
| 18 |
+
)
|
| 19 |
|
| 20 |
# ---------------------------------------------------------------------------
|
| 21 |
# Display helpers
|
|
|
|
| 92 |
if not docs:
|
| 93 |
return "_No chunks retrieved._"
|
| 94 |
|
| 95 |
+
pos_scores = [s for s in scores if s >= 0]
|
| 96 |
+
avg = sum(pos_scores) / len(pos_scores) if pos_scores else 0.0
|
| 97 |
has_bm25 = any(s < 0 for s in scores)
|
| 98 |
+
if USE_RERANKER:
|
| 99 |
+
method = "Hybrid (RRF) β Cross-Encoder Rerank"
|
| 100 |
+
score_label = "avg relevance"
|
| 101 |
+
elif has_bm25:
|
| 102 |
+
method = "Hybrid BM25 + Semantic"
|
| 103 |
+
score_label = "avg similarity"
|
| 104 |
+
else:
|
| 105 |
+
method = "Semantic"
|
| 106 |
+
score_label = "avg similarity"
|
| 107 |
|
| 108 |
lines = [
|
| 109 |
f"**{len(docs)} chunks** Β· {method}"
|
| 110 |
+
f" Β· {score_label}: **{avg:.3f}**\n"
|
| 111 |
]
|
| 112 |
for i, (doc, score) in enumerate(zip(docs, scores), 1):
|
| 113 |
phil = doc.metadata.get("philosopher", "?")
|
|
|
|
| 302 |
return fig
|
| 303 |
|
| 304 |
|
| 305 |
+
# ---------------------------------------------------------------------------
|
| 306 |
+
# RAGAS evaluation results
|
| 307 |
+
# ---------------------------------------------------------------------------
|
| 308 |
+
|
| 309 |
+
_EVAL_PATH = Path(__file__).parent / "eval_results.json"
|
| 310 |
+
_METRIC_LABELS = {
|
| 311 |
+
"faithfulness": "Faithfulness",
|
| 312 |
+
"answer_relevancy": "Answer Relevancy",
|
| 313 |
+
"context_precision": "Context Precision",
|
| 314 |
+
"context_recall": "Context Recall",
|
| 315 |
+
}
|
| 316 |
+
_METRIC_DESC = {
|
| 317 |
+
"faithfulness": "Share of answer claims supported by retrieved context (anti-hallucination)",
|
| 318 |
+
"answer_relevancy": "How directly the answer addresses the question",
|
| 319 |
+
"context_precision": "Are the relevant chunks ranked near the top?",
|
| 320 |
+
"context_recall": "Share of the reference answer covered by retrieved context",
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def _load_eval() -> dict | None:
|
| 325 |
+
if not _EVAL_PATH.exists():
|
| 326 |
+
return None
|
| 327 |
+
try:
|
| 328 |
+
return json.loads(_EVAL_PATH.read_text(encoding="utf-8"))
|
| 329 |
+
except Exception:
|
| 330 |
+
return None
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def build_eval_table() -> str:
|
| 334 |
+
data = _load_eval()
|
| 335 |
+
if data is None:
|
| 336 |
+
return (
|
| 337 |
+
"_No evaluation results yet. Run_ `python evaluate.py` _to generate "
|
| 338 |
+
"`eval_results.json` (RAGAS metrics, ~12 min)._"
|
| 339 |
+
)
|
| 340 |
+
cfgs = list(data["configs"].keys())
|
| 341 |
+
base, rer = cfgs[0], cfgs[1]
|
| 342 |
+
meta = data.get("metadata", {})
|
| 343 |
+
|
| 344 |
+
lines = [
|
| 345 |
+
f"**Evaluated with `{meta.get('framework', 'ragas')}`** Β· "
|
| 346 |
+
f"{meta.get('n_questions', '?')} questions "
|
| 347 |
+
f" Β· judge: `{meta.get('judge_model', '?')}` "
|
| 348 |
+
f" Β· reranker: `{meta.get('reranker_model', '?')}`\n",
|
| 349 |
+
f"| Metric | {base} | {rer} | Ξ |",
|
| 350 |
+
"|---|:---:|:---:|:---:|",
|
| 351 |
+
]
|
| 352 |
+
for m in _METRIC_LABELS:
|
| 353 |
+
b = data["configs"][base].get(m, 0.0)
|
| 354 |
+
r = data["configs"][rer].get(m, 0.0)
|
| 355 |
+
d = data["deltas"].get(m, 0.0)
|
| 356 |
+
arrow = "π’" if d > 0.005 else ("π΄" if d < -0.005 else "βͺ")
|
| 357 |
+
lines.append(
|
| 358 |
+
f"| **{_METRIC_LABELS[m]}**<br><sub>{_METRIC_DESC[m]}</sub> "
|
| 359 |
+
f"| {b:.3f} | {r:.3f} | {arrow} {d:+.3f} |"
|
| 360 |
+
)
|
| 361 |
+
lines.append(
|
| 362 |
+
f"\n_Generated {meta.get('generated_at', '?')} Β· "
|
| 363 |
+
"computed with the [RAGAS](https://docs.ragas.io) library "
|
| 364 |
+
"(LLM-as-judge)._"
|
| 365 |
+
)
|
| 366 |
+
return "\n".join(lines)
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def build_eval_chart():
|
| 370 |
+
data = _load_eval()
|
| 371 |
+
if data is None:
|
| 372 |
+
return None
|
| 373 |
+
cfgs = list(data["configs"].keys())
|
| 374 |
+
metrics = list(_METRIC_LABELS.keys())
|
| 375 |
+
labels = [_METRIC_LABELS[m] for m in metrics]
|
| 376 |
+
palette = {cfgs[0]: "#6366F1", cfgs[1]: "#22C55E"}
|
| 377 |
+
|
| 378 |
+
fig = go.Figure()
|
| 379 |
+
for cfg in cfgs:
|
| 380 |
+
fig.add_bar(
|
| 381 |
+
name=cfg,
|
| 382 |
+
x=labels,
|
| 383 |
+
y=[data["configs"][cfg].get(m, 0.0) for m in metrics],
|
| 384 |
+
marker_color=palette.get(cfg),
|
| 385 |
+
text=[f"{data['configs'][cfg].get(m, 0.0):.2f}" for m in metrics],
|
| 386 |
+
textposition="outside",
|
| 387 |
+
)
|
| 388 |
+
fig.update_layout(
|
| 389 |
+
barmode="group",
|
| 390 |
+
template="plotly_dark",
|
| 391 |
+
title="Retrieval Quality β Baseline vs Cross-Encoder Rerank",
|
| 392 |
+
title_font=dict(size=14),
|
| 393 |
+
height=460,
|
| 394 |
+
yaxis=dict(range=[0, 1.05], title="score", gridcolor="rgba(255,255,255,0.08)"),
|
| 395 |
+
plot_bgcolor="rgba(0,0,0,0)",
|
| 396 |
+
paper_bgcolor="rgba(0,0,0,0)",
|
| 397 |
+
font=dict(color="rgba(220,220,220,0.9)"),
|
| 398 |
+
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
|
| 399 |
+
margin=dict(l=50, r=20, t=70, b=40),
|
| 400 |
+
)
|
| 401 |
+
return fig
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def refresh_eval():
|
| 405 |
+
return gr.update(value=build_eval_table()), build_eval_chart()
|
| 406 |
+
|
| 407 |
+
|
| 408 |
# ---------------------------------------------------------------------------
|
| 409 |
# UI
|
| 410 |
# ---------------------------------------------------------------------------
|
|
|
|
| 449 |
# π Philosopher Chat
|
| 450 |
**RAG chatbot grounded in Western philosophical primary texts**
|
| 451 |
|
| 452 |
+
Hybrid retrieval + cross-encoder reranking Β· Real-time streaming
|
| 453 |
+
Β· Multi-provider LLM routing Β· RAGAS-evaluated Β· 12 primary texts Β· ~5 700 chunks
|
| 454 |
"""
|
| 455 |
)
|
| 456 |
|
|
|
|
| 512 |
with gr.Group():
|
| 513 |
gr.Markdown("**βΉοΈ Stack**", elem_classes="section-label")
|
| 514 |
gr.Markdown(
|
| 515 |
+
"- Retrieval: **Hybrid (RRF) + Rerank**\n"
|
| 516 |
+
"- Reranker: **BGE-reranker-v2-m3**\n"
|
| 517 |
"- Embeddings: **EmbeddingGemma-300M**\n"
|
| 518 |
"- Vector DB: **ChromaDB**\n"
|
| 519 |
"- Framework: **LangChain LCEL**\n"
|
|
|
|
| 610 |
elem_classes="status-box",
|
| 611 |
)
|
| 612 |
|
| 613 |
+
# ββ Tab 4 β Evaluation βββββββββββββββββββββββββββββββββββββββββββ
|
| 614 |
+
with gr.Tab("π Evaluation"):
|
| 615 |
+
gr.Markdown(
|
| 616 |
+
"### Does reranking actually help?\n"
|
| 617 |
+
"The retrieval pipeline is measured with four **RAGAS** metrics "
|
| 618 |
+
"over a curated question set with reference answers β once with the "
|
| 619 |
+
"cross-encoder reranker **off** (hybrid baseline) and once **on**. "
|
| 620 |
+
"This quantifies the impact of each retrieval component instead of "
|
| 621 |
+
"guessing. _(Computed offline by_ `evaluate.py`_; an LLM acts as judge.)_"
|
| 622 |
+
)
|
| 623 |
+
with gr.Row(equal_height=False):
|
| 624 |
+
with gr.Column(scale=1):
|
| 625 |
+
eval_table = gr.Markdown(build_eval_table())
|
| 626 |
+
with gr.Column(scale=1):
|
| 627 |
+
eval_chart = gr.Plot(build_eval_chart())
|
| 628 |
+
refresh_eval_btn = gr.Button("β» Reload results", size="sm")
|
| 629 |
+
gr.Markdown(
|
| 630 |
+
"**Metric definitions** Β· "
|
| 631 |
+
"**Faithfulness**: answer grounded in context (anti-hallucination) Β· "
|
| 632 |
+
"**Answer Relevancy**: answer addresses the question Β· "
|
| 633 |
+
"**Context Precision**: relevant chunks ranked high Β· "
|
| 634 |
+
"**Context Recall**: reference answer covered by context."
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
# ββ Event wiring βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 638 |
|
| 639 |
msg_input.submit(
|
|
|
|
| 658 |
outputs=[upload_status, philosopher_filter],
|
| 659 |
).then(refresh_kb, outputs=kb_display)
|
| 660 |
|
| 661 |
+
refresh_eval_btn.click(refresh_eval, outputs=[eval_table, eval_chart])
|
| 662 |
+
|
| 663 |
|
| 664 |
def _auto_ingest() -> None:
|
| 665 |
"""Trigger background KB build on first Spaces run (non-blocking)."""
|