fikri0o0 commited on
Commit
7959ec4
Β·
verified Β·
1 Parent(s): 0be220a

Add reranking + RAGAS evaluation

Browse files
Files changed (1) hide show
  1. app.py +152 -8
app.py CHANGED
@@ -1,15 +1,21 @@
 
1
  import re
2
  import time
 
3
 
4
  import gradio as gr
5
  import plotly.express as px
 
6
  import pandas as pd
7
 
8
  from rag_chain import (
9
  retrieve_docs, stream_llm, query, add_to_kb, vectorstore_exists,
10
  get_all_philosophers, get_kb_stats, get_umap_data,
11
  )
12
- from config import LLM_OPTIONS, DEFAULT_LLM, EMBEDDING_OPTIONS, DEFAULT_EMBEDDING
 
 
 
13
 
14
  # ---------------------------------------------------------------------------
15
  # Display helpers
@@ -86,14 +92,22 @@ def _format_retrieved_chunks(docs: list, scores: list[float]) -> str:
86
  if not docs:
87
  return "_No chunks retrieved._"
88
 
89
- semantic_scores = [s for s in scores if s >= 0]
90
- avg = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0.0
91
  has_bm25 = any(s < 0 for s in scores)
92
- method = "Hybrid BM25 + Semantic" if has_bm25 else "Semantic"
 
 
 
 
 
 
 
 
93
 
94
  lines = [
95
  f"**{len(docs)} chunks** &nbsp;Β·&nbsp; {method}"
96
- f" &nbsp;Β·&nbsp; avg similarity: **{avg:.3f}**\n"
97
  ]
98
  for i, (doc, score) in enumerate(zip(docs, scores), 1):
99
  phil = doc.metadata.get("philosopher", "?")
@@ -288,6 +302,109 @@ def build_umap_plot():
288
  return fig
289
 
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  # ---------------------------------------------------------------------------
292
  # UI
293
  # ---------------------------------------------------------------------------
@@ -332,8 +449,8 @@ with gr.Blocks(title="Philosopher Chat") as demo:
332
  # πŸ“š Philosopher Chat
333
  **RAG chatbot grounded in Western philosophical primary texts**
334
 
335
- Hybrid BM25 + Semantic retrieval &nbsp;Β·&nbsp; Real-time streaming
336
- &nbsp;Β·&nbsp; Multi-provider LLM routing &nbsp;Β·&nbsp; 12 primary texts Β· ~5 700 chunks
337
  """
338
  )
339
 
@@ -395,7 +512,8 @@ Hybrid BM25 + Semantic retrieval &nbsp;Β·&nbsp; Real-time streaming
395
  with gr.Group():
396
  gr.Markdown("**ℹ️ Stack**", elem_classes="section-label")
397
  gr.Markdown(
398
- "- Retrieval: **Hybrid BM25 + Semantic**\n"
 
399
  "- Embeddings: **EmbeddingGemma-300M**\n"
400
  "- Vector DB: **ChromaDB**\n"
401
  "- Framework: **LangChain LCEL**\n"
@@ -492,6 +610,30 @@ Hybrid BM25 + Semantic retrieval &nbsp;Β·&nbsp; Real-time streaming
492
  elem_classes="status-box",
493
  )
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  # ── Event wiring ─────────────────────────────────────────────────────
496
 
497
  msg_input.submit(
@@ -516,6 +658,8 @@ Hybrid BM25 + Semantic retrieval &nbsp;Β·&nbsp; Real-time streaming
516
  outputs=[upload_status, philosopher_filter],
517
  ).then(refresh_kb, outputs=kb_display)
518
 
 
 
519
 
520
  def _auto_ingest() -> None:
521
  """Trigger background KB build on first Spaces run (non-blocking)."""
 
1
+ import json
2
  import re
3
  import time
4
+ from pathlib import Path
5
 
6
  import gradio as gr
7
  import plotly.express as px
8
+ import plotly.graph_objects as go
9
  import pandas as pd
10
 
11
  from rag_chain import (
12
  retrieve_docs, stream_llm, query, add_to_kb, vectorstore_exists,
13
  get_all_philosophers, get_kb_stats, get_umap_data,
14
  )
15
+ from config import (
16
+ LLM_OPTIONS, DEFAULT_LLM, EMBEDDING_OPTIONS, DEFAULT_EMBEDDING,
17
+ USE_RERANKER, RERANKER_MODEL,
18
+ )
19
 
20
  # ---------------------------------------------------------------------------
21
  # Display helpers
 
92
  if not docs:
93
  return "_No chunks retrieved._"
94
 
95
+ pos_scores = [s for s in scores if s >= 0]
96
+ avg = sum(pos_scores) / len(pos_scores) if pos_scores else 0.0
97
  has_bm25 = any(s < 0 for s in scores)
98
+ if USE_RERANKER:
99
+ method = "Hybrid (RRF) β†’ Cross-Encoder Rerank"
100
+ score_label = "avg relevance"
101
+ elif has_bm25:
102
+ method = "Hybrid BM25 + Semantic"
103
+ score_label = "avg similarity"
104
+ else:
105
+ method = "Semantic"
106
+ score_label = "avg similarity"
107
 
108
  lines = [
109
  f"**{len(docs)} chunks** &nbsp;Β·&nbsp; {method}"
110
+ f" &nbsp;Β·&nbsp; {score_label}: **{avg:.3f}**\n"
111
  ]
112
  for i, (doc, score) in enumerate(zip(docs, scores), 1):
113
  phil = doc.metadata.get("philosopher", "?")
 
302
  return fig
303
 
304
 
305
+ # ---------------------------------------------------------------------------
306
+ # RAGAS evaluation results
307
+ # ---------------------------------------------------------------------------
308
+
309
+ _EVAL_PATH = Path(__file__).parent / "eval_results.json"
310
+ _METRIC_LABELS = {
311
+ "faithfulness": "Faithfulness",
312
+ "answer_relevancy": "Answer Relevancy",
313
+ "context_precision": "Context Precision",
314
+ "context_recall": "Context Recall",
315
+ }
316
+ _METRIC_DESC = {
317
+ "faithfulness": "Share of answer claims supported by retrieved context (anti-hallucination)",
318
+ "answer_relevancy": "How directly the answer addresses the question",
319
+ "context_precision": "Are the relevant chunks ranked near the top?",
320
+ "context_recall": "Share of the reference answer covered by retrieved context",
321
+ }
322
+
323
+
324
+ def _load_eval() -> dict | None:
325
+ if not _EVAL_PATH.exists():
326
+ return None
327
+ try:
328
+ return json.loads(_EVAL_PATH.read_text(encoding="utf-8"))
329
+ except Exception:
330
+ return None
331
+
332
+
333
+ def build_eval_table() -> str:
334
+ data = _load_eval()
335
+ if data is None:
336
+ return (
337
+ "_No evaluation results yet. Run_ `python evaluate.py` _to generate "
338
+ "`eval_results.json` (RAGAS metrics, ~12 min)._"
339
+ )
340
+ cfgs = list(data["configs"].keys())
341
+ base, rer = cfgs[0], cfgs[1]
342
+ meta = data.get("metadata", {})
343
+
344
+ lines = [
345
+ f"**Evaluated with `{meta.get('framework', 'ragas')}`** &nbsp;Β·&nbsp; "
346
+ f"{meta.get('n_questions', '?')} questions "
347
+ f"&nbsp;Β·&nbsp; judge: `{meta.get('judge_model', '?')}` "
348
+ f"&nbsp;Β·&nbsp; reranker: `{meta.get('reranker_model', '?')}`\n",
349
+ f"| Metric | {base} | {rer} | Ξ” |",
350
+ "|---|:---:|:---:|:---:|",
351
+ ]
352
+ for m in _METRIC_LABELS:
353
+ b = data["configs"][base].get(m, 0.0)
354
+ r = data["configs"][rer].get(m, 0.0)
355
+ d = data["deltas"].get(m, 0.0)
356
+ arrow = "🟒" if d > 0.005 else ("πŸ”΄" if d < -0.005 else "βšͺ")
357
+ lines.append(
358
+ f"| **{_METRIC_LABELS[m]}**<br><sub>{_METRIC_DESC[m]}</sub> "
359
+ f"| {b:.3f} | {r:.3f} | {arrow} {d:+.3f} |"
360
+ )
361
+ lines.append(
362
+ f"\n_Generated {meta.get('generated_at', '?')} &nbsp;Β·&nbsp; "
363
+ "computed with the [RAGAS](https://docs.ragas.io) library "
364
+ "(LLM-as-judge)._"
365
+ )
366
+ return "\n".join(lines)
367
+
368
+
369
+ def build_eval_chart():
370
+ data = _load_eval()
371
+ if data is None:
372
+ return None
373
+ cfgs = list(data["configs"].keys())
374
+ metrics = list(_METRIC_LABELS.keys())
375
+ labels = [_METRIC_LABELS[m] for m in metrics]
376
+ palette = {cfgs[0]: "#6366F1", cfgs[1]: "#22C55E"}
377
+
378
+ fig = go.Figure()
379
+ for cfg in cfgs:
380
+ fig.add_bar(
381
+ name=cfg,
382
+ x=labels,
383
+ y=[data["configs"][cfg].get(m, 0.0) for m in metrics],
384
+ marker_color=palette.get(cfg),
385
+ text=[f"{data['configs'][cfg].get(m, 0.0):.2f}" for m in metrics],
386
+ textposition="outside",
387
+ )
388
+ fig.update_layout(
389
+ barmode="group",
390
+ template="plotly_dark",
391
+ title="Retrieval Quality β€” Baseline vs Cross-Encoder Rerank",
392
+ title_font=dict(size=14),
393
+ height=460,
394
+ yaxis=dict(range=[0, 1.05], title="score", gridcolor="rgba(255,255,255,0.08)"),
395
+ plot_bgcolor="rgba(0,0,0,0)",
396
+ paper_bgcolor="rgba(0,0,0,0)",
397
+ font=dict(color="rgba(220,220,220,0.9)"),
398
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
399
+ margin=dict(l=50, r=20, t=70, b=40),
400
+ )
401
+ return fig
402
+
403
+
404
+ def refresh_eval():
405
+ return gr.update(value=build_eval_table()), build_eval_chart()
406
+
407
+
408
  # ---------------------------------------------------------------------------
409
  # UI
410
  # ---------------------------------------------------------------------------
 
449
  # πŸ“š Philosopher Chat
450
  **RAG chatbot grounded in Western philosophical primary texts**
451
 
452
+ Hybrid retrieval + cross-encoder reranking &nbsp;Β·&nbsp; Real-time streaming
453
+ &nbsp;Β·&nbsp; Multi-provider LLM routing &nbsp;Β·&nbsp; RAGAS-evaluated &nbsp;Β·&nbsp; 12 primary texts Β· ~5 700 chunks
454
  """
455
  )
456
 
 
512
  with gr.Group():
513
  gr.Markdown("**ℹ️ Stack**", elem_classes="section-label")
514
  gr.Markdown(
515
+ "- Retrieval: **Hybrid (RRF) + Rerank**\n"
516
+ "- Reranker: **BGE-reranker-v2-m3**\n"
517
  "- Embeddings: **EmbeddingGemma-300M**\n"
518
  "- Vector DB: **ChromaDB**\n"
519
  "- Framework: **LangChain LCEL**\n"
 
610
  elem_classes="status-box",
611
  )
612
 
613
+ # ── Tab 4 ─ Evaluation ───────────────────────────────────────────
614
+ with gr.Tab("πŸ“Š Evaluation"):
615
+ gr.Markdown(
616
+ "### Does reranking actually help?\n"
617
+ "The retrieval pipeline is measured with four **RAGAS** metrics "
618
+ "over a curated question set with reference answers β€” once with the "
619
+ "cross-encoder reranker **off** (hybrid baseline) and once **on**. "
620
+ "This quantifies the impact of each retrieval component instead of "
621
+ "guessing. _(Computed offline by_ `evaluate.py`_; an LLM acts as judge.)_"
622
+ )
623
+ with gr.Row(equal_height=False):
624
+ with gr.Column(scale=1):
625
+ eval_table = gr.Markdown(build_eval_table())
626
+ with gr.Column(scale=1):
627
+ eval_chart = gr.Plot(build_eval_chart())
628
+ refresh_eval_btn = gr.Button("↻ Reload results", size="sm")
629
+ gr.Markdown(
630
+ "**Metric definitions** &nbsp;Β·&nbsp; "
631
+ "**Faithfulness**: answer grounded in context (anti-hallucination) &nbsp;Β·&nbsp; "
632
+ "**Answer Relevancy**: answer addresses the question &nbsp;Β·&nbsp; "
633
+ "**Context Precision**: relevant chunks ranked high &nbsp;Β·&nbsp; "
634
+ "**Context Recall**: reference answer covered by context."
635
+ )
636
+
637
  # ── Event wiring ─────────────────────────────────────────────────────
638
 
639
  msg_input.submit(
 
658
  outputs=[upload_status, philosopher_filter],
659
  ).then(refresh_kb, outputs=kb_display)
660
 
661
+ refresh_eval_btn.click(refresh_eval, outputs=[eval_table, eval_chart])
662
+
663
 
664
  def _auto_ingest() -> None:
665
  """Trigger background KB build on first Spaces run (non-blocking)."""