Spaces:
Sleeping
Sleeping
File size: 16,534 Bytes
982896e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | """
RAG Transparency Lab β HuggingFace Gradio Space
By Zalina Dezhina, PhD | AI Evaluation Scientist
Visualises every step of a RAG pipeline on a user-uploaded scientific PDF.
"""
import os
import io
import numpy as np
import gradio as gr
import pypdf
import pandas as pd
from rag_pipeline.chunker import STRATEGIES, Chunk
from rag_pipeline.embedder import embed_texts
from rag_pipeline.retriever import retrieve
from rag_pipeline.reranker import rerank_and_filter
from rag_pipeline.generator import generate_answer
# ββ State (in-memory per session via gr.State) βββββββββββββββββββββββββββββββ
def score_color(score: float) -> str:
if score >= 0.65:
return "π’"
elif score >= 0.35:
return "π‘"
return "π΄"
def extract_text_from_pdf(pdf_path: str) -> str:
reader = pypdf.PdfReader(pdf_path)
pages = [page.extract_text() or "" for page in reader.pages]
return "\n\n".join(pages)
# ββ Tab 1: Upload & Chunk ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def process_pdf(pdf_file, strategy_name: str):
if pdf_file is None:
return "β οΈ Please upload a PDF.", None, None
text = extract_text_from_pdf(pdf_file.name)
if not text.strip():
return "β οΈ Could not extract text from this PDF.", None, None
strategy_fn = STRATEGIES[strategy_name]
chunks = strategy_fn(text)
# Build display dataframe
rows = []
for c in chunks:
rows.append({
"ID": c.chunk_id,
"Words": c.word_count,
"Sentences": c.sentence_count,
"Preview": c.preview(100),
})
df = pd.DataFrame(rows)
summary = (
f"### β
Document processed\n"
f"- **Strategy:** {strategy_name}\n"
f"- **Total chunks:** {len(chunks)}\n"
f"- **Avg words/chunk:** {df['Words'].mean():.0f}\n"
f"- **Total words:** {df['Words'].sum()}\n\n"
f"**Why chunking matters:** If chunks break mid-sentence or mid-argument, "
f"retrieval will fail β the model receives incomplete evidence. "
f"Semantic chunking preserves full reasoning units."
)
return summary, df, (text, chunks)
# ββ Tab 2: Retrieval Explorer ββββββββββββββββββββββββββββββββββββββββββββββββ
def run_retrieval(query: str, state, dense_weight: float):
if not query.strip():
return "β οΈ Please enter a question.", None, None
if state is None:
return "β οΈ Please process a PDF first (Tab 1).", None, None
text, chunks = state
if not chunks:
return "β οΈ No chunks found. Try a different chunking strategy.", None, None
chunk_texts = [c.text for c in chunks]
embeddings = embed_texts(chunk_texts)
sparse_weight = round(1.0 - dense_weight, 2)
results = retrieve(
query, chunks, embeddings,
top_k=min(10, len(chunks)),
dense_weight=dense_weight,
sparse_weight=sparse_weight,
)
rows = []
for r in results:
rows.append({
"Rank": r.rank,
"Chunk ID": r.chunk.chunk_id,
"Dense π΅": f"{score_color(r.dense_score)} {r.dense_score:.3f}",
"Sparse π ": f"{score_color(r.sparse_score)} {r.sparse_score:.3f}",
"Hybrid β‘": f"{score_color(r.hybrid_score)} {r.hybrid_score:.3f}",
"Preview": r.chunk.preview(90),
})
df = pd.DataFrame(rows)
insight = (
f"### π Retrieval results for: *\"{query}\"*\n"
f"- **Dense weight:** {dense_weight} | **Sparse (BM25) weight:** {sparse_weight}\n"
f"- **Dense** captures semantic meaning β finds conceptually similar text\n"
f"- **Sparse** captures exact keywords β catches specific terms\n"
f"- **Hybrid** combines both β more robust than either alone\n\n"
f"**Notice:** chunks with high dense but low sparse score "
f"are semantically related but don't share your exact keywords. "
f"Chunks with high sparse but low dense score match keywords but may be off-topic."
)
return insight, df, (embeddings, results, chunks)
# ββ Tab 3: Reranking & Filtering βββββββββββββββββββββββββββββββββββββββββββββ
def run_reranking(query: str, retrieval_state, threshold: float, top_n: int):
if retrieval_state is None:
return "β οΈ Run retrieval first (Tab 2).", None, None
if not query.strip():
return "β οΈ Please enter a question.", None, None
embeddings, results, chunks = retrieval_state
reranked = rerank_and_filter(
query, results,
score_threshold=threshold,
top_n=int(top_n),
)
rows = []
for r in reranked:
status = "β
KEPT" if r.kept else "β DROPPED"
reason = r.filter_reason or "β"
rows.append({
"Status": status,
"Chunk ID": r.chunk_id,
"Original rank": r.original_rank,
"New rank": r.new_rank if r.kept else "β",
"Rerank score": f"{score_color(r.rerank_score)} {r.rerank_score:.3f}",
"Filter reason": reason,
"Preview": r.preview(80),
})
df = pd.DataFrame(rows)
kept = [r for r in reranked if r.kept]
dropped = [r for r in reranked if not r.kept]
insight = (
f"### βοΈ Reranking & Filtering\n"
f"- **Kept:** {len(kept)} chunks | **Dropped:** {len(dropped)} chunks\n"
f"- **Score threshold:** {threshold} β chunks below this are removed\n\n"
f"**Why rerank?** The initial retrieval finds candidates quickly but noisily. "
f"Reranking re-scores using richer signals (keyword overlap + semantic score). "
f"Filtering removes low-quality and duplicate chunks before they reach the LLM.\n\n"
f"**Key insight:** Passing noisy chunks to the LLM is the #1 cause of "
f"hallucination in RAG systems. Clean context = sharper answers."
)
return insight, df, reranked
# ββ Tab 4: Final Answer βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def run_generation(query: str, rerank_state, api_key: str):
if rerank_state is None:
return "β οΈ Run reranking first (Tab 3).", "", ""
if not query.strip():
return "β οΈ Please enter a question.", "", ""
kept = [r for r in rerank_state if r.kept]
if not kept:
return (
"β οΈ No chunks passed the filters. "
"Try lowering the threshold in Tab 3 or rephrasing your question.",
"", ""
)
answer, prompt = generate_answer(query, kept, api_key=api_key.strip() or None)
sources_md = "### π Source excerpts used\n\n"
for i, r in enumerate(kept, 1):
sources_md += f"**[Excerpt {i}]** (Chunk {r.chunk_id}, score {r.rerank_score:.3f})\n"
sources_md += f"> {r.text[:200]}{'...' if len(r.text) > 200 else ''}\n\n"
prompt_display = f"```\n{prompt}\n```"
return answer, sources_md, prompt_display
# ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CSS = """
.score-high { color: #22c55e; font-weight: 500; }
.score-mid { color: #f59e0b; }
.score-low { color: #ef4444; }
"""
with gr.Blocks(
title="RAG Transparency Lab",
theme=gr.themes.Soft(primary_hue="slate", secondary_hue="blue"),
css=CSS,
) as demo:
# ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
gr.Markdown("""
# π¬ RAG Transparency Lab
**See inside every step of a RAG pipeline β applied to scientific papers.**
Most RAG demos show you only the final answer. This tool shows you *why* that answer is good or bad β by exposing chunking, retrieval scores, reranking decisions, and the exact prompt sent to the LLM.
Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) Β· AI Evaluation Scientist
*Part of the RAG Education Series β Project 1 of 3*
---
""")
# ββ Global inputs ββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Row():
api_key_input = gr.Textbox(
label="π Anthropic API Key (for Tab 4)",
placeholder="sk-ant-...",
type="password",
scale=2,
)
question_input = gr.Textbox(
label="β Your question about the paper",
placeholder="What is the main finding of this paper?",
scale=3,
)
# ββ State ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
chunk_state = gr.State(None)
retrieval_state = gr.State(None)
rerank_state = gr.State(None)
# ββ Tabs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Tabs():
# Tab 1 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.TabItem("π Step 1 β Upload & Chunk"):
gr.Markdown(
"Upload a scientific PDF and choose a chunking strategy. "
"See how the document is split into pieces that the retrieval system will search."
)
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], scale=2)
strategy_input = gr.Dropdown(
choices=list(STRATEGIES.keys()),
value="Semantic (5 sentences)",
label="Chunking strategy",
scale=1,
)
process_btn = gr.Button("βοΈ Process document", variant="primary")
chunk_summary = gr.Markdown()
chunk_table = gr.DataFrame(label="All chunks", wrap=True)
process_btn.click(
fn=process_pdf,
inputs=[pdf_input, strategy_input],
outputs=[chunk_summary, chunk_table, chunk_state],
)
# Tab 2 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.TabItem("π Step 2 β Retrieval Explorer"):
gr.Markdown(
"Run hybrid retrieval (dense + sparse). "
"See the individual scores for each candidate chunk β "
"and adjust the balance between semantic and keyword search."
)
dense_weight_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=0.6, step=0.1,
label="Dense weight (1 - this = sparse/BM25 weight)",
)
retrieve_btn = gr.Button("π Run retrieval", variant="primary")
retrieval_insight = gr.Markdown()
retrieval_table = gr.DataFrame(label="Retrieval scores (top 10)", wrap=True)
retrieve_btn.click(
fn=run_retrieval,
inputs=[question_input, chunk_state, dense_weight_slider],
outputs=[retrieval_insight, retrieval_table, retrieval_state],
)
# Tab 3 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.TabItem("βοΈ Step 3 β Rerank & Filter"):
gr.Markdown(
"Rerank candidates with richer scoring, then filter out "
"low-quality and duplicate chunks. "
"See exactly which chunks were dropped β and why."
)
with gr.Row():
threshold_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=0.25, step=0.05,
label="Score threshold (chunks below this are dropped)",
scale=2,
)
top_n_slider = gr.Slider(
minimum=1, maximum=10, value=5, step=1,
label="Top N chunks to keep",
scale=1,
)
rerank_btn = gr.Button("βοΈ Rerank & Filter", variant="primary")
rerank_insight = gr.Markdown()
rerank_table = gr.DataFrame(label="Reranking decisions", wrap=True)
rerank_btn.click(
fn=run_reranking,
inputs=[question_input, retrieval_state, threshold_slider, top_n_slider],
outputs=[rerank_insight, rerank_table, rerank_state],
)
# Tab 4 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.TabItem("π¬ Step 4 β Final Answer"):
gr.Markdown(
"Generate the final answer using only the filtered, reranked context. "
"See the exact prompt sent to the LLM and the source excerpts it used."
)
generate_btn = gr.Button("π¬ Generate answer", variant="primary")
with gr.Row():
with gr.Column(scale=2):
answer_out = gr.Markdown(label="Answer")
with gr.Column(scale=1):
sources_out = gr.Markdown(label="Source excerpts")
with gr.Accordion("π Prompt sent to LLM (full transparency)", open=False):
prompt_out = gr.Markdown()
generate_btn.click(
fn=run_generation,
inputs=[question_input, rerank_state, api_key_input],
outputs=[answer_out, sources_out, prompt_out],
)
# About tab βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.TabItem("π About & Methodology"):
gr.Markdown("""
## Why RAG Transparency Matters
Most RAG tutorials show you the output. This tool shows you the *pipeline* β because the output quality is determined entirely by what happens before the LLM sees any text.
### The 4 stages explained
**Stage 1 β Chunking**
Documents must be split into pieces small enough to retrieve but large enough to contain meaningful context. Fixed-size chunking is fast but breaks sentences. Semantic chunking preserves argument structure.
**Stage 2 β Hybrid Retrieval**
Dense retrieval (embeddings) captures semantic similarity. Sparse retrieval (BM25) captures exact keyword matches. Neither alone is sufficient β hybrid is the production standard.
**Stage 3 β Reranking & Filtering**
Initial retrieval casts a wide net. Reranking re-scores candidates with richer signals. Filtering removes low-quality chunks and near-duplicates. This is the stage most tutorials skip β and it's where most hallucinations originate.
**Stage 4 β Grounded Generation**
The LLM receives only the filtered, ranked excerpts β constrained to cite its sources. The prompt is shown in full so you can see exactly what the model was asked.
---
## RAG Education Series
This is **Project 1 of 3**:
1. π¬ RAG Transparency Lab β you are here
2. β‘ Classic vs Advanced RAG β side-by-side comparison
3. π§ͺ Scientific Claim Verifier β hallucination detection on research papers
---
Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina)
AI Evaluation Scientist | RLHF Specialist | Computational Neuroscientist
[GitHub](https://github.com/Mioulin) Β· dezhina@gmail.com
""")
demo.launch()
|