mioulin's picture
Create app.py
982896e verified
"""
RAG Transparency Lab β€” HuggingFace Gradio Space
By Zalina Dezhina, PhD | AI Evaluation Scientist
Visualises every step of a RAG pipeline on a user-uploaded scientific PDF.
"""
import os
import io
import numpy as np
import gradio as gr
import pypdf
import pandas as pd
from rag_pipeline.chunker import STRATEGIES, Chunk
from rag_pipeline.embedder import embed_texts
from rag_pipeline.retriever import retrieve
from rag_pipeline.reranker import rerank_and_filter
from rag_pipeline.generator import generate_answer
# ── State (in-memory per session via gr.State) ───────────────────────────────
def score_color(score: float) -> str:
if score >= 0.65:
return "🟒"
elif score >= 0.35:
return "🟑"
return "πŸ”΄"
def extract_text_from_pdf(pdf_path: str) -> str:
reader = pypdf.PdfReader(pdf_path)
pages = [page.extract_text() or "" for page in reader.pages]
return "\n\n".join(pages)
# ── Tab 1: Upload & Chunk ────────────────────────────────────────────────────
def process_pdf(pdf_file, strategy_name: str):
if pdf_file is None:
return "⚠️ Please upload a PDF.", None, None
text = extract_text_from_pdf(pdf_file.name)
if not text.strip():
return "⚠️ Could not extract text from this PDF.", None, None
strategy_fn = STRATEGIES[strategy_name]
chunks = strategy_fn(text)
# Build display dataframe
rows = []
for c in chunks:
rows.append({
"ID": c.chunk_id,
"Words": c.word_count,
"Sentences": c.sentence_count,
"Preview": c.preview(100),
})
df = pd.DataFrame(rows)
summary = (
f"### βœ… Document processed\n"
f"- **Strategy:** {strategy_name}\n"
f"- **Total chunks:** {len(chunks)}\n"
f"- **Avg words/chunk:** {df['Words'].mean():.0f}\n"
f"- **Total words:** {df['Words'].sum()}\n\n"
f"**Why chunking matters:** If chunks break mid-sentence or mid-argument, "
f"retrieval will fail β€” the model receives incomplete evidence. "
f"Semantic chunking preserves full reasoning units."
)
return summary, df, (text, chunks)
# ── Tab 2: Retrieval Explorer ────────────────────────────────────────────────
def run_retrieval(query: str, state, dense_weight: float):
if not query.strip():
return "⚠️ Please enter a question.", None, None
if state is None:
return "⚠️ Please process a PDF first (Tab 1).", None, None
text, chunks = state
if not chunks:
return "⚠️ No chunks found. Try a different chunking strategy.", None, None
chunk_texts = [c.text for c in chunks]
embeddings = embed_texts(chunk_texts)
sparse_weight = round(1.0 - dense_weight, 2)
results = retrieve(
query, chunks, embeddings,
top_k=min(10, len(chunks)),
dense_weight=dense_weight,
sparse_weight=sparse_weight,
)
rows = []
for r in results:
rows.append({
"Rank": r.rank,
"Chunk ID": r.chunk.chunk_id,
"Dense πŸ”΅": f"{score_color(r.dense_score)} {r.dense_score:.3f}",
"Sparse 🟠": f"{score_color(r.sparse_score)} {r.sparse_score:.3f}",
"Hybrid ⚑": f"{score_color(r.hybrid_score)} {r.hybrid_score:.3f}",
"Preview": r.chunk.preview(90),
})
df = pd.DataFrame(rows)
insight = (
f"### πŸ” Retrieval results for: *\"{query}\"*\n"
f"- **Dense weight:** {dense_weight} | **Sparse (BM25) weight:** {sparse_weight}\n"
f"- **Dense** captures semantic meaning β€” finds conceptually similar text\n"
f"- **Sparse** captures exact keywords β€” catches specific terms\n"
f"- **Hybrid** combines both β€” more robust than either alone\n\n"
f"**Notice:** chunks with high dense but low sparse score "
f"are semantically related but don't share your exact keywords. "
f"Chunks with high sparse but low dense score match keywords but may be off-topic."
)
return insight, df, (embeddings, results, chunks)
# ── Tab 3: Reranking & Filtering ─────────────────────────────────────────────
def run_reranking(query: str, retrieval_state, threshold: float, top_n: int):
if retrieval_state is None:
return "⚠️ Run retrieval first (Tab 2).", None, None
if not query.strip():
return "⚠️ Please enter a question.", None, None
embeddings, results, chunks = retrieval_state
reranked = rerank_and_filter(
query, results,
score_threshold=threshold,
top_n=int(top_n),
)
rows = []
for r in reranked:
status = "βœ… KEPT" if r.kept else "❌ DROPPED"
reason = r.filter_reason or "β€”"
rows.append({
"Status": status,
"Chunk ID": r.chunk_id,
"Original rank": r.original_rank,
"New rank": r.new_rank if r.kept else "β€”",
"Rerank score": f"{score_color(r.rerank_score)} {r.rerank_score:.3f}",
"Filter reason": reason,
"Preview": r.preview(80),
})
df = pd.DataFrame(rows)
kept = [r for r in reranked if r.kept]
dropped = [r for r in reranked if not r.kept]
insight = (
f"### βš–οΈ Reranking & Filtering\n"
f"- **Kept:** {len(kept)} chunks | **Dropped:** {len(dropped)} chunks\n"
f"- **Score threshold:** {threshold} β€” chunks below this are removed\n\n"
f"**Why rerank?** The initial retrieval finds candidates quickly but noisily. "
f"Reranking re-scores using richer signals (keyword overlap + semantic score). "
f"Filtering removes low-quality and duplicate chunks before they reach the LLM.\n\n"
f"**Key insight:** Passing noisy chunks to the LLM is the #1 cause of "
f"hallucination in RAG systems. Clean context = sharper answers."
)
return insight, df, reranked
# ── Tab 4: Final Answer ───────────────────────────────────────────────────────
def run_generation(query: str, rerank_state, api_key: str):
if rerank_state is None:
return "⚠️ Run reranking first (Tab 3).", "", ""
if not query.strip():
return "⚠️ Please enter a question.", "", ""
kept = [r for r in rerank_state if r.kept]
if not kept:
return (
"⚠️ No chunks passed the filters. "
"Try lowering the threshold in Tab 3 or rephrasing your question.",
"", ""
)
answer, prompt = generate_answer(query, kept, api_key=api_key.strip() or None)
sources_md = "### πŸ“„ Source excerpts used\n\n"
for i, r in enumerate(kept, 1):
sources_md += f"**[Excerpt {i}]** (Chunk {r.chunk_id}, score {r.rerank_score:.3f})\n"
sources_md += f"> {r.text[:200]}{'...' if len(r.text) > 200 else ''}\n\n"
prompt_display = f"```\n{prompt}\n```"
return answer, sources_md, prompt_display
# ── Gradio UI ─────────────────────────────────────────────────────────────────
CSS = """
.score-high { color: #22c55e; font-weight: 500; }
.score-mid { color: #f59e0b; }
.score-low { color: #ef4444; }
"""
with gr.Blocks(
title="RAG Transparency Lab",
theme=gr.themes.Soft(primary_hue="slate", secondary_hue="blue"),
css=CSS,
) as demo:
# ── Header ───────────────────────────────────────────────────────────
gr.Markdown("""
# πŸ”¬ RAG Transparency Lab
**See inside every step of a RAG pipeline β€” applied to scientific papers.**
Most RAG demos show you only the final answer. This tool shows you *why* that answer is good or bad β€” by exposing chunking, retrieval scores, reranking decisions, and the exact prompt sent to the LLM.
Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) Β· AI Evaluation Scientist
*Part of the RAG Education Series β€” Project 1 of 3*
---
""")
# ── Global inputs ────────────────────────────────────────────────────
with gr.Row():
api_key_input = gr.Textbox(
label="πŸ”‘ Anthropic API Key (for Tab 4)",
placeholder="sk-ant-...",
type="password",
scale=2,
)
question_input = gr.Textbox(
label="❓ Your question about the paper",
placeholder="What is the main finding of this paper?",
scale=3,
)
# ── State ────────────────────────────────────────────────────────────
chunk_state = gr.State(None)
retrieval_state = gr.State(None)
rerank_state = gr.State(None)
# ── Tabs ─────────────────────────────────────────────────────────────
with gr.Tabs():
# Tab 1 ───────────────────────────────────────────────────────────
with gr.TabItem("πŸ“„ Step 1 β€” Upload & Chunk"):
gr.Markdown(
"Upload a scientific PDF and choose a chunking strategy. "
"See how the document is split into pieces that the retrieval system will search."
)
with gr.Row():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], scale=2)
strategy_input = gr.Dropdown(
choices=list(STRATEGIES.keys()),
value="Semantic (5 sentences)",
label="Chunking strategy",
scale=1,
)
process_btn = gr.Button("βš™οΈ Process document", variant="primary")
chunk_summary = gr.Markdown()
chunk_table = gr.DataFrame(label="All chunks", wrap=True)
process_btn.click(
fn=process_pdf,
inputs=[pdf_input, strategy_input],
outputs=[chunk_summary, chunk_table, chunk_state],
)
# Tab 2 ───────────────────────────────────────────────────────────
with gr.TabItem("πŸ” Step 2 β€” Retrieval Explorer"):
gr.Markdown(
"Run hybrid retrieval (dense + sparse). "
"See the individual scores for each candidate chunk β€” "
"and adjust the balance between semantic and keyword search."
)
dense_weight_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=0.6, step=0.1,
label="Dense weight (1 - this = sparse/BM25 weight)",
)
retrieve_btn = gr.Button("πŸ” Run retrieval", variant="primary")
retrieval_insight = gr.Markdown()
retrieval_table = gr.DataFrame(label="Retrieval scores (top 10)", wrap=True)
retrieve_btn.click(
fn=run_retrieval,
inputs=[question_input, chunk_state, dense_weight_slider],
outputs=[retrieval_insight, retrieval_table, retrieval_state],
)
# Tab 3 ───────────────────────────────────────────────────────────
with gr.TabItem("βš–οΈ Step 3 β€” Rerank & Filter"):
gr.Markdown(
"Rerank candidates with richer scoring, then filter out "
"low-quality and duplicate chunks. "
"See exactly which chunks were dropped β€” and why."
)
with gr.Row():
threshold_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=0.25, step=0.05,
label="Score threshold (chunks below this are dropped)",
scale=2,
)
top_n_slider = gr.Slider(
minimum=1, maximum=10, value=5, step=1,
label="Top N chunks to keep",
scale=1,
)
rerank_btn = gr.Button("βš–οΈ Rerank & Filter", variant="primary")
rerank_insight = gr.Markdown()
rerank_table = gr.DataFrame(label="Reranking decisions", wrap=True)
rerank_btn.click(
fn=run_reranking,
inputs=[question_input, retrieval_state, threshold_slider, top_n_slider],
outputs=[rerank_insight, rerank_table, rerank_state],
)
# Tab 4 ───────────────────────────────────────────────────────────
with gr.TabItem("πŸ’¬ Step 4 β€” Final Answer"):
gr.Markdown(
"Generate the final answer using only the filtered, reranked context. "
"See the exact prompt sent to the LLM and the source excerpts it used."
)
generate_btn = gr.Button("πŸ’¬ Generate answer", variant="primary")
with gr.Row():
with gr.Column(scale=2):
answer_out = gr.Markdown(label="Answer")
with gr.Column(scale=1):
sources_out = gr.Markdown(label="Source excerpts")
with gr.Accordion("πŸ” Prompt sent to LLM (full transparency)", open=False):
prompt_out = gr.Markdown()
generate_btn.click(
fn=run_generation,
inputs=[question_input, rerank_state, api_key_input],
outputs=[answer_out, sources_out, prompt_out],
)
# About tab ───────────────────────────────────────────────────────
with gr.TabItem("πŸ“– About & Methodology"):
gr.Markdown("""
## Why RAG Transparency Matters
Most RAG tutorials show you the output. This tool shows you the *pipeline* β€” because the output quality is determined entirely by what happens before the LLM sees any text.
### The 4 stages explained
**Stage 1 β€” Chunking**
Documents must be split into pieces small enough to retrieve but large enough to contain meaningful context. Fixed-size chunking is fast but breaks sentences. Semantic chunking preserves argument structure.
**Stage 2 β€” Hybrid Retrieval**
Dense retrieval (embeddings) captures semantic similarity. Sparse retrieval (BM25) captures exact keyword matches. Neither alone is sufficient β€” hybrid is the production standard.
**Stage 3 β€” Reranking & Filtering**
Initial retrieval casts a wide net. Reranking re-scores candidates with richer signals. Filtering removes low-quality chunks and near-duplicates. This is the stage most tutorials skip β€” and it's where most hallucinations originate.
**Stage 4 β€” Grounded Generation**
The LLM receives only the filtered, ranked excerpts β€” constrained to cite its sources. The prompt is shown in full so you can see exactly what the model was asked.
---
## RAG Education Series
This is **Project 1 of 3**:
1. πŸ”¬ RAG Transparency Lab ← you are here
2. ⚑ Classic vs Advanced RAG β€” side-by-side comparison
3. πŸ§ͺ Scientific Claim Verifier β€” hallucination detection on research papers
---
Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina)
AI Evaluation Scientist | RLHF Specialist | Computational Neuroscientist
[GitHub](https://github.com/Mioulin) Β· dezhina@gmail.com
""")
demo.launch()