""" RAG Transparency Lab — HuggingFace Gradio Space By Zalina Dezhina, PhD | AI Evaluation Scientist Visualises every step of a RAG pipeline on a user-uploaded scientific PDF. """ import os import io import numpy as np import gradio as gr import pypdf import pandas as pd from rag_pipeline.chunker import STRATEGIES, Chunk from rag_pipeline.embedder import embed_texts from rag_pipeline.retriever import retrieve from rag_pipeline.reranker import rerank_and_filter from rag_pipeline.generator import generate_answer # ── State (in-memory per session via gr.State) ─────────────────────────────── def score_color(score: float) -> str: if score >= 0.65: return "🟢" elif score >= 0.35: return "🟡" return "🔴" def extract_text_from_pdf(pdf_path: str) -> str: reader = pypdf.PdfReader(pdf_path) pages = [page.extract_text() or "" for page in reader.pages] return "\n\n".join(pages) # ── Tab 1: Upload & Chunk ──────────────────────────────────────────────────── def process_pdf(pdf_file, strategy_name: str): if pdf_file is None: return "⚠️ Please upload a PDF.", None, None text = extract_text_from_pdf(pdf_file.name) if not text.strip(): return "⚠️ Could not extract text from this PDF.", None, None strategy_fn = STRATEGIES[strategy_name] chunks = strategy_fn(text) # Build display dataframe rows = [] for c in chunks: rows.append({ "ID": c.chunk_id, "Words": c.word_count, "Sentences": c.sentence_count, "Preview": c.preview(100), }) df = pd.DataFrame(rows) summary = ( f"### ✅ Document processed\n" f"- **Strategy:** {strategy_name}\n" f"- **Total chunks:** {len(chunks)}\n" f"- **Avg words/chunk:** {df['Words'].mean():.0f}\n" f"- **Total words:** {df['Words'].sum()}\n\n" f"**Why chunking matters:** If chunks break mid-sentence or mid-argument, " f"retrieval will fail — the model receives incomplete evidence. " f"Semantic chunking preserves full reasoning units." ) return summary, df, (text, chunks) # ── Tab 2: Retrieval Explorer ──────────────────────────────────────────────── def run_retrieval(query: str, state, dense_weight: float): if not query.strip(): return "⚠️ Please enter a question.", None, None if state is None: return "⚠️ Please process a PDF first (Tab 1).", None, None text, chunks = state if not chunks: return "⚠️ No chunks found. Try a different chunking strategy.", None, None chunk_texts = [c.text for c in chunks] embeddings = embed_texts(chunk_texts) sparse_weight = round(1.0 - dense_weight, 2) results = retrieve( query, chunks, embeddings, top_k=min(10, len(chunks)), dense_weight=dense_weight, sparse_weight=sparse_weight, ) rows = [] for r in results: rows.append({ "Rank": r.rank, "Chunk ID": r.chunk.chunk_id, "Dense 🔵": f"{score_color(r.dense_score)} {r.dense_score:.3f}", "Sparse 🟠": f"{score_color(r.sparse_score)} {r.sparse_score:.3f}", "Hybrid ⚡": f"{score_color(r.hybrid_score)} {r.hybrid_score:.3f}", "Preview": r.chunk.preview(90), }) df = pd.DataFrame(rows) insight = ( f"### 🔍 Retrieval results for: *\"{query}\"*\n" f"- **Dense weight:** {dense_weight} | **Sparse (BM25) weight:** {sparse_weight}\n" f"- **Dense** captures semantic meaning — finds conceptually similar text\n" f"- **Sparse** captures exact keywords — catches specific terms\n" f"- **Hybrid** combines both — more robust than either alone\n\n" f"**Notice:** chunks with high dense but low sparse score " f"are semantically related but don't share your exact keywords. " f"Chunks with high sparse but low dense score match keywords but may be off-topic." ) return insight, df, (embeddings, results, chunks) # ── Tab 3: Reranking & Filtering ───────────────────────────────────────────── def run_reranking(query: str, retrieval_state, threshold: float, top_n: int): if retrieval_state is None: return "⚠️ Run retrieval first (Tab 2).", None, None if not query.strip(): return "⚠️ Please enter a question.", None, None embeddings, results, chunks = retrieval_state reranked = rerank_and_filter( query, results, score_threshold=threshold, top_n=int(top_n), ) rows = [] for r in reranked: status = "✅ KEPT" if r.kept else "❌ DROPPED" reason = r.filter_reason or "—" rows.append({ "Status": status, "Chunk ID": r.chunk_id, "Original rank": r.original_rank, "New rank": r.new_rank if r.kept else "—", "Rerank score": f"{score_color(r.rerank_score)} {r.rerank_score:.3f}", "Filter reason": reason, "Preview": r.preview(80), }) df = pd.DataFrame(rows) kept = [r for r in reranked if r.kept] dropped = [r for r in reranked if not r.kept] insight = ( f"### ⚖️ Reranking & Filtering\n" f"- **Kept:** {len(kept)} chunks | **Dropped:** {len(dropped)} chunks\n" f"- **Score threshold:** {threshold} — chunks below this are removed\n\n" f"**Why rerank?** The initial retrieval finds candidates quickly but noisily. " f"Reranking re-scores using richer signals (keyword overlap + semantic score). " f"Filtering removes low-quality and duplicate chunks before they reach the LLM.\n\n" f"**Key insight:** Passing noisy chunks to the LLM is the #1 cause of " f"hallucination in RAG systems. Clean context = sharper answers." ) return insight, df, reranked # ── Tab 4: Final Answer ─────────────────────────────────────────────────────── def run_generation(query: str, rerank_state, api_key: str): if rerank_state is None: return "⚠️ Run reranking first (Tab 3).", "", "" if not query.strip(): return "⚠️ Please enter a question.", "", "" kept = [r for r in rerank_state if r.kept] if not kept: return ( "⚠️ No chunks passed the filters. " "Try lowering the threshold in Tab 3 or rephrasing your question.", "", "" ) answer, prompt = generate_answer(query, kept, api_key=api_key.strip() or None) sources_md = "### 📄 Source excerpts used\n\n" for i, r in enumerate(kept, 1): sources_md += f"**[Excerpt {i}]** (Chunk {r.chunk_id}, score {r.rerank_score:.3f})\n" sources_md += f"> {r.text[:200]}{'...' if len(r.text) > 200 else ''}\n\n" prompt_display = f"```\n{prompt}\n```" return answer, sources_md, prompt_display # ── Gradio UI ───────────────────────────────────────────────────────────────── CSS = """ .score-high { color: #22c55e; font-weight: 500; } .score-mid { color: #f59e0b; } .score-low { color: #ef4444; } """ with gr.Blocks( title="RAG Transparency Lab", theme=gr.themes.Soft(primary_hue="slate", secondary_hue="blue"), css=CSS, ) as demo: # ── Header ─────────────────────────────────────────────────────────── gr.Markdown(""" # 🔬 RAG Transparency Lab **See inside every step of a RAG pipeline — applied to scientific papers.** Most RAG demos show you only the final answer. This tool shows you *why* that answer is good or bad — by exposing chunking, retrieval scores, reranking decisions, and the exact prompt sent to the LLM. Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) · AI Evaluation Scientist *Part of the RAG Education Series — Project 1 of 3* --- """) # ── Global inputs ──────────────────────────────────────────────────── with gr.Row(): api_key_input = gr.Textbox( label="🔑 Anthropic API Key (for Tab 4)", placeholder="sk-ant-...", type="password", scale=2, ) question_input = gr.Textbox( label="❓ Your question about the paper", placeholder="What is the main finding of this paper?", scale=3, ) # ── State ──────────────────────────────────────────────────────────── chunk_state = gr.State(None) retrieval_state = gr.State(None) rerank_state = gr.State(None) # ── Tabs ───────────────────────────────────────────────────────────── with gr.Tabs(): # Tab 1 ─────────────────────────────────────────────────────────── with gr.TabItem("📄 Step 1 — Upload & Chunk"): gr.Markdown( "Upload a scientific PDF and choose a chunking strategy. " "See how the document is split into pieces that the retrieval system will search." ) with gr.Row(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], scale=2) strategy_input = gr.Dropdown( choices=list(STRATEGIES.keys()), value="Semantic (5 sentences)", label="Chunking strategy", scale=1, ) process_btn = gr.Button("⚙️ Process document", variant="primary") chunk_summary = gr.Markdown() chunk_table = gr.DataFrame(label="All chunks", wrap=True) process_btn.click( fn=process_pdf, inputs=[pdf_input, strategy_input], outputs=[chunk_summary, chunk_table, chunk_state], ) # Tab 2 ─────────────────────────────────────────────────────────── with gr.TabItem("🔍 Step 2 — Retrieval Explorer"): gr.Markdown( "Run hybrid retrieval (dense + sparse). " "See the individual scores for each candidate chunk — " "and adjust the balance between semantic and keyword search." ) dense_weight_slider = gr.Slider( minimum=0.0, maximum=1.0, value=0.6, step=0.1, label="Dense weight (1 - this = sparse/BM25 weight)", ) retrieve_btn = gr.Button("🔍 Run retrieval", variant="primary") retrieval_insight = gr.Markdown() retrieval_table = gr.DataFrame(label="Retrieval scores (top 10)", wrap=True) retrieve_btn.click( fn=run_retrieval, inputs=[question_input, chunk_state, dense_weight_slider], outputs=[retrieval_insight, retrieval_table, retrieval_state], ) # Tab 3 ─────────────────────────────────────────────────────────── with gr.TabItem("⚖️ Step 3 — Rerank & Filter"): gr.Markdown( "Rerank candidates with richer scoring, then filter out " "low-quality and duplicate chunks. " "See exactly which chunks were dropped — and why." ) with gr.Row(): threshold_slider = gr.Slider( minimum=0.0, maximum=1.0, value=0.25, step=0.05, label="Score threshold (chunks below this are dropped)", scale=2, ) top_n_slider = gr.Slider( minimum=1, maximum=10, value=5, step=1, label="Top N chunks to keep", scale=1, ) rerank_btn = gr.Button("⚖️ Rerank & Filter", variant="primary") rerank_insight = gr.Markdown() rerank_table = gr.DataFrame(label="Reranking decisions", wrap=True) rerank_btn.click( fn=run_reranking, inputs=[question_input, retrieval_state, threshold_slider, top_n_slider], outputs=[rerank_insight, rerank_table, rerank_state], ) # Tab 4 ─────────────────────────────────────────────────────────── with gr.TabItem("💬 Step 4 — Final Answer"): gr.Markdown( "Generate the final answer using only the filtered, reranked context. " "See the exact prompt sent to the LLM and the source excerpts it used." ) generate_btn = gr.Button("💬 Generate answer", variant="primary") with gr.Row(): with gr.Column(scale=2): answer_out = gr.Markdown(label="Answer") with gr.Column(scale=1): sources_out = gr.Markdown(label="Source excerpts") with gr.Accordion("🔍 Prompt sent to LLM (full transparency)", open=False): prompt_out = gr.Markdown() generate_btn.click( fn=run_generation, inputs=[question_input, rerank_state, api_key_input], outputs=[answer_out, sources_out, prompt_out], ) # About tab ─────────────────────────────────────────────────────── with gr.TabItem("📖 About & Methodology"): gr.Markdown(""" ## Why RAG Transparency Matters Most RAG tutorials show you the output. This tool shows you the *pipeline* — because the output quality is determined entirely by what happens before the LLM sees any text. ### The 4 stages explained **Stage 1 — Chunking** Documents must be split into pieces small enough to retrieve but large enough to contain meaningful context. Fixed-size chunking is fast but breaks sentences. Semantic chunking preserves argument structure. **Stage 2 — Hybrid Retrieval** Dense retrieval (embeddings) captures semantic similarity. Sparse retrieval (BM25) captures exact keyword matches. Neither alone is sufficient — hybrid is the production standard. **Stage 3 — Reranking & Filtering** Initial retrieval casts a wide net. Reranking re-scores candidates with richer signals. Filtering removes low-quality chunks and near-duplicates. This is the stage most tutorials skip — and it's where most hallucinations originate. **Stage 4 — Grounded Generation** The LLM receives only the filtered, ranked excerpts — constrained to cite its sources. The prompt is shown in full so you can see exactly what the model was asked. --- ## RAG Education Series This is **Project 1 of 3**: 1. 🔬 RAG Transparency Lab ← you are here 2. ⚡ Classic vs Advanced RAG — side-by-side comparison 3. 🧪 Scientific Claim Verifier — hallucination detection on research papers --- Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) AI Evaluation Scientist | RLHF Specialist | Computational Neuroscientist [GitHub](https://github.com/Mioulin) · dezhina@gmail.com """) demo.launch()