Spaces:
Sleeping
Sleeping
| """ | |
| RAG Transparency Lab β HuggingFace Gradio Space | |
| By Zalina Dezhina, PhD | AI Evaluation Scientist | |
| Visualises every step of a RAG pipeline on a user-uploaded scientific PDF. | |
| """ | |
| import os | |
| import io | |
| import numpy as np | |
| import gradio as gr | |
| import pypdf | |
| import pandas as pd | |
| from rag_pipeline.chunker import STRATEGIES, Chunk | |
| from rag_pipeline.embedder import embed_texts | |
| from rag_pipeline.retriever import retrieve | |
| from rag_pipeline.reranker import rerank_and_filter | |
| from rag_pipeline.generator import generate_answer | |
| # ββ State (in-memory per session via gr.State) βββββββββββββββββββββββββββββββ | |
| def score_color(score: float) -> str: | |
| if score >= 0.65: | |
| return "π’" | |
| elif score >= 0.35: | |
| return "π‘" | |
| return "π΄" | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| reader = pypdf.PdfReader(pdf_path) | |
| pages = [page.extract_text() or "" for page in reader.pages] | |
| return "\n\n".join(pages) | |
| # ββ Tab 1: Upload & Chunk ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_pdf(pdf_file, strategy_name: str): | |
| if pdf_file is None: | |
| return "β οΈ Please upload a PDF.", None, None | |
| text = extract_text_from_pdf(pdf_file.name) | |
| if not text.strip(): | |
| return "β οΈ Could not extract text from this PDF.", None, None | |
| strategy_fn = STRATEGIES[strategy_name] | |
| chunks = strategy_fn(text) | |
| # Build display dataframe | |
| rows = [] | |
| for c in chunks: | |
| rows.append({ | |
| "ID": c.chunk_id, | |
| "Words": c.word_count, | |
| "Sentences": c.sentence_count, | |
| "Preview": c.preview(100), | |
| }) | |
| df = pd.DataFrame(rows) | |
| summary = ( | |
| f"### β Document processed\n" | |
| f"- **Strategy:** {strategy_name}\n" | |
| f"- **Total chunks:** {len(chunks)}\n" | |
| f"- **Avg words/chunk:** {df['Words'].mean():.0f}\n" | |
| f"- **Total words:** {df['Words'].sum()}\n\n" | |
| f"**Why chunking matters:** If chunks break mid-sentence or mid-argument, " | |
| f"retrieval will fail β the model receives incomplete evidence. " | |
| f"Semantic chunking preserves full reasoning units." | |
| ) | |
| return summary, df, (text, chunks) | |
| # ββ Tab 2: Retrieval Explorer ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_retrieval(query: str, state, dense_weight: float): | |
| if not query.strip(): | |
| return "β οΈ Please enter a question.", None, None | |
| if state is None: | |
| return "β οΈ Please process a PDF first (Tab 1).", None, None | |
| text, chunks = state | |
| if not chunks: | |
| return "β οΈ No chunks found. Try a different chunking strategy.", None, None | |
| chunk_texts = [c.text for c in chunks] | |
| embeddings = embed_texts(chunk_texts) | |
| sparse_weight = round(1.0 - dense_weight, 2) | |
| results = retrieve( | |
| query, chunks, embeddings, | |
| top_k=min(10, len(chunks)), | |
| dense_weight=dense_weight, | |
| sparse_weight=sparse_weight, | |
| ) | |
| rows = [] | |
| for r in results: | |
| rows.append({ | |
| "Rank": r.rank, | |
| "Chunk ID": r.chunk.chunk_id, | |
| "Dense π΅": f"{score_color(r.dense_score)} {r.dense_score:.3f}", | |
| "Sparse π ": f"{score_color(r.sparse_score)} {r.sparse_score:.3f}", | |
| "Hybrid β‘": f"{score_color(r.hybrid_score)} {r.hybrid_score:.3f}", | |
| "Preview": r.chunk.preview(90), | |
| }) | |
| df = pd.DataFrame(rows) | |
| insight = ( | |
| f"### π Retrieval results for: *\"{query}\"*\n" | |
| f"- **Dense weight:** {dense_weight} | **Sparse (BM25) weight:** {sparse_weight}\n" | |
| f"- **Dense** captures semantic meaning β finds conceptually similar text\n" | |
| f"- **Sparse** captures exact keywords β catches specific terms\n" | |
| f"- **Hybrid** combines both β more robust than either alone\n\n" | |
| f"**Notice:** chunks with high dense but low sparse score " | |
| f"are semantically related but don't share your exact keywords. " | |
| f"Chunks with high sparse but low dense score match keywords but may be off-topic." | |
| ) | |
| return insight, df, (embeddings, results, chunks) | |
| # ββ Tab 3: Reranking & Filtering βββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_reranking(query: str, retrieval_state, threshold: float, top_n: int): | |
| if retrieval_state is None: | |
| return "β οΈ Run retrieval first (Tab 2).", None, None | |
| if not query.strip(): | |
| return "β οΈ Please enter a question.", None, None | |
| embeddings, results, chunks = retrieval_state | |
| reranked = rerank_and_filter( | |
| query, results, | |
| score_threshold=threshold, | |
| top_n=int(top_n), | |
| ) | |
| rows = [] | |
| for r in reranked: | |
| status = "β KEPT" if r.kept else "β DROPPED" | |
| reason = r.filter_reason or "β" | |
| rows.append({ | |
| "Status": status, | |
| "Chunk ID": r.chunk_id, | |
| "Original rank": r.original_rank, | |
| "New rank": r.new_rank if r.kept else "β", | |
| "Rerank score": f"{score_color(r.rerank_score)} {r.rerank_score:.3f}", | |
| "Filter reason": reason, | |
| "Preview": r.preview(80), | |
| }) | |
| df = pd.DataFrame(rows) | |
| kept = [r for r in reranked if r.kept] | |
| dropped = [r for r in reranked if not r.kept] | |
| insight = ( | |
| f"### βοΈ Reranking & Filtering\n" | |
| f"- **Kept:** {len(kept)} chunks | **Dropped:** {len(dropped)} chunks\n" | |
| f"- **Score threshold:** {threshold} β chunks below this are removed\n\n" | |
| f"**Why rerank?** The initial retrieval finds candidates quickly but noisily. " | |
| f"Reranking re-scores using richer signals (keyword overlap + semantic score). " | |
| f"Filtering removes low-quality and duplicate chunks before they reach the LLM.\n\n" | |
| f"**Key insight:** Passing noisy chunks to the LLM is the #1 cause of " | |
| f"hallucination in RAG systems. Clean context = sharper answers." | |
| ) | |
| return insight, df, reranked | |
| # ββ Tab 4: Final Answer βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_generation(query: str, rerank_state, api_key: str): | |
| if rerank_state is None: | |
| return "β οΈ Run reranking first (Tab 3).", "", "" | |
| if not query.strip(): | |
| return "β οΈ Please enter a question.", "", "" | |
| kept = [r for r in rerank_state if r.kept] | |
| if not kept: | |
| return ( | |
| "β οΈ No chunks passed the filters. " | |
| "Try lowering the threshold in Tab 3 or rephrasing your question.", | |
| "", "" | |
| ) | |
| answer, prompt = generate_answer(query, kept, api_key=api_key.strip() or None) | |
| sources_md = "### π Source excerpts used\n\n" | |
| for i, r in enumerate(kept, 1): | |
| sources_md += f"**[Excerpt {i}]** (Chunk {r.chunk_id}, score {r.rerank_score:.3f})\n" | |
| sources_md += f"> {r.text[:200]}{'...' if len(r.text) > 200 else ''}\n\n" | |
| prompt_display = f"```\n{prompt}\n```" | |
| return answer, sources_md, prompt_display | |
| # ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| .score-high { color: #22c55e; font-weight: 500; } | |
| .score-mid { color: #f59e0b; } | |
| .score-low { color: #ef4444; } | |
| """ | |
| with gr.Blocks( | |
| title="RAG Transparency Lab", | |
| theme=gr.themes.Soft(primary_hue="slate", secondary_hue="blue"), | |
| css=CSS, | |
| ) as demo: | |
| # ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Markdown(""" | |
| # π¬ RAG Transparency Lab | |
| **See inside every step of a RAG pipeline β applied to scientific papers.** | |
| Most RAG demos show you only the final answer. This tool shows you *why* that answer is good or bad β by exposing chunking, retrieval scores, reranking decisions, and the exact prompt sent to the LLM. | |
| Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) Β· AI Evaluation Scientist | |
| *Part of the RAG Education Series β Project 1 of 3* | |
| --- | |
| """) | |
| # ββ Global inputs ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(): | |
| api_key_input = gr.Textbox( | |
| label="π Anthropic API Key (for Tab 4)", | |
| placeholder="sk-ant-...", | |
| type="password", | |
| scale=2, | |
| ) | |
| question_input = gr.Textbox( | |
| label="β Your question about the paper", | |
| placeholder="What is the main finding of this paper?", | |
| scale=3, | |
| ) | |
| # ββ State ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| chunk_state = gr.State(None) | |
| retrieval_state = gr.State(None) | |
| rerank_state = gr.State(None) | |
| # ββ Tabs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tabs(): | |
| # Tab 1 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π Step 1 β Upload & Chunk"): | |
| gr.Markdown( | |
| "Upload a scientific PDF and choose a chunking strategy. " | |
| "See how the document is split into pieces that the retrieval system will search." | |
| ) | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], scale=2) | |
| strategy_input = gr.Dropdown( | |
| choices=list(STRATEGIES.keys()), | |
| value="Semantic (5 sentences)", | |
| label="Chunking strategy", | |
| scale=1, | |
| ) | |
| process_btn = gr.Button("βοΈ Process document", variant="primary") | |
| chunk_summary = gr.Markdown() | |
| chunk_table = gr.DataFrame(label="All chunks", wrap=True) | |
| process_btn.click( | |
| fn=process_pdf, | |
| inputs=[pdf_input, strategy_input], | |
| outputs=[chunk_summary, chunk_table, chunk_state], | |
| ) | |
| # Tab 2 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π Step 2 β Retrieval Explorer"): | |
| gr.Markdown( | |
| "Run hybrid retrieval (dense + sparse). " | |
| "See the individual scores for each candidate chunk β " | |
| "and adjust the balance between semantic and keyword search." | |
| ) | |
| dense_weight_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.6, step=0.1, | |
| label="Dense weight (1 - this = sparse/BM25 weight)", | |
| ) | |
| retrieve_btn = gr.Button("π Run retrieval", variant="primary") | |
| retrieval_insight = gr.Markdown() | |
| retrieval_table = gr.DataFrame(label="Retrieval scores (top 10)", wrap=True) | |
| retrieve_btn.click( | |
| fn=run_retrieval, | |
| inputs=[question_input, chunk_state, dense_weight_slider], | |
| outputs=[retrieval_insight, retrieval_table, retrieval_state], | |
| ) | |
| # Tab 3 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("βοΈ Step 3 β Rerank & Filter"): | |
| gr.Markdown( | |
| "Rerank candidates with richer scoring, then filter out " | |
| "low-quality and duplicate chunks. " | |
| "See exactly which chunks were dropped β and why." | |
| ) | |
| with gr.Row(): | |
| threshold_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.25, step=0.05, | |
| label="Score threshold (chunks below this are dropped)", | |
| scale=2, | |
| ) | |
| top_n_slider = gr.Slider( | |
| minimum=1, maximum=10, value=5, step=1, | |
| label="Top N chunks to keep", | |
| scale=1, | |
| ) | |
| rerank_btn = gr.Button("βοΈ Rerank & Filter", variant="primary") | |
| rerank_insight = gr.Markdown() | |
| rerank_table = gr.DataFrame(label="Reranking decisions", wrap=True) | |
| rerank_btn.click( | |
| fn=run_reranking, | |
| inputs=[question_input, retrieval_state, threshold_slider, top_n_slider], | |
| outputs=[rerank_insight, rerank_table, rerank_state], | |
| ) | |
| # Tab 4 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π¬ Step 4 β Final Answer"): | |
| gr.Markdown( | |
| "Generate the final answer using only the filtered, reranked context. " | |
| "See the exact prompt sent to the LLM and the source excerpts it used." | |
| ) | |
| generate_btn = gr.Button("π¬ Generate answer", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| answer_out = gr.Markdown(label="Answer") | |
| with gr.Column(scale=1): | |
| sources_out = gr.Markdown(label="Source excerpts") | |
| with gr.Accordion("π Prompt sent to LLM (full transparency)", open=False): | |
| prompt_out = gr.Markdown() | |
| generate_btn.click( | |
| fn=run_generation, | |
| inputs=[question_input, rerank_state, api_key_input], | |
| outputs=[answer_out, sources_out, prompt_out], | |
| ) | |
| # About tab βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("π About & Methodology"): | |
| gr.Markdown(""" | |
| ## Why RAG Transparency Matters | |
| Most RAG tutorials show you the output. This tool shows you the *pipeline* β because the output quality is determined entirely by what happens before the LLM sees any text. | |
| ### The 4 stages explained | |
| **Stage 1 β Chunking** | |
| Documents must be split into pieces small enough to retrieve but large enough to contain meaningful context. Fixed-size chunking is fast but breaks sentences. Semantic chunking preserves argument structure. | |
| **Stage 2 β Hybrid Retrieval** | |
| Dense retrieval (embeddings) captures semantic similarity. Sparse retrieval (BM25) captures exact keyword matches. Neither alone is sufficient β hybrid is the production standard. | |
| **Stage 3 β Reranking & Filtering** | |
| Initial retrieval casts a wide net. Reranking re-scores candidates with richer signals. Filtering removes low-quality chunks and near-duplicates. This is the stage most tutorials skip β and it's where most hallucinations originate. | |
| **Stage 4 β Grounded Generation** | |
| The LLM receives only the filtered, ranked excerpts β constrained to cite its sources. The prompt is shown in full so you can see exactly what the model was asked. | |
| --- | |
| ## RAG Education Series | |
| This is **Project 1 of 3**: | |
| 1. π¬ RAG Transparency Lab β you are here | |
| 2. β‘ Classic vs Advanced RAG β side-by-side comparison | |
| 3. π§ͺ Scientific Claim Verifier β hallucination detection on research papers | |
| --- | |
| Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) | |
| AI Evaluation Scientist | RLHF Specialist | Computational Neuroscientist | |
| [GitHub](https://github.com/Mioulin) Β· dezhina@gmail.com | |
| """) | |
| demo.launch() | |