Spaces:

mioulin
/

rag-transparency-lab

Sleeping

File size: 16,534 Bytes

982896e

"""
RAG Transparency Lab — HuggingFace Gradio Space
By Zalina Dezhina, PhD | AI Evaluation Scientist

Visualises every step of a RAG pipeline on a user-uploaded scientific PDF.
"""

import os
import io
import numpy as np
import gradio as gr
import pypdf
import pandas as pd

from rag_pipeline.chunker import STRATEGIES, Chunk
from rag_pipeline.embedder import embed_texts
from rag_pipeline.retriever import retrieve
from rag_pipeline.reranker import rerank_and_filter
from rag_pipeline.generator import generate_answer

# ── State (in-memory per session via gr.State) ───────────────────────────────

def score_color(score: float) -> str:
    if score >= 0.65:
        return "🟢"
    elif score >= 0.35:
        return "🟡"
    return "🔴"


def extract_text_from_pdf(pdf_path: str) -> str:
    reader = pypdf.PdfReader(pdf_path)
    pages = [page.extract_text() or "" for page in reader.pages]
    return "\n\n".join(pages)


# ── Tab 1: Upload & Chunk ────────────────────────────────────────────────────

def process_pdf(pdf_file, strategy_name: str):
    if pdf_file is None:
        return "⚠️ Please upload a PDF.", None, None

    text = extract_text_from_pdf(pdf_file.name)
    if not text.strip():
        return "⚠️ Could not extract text from this PDF.", None, None

    strategy_fn = STRATEGIES[strategy_name]
    chunks = strategy_fn(text)

    # Build display dataframe
    rows = []
    for c in chunks:
        rows.append({
            "ID": c.chunk_id,
            "Words": c.word_count,
            "Sentences": c.sentence_count,
            "Preview": c.preview(100),
        })
    df = pd.DataFrame(rows)

    summary = (
        f"### ✅ Document processed\n"
        f"- **Strategy:** {strategy_name}\n"
        f"- **Total chunks:** {len(chunks)}\n"
        f"- **Avg words/chunk:** {df['Words'].mean():.0f}\n"
        f"- **Total words:** {df['Words'].sum()}\n\n"
        f"**Why chunking matters:** If chunks break mid-sentence or mid-argument, "
        f"retrieval will fail — the model receives incomplete evidence. "
        f"Semantic chunking preserves full reasoning units."
    )

    return summary, df, (text, chunks)


# ── Tab 2: Retrieval Explorer ────────────────────────────────────────────────

def run_retrieval(query: str, state, dense_weight: float):
    if not query.strip():
        return "⚠️ Please enter a question.", None, None
    if state is None:
        return "⚠️ Please process a PDF first (Tab 1).", None, None

    text, chunks = state
    if not chunks:
        return "⚠️ No chunks found. Try a different chunking strategy.", None, None

    chunk_texts = [c.text for c in chunks]
    embeddings = embed_texts(chunk_texts)

    sparse_weight = round(1.0 - dense_weight, 2)
    results = retrieve(
        query, chunks, embeddings,
        top_k=min(10, len(chunks)),
        dense_weight=dense_weight,
        sparse_weight=sparse_weight,
    )

    rows = []
    for r in results:
        rows.append({
            "Rank": r.rank,
            "Chunk ID": r.chunk.chunk_id,
            "Dense 🔵": f"{score_color(r.dense_score)} {r.dense_score:.3f}",
            "Sparse 🟠": f"{score_color(r.sparse_score)} {r.sparse_score:.3f}",
            "Hybrid ⚡": f"{score_color(r.hybrid_score)} {r.hybrid_score:.3f}",
            "Preview": r.chunk.preview(90),
        })

    df = pd.DataFrame(rows)

    insight = (
        f"### 🔍 Retrieval results for: *\"{query}\"*\n"
        f"- **Dense weight:** {dense_weight} | **Sparse (BM25) weight:** {sparse_weight}\n"
        f"- **Dense** captures semantic meaning — finds conceptually similar text\n"
        f"- **Sparse** captures exact keywords — catches specific terms\n"
        f"- **Hybrid** combines both — more robust than either alone\n\n"
        f"**Notice:** chunks with high dense but low sparse score "
        f"are semantically related but don't share your exact keywords. "
        f"Chunks with high sparse but low dense score match keywords but may be off-topic."
    )

    return insight, df, (embeddings, results, chunks)


# ── Tab 3: Reranking & Filtering ─────────────────────────────────────────────

def run_reranking(query: str, retrieval_state, threshold: float, top_n: int):
    if retrieval_state is None:
        return "⚠️ Run retrieval first (Tab 2).", None, None
    if not query.strip():
        return "⚠️ Please enter a question.", None, None

    embeddings, results, chunks = retrieval_state

    reranked = rerank_and_filter(
        query, results,
        score_threshold=threshold,
        top_n=int(top_n),
    )

    rows = []
    for r in reranked:
        status = "✅ KEPT" if r.kept else "❌ DROPPED"
        reason = r.filter_reason or "—"
        rows.append({
            "Status": status,
            "Chunk ID": r.chunk_id,
            "Original rank": r.original_rank,
            "New rank": r.new_rank if r.kept else "—",
            "Rerank score": f"{score_color(r.rerank_score)} {r.rerank_score:.3f}",
            "Filter reason": reason,
            "Preview": r.preview(80),
        })

    df = pd.DataFrame(rows)

    kept = [r for r in reranked if r.kept]
    dropped = [r for r in reranked if not r.kept]

    insight = (
        f"### ⚖️ Reranking & Filtering\n"
        f"- **Kept:** {len(kept)} chunks | **Dropped:** {len(dropped)} chunks\n"
        f"- **Score threshold:** {threshold} — chunks below this are removed\n\n"
        f"**Why rerank?** The initial retrieval finds candidates quickly but noisily. "
        f"Reranking re-scores using richer signals (keyword overlap + semantic score). "
        f"Filtering removes low-quality and duplicate chunks before they reach the LLM.\n\n"
        f"**Key insight:** Passing noisy chunks to the LLM is the #1 cause of "
        f"hallucination in RAG systems. Clean context = sharper answers."
    )

    return insight, df, reranked


# ── Tab 4: Final Answer ───────────────────────────────────────────────────────

def run_generation(query: str, rerank_state, api_key: str):
    if rerank_state is None:
        return "⚠️ Run reranking first (Tab 3).", "", ""
    if not query.strip():
        return "⚠️ Please enter a question.", "", ""

    kept = [r for r in rerank_state if r.kept]
    if not kept:
        return (
            "⚠️ No chunks passed the filters. "
            "Try lowering the threshold in Tab 3 or rephrasing your question.",
            "", ""
        )

    answer, prompt = generate_answer(query, kept, api_key=api_key.strip() or None)

    sources_md = "### 📄 Source excerpts used\n\n"
    for i, r in enumerate(kept, 1):
        sources_md += f"**[Excerpt {i}]** (Chunk {r.chunk_id}, score {r.rerank_score:.3f})\n"
        sources_md += f"> {r.text[:200]}{'...' if len(r.text) > 200 else ''}\n\n"

    prompt_display = f"```\n{prompt}\n```"

    return answer, sources_md, prompt_display


# ── Gradio UI ─────────────────────────────────────────────────────────────────

CSS = """
.score-high { color: #22c55e; font-weight: 500; }
.score-mid  { color: #f59e0b; }
.score-low  { color: #ef4444; }
"""

with gr.Blocks(
    title="RAG Transparency Lab",
    theme=gr.themes.Soft(primary_hue="slate", secondary_hue="blue"),
    css=CSS,
) as demo:

    # ── Header ───────────────────────────────────────────────────────────
    gr.Markdown("""
# 🔬 RAG Transparency Lab

**See inside every step of a RAG pipeline — applied to scientific papers.**

Most RAG demos show you only the final answer. This tool shows you *why* that answer is good or bad — by exposing chunking, retrieval scores, reranking decisions, and the exact prompt sent to the LLM.

Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) · AI Evaluation Scientist  
*Part of the RAG Education Series — Project 1 of 3*

---
    """)

    # ── Global inputs ────────────────────────────────────────────────────
    with gr.Row():
        api_key_input = gr.Textbox(
            label="🔑 Anthropic API Key (for Tab 4)",
            placeholder="sk-ant-...",
            type="password",
            scale=2,
        )
        question_input = gr.Textbox(
            label="❓ Your question about the paper",
            placeholder="What is the main finding of this paper?",
            scale=3,
        )

    # ── State ────────────────────────────────────────────────────────────
    chunk_state = gr.State(None)
    retrieval_state = gr.State(None)
    rerank_state = gr.State(None)

    # ── Tabs ─────────────────────────────────────────────────────────────
    with gr.Tabs():

        # Tab 1 ───────────────────────────────────────────────────────────
        with gr.TabItem("📄 Step 1 — Upload & Chunk"):
            gr.Markdown(
                "Upload a scientific PDF and choose a chunking strategy. "
                "See how the document is split into pieces that the retrieval system will search."
            )
            with gr.Row():
                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], scale=2)
                strategy_input = gr.Dropdown(
                    choices=list(STRATEGIES.keys()),
                    value="Semantic (5 sentences)",
                    label="Chunking strategy",
                    scale=1,
                )
            process_btn = gr.Button("⚙️ Process document", variant="primary")
            chunk_summary = gr.Markdown()
            chunk_table = gr.DataFrame(label="All chunks", wrap=True)

            process_btn.click(
                fn=process_pdf,
                inputs=[pdf_input, strategy_input],
                outputs=[chunk_summary, chunk_table, chunk_state],
            )

        # Tab 2 ───────────────────────────────────────────────────────────
        with gr.TabItem("🔍 Step 2 — Retrieval Explorer"):
            gr.Markdown(
                "Run hybrid retrieval (dense + sparse). "
                "See the individual scores for each candidate chunk — "
                "and adjust the balance between semantic and keyword search."
            )
            dense_weight_slider = gr.Slider(
                minimum=0.0, maximum=1.0, value=0.6, step=0.1,
                label="Dense weight (1 - this = sparse/BM25 weight)",
            )
            retrieve_btn = gr.Button("🔍 Run retrieval", variant="primary")
            retrieval_insight = gr.Markdown()
            retrieval_table = gr.DataFrame(label="Retrieval scores (top 10)", wrap=True)

            retrieve_btn.click(
                fn=run_retrieval,
                inputs=[question_input, chunk_state, dense_weight_slider],
                outputs=[retrieval_insight, retrieval_table, retrieval_state],
            )

        # Tab 3 ───────────────────────────────────────────────────────────
        with gr.TabItem("⚖️ Step 3 — Rerank & Filter"):
            gr.Markdown(
                "Rerank candidates with richer scoring, then filter out "
                "low-quality and duplicate chunks. "
                "See exactly which chunks were dropped — and why."
            )
            with gr.Row():
                threshold_slider = gr.Slider(
                    minimum=0.0, maximum=1.0, value=0.25, step=0.05,
                    label="Score threshold (chunks below this are dropped)",
                    scale=2,
                )
                top_n_slider = gr.Slider(
                    minimum=1, maximum=10, value=5, step=1,
                    label="Top N chunks to keep",
                    scale=1,
                )
            rerank_btn = gr.Button("⚖️ Rerank & Filter", variant="primary")
            rerank_insight = gr.Markdown()
            rerank_table = gr.DataFrame(label="Reranking decisions", wrap=True)

            rerank_btn.click(
                fn=run_reranking,
                inputs=[question_input, retrieval_state, threshold_slider, top_n_slider],
                outputs=[rerank_insight, rerank_table, rerank_state],
            )

        # Tab 4 ───────────────────────────────────────────────────────────
        with gr.TabItem("💬 Step 4 — Final Answer"):
            gr.Markdown(
                "Generate the final answer using only the filtered, reranked context. "
                "See the exact prompt sent to the LLM and the source excerpts it used."
            )
            generate_btn = gr.Button("💬 Generate answer", variant="primary")

            with gr.Row():
                with gr.Column(scale=2):
                    answer_out = gr.Markdown(label="Answer")
                with gr.Column(scale=1):
                    sources_out = gr.Markdown(label="Source excerpts")

            with gr.Accordion("🔍 Prompt sent to LLM (full transparency)", open=False):
                prompt_out = gr.Markdown()

            generate_btn.click(
                fn=run_generation,
                inputs=[question_input, rerank_state, api_key_input],
                outputs=[answer_out, sources_out, prompt_out],
            )

        # About tab ───────────────────────────────────────────────────────
        with gr.TabItem("📖 About & Methodology"):
            gr.Markdown("""
## Why RAG Transparency Matters

Most RAG tutorials show you the output. This tool shows you the *pipeline* — because the output quality is determined entirely by what happens before the LLM sees any text.

### The 4 stages explained

**Stage 1 — Chunking**
Documents must be split into pieces small enough to retrieve but large enough to contain meaningful context. Fixed-size chunking is fast but breaks sentences. Semantic chunking preserves argument structure.

**Stage 2 — Hybrid Retrieval**
Dense retrieval (embeddings) captures semantic similarity. Sparse retrieval (BM25) captures exact keyword matches. Neither alone is sufficient — hybrid is the production standard.

**Stage 3 — Reranking & Filtering**
Initial retrieval casts a wide net. Reranking re-scores candidates with richer signals. Filtering removes low-quality chunks and near-duplicates. This is the stage most tutorials skip — and it's where most hallucinations originate.

**Stage 4 — Grounded Generation**
The LLM receives only the filtered, ranked excerpts — constrained to cite its sources. The prompt is shown in full so you can see exactly what the model was asked.

---

## RAG Education Series

This is **Project 1 of 3**:
1. 🔬 RAG Transparency Lab ← you are here
2. ⚡ Classic vs Advanced RAG — side-by-side comparison
3. 🧪 Scientific Claim Verifier — hallucination detection on research papers

---

Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina)  
AI Evaluation Scientist | RLHF Specialist | Computational Neuroscientist  
[GitHub](https://github.com/Mioulin) · dezhina@gmail.com
            """)

demo.launch()