Spaces:

mioulin
/

rag-transparency-lab

Sleeping

App Files Files Community

rag-transparency-lab / app.py

mioulin

Create app.py

982896e verified about 1 month ago

raw

history blame contribute delete

16.5 kB

	"""
	RAG Transparency Lab — HuggingFace Gradio Space
	By Zalina Dezhina, PhD \| AI Evaluation Scientist

	Visualises every step of a RAG pipeline on a user-uploaded scientific PDF.
	"""

	import os
	import io
	import numpy as np
	import gradio as gr
	import pypdf
	import pandas as pd

	from rag_pipeline.chunker import STRATEGIES, Chunk
	from rag_pipeline.embedder import embed_texts
	from rag_pipeline.retriever import retrieve
	from rag_pipeline.reranker import rerank_and_filter
	from rag_pipeline.generator import generate_answer

	# ── State (in-memory per session via gr.State) ───────────────────────────────

	def score_color(score: float) -> str:
	if score >= 0.65:
	return "🟢"
	elif score >= 0.35:
	return "🟡"
	return "🔴"


	def extract_text_from_pdf(pdf_path: str) -> str:
	reader = pypdf.PdfReader(pdf_path)
	pages = [page.extract_text() or "" for page in reader.pages]
	return "\n\n".join(pages)


	# ── Tab 1: Upload & Chunk ────────────────────────────────────────────────────

	def process_pdf(pdf_file, strategy_name: str):
	if pdf_file is None:
	return "⚠️ Please upload a PDF.", None, None

	text = extract_text_from_pdf(pdf_file.name)
	if not text.strip():
	return "⚠️ Could not extract text from this PDF.", None, None

	strategy_fn = STRATEGIES[strategy_name]
	chunks = strategy_fn(text)

	# Build display dataframe
	rows = []
	for c in chunks:
	rows.append({
	"ID": c.chunk_id,
	"Words": c.word_count,
	"Sentences": c.sentence_count,
	"Preview": c.preview(100),
	})
	df = pd.DataFrame(rows)

	summary = (
	f"### ✅ Document processed\n"
	f"- Strategy: {strategy_name}\n"
	f"- Total chunks: {len(chunks)}\n"
	f"- Avg words/chunk: {df['Words'].mean():.0f}\n"
	f"- Total words: {df['Words'].sum()}\n\n"
	f"Why chunking matters: If chunks break mid-sentence or mid-argument, "
	f"retrieval will fail — the model receives incomplete evidence. "
	f"Semantic chunking preserves full reasoning units."
	)

	return summary, df, (text, chunks)


	# ── Tab 2: Retrieval Explorer ────────────────────────────────────────────────

	def run_retrieval(query: str, state, dense_weight: float):
	if not query.strip():
	return "⚠️ Please enter a question.", None, None
	if state is None:
	return "⚠️ Please process a PDF first (Tab 1).", None, None

	text, chunks = state
	if not chunks:
	return "⚠️ No chunks found. Try a different chunking strategy.", None, None

	chunk_texts = [c.text for c in chunks]
	embeddings = embed_texts(chunk_texts)

	sparse_weight = round(1.0 - dense_weight, 2)
	results = retrieve(
	query, chunks, embeddings,
	top_k=min(10, len(chunks)),
	dense_weight=dense_weight,
	sparse_weight=sparse_weight,
	)

	rows = []
	for r in results:
	rows.append({
	"Rank": r.rank,
	"Chunk ID": r.chunk.chunk_id,
	"Dense 🔵": f"{score_color(r.dense_score)} {r.dense_score:.3f}",
	"Sparse 🟠": f"{score_color(r.sparse_score)} {r.sparse_score:.3f}",
	"Hybrid ⚡": f"{score_color(r.hybrid_score)} {r.hybrid_score:.3f}",
	"Preview": r.chunk.preview(90),
	})

	df = pd.DataFrame(rows)

	insight = (
	f"### 🔍 Retrieval results for: \"{query}\"\n"
	f"- Dense weight: {dense_weight} \| Sparse (BM25) weight: {sparse_weight}\n"
	f"- Dense captures semantic meaning — finds conceptually similar text\n"
	f"- Sparse captures exact keywords — catches specific terms\n"
	f"- Hybrid combines both — more robust than either alone\n\n"
	f"Notice: chunks with high dense but low sparse score "
	f"are semantically related but don't share your exact keywords. "
	f"Chunks with high sparse but low dense score match keywords but may be off-topic."
	)

	return insight, df, (embeddings, results, chunks)


	# ── Tab 3: Reranking & Filtering ─────────────────────────────────────────────

	def run_reranking(query: str, retrieval_state, threshold: float, top_n: int):
	if retrieval_state is None:
	return "⚠️ Run retrieval first (Tab 2).", None, None
	if not query.strip():
	return "⚠️ Please enter a question.", None, None

	embeddings, results, chunks = retrieval_state

	reranked = rerank_and_filter(
	query, results,
	score_threshold=threshold,
	top_n=int(top_n),
	)

	rows = []
	for r in reranked:
	status = "✅ KEPT" if r.kept else "❌ DROPPED"
	reason = r.filter_reason or "—"
	rows.append({
	"Status": status,
	"Chunk ID": r.chunk_id,
	"Original rank": r.original_rank,
	"New rank": r.new_rank if r.kept else "—",
	"Rerank score": f"{score_color(r.rerank_score)} {r.rerank_score:.3f}",
	"Filter reason": reason,
	"Preview": r.preview(80),
	})

	df = pd.DataFrame(rows)

	kept = [r for r in reranked if r.kept]
	dropped = [r for r in reranked if not r.kept]

	insight = (
	f"### ⚖️ Reranking & Filtering\n"
	f"- Kept: {len(kept)} chunks \| Dropped: {len(dropped)} chunks\n"
	f"- Score threshold: {threshold} — chunks below this are removed\n\n"
	f"Why rerank? The initial retrieval finds candidates quickly but noisily. "
	f"Reranking re-scores using richer signals (keyword overlap + semantic score). "
	f"Filtering removes low-quality and duplicate chunks before they reach the LLM.\n\n"
	f"Key insight: Passing noisy chunks to the LLM is the #1 cause of "
	f"hallucination in RAG systems. Clean context = sharper answers."
	)

	return insight, df, reranked


	# ── Tab 4: Final Answer ───────────────────────────────────────────────────────

	def run_generation(query: str, rerank_state, api_key: str):
	if rerank_state is None:
	return "⚠️ Run reranking first (Tab 3).", "", ""
	if not query.strip():
	return "⚠️ Please enter a question.", "", ""

	kept = [r for r in rerank_state if r.kept]
	if not kept:
	return (
	"⚠️ No chunks passed the filters. "
	"Try lowering the threshold in Tab 3 or rephrasing your question.",
	"", ""
	)

	answer, prompt = generate_answer(query, kept, api_key=api_key.strip() or None)

	sources_md = "### 📄 Source excerpts used\n\n"
	for i, r in enumerate(kept, 1):
	sources_md += f"[Excerpt {i}] (Chunk {r.chunk_id}, score {r.rerank_score:.3f})\n"
	sources_md += f"> {r.text[:200]}{'...' if len(r.text) > 200 else ''}\n\n"

	prompt_display = f"```\n{prompt}\n```"

	return answer, sources_md, prompt_display


	# ── Gradio UI ─────────────────────────────────────────────────────────────────

	CSS = """
	.score-high { color: #22c55e; font-weight: 500; }
	.score-mid { color: #f59e0b; }
	.score-low { color: #ef4444; }
	"""

	with gr.Blocks(
	title="RAG Transparency Lab",
	theme=gr.themes.Soft(primary_hue="slate", secondary_hue="blue"),
	css=CSS,
	) as demo:

	# ── Header ───────────────────────────────────────────────────────────
	gr.Markdown("""
	# 🔬 RAG Transparency Lab

	See inside every step of a RAG pipeline — applied to scientific papers.

	Most RAG demos show you only the final answer. This tool shows you why that answer is good or bad — by exposing chunking, retrieval scores, reranking decisions, and the exact prompt sent to the LLM.

	Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina) · AI Evaluation Scientist
	Part of the RAG Education Series — Project 1 of 3

	---
	""")

	# ── Global inputs ────────────────────────────────────────────────────
	with gr.Row():
	api_key_input = gr.Textbox(
	label="🔑 Anthropic API Key (for Tab 4)",
	placeholder="sk-ant-...",
	type="password",
	scale=2,
	)
	question_input = gr.Textbox(
	label="❓ Your question about the paper",
	placeholder="What is the main finding of this paper?",
	scale=3,
	)

	# ── State ────────────────────────────────────────────────────────────
	chunk_state = gr.State(None)
	retrieval_state = gr.State(None)
	rerank_state = gr.State(None)

	# ── Tabs ─────────────────────────────────────────────────────────────
	with gr.Tabs():

	# Tab 1 ───────────────────────────────────────────────────────────
	with gr.TabItem("📄 Step 1 — Upload & Chunk"):
	gr.Markdown(
	"Upload a scientific PDF and choose a chunking strategy. "
	"See how the document is split into pieces that the retrieval system will search."
	)
	with gr.Row():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], scale=2)
	strategy_input = gr.Dropdown(
	choices=list(STRATEGIES.keys()),
	value="Semantic (5 sentences)",
	label="Chunking strategy",
	scale=1,
	)
	process_btn = gr.Button("⚙️ Process document", variant="primary")
	chunk_summary = gr.Markdown()
	chunk_table = gr.DataFrame(label="All chunks", wrap=True)

	process_btn.click(
	fn=process_pdf,
	inputs=[pdf_input, strategy_input],
	outputs=[chunk_summary, chunk_table, chunk_state],
	)

	# Tab 2 ───────────────────────────────────────────────────────────
	with gr.TabItem("🔍 Step 2 — Retrieval Explorer"):
	gr.Markdown(
	"Run hybrid retrieval (dense + sparse). "
	"See the individual scores for each candidate chunk — "
	"and adjust the balance between semantic and keyword search."
	)
	dense_weight_slider = gr.Slider(
	minimum=0.0, maximum=1.0, value=0.6, step=0.1,
	label="Dense weight (1 - this = sparse/BM25 weight)",
	)
	retrieve_btn = gr.Button("🔍 Run retrieval", variant="primary")
	retrieval_insight = gr.Markdown()
	retrieval_table = gr.DataFrame(label="Retrieval scores (top 10)", wrap=True)

	retrieve_btn.click(
	fn=run_retrieval,
	inputs=[question_input, chunk_state, dense_weight_slider],
	outputs=[retrieval_insight, retrieval_table, retrieval_state],
	)

	# Tab 3 ───────────────────────────────────────────────────────────
	with gr.TabItem("⚖️ Step 3 — Rerank & Filter"):
	gr.Markdown(
	"Rerank candidates with richer scoring, then filter out "
	"low-quality and duplicate chunks. "
	"See exactly which chunks were dropped — and why."
	)
	with gr.Row():
	threshold_slider = gr.Slider(
	minimum=0.0, maximum=1.0, value=0.25, step=0.05,
	label="Score threshold (chunks below this are dropped)",
	scale=2,
	)
	top_n_slider = gr.Slider(
	minimum=1, maximum=10, value=5, step=1,
	label="Top N chunks to keep",
	scale=1,
	)
	rerank_btn = gr.Button("⚖️ Rerank & Filter", variant="primary")
	rerank_insight = gr.Markdown()
	rerank_table = gr.DataFrame(label="Reranking decisions", wrap=True)

	rerank_btn.click(
	fn=run_reranking,
	inputs=[question_input, retrieval_state, threshold_slider, top_n_slider],
	outputs=[rerank_insight, rerank_table, rerank_state],
	)

	# Tab 4 ───────────────────────────────────────────────────────────
	with gr.TabItem("💬 Step 4 — Final Answer"):
	gr.Markdown(
	"Generate the final answer using only the filtered, reranked context. "
	"See the exact prompt sent to the LLM and the source excerpts it used."
	)
	generate_btn = gr.Button("💬 Generate answer", variant="primary")

	with gr.Row():
	with gr.Column(scale=2):
	answer_out = gr.Markdown(label="Answer")
	with gr.Column(scale=1):
	sources_out = gr.Markdown(label="Source excerpts")

	with gr.Accordion("🔍 Prompt sent to LLM (full transparency)", open=False):
	prompt_out = gr.Markdown()

	generate_btn.click(
	fn=run_generation,
	inputs=[question_input, rerank_state, api_key_input],
	outputs=[answer_out, sources_out, prompt_out],
	)

	# About tab ───────────────────────────────────────────────────────
	with gr.TabItem("📖 About & Methodology"):
	gr.Markdown("""
	## Why RAG Transparency Matters

	Most RAG tutorials show you the output. This tool shows you the pipeline — because the output quality is determined entirely by what happens before the LLM sees any text.

	### The 4 stages explained

	Stage 1 — Chunking
	Documents must be split into pieces small enough to retrieve but large enough to contain meaningful context. Fixed-size chunking is fast but breaks sentences. Semantic chunking preserves argument structure.

	Stage 2 — Hybrid Retrieval
	Dense retrieval (embeddings) captures semantic similarity. Sparse retrieval (BM25) captures exact keyword matches. Neither alone is sufficient — hybrid is the production standard.

	Stage 3 — Reranking & Filtering
	Initial retrieval casts a wide net. Reranking re-scores candidates with richer signals. Filtering removes low-quality chunks and near-duplicates. This is the stage most tutorials skip — and it's where most hallucinations originate.

	Stage 4 — Grounded Generation
	The LLM receives only the filtered, ranked excerpts — constrained to cite its sources. The prompt is shown in full so you can see exactly what the model was asked.

	---

	## RAG Education Series

	This is Project 1 of 3:
	1. 🔬 RAG Transparency Lab ← you are here
	2. ⚡ Classic vs Advanced RAG — side-by-side comparison
	3. 🧪 Scientific Claim Verifier — hallucination detection on research papers

	---

	Built by [Zalina Dezhina, PhD](https://linkedin.com/in/zalina-dezhina)
	AI Evaluation Scientist \| RLHF Specialist \| Computational Neuroscientist
	[GitHub](https://github.com/Mioulin) · dezhina@gmail.com
	""")

	demo.launch()