Spaces:

gopikrishnait
/

CapStoneRAG10

Running

CapStoneRAG10 / archived_scripts /create_ppt_presentation.py

Developer

Initial commit for HuggingFace Spaces - RAG Capstone Project with Qdrant Cloud

1d10b0a 30 days ago

24.3 kB

	"""Create a comprehensive PowerPoint presentation for RAG Capstone Project."""
	from pptx import Presentation
	from pptx.util import Inches, Pt
	from pptx.enum.text import PP_ALIGN
	from pptx.dml.color import RGBColor
	from datetime import datetime


	def create_presentation():
	"""Create the RAG Capstone Project presentation."""
	prs = Presentation()
	prs.slide_width = Inches(10)
	prs.slide_height = Inches(7.5)

	# Define color scheme
	DARK_BLUE = RGBColor(25, 55, 109)
	ACCENT_BLUE = RGBColor(0, 120, 215)
	LIGHT_GRAY = RGBColor(240, 240, 240)
	TEXT_DARK = RGBColor(33, 33, 33)

	def add_title_slide(title, subtitle=""):
	"""Add a title slide."""
	slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank layout
	background = slide.background
	fill = background.fill
	fill.solid()
	fill.fore_color.rgb = DARK_BLUE

	# Title
	title_box = slide.shapes.add_textbox(Inches(0.5), Inches(2.5), Inches(9), Inches(1.5))
	title_frame = title_box.text_frame
	title_frame.word_wrap = True
	p = title_frame.paragraphs[0]
	p.text = title
	p.font.size = Pt(54)
	p.font.bold = True
	p.font.color.rgb = RGBColor(255, 255, 255)
	p.alignment = PP_ALIGN.CENTER

	# Subtitle
	if subtitle:
	subtitle_box = slide.shapes.add_textbox(Inches(0.5), Inches(4.2), Inches(9), Inches(1))
	subtitle_frame = subtitle_box.text_frame
	p = subtitle_frame.paragraphs[0]
	p.text = subtitle
	p.font.size = Pt(28)
	p.font.color.rgb = ACCENT_BLUE
	p.alignment = PP_ALIGN.CENTER

	return slide

	def add_content_slide(title, content_items):
	"""Add a content slide with bullet points."""
	slide = prs.slides.add_slide(prs.slide_layouts[6])
	background = slide.background
	fill = background.fill
	fill.solid()
	fill.fore_color.rgb = RGBColor(255, 255, 255)

	# Title bar
	title_shape = slide.shapes.add_shape(1, Inches(0), Inches(0), Inches(10), Inches(0.8))
	title_shape.fill.solid()
	title_shape.fill.fore_color.rgb = DARK_BLUE
	title_shape.line.color.rgb = DARK_BLUE

	# Title text
	title_frame = title_shape.text_frame
	p = title_frame.paragraphs[0]
	p.text = title
	p.font.size = Pt(40)
	p.font.bold = True
	p.font.color.rgb = RGBColor(255, 255, 255)
	p.space_before = Pt(8)
	p.space_after = Pt(8)

	# Content
	text_box = slide.shapes.add_textbox(Inches(0.7), Inches(1.2), Inches(8.6), Inches(6))
	text_frame = text_box.text_frame
	text_frame.word_wrap = True

	for i, item in enumerate(content_items):
	if i > 0:
	p = text_frame.add_paragraph()
	else:
	p = text_frame.paragraphs[0]

	p.text = item
	p.level = 0
	p.font.size = Pt(18)
	p.font.color.rgb = TEXT_DARK
	p.space_before = Pt(6)
	p.space_after = Pt(6)

	return slide

	def add_two_column_slide(title, left_title, left_items, right_title, right_items):
	"""Add a two-column content slide."""
	slide = prs.slides.add_slide(prs.slide_layouts[6])
	background = slide.background
	fill = background.fill
	fill.solid()
	fill.fore_color.rgb = RGBColor(255, 255, 255)

	# Title bar
	title_shape = slide.shapes.add_shape(1, Inches(0), Inches(0), Inches(10), Inches(0.8))
	title_shape.fill.solid()
	title_shape.fill.fore_color.rgb = DARK_BLUE
	title_shape.line.color.rgb = DARK_BLUE

	title_frame = title_shape.text_frame
	p = title_frame.paragraphs[0]
	p.text = title
	p.font.size = Pt(40)
	p.font.bold = True
	p.font.color.rgb = RGBColor(255, 255, 255)
	p.space_before = Pt(8)
	p.space_after = Pt(8)

	# Left column
	left_box = slide.shapes.add_textbox(Inches(0.4), Inches(1.2), Inches(4.6), Inches(6))
	left_frame = left_box.text_frame
	left_frame.word_wrap = True

	p = left_frame.paragraphs[0]
	p.text = left_title
	p.font.size = Pt(20)
	p.font.bold = True
	p.font.color.rgb = ACCENT_BLUE
	p.space_after = Pt(8)

	for item in left_items:
	p = left_frame.add_paragraph()
	p.text = item
	p.level = 0
	p.font.size = Pt(15)
	p.font.color.rgb = TEXT_DARK
	p.space_after = Pt(6)

	# Right column
	right_box = slide.shapes.add_textbox(Inches(5.0), Inches(1.2), Inches(4.6), Inches(6))
	right_frame = right_box.text_frame
	right_frame.word_wrap = True

	p = right_frame.paragraphs[0]
	p.text = right_title
	p.font.size = Pt(20)
	p.font.bold = True
	p.font.color.rgb = ACCENT_BLUE
	p.space_after = Pt(8)

	for item in right_items:
	p = right_frame.add_paragraph()
	p.text = item
	p.level = 0
	p.font.size = Pt(15)
	p.font.color.rgb = TEXT_DARK
	p.space_after = Pt(6)

	return slide

	# Slide 1: Title Slide
	add_title_slide(
	"RAG Capstone Project",
	"Retrieval-Augmented Generation Pipeline with Advanced Evaluation"
	)

	# Slide 2: Project Overview
	add_content_slide(
	"Project Overview",
	[
	"🎯 Goal: Build a production-ready RAG system with comprehensive evaluation",
	"",
	"📊 Key Components:",
	" • Document ingestion from RAGBench datasets (15+ datasets)",
	" • Flexible chunking strategies (6 different approaches)",
	" • Multiple embedding models (8 different embeddings)",
	" • Advanced LLM-based evaluation framework",
	" • Real-time monitoring and audit trails",
	"",
	"🔧 Tech Stack: Python, Streamlit, ChromaDB, Groq LLM API, Sentence Transformers"
	]
	)

	# Slide 3: RAG Pipeline Architecture
	add_content_slide(
	"RAG Pipeline Architecture",
	[
	"1️⃣ DATA INGESTION",
	" Load documents from 15+ RAGBench datasets (CovidQA, CUAD, FinQA, etc.)",
	"",
	"2️⃣ DOCUMENT CHUNKING",
	" Apply 6 chunking strategies to split documents into manageable pieces",
	"",
	"3️⃣ EMBEDDING & VECTORIZATION",
	" Convert chunks to dense vectors using multiple embedding models",
	"",
	"4️⃣ VECTOR STORAGE",
	" Store in ChromaDB with semantic search capabilities",
	"",
	"5️⃣ RETRIEVAL & RANKING",
	" Retrieve relevant documents based on query similarity",
	"",
	"6️⃣ RESPONSE GENERATION",
	" Use Groq LLM to generate answers grounded in retrieved documents"
	]
	)

	# Slide 4: Chunking Strategies
	add_two_column_slide(
	"Document Chunking Strategies",
	"Chunking Methods",
	[
	"1. Dense Chunking",
	" Fixed-size chunks (512 tokens) with overlap",
	" Best for: Uniform content",
	"",
	"2. Sparse Chunking",
	" Semantic boundaries (paragraphs)",
	" Best for: Structured documents",
	"",
	"3. Hybrid Chunking",
	" Combines dense + semantic splitting",
	" Best for: Mixed content types",
	],
	"Advanced Methods",
	[
	"4. Re-ranking Chunking",
	" Chunks with relevance re-ranking",
	" Best for: High precision retrieval",
	"",
	"5. Row-based Chunking",
	" Preserves data structure for tables",
	" Best for: Tabular data",
	"",
	"6. Entity-based Chunking",
	" Groups by semantic entities",
	" Best for: Knowledge extraction",
	]
	)

	# Slide 5: Embedding Models
	add_content_slide(
	"Embedding Models Used",
	[
	"🔹 General Purpose Models:",
	" • sentence-transformers/all-mpnet-base-v2 (High quality, 768-dim)",
	" • sentence-transformers/all-MiniLM-L6-v2 (Fast, lightweight, 384-dim)",
	"",
	"🔹 Domain-Specific Models:",
	" • emilyalsentzer/Bio_ClinicalBERT (Clinical text, 768-dim)",
	" • microsoft/BiomedNLP-PubMedBERT (Medical abstracts, 768-dim)",
	" • allenai/specter (Academic papers, 768-dim)",
	"",
	"🔹 Multilingual Models:",
	" • sentence-transformers/multilingual-MiniLM-L12-v2 (110 languages)",
	" • sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
	"",
	"🔹 API-Based Model:",
	" • gemini-embedding-001 (Google Gemini API embeddings)"
	]
	)

	# Slide 6: RAG Evaluation Challenge
	add_content_slide(
	"The RAG Evaluation Challenge",
	[
	"❌ Why Traditional Metrics Fail?",
	" • BLEU/ROUGE only measure surface-level similarity",
	" • Don't evaluate grounding in retrieved documents",
	" • Can't detect hallucinations or factual errors",
	"",
	"✅ What We Need?",
	" • Metrics that measure document relevance to query",
	" • Metrics that measure document usage in response",
	" • Metrics that evaluate response grounding (no hallucinations)",
	" • Metrics that assess completeness of coverage",
	"",
	"🎓 Our Solution: LLM-based Evaluation Framework",
	" Inspired by RAGBench paper (arXiv:2407.11005)"
	]
	)

	# Slide 7: TRACE Framework
	add_content_slide(
	"TRACE Framework - 4 Core Metrics",
	[
	"🔴 RELEVANCE (R)",
	" Fraction of retrieved context relevant to the query",
	" Formula: Σ Len(Relevant spans) / Σ Len(All retrieved docs)",
	"",
	"🔵 UTILIZATION (T)",
	" Fraction of retrieved context used in the response",
	" Formula: Σ Len(Used spans) / Σ Len(All retrieved docs)",
	"",
	"🟢 ADHERENCE (A)",
	" Boolean: Is the response fully grounded in documents?",
	" Detects hallucinations and unsupported claims",
	"",
	"🟡 COMPLETENESS (C)",
	" Fraction of relevant information covered by response",
	" Formula: Len(Relevant ∩ Used) / Len(Relevant)"
	]
	)

	# Slide 8: LLM-Based Evaluation
	add_content_slide(
	"Advanced LLM-Based Evaluation",
	[
	"🤖 GPT Labeling Approach:",
	" • Use LLM (GPT-4/Groq) to annotate response sentences",
	" • Match each response sentence to supporting document spans",
	" • Detect fully supported, partially supported, and unsupported sentences",
	"",
	"📋 Evaluation Process:",
	" 1. Extract all sentences from both response and documents",
	" 2. Prompt LLM to identify relevant document sentences for query",
	" 3. Prompt LLM to map response sentences to document spans",
	" 4. Calculate support metrics at sentence and document level",
	"",
	"✨ Advantages:",
	" ✓ Semantic understanding (not just keyword matching)",
	" ✓ Detects hallucinations and contradictions",
	" ✓ Provides explainable audit trails",
	" ✓ Works across different domains and languages"
	]
	)

	# Slide 9: Evaluation Output Metrics
	add_two_column_slide(
	"Evaluation Output & Metrics",
	"Per-Response Metrics",
	[
	"✓ Context Relevance (0-1)",
	" How much retrieved content is relevant?",
	"",
	"✓ Context Utilization (0-1)",
	" How much retrieved content was used?",
	"",
	"✓ Adherence (0-1)",
	" Is response grounded in documents?",
	"",
	"✓ Completeness (0-1)",
	" Does response cover relevant information?",
	],
	"Aggregate Metrics",
	[
	"📊 RMSE Metrics",
	" Root Mean Squared Error for each metric",
	"",
	"📈 AUC-ROC Metrics",
	" Area Under ROC Curve for binary classification",
	"",
	"🎯 Average Score",
	" Mean of all 4 TRACE metrics",
	"",
	"📋 Detailed Audit Trail",
	" Sentence-level support information",
	]
	)

	# Slide 10: Audit Trail & Explainability
	add_content_slide(
	"Explainability & Audit Trails",
	[
	"🔍 Detailed Audit Information Captured:",
	"",
	"✓ Original Query",
	" User's question or request",
	"",
	"✓ LLM Prompt",
	" Exact instructions sent to LLM for evaluation",
	"",
	"✓ LLM Response",
	" Complete evaluation reasoning from LLM",
	"",
	"✓ Retrieved Documents",
	" Context provided to the RAG system",
	"",
	"✓ Sentence-Level Support Map",
	" Which document spans support each response sentence",
	"",
	"🎯 Enables: Root cause analysis, model improvements, and trust building"
	]
	)

	# Slide 11: System Architecture
	add_content_slide(
	"System Architecture Overview",
	[
	"📱 Frontend: Streamlit Web Interface",
	" • Interactive configuration panel",
	" • Real-time collection management",
	" • Chat interface with context display",
	" • Evaluation results visualization",
	"",
	"⚙️ Backend: Python Services",
	" • Vector store management (ChromaDB with SQLite indexing)",
	" • Embedding pipeline with 8 models",
	" • LLM integration (Groq API with rate limiting)",
	" • Advanced evaluation engine",
	"",
	"📚 Data Layer: ChromaDB",
	" • Persistent vector storage",
	" • SQLite metadata indexing",
	" • Multi-collection support",
	" • 4 active collections from RAGBench"
	]
	)

	# Slide 12: Key Features
	add_two_column_slide(
	"Key System Features",
	"Data Management",
	[
	"✓ 15+ RAGBench datasets",
	"✓ Flexible chunking strategies",
	"✓ Multiple embedding models",
	"✓ Real-time collection loading",
	"✓ Batch processing capability",
	"✓ Persistent storage (ChromaDB)",
	"✓ SQLite metadata indexing",
	],
	"Evaluation & Monitoring",
	[
	"✓ LLM-based evaluation",
	"✓ 4 TRACE metrics",
	"✓ RMSE & AUC metrics",
	"✓ Sentence-level analysis",
	"✓ Hallucination detection",
	"✓ Detailed audit trails",
	"✓ JSON export & visualization",
	]
	)

	# Slide 13: LLM Configuration
	add_content_slide(
	"LLM Configuration & Settings",
	[
	"🔧 Groq LLM Models Supported:",
	" • meta-llama/llama-4-maverick-17b-128e-instruct",
	" • llama-3.1-8b-instant",
	" • openai/gpt-oss-120b",
	"",
	"⚙️ Configurable Parameters:",
	" • Temperature: 0.0 (deterministic for evaluation)",
	" • Max Tokens: 2048 (sufficient for detailed analysis)",
	" • Rate Limit: 30 RPM (Groq API limit)",
	" • Rate Limit Delay: 2.0 seconds (throttling)",
	"",
	"🎯 System Prompt:",
	" Specialized fact-checking and citation verification prompt",
	" Enables LLM to evaluate without additional fine-tuning"
	]
	)

	# Slide 14: Data Flow Example
	add_content_slide(
	"Data Flow Example: A Question in RAG",
	[
	"1️⃣ USER QUERY",
	' "What are the COVID-19 vaccine side effects?"',
	"",
	"2️⃣ RETRIEVAL",
	" ChromaDB retrieves top 5 similar chunks from CovidQA dataset",
	"",
	"3️⃣ CONTEXT PREPARATION",
	" Relevant medical documents selected and formatted",
	"",
	"4️⃣ RESPONSE GENERATION",
	" Groq LLM generates answer: 'Common side effects include...'",
	"",
	"5️⃣ EVALUATION",
	" • LLM verifies: Are claims supported by documents?",
	" • Calculates: Relevance=0.92, Utilization=0.87, Adherence=1.0, Completeness=0.95",
	"",
	"6️⃣ OUTPUT",
	" JSON with metrics, audit trail, and source documents"
	]
	)

	# Slide 15: Use Cases
	add_content_slide(
	"Real-World Use Cases",
	[
	"📋 Document Q&A Systems",
	" Help desk, knowledge base search, document retrieval",
	"",
	"🏥 Medical Information Retrieval",
	" Clinical decision support, patient education",
	"",
	"⚖️ Legal Document Analysis",
	" Contract review, case law research, compliance checking",
	"",
	"💰 Financial Analysis",
	" SEC filing analysis, market research, investment insights",
	"",
	"🎓 Academic Research",
	" Paper indexing, literature review, citation analysis",
	"",
	"🏢 Enterprise Knowledge Management",
	" Internal document search, policy retrieval, FAQs"
	]
	)

	# Slide 16: Performance & Results
	add_content_slide(
	"System Performance & Achievements",
	[
	"✅ Successfully Processed:",
	" • 4 collections from RAGBench datasets",
	" • Recovered and re-indexed 4M+ vector embeddings in ChromaDB",
	" • 8 different embedding models tested",
	" • 6 chunking strategies implemented and evaluated",
	"",
	"📊 Evaluation Coverage:",
	" • Batch evaluation of 100+ test cases",
	" • Per-sentence analysis with GPT labeling",
	" • Comprehensive audit trails with LLM reasoning",
	"",
	"⚡ Performance Metrics:",
	" • Sub-second retrieval latency",
	" • Batch evaluation: ~2-3 seconds per query (with GPT labeling)",
	" • Rate limiting: Controlled via Groq API settings"
	]
	)

	# Slide 17: Technical Innovations
	add_content_slide(
	"Technical Innovations",
	[
	"🔹 Advanced ChromaDB Recovery",
	" Smart SQLite index rebuilding preserving all vector data",
	"",
	"🔹 Smart Collection Naming",
	" Automatic metadata extraction with interactive fallback UI",
	"",
	"🔹 Sentence-Level Evaluation",
	" Maps individual response sentences to document spans",
	"",
	"🔹 Multi-Metric Evaluation",
	" RMSE and AUC-ROC metrics alongside TRACE framework",
	"",
	"🔹 Explainable AI",
	" Complete audit trails showing LLM reasoning for each decision",
	"",
	"🔹 Flexible Pipeline",
	" Modular design allows easy swapping of chunking, embedding, and LLM components"
	]
	)

	# Slide 18: Challenges & Solutions
	add_two_column_slide(
	"Challenges & Solutions",
	"Challenges Faced",
	[
	"🔴 ChromaDB Index Corruption",
	" Collection folders orphaned from SQLite",
	"",
	"🔴 Evaluation Consistency",
	" Different chunking strategies vary in effectiveness",
	"",
	"🔴 Rate Limiting",
	" Groq API has strict RPM limits",
	"",
	"🔴 Hallucination Detection",
	" Hard to detect factual errors without reference",
	"",
	"🔴 Scalability",
	" Large batch evaluations take time",
	],
	"Solutions Implemented",
	[
	"✅ Data-Preserving Recovery",
	" Direct SQLite rebuild scripts",
	"",
	"✅ Comprehensive Testing",
	" Baseline metrics for different strategies",
	"",
	"✅ Intelligent Queuing",
	" Configurable rate limit delays",
	"",
	"✅ LLM Verification",
	" Adherence metric detects unsupported claims",
	"",
	"✅ Batch Processing",
	" Parallel processing where possible",
	]
	)

	# Slide 19: Future Roadmap
	add_content_slide(
	"Future Development Roadmap",
	[
	"🚀 Phase 2: Production Enhancements",
	" • Distributed processing for large-scale evaluation",
	" • Caching layer for frequently accessed documents",
	" • Real-time monitoring dashboard",
	"",
	"🚀 Phase 3: Advanced Features",
	" • Multimodal RAG (images, tables, PDFs)",
	" • Knowledge graph integration",
	" • Cross-domain transfer learning",
	"",
	"🚀 Phase 4: Enterprise Features",
	" • Multi-tenant support",
	" • Fine-tuned models for specific domains",
	" • Advanced security and compliance",
	"",
	"🚀 Phase 5: Research Contributions",
	" • Publication of benchmark results",
	" • Open-source evaluation framework",
	" • Industry collaboration"
	]
	)

	# Slide 20: Conclusion
	add_title_slide(
	"Key Takeaways",
	"Advanced RAG with Comprehensive Evaluation"
	)

	# Add content to conclusion
	slide = prs.slides[-1]
	text_box = slide.shapes.add_textbox(Inches(1), Inches(2.5), Inches(8), Inches(4))
	text_frame = text_box.text_frame

	points = [
	"✓ Complete RAG pipeline from ingestion to evaluation",
	"✓ Flexible architecture supporting multiple chunking and embedding strategies",
	"✓ LLM-based evaluation with sentence-level grounding verification",
	"✓ Explainable AI with comprehensive audit trails",
	"✓ Production-ready implementation with real data (RAGBench datasets)",
	"✓ Addresses critical RAG evaluation challenges",
	]

	for i, point in enumerate(points):
	if i == 0:
	p = text_frame.paragraphs[0]
	else:
	p = text_frame.add_paragraph()
	p.text = point
	p.font.size = Pt(20)
	p.font.color.rgb = TEXT_DARK
	p.space_before = Pt(12)
	p.space_after = Pt(12)

	# Save presentation
	output_file = "RAG_Capstone_Project_Presentation.pptx"
	prs.save(output_file)
	print(f"✅ Presentation created successfully: {output_file}")
	print(f"📊 Total slides: {len(prs.slides)}")
	print(f"💾 File size: {len(open(output_file, 'rb').read()) / 1024:.2f} KB")


	if __name__ == "__main__":
	create_presentation()