"""Create a comprehensive architecture diagram for RAG Capstone Project.""" import matplotlib.pyplot as plt import matplotlib.patches as mpatches from matplotlib.patches import FancyBboxPatch, FancyArrowPatch import matplotlib.lines as mlines from matplotlib.patches import Rectangle import numpy as np # Create figure with larger size for detailed diagram fig, ax = plt.subplots(1, 1, figsize=(18, 14)) ax.set_xlim(0, 20) ax.set_ylim(0, 16) ax.axis('off') # Color palette COLOR_INPUT = '#E8F4F8' COLOR_PROCESS = '#B3E5FC' COLOR_STORAGE = '#81D4FA' COLOR_EVAL = '#FFE0B2' COLOR_JUDGE = '#FFCC80' COLOR_OUTPUT = '#C8E6C9' COLOR_ARROW = '#424242' COLOR_TEXT = '#212121' def draw_box(ax, x, y, width, height, text, color, fontsize=9, bold=False): """Draw a rounded rectangle box with text.""" box = FancyBboxPatch( (x - width/2, y - height/2), width, height, boxstyle="round,pad=0.1", edgecolor='#424242', facecolor=color, linewidth=2 ) ax.add_patch(box) weight = 'bold' if bold else 'normal' ax.text(x, y, text, ha='center', va='center', fontsize=fontsize, weight=weight, color=COLOR_TEXT, wrap=True) def draw_arrow(ax, x1, y1, x2, y2, label='', style='->', color=COLOR_ARROW, linewidth=2.5): """Draw an arrow between two points.""" arrow = FancyArrowPatch( (x1, y1), (x2, y2), arrowstyle=style, mutation_scale=25, color=color, linewidth=linewidth ) ax.add_patch(arrow) if label: mid_x, mid_y = (x1 + x2) / 2, (y1 + y2) / 2 ax.text(mid_x + 0.3, mid_y + 0.2, label, fontsize=8, bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8)) def draw_section_header(ax, x, y, text, color): """Draw a section header.""" header = Rectangle((x - 3, y - 0.3), 6, 0.6, facecolor=color, edgecolor='#424242', linewidth=2) ax.add_patch(header) ax.text(x, y, text, ha='center', va='center', fontsize=11, weight='bold', color='white') # Title ax.text(10, 15.2, 'RAG Capstone Project - Architecture Diagram', ha='center', va='top', fontsize=18, weight='bold', color=COLOR_TEXT) ax.text(10, 14.7, 'Collection Creation & TRACE Evaluation Framework', ha='center', va='top', fontsize=12, style='italic', color='#666') # ============================================================================ # SECTION 1: DATA INGESTION & COLLECTION CREATION (Left Side) # ============================================================================ draw_section_header(ax, 3.5, 13.8, '1. COLLECTION CREATION', COLOR_INPUT) # Data sources draw_box(ax, 1.5, 12.8, 2, 0.8, 'RAGBench\nDatasets\n(15+)', COLOR_INPUT, 8, True) draw_box(ax, 5.5, 12.8, 2, 0.8, 'User\nDocuments', COLOR_INPUT, 8, True) # Data loading draw_arrow(ax, 1.5, 12.4, 3.5, 11.5) draw_arrow(ax, 5.5, 12.4, 3.5, 11.5) draw_box(ax, 3.5, 11.1, 2.5, 0.8, 'Data Loader\n(dataset_loader.py)', COLOR_PROCESS, 9, True) # Chunking strategies draw_arrow(ax, 3.5, 10.7, 3.5, 9.9) draw_section_header(ax, 3.5, 9.6, 'Chunking Strategies', '#FFD54F') chunking_strategies = [ ('Dense', 0.8), ('Sparse', 2.0), ('Hybrid', 3.2), ('Re-rank', 4.4), ('Row-based', 5.6), ('Entity', 6.8) ] for i, (name, x_offset) in enumerate(chunking_strategies): x = 0.5 + x_offset draw_box(ax, x, 8.8, 1.2, 0.7, name, '#FFF9C4', 8) if i < 3: draw_arrow(ax, x, 8.45, x, 7.9) else: draw_arrow(ax, x, 8.45, x, 7.9) # Embedding models draw_section_header(ax, 3.5, 7.6, 'Embedding Models', '#BBDEFB') embedding_models = [ ('MPNet', 0.5), ('MiniLM', 1.5), ('BioClinical\nBERT', 2.8), ('PubMedBERT', 4.2), ('Specter', 5.4), ('Multilingual', 6.6) ] for name, x_offset in embedding_models: x = 0.5 + x_offset draw_box(ax, x, 6.8, 1.1, 0.8, name, COLOR_STORAGE, 7) draw_arrow(ax, x, 6.4, x - 0.3, 5.7, color='#1976D2') # Vector Storage draw_arrow(ax, 2, 5.3, 1.5, 4.5, color='#1976D2') draw_arrow(ax, 3, 5.3, 3.5, 4.5, color='#1976D2') draw_arrow(ax, 4, 5.3, 4, 4.5, color='#1976D2') draw_arrow(ax, 5, 5.3, 5.5, 4.5, color='#1976D2') draw_arrow(ax, 6, 5.3, 6.5, 4.5, color='#1976D2') draw_arrow(ax, 7, 5.3, 7, 4.5, color='#1976D2') draw_box(ax, 4.5, 3.8, 3.5, 1, 'ChromaDB Vector Store\n(Persistent Storage)', COLOR_STORAGE, 10, True) draw_box(ax, 1.5, 2.5, 2.2, 0.8, 'SQLite Index\n(Metadata)', '#E0BEE7', 9) draw_arrow(ax, 3.2, 3.4, 1.5, 2.9) # Collection Output draw_arrow(ax, 4.5, 3.3, 4.5, 2.4) draw_box(ax, 4.5, 1.8, 3.5, 0.8, 'Named Collections\n(Collection Registry)', COLOR_OUTPUT, 10, True) ax.text(4.5, 0.9, 'Ready for Chat & Evaluation', ha='center', fontsize=8, style='italic') # ============================================================================ # SECTION 2: TRACE EVALUATION FRAMEWORK (Center) # ============================================================================ draw_section_header(ax, 10, 13.8, '2. EVALUATION FRAMEWORK (TRACE)', COLOR_EVAL) # Query & Response Input draw_box(ax, 8, 12.5, 1.8, 0.8, 'User\nQuery', COLOR_INPUT, 9, True) draw_box(ax, 12, 12.5, 1.8, 0.8, 'LLM\nResponse', COLOR_INPUT, 9, True) draw_arrow(ax, 8, 12.1, 9.5, 11.3) draw_arrow(ax, 12, 12.1, 10.5, 11.3) draw_box(ax, 10, 10.9, 3.5, 0.8, 'Evaluation Input Preparation', COLOR_PROCESS, 9, True) # TRACE Metrics (4 columns) draw_arrow(ax, 10, 10.5, 10, 9.7) # Create 4 TRACE metric boxes metrics = [ ('RELEVANCE\n(R)', 7.5, '#FF6B6B', 'Fraction of retrieved\ncontext relevant\nto query'), ('UTILIZATION\n(T)', 9.5, '#4ECDC4', 'Fraction of retrieved\ncontext used in\nresponse'), ('ADHERENCE\n(A)', 11.5, '#45B7D1', 'Is response fully\ngrounded in\ndocuments?'), ('COMPLETENESS\n(C)', 13.5, '#FFA07A', 'Fraction of relevant\ninfo covered by\nresponse') ] metric_y = 9.2 for name, x, color, desc in metrics: draw_box(ax, x, metric_y + 0.5, 1.6, 0.8, name, color, 10, True) draw_box(ax, x, metric_y - 0.8, 1.8, 1.2, desc, '#F5F5F5', 8) draw_arrow(ax, x, metric_y + 0.1, x, metric_y - 0.2) # Calculation formulas formula_y = 6.8 ax.text(7.5, formula_y + 0.3, 'R = Σ Len(Relevant)\n/ Σ Len(All Docs)', ha='center', fontsize=7, bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFE0B2', alpha=0.7), family='monospace') ax.text(9.5, formula_y + 0.3, 'T = Σ Len(Used)\n/ Σ Len(All Docs)', ha='center', fontsize=7, bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFE0B2', alpha=0.7), family='monospace') ax.text(11.5, formula_y + 0.3, 'A = Boolean\n(Hallucination\nDetection)', ha='center', fontsize=7, bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFE0B2', alpha=0.7), family='monospace') ax.text(13.5, formula_y + 0.3, 'C = Len(R ∩ T)\n/ Len(R)', ha='center', fontsize=7, bbox=dict(boxstyle='round,pad=0.3', facecolor='#FFE0B2', alpha=0.7), family='monospace') # Arrows converging to evaluation draw_arrow(ax, 7.5, 6.3, 9.5, 5.7, color='#E91E63') draw_arrow(ax, 9.5, 6.3, 9.5, 5.7, color='#E91E63') draw_arrow(ax, 11.5, 6.3, 10.5, 5.7, color='#E91E63') draw_arrow(ax, 13.5, 6.3, 11.5, 5.7, color='#E91E63') # ============================================================================ # SECTION 3: JUDGE - LLM-BASED EVALUATION (Right Side) # ============================================================================ draw_section_header(ax, 15.5, 13.8, '3. JUDGE EVALUATION', COLOR_JUDGE) # Judge component draw_box(ax, 15.5, 11.5, 3.5, 1.2, 'GPT Labeling\nJudge\n(advanced_rag_evaluator.py)', COLOR_JUDGE, 10, True) draw_arrow(ax, 12.5, 11, 14, 11.5) ax.text(12.8, 11.3, 'Retrieved\nDocs', fontsize=8, ha='center', bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.8)) draw_arrow(ax, 13.5, 11, 14, 11.5) ax.text(13.8, 11.3, 'Question\n& Response', fontsize=8, ha='center', bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.8)) # Sentencizer draw_arrow(ax, 15.5, 10.9, 15.5, 10.2) draw_box(ax, 15.5, 9.7, 3.5, 0.8, 'DocumentSentencizer\n(Split into sentences with keys)', COLOR_PROCESS, 8) # Prompt Generator draw_arrow(ax, 15.5, 9.3, 15.5, 8.6) draw_box(ax, 15.5, 8.1, 3.5, 0.8, 'GPTLabelingPromptGenerator\n(Create structured prompt)', COLOR_PROCESS, 8) # LLM Call draw_arrow(ax, 15.5, 7.7, 15.5, 7.0) draw_box(ax, 15.5, 6.3, 3.2, 1, 'Groq LLM\nAPI Call\n(llm_client.py)', '#C5CAE9', 10, True) # JSON Parsing draw_arrow(ax, 15.5, 5.8, 15.5, 5.1) draw_box(ax, 15.5, 4.6, 3.5, 0.8, 'JSON Response Parsing\n(Extract metrics & mapping)', COLOR_PROCESS, 8) # Output metrics draw_arrow(ax, 15.5, 4.2, 15.5, 3.5) output_metrics = [ 'Sentence Support Map', 'RMSE Metrics', 'AUC-ROC Metrics', 'Audit Trail' ] draw_section_header(ax, 15.5, 3.1, 'Judge Output', COLOR_OUTPUT) for i, metric in enumerate(output_metrics): y = 2.5 - (i * 0.5) draw_box(ax, 15.5, y, 3, 0.4, f'• {metric}', COLOR_OUTPUT, 8) # ============================================================================ # SECTION 4: INTEGRATION & DATA FLOW # ============================================================================ # Arrow from collection to evaluation draw_arrow(ax, 6.5, 1.8, 8, 11, style='->', color='#00796B', linewidth=2) ax.text(6.8, 6.5, 'Loaded\nCollection', fontsize=9, weight='bold', ha='center', bbox=dict(boxstyle='round,pad=0.4', facecolor='#B2DFDB', alpha=0.9)) # Arrow from TRACE to Judge draw_arrow(ax, 11.5, 5.3, 13.5, 6.3, style='->', color='#C62828', linewidth=2) ax.text(11.8, 5.7, 'Metric\nCalculation', fontsize=9, weight='bold', ha='center', bbox=dict(boxstyle='round,pad=0.4', facecolor='#FFCDD2', alpha=0.9)) # Final output draw_arrow(ax, 15.5, 0.5, 10, -0.5) draw_box(ax, 10, -1.2, 5, 1, 'Comprehensive Evaluation Report\n(Metrics + Audit Trail + JSON Export)', COLOR_OUTPUT, 11, True) # ============================================================================ # Legend and Notes # ============================================================================ # Legend position legend_y = -2.5 ax.text(0.5, legend_y, 'Legend:', fontsize=10, weight='bold') legend_items = [ (COLOR_INPUT, 'Input Data'), (COLOR_PROCESS, 'Processing'), (COLOR_STORAGE, 'Storage'), (COLOR_EVAL, 'Evaluation'), (COLOR_JUDGE, 'Judge'), (COLOR_OUTPUT, 'Output') ] for i, (color, label) in enumerate(legend_items): x = 0.5 + (i % 3) * 3.5 y = legend_y - 0.6 - ((i // 3) * 0.5) rect = Rectangle((x, y - 0.15), 0.3, 0.3, facecolor=color, edgecolor='#424242') ax.add_patch(rect) ax.text(x + 0.5, y, label, fontsize=9, va='center') # Key features features_y = -4.2 ax.text(0.5, features_y, 'Key Features:', fontsize=10, weight='bold') features = [ '✓ 6 Chunking Strategies ✓ 8 Embedding Models ✓ 15+ RAGBench Datasets', '✓ Sentence-Level Support Mapping ✓ Hallucination Detection ✓ Complete Audit Trail' ] for i, feature in enumerate(features): ax.text(0.5, features_y - 0.5 - (i * 0.4), feature, fontsize=9) plt.tight_layout() plt.savefig('RAG_Architecture_Diagram.png', dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none') print("✅ Architecture diagram created: RAG_Architecture_Diagram.png") print(f"📊 Diagram size: 18x14 inches at 300 DPI") print("📋 Includes: Collection Creation → TRACE Metrics → Judge Evaluation") plt.close() # ============================================================================ # Create a second detailed diagram focusing on data flow # ============================================================================ fig2, ax2 = plt.subplots(1, 1, figsize=(16, 12)) ax2.set_xlim(0, 16) ax2.set_ylim(0, 14) ax2.axis('off') # Title ax2.text(8, 13.5, 'Detailed Data Flow: From Query to Evaluation', ha='center', va='top', fontsize=16, weight='bold', color=COLOR_TEXT) # Define stages stages = [ { 'y': 12.5, 'title': '1. Query Processing', 'items': [ ('User Query', 'What is COVID-19?', 1), ('Collection Selected', 'CovidQA Dataset', 6), ] }, { 'y': 11, 'title': '2. Retrieval', 'items': [ ('Vector Search', 'Query → Embeddings\n↓\nChromaDB Search', 1), ('Top-K Results', 'Retrieved 5 documents\nwith similarity scores', 6), ] }, { 'y': 9.3, 'title': '3. Response Generation', 'items': [ ('Context', 'Ranked documents\nas context', 1), ('LLM Generation', 'Groq LLM\n(llm_client.py)', 6), ('Response', 'Generated answer\ngrounded in docs', 11), ] }, { 'y': 7.4, 'title': '4. Evaluation Setup', 'items': [ ('Sentencize', 'Split docs &\nresponse into\nsentences', 1), ('Create Keys', 'doc_0_s0, doc_0_s1\nresp_s0, resp_s1...', 5), ('Generate Prompt', 'GPTLabelingPrompt\nGenerator', 10), ] }, { 'y': 5.3, 'title': '5. Judge Evaluation', 'items': [ ('LLM Prompt', 'Send prompt +\nsentencized data\nto Groq', 1), ('LLM Response', 'JSON with\nsentence mapping\n& support info', 6), ] }, { 'y': 3.3, 'title': '6. Metric Calculation', 'items': [ ('Parse JSON', 'Extract support\nmapping', 1), ('Calculate TRACE', 'R, T, A, C\nmetrics', 5), ('RMSE & AUC', 'Additional\nmetrics', 9), ] }, { 'y': 1.3, 'title': '7. Output', 'items': [ ('Report', 'JSON with all metrics\n+ audit trail', 3), ('Visualization', 'Charts & tables\nin Streamlit', 8), ('Export', 'Download results', 13), ] } ] # Draw stages for stage in stages: # Stage header header_rect = Rectangle((0.2, stage['y'] - 0.15), 15.6, 0.3, facecolor='#1976D2', alpha=0.8) ax2.add_patch(header_rect) ax2.text(0.5, stage['y'], stage['title'], fontsize=11, weight='bold', color='white', va='center') # Stage items for item in stage['items']: if len(item) == 3: title, desc, x = item box = FancyBboxPatch( (x - 1.8, stage['y'] - 0.7), 3.6, 0.5, boxstyle="round,pad=0.08", edgecolor='#424242', facecolor='#E3F2FD', linewidth=1.5 ) ax2.add_patch(box) ax2.text(x, stage['y'] - 0.45, f'{title}\n{desc}', ha='center', va='center', fontsize=8, weight='bold') # Arrow to next stage if stage != stages[-1]: next_y = stage['y'] - 1 arrow = FancyArrowPatch( (8, stage['y'] - 0.85), (8, next_y + 0.15), arrowstyle='->', mutation_scale=30, color='#1976D2', linewidth=2.5 ) ax2.add_patch(arrow) # Add code file references on the side code_refs = [ ('dataset_loader.py', 12.5), ('vector_store.py', 11), ('llm_client.py', 9.3), ('chunking_strategies.py\nembedding_models.py', 7.4), ('advanced_rag_evaluator.py', 5.3), ('trace_evaluator.py', 3.3), ('streamlit_app.py', 1.3), ] for ref, y in code_refs: ax2.text(15.2, y, ref, fontsize=7, style='italic', ha='right', bbox=dict(boxstyle='round,pad=0.3', facecolor='#F5F5F5', edgecolor='#757575', linewidth=1)) plt.tight_layout() plt.savefig('RAG_Data_Flow_Diagram.png', dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none') print("\n✅ Data flow diagram created: RAG_Data_Flow_Diagram.png") print(f"📊 Diagram size: 16x12 inches at 300 DPI") print("📋 Shows: 7-step data flow from query to evaluation results") plt.close() print("\n" + "="*60) print("Architecture Diagrams Created Successfully!") print("="*60) print("\n📁 Files Generated:") print(" 1. RAG_Architecture_Diagram.png - Full system architecture") print(" 2. RAG_Data_Flow_Diagram.png - Detailed data flow diagram") print("\n✨ Both diagrams are ready for presentations and documentation!")