"""Create simplified flow diagrams for TRACE metrics.""" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.patches import FancyBboxPatch, FancyArrowPatch, Rectangle import matplotlib.patches as mpatches # Create first diagram fig, ax = plt.subplots(figsize=(12, 10)) ax.set_xlim(0, 10) ax.set_ylim(0, 12) ax.axis('off') # Color scheme COLOR_INPUT = '#E3F2FD' COLOR_PROCESS = '#BBDEFB' COLOR_DATA = '#81D4FA' COLOR_METRIC = '#FFE0B2' COLOR_OUTPUT = '#C8E6C9' def draw_box(ax, x, y, w, h, text, color, size=9): box = FancyBboxPatch((x-w/2, y-h/2), w, h, boxstyle="round,pad=0.05", edgecolor='#333', facecolor=color, linewidth=1.5) ax.add_patch(box) ax.text(x, y, text, ha='center', va='center', fontsize=size, weight='normal') def draw_arrow(ax, x1, y1, x2, y2): arrow = FancyArrowPatch((x1, y1), (x2, y2), arrowstyle='->', mutation_scale=20, color='#333', linewidth=2) ax.add_patch(arrow) # Title ax.text(5, 11.5, 'GPT Labeling Response → TRACE Metrics', ha='center', fontsize=13, weight='bold') # Step 1 draw_box(ax, 2, 10.5, 3, 0.6, 'Query + Response\n+ Documents', COLOR_INPUT, 9) draw_arrow(ax, 3.5, 10.2, 3.5, 9.7) # Step 2 draw_box(ax, 2, 9.3, 3, 0.6, 'Sentencize\n(Get keyed sentences)', COLOR_PROCESS, 9) draw_arrow(ax, 3.5, 9.0, 3.5, 8.5) # Step 3 draw_box(ax, 2, 8.1, 3, 0.6, 'Generate GPT\nLabeling Prompt', COLOR_PROCESS, 9) draw_arrow(ax, 3.5, 7.8, 3.5, 7.3) # Step 4 draw_box(ax, 2, 6.9, 3, 0.6, 'Call Groq LLM API', COLOR_PROCESS, 9) draw_arrow(ax, 3.5, 6.6, 3.5, 6.1) # Step 5 draw_box(ax, 2, 5.7, 3, 0.6, 'LLM Returns JSON\nwith sentence mapping', COLOR_DATA, 9) draw_arrow(ax, 3.5, 5.4, 3.5, 4.9) # Step 6 draw_box(ax, 2, 4.5, 3, 0.6, 'Extract Key Data:\n- relevant_keys\n- utilized_keys\n- support_info', COLOR_DATA, 8) draw_arrow(ax, 2, 4.2, 1.2, 3.5) draw_arrow(ax, 2.5, 4.2, 2.5, 3.5) draw_arrow(ax, 3, 4.2, 2.8, 3.5) draw_arrow(ax, 3.5, 4.2, 3.5, 3.5) draw_arrow(ax, 4, 4.2, 4.2, 3.5) # TRACE Metrics metrics = [ (0.8, 3, 'Relevance\n(R)', 'len(relevant)\n/ 20', COLOR_METRIC), (2.2, 3, 'Utilization\n(T)', 'len(used) /\nlen(relevant)', COLOR_METRIC), (3.6, 3, 'Completeness\n(C)', 'len(R∩T) /\nlen(R)', COLOR_METRIC), (5, 3, 'Adherence\n(A)', 'All fully\nsupported?', COLOR_METRIC), ] for x, y, title, formula, color in metrics: draw_box(ax, x, y, 1.2, 0.5, title, color, 7) draw_box(ax, x, y-0.6, 1.2, 0.4, formula, '#FFF9C4', 6) # Final output draw_arrow(ax, 0.8, 1.9, 2.5, 1.5) draw_arrow(ax, 2.2, 1.9, 2.5, 1.5) draw_arrow(ax, 3.6, 1.9, 2.5, 1.5) draw_arrow(ax, 5, 1.9, 2.5, 1.5) draw_box(ax, 2.5, 0.8, 3.5, 0.6, 'AdvancedTRACEScores\n(R, T, C, A + metadata)', COLOR_OUTPUT, 9) # Example ax.text(7, 10.5, 'Example:', fontsize=11, weight='bold') example = ''' Inputs: • relevant sentences: 3 • utilized sentences: 2 • all fully supported: Yes Results: R = 3/20 = 0.15 T = 2/3 = 0.67 C = 2/3 = 0.67 A = 1.0 (no hallucinations) Avg = 0.62 ''' ax.text(7.2, 7.5, example, fontsize=8, family='monospace', bbox=dict(boxstyle='round', facecolor='#F5F5F5', alpha=0.8), verticalalignment='top') plt.tight_layout() plt.savefig('TRACE_Metrics_Flow.png', dpi=300, bbox_inches='tight', facecolor='white') print("✅ Created: TRACE_Metrics_Flow.png") plt.close() # Create second diagram - Sentence mapping fig, ax = plt.subplots(figsize=(12, 8)) ax.set_xlim(0, 12) ax.set_ylim(0, 9) ax.axis('off') ax.text(6, 8.5, 'Sentence Support Mapping from GPT Response', ha='center', fontsize=13, weight='bold') # Documents ax.text(1.5, 7.8, 'Retrieved Documents', fontsize=10, weight='bold', color='#1976D2') docs = [ ('doc_0_s0', 'COVID-19 is respiratory disease', True), ('doc_0_s1', 'caused by virus', True), ('doc_1_s0', 'Spreads via droplets', True), ] for i, (key, text, rel) in enumerate(docs): y = 7.2 - i*0.6 color = '#C8E6C9' if rel else '#FFCDD2' draw_box(ax, 1.5, y, 2.5, 0.5, f'{key}\n{text}', color, 7) # Response ax.text(6, 7.8, 'Response + Support Info', fontsize=10, weight='bold', color='#1976D2') responses = [ ('resp_s0', 'COVID-19 is respiratory', 'doc_0_s0,s1', True), ('resp_s1', 'Spreads person-to-person', 'doc_1_s0', True), ] for i, (key, text, support, full) in enumerate(responses): y = 7.2 - i*0.6 color = '#C8E6C9' if full else '#FFCDD2' draw_box(ax, 6, y, 2.5, 0.5, f'{key}: {text}', color, 7) draw_box(ax, 9.5, y, 2, 0.5, f'Support: {support}\nFull: {"✓" if full else "✗"}', '#FFF9C4' if full else '#FFE0B2', 6) # Calculations calc_text = ''' Metric Calculations: ──────────────────── Relevant count = 3 [doc_0_s0, doc_0_s1, doc_1_s0] Utilized count = 3 [doc_0_s0, doc_0_s1, doc_1_s0] Fully supported = 2/2 responses Relevance = 3/20 = 0.15 Utilization = 3/3 = 1.00 Completeness = 3/3 = 1.00 Adherence = 1.0 (no hallucinations) Average Score = 0.79 ''' ax.text(1, 4, calc_text, fontsize=8, family='monospace', bbox=dict(boxstyle='round', facecolor='#F5F5F5', edgecolor='#666'), verticalalignment='top') # Legend ax.text(7, 4, 'Legend:', fontsize=10, weight='bold', color='#1976D2') legend_items = [ ('#C8E6C9', 'Relevant/Supported'), ('#FFCDD2', 'Not relevant/unsupported'), ('#FFF9C4', 'Fully supported'), ('#FFE0B2', 'Partially supported'), ] for i, (color, label) in enumerate(legend_items): y = 3.2 - i*0.4 rect = Rectangle((6.5, y-0.12), 0.25, 0.25, facecolor=color, edgecolor='#333') ax.add_patch(rect) ax.text(7, y, label, fontsize=8, va='center') plt.tight_layout() plt.savefig('Sentence_Mapping_Example.png', dpi=300, bbox_inches='tight', facecolor='white') print("✅ Created: Sentence_Mapping_Example.png") plt.close() print("\n" + "="*50) print("Flow Diagrams Created Successfully!") print("="*50) print("\nGenerated files:") print(" 1. TRACE_Metrics_Flow.png - 8-step process flow") print(" 2. Sentence_Mapping_Example.png - Sentence mapping details") draw_box(ax, 4.5, y_pos - 0.8, 2, 0.7, 'LLM Response\n"COVID-19 is..."', COLOR_INPUT, 8) draw_box(ax, 7.5, y_pos - 0.8, 2.5, 0.7, 'Retrieved Documents\n[Doc1, Doc2, Doc3]', COLOR_INPUT, 8) # ============================================================================ # PHASE 2: Sentencization # ============================================================================ y_pos = 14.8 ax.text(1, y_pos, 'PHASE 2: Sentencization', fontsize=12, weight='bold', color='#1976D2') draw_arrow(ax, 1.5, 15.4, 1.5, 15.0) draw_arrow(ax, 4.5, 15.4, 4.5, 15.0) draw_arrow(ax, 7.5, 15.4, 7.5, 15.0) draw_box(ax, 1.5, y_pos - 0.8, 2.5, 1, 'Query Sentences\n(Usually 1 sentence)', COLOR_PROCESS, 8) draw_box(ax, 4.5, y_pos - 0.8, 2.5, 1, 'Response Sentences\nresp_s0, resp_s1\nresp_s2...', COLOR_PROCESS, 8) draw_box(ax, 7.5, y_pos - 0.8, 2.8, 1, 'Document Sentences\ndoc_0_s0, doc_0_s1\ndoc_1_s0, doc_1_s1...', COLOR_PROCESS, 8) # ============================================================================ # PHASE 3: Prompt Generation # ============================================================================ y_pos = 13 ax.text(1, y_pos, 'PHASE 3: GPT Labeling Prompt Generation', fontsize=12, weight='bold', color='#1976D2') draw_arrow(ax, 1.5, 14.0, 2.5, 13.5) draw_arrow(ax, 4.5, 14.0, 3.5, 13.5) draw_arrow(ax, 7.5, 14.0, 4.5, 13.5) draw_box(ax, 3.5, y_pos - 0.9, 5.5, 1.5, 'GPTLabelingPromptGenerator.generate_labeling_prompt()\n\nCreates:\n- ROLE section\n- TASK OVERVIEW\n- INPUT DATA (with keys)\n- OUTPUT REQUIREMENTS\n- JSON SCHEMA', COLOR_PROCESS, 8, True) draw_arrow(ax, 3.5, y_pos - 1.4, 3.5, 12) draw_box(ax, 3.5, y_pos - 2.3, 5.8, 0.9, 'Structured Prompt with Sentencized Data\n(Ready to send to LLM)', COLOR_DATA, 8, True) # ============================================================================ # PHASE 4: LLM Call # ============================================================================ y_pos = 11 ax.text(1, y_pos, 'PHASE 4: LLM API Call (Groq)', fontsize=12, weight='bold', color='#1976D2') draw_arrow(ax, 3.5, 11.7, 3.5, 11.4) draw_box(ax, 3.5, y_pos - 0.7, 5, 0.9, 'Groq LLM\n(llm_client.generate)', '#C5CAE9', 9, True) draw_arrow(ax, 3.5, y_pos - 1.1, 3.5, 9.5) # ============================================================================ # PHASE 5: JSON Response # ============================================================================ y_pos = 9 ax.text(1, y_pos, 'PHASE 5: JSON Response Parsing', fontsize=12, weight='bold', color='#1976D2') # Show the JSON response structure json_text = '''LLM Response (JSON): { "relevance_explanation": "...", "all_relevant_sentence_keys": ["doc_0_s0", "doc_0_s1"], "overall_supported": true, "sentence_support_information": [ {"response_sentence_key": "resp_s0", "fully_supported": true, "supporting_sentence_keys": ["doc_0_s0"]}, {"response_sentence_key": "resp_s1", "fully_supported": true, "supporting_sentence_keys": ["doc_0_s1"]} ], "all_utilized_sentence_keys": ["doc_0_s0", "doc_0_s1"] }''' draw_box(ax, 3.5, y_pos - 2.2, 6.2, 3.2, json_text, COLOR_DATA, 7, False) # ============================================================================ # PHASE 6: Extract Key Data # ============================================================================ y_pos = 4.5 ax.text(1, y_pos, 'PHASE 6: Extract Data from JSON', fontsize=12, weight='bold', color='#1976D2') draw_arrow(ax, 3.5, 5.8, 3.5, 5.2) # Extract different data points draw_box(ax, 1, y_pos - 0.8, 2.2, 0.9, 'Relevant Sentences\nall_relevant_\nsentence_keys\n\n["doc_0_s0",\n "doc_0_s1"]', COLOR_METRIC, 7) draw_box(ax, 3.5, y_pos - 0.8, 2.2, 0.9, 'Utilized Sentences\nall_utilized_\nsentence_keys\n\n["doc_0_s0",\n "doc_0_s1"]', COLOR_METRIC, 7) draw_box(ax, 6, y_pos - 0.8, 2.2, 0.9, 'Support Info\nsentence_\nsupport_\ninformation\n\n[{...}, {...}]', COLOR_METRIC, 7) draw_box(ax, 8.5, y_pos - 0.8, 2.2, 0.9, 'Overall Support\noverall_\nsupported\n\ntrue/false', COLOR_METRIC, 7) # ============================================================================ # PHASE 7: Calculate TRACE Metrics # ============================================================================ y_pos = 2.2 ax.text(1, y_pos, 'PHASE 7: Calculate TRACE Metrics', fontsize=12, weight='bold', color='#1976D2') # Draw arrows from extracted data to metrics draw_arrow(ax, 1, 3.7, 1.5, 2.9) draw_arrow(ax, 3.5, 3.7, 3.5, 2.9) draw_arrow(ax, 6, 3.7, 5.5, 2.9) draw_arrow(ax, 8.5, 3.7, 7, 2.9) # Four TRACE metrics metrics = [ ('Relevance (R)\nlen(relevant)/20', 1.5, '#FF6B6B'), ('Utilization (T)\nlen(used)/\nlen(relevant)', 4, '#4ECDC4'), ('Completeness (C)\nlen(R∩T)/\nlen(R)', 6.5, '#45B7D1'), ('Adherence (A)\nall fully_\nsupported?', 9, '#FFA07A'), ] for name, x, color in metrics: draw_box(ax, x, y_pos - 0.8, 1.8, 1.1, name, color, 8, True) # ============================================================================ # PHASE 8: Output # ============================================================================ y_pos = 0.2 ax.text(1, y_pos, 'PHASE 8: Final Output', fontsize=12, weight='bold', color='#1976D2') # Draw arrows from metrics to output draw_arrow(ax, 1.5, 1.4, 3, 0.9) draw_arrow(ax, 4, 1.4, 5, 0.9) draw_arrow(ax, 6.5, 1.4, 7, 0.9) draw_arrow(ax, 9, 1.4, 8.5, 0.9) draw_box(ax, 5.5, y_pos - 0.6, 4.5, 0.8, 'AdvancedTRACEScores Object\n(R, T, C, A values + metadata)', COLOR_OUTPUT, 9, True) # ============================================================================ # Side Panel: Example Values # ============================================================================ ax.text(11.5, 17.3, 'Example Calculation', fontsize=12, weight='bold', color='#1976D2') example_text = '''Given: • Relevant sentences: 2 all_relevant_sentence_keys: ["doc_0_s0", "doc_0_s1"] • Utilized sentences: 2 all_utilized_sentence_keys: ["doc_0_s0", "doc_0_s1"] • Supported sentences: 2/2 All with fully_supported=true TRACE Metrics: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ R = 2 / 20 = 0.10 → 10% of docs relevant T = 2 / 2 = 1.00 → 100% of relevant used C = 2 / 2 = 1.00 → 100% relevant info used A = 1.00 → No hallucinations Average = (0.10+1+1+1)/4 = 0.775 ''' draw_box(ax, 11.5, 13.5, 4.5, 6.5, example_text, '#F5F5F5', 7, False) # ============================================================================ # Key Formula Reference # ============================================================================ ax.text(11.5, 6.5, 'Key Formulas', fontsize=12, weight='bold', color='#1976D2') formulas_text = '''Relevance (R): R = |relevant_sentences| / 20 Utilization (T): T = |utilized_sentences| / |relevant_sentences| Completeness (C): C = |relevant ∩ utilized| / |relevant| Adherence (A): A = 1.0 if all fully_supported else 0.0 ''' draw_box(ax, 11.5, 4.2, 4.5, 3.8, formulas_text, '#F5F5F5', 8, False) plt.tight_layout() plt.savefig('TRACE_Metrics_Calculation_Flow.png', dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none') print("✅ Flow diagram created: TRACE_Metrics_Calculation_Flow.png") print("📊 Shows 8-phase process from input to TRACE metrics") plt.close() # Create a second diagram showing the detailed sentence mapping fig2, ax2 = plt.subplots(1, 1, figsize=(14, 10)) ax2.set_xlim(0, 14) ax2.set_ylim(0, 11) ax2.axis('off') ax2.text(7, 10.5, 'Sentence Mapping & Support Detection', ha='center', fontsize=14, weight='bold', color='#212121') # Document sentences ax2.text(1, 9.8, 'Retrieved Documents (Sentencized)', fontsize=11, weight='bold', color='#1976D2') doc_sentences = [ ('doc_0_s0', 'COVID-19 is a respiratory disease', True), ('doc_0_s1', 'caused by SARS-CoV-2', True), ('doc_1_s0', 'The virus spreads via droplets', True), ('doc_2_s0', 'Vaccines prevent infection', False), ] for i, (key, text, relevant) in enumerate(doc_sentences): y = 9.2 - (i * 0.6) color = '#C8E6C9' if relevant else '#FFCDD2' draw_box(ax2, 1, y, 2.5, 0.5, f'{key}\n{text}', color, 7) # Arrow in middle for i in range(4): y = 9.2 - (i * 0.6) draw_arrow(ax2, 2.8, y, 4.2, y - 2.5, color='#1976D2') # Response sentences with support mapping ax2.text(7, 9.8, 'Response Sentences (with Support)', fontsize=11, weight='bold', color='#1976D2') response_sentences = [ ('resp_s0', 'COVID-19 is a respiratory disease', 'doc_0_s0, doc_0_s1', True), ('resp_s1', 'It spreads through droplets', 'doc_1_s0', True), ] for i, (key, text, support, fully_supported) in enumerate(response_sentences): y = 9.2 - (i * 1.2) color = '#C8E6C9' if fully_supported else '#FFCDD2' # Response sentence box draw_box(ax2, 7, y, 2.8, 0.5, f'{key}: {text}', color, 7) # Support information draw_box(ax2, 10, y, 2.5, 0.5, f'Supports: {support}\nFully: {"✓" if fully_supported else "✗"}', '#FFF9C4' if fully_supported else '#FFE0B2', 7) # Connect with arrow draw_arrow(ax2, 8.8, y, 8.8, y, color='#757575') # Summary stats ax2.text(1, 5.8, 'Metric Calculations', fontsize=11, weight='bold', color='#1976D2') stats_text = '''Relevant Sentences: doc_0_s0 ✓, doc_0_s1 ✓, doc_1_s0 ✓ Count: 3 Relevance (R) = 3/20 = 0.15 Utilized Sentences: doc_0_s0, doc_0_s1, doc_1_s0 Count: 3 Utilization (T) = 3/3 = 1.00 Completeness (C) = 3/3 = 1.00 Adherence (A) = 1.0 (All 2 sentences fully supported) Average = (0.15 + 1.0 + 1.0 + 1.0) / 4 = 0.79 ''' draw_box(ax2, 3.5, 3.5, 5.2, 4.2, stats_text, '#E3F2FD', 8) # Legend ax2.text(9.5, 5.8, 'Legend', fontsize=11, weight='bold', color='#1976D2') legend_items = [ ('#C8E6C9', 'Relevant / Fully Supported'), ('#FFCDD2', 'Not Relevant / Not Supported'), ('#FFF9C4', 'Fully Supported'), ('#FFE0B2', 'Partially Supported'), ] for i, (color, label) in enumerate(legend_items): y = 5.2 - (i * 0.5) rect = mpatches.Rectangle((9.2, y - 0.15), 0.3, 0.3, facecolor=color, edgecolor='#424242') ax2.add_patch(rect) ax2.text(9.7, y, label, fontsize=8, va='center') plt.tight_layout() plt.savefig('Sentence_Support_Mapping.png', dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none') print("✅ Mapping diagram created: Sentence_Support_Mapping.png") print("📊 Shows sentence-level support detection and metric calculation") print("\n" + "="*60) print("Flow Diagrams Created Successfully!") print("="*60) print("\nFiles generated:") print(" 1. TRACE_Metrics_Calculation_Flow.png") print(" 2. Sentence_Support_Mapping.png")