# core/eval_utils.py (NEW FILE) """Evaluation visualization and reporting utilities.""" import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path from typing import Dict, Any import json # Set style sns.set_style("whitegrid") plt.rcParams['figure.figsize'] = (14, 10) # core/eval_utils.py - Fix the encoding issue def generate_evaluation_report(csv_path: str) -> Dict[str, Any]: """ Generate comprehensive evaluation report with visualizations. Args: csv_path: Path to evaluation CSV file Returns: Dictionary with report statistics """ # Load data df = pd.read_csv(csv_path) # Generate plots output_path = csv_path.replace('.csv', '_report') Path(output_path).parent.mkdir(parents=True, exist_ok=True) # Create figure with subplots fig = plt.figure(figsize=(16, 12)) gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3) # Plot 1: Average Total Time Comparison (Bar Chart) ax1 = fig.add_subplot(gs[0, 0]) times = df[['base_total_time', 'hier_total_time']].mean() bars = ax1.bar(['Base-RAG', 'Hier-RAG'], times, color=['#3498db', '#e74c3c'], alpha=0.8) ax1.set_ylabel('Time (seconds)', fontsize=11) ax1.set_title('Average Total Query Time', fontsize=12, fontweight='bold') ax1.grid(axis='y', alpha=0.3) # Add value labels on bars for bar in bars: height = bar.get_height() ax1.text(bar.get_x() + bar.get_width()/2., height, f'{height:.2f}s', ha='center', va='bottom', fontsize=10) # Plot 2: Speedup Distribution (Histogram) ax2 = fig.add_subplot(gs[0, 1]) ax2.hist(df['speedup'], bins=15, color='#2ecc71', edgecolor='black', alpha=0.7) ax2.axvline(1.0, color='red', linestyle='--', linewidth=2, label='No improvement') ax2.axvline(df['speedup'].mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {df["speedup"].mean():.2f}x') ax2.set_xlabel('Speedup Factor', fontsize=11) ax2.set_ylabel('Frequency', fontsize=11) ax2.set_title('Speedup Distribution', fontsize=12, fontweight='bold') ax2.legend(fontsize=9) ax2.grid(alpha=0.3) # Plot 3: Retrieval Time Scatter ax3 = fig.add_subplot(gs[0, 2]) ax3.scatter(df['base_retrieval_time'], df['hier_retrieval_time'], s=100, alpha=0.6, color='#9b59b6', edgecolors='black') max_val = max(df['base_retrieval_time'].max(), df['hier_retrieval_time'].max()) ax3.plot([0, max_val], [0, max_val], 'r--', linewidth=2, label='Equal performance') ax3.set_xlabel('Base-RAG Retrieval Time (s)', fontsize=11) ax3.set_ylabel('Hier-RAG Retrieval Time (s)', fontsize=11) ax3.set_title('Retrieval Time Comparison', fontsize=12, fontweight='bold') ax3.legend(fontsize=9) ax3.grid(alpha=0.3) # Plot 4: Query-wise Speedup (Horizontal Bar) ax4 = fig.add_subplot(gs[1, :]) queries = [f"Q{i+1}" for i in range(len(df))] colors = ['#2ecc71' if x > 1.0 else '#e74c3c' for x in df['speedup']] bars = ax4.barh(queries, df['speedup'], color=colors, alpha=0.7, edgecolor='black') ax4.axvline(1.0, color='black', linestyle='--', linewidth=2, label='Break-even') ax4.set_xlabel('Speedup Factor', fontsize=11) ax4.set_ylabel('Query', fontsize=11) ax4.set_title('Per-Query Speedup (Green = Hier-RAG Faster)', fontsize=12, fontweight='bold') ax4.legend(fontsize=9) ax4.grid(axis='x', alpha=0.3) # Add value labels for i, (bar, val) in enumerate(zip(bars, df['speedup'])): ax4.text(val, i, f' {val:.2f}x', va='center', fontsize=9) # Plot 5: Time Breakdown (Stacked Bar) ax5 = fig.add_subplot(gs[2, 0]) base_gen = df['base_total_time'] - df['base_retrieval_time'] hier_gen = df['hier_total_time'] - df['hier_retrieval_time'] x = ['Base-RAG', 'Hier-RAG'] retrieval = [df['base_retrieval_time'].mean(), df['hier_retrieval_time'].mean()] generation = [base_gen.mean(), hier_gen.mean()] ax5.bar(x, retrieval, label='Retrieval', color='#3498db', alpha=0.8) ax5.bar(x, generation, bottom=retrieval, label='Generation', color='#e67e22', alpha=0.8) ax5.set_ylabel('Time (seconds)', fontsize=11) ax5.set_title('Average Time Breakdown', fontsize=12, fontweight='bold') ax5.legend(fontsize=9) ax5.grid(axis='y', alpha=0.3) # Plot 6: Filter Match Analysis ax6 = fig.add_subplot(gs[2, 1]) filter_columns = ['filter_level1', 'filter_level2', 'filter_level3', 'filter_doc_type'] filter_counts = {} for col in filter_columns: if col in df.columns: non_none = df[col].notna() & (df[col] != 'None') filter_counts[col.replace('filter_', '')] = non_none.sum() if filter_counts: ax6.bar(filter_counts.keys(), filter_counts.values(), color='#f39c12', alpha=0.8) ax6.set_ylabel('Number of Queries', fontsize=11) ax6.set_title('Filter Application Frequency', fontsize=12, fontweight='bold') ax6.tick_params(axis='x', rotation=45) ax6.grid(axis='y', alpha=0.3) # Plot 7: Performance Summary (Text Box) ax7 = fig.add_subplot(gs[2, 2]) ax7.axis('off') # Calculate statistics stats_text = f""" PERFORMANCE SUMMARY {'='*30} Total Queries: {len(df)} Base-RAG: Avg Retrieval: {df['base_retrieval_time'].mean():.3f}s Avg Total: {df['base_total_time'].mean():.3f}s Hier-RAG: Avg Retrieval: {df['hier_retrieval_time'].mean():.3f}s Avg Total: {df['hier_total_time'].mean():.3f}s Speedup: Mean: {df['speedup'].mean():.2f}x Median: {df['speedup'].median():.2f}x Max: {df['speedup'].max():.2f}x Min: {df['speedup'].min():.2f}x Hier-RAG Wins: {(df['speedup'] > 1.0).sum()}/{len(df)} ({(df['speedup'] > 1.0).sum()/len(df)*100:.1f}%) """ ax7.text(0.05, 0.95, stats_text, transform=ax7.transAxes, fontsize=10, verticalalignment='top', fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) # Save figure plt.suptitle('RAG Performance Evaluation Report', fontsize=16, fontweight='bold', y=0.995) plt.savefig(f'{output_path}_charts.png', dpi=300, bbox_inches='tight') print(f"[SUCCESS] Visualization saved: {output_path}_charts.png") # Generate summary statistics summary_stats = { 'total_queries': len(df), 'base_avg_total': df['base_total_time'].mean(), 'hier_avg_total': df['hier_total_time'].mean(), 'avg_speedup': df['speedup'].mean(), 'median_speedup': df['speedup'].median(), 'max_speedup': df['speedup'].max(), 'min_speedup': df['speedup'].min(), 'hier_wins': (df['speedup'] > 1.0).sum(), 'win_rate': (df['speedup'] > 1.0).sum() / len(df) * 100, 'base_avg_retrieval': df['base_retrieval_time'].mean(), 'hier_avg_retrieval': df['hier_retrieval_time'].mean(), 'retrieval_improvement': (df['base_retrieval_time'].mean() - df['hier_retrieval_time'].mean()) / df['base_retrieval_time'].mean() * 100 } # Generate markdown report with UTF-8 encoding markdown_report = f"""# Evaluation Report ## Summary Statistics - **Total Queries Evaluated**: {summary_stats['total_queries']} - **Hier-RAG Win Rate**: {summary_stats['win_rate']:.1f}% ({summary_stats['hier_wins']}/{summary_stats['total_queries']} queries) ## Performance Metrics ### Average Total Time - **Base-RAG**: {summary_stats['base_avg_total']:.3f}s - **Hier-RAG**: {summary_stats['hier_avg_total']:.3f}s - **Improvement**: {((summary_stats['base_avg_total'] - summary_stats['hier_avg_total']) / summary_stats['base_avg_total'] * 100):.1f}% ### Average Retrieval Time - **Base-RAG**: {summary_stats['base_avg_retrieval']:.3f}s - **Hier-RAG**: {summary_stats['hier_avg_retrieval']:.3f}s - **Improvement**: {summary_stats['retrieval_improvement']:.1f}% ### Speedup Statistics - **Mean Speedup**: {summary_stats['avg_speedup']:.2f}x - **Median Speedup**: {summary_stats['median_speedup']:.2f}x - **Maximum Speedup**: {summary_stats['max_speedup']:.2f}x - **Minimum Speedup**: {summary_stats['min_speedup']:.2f}x ## Key Findings """ if summary_stats['avg_speedup'] > 1.2: markdown_report += "[SUCCESS] **Hier-RAG shows significant performance improvement** (>20% faster on average)\n\n" elif summary_stats['avg_speedup'] > 1.0: markdown_report += "[SUCCESS] **Hier-RAG shows moderate performance improvement** (>0% faster on average)\n\n" else: markdown_report += "[WARNING] **Hier-RAG shows no performance improvement** - Consider improving filter inference\n\n" if summary_stats['retrieval_improvement'] > 20: markdown_report += "[SUCCESS] **Retrieval time significantly reduced** with hierarchical filtering\n\n" if summary_stats['win_rate'] >= 60: markdown_report += f"[SUCCESS] **High win rate** ({summary_stats['win_rate']:.1f}%) indicates effective hierarchical filtering\n\n" else: markdown_report += f"[WARNING] **Low win rate** ({summary_stats['win_rate']:.1f}%) suggests filter inference needs improvement\n\n" markdown_report += f""" ## Recommendations """ if summary_stats['win_rate'] < 50: markdown_report += """1. **Improve Auto-Inference**: Current classification accuracy is low. Consider: - Using LLM-based classification instead of keyword matching - Fine-tuning classification prompts - Adding more domain-specific keywords """ if summary_stats['retrieval_improvement'] < 10: markdown_report += """2. **Optimize Filtering Strategy**: Limited retrieval improvement suggests: - Filters may be too broad (not reducing search space enough) - Consider adding more granular metadata levels - Evaluate if hierarchy structure matches document distribution """ markdown_report += """3. **Continue Monitoring**: Run evaluation regularly on new documents to track performance trends ## Visualization ![Evaluation Charts]({}_charts.png) --- *Report generated automatically by Hierarchical RAG Evaluation System* """.format(output_path.split('/')[-1].replace('\\', '/')) # Save markdown report with UTF-8 encoding to handle emojis try: with open(f'{output_path}_summary.md', 'w', encoding='utf-8') as f: f.write(markdown_report) print(f"[SUCCESS] Summary saved: {output_path}_summary.md") except Exception as e: print(f"[WARNING] Could not save markdown summary: {str(e)}") # Try without emojis markdown_report_plain = markdown_report.replace('✅', '[SUCCESS]').replace('⚠️', '[WARNING]').replace('❌', '[ERROR]') try: with open(f'{output_path}_summary.md', 'w', encoding='utf-8') as f: f.write(markdown_report_plain) print(f"[SUCCESS] Summary saved (plain text): {output_path}_summary.md") except Exception as e2: print(f"[ERROR] Failed to save summary: {str(e2)}") plt.close() return summary_stats