Spaces:
Sleeping
Sleeping
| # core/eval_utils.py (NEW FILE) | |
| """Evaluation visualization and reporting utilities.""" | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pathlib import Path | |
| from typing import Dict, Any | |
| import json | |
| # Set style | |
| sns.set_style("whitegrid") | |
| plt.rcParams['figure.figsize'] = (14, 10) | |
| # core/eval_utils.py - Fix the encoding issue | |
| def generate_evaluation_report(csv_path: str) -> Dict[str, Any]: | |
| """ | |
| Generate comprehensive evaluation report with visualizations. | |
| Args: | |
| csv_path: Path to evaluation CSV file | |
| Returns: | |
| Dictionary with report statistics | |
| """ | |
| # Load data | |
| df = pd.read_csv(csv_path) | |
| # Generate plots | |
| output_path = csv_path.replace('.csv', '_report') | |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) | |
| # Create figure with subplots | |
| fig = plt.figure(figsize=(16, 12)) | |
| gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3) | |
| # Plot 1: Average Total Time Comparison (Bar Chart) | |
| ax1 = fig.add_subplot(gs[0, 0]) | |
| times = df[['base_total_time', 'hier_total_time']].mean() | |
| bars = ax1.bar(['Base-RAG', 'Hier-RAG'], times, color=['#3498db', '#e74c3c'], alpha=0.8) | |
| ax1.set_ylabel('Time (seconds)', fontsize=11) | |
| ax1.set_title('Average Total Query Time', fontsize=12, fontweight='bold') | |
| ax1.grid(axis='y', alpha=0.3) | |
| # Add value labels on bars | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax1.text(bar.get_x() + bar.get_width()/2., height, | |
| f'{height:.2f}s', | |
| ha='center', va='bottom', fontsize=10) | |
| # Plot 2: Speedup Distribution (Histogram) | |
| ax2 = fig.add_subplot(gs[0, 1]) | |
| ax2.hist(df['speedup'], bins=15, color='#2ecc71', edgecolor='black', alpha=0.7) | |
| ax2.axvline(1.0, color='red', linestyle='--', linewidth=2, label='No improvement') | |
| ax2.axvline(df['speedup'].mean(), color='blue', linestyle='--', linewidth=2, | |
| label=f'Mean: {df["speedup"].mean():.2f}x') | |
| ax2.set_xlabel('Speedup Factor', fontsize=11) | |
| ax2.set_ylabel('Frequency', fontsize=11) | |
| ax2.set_title('Speedup Distribution', fontsize=12, fontweight='bold') | |
| ax2.legend(fontsize=9) | |
| ax2.grid(alpha=0.3) | |
| # Plot 3: Retrieval Time Scatter | |
| ax3 = fig.add_subplot(gs[0, 2]) | |
| ax3.scatter(df['base_retrieval_time'], df['hier_retrieval_time'], | |
| s=100, alpha=0.6, color='#9b59b6', edgecolors='black') | |
| max_val = max(df['base_retrieval_time'].max(), df['hier_retrieval_time'].max()) | |
| ax3.plot([0, max_val], [0, max_val], 'r--', linewidth=2, label='Equal performance') | |
| ax3.set_xlabel('Base-RAG Retrieval Time (s)', fontsize=11) | |
| ax3.set_ylabel('Hier-RAG Retrieval Time (s)', fontsize=11) | |
| ax3.set_title('Retrieval Time Comparison', fontsize=12, fontweight='bold') | |
| ax3.legend(fontsize=9) | |
| ax3.grid(alpha=0.3) | |
| # Plot 4: Query-wise Speedup (Horizontal Bar) | |
| ax4 = fig.add_subplot(gs[1, :]) | |
| queries = [f"Q{i+1}" for i in range(len(df))] | |
| colors = ['#2ecc71' if x > 1.0 else '#e74c3c' for x in df['speedup']] | |
| bars = ax4.barh(queries, df['speedup'], color=colors, alpha=0.7, edgecolor='black') | |
| ax4.axvline(1.0, color='black', linestyle='--', linewidth=2, label='Break-even') | |
| ax4.set_xlabel('Speedup Factor', fontsize=11) | |
| ax4.set_ylabel('Query', fontsize=11) | |
| ax4.set_title('Per-Query Speedup (Green = Hier-RAG Faster)', fontsize=12, fontweight='bold') | |
| ax4.legend(fontsize=9) | |
| ax4.grid(axis='x', alpha=0.3) | |
| # Add value labels | |
| for i, (bar, val) in enumerate(zip(bars, df['speedup'])): | |
| ax4.text(val, i, f' {val:.2f}x', va='center', fontsize=9) | |
| # Plot 5: Time Breakdown (Stacked Bar) | |
| ax5 = fig.add_subplot(gs[2, 0]) | |
| base_gen = df['base_total_time'] - df['base_retrieval_time'] | |
| hier_gen = df['hier_total_time'] - df['hier_retrieval_time'] | |
| x = ['Base-RAG', 'Hier-RAG'] | |
| retrieval = [df['base_retrieval_time'].mean(), df['hier_retrieval_time'].mean()] | |
| generation = [base_gen.mean(), hier_gen.mean()] | |
| ax5.bar(x, retrieval, label='Retrieval', color='#3498db', alpha=0.8) | |
| ax5.bar(x, generation, bottom=retrieval, label='Generation', color='#e67e22', alpha=0.8) | |
| ax5.set_ylabel('Time (seconds)', fontsize=11) | |
| ax5.set_title('Average Time Breakdown', fontsize=12, fontweight='bold') | |
| ax5.legend(fontsize=9) | |
| ax5.grid(axis='y', alpha=0.3) | |
| # Plot 6: Filter Match Analysis | |
| ax6 = fig.add_subplot(gs[2, 1]) | |
| filter_columns = ['filter_level1', 'filter_level2', 'filter_level3', 'filter_doc_type'] | |
| filter_counts = {} | |
| for col in filter_columns: | |
| if col in df.columns: | |
| non_none = df[col].notna() & (df[col] != 'None') | |
| filter_counts[col.replace('filter_', '')] = non_none.sum() | |
| if filter_counts: | |
| ax6.bar(filter_counts.keys(), filter_counts.values(), color='#f39c12', alpha=0.8) | |
| ax6.set_ylabel('Number of Queries', fontsize=11) | |
| ax6.set_title('Filter Application Frequency', fontsize=12, fontweight='bold') | |
| ax6.tick_params(axis='x', rotation=45) | |
| ax6.grid(axis='y', alpha=0.3) | |
| # Plot 7: Performance Summary (Text Box) | |
| ax7 = fig.add_subplot(gs[2, 2]) | |
| ax7.axis('off') | |
| # Calculate statistics | |
| stats_text = f""" | |
| PERFORMANCE SUMMARY | |
| {'='*30} | |
| Total Queries: {len(df)} | |
| Base-RAG: | |
| Avg Retrieval: {df['base_retrieval_time'].mean():.3f}s | |
| Avg Total: {df['base_total_time'].mean():.3f}s | |
| Hier-RAG: | |
| Avg Retrieval: {df['hier_retrieval_time'].mean():.3f}s | |
| Avg Total: {df['hier_total_time'].mean():.3f}s | |
| Speedup: | |
| Mean: {df['speedup'].mean():.2f}x | |
| Median: {df['speedup'].median():.2f}x | |
| Max: {df['speedup'].max():.2f}x | |
| Min: {df['speedup'].min():.2f}x | |
| Hier-RAG Wins: {(df['speedup'] > 1.0).sum()}/{len(df)} | |
| ({(df['speedup'] > 1.0).sum()/len(df)*100:.1f}%) | |
| """ | |
| ax7.text(0.05, 0.95, stats_text, transform=ax7.transAxes, | |
| fontsize=10, verticalalignment='top', | |
| fontfamily='monospace', | |
| bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) | |
| # Save figure | |
| plt.suptitle('RAG Performance Evaluation Report', fontsize=16, fontweight='bold', y=0.995) | |
| plt.savefig(f'{output_path}_charts.png', dpi=300, bbox_inches='tight') | |
| print(f"[SUCCESS] Visualization saved: {output_path}_charts.png") | |
| # Generate summary statistics | |
| summary_stats = { | |
| 'total_queries': len(df), | |
| 'base_avg_total': df['base_total_time'].mean(), | |
| 'hier_avg_total': df['hier_total_time'].mean(), | |
| 'avg_speedup': df['speedup'].mean(), | |
| 'median_speedup': df['speedup'].median(), | |
| 'max_speedup': df['speedup'].max(), | |
| 'min_speedup': df['speedup'].min(), | |
| 'hier_wins': (df['speedup'] > 1.0).sum(), | |
| 'win_rate': (df['speedup'] > 1.0).sum() / len(df) * 100, | |
| 'base_avg_retrieval': df['base_retrieval_time'].mean(), | |
| 'hier_avg_retrieval': df['hier_retrieval_time'].mean(), | |
| 'retrieval_improvement': (df['base_retrieval_time'].mean() - df['hier_retrieval_time'].mean()) / df['base_retrieval_time'].mean() * 100 | |
| } | |
| # Generate markdown report with UTF-8 encoding | |
| markdown_report = f"""# Evaluation Report | |
| ## Summary Statistics | |
| - **Total Queries Evaluated**: {summary_stats['total_queries']} | |
| - **Hier-RAG Win Rate**: {summary_stats['win_rate']:.1f}% ({summary_stats['hier_wins']}/{summary_stats['total_queries']} queries) | |
| ## Performance Metrics | |
| ### Average Total Time | |
| - **Base-RAG**: {summary_stats['base_avg_total']:.3f}s | |
| - **Hier-RAG**: {summary_stats['hier_avg_total']:.3f}s | |
| - **Improvement**: {((summary_stats['base_avg_total'] - summary_stats['hier_avg_total']) / summary_stats['base_avg_total'] * 100):.1f}% | |
| ### Average Retrieval Time | |
| - **Base-RAG**: {summary_stats['base_avg_retrieval']:.3f}s | |
| - **Hier-RAG**: {summary_stats['hier_avg_retrieval']:.3f}s | |
| - **Improvement**: {summary_stats['retrieval_improvement']:.1f}% | |
| ### Speedup Statistics | |
| - **Mean Speedup**: {summary_stats['avg_speedup']:.2f}x | |
| - **Median Speedup**: {summary_stats['median_speedup']:.2f}x | |
| - **Maximum Speedup**: {summary_stats['max_speedup']:.2f}x | |
| - **Minimum Speedup**: {summary_stats['min_speedup']:.2f}x | |
| ## Key Findings | |
| """ | |
| if summary_stats['avg_speedup'] > 1.2: | |
| markdown_report += "[SUCCESS] **Hier-RAG shows significant performance improvement** (>20% faster on average)\n\n" | |
| elif summary_stats['avg_speedup'] > 1.0: | |
| markdown_report += "[SUCCESS] **Hier-RAG shows moderate performance improvement** (>0% faster on average)\n\n" | |
| else: | |
| markdown_report += "[WARNING] **Hier-RAG shows no performance improvement** - Consider improving filter inference\n\n" | |
| if summary_stats['retrieval_improvement'] > 20: | |
| markdown_report += "[SUCCESS] **Retrieval time significantly reduced** with hierarchical filtering\n\n" | |
| if summary_stats['win_rate'] >= 60: | |
| markdown_report += f"[SUCCESS] **High win rate** ({summary_stats['win_rate']:.1f}%) indicates effective hierarchical filtering\n\n" | |
| else: | |
| markdown_report += f"[WARNING] **Low win rate** ({summary_stats['win_rate']:.1f}%) suggests filter inference needs improvement\n\n" | |
| markdown_report += f""" | |
| ## Recommendations | |
| """ | |
| if summary_stats['win_rate'] < 50: | |
| markdown_report += """1. **Improve Auto-Inference**: Current classification accuracy is low. Consider: | |
| - Using LLM-based classification instead of keyword matching | |
| - Fine-tuning classification prompts | |
| - Adding more domain-specific keywords | |
| """ | |
| if summary_stats['retrieval_improvement'] < 10: | |
| markdown_report += """2. **Optimize Filtering Strategy**: Limited retrieval improvement suggests: | |
| - Filters may be too broad (not reducing search space enough) | |
| - Consider adding more granular metadata levels | |
| - Evaluate if hierarchy structure matches document distribution | |
| """ | |
| markdown_report += """3. **Continue Monitoring**: Run evaluation regularly on new documents to track performance trends | |
| ## Visualization | |
|  | |
| --- | |
| *Report generated automatically by Hierarchical RAG Evaluation System* | |
| """.format(output_path.split('/')[-1].replace('\\', '/')) | |
| # Save markdown report with UTF-8 encoding to handle emojis | |
| try: | |
| with open(f'{output_path}_summary.md', 'w', encoding='utf-8') as f: | |
| f.write(markdown_report) | |
| print(f"[SUCCESS] Summary saved: {output_path}_summary.md") | |
| except Exception as e: | |
| print(f"[WARNING] Could not save markdown summary: {str(e)}") | |
| # Try without emojis | |
| markdown_report_plain = markdown_report.replace('✅', '[SUCCESS]').replace('⚠️', '[WARNING]').replace('❌', '[ERROR]') | |
| try: | |
| with open(f'{output_path}_summary.md', 'w', encoding='utf-8') as f: | |
| f.write(markdown_report_plain) | |
| print(f"[SUCCESS] Summary saved (plain text): {output_path}_summary.md") | |
| except Exception as e2: | |
| print(f"[ERROR] Failed to save summary: {str(e2)}") | |
| plt.close() | |
| return summary_stats |