hierarchical-rag-eval / core /eval_utils.py
hh786's picture
Deployment of Hierarchical RAG system
c54dcef
# core/eval_utils.py (NEW FILE)
"""Evaluation visualization and reporting utilities."""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, Any
import json
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 10)
# core/eval_utils.py - Fix the encoding issue
def generate_evaluation_report(csv_path: str) -> Dict[str, Any]:
"""
Generate comprehensive evaluation report with visualizations.
Args:
csv_path: Path to evaluation CSV file
Returns:
Dictionary with report statistics
"""
# Load data
df = pd.read_csv(csv_path)
# Generate plots
output_path = csv_path.replace('.csv', '_report')
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
# Create figure with subplots
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
# Plot 1: Average Total Time Comparison (Bar Chart)
ax1 = fig.add_subplot(gs[0, 0])
times = df[['base_total_time', 'hier_total_time']].mean()
bars = ax1.bar(['Base-RAG', 'Hier-RAG'], times, color=['#3498db', '#e74c3c'], alpha=0.8)
ax1.set_ylabel('Time (seconds)', fontsize=11)
ax1.set_title('Average Total Query Time', fontsize=12, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height,
f'{height:.2f}s',
ha='center', va='bottom', fontsize=10)
# Plot 2: Speedup Distribution (Histogram)
ax2 = fig.add_subplot(gs[0, 1])
ax2.hist(df['speedup'], bins=15, color='#2ecc71', edgecolor='black', alpha=0.7)
ax2.axvline(1.0, color='red', linestyle='--', linewidth=2, label='No improvement')
ax2.axvline(df['speedup'].mean(), color='blue', linestyle='--', linewidth=2,
label=f'Mean: {df["speedup"].mean():.2f}x')
ax2.set_xlabel('Speedup Factor', fontsize=11)
ax2.set_ylabel('Frequency', fontsize=11)
ax2.set_title('Speedup Distribution', fontsize=12, fontweight='bold')
ax2.legend(fontsize=9)
ax2.grid(alpha=0.3)
# Plot 3: Retrieval Time Scatter
ax3 = fig.add_subplot(gs[0, 2])
ax3.scatter(df['base_retrieval_time'], df['hier_retrieval_time'],
s=100, alpha=0.6, color='#9b59b6', edgecolors='black')
max_val = max(df['base_retrieval_time'].max(), df['hier_retrieval_time'].max())
ax3.plot([0, max_val], [0, max_val], 'r--', linewidth=2, label='Equal performance')
ax3.set_xlabel('Base-RAG Retrieval Time (s)', fontsize=11)
ax3.set_ylabel('Hier-RAG Retrieval Time (s)', fontsize=11)
ax3.set_title('Retrieval Time Comparison', fontsize=12, fontweight='bold')
ax3.legend(fontsize=9)
ax3.grid(alpha=0.3)
# Plot 4: Query-wise Speedup (Horizontal Bar)
ax4 = fig.add_subplot(gs[1, :])
queries = [f"Q{i+1}" for i in range(len(df))]
colors = ['#2ecc71' if x > 1.0 else '#e74c3c' for x in df['speedup']]
bars = ax4.barh(queries, df['speedup'], color=colors, alpha=0.7, edgecolor='black')
ax4.axvline(1.0, color='black', linestyle='--', linewidth=2, label='Break-even')
ax4.set_xlabel('Speedup Factor', fontsize=11)
ax4.set_ylabel('Query', fontsize=11)
ax4.set_title('Per-Query Speedup (Green = Hier-RAG Faster)', fontsize=12, fontweight='bold')
ax4.legend(fontsize=9)
ax4.grid(axis='x', alpha=0.3)
# Add value labels
for i, (bar, val) in enumerate(zip(bars, df['speedup'])):
ax4.text(val, i, f' {val:.2f}x', va='center', fontsize=9)
# Plot 5: Time Breakdown (Stacked Bar)
ax5 = fig.add_subplot(gs[2, 0])
base_gen = df['base_total_time'] - df['base_retrieval_time']
hier_gen = df['hier_total_time'] - df['hier_retrieval_time']
x = ['Base-RAG', 'Hier-RAG']
retrieval = [df['base_retrieval_time'].mean(), df['hier_retrieval_time'].mean()]
generation = [base_gen.mean(), hier_gen.mean()]
ax5.bar(x, retrieval, label='Retrieval', color='#3498db', alpha=0.8)
ax5.bar(x, generation, bottom=retrieval, label='Generation', color='#e67e22', alpha=0.8)
ax5.set_ylabel('Time (seconds)', fontsize=11)
ax5.set_title('Average Time Breakdown', fontsize=12, fontweight='bold')
ax5.legend(fontsize=9)
ax5.grid(axis='y', alpha=0.3)
# Plot 6: Filter Match Analysis
ax6 = fig.add_subplot(gs[2, 1])
filter_columns = ['filter_level1', 'filter_level2', 'filter_level3', 'filter_doc_type']
filter_counts = {}
for col in filter_columns:
if col in df.columns:
non_none = df[col].notna() & (df[col] != 'None')
filter_counts[col.replace('filter_', '')] = non_none.sum()
if filter_counts:
ax6.bar(filter_counts.keys(), filter_counts.values(), color='#f39c12', alpha=0.8)
ax6.set_ylabel('Number of Queries', fontsize=11)
ax6.set_title('Filter Application Frequency', fontsize=12, fontweight='bold')
ax6.tick_params(axis='x', rotation=45)
ax6.grid(axis='y', alpha=0.3)
# Plot 7: Performance Summary (Text Box)
ax7 = fig.add_subplot(gs[2, 2])
ax7.axis('off')
# Calculate statistics
stats_text = f"""
PERFORMANCE SUMMARY
{'='*30}
Total Queries: {len(df)}
Base-RAG:
Avg Retrieval: {df['base_retrieval_time'].mean():.3f}s
Avg Total: {df['base_total_time'].mean():.3f}s
Hier-RAG:
Avg Retrieval: {df['hier_retrieval_time'].mean():.3f}s
Avg Total: {df['hier_total_time'].mean():.3f}s
Speedup:
Mean: {df['speedup'].mean():.2f}x
Median: {df['speedup'].median():.2f}x
Max: {df['speedup'].max():.2f}x
Min: {df['speedup'].min():.2f}x
Hier-RAG Wins: {(df['speedup'] > 1.0).sum()}/{len(df)}
({(df['speedup'] > 1.0).sum()/len(df)*100:.1f}%)
"""
ax7.text(0.05, 0.95, stats_text, transform=ax7.transAxes,
fontsize=10, verticalalignment='top',
fontfamily='monospace',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# Save figure
plt.suptitle('RAG Performance Evaluation Report', fontsize=16, fontweight='bold', y=0.995)
plt.savefig(f'{output_path}_charts.png', dpi=300, bbox_inches='tight')
print(f"[SUCCESS] Visualization saved: {output_path}_charts.png")
# Generate summary statistics
summary_stats = {
'total_queries': len(df),
'base_avg_total': df['base_total_time'].mean(),
'hier_avg_total': df['hier_total_time'].mean(),
'avg_speedup': df['speedup'].mean(),
'median_speedup': df['speedup'].median(),
'max_speedup': df['speedup'].max(),
'min_speedup': df['speedup'].min(),
'hier_wins': (df['speedup'] > 1.0).sum(),
'win_rate': (df['speedup'] > 1.0).sum() / len(df) * 100,
'base_avg_retrieval': df['base_retrieval_time'].mean(),
'hier_avg_retrieval': df['hier_retrieval_time'].mean(),
'retrieval_improvement': (df['base_retrieval_time'].mean() - df['hier_retrieval_time'].mean()) / df['base_retrieval_time'].mean() * 100
}
# Generate markdown report with UTF-8 encoding
markdown_report = f"""# Evaluation Report
## Summary Statistics
- **Total Queries Evaluated**: {summary_stats['total_queries']}
- **Hier-RAG Win Rate**: {summary_stats['win_rate']:.1f}% ({summary_stats['hier_wins']}/{summary_stats['total_queries']} queries)
## Performance Metrics
### Average Total Time
- **Base-RAG**: {summary_stats['base_avg_total']:.3f}s
- **Hier-RAG**: {summary_stats['hier_avg_total']:.3f}s
- **Improvement**: {((summary_stats['base_avg_total'] - summary_stats['hier_avg_total']) / summary_stats['base_avg_total'] * 100):.1f}%
### Average Retrieval Time
- **Base-RAG**: {summary_stats['base_avg_retrieval']:.3f}s
- **Hier-RAG**: {summary_stats['hier_avg_retrieval']:.3f}s
- **Improvement**: {summary_stats['retrieval_improvement']:.1f}%
### Speedup Statistics
- **Mean Speedup**: {summary_stats['avg_speedup']:.2f}x
- **Median Speedup**: {summary_stats['median_speedup']:.2f}x
- **Maximum Speedup**: {summary_stats['max_speedup']:.2f}x
- **Minimum Speedup**: {summary_stats['min_speedup']:.2f}x
## Key Findings
"""
if summary_stats['avg_speedup'] > 1.2:
markdown_report += "[SUCCESS] **Hier-RAG shows significant performance improvement** (>20% faster on average)\n\n"
elif summary_stats['avg_speedup'] > 1.0:
markdown_report += "[SUCCESS] **Hier-RAG shows moderate performance improvement** (>0% faster on average)\n\n"
else:
markdown_report += "[WARNING] **Hier-RAG shows no performance improvement** - Consider improving filter inference\n\n"
if summary_stats['retrieval_improvement'] > 20:
markdown_report += "[SUCCESS] **Retrieval time significantly reduced** with hierarchical filtering\n\n"
if summary_stats['win_rate'] >= 60:
markdown_report += f"[SUCCESS] **High win rate** ({summary_stats['win_rate']:.1f}%) indicates effective hierarchical filtering\n\n"
else:
markdown_report += f"[WARNING] **Low win rate** ({summary_stats['win_rate']:.1f}%) suggests filter inference needs improvement\n\n"
markdown_report += f"""
## Recommendations
"""
if summary_stats['win_rate'] < 50:
markdown_report += """1. **Improve Auto-Inference**: Current classification accuracy is low. Consider:
- Using LLM-based classification instead of keyword matching
- Fine-tuning classification prompts
- Adding more domain-specific keywords
"""
if summary_stats['retrieval_improvement'] < 10:
markdown_report += """2. **Optimize Filtering Strategy**: Limited retrieval improvement suggests:
- Filters may be too broad (not reducing search space enough)
- Consider adding more granular metadata levels
- Evaluate if hierarchy structure matches document distribution
"""
markdown_report += """3. **Continue Monitoring**: Run evaluation regularly on new documents to track performance trends
## Visualization
![Evaluation Charts]({}_charts.png)
---
*Report generated automatically by Hierarchical RAG Evaluation System*
""".format(output_path.split('/')[-1].replace('\\', '/'))
# Save markdown report with UTF-8 encoding to handle emojis
try:
with open(f'{output_path}_summary.md', 'w', encoding='utf-8') as f:
f.write(markdown_report)
print(f"[SUCCESS] Summary saved: {output_path}_summary.md")
except Exception as e:
print(f"[WARNING] Could not save markdown summary: {str(e)}")
# Try without emojis
markdown_report_plain = markdown_report.replace('✅', '[SUCCESS]').replace('⚠️', '[WARNING]').replace('❌', '[ERROR]')
try:
with open(f'{output_path}_summary.md', 'w', encoding='utf-8') as f:
f.write(markdown_report_plain)
print(f"[SUCCESS] Summary saved (plain text): {output_path}_summary.md")
except Exception as e2:
print(f"[ERROR] Failed to save summary: {str(e2)}")
plt.close()
return summary_stats