"""Comparison and reporting functionality for benchmarks.""" import numpy as np from typing import Dict, Any, List, Optional from scipy import stats import logging logger = logging.getLogger(__name__) class BenchmarkComparison: """ Compares benchmark results and generates reports. Computes improvement deltas and statistical significance. """ def __init__(self): """Initialize comparison tool.""" pass def compare_results( self, baseline: Dict[str, Any], trained: Dict[str, Any] ) -> Dict[str, Any]: """ Compare baseline and trained model results. Args: baseline: Baseline benchmark results trained: Trained model benchmark results Returns: Comparison dictionary with deltas and significance """ comparison = { 'baseline': baseline, 'trained': trained, 'deltas': {}, 'improvements': {}, 'statistical_significance': {} } # Compute deltas for all numeric metrics metric_keys = set(baseline.keys()) & set(trained.keys()) for key in metric_keys: if isinstance(baseline.get(key), (int, float)) and isinstance(trained.get(key), (int, float)): baseline_val = baseline[key] trained_val = trained[key] # Compute delta delta = trained_val - baseline_val comparison['deltas'][key] = delta # Determine if this is an improvement # For error rates, lower is better if 'error' in key.lower() or 'distortion' in key.lower(): is_improvement = delta < 0 improvement_pct = -100 * delta / baseline_val if baseline_val != 0 else 0 else: # For quality scores, higher is better is_improvement = delta > 0 improvement_pct = 100 * delta / baseline_val if baseline_val != 0 else 0 comparison['improvements'][key] = { 'improved': is_improvement, 'delta': delta, 'percent_change': improvement_pct } return comparison def compute_statistical_significance( self, baseline_samples: List[float], trained_samples: List[float], alpha: float = 0.05 ) -> Dict[str, Any]: """ Compute statistical significance of improvement. Uses paired t-test to determine if difference is significant. Args: baseline_samples: Baseline metric values trained_samples: Trained model metric values alpha: Significance level Returns: Dictionary with test results """ if len(baseline_samples) != len(trained_samples): raise ValueError("Sample lists must have same length") if len(baseline_samples) < 2: return { 'significant': False, 'p_value': 1.0, 'test': 'insufficient_data' } # Perform paired t-test t_statistic, p_value = stats.ttest_rel(baseline_samples, trained_samples) is_significant = p_value < alpha return { 'significant': bool(is_significant), 'p_value': float(p_value), 't_statistic': float(t_statistic), 'alpha': alpha, 'test': 'paired_t_test', 'n_samples': len(baseline_samples) } def rank_improvements( self, comparison: Dict[str, Any] ) -> List[Dict[str, Any]]: """ Rank metrics by improvement magnitude. Args: comparison: Comparison dictionary from compare_results Returns: List of improvements sorted by magnitude """ improvements = comparison.get('improvements', {}) ranked = [] for metric, info in improvements.items(): ranked.append({ 'metric': metric, 'improved': info['improved'], 'delta': info['delta'], 'percent_change': info['percent_change'] }) # Sort by absolute percent change ranked.sort(key=lambda x: abs(x['percent_change']), reverse=True) return ranked def generate_summary_report( self, comparison: Dict[str, Any], significance_results: Optional[Dict[str, Dict]] = None ) -> str: """ Generate human-readable summary report. Args: comparison: Comparison dictionary significance_results: Optional statistical significance results per metric Returns: Formatted report string """ lines = [] lines.append("=" * 60) lines.append("BENCHMARK COMPARISON REPORT") lines.append("=" * 60) lines.append("") # Model info baseline = comparison.get('baseline', {}) trained = comparison.get('trained', {}) lines.append(f"Baseline Model: {baseline.get('model_name', 'Unknown')}") lines.append(f"Trained Model: {trained.get('model_name', 'Unknown')}") lines.append(f"Baseline Timestamp: {baseline.get('timestamp', 'Unknown')}") lines.append(f"Trained Timestamp: {trained.get('timestamp', 'Unknown')}") lines.append("") # Improvements lines.append("IMPROVEMENTS:") lines.append("-" * 60) ranked = self.rank_improvements(comparison) for item in ranked: metric = item['metric'] delta = item['delta'] pct = item['percent_change'] improved = item['improved'] status = "✓ IMPROVED" if improved else "✗ REGRESSED" sig_marker = "" if significance_results and metric in significance_results: if significance_results[metric].get('significant'): sig_marker = " *" lines.append(f"{metric:40s} {status:12s} {delta:+10.4f} ({pct:+6.2f}%){sig_marker}") if significance_results: lines.append("") lines.append("* Statistically significant at α=0.05") lines.append("") lines.append("=" * 60) return "\n".join(lines)