Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script to compare response times (inference times) from two benchmark JSON files. | |
| Generates a visualization comparing the models from both benchmarks. | |
| """ | |
| import json | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from pathlib import Path | |
| # File paths | |
| benchmark_path = Path(__file__).parent / "../benchmark_20260310_090052.json" | |
| single_benchmark_path = Path(__file__).parent / "../single_benchmark_20260310_090011.json" | |
| # Load benchmark data | |
| with open(benchmark_path, 'r') as f: | |
| benchmark_data = json.load(f) | |
| with open(single_benchmark_path, 'r') as f: | |
| single_benchmark_data = json.load(f) | |
| # Extract model data | |
| def extract_model_data(data_dict): | |
| models = {} | |
| for model_name, model_info in data_dict.get('models', {}).items(): | |
| models[model_name] = { | |
| 'mean': model_info.get('inference_time_mean', 0), | |
| 'std': model_info.get('inference_time_std', 0), | |
| 'min': model_info.get('inference_time_min', 0), | |
| 'max': model_info.get('inference_time_max', 0), | |
| 'p50': model_info.get('inference_time_p50', 0), | |
| 'p95': model_info.get('inference_time_p95', 0), | |
| 'p99': model_info.get('inference_time_p99', 0), | |
| 'accuracy': model_info.get('accuracy', 0), | |
| 'timing_samples': model_info.get('timing_samples', []) | |
| } | |
| return models | |
| benchmark_models = extract_model_data(benchmark_data) | |
| single_benchmark_models = extract_model_data(single_benchmark_data) | |
| # Get all model names (should be the same in both) | |
| all_model_names = sorted(benchmark_models.keys()) | |
| # Create figure with subplots | |
| fig = plt.figure(figsize=(16, 10)) | |
| # 1. Bar chart comparing mean inference times | |
| ax1 = fig.add_subplot(2, 3, 1) | |
| x = np.arange(len(all_model_names)) | |
| width = 0.35 | |
| benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names] # Convert to ms | |
| single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names] # Convert to ms | |
| bars1 = ax1.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8) | |
| bars2 = ax1.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8) | |
| ax1.set_xlabel('Model') | |
| ax1.set_ylabel('Mean Inference Time (ms)') | |
| ax1.set_title('Comparison of Mean Inference Times') | |
| ax1.set_xticks(x) | |
| ax1.set_xticklabels(all_model_names, rotation=45, ha='right') | |
| ax1.legend() | |
| ax1.grid(axis='y', alpha=0.3) | |
| # Add value labels on bars | |
| for bar in bars1: | |
| height = bar.get_height() | |
| ax1.annotate(f'{height:.3f}', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, 3), | |
| textcoords="offset points", | |
| ha='center', va='bottom', fontsize=8) | |
| for bar in bars2: | |
| height = bar.get_height() | |
| ax1.annotate(f'{height:.3f}', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, 3), | |
| textcoords="offset points", | |
| ha='center', va='bottom', fontsize=8) | |
| # 2. Box plot comparing timing distributions | |
| ax2 = fig.add_subplot(2, 3, 2) | |
| # Prepare data for box plot | |
| all_data = [] | |
| labels = [] | |
| colors = [] | |
| for i, model_name in enumerate(all_model_names): | |
| benchmark_samples = benchmark_models[model_name]['timing_samples'][:10] # Use first 10 for comparison | |
| single_samples = single_benchmark_models[model_name]['timing_samples'][:10] # Use first 10 for comparison | |
| # Convert to ms | |
| benchmark_ms = [s * 1000 for s in benchmark_samples] | |
| single_ms = [s * 1000 for s in single_samples] | |
| all_data.append(benchmark_ms) | |
| all_data.append(single_ms) | |
| labels.append(f'{model_name}\nMulti') | |
| labels.append(f'{model_name}\nSingle') | |
| colors.extend([f'C{i}', f'C{i}']) | |
| bp = ax2.boxplot(all_data, labels=labels, patch_artist=True, vert=True) | |
| for patch, color in zip(bp['boxes'], colors): | |
| patch.set_facecolor(color) | |
| patch.set_alpha(0.6) | |
| ax2.set_xlabel('Model (Benchmark Type)') | |
| ax2.set_ylabel('Inference Time (ms)') | |
| ax2.set_title('Distribution of Inference Times (Box Plot)') | |
| ax2.tick_params(axis='x', rotation=45) | |
| ax2.grid(axis='y', alpha=0.3) | |
| # 3. Comparison scatter plot with accuracy | |
| ax3 = fig.add_subplot(2, 3, 3) | |
| benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names] | |
| single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names] | |
| benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names] | |
| single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names] | |
| # Create scatter plot | |
| for i, model_name in enumerate(all_model_names): | |
| ax3.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100, | |
| label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}') | |
| ax3.scatter([single_times[i]], [single_accs[i]], marker='s', s=100, | |
| label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}') | |
| ax3.set_xlabel('Mean Inference Time (ms)') | |
| ax3.set_ylabel('Accuracy (%)') | |
| ax3.set_title('Accuracy vs Inference Time Comparison') | |
| ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small') | |
| ax3.grid(True, alpha=0.3) | |
| # 4. Percentile comparison | |
| ax4 = fig.add_subplot(2, 3, 4) | |
| x = np.arange(len(all_model_names)) | |
| width = 0.25 | |
| benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names] | |
| benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names] | |
| benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names] | |
| single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names] | |
| single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names] | |
| single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names] | |
| bars_p50 = ax4.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8) | |
| bars_p95 = ax4.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8) | |
| bars_p99 = ax4.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8) | |
| # Single benchmark percentiles (offset) | |
| ax4.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//') | |
| ax4.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//') | |
| ax4.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//') | |
| ax4.set_xlabel('Model') | |
| ax4.set_ylabel('Inference Time (ms)') | |
| ax4.set_title('Percentile Comparison (P50, P95, P99)') | |
| ax4.set_xticks(x) | |
| ax4.set_xticklabels(all_model_names, rotation=45, ha='right') | |
| ax4.legend(fontsize='small') | |
| ax4.grid(axis='y', alpha=0.3) | |
| # 5. Standard deviation comparison | |
| ax5 = fig.add_subplot(2, 3, 5) | |
| benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names] | |
| single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names] | |
| x = np.arange(len(all_model_names)) | |
| width = 0.35 | |
| bars_std1 = ax5.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8) | |
| bars_std2 = ax5.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8) | |
| ax5.set_xlabel('Model') | |
| ax5.set_ylabel('Standard Deviation (ms)') | |
| ax5.set_title('Standard Deviation of Inference Times') | |
| ax5.set_xticks(x) | |
| ax5.set_xticklabels(all_model_names, rotation=45, ha='right') | |
| ax5.legend() | |
| ax5.grid(axis='y', alpha=0.3) | |
| # Add value labels | |
| for bar in bars_std1: | |
| height = bar.get_height() | |
| ax5.annotate(f'{height:.4f}', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, 3), | |
| textcoords="offset points", | |
| ha='center', va='bottom', fontsize=7) | |
| for bar in bars_std2: | |
| height = bar.get_height() | |
| ax5.annotate(f'{height:.4f}', | |
| xy=(bar.get_x() + bar.get_width() / 2, height), | |
| xytext=(0, 3), | |
| textcoords="offset points", | |
| ha='center', va='bottom', fontsize=7) | |
| # 6. Summary statistics table | |
| ax6 = fig.add_subplot(2, 3, 6) | |
| ax6.axis('off') | |
| # Create table data | |
| table_data = [] | |
| for model_name in all_model_names: | |
| row = [ | |
| model_name, | |
| f"{benchmark_models[model_name]['mean']*1000:.3f} ± {benchmark_models[model_name]['std']*1000:.3f}", | |
| f"{benchmark_models[model_name]['min']*1000:.3f}", | |
| f"{benchmark_models[model_name]['max']*1000:.3f}", | |
| f"{benchmark_models[model_name]['accuracy']*100:.1f}%", | |
| f"{single_benchmark_models[model_name]['mean']*1000:.3f} ± {single_benchmark_models[model_name]['std']*1000:.3f}", | |
| f"{single_benchmark_models[model_name]['min']*1000:.3f}", | |
| f"{single_benchmark_models[model_name]['max']*1000:.3f}", | |
| f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%" | |
| ] | |
| table_data.append(row) | |
| columns = ['Model', 'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)', | |
| 'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)'] | |
| row_labels = ['Multi', 'Single'] * len(all_model_names) | |
| # Create table | |
| table = ax6.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center') | |
| table.auto_set_font_size(False) | |
| table.set_fontsize(9) | |
| table.scale(1.1, 1.8) | |
| # Style the table | |
| for i in range(len(all_model_names)): | |
| for j in range(len(columns)): | |
| cell = table[(i+1, j)] | |
| cell.set_height(0.4) | |
| if j < 5: | |
| cell.set_facecolor('#f0f0f0') # Light gray for multi-benchmark columns | |
| else: | |
| cell.set_facecolor('#e0e0f0') # Light blue for single-benchmark columns | |
| ax6.set_title('Summary Statistics Comparison', fontsize=12, pad=20) | |
| # Save each subplot as a separate PNG image | |
| output_dir = Path(__file__).parent | |
| # 1. Bar chart comparing mean inference times | |
| fig1, ax1_single = plt.subplots(figsize=(10, 6)) | |
| x = np.arange(len(all_model_names)) | |
| width = 0.35 | |
| benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names] | |
| single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names] | |
| bars1 = ax1_single.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8) | |
| bars2 = ax1_single.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8) | |
| ax1_single.set_xlabel('Model') | |
| ax1_single.set_ylabel('Mean Inference Time (ms)') | |
| ax1_single.set_title('Comparison of Mean Inference Times') | |
| ax1_single.set_xticks(x) | |
| ax1_single.set_xticklabels(all_model_names, rotation=45, ha='right') | |
| ax1_single.legend() | |
| ax1_single.grid(axis='y', alpha=0.3) | |
| for bar in bars1: | |
| height = bar.get_height() | |
| ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8) | |
| for bar in bars2: | |
| height = bar.get_height() | |
| ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8) | |
| plt.tight_layout() | |
| plt.savefig(output_dir / "mean_inference_times.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig1) | |
| print(f"Saved: mean_inference_times.png") | |
| # 2. Box plot comparing timing distributions | |
| fig2, ax2_single = plt.subplots(figsize=(12, 6)) | |
| all_data = [] | |
| labels = [] | |
| colors = [] | |
| for i, model_name in enumerate(all_model_names): | |
| benchmark_samples = benchmark_models[model_name]['timing_samples'][:10] | |
| single_samples = single_benchmark_models[model_name]['timing_samples'][:10] | |
| benchmark_ms = [s * 1000 for s in benchmark_samples] | |
| single_ms = [s * 1000 for s in single_samples] | |
| all_data.append(benchmark_ms) | |
| all_data.append(single_ms) | |
| labels.append(f'{model_name}\nMulti') | |
| labels.append(f'{model_name}\nSingle') | |
| colors.extend([f'C{i}', f'C{i}']) | |
| bp = ax2_single.boxplot(all_data, labels=labels, patch_artist=True, vert=True) | |
| for patch, color in zip(bp['boxes'], colors): | |
| patch.set_facecolor(color) | |
| patch.set_alpha(0.6) | |
| ax2_single.set_xlabel('Model (Benchmark Type)') | |
| ax2_single.set_ylabel('Inference Time (ms)') | |
| ax2_single.set_title('Distribution of Inference Times (Box Plot)') | |
| ax2_single.tick_params(axis='x', rotation=45) | |
| ax2_single.grid(axis='y', alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig(output_dir / "inference_time_distribution.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig2) | |
| print(f"Saved: inference_time_distribution.png") | |
| # 3. Comparison scatter plot with accuracy | |
| fig3, ax3_single = plt.subplots(figsize=(10, 6)) | |
| benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names] | |
| single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names] | |
| benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names] | |
| single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names] | |
| for i, model_name in enumerate(all_model_names): | |
| ax3_single.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100, label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}') | |
| ax3_single.scatter([single_times[i]], [single_accs[i]], marker='s', s=100, label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}') | |
| ax3_single.set_xlabel('Mean Inference Time (ms)') | |
| ax3_single.set_ylabel('Accuracy (%)') | |
| ax3_single.set_title('Accuracy vs Inference Time Comparison') | |
| ax3_single.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small') | |
| ax3_single.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig(output_dir / "accuracy_vs_inference_time.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig3) | |
| print(f"Saved: accuracy_vs_inference_time.png") | |
| # 4. Percentile comparison | |
| fig4, ax4_single = plt.subplots(figsize=(12, 6)) | |
| x = np.arange(len(all_model_names)) | |
| width = 0.25 | |
| benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names] | |
| benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names] | |
| benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names] | |
| single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names] | |
| single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names] | |
| single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names] | |
| bars_p50 = ax4_single.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8) | |
| bars_p95 = ax4_single.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8) | |
| bars_p99 = ax4_single.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8) | |
| ax4_single.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//') | |
| ax4_single.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//') | |
| ax4_single.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//') | |
| ax4_single.set_xlabel('Model') | |
| ax4_single.set_ylabel('Inference Time (ms)') | |
| ax4_single.set_title('Percentile Comparison (P50, P95, P99)') | |
| ax4_single.set_xticks(x) | |
| ax4_single.set_xticklabels(all_model_names, rotation=45, ha='right') | |
| ax4_single.legend(fontsize='small') | |
| ax4_single.grid(axis='y', alpha=0.3) | |
| plt.tight_layout() | |
| plt.savefig(output_dir / "percentile_comparison.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig4) | |
| print(f"Saved: percentile_comparison.png") | |
| # 5. Standard deviation comparison | |
| fig5, ax5_single = plt.subplots(figsize=(10, 6)) | |
| benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names] | |
| single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names] | |
| x = np.arange(len(all_model_names)) | |
| width = 0.35 | |
| bars_std1 = ax5_single.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8) | |
| bars_std2 = ax5_single.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8) | |
| ax5_single.set_xlabel('Model') | |
| ax5_single.set_ylabel('Standard Deviation (ms)') | |
| ax5_single.set_title('Standard Deviation of Inference Times') | |
| ax5_single.set_xticks(x) | |
| ax5_single.set_xticklabels(all_model_names, rotation=45, ha='right') | |
| ax5_single.legend() | |
| ax5_single.grid(axis='y', alpha=0.3) | |
| for bar in bars_std1: | |
| height = bar.get_height() | |
| ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7) | |
| for bar in bars_std2: | |
| height = bar.get_height() | |
| ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7) | |
| plt.tight_layout() | |
| plt.savefig(output_dir / "standard_deviation_comparison.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig5) | |
| print(f"Saved: standard_deviation_comparison.png") | |
| # 6. Summary statistics table | |
| fig6, ax6_single = plt.subplots(figsize=(14, 6)) | |
| ax6_single.axis('off') | |
| table_data = [] | |
| for model_name in all_model_names: | |
| row = [ | |
| model_name, | |
| f"{benchmark_models[model_name]['mean']*1000:.3f} ± {benchmark_models[model_name]['std']*1000:.3f}", | |
| f"{benchmark_models[model_name]['min']*1000:.3f}", | |
| f"{benchmark_models[model_name]['max']*1000:.3f}", | |
| f"{benchmark_models[model_name]['accuracy']*100:.1f}%", | |
| f"{single_benchmark_models[model_name]['mean']*1000:.3f} ± {single_benchmark_models[model_name]['std']*1000:.3f}", | |
| f"{single_benchmark_models[model_name]['min']*1000:.3f}", | |
| f"{single_benchmark_models[model_name]['max']*1000:.3f}", | |
| f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%" | |
| ] | |
| table_data.append(row) | |
| columns = ['Model', 'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)', | |
| 'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)'] | |
| table = ax6_single.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center') | |
| table.auto_set_font_size(False) | |
| table.set_fontsize(9) | |
| table.scale(1.1, 1.8) | |
| for i in range(len(all_model_names)): | |
| for j in range(len(columns)): | |
| cell = table[(i+1, j)] | |
| cell.set_height(0.4) | |
| if j < 5: | |
| cell.set_facecolor('#f0f0f0') | |
| else: | |
| cell.set_facecolor('#e0e0f0') | |
| ax6_single.set_title('Summary Statistics Comparison', fontsize=12, pad=20) | |
| plt.tight_layout() | |
| plt.savefig(output_dir / "summary_statistics.png", dpi=300, bbox_inches='tight') | |
| plt.close(fig6) | |
| print(f"Saved: summary_statistics.png") | |
| print(f"\nAll individual visualizations saved to: {output_dir}") | |
| # Also save as interactive HTML | |
| html_output = Path(__file__).parent / "response_time_comparison.html" | |
| with open(html_output, 'w') as f: | |
| f.write(f"""<!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Benchmark Response Time Comparison</title> | |
| <style> | |
| body {{ font-family: Arial, sans-serif; margin: 20px; }} | |
| h1 {{ text-align: center; }} | |
| .chart {{ max-width: 1200px; margin: 0 auto; }} | |
| .model-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }} | |
| .model-title {{ font-weight: bold; font-size: 1.2em; margin-bottom: 10px; }} | |
| table {{ width: 100%; border-collapse: collapse; }} | |
| th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }} | |
| th {{ background-color: #f4f4f4; }} | |
| </style> | |
| </head> | |
| <body> | |
| <h1>Benchmark Response Time Comparison</h1> | |
| <p><strong>Multi-benchmark:</strong> {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats</p> | |
| <p><strong>Single-benchmark:</strong> {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats</p> | |
| <p><img src="response_time_comparison.png" alt="Comparison Chart" class="chart"></p> | |
| <h2>Detailed Statistics</h2> | |
| """) | |
| for model_name in all_model_names: | |
| f.write(f""" | |
| <div class="model-section"> | |
| <div class="model-title">{model_name}</div> | |
| <table> | |
| <tr> | |
| <th>Metric</th> | |
| <th>Multi-benchmark</th> | |
| <th>Single-benchmark</th> | |
| <th>Change</th> | |
| </tr> | |
| <tr> | |
| <td>Mean (ms)</td> | |
| <td>{benchmark_models[model_name]['mean']*1000:.4f}</td> | |
| <td>{single_benchmark_models[model_name]['mean']*1000:.4f}</td> | |
| <td>{((single_benchmark_models[model_name]['mean'] - benchmark_models[model_name]['mean']) / benchmark_models[model_name]['mean'] * 100):.1f}%</td> | |
| </tr> | |
| <tr> | |
| <td>Std (ms)</td> | |
| <td>{benchmark_models[model_name]['std']*1000:.4f}</td> | |
| <td>{single_benchmark_models[model_name]['std']*1000:.4f}</td> | |
| <td>{((single_benchmark_models[model_name]['std'] - benchmark_models[model_name]['std']) / benchmark_models[model_name]['std'] * 100):.1f}%</td> | |
| </tr> | |
| <tr> | |
| <td>Min (ms)</td> | |
| <td>{benchmark_models[model_name]['min']*1000:.4f}</td> | |
| <td>{single_benchmark_models[model_name]['min']*1000:.4f}</td> | |
| <td>{((single_benchmark_models[model_name]['min'] - benchmark_models[model_name]['min']) / benchmark_models[model_name]['min'] * 100):.1f}%</td> | |
| </tr> | |
| <tr> | |
| <td>Max (ms)</td> | |
| <td>{benchmark_models[model_name]['max']*1000:.4f}</td> | |
| <td>{single_benchmark_models[model_name]['max']*1000:.4f}</td> | |
| <td>{((single_benchmark_models[model_name]['max'] - benchmark_models[model_name]['max']) / benchmark_models[model_name]['max'] * 100):.1f}%</td> | |
| </tr> | |
| <tr> | |
| <td>Accuracy</td> | |
| <td>{benchmark_models[model_name]['accuracy']*100:.1f}%</td> | |
| <td>{single_benchmark_models[model_name]['accuracy']*100:.1f}%</td> | |
| <td>{(single_benchmark_models[model_name]['accuracy'] - benchmark_models[model_name]['accuracy']) * 100:.1f}pp</td> | |
| </tr> | |
| </table> | |
| </div> | |
| """) | |
| f.write(""" | |
| </body> | |
| </html>""") | |
| print(f"HTML report saved to: {html_output}") | |
| # Print summary to console | |
| print("\n=== Summary ===") | |
| print(f"Multi-benchmark: {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats") | |
| print(f"Single-benchmark: {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats") | |
| print("\nModel Comparison:") | |
| print("-" * 80) | |
| for model_name in all_model_names: | |
| b_mean = benchmark_models[model_name]['mean'] * 1000 | |
| s_mean = single_benchmark_models[model_name]['mean'] * 1000 | |
| change = ((s_mean - b_mean) / b_mean * 100) | |
| print(f"{model_name:20s} | Multi: {b_mean:6.3f}ms | Single: {s_mean:6.3f}ms | Change: {change:+6.1f}%") | |