#!/usr/bin/env python3
"""
Script to compare response times (inference times) from two benchmark JSON files.
Generates a visualization comparing the models from both benchmarks.
"""

import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# File paths
benchmark_path = Path(__file__).parent / "../benchmark_20260310_090052.json"
single_benchmark_path = Path(__file__).parent / "../single_benchmark_20260310_090011.json"

# Load benchmark data
with open(benchmark_path, 'r') as f:
    benchmark_data = json.load(f)

with open(single_benchmark_path, 'r') as f:
    single_benchmark_data = json.load(f)

# Extract model data
def extract_model_data(data_dict):
    models = {}
    for model_name, model_info in data_dict.get('models', {}).items():
        models[model_name] = {
            'mean': model_info.get('inference_time_mean', 0),
            'std': model_info.get('inference_time_std', 0),
            'min': model_info.get('inference_time_min', 0),
            'max': model_info.get('inference_time_max', 0),
            'p50': model_info.get('inference_time_p50', 0),
            'p95': model_info.get('inference_time_p95', 0),
            'p99': model_info.get('inference_time_p99', 0),
            'accuracy': model_info.get('accuracy', 0),
            'timing_samples': model_info.get('timing_samples', [])
        }
    return models

benchmark_models = extract_model_data(benchmark_data)
single_benchmark_models = extract_model_data(single_benchmark_data)

# Get all model names (should be the same in both)
all_model_names = sorted(benchmark_models.keys())

# Create figure with subplots
fig = plt.figure(figsize=(16, 10))

# 1. Bar chart comparing mean inference times
ax1 = fig.add_subplot(2, 3, 1)
x = np.arange(len(all_model_names))
width = 0.35

benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]  # Convert to ms
single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]  # Convert to ms

bars1 = ax1.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8)
bars2 = ax1.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8)

ax1.set_xlabel('Model')
ax1.set_ylabel('Mean Inference Time (ms)')
ax1.set_title('Comparison of Mean Inference Times')
ax1.set_xticks(x)
ax1.set_xticklabels(all_model_names, rotation=45, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.annotate(f'{height:.3f}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax1.annotate(f'{height:.3f}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=8)

# 2. Box plot comparing timing distributions
ax2 = fig.add_subplot(2, 3, 2)

# Prepare data for box plot
all_data = []
labels = []
colors = []

for i, model_name in enumerate(all_model_names):
    benchmark_samples = benchmark_models[model_name]['timing_samples'][:10]  # Use first 10 for comparison
    single_samples = single_benchmark_models[model_name]['timing_samples'][:10]  # Use first 10 for comparison

    # Convert to ms
    benchmark_ms = [s * 1000 for s in benchmark_samples]
    single_ms = [s * 1000 for s in single_samples]

    all_data.append(benchmark_ms)
    all_data.append(single_ms)
    labels.append(f'{model_name}\nMulti')
    labels.append(f'{model_name}\nSingle')
    colors.extend([f'C{i}', f'C{i}'])

bp = ax2.boxplot(all_data, labels=labels, patch_artist=True, vert=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.6)

ax2.set_xlabel('Model (Benchmark Type)')
ax2.set_ylabel('Inference Time (ms)')
ax2.set_title('Distribution of Inference Times (Box Plot)')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(axis='y', alpha=0.3)

# 3. Comparison scatter plot with accuracy
ax3 = fig.add_subplot(2, 3, 3)

benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]

# Create scatter plot
for i, model_name in enumerate(all_model_names):
    ax3.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100,
                label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}')
    ax3.scatter([single_times[i]], [single_accs[i]], marker='s', s=100,
                label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}')

ax3.set_xlabel('Mean Inference Time (ms)')
ax3.set_ylabel('Accuracy (%)')
ax3.set_title('Accuracy vs Inference Time Comparison')
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
ax3.grid(True, alpha=0.3)

# 4. Percentile comparison
ax4 = fig.add_subplot(2, 3, 4)

x = np.arange(len(all_model_names))
width = 0.25

benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names]
benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names]
benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names]

single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names]
single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names]
single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names]

bars_p50 = ax4.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8)
bars_p95 = ax4.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8)
bars_p99 = ax4.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8)

# Single benchmark percentiles (offset)
ax4.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//')
ax4.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//')
ax4.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//')

ax4.set_xlabel('Model')
ax4.set_ylabel('Inference Time (ms)')
ax4.set_title('Percentile Comparison (P50, P95, P99)')
ax4.set_xticks(x)
ax4.set_xticklabels(all_model_names, rotation=45, ha='right')
ax4.legend(fontsize='small')
ax4.grid(axis='y', alpha=0.3)

# 5. Standard deviation comparison
ax5 = fig.add_subplot(2, 3, 5)

benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names]
single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names]

x = np.arange(len(all_model_names))
width = 0.35

bars_std1 = ax5.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8)
bars_std2 = ax5.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8)

ax5.set_xlabel('Model')
ax5.set_ylabel('Standard Deviation (ms)')
ax5.set_title('Standard Deviation of Inference Times')
ax5.set_xticks(x)
ax5.set_xticklabels(all_model_names, rotation=45, ha='right')
ax5.legend()
ax5.grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars_std1:
    height = bar.get_height()
    ax5.annotate(f'{height:.4f}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=7)

for bar in bars_std2:
    height = bar.get_height()
    ax5.annotate(f'{height:.4f}',
                 xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3),
                 textcoords="offset points",
                 ha='center', va='bottom', fontsize=7)

# 6. Summary statistics table
ax6 = fig.add_subplot(2, 3, 6)
ax6.axis('off')

# Create table data
table_data = []
for model_name in all_model_names:
    row = [
        model_name,
        f"{benchmark_models[model_name]['mean']*1000:.3f} ± {benchmark_models[model_name]['std']*1000:.3f}",
        f"{benchmark_models[model_name]['min']*1000:.3f}",
        f"{benchmark_models[model_name]['max']*1000:.3f}",
        f"{benchmark_models[model_name]['accuracy']*100:.1f}%",
        f"{single_benchmark_models[model_name]['mean']*1000:.3f} ± {single_benchmark_models[model_name]['std']*1000:.3f}",
        f"{single_benchmark_models[model_name]['min']*1000:.3f}",
        f"{single_benchmark_models[model_name]['max']*1000:.3f}",
        f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%"
    ]
    table_data.append(row)

columns = ['Model', 'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)',
           'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)']
row_labels = ['Multi', 'Single'] * len(all_model_names)

# Create table
table = ax6.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1.1, 1.8)

# Style the table
for i in range(len(all_model_names)):
    for j in range(len(columns)):
        cell = table[(i+1, j)]
        cell.set_height(0.4)
        if j < 5:
            cell.set_facecolor('#f0f0f0')  # Light gray for multi-benchmark columns
        else:
            cell.set_facecolor('#e0e0f0')  # Light blue for single-benchmark columns

ax6.set_title('Summary Statistics Comparison', fontsize=12, pad=20)

# Save each subplot as a separate PNG image
output_dir = Path(__file__).parent

# 1. Bar chart comparing mean inference times
fig1, ax1_single = plt.subplots(figsize=(10, 6))
x = np.arange(len(all_model_names))
width = 0.35
benchmark_means = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
single_means = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
bars1 = ax1_single.bar(x - width/2, benchmark_means, width, label='Multi-benchmark (100 samples)', alpha=0.8)
bars2 = ax1_single.bar(x + width/2, single_means, width, label='Single-benchmark (10 samples)', alpha=0.8)
ax1_single.set_xlabel('Model')
ax1_single.set_ylabel('Mean Inference Time (ms)')
ax1_single.set_title('Comparison of Mean Inference Times')
ax1_single.set_xticks(x)
ax1_single.set_xticklabels(all_model_names, rotation=45, ha='right')
ax1_single.legend()
ax1_single.grid(axis='y', alpha=0.3)
for bar in bars1:
    height = bar.get_height()
    ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
for bar in bars2:
    height = bar.get_height()
    ax1_single.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
plt.tight_layout()
plt.savefig(output_dir / "mean_inference_times.png", dpi=300, bbox_inches='tight')
plt.close(fig1)
print(f"Saved: mean_inference_times.png")

# 2. Box plot comparing timing distributions
fig2, ax2_single = plt.subplots(figsize=(12, 6))
all_data = []
labels = []
colors = []
for i, model_name in enumerate(all_model_names):
    benchmark_samples = benchmark_models[model_name]['timing_samples'][:10]
    single_samples = single_benchmark_models[model_name]['timing_samples'][:10]
    benchmark_ms = [s * 1000 for s in benchmark_samples]
    single_ms = [s * 1000 for s in single_samples]
    all_data.append(benchmark_ms)
    all_data.append(single_ms)
    labels.append(f'{model_name}\nMulti')
    labels.append(f'{model_name}\nSingle')
    colors.extend([f'C{i}', f'C{i}'])
bp = ax2_single.boxplot(all_data, labels=labels, patch_artist=True, vert=True)
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.6)
ax2_single.set_xlabel('Model (Benchmark Type)')
ax2_single.set_ylabel('Inference Time (ms)')
ax2_single.set_title('Distribution of Inference Times (Box Plot)')
ax2_single.tick_params(axis='x', rotation=45)
ax2_single.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "inference_time_distribution.png", dpi=300, bbox_inches='tight')
plt.close(fig2)
print(f"Saved: inference_time_distribution.png")

# 3. Comparison scatter plot with accuracy
fig3, ax3_single = plt.subplots(figsize=(10, 6))
benchmark_accs = [benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
single_accs = [single_benchmark_models[m]['accuracy'] * 100 for m in all_model_names]
benchmark_times = [benchmark_models[m]['mean'] * 1000 for m in all_model_names]
single_times = [single_benchmark_models[m]['mean'] * 1000 for m in all_model_names]
for i, model_name in enumerate(all_model_names):
    ax3_single.scatter([benchmark_times[i]], [benchmark_accs[i]], marker='o', s=100, label=f'{model_name} (Multi)', alpha=0.8, color=f'C{i}')
    ax3_single.scatter([single_times[i]], [single_accs[i]], marker='s', s=100, label=f'{model_name} (Single)', alpha=0.8, color=f'C{i}')
ax3_single.set_xlabel('Mean Inference Time (ms)')
ax3_single.set_ylabel('Accuracy (%)')
ax3_single.set_title('Accuracy vs Inference Time Comparison')
ax3_single.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
ax3_single.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "accuracy_vs_inference_time.png", dpi=300, bbox_inches='tight')
plt.close(fig3)
print(f"Saved: accuracy_vs_inference_time.png")

# 4. Percentile comparison
fig4, ax4_single = plt.subplots(figsize=(12, 6))
x = np.arange(len(all_model_names))
width = 0.25
benchmark_p50 = [benchmark_models[m]['p50'] * 1000 for m in all_model_names]
benchmark_p95 = [benchmark_models[m]['p95'] * 1000 for m in all_model_names]
benchmark_p99 = [benchmark_models[m]['p99'] * 1000 for m in all_model_names]
single_p50 = [single_benchmark_models[m]['p50'] * 1000 for m in all_model_names]
single_p95 = [single_benchmark_models[m]['p95'] * 1000 for m in all_model_names]
single_p99 = [single_benchmark_models[m]['p99'] * 1000 for m in all_model_names]
bars_p50 = ax4_single.bar(x - width, benchmark_p50, width, label='P50 (Multi)', alpha=0.8)
bars_p95 = ax4_single.bar(x, benchmark_p95, width, label='P95 (Multi)', alpha=0.8)
bars_p99 = ax4_single.bar(x + width, benchmark_p99, width, label='P99 (Multi)', alpha=0.8)
ax4_single.bar(x - width + 0.05, single_p50, width*0.8, label='P50 (Single)', alpha=0.6, hatch='//')
ax4_single.bar(x + 0.05, single_p95, width*0.8, label='P95 (Single)', alpha=0.6, hatch='//')
ax4_single.bar(x + width + 0.05, single_p99, width*0.8, label='P99 (Single)', alpha=0.6, hatch='//')
ax4_single.set_xlabel('Model')
ax4_single.set_ylabel('Inference Time (ms)')
ax4_single.set_title('Percentile Comparison (P50, P95, P99)')
ax4_single.set_xticks(x)
ax4_single.set_xticklabels(all_model_names, rotation=45, ha='right')
ax4_single.legend(fontsize='small')
ax4_single.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(output_dir / "percentile_comparison.png", dpi=300, bbox_inches='tight')
plt.close(fig4)
print(f"Saved: percentile_comparison.png")

# 5. Standard deviation comparison
fig5, ax5_single = plt.subplots(figsize=(10, 6))
benchmark_std = [benchmark_models[m]['std'] * 1000 for m in all_model_names]
single_std = [single_benchmark_models[m]['std'] * 1000 for m in all_model_names]
x = np.arange(len(all_model_names))
width = 0.35
bars_std1 = ax5_single.bar(x - width/2, benchmark_std, width, label='Multi-benchmark', alpha=0.8)
bars_std2 = ax5_single.bar(x + width/2, single_std, width, label='Single-benchmark', alpha=0.8)
ax5_single.set_xlabel('Model')
ax5_single.set_ylabel('Standard Deviation (ms)')
ax5_single.set_title('Standard Deviation of Inference Times')
ax5_single.set_xticks(x)
ax5_single.set_xticklabels(all_model_names, rotation=45, ha='right')
ax5_single.legend()
ax5_single.grid(axis='y', alpha=0.3)
for bar in bars_std1:
    height = bar.get_height()
    ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7)
for bar in bars_std2:
    height = bar.get_height()
    ax5_single.annotate(f'{height:.4f}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7)
plt.tight_layout()
plt.savefig(output_dir / "standard_deviation_comparison.png", dpi=300, bbox_inches='tight')
plt.close(fig5)
print(f"Saved: standard_deviation_comparison.png")

# 6. Summary statistics table
fig6, ax6_single = plt.subplots(figsize=(14, 6))
ax6_single.axis('off')
table_data = []
for model_name in all_model_names:
    row = [
        model_name,
        f"{benchmark_models[model_name]['mean']*1000:.3f} ± {benchmark_models[model_name]['std']*1000:.3f}",
        f"{benchmark_models[model_name]['min']*1000:.3f}",
        f"{benchmark_models[model_name]['max']*1000:.3f}",
        f"{benchmark_models[model_name]['accuracy']*100:.1f}%",
        f"{single_benchmark_models[model_name]['mean']*1000:.3f} ± {single_benchmark_models[model_name]['std']*1000:.3f}",
        f"{single_benchmark_models[model_name]['min']*1000:.3f}",
        f"{single_benchmark_models[model_name]['max']*1000:.3f}",
        f"{single_benchmark_models[model_name]['accuracy']*100:.1f}%"
    ]
    table_data.append(row)
columns = ['Model', 'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)',
           'Mean ± Std (ms)', 'Min (ms)', 'Max (ms)', 'Acc (%)']
table = ax6_single.table(cellText=table_data, colLabels=columns, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1.1, 1.8)
for i in range(len(all_model_names)):
    for j in range(len(columns)):
        cell = table[(i+1, j)]
        cell.set_height(0.4)
        if j < 5:
            cell.set_facecolor('#f0f0f0')
        else:
            cell.set_facecolor('#e0e0f0')
ax6_single.set_title('Summary Statistics Comparison', fontsize=12, pad=20)
plt.tight_layout()
plt.savefig(output_dir / "summary_statistics.png", dpi=300, bbox_inches='tight')
plt.close(fig6)
print(f"Saved: summary_statistics.png")

print(f"\nAll individual visualizations saved to: {output_dir}")

# Also save as interactive HTML
html_output = Path(__file__).parent / "response_time_comparison.html"
with open(html_output, 'w') as f:
    f.write(f"""<!DOCTYPE html>
<html>
<head>
    <title>Benchmark Response Time Comparison</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; }}
        h1 {{ text-align: center; }}
        .chart {{ max-width: 1200px; margin: 0 auto; }}
        .model-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
        .model-title {{ font-weight: bold; font-size: 1.2em; margin-bottom: 10px; }}
        table {{ width: 100%; border-collapse: collapse; }}
        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        th {{ background-color: #f4f4f4; }}
    </style>
</head>
<body>
    <h1>Benchmark Response Time Comparison</h1>
    <p><strong>Multi-benchmark:</strong> {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats</p>
    <p><strong>Single-benchmark:</strong> {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats</p>
    <p><img src="response_time_comparison.png" alt="Comparison Chart" class="chart"></p>
    <h2>Detailed Statistics</h2>
""")
    for model_name in all_model_names:
        f.write(f"""
    <div class="model-section">
        <div class="model-title">{model_name}</div>
        <table>
            <tr>
                <th>Metric</th>
                <th>Multi-benchmark</th>
                <th>Single-benchmark</th>
                <th>Change</th>
            </tr>
            <tr>
                <td>Mean (ms)</td>
                <td>{benchmark_models[model_name]['mean']*1000:.4f}</td>
                <td>{single_benchmark_models[model_name]['mean']*1000:.4f}</td>
                <td>{((single_benchmark_models[model_name]['mean'] - benchmark_models[model_name]['mean']) / benchmark_models[model_name]['mean'] * 100):.1f}%</td>
            </tr>
            <tr>
                <td>Std (ms)</td>
                <td>{benchmark_models[model_name]['std']*1000:.4f}</td>
                <td>{single_benchmark_models[model_name]['std']*1000:.4f}</td>
                <td>{((single_benchmark_models[model_name]['std'] - benchmark_models[model_name]['std']) / benchmark_models[model_name]['std'] * 100):.1f}%</td>
            </tr>
            <tr>
                <td>Min (ms)</td>
                <td>{benchmark_models[model_name]['min']*1000:.4f}</td>
                <td>{single_benchmark_models[model_name]['min']*1000:.4f}</td>
                <td>{((single_benchmark_models[model_name]['min'] - benchmark_models[model_name]['min']) / benchmark_models[model_name]['min'] * 100):.1f}%</td>
            </tr>
            <tr>
                <td>Max (ms)</td>
                <td>{benchmark_models[model_name]['max']*1000:.4f}</td>
                <td>{single_benchmark_models[model_name]['max']*1000:.4f}</td>
                <td>{((single_benchmark_models[model_name]['max'] - benchmark_models[model_name]['max']) / benchmark_models[model_name]['max'] * 100):.1f}%</td>
            </tr>
            <tr>
                <td>Accuracy</td>
                <td>{benchmark_models[model_name]['accuracy']*100:.1f}%</td>
                <td>{single_benchmark_models[model_name]['accuracy']*100:.1f}%</td>
                <td>{(single_benchmark_models[model_name]['accuracy'] - benchmark_models[model_name]['accuracy']) * 100:.1f}pp</td>
            </tr>
        </table>
    </div>
""")
    f.write("""
</body>
</html>""")
print(f"HTML report saved to: {html_output}")

# Print summary to console
print("\n=== Summary ===")
print(f"Multi-benchmark: {benchmark_data['num_samples']} samples, {benchmark_data['num_repeats']} repeats")
print(f"Single-benchmark: {single_benchmark_data['num_samples']} samples, {single_benchmark_data['num_repeats']} repeats")
print("\nModel Comparison:")
print("-" * 80)
for model_name in all_model_names:
    b_mean = benchmark_models[model_name]['mean'] * 1000
    s_mean = single_benchmark_models[model_name]['mean'] * 1000
    change = ((s_mean - b_mean) / b_mean * 100)
    print(f"{model_name:20s} | Multi: {b_mean:6.3f}ms | Single: {s_mean:6.3f}ms | Change: {change:+6.1f}%")