| """Generate additional benchmarking plots.""" |
| import json |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
|
|
| RESULTS_DIR = '/Users/abhdey/Documents/My LLM/Research & Experiment/serving-results' |
| OUTPUT_DIR = '/Users/abhdey/Documents/My LLM/Research & Experiment/TinyLLMExperiment/serving-results' |
|
|
| models = ['tinystories_10m', 'tinystories_7m', 'tinystories_5m', 'tinystories_2_5m'] |
| labels = ['10M', '7M', '5M', '2.5M'] |
| colors = ['#0969da', '#e5383b', '#2d6a4f', '#9b5de5'] |
|
|
| data = {} |
| for model, label in zip(models, labels): |
| with open(f'{RESULTS_DIR}/{model}_samples.json') as f: |
| samples = json.load(f) |
| data[label] = [s['metrics'] for s in samples] |
|
|
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| for label, color in zip(labels, colors): |
| tokens = [m['tokens_generated'] for m in data[label]] |
| rep = [m['repetition_rate'] * 100 for m in data[label]] |
| ax.scatter(tokens, rep, c=color, alpha=0.6, s=30, label=label) |
| ax.set_xlabel('Tokens Generated', fontsize=12) |
| ax.set_ylabel('Repetition Rate (%)', fontsize=12) |
| ax.set_title('Tokens Generated vs Repetition Rate (100 samples per model)', fontsize=13) |
| ax.legend(fontsize=11) |
| ax.grid(True, alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(f'{OUTPUT_DIR}/plot4_tokens_vs_repetition.png', dpi=150) |
| plt.close() |
| print("Plot 4 saved: tokens vs repetition") |
|
|
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| for label, color in zip(labels, colors): |
| tokens = [m['tokens_generated'] for m in data[label]] |
| div = [m['unique_tokens'] * 100 for m in data[label]] |
| ax.scatter(tokens, div, c=color, alpha=0.6, s=30, label=label) |
| ax.set_xlabel('Tokens Generated', fontsize=12) |
| ax.set_ylabel('Vocab Diversity (%)', fontsize=12) |
| ax.set_title('Tokens Generated vs Vocab Diversity (100 samples per model)', fontsize=13) |
| ax.legend(fontsize=11) |
| ax.grid(True, alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(f'{OUTPUT_DIR}/plot5_tokens_vs_diversity.png', dpi=150) |
| plt.close() |
| print("Plot 5 saved: tokens vs diversity") |
|
|
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| for label, color in zip(labels, colors): |
| prob = [m['avg_token_prob'] for m in data[label]] |
| coherence = [m['coherence_length'] for m in data[label]] |
| ax.scatter(prob, coherence, c=color, alpha=0.6, s=30, label=label) |
| ax.set_xlabel('Avg Token Probability', fontsize=12) |
| ax.set_ylabel('Coherence Length', fontsize=12) |
| ax.set_title('Token Confidence vs Coherence Length (100 samples per model)', fontsize=13) |
| ax.legend(fontsize=11) |
| ax.grid(True, alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(f'{OUTPUT_DIR}/plot6_confidence_vs_coherence.png', dpi=150) |
| plt.close() |
| print("Plot 6 saved: confidence vs coherence") |
|
|
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| for label, color in zip(labels, colors): |
| tps = [m['tps'] for m in data[label][1:]] |
| latency = [m['total_latency_ms'] for m in data[label][1:]] |
| ax.scatter(tps, latency, c=color, alpha=0.6, s=30, label=label) |
| ax.set_xlabel('TPS (tokens/sec)', fontsize=12) |
| ax.set_ylabel('Total Latency (ms)', fontsize=12) |
| ax.set_title('TPS vs Total Latency (99 samples per model, cold start excluded)', fontsize=13) |
| ax.legend(fontsize=11) |
| ax.grid(True, alpha=0.3) |
| plt.tight_layout() |
| plt.savefig(f'{OUTPUT_DIR}/plot9_tps_vs_latency.png', dpi=150) |
| plt.close() |
| print("Plot 9 saved: tps vs latency") |
|
|
| print(f"\nAll 4 plots saved to: {OUTPUT_DIR}") |
|
|