|
|
""" |
|
|
plot_results.py - Generate visualizations from benchmark results. |
|
|
|
|
|
Creates publication-quality plots comparing RippleGPT vs VanillaGPT2. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
import matplotlib.pyplot as plt |
|
|
import matplotlib.patches as mpatches |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
COLORS = { |
|
|
"ripple": "#4CAF50", |
|
|
"baseline": "#2196F3", |
|
|
"highlight": "#FF9800", |
|
|
"background": "#1a1a2e", |
|
|
"text": "#ffffff", |
|
|
"grid": "#333355" |
|
|
} |
|
|
|
|
|
|
|
|
plt.style.use('dark_background') |
|
|
plt.rcParams.update({ |
|
|
'font.family': 'sans-serif', |
|
|
'font.size': 11, |
|
|
'axes.titlesize': 14, |
|
|
'axes.labelsize': 12, |
|
|
'figure.facecolor': COLORS['background'], |
|
|
'axes.facecolor': COLORS['background'], |
|
|
'savefig.facecolor': COLORS['background'], |
|
|
'axes.edgecolor': COLORS['grid'], |
|
|
'axes.grid': True, |
|
|
'grid.color': COLORS['grid'], |
|
|
'grid.alpha': 0.3 |
|
|
}) |
|
|
|
|
|
|
|
|
def load_results(results_dir: Path) -> List[Dict]: |
|
|
"""Load all benchmark result files from directory.""" |
|
|
results = [] |
|
|
for f in results_dir.glob("benchmark_*.json"): |
|
|
with open(f) as fp: |
|
|
results.append(json.load(fp)) |
|
|
return results |
|
|
|
|
|
|
|
|
def plot_parameter_comparison(results: List[Dict], output_path: Path): |
|
|
"""Bar chart comparing parameter counts.""" |
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
|
|
datasets = [] |
|
|
sizes = [] |
|
|
ripple_params = [] |
|
|
baseline_params = [] |
|
|
|
|
|
for r in results: |
|
|
label = f"{r['metadata']['dataset']}_{r['metadata']['size']}" |
|
|
datasets.append(label) |
|
|
ripple_params.append(r['parameters']['ripple'] / 1e6) |
|
|
baseline_params.append(r['parameters']['baseline'] / 1e6) |
|
|
|
|
|
x = np.arange(len(datasets)) |
|
|
width = 0.35 |
|
|
|
|
|
bars1 = ax.bar(x - width/2, ripple_params, width, |
|
|
label='RippleGPT', color=COLORS['ripple'], alpha=0.9) |
|
|
bars2 = ax.bar(x + width/2, baseline_params, width, |
|
|
label='VanillaGPT2', color=COLORS['baseline'], alpha=0.9) |
|
|
|
|
|
ax.set_ylabel('Parameters (Millions)') |
|
|
ax.set_title('π Parameter Comparison: RippleGPT vs VanillaGPT2') |
|
|
ax.set_xticks(x) |
|
|
ax.set_xticklabels(datasets, rotation=15, ha='right') |
|
|
ax.legend() |
|
|
|
|
|
|
|
|
for bar, val in zip(bars1, ripple_params): |
|
|
ax.annotate(f'{val:.1f}M', |
|
|
xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), |
|
|
xytext=(0, 3), textcoords="offset points", |
|
|
ha='center', va='bottom', fontsize=9, color=COLORS['text']) |
|
|
|
|
|
for bar, val in zip(bars2, baseline_params): |
|
|
ax.annotate(f'{val:.1f}M', |
|
|
xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), |
|
|
xytext=(0, 3), textcoords="offset points", |
|
|
ha='center', va='bottom', fontsize=9, color=COLORS['text']) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig(output_path / 'parameter_comparison.png', dpi=150) |
|
|
plt.close() |
|
|
print(f"β
Saved: {output_path / 'parameter_comparison.png'}") |
|
|
|
|
|
|
|
|
def plot_loss_curves(results: List[Dict], output_path: Path): |
|
|
"""Plot training loss curves for all benchmarks.""" |
|
|
n_results = len(results) |
|
|
cols = min(2, n_results) |
|
|
rows = (n_results + cols - 1) // cols |
|
|
|
|
|
fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 4*rows)) |
|
|
if n_results == 1: |
|
|
axes = [axes] |
|
|
else: |
|
|
axes = axes.flatten() if n_results > 2 else list(axes) |
|
|
|
|
|
for idx, r in enumerate(results): |
|
|
ax = axes[idx] |
|
|
|
|
|
ripple_curve = r['ripple']['training']['loss_curve'] |
|
|
baseline_curve = r['baseline']['training']['loss_curve'] |
|
|
|
|
|
r_iters = [x[0] for x in ripple_curve] |
|
|
r_losses = [x[1] for x in ripple_curve] |
|
|
b_iters = [x[0] for x in baseline_curve] |
|
|
b_losses = [x[1] for x in baseline_curve] |
|
|
|
|
|
ax.plot(r_iters, r_losses, color=COLORS['ripple'], |
|
|
linewidth=2, label='RippleGPT', marker='o', markersize=4) |
|
|
ax.plot(b_iters, b_losses, color=COLORS['baseline'], |
|
|
linewidth=2, label='VanillaGPT2', marker='s', markersize=4) |
|
|
|
|
|
title = f"{r['metadata']['dataset'].capitalize()} ({r['metadata']['size']})" |
|
|
ax.set_title(f"π {title}") |
|
|
ax.set_xlabel('Iteration') |
|
|
ax.set_ylabel('Loss') |
|
|
ax.legend(loc='upper right') |
|
|
|
|
|
|
|
|
for idx in range(len(results), len(axes)): |
|
|
axes[idx].set_visible(False) |
|
|
|
|
|
plt.suptitle('Training Loss Curves', fontsize=16, y=1.02) |
|
|
plt.tight_layout() |
|
|
plt.savefig(output_path / 'loss_curves.png', dpi=150) |
|
|
plt.close() |
|
|
print(f"β
Saved: {output_path / 'loss_curves.png'}") |
|
|
|
|
|
|
|
|
def plot_extrapolation(results: List[Dict], output_path: Path): |
|
|
"""Plot extrapolation capability comparison.""" |
|
|
|
|
|
extrap_results = [r for r in results if r['ripple'].get('extrapolation')] |
|
|
|
|
|
if not extrap_results: |
|
|
print("β οΈ No extrapolation data found in results") |
|
|
return |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
|
|
for idx, r in enumerate(extrap_results): |
|
|
extrap = r['ripple']['extrapolation'] |
|
|
train_block = r['metadata']['model_config']['block_size'] |
|
|
|
|
|
|
|
|
sizes = sorted([int(k) for k in extrap.keys()]) |
|
|
ppls = [extrap[str(s)] for s in sizes] |
|
|
ratios = [s / train_block for s in sizes] |
|
|
|
|
|
|
|
|
train_loss = r['ripple']['training']['final_loss'] |
|
|
train_ppl = np.exp(train_loss) |
|
|
|
|
|
all_sizes = [train_block] + sizes |
|
|
all_ppls = [train_ppl] + ppls |
|
|
all_ratios = [1.0] + ratios |
|
|
|
|
|
label = f"{r['metadata']['dataset']} ({r['metadata']['size']})" |
|
|
ax.plot(all_ratios, all_ppls, marker='o', linewidth=2, |
|
|
label=label, markersize=8) |
|
|
|
|
|
ax.axhline(y=train_ppl, color=COLORS['highlight'], linestyle='--', |
|
|
alpha=0.5, label='Training baseline') |
|
|
ax.axvline(x=1.0, color=COLORS['grid'], linestyle=':', alpha=0.5) |
|
|
|
|
|
ax.set_xlabel('Context Ratio (relative to training)') |
|
|
ax.set_ylabel('Perplexity') |
|
|
ax.set_title('π RippleGPT Extrapolation Capability\n(Lower is better, <1.0x = shorter, >1.0x = longer than training)') |
|
|
ax.legend() |
|
|
|
|
|
|
|
|
ax.annotate('Training\nContext', xy=(1.0, ax.get_ylim()[0]), |
|
|
xytext=(1.0, ax.get_ylim()[0] + 0.5), |
|
|
ha='center', fontsize=9, color=COLORS['text']) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig(output_path / 'extrapolation.png', dpi=150) |
|
|
plt.close() |
|
|
print(f"β
Saved: {output_path / 'extrapolation.png'}") |
|
|
|
|
|
|
|
|
def plot_summary_table(results: List[Dict], output_path: Path): |
|
|
"""Create a summary table as an image.""" |
|
|
fig, ax = plt.subplots(figsize=(12, 4)) |
|
|
ax.axis('off') |
|
|
|
|
|
|
|
|
columns = ['Dataset', 'Size', 'Ripple Params', 'GPT2 Params', |
|
|
'Ripple Loss', 'GPT2 Loss', 'Winner'] |
|
|
|
|
|
rows = [] |
|
|
for r in results: |
|
|
r_params = f"{r['parameters']['ripple']/1e6:.1f}M" |
|
|
b_params = f"{r['parameters']['baseline']/1e6:.1f}M" |
|
|
r_loss = f"{r['ripple']['training']['final_loss']:.4f}" |
|
|
b_loss = f"{r['baseline']['training']['final_loss']:.4f}" |
|
|
|
|
|
|
|
|
winner = "RippleGPT" if r['ripple']['training']['final_loss'] < r['baseline']['training']['final_loss'] else "VanillaGPT2" |
|
|
|
|
|
rows.append([ |
|
|
r['metadata']['dataset'].capitalize(), |
|
|
r['metadata']['size'].capitalize(), |
|
|
r_params, |
|
|
b_params, |
|
|
r_loss, |
|
|
b_loss, |
|
|
winner |
|
|
]) |
|
|
|
|
|
table = ax.table( |
|
|
cellText=rows, |
|
|
colLabels=columns, |
|
|
loc='center', |
|
|
cellLoc='center', |
|
|
colColours=[COLORS['grid']] * len(columns) |
|
|
) |
|
|
|
|
|
table.auto_set_font_size(False) |
|
|
table.set_fontsize(10) |
|
|
table.scale(1.2, 1.5) |
|
|
|
|
|
|
|
|
for (row, col), cell in table.get_celld().items(): |
|
|
if row == 0: |
|
|
cell.set_text_props(weight='bold', color=COLORS['text']) |
|
|
cell.set_facecolor(COLORS['grid']) |
|
|
else: |
|
|
cell.set_facecolor(COLORS['background']) |
|
|
cell.set_text_props(color=COLORS['text']) |
|
|
|
|
|
ax.set_title('π Benchmark Summary', fontsize=14, pad=20) |
|
|
plt.tight_layout() |
|
|
plt.savefig(output_path / 'summary_table.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
print(f"β
Saved: {output_path / 'summary_table.png'}") |
|
|
|
|
|
|
|
|
def generate_all_plots(results_dir: str): |
|
|
"""Generate all plots from benchmark results.""" |
|
|
results_path = Path(results_dir) |
|
|
|
|
|
if not results_path.exists(): |
|
|
print(f"β Results directory not found: {results_path}") |
|
|
return |
|
|
|
|
|
results = load_results(results_path) |
|
|
|
|
|
if not results: |
|
|
print(f"β No benchmark results found in {results_path}") |
|
|
return |
|
|
|
|
|
print(f"\nπ Found {len(results)} benchmark results") |
|
|
|
|
|
|
|
|
plots_dir = results_path / 'plots' |
|
|
plots_dir.mkdir(exist_ok=True) |
|
|
|
|
|
|
|
|
print("\nπ¨ Generating plots...") |
|
|
plot_parameter_comparison(results, plots_dir) |
|
|
plot_loss_curves(results, plots_dir) |
|
|
plot_extrapolation(results, plots_dir) |
|
|
plot_summary_table(results, plots_dir) |
|
|
|
|
|
print(f"\nβ
All plots saved to: {plots_dir}") |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser(description="Generate benchmark plots") |
|
|
parser.add_argument( |
|
|
"--results", |
|
|
type=str, |
|
|
default="validation/benchmarks/results", |
|
|
help="Path to results directory" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
generate_all_plots(args.results) |
|
|
|