#!/usr/bin/env python3 """ Create visualizations for Model Scaling Study. Generates publication-ready charts and tables. """ import json import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path # Set style for publication-quality figures sns.set_style("whitegrid") plt.rcParams['figure.figsize'] = (12, 8) plt.rcParams['font.size'] = 12 plt.rcParams['axes.labelsize'] = 14 plt.rcParams['axes.titlesize'] = 16 plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12 plt.rcParams['legend.fontsize'] = 12 # Create output directory output_dir = Path('visualizations') output_dir.mkdir(exist_ok=True) print("="*80) print("CREATING VISUALIZATIONS FOR MODEL SCALING STUDY") print("="*80) print() # Load data print("Loading data...") # Quality results quality_data = { 'Base': {'valid_rate': 0.994, 'diversity': 0.978, 'unique': 489, 'samples': 500}, 'Medium': {'valid_rate': 0.992, 'diversity': 0.988, 'unique': 494, 'samples': 500}, 'Large': {'valid_rate': 1.000, 'diversity': 0.986, 'unique': 493, 'samples': 500} } # Nguyen benchmark summary with open('results_nguyen_benchmarks/summary.json') as f: nguyen_data = json.load(f) # Extract Nguyen stats by model nguyen_stats = {} for model in ['base', 'medium', 'large']: model_results = [r for r in nguyen_data['results'] if r['model'] == model] nguyen_stats[model.capitalize()] = { 'avg_valid_rate': np.mean([r['valid_rate'] for r in model_results]), 'avg_best_r2': np.mean([r['best_r2'] for r in model_results]), 'max_r2': max([r['best_r2'] for r in model_results]), 'benchmarks_gt_099': sum([1 for r in model_results if r['best_r2'] > 0.99]) } print("Data loaded successfully!") print() # ============================================================================ # Figure 1: Valid Rate Comparison (Quality + Benchmarks) # ============================================================================ print("Creating Figure 1: Valid Rate Comparison...") fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) models = ['Base', 'Medium', 'Large'] colors = ['#3498db', '#e74c3c', '#2ecc71'] # Quality valid rates quality_valid = [quality_data[m]['valid_rate'] * 100 for m in models] bars1 = ax1.bar(models, quality_valid, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5) ax1.set_ylabel('Valid Expression Rate (%)', fontsize=14, fontweight='bold') ax1.set_title('Quality Evaluation\n(500 samples per model)', fontsize=16, fontweight='bold') ax1.set_ylim([95, 101]) ax1.axhline(y=100, color='green', linestyle='--', linewidth=2, label='Perfect (100%)') ax1.legend() # Add value labels on bars for bar, val in zip(bars1, quality_valid): height = bar.get_height() ax1.text(bar.get_x() + bar.get_width()/2., height + 0.3, f'{val:.1f}%', ha='center', va='bottom', fontsize=12, fontweight='bold') # Benchmark valid rates benchmark_valid = [nguyen_stats[m]['avg_valid_rate'] * 100 for m in models] bars2 = ax2.bar(models, benchmark_valid, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5) ax2.set_ylabel('Valid Expression Rate (%)', fontsize=14, fontweight='bold') ax2.set_title('Nguyen Benchmarks\n(36 experiments, 3,600 expressions)', fontsize=16, fontweight='bold') ax2.set_ylim([0, 100]) # Add value labels on bars for bar, val in zip(bars2, benchmark_valid): height = bar.get_height() ax2.text(bar.get_x() + bar.get_width()/2., height + 2, f'{val:.1f}%', ha='center', va='bottom', fontsize=12, fontweight='bold') plt.tight_layout() plt.savefig(output_dir / 'fig1_valid_rate_comparison.png', dpi=300, bbox_inches='tight') print(f" Saved: {output_dir / 'fig1_valid_rate_comparison.png'}") plt.close() # ============================================================================ # Figure 2: R² Performance on Nguyen Benchmarks # ============================================================================ print("Creating Figure 2: R² Performance...") fig, ax = plt.subplots(figsize=(12, 6)) x = np.arange(len(models)) width = 0.25 avg_r2 = [nguyen_stats[m]['avg_best_r2'] for m in models] max_r2 = [nguyen_stats[m]['max_r2'] for m in models] bars1 = ax.bar(x - width/2, avg_r2, width, label='Average Best R²', color='#3498db', alpha=0.8, edgecolor='black', linewidth=1.5) bars2 = ax.bar(x + width/2, max_r2, width, label='Maximum R²', color='#e74c3c', alpha=0.8, edgecolor='black', linewidth=1.5) ax.set_ylabel('R² Score', fontsize=14, fontweight='bold') ax.set_title('Symbolic Regression Performance (Nguyen Benchmarks)', fontsize=16, fontweight='bold') ax.set_xticks(x) ax.set_xticklabels(models) ax.legend(fontsize=12) ax.set_ylim([0.85, 1.05]) ax.axhline(y=1.0, color='green', linestyle='--', linewidth=2, alpha=0.5, label='Perfect Fit') ax.grid(axis='y', alpha=0.3) # Add value labels for bar in bars1: height = bar.get_height() ax.text(bar.get_x() + bar.get_width()/2., height + 0.01, f'{height:.4f}', ha='center', va='bottom', fontsize=11, fontweight='bold') for bar in bars2: height = bar.get_height() ax.text(bar.get_x() + bar.get_width()/2., height + 0.01, f'{height:.4f}', ha='center', va='bottom', fontsize=11, fontweight='bold') plt.tight_layout() plt.savefig(output_dir / 'fig2_r2_performance.png', dpi=300, bbox_inches='tight') print(f" Saved: {output_dir / 'fig2_r2_performance.png'}") plt.close() # ============================================================================ # Figure 3: Per-Benchmark Heatmap # ============================================================================ print("Creating Figure 3: Per-Benchmark Heatmap...") # Extract per-benchmark R² scores benchmark_matrix = [] for bench in range(1, 13): row = [] for model in ['base', 'medium', 'large']: result = [r for r in nguyen_data['results'] if r['model'] == model and r['benchmark'] == f'nguyen_{bench}'] if result: row.append(result[0]['best_r2']) else: row.append(0) benchmark_matrix.append(row) benchmark_matrix = np.array(benchmark_matrix) fig, ax = plt.subplots(figsize=(10, 10)) im = ax.imshow(benchmark_matrix, cmap='RdYlGn', aspect='auto', vmin=0.5, vmax=1.0) # Set ticks ax.set_xticks(np.arange(3)) ax.set_yticks(np.arange(12)) ax.set_xticklabels(['Base\n(124M)', 'Medium\n(355M)', 'Large\n(774M)'], fontsize=12) ax.set_yticklabels([f'Nguyen-{i+1}' for i in range(12)], fontsize=11) # Add colorbar cbar = plt.colorbar(im, ax=ax) cbar.set_label('R² Score', rotation=270, labelpad=20, fontsize=14, fontweight='bold') # Add text annotations for i in range(12): for j in range(3): text = ax.text(j, i, f'{benchmark_matrix[i, j]:.3f}', ha="center", va="center", color="black", fontsize=10, fontweight='bold') ax.set_title('R² Scores by Model and Benchmark', fontsize=16, fontweight='bold', pad=20) plt.tight_layout() plt.savefig(output_dir / 'fig3_benchmark_heatmap.png', dpi=300, bbox_inches='tight') print(f" Saved: {output_dir / 'fig3_benchmark_heatmap.png'}") plt.close() # ============================================================================ # Figure 4: Scaling Progression (Valid Rate + R²) # ============================================================================ print("Creating Figure 4: Scaling Progression...") fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) params = [124, 355, 774] # Millions # Valid rate progression (benchmarks) ax1.plot(params, benchmark_valid, 'o-', color='#3498db', linewidth=3, markersize=12, label='Nguyen Valid Rate', markeredgecolor='black', markeredgewidth=2) ax1.set_xlabel('Model Size (Million Parameters)', fontsize=14, fontweight='bold') ax1.set_ylabel('Valid Expression Rate (%)', fontsize=14, fontweight='bold') ax1.set_title('Valid Rate vs Model Size', fontsize=16, fontweight='bold') ax1.grid(True, alpha=0.3) ax1.legend(fontsize=12) # Add value labels for x, y in zip(params, benchmark_valid): ax1.text(x, y + 2, f'{y:.1f}%', ha='center', fontsize=11, fontweight='bold') # R² progression ax2.plot(params, avg_r2, 'o-', color='#e74c3c', linewidth=3, markersize=12, label='Average Best R²', markeredgecolor='black', markeredgewidth=2) ax2.axhline(y=1.0, color='green', linestyle='--', linewidth=2, alpha=0.5, label='Perfect Fit') ax2.set_xlabel('Model Size (Million Parameters)', fontsize=14, fontweight='bold') ax2.set_ylabel('R² Score', fontsize=14, fontweight='bold') ax2.set_title('R² vs Model Size', fontsize=16, fontweight='bold') ax2.set_ylim([0.9, 1.02]) ax2.grid(True, alpha=0.3) ax2.legend(fontsize=12) # Add value labels for x, y in zip(params, avg_r2): ax2.text(x, y + 0.005, f'{y:.4f}', ha='center', fontsize=11, fontweight='bold') plt.tight_layout() plt.savefig(output_dir / 'fig4_scaling_progression.png', dpi=300, bbox_inches='tight') print(f" Saved: {output_dir / 'fig4_scaling_progression.png'}") plt.close() print() print("="*80) print("ALL VISUALIZATIONS CREATED SUCCESSFULLY!") print("="*80) print() print(f"Output directory: {output_dir.absolute()}") print() print("Generated files:") print(" 1. fig1_valid_rate_comparison.png - Quality vs Benchmark valid rates") print(" 2. fig2_r2_performance.png - R² scores comparison") print(" 3. fig3_benchmark_heatmap.png - Per-benchmark R² heatmap") print(" 4. fig4_scaling_progression.png - Scaling laws visualization") print() print("These figures are publication-ready (300 DPI, high resolution)") print()