"""
Gradient Clipping Experiment

This script demonstrates how gradient clipping stabilizes training by preventing
sudden large weight updates caused by rare, high-loss data points.

Experiment Setup:
- Simple model: Embedding(4, 16) -> Linear(16, 4)
- Vocabulary: ['A', 'B', 'C', 'D']
- Dataset: 1000 samples with imbalanced targets (990 'A', 10 'B')
- Compare training with and without gradient clipping
"""

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import random

# Set seeds for reproducibility
SEED = 42


def set_seeds(seed=SEED):
    """Set all random seeds for reproducibility."""
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


# =============================================================================
# 1. MODEL DEFINITION
# =============================================================================

class SimpleNextTokenModel(nn.Module):
    """
    Simple model that takes a token index and predicts the next token.
    Architecture: Embedding -> Linear
    """
    def __init__(self, vocab_size=4, embedding_dim=16):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, x):
        """
        Args:
            x: Token indices of shape (batch_size,)
        Returns:
            Logits of shape (batch_size, vocab_size)
        """
        embedded = self.embedding(x)  # (batch_size, embedding_dim)
        logits = self.linear(embedded)  # (batch_size, vocab_size)
        return logits


# =============================================================================
# 2. DATASET CREATION
# =============================================================================

def create_imbalanced_dataset(n_samples=1000, n_rare=10, seed=SEED):
    """
    Create a synthetic dataset with imbalanced targets.
    
    Args:
        n_samples: Total number of samples
        n_rare: Number of rare 'B' samples
        seed: Random seed for reproducibility
    
    Returns:
        inputs: Random token indices (0-3)
        targets: 990 'A' (0) and 10 'B' (1)
        rare_indices: Indices where target is 'B'
    """
    # Set seed for reproducibility
    set_seeds(seed)
    
    vocab = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    
    # Random input tokens
    inputs = torch.randint(0, 4, (n_samples,))
    
    # Create imbalanced targets: mostly 'A' (0), few 'B' (1)
    targets = torch.zeros(n_samples, dtype=torch.long)  # All 'A' initially
    
    # Randomly select indices for rare 'B' samples
    rare_indices = random.sample(range(n_samples), n_rare)
    targets[rare_indices] = 1  # Set to 'B'
    
    return inputs, targets, sorted(rare_indices)


# =============================================================================
# 3. UTILITY FUNCTIONS
# =============================================================================

def compute_weight_norm(model):
    """Compute L2 norm of all model weights."""
    total_norm = 0.0
    for param in model.parameters():
        total_norm += param.data.norm(2).item() ** 2
    return total_norm ** 0.5


def get_initial_weights(seed=SEED):
    """Get initial weights for reproducible model initialization."""
    set_seeds(seed)
    model = SimpleNextTokenModel(vocab_size=4, embedding_dim=16)
    return {name: param.clone() for name, param in model.state_dict().items()}


def train_epoch(model, optimizer, criterion, inputs, targets, clip_grad=False, max_norm=1.0):
    """
    Train for one epoch, recording metrics at each step.
    
    Args:
        model: The neural network
        optimizer: SGD optimizer
        criterion: CrossEntropyLoss
        inputs: Input token indices
        targets: Target token indices
        clip_grad: Whether to apply gradient clipping
        max_norm: Maximum gradient norm (if clipping)
    
    Returns:
        losses: List of losses per step
        grad_norms: List of gradient norms per step (before clipping)
        weight_norms: List of weight norms per step
    """
    model.train()
    
    losses = []
    grad_norms = []
    weight_norms = []
    
    # Train on each sample individually to see the effect of rare samples
    for i in range(len(inputs)):
        x = inputs[i:i+1]  # Single sample
        y = targets[i:i+1]
        
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(x)
        loss = criterion(logits, y)
        
        # Backward pass
        loss.backward()
        
        # Compute gradient norm BEFORE clipping
        # Use a large value to just compute the norm without clipping
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), float('inf'))
        
        # Apply gradient clipping if requested
        if clip_grad:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        
        # Update weights
        optimizer.step()
        
        # Record metrics
        losses.append(loss.item())
        grad_norms.append(grad_norm.item())
        weight_norms.append(compute_weight_norm(model))
    
    return losses, grad_norms, weight_norms


# =============================================================================
# 4. TRAINING FUNCTIONS
# =============================================================================

def run_training(inputs, targets, rare_indices, clip_grad=False, max_norm=1.0, n_epochs=3, lr=0.1, init_weights=None):
    """
    Run complete training loop.
    
    Args:
        inputs: Input token indices
        targets: Target token indices
        rare_indices: Indices of rare 'B' samples
        clip_grad: Whether to apply gradient clipping
        max_norm: Maximum gradient norm threshold
        n_epochs: Number of training epochs
        lr: Learning rate
        init_weights: Initial model weights for reproducibility
    
    Returns:
        all_losses, all_grad_norms, all_weight_norms: Metrics across all steps
    """
    # Create fresh model with same initial weights
    set_seeds(SEED)
    model = SimpleNextTokenModel(vocab_size=4, embedding_dim=16)
    if init_weights:
        model.load_state_dict(init_weights)
    
    optimizer = optim.SGD(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    all_losses = []
    all_grad_norms = []
    all_weight_norms = []
    
    mode = "WITH" if clip_grad else "WITHOUT"
    print(f"\n{'='*60}")
    print(f"Training {mode} gradient clipping (max_norm={max_norm})")
    print(f"{'='*60}")
    
    for epoch in range(n_epochs):
        losses, grad_norms, weight_norms = train_epoch(
            model, optimizer, criterion, inputs, targets, 
            clip_grad=clip_grad, max_norm=max_norm
        )
        
        all_losses.extend(losses)
        all_grad_norms.extend(grad_norms)
        all_weight_norms.extend(weight_norms)
        
        avg_loss = np.mean(losses)
        max_grad = np.max(grad_norms)
        print(f"Epoch {epoch+1}/{n_epochs}: Avg Loss={avg_loss:.4f}, Max Grad Norm={max_grad:.4f}")
    
    return all_losses, all_grad_norms, all_weight_norms


# =============================================================================
# 5. PLOTTING FUNCTIONS
# =============================================================================

def plot_metrics(losses, grad_norms, weight_norms, title, filename, rare_indices=None, n_samples=1000):
    """
    Plot training metrics: loss, gradient norm, and weight norm.
    
    Args:
        losses: List of losses per step
        grad_norms: List of gradient norms per step
        weight_norms: List of weight norms per step
        title: Plot title
        filename: Output filename
        rare_indices: Indices of rare 'B' samples (for highlighting)
        n_samples: Number of samples per epoch
    """
    fig, axes = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
    
    steps = range(len(losses))
    n_epochs = len(losses) // n_samples
    
    # Plot 1: Training Loss
    axes[0].plot(steps, losses, 'b-', alpha=0.7, linewidth=0.5)
    axes[0].set_ylabel('Training Loss', fontsize=12)
    axes[0].set_title(title, fontsize=14, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Highlight rare sample positions
    if rare_indices:
        for epoch in range(n_epochs):
            for idx in rare_indices:
                step = epoch * n_samples + idx
                if step < len(losses):
                    axes[0].axvline(x=step, color='red', alpha=0.3, linewidth=0.5)
    
    # Plot 2: Gradient Norm
    axes[1].plot(steps, grad_norms, 'g-', alpha=0.7, linewidth=0.5)
    axes[1].set_ylabel('Gradient L2 Norm', fontsize=12)
    axes[1].grid(True, alpha=0.3)
    
    # Add horizontal line at clipping threshold
    if "With" in title or "WITH" in title:
        axes[1].axhline(y=1.0, color='red', linestyle='--', label='Clip threshold (1.0)')
        axes[1].legend()
    
    if rare_indices:
        for epoch in range(n_epochs):
            for idx in rare_indices:
                step = epoch * n_samples + idx
                if step < len(grad_norms):
                    axes[1].axvline(x=step, color='red', alpha=0.3, linewidth=0.5)
    
    # Plot 3: Weight Norm
    axes[2].plot(steps, weight_norms, 'm-', alpha=0.7, linewidth=0.5)
    axes[2].set_ylabel('Weight L2 Norm', fontsize=12)
    axes[2].set_xlabel('Training Step', fontsize=12)
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"Plot saved to: {filename}")


def plot_comparison(metrics_no_clip, metrics_with_clip, rare_indices, filename, n_samples=1000):
    """
    Create side-by-side comparison plot.
    
    Args:
        metrics_no_clip: (losses, grad_norms, weight_norms) without clipping
        metrics_with_clip: (losses, grad_norms, weight_norms) with clipping
        rare_indices: Indices of rare 'B' samples
        filename: Output filename
        n_samples: Number of samples per epoch
    """
    fig, axes = plt.subplots(3, 2, figsize=(16, 12))
    
    losses_no, grads_no, weights_no = metrics_no_clip
    losses_with, grads_with, weights_with = metrics_with_clip
    
    steps = range(len(losses_no))
    n_epochs = len(losses_no) // n_samples
    
    # Column 1: Without Clipping
    axes[0, 0].plot(steps, losses_no, 'b-', alpha=0.7, linewidth=0.5)
    axes[0, 0].set_ylabel('Training Loss', fontsize=11)
    axes[0, 0].set_title('WITHOUT Gradient Clipping', fontsize=13, fontweight='bold', color='red')
    axes[0, 0].grid(True, alpha=0.3)
    
    axes[1, 0].plot(steps, grads_no, 'g-', alpha=0.7, linewidth=0.5)
    axes[1, 0].set_ylabel('Gradient L2 Norm', fontsize=11)
    axes[1, 0].grid(True, alpha=0.3)
    
    axes[2, 0].plot(steps, weights_no, 'm-', alpha=0.7, linewidth=0.5)
    axes[2, 0].set_ylabel('Weight L2 Norm', fontsize=11)
    axes[2, 0].set_xlabel('Training Step', fontsize=11)
    axes[2, 0].grid(True, alpha=0.3)
    
    # Column 2: With Clipping
    axes[0, 1].plot(steps, losses_with, 'b-', alpha=0.7, linewidth=0.5)
    axes[0, 1].set_title('WITH Gradient Clipping (max_norm=1.0)', fontsize=13, fontweight='bold', color='green')
    axes[0, 1].grid(True, alpha=0.3)
    
    axes[1, 1].plot(steps, grads_with, 'g-', alpha=0.7, linewidth=0.5)
    axes[1, 1].axhline(y=1.0, color='red', linestyle='--', linewidth=2, label='Clip threshold')
    axes[1, 1].legend(loc='upper right')
    axes[1, 1].grid(True, alpha=0.3)
    
    axes[2, 1].plot(steps, weights_with, 'm-', alpha=0.7, linewidth=0.5)
    axes[2, 1].set_xlabel('Training Step', fontsize=11)
    axes[2, 1].grid(True, alpha=0.3)
    
    # Highlight rare sample positions in all plots
    for col in range(2):
        for row in range(3):
            for epoch in range(n_epochs):
                for idx in rare_indices:
                    step = epoch * n_samples + idx
                    if step < len(losses_no):
                        axes[row, col].axvline(x=step, color='red', alpha=0.2, linewidth=0.5)
    
    # Add legend for rare samples
    axes[0, 0].axvline(x=-100, color='red', alpha=0.5, linewidth=2, label="Rare 'B' samples")
    axes[0, 0].legend(loc='upper right')
    
    # Add overall title
    fig.suptitle('Effect of Gradient Clipping on Training Stability\n(Red lines indicate rare "B" samples)', 
                 fontsize=14, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"Comparison plot saved to: {filename}")


# =============================================================================
# 6. MAIN EXECUTION
# =============================================================================

def main():
    print("="*60)
    print("GRADIENT CLIPPING EXPERIMENT")
    print("="*60)
    print("\nThis experiment demonstrates how gradient clipping stabilizes")
    print("training by preventing sudden large weight updates caused by")
    print("rare, high-loss data points.\n")
    
    # Create dataset ONCE (used for both runs)
    inputs, targets, rare_indices = create_imbalanced_dataset(n_samples=1000, n_rare=10, seed=SEED)
    
    print(f"Dataset created:")
    print(f"  Total samples: {len(inputs)}")
    print(f"  Target 'A' (0): {(targets == 0).sum().item()}")
    print(f"  Target 'B' (1): {(targets == 1).sum().item()}")
    print(f"  Rare 'B' indices: {rare_indices}")
    
    # Get initial weights (same for both runs)
    init_weights = get_initial_weights(seed=SEED)
    
    # Run training WITHOUT gradient clipping
    losses_no_clip, grads_no_clip, weights_no_clip = run_training(
        inputs, targets, rare_indices,
        clip_grad=False, n_epochs=3, lr=0.1, init_weights=init_weights
    )
    
    # Run training WITH gradient clipping
    losses_with_clip, grads_with_clip, weights_with_clip = run_training(
        inputs, targets, rare_indices,
        clip_grad=True, max_norm=1.0, n_epochs=3, lr=0.1, init_weights=init_weights
    )
    
    # Generate individual plots
    print("\n" + "="*60)
    print("GENERATING PLOTS")
    print("="*60)
    
    plot_metrics(
        losses_no_clip, grads_no_clip, weights_no_clip,
        "Training WITHOUT Gradient Clipping",
        "no_clipping.png",
        rare_indices
    )
    
    plot_metrics(
        losses_with_clip, grads_with_clip, weights_with_clip,
        "Training WITH Gradient Clipping (max_norm=1.0)",
        "with_clipping.png",
        rare_indices
    )
    
    # Generate comparison plot
    plot_comparison(
        (losses_no_clip, grads_no_clip, weights_no_clip),
        (losses_with_clip, grads_with_clip, weights_with_clip),
        rare_indices,
        "comparison.png"
    )
    
    # Print summary statistics
    print("\n" + "="*60)
    print("SUMMARY STATISTICS")
    print("="*60)
    
    print("\nWithout Gradient Clipping:")
    print(f"  Max Gradient Norm: {max(grads_no_clip):.4f}")
    print(f"  Mean Gradient Norm: {np.mean(grads_no_clip):.4f}")
    print(f"  Std Gradient Norm: {np.std(grads_no_clip):.4f}")
    print(f"  Final Weight Norm: {weights_no_clip[-1]:.4f}")
    print(f"  Final Loss: {losses_no_clip[-1]:.4f}")
    
    print("\nWith Gradient Clipping (max_norm=1.0):")
    print(f"  Max Gradient Norm: {max(grads_with_clip):.4f}")
    print(f"  Mean Gradient Norm: {np.mean(grads_with_clip):.4f}")
    print(f"  Std Gradient Norm: {np.std(grads_with_clip):.4f}")
    print(f"  Final Weight Norm: {weights_with_clip[-1]:.4f}")
    print(f"  Final Loss: {losses_with_clip[-1]:.4f}")
    
    # Return statistics for report
    return {
        'no_clip': {
            'max_grad': max(grads_no_clip),
            'mean_grad': np.mean(grads_no_clip),
            'std_grad': np.std(grads_no_clip),
            'final_weight': weights_no_clip[-1],
            'final_loss': losses_no_clip[-1]
        },
        'with_clip': {
            'max_grad': max(grads_with_clip),
            'mean_grad': np.mean(grads_with_clip),
            'std_grad': np.std(grads_with_clip),
            'final_weight': weights_with_clip[-1],
            'final_loss': losses_with_clip[-1]
        },
        'rare_indices': rare_indices
    }


if __name__ == "__main__":
    stats = main()
    print("\n" + "="*60)
    print("EXPERIMENT COMPLETE!")
    print("="*60)