AmberLJC
/

activation_functions

Model card Files Files and versions

xet

Community

AmberLJC commited on Jan 29

Commit

778f232

verified ·

1 Parent(s): dee95e9

Upload train_dynamics.py with huggingface_hub

Browse files

Files changed (1) hide show

train_dynamics.py +742 -0

train_dynamics.py ADDED Viewed

	@@ -0,0 +1,742 @@

+"""
+Activation Functions Comparison Experiment - Extended Training Dynamics Analysis
+Compares Linear, Sigmoid, ReLU, Leaky ReLU, and GELU activation functions
+on a deep neural network (10 hidden layers) for 1D non-linear regression.
+NEW FEATURES:
+- Gradient measurements at epochs 1, 100, and 200
+- Training dynamics visualizations showing how activations evolve
+- Gradient flow evolution over training
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import matplotlib.pyplot as plt
+import json
+import os
+from datetime import datetime
+# Set random seeds for reproducibility
+np.random.seed(42)
+torch.manual_seed(42)
+# Create output directory
+os.makedirs('activation_functions', exist_ok=True)
+print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting Activation Functions - Training Dynamics Experiment")
+print("=" * 70)
+# ============================================================
+# 1. Generate Synthetic Dataset
+# ============================================================
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Generating synthetic dataset...")
+x = np.linspace(-np.pi, np.pi, 200)
+y = np.sin(x) + np.random.normal(0, 0.1, 200)
+# Convert to PyTorch tensors
+X_train = torch.tensor(x, dtype=torch.float32).reshape(-1, 1)
+Y_train = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
+# Create a fine grid for evaluation/visualization
+x_eval = np.linspace(-np.pi, np.pi, 500)
+X_eval = torch.tensor(x_eval, dtype=torch.float32).reshape(-1, 1)
+y_true = np.sin(x_eval)  # Ground truth
+print(f"  Training samples: {len(X_train)}")
+print(f"  Evaluation samples: {len(X_eval)}")
+# ============================================================
+# 2. Define Deep MLP Architecture
+# ============================================================
+class DeepMLP(nn.Module):
+    """
+    Deep MLP with 10 hidden layers of 64 neurons each.
+    Stores intermediate activations and gradients for analysis.
+    """
+    def __init__(self, activation_fn=None, activation_name="linear"):
+        super(DeepMLP, self).__init__()
+        self.activation_name = activation_name
+        # Input layer
+        self.input_layer = nn.Linear(1, 64)
+        # 10 hidden layers
+        self.hidden_layers = nn.ModuleList([
+            nn.Linear(64, 64) for _ in range(10)
+        ])
+        # Output layer
+        self.output_layer = nn.Linear(64, 1)
+        # Activation function
+        self.activation_fn = activation_fn
+        # Storage for activations (for analysis)
+        self.activations = {}
+    def forward(self, x, store_activations=False):
+        # Input layer
+        x = self.input_layer(x)
+        if self.activation_fn is not None:
+            x = self.activation_fn(x)
+        # Hidden layers
+        for i, layer in enumerate(self.hidden_layers):
+            x = layer(x)
+            if self.activation_fn is not None:
+                x = self.activation_fn(x)
+            # Store activations for all layers when requested
+            if store_activations:
+                self.activations[f'layer_{i+1}'] = x.detach().clone()
+        # Output layer (no activation)
+        x = self.output_layer(x)
+        return x
+    def get_gradient_magnitudes(self):
+        """Get average gradient magnitude for each hidden layer."""
+        magnitudes = []
+        for i, layer in enumerate(self.hidden_layers):
+            if layer.weight.grad is not None:
+                mag = layer.weight.grad.abs().mean().item()
+                magnitudes.append(mag)
+            else:
+                magnitudes.append(0.0)
+        return magnitudes
+    def get_weight_stats(self):
+        """Get weight statistics for each hidden layer."""
+        stats = []
+        for i, layer in enumerate(self.hidden_layers):
+            w = layer.weight.data
+            stats.append({
+                'mean': w.mean().item(),
+                'std': w.std().item(),
+                'min': w.min().item(),
+                'max': w.max().item()
+            })
+        return stats
+def create_model(activation_type):
+    """Create a model with the specified activation function."""
+    if activation_type == "linear":
+        return DeepMLP(activation_fn=None, activation_name="linear")
+    elif activation_type == "sigmoid":
+        return DeepMLP(activation_fn=torch.sigmoid, activation_name="sigmoid")
+    elif activation_type == "relu":
+        return DeepMLP(activation_fn=torch.relu, activation_name="relu")
+    elif activation_type == "leaky_relu":
+        return DeepMLP(activation_fn=nn.LeakyReLU(0.01), activation_name="leaky_relu")
+    elif activation_type == "gelu":
+        return DeepMLP(activation_fn=nn.GELU(), activation_name="gelu")
+    else:
+        raise ValueError(f"Unknown activation type: {activation_type}")
+# ============================================================
+# 3. Training Function with Extended Metrics
+# ============================================================
+def train_model(model, X_train, Y_train, X_eval, epochs=500, lr=0.001):
+    """
+    Train a model and collect comprehensive metrics.
+    Returns:
+        - loss_history: List of losses per epoch
+        - gradient_history: Dict of gradient magnitudes at key epochs (1, 100, 200)
+        - activation_history: Activations at various epochs
+        - weight_history: Weight statistics over training
+        - prediction_history: Model predictions at key epochs
+    """
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    criterion = nn.MSELoss()
+    loss_history = []
+    gradient_history = {}  # Gradients at epochs 1, 100, 200
+    activation_history = {}
+    weight_history = {}
+    prediction_history = {}
+    # Key epochs for analysis
+    gradient_epochs = [1, 100, 200]  # Epochs to measure gradients
+    activation_epochs = [0, 50, 100, 150, 200, 300, 400, 499]  # Epochs to save activations
+    prediction_epochs = [0, 50, 100, 200, 300, 499]  # Epochs to save predictions
+    for epoch in range(epochs):
+        model.train()
+        optimizer.zero_grad()
+        # Forward pass (store activations at specific epochs)
+        store_acts = epoch in activation_epochs
+        predictions = model(X_train, store_activations=store_acts)
+        # Compute loss
+        loss = criterion(predictions, Y_train)
+        # Backward pass
+        loss.backward()
+        # Capture gradient magnitudes at key epochs
+        if epoch in gradient_epochs:
+            gradient_history[epoch] = model.get_gradient_magnitudes()
+            print(f"    [Gradient Capture] Epoch {epoch}: Layer 1={gradient_history[epoch][0]:.2e}, Layer 10={gradient_history[epoch][9]:.2e}")
+        # Update weights
+        optimizer.step()
+        # Record loss
+        loss_history.append(loss.item())
+        # Store activations
+        if store_acts:
+            activation_history[epoch] = {
+                k: v.numpy().copy() for k, v in model.activations.items()
+            }
+        # Store weight statistics periodically
+        if epoch % 50 == 0:
+            weight_history[epoch] = model.get_weight_stats()
+        # Store predictions at key epochs
+        if epoch in prediction_epochs:
+            model.eval()
+            with torch.no_grad():
+                pred = model(X_eval)
+                prediction_history[epoch] = pred.numpy().flatten()
+            model.train()
+        # Print progress
+        if epoch % 100 == 0 or epoch == epochs - 1:
+            print(f"    Epoch {epoch:4d}/{epochs}: Loss = {loss.item():.6f}")
+    return loss_history, gradient_history, activation_history, weight_history, prediction_history
+# ============================================================
+# 4. Train All Models
+# ============================================================
+activation_types = ["linear", "sigmoid", "relu", "leaky_relu", "gelu"]
+activation_labels = {
+    "linear": "Linear (None)",
+    "sigmoid": "Sigmoid",
+    "relu": "ReLU",
+    "leaky_relu": "Leaky ReLU",
+    "gelu": "GELU"
+}
+results = {}
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Training models with extended metrics...")
+print("=" * 70)
+for act_type in activation_types:
+    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Training {activation_labels[act_type]} model...")
+    model = create_model(act_type)
+    loss_history, grad_history, act_history, weight_history, pred_history = train_model(
+        model, X_train, Y_train, X_eval, epochs=500, lr=0.001
+    )
+    # Get final predictions
+    model.eval()
+    with torch.no_grad():
+        final_predictions = model(X_eval, store_activations=True)
+    results[act_type] = {
+        "model": model,
+        "loss_history": loss_history,
+        "gradient_history": grad_history,  # Gradients at epochs 1, 100, 200
+        "activation_history": act_history,
+        "weight_history": weight_history,
+        "prediction_history": pred_history,
+        "final_predictions": final_predictions.numpy().flatten(),
+        "final_activations": {k: v.numpy().copy() for k, v in model.activations.items()},
+        "final_loss": loss_history[-1]
+    }
+    print(f"    Final MSE Loss: {loss_history[-1]:.6f}")
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] All models trained!")
+# ============================================================
+# 5. Save Extended Data
+# ============================================================
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Saving extended data...")
+# Save gradient magnitudes at all measured epochs
+gradient_data = {}
+for act_type in activation_types:
+    gradient_data[act_type] = {
+        str(epoch): grads for epoch, grads in results[act_type]["gradient_history"].items()
+    }
+with open('activation_functions/gradient_magnitudes_epochs.json', 'w') as f:
+    json.dump(gradient_data, f, indent=2)
+# Save loss histories
+loss_data = {
+    act_type: results[act_type]["loss_history"]
+    for act_type in activation_types
+}
+with open('activation_functions/loss_histories.json', 'w') as f:
+    json.dump(loss_data, f, indent=2)
+# Save final losses
+final_losses = {
+    act_type: results[act_type]["final_loss"]
+    for act_type in activation_types
+}
+with open('activation_functions/final_losses.json', 'w') as f:
+    json.dump(final_losses, f, indent=2)
+print("  Saved: gradient_magnitudes_epochs.json, loss_histories.json, final_losses.json")
+# ============================================================
+# 6. Generate Visualizations
+# ============================================================
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Generating visualizations...")
+# Set style
+plt.style.use('seaborn-v0_8-whitegrid')
+colors = {
+    "linear": "#1f77b4",
+    "sigmoid": "#ff7f0e",
+    "relu": "#2ca02c",
+    "leaky_relu": "#d62728",
+    "gelu": "#9467bd"
+}
+# --- Plot 1: Learned Functions ---
+print("  Creating learned_functions.png...")
+fig, ax = plt.subplots(figsize=(12, 8))
+# Ground truth
+ax.plot(x_eval, y_true, 'k-', linewidth=2.5, label='Ground Truth (sin(x))', zorder=10)
+# Noisy data points
+ax.scatter(x, y, c='gray', alpha=0.5, s=30, label='Noisy Data', zorder=5)
+# Learned functions
+for act_type in activation_types:
+    ax.plot(x_eval, results[act_type]["final_predictions"],
+            color=colors[act_type], linewidth=2,
+            label=f'{activation_labels[act_type]} (MSE: {results[act_type]["final_loss"]:.4f})',
+            alpha=0.8)
+ax.set_xlabel('x', fontsize=12)
+ax.set_ylabel('y', fontsize=12)
+ax.set_title('Learned Functions: Comparison of Activation Functions\n(10 Hidden Layers, 64 Neurons Each, 500 Epochs)', fontsize=14)
+ax.legend(loc='upper right', fontsize=10)
+ax.set_xlim(-np.pi, np.pi)
+ax.set_ylim(-1.5, 1.5)
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig('activation_functions/learned_functions.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- Plot 2: Loss Curves ---
+print("  Creating loss_curves.png...")
+fig, ax = plt.subplots(figsize=(12, 8))
+for act_type in activation_types:
+    ax.plot(results[act_type]["loss_history"],
+            color=colors[act_type], linewidth=2,
+            label=f'{activation_labels[act_type]}')
+ax.set_xlabel('Epoch', fontsize=12)
+ax.set_ylabel('MSE Loss', fontsize=12)
+ax.set_title('Training Loss Curves: Comparison of Activation Functions', fontsize=14)
+ax.legend(loc='upper right', fontsize=10)
+ax.set_yscale('log')
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plt.savefig('activation_functions/loss_curves.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- Plot 3: Gradient Flow at Epochs 1, 100, 200 ---
+print("  Creating gradient_flow_epochs.png...")
+fig, axes = plt.subplots(1, 3, figsize=(18, 6))
+gradient_epochs = [1, 100, 200]
+layer_indices = list(range(1, 11))
+for idx, epoch in enumerate(gradient_epochs):
+    ax = axes[idx]
+    bar_width = 0.15
+    x_positions = np.arange(len(layer_indices))
+    for i, act_type in enumerate(activation_types):
+        grad_mags = results[act_type]["gradient_history"].get(epoch, [0]*10)
+        offset = (i - 2) * bar_width
+        bars = ax.bar(x_positions + offset, grad_mags, bar_width,
+                      label=activation_labels[act_type] if idx == 0 else "",
+                      color=colors[act_type], alpha=0.8)
+    ax.set_xlabel('Hidden Layer', fontsize=11)
+    ax.set_ylabel('Avg Gradient Magnitude', fontsize=11)
+    ax.set_title(f'Epoch {epoch}', fontsize=13, fontweight='bold')
+    ax.set_xticks(x_positions)
+    ax.set_xticklabels([f'L{i}' for i in layer_indices], fontsize=9)
+    ax.set_yscale('log')
+    ax.grid(True, alpha=0.3, axis='y')
+    ax.set_ylim(1e-12, 1e0)
+# Add legend to first subplot
+axes[0].legend(loc='upper right', fontsize=9)
+fig.suptitle('Gradient Flow Analysis Across Training\n(Gradient Magnitude per Layer at Epochs 1, 100, 200)', fontsize=14, y=1.02)
+plt.tight_layout()
+plt.savefig('activation_functions/gradient_flow_epochs.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- Plot 4: Original Gradient Flow (Epoch 1 only for compatibility) ---
+print("  Creating gradient_flow.png...")
+fig, ax = plt.subplots(figsize=(12, 8))
+bar_width = 0.15
+x_positions = np.arange(len(layer_indices))
+for i, act_type in enumerate(activation_types):
+    grad_mags = results[act_type]["gradient_history"].get(1, [0]*10)
+    offset = (i - 2) * bar_width
+    bars = ax.bar(x_positions + offset, grad_mags, bar_width,
+                  label=activation_labels[act_type], color=colors[act_type], alpha=0.8)
+ax.set_xlabel('Hidden Layer', fontsize=12)
+ax.set_ylabel('Average Gradient Magnitude', fontsize=12)
+ax.set_title('Gradient Flow Analysis: Average Gradient Magnitude per Layer\n(Measured at Epoch 1)', fontsize=14)
+ax.set_xticks(x_positions)
+ax.set_xticklabels([f'Layer {i}' for i in layer_indices])
+ax.legend(loc='upper right', fontsize=10)
+ax.set_yscale('log')
+ax.grid(True, alpha=0.3, axis='y')
+plt.tight_layout()
+plt.savefig('activation_functions/gradient_flow.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- Plot 5: Hidden Activations ---
+print("  Creating hidden_activations.png...")
+fig, axes = plt.subplots(3, 5, figsize=(18, 12))
+layers_to_plot = ['layer_1', 'layer_5', 'layer_10']
+layer_titles = ['Layer 1 (First)', 'Layer 5 (Middle)', 'Layer 10 (Last)']
+for row, (layer_key, layer_title) in enumerate(zip(layers_to_plot, layer_titles)):
+    for col, act_type in enumerate(activation_types):
+        ax = axes[row, col]
+        # Get activations for this layer
+        activations = results[act_type]["final_activations"].get(layer_key, None)
+        if activations is not None:
+            # Plot histogram of activation values
+            ax.hist(activations.flatten(), bins=50, color=colors[act_type],
+                    alpha=0.7, edgecolor='black', linewidth=0.5)
+            # Add statistics
+            mean_val = activations.mean()
+            std_val = activations.std()
+            ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.5)
+            ax.set_title(f'{activation_labels[act_type]}\n{layer_title}', fontsize=10)
+            ax.set_xlabel('Activation Value', fontsize=8)
+            ax.set_ylabel('Frequency', fontsize=8)
+            # Add text box with stats
+            textstr = f'μ={mean_val:.3f}\nσ={std_val:.3f}'
+            props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
+            ax.text(0.95, 0.95, textstr, transform=ax.transAxes, fontsize=8,
+                    verticalalignment='top', horizontalalignment='right', bbox=props)
+        else:
+            ax.text(0.5, 0.5, 'No Data', ha='center', va='center', transform=ax.transAxes)
+            ax.set_title(f'{activation_labels[act_type]}\n{layer_title}', fontsize=10)
+fig.suptitle('Hidden Layer Activation Distributions (After Training)', fontsize=14, y=1.02)
+plt.tight_layout()
+plt.savefig('activation_functions/hidden_activations.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- NEW Plot 6: Training Dynamics - Function Learning Over Time ---
+print("  Creating training_dynamics_functions.png...")
+fig, axes = plt.subplots(2, 3, figsize=(16, 10))
+axes = axes.flatten()
+# Show how each activation learns the function over epochs
+prediction_epochs = [0, 50, 100, 200, 300, 499]
+epoch_colors = plt.cm.viridis(np.linspace(0, 1, len(prediction_epochs)))
+for idx, act_type in enumerate(activation_types):
+    ax = axes[idx]
+    # Ground truth
+    ax.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7)
+    # Predictions at different epochs
+    for ep_idx, epoch in enumerate(prediction_epochs):
+        if epoch in results[act_type]["prediction_history"]:
+            pred = results[act_type]["prediction_history"][epoch]
+            ax.plot(x_eval, pred, color=epoch_colors[ep_idx], linewidth=1.5,
+                    label=f'Epoch {epoch}', alpha=0.8)
+    ax.set_xlabel('x', fontsize=10)
+    ax.set_ylabel('y', fontsize=10)
+    ax.set_title(f'{activation_labels[act_type]}', fontsize=12, fontweight='bold')
+    ax.set_xlim(-np.pi, np.pi)
+    ax.set_ylim(-2, 2)
+    ax.grid(True, alpha=0.3)
+    ax.legend(loc='upper right', fontsize=7)
+# Hide the 6th subplot (we have 5 activations)
+axes[5].axis('off')
+fig.suptitle('Training Dynamics: How Each Activation Learns the Function Over Time', fontsize=14, y=1.02)
+plt.tight_layout()
+plt.savefig('activation_functions/training_dynamics_functions.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- NEW Plot 7: Gradient Evolution Over Training ---
+print("  Creating gradient_evolution.png...")
+fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+# Left plot: Gradient ratio (Layer 10 / Layer 1) evolution
+ax1 = axes[0]
+gradient_epochs = [1, 100, 200]
+x_pos = np.arange(len(gradient_epochs))
+bar_width = 0.15
+for i, act_type in enumerate(activation_types):
+    ratios = []
+    for epoch in gradient_epochs:
+        grads = results[act_type]["gradient_history"].get(epoch, [1e-10]*10)
+        # Avoid division by zero
+        if grads[0] > 1e-15:
+            ratio = grads[9] / grads[0]  # Layer 10 / Layer 1
+        else:
+            ratio = 1e10  # Very large ratio indicates vanishing gradients
+        ratios.append(ratio)
+    offset = (i - 2) * bar_width
+    ax1.bar(x_pos + offset, ratios, bar_width, label=activation_labels[act_type],
+            color=colors[act_type], alpha=0.8)
+ax1.set_xlabel('Epoch', fontsize=12)
+ax1.set_ylabel('Gradient Ratio (Layer 10 / Layer 1)', fontsize=12)
+ax1.set_title('Gradient Ratio Evolution\n(Higher = More Vanishing)', fontsize=13)
+ax1.set_xticks(x_pos)
+ax1.set_xticklabels([f'Epoch {e}' for e in gradient_epochs])
+ax1.set_yscale('log')
+ax1.axhline(y=1, color='black', linestyle='--', linewidth=1, label='Ideal (ratio=1)')
+ax1.legend(loc='upper left', fontsize=9)
+ax1.grid(True, alpha=0.3, axis='y')
+# Right plot: Layer 1 gradient magnitude over epochs
+ax2 = axes[1]
+for act_type in activation_types:
+    layer1_grads = []
+    for epoch in gradient_epochs:
+        grads = results[act_type]["gradient_history"].get(epoch, [0]*10)
+        layer1_grads.append(grads[0])
+    ax2.plot(gradient_epochs, layer1_grads, 'o-', color=colors[act_type],
+             linewidth=2, markersize=8, label=activation_labels[act_type])
+ax2.set_xlabel('Epoch', fontsize=12)
+ax2.set_ylabel('Layer 1 Gradient Magnitude', fontsize=12)
+ax2.set_title('First Layer Gradient Over Training\n(Key Indicator of Learning)', fontsize=13)
+ax2.set_yscale('log')
+ax2.legend(loc='upper right', fontsize=9)
+ax2.grid(True, alpha=0.3)
+fig.suptitle('Activation Effect on Gradient Dynamics During Training', fontsize=14, y=1.02)
+plt.tight_layout()
+plt.savefig('activation_functions/gradient_evolution.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- NEW Plot 8: Activation Distribution Evolution ---
+print("  Creating activation_evolution.png...")
+fig, axes = plt.subplots(5, 4, figsize=(16, 18))
+# Show activation distributions at epochs 0, 100, 200, 499 for layer 5
+epochs_to_show = [0, 100, 200, 499]
+for row, act_type in enumerate(activation_types):
+    for col, epoch in enumerate(epochs_to_show):
+        ax = axes[row, col]
+        if epoch in results[act_type]["activation_history"]:
+            activations = results[act_type]["activation_history"][epoch].get('layer_5', None)
+            if activations is not None:
+                # Clean data for histogram
+                acts_clean = activations.flatten()
+                acts_clean = acts_clean[np.isfinite(acts_clean)]
+                if len(acts_clean) > 0:
+                    ax.hist(acts_clean, bins=50, color=colors[act_type],
+                            alpha=0.7, edgecolor='black', linewidth=0.5)
+                    mean_val = np.nanmean(acts_clean)
+                    std_val = np.nanstd(acts_clean)
+                    ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.5)
+                    textstr = f'μ={mean_val:.3f}\nσ={std_val:.3f}'
+                    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
+                    ax.text(0.95, 0.95, textstr, transform=ax.transAxes, fontsize=8,
+                            verticalalignment='top', horizontalalignment='right', bbox=props)
+        if row == 0:
+            ax.set_title(f'Epoch {epoch}', fontsize=11, fontweight='bold')
+        if col == 0:
+            ax.set_ylabel(f'{activation_labels[act_type]}', fontsize=10)
+fig.suptitle('Activation Distribution Evolution (Layer 5 - Middle Layer)\nHow Activations Change During Training', fontsize=14, y=1.01)
+plt.tight_layout()
+plt.savefig('activation_functions/activation_evolution.png', dpi=150, bbox_inches='tight')
+plt.close()
+# --- NEW Plot 9: Comprehensive Training Dynamics Summary ---
+print("  Creating training_dynamics_summary.png...")
+fig = plt.figure(figsize=(20, 16))
+# Create grid layout
+gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
+# Panel 1: Loss curves (top-left)
+ax1 = fig.add_subplot(gs[0, 0])
+for act_type in activation_types:
+    ax1.plot(results[act_type]["loss_history"],
+             color=colors[act_type], linewidth=2, label=activation_labels[act_type])
+ax1.set_xlabel('Epoch', fontsize=11)
+ax1.set_ylabel('MSE Loss', fontsize=11)
+ax1.set_title('A. Training Loss Curves', fontsize=12, fontweight='bold')
+ax1.set_yscale('log')
+ax1.legend(loc='upper right', fontsize=8)
+ax1.grid(True, alpha=0.3)
+# Panel 2: Gradient ratio evolution (top-middle)
+ax2 = fig.add_subplot(gs[0, 1])
+for act_type in activation_types:
+    ratios = []
+    for epoch in [1, 100, 200]:
+        grads = results[act_type]["gradient_history"].get(epoch, [1e-10]*10)
+        if grads[0] > 1e-15:
+            ratio = grads[9] / grads[0]
+        else:
+            ratio = 1e10
+        ratios.append(ratio)
+    ax2.plot([1, 100, 200], ratios, 'o-', color=colors[act_type],
+             linewidth=2, markersize=8, label=activation_labels[act_type])
+ax2.set_xlabel('Epoch', fontsize=11)
+ax2.set_ylabel('Gradient Ratio (L10/L1)', fontsize=11)
+ax2.set_title('B. Gradient Ratio Over Training', fontsize=12, fontweight='bold')
+ax2.set_yscale('log')
+ax2.axhline(y=1, color='black', linestyle='--', linewidth=1, alpha=0.5)
+ax2.legend(loc='upper left', fontsize=8)
+ax2.grid(True, alpha=0.3)
+# Panel 3: Final learned functions (top-right)
+ax3 = fig.add_subplot(gs[0, 2])
+ax3.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7)
+for act_type in activation_types:
+    ax3.plot(x_eval, results[act_type]["final_predictions"],
+             color=colors[act_type], linewidth=1.5, label=activation_labels[act_type], alpha=0.8)
+ax3.set_xlabel('x', fontsize=11)
+ax3.set_ylabel('y', fontsize=11)
+ax3.set_title('C. Final Learned Functions', fontsize=12, fontweight='bold')
+ax3.legend(loc='upper right', fontsize=8)
+ax3.grid(True, alpha=0.3)
+# Panels 4-6: Gradient flow at epochs 1, 100, 200 (middle row)
+for idx, epoch in enumerate([1, 100, 200]):
+    ax = fig.add_subplot(gs[1, idx])
+    bar_width = 0.15
+    x_positions = np.arange(10)
+    for i, act_type in enumerate(activation_types):
+        grad_mags = results[act_type]["gradient_history"].get(epoch, [0]*10)
+        offset = (i - 2) * bar_width
+        ax.bar(x_positions + offset, grad_mags, bar_width,
+               color=colors[act_type], alpha=0.8)
+    ax.set_xlabel('Layer', fontsize=10)
+    ax.set_ylabel('Gradient Magnitude', fontsize=10)
+    ax.set_title(f'D{idx+1}. Gradient Flow - Epoch {epoch}', fontsize=12, fontweight='bold')
+    ax.set_xticks(x_positions)
+    ax.set_xticklabels([f'{i+1}' for i in range(10)], fontsize=8)
+    ax.set_yscale('log')
+    ax.set_ylim(1e-12, 1e0)
+    ax.grid(True, alpha=0.3, axis='y')
+# Panels 7-9: Function learning at epochs 50, 200, 499 (bottom row)
+for idx, epoch in enumerate([50, 200, 499]):
+    ax = fig.add_subplot(gs[2, idx])
+    ax.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7)
+    for act_type in activation_types:
+        if epoch in results[act_type]["prediction_history"]:
+            pred = results[act_type]["prediction_history"][epoch]
+            ax.plot(x_eval, pred, color=colors[act_type], linewidth=1.5,
+                    label=activation_labels[act_type], alpha=0.8)
+    ax.set_xlabel('x', fontsize=10)
+    ax.set_ylabel('y', fontsize=10)
+    ax.set_title(f'E{idx+1}. Predictions at Epoch {epoch}', fontsize=12, fontweight='bold')
+    ax.set_xlim(-np.pi, np.pi)
+    ax.set_ylim(-2, 2)
+    ax.grid(True, alpha=0.3)
+    if idx == 2:
+        ax.legend(loc='upper right', fontsize=7)
+fig.suptitle('Comprehensive Training Dynamics Analysis: Activation Functions in Deep Networks\n(10 Layers × 64 Neurons, 500 Epochs, Adam Optimizer)', fontsize=16, y=1.01)
+plt.savefig('activation_functions/training_dynamics_summary.png', dpi=150, bbox_inches='tight')
+plt.close()
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] All visualizations saved!")
+print("  - learned_functions.png")
+print("  - loss_curves.png")
+print("  - gradient_flow.png")
+print("  - gradient_flow_epochs.png (NEW)")
+print("  - hidden_activations.png")
+print("  - training_dynamics_functions.png (NEW)")
+print("  - gradient_evolution.png (NEW)")
+print("  - activation_evolution.png (NEW)")
+print("  - training_dynamics_summary.png (NEW)")
+# ============================================================
+# 7. Print Summary Statistics
+# ============================================================
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Summary Statistics")
+print("=" * 70)
+print("\n### Gradient Magnitudes at Key Epochs ###")
+print("-" * 70)
+print(f"{'Activation':<15} {'Epoch':<8} {'Layer 1':<12} {'Layer 5':<12} {'Layer 10':<12} {'Ratio (L10/L1)':<15}")
+print("-" * 70)
+for act_type in activation_types:
+    for epoch in [1, 100, 200]:
+        grads = results[act_type]["gradient_history"].get(epoch, [0]*10)
+        if grads[0] > 1e-15:
+            ratio = grads[9] / grads[0]
+        else:
+            ratio = float('inf')
+        print(f"{activation_labels[act_type]:<15} {epoch:<8} {grads[0]:<12.2e} {grads[4]:<12.2e} {grads[9]:<12.2e} {ratio:<15.2e}")
+print("\n### Final MSE Losses ###")
+print("-" * 40)
+sorted_losses = sorted(final_losses.items(), key=lambda x: x[1])
+for act_type, loss in sorted_losses:
+    print(f"{activation_labels[act_type]:<20}: {loss:.6f}")
+print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Experiment complete!")
+print("=" * 70)