AmberLJC commited on
Commit
778f232
·
verified ·
1 Parent(s): dee95e9

Upload train_dynamics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_dynamics.py +742 -0
train_dynamics.py ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Activation Functions Comparison Experiment - Extended Training Dynamics Analysis
3
+
4
+ Compares Linear, Sigmoid, ReLU, Leaky ReLU, and GELU activation functions
5
+ on a deep neural network (10 hidden layers) for 1D non-linear regression.
6
+
7
+ NEW FEATURES:
8
+ - Gradient measurements at epochs 1, 100, and 200
9
+ - Training dynamics visualizations showing how activations evolve
10
+ - Gradient flow evolution over training
11
+ """
12
+
13
+ import numpy as np
14
+ import torch
15
+ import torch.nn as nn
16
+ import torch.optim as optim
17
+ import matplotlib.pyplot as plt
18
+ import json
19
+ import os
20
+ from datetime import datetime
21
+
22
+ # Set random seeds for reproducibility
23
+ np.random.seed(42)
24
+ torch.manual_seed(42)
25
+
26
+ # Create output directory
27
+ os.makedirs('activation_functions', exist_ok=True)
28
+
29
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting Activation Functions - Training Dynamics Experiment")
30
+ print("=" * 70)
31
+
32
+ # ============================================================
33
+ # 1. Generate Synthetic Dataset
34
+ # ============================================================
35
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Generating synthetic dataset...")
36
+
37
+ x = np.linspace(-np.pi, np.pi, 200)
38
+ y = np.sin(x) + np.random.normal(0, 0.1, 200)
39
+
40
+ # Convert to PyTorch tensors
41
+ X_train = torch.tensor(x, dtype=torch.float32).reshape(-1, 1)
42
+ Y_train = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
43
+
44
+ # Create a fine grid for evaluation/visualization
45
+ x_eval = np.linspace(-np.pi, np.pi, 500)
46
+ X_eval = torch.tensor(x_eval, dtype=torch.float32).reshape(-1, 1)
47
+ y_true = np.sin(x_eval) # Ground truth
48
+
49
+ print(f" Training samples: {len(X_train)}")
50
+ print(f" Evaluation samples: {len(X_eval)}")
51
+
52
+ # ============================================================
53
+ # 2. Define Deep MLP Architecture
54
+ # ============================================================
55
+ class DeepMLP(nn.Module):
56
+ """
57
+ Deep MLP with 10 hidden layers of 64 neurons each.
58
+ Stores intermediate activations and gradients for analysis.
59
+ """
60
+ def __init__(self, activation_fn=None, activation_name="linear"):
61
+ super(DeepMLP, self).__init__()
62
+ self.activation_name = activation_name
63
+
64
+ # Input layer
65
+ self.input_layer = nn.Linear(1, 64)
66
+
67
+ # 10 hidden layers
68
+ self.hidden_layers = nn.ModuleList([
69
+ nn.Linear(64, 64) for _ in range(10)
70
+ ])
71
+
72
+ # Output layer
73
+ self.output_layer = nn.Linear(64, 1)
74
+
75
+ # Activation function
76
+ self.activation_fn = activation_fn
77
+
78
+ # Storage for activations (for analysis)
79
+ self.activations = {}
80
+
81
+ def forward(self, x, store_activations=False):
82
+ # Input layer
83
+ x = self.input_layer(x)
84
+ if self.activation_fn is not None:
85
+ x = self.activation_fn(x)
86
+
87
+ # Hidden layers
88
+ for i, layer in enumerate(self.hidden_layers):
89
+ x = layer(x)
90
+ if self.activation_fn is not None:
91
+ x = self.activation_fn(x)
92
+
93
+ # Store activations for all layers when requested
94
+ if store_activations:
95
+ self.activations[f'layer_{i+1}'] = x.detach().clone()
96
+
97
+ # Output layer (no activation)
98
+ x = self.output_layer(x)
99
+ return x
100
+
101
+ def get_gradient_magnitudes(self):
102
+ """Get average gradient magnitude for each hidden layer."""
103
+ magnitudes = []
104
+ for i, layer in enumerate(self.hidden_layers):
105
+ if layer.weight.grad is not None:
106
+ mag = layer.weight.grad.abs().mean().item()
107
+ magnitudes.append(mag)
108
+ else:
109
+ magnitudes.append(0.0)
110
+ return magnitudes
111
+
112
+ def get_weight_stats(self):
113
+ """Get weight statistics for each hidden layer."""
114
+ stats = []
115
+ for i, layer in enumerate(self.hidden_layers):
116
+ w = layer.weight.data
117
+ stats.append({
118
+ 'mean': w.mean().item(),
119
+ 'std': w.std().item(),
120
+ 'min': w.min().item(),
121
+ 'max': w.max().item()
122
+ })
123
+ return stats
124
+
125
+
126
+ def create_model(activation_type):
127
+ """Create a model with the specified activation function."""
128
+ if activation_type == "linear":
129
+ return DeepMLP(activation_fn=None, activation_name="linear")
130
+ elif activation_type == "sigmoid":
131
+ return DeepMLP(activation_fn=torch.sigmoid, activation_name="sigmoid")
132
+ elif activation_type == "relu":
133
+ return DeepMLP(activation_fn=torch.relu, activation_name="relu")
134
+ elif activation_type == "leaky_relu":
135
+ return DeepMLP(activation_fn=nn.LeakyReLU(0.01), activation_name="leaky_relu")
136
+ elif activation_type == "gelu":
137
+ return DeepMLP(activation_fn=nn.GELU(), activation_name="gelu")
138
+ else:
139
+ raise ValueError(f"Unknown activation type: {activation_type}")
140
+
141
+
142
+ # ============================================================
143
+ # 3. Training Function with Extended Metrics
144
+ # ============================================================
145
+ def train_model(model, X_train, Y_train, X_eval, epochs=500, lr=0.001):
146
+ """
147
+ Train a model and collect comprehensive metrics.
148
+
149
+ Returns:
150
+ - loss_history: List of losses per epoch
151
+ - gradient_history: Dict of gradient magnitudes at key epochs (1, 100, 200)
152
+ - activation_history: Activations at various epochs
153
+ - weight_history: Weight statistics over training
154
+ - prediction_history: Model predictions at key epochs
155
+ """
156
+ optimizer = optim.Adam(model.parameters(), lr=lr)
157
+ criterion = nn.MSELoss()
158
+
159
+ loss_history = []
160
+ gradient_history = {} # Gradients at epochs 1, 100, 200
161
+ activation_history = {}
162
+ weight_history = {}
163
+ prediction_history = {}
164
+
165
+ # Key epochs for analysis
166
+ gradient_epochs = [1, 100, 200] # Epochs to measure gradients
167
+ activation_epochs = [0, 50, 100, 150, 200, 300, 400, 499] # Epochs to save activations
168
+ prediction_epochs = [0, 50, 100, 200, 300, 499] # Epochs to save predictions
169
+
170
+ for epoch in range(epochs):
171
+ model.train()
172
+ optimizer.zero_grad()
173
+
174
+ # Forward pass (store activations at specific epochs)
175
+ store_acts = epoch in activation_epochs
176
+ predictions = model(X_train, store_activations=store_acts)
177
+
178
+ # Compute loss
179
+ loss = criterion(predictions, Y_train)
180
+
181
+ # Backward pass
182
+ loss.backward()
183
+
184
+ # Capture gradient magnitudes at key epochs
185
+ if epoch in gradient_epochs:
186
+ gradient_history[epoch] = model.get_gradient_magnitudes()
187
+ print(f" [Gradient Capture] Epoch {epoch}: Layer 1={gradient_history[epoch][0]:.2e}, Layer 10={gradient_history[epoch][9]:.2e}")
188
+
189
+ # Update weights
190
+ optimizer.step()
191
+
192
+ # Record loss
193
+ loss_history.append(loss.item())
194
+
195
+ # Store activations
196
+ if store_acts:
197
+ activation_history[epoch] = {
198
+ k: v.numpy().copy() for k, v in model.activations.items()
199
+ }
200
+
201
+ # Store weight statistics periodically
202
+ if epoch % 50 == 0:
203
+ weight_history[epoch] = model.get_weight_stats()
204
+
205
+ # Store predictions at key epochs
206
+ if epoch in prediction_epochs:
207
+ model.eval()
208
+ with torch.no_grad():
209
+ pred = model(X_eval)
210
+ prediction_history[epoch] = pred.numpy().flatten()
211
+ model.train()
212
+
213
+ # Print progress
214
+ if epoch % 100 == 0 or epoch == epochs - 1:
215
+ print(f" Epoch {epoch:4d}/{epochs}: Loss = {loss.item():.6f}")
216
+
217
+ return loss_history, gradient_history, activation_history, weight_history, prediction_history
218
+
219
+
220
+ # ============================================================
221
+ # 4. Train All Models
222
+ # ============================================================
223
+ activation_types = ["linear", "sigmoid", "relu", "leaky_relu", "gelu"]
224
+ activation_labels = {
225
+ "linear": "Linear (None)",
226
+ "sigmoid": "Sigmoid",
227
+ "relu": "ReLU",
228
+ "leaky_relu": "Leaky ReLU",
229
+ "gelu": "GELU"
230
+ }
231
+
232
+ results = {}
233
+
234
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Training models with extended metrics...")
235
+ print("=" * 70)
236
+
237
+ for act_type in activation_types:
238
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Training {activation_labels[act_type]} model...")
239
+
240
+ model = create_model(act_type)
241
+ loss_history, grad_history, act_history, weight_history, pred_history = train_model(
242
+ model, X_train, Y_train, X_eval, epochs=500, lr=0.001
243
+ )
244
+
245
+ # Get final predictions
246
+ model.eval()
247
+ with torch.no_grad():
248
+ final_predictions = model(X_eval, store_activations=True)
249
+
250
+ results[act_type] = {
251
+ "model": model,
252
+ "loss_history": loss_history,
253
+ "gradient_history": grad_history, # Gradients at epochs 1, 100, 200
254
+ "activation_history": act_history,
255
+ "weight_history": weight_history,
256
+ "prediction_history": pred_history,
257
+ "final_predictions": final_predictions.numpy().flatten(),
258
+ "final_activations": {k: v.numpy().copy() for k, v in model.activations.items()},
259
+ "final_loss": loss_history[-1]
260
+ }
261
+
262
+ print(f" Final MSE Loss: {loss_history[-1]:.6f}")
263
+
264
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] All models trained!")
265
+
266
+ # ============================================================
267
+ # 5. Save Extended Data
268
+ # ============================================================
269
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Saving extended data...")
270
+
271
+ # Save gradient magnitudes at all measured epochs
272
+ gradient_data = {}
273
+ for act_type in activation_types:
274
+ gradient_data[act_type] = {
275
+ str(epoch): grads for epoch, grads in results[act_type]["gradient_history"].items()
276
+ }
277
+ with open('activation_functions/gradient_magnitudes_epochs.json', 'w') as f:
278
+ json.dump(gradient_data, f, indent=2)
279
+
280
+ # Save loss histories
281
+ loss_data = {
282
+ act_type: results[act_type]["loss_history"]
283
+ for act_type in activation_types
284
+ }
285
+ with open('activation_functions/loss_histories.json', 'w') as f:
286
+ json.dump(loss_data, f, indent=2)
287
+
288
+ # Save final losses
289
+ final_losses = {
290
+ act_type: results[act_type]["final_loss"]
291
+ for act_type in activation_types
292
+ }
293
+ with open('activation_functions/final_losses.json', 'w') as f:
294
+ json.dump(final_losses, f, indent=2)
295
+
296
+ print(" Saved: gradient_magnitudes_epochs.json, loss_histories.json, final_losses.json")
297
+
298
+ # ============================================================
299
+ # 6. Generate Visualizations
300
+ # ============================================================
301
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Generating visualizations...")
302
+
303
+ # Set style
304
+ plt.style.use('seaborn-v0_8-whitegrid')
305
+ colors = {
306
+ "linear": "#1f77b4",
307
+ "sigmoid": "#ff7f0e",
308
+ "relu": "#2ca02c",
309
+ "leaky_relu": "#d62728",
310
+ "gelu": "#9467bd"
311
+ }
312
+
313
+ # --- Plot 1: Learned Functions ---
314
+ print(" Creating learned_functions.png...")
315
+ fig, ax = plt.subplots(figsize=(12, 8))
316
+
317
+ # Ground truth
318
+ ax.plot(x_eval, y_true, 'k-', linewidth=2.5, label='Ground Truth (sin(x))', zorder=10)
319
+
320
+ # Noisy data points
321
+ ax.scatter(x, y, c='gray', alpha=0.5, s=30, label='Noisy Data', zorder=5)
322
+
323
+ # Learned functions
324
+ for act_type in activation_types:
325
+ ax.plot(x_eval, results[act_type]["final_predictions"],
326
+ color=colors[act_type], linewidth=2,
327
+ label=f'{activation_labels[act_type]} (MSE: {results[act_type]["final_loss"]:.4f})',
328
+ alpha=0.8)
329
+
330
+ ax.set_xlabel('x', fontsize=12)
331
+ ax.set_ylabel('y', fontsize=12)
332
+ ax.set_title('Learned Functions: Comparison of Activation Functions\n(10 Hidden Layers, 64 Neurons Each, 500 Epochs)', fontsize=14)
333
+ ax.legend(loc='upper right', fontsize=10)
334
+ ax.set_xlim(-np.pi, np.pi)
335
+ ax.set_ylim(-1.5, 1.5)
336
+ ax.grid(True, alpha=0.3)
337
+
338
+ plt.tight_layout()
339
+ plt.savefig('activation_functions/learned_functions.png', dpi=150, bbox_inches='tight')
340
+ plt.close()
341
+
342
+ # --- Plot 2: Loss Curves ---
343
+ print(" Creating loss_curves.png...")
344
+ fig, ax = plt.subplots(figsize=(12, 8))
345
+
346
+ for act_type in activation_types:
347
+ ax.plot(results[act_type]["loss_history"],
348
+ color=colors[act_type], linewidth=2,
349
+ label=f'{activation_labels[act_type]}')
350
+
351
+ ax.set_xlabel('Epoch', fontsize=12)
352
+ ax.set_ylabel('MSE Loss', fontsize=12)
353
+ ax.set_title('Training Loss Curves: Comparison of Activation Functions', fontsize=14)
354
+ ax.legend(loc='upper right', fontsize=10)
355
+ ax.set_yscale('log')
356
+ ax.grid(True, alpha=0.3)
357
+
358
+ plt.tight_layout()
359
+ plt.savefig('activation_functions/loss_curves.png', dpi=150, bbox_inches='tight')
360
+ plt.close()
361
+
362
+ # --- Plot 3: Gradient Flow at Epochs 1, 100, 200 ---
363
+ print(" Creating gradient_flow_epochs.png...")
364
+ fig, axes = plt.subplots(1, 3, figsize=(18, 6))
365
+
366
+ gradient_epochs = [1, 100, 200]
367
+ layer_indices = list(range(1, 11))
368
+
369
+ for idx, epoch in enumerate(gradient_epochs):
370
+ ax = axes[idx]
371
+ bar_width = 0.15
372
+ x_positions = np.arange(len(layer_indices))
373
+
374
+ for i, act_type in enumerate(activation_types):
375
+ grad_mags = results[act_type]["gradient_history"].get(epoch, [0]*10)
376
+ offset = (i - 2) * bar_width
377
+ bars = ax.bar(x_positions + offset, grad_mags, bar_width,
378
+ label=activation_labels[act_type] if idx == 0 else "",
379
+ color=colors[act_type], alpha=0.8)
380
+
381
+ ax.set_xlabel('Hidden Layer', fontsize=11)
382
+ ax.set_ylabel('Avg Gradient Magnitude', fontsize=11)
383
+ ax.set_title(f'Epoch {epoch}', fontsize=13, fontweight='bold')
384
+ ax.set_xticks(x_positions)
385
+ ax.set_xticklabels([f'L{i}' for i in layer_indices], fontsize=9)
386
+ ax.set_yscale('log')
387
+ ax.grid(True, alpha=0.3, axis='y')
388
+ ax.set_ylim(1e-12, 1e0)
389
+
390
+ # Add legend to first subplot
391
+ axes[0].legend(loc='upper right', fontsize=9)
392
+
393
+ fig.suptitle('Gradient Flow Analysis Across Training\n(Gradient Magnitude per Layer at Epochs 1, 100, 200)', fontsize=14, y=1.02)
394
+ plt.tight_layout()
395
+ plt.savefig('activation_functions/gradient_flow_epochs.png', dpi=150, bbox_inches='tight')
396
+ plt.close()
397
+
398
+ # --- Plot 4: Original Gradient Flow (Epoch 1 only for compatibility) ---
399
+ print(" Creating gradient_flow.png...")
400
+ fig, ax = plt.subplots(figsize=(12, 8))
401
+
402
+ bar_width = 0.15
403
+ x_positions = np.arange(len(layer_indices))
404
+
405
+ for i, act_type in enumerate(activation_types):
406
+ grad_mags = results[act_type]["gradient_history"].get(1, [0]*10)
407
+ offset = (i - 2) * bar_width
408
+ bars = ax.bar(x_positions + offset, grad_mags, bar_width,
409
+ label=activation_labels[act_type], color=colors[act_type], alpha=0.8)
410
+
411
+ ax.set_xlabel('Hidden Layer', fontsize=12)
412
+ ax.set_ylabel('Average Gradient Magnitude', fontsize=12)
413
+ ax.set_title('Gradient Flow Analysis: Average Gradient Magnitude per Layer\n(Measured at Epoch 1)', fontsize=14)
414
+ ax.set_xticks(x_positions)
415
+ ax.set_xticklabels([f'Layer {i}' for i in layer_indices])
416
+ ax.legend(loc='upper right', fontsize=10)
417
+ ax.set_yscale('log')
418
+ ax.grid(True, alpha=0.3, axis='y')
419
+
420
+ plt.tight_layout()
421
+ plt.savefig('activation_functions/gradient_flow.png', dpi=150, bbox_inches='tight')
422
+ plt.close()
423
+
424
+ # --- Plot 5: Hidden Activations ---
425
+ print(" Creating hidden_activations.png...")
426
+ fig, axes = plt.subplots(3, 5, figsize=(18, 12))
427
+
428
+ layers_to_plot = ['layer_1', 'layer_5', 'layer_10']
429
+ layer_titles = ['Layer 1 (First)', 'Layer 5 (Middle)', 'Layer 10 (Last)']
430
+
431
+ for row, (layer_key, layer_title) in enumerate(zip(layers_to_plot, layer_titles)):
432
+ for col, act_type in enumerate(activation_types):
433
+ ax = axes[row, col]
434
+
435
+ # Get activations for this layer
436
+ activations = results[act_type]["final_activations"].get(layer_key, None)
437
+
438
+ if activations is not None:
439
+ # Plot histogram of activation values
440
+ ax.hist(activations.flatten(), bins=50, color=colors[act_type],
441
+ alpha=0.7, edgecolor='black', linewidth=0.5)
442
+
443
+ # Add statistics
444
+ mean_val = activations.mean()
445
+ std_val = activations.std()
446
+ ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.5)
447
+
448
+ ax.set_title(f'{activation_labels[act_type]}\n{layer_title}', fontsize=10)
449
+ ax.set_xlabel('Activation Value', fontsize=8)
450
+ ax.set_ylabel('Frequency', fontsize=8)
451
+
452
+ # Add text box with stats
453
+ textstr = f'μ={mean_val:.3f}\nσ={std_val:.3f}'
454
+ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
455
+ ax.text(0.95, 0.95, textstr, transform=ax.transAxes, fontsize=8,
456
+ verticalalignment='top', horizontalalignment='right', bbox=props)
457
+ else:
458
+ ax.text(0.5, 0.5, 'No Data', ha='center', va='center', transform=ax.transAxes)
459
+ ax.set_title(f'{activation_labels[act_type]}\n{layer_title}', fontsize=10)
460
+
461
+ fig.suptitle('Hidden Layer Activation Distributions (After Training)', fontsize=14, y=1.02)
462
+ plt.tight_layout()
463
+ plt.savefig('activation_functions/hidden_activations.png', dpi=150, bbox_inches='tight')
464
+ plt.close()
465
+
466
+ # --- NEW Plot 6: Training Dynamics - Function Learning Over Time ---
467
+ print(" Creating training_dynamics_functions.png...")
468
+ fig, axes = plt.subplots(2, 3, figsize=(16, 10))
469
+ axes = axes.flatten()
470
+
471
+ # Show how each activation learns the function over epochs
472
+ prediction_epochs = [0, 50, 100, 200, 300, 499]
473
+ epoch_colors = plt.cm.viridis(np.linspace(0, 1, len(prediction_epochs)))
474
+
475
+ for idx, act_type in enumerate(activation_types):
476
+ ax = axes[idx]
477
+
478
+ # Ground truth
479
+ ax.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7)
480
+
481
+ # Predictions at different epochs
482
+ for ep_idx, epoch in enumerate(prediction_epochs):
483
+ if epoch in results[act_type]["prediction_history"]:
484
+ pred = results[act_type]["prediction_history"][epoch]
485
+ ax.plot(x_eval, pred, color=epoch_colors[ep_idx], linewidth=1.5,
486
+ label=f'Epoch {epoch}', alpha=0.8)
487
+
488
+ ax.set_xlabel('x', fontsize=10)
489
+ ax.set_ylabel('y', fontsize=10)
490
+ ax.set_title(f'{activation_labels[act_type]}', fontsize=12, fontweight='bold')
491
+ ax.set_xlim(-np.pi, np.pi)
492
+ ax.set_ylim(-2, 2)
493
+ ax.grid(True, alpha=0.3)
494
+ ax.legend(loc='upper right', fontsize=7)
495
+
496
+ # Hide the 6th subplot (we have 5 activations)
497
+ axes[5].axis('off')
498
+
499
+ fig.suptitle('Training Dynamics: How Each Activation Learns the Function Over Time', fontsize=14, y=1.02)
500
+ plt.tight_layout()
501
+ plt.savefig('activation_functions/training_dynamics_functions.png', dpi=150, bbox_inches='tight')
502
+ plt.close()
503
+
504
+ # --- NEW Plot 7: Gradient Evolution Over Training ---
505
+ print(" Creating gradient_evolution.png...")
506
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
507
+
508
+ # Left plot: Gradient ratio (Layer 10 / Layer 1) evolution
509
+ ax1 = axes[0]
510
+ gradient_epochs = [1, 100, 200]
511
+ x_pos = np.arange(len(gradient_epochs))
512
+ bar_width = 0.15
513
+
514
+ for i, act_type in enumerate(activation_types):
515
+ ratios = []
516
+ for epoch in gradient_epochs:
517
+ grads = results[act_type]["gradient_history"].get(epoch, [1e-10]*10)
518
+ # Avoid division by zero
519
+ if grads[0] > 1e-15:
520
+ ratio = grads[9] / grads[0] # Layer 10 / Layer 1
521
+ else:
522
+ ratio = 1e10 # Very large ratio indicates vanishing gradients
523
+ ratios.append(ratio)
524
+
525
+ offset = (i - 2) * bar_width
526
+ ax1.bar(x_pos + offset, ratios, bar_width, label=activation_labels[act_type],
527
+ color=colors[act_type], alpha=0.8)
528
+
529
+ ax1.set_xlabel('Epoch', fontsize=12)
530
+ ax1.set_ylabel('Gradient Ratio (Layer 10 / Layer 1)', fontsize=12)
531
+ ax1.set_title('Gradient Ratio Evolution\n(Higher = More Vanishing)', fontsize=13)
532
+ ax1.set_xticks(x_pos)
533
+ ax1.set_xticklabels([f'Epoch {e}' for e in gradient_epochs])
534
+ ax1.set_yscale('log')
535
+ ax1.axhline(y=1, color='black', linestyle='--', linewidth=1, label='Ideal (ratio=1)')
536
+ ax1.legend(loc='upper left', fontsize=9)
537
+ ax1.grid(True, alpha=0.3, axis='y')
538
+
539
+ # Right plot: Layer 1 gradient magnitude over epochs
540
+ ax2 = axes[1]
541
+
542
+ for act_type in activation_types:
543
+ layer1_grads = []
544
+ for epoch in gradient_epochs:
545
+ grads = results[act_type]["gradient_history"].get(epoch, [0]*10)
546
+ layer1_grads.append(grads[0])
547
+
548
+ ax2.plot(gradient_epochs, layer1_grads, 'o-', color=colors[act_type],
549
+ linewidth=2, markersize=8, label=activation_labels[act_type])
550
+
551
+ ax2.set_xlabel('Epoch', fontsize=12)
552
+ ax2.set_ylabel('Layer 1 Gradient Magnitude', fontsize=12)
553
+ ax2.set_title('First Layer Gradient Over Training\n(Key Indicator of Learning)', fontsize=13)
554
+ ax2.set_yscale('log')
555
+ ax2.legend(loc='upper right', fontsize=9)
556
+ ax2.grid(True, alpha=0.3)
557
+
558
+ fig.suptitle('Activation Effect on Gradient Dynamics During Training', fontsize=14, y=1.02)
559
+ plt.tight_layout()
560
+ plt.savefig('activation_functions/gradient_evolution.png', dpi=150, bbox_inches='tight')
561
+ plt.close()
562
+
563
+ # --- NEW Plot 8: Activation Distribution Evolution ---
564
+ print(" Creating activation_evolution.png...")
565
+ fig, axes = plt.subplots(5, 4, figsize=(16, 18))
566
+
567
+ # Show activation distributions at epochs 0, 100, 200, 499 for layer 5
568
+ epochs_to_show = [0, 100, 200, 499]
569
+
570
+ for row, act_type in enumerate(activation_types):
571
+ for col, epoch in enumerate(epochs_to_show):
572
+ ax = axes[row, col]
573
+
574
+ if epoch in results[act_type]["activation_history"]:
575
+ activations = results[act_type]["activation_history"][epoch].get('layer_5', None)
576
+
577
+ if activations is not None:
578
+ # Clean data for histogram
579
+ acts_clean = activations.flatten()
580
+ acts_clean = acts_clean[np.isfinite(acts_clean)]
581
+
582
+ if len(acts_clean) > 0:
583
+ ax.hist(acts_clean, bins=50, color=colors[act_type],
584
+ alpha=0.7, edgecolor='black', linewidth=0.5)
585
+
586
+ mean_val = np.nanmean(acts_clean)
587
+ std_val = np.nanstd(acts_clean)
588
+
589
+ ax.axvline(mean_val, color='red', linestyle='--', linewidth=1.5)
590
+
591
+ textstr = f'μ={mean_val:.3f}\nσ={std_val:.3f}'
592
+ props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
593
+ ax.text(0.95, 0.95, textstr, transform=ax.transAxes, fontsize=8,
594
+ verticalalignment='top', horizontalalignment='right', bbox=props)
595
+
596
+ if row == 0:
597
+ ax.set_title(f'Epoch {epoch}', fontsize=11, fontweight='bold')
598
+ if col == 0:
599
+ ax.set_ylabel(f'{activation_labels[act_type]}', fontsize=10)
600
+
601
+ fig.suptitle('Activation Distribution Evolution (Layer 5 - Middle Layer)\nHow Activations Change During Training', fontsize=14, y=1.01)
602
+ plt.tight_layout()
603
+ plt.savefig('activation_functions/activation_evolution.png', dpi=150, bbox_inches='tight')
604
+ plt.close()
605
+
606
+ # --- NEW Plot 9: Comprehensive Training Dynamics Summary ---
607
+ print(" Creating training_dynamics_summary.png...")
608
+ fig = plt.figure(figsize=(20, 16))
609
+
610
+ # Create grid layout
611
+ gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
612
+
613
+ # Panel 1: Loss curves (top-left)
614
+ ax1 = fig.add_subplot(gs[0, 0])
615
+ for act_type in activation_types:
616
+ ax1.plot(results[act_type]["loss_history"],
617
+ color=colors[act_type], linewidth=2, label=activation_labels[act_type])
618
+ ax1.set_xlabel('Epoch', fontsize=11)
619
+ ax1.set_ylabel('MSE Loss', fontsize=11)
620
+ ax1.set_title('A. Training Loss Curves', fontsize=12, fontweight='bold')
621
+ ax1.set_yscale('log')
622
+ ax1.legend(loc='upper right', fontsize=8)
623
+ ax1.grid(True, alpha=0.3)
624
+
625
+ # Panel 2: Gradient ratio evolution (top-middle)
626
+ ax2 = fig.add_subplot(gs[0, 1])
627
+ for act_type in activation_types:
628
+ ratios = []
629
+ for epoch in [1, 100, 200]:
630
+ grads = results[act_type]["gradient_history"].get(epoch, [1e-10]*10)
631
+ if grads[0] > 1e-15:
632
+ ratio = grads[9] / grads[0]
633
+ else:
634
+ ratio = 1e10
635
+ ratios.append(ratio)
636
+ ax2.plot([1, 100, 200], ratios, 'o-', color=colors[act_type],
637
+ linewidth=2, markersize=8, label=activation_labels[act_type])
638
+ ax2.set_xlabel('Epoch', fontsize=11)
639
+ ax2.set_ylabel('Gradient Ratio (L10/L1)', fontsize=11)
640
+ ax2.set_title('B. Gradient Ratio Over Training', fontsize=12, fontweight='bold')
641
+ ax2.set_yscale('log')
642
+ ax2.axhline(y=1, color='black', linestyle='--', linewidth=1, alpha=0.5)
643
+ ax2.legend(loc='upper left', fontsize=8)
644
+ ax2.grid(True, alpha=0.3)
645
+
646
+ # Panel 3: Final learned functions (top-right)
647
+ ax3 = fig.add_subplot(gs[0, 2])
648
+ ax3.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7)
649
+ for act_type in activation_types:
650
+ ax3.plot(x_eval, results[act_type]["final_predictions"],
651
+ color=colors[act_type], linewidth=1.5, label=activation_labels[act_type], alpha=0.8)
652
+ ax3.set_xlabel('x', fontsize=11)
653
+ ax3.set_ylabel('y', fontsize=11)
654
+ ax3.set_title('C. Final Learned Functions', fontsize=12, fontweight='bold')
655
+ ax3.legend(loc='upper right', fontsize=8)
656
+ ax3.grid(True, alpha=0.3)
657
+
658
+ # Panels 4-6: Gradient flow at epochs 1, 100, 200 (middle row)
659
+ for idx, epoch in enumerate([1, 100, 200]):
660
+ ax = fig.add_subplot(gs[1, idx])
661
+ bar_width = 0.15
662
+ x_positions = np.arange(10)
663
+
664
+ for i, act_type in enumerate(activation_types):
665
+ grad_mags = results[act_type]["gradient_history"].get(epoch, [0]*10)
666
+ offset = (i - 2) * bar_width
667
+ ax.bar(x_positions + offset, grad_mags, bar_width,
668
+ color=colors[act_type], alpha=0.8)
669
+
670
+ ax.set_xlabel('Layer', fontsize=10)
671
+ ax.set_ylabel('Gradient Magnitude', fontsize=10)
672
+ ax.set_title(f'D{idx+1}. Gradient Flow - Epoch {epoch}', fontsize=12, fontweight='bold')
673
+ ax.set_xticks(x_positions)
674
+ ax.set_xticklabels([f'{i+1}' for i in range(10)], fontsize=8)
675
+ ax.set_yscale('log')
676
+ ax.set_ylim(1e-12, 1e0)
677
+ ax.grid(True, alpha=0.3, axis='y')
678
+
679
+ # Panels 7-9: Function learning at epochs 50, 200, 499 (bottom row)
680
+ for idx, epoch in enumerate([50, 200, 499]):
681
+ ax = fig.add_subplot(gs[2, idx])
682
+ ax.plot(x_eval, y_true, 'k--', linewidth=2, label='Ground Truth', alpha=0.7)
683
+
684
+ for act_type in activation_types:
685
+ if epoch in results[act_type]["prediction_history"]:
686
+ pred = results[act_type]["prediction_history"][epoch]
687
+ ax.plot(x_eval, pred, color=colors[act_type], linewidth=1.5,
688
+ label=activation_labels[act_type], alpha=0.8)
689
+
690
+ ax.set_xlabel('x', fontsize=10)
691
+ ax.set_ylabel('y', fontsize=10)
692
+ ax.set_title(f'E{idx+1}. Predictions at Epoch {epoch}', fontsize=12, fontweight='bold')
693
+ ax.set_xlim(-np.pi, np.pi)
694
+ ax.set_ylim(-2, 2)
695
+ ax.grid(True, alpha=0.3)
696
+ if idx == 2:
697
+ ax.legend(loc='upper right', fontsize=7)
698
+
699
+ fig.suptitle('Comprehensive Training Dynamics Analysis: Activation Functions in Deep Networks\n(10 Layers × 64 Neurons, 500 Epochs, Adam Optimizer)', fontsize=16, y=1.01)
700
+ plt.savefig('activation_functions/training_dynamics_summary.png', dpi=150, bbox_inches='tight')
701
+ plt.close()
702
+
703
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] All visualizations saved!")
704
+ print(" - learned_functions.png")
705
+ print(" - loss_curves.png")
706
+ print(" - gradient_flow.png")
707
+ print(" - gradient_flow_epochs.png (NEW)")
708
+ print(" - hidden_activations.png")
709
+ print(" - training_dynamics_functions.png (NEW)")
710
+ print(" - gradient_evolution.png (NEW)")
711
+ print(" - activation_evolution.png (NEW)")
712
+ print(" - training_dynamics_summary.png (NEW)")
713
+
714
+
715
+ # ============================================================
716
+ # 7. Print Summary Statistics
717
+ # ============================================================
718
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Summary Statistics")
719
+ print("=" * 70)
720
+
721
+ print("\n### Gradient Magnitudes at Key Epochs ###")
722
+ print("-" * 70)
723
+ print(f"{'Activation':<15} {'Epoch':<8} {'Layer 1':<12} {'Layer 5':<12} {'Layer 10':<12} {'Ratio (L10/L1)':<15}")
724
+ print("-" * 70)
725
+
726
+ for act_type in activation_types:
727
+ for epoch in [1, 100, 200]:
728
+ grads = results[act_type]["gradient_history"].get(epoch, [0]*10)
729
+ if grads[0] > 1e-15:
730
+ ratio = grads[9] / grads[0]
731
+ else:
732
+ ratio = float('inf')
733
+ print(f"{activation_labels[act_type]:<15} {epoch:<8} {grads[0]:<12.2e} {grads[4]:<12.2e} {grads[9]:<12.2e} {ratio:<15.2e}")
734
+
735
+ print("\n### Final MSE Losses ###")
736
+ print("-" * 40)
737
+ sorted_losses = sorted(final_losses.items(), key=lambda x: x[1])
738
+ for act_type, loss in sorted_losses:
739
+ print(f"{activation_labels[act_type]:<20}: {loss:.6f}")
740
+
741
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Experiment complete!")
742
+ print("=" * 70)