algorythmtechnologies
/

Supernova25million

English

Model card Files Files and versions

xet

Community

Kompella Sri Aasrith Souri commited on Sep 22, 2025

Commit

c866f18

1 Parent(s): 76a1306

fixed gradient norm error

Browse files

Files changed (2) hide show

supernova/train.py +23 -4
test_gradients.py +128 -0

supernova/train.py CHANGED Viewed

@@ -19,12 +19,25 @@ from .data import load_sources_from_yaml, TokenChunkDataset, DataSource
 # ------------------------------
 # Utilities
 # ------------------------------
-def compute_grad_norm(model: nn.Module) -> float:
     total = 0.0
-    for p in model.parameters():
         if p.grad is not None:
             param_norm = p.grad.data.float().norm(2).item()
             total += param_norm * param_norm
     return math.sqrt(total)
 def atomic_save(obj: Dict[str, Any], path: str):
@@ -237,6 +250,13 @@ def train(
                     scaler.unscale_(optimizer)
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
                 scaler.step(optimizer)
                 scaler.update()
                 optimizer.zero_grad(set_to_none=True)
@@ -247,8 +267,7 @@ def train(
                 step += 1
                 # logging
-                if step % 50 == 0 and (not ddp or local_rank == 0):
-                    grad_norm = compute_grad_norm(model if not ddp else model.module)
                     avg_loss = running_loss * grad_accum / 50.0
                     running_loss = 0.0
                     elapsed = time.time() - t0

 # ------------------------------
 # Utilities
 # ------------------------------
+def compute_grad_norm(model: nn.Module, debug: bool = False) -> float:
     total = 0.0
+    grad_count = 0
+    param_count = 0
+    for name, p in model.named_parameters():
+        param_count += 1
         if p.grad is not None:
+            grad_count += 1
             param_norm = p.grad.data.float().norm(2).item()
             total += param_norm * param_norm
+            if debug and param_norm > 1e-8:  # Only print non-zero gradients
+                print(f"  {name}: grad_norm={param_norm:.6f}")
+        elif debug:
+            print(f"  {name}: NO GRAD")
+    if debug:
+        print(f"Gradient stats: {grad_count}/{param_count} parameters have gradients, total_norm={math.sqrt(total):.6f}")
     return math.sqrt(total)
 def atomic_save(obj: Dict[str, Any], path: str):
                     scaler.unscale_(optimizer)
                     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
+                # Compute gradient norm BEFORE clearing gradients (only when needed for logging)
+                grad_norm = None
+                if (step + 1) % 50 == 0 and (not ddp or local_rank == 0):
+                    # Enable debug mode for first few steps to diagnose gradient issues
+                    debug_gradients = step < 5
+                    grad_norm = compute_grad_norm(model if not ddp else model.module, debug=debug_gradients)
                 scaler.step(optimizer)
                 scaler.update()
                 optimizer.zero_grad(set_to_none=True)
                 step += 1
                 # logging
+                if step % 50 == 0 and (not ddp or local_rank == 0) and grad_norm is not None:
                     avg_loss = running_loss * grad_accum / 50.0
                     running_loss = 0.0
                     elapsed = time.time() - t0

test_gradients.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""
+Diagnostic script to test gradient flow in SupernovaModel
+"""
+import torch
+import torch.nn.functional as F
+from supernova.config import ModelConfig
+from supernova.model import SupernovaModel
+from supernova.tokenizer import load_gpt2_tokenizer
+import math
+def compute_grad_norm(model, debug=True):
+    total = 0.0
+    grad_count = 0
+    param_count = 0
+    for name, p in model.named_parameters():
+        param_count += 1
+        if p.grad is not None:
+            grad_count += 1
+            param_norm = p.grad.data.float().norm(2).item()
+            total += param_norm * param_norm
+            if debug and param_norm > 1e-8:
+                print(f"  {name}: grad_norm={param_norm:.6f}, shape={p.grad.shape}")
+        elif debug:
+            print(f"  {name}: NO GRAD, requires_grad={p.requires_grad}")
+    total_norm = math.sqrt(total)
+    print(f"Gradient stats: {grad_count}/{param_count} parameters have gradients, total_norm={total_norm:.6f}")
+    return total_norm
+def test_gradient_flow():
+    print("Testing gradient flow in SupernovaModel...")
+    # Load config
+    try:
+        cfg = ModelConfig.from_json_file("supernova_25m_config.json")
+        print(f"Loaded config: {cfg.d_model}d, {cfg.n_layers}L, {cfg.n_heads}H")
+    except FileNotFoundError:
+        print("Config file not found, creating minimal config...")
+        cfg = ModelConfig(
+            vocab_size=50257,
+            d_model=512,
+            n_layers=8,
+            n_heads=8,
+            mlp_ratio=4,
+            dropout=0.1,
+            n_positions=1024,
+            use_positional_embedding=True,
+            final_layer_norm=True
+        )
+    # Create model
+    model = SupernovaModel(cfg)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.train()
+    print(f"Model parameters: {model.num_parameters():,}")
+    print(f"Using device: {device}")
+    # Create dummy data
+    batch_size = 2
+    seq_len = 64
+    input_ids = torch.randint(0, cfg.vocab_size, (batch_size, seq_len), device=device)
+    targets = torch.randint(0, cfg.vocab_size, (batch_size, seq_len), device=device)
+    print(f"Input shape: {input_ids.shape}, Target shape: {targets.shape}")
+    # Test 1: Basic forward pass
+    print("\n=== Test 1: Basic forward pass ===")
+    with torch.no_grad():
+        logits, loss = model(input_ids, targets)
+        print(f"Logits shape: {logits.shape}")
+        print(f"Loss: {loss.item():.6f}")
+    # Test 2: Forward pass with gradients
+    print("\n=== Test 2: Forward pass with gradients ===")
+    model.zero_grad()
+    logits, loss = model(input_ids, targets)
+    print(f"Loss before backward: {loss.item():.6f}")
+    loss.backward()
+    print("After backward pass:")
+    grad_norm = compute_grad_norm(model, debug=True)
+    # Test 3: With mixed precision
+    print("\n=== Test 3: With mixed precision ===")
+    model.zero_grad()
+    scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))
+    device_type = 'cuda' if device.type == 'cuda' else 'cpu'
+    with torch.amp.autocast(device_type, enabled=(device.type == "cuda")):
+        logits, loss = model(input_ids, targets)
+        print(f"Loss with autocast: {loss.item():.6f}")
+        scaled_loss = scaler.scale(loss)
+        print(f"Scaled loss: {scaled_loss.item():.6f}")
+    scaled_loss.backward()
+    print("After scaled backward pass:")
+    grad_norm_before_unscale = compute_grad_norm(model, debug=False)
+    print(f"Grad norm before unscale: {grad_norm_before_unscale:.6f}")
+    scaler.unscale_(torch.optim.AdamW(model.parameters()))
+    print("After unscaling:")
+    grad_norm_after_unscale = compute_grad_norm(model, debug=True)
+    # Test 4: Parameter inspection
+    print("\n=== Test 4: Parameter inspection ===")
+    total_params = 0
+    trainable_params = 0
+    for name, param in model.named_parameters():
+        total_params += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(f"Total parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    # Check specific layers
+    print("\nChecking specific layer parameters:")
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(f"{name}: shape={param.shape}, dtype={param.dtype}, device={param.device}")
+            break  # Just show first few
+if __name__ == "__main__":
+    test_gradient_flow()