Fix check_numerical_stability accuracy and completeness

- Norm fp32 check: inspect all unique norm classes instead of breaking
after first; use type(module).forward to resolve inherited methods
- Remove dead loss_fp32_note variable (condition always False)
- Add GradScaler for fp16 backward to prevent gradient underflow
from producing false positives in gradient health checks
- Add activation growth trend detection (std ratio across layers)
to fulfill docstring promise of catching initialization/norm bugs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

llm_lab/training/debugger.py +30 -10

llm_lab/training/debugger.py CHANGED Viewed

@@ -510,20 +510,20 @@ class LossDebugger:
         # Check RMSNorm fp32 upcast
         norm_fp32_ok = True
         for name, module in model.named_modules():
             cls_name = module.__class__.__name__
-            if "Norm" in cls_name:
-                # Inspect forward source for .float() call
                 import inspect
                 try:
-                    src = inspect.getsource(module.forward)
                     has_upcast = ".float()" in src or "float32" in src
                 except (TypeError, OSError):
                     has_upcast = True  # assume ok if can't inspect
                 if not has_upcast:
                     norm_fp32_ok = False
-                    print(f"    🔴 {name} ({cls_name}): no fp32 upcast detected!")
-                break  # check one norm layer is enough
         if norm_fp32_ok:
             print(f"    ✅ Norm layers use fp32 upcast (safe)")
@@ -533,10 +533,6 @@ class LossDebugger:
         ))
         # Check loss computation dtype
-        loss_fp32_note = (
-            dtype in (torch.bfloat16, torch.float16)
-            and "cross_entropy" in str(type(model))
-        )
         if dtype in (torch.bfloat16, torch.float16):
             print(f"    ℹ️  Best practice: compute loss in fp32 when using {dtype}")
             print(f"       logits_fp32 = logits.float()")
@@ -579,6 +575,9 @@ class LossDebugger:
         # ── Forward + Backward ──
         model.train()
         model.zero_grad(set_to_none=True)
         with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
             logits, loss = model(input_ids, targets)
@@ -590,7 +589,12 @@ class LossDebugger:
             f"Loss = {loss_val:.4f}" if loss_ok else f"Loss = {loss_val} (NaN/Inf!)"
         ))
-        loss.backward()
         # Remove hooks
         for h in hooks:
@@ -650,6 +654,22 @@ class LossDebugger:
             f"{act_nan_count} layers with NaN/Inf" if not act_ok else "All layers healthy",
         ))
         # ── Logit scale check ──
         logit_max = logits.float().abs().max().item()
         logit_ok = logit_max < 1000

         # Check RMSNorm fp32 upcast
         norm_fp32_ok = True
+        checked_norm_classes: set = set()
         for name, module in model.named_modules():
             cls_name = module.__class__.__name__
+            if "Norm" in cls_name and cls_name not in checked_norm_classes:
+                checked_norm_classes.add(cls_name)
                 import inspect
                 try:
+                    src = inspect.getsource(type(module).forward)
                     has_upcast = ".float()" in src or "float32" in src
                 except (TypeError, OSError):
                     has_upcast = True  # assume ok if can't inspect
                 if not has_upcast:
                     norm_fp32_ok = False
+                    print(f"    🔴 {cls_name}: no fp32 upcast detected!")
         if norm_fp32_ok:
             print(f"    ✅ Norm layers use fp32 upcast (safe)")
         ))
         # Check loss computation dtype
         if dtype in (torch.bfloat16, torch.float16):
             print(f"    ℹ️  Best practice: compute loss in fp32 when using {dtype}")
             print(f"       logits_fp32 = logits.float()")
         # ── Forward + Backward ──
         model.train()
         model.zero_grad(set_to_none=True)
+        use_scaler = dtype == torch.float16 and torch.cuda.is_available()
+        scaler = torch.amp.GradScaler() if use_scaler else None
         with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
             logits, loss = model(input_ids, targets)
             f"Loss = {loss_val:.4f}" if loss_ok else f"Loss = {loss_val} (NaN/Inf!)"
         ))
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            _temp_opt = torch.optim.SGD(model.parameters(), lr=0)
+            scaler.unscale_(_temp_opt)
+        else:
+            loss.backward()
         # Remove hooks
         for h in hooks:
             f"{act_nan_count} layers with NaN/Inf" if not act_ok else "All layers healthy",
         ))
+        # ── Activation growth trend ──
+        if len(activation_stats) >= 2:
+            stds = [s["std"] for s in activation_stats]
+            if stds[0] > 1e-8:
+                growth_ratio = stds[-1] / stds[0]
+                growth_ok = growth_ratio < 10
+                detail = (
+                    f"Activation std ratio (last/first): {growth_ratio:.1f}x "
+                    f"(layer_0={stds[0]:.4f}, last={stds[-1]:.4f})"
+                )
+                results.append(_check_result("Activation growth", growth_ok, detail))
+                icon = "✅" if growth_ok else "🟡"
+                print(f"    {icon} {detail}")
+                if not growth_ok:
+                    print(f"       Possible initialization or normalization issue")
         # ── Logit scale check ──
         logit_max = logits.float().abs().max().item()
         logit_ok = logit_max < 1000