Fix gradient diagnostic thresholds with evidence-based criteria in LossDebugger

- Use exact grad_clip instead of 0.95 multiplier (PyTorch clip_grad_norm_ behavior)
- Raise clip rate threshold from 30% to 50% (Google Deep Learning Tuning Playbook)
- Replace absolute tiny-grad threshold (0.01) with relative one (grad_clip * 0.01)
- Change tiny-grad diagnosis from "LR too low" to "vanishing gradients"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

llm_lab/training/debugger.py +13 -8

llm_lab/training/debugger.py CHANGED Viewed

@@ -719,16 +719,21 @@ class LossDebugger:
         if grad_norms:
             avg_grad = sum(grad_norms) / len(grad_norms)
-            clip_count = sum(1 for g in grad_norms if g >= config.grad_clip * 0.95)
             clip_rate = clip_count / len(grad_norms)
-            tiny_count = sum(1 for g in grad_norms if g < 0.01)
             tiny_rate = tiny_count / len(grad_norms)
             print(f"    Avg grad norm:  {avg_grad:.4f}")
             print(f"    Clip rate:      {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
-            print(f"    Tiny grad rate: {tiny_rate * 100:.1f}% (< 0.01)")
-            if clip_rate > 0.3:
                 findings.append({
                     "issue": "LR may be too high",
                     "evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
@@ -737,11 +742,11 @@ class LossDebugger:
                 print(f"    🟡 Grad clipping frequent ({clip_rate * 100:.0f}%) → LR may be too high")
             elif tiny_rate > 0.5:
                 findings.append({
-                    "issue": "LR may be too low",
-                    "evidence": f"Grad norm < 0.01 in {tiny_rate * 100:.0f}% of steps",
-                    "action": f"Try LR = {config.learning_rate * 2:.2e} (×2)",
                 })
-                print(f"    🟡 Grad norm too small ({tiny_rate * 100:.0f}% < 0.01) → LR may be too low")
             else:
                 print(f"    ✅ LR looks appropriate")

         if grad_norms:
             avg_grad = sum(grad_norms) / len(grad_norms)
+            # Ref: PyTorch clip_grad_norm_ clips when total_norm > max_norm
+            clip_count = sum(1 for g in grad_norms if g >= config.grad_clip)
             clip_rate = clip_count / len(grad_norms)
+            # Relative threshold: < 1% of clip limit (model-size independent)
+            tiny_threshold = config.grad_clip * 0.01
+            tiny_count = sum(1 for g in grad_norms if g < tiny_threshold)
             tiny_rate = tiny_count / len(grad_norms)
             print(f"    Avg grad norm:  {avg_grad:.4f}")
             print(f"    Clip rate:      {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
+            print(f"    Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
+            # Ref: Google Deep Learning Tuning Playbook — >50% clipping is
+            # "extremely aggressive" and effectively a strange LR reduction.
+            if clip_rate > 0.5:
                 findings.append({
                     "issue": "LR may be too high",
                     "evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
                 print(f"    🟡 Grad clipping frequent ({clip_rate * 100:.0f}%) → LR may be too high")
             elif tiny_rate > 0.5:
                 findings.append({
+                    "issue": "Possible vanishing gradients",
+                    "evidence": f"Grad norm < {tiny_threshold:.4f} in {tiny_rate * 100:.0f}% of steps",
+                    "action": "Check weight initialization, layer norms, and model depth",
                 })
+                print(f"    🟡 Grad norm too small ({tiny_rate * 100:.0f}% < {tiny_threshold:.4f}) → possible vanishing gradients")
             else:
                 print(f"    ✅ LR looks appropriate")