Fix false/inaccurate citations in LossDebugger

- Remove false attribution to Google Deep Learning Tuning Playbook for
gradient clipping heuristic (playbook has no clipping content)
- Remove Krizhevsky 2014 citation for sqrt batch-LR scaling (he proposed
it theoretically but preferred linear for SGD; Malladi et al. 2022 is
the proper reference for Adam)
- Clarify expected loss/PPL ranges are estimates from GPT-2 benchmarks
and Chinchilla scaling laws, not FineWeb-Edu specific results

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

llm_lab/training/debugger.py +7 -4

llm_lab/training/debugger.py CHANGED Viewed

@@ -29,7 +29,9 @@ from llm_lab.config import TrainConfig
 # Constants
 # ═══════════════════════════════════════════════════════════════════
-# Normal convergence ranges for a 1B model trained on ~10B tokens (FineWeb-Edu)
 _EXPECTED_TRAIN_LOSS = (2.5, 3.3)
 _EXPECTED_VAL_LOSS = (2.7, 3.6)
 _EXPECTED_VAL_PPL = (15, 37)
@@ -733,8 +735,9 @@ class LossDebugger:
             print(f"    Clip rate:      {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
             print(f"    Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
-            # Ref: Google Deep Learning Tuning Playbook — >50% clipping is
-            # "extremely aggressive" and effectively a strange LR reduction.
             if clip_rate > 0.5:
                 findings.append({
                     "issue": "LR may be too high",
@@ -829,7 +832,7 @@ class LossDebugger:
         # ── Batch-LR scaling guidance ──
         print("\n  Batch-LR Scaling Rules:")
         print("    • Batch ×2 → LR ×√2 (square root scaling, recommended for Adam)")
-        print("      (Malladi et al. NeurIPS 2022; Krizhevsky 2014)")
         print("    • Batch ×2 → LR ×2   (linear scaling, Goyal et al. 2017, mainly SGD)")
         print("    • 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
         print("      (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")

 # Constants
 # ═══════════════════════════════════════════════════════════════════
+# Approximate convergence ranges for a 1B model trained on ~10B tokens.
+# Estimated from GPT-2 scaling benchmarks (Radford et al. 2019) and
+# Chinchilla scaling laws (Hoffmann et al. 2022). Not dataset-specific.
 _EXPECTED_TRAIN_LOSS = (2.5, 3.3)
 _EXPECTED_VAL_LOSS = (2.7, 3.6)
 _EXPECTED_VAL_PPL = (15, 37)
             print(f"    Clip rate:      {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
             print(f"    Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
+            # Heuristic: >50% clipping means most steps are capped, so the
+            # effective LR is lower than configured. Practitioners generally
+            # treat this as a sign that peak LR is too high.
             if clip_rate > 0.5:
                 findings.append({
                     "issue": "LR may be too high",
         # ── Batch-LR scaling guidance ──
         print("\n  Batch-LR Scaling Rules:")
         print("    • Batch ×2 → LR ×√2 (square root scaling, recommended for Adam)")
+        print("      (Malladi et al. NeurIPS 2022, 'On the SDEs and Scaling Rules for Adaptive Gradient Algorithms')")
         print("    • Batch ×2 → LR ×2   (linear scaling, Goyal et al. 2017, mainly SGD)")
         print("    • 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
         print("      (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")