Fix false/inaccurate citations in LossDebugger
Browse files- Remove false attribution to Google Deep Learning Tuning Playbook for
gradient clipping heuristic (playbook has no clipping content)
- Remove Krizhevsky 2014 citation for sqrt batch-LR scaling (he proposed
it theoretically but preferred linear for SGD; Malladi et al. 2022 is
the proper reference for Adam)
- Clarify expected loss/PPL ranges are estimates from GPT-2 benchmarks
and Chinchilla scaling laws, not FineWeb-Edu specific results
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
llm_lab/training/debugger.py
CHANGED
|
@@ -29,7 +29,9 @@ from llm_lab.config import TrainConfig
|
|
| 29 |
# Constants
|
| 30 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
|
| 32 |
-
#
|
|
|
|
|
|
|
| 33 |
_EXPECTED_TRAIN_LOSS = (2.5, 3.3)
|
| 34 |
_EXPECTED_VAL_LOSS = (2.7, 3.6)
|
| 35 |
_EXPECTED_VAL_PPL = (15, 37)
|
|
@@ -733,8 +735,9 @@ class LossDebugger:
|
|
| 733 |
print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
|
| 734 |
print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
|
| 735 |
|
| 736 |
-
#
|
| 737 |
-
#
|
|
|
|
| 738 |
if clip_rate > 0.5:
|
| 739 |
findings.append({
|
| 740 |
"issue": "LR may be too high",
|
|
@@ -829,7 +832,7 @@ class LossDebugger:
|
|
| 829 |
# ββ Batch-LR scaling guidance ββ
|
| 830 |
print("\n Batch-LR Scaling Rules:")
|
| 831 |
print(" β’ Batch Γ2 β LR Γβ2 (square root scaling, recommended for Adam)")
|
| 832 |
-
print(" (Malladi et al. NeurIPS 2022
|
| 833 |
print(" β’ Batch Γ2 β LR Γ2 (linear scaling, Goyal et al. 2017, mainly SGD)")
|
| 834 |
print(" β’ 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
|
| 835 |
print(" (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")
|
|
|
|
| 29 |
# Constants
|
| 30 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
|
| 32 |
+
# Approximate convergence ranges for a 1B model trained on ~10B tokens.
|
| 33 |
+
# Estimated from GPT-2 scaling benchmarks (Radford et al. 2019) and
|
| 34 |
+
# Chinchilla scaling laws (Hoffmann et al. 2022). Not dataset-specific.
|
| 35 |
_EXPECTED_TRAIN_LOSS = (2.5, 3.3)
|
| 36 |
_EXPECTED_VAL_LOSS = (2.7, 3.6)
|
| 37 |
_EXPECTED_VAL_PPL = (15, 37)
|
|
|
|
| 735 |
print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
|
| 736 |
print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
|
| 737 |
|
| 738 |
+
# Heuristic: >50% clipping means most steps are capped, so the
|
| 739 |
+
# effective LR is lower than configured. Practitioners generally
|
| 740 |
+
# treat this as a sign that peak LR is too high.
|
| 741 |
if clip_rate > 0.5:
|
| 742 |
findings.append({
|
| 743 |
"issue": "LR may be too high",
|
|
|
|
| 832 |
# ββ Batch-LR scaling guidance ββ
|
| 833 |
print("\n Batch-LR Scaling Rules:")
|
| 834 |
print(" β’ Batch Γ2 β LR Γβ2 (square root scaling, recommended for Adam)")
|
| 835 |
+
print(" (Malladi et al. NeurIPS 2022, 'On the SDEs and Scaling Rules for Adaptive Gradient Algorithms')")
|
| 836 |
print(" β’ Batch Γ2 β LR Γ2 (linear scaling, Goyal et al. 2017, mainly SGD)")
|
| 837 |
print(" β’ 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
|
| 838 |
print(" (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")
|