Vjeong Claude Opus 4.6 commited on
Commit
4322ea0
Β·
1 Parent(s): e96b9d3

Fix false/inaccurate citations in LossDebugger

Browse files

- Remove false attribution to Google Deep Learning Tuning Playbook for
gradient clipping heuristic (playbook has no clipping content)
- Remove Krizhevsky 2014 citation for sqrt batch-LR scaling (he proposed
it theoretically but preferred linear for SGD; Malladi et al. 2022 is
the proper reference for Adam)
- Clarify expected loss/PPL ranges are estimates from GPT-2 benchmarks
and Chinchilla scaling laws, not FineWeb-Edu specific results

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. llm_lab/training/debugger.py +7 -4
llm_lab/training/debugger.py CHANGED
@@ -29,7 +29,9 @@ from llm_lab.config import TrainConfig
29
  # Constants
30
  # ═══════════════════════════════════════════════════════════════════
31
 
32
- # Normal convergence ranges for a 1B model trained on ~10B tokens (FineWeb-Edu)
 
 
33
  _EXPECTED_TRAIN_LOSS = (2.5, 3.3)
34
  _EXPECTED_VAL_LOSS = (2.7, 3.6)
35
  _EXPECTED_VAL_PPL = (15, 37)
@@ -733,8 +735,9 @@ class LossDebugger:
733
  print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
734
  print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
735
 
736
- # Ref: Google Deep Learning Tuning Playbook β€” >50% clipping is
737
- # "extremely aggressive" and effectively a strange LR reduction.
 
738
  if clip_rate > 0.5:
739
  findings.append({
740
  "issue": "LR may be too high",
@@ -829,7 +832,7 @@ class LossDebugger:
829
  # ── Batch-LR scaling guidance ──
830
  print("\n Batch-LR Scaling Rules:")
831
  print(" β€’ Batch Γ—2 β†’ LR Γ—βˆš2 (square root scaling, recommended for Adam)")
832
- print(" (Malladi et al. NeurIPS 2022; Krizhevsky 2014)")
833
  print(" β€’ Batch Γ—2 β†’ LR Γ—2 (linear scaling, Goyal et al. 2017, mainly SGD)")
834
  print(" β€’ 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
835
  print(" (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")
 
29
  # Constants
30
  # ═══════════════════════════════════════════════════════════════════
31
 
32
+ # Approximate convergence ranges for a 1B model trained on ~10B tokens.
33
+ # Estimated from GPT-2 scaling benchmarks (Radford et al. 2019) and
34
+ # Chinchilla scaling laws (Hoffmann et al. 2022). Not dataset-specific.
35
  _EXPECTED_TRAIN_LOSS = (2.5, 3.3)
36
  _EXPECTED_VAL_LOSS = (2.7, 3.6)
37
  _EXPECTED_VAL_PPL = (15, 37)
 
735
  print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
736
  print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
737
 
738
+ # Heuristic: >50% clipping means most steps are capped, so the
739
+ # effective LR is lower than configured. Practitioners generally
740
+ # treat this as a sign that peak LR is too high.
741
  if clip_rate > 0.5:
742
  findings.append({
743
  "issue": "LR may be too high",
 
832
  # ── Batch-LR scaling guidance ──
833
  print("\n Batch-LR Scaling Rules:")
834
  print(" β€’ Batch Γ—2 β†’ LR Γ—βˆš2 (square root scaling, recommended for Adam)")
835
+ print(" (Malladi et al. NeurIPS 2022, 'On the SDEs and Scaling Rules for Adaptive Gradient Algorithms')")
836
  print(" β€’ Batch Γ—2 β†’ LR Γ—2 (linear scaling, Goyal et al. 2017, mainly SGD)")
837
  print(" β€’ 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
838
  print(" (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")