Vjeong Claude Opus 4.6 commited on
Commit
362e9ea
Β·
1 Parent(s): 5359f06

Fix gradient diagnostic thresholds with evidence-based criteria in LossDebugger

Browse files

- Use exact grad_clip instead of 0.95 multiplier (PyTorch clip_grad_norm_ behavior)
- Raise clip rate threshold from 30% to 50% (Google Deep Learning Tuning Playbook)
- Replace absolute tiny-grad threshold (0.01) with relative one (grad_clip * 0.01)
- Change tiny-grad diagnosis from "LR too low" to "vanishing gradients"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. llm_lab/training/debugger.py +13 -8
llm_lab/training/debugger.py CHANGED
@@ -719,16 +719,21 @@ class LossDebugger:
719
 
720
  if grad_norms:
721
  avg_grad = sum(grad_norms) / len(grad_norms)
722
- clip_count = sum(1 for g in grad_norms if g >= config.grad_clip * 0.95)
 
723
  clip_rate = clip_count / len(grad_norms)
724
- tiny_count = sum(1 for g in grad_norms if g < 0.01)
 
 
725
  tiny_rate = tiny_count / len(grad_norms)
726
 
727
  print(f" Avg grad norm: {avg_grad:.4f}")
728
  print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
729
- print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< 0.01)")
730
 
731
- if clip_rate > 0.3:
 
 
732
  findings.append({
733
  "issue": "LR may be too high",
734
  "evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
@@ -737,11 +742,11 @@ class LossDebugger:
737
  print(f" 🟑 Grad clipping frequent ({clip_rate * 100:.0f}%) β†’ LR may be too high")
738
  elif tiny_rate > 0.5:
739
  findings.append({
740
- "issue": "LR may be too low",
741
- "evidence": f"Grad norm < 0.01 in {tiny_rate * 100:.0f}% of steps",
742
- "action": f"Try LR = {config.learning_rate * 2:.2e} (Γ—2)",
743
  })
744
- print(f" 🟑 Grad norm too small ({tiny_rate * 100:.0f}% < 0.01) β†’ LR may be too low")
745
  else:
746
  print(f" βœ… LR looks appropriate")
747
 
 
719
 
720
  if grad_norms:
721
  avg_grad = sum(grad_norms) / len(grad_norms)
722
+ # Ref: PyTorch clip_grad_norm_ clips when total_norm > max_norm
723
+ clip_count = sum(1 for g in grad_norms if g >= config.grad_clip)
724
  clip_rate = clip_count / len(grad_norms)
725
+ # Relative threshold: < 1% of clip limit (model-size independent)
726
+ tiny_threshold = config.grad_clip * 0.01
727
+ tiny_count = sum(1 for g in grad_norms if g < tiny_threshold)
728
  tiny_rate = tiny_count / len(grad_norms)
729
 
730
  print(f" Avg grad norm: {avg_grad:.4f}")
731
  print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
732
+ print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
733
 
734
+ # Ref: Google Deep Learning Tuning Playbook β€” >50% clipping is
735
+ # "extremely aggressive" and effectively a strange LR reduction.
736
+ if clip_rate > 0.5:
737
  findings.append({
738
  "issue": "LR may be too high",
739
  "evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
 
742
  print(f" 🟑 Grad clipping frequent ({clip_rate * 100:.0f}%) β†’ LR may be too high")
743
  elif tiny_rate > 0.5:
744
  findings.append({
745
+ "issue": "Possible vanishing gradients",
746
+ "evidence": f"Grad norm < {tiny_threshold:.4f} in {tiny_rate * 100:.0f}% of steps",
747
+ "action": "Check weight initialization, layer norms, and model depth",
748
  })
749
+ print(f" 🟑 Grad norm too small ({tiny_rate * 100:.0f}% < {tiny_threshold:.4f}) β†’ possible vanishing gradients")
750
  else:
751
  print(f" βœ… LR looks appropriate")
752