Fix gradient diagnostic thresholds with evidence-based criteria in LossDebugger
Browse files- Use exact grad_clip instead of 0.95 multiplier (PyTorch clip_grad_norm_ behavior)
- Raise clip rate threshold from 30% to 50% (Google Deep Learning Tuning Playbook)
- Replace absolute tiny-grad threshold (0.01) with relative one (grad_clip * 0.01)
- Change tiny-grad diagnosis from "LR too low" to "vanishing gradients"
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- llm_lab/training/debugger.py +13 -8
llm_lab/training/debugger.py
CHANGED
|
@@ -719,16 +719,21 @@ class LossDebugger:
|
|
| 719 |
|
| 720 |
if grad_norms:
|
| 721 |
avg_grad = sum(grad_norms) / len(grad_norms)
|
| 722 |
-
|
|
|
|
| 723 |
clip_rate = clip_count / len(grad_norms)
|
| 724 |
-
|
|
|
|
|
|
|
| 725 |
tiny_rate = tiny_count / len(grad_norms)
|
| 726 |
|
| 727 |
print(f" Avg grad norm: {avg_grad:.4f}")
|
| 728 |
print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
|
| 729 |
-
print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (<
|
| 730 |
|
| 731 |
-
|
|
|
|
|
|
|
| 732 |
findings.append({
|
| 733 |
"issue": "LR may be too high",
|
| 734 |
"evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
|
|
@@ -737,11 +742,11 @@ class LossDebugger:
|
|
| 737 |
print(f" π‘ Grad clipping frequent ({clip_rate * 100:.0f}%) β LR may be too high")
|
| 738 |
elif tiny_rate > 0.5:
|
| 739 |
findings.append({
|
| 740 |
-
"issue": "
|
| 741 |
-
"evidence": f"Grad norm <
|
| 742 |
-
"action":
|
| 743 |
})
|
| 744 |
-
print(f" π‘ Grad norm too small ({tiny_rate * 100:.0f}% <
|
| 745 |
else:
|
| 746 |
print(f" β
LR looks appropriate")
|
| 747 |
|
|
|
|
| 719 |
|
| 720 |
if grad_norms:
|
| 721 |
avg_grad = sum(grad_norms) / len(grad_norms)
|
| 722 |
+
# Ref: PyTorch clip_grad_norm_ clips when total_norm > max_norm
|
| 723 |
+
clip_count = sum(1 for g in grad_norms if g >= config.grad_clip)
|
| 724 |
clip_rate = clip_count / len(grad_norms)
|
| 725 |
+
# Relative threshold: < 1% of clip limit (model-size independent)
|
| 726 |
+
tiny_threshold = config.grad_clip * 0.01
|
| 727 |
+
tiny_count = sum(1 for g in grad_norms if g < tiny_threshold)
|
| 728 |
tiny_rate = tiny_count / len(grad_norms)
|
| 729 |
|
| 730 |
print(f" Avg grad norm: {avg_grad:.4f}")
|
| 731 |
print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
|
| 732 |
+
print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")
|
| 733 |
|
| 734 |
+
# Ref: Google Deep Learning Tuning Playbook β >50% clipping is
|
| 735 |
+
# "extremely aggressive" and effectively a strange LR reduction.
|
| 736 |
+
if clip_rate > 0.5:
|
| 737 |
findings.append({
|
| 738 |
"issue": "LR may be too high",
|
| 739 |
"evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
|
|
|
|
| 742 |
print(f" π‘ Grad clipping frequent ({clip_rate * 100:.0f}%) β LR may be too high")
|
| 743 |
elif tiny_rate > 0.5:
|
| 744 |
findings.append({
|
| 745 |
+
"issue": "Possible vanishing gradients",
|
| 746 |
+
"evidence": f"Grad norm < {tiny_threshold:.4f} in {tiny_rate * 100:.0f}% of steps",
|
| 747 |
+
"action": "Check weight initialization, layer norms, and model depth",
|
| 748 |
})
|
| 749 |
+
print(f" π‘ Grad norm too small ({tiny_rate * 100:.0f}% < {tiny_threshold:.4f}) β possible vanishing gradients")
|
| 750 |
else:
|
| 751 |
print(f" β
LR looks appropriate")
|
| 752 |
|