Remove redundant detect_scenario from LossDebugger
Browse filesLevel 0 diagnose_status already covers the same 4 scenarios (no decrease,
NaN, loss bounce, plateau) with better accuracy (moving-average smoothing,
val trend awareness, more granular categories). Remove the duplicate method
and its notebook section to avoid confusion.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- llm_lab/training/debugger.py +0 -126
- notebooks/05_debugging.ipynb +59 -134
llm_lab/training/debugger.py
CHANGED
|
@@ -1289,129 +1289,6 @@ class LossDebugger:
|
|
| 1289 |
"weight_issues": weight_issues,
|
| 1290 |
}
|
| 1291 |
|
| 1292 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1293 |
-
# Scenario Auto-Detection
|
| 1294 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1295 |
-
|
| 1296 |
-
@staticmethod
|
| 1297 |
-
def detect_scenario(
|
| 1298 |
-
metrics_history: Dict[str, list],
|
| 1299 |
-
vocab_size: int = 32000,
|
| 1300 |
-
) -> Dict[str, Any]:
|
| 1301 |
-
"""Auto-detect which debugging scenario applies.
|
| 1302 |
-
|
| 1303 |
-
Scenarios (from the guide):
|
| 1304 |
-
A: Loss stuck at ~10.37 (doesn't decrease at all)
|
| 1305 |
-
B: Loss was decreasing then suddenly NaN
|
| 1306 |
-
C: Loss decreased to X then started increasing
|
| 1307 |
-
D: Loss stuck at high value (e.g. 4.0 for 1B model)
|
| 1308 |
-
"""
|
| 1309 |
-
print(_header("Scenario Auto-Detection"))
|
| 1310 |
-
|
| 1311 |
-
train_losses = metrics_history.get("train_loss", [])
|
| 1312 |
-
val_losses = [v for v in metrics_history.get("val_loss", []) if v is not None]
|
| 1313 |
-
expected_initial = math.log(vocab_size)
|
| 1314 |
-
|
| 1315 |
-
if len(train_losses) < 5:
|
| 1316 |
-
print(" [!] Not enough data to detect scenario.")
|
| 1317 |
-
return {"scenario": "unknown", "steps": []}
|
| 1318 |
-
|
| 1319 |
-
# Filter out NaN for statistics
|
| 1320 |
-
valid_losses = [l for l in train_losses if not math.isnan(l)]
|
| 1321 |
-
if len(valid_losses) < 5:
|
| 1322 |
-
print(" [!] Not enough valid (non-NaN) data to detect scenario.")
|
| 1323 |
-
return {"scenario": "unknown", "steps": []}
|
| 1324 |
-
|
| 1325 |
-
first_loss = valid_losses[0]
|
| 1326 |
-
last_loss = valid_losses[-1]
|
| 1327 |
-
has_nan = len(valid_losses) < len(train_losses)
|
| 1328 |
-
min_loss = min(valid_losses)
|
| 1329 |
-
min_loss_idx = next(i for i, l in enumerate(train_losses)
|
| 1330 |
-
if not math.isnan(l) and l == min_loss)
|
| 1331 |
-
loss_recovered = last_loss > min_loss + 0.3 and min_loss_idx < len(train_losses) * 0.8
|
| 1332 |
-
|
| 1333 |
-
# Trend analysis: is loss still decreasing?
|
| 1334 |
-
mid = len(valid_losses) // 2
|
| 1335 |
-
first_half_avg = sum(valid_losses[:mid]) / mid
|
| 1336 |
-
second_half_avg = sum(valid_losses[mid:]) / (len(valid_losses) - mid)
|
| 1337 |
-
still_decreasing = (first_half_avg - second_half_avg) > 0.3
|
| 1338 |
-
|
| 1339 |
-
scenario = "unknown"
|
| 1340 |
-
steps: List[str] = []
|
| 1341 |
-
|
| 1342 |
-
# Scenario A: Loss stuck near initial value
|
| 1343 |
-
if abs(last_loss - expected_initial) < 1.5 and abs(first_loss - last_loss) < 0.5:
|
| 1344 |
-
scenario = "A"
|
| 1345 |
-
steps = [
|
| 1346 |
-
"1. Run single-batch overfit test β if it fails, model/loss has a bug",
|
| 1347 |
-
"2. Check if gradients are zero β optimizer.step() may be missing",
|
| 1348 |
-
"3. Verify input_ids/targets shift β data pipeline bug",
|
| 1349 |
-
"4. Check LR β is it set to 0?",
|
| 1350 |
-
"5. Check model.train() β eval mode changes norm/dropout behavior",
|
| 1351 |
-
]
|
| 1352 |
-
|
| 1353 |
-
# Scenario B: NaN appeared
|
| 1354 |
-
elif has_nan:
|
| 1355 |
-
nan_idx = next(i for i, l in enumerate(train_losses) if math.isnan(l))
|
| 1356 |
-
scenario = "B"
|
| 1357 |
-
steps = [
|
| 1358 |
-
f"1. NaN appeared at step ~{nan_idx}. Check that batch's data for bad tokens",
|
| 1359 |
-
"2. Check gradient norm just before NaN β was there a spike?",
|
| 1360 |
-
"3. Check LR schedule β does NaN coincide with warmup end?",
|
| 1361 |
-
"4. Check specific layer weights for Inf values",
|
| 1362 |
-
"5. Try switching to fp32 to see if it's a mixed precision issue",
|
| 1363 |
-
" (Pythia-1B had irrecoverable fp16 loss spikes β switched to bf16,",
|
| 1364 |
-
" Biderman et al. 2023)",
|
| 1365 |
-
]
|
| 1366 |
-
|
| 1367 |
-
# Scenario C: Loss decreased then increased
|
| 1368 |
-
elif loss_recovered:
|
| 1369 |
-
scenario = "C"
|
| 1370 |
-
steps = [
|
| 1371 |
-
"1. Check Train and Val loss simultaneously:",
|
| 1372 |
-
" - Both increasing β LR too high (check cosine decay)",
|
| 1373 |
-
" - Only train increasing β data quality changed (streaming order)",
|
| 1374 |
-
" - Only val increasing β overfitting started",
|
| 1375 |
-
"2. Verify LR schedule is decaying as intended",
|
| 1376 |
-
"3. Check data shuffling β same data repeating?",
|
| 1377 |
-
]
|
| 1378 |
-
|
| 1379 |
-
# Scenario D: Loss stuck at high value (not still decreasing)
|
| 1380 |
-
elif (last_loss > _EXPECTED_TRAIN_LOSS[1]
|
| 1381 |
-
and abs(last_loss - min_loss) < 0.3
|
| 1382 |
-
and not still_decreasing):
|
| 1383 |
-
scenario = "D"
|
| 1384 |
-
total_tokens = len(train_losses) * 262144 # approximate
|
| 1385 |
-
steps = [
|
| 1386 |
-
f"1. Check total tokens trained: ~{total_tokens / 1e9:.1f}B "
|
| 1387 |
-
f"(need 5-10B for 1B model)",
|
| 1388 |
-
"2. Compare with smaller model (100M) at same step β "
|
| 1389 |
-
"if 100M is lower, 1B may have a bug",
|
| 1390 |
-
"3. Run LR range test β current LR may not be optimal",
|
| 1391 |
-
"4. Sample training data β check for noise, duplicates, low quality",
|
| 1392 |
-
"5. Try different effective batch size (64 vs 128 vs 256)",
|
| 1393 |
-
]
|
| 1394 |
-
|
| 1395 |
-
else:
|
| 1396 |
-
scenario = "none"
|
| 1397 |
-
steps = ["Training appears normal. No specific scenario detected."]
|
| 1398 |
-
|
| 1399 |
-
label = {
|
| 1400 |
-
"A": "Loss stuck at initial value (~10.37)",
|
| 1401 |
-
"B": "Loss was decreasing, then NaN",
|
| 1402 |
-
"C": "Loss decreased then started increasing",
|
| 1403 |
-
"D": f"Loss stuck at high value (>{_EXPECTED_TRAIN_LOSS[1]})",
|
| 1404 |
-
"none": "No problematic scenario detected",
|
| 1405 |
-
"unknown": "Cannot determine",
|
| 1406 |
-
}
|
| 1407 |
-
|
| 1408 |
-
print(f"\n Detected: Scenario {scenario} β {label.get(scenario, 'Unknown')}")
|
| 1409 |
-
print(f"\n Recommended debugging steps:")
|
| 1410 |
-
for step in steps:
|
| 1411 |
-
print(f" {step}")
|
| 1412 |
-
|
| 1413 |
-
return {"scenario": scenario, "label": label.get(scenario, ""), "steps": steps}
|
| 1414 |
-
|
| 1415 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1416 |
# Main Entry Point
|
| 1417 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1491,9 +1368,6 @@ class LossDebugger:
|
|
| 1491 |
model, dataloader, device,
|
| 1492 |
)
|
| 1493 |
|
| 1494 |
-
# Auto-detect scenario
|
| 1495 |
-
report["scenario"] = LossDebugger.detect_scenario(metrics_history, vocab_size)
|
| 1496 |
-
|
| 1497 |
# Final summary
|
| 1498 |
print("\n" + "β" * 60)
|
| 1499 |
print(" Diagnostics Complete")
|
|
|
|
| 1289 |
"weight_issues": weight_issues,
|
| 1290 |
}
|
| 1291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1292 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1293 |
# Main Entry Point
|
| 1294 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1368 |
model, dataloader, device,
|
| 1369 |
)
|
| 1370 |
|
|
|
|
|
|
|
|
|
|
| 1371 |
# Final summary
|
| 1372 |
print("\n" + "β" * 60)
|
| 1373 |
print(" Diagnostics Complete")
|
notebooks/05_debugging.ipynb
CHANGED
|
@@ -18,7 +18,8 @@
|
|
| 18 |
"Level 4: Fitting Diagnosis β overfitting vs underfitting\n",
|
| 19 |
"Level 5: Architecture β initialization, per-layer activation\n",
|
| 20 |
"```"
|
| 21 |
-
]
|
|
|
|
| 22 |
},
|
| 23 |
{
|
| 24 |
"cell_type": "code",
|
|
@@ -28,7 +29,8 @@
|
|
| 28 |
"source": [
|
| 29 |
"# No additional packages required\n",
|
| 30 |
"# LossDebugger only uses torch and built-in llm_lab modules"
|
| 31 |
-
]
|
|
|
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"cell_type": "code",
|
|
@@ -55,7 +57,8 @@
|
|
| 55 |
"from llm_lab.data import setup_data_pipeline\n",
|
| 56 |
"from llm_lab.training import LossDebugger\n",
|
| 57 |
"from llm_lab.utils import auto_configure, get_device"
|
| 58 |
-
]
|
|
|
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"cell_type": "markdown",
|
|
@@ -64,7 +67,8 @@
|
|
| 64 |
"## 0. Configuration\n",
|
| 65 |
"\n",
|
| 66 |
"Use the `debug_10m` preset so it runs quickly even on CPU."
|
| 67 |
-
]
|
|
|
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"cell_type": "code",
|
|
@@ -89,7 +93,8 @@
|
|
| 89 |
"print(f\"Device: {device}, dtype: {dtype}\")\n",
|
| 90 |
"print(f\"Vocab size: {vocab_size:,}\")\n",
|
| 91 |
"print(f\"Expected initial loss: ln({vocab_size}) = {math.log(vocab_size):.2f}\")"
|
| 92 |
-
]
|
|
|
|
| 93 |
},
|
| 94 |
{
|
| 95 |
"cell_type": "code",
|
|
@@ -103,7 +108,8 @@
|
|
| 103 |
"\n",
|
| 104 |
"# --- Data pipeline (GPT-2 tokenizer used automatically) ---\n",
|
| 105 |
"tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
|
| 106 |
-
]
|
|
|
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"cell_type": "markdown",
|
|
@@ -118,7 +124,8 @@
|
|
| 118 |
"```python\n",
|
| 119 |
"# metrics_history = trainer.metrics.history\n",
|
| 120 |
"```"
|
| 121 |
-
]
|
|
|
|
| 122 |
},
|
| 123 |
{
|
| 124 |
"cell_type": "code",
|
|
@@ -134,7 +141,6 @@
|
|
| 134 |
"# --- Scenario A: loss stuck near ln(vocab_size) ---\n",
|
| 135 |
"# loss barely decreasing (suspected data/implementation bug)\n",
|
| 136 |
"# diagnose_status condition: loss_change < 0.1\n",
|
| 137 |
-
"# detect_scenario condition: abs(last - initial) < 1.5, abs(first - last) < 0.5\n",
|
| 138 |
"n_steps_a = 200\n",
|
| 139 |
"mock_history_a = {\n",
|
| 140 |
" \"step\": list(range(1, n_steps_a + 1)),\n",
|
|
@@ -308,7 +314,8 @@
|
|
| 308 |
"}\n",
|
| 309 |
"print(f\"\\nMock H β train_loss: only 1 step\")\n",
|
| 310 |
"print(f\" Expected: INSUFFICIENT_DATA (need >= 2 steps)\")"
|
| 311 |
-
]
|
|
|
|
| 312 |
},
|
| 313 |
{
|
| 314 |
"cell_type": "markdown",
|
|
@@ -326,7 +333,8 @@
|
|
| 326 |
"| `PLATEAU` | Loss stuck at high value | yellow |\n",
|
| 327 |
"| `OVERFITTING` | trainβ valβ | yellow |\n",
|
| 328 |
"| `UNSTABLE` | High loss variance | yellow |"
|
| 329 |
-
]
|
|
|
|
| 330 |
},
|
| 331 |
{
|
| 332 |
"cell_type": "code",
|
|
@@ -367,7 +375,8 @@
|
|
| 367 |
" print(f\" {icon} {name:25s} expected={expected:20s} got={actual}\")\n",
|
| 368 |
"passed = sum(1 for *_, m in results if m == \"PASS\")\n",
|
| 369 |
"print(f\"\\n {passed}/{len(results)} passed\")"
|
| 370 |
-
]
|
|
|
|
| 371 |
},
|
| 372 |
{
|
| 373 |
"cell_type": "code",
|
|
@@ -380,7 +389,8 @@
|
|
| 380 |
"\n",
|
| 381 |
"result = LossDebugger.diagnose_status(vocab_size, metrics_history)\n",
|
| 382 |
"print(f\"\\nReal checkpoint diagnosis: {result['status']}\")"
|
| 383 |
-
]
|
|
|
|
| 384 |
},
|
| 385 |
{
|
| 386 |
"cell_type": "markdown",
|
|
@@ -396,7 +406,8 @@
|
|
| 396 |
"4. **Single-batch overfit** β repeat one batch for 200 steps β verify loss β 0\n",
|
| 397 |
"5. **Tokenizer round-trip** β verify text is preserved after encode β decode\n",
|
| 398 |
"6. **Data quality** β visually inspect sample text"
|
| 399 |
-
]
|
|
|
|
| 400 |
},
|
| 401 |
{
|
| 402 |
"cell_type": "code",
|
|
@@ -416,7 +427,8 @@
|
|
| 416 |
"print(f\"\\nPassed: {len(level1['passed'])}, Failed: {len(level1['failed'])}\")\n",
|
| 417 |
"for f in level1['failed']:\n",
|
| 418 |
" print(f\" FAILED: {f['name']} β {f['detail']}\")"
|
| 419 |
-
]
|
|
|
|
| 420 |
},
|
| 421 |
{
|
| 422 |
"cell_type": "markdown",
|
|
@@ -453,7 +465,8 @@
|
|
| 453 |
"| Loss β Inf | log of 0 | Add eps, set ignore_index |\n",
|
| 454 |
"| Loss oscillating | fp16 gradient underflow | Switch to bf16 or use GradScaler |\n",
|
| 455 |
"| Late-stage NaN | Gradual activation increase | Verify RMSNorm, check weight decay |"
|
| 456 |
-
]
|
|
|
|
| 457 |
},
|
| 458 |
{
|
| 459 |
"cell_type": "code",
|
|
@@ -467,7 +480,8 @@
|
|
| 467 |
" device=device,\n",
|
| 468 |
" dtype=dtype,\n",
|
| 469 |
")"
|
| 470 |
-
]
|
|
|
|
| 471 |
},
|
| 472 |
{
|
| 473 |
"cell_type": "markdown",
|
|
@@ -506,7 +520,8 @@
|
|
| 506 |
"- Batch Γ2 β LR Γβ2 (square root scaling, safer)\n",
|
| 507 |
"- Batch Γ2 β LR Γ2 (linear scaling, used in GPT-3 etc.)\n",
|
| 508 |
"- Effective batch 64~512 is typical for 1B models"
|
| 509 |
-
]
|
|
|
|
| 510 |
},
|
| 511 |
{
|
| 512 |
"cell_type": "code",
|
|
@@ -518,7 +533,8 @@
|
|
| 518 |
" metrics_history=mock_history_a,\n",
|
| 519 |
" config=train_config,\n",
|
| 520 |
")"
|
| 521 |
-
]
|
|
|
|
| 522 |
},
|
| 523 |
{
|
| 524 |
"cell_type": "markdown",
|
|
@@ -531,7 +547,8 @@
|
|
| 531 |
"\n",
|
| 532 |
"> May take some time (approximately 1 minute for debug_10m).\n",
|
| 533 |
"> Run this once before starting actual training."
|
| 534 |
-
]
|
|
|
|
| 535 |
},
|
| 536 |
{
|
| 537 |
"cell_type": "code",
|
|
@@ -548,7 +565,8 @@
|
|
| 548 |
")\n",
|
| 549 |
"\n",
|
| 550 |
"print(f\"\\nSuggested peak LR: {lr_result['suggested_lr']:.2e}\")"
|
| 551 |
-
]
|
|
|
|
| 552 |
},
|
| 553 |
{
|
| 554 |
"cell_type": "markdown",
|
|
@@ -586,7 +604,8 @@
|
|
| 586 |
" - Pythia, TinyLlama, OLMo, LLaMA all use dropout=0\n",
|
| 587 |
" - With sufficient data, the data itself is the best regularization\n",
|
| 588 |
" - Dropout is useful for fine-tuning with small data"
|
| 589 |
-
]
|
|
|
|
| 590 |
},
|
| 591 |
{
|
| 592 |
"cell_type": "code",
|
|
@@ -602,7 +621,8 @@
|
|
| 602 |
" model_params=model_params,\n",
|
| 603 |
" total_tokens=total_tokens,\n",
|
| 604 |
")"
|
| 605 |
-
]
|
|
|
|
| 606 |
},
|
| 607 |
{
|
| 608 |
"cell_type": "markdown",
|
|
@@ -634,7 +654,8 @@
|
|
| 634 |
"| RoPE β absolute positional embedding | Small difference for short sequences | Check RoPE implementation |\n",
|
| 635 |
"| SwiGLU β ReLU FFN | Loss +0.05~0.15 | Check SwiGLU implementation |\n",
|
| 636 |
"| GQA β MHA | Almost same loss (memory difference only) | KV repeat bug |"
|
| 637 |
-
]
|
|
|
|
| 638 |
},
|
| 639 |
{
|
| 640 |
"cell_type": "code",
|
|
@@ -649,115 +670,14 @@
|
|
| 649 |
")\n",
|
| 650 |
"\n",
|
| 651 |
"print(f\"\\n>>> Diagnosis: {level5['diagnosis']}\")"
|
| 652 |
-
]
|
|
|
|
| 653 |
},
|
| 654 |
{
|
| 655 |
"cell_type": "markdown",
|
| 656 |
"metadata": {},
|
| 657 |
"source": [
|
| 658 |
-
"## 7.
|
| 659 |
-
"\n",
|
| 660 |
-
"Analyzes `metrics_history` to automatically identify which of the 4 scenarios applies:\n",
|
| 661 |
-
"\n",
|
| 662 |
-
"| Scenario | Symptom | Main Cause | Key Diagnostic |\n",
|
| 663 |
-
"|----------|---------|-----------|----------------|\n",
|
| 664 |
-
"| **A** | Loss stuck near ~10.37 | Data/implementation bug | Single-batch overfit test |\n",
|
| 665 |
-
"| **B** | Sudden NaN during loss decrease | Numerical instability | Check grad norm just before NaN |\n",
|
| 666 |
-
"| **C** | Loss increases after decreasing | LR/data issue | Check train/val simultaneously |\n",
|
| 667 |
-
"| **D** | Loss stuck at high value | Insufficient training/LR issue | Check token count, run LR Range Test |\n",
|
| 668 |
-
"\n",
|
| 669 |
-
"### Scenario-Specific Diagnostic Points\n",
|
| 670 |
-
"\n",
|
| 671 |
-
"**Scenario A**: \"Loss isn't decreasing from 10.37 at all\"\n",
|
| 672 |
-
"1. Single-batch overfit β if fails, model/loss bug\n",
|
| 673 |
-
"2. Check if gradients are 0 β missing `optimizer.step()`?\n",
|
| 674 |
-
"3. Check `input_ids/targets` shift β data pipeline bug\n",
|
| 675 |
-
"4. Check LR β is it set to 0?\n",
|
| 676 |
-
"5. Verify `model.train()` is called\n",
|
| 677 |
-
"\n",
|
| 678 |
-
"**Scenario B**: \"Loss was decreasing then suddenly NaN\"\n",
|
| 679 |
-
"1. Inspect the batch data at that step\n",
|
| 680 |
-
"2. Check gradient norm spike just before NaN\n",
|
| 681 |
-
"3. Compare LR schedule with NaN timing\n",
|
| 682 |
-
"4. Mixed precision issue β try reproducing with fp32\n",
|
| 683 |
-
" - (Pythia-1B: fp16 β bf16 switch case, Biderman et al. 2023)\n",
|
| 684 |
-
"\n",
|
| 685 |
-
"**Scenario C**: \"Loss decreased to 3.5 then went back up\"\n",
|
| 686 |
-
"1. Check train/val simultaneously:\n",
|
| 687 |
-
" - Both up β LR too high\n",
|
| 688 |
-
" - Only train up β data quality change (streaming order)\n",
|
| 689 |
-
" - Only val up β overfitting starting\n",
|
| 690 |
-
"2. Check LR schedule, check data shuffling\n",
|
| 691 |
-
"\n",
|
| 692 |
-
"**Scenario D**: \"Loss won't go below 4.0\"\n",
|
| 693 |
-
"1. Check training token count (if < 5B, insufficient training)\n",
|
| 694 |
-
"2. Compare with 100M model\n",
|
| 695 |
-
"3. Run LR Range Test\n",
|
| 696 |
-
"4. Sample and inspect data quality"
|
| 697 |
-
]
|
| 698 |
-
},
|
| 699 |
-
{
|
| 700 |
-
"cell_type": "code",
|
| 701 |
-
"execution_count": null,
|
| 702 |
-
"metadata": {},
|
| 703 |
-
"outputs": [],
|
| 704 |
-
"source": [
|
| 705 |
-
"# --- Scenario A ---\n",
|
| 706 |
-
"print(\"=\" * 50)\n",
|
| 707 |
-
"print(\"Testing Scenario A (loss stuck at initial value)\")\n",
|
| 708 |
-
"print(\"=\" * 50)\n",
|
| 709 |
-
"scenario_a = LossDebugger.detect_scenario(\n",
|
| 710 |
-
" metrics_history=mock_history_a,\n",
|
| 711 |
-
" vocab_size=vocab_size,\n",
|
| 712 |
-
")\n",
|
| 713 |
-
"print(f\"\\n>>> Detected: Scenario {scenario_a['scenario']}\")\n",
|
| 714 |
-
"\n",
|
| 715 |
-
"# --- Scenario B ---\n",
|
| 716 |
-
"print(\"\\n\" + \"=\" * 50)\n",
|
| 717 |
-
"print(\"Testing Scenario B (NaN appeared)\")\n",
|
| 718 |
-
"print(\"=\" * 50)\n",
|
| 719 |
-
"scenario_b = LossDebugger.detect_scenario(\n",
|
| 720 |
-
" metrics_history=mock_history_b,\n",
|
| 721 |
-
" vocab_size=vocab_size,\n",
|
| 722 |
-
")\n",
|
| 723 |
-
"print(f\"\\n>>> Detected: Scenario {scenario_b['scenario']}\")\n",
|
| 724 |
-
"\n",
|
| 725 |
-
"# --- Scenario C ---\n",
|
| 726 |
-
"print(\"\\n\" + \"=\" * 50)\n",
|
| 727 |
-
"print(\"Testing Scenario C (loss bounce)\")\n",
|
| 728 |
-
"print(\"=\" * 50)\n",
|
| 729 |
-
"scenario_c = LossDebugger.detect_scenario(\n",
|
| 730 |
-
" metrics_history=mock_history_c,\n",
|
| 731 |
-
" vocab_size=vocab_size,\n",
|
| 732 |
-
")\n",
|
| 733 |
-
"print(f\"\\n>>> Detected: Scenario {scenario_c['scenario']}\")\n",
|
| 734 |
-
"\n",
|
| 735 |
-
"# --- Scenario D ---\n",
|
| 736 |
-
"print(\"\\n\" + \"=\" * 50)\n",
|
| 737 |
-
"print(\"Testing Scenario D (loss plateau)\")\n",
|
| 738 |
-
"print(\"=\" * 50)\n",
|
| 739 |
-
"scenario_d = LossDebugger.detect_scenario(\n",
|
| 740 |
-
" metrics_history=mock_history_d,\n",
|
| 741 |
-
" vocab_size=vocab_size,\n",
|
| 742 |
-
")\n",
|
| 743 |
-
"print(f\"\\n>>> Detected: Scenario {scenario_d['scenario']}\")\n",
|
| 744 |
-
"\n",
|
| 745 |
-
"# --- Normal ---\n",
|
| 746 |
-
"print(\"\\n\" + \"=\" * 50)\n",
|
| 747 |
-
"print(\"Testing Normal scenario\")\n",
|
| 748 |
-
"print(\"=\" * 50)\n",
|
| 749 |
-
"scenario_n = LossDebugger.detect_scenario(\n",
|
| 750 |
-
" metrics_history=mock_history_normal,\n",
|
| 751 |
-
" vocab_size=vocab_size,\n",
|
| 752 |
-
")\n",
|
| 753 |
-
"print(f\"\\n>>> Detected: Scenario {scenario_n['scenario']}\")"
|
| 754 |
-
]
|
| 755 |
-
},
|
| 756 |
-
{
|
| 757 |
-
"cell_type": "markdown",
|
| 758 |
-
"metadata": {},
|
| 759 |
-
"source": [
|
| 760 |
-
"## 8. Full Diagnostics (run_diagnostics)\n",
|
| 761 |
"\n",
|
| 762 |
"Runs all levels above at once. Use the `levels` parameter to select which levels to run.\n",
|
| 763 |
"\n",
|
|
@@ -770,7 +690,8 @@
|
|
| 770 |
"# device=device, dtype=dtype,\n",
|
| 771 |
"# )\n",
|
| 772 |
"```"
|
| 773 |
-
]
|
|
|
|
| 774 |
},
|
| 775 |
{
|
| 776 |
"cell_type": "code",
|
|
@@ -789,16 +710,18 @@
|
|
| 789 |
" vocab_size=vocab_size,\n",
|
| 790 |
" levels=[0, 1, 2, 3, 4, 5],\n",
|
| 791 |
")"
|
| 792 |
-
]
|
|
|
|
| 793 |
},
|
| 794 |
{
|
| 795 |
"cell_type": "markdown",
|
| 796 |
"metadata": {},
|
| 797 |
"source": [
|
| 798 |
-
"##
|
| 799 |
"\n",
|
| 800 |
"A systematic learning path for LLM training optimization."
|
| 801 |
-
]
|
|
|
|
| 802 |
},
|
| 803 |
{
|
| 804 |
"cell_type": "code",
|
|
@@ -807,13 +730,14 @@
|
|
| 807 |
"outputs": [],
|
| 808 |
"source": [
|
| 809 |
"LossDebugger.print_study_roadmap()"
|
| 810 |
-
]
|
|
|
|
| 811 |
},
|
| 812 |
{
|
| 813 |
"cell_type": "markdown",
|
| 814 |
"metadata": {},
|
| 815 |
"source": [
|
| 816 |
-
"##
|
| 817 |
"\n",
|
| 818 |
"**Recommended order:**\n",
|
| 819 |
"1. Check status with Level 0 β tells you which level to inspect\n",
|
|
@@ -843,7 +767,8 @@
|
|
| 843 |
"---\n",
|
| 844 |
"**Previous step:** Train the model in `03_training.ipynb`. \n",
|
| 845 |
"**Next step:** Evaluate the trained model in `04_evaluation.ipynb`."
|
| 846 |
-
]
|
|
|
|
| 847 |
}
|
| 848 |
],
|
| 849 |
"metadata": {
|
|
|
|
| 18 |
"Level 4: Fitting Diagnosis β overfitting vs underfitting\n",
|
| 19 |
"Level 5: Architecture β initialization, per-layer activation\n",
|
| 20 |
"```"
|
| 21 |
+
],
|
| 22 |
+
"id": "19f7e954"
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"cell_type": "code",
|
|
|
|
| 29 |
"source": [
|
| 30 |
"# No additional packages required\n",
|
| 31 |
"# LossDebugger only uses torch and built-in llm_lab modules"
|
| 32 |
+
],
|
| 33 |
+
"id": "af6a605f"
|
| 34 |
},
|
| 35 |
{
|
| 36 |
"cell_type": "code",
|
|
|
|
| 57 |
"from llm_lab.data import setup_data_pipeline\n",
|
| 58 |
"from llm_lab.training import LossDebugger\n",
|
| 59 |
"from llm_lab.utils import auto_configure, get_device"
|
| 60 |
+
],
|
| 61 |
+
"id": "23b92c8d"
|
| 62 |
},
|
| 63 |
{
|
| 64 |
"cell_type": "markdown",
|
|
|
|
| 67 |
"## 0. Configuration\n",
|
| 68 |
"\n",
|
| 69 |
"Use the `debug_10m` preset so it runs quickly even on CPU."
|
| 70 |
+
],
|
| 71 |
+
"id": "f3daf6ad"
|
| 72 |
},
|
| 73 |
{
|
| 74 |
"cell_type": "code",
|
|
|
|
| 93 |
"print(f\"Device: {device}, dtype: {dtype}\")\n",
|
| 94 |
"print(f\"Vocab size: {vocab_size:,}\")\n",
|
| 95 |
"print(f\"Expected initial loss: ln({vocab_size}) = {math.log(vocab_size):.2f}\")"
|
| 96 |
+
],
|
| 97 |
+
"id": "dc7aa763"
|
| 98 |
},
|
| 99 |
{
|
| 100 |
"cell_type": "code",
|
|
|
|
| 108 |
"\n",
|
| 109 |
"# --- Data pipeline (GPT-2 tokenizer used automatically) ---\n",
|
| 110 |
"tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
|
| 111 |
+
],
|
| 112 |
+
"id": "6ed5cdad"
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"cell_type": "markdown",
|
|
|
|
| 124 |
"```python\n",
|
| 125 |
"# metrics_history = trainer.metrics.history\n",
|
| 126 |
"```"
|
| 127 |
+
],
|
| 128 |
+
"id": "58a40e59"
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"cell_type": "code",
|
|
|
|
| 141 |
"# --- Scenario A: loss stuck near ln(vocab_size) ---\n",
|
| 142 |
"# loss barely decreasing (suspected data/implementation bug)\n",
|
| 143 |
"# diagnose_status condition: loss_change < 0.1\n",
|
|
|
|
| 144 |
"n_steps_a = 200\n",
|
| 145 |
"mock_history_a = {\n",
|
| 146 |
" \"step\": list(range(1, n_steps_a + 1)),\n",
|
|
|
|
| 314 |
"}\n",
|
| 315 |
"print(f\"\\nMock H β train_loss: only 1 step\")\n",
|
| 316 |
"print(f\" Expected: INSUFFICIENT_DATA (need >= 2 steps)\")"
|
| 317 |
+
],
|
| 318 |
+
"id": "0fc90eec"
|
| 319 |
},
|
| 320 |
{
|
| 321 |
"cell_type": "markdown",
|
|
|
|
| 333 |
"| `PLATEAU` | Loss stuck at high value | yellow |\n",
|
| 334 |
"| `OVERFITTING` | trainβ valβ | yellow |\n",
|
| 335 |
"| `UNSTABLE` | High loss variance | yellow |"
|
| 336 |
+
],
|
| 337 |
+
"id": "0eb757cb"
|
| 338 |
},
|
| 339 |
{
|
| 340 |
"cell_type": "code",
|
|
|
|
| 375 |
" print(f\" {icon} {name:25s} expected={expected:20s} got={actual}\")\n",
|
| 376 |
"passed = sum(1 for *_, m in results if m == \"PASS\")\n",
|
| 377 |
"print(f\"\\n {passed}/{len(results)} passed\")"
|
| 378 |
+
],
|
| 379 |
+
"id": "b22fc65a"
|
| 380 |
},
|
| 381 |
{
|
| 382 |
"cell_type": "code",
|
|
|
|
| 389 |
"\n",
|
| 390 |
"result = LossDebugger.diagnose_status(vocab_size, metrics_history)\n",
|
| 391 |
"print(f\"\\nReal checkpoint diagnosis: {result['status']}\")"
|
| 392 |
+
],
|
| 393 |
+
"id": "da4a7552"
|
| 394 |
},
|
| 395 |
{
|
| 396 |
"cell_type": "markdown",
|
|
|
|
| 406 |
"4. **Single-batch overfit** β repeat one batch for 200 steps β verify loss β 0\n",
|
| 407 |
"5. **Tokenizer round-trip** β verify text is preserved after encode β decode\n",
|
| 408 |
"6. **Data quality** β visually inspect sample text"
|
| 409 |
+
],
|
| 410 |
+
"id": "793c1fc8"
|
| 411 |
},
|
| 412 |
{
|
| 413 |
"cell_type": "code",
|
|
|
|
| 427 |
"print(f\"\\nPassed: {len(level1['passed'])}, Failed: {len(level1['failed'])}\")\n",
|
| 428 |
"for f in level1['failed']:\n",
|
| 429 |
" print(f\" FAILED: {f['name']} β {f['detail']}\")"
|
| 430 |
+
],
|
| 431 |
+
"id": "d774e1c8"
|
| 432 |
},
|
| 433 |
{
|
| 434 |
"cell_type": "markdown",
|
|
|
|
| 465 |
"| Loss β Inf | log of 0 | Add eps, set ignore_index |\n",
|
| 466 |
"| Loss oscillating | fp16 gradient underflow | Switch to bf16 or use GradScaler |\n",
|
| 467 |
"| Late-stage NaN | Gradual activation increase | Verify RMSNorm, check weight decay |"
|
| 468 |
+
],
|
| 469 |
+
"id": "eb7e5160"
|
| 470 |
},
|
| 471 |
{
|
| 472 |
"cell_type": "code",
|
|
|
|
| 480 |
" device=device,\n",
|
| 481 |
" dtype=dtype,\n",
|
| 482 |
")"
|
| 483 |
+
],
|
| 484 |
+
"id": "1beffcec"
|
| 485 |
},
|
| 486 |
{
|
| 487 |
"cell_type": "markdown",
|
|
|
|
| 520 |
"- Batch Γ2 β LR Γβ2 (square root scaling, safer)\n",
|
| 521 |
"- Batch Γ2 β LR Γ2 (linear scaling, used in GPT-3 etc.)\n",
|
| 522 |
"- Effective batch 64~512 is typical for 1B models"
|
| 523 |
+
],
|
| 524 |
+
"id": "0cc8f6f9"
|
| 525 |
},
|
| 526 |
{
|
| 527 |
"cell_type": "code",
|
|
|
|
| 533 |
" metrics_history=mock_history_a,\n",
|
| 534 |
" config=train_config,\n",
|
| 535 |
")"
|
| 536 |
+
],
|
| 537 |
+
"id": "217f1cd3"
|
| 538 |
},
|
| 539 |
{
|
| 540 |
"cell_type": "markdown",
|
|
|
|
| 547 |
"\n",
|
| 548 |
"> May take some time (approximately 1 minute for debug_10m).\n",
|
| 549 |
"> Run this once before starting actual training."
|
| 550 |
+
],
|
| 551 |
+
"id": "fea85ba4"
|
| 552 |
},
|
| 553 |
{
|
| 554 |
"cell_type": "code",
|
|
|
|
| 565 |
")\n",
|
| 566 |
"\n",
|
| 567 |
"print(f\"\\nSuggested peak LR: {lr_result['suggested_lr']:.2e}\")"
|
| 568 |
+
],
|
| 569 |
+
"id": "bfd64503"
|
| 570 |
},
|
| 571 |
{
|
| 572 |
"cell_type": "markdown",
|
|
|
|
| 604 |
" - Pythia, TinyLlama, OLMo, LLaMA all use dropout=0\n",
|
| 605 |
" - With sufficient data, the data itself is the best regularization\n",
|
| 606 |
" - Dropout is useful for fine-tuning with small data"
|
| 607 |
+
],
|
| 608 |
+
"id": "edb91923"
|
| 609 |
},
|
| 610 |
{
|
| 611 |
"cell_type": "code",
|
|
|
|
| 621 |
" model_params=model_params,\n",
|
| 622 |
" total_tokens=total_tokens,\n",
|
| 623 |
")"
|
| 624 |
+
],
|
| 625 |
+
"id": "505692ed"
|
| 626 |
},
|
| 627 |
{
|
| 628 |
"cell_type": "markdown",
|
|
|
|
| 654 |
"| RoPE β absolute positional embedding | Small difference for short sequences | Check RoPE implementation |\n",
|
| 655 |
"| SwiGLU β ReLU FFN | Loss +0.05~0.15 | Check SwiGLU implementation |\n",
|
| 656 |
"| GQA β MHA | Almost same loss (memory difference only) | KV repeat bug |"
|
| 657 |
+
],
|
| 658 |
+
"id": "3b65db22"
|
| 659 |
},
|
| 660 |
{
|
| 661 |
"cell_type": "code",
|
|
|
|
| 670 |
")\n",
|
| 671 |
"\n",
|
| 672 |
"print(f\"\\n>>> Diagnosis: {level5['diagnosis']}\")"
|
| 673 |
+
],
|
| 674 |
+
"id": "e1d7a64e"
|
| 675 |
},
|
| 676 |
{
|
| 677 |
"cell_type": "markdown",
|
| 678 |
"metadata": {},
|
| 679 |
"source": [
|
| 680 |
+
"## 7. Full Diagnostics (run_diagnostics)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
"\n",
|
| 682 |
"Runs all levels above at once. Use the `levels` parameter to select which levels to run.\n",
|
| 683 |
"\n",
|
|
|
|
| 690 |
"# device=device, dtype=dtype,\n",
|
| 691 |
"# )\n",
|
| 692 |
"```"
|
| 693 |
+
],
|
| 694 |
+
"id": "8cc8a5f1"
|
| 695 |
},
|
| 696 |
{
|
| 697 |
"cell_type": "code",
|
|
|
|
| 710 |
" vocab_size=vocab_size,\n",
|
| 711 |
" levels=[0, 1, 2, 3, 4, 5],\n",
|
| 712 |
")"
|
| 713 |
+
],
|
| 714 |
+
"id": "c7f3a147"
|
| 715 |
},
|
| 716 |
{
|
| 717 |
"cell_type": "markdown",
|
| 718 |
"metadata": {},
|
| 719 |
"source": [
|
| 720 |
+
"## 8. Study Roadmap\n",
|
| 721 |
"\n",
|
| 722 |
"A systematic learning path for LLM training optimization."
|
| 723 |
+
],
|
| 724 |
+
"id": "093ac866"
|
| 725 |
},
|
| 726 |
{
|
| 727 |
"cell_type": "code",
|
|
|
|
| 730 |
"outputs": [],
|
| 731 |
"source": [
|
| 732 |
"LossDebugger.print_study_roadmap()"
|
| 733 |
+
],
|
| 734 |
+
"id": "fc94ba5f"
|
| 735 |
},
|
| 736 |
{
|
| 737 |
"cell_type": "markdown",
|
| 738 |
"metadata": {},
|
| 739 |
"source": [
|
| 740 |
+
"## 9. Debugging Tips\n",
|
| 741 |
"\n",
|
| 742 |
"**Recommended order:**\n",
|
| 743 |
"1. Check status with Level 0 β tells you which level to inspect\n",
|
|
|
|
| 767 |
"---\n",
|
| 768 |
"**Previous step:** Train the model in `03_training.ipynb`. \n",
|
| 769 |
"**Next step:** Evaluate the trained model in `04_evaluation.ipynb`."
|
| 770 |
+
],
|
| 771 |
+
"id": "eb140c33"
|
| 772 |
}
|
| 773 |
],
|
| 774 |
"metadata": {
|