Vjeong Claude Opus 4.6 commited on
Commit
2a50172
Β·
1 Parent(s): a424729

Remove redundant detect_scenario from LossDebugger

Browse files

Level 0 diagnose_status already covers the same 4 scenarios (no decrease,
NaN, loss bounce, plateau) with better accuracy (moving-average smoothing,
val trend awareness, more granular categories). Remove the duplicate method
and its notebook section to avoid confusion.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

llm_lab/training/debugger.py CHANGED
@@ -1289,129 +1289,6 @@ class LossDebugger:
1289
  "weight_issues": weight_issues,
1290
  }
1291
 
1292
- # ───────────────────────────────────────────────────────────────
1293
- # Scenario Auto-Detection
1294
- # ───────────────────────────────────────────────────────────────
1295
-
1296
- @staticmethod
1297
- def detect_scenario(
1298
- metrics_history: Dict[str, list],
1299
- vocab_size: int = 32000,
1300
- ) -> Dict[str, Any]:
1301
- """Auto-detect which debugging scenario applies.
1302
-
1303
- Scenarios (from the guide):
1304
- A: Loss stuck at ~10.37 (doesn't decrease at all)
1305
- B: Loss was decreasing then suddenly NaN
1306
- C: Loss decreased to X then started increasing
1307
- D: Loss stuck at high value (e.g. 4.0 for 1B model)
1308
- """
1309
- print(_header("Scenario Auto-Detection"))
1310
-
1311
- train_losses = metrics_history.get("train_loss", [])
1312
- val_losses = [v for v in metrics_history.get("val_loss", []) if v is not None]
1313
- expected_initial = math.log(vocab_size)
1314
-
1315
- if len(train_losses) < 5:
1316
- print(" [!] Not enough data to detect scenario.")
1317
- return {"scenario": "unknown", "steps": []}
1318
-
1319
- # Filter out NaN for statistics
1320
- valid_losses = [l for l in train_losses if not math.isnan(l)]
1321
- if len(valid_losses) < 5:
1322
- print(" [!] Not enough valid (non-NaN) data to detect scenario.")
1323
- return {"scenario": "unknown", "steps": []}
1324
-
1325
- first_loss = valid_losses[0]
1326
- last_loss = valid_losses[-1]
1327
- has_nan = len(valid_losses) < len(train_losses)
1328
- min_loss = min(valid_losses)
1329
- min_loss_idx = next(i for i, l in enumerate(train_losses)
1330
- if not math.isnan(l) and l == min_loss)
1331
- loss_recovered = last_loss > min_loss + 0.3 and min_loss_idx < len(train_losses) * 0.8
1332
-
1333
- # Trend analysis: is loss still decreasing?
1334
- mid = len(valid_losses) // 2
1335
- first_half_avg = sum(valid_losses[:mid]) / mid
1336
- second_half_avg = sum(valid_losses[mid:]) / (len(valid_losses) - mid)
1337
- still_decreasing = (first_half_avg - second_half_avg) > 0.3
1338
-
1339
- scenario = "unknown"
1340
- steps: List[str] = []
1341
-
1342
- # Scenario A: Loss stuck near initial value
1343
- if abs(last_loss - expected_initial) < 1.5 and abs(first_loss - last_loss) < 0.5:
1344
- scenario = "A"
1345
- steps = [
1346
- "1. Run single-batch overfit test β†’ if it fails, model/loss has a bug",
1347
- "2. Check if gradients are zero β†’ optimizer.step() may be missing",
1348
- "3. Verify input_ids/targets shift β†’ data pipeline bug",
1349
- "4. Check LR β†’ is it set to 0?",
1350
- "5. Check model.train() β†’ eval mode changes norm/dropout behavior",
1351
- ]
1352
-
1353
- # Scenario B: NaN appeared
1354
- elif has_nan:
1355
- nan_idx = next(i for i, l in enumerate(train_losses) if math.isnan(l))
1356
- scenario = "B"
1357
- steps = [
1358
- f"1. NaN appeared at step ~{nan_idx}. Check that batch's data for bad tokens",
1359
- "2. Check gradient norm just before NaN β†’ was there a spike?",
1360
- "3. Check LR schedule β†’ does NaN coincide with warmup end?",
1361
- "4. Check specific layer weights for Inf values",
1362
- "5. Try switching to fp32 to see if it's a mixed precision issue",
1363
- " (Pythia-1B had irrecoverable fp16 loss spikes β†’ switched to bf16,",
1364
- " Biderman et al. 2023)",
1365
- ]
1366
-
1367
- # Scenario C: Loss decreased then increased
1368
- elif loss_recovered:
1369
- scenario = "C"
1370
- steps = [
1371
- "1. Check Train and Val loss simultaneously:",
1372
- " - Both increasing β†’ LR too high (check cosine decay)",
1373
- " - Only train increasing β†’ data quality changed (streaming order)",
1374
- " - Only val increasing β†’ overfitting started",
1375
- "2. Verify LR schedule is decaying as intended",
1376
- "3. Check data shuffling β†’ same data repeating?",
1377
- ]
1378
-
1379
- # Scenario D: Loss stuck at high value (not still decreasing)
1380
- elif (last_loss > _EXPECTED_TRAIN_LOSS[1]
1381
- and abs(last_loss - min_loss) < 0.3
1382
- and not still_decreasing):
1383
- scenario = "D"
1384
- total_tokens = len(train_losses) * 262144 # approximate
1385
- steps = [
1386
- f"1. Check total tokens trained: ~{total_tokens / 1e9:.1f}B "
1387
- f"(need 5-10B for 1B model)",
1388
- "2. Compare with smaller model (100M) at same step β†’ "
1389
- "if 100M is lower, 1B may have a bug",
1390
- "3. Run LR range test β†’ current LR may not be optimal",
1391
- "4. Sample training data β†’ check for noise, duplicates, low quality",
1392
- "5. Try different effective batch size (64 vs 128 vs 256)",
1393
- ]
1394
-
1395
- else:
1396
- scenario = "none"
1397
- steps = ["Training appears normal. No specific scenario detected."]
1398
-
1399
- label = {
1400
- "A": "Loss stuck at initial value (~10.37)",
1401
- "B": "Loss was decreasing, then NaN",
1402
- "C": "Loss decreased then started increasing",
1403
- "D": f"Loss stuck at high value (>{_EXPECTED_TRAIN_LOSS[1]})",
1404
- "none": "No problematic scenario detected",
1405
- "unknown": "Cannot determine",
1406
- }
1407
-
1408
- print(f"\n Detected: Scenario {scenario} β€” {label.get(scenario, 'Unknown')}")
1409
- print(f"\n Recommended debugging steps:")
1410
- for step in steps:
1411
- print(f" {step}")
1412
-
1413
- return {"scenario": scenario, "label": label.get(scenario, ""), "steps": steps}
1414
-
1415
  # ───────────────────────────────────────────────────────────────
1416
  # Main Entry Point
1417
  # ───────────────────────────────────────────────────────────────
@@ -1491,9 +1368,6 @@ class LossDebugger:
1491
  model, dataloader, device,
1492
  )
1493
 
1494
- # Auto-detect scenario
1495
- report["scenario"] = LossDebugger.detect_scenario(metrics_history, vocab_size)
1496
-
1497
  # Final summary
1498
  print("\n" + "═" * 60)
1499
  print(" Diagnostics Complete")
 
1289
  "weight_issues": weight_issues,
1290
  }
1291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1292
  # ───────────────────────────────────────────────────────────────
1293
  # Main Entry Point
1294
  # ───────────────────────────────────────────────────────────────
 
1368
  model, dataloader, device,
1369
  )
1370
 
 
 
 
1371
  # Final summary
1372
  print("\n" + "═" * 60)
1373
  print(" Diagnostics Complete")
notebooks/05_debugging.ipynb CHANGED
@@ -18,7 +18,8 @@
18
  "Level 4: Fitting Diagnosis ← overfitting vs underfitting\n",
19
  "Level 5: Architecture ← initialization, per-layer activation\n",
20
  "```"
21
- ]
 
22
  },
23
  {
24
  "cell_type": "code",
@@ -28,7 +29,8 @@
28
  "source": [
29
  "# No additional packages required\n",
30
  "# LossDebugger only uses torch and built-in llm_lab modules"
31
- ]
 
32
  },
33
  {
34
  "cell_type": "code",
@@ -55,7 +57,8 @@
55
  "from llm_lab.data import setup_data_pipeline\n",
56
  "from llm_lab.training import LossDebugger\n",
57
  "from llm_lab.utils import auto_configure, get_device"
58
- ]
 
59
  },
60
  {
61
  "cell_type": "markdown",
@@ -64,7 +67,8 @@
64
  "## 0. Configuration\n",
65
  "\n",
66
  "Use the `debug_10m` preset so it runs quickly even on CPU."
67
- ]
 
68
  },
69
  {
70
  "cell_type": "code",
@@ -89,7 +93,8 @@
89
  "print(f\"Device: {device}, dtype: {dtype}\")\n",
90
  "print(f\"Vocab size: {vocab_size:,}\")\n",
91
  "print(f\"Expected initial loss: ln({vocab_size}) = {math.log(vocab_size):.2f}\")"
92
- ]
 
93
  },
94
  {
95
  "cell_type": "code",
@@ -103,7 +108,8 @@
103
  "\n",
104
  "# --- Data pipeline (GPT-2 tokenizer used automatically) ---\n",
105
  "tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
106
- ]
 
107
  },
108
  {
109
  "cell_type": "markdown",
@@ -118,7 +124,8 @@
118
  "```python\n",
119
  "# metrics_history = trainer.metrics.history\n",
120
  "```"
121
- ]
 
122
  },
123
  {
124
  "cell_type": "code",
@@ -134,7 +141,6 @@
134
  "# --- Scenario A: loss stuck near ln(vocab_size) ---\n",
135
  "# loss barely decreasing (suspected data/implementation bug)\n",
136
  "# diagnose_status condition: loss_change < 0.1\n",
137
- "# detect_scenario condition: abs(last - initial) < 1.5, abs(first - last) < 0.5\n",
138
  "n_steps_a = 200\n",
139
  "mock_history_a = {\n",
140
  " \"step\": list(range(1, n_steps_a + 1)),\n",
@@ -308,7 +314,8 @@
308
  "}\n",
309
  "print(f\"\\nMock H β€” train_loss: only 1 step\")\n",
310
  "print(f\" Expected: INSUFFICIENT_DATA (need >= 2 steps)\")"
311
- ]
 
312
  },
313
  {
314
  "cell_type": "markdown",
@@ -326,7 +333,8 @@
326
  "| `PLATEAU` | Loss stuck at high value | yellow |\n",
327
  "| `OVERFITTING` | train↓ val↑ | yellow |\n",
328
  "| `UNSTABLE` | High loss variance | yellow |"
329
- ]
 
330
  },
331
  {
332
  "cell_type": "code",
@@ -367,7 +375,8 @@
367
  " print(f\" {icon} {name:25s} expected={expected:20s} got={actual}\")\n",
368
  "passed = sum(1 for *_, m in results if m == \"PASS\")\n",
369
  "print(f\"\\n {passed}/{len(results)} passed\")"
370
- ]
 
371
  },
372
  {
373
  "cell_type": "code",
@@ -380,7 +389,8 @@
380
  "\n",
381
  "result = LossDebugger.diagnose_status(vocab_size, metrics_history)\n",
382
  "print(f\"\\nReal checkpoint diagnosis: {result['status']}\")"
383
- ]
 
384
  },
385
  {
386
  "cell_type": "markdown",
@@ -396,7 +406,8 @@
396
  "4. **Single-batch overfit** β€” repeat one batch for 200 steps β†’ verify loss β‰ˆ 0\n",
397
  "5. **Tokenizer round-trip** β€” verify text is preserved after encode β†’ decode\n",
398
  "6. **Data quality** β€” visually inspect sample text"
399
- ]
 
400
  },
401
  {
402
  "cell_type": "code",
@@ -416,7 +427,8 @@
416
  "print(f\"\\nPassed: {len(level1['passed'])}, Failed: {len(level1['failed'])}\")\n",
417
  "for f in level1['failed']:\n",
418
  " print(f\" FAILED: {f['name']} β€” {f['detail']}\")"
419
- ]
 
420
  },
421
  {
422
  "cell_type": "markdown",
@@ -453,7 +465,8 @@
453
  "| Loss β†’ Inf | log of 0 | Add eps, set ignore_index |\n",
454
  "| Loss oscillating | fp16 gradient underflow | Switch to bf16 or use GradScaler |\n",
455
  "| Late-stage NaN | Gradual activation increase | Verify RMSNorm, check weight decay |"
456
- ]
 
457
  },
458
  {
459
  "cell_type": "code",
@@ -467,7 +480,8 @@
467
  " device=device,\n",
468
  " dtype=dtype,\n",
469
  ")"
470
- ]
 
471
  },
472
  {
473
  "cell_type": "markdown",
@@ -506,7 +520,8 @@
506
  "- Batch Γ—2 β†’ LR Γ—βˆš2 (square root scaling, safer)\n",
507
  "- Batch Γ—2 β†’ LR Γ—2 (linear scaling, used in GPT-3 etc.)\n",
508
  "- Effective batch 64~512 is typical for 1B models"
509
- ]
 
510
  },
511
  {
512
  "cell_type": "code",
@@ -518,7 +533,8 @@
518
  " metrics_history=mock_history_a,\n",
519
  " config=train_config,\n",
520
  ")"
521
- ]
 
522
  },
523
  {
524
  "cell_type": "markdown",
@@ -531,7 +547,8 @@
531
  "\n",
532
  "> May take some time (approximately 1 minute for debug_10m).\n",
533
  "> Run this once before starting actual training."
534
- ]
 
535
  },
536
  {
537
  "cell_type": "code",
@@ -548,7 +565,8 @@
548
  ")\n",
549
  "\n",
550
  "print(f\"\\nSuggested peak LR: {lr_result['suggested_lr']:.2e}\")"
551
- ]
 
552
  },
553
  {
554
  "cell_type": "markdown",
@@ -586,7 +604,8 @@
586
  " - Pythia, TinyLlama, OLMo, LLaMA all use dropout=0\n",
587
  " - With sufficient data, the data itself is the best regularization\n",
588
  " - Dropout is useful for fine-tuning with small data"
589
- ]
 
590
  },
591
  {
592
  "cell_type": "code",
@@ -602,7 +621,8 @@
602
  " model_params=model_params,\n",
603
  " total_tokens=total_tokens,\n",
604
  ")"
605
- ]
 
606
  },
607
  {
608
  "cell_type": "markdown",
@@ -634,7 +654,8 @@
634
  "| RoPE β†’ absolute positional embedding | Small difference for short sequences | Check RoPE implementation |\n",
635
  "| SwiGLU β†’ ReLU FFN | Loss +0.05~0.15 | Check SwiGLU implementation |\n",
636
  "| GQA β†’ MHA | Almost same loss (memory difference only) | KV repeat bug |"
637
- ]
 
638
  },
639
  {
640
  "cell_type": "code",
@@ -649,115 +670,14 @@
649
  ")\n",
650
  "\n",
651
  "print(f\"\\n>>> Diagnosis: {level5['diagnosis']}\")"
652
- ]
 
653
  },
654
  {
655
  "cell_type": "markdown",
656
  "metadata": {},
657
  "source": [
658
- "## 7. Scenario Auto-Detection\n",
659
- "\n",
660
- "Analyzes `metrics_history` to automatically identify which of the 4 scenarios applies:\n",
661
- "\n",
662
- "| Scenario | Symptom | Main Cause | Key Diagnostic |\n",
663
- "|----------|---------|-----------|----------------|\n",
664
- "| **A** | Loss stuck near ~10.37 | Data/implementation bug | Single-batch overfit test |\n",
665
- "| **B** | Sudden NaN during loss decrease | Numerical instability | Check grad norm just before NaN |\n",
666
- "| **C** | Loss increases after decreasing | LR/data issue | Check train/val simultaneously |\n",
667
- "| **D** | Loss stuck at high value | Insufficient training/LR issue | Check token count, run LR Range Test |\n",
668
- "\n",
669
- "### Scenario-Specific Diagnostic Points\n",
670
- "\n",
671
- "**Scenario A**: \"Loss isn't decreasing from 10.37 at all\"\n",
672
- "1. Single-batch overfit β†’ if fails, model/loss bug\n",
673
- "2. Check if gradients are 0 β†’ missing `optimizer.step()`?\n",
674
- "3. Check `input_ids/targets` shift β†’ data pipeline bug\n",
675
- "4. Check LR β†’ is it set to 0?\n",
676
- "5. Verify `model.train()` is called\n",
677
- "\n",
678
- "**Scenario B**: \"Loss was decreasing then suddenly NaN\"\n",
679
- "1. Inspect the batch data at that step\n",
680
- "2. Check gradient norm spike just before NaN\n",
681
- "3. Compare LR schedule with NaN timing\n",
682
- "4. Mixed precision issue β†’ try reproducing with fp32\n",
683
- " - (Pythia-1B: fp16 β†’ bf16 switch case, Biderman et al. 2023)\n",
684
- "\n",
685
- "**Scenario C**: \"Loss decreased to 3.5 then went back up\"\n",
686
- "1. Check train/val simultaneously:\n",
687
- " - Both up β†’ LR too high\n",
688
- " - Only train up β†’ data quality change (streaming order)\n",
689
- " - Only val up β†’ overfitting starting\n",
690
- "2. Check LR schedule, check data shuffling\n",
691
- "\n",
692
- "**Scenario D**: \"Loss won't go below 4.0\"\n",
693
- "1. Check training token count (if < 5B, insufficient training)\n",
694
- "2. Compare with 100M model\n",
695
- "3. Run LR Range Test\n",
696
- "4. Sample and inspect data quality"
697
- ]
698
- },
699
- {
700
- "cell_type": "code",
701
- "execution_count": null,
702
- "metadata": {},
703
- "outputs": [],
704
- "source": [
705
- "# --- Scenario A ---\n",
706
- "print(\"=\" * 50)\n",
707
- "print(\"Testing Scenario A (loss stuck at initial value)\")\n",
708
- "print(\"=\" * 50)\n",
709
- "scenario_a = LossDebugger.detect_scenario(\n",
710
- " metrics_history=mock_history_a,\n",
711
- " vocab_size=vocab_size,\n",
712
- ")\n",
713
- "print(f\"\\n>>> Detected: Scenario {scenario_a['scenario']}\")\n",
714
- "\n",
715
- "# --- Scenario B ---\n",
716
- "print(\"\\n\" + \"=\" * 50)\n",
717
- "print(\"Testing Scenario B (NaN appeared)\")\n",
718
- "print(\"=\" * 50)\n",
719
- "scenario_b = LossDebugger.detect_scenario(\n",
720
- " metrics_history=mock_history_b,\n",
721
- " vocab_size=vocab_size,\n",
722
- ")\n",
723
- "print(f\"\\n>>> Detected: Scenario {scenario_b['scenario']}\")\n",
724
- "\n",
725
- "# --- Scenario C ---\n",
726
- "print(\"\\n\" + \"=\" * 50)\n",
727
- "print(\"Testing Scenario C (loss bounce)\")\n",
728
- "print(\"=\" * 50)\n",
729
- "scenario_c = LossDebugger.detect_scenario(\n",
730
- " metrics_history=mock_history_c,\n",
731
- " vocab_size=vocab_size,\n",
732
- ")\n",
733
- "print(f\"\\n>>> Detected: Scenario {scenario_c['scenario']}\")\n",
734
- "\n",
735
- "# --- Scenario D ---\n",
736
- "print(\"\\n\" + \"=\" * 50)\n",
737
- "print(\"Testing Scenario D (loss plateau)\")\n",
738
- "print(\"=\" * 50)\n",
739
- "scenario_d = LossDebugger.detect_scenario(\n",
740
- " metrics_history=mock_history_d,\n",
741
- " vocab_size=vocab_size,\n",
742
- ")\n",
743
- "print(f\"\\n>>> Detected: Scenario {scenario_d['scenario']}\")\n",
744
- "\n",
745
- "# --- Normal ---\n",
746
- "print(\"\\n\" + \"=\" * 50)\n",
747
- "print(\"Testing Normal scenario\")\n",
748
- "print(\"=\" * 50)\n",
749
- "scenario_n = LossDebugger.detect_scenario(\n",
750
- " metrics_history=mock_history_normal,\n",
751
- " vocab_size=vocab_size,\n",
752
- ")\n",
753
- "print(f\"\\n>>> Detected: Scenario {scenario_n['scenario']}\")"
754
- ]
755
- },
756
- {
757
- "cell_type": "markdown",
758
- "metadata": {},
759
- "source": [
760
- "## 8. Full Diagnostics (run_diagnostics)\n",
761
  "\n",
762
  "Runs all levels above at once. Use the `levels` parameter to select which levels to run.\n",
763
  "\n",
@@ -770,7 +690,8 @@
770
  "# device=device, dtype=dtype,\n",
771
  "# )\n",
772
  "```"
773
- ]
 
774
  },
775
  {
776
  "cell_type": "code",
@@ -789,16 +710,18 @@
789
  " vocab_size=vocab_size,\n",
790
  " levels=[0, 1, 2, 3, 4, 5],\n",
791
  ")"
792
- ]
 
793
  },
794
  {
795
  "cell_type": "markdown",
796
  "metadata": {},
797
  "source": [
798
- "## 9. Study Roadmap\n",
799
  "\n",
800
  "A systematic learning path for LLM training optimization."
801
- ]
 
802
  },
803
  {
804
  "cell_type": "code",
@@ -807,13 +730,14 @@
807
  "outputs": [],
808
  "source": [
809
  "LossDebugger.print_study_roadmap()"
810
- ]
 
811
  },
812
  {
813
  "cell_type": "markdown",
814
  "metadata": {},
815
  "source": [
816
- "## 10. Debugging Tips\n",
817
  "\n",
818
  "**Recommended order:**\n",
819
  "1. Check status with Level 0 β†’ tells you which level to inspect\n",
@@ -843,7 +767,8 @@
843
  "---\n",
844
  "**Previous step:** Train the model in `03_training.ipynb`. \n",
845
  "**Next step:** Evaluate the trained model in `04_evaluation.ipynb`."
846
- ]
 
847
  }
848
  ],
849
  "metadata": {
 
18
  "Level 4: Fitting Diagnosis ← overfitting vs underfitting\n",
19
  "Level 5: Architecture ← initialization, per-layer activation\n",
20
  "```"
21
+ ],
22
+ "id": "19f7e954"
23
  },
24
  {
25
  "cell_type": "code",
 
29
  "source": [
30
  "# No additional packages required\n",
31
  "# LossDebugger only uses torch and built-in llm_lab modules"
32
+ ],
33
+ "id": "af6a605f"
34
  },
35
  {
36
  "cell_type": "code",
 
57
  "from llm_lab.data import setup_data_pipeline\n",
58
  "from llm_lab.training import LossDebugger\n",
59
  "from llm_lab.utils import auto_configure, get_device"
60
+ ],
61
+ "id": "23b92c8d"
62
  },
63
  {
64
  "cell_type": "markdown",
 
67
  "## 0. Configuration\n",
68
  "\n",
69
  "Use the `debug_10m` preset so it runs quickly even on CPU."
70
+ ],
71
+ "id": "f3daf6ad"
72
  },
73
  {
74
  "cell_type": "code",
 
93
  "print(f\"Device: {device}, dtype: {dtype}\")\n",
94
  "print(f\"Vocab size: {vocab_size:,}\")\n",
95
  "print(f\"Expected initial loss: ln({vocab_size}) = {math.log(vocab_size):.2f}\")"
96
+ ],
97
+ "id": "dc7aa763"
98
  },
99
  {
100
  "cell_type": "code",
 
108
  "\n",
109
  "# --- Data pipeline (GPT-2 tokenizer used automatically) ---\n",
110
  "tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
111
+ ],
112
+ "id": "6ed5cdad"
113
  },
114
  {
115
  "cell_type": "markdown",
 
124
  "```python\n",
125
  "# metrics_history = trainer.metrics.history\n",
126
  "```"
127
+ ],
128
+ "id": "58a40e59"
129
  },
130
  {
131
  "cell_type": "code",
 
141
  "# --- Scenario A: loss stuck near ln(vocab_size) ---\n",
142
  "# loss barely decreasing (suspected data/implementation bug)\n",
143
  "# diagnose_status condition: loss_change < 0.1\n",
 
144
  "n_steps_a = 200\n",
145
  "mock_history_a = {\n",
146
  " \"step\": list(range(1, n_steps_a + 1)),\n",
 
314
  "}\n",
315
  "print(f\"\\nMock H β€” train_loss: only 1 step\")\n",
316
  "print(f\" Expected: INSUFFICIENT_DATA (need >= 2 steps)\")"
317
+ ],
318
+ "id": "0fc90eec"
319
  },
320
  {
321
  "cell_type": "markdown",
 
333
  "| `PLATEAU` | Loss stuck at high value | yellow |\n",
334
  "| `OVERFITTING` | train↓ val↑ | yellow |\n",
335
  "| `UNSTABLE` | High loss variance | yellow |"
336
+ ],
337
+ "id": "0eb757cb"
338
  },
339
  {
340
  "cell_type": "code",
 
375
  " print(f\" {icon} {name:25s} expected={expected:20s} got={actual}\")\n",
376
  "passed = sum(1 for *_, m in results if m == \"PASS\")\n",
377
  "print(f\"\\n {passed}/{len(results)} passed\")"
378
+ ],
379
+ "id": "b22fc65a"
380
  },
381
  {
382
  "cell_type": "code",
 
389
  "\n",
390
  "result = LossDebugger.diagnose_status(vocab_size, metrics_history)\n",
391
  "print(f\"\\nReal checkpoint diagnosis: {result['status']}\")"
392
+ ],
393
+ "id": "da4a7552"
394
  },
395
  {
396
  "cell_type": "markdown",
 
406
  "4. **Single-batch overfit** β€” repeat one batch for 200 steps β†’ verify loss β‰ˆ 0\n",
407
  "5. **Tokenizer round-trip** β€” verify text is preserved after encode β†’ decode\n",
408
  "6. **Data quality** β€” visually inspect sample text"
409
+ ],
410
+ "id": "793c1fc8"
411
  },
412
  {
413
  "cell_type": "code",
 
427
  "print(f\"\\nPassed: {len(level1['passed'])}, Failed: {len(level1['failed'])}\")\n",
428
  "for f in level1['failed']:\n",
429
  " print(f\" FAILED: {f['name']} β€” {f['detail']}\")"
430
+ ],
431
+ "id": "d774e1c8"
432
  },
433
  {
434
  "cell_type": "markdown",
 
465
  "| Loss β†’ Inf | log of 0 | Add eps, set ignore_index |\n",
466
  "| Loss oscillating | fp16 gradient underflow | Switch to bf16 or use GradScaler |\n",
467
  "| Late-stage NaN | Gradual activation increase | Verify RMSNorm, check weight decay |"
468
+ ],
469
+ "id": "eb7e5160"
470
  },
471
  {
472
  "cell_type": "code",
 
480
  " device=device,\n",
481
  " dtype=dtype,\n",
482
  ")"
483
+ ],
484
+ "id": "1beffcec"
485
  },
486
  {
487
  "cell_type": "markdown",
 
520
  "- Batch Γ—2 β†’ LR Γ—βˆš2 (square root scaling, safer)\n",
521
  "- Batch Γ—2 β†’ LR Γ—2 (linear scaling, used in GPT-3 etc.)\n",
522
  "- Effective batch 64~512 is typical for 1B models"
523
+ ],
524
+ "id": "0cc8f6f9"
525
  },
526
  {
527
  "cell_type": "code",
 
533
  " metrics_history=mock_history_a,\n",
534
  " config=train_config,\n",
535
  ")"
536
+ ],
537
+ "id": "217f1cd3"
538
  },
539
  {
540
  "cell_type": "markdown",
 
547
  "\n",
548
  "> May take some time (approximately 1 minute for debug_10m).\n",
549
  "> Run this once before starting actual training."
550
+ ],
551
+ "id": "fea85ba4"
552
  },
553
  {
554
  "cell_type": "code",
 
565
  ")\n",
566
  "\n",
567
  "print(f\"\\nSuggested peak LR: {lr_result['suggested_lr']:.2e}\")"
568
+ ],
569
+ "id": "bfd64503"
570
  },
571
  {
572
  "cell_type": "markdown",
 
604
  " - Pythia, TinyLlama, OLMo, LLaMA all use dropout=0\n",
605
  " - With sufficient data, the data itself is the best regularization\n",
606
  " - Dropout is useful for fine-tuning with small data"
607
+ ],
608
+ "id": "edb91923"
609
  },
610
  {
611
  "cell_type": "code",
 
621
  " model_params=model_params,\n",
622
  " total_tokens=total_tokens,\n",
623
  ")"
624
+ ],
625
+ "id": "505692ed"
626
  },
627
  {
628
  "cell_type": "markdown",
 
654
  "| RoPE β†’ absolute positional embedding | Small difference for short sequences | Check RoPE implementation |\n",
655
  "| SwiGLU β†’ ReLU FFN | Loss +0.05~0.15 | Check SwiGLU implementation |\n",
656
  "| GQA β†’ MHA | Almost same loss (memory difference only) | KV repeat bug |"
657
+ ],
658
+ "id": "3b65db22"
659
  },
660
  {
661
  "cell_type": "code",
 
670
  ")\n",
671
  "\n",
672
  "print(f\"\\n>>> Diagnosis: {level5['diagnosis']}\")"
673
+ ],
674
+ "id": "e1d7a64e"
675
  },
676
  {
677
  "cell_type": "markdown",
678
  "metadata": {},
679
  "source": [
680
+ "## 7. Full Diagnostics (run_diagnostics)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
  "\n",
682
  "Runs all levels above at once. Use the `levels` parameter to select which levels to run.\n",
683
  "\n",
 
690
  "# device=device, dtype=dtype,\n",
691
  "# )\n",
692
  "```"
693
+ ],
694
+ "id": "8cc8a5f1"
695
  },
696
  {
697
  "cell_type": "code",
 
710
  " vocab_size=vocab_size,\n",
711
  " levels=[0, 1, 2, 3, 4, 5],\n",
712
  ")"
713
+ ],
714
+ "id": "c7f3a147"
715
  },
716
  {
717
  "cell_type": "markdown",
718
  "metadata": {},
719
  "source": [
720
+ "## 8. Study Roadmap\n",
721
  "\n",
722
  "A systematic learning path for LLM training optimization."
723
+ ],
724
+ "id": "093ac866"
725
  },
726
  {
727
  "cell_type": "code",
 
730
  "outputs": [],
731
  "source": [
732
  "LossDebugger.print_study_roadmap()"
733
+ ],
734
+ "id": "fc94ba5f"
735
  },
736
  {
737
  "cell_type": "markdown",
738
  "metadata": {},
739
  "source": [
740
+ "## 9. Debugging Tips\n",
741
  "\n",
742
  "**Recommended order:**\n",
743
  "1. Check status with Level 0 β†’ tells you which level to inspect\n",
 
767
  "---\n",
768
  "**Previous step:** Train the model in `03_training.ipynb`. \n",
769
  "**Next step:** Evaluate the trained model in `04_evaluation.ipynb`."
770
+ ],
771
+ "id": "eb140c33"
772
  }
773
  ],
774
  "metadata": {