faro1219 commited on
Commit
e7f9bae
·
verified ·
1 Parent(s): 715bcf9

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json +14 -0
  3. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png +0 -0
  4. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png +0 -0
  5. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png +0 -0
  6. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log +253 -0
  7. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json +14 -0
  8. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png +0 -0
  9. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png +0 -0
  10. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png +0 -0
  11. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log +232 -0
  12. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json +14 -0
  13. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png +0 -0
  14. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png +0 -0
  15. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log +640 -0
  16. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json +14 -0
  17. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png +0 -0
  18. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png +0 -0
  19. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png +0 -0
  20. fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log +386 -0
  21. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt +3 -0
  22. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt +3 -0
  23. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt +3 -0
  24. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt +3 -0
  25. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt +3 -0
  26. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt +3 -0
  27. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt +3 -0
  28. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt +3 -0
  29. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt +3 -0
  30. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt +3 -0
  31. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt +3 -0
  32. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt +3 -0
  33. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt +3 -0
  34. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt +3 -0
  35. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt +3 -0
  36. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt +3 -0
  37. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt +3 -0
  38. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt +3 -0
  39. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt +3 -0
  40. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt +3 -0
  41. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt +3 -0
  42. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt +3 -0
  43. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt +3 -0
  44. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt +3 -0
  45. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt +3 -0
  46. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt +3 -0
  47. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt +3 -0
  48. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt +3 -0
  49. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt +3 -0
  50. fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt +3 -0
.gitattributes CHANGED
@@ -172,3 +172,8 @@ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_
172
  fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
173
  fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
174
  fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
172
  fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
173
  fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
174
  fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
175
+ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
176
+ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
177
+ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
178
+ fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.png filter=lfs diff=lfs merge=lfs -text
179
+ fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.png filter=lfs diff=lfs merge=lfs -text
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gpt2_small",
3
+ "factor_min": 0.6,
4
+ "factor_max": 1.5,
5
+ "factor_num": 10,
6
+ "error": 0.0001,
7
+ "accum_steps": 4,
8
+ "num_iterations": 50,
9
+ "num_checkpoint": 1000,
10
+ "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
11
+ "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
12
+ "timestamp": "250622_035242",
13
+ "raw": false
14
+ }
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-25 06:40:04,162 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt
2
+ 2025-06-25 06:42:00,983 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt
3
+ 2025-06-25 06:42:02,949 | INFO | Loaded checkpoint with optimizer: adam
4
+ 2025-06-25 06:42:02,949 | INFO | Current learning rate: 0.0018
5
+ 2025-06-25 06:42:03,551 | INFO | Weight decay: 0.1
6
+ 2025-06-25 06:42:03,551 | INFO | Epsilon: 1e-08
7
+ 2025-06-25 06:42:03,551 | INFO | Loaded 147 first moment (m) buffers
8
+ 2025-06-25 06:42:03,551 | INFO | Loaded 147 second moment (v) buffers
9
+ 2025-06-25 06:42:03,551 | INFO | Optimizer state loading completed!
10
+ 2025-06-25 06:42:05,486 | INFO | Initialized xs with norm: 1.273417
11
+ 2025-06-25 06:42:05,497 | INFO | -------------------------------- EoS --------------------------------
12
+ 2025-06-25 06:42:05,497 | INFO | Starting LR test 1/10: lr=0.0011
13
+ 2025-06-25 06:42:05,497 | INFO | Starting EoS for LR factor 0.6000
14
+ 2025-06-25 06:42:05,497 | INFO | Starting EoS for checkpoint 001000
15
+ 2025-06-25 06:42:05,497 | INFO | Starting EoS for model gpt2_small
16
+ 2025-06-25 06:42:05,497 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
17
+ 2025-06-25 06:42:05,497 | INFO | Starting EoS for num_iterations 50
18
+ 2025-06-25 06:42:05,497 | INFO | Starting EoS for accum_steps 4
19
+ 2025-06-25 06:42:05,497 | INFO | Loading model and checkpoint...
20
+ 2025-06-25 06:42:06,378 | INFO | Wrapping model with DDP...
21
+ 2025-06-25 06:42:06,406 | INFO | Loading state dict...
22
+ 2025-06-25 06:42:06,410 | INFO | Model loaded successfully!
23
+ 2025-06-25 06:42:13,594 | INFO | iter 000000 | lr 0.0011 | loss 4.3392 | norm 13.7445
24
+ 2025-06-25 06:42:18,961 | INFO | iter 000001 | lr 0.0011 | loss 4.4330 | norm 27.2540
25
+ 2025-06-25 06:42:24,075 | INFO | iter 000002 | lr 0.0011 | loss 4.3832 | norm 40.6489
26
+ 2025-06-25 06:42:29,377 | INFO | iter 000003 | lr 0.0011 | loss 4.3913 | norm 53.8958
27
+ 2025-06-25 06:42:34,634 | INFO | iter 000004 | lr 0.0011 | loss 4.3451 | norm 66.9831
28
+ 2025-06-25 06:42:39,893 | INFO | iter 000005 | lr 0.0011 | loss 4.4464 | norm 79.9053
29
+ 2025-06-25 06:42:45,174 | INFO | iter 000006 | lr 0.0011 | loss 4.4459 | norm 92.6583
30
+ 2025-06-25 06:42:50,203 | INFO | iter 000007 | lr 0.0011 | loss 4.4652 | norm 105.2383
31
+ 2025-06-25 06:42:55,371 | INFO | iter 000008 | lr 0.0011 | loss 4.4837 | norm 117.6420
32
+ 2025-06-25 06:43:00,884 | INFO | iter 000009 | lr 0.0011 | loss 4.5257 | norm 129.8671
33
+ 2025-06-25 06:43:06,419 | INFO | iter 000010 | lr 0.0011 | loss 4.5709 | norm 141.9121
34
+ 2025-06-25 06:43:11,725 | INFO | iter 000011 | lr 0.0011 | loss 4.5868 | norm 153.7757
35
+ 2025-06-25 06:43:16,715 | INFO | iter 000012 | lr 0.0011 | loss 4.6285 | norm 165.4574
36
+ 2025-06-25 06:43:21,934 | INFO | iter 000013 | lr 0.0011 | loss 4.4997 | norm 176.9570
37
+ 2025-06-25 06:43:27,054 | INFO | iter 000014 | lr 0.0011 | loss 4.8695 | norm 188.2745
38
+ 2025-06-25 06:43:32,694 | INFO | iter 000015 | lr 0.0011 | loss 4.8345 | norm 199.4108
39
+ 2025-06-25 06:43:37,757 | INFO | iter 000016 | lr 0.0011 | loss 4.7835 | norm 210.3668
40
+ 2025-06-25 06:43:42,881 | INFO | iter 000017 | lr 0.0011 | loss 4.6182 | norm 221.1439
41
+ 2025-06-25 06:43:48,183 | INFO | iter 000018 | lr 0.0011 | loss 4.9529 | norm 231.7438
42
+ 2025-06-25 06:43:53,391 | INFO | iter 000019 | lr 0.0011 | loss 4.7017 | norm 242.1687
43
+ 2025-06-25 06:43:58,710 | INFO | iter 000020 | lr 0.0011 | loss 4.7435 | norm 252.4208
44
+ 2025-06-25 06:44:04,463 | INFO | iter 000021 | lr 0.0011 | loss 4.7189 | norm 262.5029
45
+ 2025-06-25 06:44:09,748 | INFO | iter 000022 | lr 0.0011 | loss 4.8201 | norm 272.4178
46
+ 2025-06-25 06:44:14,874 | INFO | iter 000023 | lr 0.0011 | loss 4.5389 | norm 282.1686
47
+ 2025-06-25 06:44:20,048 | INFO | iter 000024 | lr 0.0011 | loss 4.6885 | norm 291.7586
48
+ 2025-06-25 06:44:25,212 | INFO | iter 000025 | lr 0.0011 | loss 4.6047 | norm 301.1913
49
+ 2025-06-25 06:44:30,907 | INFO | iter 000026 | lr 0.0011 | loss 4.8046 | norm 310.4704
50
+ 2025-06-25 06:44:36,109 | INFO | iter 000027 | lr 0.0011 | loss 4.8286 | norm 319.5996
51
+ 2025-06-25 06:44:41,223 | INFO | iter 000028 | lr 0.0011 | loss 4.4733 | norm 328.5828
52
+ 2025-06-25 06:44:46,307 | INFO | iter 000029 | lr 0.0011 | loss 4.7715 | norm 337.4237
53
+ 2025-06-25 06:44:51,454 | INFO | iter 000030 | lr 0.0011 | loss 4.3984 | norm 346.1263
54
+ 2025-06-25 06:44:56,725 | INFO | iter 000031 | lr 0.0011 | loss 4.5874 | norm 354.6947
55
+ 2025-06-25 06:45:02,103 | INFO | iter 000032 | lr 0.0011 | loss 4.2035 | norm 363.1331
56
+ 2025-06-25 06:45:07,293 | INFO | iter 000033 | lr 0.0011 | loss 3.9937 | norm 371.4456
57
+ 2025-06-25 06:45:12,413 | INFO | iter 000034 | lr 0.0011 | loss 4.2272 | norm 379.6361
58
+ 2025-06-25 06:45:17,562 | INFO | iter 000035 | lr 0.0011 | loss 4.0314 | norm 387.7087
59
+ 2025-06-25 06:45:22,574 | INFO | iter 000036 | lr 0.0011 | loss 4.0062 | norm 395.6674
60
+ 2025-06-25 06:45:27,804 | INFO | iter 000037 | lr 0.0011 | loss 4.2448 | norm 403.5161
61
+ 2025-06-25 06:45:33,292 | INFO | iter 000038 | lr 0.0011 | loss 3.5407 | norm 411.2590
62
+ 2025-06-25 06:45:38,469 | INFO | iter 000039 | lr 0.0011 | loss 3.3566 | norm 418.8998
63
+ 2025-06-25 06:45:43,663 | INFO | iter 000040 | lr 0.0011 | loss 3.4557 | norm 426.4421
64
+ 2025-06-25 06:45:48,847 | INFO | iter 000041 | lr 0.0011 | loss 3.3656 | norm 433.8899
65
+ 2025-06-25 06:45:54,209 | INFO | iter 000042 | lr 0.0011 | loss 2.9874 | norm 441.2469
66
+ 2025-06-25 06:45:59,878 | INFO | iter 000043 | lr 0.0011 | loss 2.7940 | norm 448.5164
67
+ 2025-06-25 06:46:05,083 | INFO | iter 000044 | lr 0.0011 | loss 2.7455 | norm 455.7017
68
+ 2025-06-25 06:46:10,201 | INFO | iter 000045 | lr 0.0011 | loss 2.6695 | norm 462.8061
69
+ 2025-06-25 06:46:15,556 | INFO | iter 000046 | lr 0.0011 | loss 2.5043 | norm 469.8330
70
+ 2025-06-25 06:46:20,540 | INFO | iter 000047 | lr 0.0011 | loss 2.3519 | norm 476.7853
71
+ 2025-06-25 06:46:25,531 | INFO | iter 000048 | lr 0.0011 | loss 1.8817 | norm 483.6662
72
+ 2025-06-25 06:46:31,412 | INFO | iter 000049 | lr 0.0011 | loss 1.3600 | norm 490.4789
73
+ 2025-06-25 06:46:31,413 | INFO | Completed LR test 1/10: lr=0.0011
74
+ 2025-06-25 06:46:31,437 | INFO | -------------------------------- EoS --------------------------------
75
+ 2025-06-25 06:46:31,437 | INFO | Starting LR test 2/10: lr=0.0018
76
+ 2025-06-25 06:46:31,437 | INFO | Starting EoS for LR factor 1.0000
77
+ 2025-06-25 06:46:31,437 | INFO | Starting EoS for checkpoint 001000
78
+ 2025-06-25 06:46:31,437 | INFO | Starting EoS for model gpt2_small
79
+ 2025-06-25 06:46:31,437 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
80
+ 2025-06-25 06:46:31,437 | INFO | Starting EoS for num_iterations 50
81
+ 2025-06-25 06:46:31,437 | INFO | Starting EoS for accum_steps 4
82
+ 2025-06-25 06:46:31,437 | INFO | Loading model and checkpoint...
83
+ 2025-06-25 06:46:32,149 | INFO | Wrapping model with DDP...
84
+ 2025-06-25 06:46:32,538 | INFO | Loading state dict...
85
+ 2025-06-25 06:46:32,542 | INFO | Model loaded successfully!
86
+ 2025-06-25 06:46:39,146 | INFO | iter 000000 | lr 0.0018 | loss 4.3392 | norm 22.8459
87
+ 2025-06-25 06:46:44,139 | INFO | iter 000001 | lr 0.0018 | loss 4.4466 | norm 45.1634
88
+ 2025-06-25 06:46:49,296 | INFO | iter 000002 | lr 0.0018 | loss 4.4217 | norm 67.1257
89
+ 2025-06-25 06:46:54,367 | INFO | iter 000003 | lr 0.0018 | loss 4.4527 | norm 88.6757
90
+ 2025-06-25 06:46:59,621 | INFO | iter 000004 | lr 0.0018 | loss 4.4288 | norm 109.7928
91
+ 2025-06-25 06:47:05,068 | INFO | iter 000005 | lr 0.0018 | loss 4.5626 | norm 130.4703
92
+ 2025-06-25 06:47:10,277 | INFO | iter 000006 | lr 0.0018 | loss 4.5940 | norm 150.7033
93
+ 2025-06-25 06:47:15,415 | INFO | iter 000007 | lr 0.0018 | loss 4.6409 | norm 170.4862
94
+ 2025-06-25 06:47:20,565 | INFO | iter 000008 | lr 0.0018 | loss 4.6550 | norm 189.8151
95
+ 2025-06-25 06:47:25,896 | INFO | iter 000009 | lr 0.0018 | loss 4.7106 | norm 208.6894
96
+ 2025-06-25 06:47:31,787 | INFO | iter 000010 | lr 0.0018 | loss 4.7663 | norm 227.1111
97
+ 2025-06-25 06:47:37,142 | INFO | iter 000011 | lr 0.0018 | loss 4.7738 | norm 245.0833
98
+ 2025-06-25 06:47:42,360 | INFO | iter 000012 | lr 0.0018 | loss 4.7983 | norm 262.6107
99
+ 2025-06-25 06:47:47,575 | INFO | iter 000013 | lr 0.0018 | loss 4.6526 | norm 279.6987
100
+ 2025-06-25 06:47:52,603 | INFO | iter 000014 | lr 0.0018 | loss 5.0270 | norm 296.3541
101
+ 2025-06-25 06:47:57,840 | INFO | iter 000015 | lr 0.0018 | loss 4.9013 | norm 312.5854
102
+ 2025-06-25 06:48:03,561 | INFO | iter 000016 | lr 0.0018 | loss 4.8465 | norm 328.4014
103
+ 2025-06-25 06:48:08,832 | INFO | iter 000017 | lr 0.0018 | loss 4.5672 | norm 343.8123
104
+ 2025-06-25 06:48:13,857 | INFO | iter 000018 | lr 0.0018 | loss 4.7794 | norm 358.8289
105
+ 2025-06-25 06:48:19,012 | INFO | iter 000019 | lr 0.0018 | loss 4.4661 | norm 373.4629
106
+ 2025-06-25 06:48:24,132 | INFO | iter 000020 | lr 0.0018 | loss 4.5023 | norm 387.7265
107
+ 2025-06-25 06:48:29,868 | INFO | iter 000021 | lr 0.0018 | loss 4.2938 | norm 401.6327
108
+ 2025-06-25 06:48:35,428 | INFO | iter 000022 | lr 0.0018 | loss 4.3684 | norm 415.1946
109
+ 2025-06-25 06:48:40,742 | INFO | iter 000023 | lr 0.0018 | loss 3.8490 | norm 428.4261
110
+ 2025-06-25 06:48:45,938 | INFO | iter 000024 | lr 0.0018 | loss 4.0617 | norm 441.3406
111
+ 2025-06-25 06:48:51,129 | INFO | iter 000025 | lr 0.0018 | loss 3.6068 | norm 453.9526
112
+ 2025-06-25 06:48:56,489 | INFO | iter 000026 | lr 0.0018 | loss 4.0782 | norm 466.2762
113
+ 2025-06-25 06:49:02,061 | INFO | iter 000027 | lr 0.0018 | loss 3.8365 | norm 478.3254
114
+ 2025-06-25 06:49:07,402 | INFO | iter 000028 | lr 0.0018 | loss 2.9535 | norm 490.1142
115
+ 2025-06-25 06:49:12,466 | INFO | iter 000029 | lr 0.0018 | loss 3.1557 | norm 501.6562
116
+ 2025-06-25 06:49:17,583 | INFO | iter 000030 | lr 0.0018 | loss 2.7154 | norm 512.9648
117
+ 2025-06-25 06:49:22,770 | INFO | iter 000031 | lr 0.0018 | loss 2.8398 | norm 524.0536
118
+ 2025-06-25 06:49:27,964 | INFO | iter 000032 | lr 0.0018 | loss 1.6209 | norm 534.9364
119
+ 2025-06-25 06:49:33,586 | INFO | iter 000033 | lr 0.0018 | loss 0.8802 | norm 545.6261
120
+ 2025-06-25 06:49:38,805 | INFO | iter 000034 | lr 0.0018 | loss 1.5411 | norm 556.1344
121
+ 2025-06-25 06:49:44,031 | INFO | iter 000035 | lr 0.0018 | loss 0.7415 | norm 566.4735
122
+ 2025-06-25 06:49:49,298 | INFO | iter 000036 | lr 0.0018 | loss 0.3749 | norm 576.6551
123
+ 2025-06-25 06:49:54,505 | INFO | iter 000037 | lr 0.0018 | loss 0.9944 | norm 586.6894
124
+ 2025-06-25 06:50:00,198 | INFO | iter 000038 | lr 0.0018 | loss -1.2345 | norm 596.5885
125
+ 2025-06-25 06:50:05,762 | INFO | iter 000039 | lr 0.0018 | loss -1.4369 | norm 606.3619
126
+ 2025-06-25 06:50:10,887 | INFO | iter 000040 | lr 0.0018 | loss -1.5657 | norm 616.0191
127
+ 2025-06-25 06:50:16,128 | INFO | iter 000041 | lr 0.0018 | loss -2.0936 | norm 625.5695
128
+ 2025-06-25 06:50:21,496 | INFO | iter 000042 | lr 0.0018 | loss -3.7056 | norm 635.0224
129
+ 2025-06-25 06:50:26,613 | INFO | iter 000043 | lr 0.0018 | loss -3.8761 | norm 644.3855
130
+ 2025-06-25 06:50:32,217 | INFO | iter 000044 | lr 0.0018 | loss -4.3276 | norm 653.6660
131
+ 2025-06-25 06:50:37,522 | INFO | iter 000045 | lr 0.0018 | loss -4.6411 | norm 662.8710
132
+ 2025-06-25 06:50:42,783 | INFO | iter 000046 | lr 0.0018 | loss -5.7818 | norm 672.0079
133
+ 2025-06-25 06:50:47,790 | INFO | iter 000047 | lr 0.0018 | loss -5.7992 | norm 681.0823
134
+ 2025-06-25 06:50:52,994 | INFO | iter 000048 | lr 0.0018 | loss -7.2470 | norm 690.1007
135
+ 2025-06-25 06:50:58,040 | INFO | iter 000049 | lr 0.0018 | loss -9.4092 | norm 699.0692
136
+ 2025-06-25 06:50:58,041 | INFO | Completed LR test 2/10: lr=0.0018
137
+ 2025-06-25 06:50:58,068 | INFO | -------------------------------- EoS --------------------------------
138
+ 2025-06-25 06:50:58,068 | INFO | Starting LR test 3/10: lr=0.0025
139
+ 2025-06-25 06:50:58,068 | INFO | Starting EoS for LR factor 1.4000
140
+ 2025-06-25 06:50:58,068 | INFO | Starting EoS for checkpoint 001000
141
+ 2025-06-25 06:50:58,068 | INFO | Starting EoS for model gpt2_small
142
+ 2025-06-25 06:50:58,068 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
143
+ 2025-06-25 06:50:58,068 | INFO | Starting EoS for num_iterations 50
144
+ 2025-06-25 06:50:58,068 | INFO | Starting EoS for accum_steps 4
145
+ 2025-06-25 06:50:58,068 | INFO | Loading model and checkpoint...
146
+ 2025-06-25 06:50:58,772 | INFO | Wrapping model with DDP...
147
+ 2025-06-25 06:50:59,088 | INFO | Loading state dict...
148
+ 2025-06-25 06:50:59,091 | INFO | Model loaded successfully!
149
+ 2025-06-25 06:51:05,754 | INFO | iter 000000 | lr 0.0025 | loss 4.3392 | norm 31.9607
150
+ 2025-06-25 06:51:10,755 | INFO | iter 000001 | lr 0.0025 | loss 4.4680 | norm 62.8866
151
+ 2025-06-25 06:51:15,818 | INFO | iter 000002 | lr 0.0025 | loss 4.4710 | norm 93.1217
152
+ 2025-06-25 06:51:20,907 | INFO | iter 000003 | lr 0.0025 | loss 4.5262 | norm 122.5640
153
+ 2025-06-25 06:51:26,040 | INFO | iter 000004 | lr 0.0025 | loss 4.5248 | norm 151.1809
154
+ 2025-06-25 06:51:31,812 | INFO | iter 000005 | lr 0.0025 | loss 4.6891 | norm 178.9703
155
+ 2025-06-25 06:51:37,067 | INFO | iter 000006 | lr 0.0025 | loss 4.7442 | norm 205.9325
156
+ 2025-06-25 06:51:42,230 | INFO | iter 000007 | lr 0.0025 | loss 4.8070 | norm 232.0657
157
+ 2025-06-25 06:51:47,272 | INFO | iter 000008 | lr 0.0025 | loss 4.7906 | norm 257.3720
158
+ 2025-06-25 06:51:52,535 | INFO | iter 000009 | lr 0.0025 | loss 4.8344 | norm 281.8610
159
+ 2025-06-25 06:51:57,924 | INFO | iter 000010 | lr 0.0025 | loss 4.8678 | norm 305.5477
160
+ 2025-06-25 06:52:03,450 | INFO | iter 000011 | lr 0.0025 | loss 4.8304 | norm 328.4503
161
+ 2025-06-25 06:52:08,629 | INFO | iter 000012 | lr 0.0025 | loss 4.7894 | norm 350.5884
162
+ 2025-06-25 06:52:13,898 | INFO | iter 000013 | lr 0.0025 | loss 4.5864 | norm 371.9838
163
+ 2025-06-25 06:52:19,143 | INFO | iter 000014 | lr 0.0025 | loss 4.9290 | norm 392.6606
164
+ 2025-06-25 06:52:24,307 | INFO | iter 000015 | lr 0.0025 | loss 4.6104 | norm 412.6456
165
+ 2025-06-25 06:52:29,763 | INFO | iter 000016 | lr 0.0025 | loss 4.4951 | norm 431.9664
166
+ 2025-06-25 06:52:34,877 | INFO | iter 000017 | lr 0.0025 | loss 4.0138 | norm 450.6518
167
+ 2025-06-25 06:52:40,147 | INFO | iter 000018 | lr 0.0025 | loss 3.9656 | norm 468.7319
168
+ 2025-06-25 06:52:45,265 | INFO | iter 000019 | lr 0.0025 | loss 3.5134 | norm 486.2377
169
+ 2025-06-25 06:52:50,435 | INFO | iter 000020 | lr 0.0025 | loss 3.4981 | norm 503.2006
170
+ 2025-06-25 06:52:55,571 | INFO | iter 000021 | lr 0.0025 | loss 3.0034 | norm 519.6526
171
+ 2025-06-25 06:53:01,174 | INFO | iter 000022 | lr 0.0025 | loss 3.0086 | norm 535.6248
172
+ 2025-06-25 06:53:06,599 | INFO | iter 000023 | lr 0.0025 | loss 2.0425 | norm 551.1493
173
+ 2025-06-25 06:53:11,956 | INFO | iter 000024 | lr 0.0025 | loss 2.3420 | norm 566.2564
174
+ 2025-06-25 06:53:17,132 | INFO | iter 000025 | lr 0.0025 | loss 1.2484 | norm 580.9775
175
+ 2025-06-25 06:53:22,381 | INFO | iter 000026 | lr 0.0025 | loss 1.9663 | norm 595.3424
176
+ 2025-06-25 06:53:27,583 | INFO | iter 000027 | lr 0.0025 | loss 1.4790 | norm 609.3793
177
+ 2025-06-25 06:53:33,275 | INFO | iter 000028 | lr 0.0025 | loss -0.2816 | norm 623.1166
178
+ 2025-06-25 06:53:38,353 | INFO | iter 000029 | lr 0.0025 | loss -0.2689 | norm 636.5806
179
+ 2025-06-25 06:53:43,494 | INFO | iter 000030 | lr 0.0025 | loss -0.8742 | norm 649.7966
180
+ 2025-06-25 06:53:48,723 | INFO | iter 000031 | lr 0.0025 | loss -0.8715 | norm 662.7894
181
+ 2025-06-25 06:59:15,808 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt
182
+ 2025-06-25 06:59:17,627 | INFO | Loaded checkpoint with optimizer: adam
183
+ 2025-06-25 06:59:17,627 | INFO | Current learning rate: 0.0018
184
+ 2025-06-25 06:59:18,250 | INFO | Weight decay: 0.1
185
+ 2025-06-25 06:59:18,250 | INFO | Epsilon: 1e-08
186
+ 2025-06-25 06:59:18,251 | INFO | Loaded 147 first moment (m) buffers
187
+ 2025-06-25 06:59:18,251 | INFO | Loaded 147 second moment (v) buffers
188
+ 2025-06-25 06:59:18,251 | INFO | Optimizer state loading completed!
189
+ 2025-06-25 06:59:20,150 | INFO | Initialized xs with norm: 1.273537
190
+ 2025-06-25 06:59:20,157 | INFO | -------------------------------- EoS --------------------------------
191
+ 2025-06-25 06:59:20,157 | INFO | Starting LR test 1/10: lr=0.0025
192
+ 2025-06-25 06:59:20,157 | INFO | Starting EoS for LR factor 1.4000
193
+ 2025-06-25 06:59:20,158 | INFO | Starting EoS for checkpoint 001000
194
+ 2025-06-25 06:59:20,158 | INFO | Starting EoS for model gpt2_small
195
+ 2025-06-25 06:59:20,158 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
196
+ 2025-06-25 06:59:20,158 | INFO | Starting EoS for num_iterations 50
197
+ 2025-06-25 06:59:20,158 | INFO | Starting EoS for accum_steps 4
198
+ 2025-06-25 06:59:20,158 | INFO | Loading model and checkpoint...
199
+ 2025-06-25 06:59:21,280 | INFO | Wrapping model with DDP...
200
+ 2025-06-25 06:59:21,347 | INFO | Loading state dict...
201
+ 2025-06-25 06:59:21,350 | INFO | Model loaded successfully!
202
+ 2025-06-25 06:59:29,329 | INFO | iter 000000 | lr 0.0025 | loss 4.3393 | norm 15.4976
203
+ 2025-06-25 06:59:34,563 | INFO | iter 000001 | lr 0.0025 | loss 5.0174 | norm 26.2033
204
+ 2025-06-25 06:59:39,547 | INFO | iter 000002 | lr 0.0025 | loss 7.0303 | norm 34.2732
205
+ 2025-06-25 06:59:44,649 | INFO | iter 000003 | lr 0.0025 | loss 4.8583 | norm 42.0572
206
+ 2025-06-25 06:59:49,612 | INFO | iter 000004 | lr 0.0025 | loss 5.5988 | norm 48.8932
207
+ 2025-06-25 06:59:54,632 | INFO | iter 000005 | lr 0.0025 | loss 4.8926 | norm 55.2742
208
+ 2025-06-25 07:00:00,343 | INFO | iter 000006 | lr 0.0025 | loss 4.0337 | norm 61.4189
209
+ 2025-06-25 07:00:05,824 | INFO | iter 000007 | lr 0.0025 | loss 3.8261 | norm 67.2519
210
+ 2025-06-25 07:00:10,921 | INFO | iter 000008 | lr 0.0025 | loss 1.5497 | norm 73.1772
211
+ 2025-06-25 07:00:15,950 | INFO | iter 000009 | lr 0.0025 | loss -0.9868 | norm 79.3588
212
+ 2025-06-25 07:00:21,158 | INFO | iter 000010 | lr 0.0025 | loss -4.7045 | norm 85.9307
213
+ 2025-06-25 07:00:26,356 | INFO | iter 000011 | lr 0.0025 | loss -9.1555 | norm 92.9646
214
+ 2025-06-25 07:00:32,012 | INFO | iter 000012 | lr 0.0025 | loss -18.3042 | norm 100.6413
215
+ 2025-06-25 07:00:37,134 | INFO | iter 000013 | lr 0.0025 | loss -24.4566 | norm 108.9797
216
+ 2025-06-25 07:00:42,361 | INFO | iter 000014 | lr 0.0025 | loss -34.7879 | norm 117.8907
217
+ 2025-06-25 07:00:47,543 | INFO | iter 000015 | lr 0.0025 | loss -56.0771 | norm 127.5276
218
+ 2025-06-25 07:00:52,587 | INFO | iter 000016 | lr 0.0025 | loss -74.3452 | norm 137.8423
219
+ 2025-06-25 07:00:57,843 | INFO | iter 000017 | lr 0.0025 | loss -97.3003 | norm 148.7922
220
+ 2025-06-25 07:01:03,546 | INFO | iter 000018 | lr 0.0025 | loss -128.8613 | norm 160.3333
221
+ 2025-06-25 07:01:08,639 | INFO | iter 000019 | lr 0.0025 | loss -153.6679 | norm 172.4106
222
+ 2025-06-25 07:01:13,817 | INFO | iter 000020 | lr 0.0025 | loss -177.9772 | norm 184.8852
223
+ 2025-06-25 07:01:19,038 | INFO | iter 000021 | lr 0.0025 | loss -212.6209 | norm 197.7662
224
+ 2025-06-25 07:01:24,099 | INFO | iter 000022 | lr 0.0025 | loss -228.8878 | norm 210.9760
225
+ 2025-06-25 07:01:29,523 | INFO | iter 000023 | lr 0.0025 | loss -302.6816 | norm 224.5441
226
+ 2025-06-25 07:01:34,575 | INFO | iter 000024 | lr 0.0025 | loss -313.9990 | norm 238.3132
227
+ 2025-06-25 07:01:39,718 | INFO | iter 000025 | lr 0.0025 | loss -412.0068 | norm 252.3185
228
+ 2025-06-25 07:01:44,741 | INFO | iter 000026 | lr 0.0025 | loss -436.6941 | norm 266.0395
229
+ 2025-06-25 07:01:50,024 | INFO | iter 000027 | lr 0.0025 | loss -440.2226 | norm 279.5380
230
+ 2025-06-25 07:01:55,265 | INFO | iter 000028 | lr 0.0025 | loss -601.8568 | norm 293.2904
231
+ 2025-06-25 07:02:00,592 | INFO | iter 000029 | lr 0.0025 | loss -676.1694 | norm 307.3040
232
+ 2025-06-25 07:02:05,933 | INFO | iter 000030 | lr 0.0025 | loss -704.9308 | norm 321.5338
233
+ 2025-06-25 07:02:11,124 | INFO | iter 000031 | lr 0.0025 | loss -774.7177 | norm 335.7946
234
+ 2025-06-25 07:02:16,435 | INFO | iter 000032 | lr 0.0025 | loss -920.0737 | norm 350.3229
235
+ 2025-06-25 07:02:21,641 | INFO | iter 000033 | lr 0.0025 | loss -1063.0433 | norm 364.4135
236
+ 2025-06-25 07:02:26,771 | INFO | iter 000034 | lr 0.0025 | loss -1016.9738 | norm 378.6853
237
+ 2025-06-25 07:02:32,448 | INFO | iter 000035 | lr 0.0025 | loss -1197.0923 | norm 393.2000
238
+ 2025-06-25 07:02:37,618 | INFO | iter 000036 | lr 0.0025 | loss -1259.9513 | norm 407.9392
239
+ 2025-06-25 07:02:42,716 | INFO | iter 000037 | lr 0.0025 | loss -1382.7266 | norm 422.7625
240
+ 2025-06-25 07:02:47,839 | INFO | iter 000038 | lr 0.0025 | loss -1556.2228 | norm 437.7693
241
+ 2025-06-25 07:02:53,065 | INFO | iter 000039 | lr 0.0025 | loss -1659.4865 | norm 452.8923
242
+ 2025-06-25 07:02:58,112 | INFO | iter 000040 | lr 0.0025 | loss -1659.5458 | norm 468.0710
243
+ 2025-06-25 07:03:03,864 | INFO | iter 000041 | lr 0.0025 | loss -1773.1254 | norm 483.1063
244
+ 2025-06-25 07:03:09,029 | INFO | iter 000042 | lr 0.0025 | loss -2041.8066 | norm 498.2888
245
+ 2025-06-25 07:03:14,123 | INFO | iter 000043 | lr 0.0025 | loss -2230.4387 | norm 513.6289
246
+ 2025-06-25 07:03:19,337 | INFO | iter 000044 | lr 0.0025 | loss -2268.1362 | norm 529.1135
247
+ 2025-06-25 07:03:24,439 | INFO | iter 000045 | lr 0.0025 | loss -2196.4004 | norm 544.5320
248
+ 2025-06-25 07:03:29,662 | INFO | iter 000046 | lr 0.0025 | loss -2567.3474 | norm 560.0720
249
+ 2025-06-25 07:03:35,121 | INFO | iter 000047 | lr 0.0025 | loss -2551.3745 | norm 575.5949
250
+ 2025-06-25 07:03:40,366 | INFO | iter 000048 | lr 0.0025 | loss -2840.6702 | norm 591.2187
251
+ 2025-06-25 07:03:45,515 | INFO | iter 000049 | lr 0.0025 | loss -3193.7876 | norm 606.9574
252
+ 2025-06-25 07:03:45,516 | INFO | Completed LR test 1/10: lr=0.0025
253
+ 2025-06-25 07:03:45,809 | INFO | Cleanup complete
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gpt2_small",
3
+ "factor_min": 0.6,
4
+ "factor_max": 1.5,
5
+ "factor_num": 10,
6
+ "error": 0.0001,
7
+ "accum_steps": 4,
8
+ "num_iterations": 50,
9
+ "num_checkpoint": 2000,
10
+ "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
11
+ "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
12
+ "timestamp": "250622_035242",
13
+ "raw": false
14
+ }
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-25 07:04:49,319 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
2
+ 2025-06-25 07:04:50,921 | INFO | Loaded checkpoint with optimizer: adam
3
+ 2025-06-25 07:04:50,922 | INFO | Current learning rate: 0.0018
4
+ 2025-06-25 07:04:51,534 | INFO | Weight decay: 0.1
5
+ 2025-06-25 07:04:51,534 | INFO | Epsilon: 1e-08
6
+ 2025-06-25 07:04:51,534 | INFO | Loaded 147 first moment (m) buffers
7
+ 2025-06-25 07:04:51,534 | INFO | Loaded 147 second moment (v) buffers
8
+ 2025-06-25 07:04:51,534 | INFO | Optimizer state loading completed!
9
+ 2025-06-25 07:04:53,371 | INFO | Initialized xs with norm: 1.273644
10
+ 2025-06-25 07:04:53,383 | INFO | -------------------------------- EoS --------------------------------
11
+ 2025-06-25 07:04:53,383 | INFO | Starting LR test 1/10: lr=0.0025
12
+ 2025-06-25 07:04:53,383 | INFO | Starting EoS for LR factor 1.4000
13
+ 2025-06-25 07:04:53,383 | INFO | Starting EoS for checkpoint 002000
14
+ 2025-06-25 07:04:53,383 | INFO | Starting EoS for model gpt2_small
15
+ 2025-06-25 07:04:53,383 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
16
+ 2025-06-25 07:04:53,383 | INFO | Starting EoS for num_iterations 50
17
+ 2025-06-25 07:04:53,383 | INFO | Starting EoS for accum_steps 4
18
+ 2025-06-25 07:04:53,383 | INFO | Loading model and checkpoint...
19
+ 2025-06-25 07:04:54,128 | INFO | Wrapping model with DDP...
20
+ 2025-06-25 07:04:54,613 | INFO | Loading state dict...
21
+ 2025-06-25 07:04:54,617 | INFO | Model loaded successfully!
22
+ 2025-06-25 07:05:02,467 | INFO | iter 000000 | lr 0.0025 | loss 3.9757 | norm 15.2051
23
+ 2025-06-25 07:05:07,565 | INFO | iter 000001 | lr 0.0025 | loss 4.3233 | norm 25.7014
24
+ 2025-06-25 07:05:12,643 | INFO | iter 000002 | lr 0.0025 | loss 5.8232 | norm 33.3926
25
+ 2025-06-25 07:05:17,743 | INFO | iter 000003 | lr 0.0025 | loss 4.4479 | norm 40.7412
26
+ 2025-06-25 07:05:22,835 | INFO | iter 000004 | lr 0.0025 | loss 5.0561 | norm 47.0897
27
+ 2025-06-25 07:05:27,984 | INFO | iter 000005 | lr 0.0025 | loss 4.2371 | norm 53.4390
28
+ 2025-06-25 07:05:33,593 | INFO | iter 000006 | lr 0.0025 | loss 4.2628 | norm 59.3858
29
+ 2025-06-25 07:05:38,622 | INFO | iter 000007 | lr 0.0025 | loss 4.4517 | norm 64.9470
30
+ 2025-06-25 07:05:43,671 | INFO | iter 000008 | lr 0.0025 | loss 3.7213 | norm 70.3563
31
+ 2025-06-25 07:05:48,942 | INFO | iter 000009 | lr 0.0025 | loss 3.4583 | norm 75.6566
32
+ 2025-06-25 07:05:54,202 | INFO | iter 000010 | lr 0.0025 | loss 3.2253 | norm 80.8225
33
+ 2025-06-25 07:05:59,788 | INFO | iter 000011 | lr 0.0025 | loss 2.1490 | norm 86.0380
34
+ 2025-06-25 07:06:05,316 | INFO | iter 000012 | lr 0.0025 | loss 0.5857 | norm 91.4942
35
+ 2025-06-25 07:06:10,581 | INFO | iter 000013 | lr 0.0025 | loss -0.7333 | norm 97.2915
36
+ 2025-06-25 07:06:15,719 | INFO | iter 000014 | lr 0.0025 | loss -2.5905 | norm 103.4982
37
+ 2025-06-25 07:06:20,943 | INFO | iter 000015 | lr 0.0025 | loss -6.6798 | norm 110.1739
38
+ 2025-06-25 07:08:00,350 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
39
+ 2025-06-25 07:08:02,106 | INFO | Loaded checkpoint with optimizer: adam
40
+ 2025-06-25 07:08:02,107 | INFO | Current learning rate: 0.0018
41
+ 2025-06-25 07:08:02,720 | INFO | Weight decay: 0.1
42
+ 2025-06-25 07:08:02,720 | INFO | Epsilon: 1e-08
43
+ 2025-06-25 07:08:02,721 | INFO | Loaded 147 first moment (m) buffers
44
+ 2025-06-25 07:08:02,721 | INFO | Loaded 147 second moment (v) buffers
45
+ 2025-06-25 07:08:02,721 | INFO | Optimizer state loading completed!
46
+ 2025-06-25 07:08:04,562 | INFO | Initialized xs with norm: 1.273412
47
+ 2025-06-25 07:08:04,573 | INFO | -------------------------------- EoS --------------------------------
48
+ 2025-06-25 07:08:04,574 | INFO | Starting LR test 1/10: lr=0.0090
49
+ 2025-06-25 07:08:04,574 | INFO | Starting EoS for LR factor 5.0000
50
+ 2025-06-25 07:08:04,574 | INFO | Starting EoS for checkpoint 002000
51
+ 2025-06-25 07:08:04,574 | INFO | Starting EoS for model gpt2_small
52
+ 2025-06-25 07:08:04,574 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
53
+ 2025-06-25 07:08:04,574 | INFO | Starting EoS for num_iterations 50
54
+ 2025-06-25 07:08:04,574 | INFO | Starting EoS for accum_steps 4
55
+ 2025-06-25 07:08:04,574 | INFO | Loading model and checkpoint...
56
+ 2025-06-25 07:08:05,317 | INFO | Wrapping model with DDP...
57
+ 2025-06-25 07:08:05,851 | INFO | Loading state dict...
58
+ 2025-06-25 07:08:05,855 | INFO | Model loaded successfully!
59
+ 2025-06-25 07:08:12,930 | INFO | iter 000000 | lr 0.0090 | loss 3.9757 | norm 54.1260
60
+ 2025-06-25 07:08:17,867 | INFO | iter 000001 | lr 0.0090 | loss 7.9464 | norm 93.7556
61
+ 2025-06-25 07:08:22,976 | INFO | iter 000002 | lr 0.0090 | loss 65.9051 | norm 114.5414
62
+ 2025-06-25 07:08:27,990 | INFO | iter 000003 | lr 0.0090 | loss 9.6467 | norm 142.1960
63
+ 2025-06-25 07:08:33,721 | INFO | iter 000004 | lr 0.0090 | loss 37.9767 | norm 165.1957
64
+ 2025-06-25 07:08:38,762 | INFO | iter 000005 | lr 0.0090 | loss 34.2442 | norm 186.2563
65
+ 2025-06-25 07:08:44,002 | INFO | iter 000006 | lr 0.0090 | loss 12.5688 | norm 207.7581
66
+ 2025-06-25 07:08:49,138 | INFO | iter 000007 | lr 0.0090 | loss 13.4724 | norm 228.4881
67
+ 2025-06-25 07:08:54,269 | INFO | iter 000008 | lr 0.0090 | loss 18.9169 | norm 247.7513
68
+ 2025-06-25 07:08:59,417 | INFO | iter 000009 | lr 0.0090 | loss 14.7739 | norm 265.9009
69
+ 2025-06-25 07:09:04,859 | INFO | iter 000010 | lr 0.0090 | loss 4.6113 | norm 283.6754
70
+ 2025-06-25 07:09:10,221 | INFO | iter 000011 | lr 0.0090 | loss -2.8853 | norm 301.4828
71
+ 2025-06-25 07:09:15,414 | INFO | iter 000012 | lr 0.0090 | loss -8.5041 | norm 319.3103
72
+ 2025-06-25 07:09:20,524 | INFO | iter 000013 | lr 0.0090 | loss -16.0165 | norm 337.3585
73
+ 2025-06-25 07:09:25,783 | INFO | iter 000014 | lr 0.0090 | loss -30.7357 | norm 356.0018
74
+ 2025-06-25 07:09:31,353 | INFO | iter 000015 | lr 0.0090 | loss -59.7186 | norm 375.0746
75
+ 2025-06-25 07:09:36,559 | INFO | iter 000016 | lr 0.0090 | loss -85.9098 | norm 395.1222
76
+ 2025-06-25 07:09:41,686 | INFO | iter 000017 | lr 0.0090 | loss -113.7542 | norm 416.7910
77
+ 2025-06-25 07:09:47,076 | INFO | iter 000018 | lr 0.0090 | loss -182.1024 | norm 439.8696
78
+ 2025-06-25 07:27:24,437 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
79
+ 2025-06-25 07:27:26,239 | INFO | Loaded checkpoint with optimizer: adam
80
+ 2025-06-25 07:27:26,240 | INFO | Current learning rate: 0.0018
81
+ 2025-06-25 07:27:26,858 | INFO | Weight decay: 0.1
82
+ 2025-06-25 07:27:26,858 | INFO | Epsilon: 1e-08
83
+ 2025-06-25 07:27:26,858 | INFO | Loaded 147 first moment (m) buffers
84
+ 2025-06-25 07:27:26,858 | INFO | Loaded 147 second moment (v) buffers
85
+ 2025-06-25 07:27:26,858 | INFO | Optimizer state loading completed!
86
+ 2025-06-25 07:27:29,212 | INFO | Initialized xs with norm: 1.273458
87
+ 2025-06-25 07:27:29,221 | INFO | -------------------------------- EoS --------------------------------
88
+ 2025-06-25 07:27:29,221 | INFO | Starting LR test 1/10: lr=0.0180
89
+ 2025-06-25 07:27:29,221 | INFO | Starting EoS for LR factor 10.0000
90
+ 2025-06-25 07:27:29,221 | INFO | Starting EoS for checkpoint 002000
91
+ 2025-06-25 07:27:29,221 | INFO | Starting EoS for model gpt2_small
92
+ 2025-06-25 07:27:29,221 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
93
+ 2025-06-25 07:27:29,221 | INFO | Starting EoS for num_iterations 50
94
+ 2025-06-25 07:27:29,221 | INFO | Starting EoS for accum_steps 4
95
+ 2025-06-25 07:27:29,221 | INFO | Loading model and checkpoint...
96
+ 2025-06-25 07:27:29,979 | INFO | Wrapping model with DDP...
97
+ 2025-06-25 07:27:30,468 | INFO | Loading state dict...
98
+ 2025-06-25 07:27:30,471 | INFO | Model loaded successfully!
99
+ 2025-06-25 07:27:37,640 | INFO | iter 000000 | lr 0.0180 | loss 3.9757 | norm 108.2326
100
+ 2025-06-25 07:27:42,495 | INFO | iter 000001 | lr 0.0180 | loss 20.5162 | norm 189.4878
101
+ 2025-06-25 07:27:47,595 | INFO | iter 000002 | lr 0.0180 | loss 292.6721 | norm 227.1594
102
+ 2025-06-25 07:27:52,527 | INFO | iter 000003 | lr 0.0180 | loss 30.5704 | norm 282.5874
103
+ 2025-06-25 07:27:57,708 | INFO | iter 000004 | lr 0.0180 | loss 145.1973 | norm 329.8537
104
+ 2025-06-25 07:28:03,188 | INFO | iter 000005 | lr 0.0180 | loss 154.9424 | norm 371.7672
105
+ 2025-06-25 07:28:08,432 | INFO | iter 000006 | lr 0.0180 | loss 54.9122 | norm 414.5362
106
+ 2025-06-25 07:28:13,591 | INFO | iter 000007 | lr 0.0180 | loss 38.6102 | norm 456.6207
107
+ 2025-06-25 07:28:18,809 | INFO | iter 000008 | lr 0.0180 | loss 66.4117 | norm 495.8205
108
+ 2025-06-25 07:28:23,750 | INFO | iter 000009 | lr 0.0180 | loss 67.2475 | norm 531.9467
109
+ 2025-06-25 07:28:29,478 | INFO | iter 000010 | lr 0.0180 | loss 27.9346 | norm 566.8688
110
+ 2025-06-25 07:28:34,849 | INFO | iter 000011 | lr 0.0180 | loss -10.5653 | norm 601.7505
111
+ 2025-06-25 07:28:40,026 | INFO | iter 000012 | lr 0.0180 | loss -32.7803 | norm 636.7070
112
+ 2025-06-25 07:28:45,181 | INFO | iter 000013 | lr 0.0180 | loss -49.7714 | norm 671.7962
113
+ 2025-06-25 07:28:50,620 | INFO | iter 000014 | lr 0.0180 | loss -94.9201 | norm 707.5554
114
+ 2025-06-25 07:28:55,620 | INFO | iter 000015 | lr 0.0180 | loss -195.6995 | norm 743.8602
115
+ 2025-06-25 07:29:00,782 | INFO | iter 000016 | lr 0.0180 | loss -289.3528 | norm 781.8421
116
+ 2025-06-25 07:29:05,969 | INFO | iter 000017 | lr 0.0180 | loss -385.6000 | norm 822.7772
117
+ 2025-06-25 07:29:11,247 | INFO | iter 000018 | lr 0.0180 | loss -617.5768 | norm 866.3561
118
+ 2025-06-25 07:29:16,457 | INFO | iter 000019 | lr 0.0180 | loss -758.4005 | norm 913.5998
119
+ 2025-06-25 07:29:21,562 | INFO | iter 000020 | lr 0.0180 | loss -962.2928 | norm 964.2577
120
+ 2025-06-25 07:29:26,751 | INFO | iter 000021 | lr 0.0180 | loss -1308.8584 | norm 1018.9549
121
+ 2025-06-25 07:29:32,200 | INFO | iter 000022 | lr 0.0180 | loss -1507.0786 | norm 1075.5761
122
+ 2025-06-25 07:29:37,305 | INFO | iter 000023 | lr 0.0180 | loss -1928.5552 | norm 1136.3061
123
+ 2025-06-25 07:29:42,442 | INFO | iter 000024 | lr 0.0180 | loss -2365.0591 | norm 1200.9531
124
+ 2025-06-25 07:29:47,581 | INFO | iter 000025 | lr 0.0180 | loss -2911.7729 | norm 1269.1230
125
+ 2025-06-25 07:29:52,848 | INFO | iter 000026 | lr 0.0180 | loss -3918.7095 | norm 1340.4335
126
+ 2025-06-25 07:29:58,025 | INFO | iter 000027 | lr 0.0180 | loss -4075.2781 | norm 1415.0733
127
+ 2025-06-25 07:30:03,634 | INFO | iter 000028 | lr 0.0180 | loss -4800.8032 | norm 1492.5634
128
+ 2025-06-25 07:30:08,717 | INFO | iter 000029 | lr 0.0180 | loss -5429.1694 | norm 1572.5761
129
+ 2025-06-25 07:30:13,912 | INFO | iter 000030 | lr 0.0180 | loss -6909.1343 | norm 1655.3578
130
+ 2025-06-25 07:30:19,088 | INFO | iter 000031 | lr 0.0180 | loss -7403.7188 | norm 1740.2757
131
+ 2025-06-25 07:30:24,170 | INFO | iter 000032 | lr 0.0180 | loss -8883.3643 | norm 1827.2884
132
+ 2025-06-25 07:30:29,463 | INFO | iter 000033 | lr 0.0180 | loss -9913.4092 | norm 1916.4071
133
+ 2025-06-25 07:30:34,811 | INFO | iter 000034 | lr 0.0180 | loss -12094.2510 | norm 2007.2927
134
+ 2025-06-25 07:30:40,049 | INFO | iter 000035 | lr 0.0180 | loss -13123.3652 | norm 2099.8649
135
+ 2025-06-25 07:30:45,184 | INFO | iter 000036 | lr 0.0180 | loss -13453.2988 | norm 2187.7686
136
+ 2025-06-25 07:30:50,352 | INFO | iter 000037 | lr 0.0180 | loss -15590.8887 | norm 2277.7149
137
+ 2025-06-25 07:30:55,349 | INFO | iter 000038 | lr 0.0180 | loss -17174.6211 | norm 2369.4460
138
+ 2025-06-25 07:31:00,515 | INFO | iter 000039 | lr 0.0180 | loss -18859.8008 | norm 2462.6141
139
+ 2025-06-25 07:31:05,868 | INFO | iter 000040 | lr 0.0180 | loss -22396.0918 | norm 2557.2235
140
+ 2025-06-25 07:31:11,195 | INFO | iter 000041 | lr 0.0180 | loss -23291.8730 | norm 2652.7654
141
+ 2025-06-25 07:31:16,272 | INFO | iter 000042 | lr 0.0180 | loss -24466.0820 | norm 2748.3429
142
+ 2025-06-25 07:31:21,340 | INFO | iter 000043 | lr 0.0180 | loss -26257.3105 | norm 2844.3545
143
+ 2025-06-25 07:31:26,604 | INFO | iter 000044 | lr 0.0180 | loss -28454.4160 | norm 2941.0400
144
+ 2025-06-25 07:31:32,098 | INFO | iter 000045 | lr 0.0180 | loss -30838.6445 | norm 3038.6056
145
+ 2025-06-25 07:31:37,115 | INFO | iter 000046 | lr 0.0180 | loss -32684.4766 | norm 3136.7237
146
+ 2025-06-25 07:31:42,152 | INFO | iter 000047 | lr 0.0180 | loss -39129.3398 | norm 3235.0764
147
+ 2025-06-25 07:31:47,357 | INFO | iter 000048 | lr 0.0180 | loss -37924.5391 | norm 3333.8420
148
+ 2025-06-25 07:31:52,616 | INFO | iter 000049 | lr 0.0180 | loss -39045.4688 | norm 3432.5284
149
+ 2025-06-25 07:31:52,617 | INFO | Completed LR test 1/10: lr=0.0180
150
+ 2025-06-25 07:31:52,965 | INFO | Cleanup complete
151
+ 2025-06-25 08:00:39,916 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
152
+ 2025-06-25 08:00:41,765 | INFO | Loaded checkpoint with optimizer: adam
153
+ 2025-06-25 08:00:41,766 | INFO | Current learning rate: 0.0018
154
+ 2025-06-25 08:00:42,410 | INFO | Weight decay: 0.1
155
+ 2025-06-25 08:00:42,411 | INFO | Epsilon: 1e-08
156
+ 2025-06-25 08:00:42,411 | INFO | Loaded 147 first moment (m) buffers
157
+ 2025-06-25 08:00:42,411 | INFO | Loaded 147 second moment (v) buffers
158
+ 2025-06-25 08:00:42,411 | INFO | Optimizer state loading completed!
159
+ 2025-06-25 08:00:44,469 | INFO | Initialized xs with norm: 1.273415
160
+ 2025-06-25 08:00:44,473 | INFO | -------------------------------- EoS --------------------------------
161
+ 2025-06-25 08:00:44,473 | INFO | Starting LR test 1/10: lr=0.0180
162
+ 2025-06-25 08:00:44,473 | INFO | Starting EoS for LR factor 10.0000
163
+ 2025-06-25 08:00:44,474 | INFO | Starting EoS for checkpoint 002000
164
+ 2025-06-25 08:00:44,474 | INFO | Starting EoS for model gpt2_small
165
+ 2025-06-25 08:00:44,474 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
166
+ 2025-06-25 08:00:44,474 | INFO | Starting EoS for num_iterations 50
167
+ 2025-06-25 08:00:44,474 | INFO | Starting EoS for accum_steps 4
168
+ 2025-06-25 08:00:44,474 | INFO | Loading model and checkpoint...
169
+ 2025-06-25 08:00:45,423 | INFO | Wrapping model with DDP...
170
+ 2025-06-25 08:00:45,442 | INFO | Loading state dict...
171
+ 2025-06-25 08:00:45,445 | INFO | Model loaded successfully!
172
+ 2025-06-25 08:00:52,795 | INFO | iter 000000 | lr 0.0180 | loss 4.0603 | norm 103.6942
173
+ 2025-06-25 08:00:57,878 | INFO | iter 000001 | lr 0.0180 | loss 14.7676 | norm 181.4721
174
+ 2025-06-25 08:01:03,515 | INFO | iter 000002 | lr 0.0180 | loss 240.4388 | norm 221.1537
175
+ 2025-06-25 08:01:08,778 | INFO | iter 000003 | lr 0.0180 | loss 37.1792 | norm 277.3678
176
+ 2025-06-25 08:01:13,940 | INFO | iter 000004 | lr 0.0180 | loss 125.9179 | norm 326.9716
177
+ 2025-06-25 08:01:19,048 | INFO | iter 000005 | lr 0.0180 | loss 119.1384 | norm 371.6566
178
+ 2025-06-25 08:01:24,067 | INFO | iter 000006 | lr 0.0180 | loss 58.8411 | norm 415.6504
179
+ 2025-06-25 08:01:29,398 | INFO | iter 000007 | lr 0.0180 | loss 50.0099 | norm 457.2432
180
+ 2025-06-25 08:01:34,715 | INFO | iter 000008 | lr 0.0180 | loss 57.1595 | norm 495.9399
181
+ 2025-06-25 08:01:39,808 | INFO | iter 000009 | lr 0.0180 | loss 50.3250 | norm 531.3629
182
+ 2025-06-25 08:01:45,102 | INFO | iter 000010 | lr 0.0180 | loss 18.0556 | norm 566.8195
183
+ 2025-06-25 08:01:50,160 | INFO | iter 000011 | lr 0.0180 | loss -12.1102 | norm 602.2888
184
+ 2025-06-25 08:01:55,242 | INFO | iter 000012 | lr 0.0180 | loss -66.1828 | norm 639.0583
185
+ 2025-06-25 08:02:00,842 | INFO | iter 000013 | lr 0.0180 | loss -103.6073 | norm 677.0137
186
+ 2025-06-25 08:02:05,876 | INFO | iter 000014 | lr 0.0180 | loss -169.7566 | norm 715.7398
187
+ 2025-06-25 08:02:11,234 | INFO | iter 000015 | lr 0.0180 | loss -249.2370 | norm 755.9124
188
+ 2025-06-25 08:02:16,241 | INFO | iter 000016 | lr 0.0180 | loss -360.8513 | norm 798.1494
189
+ 2025-06-25 08:02:21,418 | INFO | iter 000017 | lr 0.0180 | loss -541.1733 | norm 843.1959
190
+ 2025-06-25 08:02:26,629 | INFO | iter 000018 | lr 0.0180 | loss -727.0453 | norm 891.4551
191
+ 2025-06-25 08:02:32,162 | INFO | iter 000019 | lr 0.0180 | loss -957.3318 | norm 943.1777
192
+ 2025-06-25 08:02:43,080 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
193
+ 2025-06-25 08:02:44,945 | INFO | Loaded checkpoint with optimizer: adam
194
+ 2025-06-25 08:02:44,945 | INFO | Current learning rate: 0.0018
195
+ 2025-06-25 08:02:45,555 | INFO | Weight decay: 0.1
196
+ 2025-06-25 08:02:45,555 | INFO | Epsilon: 1e-08
197
+ 2025-06-25 08:02:45,555 | INFO | Loaded 147 first moment (m) buffers
198
+ 2025-06-25 08:02:45,555 | INFO | Loaded 147 second moment (v) buffers
199
+ 2025-06-25 08:02:45,555 | INFO | Optimizer state loading completed!
200
+ 2025-06-25 08:02:47,458 | INFO | Initialized xs with norm: 1.273634
201
+ 2025-06-25 08:02:47,466 | INFO | -------------------------------- EoS --------------------------------
202
+ 2025-06-25 08:02:47,466 | INFO | Starting LR test 1/10: lr=0.1800
203
+ 2025-06-25 08:02:47,466 | INFO | Starting EoS for LR factor 100.0000
204
+ 2025-06-25 08:02:47,466 | INFO | Starting EoS for checkpoint 002000
205
+ 2025-06-25 08:02:47,466 | INFO | Starting EoS for model gpt2_small
206
+ 2025-06-25 08:02:47,466 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
207
+ 2025-06-25 08:02:47,467 | INFO | Starting EoS for num_iterations 50
208
+ 2025-06-25 08:02:47,467 | INFO | Starting EoS for accum_steps 4
209
+ 2025-06-25 08:02:47,467 | INFO | Loading model and checkpoint...
210
+ 2025-06-25 08:02:48,209 | INFO | Wrapping model with DDP...
211
+ 2025-06-25 08:02:48,467 | INFO | Loading state dict...
212
+ 2025-06-25 08:02:48,470 | INFO | Model loaded successfully!
213
+ 2025-06-25 08:02:54,668 | INFO | iter 000000 | lr 0.1800 | loss 4.0603 | norm 1036.7952
214
+ 2025-06-25 08:02:59,931 | INFO | iter 000001 | lr 0.1800 | loss 1118.2144 | norm 1860.5170
215
+ 2025-06-25 08:03:05,288 | INFO | iter 000002 | lr 0.1800 | loss 27679.1367 | norm 2199.6821
216
+ 2025-06-25 08:03:10,533 | INFO | iter 000003 | lr 0.1800 | loss 3684.1533 | norm 2738.3308
217
+ 2025-06-25 08:03:15,885 | INFO | iter 000004 | lr 0.1800 | loss 12768.7715 | norm 3217.2167
218
+ 2025-06-25 08:03:21,102 | INFO | iter 000005 | lr 0.1800 | loss 13399.8350 | norm 3629.7486
219
+ 2025-06-25 08:03:26,163 | INFO | iter 000006 | lr 0.1800 | loss 6779.9473 | norm 4026.9859
220
+ 2025-06-25 08:03:31,728 | INFO | iter 000007 | lr 0.1800 | loss 4730.6021 | norm 4397.5922
221
+ 2025-06-25 08:03:37,023 | INFO | iter 000008 | lr 0.1800 | loss 5649.2324 | norm 4731.8789
222
+ 2025-06-25 08:03:42,224 | INFO | iter 000009 | lr 0.1800 | loss 5887.6724 | norm 5023.6548
223
+ 2025-06-25 08:03:47,336 | INFO | iter 000010 | lr 0.1800 | loss 2948.2642 | norm 5307.5434
224
+ 2025-06-25 08:03:52,742 | INFO | iter 000011 | lr 0.1800 | loss 679.2209 | norm 5583.8226
225
+ 2025-06-25 08:03:57,967 | INFO | iter 000012 | lr 0.1800 | loss -3517.8269 | norm 5866.7620
226
+ 2025-06-25 08:04:03,641 | INFO | iter 000013 | lr 0.1800 | loss -6241.0791 | norm 6155.4482
227
+ 2025-06-25 08:04:08,650 | INFO | iter 000014 | lr 0.1800 | loss -10283.7734 | norm 6445.5204
228
+ 2025-06-25 08:04:14,081 | INFO | iter 000015 | lr 0.1800 | loss -15390.3262 | norm 6741.4577
229
+ 2025-06-25 08:04:19,406 | INFO | iter 000016 | lr 0.1800 | loss -23139.1680 | norm 7049.2437
230
+ 2025-06-25 08:04:24,416 | INFO | iter 000017 | lr 0.1800 | loss -35265.1953 | norm 7376.9862
231
+ 2025-06-25 08:04:30,114 | INFO | iter 000018 | lr 0.1800 | loss -47734.4375 | norm 7729.4336
232
+ 2025-06-25 08:04:35,524 | INFO | iter 000019 | lr 0.1800 | loss -63256.2305 | norm 8108.3238
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gpt2_small",
3
+ "factor_min": 0.6,
4
+ "factor_max": 1.5,
5
+ "factor_num": 10,
6
+ "error": 0.0001,
7
+ "accum_steps": 4,
8
+ "num_iterations": 50,
9
+ "num_checkpoint": 7000,
10
+ "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
11
+ "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
12
+ "timestamp": "250622_035242",
13
+ "raw": false
14
+ }
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-25 05:35:53,415 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_007000.pt
2
+ 2025-06-25 05:35:55,332 | INFO | Loaded checkpoint with optimizer: adam
3
+ 2025-06-25 05:35:55,332 | INFO | Current learning rate: 0.0018
4
+ 2025-06-25 05:35:55,930 | INFO | Weight decay: 0.1
5
+ 2025-06-25 05:35:55,930 | INFO | Epsilon: 1e-08
6
+ 2025-06-25 05:35:55,930 | INFO | Loaded 147 first moment (m) buffers
7
+ 2025-06-25 05:35:55,930 | INFO | Loaded 147 second moment (v) buffers
8
+ 2025-06-25 05:35:55,930 | INFO | Optimizer state loading completed!
9
+ 2025-06-25 05:35:57,847 | INFO | Initialized xs with norm: 1.273580
10
+ 2025-06-25 05:35:57,853 | INFO | -------------------------------- EoS --------------------------------
11
+ 2025-06-25 05:35:57,853 | INFO | Starting LR test 1/10: lr=0.0011
12
+ 2025-06-25 05:35:57,853 | INFO | Starting EoS for LR factor 0.6000
13
+ 2025-06-25 05:35:57,853 | INFO | Starting EoS for checkpoint 007000
14
+ 2025-06-25 05:35:57,853 | INFO | Starting EoS for model gpt2_small
15
+ 2025-06-25 05:35:57,853 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
16
+ 2025-06-25 05:35:57,854 | INFO | Starting EoS for num_iterations 50
17
+ 2025-06-25 05:35:57,854 | INFO | Starting EoS for accum_steps 4
18
+ 2025-06-25 05:35:57,854 | INFO | Loading model and checkpoint...
19
+ 2025-06-25 05:35:58,674 | INFO | Wrapping model with DDP...
20
+ 2025-06-25 05:35:58,743 | INFO | Loading state dict...
21
+ 2025-06-25 05:35:58,747 | INFO | Model loaded successfully!
22
+ 2025-06-25 05:36:05,712 | INFO | iter 000000 | lr 0.0011 | loss 3.6876 | norm 13.7931
23
+ 2025-06-25 05:36:10,973 | INFO | iter 000001 | lr 0.0011 | loss 3.5650 | norm 27.3894
24
+ 2025-06-25 05:36:16,178 | INFO | iter 000002 | lr 0.0011 | loss 3.5923 | norm 40.9245
25
+ 2025-06-25 05:36:21,425 | INFO | iter 000003 | lr 0.0011 | loss 3.7197 | norm 54.3712
26
+ 2025-06-25 05:36:26,555 | INFO | iter 000004 | lr 0.0011 | loss 3.7198 | norm 67.7207
27
+ 2025-06-25 05:36:32,221 | INFO | iter 000005 | lr 0.0011 | loss 3.7145 | norm 80.9691
28
+ 2025-06-25 05:36:37,311 | INFO | iter 000006 | lr 0.0011 | loss 3.8629 | norm 94.1130
29
+ 2025-06-25 05:36:42,481 | INFO | iter 000007 | lr 0.0011 | loss 3.8424 | norm 107.1491
30
+ 2025-06-25 05:36:47,571 | INFO | iter 000008 | lr 0.0011 | loss 3.9408 | norm 120.0743
31
+ 2025-06-25 05:36:52,819 | INFO | iter 000009 | lr 0.0011 | loss 3.9754 | norm 132.8863
32
+ 2025-06-25 05:36:57,995 | INFO | iter 000010 | lr 0.0011 | loss 3.8358 | norm 145.5831
33
+ 2025-06-25 05:37:03,602 | INFO | iter 000011 | lr 0.0011 | loss 3.7628 | norm 158.1634
34
+ 2025-06-25 05:37:08,848 | INFO | iter 000012 | lr 0.0011 | loss 4.0600 | norm 170.6257
35
+ 2025-06-25 05:37:14,100 | INFO | iter 000013 | lr 0.0011 | loss 3.9553 | norm 182.9687
36
+ 2025-06-25 05:37:19,299 | INFO | iter 000014 | lr 0.0011 | loss 3.9901 | norm 195.1916
37
+ 2025-06-25 05:37:24,464 | INFO | iter 000015 | lr 0.0011 | loss 4.3327 | norm 207.2938
38
+ 2025-06-25 05:37:29,984 | INFO | iter 000016 | lr 0.0011 | loss 4.2055 | norm 219.2749
39
+ 2025-06-25 05:37:35,307 | INFO | iter 000017 | lr 0.0011 | loss 4.0625 | norm 231.1345
40
+ 2025-06-25 05:37:40,541 | INFO | iter 000018 | lr 0.0011 | loss 4.1686 | norm 242.8727
41
+ 2025-06-25 05:37:45,565 | INFO | iter 000019 | lr 0.0011 | loss 4.1835 | norm 254.4896
42
+ 2025-06-25 05:37:50,753 | INFO | iter 000020 | lr 0.0011 | loss 4.1426 | norm 265.9856
43
+ 2025-06-25 05:37:56,056 | INFO | iter 000021 | lr 0.0011 | loss 4.2872 | norm 277.3614
44
+ 2025-06-25 05:38:01,898 | INFO | iter 000022 | lr 0.0011 | loss 4.5773 | norm 288.6175
45
+ 2025-06-25 05:38:07,056 | INFO | iter 000023 | lr 0.0011 | loss 4.4376 | norm 299.7551
46
+ 2025-06-25 05:38:12,351 | INFO | iter 000024 | lr 0.0011 | loss 4.2737 | norm 310.7748
47
+ 2025-06-25 05:38:17,566 | INFO | iter 000025 | lr 0.0011 | loss 4.4620 | norm 321.6779
48
+ 2025-06-25 05:38:22,639 | INFO | iter 000026 | lr 0.0011 | loss 4.3275 | norm 332.4654
49
+ 2025-06-25 05:38:27,858 | INFO | iter 000027 | lr 0.0011 | loss 4.3138 | norm 343.1386
50
+ 2025-06-25 05:38:33,577 | INFO | iter 000028 | lr 0.0011 | loss 4.5462 | norm 353.6991
51
+ 2025-06-25 05:38:38,886 | INFO | iter 000029 | lr 0.0011 | loss 4.4448 | norm 364.1481
52
+ 2025-06-25 05:38:43,950 | INFO | iter 000030 | lr 0.0011 | loss 4.6483 | norm 374.4874
53
+ 2025-06-25 05:38:49,201 | INFO | iter 000031 | lr 0.0011 | loss 4.6357 | norm 384.7184
54
+ 2025-06-25 05:38:54,409 | INFO | iter 000032 | lr 0.0011 | loss 4.6015 | norm 394.8427
55
+ 2025-06-25 05:38:59,422 | INFO | iter 000033 | lr 0.0011 | loss 4.7027 | norm 404.8622
56
+ 2025-06-25 05:39:05,147 | INFO | iter 000034 | lr 0.0011 | loss 4.5023 | norm 414.7783
57
+ 2025-06-25 05:39:10,566 | INFO | iter 000035 | lr 0.0011 | loss 4.9184 | norm 424.5929
58
+ 2025-06-25 05:39:15,684 | INFO | iter 000036 | lr 0.0011 | loss 4.8844 | norm 434.3076
59
+ 2025-06-25 05:39:20,799 | INFO | iter 000037 | lr 0.0011 | loss 5.1758 | norm 443.9242
60
+ 2025-06-25 05:39:25,840 | INFO | iter 000038 | lr 0.0011 | loss 5.0171 | norm 453.4445
61
+ 2025-06-25 05:39:31,938 | INFO | iter 000039 | lr 0.0011 | loss 4.8304 | norm 462.8701
62
+ 2025-06-25 05:39:36,990 | INFO | iter 000040 | lr 0.0011 | loss 5.0139 | norm 472.2029
63
+ 2025-06-25 05:39:42,282 | INFO | iter 000041 | lr 0.0011 | loss 4.5102 | norm 481.4444
64
+ 2025-06-25 05:39:47,604 | INFO | iter 000042 | lr 0.0011 | loss 5.0961 | norm 490.5965
65
+ 2025-06-25 05:39:52,697 | INFO | iter 000043 | lr 0.0011 | loss 4.9875 | norm 499.6608
66
+ 2025-06-25 05:39:57,946 | INFO | iter 000044 | lr 0.0011 | loss 4.8258 | norm 508.6389
67
+ 2025-06-25 05:40:03,733 | INFO | iter 000045 | lr 0.0011 | loss 5.1973 | norm 517.5325
68
+ 2025-06-25 05:40:08,855 | INFO | iter 000046 | lr 0.0011 | loss 5.2978 | norm 526.3431
69
+ 2025-06-25 05:40:14,128 | INFO | iter 000047 | lr 0.0011 | loss 5.1401 | norm 535.0723
70
+ 2025-06-25 05:40:19,467 | INFO | iter 000048 | lr 0.0011 | loss 5.3447 | norm 543.7217
71
+ 2025-06-25 05:40:24,714 | INFO | iter 000049 | lr 0.0011 | loss 4.6829 | norm 552.2926
72
+ 2025-06-25 05:40:24,714 | INFO | Completed LR test 1/10: lr=0.0011
73
+ 2025-06-25 05:40:24,751 | INFO | -------------------------------- EoS --------------------------------
74
+ 2025-06-25 05:40:24,751 | INFO | Starting LR test 2/10: lr=0.0013
75
+ 2025-06-25 05:40:24,751 | INFO | Starting EoS for LR factor 0.7000
76
+ 2025-06-25 05:40:24,751 | INFO | Starting EoS for checkpoint 007000
77
+ 2025-06-25 05:40:24,751 | INFO | Starting EoS for model gpt2_small
78
+ 2025-06-25 05:40:24,751 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
79
+ 2025-06-25 05:40:24,751 | INFO | Starting EoS for num_iterations 50
80
+ 2025-06-25 05:40:24,751 | INFO | Starting EoS for accum_steps 4
81
+ 2025-06-25 05:40:24,751 | INFO | Loading model and checkpoint...
82
+ 2025-06-25 05:40:25,477 | INFO | Wrapping model with DDP...
83
+ 2025-06-25 05:40:25,746 | INFO | Loading state dict...
84
+ 2025-06-25 05:40:25,749 | INFO | Model loaded successfully!
85
+ 2025-06-25 05:40:33,135 | INFO | iter 000000 | lr 0.0013 | loss 3.6876 | norm 16.0739
86
+ 2025-06-25 05:40:38,238 | INFO | iter 000001 | lr 0.0013 | loss 3.5668 | norm 31.9203
87
+ 2025-06-25 05:40:43,503 | INFO | iter 000002 | lr 0.0013 | loss 3.5993 | norm 47.6747
88
+ 2025-06-25 05:40:48,640 | INFO | iter 000003 | lr 0.0013 | loss 3.7332 | norm 63.3073
89
+ 2025-06-25 05:40:53,858 | INFO | iter 000004 | lr 0.0013 | loss 3.7399 | norm 78.8077
90
+ 2025-06-25 05:40:59,072 | INFO | iter 000005 | lr 0.0013 | loss 3.7418 | norm 94.1714
91
+ 2025-06-25 05:41:04,563 | INFO | iter 000006 | lr 0.0013 | loss 3.9009 | norm 109.3946
92
+ 2025-06-25 05:41:09,648 | INFO | iter 000007 | lr 0.0013 | loss 3.8821 | norm 124.4729
93
+ 2025-06-25 05:41:15,013 | INFO | iter 000008 | lr 0.0013 | loss 3.9903 | norm 139.4024
94
+ 2025-06-25 05:41:20,139 | INFO | iter 000009 | lr 0.0013 | loss 4.0268 | norm 154.1803
95
+ 2025-06-25 05:41:25,227 | INFO | iter 000010 | lr 0.0013 | loss 3.8996 | norm 168.8045
96
+ 2025-06-25 05:41:31,038 | INFO | iter 000011 | lr 0.0013 | loss 3.8270 | norm 183.2730
97
+ 2025-06-25 05:41:36,315 | INFO | iter 000012 | lr 0.0013 | loss 4.1310 | norm 197.5845
98
+ 2025-06-25 05:41:41,326 | INFO | iter 000013 | lr 0.0013 | loss 4.0251 | norm 211.7373
99
+ 2025-06-25 05:41:46,451 | INFO | iter 000014 | lr 0.0013 | loss 4.0945 | norm 225.7308
100
+ 2025-06-25 05:41:51,631 | INFO | iter 000015 | lr 0.0013 | loss 4.4285 | norm 239.5642
101
+ 2025-06-25 05:41:56,676 | INFO | iter 000016 | lr 0.0013 | loss 4.3138 | norm 253.2372
102
+ 2025-06-25 05:42:02,225 | INFO | iter 000017 | lr 0.0013 | loss 4.1754 | norm 266.7497
103
+ 2025-06-25 05:42:07,383 | INFO | iter 000018 | lr 0.0013 | loss 4.2904 | norm 280.1020
104
+ 2025-06-25 05:42:12,527 | INFO | iter 000019 | lr 0.0013 | loss 4.2943 | norm 293.2945
105
+ 2025-06-25 05:42:17,700 | INFO | iter 000020 | lr 0.0013 | loss 4.2564 | norm 306.3280
106
+ 2025-06-25 05:42:23,000 | INFO | iter 000021 | lr 0.0013 | loss 4.4312 | norm 319.2035
107
+ 2025-06-25 05:42:28,355 | INFO | iter 000022 | lr 0.0013 | loss 4.7017 | norm 331.9223
108
+ 2025-06-25 05:42:34,085 | INFO | iter 000023 | lr 0.0013 | loss 4.6376 | norm 344.4858
109
+ 2025-06-25 05:42:39,128 | INFO | iter 000024 | lr 0.0013 | loss 4.4395 | norm 356.8954
110
+ 2025-06-25 05:42:44,446 | INFO | iter 000025 | lr 0.0013 | loss 4.6249 | norm 369.1527
111
+ 2025-06-25 05:42:49,650 | INFO | iter 000026 | lr 0.0013 | loss 4.4875 | norm 381.2595
112
+ 2025-06-25 05:42:54,760 | INFO | iter 000027 | lr 0.0013 | loss 4.4658 | norm 393.2177
113
+ 2025-06-25 05:43:00,105 | INFO | iter 000028 | lr 0.0013 | loss 4.7561 | norm 405.0296
114
+ 2025-06-25 05:43:05,673 | INFO | iter 000029 | lr 0.0013 | loss 4.6378 | norm 416.6971
115
+ 2025-06-25 05:43:10,891 | INFO | iter 000030 | lr 0.0013 | loss 4.8317 | norm 428.2225
116
+ 2025-06-25 05:43:16,007 | INFO | iter 000031 | lr 0.0013 | loss 4.8500 | norm 439.6082
117
+ 2025-06-25 05:43:21,121 | INFO | iter 000032 | lr 0.0013 | loss 4.8139 | norm 450.8564
118
+ 2025-06-25 05:43:26,282 | INFO | iter 000033 | lr 0.0013 | loss 4.9246 | norm 461.9696
119
+ 2025-06-25 05:43:31,833 | INFO | iter 000034 | lr 0.0013 | loss 4.6747 | norm 472.9502
120
+ 2025-06-25 05:43:37,078 | INFO | iter 000035 | lr 0.0013 | loss 5.1966 | norm 483.8006
121
+ 2025-06-25 05:43:42,249 | INFO | iter 000036 | lr 0.0013 | loss 5.1442 | norm 494.5233
122
+ 2025-06-25 05:43:47,360 | INFO | iter 000037 | lr 0.0013 | loss 5.4601 | norm 505.1207
123
+ 2025-06-25 05:43:52,622 | INFO | iter 000038 | lr 0.0013 | loss 5.2407 | norm 515.5953
124
+ 2025-06-25 05:43:57,845 | INFO | iter 000039 | lr 0.0013 | loss 5.0629 | norm 525.9496
125
+ 2025-06-25 05:44:03,515 | INFO | iter 000040 | lr 0.0013 | loss 5.2719 | norm 536.1859
126
+ 2025-06-25 05:44:08,699 | INFO | iter 000041 | lr 0.0013 | loss 4.7011 | norm 546.3068
127
+ 2025-06-25 05:44:13,777 | INFO | iter 000042 | lr 0.0013 | loss 5.3736 | norm 556.3145
128
+ 2025-06-25 05:44:19,131 | INFO | iter 000043 | lr 0.0013 | loss 5.2889 | norm 566.2115
129
+ 2025-06-25 05:44:24,440 | INFO | iter 000044 | lr 0.0013 | loss 5.0845 | norm 576.0000
130
+ 2025-06-25 05:44:30,026 | INFO | iter 000045 | lr 0.0013 | loss 5.4554 | norm 585.6822
131
+ 2025-06-25 05:44:35,085 | INFO | iter 000046 | lr 0.0013 | loss 5.5968 | norm 595.2605
132
+ 2025-06-25 05:44:40,229 | INFO | iter 000047 | lr 0.0013 | loss 5.4326 | norm 604.7369
133
+ 2025-06-25 05:44:45,571 | INFO | iter 000048 | lr 0.0013 | loss 5.6167 | norm 614.1136
134
+ 2025-06-25 05:44:50,721 | INFO | iter 000049 | lr 0.0013 | loss 4.9240 | norm 623.3926
135
+ 2025-06-25 05:44:50,722 | INFO | Completed LR test 2/10: lr=0.0013
136
+ 2025-06-25 05:44:50,738 | INFO | -------------------------------- EoS --------------------------------
137
+ 2025-06-25 05:44:50,738 | INFO | Starting LR test 3/10: lr=0.0014
138
+ 2025-06-25 05:44:50,738 | INFO | Starting EoS for LR factor 0.8000
139
+ 2025-06-25 05:44:50,738 | INFO | Starting EoS for checkpoint 007000
140
+ 2025-06-25 05:44:50,738 | INFO | Starting EoS for model gpt2_small
141
+ 2025-06-25 05:44:50,738 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
142
+ 2025-06-25 05:44:50,738 | INFO | Starting EoS for num_iterations 50
143
+ 2025-06-25 05:44:50,738 | INFO | Starting EoS for accum_steps 4
144
+ 2025-06-25 05:44:50,738 | INFO | Loading model and checkpoint...
145
+ 2025-06-25 05:44:51,462 | INFO | Wrapping model with DDP...
146
+ 2025-06-25 05:44:51,701 | INFO | Loading state dict...
147
+ 2025-06-25 05:44:51,704 | INFO | Model loaded successfully!
148
+ 2025-06-25 05:44:58,088 | INFO | iter 000000 | lr 0.0014 | loss 3.6876 | norm 18.3568
149
+ 2025-06-25 05:45:03,653 | INFO | iter 000001 | lr 0.0014 | loss 3.5691 | norm 36.4448
150
+ 2025-06-25 05:45:08,934 | INFO | iter 000002 | lr 0.0014 | loss 3.6073 | norm 54.4071
151
+ 2025-06-25 05:45:14,009 | INFO | iter 000003 | lr 0.0014 | loss 3.7478 | norm 72.2096
152
+ 2025-06-25 05:45:19,098 | INFO | iter 000004 | lr 0.0014 | loss 3.7622 | norm 89.8399
153
+ 2025-06-25 05:45:24,184 | INFO | iter 000005 | lr 0.0014 | loss 3.7683 | norm 107.2932
154
+ 2025-06-25 05:45:29,635 | INFO | iter 000006 | lr 0.0014 | loss 3.9400 | norm 124.5648
155
+ 2025-06-25 05:45:35,126 | INFO | iter 000007 | lr 0.0014 | loss 3.9224 | norm 141.6493
156
+ 2025-06-25 05:45:40,271 | INFO | iter 000008 | lr 0.0014 | loss 4.0389 | norm 158.5423
157
+ 2025-06-25 05:45:45,417 | INFO | iter 000009 | lr 0.0014 | loss 4.0786 | norm 175.2402
158
+ 2025-06-25 05:45:50,522 | INFO | iter 000010 | lr 0.0014 | loss 3.9723 | norm 191.7406
159
+ 2025-06-25 05:45:55,674 | INFO | iter 000011 | lr 0.0014 | loss 3.8953 | norm 208.0416
160
+ 2025-06-25 05:46:01,485 | INFO | iter 000012 | lr 0.0014 | loss 4.2137 | norm 224.1416
161
+ 2025-06-25 05:46:06,578 | INFO | iter 000013 | lr 0.0014 | loss 4.0981 | norm 240.0388
162
+ 2025-06-25 05:46:11,525 | INFO | iter 000014 | lr 0.0014 | loss 4.1986 | norm 255.7326
163
+ 2025-06-25 05:46:16,727 | INFO | iter 000015 | lr 0.0014 | loss 4.5317 | norm 271.2225
164
+ 2025-06-25 05:46:21,994 | INFO | iter 000016 | lr 0.0014 | loss 4.4237 | norm 286.5083
165
+ 2025-06-25 05:46:27,164 | INFO | iter 000017 | lr 0.0014 | loss 4.3022 | norm 301.5904
166
+ 2025-06-25 05:46:32,884 | INFO | iter 000018 | lr 0.0014 | loss 4.3945 | norm 316.4691
167
+ 2025-06-25 05:46:38,027 | INFO | iter 000019 | lr 0.0014 | loss 4.4074 | norm 331.1456
168
+ 2025-06-25 05:46:43,453 | INFO | iter 000020 | lr 0.0014 | loss 4.3716 | norm 345.6211
169
+ 2025-06-25 05:46:48,517 | INFO | iter 000021 | lr 0.0014 | loss 4.5736 | norm 359.8973
170
+ 2025-06-25 05:46:53,691 | INFO | iter 000022 | lr 0.0014 | loss 4.8349 | norm 373.9760
171
+ 2025-06-25 05:46:58,799 | INFO | iter 000023 | lr 0.0014 | loss 4.8390 | norm 387.8593
172
+ 2025-06-25 05:47:04,367 | INFO | iter 000024 | lr 0.0014 | loss 4.6071 | norm 401.5493
173
+ 2025-06-25 05:47:09,593 | INFO | iter 000025 | lr 0.0014 | loss 4.8229 | norm 415.0486
174
+ 2025-06-25 05:47:14,751 | INFO | iter 000026 | lr 0.0014 | loss 4.6378 | norm 428.3594
175
+ 2025-06-25 05:47:19,821 | INFO | iter 000027 | lr 0.0014 | loss 4.6228 | norm 441.4847
176
+ 2025-06-25 05:47:24,895 | INFO | iter 000028 | lr 0.0014 | loss 4.9751 | norm 454.4274
177
+ 2025-06-25 05:47:30,528 | INFO | iter 000029 | lr 0.0014 | loss 4.8282 | norm 467.1905
178
+ 2025-06-25 05:47:35,961 | INFO | iter 000030 | lr 0.0014 | loss 5.0116 | norm 479.7771
179
+ 2025-06-25 05:47:41,409 | INFO | iter 000031 | lr 0.0014 | loss 5.0729 | norm 492.1905
180
+ 2025-06-25 05:47:46,555 | INFO | iter 000032 | lr 0.0014 | loss 5.0228 | norm 504.4338
181
+ 2025-06-25 05:47:51,877 | INFO | iter 000033 | lr 0.0014 | loss 5.1236 | norm 516.5103
182
+ 2025-06-25 05:47:57,014 | INFO | iter 000034 | lr 0.0014 | loss 4.8552 | norm 528.4233
183
+ 2025-06-25 05:48:02,608 | INFO | iter 000035 | lr 0.0014 | loss 5.4885 | norm 540.1762
184
+ 2025-06-25 05:48:07,748 | INFO | iter 000036 | lr 0.0014 | loss 5.4107 | norm 551.7723
185
+ 2025-06-25 05:48:12,707 | INFO | iter 000037 | lr 0.0014 | loss 5.7052 | norm 563.2150
186
+ 2025-06-25 05:48:17,970 | INFO | iter 000038 | lr 0.0014 | loss 5.4606 | norm 574.5076
187
+ 2025-06-25 05:48:23,145 | INFO | iter 000039 | lr 0.0014 | loss 5.2917 | norm 585.6534
188
+ 2025-06-25 05:48:28,398 | INFO | iter 000040 | lr 0.0014 | loss 5.5305 | norm 596.6557
189
+ 2025-06-25 05:48:33,822 | INFO | iter 000041 | lr 0.0014 | loss 4.8888 | norm 607.5177
190
+ 2025-06-25 05:48:39,061 | INFO | iter 000042 | lr 0.0014 | loss 5.6519 | norm 618.2427
191
+ 2025-06-25 05:48:44,089 | INFO | iter 000043 | lr 0.0014 | loss 5.5571 | norm 628.8338
192
+ 2025-06-25 05:48:49,294 | INFO | iter 000044 | lr 0.0014 | loss 5.3017 | norm 639.2940
193
+ 2025-06-25 05:48:54,335 | INFO | iter 000045 | lr 0.0014 | loss 5.7088 | norm 649.6263
194
+ 2025-06-25 05:48:59,520 | INFO | iter 000046 | lr 0.0014 | loss 5.8573 | norm 659.8338
195
+ 2025-06-25 05:49:05,181 | INFO | iter 000047 | lr 0.0014 | loss 5.7481 | norm 669.9192
196
+ 2025-06-25 05:49:10,172 | INFO | iter 000048 | lr 0.0014 | loss 5.8767 | norm 679.8854
197
+ 2025-06-25 05:49:15,104 | INFO | iter 000049 | lr 0.0014 | loss 5.1259 | norm 689.7351
198
+ 2025-06-25 05:49:15,104 | INFO | Completed LR test 3/10: lr=0.0014
199
+ 2025-06-25 05:49:15,127 | INFO | -------------------------------- EoS --------------------------------
200
+ 2025-06-25 05:49:15,127 | INFO | Starting LR test 4/10: lr=0.0016
201
+ 2025-06-25 05:49:15,127 | INFO | Starting EoS for LR factor 0.9000
202
+ 2025-06-25 05:49:15,127 | INFO | Starting EoS for checkpoint 007000
203
+ 2025-06-25 05:49:15,127 | INFO | Starting EoS for model gpt2_small
204
+ 2025-06-25 05:49:15,127 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
205
+ 2025-06-25 05:49:15,127 | INFO | Starting EoS for num_iterations 50
206
+ 2025-06-25 05:49:15,127 | INFO | Starting EoS for accum_steps 4
207
+ 2025-06-25 05:49:15,127 | INFO | Loading model and checkpoint...
208
+ 2025-06-25 05:49:15,838 | INFO | Wrapping model with DDP...
209
+ 2025-06-25 05:49:16,179 | INFO | Loading state dict...
210
+ 2025-06-25 05:49:16,183 | INFO | Model loaded successfully!
211
+ 2025-06-25 05:49:22,383 | INFO | iter 000000 | lr 0.0016 | loss 3.6876 | norm 20.6411
212
+ 2025-06-25 05:49:27,494 | INFO | iter 000001 | lr 0.0016 | loss 3.5716 | norm 40.9628
213
+ 2025-06-25 05:49:33,578 | INFO | iter 000002 | lr 0.0016 | loss 3.6157 | norm 61.1216
214
+ 2025-06-25 05:49:38,620 | INFO | iter 000003 | lr 0.0016 | loss 3.7629 | norm 81.0780
215
+ 2025-06-25 05:49:43,686 | INFO | iter 000004 | lr 0.0016 | loss 3.7846 | norm 100.8176
216
+ 2025-06-25 05:49:48,747 | INFO | iter 000005 | lr 0.0016 | loss 3.7971 | norm 120.3347
217
+ 2025-06-25 05:49:53,747 | INFO | iter 000006 | lr 0.0016 | loss 3.9829 | norm 139.6241
218
+ 2025-06-25 05:49:58,945 | INFO | iter 000007 | lr 0.0016 | loss 3.9620 | norm 158.6796
219
+ 2025-06-25 05:50:04,505 | INFO | iter 000008 | lr 0.0016 | loss 4.0909 | norm 177.4956
220
+ 2025-06-25 05:50:09,658 | INFO | iter 000009 | lr 0.0016 | loss 4.1380 | norm 196.0683
221
+ 2025-06-25 05:50:14,683 | INFO | iter 000010 | lr 0.0016 | loss 4.0397 | norm 214.3950
222
+ 2025-06-25 05:50:19,816 | INFO | iter 000011 | lr 0.0016 | loss 3.9675 | norm 232.4738
223
+ 2025-06-25 05:50:24,917 | INFO | iter 000012 | lr 0.0016 | loss 4.2959 | norm 250.3029
224
+ 2025-06-25 05:50:30,599 | INFO | iter 000013 | lr 0.0016 | loss 4.1695 | norm 267.8807
225
+ 2025-06-25 05:50:35,603 | INFO | iter 000014 | lr 0.0016 | loss 4.3111 | norm 285.2066
226
+ 2025-06-25 05:50:40,827 | INFO | iter 000015 | lr 0.0016 | loss 4.6379 | norm 302.2806
227
+ 2025-06-25 05:50:46,091 | INFO | iter 000016 | lr 0.0016 | loss 4.5418 | norm 319.1028
228
+ 2025-06-25 05:50:51,447 | INFO | iter 000017 | lr 0.0016 | loss 4.4444 | norm 335.6738
229
+ 2025-06-25 05:50:56,560 | INFO | iter 000018 | lr 0.0016 | loss 4.5090 | norm 351.9947
230
+ 2025-06-25 05:51:02,059 | INFO | iter 000019 | lr 0.0016 | loss 4.5233 | norm 368.0672
231
+ 2025-06-25 05:51:07,086 | INFO | iter 000020 | lr 0.0016 | loss 4.4959 | norm 383.8933
232
+ 2025-06-25 05:51:12,277 | INFO | iter 000021 | lr 0.0016 | loss 4.7354 | norm 399.4754
233
+ 2025-06-25 05:51:17,599 | INFO | iter 000022 | lr 0.0016 | loss 4.9572 | norm 414.8162
234
+ 2025-06-25 05:51:22,816 | INFO | iter 000023 | lr 0.0016 | loss 5.0554 | norm 429.9186
235
+ 2025-06-25 05:51:28,254 | INFO | iter 000024 | lr 0.0016 | loss 4.7812 | norm 444.7857
236
+ 2025-06-25 05:51:33,877 | INFO | iter 000025 | lr 0.0016 | loss 4.9812 | norm 459.4208
237
+ 2025-06-25 05:51:38,996 | INFO | iter 000026 | lr 0.0016 | loss 4.7977 | norm 473.8274
238
+ 2025-06-25 05:51:44,352 | INFO | iter 000027 | lr 0.0016 | loss 4.7787 | norm 488.0092
239
+ 2025-06-25 05:51:49,761 | INFO | iter 000028 | lr 0.0016 | loss 5.1998 | norm 501.9703
240
+ 2025-06-25 05:51:55,046 | INFO | iter 000029 | lr 0.0016 | loss 5.0280 | norm 515.7147
241
+ 2025-06-25 05:52:00,665 | INFO | iter 000030 | lr 0.0016 | loss 5.1958 | norm 529.2467
242
+ 2025-06-25 05:52:05,662 | INFO | iter 000031 | lr 0.0016 | loss 5.2719 | norm 542.5705
243
+ 2025-06-25 05:52:10,734 | INFO | iter 000032 | lr 0.0016 | loss 5.2195 | norm 555.6903
244
+ 2025-06-25 05:52:15,809 | INFO | iter 000033 | lr 0.0016 | loss 5.3781 | norm 568.6107
245
+ 2025-06-25 05:52:20,946 | INFO | iter 000034 | lr 0.0016 | loss 5.0383 | norm 581.3358
246
+ 2025-06-25 05:52:26,121 | INFO | iter 000035 | lr 0.0016 | loss 5.7508 | norm 593.8701
247
+ 2025-06-25 05:52:31,748 | INFO | iter 000036 | lr 0.0016 | loss 5.6478 | norm 606.2180
248
+ 2025-06-25 05:52:36,972 | INFO | iter 000037 | lr 0.0016 | loss 5.9527 | norm 618.3839
249
+ 2025-06-25 05:52:42,355 | INFO | iter 000038 | lr 0.0016 | loss 5.6528 | norm 630.3722
250
+ 2025-06-25 05:52:47,675 | INFO | iter 000039 | lr 0.0016 | loss 5.4899 | norm 642.1872
251
+ 2025-06-25 05:52:52,854 | INFO | iter 000040 | lr 0.0016 | loss 5.7534 | norm 653.8331
252
+ 2025-06-25 05:52:57,905 | INFO | iter 000041 | lr 0.0016 | loss 5.0604 | norm 665.3142
253
+ 2025-06-25 05:53:03,388 | INFO | iter 000042 | lr 0.0016 | loss 5.9295 | norm 676.6346
254
+ 2025-06-25 05:53:08,623 | INFO | iter 000043 | lr 0.0016 | loss 5.7979 | norm 687.7984
255
+ 2025-06-25 05:53:13,916 | INFO | iter 000044 | lr 0.0016 | loss 5.5369 | norm 698.8096
256
+ 2025-06-25 05:53:19,248 | INFO | iter 000045 | lr 0.0016 | loss 5.9652 | norm 709.6719
257
+ 2025-06-25 05:53:24,627 | INFO | iter 000046 | lr 0.0016 | loss 6.0899 | norm 720.3892
258
+ 2025-06-25 05:53:30,007 | INFO | iter 000047 | lr 0.0016 | loss 6.0334 | norm 730.9651
259
+ 2025-06-25 05:53:35,384 | INFO | iter 000048 | lr 0.0016 | loss 6.1324 | norm 741.4034
260
+ 2025-06-25 05:53:40,548 | INFO | iter 000049 | lr 0.0016 | loss 5.3401 | norm 751.7074
261
+ 2025-06-25 05:53:40,549 | INFO | Completed LR test 4/10: lr=0.0016
262
+ 2025-06-25 05:53:40,579 | INFO | -------------------------------- EoS --------------------------------
263
+ 2025-06-25 05:53:40,580 | INFO | Starting LR test 5/10: lr=0.0018
264
+ 2025-06-25 05:53:40,580 | INFO | Starting EoS for LR factor 1.0000
265
+ 2025-06-25 05:53:40,580 | INFO | Starting EoS for checkpoint 007000
266
+ 2025-06-25 05:53:40,580 | INFO | Starting EoS for model gpt2_small
267
+ 2025-06-25 05:53:40,580 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
268
+ 2025-06-25 05:53:40,580 | INFO | Starting EoS for num_iterations 50
269
+ 2025-06-25 05:53:40,580 | INFO | Starting EoS for accum_steps 4
270
+ 2025-06-25 05:53:40,580 | INFO | Loading model and checkpoint...
271
+ 2025-06-25 05:53:41,270 | INFO | Wrapping model with DDP...
272
+ 2025-06-25 05:53:41,650 | INFO | Loading state dict...
273
+ 2025-06-25 05:53:41,653 | INFO | Model loaded successfully!
274
+ 2025-06-25 05:53:47,958 | INFO | iter 000000 | lr 0.0018 | loss 3.6876 | norm 22.9263
275
+ 2025-06-25 05:53:53,039 | INFO | iter 000001 | lr 0.0018 | loss 3.5747 | norm 45.4740
276
+ 2025-06-25 05:53:58,328 | INFO | iter 000002 | lr 0.0018 | loss 3.6252 | norm 67.8180
277
+ 2025-06-25 05:54:03,897 | INFO | iter 000003 | lr 0.0018 | loss 3.7789 | norm 89.9128
278
+ 2025-06-25 05:54:08,955 | INFO | iter 000004 | lr 0.0018 | loss 3.8081 | norm 111.7409
279
+ 2025-06-25 05:54:14,093 | INFO | iter 000005 | lr 0.0018 | loss 3.8254 | norm 133.2964
280
+ 2025-06-25 05:54:19,498 | INFO | iter 000006 | lr 0.0018 | loss 4.0263 | norm 154.5736
281
+ 2025-06-25 05:54:24,625 | INFO | iter 000007 | lr 0.0018 | loss 4.0083 | norm 175.5650
282
+ 2025-06-25 05:54:30,300 | INFO | iter 000008 | lr 0.0018 | loss 4.1424 | norm 196.2643
283
+ 2025-06-25 05:54:35,636 | INFO | iter 000009 | lr 0.0018 | loss 4.1973 | norm 216.6673
284
+ 2025-06-25 05:54:40,921 | INFO | iter 000010 | lr 0.0018 | loss 4.1163 | norm 236.7712
285
+ 2025-06-25 05:54:46,048 | INFO | iter 000011 | lr 0.0018 | loss 4.0434 | norm 256.5742
286
+ 2025-06-25 05:54:51,400 | INFO | iter 000012 | lr 0.0018 | loss 4.3868 | norm 276.0744
287
+ 2025-06-25 05:54:56,503 | INFO | iter 000013 | lr 0.0018 | loss 4.2525 | norm 295.2705
288
+ 2025-06-25 05:55:02,238 | INFO | iter 000014 | lr 0.0018 | loss 4.4289 | norm 314.1622
289
+ 2025-06-25 05:55:07,532 | INFO | iter 000015 | lr 0.0018 | loss 4.7470 | norm 332.7499
290
+ 2025-06-25 05:55:12,639 | INFO | iter 000016 | lr 0.0018 | loss 4.6653 | norm 351.0343
291
+ 2025-06-25 05:55:17,801 | INFO | iter 000017 | lr 0.0018 | loss 4.5801 | norm 369.0165
292
+ 2025-06-25 05:55:22,922 | INFO | iter 000018 | lr 0.0018 | loss 4.6499 | norm 386.6985
293
+ 2025-06-25 05:55:28,023 | INFO | iter 000019 | lr 0.0018 | loss 4.6453 | norm 404.0825
294
+ 2025-06-25 05:55:33,721 | INFO | iter 000020 | lr 0.0018 | loss 4.6186 | norm 421.1716
295
+ 2025-06-25 05:55:38,680 | INFO | iter 000021 | lr 0.0018 | loss 4.8857 | norm 437.9693
296
+ 2025-06-25 05:55:43,961 | INFO | iter 000022 | lr 0.0018 | loss 5.0973 | norm 454.4791
297
+ 2025-06-25 05:55:49,091 | INFO | iter 000023 | lr 0.0018 | loss 5.2755 | norm 470.7052
298
+ 2025-06-25 05:55:54,224 | INFO | iter 000024 | lr 0.0018 | loss 4.9374 | norm 486.6516
299
+ 2025-06-25 05:55:59,987 | INFO | iter 000025 | lr 0.0018 | loss 5.1755 | norm 502.3227
300
+ 2025-06-25 05:56:05,519 | INFO | iter 000026 | lr 0.0018 | loss 4.9887 | norm 517.7234
301
+ 2025-06-25 05:56:10,497 | INFO | iter 000027 | lr 0.0018 | loss 4.9314 | norm 532.8585
302
+ 2025-06-25 05:56:15,715 | INFO | iter 000028 | lr 0.0018 | loss 5.3997 | norm 547.7333
303
+ 2025-06-25 05:56:20,871 | INFO | iter 000029 | lr 0.0018 | loss 5.1985 | norm 562.3531
304
+ 2025-06-25 05:56:26,149 | INFO | iter 000030 | lr 0.0018 | loss 5.3587 | norm 576.7233
305
+ 2025-06-25 05:56:31,899 | INFO | iter 000031 | lr 0.0018 | loss 5.4842 | norm 590.8496
306
+ 2025-06-25 05:56:37,096 | INFO | iter 000032 | lr 0.0018 | loss 5.4218 | norm 604.7374
307
+ 2025-06-25 05:56:42,378 | INFO | iter 000033 | lr 0.0018 | loss 5.5774 | norm 618.3925
308
+ 2025-06-25 05:56:47,635 | INFO | iter 000034 | lr 0.0018 | loss 5.2042 | norm 631.8203
309
+ 2025-06-25 05:56:52,745 | INFO | iter 000035 | lr 0.0018 | loss 6.0363 | norm 645.0265
310
+ 2025-06-25 05:56:58,068 | INFO | iter 000036 | lr 0.0018 | loss 5.8952 | norm 658.0167
311
+ 2025-06-25 05:57:03,613 | INFO | iter 000037 | lr 0.0018 | loss 6.2191 | norm 670.7966
312
+ 2025-06-25 05:57:08,695 | INFO | iter 000038 | lr 0.0018 | loss 5.8633 | norm 683.3716
313
+ 2025-06-25 05:57:13,857 | INFO | iter 000039 | lr 0.0018 | loss 5.6797 | norm 695.7473
314
+ 2025-06-25 05:57:18,927 | INFO | iter 000040 | lr 0.0018 | loss 5.9959 | norm 707.9289
315
+ 2025-06-25 05:57:24,051 | INFO | iter 000041 | lr 0.0018 | loss 5.2168 | norm 719.9219
316
+ 2025-06-25 05:57:29,317 | INFO | iter 000042 | lr 0.0018 | loss 6.1407 | norm 731.7314
317
+ 2025-06-25 05:57:34,877 | INFO | iter 000043 | lr 0.0018 | loss 6.0471 | norm 743.3626
318
+ 2025-06-25 05:57:40,102 | INFO | iter 000044 | lr 0.0018 | loss 5.7566 | norm 754.8203
319
+ 2025-06-25 05:57:45,152 | INFO | iter 000045 | lr 0.0018 | loss 6.1483 | norm 766.1095
320
+ 2025-06-25 05:57:50,342 | INFO | iter 000046 | lr 0.0018 | loss 6.3447 | norm 777.2347
321
+ 2025-06-25 05:57:55,654 | INFO | iter 000047 | lr 0.0018 | loss 6.2507 | norm 788.2008
322
+ 2025-06-25 05:58:01,245 | INFO | iter 000048 | lr 0.0018 | loss 6.3594 | norm 799.0120
323
+ 2025-06-25 05:58:06,311 | INFO | iter 000049 | lr 0.0018 | loss 5.4779 | norm 809.6727
324
+ 2025-06-25 05:58:06,311 | INFO | Completed LR test 5/10: lr=0.0018
325
+ 2025-06-25 05:58:06,326 | INFO | -------------------------------- EoS --------------------------------
326
+ 2025-06-25 05:58:06,326 | INFO | Starting LR test 6/10: lr=0.0020
327
+ 2025-06-25 05:58:06,326 | INFO | Starting EoS for LR factor 1.1000
328
+ 2025-06-25 05:58:06,327 | INFO | Starting EoS for checkpoint 007000
329
+ 2025-06-25 05:58:06,327 | INFO | Starting EoS for model gpt2_small
330
+ 2025-06-25 05:58:06,327 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
331
+ 2025-06-25 05:58:06,327 | INFO | Starting EoS for num_iterations 50
332
+ 2025-06-25 05:58:06,327 | INFO | Starting EoS for accum_steps 4
333
+ 2025-06-25 05:58:06,327 | INFO | Loading model and checkpoint...
334
+ 2025-06-25 05:58:07,014 | INFO | Wrapping model with DDP...
335
+ 2025-06-25 05:58:07,353 | INFO | Loading state dict...
336
+ 2025-06-25 05:58:07,356 | INFO | Model loaded successfully!
337
+ 2025-06-25 05:58:13,544 | INFO | iter 000000 | lr 0.0020 | loss 3.6876 | norm 25.2123
338
+ 2025-06-25 05:58:18,866 | INFO | iter 000001 | lr 0.0020 | loss 3.5779 | norm 49.9783
339
+ 2025-06-25 05:58:24,009 | INFO | iter 000002 | lr 0.0020 | loss 3.6351 | norm 74.4965
340
+ 2025-06-25 05:58:29,266 | INFO | iter 000003 | lr 0.0020 | loss 3.7960 | norm 98.7138
341
+ 2025-06-25 05:58:34,900 | INFO | iter 000004 | lr 0.0020 | loss 3.8337 | norm 122.6100
342
+ 2025-06-25 05:58:40,074 | INFO | iter 000005 | lr 0.0020 | loss 3.8578 | norm 146.1787
343
+ 2025-06-25 05:58:45,172 | INFO | iter 000006 | lr 0.0020 | loss 4.0691 | norm 169.4137
344
+ 2025-06-25 05:58:50,219 | INFO | iter 000007 | lr 0.0020 | loss 4.0526 | norm 192.3065
345
+ 2025-06-25 05:58:55,431 | INFO | iter 000008 | lr 0.0020 | loss 4.1982 | norm 214.8500
346
+ 2025-06-25 05:59:00,812 | INFO | iter 000009 | lr 0.0020 | loss 4.2537 | norm 237.0394
347
+ 2025-06-25 05:59:06,232 | INFO | iter 000010 | lr 0.0020 | loss 4.1896 | norm 258.8723
348
+ 2025-06-25 05:59:11,389 | INFO | iter 000011 | lr 0.0020 | loss 4.1233 | norm 280.3468
349
+ 2025-06-25 05:59:16,596 | INFO | iter 000012 | lr 0.0020 | loss 4.4756 | norm 301.4616
350
+ 2025-06-25 05:59:21,692 | INFO | iter 000013 | lr 0.0020 | loss 4.3328 | norm 322.2153
351
+ 2025-06-25 05:59:26,948 | INFO | iter 000014 | lr 0.0020 | loss 4.5445 | norm 342.6083
352
+ 2025-06-25 05:59:32,443 | INFO | iter 000015 | lr 0.0020 | loss 4.8651 | norm 362.6415
353
+ 2025-06-25 05:59:37,749 | INFO | iter 000016 | lr 0.0020 | loss 4.7906 | norm 382.3164
354
+ 2025-06-25 05:59:42,700 | INFO | iter 000017 | lr 0.0020 | loss 4.6993 | norm 401.6350
355
+ 2025-06-25 05:59:47,904 | INFO | iter 000018 | lr 0.0020 | loss 4.7672 | norm 420.5997
356
+ 2025-06-25 05:59:53,164 | INFO | iter 000019 | lr 0.0020 | loss 4.7585 | norm 439.2143
357
+ 2025-06-25 05:59:58,222 | INFO | iter 000020 | lr 0.0020 | loss 4.7307 | norm 457.4828
358
+ 2025-06-25 06:00:03,684 | INFO | iter 000021 | lr 0.0020 | loss 5.0488 | norm 475.4099
359
+ 2025-06-25 06:00:08,860 | INFO | iter 000022 | lr 0.0020 | loss 5.2354 | norm 493.0004
360
+ 2025-06-25 06:00:14,235 | INFO | iter 000023 | lr 0.0020 | loss 5.4842 | norm 510.2596
361
+ 2025-06-25 06:00:19,440 | INFO | iter 000024 | lr 0.0020 | loss 5.1194 | norm 527.1931
362
+ 2025-06-25 06:00:24,496 | INFO | iter 000025 | lr 0.0020 | loss 5.3493 | norm 543.8065
363
+ 2025-06-25 06:00:30,381 | INFO | iter 000026 | lr 0.0020 | loss 5.1178 | norm 560.1060
364
+ 2025-06-25 06:00:35,582 | INFO | iter 000027 | lr 0.0020 | loss 5.0840 | norm 576.0979
365
+ 2025-06-25 06:00:40,743 | INFO | iter 000028 | lr 0.0020 | loss 5.6218 | norm 591.7890
366
+ 2025-06-25 06:00:46,041 | INFO | iter 000029 | lr 0.0020 | loss 5.3631 | norm 607.1861
367
+ 2025-06-25 06:00:51,225 | INFO | iter 000030 | lr 0.0020 | loss 5.5315 | norm 622.2960
368
+ 2025-06-25 06:00:56,346 | INFO | iter 000031 | lr 0.0020 | loss 5.6890 | norm 637.1259
369
+ 2025-06-25 06:01:02,073 | INFO | iter 000032 | lr 0.0020 | loss 5.5865 | norm 651.6826
370
+ 2025-06-25 06:01:07,316 | INFO | iter 000033 | lr 0.0020 | loss 5.7409 | norm 665.9732
371
+ 2025-06-25 06:01:12,507 | INFO | iter 000034 | lr 0.0020 | loss 5.3656 | norm 680.0048
372
+ 2025-06-25 06:01:17,515 | INFO | iter 000035 | lr 0.0020 | loss 6.3080 | norm 693.7843
373
+ 2025-06-25 06:01:22,649 | INFO | iter 000036 | lr 0.0020 | loss 6.1109 | norm 707.3187
374
+ 2025-06-25 06:01:27,838 | INFO | iter 000037 | lr 0.0020 | loss 6.4594 | norm 720.6151
375
+ 2025-06-25 06:01:33,547 | INFO | iter 000038 | lr 0.0020 | loss 6.0626 | norm 733.6802
376
+ 2025-06-25 06:01:38,679 | INFO | iter 000039 | lr 0.0020 | loss 5.8983 | norm 746.5208
377
+ 2025-06-25 06:01:43,863 | INFO | iter 000040 | lr 0.0020 | loss 6.1886 | norm 759.1436
378
+ 2025-06-25 06:01:48,902 | INFO | iter 000041 | lr 0.0020 | loss 5.4046 | norm 771.5550
379
+ 2025-06-25 06:01:54,199 | INFO | iter 000042 | lr 0.0020 | loss 6.3809 | norm 783.7615
380
+ 2025-06-25 06:01:59,802 | INFO | iter 000043 | lr 0.0020 | loss 6.2593 | norm 795.7693
381
+ 2025-06-25 06:02:04,920 | INFO | iter 000044 | lr 0.0020 | loss 5.9433 | norm 807.5844
382
+ 2025-06-25 06:02:09,977 | INFO | iter 000045 | lr 0.0020 | loss 6.3769 | norm 819.2127
383
+ 2025-06-25 06:02:15,095 | INFO | iter 000046 | lr 0.0020 | loss 6.5365 | norm 830.6600
384
+ 2025-06-25 06:02:20,439 | INFO | iter 000047 | lr 0.0020 | loss 6.4452 | norm 841.9317
385
+ 2025-06-25 06:02:25,606 | INFO | iter 000048 | lr 0.0020 | loss 6.5175 | norm 853.0334
386
+ 2025-06-25 06:02:31,548 | INFO | iter 000049 | lr 0.0020 | loss 5.6289 | norm 863.9703
387
+ 2025-06-25 06:02:31,548 | INFO | Completed LR test 6/10: lr=0.0020
388
+ 2025-06-25 06:02:31,566 | INFO | -------------------------------- EoS --------------------------------
389
+ 2025-06-25 06:02:31,566 | INFO | Starting LR test 7/10: lr=0.0022
390
+ 2025-06-25 06:02:31,566 | INFO | Starting EoS for LR factor 1.2000
391
+ 2025-06-25 06:02:31,566 | INFO | Starting EoS for checkpoint 007000
392
+ 2025-06-25 06:02:31,566 | INFO | Starting EoS for model gpt2_small
393
+ 2025-06-25 06:02:31,566 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
394
+ 2025-06-25 06:02:31,566 | INFO | Starting EoS for num_iterations 50
395
+ 2025-06-25 06:02:31,566 | INFO | Starting EoS for accum_steps 4
396
+ 2025-06-25 06:02:31,566 | INFO | Loading model and checkpoint...
397
+ 2025-06-25 06:02:32,259 | INFO | Wrapping model with DDP...
398
+ 2025-06-25 06:02:32,591 | INFO | Loading state dict...
399
+ 2025-06-25 06:02:32,594 | INFO | Model loaded successfully!
400
+ 2025-06-25 06:02:38,924 | INFO | iter 000000 | lr 0.0022 | loss 3.6876 | norm 27.4988
401
+ 2025-06-25 06:02:44,220 | INFO | iter 000001 | lr 0.0022 | loss 3.5816 | norm 54.4755
402
+ 2025-06-25 06:02:49,456 | INFO | iter 000002 | lr 0.0022 | loss 3.6457 | norm 81.1569
403
+ 2025-06-25 06:02:54,513 | INFO | iter 000003 | lr 0.0022 | loss 3.8138 | norm 107.4811
404
+ 2025-06-25 06:02:59,811 | INFO | iter 000004 | lr 0.0022 | loss 3.8571 | norm 133.4251
405
+ 2025-06-25 06:03:05,250 | INFO | iter 000005 | lr 0.0022 | loss 3.8876 | norm 158.9821
406
+ 2025-06-25 06:03:10,559 | INFO | iter 000006 | lr 0.0022 | loss 4.1145 | norm 184.1454
407
+ 2025-06-25 06:03:15,729 | INFO | iter 000007 | lr 0.0022 | loss 4.0991 | norm 208.9055
408
+ 2025-06-25 06:03:20,970 | INFO | iter 000008 | lr 0.0022 | loss 4.2516 | norm 233.2544
409
+ 2025-06-25 06:03:26,170 | INFO | iter 000009 | lr 0.0022 | loss 4.3180 | norm 257.1872
410
+ 2025-06-25 06:03:31,967 | INFO | iter 000010 | lr 0.0022 | loss 4.2748 | norm 280.7017
411
+ 2025-06-25 06:03:37,096 | INFO | iter 000011 | lr 0.0022 | loss 4.2000 | norm 303.7964
412
+ 2025-06-25 06:03:42,276 | INFO | iter 000012 | lr 0.0022 | loss 4.5692 | norm 326.4702
413
+ 2025-06-25 06:03:47,622 | INFO | iter 000013 | lr 0.0022 | loss 4.4204 | norm 348.7223
414
+ 2025-06-25 06:03:52,647 | INFO | iter 000014 | lr 0.0022 | loss 4.6603 | norm 370.5538
415
+ 2025-06-25 06:03:57,845 | INFO | iter 000015 | lr 0.0022 | loss 4.9774 | norm 391.9663
416
+ 2025-06-25 06:04:03,363 | INFO | iter 000016 | lr 0.0022 | loss 4.9215 | norm 412.9624
417
+ 2025-06-25 06:04:08,447 | INFO | iter 000017 | lr 0.0022 | loss 4.8513 | norm 433.5448
418
+ 2025-06-25 06:04:13,412 | INFO | iter 000018 | lr 0.0022 | loss 4.9012 | norm 453.7173
419
+ 2025-06-25 06:04:18,556 | INFO | iter 000019 | lr 0.0022 | loss 4.8747 | norm 473.4848
420
+ 2025-06-25 06:04:23,868 | INFO | iter 000020 | lr 0.0022 | loss 4.8614 | norm 492.8527
421
+ 2025-06-25 06:04:29,288 | INFO | iter 000021 | lr 0.0022 | loss 5.2076 | norm 511.8271
422
+ 2025-06-25 06:04:34,599 | INFO | iter 000022 | lr 0.0022 | loss 5.3604 | norm 530.4143
423
+ 2025-06-25 06:04:39,952 | INFO | iter 000023 | lr 0.0022 | loss 5.7092 | norm 548.6212
424
+ 2025-06-25 06:04:45,030 | INFO | iter 000024 | lr 0.0022 | loss 5.2997 | norm 566.4548
425
+ 2025-06-25 06:04:49,964 | INFO | iter 000025 | lr 0.0022 | loss 5.4966 | norm 583.9225
426
+ 2025-06-25 06:04:55,124 | INFO | iter 000026 | lr 0.0022 | loss 5.2925 | norm 601.0318
427
+ 2025-06-25 06:05:00,430 | INFO | iter 000027 | lr 0.0022 | loss 5.2346 | norm 617.7909
428
+ 2025-06-25 06:05:05,872 | INFO | iter 000028 | lr 0.0022 | loss 5.8256 | norm 634.2081
429
+ 2025-06-25 06:05:11,076 | INFO | iter 000029 | lr 0.0022 | loss 5.5557 | norm 650.2919
430
+ 2025-06-25 06:05:16,271 | INFO | iter 000030 | lr 0.0022 | loss 5.6963 | norm 666.0509
431
+ 2025-06-25 06:05:21,464 | INFO | iter 000031 | lr 0.0022 | loss 5.9050 | norm 681.4938
432
+ 2025-06-25 06:05:26,686 | INFO | iter 000032 | lr 0.0022 | loss 5.8065 | norm 696.6291
433
+ 2025-06-25 06:05:32,275 | INFO | iter 000033 | lr 0.0022 | loss 6.0051 | norm 711.4657
434
+ 2025-06-25 06:05:37,544 | INFO | iter 000034 | lr 0.0022 | loss 5.5032 | norm 726.0119
435
+ 2025-06-25 06:05:42,775 | INFO | iter 000035 | lr 0.0022 | loss 6.5643 | norm 740.2764
436
+ 2025-06-25 06:05:48,005 | INFO | iter 000036 | lr 0.0022 | loss 6.3361 | norm 754.2676
437
+ 2025-06-25 06:05:53,224 | INFO | iter 000037 | lr 0.0022 | loss 6.6566 | norm 767.9942
438
+ 2025-06-25 06:05:58,432 | INFO | iter 000038 | lr 0.0022 | loss 6.2322 | norm 781.4643
439
+ 2025-06-25 06:06:04,272 | INFO | iter 000039 | lr 0.0022 | loss 6.0644 | norm 794.6861
440
+ 2025-06-25 06:06:09,583 | INFO | iter 000040 | lr 0.0022 | loss 6.3769 | norm 807.6677
441
+ 2025-06-25 06:06:14,860 | INFO | iter 000041 | lr 0.0022 | loss 5.4851 | norm 820.4168
442
+ 2025-06-25 06:06:20,186 | INFO | iter 000042 | lr 0.0022 | loss 6.6002 | norm 832.9411
443
+ 2025-06-25 06:06:25,378 | INFO | iter 000043 | lr 0.0022 | loss 6.4176 | norm 845.2482
444
+ 2025-06-25 06:06:30,818 | INFO | iter 000044 | lr 0.0022 | loss 6.1134 | norm 857.3452
445
+ 2025-06-25 06:06:35,947 | INFO | iter 000045 | lr 0.0022 | loss 6.5890 | norm 869.2391
446
+ 2025-06-25 06:06:40,987 | INFO | iter 000046 | lr 0.0022 | loss 6.7593 | norm 880.9367
447
+ 2025-06-25 06:06:46,195 | INFO | iter 000047 | lr 0.0022 | loss 6.6622 | norm 892.4446
448
+ 2025-06-25 06:06:51,454 | INFO | iter 000048 | lr 0.0022 | loss 6.6858 | norm 903.7693
449
+ 2025-06-25 06:06:56,659 | INFO | iter 000049 | lr 0.0022 | loss 5.7619 | norm 914.9168
450
+ 2025-06-25 06:06:56,660 | INFO | Completed LR test 7/10: lr=0.0022
451
+ 2025-06-25 06:06:56,681 | INFO | -------------------------------- EoS --------------------------------
452
+ 2025-06-25 06:06:56,681 | INFO | Starting LR test 8/10: lr=0.0023
453
+ 2025-06-25 06:06:56,681 | INFO | Starting EoS for LR factor 1.3000
454
+ 2025-06-25 06:06:56,681 | INFO | Starting EoS for checkpoint 007000
455
+ 2025-06-25 06:06:56,681 | INFO | Starting EoS for model gpt2_small
456
+ 2025-06-25 06:06:56,681 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
457
+ 2025-06-25 06:06:56,682 | INFO | Starting EoS for num_iterations 50
458
+ 2025-06-25 06:06:56,682 | INFO | Starting EoS for accum_steps 4
459
+ 2025-06-25 06:06:56,682 | INFO | Loading model and checkpoint...
460
+ 2025-06-25 06:06:57,381 | INFO | Wrapping model with DDP...
461
+ 2025-06-25 06:06:57,664 | INFO | Loading state dict...
462
+ 2025-06-25 06:06:57,668 | INFO | Model loaded successfully!
463
+ 2025-06-25 06:07:04,307 | INFO | iter 000000 | lr 0.0023 | loss 3.6876 | norm 29.7858
464
+ 2025-06-25 06:07:09,325 | INFO | iter 000001 | lr 0.0023 | loss 3.5855 | norm 58.9657
465
+ 2025-06-25 06:07:14,299 | INFO | iter 000002 | lr 0.0023 | loss 3.6563 | norm 87.7992
466
+ 2025-06-25 06:07:19,266 | INFO | iter 000003 | lr 0.0023 | loss 3.8318 | norm 116.2150
467
+ 2025-06-25 06:07:24,351 | INFO | iter 000004 | lr 0.0023 | loss 3.8853 | norm 144.1865
468
+ 2025-06-25 06:07:29,939 | INFO | iter 000005 | lr 0.0023 | loss 3.9210 | norm 171.7070
469
+ 2025-06-25 06:07:35,284 | INFO | iter 000006 | lr 0.0023 | loss 4.1628 | norm 198.7692
470
+ 2025-06-25 06:07:40,587 | INFO | iter 000007 | lr 0.0023 | loss 4.1507 | norm 225.3628
471
+ 2025-06-25 06:07:45,771 | INFO | iter 000008 | lr 0.0023 | loss 4.3096 | norm 251.4790
472
+ 2025-06-25 06:07:50,939 | INFO | iter 000009 | lr 0.0023 | loss 4.3805 | norm 277.1128
473
+ 2025-06-25 06:07:56,136 | INFO | iter 000010 | lr 0.0023 | loss 4.3467 | norm 302.2623
474
+ 2025-06-25 06:08:01,650 | INFO | iter 000011 | lr 0.0023 | loss 4.2815 | norm 326.9267
475
+ 2025-06-25 06:08:06,740 | INFO | iter 000012 | lr 0.0023 | loss 4.6548 | norm 351.1054
476
+ 2025-06-25 06:08:11,812 | INFO | iter 000013 | lr 0.0023 | loss 4.5047 | norm 374.7983
477
+ 2025-06-25 06:08:17,126 | INFO | iter 000014 | lr 0.0023 | loss 4.7994 | norm 398.0073
478
+ 2025-06-25 06:08:22,168 | INFO | iter 000015 | lr 0.0023 | loss 5.0968 | norm 420.7350
479
+ 2025-06-25 06:08:27,247 | INFO | iter 000016 | lr 0.0023 | loss 5.0512 | norm 442.9852
480
+ 2025-06-25 06:08:32,847 | INFO | iter 000017 | lr 0.0023 | loss 4.9901 | norm 464.7617
481
+ 2025-06-25 06:08:37,944 | INFO | iter 000018 | lr 0.0023 | loss 5.0242 | norm 486.0699
482
+ 2025-06-25 06:08:43,006 | INFO | iter 000019 | lr 0.0023 | loss 4.9823 | norm 506.9158
483
+ 2025-06-25 06:08:48,089 | INFO | iter 000020 | lr 0.0023 | loss 4.9903 | norm 527.3068
484
+ 2025-06-25 06:08:53,055 | INFO | iter 000021 | lr 0.0023 | loss 5.3445 | norm 547.2504
485
+ 2025-06-25 06:08:58,067 | INFO | iter 000022 | lr 0.0023 | loss 5.4917 | norm 566.7548
486
+ 2025-06-25 06:09:03,752 | INFO | iter 000023 | lr 0.0023 | loss 5.9406 | norm 585.8286
487
+ 2025-06-25 06:09:08,966 | INFO | iter 000024 | lr 0.0023 | loss 5.4620 | norm 604.4806
488
+ 2025-06-25 06:09:13,890 | INFO | iter 000025 | lr 0.0023 | loss 5.7073 | norm 622.7200
489
+ 2025-06-25 06:09:19,064 | INFO | iter 000026 | lr 0.0023 | loss 5.4810 | norm 640.5562
490
+ 2025-06-25 06:09:24,103 | INFO | iter 000027 | lr 0.0023 | loss 5.3813 | norm 657.9991
491
+ 2025-06-25 06:09:29,610 | INFO | iter 000028 | lr 0.0023 | loss 6.0298 | norm 675.0590
492
+ 2025-06-25 06:09:34,627 | INFO | iter 000029 | lr 0.0023 | loss 5.7523 | norm 691.7463
493
+ 2025-06-25 06:09:39,706 | INFO | iter 000030 | lr 0.0023 | loss 5.8543 | norm 708.0714
494
+ 2025-06-25 06:09:44,756 | INFO | iter 000031 | lr 0.0023 | loss 6.0789 | norm 724.0448
495
+ 2025-06-25 06:09:49,863 | INFO | iter 000032 | lr 0.0023 | loss 5.9534 | norm 739.6771
496
+ 2025-06-25 06:09:54,951 | INFO | iter 000033 | lr 0.0023 | loss 6.1380 | norm 754.9785
497
+ 2025-06-25 06:10:00,358 | INFO | iter 000034 | lr 0.0023 | loss 5.6740 | norm 769.9594
498
+ 2025-06-25 06:10:05,430 | INFO | iter 000035 | lr 0.0023 | loss 6.7911 | norm 784.6301
499
+ 2025-06-25 06:10:10,341 | INFO | iter 000036 | lr 0.0023 | loss 6.5128 | norm 799.0007
500
+ 2025-06-25 06:10:15,590 | INFO | iter 000037 | lr 0.0023 | loss 6.8682 | norm 813.0815
501
+ 2025-06-25 06:10:20,823 | INFO | iter 000038 | lr 0.0023 | loss 6.3677 | norm 826.8822
502
+ 2025-06-25 06:10:25,968 | INFO | iter 000039 | lr 0.0023 | loss 6.1648 | norm 840.4125
503
+ 2025-06-25 06:10:31,768 | INFO | iter 000040 | lr 0.0023 | loss 6.5954 | norm 853.6819
504
+ 2025-06-25 06:10:37,016 | INFO | iter 000041 | lr 0.0023 | loss 5.6618 | norm 866.6996
505
+ 2025-06-25 06:10:42,164 | INFO | iter 000042 | lr 0.0023 | loss 6.7585 | norm 879.4748
506
+ 2025-06-25 06:10:47,336 | INFO | iter 000043 | lr 0.0023 | loss 6.5646 | norm 892.0160
507
+ 2025-06-25 06:10:52,421 | INFO | iter 000044 | lr 0.0023 | loss 6.2502 | norm 904.3319
508
+ 2025-06-25 06:10:57,760 | INFO | iter 000045 | lr 0.0023 | loss 6.7723 | norm 916.4305
509
+ 2025-06-25 06:11:03,658 | INFO | iter 000046 | lr 0.0023 | loss 6.9262 | norm 928.3196
510
+ 2025-06-25 06:11:08,849 | INFO | iter 000047 | lr 0.0023 | loss 6.8335 | norm 940.0070
511
+ 2025-06-25 06:11:14,029 | INFO | iter 000048 | lr 0.0023 | loss 6.8953 | norm 951.5001
512
+ 2025-06-25 06:11:19,142 | INFO | iter 000049 | lr 0.0023 | loss 5.8852 | norm 962.8060
513
+ 2025-06-25 06:11:19,143 | INFO | Completed LR test 8/10: lr=0.0023
514
+ 2025-06-25 06:11:19,174 | INFO | -------------------------------- EoS --------------------------------
515
+ 2025-06-25 06:11:19,174 | INFO | Starting LR test 9/10: lr=0.0025
516
+ 2025-06-25 06:11:19,174 | INFO | Starting EoS for LR factor 1.4000
517
+ 2025-06-25 06:11:19,174 | INFO | Starting EoS for checkpoint 007000
518
+ 2025-06-25 06:11:19,174 | INFO | Starting EoS for model gpt2_small
519
+ 2025-06-25 06:11:19,175 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
520
+ 2025-06-25 06:11:19,175 | INFO | Starting EoS for num_iterations 50
521
+ 2025-06-25 06:11:19,175 | INFO | Starting EoS for accum_steps 4
522
+ 2025-06-25 06:11:19,175 | INFO | Loading model and checkpoint...
523
+ 2025-06-25 06:11:19,871 | INFO | Wrapping model with DDP...
524
+ 2025-06-25 06:11:20,178 | INFO | Loading state dict...
525
+ 2025-06-25 06:11:20,181 | INFO | Model loaded successfully!
526
+ 2025-06-25 06:11:26,593 | INFO | iter 000000 | lr 0.0025 | loss 3.6876 | norm 32.0730
527
+ 2025-06-25 06:11:32,036 | INFO | iter 000001 | lr 0.0025 | loss 3.5896 | norm 63.4489
528
+ 2025-06-25 06:11:37,229 | INFO | iter 000002 | lr 0.0025 | loss 3.6682 | norm 94.4235
529
+ 2025-06-25 06:11:42,390 | INFO | iter 000003 | lr 0.0025 | loss 3.8500 | norm 124.9154
530
+ 2025-06-25 06:11:47,646 | INFO | iter 000004 | lr 0.0025 | loss 3.9122 | norm 154.8944
531
+ 2025-06-25 06:11:52,952 | INFO | iter 000005 | lr 0.0025 | loss 3.9551 | norm 184.3538
532
+ 2025-06-25 06:11:57,954 | INFO | iter 000006 | lr 0.0025 | loss 4.2136 | norm 213.2859
533
+ 2025-06-25 06:12:03,477 | INFO | iter 000007 | lr 0.0025 | loss 4.1950 | norm 241.6798
534
+ 2025-06-25 06:12:08,505 | INFO | iter 000008 | lr 0.0025 | loss 4.3723 | norm 269.5256
535
+ 2025-06-25 06:12:13,681 | INFO | iter 000009 | lr 0.0025 | loss 4.4440 | norm 296.8186
536
+ 2025-06-25 06:12:18,787 | INFO | iter 000010 | lr 0.0025 | loss 4.4323 | norm 323.5575
537
+ 2025-06-25 06:12:23,836 | INFO | iter 000011 | lr 0.0025 | loss 4.3655 | norm 349.7422
538
+ 2025-06-25 06:12:29,157 | INFO | iter 000012 | lr 0.0025 | loss 4.7546 | norm 375.3729
539
+ 2025-06-25 06:12:34,519 | INFO | iter 000013 | lr 0.0025 | loss 4.5906 | norm 400.4503
540
+ 2025-06-25 06:12:39,712 | INFO | iter 000014 | lr 0.0025 | loss 4.9124 | norm 424.9773
541
+ 2025-06-25 06:12:45,174 | INFO | iter 000015 | lr 0.0025 | loss 5.2241 | norm 448.9581
542
+ 2025-06-25 06:12:50,358 | INFO | iter 000016 | lr 0.0025 | loss 5.1840 | norm 472.3975
543
+ 2025-06-25 06:12:55,418 | INFO | iter 000017 | lr 0.0025 | loss 5.1327 | norm 495.3010
544
+ 2025-06-25 06:13:00,485 | INFO | iter 000018 | lr 0.0025 | loss 5.1446 | norm 517.6753
545
+ 2025-06-25 06:13:06,131 | INFO | iter 000019 | lr 0.0025 | loss 5.1064 | norm 539.5286
546
+ 2025-06-25 06:13:11,225 | INFO | iter 000020 | lr 0.0025 | loss 5.1112 | norm 560.8697
547
+ 2025-06-25 06:13:16,220 | INFO | iter 000021 | lr 0.0025 | loss 5.5130 | norm 581.7085
548
+ 2025-06-25 06:13:21,342 | INFO | iter 000022 | lr 0.0025 | loss 5.6241 | norm 602.0549
549
+ 2025-06-25 06:13:26,451 | INFO | iter 000023 | lr 0.0025 | loss 6.1646 | norm 621.9195
550
+ 2025-06-25 06:13:32,022 | INFO | iter 000024 | lr 0.0025 | loss 5.6253 | norm 641.3132
551
+ 2025-06-25 06:13:37,001 | INFO | iter 000025 | lr 0.0025 | loss 5.8789 | norm 660.2472
552
+ 2025-06-25 06:13:42,161 | INFO | iter 000026 | lr 0.0025 | loss 5.5988 | norm 678.7329
553
+ 2025-06-25 06:13:47,209 | INFO | iter 000027 | lr 0.0025 | loss 5.4927 | norm 696.7825
554
+ 2025-06-25 06:13:52,260 | INFO | iter 000028 | lr 0.0025 | loss 6.2231 | norm 714.4082
555
+ 2025-06-25 06:13:57,457 | INFO | iter 000029 | lr 0.0025 | loss 5.9250 | norm 731.6225
556
+ 2025-06-25 06:14:03,013 | INFO | iter 000030 | lr 0.0025 | loss 5.9891 | norm 748.4380
557
+ 2025-06-25 06:14:08,128 | INFO | iter 000031 | lr 0.0025 | loss 6.2910 | norm 764.8672
558
+ 2025-06-25 06:14:13,228 | INFO | iter 000032 | lr 0.0025 | loss 6.1202 | norm 780.9225
559
+ 2025-06-25 06:14:18,498 | INFO | iter 000033 | lr 0.0025 | loss 6.2973 | norm 796.6163
560
+ 2025-06-25 06:14:23,706 | INFO | iter 000034 | lr 0.0025 | loss 5.7886 | norm 811.9608
561
+ 2025-06-25 06:14:28,852 | INFO | iter 000035 | lr 0.0025 | loss 6.9882 | norm 826.9680
562
+ 2025-06-25 06:14:34,397 | INFO | iter 000036 | lr 0.0025 | loss 6.7193 | norm 841.6499
563
+ 2025-06-25 06:14:39,459 | INFO | iter 000037 | lr 0.0025 | loss 7.0613 | norm 856.0185
564
+ 2025-06-25 06:14:44,530 | INFO | iter 000038 | lr 0.0025 | loss 6.5425 | norm 870.0853
565
+ 2025-06-25 06:14:49,627 | INFO | iter 000039 | lr 0.0025 | loss 6.3130 | norm 883.8614
566
+ 2025-06-25 06:14:54,817 | INFO | iter 000040 | lr 0.0025 | loss 6.7243 | norm 897.3581
567
+ 2025-06-25 06:15:00,128 | INFO | iter 000041 | lr 0.0025 | loss 5.7344 | norm 910.5860
568
+ 2025-06-25 06:15:05,761 | INFO | iter 000042 | lr 0.0025 | loss 6.9161 | norm 923.5555
569
+ 2025-06-25 06:15:10,962 | INFO | iter 000043 | lr 0.0025 | loss 6.8284 | norm 936.2766
570
+ 2025-06-25 06:15:16,242 | INFO | iter 000044 | lr 0.0025 | loss 6.4281 | norm 948.7593
571
+ 2025-06-25 06:15:21,392 | INFO | iter 000045 | lr 0.0025 | loss 6.8733 | norm 961.0127
572
+ 2025-06-25 06:15:26,535 | INFO | iter 000046 | lr 0.0025 | loss 7.0664 | norm 973.0458
573
+ 2025-06-25 06:15:31,984 | INFO | iter 000047 | lr 0.0025 | loss 6.9932 | norm 984.8674
574
+ 2025-06-25 06:15:37,095 | INFO | iter 000048 | lr 0.0025 | loss 7.0181 | norm 996.4859
575
+ 2025-06-25 06:15:42,239 | INFO | iter 000049 | lr 0.0025 | loss 6.0350 | norm 1007.9093
576
+ 2025-06-25 06:15:42,239 | INFO | Completed LR test 9/10: lr=0.0025
577
+ 2025-06-25 06:15:42,268 | INFO | -------------------------------- EoS --------------------------------
578
+ 2025-06-25 06:15:42,268 | INFO | Starting LR test 10/10: lr=0.0027
579
+ 2025-06-25 06:15:42,268 | INFO | Starting EoS for LR factor 1.5000
580
+ 2025-06-25 06:15:42,268 | INFO | Starting EoS for checkpoint 007000
581
+ 2025-06-25 06:15:42,268 | INFO | Starting EoS for model gpt2_small
582
+ 2025-06-25 06:15:42,268 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
583
+ 2025-06-25 06:15:42,268 | INFO | Starting EoS for num_iterations 50
584
+ 2025-06-25 06:15:42,268 | INFO | Starting EoS for accum_steps 4
585
+ 2025-06-25 06:15:42,268 | INFO | Loading model and checkpoint...
586
+ 2025-06-25 06:15:42,968 | INFO | Wrapping model with DDP...
587
+ 2025-06-25 06:15:43,317 | INFO | Loading state dict...
588
+ 2025-06-25 06:15:43,321 | INFO | Model loaded successfully!
589
+ 2025-06-25 06:15:49,488 | INFO | iter 000000 | lr 0.0027 | loss 3.6876 | norm 34.3605
590
+ 2025-06-25 06:15:54,389 | INFO | iter 000001 | lr 0.0027 | loss 3.5943 | norm 67.9249
591
+ 2025-06-25 06:15:59,699 | INFO | iter 000002 | lr 0.0027 | loss 3.6800 | norm 101.0297
592
+ 2025-06-25 06:16:05,013 | INFO | iter 000003 | lr 0.0027 | loss 3.8703 | norm 133.5825
593
+ 2025-06-25 06:16:10,313 | INFO | iter 000004 | lr 0.0027 | loss 3.9398 | norm 165.5491
594
+ 2025-06-25 06:16:15,323 | INFO | iter 000005 | lr 0.0027 | loss 3.9873 | norm 196.9229
595
+ 2025-06-25 06:16:20,431 | INFO | iter 000006 | lr 0.0027 | loss 4.2650 | norm 227.6963
596
+ 2025-06-25 06:16:25,491 | INFO | iter 000007 | lr 0.0027 | loss 4.2481 | norm 257.8574
597
+ 2025-06-25 06:16:31,169 | INFO | iter 000008 | lr 0.0027 | loss 4.4322 | norm 287.3958
598
+ 2025-06-25 06:16:36,127 | INFO | iter 000009 | lr 0.0027 | loss 4.5136 | norm 316.3069
599
+ 2025-06-25 06:16:41,329 | INFO | iter 000010 | lr 0.0027 | loss 4.5131 | norm 344.5903
600
+ 2025-06-25 06:16:46,561 | INFO | iter 000011 | lr 0.0027 | loss 4.4507 | norm 372.2470
601
+ 2025-06-25 06:16:51,600 | INFO | iter 000012 | lr 0.0027 | loss 4.8472 | norm 399.2780
602
+ 2025-06-25 06:16:56,803 | INFO | iter 000013 | lr 0.0027 | loss 4.6747 | norm 425.6851
603
+ 2025-06-25 06:17:02,481 | INFO | iter 000014 | lr 0.0027 | loss 5.0471 | norm 451.4725
604
+ 2025-06-25 06:17:07,499 | INFO | iter 000015 | lr 0.0027 | loss 5.3386 | norm 476.6461
605
+ 2025-06-25 06:17:12,469 | INFO | iter 000016 | lr 0.0027 | loss 5.3129 | norm 501.2121
606
+ 2025-06-25 06:17:17,681 | INFO | iter 000017 | lr 0.0027 | loss 5.2700 | norm 525.1778
607
+ 2025-06-25 06:17:22,697 | INFO | iter 000018 | lr 0.0027 | loss 5.2826 | norm 548.5519
608
+ 2025-06-25 06:17:27,938 | INFO | iter 000019 | lr 0.0027 | loss 5.2451 | norm 571.3443
609
+ 2025-06-25 06:17:33,339 | INFO | iter 000020 | lr 0.0027 | loss 5.2423 | norm 593.5663
610
+ 2025-06-25 06:17:38,686 | INFO | iter 000021 | lr 0.0027 | loss 5.6845 | norm 615.2298
611
+ 2025-06-25 06:17:43,870 | INFO | iter 000022 | lr 0.0027 | loss 5.7502 | norm 636.3470
612
+ 2025-06-25 06:17:48,833 | INFO | iter 000023 | lr 0.0027 | loss 6.3552 | norm 656.9308
613
+ 2025-06-25 06:17:54,207 | INFO | iter 000024 | lr 0.0027 | loss 5.7950 | norm 676.9943
614
+ 2025-06-25 06:17:59,920 | INFO | iter 000025 | lr 0.0027 | loss 6.0212 | norm 696.5510
615
+ 2025-06-25 06:18:05,173 | INFO | iter 000026 | lr 0.0027 | loss 5.7221 | norm 715.6146
616
+ 2025-06-25 06:18:10,751 | INFO | iter 000027 | lr 0.0027 | loss 5.6325 | norm 734.1994
617
+ 2025-06-25 06:18:15,764 | INFO | iter 000028 | lr 0.0027 | loss 6.3907 | norm 752.3203
618
+ 2025-06-25 06:18:20,879 | INFO | iter 000029 | lr 0.0027 | loss 6.0694 | norm 769.9918
619
+ 2025-06-25 06:18:26,035 | INFO | iter 000030 | lr 0.0027 | loss 6.1491 | norm 787.2289
620
+ 2025-06-25 06:18:31,673 | INFO | iter 000031 | lr 0.0027 | loss 6.4236 | norm 804.0462
621
+ 2025-06-25 06:18:36,833 | INFO | iter 000032 | lr 0.0027 | loss 6.2595 | norm 820.4583
622
+ 2025-06-25 06:18:41,950 | INFO | iter 000033 | lr 0.0027 | loss 6.5047 | norm 836.4797
623
+ 2025-06-25 06:18:47,129 | INFO | iter 000034 | lr 0.0027 | loss 5.9256 | norm 852.1246
624
+ 2025-06-25 06:18:52,172 | INFO | iter 000035 | lr 0.0027 | loss 7.2157 | norm 867.4069
625
+ 2025-06-25 06:18:57,259 | INFO | iter 000036 | lr 0.0027 | loss 6.8648 | norm 882.3406
626
+ 2025-06-25 06:19:02,908 | INFO | iter 000037 | lr 0.0027 | loss 7.2547 | norm 896.9395
627
+ 2025-06-25 06:19:08,076 | INFO | iter 000038 | lr 0.0027 | loss 6.6690 | norm 911.2167
628
+ 2025-06-25 06:19:13,095 | INFO | iter 000039 | lr 0.0027 | loss 6.4529 | norm 925.1854
629
+ 2025-06-25 06:19:18,346 | INFO | iter 000040 | lr 0.0027 | loss 6.8312 | norm 938.8581
630
+ 2025-06-25 06:19:23,338 | INFO | iter 000041 | lr 0.0027 | loss 5.8287 | norm 952.2471
631
+ 2025-06-25 06:19:28,610 | INFO | iter 000042 | lr 0.0027 | loss 7.0746 | norm 965.3642
632
+ 2025-06-25 06:19:33,936 | INFO | iter 000043 | lr 0.0027 | loss 7.0014 | norm 978.2209
633
+ 2025-06-25 06:19:39,018 | INFO | iter 000044 | lr 0.0027 | loss 6.5323 | norm 990.8282
634
+ 2025-06-25 06:19:44,171 | INFO | iter 000045 | lr 0.0027 | loss 7.0292 | norm 1003.1965
635
+ 2025-06-25 06:19:49,407 | INFO | iter 000046 | lr 0.0027 | loss 7.2137 | norm 1015.3359
636
+ 2025-06-25 06:19:54,614 | INFO | iter 000047 | lr 0.0027 | loss 7.1640 | norm 1027.2564
637
+ 2025-06-25 06:19:59,877 | INFO | iter 000048 | lr 0.0027 | loss 7.1574 | norm 1038.9671
638
+ 2025-06-25 06:20:05,133 | INFO | iter 000049 | lr 0.0027 | loss 6.1115 | norm 1050.4771
639
+ 2025-06-25 06:20:05,133 | INFO | Completed LR test 10/10: lr=0.0027
640
+ 2025-06-25 06:20:05,458 | INFO | Cleanup complete
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "gpt2_small",
3
+ "factor_min": 0.6,
4
+ "factor_max": 1.5,
5
+ "factor_num": 10,
6
+ "error": 0.0001,
7
+ "accum_steps": 4,
8
+ "num_iterations": 50,
9
+ "num_checkpoint": 2000,
10
+ "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
11
+ "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
12
+ "timestamp": "250622_035242",
13
+ "raw": false
14
+ }
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png ADDED
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-25 08:05:01,878 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
2
+ 2025-06-25 08:05:03,791 | INFO | Loaded checkpoint with optimizer: adam
3
+ 2025-06-25 08:05:03,792 | INFO | Current learning rate: 0.0018
4
+ 2025-06-25 08:05:04,404 | INFO | Weight decay: 0.1
5
+ 2025-06-25 08:05:04,404 | INFO | Epsilon: 1e-08
6
+ 2025-06-25 08:05:04,404 | INFO | Loaded 147 first moment (m) buffers
7
+ 2025-06-25 08:05:04,404 | INFO | Loaded 147 second moment (v) buffers
8
+ 2025-06-25 08:05:04,404 | INFO | Optimizer state loading completed!
9
+ 2025-06-25 08:05:06,318 | INFO | Initialized xs with norm: 1.273501
10
+ 2025-06-25 08:05:06,326 | INFO | -------------------------------- EoS --------------------------------
11
+ 2025-06-25 08:05:06,326 | INFO | Starting LR test 1/10: lr=0.1800
12
+ 2025-06-25 08:05:06,326 | INFO | Starting EoS for LR factor 100.0000
13
+ 2025-06-25 08:05:06,326 | INFO | Starting EoS for checkpoint 010000
14
+ 2025-06-25 08:05:06,326 | INFO | Starting EoS for model gpt2_small
15
+ 2025-06-25 08:05:06,326 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
16
+ 2025-06-25 08:05:06,326 | INFO | Starting EoS for num_iterations 50
17
+ 2025-06-25 08:05:06,326 | INFO | Starting EoS for accum_steps 4
18
+ 2025-06-25 08:05:06,326 | INFO | Loading model and checkpoint...
19
+ 2025-06-25 08:05:07,207 | INFO | Wrapping model with DDP...
20
+ 2025-06-25 08:05:07,346 | INFO | Loading state dict...
21
+ 2025-06-25 08:05:07,350 | INFO | Model loaded successfully!
22
+ 2025-06-25 08:05:14,155 | INFO | iter 000000 | lr 0.1800 | loss 3.7162 | norm 1035.2508
23
+ 2025-06-25 08:05:19,180 | INFO | iter 000001 | lr 0.1800 | loss 1022.7610 | norm 1854.6921
24
+ 2025-06-25 08:05:24,315 | INFO | iter 000002 | lr 0.1800 | loss 12916.4541 | norm 2262.8654
25
+ 2025-06-25 08:05:29,639 | INFO | iter 000003 | lr 0.1800 | loss 2381.4231 | norm 2784.9186
26
+ 2025-06-25 08:05:35,017 | INFO | iter 000004 | lr 0.1800 | loss 5663.5991 | norm 3258.5670
27
+ 2025-06-25 08:05:40,197 | INFO | iter 000005 | lr 0.1800 | loss 6263.7207 | norm 3659.7969
28
+ 2025-06-25 08:05:45,423 | INFO | iter 000006 | lr 0.1800 | loss 3300.3076 | norm 4042.1641
29
+ 2025-06-25 08:05:50,614 | INFO | iter 000007 | lr 0.1800 | loss 2205.2788 | norm 4403.1766
30
+ 2025-06-25 08:05:55,705 | INFO | iter 000008 | lr 0.1800 | loss 3169.1311 | norm 4721.1078
31
+ 2025-06-25 08:06:01,395 | INFO | iter 000009 | lr 0.1800 | loss 2604.3735 | norm 5007.2289
32
+ 2025-06-25 08:06:06,537 | INFO | iter 000010 | lr 0.1800 | loss 1049.3635 | norm 5277.7821
33
+ 2025-06-25 08:06:11,635 | INFO | iter 000011 | lr 0.1800 | loss -534.1947 | norm 5542.2629
34
+ 2025-06-25 08:06:16,925 | INFO | iter 000012 | lr 0.1800 | loss -1829.3038 | norm 5804.4541
35
+ 2025-06-25 08:06:22,270 | INFO | iter 000013 | lr 0.1800 | loss -3147.5239 | norm 6068.7285
36
+ 2025-06-25 08:06:27,494 | INFO | iter 000014 | lr 0.1800 | loss -5675.9150 | norm 6341.2563
37
+ 2025-06-25 08:06:33,272 | INFO | iter 000015 | lr 0.1800 | loss -8397.9707 | norm 6625.5651
38
+ 2025-06-25 08:06:38,517 | INFO | iter 000016 | lr 0.1800 | loss -12464.1982 | norm 6928.3923
39
+ 2025-06-25 08:06:43,692 | INFO | iter 000017 | lr 0.1800 | loss -19611.1348 | norm 7248.3939
40
+ 2025-06-25 08:06:48,881 | INFO | iter 000018 | lr 0.1800 | loss -27940.6465 | norm 7598.8381
41
+ 2025-06-25 08:06:54,254 | INFO | iter 000019 | lr 0.1800 | loss -37102.6367 | norm 7980.6707
42
+ 2025-06-25 08:06:59,396 | INFO | iter 000020 | lr 0.1800 | loss -41850.2695 | norm 8391.1617
43
+ 2025-06-25 08:07:04,889 | INFO | iter 000021 | lr 0.1800 | loss -55457.6641 | norm 8833.9090
44
+ 2025-06-25 08:07:10,211 | INFO | iter 000022 | lr 0.1800 | loss -77246.1016 | norm 9308.0797
45
+ 2025-06-25 08:07:15,407 | INFO | iter 000023 | lr 0.1800 | loss -88869.9531 | norm 9810.1623
46
+ 2025-06-25 08:07:20,721 | INFO | iter 000024 | lr 0.1800 | loss -106978.7188 | norm 10315.7090
47
+ 2025-06-25 08:07:26,169 | INFO | iter 000025 | lr 0.1800 | loss -128147.9297 | norm 10848.4611
48
+ 2025-06-25 08:07:31,697 | INFO | iter 000026 | lr 0.1800 | loss -167841.3906 | norm 11408.8324
49
+ 2025-06-25 08:07:37,001 | INFO | iter 000027 | lr 0.1800 | loss -180567.0000 | norm 11986.0506
50
+ 2025-06-25 08:07:42,165 | INFO | iter 000028 | lr 0.1800 | loss -195498.7031 | norm 12575.1052
51
+ 2025-06-25 08:07:47,397 | INFO | iter 000029 | lr 0.1800 | loss -226350.6406 | norm 13174.6499
52
+ 2025-06-25 08:07:52,643 | INFO | iter 000030 | lr 0.1800 | loss -278469.5625 | norm 13789.2878
53
+ 2025-06-25 08:07:57,699 | INFO | iter 000031 | lr 0.1800 | loss -291369.5000 | norm 14408.3851
54
+ 2025-06-25 08:08:03,348 | INFO | iter 000032 | lr 0.1800 | loss -340664.4062 | norm 15027.4904
55
+ 2025-06-25 08:08:08,536 | INFO | iter 000033 | lr 0.1800 | loss -381711.4062 | norm 15650.6969
56
+ 2025-06-25 08:08:13,710 | INFO | iter 000034 | lr 0.1800 | loss -447641.5625 | norm 16275.4851
57
+ 2025-06-25 08:08:18,817 | INFO | iter 000035 | lr 0.1800 | loss -478080.0938 | norm 16906.2037
58
+ 2025-06-25 08:08:23,855 | INFO | iter 000036 | lr 0.1800 | loss -474737.6875 | norm 17529.9377
59
+ 2025-06-25 08:08:29,137 | INFO | iter 000037 | lr 0.1800 | loss -562731.2500 | norm 18158.1498
60
+ 2025-06-25 08:08:34,554 | INFO | iter 000038 | lr 0.1800 | loss -671918.5000 | norm 18787.8108
61
+ 2025-06-25 08:08:39,659 | INFO | iter 000039 | lr 0.1800 | loss -611043.3750 | norm 19413.9402
62
+ 2025-06-25 08:08:44,757 | INFO | iter 000040 | lr 0.1800 | loss -695682.8750 | norm 20036.0407
63
+ 2025-06-25 08:08:49,914 | INFO | iter 000041 | lr 0.1800 | loss -780705.8750 | norm 20644.8346
64
+ 2025-06-25 08:08:55,158 | INFO | iter 000042 | lr 0.1800 | loss -995004.5625 | norm 21239.0209
65
+ 2025-06-25 08:09:00,499 | INFO | iter 000043 | lr 0.1800 | loss -818858.2500 | norm 21828.2696
66
+ 2025-06-25 08:09:05,819 | INFO | iter 000044 | lr 0.1800 | loss -880965.5000 | norm 22411.6198
67
+ 2025-06-25 08:09:11,010 | INFO | iter 000045 | lr 0.1800 | loss -1003927.8125 | norm 22994.3220
68
+ 2025-06-25 08:09:16,158 | INFO | iter 000046 | lr 0.1800 | loss -1131396.7500 | norm 23577.3465
69
+ 2025-06-25 08:09:21,395 | INFO | iter 000047 | lr 0.1800 | loss -1100723.7500 | norm 24149.2858
70
+ 2025-06-25 08:09:26,617 | INFO | iter 000048 | lr 0.1800 | loss -1096843.2500 | norm 24709.4051
71
+ 2025-06-25 08:09:32,274 | INFO | iter 000049 | lr 0.1800 | loss -1231046.2500 | norm 25267.4215
72
+ 2025-06-25 08:09:32,275 | INFO | Completed LR test 1/10: lr=0.1800
73
+ 2025-06-25 08:09:32,476 | INFO | Cleanup complete
74
+ 2025-06-25 08:14:04,088 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
75
+ 2025-06-25 08:14:06,081 | INFO | Loaded checkpoint with optimizer: adam
76
+ 2025-06-25 08:14:06,081 | INFO | Current learning rate: 0.0018
77
+ 2025-06-25 08:14:06,733 | INFO | Weight decay: 0.1
78
+ 2025-06-25 08:14:06,733 | INFO | Epsilon: 1e-08
79
+ 2025-06-25 08:14:06,733 | INFO | Loaded 147 first moment (m) buffers
80
+ 2025-06-25 08:14:06,733 | INFO | Loaded 147 second moment (v) buffers
81
+ 2025-06-25 08:14:06,733 | INFO | Optimizer state loading completed!
82
+ 2025-06-25 08:14:08,702 | INFO | Initialized xs with norm: 1.273654
83
+ 2025-06-25 08:14:08,705 | INFO | -------------------------------- EoS --------------------------------
84
+ 2025-06-25 08:14:08,705 | INFO | Starting LR test 1/10: lr=0.1800
85
+ 2025-06-25 08:14:08,705 | INFO | Starting EoS for LR factor 100.0000
86
+ 2025-06-25 08:14:08,705 | INFO | Starting EoS for checkpoint 010000
87
+ 2025-06-25 08:14:08,706 | INFO | Starting EoS for model gpt2_small
88
+ 2025-06-25 08:14:08,706 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
89
+ 2025-06-25 08:14:08,706 | INFO | Starting EoS for num_iterations 50
90
+ 2025-06-25 08:14:08,706 | INFO | Starting EoS for accum_steps 4
91
+ 2025-06-25 08:14:08,706 | INFO | Loading model and checkpoint...
92
+ 2025-06-25 08:14:09,604 | INFO | Wrapping model with DDP...
93
+ 2025-06-25 08:14:09,826 | INFO | Loading state dict...
94
+ 2025-06-25 08:14:09,830 | INFO | Model loaded successfully!
95
+ 2025-06-25 08:14:16,113 | INFO | iter 000000 | lr 0.1800 | loss 3.7162 | norm 458.7718
96
+ 2025-06-25 08:14:21,223 | INFO | iter 000001 | lr 0.1800 | loss 207.3757 | norm 1023.7285
97
+ 2025-06-25 08:14:26,544 | INFO | iter 000002 | lr 0.1800 | loss 5262.5127 | norm 1344.5259
98
+ 2025-06-25 08:14:32,322 | INFO | iter 000003 | lr 0.1800 | loss 1121.6940 | norm 1825.7758
99
+ 2025-06-25 08:14:37,581 | INFO | iter 000004 | lr 0.1800 | loss 3489.6094 | norm 2256.3649
100
+ 2025-06-25 08:14:42,793 | INFO | iter 000005 | lr 0.1800 | loss 2611.4473 | norm 2665.4942
101
+ 2025-06-25 08:14:47,946 | INFO | iter 000006 | lr 0.1800 | loss 1170.9673 | norm 3086.3505
102
+ 2025-06-25 08:14:53,249 | INFO | iter 000007 | lr 0.1800 | loss 1968.7089 | norm 3471.1555
103
+ 2025-06-25 08:14:58,293 | INFO | iter 000008 | lr 0.1800 | loss 2037.9554 | norm 3819.5898
104
+ 2025-06-25 08:15:04,132 | INFO | iter 000009 | lr 0.1800 | loss 831.5104 | norm 4163.8926
105
+ 2025-06-25 08:15:09,325 | INFO | iter 000010 | lr 0.1800 | loss 281.9943 | norm 4504.7350
106
+ 2025-06-25 08:15:14,666 | INFO | iter 000011 | lr 0.1800 | loss -326.7262 | norm 4836.5242
107
+ 2025-06-25 08:15:19,988 | INFO | iter 000012 | lr 0.1800 | loss -1960.6803 | norm 5168.6060
108
+ 2025-06-25 08:15:25,055 | INFO | iter 000013 | lr 0.1800 | loss -4240.2627 | norm 5518.1032
109
+ 2025-06-25 08:15:30,756 | INFO | iter 000014 | lr 0.1800 | loss -7296.6255 | norm 5891.2474
110
+ 2025-06-25 08:15:35,838 | INFO | iter 000015 | lr 0.1800 | loss -10337.6426 | norm 6286.4372
111
+ 2025-06-25 08:15:40,922 | INFO | iter 000016 | lr 0.1800 | loss -15605.5254 | norm 6710.1341
112
+ 2025-06-25 08:15:46,100 | INFO | iter 000017 | lr 0.1800 | loss -25703.7539 | norm 7163.8796
113
+ 2025-06-25 08:15:51,309 | INFO | iter 000018 | lr 0.1800 | loss -37443.0664 | norm 7659.6704
114
+ 2025-06-25 08:15:56,583 | INFO | iter 000019 | lr 0.1800 | loss -50078.2383 | norm 8193.8946
115
+ 2025-06-25 08:16:01,998 | INFO | iter 000020 | lr 0.1800 | loss -57032.1875 | norm 8760.0695
116
+ 2025-06-25 08:16:07,120 | INFO | iter 000021 | lr 0.1800 | loss -76344.0781 | norm 9360.5588
117
+ 2025-06-25 08:16:12,341 | INFO | iter 000022 | lr 0.1800 | loss -106728.8047 | norm 9991.8501
118
+ 2025-06-25 08:16:17,385 | INFO | iter 000023 | lr 0.1800 | loss -122665.7422 | norm 10647.0357
119
+ 2025-06-25 08:16:22,444 | INFO | iter 000024 | lr 0.1800 | loss -146900.8906 | norm 11291.6713
120
+ 2025-06-25 08:16:27,677 | INFO | iter 000025 | lr 0.1800 | loss -175862.7344 | norm 11958.6899
121
+ 2025-06-25 08:16:33,138 | INFO | iter 000026 | lr 0.1800 | loss -229347.8750 | norm 12648.0304
122
+ 2025-06-25 08:16:38,360 | INFO | iter 000027 | lr 0.1800 | loss -245424.7344 | norm 13346.8304
123
+ 2025-06-25 08:16:43,584 | INFO | iter 000028 | lr 0.1800 | loss -263935.7500 | norm 14047.6949
124
+ 2025-06-25 08:16:48,605 | INFO | iter 000029 | lr 0.1800 | loss -304780.0625 | norm 14750.5359
125
+ 2025-06-25 08:16:53,925 | INFO | iter 000030 | lr 0.1800 | loss -372978.8438 | norm 15461.7537
126
+ 2025-06-25 08:16:59,207 | INFO | iter 000031 | lr 0.1800 | loss -386555.4062 | norm 16169.9843
127
+ 2025-06-25 08:17:04,766 | INFO | iter 000032 | lr 0.1800 | loss -447786.1250 | norm 16870.0891
128
+ 2025-06-25 08:17:09,788 | INFO | iter 000033 | lr 0.1800 | loss -501011.2500 | norm 17566.7465
129
+ 2025-06-25 08:17:14,895 | INFO | iter 000034 | lr 0.1800 | loss -584175.8750 | norm 18257.1731
130
+ 2025-06-25 08:17:20,270 | INFO | iter 000035 | lr 0.1800 | loss -620550.2500 | norm 18947.4654
131
+ 2025-06-25 08:17:25,581 | INFO | iter 000036 | lr 0.1800 | loss -611509.9375 | norm 19623.6972
132
+ 2025-06-25 08:17:31,267 | INFO | iter 000037 | lr 0.1800 | loss -720793.8750 | norm 20299.6148
133
+ 2025-06-25 08:17:36,391 | INFO | iter 000038 | lr 0.1800 | loss -856993.6875 | norm 20972.3499
134
+ 2025-06-25 08:17:41,749 | INFO | iter 000039 | lr 0.1800 | loss -774917.7500 | norm 21636.3731
135
+ 2025-06-25 08:17:46,809 | INFO | iter 000040 | lr 0.1800 | loss -875320.1250 | norm 22291.3078
136
+ 2025-06-25 08:17:51,936 | INFO | iter 000041 | lr 0.1800 | loss -978267.6250 | norm 22928.2164
137
+ 2025-06-25 08:17:57,182 | INFO | iter 000042 | lr 0.1800 | loss -1236919.7500 | norm 23545.3115
138
+ 2025-06-25 08:18:02,979 | INFO | iter 000043 | lr 0.1800 | loss -1016738.6875 | norm 24153.6447
139
+ 2025-06-25 08:18:08,099 | INFO | iter 000044 | lr 0.1800 | loss -1085286.3750 | norm 24751.7884
140
+ 2025-06-25 08:18:13,218 | INFO | iter 000045 | lr 0.1800 | loss -1233294.6250 | norm 25346.0331
141
+ 2025-06-25 08:18:18,370 | INFO | iter 000046 | lr 0.1800 | loss -1382542.3750 | norm 25937.9944
142
+ 2025-06-25 08:18:23,739 | INFO | iter 000047 | lr 0.1800 | loss -1339756.2500 | norm 26515.0226
143
+ 2025-06-25 08:18:29,372 | INFO | iter 000048 | lr 0.1800 | loss -1327051.5000 | norm 27076.3794
144
+ 2025-06-25 08:18:34,762 | INFO | iter 000049 | lr 0.1800 | loss -1484669.5000 | norm 27633.4660
145
+ 2025-06-25 08:18:34,762 | INFO | Completed LR test 1/10: lr=0.1800
146
+ 2025-06-25 08:18:34,933 | INFO | Cleanup complete
147
+ 2025-06-25 08:19:09,783 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
148
+ 2025-06-25 08:19:11,705 | INFO | Loaded checkpoint with optimizer: adam
149
+ 2025-06-25 08:19:11,705 | INFO | Current learning rate: 0.0018
150
+ 2025-06-25 08:19:12,319 | INFO | Weight decay: 0.1
151
+ 2025-06-25 08:19:12,319 | INFO | Epsilon: 1e-08
152
+ 2025-06-25 08:19:12,319 | INFO | Loaded 147 first moment (m) buffers
153
+ 2025-06-25 08:19:12,319 | INFO | Loaded 147 second moment (v) buffers
154
+ 2025-06-25 08:19:12,319 | INFO | Optimizer state loading completed!
155
+ 2025-06-25 08:19:14,382 | INFO | Initialized xs with norm: 1.273542
156
+ 2025-06-25 08:19:14,386 | INFO | -------------------------------- EoS --------------------------------
157
+ 2025-06-25 08:19:14,386 | INFO | Starting LR test 1/10: lr=0.1800
158
+ 2025-06-25 08:19:14,387 | INFO | Starting EoS for LR factor 100.0000
159
+ 2025-06-25 08:19:14,387 | INFO | Starting EoS for checkpoint 010000
160
+ 2025-06-25 08:19:14,387 | INFO | Starting EoS for model gpt2_small
161
+ 2025-06-25 08:19:14,387 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
162
+ 2025-06-25 08:19:14,387 | INFO | Starting EoS for num_iterations 50
163
+ 2025-06-25 08:19:14,387 | INFO | Starting EoS for accum_steps 4
164
+ 2025-06-25 08:19:14,387 | INFO | Loading model and checkpoint...
165
+ 2025-06-25 08:19:15,188 | INFO | Wrapping model with DDP...
166
+ 2025-06-25 08:19:15,646 | INFO | Loading state dict...
167
+ 2025-06-25 08:19:15,649 | INFO | Model loaded successfully!
168
+ 2025-06-25 08:19:22,000 | INFO | iter 000000 | lr 0.1800 | loss 3.5472 | norm 471.7711
169
+ 2025-06-25 08:19:27,321 | INFO | iter 000001 | lr 0.1800 | loss 258.9568 | norm 1033.3546
170
+ 2025-06-25 08:19:33,176 | INFO | iter 000002 | lr 0.1800 | loss 6505.9888 | norm 1313.8736
171
+ 2025-06-25 08:19:38,376 | INFO | iter 000003 | lr 0.1800 | loss 720.6740 | norm 1808.7756
172
+ 2025-06-25 08:19:43,461 | INFO | iter 000004 | lr 0.1800 | loss 4549.4497 | norm 2236.3996
173
+ 2025-06-25 08:19:48,625 | INFO | iter 000005 | lr 0.1800 | loss 3147.6418 | norm 2648.9176
174
+ 2025-06-25 08:19:53,829 | INFO | iter 000006 | lr 0.1800 | loss 1210.4675 | norm 3081.6848
175
+ 2025-06-25 08:19:58,930 | INFO | iter 000007 | lr 0.1800 | loss 2729.4216 | norm 3473.4778
176
+ 2025-06-25 08:20:04,381 | INFO | iter 000008 | lr 0.1800 | loss 2761.1892 | norm 3830.6859
177
+ 2025-06-25 08:20:09,528 | INFO | iter 000009 | lr 0.1800 | loss 1629.8550 | norm 4172.5473
178
+ 2025-06-25 08:20:14,583 | INFO | iter 000010 | lr 0.1800 | loss 738.9063 | norm 4510.9757
179
+ 2025-06-25 08:20:19,734 | INFO | iter 000011 | lr 0.1800 | loss 429.7834 | norm 4842.2049
180
+ 2025-06-25 08:20:24,921 | INFO | iter 000012 | lr 0.1800 | loss -247.2831 | norm 5150.2921
181
+ 2025-06-25 08:20:30,290 | INFO | iter 000013 | lr 0.1800 | loss -2646.1260 | norm 5471.6201
182
+ 2025-06-25 08:20:35,302 | INFO | iter 000014 | lr 0.1800 | loss -5207.6968 | norm 5812.3796
183
+ 2025-06-25 08:20:40,540 | INFO | iter 000015 | lr 0.1800 | loss -6475.6606 | norm 6168.6810
184
+ 2025-06-25 08:20:45,761 | INFO | iter 000016 | lr 0.1800 | loss -11955.2441 | norm 6540.5293
185
+ 2025-06-25 08:20:50,785 | INFO | iter 000017 | lr 0.1800 | loss -18120.8965 | norm 6942.5453
186
+ 2025-06-25 08:20:56,027 | INFO | iter 000018 | lr 0.1800 | loss -27042.0312 | norm 7375.6815
187
+ 2025-06-25 08:21:01,764 | INFO | iter 000019 | lr 0.1800 | loss -32353.4766 | norm 7844.6077
188
+ 2025-06-25 08:21:06,819 | INFO | iter 000020 | lr 0.1800 | loss -42713.4258 | norm 8350.1531
189
+ 2025-06-25 08:21:11,942 | INFO | iter 000021 | lr 0.1800 | loss -63141.9570 | norm 8895.7420
190
+ 2025-06-25 08:21:17,296 | INFO | iter 000022 | lr 0.1800 | loss -79225.8125 | norm 9464.6960
191
+ 2025-06-25 08:21:22,364 | INFO | iter 000023 | lr 0.1800 | loss -85295.2188 | norm 10064.8914
192
+ 2025-06-25 08:21:27,555 | INFO | iter 000024 | lr 0.1800 | loss -104268.3984 | norm 10670.1012
193
+ 2025-06-25 08:21:33,391 | INFO | iter 000025 | lr 0.1800 | loss -138191.5625 | norm 11306.4591
194
+ 2025-06-25 08:21:38,449 | INFO | iter 000026 | lr 0.1800 | loss -130759.4922 | norm 11963.0958
195
+ 2025-06-25 08:21:43,851 | INFO | iter 000027 | lr 0.1800 | loss -151880.2188 | norm 12623.4203
196
+ 2025-06-25 08:21:48,943 | INFO | iter 000028 | lr 0.1800 | loss -199277.5312 | norm 13299.8124
197
+ 2025-06-25 08:21:54,133 | INFO | iter 000029 | lr 0.1800 | loss -231175.6094 | norm 13985.3231
198
+ 2025-06-25 08:21:59,601 | INFO | iter 000030 | lr 0.1800 | loss -227062.6875 | norm 14638.4871
199
+ 2025-06-25 08:22:04,755 | INFO | iter 000031 | lr 0.1800 | loss -305321.5312 | norm 15300.2854
200
+ 2025-06-25 08:22:09,996 | INFO | iter 000032 | lr 0.1800 | loss -312911.1562 | norm 15967.4907
201
+ 2025-06-25 08:22:15,180 | INFO | iter 000033 | lr 0.1800 | loss -394100.9688 | norm 16633.2168
202
+ 2025-06-25 08:22:20,480 | INFO | iter 000034 | lr 0.1800 | loss -396323.7812 | norm 17301.4404
203
+ 2025-06-25 08:22:25,744 | INFO | iter 000035 | lr 0.1800 | loss -495372.3750 | norm 17975.7867
204
+ 2025-06-25 08:22:31,676 | INFO | iter 000036 | lr 0.1800 | loss -536708.7500 | norm 18632.5354
205
+ 2025-06-25 08:22:36,888 | INFO | iter 000037 | lr 0.1800 | loss -565078.6250 | norm 19274.2470
206
+ 2025-06-25 08:22:42,081 | INFO | iter 000038 | lr 0.1800 | loss -567815.5000 | norm 19919.4392
207
+ 2025-06-25 08:22:47,333 | INFO | iter 000039 | lr 0.1800 | loss -618632.5625 | norm 20562.6429
208
+ 2025-06-25 08:22:52,496 | INFO | iter 000040 | lr 0.1800 | loss -746847.8750 | norm 21208.1400
209
+ 2025-06-25 08:22:57,533 | INFO | iter 000041 | lr 0.1800 | loss -694621.2500 | norm 21850.9988
210
+ 2025-06-25 08:23:03,478 | INFO | iter 000042 | lr 0.1800 | loss -805244.6250 | norm 22487.1557
211
+ 2025-06-25 08:23:08,470 | INFO | iter 000043 | lr 0.1800 | loss -911537.5000 | norm 23123.2202
212
+ 2025-06-25 08:23:13,595 | INFO | iter 000044 | lr 0.1800 | loss -935658.9375 | norm 23750.3923
213
+ 2025-06-25 08:23:18,852 | INFO | iter 000045 | lr 0.1800 | loss -877843.8125 | norm 24361.4878
214
+ 2025-06-25 08:23:23,961 | INFO | iter 000046 | lr 0.1800 | loss -1091743.1250 | norm 24972.6994
215
+ 2025-06-25 08:23:29,213 | INFO | iter 000047 | lr 0.1800 | loss -1240315.0000 | norm 25579.9303
216
+ 2025-06-25 08:23:34,577 | INFO | iter 000048 | lr 0.1800 | loss -1015323.0000 | norm 26170.5807
217
+ 2025-06-25 08:23:39,739 | INFO | iter 000049 | lr 0.1800 | loss -1083682.8750 | norm 26751.1997
218
+ 2025-06-25 08:23:39,739 | INFO | Completed LR test 1/10: lr=0.1800
219
+ 2025-06-25 08:23:40,094 | INFO | Cleanup complete
220
+ 2025-06-25 08:25:41,547 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
221
+ 2025-06-25 08:25:43,132 | INFO | Loaded checkpoint with optimizer: adam
222
+ 2025-06-25 08:25:43,133 | INFO | Current learning rate: 0.0018
223
+ 2025-06-25 08:25:43,761 | INFO | Weight decay: 0.1
224
+ 2025-06-25 08:25:43,761 | INFO | Epsilon: 1e-08
225
+ 2025-06-25 08:25:43,761 | INFO | Loaded 147 first moment (m) buffers
226
+ 2025-06-25 08:25:43,761 | INFO | Loaded 147 second moment (v) buffers
227
+ 2025-06-25 08:25:43,761 | INFO | Optimizer state loading completed!
228
+ 2025-06-25 08:25:45,718 | INFO | Initialized xs with norm: 1.273535
229
+ 2025-06-25 08:25:45,726 | INFO | -------------------------------- EoS --------------------------------
230
+ 2025-06-25 08:25:45,726 | INFO | Starting LR test 1/10: lr=0.1800
231
+ 2025-06-25 08:25:45,726 | INFO | Starting EoS for LR factor 100.0000
232
+ 2025-06-25 08:25:45,726 | INFO | Starting EoS for checkpoint 010000
233
+ 2025-06-25 08:25:45,726 | INFO | Starting EoS for model gpt2_small
234
+ 2025-06-25 08:25:45,726 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
235
+ 2025-06-25 08:25:45,726 | INFO | Starting EoS for num_iterations 50
236
+ 2025-06-25 08:25:45,726 | INFO | Starting EoS for accum_steps 4
237
+ 2025-06-25 08:25:45,726 | INFO | Loading model and checkpoint...
238
+ 2025-06-25 08:25:46,600 | INFO | Wrapping model with DDP...
239
+ 2025-06-25 08:25:46,828 | INFO | Loading state dict...
240
+ 2025-06-25 08:25:46,831 | INFO | Model loaded successfully!
241
+ 2025-06-25 08:25:54,406 | INFO | iter 000000 | lr 0.1800 | loss 0.0000 | norm 471.7869
242
+ 2025-06-25 08:25:59,712 | INFO | iter 000001 | lr 0.1800 | loss 255.7516 | norm 1033.5834
243
+ 2025-06-25 08:26:04,882 | INFO | iter 000002 | lr 0.1800 | loss 6477.0049 | norm 1315.8706
244
+ 2025-06-25 08:26:10,059 | INFO | iter 000003 | lr 0.1800 | loss 717.1234 | norm 1811.0524
245
+ 2025-06-25 08:26:15,184 | INFO | iter 000004 | lr 0.1800 | loss 4544.1074 | norm 2238.2034
246
+ 2025-06-25 08:26:20,441 | INFO | iter 000005 | lr 0.1800 | loss 3125.7515 | norm 2650.5604
247
+ 2025-06-25 08:26:25,413 | INFO | iter 000006 | lr 0.1800 | loss 1199.5897 | norm 3083.3089
248
+ 2025-06-25 08:26:30,945 | INFO | iter 000007 | lr 0.1800 | loss 2725.4370 | norm 3475.1633
249
+ 2025-06-25 08:26:36,160 | INFO | iter 000008 | lr 0.1800 | loss 2744.2969 | norm 3832.5715
250
+ 2025-06-25 08:26:41,444 | INFO | iter 000009 | lr 0.1800 | loss 1625.9283 | norm 4174.4268
251
+ 2025-06-25 08:26:46,671 | INFO | iter 000010 | lr 0.1800 | loss 732.4396 | norm 4512.6526
252
+ 2025-06-25 08:26:51,717 | INFO | iter 000011 | lr 0.1800 | loss 401.3749 | norm 4843.9269
253
+ 2025-06-25 08:26:56,797 | INFO | iter 000012 | lr 0.1800 | loss -414.9888 | norm 5153.5810
254
+ 2025-06-25 08:27:02,557 | INFO | iter 000013 | lr 0.1800 | loss -2720.9531 | norm 5476.3233
255
+ 2025-06-25 08:27:07,729 | INFO | iter 000014 | lr 0.1800 | loss -5391.8271 | norm 5818.1825
256
+ 2025-06-25 08:27:12,870 | INFO | iter 000015 | lr 0.1800 | loss -6670.7456 | norm 6176.3078
257
+ 2025-06-25 08:27:18,153 | INFO | iter 000016 | lr 0.1800 | loss -12359.0928 | norm 6550.6398
258
+ 2025-06-25 08:27:23,424 | INFO | iter 000017 | lr 0.1800 | loss -18700.9395 | norm 6955.5065
259
+ 2025-06-25 08:27:28,958 | INFO | iter 000018 | lr 0.1800 | loss -28184.4316 | norm 7388.6123
260
+ 2025-06-25 08:27:34,339 | INFO | iter 000019 | lr 0.1800 | loss -33184.2344 | norm 7858.2671
261
+ 2025-06-25 08:27:39,610 | INFO | iter 000020 | lr 0.1800 | loss -43730.0781 | norm 8364.9345
262
+ 2025-06-25 08:27:44,683 | INFO | iter 000021 | lr 0.1800 | loss -64642.3008 | norm 8911.7296
263
+ 2025-06-25 08:27:49,940 | INFO | iter 000022 | lr 0.1800 | loss -81204.4609 | norm 9481.5446
264
+ 2025-06-25 08:27:55,204 | INFO | iter 000023 | lr 0.1800 | loss -86869.9766 | norm 10082.6514
265
+ 2025-06-25 08:28:00,953 | INFO | iter 000024 | lr 0.1800 | loss -105953.4062 | norm 10687.3279
266
+ 2025-06-25 08:28:06,307 | INFO | iter 000025 | lr 0.1800 | loss -140775.6562 | norm 11322.9195
267
+ 2025-06-25 08:28:11,582 | INFO | iter 000026 | lr 0.1800 | loss -132647.5938 | norm 11978.4767
268
+ 2025-06-25 08:28:16,754 | INFO | iter 000027 | lr 0.1800 | loss -153962.5781 | norm 12637.7207
269
+ 2025-06-25 08:28:21,813 | INFO | iter 000028 | lr 0.1800 | loss -202029.5312 | norm 13312.6754
270
+ 2025-06-25 08:28:27,054 | INFO | iter 000029 | lr 0.1800 | loss -234245.6406 | norm 13996.1748
271
+ 2025-06-25 08:28:33,097 | INFO | iter 000030 | lr 0.1800 | loss -229900.0625 | norm 14646.0295
272
+ 2025-06-25 08:28:38,339 | INFO | iter 000031 | lr 0.1800 | loss -308670.3125 | norm 15304.3426
273
+ 2025-06-25 08:28:43,514 | INFO | iter 000032 | lr 0.1800 | loss -316647.1562 | norm 15968.0695
274
+ 2025-06-25 08:28:48,671 | INFO | iter 000033 | lr 0.1800 | loss -398862.5625 | norm 16630.9157
275
+ 2025-06-25 08:28:53,992 | INFO | iter 000034 | lr 0.1800 | loss -400159.2812 | norm 17296.2316
276
+ 2025-06-25 08:28:59,229 | INFO | iter 000035 | lr 0.1800 | loss -500405.0000 | norm 17967.7855
277
+ 2025-06-25 08:29:04,924 | INFO | iter 000036 | lr 0.1800 | loss -542243.6875 | norm 18618.2132
278
+ 2025-06-25 08:29:10,021 | INFO | iter 000037 | lr 0.1800 | loss -565329.5000 | norm 19245.1076
279
+ 2025-06-25 08:29:24,306 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
280
+ 2025-06-25 08:29:26,082 | INFO | Loaded checkpoint with optimizer: adam
281
+ 2025-06-25 08:29:26,082 | INFO | Current learning rate: 0.0018
282
+ 2025-06-25 08:29:26,684 | INFO | Weight decay: 0.1
283
+ 2025-06-25 08:29:26,684 | INFO | Epsilon: 1e-08
284
+ 2025-06-25 08:29:26,684 | INFO | Loaded 147 first moment (m) buffers
285
+ 2025-06-25 08:29:26,684 | INFO | Loaded 147 second moment (v) buffers
286
+ 2025-06-25 08:29:26,684 | INFO | Optimizer state loading completed!
287
+ 2025-06-25 08:29:28,983 | INFO | Initialized xs with norm: 1.273466
288
+ 2025-06-25 08:29:28,995 | INFO | -------------------------------- EoS --------------------------------
289
+ 2025-06-25 08:29:28,995 | INFO | Starting LR test 1/10: lr=0.1800
290
+ 2025-06-25 08:29:28,995 | INFO | Starting EoS for LR factor 100.0000
291
+ 2025-06-25 08:29:28,995 | INFO | Starting EoS for checkpoint 010000
292
+ 2025-06-25 08:29:28,996 | INFO | Starting EoS for model gpt2_small
293
+ 2025-06-25 08:29:28,996 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
294
+ 2025-06-25 08:29:28,996 | INFO | Starting EoS for num_iterations 50
295
+ 2025-06-25 08:29:28,996 | INFO | Starting EoS for accum_steps 4
296
+ 2025-06-25 08:29:28,996 | INFO | Loading model and checkpoint...
297
+ 2025-06-25 08:29:29,754 | INFO | Wrapping model with DDP...
298
+ 2025-06-25 08:29:30,234 | INFO | Loading state dict...
299
+ 2025-06-25 08:29:30,237 | INFO | Model loaded successfully!
300
+ 2025-06-25 08:29:37,767 | INFO | iter 000000 | lr 0.1800 | loss 0.0000 | norm 471.7844
301
+ 2025-06-25 08:29:42,698 | INFO | iter 000001 | lr 0.1800 | loss 256.3545 | norm 1033.6785
302
+ 2025-06-25 08:29:47,593 | INFO | iter 000002 | lr 0.1800 | loss 6480.4614 | norm 1317.1781
303
+ 2025-06-25 08:29:52,655 | INFO | iter 000003 | lr 0.1800 | loss 707.6561 | norm 1814.0126
304
+ 2025-06-25 08:29:57,762 | INFO | iter 000004 | lr 0.1800 | loss 4542.7993 | norm 2242.5040
305
+ 2025-06-25 08:30:03,258 | INFO | iter 000005 | lr 0.1800 | loss 3116.1213 | norm 2655.8104
306
+ 2025-06-25 08:30:08,366 | INFO | iter 000006 | lr 0.1800 | loss 1191.1149 | norm 3089.5643
307
+ 2025-06-25 08:30:13,829 | INFO | iter 000007 | lr 0.1800 | loss 2736.9172 | norm 3481.7995
308
+ 2025-06-25 08:30:19,004 | INFO | iter 000008 | lr 0.1800 | loss 2748.8972 | norm 3839.4604
309
+ 2025-06-25 08:30:24,084 | INFO | iter 000009 | lr 0.1800 | loss 1633.2867 | norm 4181.4834
310
+ 2025-06-25 08:30:29,379 | INFO | iter 000010 | lr 0.1800 | loss 741.7749 | norm 4519.9052
311
+ 2025-06-25 08:30:34,505 | INFO | iter 000011 | lr 0.1800 | loss 424.5821 | norm 4851.2794
312
+ 2025-06-25 08:30:39,626 | INFO | iter 000012 | lr 0.1800 | loss -395.4122 | norm 5162.5043
313
+ 2025-06-25 08:30:44,815 | INFO | iter 000013 | lr 0.1800 | loss -2702.6868 | norm 5485.5912
314
+ 2025-06-25 08:30:50,029 | INFO | iter 000014 | lr 0.1800 | loss -5335.0098 | norm 5827.3394
315
+ 2025-06-25 08:30:55,330 | INFO | iter 000015 | lr 0.1800 | loss -6652.4419 | norm 6184.9165
316
+ 2025-06-25 08:31:00,446 | INFO | iter 000016 | lr 0.1800 | loss -12321.6143 | norm 6558.2939
317
+ 2025-06-25 08:31:06,042 | INFO | iter 000017 | lr 0.1800 | loss -18648.2637 | norm 6962.6391
318
+ 2025-06-25 08:31:11,354 | INFO | iter 000018 | lr 0.1800 | loss -27954.8457 | norm 7397.7840
319
+ 2025-06-25 08:31:16,490 | INFO | iter 000019 | lr 0.1800 | loss -33195.2930 | norm 7869.4220
320
+ 2025-06-25 08:31:21,610 | INFO | iter 000020 | lr 0.1800 | loss -43725.9531 | norm 8378.2009
321
+ 2025-06-25 08:31:35,131 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
322
+ 2025-06-25 08:31:37,075 | INFO | Loaded checkpoint with optimizer: adam
323
+ 2025-06-25 08:31:37,075 | INFO | Current learning rate: 0.0018
324
+ 2025-06-25 08:31:37,696 | INFO | Weight decay: 0.1
325
+ 2025-06-25 08:31:37,696 | INFO | Epsilon: 1e-08
326
+ 2025-06-25 08:31:37,696 | INFO | Loaded 147 first moment (m) buffers
327
+ 2025-06-25 08:31:37,696 | INFO | Loaded 147 second moment (v) buffers
328
+ 2025-06-25 08:31:37,696 | INFO | Optimizer state loading completed!
329
+ 2025-06-25 08:31:39,726 | INFO | Initialized xs with norm: 1.273655
330
+ 2025-06-25 08:31:39,743 | INFO | -------------------------------- EoS --------------------------------
331
+ 2025-06-25 08:31:39,743 | INFO | Starting LR test 1/10: lr=18.0000
332
+ 2025-06-25 08:31:39,743 | INFO | Starting EoS for LR factor 10000.0000
333
+ 2025-06-25 08:31:39,743 | INFO | Starting EoS for checkpoint 010000
334
+ 2025-06-25 08:31:39,743 | INFO | Starting EoS for model gpt2_small
335
+ 2025-06-25 08:31:39,743 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
336
+ 2025-06-25 08:31:39,743 | INFO | Starting EoS for num_iterations 50
337
+ 2025-06-25 08:31:39,743 | INFO | Starting EoS for accum_steps 4
338
+ 2025-06-25 08:31:39,743 | INFO | Loading model and checkpoint...
339
+ 2025-06-25 08:31:40,508 | INFO | Wrapping model with DDP...
340
+ 2025-06-25 08:31:40,770 | INFO | Loading state dict...
341
+ 2025-06-25 08:31:40,773 | INFO | Model loaded successfully!
342
+ 2025-06-25 08:31:48,938 | INFO | iter 000000 | lr 18.0000 | loss 0.0000 | norm 47178.7107
343
+ 2025-06-25 08:31:54,180 | INFO | iter 000001 | lr 18.0000 | loss 2579423.0000 | norm 105552.8516
344
+ 2025-06-25 08:31:59,433 | INFO | iter 000002 | lr 18.0000 | loss 111763472.0000 | norm 135456.5704
345
+ 2025-06-25 08:32:04,746 | INFO | iter 000003 | lr 18.0000 | loss 276864480.0000 | norm 133713.0656
346
+ 2025-06-25 08:32:09,937 | INFO | iter 000004 | lr 18.0000 | loss 320959104.0000 | norm 132714.9397
347
+ 2025-06-25 08:32:15,243 | INFO | iter 000005 | lr 18.0000 | loss 336375200.0000 | norm 124895.1838
348
+ 2025-06-25 08:32:20,604 | INFO | iter 000006 | lr 18.0000 | loss 306018016.0000 | norm 117796.8739
349
+ 2025-06-25 08:32:25,779 | INFO | iter 000007 | lr 18.0000 | loss 267033840.0000 | norm 108461.6875
350
+ 2025-06-25 08:32:31,373 | INFO | iter 000008 | lr 18.0000 | loss 243656960.0000 | norm 102025.2348
351
+ 2025-06-25 08:32:36,467 | INFO | iter 000009 | lr 18.0000 | loss 221454464.0000 | norm 93885.3865
352
+ 2025-06-25 08:32:41,725 | INFO | iter 000010 | lr 18.0000 | loss 185239232.0000 | norm 87718.0808
353
+ 2025-06-25 08:32:47,036 | INFO | iter 000011 | lr 18.0000 | loss 154752192.0000 | norm 80196.6753
354
+ 2025-06-25 08:32:52,225 | INFO | iter 000012 | lr 18.0000 | loss 132501480.0000 | norm 77225.2747
355
+ 2025-06-25 08:32:57,474 | INFO | iter 000013 | lr 18.0000 | loss 119106048.0000 | norm 68886.8834
356
+ 2025-06-25 08:33:03,295 | INFO | iter 000014 | lr 18.0000 | loss 97062160.0000 | norm 65299.5486
357
+ 2025-06-25 08:33:08,516 | INFO | iter 000015 | lr 18.0000 | loss 79378960.0000 | norm 59360.0908
358
+ 2025-06-25 08:33:13,789 | INFO | iter 000016 | lr 18.0000 | loss 68932920.0000 | norm 56258.6832
359
+ 2025-06-25 08:33:19,001 | INFO | iter 000017 | lr 18.0000 | loss 61816232.0000 | norm 51914.3800
360
+ 2025-06-25 08:33:24,459 | INFO | iter 000018 | lr 18.0000 | loss 52157472.0000 | norm 49234.2090
361
+ 2025-06-25 08:33:30,191 | INFO | iter 000019 | lr 18.0000 | loss 42893376.0000 | norm 45645.4134
362
+ 2025-06-25 08:33:35,359 | INFO | iter 000020 | lr 18.0000 | loss 39586664.0000 | norm 43603.7019
363
+ 2025-06-25 08:33:40,646 | INFO | iter 000021 | lr 18.0000 | loss 35820804.0000 | norm 40888.9978
364
+ 2025-06-25 08:33:45,894 | INFO | iter 000022 | lr 18.0000 | loss 30331428.0000 | norm 39151.8028
365
+ 2025-06-25 08:33:51,024 | INFO | iter 000023 | lr 18.0000 | loss 26849960.0000 | norm 37251.1340
366
+ 2025-06-25 08:33:56,173 | INFO | iter 000024 | lr 18.0000 | loss 25112784.0000 | norm 36166.3473
367
+ 2025-06-25 08:34:01,635 | INFO | iter 000025 | lr 18.0000 | loss 23161016.0000 | norm 34488.3613
368
+ 2025-06-25 08:34:06,878 | INFO | iter 000026 | lr 18.0000 | loss 21752610.0000 | norm 33774.8960
369
+ 2025-06-25 08:34:12,170 | INFO | iter 000027 | lr 18.0000 | loss 20102200.0000 | norm 32634.3167
370
+ 2025-06-25 08:34:17,332 | INFO | iter 000028 | lr 18.0000 | loss 20159082.0000 | norm 32413.1788
371
+ 2025-06-25 08:34:22,621 | INFO | iter 000029 | lr 18.0000 | loss 20947662.0000 | norm 32105.9349
372
+ 2025-06-25 08:34:27,921 | INFO | iter 000030 | lr 18.0000 | loss 19995154.0000 | norm 33069.7203
373
+ 2025-06-25 08:34:33,805 | INFO | iter 000031 | lr 18.0000 | loss 20508292.0000 | norm 32105.0565
374
+ 2025-06-25 08:34:39,056 | INFO | iter 000032 | lr 18.0000 | loss 21091504.0000 | norm 32818.1221
375
+ 2025-06-25 08:34:44,173 | INFO | iter 000033 | lr 18.0000 | loss 22435536.0000 | norm 32885.5579
376
+ 2025-06-25 08:34:49,412 | INFO | iter 000034 | lr 18.0000 | loss 22536620.0000 | norm 33441.7257
377
+ 2025-06-25 08:34:54,548 | INFO | iter 000035 | lr 18.0000 | loss 23596192.0000 | norm 33815.4563
378
+ 2025-06-25 08:34:59,711 | INFO | iter 000036 | lr 18.0000 | loss 25515312.0000 | norm 34967.7110
379
+ 2025-06-25 08:35:04,972 | INFO | iter 000037 | lr 18.0000 | loss 27476122.0000 | norm 35625.2597
380
+ 2025-06-25 08:35:10,098 | INFO | iter 000038 | lr 18.0000 | loss 29265056.0000 | norm 36547.8041
381
+ 2025-06-25 08:35:15,160 | INFO | iter 000039 | lr 18.0000 | loss 31501882.0000 | norm 37360.1106
382
+ 2025-06-25 08:35:20,403 | INFO | iter 000040 | lr 18.0000 | loss 33444498.0000 | norm 38455.2587
383
+ 2025-06-25 08:35:25,428 | INFO | iter 000041 | lr 18.0000 | loss 35255556.0000 | norm 39234.6789
384
+ 2025-06-25 08:35:31,052 | INFO | iter 000042 | lr 18.0000 | loss 36108252.0000 | norm 40022.0439
385
+ 2025-06-25 08:35:36,298 | INFO | iter 000043 | lr 18.0000 | loss 40793144.0000 | norm 41435.8015
386
+ 2025-06-25 08:35:41,428 | INFO | iter 000044 | lr 18.0000 | loss 40548568.0000 | norm 41333.9848
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12623cbe522c1e7a4e735fc1067e87afff8187ac246afb4dba2caad567fffb0e
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01c656090c88cf1d156a4a323f0c8f4eb5d418043b6aa18ff6745a0a58a6d000
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63646361ad0ece816c247161cf7978270f56d7739e29ab5af1169d0bef798d3e
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a51b112a9e09695144c5d1072b1e368431f3d522e7db2c8e95d4e05d5a3fee7f
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fefc324e20a91a5de78a3313f19a003cf2f274abe56f6d08944c319fc715520
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d180f12608316931efe85f49ff95f7f6fa91364bc6672244bd094a069646f7ef
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25017e80be486ae3b5dbbb781de85c121e8d2d88f908581c97dfe988b3730084
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2543f607872dbdd3c04424f6d6d02b415b80cfd19f43b8f134b1cc8496c70cd3
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90ddbe1ada352f48db260bc8c6d74f792303e93eb1160184ae995f2335b7c88e
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d4eaf93dc3d53ab7442bafd01c362dc03e2eee7fe304baa0918f3543a2415d9
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca6cfdb6bde4442e646552ed717969483e3e24394674bd3d6cc8b1b67529c1aa
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:981221565146678811b2a7f34a61f973fb9c0f92583119d993bfdfea2d05c0dd
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f4ef37fffcb4fe27f7bed035fadcfdb90af73c77676d69e20d0a1fa1e1efe85
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13445b8a5bc5476cc9805b3ef65eb2b4237fcbe39361704c64a775661f726252
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dbbcc9daba97cbcdabde080086f9b41f275b4342e0b4892bca7adfcd4f573ae
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bee60122d717616ce4b40fd27948e3db7503784caa6b6f579ded5a34fce8cb7
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2efad723a74e72ca5b5e4d8792afd4c8b3ef2567e075350dd3cd5f44f2b66121
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afb0c7371eb7b09cc13faa342fee7fd484dde2516baa90f9539695420d16738b
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ede13b57abfda1ebab3d0bb1091e5df05106498fe4a993c0b607bddd954657
3
+ size 988816320
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d09284853ab69674c8e10e863ce85717ddc0ea15f1a80fa0122ec48913521e
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:985bf2b31f814aa1642190a5258c516511c660d45371fc995ac4944a453fc4ec
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf9c1bf4d9b3a9e1c570be899f9141bb934ecf325e0708496f455a7518c9ff06
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adde10dc5829f7236b67eb63ab8f0ed6d58ecfb0fb17deee3cdadf01949879d3
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dffb6249ddca4645632f62a4f38ef97fdb00f775f1b41568d0d35a644e8c99a3
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b28451924e18d20c8691bdda04c00516426541675534bb3fd1fc036d0d7940
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b90a3953ceb60cd88cd4519a01e4d76c4dbab205ecb7c54dd084ed0f3ef72c79
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd19034279aa37701ee63a9ef80d1c6bcd4e040dfc2b93d9f02050cc704cd96f
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b78860a70fe8718833963ca3879abc0d0b0645edabc7832de26df64a922f6b1d
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e534e382e6d665965f8fbb8b7435bb52eaaba18e95a6df95cae4a5aa4bf56a3
3
+ size 1297616507
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d6da5d86436c34b53dba6c5726c8a79af0780ded43fba94632b58c757f8c8a
3
+ size 1297616507