Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +5 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json +14 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log +253 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json +14 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log +232 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json +14 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log +640 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json +14 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png +0 -0
- fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log +386 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt +3 -0
- fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt +3 -0
.gitattributes
CHANGED
|
@@ -172,3 +172,8 @@ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_
|
|
| 172 |
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
|
| 173 |
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 174 |
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
|
| 173 |
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 174 |
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
|
| 175 |
+
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 176 |
+
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text
|
| 177 |
+
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 178 |
+
fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.png filter=lfs diff=lfs merge=lfs -text
|
| 179 |
+
fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.png filter=lfs diff=lfs merge=lfs -text
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gpt2_small",
|
| 3 |
+
"factor_min": 0.6,
|
| 4 |
+
"factor_max": 1.5,
|
| 5 |
+
"factor_num": 10,
|
| 6 |
+
"error": 0.0001,
|
| 7 |
+
"accum_steps": 4,
|
| 8 |
+
"num_iterations": 50,
|
| 9 |
+
"num_checkpoint": 1000,
|
| 10 |
+
"input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
|
| 11 |
+
"run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
|
| 12 |
+
"timestamp": "250622_035242",
|
| 13 |
+
"raw": false
|
| 14 |
+
}
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-25 06:40:04,162 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt
|
| 2 |
+
2025-06-25 06:42:00,983 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt
|
| 3 |
+
2025-06-25 06:42:02,949 | INFO | Loaded checkpoint with optimizer: adam
|
| 4 |
+
2025-06-25 06:42:02,949 | INFO | Current learning rate: 0.0018
|
| 5 |
+
2025-06-25 06:42:03,551 | INFO | Weight decay: 0.1
|
| 6 |
+
2025-06-25 06:42:03,551 | INFO | Epsilon: 1e-08
|
| 7 |
+
2025-06-25 06:42:03,551 | INFO | Loaded 147 first moment (m) buffers
|
| 8 |
+
2025-06-25 06:42:03,551 | INFO | Loaded 147 second moment (v) buffers
|
| 9 |
+
2025-06-25 06:42:03,551 | INFO | Optimizer state loading completed!
|
| 10 |
+
2025-06-25 06:42:05,486 | INFO | Initialized xs with norm: 1.273417
|
| 11 |
+
2025-06-25 06:42:05,497 | INFO | -------------------------------- EoS --------------------------------
|
| 12 |
+
2025-06-25 06:42:05,497 | INFO | Starting LR test 1/10: lr=0.0011
|
| 13 |
+
2025-06-25 06:42:05,497 | INFO | Starting EoS for LR factor 0.6000
|
| 14 |
+
2025-06-25 06:42:05,497 | INFO | Starting EoS for checkpoint 001000
|
| 15 |
+
2025-06-25 06:42:05,497 | INFO | Starting EoS for model gpt2_small
|
| 16 |
+
2025-06-25 06:42:05,497 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 17 |
+
2025-06-25 06:42:05,497 | INFO | Starting EoS for num_iterations 50
|
| 18 |
+
2025-06-25 06:42:05,497 | INFO | Starting EoS for accum_steps 4
|
| 19 |
+
2025-06-25 06:42:05,497 | INFO | Loading model and checkpoint...
|
| 20 |
+
2025-06-25 06:42:06,378 | INFO | Wrapping model with DDP...
|
| 21 |
+
2025-06-25 06:42:06,406 | INFO | Loading state dict...
|
| 22 |
+
2025-06-25 06:42:06,410 | INFO | Model loaded successfully!
|
| 23 |
+
2025-06-25 06:42:13,594 | INFO | iter 000000 | lr 0.0011 | loss 4.3392 | norm 13.7445
|
| 24 |
+
2025-06-25 06:42:18,961 | INFO | iter 000001 | lr 0.0011 | loss 4.4330 | norm 27.2540
|
| 25 |
+
2025-06-25 06:42:24,075 | INFO | iter 000002 | lr 0.0011 | loss 4.3832 | norm 40.6489
|
| 26 |
+
2025-06-25 06:42:29,377 | INFO | iter 000003 | lr 0.0011 | loss 4.3913 | norm 53.8958
|
| 27 |
+
2025-06-25 06:42:34,634 | INFO | iter 000004 | lr 0.0011 | loss 4.3451 | norm 66.9831
|
| 28 |
+
2025-06-25 06:42:39,893 | INFO | iter 000005 | lr 0.0011 | loss 4.4464 | norm 79.9053
|
| 29 |
+
2025-06-25 06:42:45,174 | INFO | iter 000006 | lr 0.0011 | loss 4.4459 | norm 92.6583
|
| 30 |
+
2025-06-25 06:42:50,203 | INFO | iter 000007 | lr 0.0011 | loss 4.4652 | norm 105.2383
|
| 31 |
+
2025-06-25 06:42:55,371 | INFO | iter 000008 | lr 0.0011 | loss 4.4837 | norm 117.6420
|
| 32 |
+
2025-06-25 06:43:00,884 | INFO | iter 000009 | lr 0.0011 | loss 4.5257 | norm 129.8671
|
| 33 |
+
2025-06-25 06:43:06,419 | INFO | iter 000010 | lr 0.0011 | loss 4.5709 | norm 141.9121
|
| 34 |
+
2025-06-25 06:43:11,725 | INFO | iter 000011 | lr 0.0011 | loss 4.5868 | norm 153.7757
|
| 35 |
+
2025-06-25 06:43:16,715 | INFO | iter 000012 | lr 0.0011 | loss 4.6285 | norm 165.4574
|
| 36 |
+
2025-06-25 06:43:21,934 | INFO | iter 000013 | lr 0.0011 | loss 4.4997 | norm 176.9570
|
| 37 |
+
2025-06-25 06:43:27,054 | INFO | iter 000014 | lr 0.0011 | loss 4.8695 | norm 188.2745
|
| 38 |
+
2025-06-25 06:43:32,694 | INFO | iter 000015 | lr 0.0011 | loss 4.8345 | norm 199.4108
|
| 39 |
+
2025-06-25 06:43:37,757 | INFO | iter 000016 | lr 0.0011 | loss 4.7835 | norm 210.3668
|
| 40 |
+
2025-06-25 06:43:42,881 | INFO | iter 000017 | lr 0.0011 | loss 4.6182 | norm 221.1439
|
| 41 |
+
2025-06-25 06:43:48,183 | INFO | iter 000018 | lr 0.0011 | loss 4.9529 | norm 231.7438
|
| 42 |
+
2025-06-25 06:43:53,391 | INFO | iter 000019 | lr 0.0011 | loss 4.7017 | norm 242.1687
|
| 43 |
+
2025-06-25 06:43:58,710 | INFO | iter 000020 | lr 0.0011 | loss 4.7435 | norm 252.4208
|
| 44 |
+
2025-06-25 06:44:04,463 | INFO | iter 000021 | lr 0.0011 | loss 4.7189 | norm 262.5029
|
| 45 |
+
2025-06-25 06:44:09,748 | INFO | iter 000022 | lr 0.0011 | loss 4.8201 | norm 272.4178
|
| 46 |
+
2025-06-25 06:44:14,874 | INFO | iter 000023 | lr 0.0011 | loss 4.5389 | norm 282.1686
|
| 47 |
+
2025-06-25 06:44:20,048 | INFO | iter 000024 | lr 0.0011 | loss 4.6885 | norm 291.7586
|
| 48 |
+
2025-06-25 06:44:25,212 | INFO | iter 000025 | lr 0.0011 | loss 4.6047 | norm 301.1913
|
| 49 |
+
2025-06-25 06:44:30,907 | INFO | iter 000026 | lr 0.0011 | loss 4.8046 | norm 310.4704
|
| 50 |
+
2025-06-25 06:44:36,109 | INFO | iter 000027 | lr 0.0011 | loss 4.8286 | norm 319.5996
|
| 51 |
+
2025-06-25 06:44:41,223 | INFO | iter 000028 | lr 0.0011 | loss 4.4733 | norm 328.5828
|
| 52 |
+
2025-06-25 06:44:46,307 | INFO | iter 000029 | lr 0.0011 | loss 4.7715 | norm 337.4237
|
| 53 |
+
2025-06-25 06:44:51,454 | INFO | iter 000030 | lr 0.0011 | loss 4.3984 | norm 346.1263
|
| 54 |
+
2025-06-25 06:44:56,725 | INFO | iter 000031 | lr 0.0011 | loss 4.5874 | norm 354.6947
|
| 55 |
+
2025-06-25 06:45:02,103 | INFO | iter 000032 | lr 0.0011 | loss 4.2035 | norm 363.1331
|
| 56 |
+
2025-06-25 06:45:07,293 | INFO | iter 000033 | lr 0.0011 | loss 3.9937 | norm 371.4456
|
| 57 |
+
2025-06-25 06:45:12,413 | INFO | iter 000034 | lr 0.0011 | loss 4.2272 | norm 379.6361
|
| 58 |
+
2025-06-25 06:45:17,562 | INFO | iter 000035 | lr 0.0011 | loss 4.0314 | norm 387.7087
|
| 59 |
+
2025-06-25 06:45:22,574 | INFO | iter 000036 | lr 0.0011 | loss 4.0062 | norm 395.6674
|
| 60 |
+
2025-06-25 06:45:27,804 | INFO | iter 000037 | lr 0.0011 | loss 4.2448 | norm 403.5161
|
| 61 |
+
2025-06-25 06:45:33,292 | INFO | iter 000038 | lr 0.0011 | loss 3.5407 | norm 411.2590
|
| 62 |
+
2025-06-25 06:45:38,469 | INFO | iter 000039 | lr 0.0011 | loss 3.3566 | norm 418.8998
|
| 63 |
+
2025-06-25 06:45:43,663 | INFO | iter 000040 | lr 0.0011 | loss 3.4557 | norm 426.4421
|
| 64 |
+
2025-06-25 06:45:48,847 | INFO | iter 000041 | lr 0.0011 | loss 3.3656 | norm 433.8899
|
| 65 |
+
2025-06-25 06:45:54,209 | INFO | iter 000042 | lr 0.0011 | loss 2.9874 | norm 441.2469
|
| 66 |
+
2025-06-25 06:45:59,878 | INFO | iter 000043 | lr 0.0011 | loss 2.7940 | norm 448.5164
|
| 67 |
+
2025-06-25 06:46:05,083 | INFO | iter 000044 | lr 0.0011 | loss 2.7455 | norm 455.7017
|
| 68 |
+
2025-06-25 06:46:10,201 | INFO | iter 000045 | lr 0.0011 | loss 2.6695 | norm 462.8061
|
| 69 |
+
2025-06-25 06:46:15,556 | INFO | iter 000046 | lr 0.0011 | loss 2.5043 | norm 469.8330
|
| 70 |
+
2025-06-25 06:46:20,540 | INFO | iter 000047 | lr 0.0011 | loss 2.3519 | norm 476.7853
|
| 71 |
+
2025-06-25 06:46:25,531 | INFO | iter 000048 | lr 0.0011 | loss 1.8817 | norm 483.6662
|
| 72 |
+
2025-06-25 06:46:31,412 | INFO | iter 000049 | lr 0.0011 | loss 1.3600 | norm 490.4789
|
| 73 |
+
2025-06-25 06:46:31,413 | INFO | Completed LR test 1/10: lr=0.0011
|
| 74 |
+
2025-06-25 06:46:31,437 | INFO | -------------------------------- EoS --------------------------------
|
| 75 |
+
2025-06-25 06:46:31,437 | INFO | Starting LR test 2/10: lr=0.0018
|
| 76 |
+
2025-06-25 06:46:31,437 | INFO | Starting EoS for LR factor 1.0000
|
| 77 |
+
2025-06-25 06:46:31,437 | INFO | Starting EoS for checkpoint 001000
|
| 78 |
+
2025-06-25 06:46:31,437 | INFO | Starting EoS for model gpt2_small
|
| 79 |
+
2025-06-25 06:46:31,437 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 80 |
+
2025-06-25 06:46:31,437 | INFO | Starting EoS for num_iterations 50
|
| 81 |
+
2025-06-25 06:46:31,437 | INFO | Starting EoS for accum_steps 4
|
| 82 |
+
2025-06-25 06:46:31,437 | INFO | Loading model and checkpoint...
|
| 83 |
+
2025-06-25 06:46:32,149 | INFO | Wrapping model with DDP...
|
| 84 |
+
2025-06-25 06:46:32,538 | INFO | Loading state dict...
|
| 85 |
+
2025-06-25 06:46:32,542 | INFO | Model loaded successfully!
|
| 86 |
+
2025-06-25 06:46:39,146 | INFO | iter 000000 | lr 0.0018 | loss 4.3392 | norm 22.8459
|
| 87 |
+
2025-06-25 06:46:44,139 | INFO | iter 000001 | lr 0.0018 | loss 4.4466 | norm 45.1634
|
| 88 |
+
2025-06-25 06:46:49,296 | INFO | iter 000002 | lr 0.0018 | loss 4.4217 | norm 67.1257
|
| 89 |
+
2025-06-25 06:46:54,367 | INFO | iter 000003 | lr 0.0018 | loss 4.4527 | norm 88.6757
|
| 90 |
+
2025-06-25 06:46:59,621 | INFO | iter 000004 | lr 0.0018 | loss 4.4288 | norm 109.7928
|
| 91 |
+
2025-06-25 06:47:05,068 | INFO | iter 000005 | lr 0.0018 | loss 4.5626 | norm 130.4703
|
| 92 |
+
2025-06-25 06:47:10,277 | INFO | iter 000006 | lr 0.0018 | loss 4.5940 | norm 150.7033
|
| 93 |
+
2025-06-25 06:47:15,415 | INFO | iter 000007 | lr 0.0018 | loss 4.6409 | norm 170.4862
|
| 94 |
+
2025-06-25 06:47:20,565 | INFO | iter 000008 | lr 0.0018 | loss 4.6550 | norm 189.8151
|
| 95 |
+
2025-06-25 06:47:25,896 | INFO | iter 000009 | lr 0.0018 | loss 4.7106 | norm 208.6894
|
| 96 |
+
2025-06-25 06:47:31,787 | INFO | iter 000010 | lr 0.0018 | loss 4.7663 | norm 227.1111
|
| 97 |
+
2025-06-25 06:47:37,142 | INFO | iter 000011 | lr 0.0018 | loss 4.7738 | norm 245.0833
|
| 98 |
+
2025-06-25 06:47:42,360 | INFO | iter 000012 | lr 0.0018 | loss 4.7983 | norm 262.6107
|
| 99 |
+
2025-06-25 06:47:47,575 | INFO | iter 000013 | lr 0.0018 | loss 4.6526 | norm 279.6987
|
| 100 |
+
2025-06-25 06:47:52,603 | INFO | iter 000014 | lr 0.0018 | loss 5.0270 | norm 296.3541
|
| 101 |
+
2025-06-25 06:47:57,840 | INFO | iter 000015 | lr 0.0018 | loss 4.9013 | norm 312.5854
|
| 102 |
+
2025-06-25 06:48:03,561 | INFO | iter 000016 | lr 0.0018 | loss 4.8465 | norm 328.4014
|
| 103 |
+
2025-06-25 06:48:08,832 | INFO | iter 000017 | lr 0.0018 | loss 4.5672 | norm 343.8123
|
| 104 |
+
2025-06-25 06:48:13,857 | INFO | iter 000018 | lr 0.0018 | loss 4.7794 | norm 358.8289
|
| 105 |
+
2025-06-25 06:48:19,012 | INFO | iter 000019 | lr 0.0018 | loss 4.4661 | norm 373.4629
|
| 106 |
+
2025-06-25 06:48:24,132 | INFO | iter 000020 | lr 0.0018 | loss 4.5023 | norm 387.7265
|
| 107 |
+
2025-06-25 06:48:29,868 | INFO | iter 000021 | lr 0.0018 | loss 4.2938 | norm 401.6327
|
| 108 |
+
2025-06-25 06:48:35,428 | INFO | iter 000022 | lr 0.0018 | loss 4.3684 | norm 415.1946
|
| 109 |
+
2025-06-25 06:48:40,742 | INFO | iter 000023 | lr 0.0018 | loss 3.8490 | norm 428.4261
|
| 110 |
+
2025-06-25 06:48:45,938 | INFO | iter 000024 | lr 0.0018 | loss 4.0617 | norm 441.3406
|
| 111 |
+
2025-06-25 06:48:51,129 | INFO | iter 000025 | lr 0.0018 | loss 3.6068 | norm 453.9526
|
| 112 |
+
2025-06-25 06:48:56,489 | INFO | iter 000026 | lr 0.0018 | loss 4.0782 | norm 466.2762
|
| 113 |
+
2025-06-25 06:49:02,061 | INFO | iter 000027 | lr 0.0018 | loss 3.8365 | norm 478.3254
|
| 114 |
+
2025-06-25 06:49:07,402 | INFO | iter 000028 | lr 0.0018 | loss 2.9535 | norm 490.1142
|
| 115 |
+
2025-06-25 06:49:12,466 | INFO | iter 000029 | lr 0.0018 | loss 3.1557 | norm 501.6562
|
| 116 |
+
2025-06-25 06:49:17,583 | INFO | iter 000030 | lr 0.0018 | loss 2.7154 | norm 512.9648
|
| 117 |
+
2025-06-25 06:49:22,770 | INFO | iter 000031 | lr 0.0018 | loss 2.8398 | norm 524.0536
|
| 118 |
+
2025-06-25 06:49:27,964 | INFO | iter 000032 | lr 0.0018 | loss 1.6209 | norm 534.9364
|
| 119 |
+
2025-06-25 06:49:33,586 | INFO | iter 000033 | lr 0.0018 | loss 0.8802 | norm 545.6261
|
| 120 |
+
2025-06-25 06:49:38,805 | INFO | iter 000034 | lr 0.0018 | loss 1.5411 | norm 556.1344
|
| 121 |
+
2025-06-25 06:49:44,031 | INFO | iter 000035 | lr 0.0018 | loss 0.7415 | norm 566.4735
|
| 122 |
+
2025-06-25 06:49:49,298 | INFO | iter 000036 | lr 0.0018 | loss 0.3749 | norm 576.6551
|
| 123 |
+
2025-06-25 06:49:54,505 | INFO | iter 000037 | lr 0.0018 | loss 0.9944 | norm 586.6894
|
| 124 |
+
2025-06-25 06:50:00,198 | INFO | iter 000038 | lr 0.0018 | loss -1.2345 | norm 596.5885
|
| 125 |
+
2025-06-25 06:50:05,762 | INFO | iter 000039 | lr 0.0018 | loss -1.4369 | norm 606.3619
|
| 126 |
+
2025-06-25 06:50:10,887 | INFO | iter 000040 | lr 0.0018 | loss -1.5657 | norm 616.0191
|
| 127 |
+
2025-06-25 06:50:16,128 | INFO | iter 000041 | lr 0.0018 | loss -2.0936 | norm 625.5695
|
| 128 |
+
2025-06-25 06:50:21,496 | INFO | iter 000042 | lr 0.0018 | loss -3.7056 | norm 635.0224
|
| 129 |
+
2025-06-25 06:50:26,613 | INFO | iter 000043 | lr 0.0018 | loss -3.8761 | norm 644.3855
|
| 130 |
+
2025-06-25 06:50:32,217 | INFO | iter 000044 | lr 0.0018 | loss -4.3276 | norm 653.6660
|
| 131 |
+
2025-06-25 06:50:37,522 | INFO | iter 000045 | lr 0.0018 | loss -4.6411 | norm 662.8710
|
| 132 |
+
2025-06-25 06:50:42,783 | INFO | iter 000046 | lr 0.0018 | loss -5.7818 | norm 672.0079
|
| 133 |
+
2025-06-25 06:50:47,790 | INFO | iter 000047 | lr 0.0018 | loss -5.7992 | norm 681.0823
|
| 134 |
+
2025-06-25 06:50:52,994 | INFO | iter 000048 | lr 0.0018 | loss -7.2470 | norm 690.1007
|
| 135 |
+
2025-06-25 06:50:58,040 | INFO | iter 000049 | lr 0.0018 | loss -9.4092 | norm 699.0692
|
| 136 |
+
2025-06-25 06:50:58,041 | INFO | Completed LR test 2/10: lr=0.0018
|
| 137 |
+
2025-06-25 06:50:58,068 | INFO | -------------------------------- EoS --------------------------------
|
| 138 |
+
2025-06-25 06:50:58,068 | INFO | Starting LR test 3/10: lr=0.0025
|
| 139 |
+
2025-06-25 06:50:58,068 | INFO | Starting EoS for LR factor 1.4000
|
| 140 |
+
2025-06-25 06:50:58,068 | INFO | Starting EoS for checkpoint 001000
|
| 141 |
+
2025-06-25 06:50:58,068 | INFO | Starting EoS for model gpt2_small
|
| 142 |
+
2025-06-25 06:50:58,068 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 143 |
+
2025-06-25 06:50:58,068 | INFO | Starting EoS for num_iterations 50
|
| 144 |
+
2025-06-25 06:50:58,068 | INFO | Starting EoS for accum_steps 4
|
| 145 |
+
2025-06-25 06:50:58,068 | INFO | Loading model and checkpoint...
|
| 146 |
+
2025-06-25 06:50:58,772 | INFO | Wrapping model with DDP...
|
| 147 |
+
2025-06-25 06:50:59,088 | INFO | Loading state dict...
|
| 148 |
+
2025-06-25 06:50:59,091 | INFO | Model loaded successfully!
|
| 149 |
+
2025-06-25 06:51:05,754 | INFO | iter 000000 | lr 0.0025 | loss 4.3392 | norm 31.9607
|
| 150 |
+
2025-06-25 06:51:10,755 | INFO | iter 000001 | lr 0.0025 | loss 4.4680 | norm 62.8866
|
| 151 |
+
2025-06-25 06:51:15,818 | INFO | iter 000002 | lr 0.0025 | loss 4.4710 | norm 93.1217
|
| 152 |
+
2025-06-25 06:51:20,907 | INFO | iter 000003 | lr 0.0025 | loss 4.5262 | norm 122.5640
|
| 153 |
+
2025-06-25 06:51:26,040 | INFO | iter 000004 | lr 0.0025 | loss 4.5248 | norm 151.1809
|
| 154 |
+
2025-06-25 06:51:31,812 | INFO | iter 000005 | lr 0.0025 | loss 4.6891 | norm 178.9703
|
| 155 |
+
2025-06-25 06:51:37,067 | INFO | iter 000006 | lr 0.0025 | loss 4.7442 | norm 205.9325
|
| 156 |
+
2025-06-25 06:51:42,230 | INFO | iter 000007 | lr 0.0025 | loss 4.8070 | norm 232.0657
|
| 157 |
+
2025-06-25 06:51:47,272 | INFO | iter 000008 | lr 0.0025 | loss 4.7906 | norm 257.3720
|
| 158 |
+
2025-06-25 06:51:52,535 | INFO | iter 000009 | lr 0.0025 | loss 4.8344 | norm 281.8610
|
| 159 |
+
2025-06-25 06:51:57,924 | INFO | iter 000010 | lr 0.0025 | loss 4.8678 | norm 305.5477
|
| 160 |
+
2025-06-25 06:52:03,450 | INFO | iter 000011 | lr 0.0025 | loss 4.8304 | norm 328.4503
|
| 161 |
+
2025-06-25 06:52:08,629 | INFO | iter 000012 | lr 0.0025 | loss 4.7894 | norm 350.5884
|
| 162 |
+
2025-06-25 06:52:13,898 | INFO | iter 000013 | lr 0.0025 | loss 4.5864 | norm 371.9838
|
| 163 |
+
2025-06-25 06:52:19,143 | INFO | iter 000014 | lr 0.0025 | loss 4.9290 | norm 392.6606
|
| 164 |
+
2025-06-25 06:52:24,307 | INFO | iter 000015 | lr 0.0025 | loss 4.6104 | norm 412.6456
|
| 165 |
+
2025-06-25 06:52:29,763 | INFO | iter 000016 | lr 0.0025 | loss 4.4951 | norm 431.9664
|
| 166 |
+
2025-06-25 06:52:34,877 | INFO | iter 000017 | lr 0.0025 | loss 4.0138 | norm 450.6518
|
| 167 |
+
2025-06-25 06:52:40,147 | INFO | iter 000018 | lr 0.0025 | loss 3.9656 | norm 468.7319
|
| 168 |
+
2025-06-25 06:52:45,265 | INFO | iter 000019 | lr 0.0025 | loss 3.5134 | norm 486.2377
|
| 169 |
+
2025-06-25 06:52:50,435 | INFO | iter 000020 | lr 0.0025 | loss 3.4981 | norm 503.2006
|
| 170 |
+
2025-06-25 06:52:55,571 | INFO | iter 000021 | lr 0.0025 | loss 3.0034 | norm 519.6526
|
| 171 |
+
2025-06-25 06:53:01,174 | INFO | iter 000022 | lr 0.0025 | loss 3.0086 | norm 535.6248
|
| 172 |
+
2025-06-25 06:53:06,599 | INFO | iter 000023 | lr 0.0025 | loss 2.0425 | norm 551.1493
|
| 173 |
+
2025-06-25 06:53:11,956 | INFO | iter 000024 | lr 0.0025 | loss 2.3420 | norm 566.2564
|
| 174 |
+
2025-06-25 06:53:17,132 | INFO | iter 000025 | lr 0.0025 | loss 1.2484 | norm 580.9775
|
| 175 |
+
2025-06-25 06:53:22,381 | INFO | iter 000026 | lr 0.0025 | loss 1.9663 | norm 595.3424
|
| 176 |
+
2025-06-25 06:53:27,583 | INFO | iter 000027 | lr 0.0025 | loss 1.4790 | norm 609.3793
|
| 177 |
+
2025-06-25 06:53:33,275 | INFO | iter 000028 | lr 0.0025 | loss -0.2816 | norm 623.1166
|
| 178 |
+
2025-06-25 06:53:38,353 | INFO | iter 000029 | lr 0.0025 | loss -0.2689 | norm 636.5806
|
| 179 |
+
2025-06-25 06:53:43,494 | INFO | iter 000030 | lr 0.0025 | loss -0.8742 | norm 649.7966
|
| 180 |
+
2025-06-25 06:53:48,723 | INFO | iter 000031 | lr 0.0025 | loss -0.8715 | norm 662.7894
|
| 181 |
+
2025-06-25 06:59:15,808 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt
|
| 182 |
+
2025-06-25 06:59:17,627 | INFO | Loaded checkpoint with optimizer: adam
|
| 183 |
+
2025-06-25 06:59:17,627 | INFO | Current learning rate: 0.0018
|
| 184 |
+
2025-06-25 06:59:18,250 | INFO | Weight decay: 0.1
|
| 185 |
+
2025-06-25 06:59:18,250 | INFO | Epsilon: 1e-08
|
| 186 |
+
2025-06-25 06:59:18,251 | INFO | Loaded 147 first moment (m) buffers
|
| 187 |
+
2025-06-25 06:59:18,251 | INFO | Loaded 147 second moment (v) buffers
|
| 188 |
+
2025-06-25 06:59:18,251 | INFO | Optimizer state loading completed!
|
| 189 |
+
2025-06-25 06:59:20,150 | INFO | Initialized xs with norm: 1.273537
|
| 190 |
+
2025-06-25 06:59:20,157 | INFO | -------------------------------- EoS --------------------------------
|
| 191 |
+
2025-06-25 06:59:20,157 | INFO | Starting LR test 1/10: lr=0.0025
|
| 192 |
+
2025-06-25 06:59:20,157 | INFO | Starting EoS for LR factor 1.4000
|
| 193 |
+
2025-06-25 06:59:20,158 | INFO | Starting EoS for checkpoint 001000
|
| 194 |
+
2025-06-25 06:59:20,158 | INFO | Starting EoS for model gpt2_small
|
| 195 |
+
2025-06-25 06:59:20,158 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 196 |
+
2025-06-25 06:59:20,158 | INFO | Starting EoS for num_iterations 50
|
| 197 |
+
2025-06-25 06:59:20,158 | INFO | Starting EoS for accum_steps 4
|
| 198 |
+
2025-06-25 06:59:20,158 | INFO | Loading model and checkpoint...
|
| 199 |
+
2025-06-25 06:59:21,280 | INFO | Wrapping model with DDP...
|
| 200 |
+
2025-06-25 06:59:21,347 | INFO | Loading state dict...
|
| 201 |
+
2025-06-25 06:59:21,350 | INFO | Model loaded successfully!
|
| 202 |
+
2025-06-25 06:59:29,329 | INFO | iter 000000 | lr 0.0025 | loss 4.3393 | norm 15.4976
|
| 203 |
+
2025-06-25 06:59:34,563 | INFO | iter 000001 | lr 0.0025 | loss 5.0174 | norm 26.2033
|
| 204 |
+
2025-06-25 06:59:39,547 | INFO | iter 000002 | lr 0.0025 | loss 7.0303 | norm 34.2732
|
| 205 |
+
2025-06-25 06:59:44,649 | INFO | iter 000003 | lr 0.0025 | loss 4.8583 | norm 42.0572
|
| 206 |
+
2025-06-25 06:59:49,612 | INFO | iter 000004 | lr 0.0025 | loss 5.5988 | norm 48.8932
|
| 207 |
+
2025-06-25 06:59:54,632 | INFO | iter 000005 | lr 0.0025 | loss 4.8926 | norm 55.2742
|
| 208 |
+
2025-06-25 07:00:00,343 | INFO | iter 000006 | lr 0.0025 | loss 4.0337 | norm 61.4189
|
| 209 |
+
2025-06-25 07:00:05,824 | INFO | iter 000007 | lr 0.0025 | loss 3.8261 | norm 67.2519
|
| 210 |
+
2025-06-25 07:00:10,921 | INFO | iter 000008 | lr 0.0025 | loss 1.5497 | norm 73.1772
|
| 211 |
+
2025-06-25 07:00:15,950 | INFO | iter 000009 | lr 0.0025 | loss -0.9868 | norm 79.3588
|
| 212 |
+
2025-06-25 07:00:21,158 | INFO | iter 000010 | lr 0.0025 | loss -4.7045 | norm 85.9307
|
| 213 |
+
2025-06-25 07:00:26,356 | INFO | iter 000011 | lr 0.0025 | loss -9.1555 | norm 92.9646
|
| 214 |
+
2025-06-25 07:00:32,012 | INFO | iter 000012 | lr 0.0025 | loss -18.3042 | norm 100.6413
|
| 215 |
+
2025-06-25 07:00:37,134 | INFO | iter 000013 | lr 0.0025 | loss -24.4566 | norm 108.9797
|
| 216 |
+
2025-06-25 07:00:42,361 | INFO | iter 000014 | lr 0.0025 | loss -34.7879 | norm 117.8907
|
| 217 |
+
2025-06-25 07:00:47,543 | INFO | iter 000015 | lr 0.0025 | loss -56.0771 | norm 127.5276
|
| 218 |
+
2025-06-25 07:00:52,587 | INFO | iter 000016 | lr 0.0025 | loss -74.3452 | norm 137.8423
|
| 219 |
+
2025-06-25 07:00:57,843 | INFO | iter 000017 | lr 0.0025 | loss -97.3003 | norm 148.7922
|
| 220 |
+
2025-06-25 07:01:03,546 | INFO | iter 000018 | lr 0.0025 | loss -128.8613 | norm 160.3333
|
| 221 |
+
2025-06-25 07:01:08,639 | INFO | iter 000019 | lr 0.0025 | loss -153.6679 | norm 172.4106
|
| 222 |
+
2025-06-25 07:01:13,817 | INFO | iter 000020 | lr 0.0025 | loss -177.9772 | norm 184.8852
|
| 223 |
+
2025-06-25 07:01:19,038 | INFO | iter 000021 | lr 0.0025 | loss -212.6209 | norm 197.7662
|
| 224 |
+
2025-06-25 07:01:24,099 | INFO | iter 000022 | lr 0.0025 | loss -228.8878 | norm 210.9760
|
| 225 |
+
2025-06-25 07:01:29,523 | INFO | iter 000023 | lr 0.0025 | loss -302.6816 | norm 224.5441
|
| 226 |
+
2025-06-25 07:01:34,575 | INFO | iter 000024 | lr 0.0025 | loss -313.9990 | norm 238.3132
|
| 227 |
+
2025-06-25 07:01:39,718 | INFO | iter 000025 | lr 0.0025 | loss -412.0068 | norm 252.3185
|
| 228 |
+
2025-06-25 07:01:44,741 | INFO | iter 000026 | lr 0.0025 | loss -436.6941 | norm 266.0395
|
| 229 |
+
2025-06-25 07:01:50,024 | INFO | iter 000027 | lr 0.0025 | loss -440.2226 | norm 279.5380
|
| 230 |
+
2025-06-25 07:01:55,265 | INFO | iter 000028 | lr 0.0025 | loss -601.8568 | norm 293.2904
|
| 231 |
+
2025-06-25 07:02:00,592 | INFO | iter 000029 | lr 0.0025 | loss -676.1694 | norm 307.3040
|
| 232 |
+
2025-06-25 07:02:05,933 | INFO | iter 000030 | lr 0.0025 | loss -704.9308 | norm 321.5338
|
| 233 |
+
2025-06-25 07:02:11,124 | INFO | iter 000031 | lr 0.0025 | loss -774.7177 | norm 335.7946
|
| 234 |
+
2025-06-25 07:02:16,435 | INFO | iter 000032 | lr 0.0025 | loss -920.0737 | norm 350.3229
|
| 235 |
+
2025-06-25 07:02:21,641 | INFO | iter 000033 | lr 0.0025 | loss -1063.0433 | norm 364.4135
|
| 236 |
+
2025-06-25 07:02:26,771 | INFO | iter 000034 | lr 0.0025 | loss -1016.9738 | norm 378.6853
|
| 237 |
+
2025-06-25 07:02:32,448 | INFO | iter 000035 | lr 0.0025 | loss -1197.0923 | norm 393.2000
|
| 238 |
+
2025-06-25 07:02:37,618 | INFO | iter 000036 | lr 0.0025 | loss -1259.9513 | norm 407.9392
|
| 239 |
+
2025-06-25 07:02:42,716 | INFO | iter 000037 | lr 0.0025 | loss -1382.7266 | norm 422.7625
|
| 240 |
+
2025-06-25 07:02:47,839 | INFO | iter 000038 | lr 0.0025 | loss -1556.2228 | norm 437.7693
|
| 241 |
+
2025-06-25 07:02:53,065 | INFO | iter 000039 | lr 0.0025 | loss -1659.4865 | norm 452.8923
|
| 242 |
+
2025-06-25 07:02:58,112 | INFO | iter 000040 | lr 0.0025 | loss -1659.5458 | norm 468.0710
|
| 243 |
+
2025-06-25 07:03:03,864 | INFO | iter 000041 | lr 0.0025 | loss -1773.1254 | norm 483.1063
|
| 244 |
+
2025-06-25 07:03:09,029 | INFO | iter 000042 | lr 0.0025 | loss -2041.8066 | norm 498.2888
|
| 245 |
+
2025-06-25 07:03:14,123 | INFO | iter 000043 | lr 0.0025 | loss -2230.4387 | norm 513.6289
|
| 246 |
+
2025-06-25 07:03:19,337 | INFO | iter 000044 | lr 0.0025 | loss -2268.1362 | norm 529.1135
|
| 247 |
+
2025-06-25 07:03:24,439 | INFO | iter 000045 | lr 0.0025 | loss -2196.4004 | norm 544.5320
|
| 248 |
+
2025-06-25 07:03:29,662 | INFO | iter 000046 | lr 0.0025 | loss -2567.3474 | norm 560.0720
|
| 249 |
+
2025-06-25 07:03:35,121 | INFO | iter 000047 | lr 0.0025 | loss -2551.3745 | norm 575.5949
|
| 250 |
+
2025-06-25 07:03:40,366 | INFO | iter 000048 | lr 0.0025 | loss -2840.6702 | norm 591.2187
|
| 251 |
+
2025-06-25 07:03:45,515 | INFO | iter 000049 | lr 0.0025 | loss -3193.7876 | norm 606.9574
|
| 252 |
+
2025-06-25 07:03:45,516 | INFO | Completed LR test 1/10: lr=0.0025
|
| 253 |
+
2025-06-25 07:03:45,809 | INFO | Cleanup complete
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gpt2_small",
|
| 3 |
+
"factor_min": 0.6,
|
| 4 |
+
"factor_max": 1.5,
|
| 5 |
+
"factor_num": 10,
|
| 6 |
+
"error": 0.0001,
|
| 7 |
+
"accum_steps": 4,
|
| 8 |
+
"num_iterations": 50,
|
| 9 |
+
"num_checkpoint": 2000,
|
| 10 |
+
"input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
|
| 11 |
+
"run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
|
| 12 |
+
"timestamp": "250622_035242",
|
| 13 |
+
"raw": false
|
| 14 |
+
}
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-25 07:04:49,319 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
|
| 2 |
+
2025-06-25 07:04:50,921 | INFO | Loaded checkpoint with optimizer: adam
|
| 3 |
+
2025-06-25 07:04:50,922 | INFO | Current learning rate: 0.0018
|
| 4 |
+
2025-06-25 07:04:51,534 | INFO | Weight decay: 0.1
|
| 5 |
+
2025-06-25 07:04:51,534 | INFO | Epsilon: 1e-08
|
| 6 |
+
2025-06-25 07:04:51,534 | INFO | Loaded 147 first moment (m) buffers
|
| 7 |
+
2025-06-25 07:04:51,534 | INFO | Loaded 147 second moment (v) buffers
|
| 8 |
+
2025-06-25 07:04:51,534 | INFO | Optimizer state loading completed!
|
| 9 |
+
2025-06-25 07:04:53,371 | INFO | Initialized xs with norm: 1.273644
|
| 10 |
+
2025-06-25 07:04:53,383 | INFO | -------------------------------- EoS --------------------------------
|
| 11 |
+
2025-06-25 07:04:53,383 | INFO | Starting LR test 1/10: lr=0.0025
|
| 12 |
+
2025-06-25 07:04:53,383 | INFO | Starting EoS for LR factor 1.4000
|
| 13 |
+
2025-06-25 07:04:53,383 | INFO | Starting EoS for checkpoint 002000
|
| 14 |
+
2025-06-25 07:04:53,383 | INFO | Starting EoS for model gpt2_small
|
| 15 |
+
2025-06-25 07:04:53,383 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 16 |
+
2025-06-25 07:04:53,383 | INFO | Starting EoS for num_iterations 50
|
| 17 |
+
2025-06-25 07:04:53,383 | INFO | Starting EoS for accum_steps 4
|
| 18 |
+
2025-06-25 07:04:53,383 | INFO | Loading model and checkpoint...
|
| 19 |
+
2025-06-25 07:04:54,128 | INFO | Wrapping model with DDP...
|
| 20 |
+
2025-06-25 07:04:54,613 | INFO | Loading state dict...
|
| 21 |
+
2025-06-25 07:04:54,617 | INFO | Model loaded successfully!
|
| 22 |
+
2025-06-25 07:05:02,467 | INFO | iter 000000 | lr 0.0025 | loss 3.9757 | norm 15.2051
|
| 23 |
+
2025-06-25 07:05:07,565 | INFO | iter 000001 | lr 0.0025 | loss 4.3233 | norm 25.7014
|
| 24 |
+
2025-06-25 07:05:12,643 | INFO | iter 000002 | lr 0.0025 | loss 5.8232 | norm 33.3926
|
| 25 |
+
2025-06-25 07:05:17,743 | INFO | iter 000003 | lr 0.0025 | loss 4.4479 | norm 40.7412
|
| 26 |
+
2025-06-25 07:05:22,835 | INFO | iter 000004 | lr 0.0025 | loss 5.0561 | norm 47.0897
|
| 27 |
+
2025-06-25 07:05:27,984 | INFO | iter 000005 | lr 0.0025 | loss 4.2371 | norm 53.4390
|
| 28 |
+
2025-06-25 07:05:33,593 | INFO | iter 000006 | lr 0.0025 | loss 4.2628 | norm 59.3858
|
| 29 |
+
2025-06-25 07:05:38,622 | INFO | iter 000007 | lr 0.0025 | loss 4.4517 | norm 64.9470
|
| 30 |
+
2025-06-25 07:05:43,671 | INFO | iter 000008 | lr 0.0025 | loss 3.7213 | norm 70.3563
|
| 31 |
+
2025-06-25 07:05:48,942 | INFO | iter 000009 | lr 0.0025 | loss 3.4583 | norm 75.6566
|
| 32 |
+
2025-06-25 07:05:54,202 | INFO | iter 000010 | lr 0.0025 | loss 3.2253 | norm 80.8225
|
| 33 |
+
2025-06-25 07:05:59,788 | INFO | iter 000011 | lr 0.0025 | loss 2.1490 | norm 86.0380
|
| 34 |
+
2025-06-25 07:06:05,316 | INFO | iter 000012 | lr 0.0025 | loss 0.5857 | norm 91.4942
|
| 35 |
+
2025-06-25 07:06:10,581 | INFO | iter 000013 | lr 0.0025 | loss -0.7333 | norm 97.2915
|
| 36 |
+
2025-06-25 07:06:15,719 | INFO | iter 000014 | lr 0.0025 | loss -2.5905 | norm 103.4982
|
| 37 |
+
2025-06-25 07:06:20,943 | INFO | iter 000015 | lr 0.0025 | loss -6.6798 | norm 110.1739
|
| 38 |
+
2025-06-25 07:08:00,350 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
|
| 39 |
+
2025-06-25 07:08:02,106 | INFO | Loaded checkpoint with optimizer: adam
|
| 40 |
+
2025-06-25 07:08:02,107 | INFO | Current learning rate: 0.0018
|
| 41 |
+
2025-06-25 07:08:02,720 | INFO | Weight decay: 0.1
|
| 42 |
+
2025-06-25 07:08:02,720 | INFO | Epsilon: 1e-08
|
| 43 |
+
2025-06-25 07:08:02,721 | INFO | Loaded 147 first moment (m) buffers
|
| 44 |
+
2025-06-25 07:08:02,721 | INFO | Loaded 147 second moment (v) buffers
|
| 45 |
+
2025-06-25 07:08:02,721 | INFO | Optimizer state loading completed!
|
| 46 |
+
2025-06-25 07:08:04,562 | INFO | Initialized xs with norm: 1.273412
|
| 47 |
+
2025-06-25 07:08:04,573 | INFO | -------------------------------- EoS --------------------------------
|
| 48 |
+
2025-06-25 07:08:04,574 | INFO | Starting LR test 1/10: lr=0.0090
|
| 49 |
+
2025-06-25 07:08:04,574 | INFO | Starting EoS for LR factor 5.0000
|
| 50 |
+
2025-06-25 07:08:04,574 | INFO | Starting EoS for checkpoint 002000
|
| 51 |
+
2025-06-25 07:08:04,574 | INFO | Starting EoS for model gpt2_small
|
| 52 |
+
2025-06-25 07:08:04,574 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 53 |
+
2025-06-25 07:08:04,574 | INFO | Starting EoS for num_iterations 50
|
| 54 |
+
2025-06-25 07:08:04,574 | INFO | Starting EoS for accum_steps 4
|
| 55 |
+
2025-06-25 07:08:04,574 | INFO | Loading model and checkpoint...
|
| 56 |
+
2025-06-25 07:08:05,317 | INFO | Wrapping model with DDP...
|
| 57 |
+
2025-06-25 07:08:05,851 | INFO | Loading state dict...
|
| 58 |
+
2025-06-25 07:08:05,855 | INFO | Model loaded successfully!
|
| 59 |
+
2025-06-25 07:08:12,930 | INFO | iter 000000 | lr 0.0090 | loss 3.9757 | norm 54.1260
|
| 60 |
+
2025-06-25 07:08:17,867 | INFO | iter 000001 | lr 0.0090 | loss 7.9464 | norm 93.7556
|
| 61 |
+
2025-06-25 07:08:22,976 | INFO | iter 000002 | lr 0.0090 | loss 65.9051 | norm 114.5414
|
| 62 |
+
2025-06-25 07:08:27,990 | INFO | iter 000003 | lr 0.0090 | loss 9.6467 | norm 142.1960
|
| 63 |
+
2025-06-25 07:08:33,721 | INFO | iter 000004 | lr 0.0090 | loss 37.9767 | norm 165.1957
|
| 64 |
+
2025-06-25 07:08:38,762 | INFO | iter 000005 | lr 0.0090 | loss 34.2442 | norm 186.2563
|
| 65 |
+
2025-06-25 07:08:44,002 | INFO | iter 000006 | lr 0.0090 | loss 12.5688 | norm 207.7581
|
| 66 |
+
2025-06-25 07:08:49,138 | INFO | iter 000007 | lr 0.0090 | loss 13.4724 | norm 228.4881
|
| 67 |
+
2025-06-25 07:08:54,269 | INFO | iter 000008 | lr 0.0090 | loss 18.9169 | norm 247.7513
|
| 68 |
+
2025-06-25 07:08:59,417 | INFO | iter 000009 | lr 0.0090 | loss 14.7739 | norm 265.9009
|
| 69 |
+
2025-06-25 07:09:04,859 | INFO | iter 000010 | lr 0.0090 | loss 4.6113 | norm 283.6754
|
| 70 |
+
2025-06-25 07:09:10,221 | INFO | iter 000011 | lr 0.0090 | loss -2.8853 | norm 301.4828
|
| 71 |
+
2025-06-25 07:09:15,414 | INFO | iter 000012 | lr 0.0090 | loss -8.5041 | norm 319.3103
|
| 72 |
+
2025-06-25 07:09:20,524 | INFO | iter 000013 | lr 0.0090 | loss -16.0165 | norm 337.3585
|
| 73 |
+
2025-06-25 07:09:25,783 | INFO | iter 000014 | lr 0.0090 | loss -30.7357 | norm 356.0018
|
| 74 |
+
2025-06-25 07:09:31,353 | INFO | iter 000015 | lr 0.0090 | loss -59.7186 | norm 375.0746
|
| 75 |
+
2025-06-25 07:09:36,559 | INFO | iter 000016 | lr 0.0090 | loss -85.9098 | norm 395.1222
|
| 76 |
+
2025-06-25 07:09:41,686 | INFO | iter 000017 | lr 0.0090 | loss -113.7542 | norm 416.7910
|
| 77 |
+
2025-06-25 07:09:47,076 | INFO | iter 000018 | lr 0.0090 | loss -182.1024 | norm 439.8696
|
| 78 |
+
2025-06-25 07:27:24,437 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
|
| 79 |
+
2025-06-25 07:27:26,239 | INFO | Loaded checkpoint with optimizer: adam
|
| 80 |
+
2025-06-25 07:27:26,240 | INFO | Current learning rate: 0.0018
|
| 81 |
+
2025-06-25 07:27:26,858 | INFO | Weight decay: 0.1
|
| 82 |
+
2025-06-25 07:27:26,858 | INFO | Epsilon: 1e-08
|
| 83 |
+
2025-06-25 07:27:26,858 | INFO | Loaded 147 first moment (m) buffers
|
| 84 |
+
2025-06-25 07:27:26,858 | INFO | Loaded 147 second moment (v) buffers
|
| 85 |
+
2025-06-25 07:27:26,858 | INFO | Optimizer state loading completed!
|
| 86 |
+
2025-06-25 07:27:29,212 | INFO | Initialized xs with norm: 1.273458
|
| 87 |
+
2025-06-25 07:27:29,221 | INFO | -------------------------------- EoS --------------------------------
|
| 88 |
+
2025-06-25 07:27:29,221 | INFO | Starting LR test 1/10: lr=0.0180
|
| 89 |
+
2025-06-25 07:27:29,221 | INFO | Starting EoS for LR factor 10.0000
|
| 90 |
+
2025-06-25 07:27:29,221 | INFO | Starting EoS for checkpoint 002000
|
| 91 |
+
2025-06-25 07:27:29,221 | INFO | Starting EoS for model gpt2_small
|
| 92 |
+
2025-06-25 07:27:29,221 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 93 |
+
2025-06-25 07:27:29,221 | INFO | Starting EoS for num_iterations 50
|
| 94 |
+
2025-06-25 07:27:29,221 | INFO | Starting EoS for accum_steps 4
|
| 95 |
+
2025-06-25 07:27:29,221 | INFO | Loading model and checkpoint...
|
| 96 |
+
2025-06-25 07:27:29,979 | INFO | Wrapping model with DDP...
|
| 97 |
+
2025-06-25 07:27:30,468 | INFO | Loading state dict...
|
| 98 |
+
2025-06-25 07:27:30,471 | INFO | Model loaded successfully!
|
| 99 |
+
2025-06-25 07:27:37,640 | INFO | iter 000000 | lr 0.0180 | loss 3.9757 | norm 108.2326
|
| 100 |
+
2025-06-25 07:27:42,495 | INFO | iter 000001 | lr 0.0180 | loss 20.5162 | norm 189.4878
|
| 101 |
+
2025-06-25 07:27:47,595 | INFO | iter 000002 | lr 0.0180 | loss 292.6721 | norm 227.1594
|
| 102 |
+
2025-06-25 07:27:52,527 | INFO | iter 000003 | lr 0.0180 | loss 30.5704 | norm 282.5874
|
| 103 |
+
2025-06-25 07:27:57,708 | INFO | iter 000004 | lr 0.0180 | loss 145.1973 | norm 329.8537
|
| 104 |
+
2025-06-25 07:28:03,188 | INFO | iter 000005 | lr 0.0180 | loss 154.9424 | norm 371.7672
|
| 105 |
+
2025-06-25 07:28:08,432 | INFO | iter 000006 | lr 0.0180 | loss 54.9122 | norm 414.5362
|
| 106 |
+
2025-06-25 07:28:13,591 | INFO | iter 000007 | lr 0.0180 | loss 38.6102 | norm 456.6207
|
| 107 |
+
2025-06-25 07:28:18,809 | INFO | iter 000008 | lr 0.0180 | loss 66.4117 | norm 495.8205
|
| 108 |
+
2025-06-25 07:28:23,750 | INFO | iter 000009 | lr 0.0180 | loss 67.2475 | norm 531.9467
|
| 109 |
+
2025-06-25 07:28:29,478 | INFO | iter 000010 | lr 0.0180 | loss 27.9346 | norm 566.8688
|
| 110 |
+
2025-06-25 07:28:34,849 | INFO | iter 000011 | lr 0.0180 | loss -10.5653 | norm 601.7505
|
| 111 |
+
2025-06-25 07:28:40,026 | INFO | iter 000012 | lr 0.0180 | loss -32.7803 | norm 636.7070
|
| 112 |
+
2025-06-25 07:28:45,181 | INFO | iter 000013 | lr 0.0180 | loss -49.7714 | norm 671.7962
|
| 113 |
+
2025-06-25 07:28:50,620 | INFO | iter 000014 | lr 0.0180 | loss -94.9201 | norm 707.5554
|
| 114 |
+
2025-06-25 07:28:55,620 | INFO | iter 000015 | lr 0.0180 | loss -195.6995 | norm 743.8602
|
| 115 |
+
2025-06-25 07:29:00,782 | INFO | iter 000016 | lr 0.0180 | loss -289.3528 | norm 781.8421
|
| 116 |
+
2025-06-25 07:29:05,969 | INFO | iter 000017 | lr 0.0180 | loss -385.6000 | norm 822.7772
|
| 117 |
+
2025-06-25 07:29:11,247 | INFO | iter 000018 | lr 0.0180 | loss -617.5768 | norm 866.3561
|
| 118 |
+
2025-06-25 07:29:16,457 | INFO | iter 000019 | lr 0.0180 | loss -758.4005 | norm 913.5998
|
| 119 |
+
2025-06-25 07:29:21,562 | INFO | iter 000020 | lr 0.0180 | loss -962.2928 | norm 964.2577
|
| 120 |
+
2025-06-25 07:29:26,751 | INFO | iter 000021 | lr 0.0180 | loss -1308.8584 | norm 1018.9549
|
| 121 |
+
2025-06-25 07:29:32,200 | INFO | iter 000022 | lr 0.0180 | loss -1507.0786 | norm 1075.5761
|
| 122 |
+
2025-06-25 07:29:37,305 | INFO | iter 000023 | lr 0.0180 | loss -1928.5552 | norm 1136.3061
|
| 123 |
+
2025-06-25 07:29:42,442 | INFO | iter 000024 | lr 0.0180 | loss -2365.0591 | norm 1200.9531
|
| 124 |
+
2025-06-25 07:29:47,581 | INFO | iter 000025 | lr 0.0180 | loss -2911.7729 | norm 1269.1230
|
| 125 |
+
2025-06-25 07:29:52,848 | INFO | iter 000026 | lr 0.0180 | loss -3918.7095 | norm 1340.4335
|
| 126 |
+
2025-06-25 07:29:58,025 | INFO | iter 000027 | lr 0.0180 | loss -4075.2781 | norm 1415.0733
|
| 127 |
+
2025-06-25 07:30:03,634 | INFO | iter 000028 | lr 0.0180 | loss -4800.8032 | norm 1492.5634
|
| 128 |
+
2025-06-25 07:30:08,717 | INFO | iter 000029 | lr 0.0180 | loss -5429.1694 | norm 1572.5761
|
| 129 |
+
2025-06-25 07:30:13,912 | INFO | iter 000030 | lr 0.0180 | loss -6909.1343 | norm 1655.3578
|
| 130 |
+
2025-06-25 07:30:19,088 | INFO | iter 000031 | lr 0.0180 | loss -7403.7188 | norm 1740.2757
|
| 131 |
+
2025-06-25 07:30:24,170 | INFO | iter 000032 | lr 0.0180 | loss -8883.3643 | norm 1827.2884
|
| 132 |
+
2025-06-25 07:30:29,463 | INFO | iter 000033 | lr 0.0180 | loss -9913.4092 | norm 1916.4071
|
| 133 |
+
2025-06-25 07:30:34,811 | INFO | iter 000034 | lr 0.0180 | loss -12094.2510 | norm 2007.2927
|
| 134 |
+
2025-06-25 07:30:40,049 | INFO | iter 000035 | lr 0.0180 | loss -13123.3652 | norm 2099.8649
|
| 135 |
+
2025-06-25 07:30:45,184 | INFO | iter 000036 | lr 0.0180 | loss -13453.2988 | norm 2187.7686
|
| 136 |
+
2025-06-25 07:30:50,352 | INFO | iter 000037 | lr 0.0180 | loss -15590.8887 | norm 2277.7149
|
| 137 |
+
2025-06-25 07:30:55,349 | INFO | iter 000038 | lr 0.0180 | loss -17174.6211 | norm 2369.4460
|
| 138 |
+
2025-06-25 07:31:00,515 | INFO | iter 000039 | lr 0.0180 | loss -18859.8008 | norm 2462.6141
|
| 139 |
+
2025-06-25 07:31:05,868 | INFO | iter 000040 | lr 0.0180 | loss -22396.0918 | norm 2557.2235
|
| 140 |
+
2025-06-25 07:31:11,195 | INFO | iter 000041 | lr 0.0180 | loss -23291.8730 | norm 2652.7654
|
| 141 |
+
2025-06-25 07:31:16,272 | INFO | iter 000042 | lr 0.0180 | loss -24466.0820 | norm 2748.3429
|
| 142 |
+
2025-06-25 07:31:21,340 | INFO | iter 000043 | lr 0.0180 | loss -26257.3105 | norm 2844.3545
|
| 143 |
+
2025-06-25 07:31:26,604 | INFO | iter 000044 | lr 0.0180 | loss -28454.4160 | norm 2941.0400
|
| 144 |
+
2025-06-25 07:31:32,098 | INFO | iter 000045 | lr 0.0180 | loss -30838.6445 | norm 3038.6056
|
| 145 |
+
2025-06-25 07:31:37,115 | INFO | iter 000046 | lr 0.0180 | loss -32684.4766 | norm 3136.7237
|
| 146 |
+
2025-06-25 07:31:42,152 | INFO | iter 000047 | lr 0.0180 | loss -39129.3398 | norm 3235.0764
|
| 147 |
+
2025-06-25 07:31:47,357 | INFO | iter 000048 | lr 0.0180 | loss -37924.5391 | norm 3333.8420
|
| 148 |
+
2025-06-25 07:31:52,616 | INFO | iter 000049 | lr 0.0180 | loss -39045.4688 | norm 3432.5284
|
| 149 |
+
2025-06-25 07:31:52,617 | INFO | Completed LR test 1/10: lr=0.0180
|
| 150 |
+
2025-06-25 07:31:52,965 | INFO | Cleanup complete
|
| 151 |
+
2025-06-25 08:00:39,916 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
|
| 152 |
+
2025-06-25 08:00:41,765 | INFO | Loaded checkpoint with optimizer: adam
|
| 153 |
+
2025-06-25 08:00:41,766 | INFO | Current learning rate: 0.0018
|
| 154 |
+
2025-06-25 08:00:42,410 | INFO | Weight decay: 0.1
|
| 155 |
+
2025-06-25 08:00:42,411 | INFO | Epsilon: 1e-08
|
| 156 |
+
2025-06-25 08:00:42,411 | INFO | Loaded 147 first moment (m) buffers
|
| 157 |
+
2025-06-25 08:00:42,411 | INFO | Loaded 147 second moment (v) buffers
|
| 158 |
+
2025-06-25 08:00:42,411 | INFO | Optimizer state loading completed!
|
| 159 |
+
2025-06-25 08:00:44,469 | INFO | Initialized xs with norm: 1.273415
|
| 160 |
+
2025-06-25 08:00:44,473 | INFO | -------------------------------- EoS --------------------------------
|
| 161 |
+
2025-06-25 08:00:44,473 | INFO | Starting LR test 1/10: lr=0.0180
|
| 162 |
+
2025-06-25 08:00:44,473 | INFO | Starting EoS for LR factor 10.0000
|
| 163 |
+
2025-06-25 08:00:44,474 | INFO | Starting EoS for checkpoint 002000
|
| 164 |
+
2025-06-25 08:00:44,474 | INFO | Starting EoS for model gpt2_small
|
| 165 |
+
2025-06-25 08:00:44,474 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 166 |
+
2025-06-25 08:00:44,474 | INFO | Starting EoS for num_iterations 50
|
| 167 |
+
2025-06-25 08:00:44,474 | INFO | Starting EoS for accum_steps 4
|
| 168 |
+
2025-06-25 08:00:44,474 | INFO | Loading model and checkpoint...
|
| 169 |
+
2025-06-25 08:00:45,423 | INFO | Wrapping model with DDP...
|
| 170 |
+
2025-06-25 08:00:45,442 | INFO | Loading state dict...
|
| 171 |
+
2025-06-25 08:00:45,445 | INFO | Model loaded successfully!
|
| 172 |
+
2025-06-25 08:00:52,795 | INFO | iter 000000 | lr 0.0180 | loss 4.0603 | norm 103.6942
|
| 173 |
+
2025-06-25 08:00:57,878 | INFO | iter 000001 | lr 0.0180 | loss 14.7676 | norm 181.4721
|
| 174 |
+
2025-06-25 08:01:03,515 | INFO | iter 000002 | lr 0.0180 | loss 240.4388 | norm 221.1537
|
| 175 |
+
2025-06-25 08:01:08,778 | INFO | iter 000003 | lr 0.0180 | loss 37.1792 | norm 277.3678
|
| 176 |
+
2025-06-25 08:01:13,940 | INFO | iter 000004 | lr 0.0180 | loss 125.9179 | norm 326.9716
|
| 177 |
+
2025-06-25 08:01:19,048 | INFO | iter 000005 | lr 0.0180 | loss 119.1384 | norm 371.6566
|
| 178 |
+
2025-06-25 08:01:24,067 | INFO | iter 000006 | lr 0.0180 | loss 58.8411 | norm 415.6504
|
| 179 |
+
2025-06-25 08:01:29,398 | INFO | iter 000007 | lr 0.0180 | loss 50.0099 | norm 457.2432
|
| 180 |
+
2025-06-25 08:01:34,715 | INFO | iter 000008 | lr 0.0180 | loss 57.1595 | norm 495.9399
|
| 181 |
+
2025-06-25 08:01:39,808 | INFO | iter 000009 | lr 0.0180 | loss 50.3250 | norm 531.3629
|
| 182 |
+
2025-06-25 08:01:45,102 | INFO | iter 000010 | lr 0.0180 | loss 18.0556 | norm 566.8195
|
| 183 |
+
2025-06-25 08:01:50,160 | INFO | iter 000011 | lr 0.0180 | loss -12.1102 | norm 602.2888
|
| 184 |
+
2025-06-25 08:01:55,242 | INFO | iter 000012 | lr 0.0180 | loss -66.1828 | norm 639.0583
|
| 185 |
+
2025-06-25 08:02:00,842 | INFO | iter 000013 | lr 0.0180 | loss -103.6073 | norm 677.0137
|
| 186 |
+
2025-06-25 08:02:05,876 | INFO | iter 000014 | lr 0.0180 | loss -169.7566 | norm 715.7398
|
| 187 |
+
2025-06-25 08:02:11,234 | INFO | iter 000015 | lr 0.0180 | loss -249.2370 | norm 755.9124
|
| 188 |
+
2025-06-25 08:02:16,241 | INFO | iter 000016 | lr 0.0180 | loss -360.8513 | norm 798.1494
|
| 189 |
+
2025-06-25 08:02:21,418 | INFO | iter 000017 | lr 0.0180 | loss -541.1733 | norm 843.1959
|
| 190 |
+
2025-06-25 08:02:26,629 | INFO | iter 000018 | lr 0.0180 | loss -727.0453 | norm 891.4551
|
| 191 |
+
2025-06-25 08:02:32,162 | INFO | iter 000019 | lr 0.0180 | loss -957.3318 | norm 943.1777
|
| 192 |
+
2025-06-25 08:02:43,080 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt
|
| 193 |
+
2025-06-25 08:02:44,945 | INFO | Loaded checkpoint with optimizer: adam
|
| 194 |
+
2025-06-25 08:02:44,945 | INFO | Current learning rate: 0.0018
|
| 195 |
+
2025-06-25 08:02:45,555 | INFO | Weight decay: 0.1
|
| 196 |
+
2025-06-25 08:02:45,555 | INFO | Epsilon: 1e-08
|
| 197 |
+
2025-06-25 08:02:45,555 | INFO | Loaded 147 first moment (m) buffers
|
| 198 |
+
2025-06-25 08:02:45,555 | INFO | Loaded 147 second moment (v) buffers
|
| 199 |
+
2025-06-25 08:02:45,555 | INFO | Optimizer state loading completed!
|
| 200 |
+
2025-06-25 08:02:47,458 | INFO | Initialized xs with norm: 1.273634
|
| 201 |
+
2025-06-25 08:02:47,466 | INFO | -------------------------------- EoS --------------------------------
|
| 202 |
+
2025-06-25 08:02:47,466 | INFO | Starting LR test 1/10: lr=0.1800
|
| 203 |
+
2025-06-25 08:02:47,466 | INFO | Starting EoS for LR factor 100.0000
|
| 204 |
+
2025-06-25 08:02:47,466 | INFO | Starting EoS for checkpoint 002000
|
| 205 |
+
2025-06-25 08:02:47,466 | INFO | Starting EoS for model gpt2_small
|
| 206 |
+
2025-06-25 08:02:47,466 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 207 |
+
2025-06-25 08:02:47,467 | INFO | Starting EoS for num_iterations 50
|
| 208 |
+
2025-06-25 08:02:47,467 | INFO | Starting EoS for accum_steps 4
|
| 209 |
+
2025-06-25 08:02:47,467 | INFO | Loading model and checkpoint...
|
| 210 |
+
2025-06-25 08:02:48,209 | INFO | Wrapping model with DDP...
|
| 211 |
+
2025-06-25 08:02:48,467 | INFO | Loading state dict...
|
| 212 |
+
2025-06-25 08:02:48,470 | INFO | Model loaded successfully!
|
| 213 |
+
2025-06-25 08:02:54,668 | INFO | iter 000000 | lr 0.1800 | loss 4.0603 | norm 1036.7952
|
| 214 |
+
2025-06-25 08:02:59,931 | INFO | iter 000001 | lr 0.1800 | loss 1118.2144 | norm 1860.5170
|
| 215 |
+
2025-06-25 08:03:05,288 | INFO | iter 000002 | lr 0.1800 | loss 27679.1367 | norm 2199.6821
|
| 216 |
+
2025-06-25 08:03:10,533 | INFO | iter 000003 | lr 0.1800 | loss 3684.1533 | norm 2738.3308
|
| 217 |
+
2025-06-25 08:03:15,885 | INFO | iter 000004 | lr 0.1800 | loss 12768.7715 | norm 3217.2167
|
| 218 |
+
2025-06-25 08:03:21,102 | INFO | iter 000005 | lr 0.1800 | loss 13399.8350 | norm 3629.7486
|
| 219 |
+
2025-06-25 08:03:26,163 | INFO | iter 000006 | lr 0.1800 | loss 6779.9473 | norm 4026.9859
|
| 220 |
+
2025-06-25 08:03:31,728 | INFO | iter 000007 | lr 0.1800 | loss 4730.6021 | norm 4397.5922
|
| 221 |
+
2025-06-25 08:03:37,023 | INFO | iter 000008 | lr 0.1800 | loss 5649.2324 | norm 4731.8789
|
| 222 |
+
2025-06-25 08:03:42,224 | INFO | iter 000009 | lr 0.1800 | loss 5887.6724 | norm 5023.6548
|
| 223 |
+
2025-06-25 08:03:47,336 | INFO | iter 000010 | lr 0.1800 | loss 2948.2642 | norm 5307.5434
|
| 224 |
+
2025-06-25 08:03:52,742 | INFO | iter 000011 | lr 0.1800 | loss 679.2209 | norm 5583.8226
|
| 225 |
+
2025-06-25 08:03:57,967 | INFO | iter 000012 | lr 0.1800 | loss -3517.8269 | norm 5866.7620
|
| 226 |
+
2025-06-25 08:04:03,641 | INFO | iter 000013 | lr 0.1800 | loss -6241.0791 | norm 6155.4482
|
| 227 |
+
2025-06-25 08:04:08,650 | INFO | iter 000014 | lr 0.1800 | loss -10283.7734 | norm 6445.5204
|
| 228 |
+
2025-06-25 08:04:14,081 | INFO | iter 000015 | lr 0.1800 | loss -15390.3262 | norm 6741.4577
|
| 229 |
+
2025-06-25 08:04:19,406 | INFO | iter 000016 | lr 0.1800 | loss -23139.1680 | norm 7049.2437
|
| 230 |
+
2025-06-25 08:04:24,416 | INFO | iter 000017 | lr 0.1800 | loss -35265.1953 | norm 7376.9862
|
| 231 |
+
2025-06-25 08:04:30,114 | INFO | iter 000018 | lr 0.1800 | loss -47734.4375 | norm 7729.4336
|
| 232 |
+
2025-06-25 08:04:35,524 | INFO | iter 000019 | lr 0.1800 | loss -63256.2305 | norm 8108.3238
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gpt2_small",
|
| 3 |
+
"factor_min": 0.6,
|
| 4 |
+
"factor_max": 1.5,
|
| 5 |
+
"factor_num": 10,
|
| 6 |
+
"error": 0.0001,
|
| 7 |
+
"accum_steps": 4,
|
| 8 |
+
"num_iterations": 50,
|
| 9 |
+
"num_checkpoint": 7000,
|
| 10 |
+
"input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
|
| 11 |
+
"run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
|
| 12 |
+
"timestamp": "250622_035242",
|
| 13 |
+
"raw": false
|
| 14 |
+
}
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-25 05:35:53,415 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_007000.pt
|
| 2 |
+
2025-06-25 05:35:55,332 | INFO | Loaded checkpoint with optimizer: adam
|
| 3 |
+
2025-06-25 05:35:55,332 | INFO | Current learning rate: 0.0018
|
| 4 |
+
2025-06-25 05:35:55,930 | INFO | Weight decay: 0.1
|
| 5 |
+
2025-06-25 05:35:55,930 | INFO | Epsilon: 1e-08
|
| 6 |
+
2025-06-25 05:35:55,930 | INFO | Loaded 147 first moment (m) buffers
|
| 7 |
+
2025-06-25 05:35:55,930 | INFO | Loaded 147 second moment (v) buffers
|
| 8 |
+
2025-06-25 05:35:55,930 | INFO | Optimizer state loading completed!
|
| 9 |
+
2025-06-25 05:35:57,847 | INFO | Initialized xs with norm: 1.273580
|
| 10 |
+
2025-06-25 05:35:57,853 | INFO | -------------------------------- EoS --------------------------------
|
| 11 |
+
2025-06-25 05:35:57,853 | INFO | Starting LR test 1/10: lr=0.0011
|
| 12 |
+
2025-06-25 05:35:57,853 | INFO | Starting EoS for LR factor 0.6000
|
| 13 |
+
2025-06-25 05:35:57,853 | INFO | Starting EoS for checkpoint 007000
|
| 14 |
+
2025-06-25 05:35:57,853 | INFO | Starting EoS for model gpt2_small
|
| 15 |
+
2025-06-25 05:35:57,853 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 16 |
+
2025-06-25 05:35:57,854 | INFO | Starting EoS for num_iterations 50
|
| 17 |
+
2025-06-25 05:35:57,854 | INFO | Starting EoS for accum_steps 4
|
| 18 |
+
2025-06-25 05:35:57,854 | INFO | Loading model and checkpoint...
|
| 19 |
+
2025-06-25 05:35:58,674 | INFO | Wrapping model with DDP...
|
| 20 |
+
2025-06-25 05:35:58,743 | INFO | Loading state dict...
|
| 21 |
+
2025-06-25 05:35:58,747 | INFO | Model loaded successfully!
|
| 22 |
+
2025-06-25 05:36:05,712 | INFO | iter 000000 | lr 0.0011 | loss 3.6876 | norm 13.7931
|
| 23 |
+
2025-06-25 05:36:10,973 | INFO | iter 000001 | lr 0.0011 | loss 3.5650 | norm 27.3894
|
| 24 |
+
2025-06-25 05:36:16,178 | INFO | iter 000002 | lr 0.0011 | loss 3.5923 | norm 40.9245
|
| 25 |
+
2025-06-25 05:36:21,425 | INFO | iter 000003 | lr 0.0011 | loss 3.7197 | norm 54.3712
|
| 26 |
+
2025-06-25 05:36:26,555 | INFO | iter 000004 | lr 0.0011 | loss 3.7198 | norm 67.7207
|
| 27 |
+
2025-06-25 05:36:32,221 | INFO | iter 000005 | lr 0.0011 | loss 3.7145 | norm 80.9691
|
| 28 |
+
2025-06-25 05:36:37,311 | INFO | iter 000006 | lr 0.0011 | loss 3.8629 | norm 94.1130
|
| 29 |
+
2025-06-25 05:36:42,481 | INFO | iter 000007 | lr 0.0011 | loss 3.8424 | norm 107.1491
|
| 30 |
+
2025-06-25 05:36:47,571 | INFO | iter 000008 | lr 0.0011 | loss 3.9408 | norm 120.0743
|
| 31 |
+
2025-06-25 05:36:52,819 | INFO | iter 000009 | lr 0.0011 | loss 3.9754 | norm 132.8863
|
| 32 |
+
2025-06-25 05:36:57,995 | INFO | iter 000010 | lr 0.0011 | loss 3.8358 | norm 145.5831
|
| 33 |
+
2025-06-25 05:37:03,602 | INFO | iter 000011 | lr 0.0011 | loss 3.7628 | norm 158.1634
|
| 34 |
+
2025-06-25 05:37:08,848 | INFO | iter 000012 | lr 0.0011 | loss 4.0600 | norm 170.6257
|
| 35 |
+
2025-06-25 05:37:14,100 | INFO | iter 000013 | lr 0.0011 | loss 3.9553 | norm 182.9687
|
| 36 |
+
2025-06-25 05:37:19,299 | INFO | iter 000014 | lr 0.0011 | loss 3.9901 | norm 195.1916
|
| 37 |
+
2025-06-25 05:37:24,464 | INFO | iter 000015 | lr 0.0011 | loss 4.3327 | norm 207.2938
|
| 38 |
+
2025-06-25 05:37:29,984 | INFO | iter 000016 | lr 0.0011 | loss 4.2055 | norm 219.2749
|
| 39 |
+
2025-06-25 05:37:35,307 | INFO | iter 000017 | lr 0.0011 | loss 4.0625 | norm 231.1345
|
| 40 |
+
2025-06-25 05:37:40,541 | INFO | iter 000018 | lr 0.0011 | loss 4.1686 | norm 242.8727
|
| 41 |
+
2025-06-25 05:37:45,565 | INFO | iter 000019 | lr 0.0011 | loss 4.1835 | norm 254.4896
|
| 42 |
+
2025-06-25 05:37:50,753 | INFO | iter 000020 | lr 0.0011 | loss 4.1426 | norm 265.9856
|
| 43 |
+
2025-06-25 05:37:56,056 | INFO | iter 000021 | lr 0.0011 | loss 4.2872 | norm 277.3614
|
| 44 |
+
2025-06-25 05:38:01,898 | INFO | iter 000022 | lr 0.0011 | loss 4.5773 | norm 288.6175
|
| 45 |
+
2025-06-25 05:38:07,056 | INFO | iter 000023 | lr 0.0011 | loss 4.4376 | norm 299.7551
|
| 46 |
+
2025-06-25 05:38:12,351 | INFO | iter 000024 | lr 0.0011 | loss 4.2737 | norm 310.7748
|
| 47 |
+
2025-06-25 05:38:17,566 | INFO | iter 000025 | lr 0.0011 | loss 4.4620 | norm 321.6779
|
| 48 |
+
2025-06-25 05:38:22,639 | INFO | iter 000026 | lr 0.0011 | loss 4.3275 | norm 332.4654
|
| 49 |
+
2025-06-25 05:38:27,858 | INFO | iter 000027 | lr 0.0011 | loss 4.3138 | norm 343.1386
|
| 50 |
+
2025-06-25 05:38:33,577 | INFO | iter 000028 | lr 0.0011 | loss 4.5462 | norm 353.6991
|
| 51 |
+
2025-06-25 05:38:38,886 | INFO | iter 000029 | lr 0.0011 | loss 4.4448 | norm 364.1481
|
| 52 |
+
2025-06-25 05:38:43,950 | INFO | iter 000030 | lr 0.0011 | loss 4.6483 | norm 374.4874
|
| 53 |
+
2025-06-25 05:38:49,201 | INFO | iter 000031 | lr 0.0011 | loss 4.6357 | norm 384.7184
|
| 54 |
+
2025-06-25 05:38:54,409 | INFO | iter 000032 | lr 0.0011 | loss 4.6015 | norm 394.8427
|
| 55 |
+
2025-06-25 05:38:59,422 | INFO | iter 000033 | lr 0.0011 | loss 4.7027 | norm 404.8622
|
| 56 |
+
2025-06-25 05:39:05,147 | INFO | iter 000034 | lr 0.0011 | loss 4.5023 | norm 414.7783
|
| 57 |
+
2025-06-25 05:39:10,566 | INFO | iter 000035 | lr 0.0011 | loss 4.9184 | norm 424.5929
|
| 58 |
+
2025-06-25 05:39:15,684 | INFO | iter 000036 | lr 0.0011 | loss 4.8844 | norm 434.3076
|
| 59 |
+
2025-06-25 05:39:20,799 | INFO | iter 000037 | lr 0.0011 | loss 5.1758 | norm 443.9242
|
| 60 |
+
2025-06-25 05:39:25,840 | INFO | iter 000038 | lr 0.0011 | loss 5.0171 | norm 453.4445
|
| 61 |
+
2025-06-25 05:39:31,938 | INFO | iter 000039 | lr 0.0011 | loss 4.8304 | norm 462.8701
|
| 62 |
+
2025-06-25 05:39:36,990 | INFO | iter 000040 | lr 0.0011 | loss 5.0139 | norm 472.2029
|
| 63 |
+
2025-06-25 05:39:42,282 | INFO | iter 000041 | lr 0.0011 | loss 4.5102 | norm 481.4444
|
| 64 |
+
2025-06-25 05:39:47,604 | INFO | iter 000042 | lr 0.0011 | loss 5.0961 | norm 490.5965
|
| 65 |
+
2025-06-25 05:39:52,697 | INFO | iter 000043 | lr 0.0011 | loss 4.9875 | norm 499.6608
|
| 66 |
+
2025-06-25 05:39:57,946 | INFO | iter 000044 | lr 0.0011 | loss 4.8258 | norm 508.6389
|
| 67 |
+
2025-06-25 05:40:03,733 | INFO | iter 000045 | lr 0.0011 | loss 5.1973 | norm 517.5325
|
| 68 |
+
2025-06-25 05:40:08,855 | INFO | iter 000046 | lr 0.0011 | loss 5.2978 | norm 526.3431
|
| 69 |
+
2025-06-25 05:40:14,128 | INFO | iter 000047 | lr 0.0011 | loss 5.1401 | norm 535.0723
|
| 70 |
+
2025-06-25 05:40:19,467 | INFO | iter 000048 | lr 0.0011 | loss 5.3447 | norm 543.7217
|
| 71 |
+
2025-06-25 05:40:24,714 | INFO | iter 000049 | lr 0.0011 | loss 4.6829 | norm 552.2926
|
| 72 |
+
2025-06-25 05:40:24,714 | INFO | Completed LR test 1/10: lr=0.0011
|
| 73 |
+
2025-06-25 05:40:24,751 | INFO | -------------------------------- EoS --------------------------------
|
| 74 |
+
2025-06-25 05:40:24,751 | INFO | Starting LR test 2/10: lr=0.0013
|
| 75 |
+
2025-06-25 05:40:24,751 | INFO | Starting EoS for LR factor 0.7000
|
| 76 |
+
2025-06-25 05:40:24,751 | INFO | Starting EoS for checkpoint 007000
|
| 77 |
+
2025-06-25 05:40:24,751 | INFO | Starting EoS for model gpt2_small
|
| 78 |
+
2025-06-25 05:40:24,751 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 79 |
+
2025-06-25 05:40:24,751 | INFO | Starting EoS for num_iterations 50
|
| 80 |
+
2025-06-25 05:40:24,751 | INFO | Starting EoS for accum_steps 4
|
| 81 |
+
2025-06-25 05:40:24,751 | INFO | Loading model and checkpoint...
|
| 82 |
+
2025-06-25 05:40:25,477 | INFO | Wrapping model with DDP...
|
| 83 |
+
2025-06-25 05:40:25,746 | INFO | Loading state dict...
|
| 84 |
+
2025-06-25 05:40:25,749 | INFO | Model loaded successfully!
|
| 85 |
+
2025-06-25 05:40:33,135 | INFO | iter 000000 | lr 0.0013 | loss 3.6876 | norm 16.0739
|
| 86 |
+
2025-06-25 05:40:38,238 | INFO | iter 000001 | lr 0.0013 | loss 3.5668 | norm 31.9203
|
| 87 |
+
2025-06-25 05:40:43,503 | INFO | iter 000002 | lr 0.0013 | loss 3.5993 | norm 47.6747
|
| 88 |
+
2025-06-25 05:40:48,640 | INFO | iter 000003 | lr 0.0013 | loss 3.7332 | norm 63.3073
|
| 89 |
+
2025-06-25 05:40:53,858 | INFO | iter 000004 | lr 0.0013 | loss 3.7399 | norm 78.8077
|
| 90 |
+
2025-06-25 05:40:59,072 | INFO | iter 000005 | lr 0.0013 | loss 3.7418 | norm 94.1714
|
| 91 |
+
2025-06-25 05:41:04,563 | INFO | iter 000006 | lr 0.0013 | loss 3.9009 | norm 109.3946
|
| 92 |
+
2025-06-25 05:41:09,648 | INFO | iter 000007 | lr 0.0013 | loss 3.8821 | norm 124.4729
|
| 93 |
+
2025-06-25 05:41:15,013 | INFO | iter 000008 | lr 0.0013 | loss 3.9903 | norm 139.4024
|
| 94 |
+
2025-06-25 05:41:20,139 | INFO | iter 000009 | lr 0.0013 | loss 4.0268 | norm 154.1803
|
| 95 |
+
2025-06-25 05:41:25,227 | INFO | iter 000010 | lr 0.0013 | loss 3.8996 | norm 168.8045
|
| 96 |
+
2025-06-25 05:41:31,038 | INFO | iter 000011 | lr 0.0013 | loss 3.8270 | norm 183.2730
|
| 97 |
+
2025-06-25 05:41:36,315 | INFO | iter 000012 | lr 0.0013 | loss 4.1310 | norm 197.5845
|
| 98 |
+
2025-06-25 05:41:41,326 | INFO | iter 000013 | lr 0.0013 | loss 4.0251 | norm 211.7373
|
| 99 |
+
2025-06-25 05:41:46,451 | INFO | iter 000014 | lr 0.0013 | loss 4.0945 | norm 225.7308
|
| 100 |
+
2025-06-25 05:41:51,631 | INFO | iter 000015 | lr 0.0013 | loss 4.4285 | norm 239.5642
|
| 101 |
+
2025-06-25 05:41:56,676 | INFO | iter 000016 | lr 0.0013 | loss 4.3138 | norm 253.2372
|
| 102 |
+
2025-06-25 05:42:02,225 | INFO | iter 000017 | lr 0.0013 | loss 4.1754 | norm 266.7497
|
| 103 |
+
2025-06-25 05:42:07,383 | INFO | iter 000018 | lr 0.0013 | loss 4.2904 | norm 280.1020
|
| 104 |
+
2025-06-25 05:42:12,527 | INFO | iter 000019 | lr 0.0013 | loss 4.2943 | norm 293.2945
|
| 105 |
+
2025-06-25 05:42:17,700 | INFO | iter 000020 | lr 0.0013 | loss 4.2564 | norm 306.3280
|
| 106 |
+
2025-06-25 05:42:23,000 | INFO | iter 000021 | lr 0.0013 | loss 4.4312 | norm 319.2035
|
| 107 |
+
2025-06-25 05:42:28,355 | INFO | iter 000022 | lr 0.0013 | loss 4.7017 | norm 331.9223
|
| 108 |
+
2025-06-25 05:42:34,085 | INFO | iter 000023 | lr 0.0013 | loss 4.6376 | norm 344.4858
|
| 109 |
+
2025-06-25 05:42:39,128 | INFO | iter 000024 | lr 0.0013 | loss 4.4395 | norm 356.8954
|
| 110 |
+
2025-06-25 05:42:44,446 | INFO | iter 000025 | lr 0.0013 | loss 4.6249 | norm 369.1527
|
| 111 |
+
2025-06-25 05:42:49,650 | INFO | iter 000026 | lr 0.0013 | loss 4.4875 | norm 381.2595
|
| 112 |
+
2025-06-25 05:42:54,760 | INFO | iter 000027 | lr 0.0013 | loss 4.4658 | norm 393.2177
|
| 113 |
+
2025-06-25 05:43:00,105 | INFO | iter 000028 | lr 0.0013 | loss 4.7561 | norm 405.0296
|
| 114 |
+
2025-06-25 05:43:05,673 | INFO | iter 000029 | lr 0.0013 | loss 4.6378 | norm 416.6971
|
| 115 |
+
2025-06-25 05:43:10,891 | INFO | iter 000030 | lr 0.0013 | loss 4.8317 | norm 428.2225
|
| 116 |
+
2025-06-25 05:43:16,007 | INFO | iter 000031 | lr 0.0013 | loss 4.8500 | norm 439.6082
|
| 117 |
+
2025-06-25 05:43:21,121 | INFO | iter 000032 | lr 0.0013 | loss 4.8139 | norm 450.8564
|
| 118 |
+
2025-06-25 05:43:26,282 | INFO | iter 000033 | lr 0.0013 | loss 4.9246 | norm 461.9696
|
| 119 |
+
2025-06-25 05:43:31,833 | INFO | iter 000034 | lr 0.0013 | loss 4.6747 | norm 472.9502
|
| 120 |
+
2025-06-25 05:43:37,078 | INFO | iter 000035 | lr 0.0013 | loss 5.1966 | norm 483.8006
|
| 121 |
+
2025-06-25 05:43:42,249 | INFO | iter 000036 | lr 0.0013 | loss 5.1442 | norm 494.5233
|
| 122 |
+
2025-06-25 05:43:47,360 | INFO | iter 000037 | lr 0.0013 | loss 5.4601 | norm 505.1207
|
| 123 |
+
2025-06-25 05:43:52,622 | INFO | iter 000038 | lr 0.0013 | loss 5.2407 | norm 515.5953
|
| 124 |
+
2025-06-25 05:43:57,845 | INFO | iter 000039 | lr 0.0013 | loss 5.0629 | norm 525.9496
|
| 125 |
+
2025-06-25 05:44:03,515 | INFO | iter 000040 | lr 0.0013 | loss 5.2719 | norm 536.1859
|
| 126 |
+
2025-06-25 05:44:08,699 | INFO | iter 000041 | lr 0.0013 | loss 4.7011 | norm 546.3068
|
| 127 |
+
2025-06-25 05:44:13,777 | INFO | iter 000042 | lr 0.0013 | loss 5.3736 | norm 556.3145
|
| 128 |
+
2025-06-25 05:44:19,131 | INFO | iter 000043 | lr 0.0013 | loss 5.2889 | norm 566.2115
|
| 129 |
+
2025-06-25 05:44:24,440 | INFO | iter 000044 | lr 0.0013 | loss 5.0845 | norm 576.0000
|
| 130 |
+
2025-06-25 05:44:30,026 | INFO | iter 000045 | lr 0.0013 | loss 5.4554 | norm 585.6822
|
| 131 |
+
2025-06-25 05:44:35,085 | INFO | iter 000046 | lr 0.0013 | loss 5.5968 | norm 595.2605
|
| 132 |
+
2025-06-25 05:44:40,229 | INFO | iter 000047 | lr 0.0013 | loss 5.4326 | norm 604.7369
|
| 133 |
+
2025-06-25 05:44:45,571 | INFO | iter 000048 | lr 0.0013 | loss 5.6167 | norm 614.1136
|
| 134 |
+
2025-06-25 05:44:50,721 | INFO | iter 000049 | lr 0.0013 | loss 4.9240 | norm 623.3926
|
| 135 |
+
2025-06-25 05:44:50,722 | INFO | Completed LR test 2/10: lr=0.0013
|
| 136 |
+
2025-06-25 05:44:50,738 | INFO | -------------------------------- EoS --------------------------------
|
| 137 |
+
2025-06-25 05:44:50,738 | INFO | Starting LR test 3/10: lr=0.0014
|
| 138 |
+
2025-06-25 05:44:50,738 | INFO | Starting EoS for LR factor 0.8000
|
| 139 |
+
2025-06-25 05:44:50,738 | INFO | Starting EoS for checkpoint 007000
|
| 140 |
+
2025-06-25 05:44:50,738 | INFO | Starting EoS for model gpt2_small
|
| 141 |
+
2025-06-25 05:44:50,738 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 142 |
+
2025-06-25 05:44:50,738 | INFO | Starting EoS for num_iterations 50
|
| 143 |
+
2025-06-25 05:44:50,738 | INFO | Starting EoS for accum_steps 4
|
| 144 |
+
2025-06-25 05:44:50,738 | INFO | Loading model and checkpoint...
|
| 145 |
+
2025-06-25 05:44:51,462 | INFO | Wrapping model with DDP...
|
| 146 |
+
2025-06-25 05:44:51,701 | INFO | Loading state dict...
|
| 147 |
+
2025-06-25 05:44:51,704 | INFO | Model loaded successfully!
|
| 148 |
+
2025-06-25 05:44:58,088 | INFO | iter 000000 | lr 0.0014 | loss 3.6876 | norm 18.3568
|
| 149 |
+
2025-06-25 05:45:03,653 | INFO | iter 000001 | lr 0.0014 | loss 3.5691 | norm 36.4448
|
| 150 |
+
2025-06-25 05:45:08,934 | INFO | iter 000002 | lr 0.0014 | loss 3.6073 | norm 54.4071
|
| 151 |
+
2025-06-25 05:45:14,009 | INFO | iter 000003 | lr 0.0014 | loss 3.7478 | norm 72.2096
|
| 152 |
+
2025-06-25 05:45:19,098 | INFO | iter 000004 | lr 0.0014 | loss 3.7622 | norm 89.8399
|
| 153 |
+
2025-06-25 05:45:24,184 | INFO | iter 000005 | lr 0.0014 | loss 3.7683 | norm 107.2932
|
| 154 |
+
2025-06-25 05:45:29,635 | INFO | iter 000006 | lr 0.0014 | loss 3.9400 | norm 124.5648
|
| 155 |
+
2025-06-25 05:45:35,126 | INFO | iter 000007 | lr 0.0014 | loss 3.9224 | norm 141.6493
|
| 156 |
+
2025-06-25 05:45:40,271 | INFO | iter 000008 | lr 0.0014 | loss 4.0389 | norm 158.5423
|
| 157 |
+
2025-06-25 05:45:45,417 | INFO | iter 000009 | lr 0.0014 | loss 4.0786 | norm 175.2402
|
| 158 |
+
2025-06-25 05:45:50,522 | INFO | iter 000010 | lr 0.0014 | loss 3.9723 | norm 191.7406
|
| 159 |
+
2025-06-25 05:45:55,674 | INFO | iter 000011 | lr 0.0014 | loss 3.8953 | norm 208.0416
|
| 160 |
+
2025-06-25 05:46:01,485 | INFO | iter 000012 | lr 0.0014 | loss 4.2137 | norm 224.1416
|
| 161 |
+
2025-06-25 05:46:06,578 | INFO | iter 000013 | lr 0.0014 | loss 4.0981 | norm 240.0388
|
| 162 |
+
2025-06-25 05:46:11,525 | INFO | iter 000014 | lr 0.0014 | loss 4.1986 | norm 255.7326
|
| 163 |
+
2025-06-25 05:46:16,727 | INFO | iter 000015 | lr 0.0014 | loss 4.5317 | norm 271.2225
|
| 164 |
+
2025-06-25 05:46:21,994 | INFO | iter 000016 | lr 0.0014 | loss 4.4237 | norm 286.5083
|
| 165 |
+
2025-06-25 05:46:27,164 | INFO | iter 000017 | lr 0.0014 | loss 4.3022 | norm 301.5904
|
| 166 |
+
2025-06-25 05:46:32,884 | INFO | iter 000018 | lr 0.0014 | loss 4.3945 | norm 316.4691
|
| 167 |
+
2025-06-25 05:46:38,027 | INFO | iter 000019 | lr 0.0014 | loss 4.4074 | norm 331.1456
|
| 168 |
+
2025-06-25 05:46:43,453 | INFO | iter 000020 | lr 0.0014 | loss 4.3716 | norm 345.6211
|
| 169 |
+
2025-06-25 05:46:48,517 | INFO | iter 000021 | lr 0.0014 | loss 4.5736 | norm 359.8973
|
| 170 |
+
2025-06-25 05:46:53,691 | INFO | iter 000022 | lr 0.0014 | loss 4.8349 | norm 373.9760
|
| 171 |
+
2025-06-25 05:46:58,799 | INFO | iter 000023 | lr 0.0014 | loss 4.8390 | norm 387.8593
|
| 172 |
+
2025-06-25 05:47:04,367 | INFO | iter 000024 | lr 0.0014 | loss 4.6071 | norm 401.5493
|
| 173 |
+
2025-06-25 05:47:09,593 | INFO | iter 000025 | lr 0.0014 | loss 4.8229 | norm 415.0486
|
| 174 |
+
2025-06-25 05:47:14,751 | INFO | iter 000026 | lr 0.0014 | loss 4.6378 | norm 428.3594
|
| 175 |
+
2025-06-25 05:47:19,821 | INFO | iter 000027 | lr 0.0014 | loss 4.6228 | norm 441.4847
|
| 176 |
+
2025-06-25 05:47:24,895 | INFO | iter 000028 | lr 0.0014 | loss 4.9751 | norm 454.4274
|
| 177 |
+
2025-06-25 05:47:30,528 | INFO | iter 000029 | lr 0.0014 | loss 4.8282 | norm 467.1905
|
| 178 |
+
2025-06-25 05:47:35,961 | INFO | iter 000030 | lr 0.0014 | loss 5.0116 | norm 479.7771
|
| 179 |
+
2025-06-25 05:47:41,409 | INFO | iter 000031 | lr 0.0014 | loss 5.0729 | norm 492.1905
|
| 180 |
+
2025-06-25 05:47:46,555 | INFO | iter 000032 | lr 0.0014 | loss 5.0228 | norm 504.4338
|
| 181 |
+
2025-06-25 05:47:51,877 | INFO | iter 000033 | lr 0.0014 | loss 5.1236 | norm 516.5103
|
| 182 |
+
2025-06-25 05:47:57,014 | INFO | iter 000034 | lr 0.0014 | loss 4.8552 | norm 528.4233
|
| 183 |
+
2025-06-25 05:48:02,608 | INFO | iter 000035 | lr 0.0014 | loss 5.4885 | norm 540.1762
|
| 184 |
+
2025-06-25 05:48:07,748 | INFO | iter 000036 | lr 0.0014 | loss 5.4107 | norm 551.7723
|
| 185 |
+
2025-06-25 05:48:12,707 | INFO | iter 000037 | lr 0.0014 | loss 5.7052 | norm 563.2150
|
| 186 |
+
2025-06-25 05:48:17,970 | INFO | iter 000038 | lr 0.0014 | loss 5.4606 | norm 574.5076
|
| 187 |
+
2025-06-25 05:48:23,145 | INFO | iter 000039 | lr 0.0014 | loss 5.2917 | norm 585.6534
|
| 188 |
+
2025-06-25 05:48:28,398 | INFO | iter 000040 | lr 0.0014 | loss 5.5305 | norm 596.6557
|
| 189 |
+
2025-06-25 05:48:33,822 | INFO | iter 000041 | lr 0.0014 | loss 4.8888 | norm 607.5177
|
| 190 |
+
2025-06-25 05:48:39,061 | INFO | iter 000042 | lr 0.0014 | loss 5.6519 | norm 618.2427
|
| 191 |
+
2025-06-25 05:48:44,089 | INFO | iter 000043 | lr 0.0014 | loss 5.5571 | norm 628.8338
|
| 192 |
+
2025-06-25 05:48:49,294 | INFO | iter 000044 | lr 0.0014 | loss 5.3017 | norm 639.2940
|
| 193 |
+
2025-06-25 05:48:54,335 | INFO | iter 000045 | lr 0.0014 | loss 5.7088 | norm 649.6263
|
| 194 |
+
2025-06-25 05:48:59,520 | INFO | iter 000046 | lr 0.0014 | loss 5.8573 | norm 659.8338
|
| 195 |
+
2025-06-25 05:49:05,181 | INFO | iter 000047 | lr 0.0014 | loss 5.7481 | norm 669.9192
|
| 196 |
+
2025-06-25 05:49:10,172 | INFO | iter 000048 | lr 0.0014 | loss 5.8767 | norm 679.8854
|
| 197 |
+
2025-06-25 05:49:15,104 | INFO | iter 000049 | lr 0.0014 | loss 5.1259 | norm 689.7351
|
| 198 |
+
2025-06-25 05:49:15,104 | INFO | Completed LR test 3/10: lr=0.0014
|
| 199 |
+
2025-06-25 05:49:15,127 | INFO | -------------------------------- EoS --------------------------------
|
| 200 |
+
2025-06-25 05:49:15,127 | INFO | Starting LR test 4/10: lr=0.0016
|
| 201 |
+
2025-06-25 05:49:15,127 | INFO | Starting EoS for LR factor 0.9000
|
| 202 |
+
2025-06-25 05:49:15,127 | INFO | Starting EoS for checkpoint 007000
|
| 203 |
+
2025-06-25 05:49:15,127 | INFO | Starting EoS for model gpt2_small
|
| 204 |
+
2025-06-25 05:49:15,127 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 205 |
+
2025-06-25 05:49:15,127 | INFO | Starting EoS for num_iterations 50
|
| 206 |
+
2025-06-25 05:49:15,127 | INFO | Starting EoS for accum_steps 4
|
| 207 |
+
2025-06-25 05:49:15,127 | INFO | Loading model and checkpoint...
|
| 208 |
+
2025-06-25 05:49:15,838 | INFO | Wrapping model with DDP...
|
| 209 |
+
2025-06-25 05:49:16,179 | INFO | Loading state dict...
|
| 210 |
+
2025-06-25 05:49:16,183 | INFO | Model loaded successfully!
|
| 211 |
+
2025-06-25 05:49:22,383 | INFO | iter 000000 | lr 0.0016 | loss 3.6876 | norm 20.6411
|
| 212 |
+
2025-06-25 05:49:27,494 | INFO | iter 000001 | lr 0.0016 | loss 3.5716 | norm 40.9628
|
| 213 |
+
2025-06-25 05:49:33,578 | INFO | iter 000002 | lr 0.0016 | loss 3.6157 | norm 61.1216
|
| 214 |
+
2025-06-25 05:49:38,620 | INFO | iter 000003 | lr 0.0016 | loss 3.7629 | norm 81.0780
|
| 215 |
+
2025-06-25 05:49:43,686 | INFO | iter 000004 | lr 0.0016 | loss 3.7846 | norm 100.8176
|
| 216 |
+
2025-06-25 05:49:48,747 | INFO | iter 000005 | lr 0.0016 | loss 3.7971 | norm 120.3347
|
| 217 |
+
2025-06-25 05:49:53,747 | INFO | iter 000006 | lr 0.0016 | loss 3.9829 | norm 139.6241
|
| 218 |
+
2025-06-25 05:49:58,945 | INFO | iter 000007 | lr 0.0016 | loss 3.9620 | norm 158.6796
|
| 219 |
+
2025-06-25 05:50:04,505 | INFO | iter 000008 | lr 0.0016 | loss 4.0909 | norm 177.4956
|
| 220 |
+
2025-06-25 05:50:09,658 | INFO | iter 000009 | lr 0.0016 | loss 4.1380 | norm 196.0683
|
| 221 |
+
2025-06-25 05:50:14,683 | INFO | iter 000010 | lr 0.0016 | loss 4.0397 | norm 214.3950
|
| 222 |
+
2025-06-25 05:50:19,816 | INFO | iter 000011 | lr 0.0016 | loss 3.9675 | norm 232.4738
|
| 223 |
+
2025-06-25 05:50:24,917 | INFO | iter 000012 | lr 0.0016 | loss 4.2959 | norm 250.3029
|
| 224 |
+
2025-06-25 05:50:30,599 | INFO | iter 000013 | lr 0.0016 | loss 4.1695 | norm 267.8807
|
| 225 |
+
2025-06-25 05:50:35,603 | INFO | iter 000014 | lr 0.0016 | loss 4.3111 | norm 285.2066
|
| 226 |
+
2025-06-25 05:50:40,827 | INFO | iter 000015 | lr 0.0016 | loss 4.6379 | norm 302.2806
|
| 227 |
+
2025-06-25 05:50:46,091 | INFO | iter 000016 | lr 0.0016 | loss 4.5418 | norm 319.1028
|
| 228 |
+
2025-06-25 05:50:51,447 | INFO | iter 000017 | lr 0.0016 | loss 4.4444 | norm 335.6738
|
| 229 |
+
2025-06-25 05:50:56,560 | INFO | iter 000018 | lr 0.0016 | loss 4.5090 | norm 351.9947
|
| 230 |
+
2025-06-25 05:51:02,059 | INFO | iter 000019 | lr 0.0016 | loss 4.5233 | norm 368.0672
|
| 231 |
+
2025-06-25 05:51:07,086 | INFO | iter 000020 | lr 0.0016 | loss 4.4959 | norm 383.8933
|
| 232 |
+
2025-06-25 05:51:12,277 | INFO | iter 000021 | lr 0.0016 | loss 4.7354 | norm 399.4754
|
| 233 |
+
2025-06-25 05:51:17,599 | INFO | iter 000022 | lr 0.0016 | loss 4.9572 | norm 414.8162
|
| 234 |
+
2025-06-25 05:51:22,816 | INFO | iter 000023 | lr 0.0016 | loss 5.0554 | norm 429.9186
|
| 235 |
+
2025-06-25 05:51:28,254 | INFO | iter 000024 | lr 0.0016 | loss 4.7812 | norm 444.7857
|
| 236 |
+
2025-06-25 05:51:33,877 | INFO | iter 000025 | lr 0.0016 | loss 4.9812 | norm 459.4208
|
| 237 |
+
2025-06-25 05:51:38,996 | INFO | iter 000026 | lr 0.0016 | loss 4.7977 | norm 473.8274
|
| 238 |
+
2025-06-25 05:51:44,352 | INFO | iter 000027 | lr 0.0016 | loss 4.7787 | norm 488.0092
|
| 239 |
+
2025-06-25 05:51:49,761 | INFO | iter 000028 | lr 0.0016 | loss 5.1998 | norm 501.9703
|
| 240 |
+
2025-06-25 05:51:55,046 | INFO | iter 000029 | lr 0.0016 | loss 5.0280 | norm 515.7147
|
| 241 |
+
2025-06-25 05:52:00,665 | INFO | iter 000030 | lr 0.0016 | loss 5.1958 | norm 529.2467
|
| 242 |
+
2025-06-25 05:52:05,662 | INFO | iter 000031 | lr 0.0016 | loss 5.2719 | norm 542.5705
|
| 243 |
+
2025-06-25 05:52:10,734 | INFO | iter 000032 | lr 0.0016 | loss 5.2195 | norm 555.6903
|
| 244 |
+
2025-06-25 05:52:15,809 | INFO | iter 000033 | lr 0.0016 | loss 5.3781 | norm 568.6107
|
| 245 |
+
2025-06-25 05:52:20,946 | INFO | iter 000034 | lr 0.0016 | loss 5.0383 | norm 581.3358
|
| 246 |
+
2025-06-25 05:52:26,121 | INFO | iter 000035 | lr 0.0016 | loss 5.7508 | norm 593.8701
|
| 247 |
+
2025-06-25 05:52:31,748 | INFO | iter 000036 | lr 0.0016 | loss 5.6478 | norm 606.2180
|
| 248 |
+
2025-06-25 05:52:36,972 | INFO | iter 000037 | lr 0.0016 | loss 5.9527 | norm 618.3839
|
| 249 |
+
2025-06-25 05:52:42,355 | INFO | iter 000038 | lr 0.0016 | loss 5.6528 | norm 630.3722
|
| 250 |
+
2025-06-25 05:52:47,675 | INFO | iter 000039 | lr 0.0016 | loss 5.4899 | norm 642.1872
|
| 251 |
+
2025-06-25 05:52:52,854 | INFO | iter 000040 | lr 0.0016 | loss 5.7534 | norm 653.8331
|
| 252 |
+
2025-06-25 05:52:57,905 | INFO | iter 000041 | lr 0.0016 | loss 5.0604 | norm 665.3142
|
| 253 |
+
2025-06-25 05:53:03,388 | INFO | iter 000042 | lr 0.0016 | loss 5.9295 | norm 676.6346
|
| 254 |
+
2025-06-25 05:53:08,623 | INFO | iter 000043 | lr 0.0016 | loss 5.7979 | norm 687.7984
|
| 255 |
+
2025-06-25 05:53:13,916 | INFO | iter 000044 | lr 0.0016 | loss 5.5369 | norm 698.8096
|
| 256 |
+
2025-06-25 05:53:19,248 | INFO | iter 000045 | lr 0.0016 | loss 5.9652 | norm 709.6719
|
| 257 |
+
2025-06-25 05:53:24,627 | INFO | iter 000046 | lr 0.0016 | loss 6.0899 | norm 720.3892
|
| 258 |
+
2025-06-25 05:53:30,007 | INFO | iter 000047 | lr 0.0016 | loss 6.0334 | norm 730.9651
|
| 259 |
+
2025-06-25 05:53:35,384 | INFO | iter 000048 | lr 0.0016 | loss 6.1324 | norm 741.4034
|
| 260 |
+
2025-06-25 05:53:40,548 | INFO | iter 000049 | lr 0.0016 | loss 5.3401 | norm 751.7074
|
| 261 |
+
2025-06-25 05:53:40,549 | INFO | Completed LR test 4/10: lr=0.0016
|
| 262 |
+
2025-06-25 05:53:40,579 | INFO | -------------------------------- EoS --------------------------------
|
| 263 |
+
2025-06-25 05:53:40,580 | INFO | Starting LR test 5/10: lr=0.0018
|
| 264 |
+
2025-06-25 05:53:40,580 | INFO | Starting EoS for LR factor 1.0000
|
| 265 |
+
2025-06-25 05:53:40,580 | INFO | Starting EoS for checkpoint 007000
|
| 266 |
+
2025-06-25 05:53:40,580 | INFO | Starting EoS for model gpt2_small
|
| 267 |
+
2025-06-25 05:53:40,580 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 268 |
+
2025-06-25 05:53:40,580 | INFO | Starting EoS for num_iterations 50
|
| 269 |
+
2025-06-25 05:53:40,580 | INFO | Starting EoS for accum_steps 4
|
| 270 |
+
2025-06-25 05:53:40,580 | INFO | Loading model and checkpoint...
|
| 271 |
+
2025-06-25 05:53:41,270 | INFO | Wrapping model with DDP...
|
| 272 |
+
2025-06-25 05:53:41,650 | INFO | Loading state dict...
|
| 273 |
+
2025-06-25 05:53:41,653 | INFO | Model loaded successfully!
|
| 274 |
+
2025-06-25 05:53:47,958 | INFO | iter 000000 | lr 0.0018 | loss 3.6876 | norm 22.9263
|
| 275 |
+
2025-06-25 05:53:53,039 | INFO | iter 000001 | lr 0.0018 | loss 3.5747 | norm 45.4740
|
| 276 |
+
2025-06-25 05:53:58,328 | INFO | iter 000002 | lr 0.0018 | loss 3.6252 | norm 67.8180
|
| 277 |
+
2025-06-25 05:54:03,897 | INFO | iter 000003 | lr 0.0018 | loss 3.7789 | norm 89.9128
|
| 278 |
+
2025-06-25 05:54:08,955 | INFO | iter 000004 | lr 0.0018 | loss 3.8081 | norm 111.7409
|
| 279 |
+
2025-06-25 05:54:14,093 | INFO | iter 000005 | lr 0.0018 | loss 3.8254 | norm 133.2964
|
| 280 |
+
2025-06-25 05:54:19,498 | INFO | iter 000006 | lr 0.0018 | loss 4.0263 | norm 154.5736
|
| 281 |
+
2025-06-25 05:54:24,625 | INFO | iter 000007 | lr 0.0018 | loss 4.0083 | norm 175.5650
|
| 282 |
+
2025-06-25 05:54:30,300 | INFO | iter 000008 | lr 0.0018 | loss 4.1424 | norm 196.2643
|
| 283 |
+
2025-06-25 05:54:35,636 | INFO | iter 000009 | lr 0.0018 | loss 4.1973 | norm 216.6673
|
| 284 |
+
2025-06-25 05:54:40,921 | INFO | iter 000010 | lr 0.0018 | loss 4.1163 | norm 236.7712
|
| 285 |
+
2025-06-25 05:54:46,048 | INFO | iter 000011 | lr 0.0018 | loss 4.0434 | norm 256.5742
|
| 286 |
+
2025-06-25 05:54:51,400 | INFO | iter 000012 | lr 0.0018 | loss 4.3868 | norm 276.0744
|
| 287 |
+
2025-06-25 05:54:56,503 | INFO | iter 000013 | lr 0.0018 | loss 4.2525 | norm 295.2705
|
| 288 |
+
2025-06-25 05:55:02,238 | INFO | iter 000014 | lr 0.0018 | loss 4.4289 | norm 314.1622
|
| 289 |
+
2025-06-25 05:55:07,532 | INFO | iter 000015 | lr 0.0018 | loss 4.7470 | norm 332.7499
|
| 290 |
+
2025-06-25 05:55:12,639 | INFO | iter 000016 | lr 0.0018 | loss 4.6653 | norm 351.0343
|
| 291 |
+
2025-06-25 05:55:17,801 | INFO | iter 000017 | lr 0.0018 | loss 4.5801 | norm 369.0165
|
| 292 |
+
2025-06-25 05:55:22,922 | INFO | iter 000018 | lr 0.0018 | loss 4.6499 | norm 386.6985
|
| 293 |
+
2025-06-25 05:55:28,023 | INFO | iter 000019 | lr 0.0018 | loss 4.6453 | norm 404.0825
|
| 294 |
+
2025-06-25 05:55:33,721 | INFO | iter 000020 | lr 0.0018 | loss 4.6186 | norm 421.1716
|
| 295 |
+
2025-06-25 05:55:38,680 | INFO | iter 000021 | lr 0.0018 | loss 4.8857 | norm 437.9693
|
| 296 |
+
2025-06-25 05:55:43,961 | INFO | iter 000022 | lr 0.0018 | loss 5.0973 | norm 454.4791
|
| 297 |
+
2025-06-25 05:55:49,091 | INFO | iter 000023 | lr 0.0018 | loss 5.2755 | norm 470.7052
|
| 298 |
+
2025-06-25 05:55:54,224 | INFO | iter 000024 | lr 0.0018 | loss 4.9374 | norm 486.6516
|
| 299 |
+
2025-06-25 05:55:59,987 | INFO | iter 000025 | lr 0.0018 | loss 5.1755 | norm 502.3227
|
| 300 |
+
2025-06-25 05:56:05,519 | INFO | iter 000026 | lr 0.0018 | loss 4.9887 | norm 517.7234
|
| 301 |
+
2025-06-25 05:56:10,497 | INFO | iter 000027 | lr 0.0018 | loss 4.9314 | norm 532.8585
|
| 302 |
+
2025-06-25 05:56:15,715 | INFO | iter 000028 | lr 0.0018 | loss 5.3997 | norm 547.7333
|
| 303 |
+
2025-06-25 05:56:20,871 | INFO | iter 000029 | lr 0.0018 | loss 5.1985 | norm 562.3531
|
| 304 |
+
2025-06-25 05:56:26,149 | INFO | iter 000030 | lr 0.0018 | loss 5.3587 | norm 576.7233
|
| 305 |
+
2025-06-25 05:56:31,899 | INFO | iter 000031 | lr 0.0018 | loss 5.4842 | norm 590.8496
|
| 306 |
+
2025-06-25 05:56:37,096 | INFO | iter 000032 | lr 0.0018 | loss 5.4218 | norm 604.7374
|
| 307 |
+
2025-06-25 05:56:42,378 | INFO | iter 000033 | lr 0.0018 | loss 5.5774 | norm 618.3925
|
| 308 |
+
2025-06-25 05:56:47,635 | INFO | iter 000034 | lr 0.0018 | loss 5.2042 | norm 631.8203
|
| 309 |
+
2025-06-25 05:56:52,745 | INFO | iter 000035 | lr 0.0018 | loss 6.0363 | norm 645.0265
|
| 310 |
+
2025-06-25 05:56:58,068 | INFO | iter 000036 | lr 0.0018 | loss 5.8952 | norm 658.0167
|
| 311 |
+
2025-06-25 05:57:03,613 | INFO | iter 000037 | lr 0.0018 | loss 6.2191 | norm 670.7966
|
| 312 |
+
2025-06-25 05:57:08,695 | INFO | iter 000038 | lr 0.0018 | loss 5.8633 | norm 683.3716
|
| 313 |
+
2025-06-25 05:57:13,857 | INFO | iter 000039 | lr 0.0018 | loss 5.6797 | norm 695.7473
|
| 314 |
+
2025-06-25 05:57:18,927 | INFO | iter 000040 | lr 0.0018 | loss 5.9959 | norm 707.9289
|
| 315 |
+
2025-06-25 05:57:24,051 | INFO | iter 000041 | lr 0.0018 | loss 5.2168 | norm 719.9219
|
| 316 |
+
2025-06-25 05:57:29,317 | INFO | iter 000042 | lr 0.0018 | loss 6.1407 | norm 731.7314
|
| 317 |
+
2025-06-25 05:57:34,877 | INFO | iter 000043 | lr 0.0018 | loss 6.0471 | norm 743.3626
|
| 318 |
+
2025-06-25 05:57:40,102 | INFO | iter 000044 | lr 0.0018 | loss 5.7566 | norm 754.8203
|
| 319 |
+
2025-06-25 05:57:45,152 | INFO | iter 000045 | lr 0.0018 | loss 6.1483 | norm 766.1095
|
| 320 |
+
2025-06-25 05:57:50,342 | INFO | iter 000046 | lr 0.0018 | loss 6.3447 | norm 777.2347
|
| 321 |
+
2025-06-25 05:57:55,654 | INFO | iter 000047 | lr 0.0018 | loss 6.2507 | norm 788.2008
|
| 322 |
+
2025-06-25 05:58:01,245 | INFO | iter 000048 | lr 0.0018 | loss 6.3594 | norm 799.0120
|
| 323 |
+
2025-06-25 05:58:06,311 | INFO | iter 000049 | lr 0.0018 | loss 5.4779 | norm 809.6727
|
| 324 |
+
2025-06-25 05:58:06,311 | INFO | Completed LR test 5/10: lr=0.0018
|
| 325 |
+
2025-06-25 05:58:06,326 | INFO | -------------------------------- EoS --------------------------------
|
| 326 |
+
2025-06-25 05:58:06,326 | INFO | Starting LR test 6/10: lr=0.0020
|
| 327 |
+
2025-06-25 05:58:06,326 | INFO | Starting EoS for LR factor 1.1000
|
| 328 |
+
2025-06-25 05:58:06,327 | INFO | Starting EoS for checkpoint 007000
|
| 329 |
+
2025-06-25 05:58:06,327 | INFO | Starting EoS for model gpt2_small
|
| 330 |
+
2025-06-25 05:58:06,327 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 331 |
+
2025-06-25 05:58:06,327 | INFO | Starting EoS for num_iterations 50
|
| 332 |
+
2025-06-25 05:58:06,327 | INFO | Starting EoS for accum_steps 4
|
| 333 |
+
2025-06-25 05:58:06,327 | INFO | Loading model and checkpoint...
|
| 334 |
+
2025-06-25 05:58:07,014 | INFO | Wrapping model with DDP...
|
| 335 |
+
2025-06-25 05:58:07,353 | INFO | Loading state dict...
|
| 336 |
+
2025-06-25 05:58:07,356 | INFO | Model loaded successfully!
|
| 337 |
+
2025-06-25 05:58:13,544 | INFO | iter 000000 | lr 0.0020 | loss 3.6876 | norm 25.2123
|
| 338 |
+
2025-06-25 05:58:18,866 | INFO | iter 000001 | lr 0.0020 | loss 3.5779 | norm 49.9783
|
| 339 |
+
2025-06-25 05:58:24,009 | INFO | iter 000002 | lr 0.0020 | loss 3.6351 | norm 74.4965
|
| 340 |
+
2025-06-25 05:58:29,266 | INFO | iter 000003 | lr 0.0020 | loss 3.7960 | norm 98.7138
|
| 341 |
+
2025-06-25 05:58:34,900 | INFO | iter 000004 | lr 0.0020 | loss 3.8337 | norm 122.6100
|
| 342 |
+
2025-06-25 05:58:40,074 | INFO | iter 000005 | lr 0.0020 | loss 3.8578 | norm 146.1787
|
| 343 |
+
2025-06-25 05:58:45,172 | INFO | iter 000006 | lr 0.0020 | loss 4.0691 | norm 169.4137
|
| 344 |
+
2025-06-25 05:58:50,219 | INFO | iter 000007 | lr 0.0020 | loss 4.0526 | norm 192.3065
|
| 345 |
+
2025-06-25 05:58:55,431 | INFO | iter 000008 | lr 0.0020 | loss 4.1982 | norm 214.8500
|
| 346 |
+
2025-06-25 05:59:00,812 | INFO | iter 000009 | lr 0.0020 | loss 4.2537 | norm 237.0394
|
| 347 |
+
2025-06-25 05:59:06,232 | INFO | iter 000010 | lr 0.0020 | loss 4.1896 | norm 258.8723
|
| 348 |
+
2025-06-25 05:59:11,389 | INFO | iter 000011 | lr 0.0020 | loss 4.1233 | norm 280.3468
|
| 349 |
+
2025-06-25 05:59:16,596 | INFO | iter 000012 | lr 0.0020 | loss 4.4756 | norm 301.4616
|
| 350 |
+
2025-06-25 05:59:21,692 | INFO | iter 000013 | lr 0.0020 | loss 4.3328 | norm 322.2153
|
| 351 |
+
2025-06-25 05:59:26,948 | INFO | iter 000014 | lr 0.0020 | loss 4.5445 | norm 342.6083
|
| 352 |
+
2025-06-25 05:59:32,443 | INFO | iter 000015 | lr 0.0020 | loss 4.8651 | norm 362.6415
|
| 353 |
+
2025-06-25 05:59:37,749 | INFO | iter 000016 | lr 0.0020 | loss 4.7906 | norm 382.3164
|
| 354 |
+
2025-06-25 05:59:42,700 | INFO | iter 000017 | lr 0.0020 | loss 4.6993 | norm 401.6350
|
| 355 |
+
2025-06-25 05:59:47,904 | INFO | iter 000018 | lr 0.0020 | loss 4.7672 | norm 420.5997
|
| 356 |
+
2025-06-25 05:59:53,164 | INFO | iter 000019 | lr 0.0020 | loss 4.7585 | norm 439.2143
|
| 357 |
+
2025-06-25 05:59:58,222 | INFO | iter 000020 | lr 0.0020 | loss 4.7307 | norm 457.4828
|
| 358 |
+
2025-06-25 06:00:03,684 | INFO | iter 000021 | lr 0.0020 | loss 5.0488 | norm 475.4099
|
| 359 |
+
2025-06-25 06:00:08,860 | INFO | iter 000022 | lr 0.0020 | loss 5.2354 | norm 493.0004
|
| 360 |
+
2025-06-25 06:00:14,235 | INFO | iter 000023 | lr 0.0020 | loss 5.4842 | norm 510.2596
|
| 361 |
+
2025-06-25 06:00:19,440 | INFO | iter 000024 | lr 0.0020 | loss 5.1194 | norm 527.1931
|
| 362 |
+
2025-06-25 06:00:24,496 | INFO | iter 000025 | lr 0.0020 | loss 5.3493 | norm 543.8065
|
| 363 |
+
2025-06-25 06:00:30,381 | INFO | iter 000026 | lr 0.0020 | loss 5.1178 | norm 560.1060
|
| 364 |
+
2025-06-25 06:00:35,582 | INFO | iter 000027 | lr 0.0020 | loss 5.0840 | norm 576.0979
|
| 365 |
+
2025-06-25 06:00:40,743 | INFO | iter 000028 | lr 0.0020 | loss 5.6218 | norm 591.7890
|
| 366 |
+
2025-06-25 06:00:46,041 | INFO | iter 000029 | lr 0.0020 | loss 5.3631 | norm 607.1861
|
| 367 |
+
2025-06-25 06:00:51,225 | INFO | iter 000030 | lr 0.0020 | loss 5.5315 | norm 622.2960
|
| 368 |
+
2025-06-25 06:00:56,346 | INFO | iter 000031 | lr 0.0020 | loss 5.6890 | norm 637.1259
|
| 369 |
+
2025-06-25 06:01:02,073 | INFO | iter 000032 | lr 0.0020 | loss 5.5865 | norm 651.6826
|
| 370 |
+
2025-06-25 06:01:07,316 | INFO | iter 000033 | lr 0.0020 | loss 5.7409 | norm 665.9732
|
| 371 |
+
2025-06-25 06:01:12,507 | INFO | iter 000034 | lr 0.0020 | loss 5.3656 | norm 680.0048
|
| 372 |
+
2025-06-25 06:01:17,515 | INFO | iter 000035 | lr 0.0020 | loss 6.3080 | norm 693.7843
|
| 373 |
+
2025-06-25 06:01:22,649 | INFO | iter 000036 | lr 0.0020 | loss 6.1109 | norm 707.3187
|
| 374 |
+
2025-06-25 06:01:27,838 | INFO | iter 000037 | lr 0.0020 | loss 6.4594 | norm 720.6151
|
| 375 |
+
2025-06-25 06:01:33,547 | INFO | iter 000038 | lr 0.0020 | loss 6.0626 | norm 733.6802
|
| 376 |
+
2025-06-25 06:01:38,679 | INFO | iter 000039 | lr 0.0020 | loss 5.8983 | norm 746.5208
|
| 377 |
+
2025-06-25 06:01:43,863 | INFO | iter 000040 | lr 0.0020 | loss 6.1886 | norm 759.1436
|
| 378 |
+
2025-06-25 06:01:48,902 | INFO | iter 000041 | lr 0.0020 | loss 5.4046 | norm 771.5550
|
| 379 |
+
2025-06-25 06:01:54,199 | INFO | iter 000042 | lr 0.0020 | loss 6.3809 | norm 783.7615
|
| 380 |
+
2025-06-25 06:01:59,802 | INFO | iter 000043 | lr 0.0020 | loss 6.2593 | norm 795.7693
|
| 381 |
+
2025-06-25 06:02:04,920 | INFO | iter 000044 | lr 0.0020 | loss 5.9433 | norm 807.5844
|
| 382 |
+
2025-06-25 06:02:09,977 | INFO | iter 000045 | lr 0.0020 | loss 6.3769 | norm 819.2127
|
| 383 |
+
2025-06-25 06:02:15,095 | INFO | iter 000046 | lr 0.0020 | loss 6.5365 | norm 830.6600
|
| 384 |
+
2025-06-25 06:02:20,439 | INFO | iter 000047 | lr 0.0020 | loss 6.4452 | norm 841.9317
|
| 385 |
+
2025-06-25 06:02:25,606 | INFO | iter 000048 | lr 0.0020 | loss 6.5175 | norm 853.0334
|
| 386 |
+
2025-06-25 06:02:31,548 | INFO | iter 000049 | lr 0.0020 | loss 5.6289 | norm 863.9703
|
| 387 |
+
2025-06-25 06:02:31,548 | INFO | Completed LR test 6/10: lr=0.0020
|
| 388 |
+
2025-06-25 06:02:31,566 | INFO | -------------------------------- EoS --------------------------------
|
| 389 |
+
2025-06-25 06:02:31,566 | INFO | Starting LR test 7/10: lr=0.0022
|
| 390 |
+
2025-06-25 06:02:31,566 | INFO | Starting EoS for LR factor 1.2000
|
| 391 |
+
2025-06-25 06:02:31,566 | INFO | Starting EoS for checkpoint 007000
|
| 392 |
+
2025-06-25 06:02:31,566 | INFO | Starting EoS for model gpt2_small
|
| 393 |
+
2025-06-25 06:02:31,566 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 394 |
+
2025-06-25 06:02:31,566 | INFO | Starting EoS for num_iterations 50
|
| 395 |
+
2025-06-25 06:02:31,566 | INFO | Starting EoS for accum_steps 4
|
| 396 |
+
2025-06-25 06:02:31,566 | INFO | Loading model and checkpoint...
|
| 397 |
+
2025-06-25 06:02:32,259 | INFO | Wrapping model with DDP...
|
| 398 |
+
2025-06-25 06:02:32,591 | INFO | Loading state dict...
|
| 399 |
+
2025-06-25 06:02:32,594 | INFO | Model loaded successfully!
|
| 400 |
+
2025-06-25 06:02:38,924 | INFO | iter 000000 | lr 0.0022 | loss 3.6876 | norm 27.4988
|
| 401 |
+
2025-06-25 06:02:44,220 | INFO | iter 000001 | lr 0.0022 | loss 3.5816 | norm 54.4755
|
| 402 |
+
2025-06-25 06:02:49,456 | INFO | iter 000002 | lr 0.0022 | loss 3.6457 | norm 81.1569
|
| 403 |
+
2025-06-25 06:02:54,513 | INFO | iter 000003 | lr 0.0022 | loss 3.8138 | norm 107.4811
|
| 404 |
+
2025-06-25 06:02:59,811 | INFO | iter 000004 | lr 0.0022 | loss 3.8571 | norm 133.4251
|
| 405 |
+
2025-06-25 06:03:05,250 | INFO | iter 000005 | lr 0.0022 | loss 3.8876 | norm 158.9821
|
| 406 |
+
2025-06-25 06:03:10,559 | INFO | iter 000006 | lr 0.0022 | loss 4.1145 | norm 184.1454
|
| 407 |
+
2025-06-25 06:03:15,729 | INFO | iter 000007 | lr 0.0022 | loss 4.0991 | norm 208.9055
|
| 408 |
+
2025-06-25 06:03:20,970 | INFO | iter 000008 | lr 0.0022 | loss 4.2516 | norm 233.2544
|
| 409 |
+
2025-06-25 06:03:26,170 | INFO | iter 000009 | lr 0.0022 | loss 4.3180 | norm 257.1872
|
| 410 |
+
2025-06-25 06:03:31,967 | INFO | iter 000010 | lr 0.0022 | loss 4.2748 | norm 280.7017
|
| 411 |
+
2025-06-25 06:03:37,096 | INFO | iter 000011 | lr 0.0022 | loss 4.2000 | norm 303.7964
|
| 412 |
+
2025-06-25 06:03:42,276 | INFO | iter 000012 | lr 0.0022 | loss 4.5692 | norm 326.4702
|
| 413 |
+
2025-06-25 06:03:47,622 | INFO | iter 000013 | lr 0.0022 | loss 4.4204 | norm 348.7223
|
| 414 |
+
2025-06-25 06:03:52,647 | INFO | iter 000014 | lr 0.0022 | loss 4.6603 | norm 370.5538
|
| 415 |
+
2025-06-25 06:03:57,845 | INFO | iter 000015 | lr 0.0022 | loss 4.9774 | norm 391.9663
|
| 416 |
+
2025-06-25 06:04:03,363 | INFO | iter 000016 | lr 0.0022 | loss 4.9215 | norm 412.9624
|
| 417 |
+
2025-06-25 06:04:08,447 | INFO | iter 000017 | lr 0.0022 | loss 4.8513 | norm 433.5448
|
| 418 |
+
2025-06-25 06:04:13,412 | INFO | iter 000018 | lr 0.0022 | loss 4.9012 | norm 453.7173
|
| 419 |
+
2025-06-25 06:04:18,556 | INFO | iter 000019 | lr 0.0022 | loss 4.8747 | norm 473.4848
|
| 420 |
+
2025-06-25 06:04:23,868 | INFO | iter 000020 | lr 0.0022 | loss 4.8614 | norm 492.8527
|
| 421 |
+
2025-06-25 06:04:29,288 | INFO | iter 000021 | lr 0.0022 | loss 5.2076 | norm 511.8271
|
| 422 |
+
2025-06-25 06:04:34,599 | INFO | iter 000022 | lr 0.0022 | loss 5.3604 | norm 530.4143
|
| 423 |
+
2025-06-25 06:04:39,952 | INFO | iter 000023 | lr 0.0022 | loss 5.7092 | norm 548.6212
|
| 424 |
+
2025-06-25 06:04:45,030 | INFO | iter 000024 | lr 0.0022 | loss 5.2997 | norm 566.4548
|
| 425 |
+
2025-06-25 06:04:49,964 | INFO | iter 000025 | lr 0.0022 | loss 5.4966 | norm 583.9225
|
| 426 |
+
2025-06-25 06:04:55,124 | INFO | iter 000026 | lr 0.0022 | loss 5.2925 | norm 601.0318
|
| 427 |
+
2025-06-25 06:05:00,430 | INFO | iter 000027 | lr 0.0022 | loss 5.2346 | norm 617.7909
|
| 428 |
+
2025-06-25 06:05:05,872 | INFO | iter 000028 | lr 0.0022 | loss 5.8256 | norm 634.2081
|
| 429 |
+
2025-06-25 06:05:11,076 | INFO | iter 000029 | lr 0.0022 | loss 5.5557 | norm 650.2919
|
| 430 |
+
2025-06-25 06:05:16,271 | INFO | iter 000030 | lr 0.0022 | loss 5.6963 | norm 666.0509
|
| 431 |
+
2025-06-25 06:05:21,464 | INFO | iter 000031 | lr 0.0022 | loss 5.9050 | norm 681.4938
|
| 432 |
+
2025-06-25 06:05:26,686 | INFO | iter 000032 | lr 0.0022 | loss 5.8065 | norm 696.6291
|
| 433 |
+
2025-06-25 06:05:32,275 | INFO | iter 000033 | lr 0.0022 | loss 6.0051 | norm 711.4657
|
| 434 |
+
2025-06-25 06:05:37,544 | INFO | iter 000034 | lr 0.0022 | loss 5.5032 | norm 726.0119
|
| 435 |
+
2025-06-25 06:05:42,775 | INFO | iter 000035 | lr 0.0022 | loss 6.5643 | norm 740.2764
|
| 436 |
+
2025-06-25 06:05:48,005 | INFO | iter 000036 | lr 0.0022 | loss 6.3361 | norm 754.2676
|
| 437 |
+
2025-06-25 06:05:53,224 | INFO | iter 000037 | lr 0.0022 | loss 6.6566 | norm 767.9942
|
| 438 |
+
2025-06-25 06:05:58,432 | INFO | iter 000038 | lr 0.0022 | loss 6.2322 | norm 781.4643
|
| 439 |
+
2025-06-25 06:06:04,272 | INFO | iter 000039 | lr 0.0022 | loss 6.0644 | norm 794.6861
|
| 440 |
+
2025-06-25 06:06:09,583 | INFO | iter 000040 | lr 0.0022 | loss 6.3769 | norm 807.6677
|
| 441 |
+
2025-06-25 06:06:14,860 | INFO | iter 000041 | lr 0.0022 | loss 5.4851 | norm 820.4168
|
| 442 |
+
2025-06-25 06:06:20,186 | INFO | iter 000042 | lr 0.0022 | loss 6.6002 | norm 832.9411
|
| 443 |
+
2025-06-25 06:06:25,378 | INFO | iter 000043 | lr 0.0022 | loss 6.4176 | norm 845.2482
|
| 444 |
+
2025-06-25 06:06:30,818 | INFO | iter 000044 | lr 0.0022 | loss 6.1134 | norm 857.3452
|
| 445 |
+
2025-06-25 06:06:35,947 | INFO | iter 000045 | lr 0.0022 | loss 6.5890 | norm 869.2391
|
| 446 |
+
2025-06-25 06:06:40,987 | INFO | iter 000046 | lr 0.0022 | loss 6.7593 | norm 880.9367
|
| 447 |
+
2025-06-25 06:06:46,195 | INFO | iter 000047 | lr 0.0022 | loss 6.6622 | norm 892.4446
|
| 448 |
+
2025-06-25 06:06:51,454 | INFO | iter 000048 | lr 0.0022 | loss 6.6858 | norm 903.7693
|
| 449 |
+
2025-06-25 06:06:56,659 | INFO | iter 000049 | lr 0.0022 | loss 5.7619 | norm 914.9168
|
| 450 |
+
2025-06-25 06:06:56,660 | INFO | Completed LR test 7/10: lr=0.0022
|
| 451 |
+
2025-06-25 06:06:56,681 | INFO | -------------------------------- EoS --------------------------------
|
| 452 |
+
2025-06-25 06:06:56,681 | INFO | Starting LR test 8/10: lr=0.0023
|
| 453 |
+
2025-06-25 06:06:56,681 | INFO | Starting EoS for LR factor 1.3000
|
| 454 |
+
2025-06-25 06:06:56,681 | INFO | Starting EoS for checkpoint 007000
|
| 455 |
+
2025-06-25 06:06:56,681 | INFO | Starting EoS for model gpt2_small
|
| 456 |
+
2025-06-25 06:06:56,681 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 457 |
+
2025-06-25 06:06:56,682 | INFO | Starting EoS for num_iterations 50
|
| 458 |
+
2025-06-25 06:06:56,682 | INFO | Starting EoS for accum_steps 4
|
| 459 |
+
2025-06-25 06:06:56,682 | INFO | Loading model and checkpoint...
|
| 460 |
+
2025-06-25 06:06:57,381 | INFO | Wrapping model with DDP...
|
| 461 |
+
2025-06-25 06:06:57,664 | INFO | Loading state dict...
|
| 462 |
+
2025-06-25 06:06:57,668 | INFO | Model loaded successfully!
|
| 463 |
+
2025-06-25 06:07:04,307 | INFO | iter 000000 | lr 0.0023 | loss 3.6876 | norm 29.7858
|
| 464 |
+
2025-06-25 06:07:09,325 | INFO | iter 000001 | lr 0.0023 | loss 3.5855 | norm 58.9657
|
| 465 |
+
2025-06-25 06:07:14,299 | INFO | iter 000002 | lr 0.0023 | loss 3.6563 | norm 87.7992
|
| 466 |
+
2025-06-25 06:07:19,266 | INFO | iter 000003 | lr 0.0023 | loss 3.8318 | norm 116.2150
|
| 467 |
+
2025-06-25 06:07:24,351 | INFO | iter 000004 | lr 0.0023 | loss 3.8853 | norm 144.1865
|
| 468 |
+
2025-06-25 06:07:29,939 | INFO | iter 000005 | lr 0.0023 | loss 3.9210 | norm 171.7070
|
| 469 |
+
2025-06-25 06:07:35,284 | INFO | iter 000006 | lr 0.0023 | loss 4.1628 | norm 198.7692
|
| 470 |
+
2025-06-25 06:07:40,587 | INFO | iter 000007 | lr 0.0023 | loss 4.1507 | norm 225.3628
|
| 471 |
+
2025-06-25 06:07:45,771 | INFO | iter 000008 | lr 0.0023 | loss 4.3096 | norm 251.4790
|
| 472 |
+
2025-06-25 06:07:50,939 | INFO | iter 000009 | lr 0.0023 | loss 4.3805 | norm 277.1128
|
| 473 |
+
2025-06-25 06:07:56,136 | INFO | iter 000010 | lr 0.0023 | loss 4.3467 | norm 302.2623
|
| 474 |
+
2025-06-25 06:08:01,650 | INFO | iter 000011 | lr 0.0023 | loss 4.2815 | norm 326.9267
|
| 475 |
+
2025-06-25 06:08:06,740 | INFO | iter 000012 | lr 0.0023 | loss 4.6548 | norm 351.1054
|
| 476 |
+
2025-06-25 06:08:11,812 | INFO | iter 000013 | lr 0.0023 | loss 4.5047 | norm 374.7983
|
| 477 |
+
2025-06-25 06:08:17,126 | INFO | iter 000014 | lr 0.0023 | loss 4.7994 | norm 398.0073
|
| 478 |
+
2025-06-25 06:08:22,168 | INFO | iter 000015 | lr 0.0023 | loss 5.0968 | norm 420.7350
|
| 479 |
+
2025-06-25 06:08:27,247 | INFO | iter 000016 | lr 0.0023 | loss 5.0512 | norm 442.9852
|
| 480 |
+
2025-06-25 06:08:32,847 | INFO | iter 000017 | lr 0.0023 | loss 4.9901 | norm 464.7617
|
| 481 |
+
2025-06-25 06:08:37,944 | INFO | iter 000018 | lr 0.0023 | loss 5.0242 | norm 486.0699
|
| 482 |
+
2025-06-25 06:08:43,006 | INFO | iter 000019 | lr 0.0023 | loss 4.9823 | norm 506.9158
|
| 483 |
+
2025-06-25 06:08:48,089 | INFO | iter 000020 | lr 0.0023 | loss 4.9903 | norm 527.3068
|
| 484 |
+
2025-06-25 06:08:53,055 | INFO | iter 000021 | lr 0.0023 | loss 5.3445 | norm 547.2504
|
| 485 |
+
2025-06-25 06:08:58,067 | INFO | iter 000022 | lr 0.0023 | loss 5.4917 | norm 566.7548
|
| 486 |
+
2025-06-25 06:09:03,752 | INFO | iter 000023 | lr 0.0023 | loss 5.9406 | norm 585.8286
|
| 487 |
+
2025-06-25 06:09:08,966 | INFO | iter 000024 | lr 0.0023 | loss 5.4620 | norm 604.4806
|
| 488 |
+
2025-06-25 06:09:13,890 | INFO | iter 000025 | lr 0.0023 | loss 5.7073 | norm 622.7200
|
| 489 |
+
2025-06-25 06:09:19,064 | INFO | iter 000026 | lr 0.0023 | loss 5.4810 | norm 640.5562
|
| 490 |
+
2025-06-25 06:09:24,103 | INFO | iter 000027 | lr 0.0023 | loss 5.3813 | norm 657.9991
|
| 491 |
+
2025-06-25 06:09:29,610 | INFO | iter 000028 | lr 0.0023 | loss 6.0298 | norm 675.0590
|
| 492 |
+
2025-06-25 06:09:34,627 | INFO | iter 000029 | lr 0.0023 | loss 5.7523 | norm 691.7463
|
| 493 |
+
2025-06-25 06:09:39,706 | INFO | iter 000030 | lr 0.0023 | loss 5.8543 | norm 708.0714
|
| 494 |
+
2025-06-25 06:09:44,756 | INFO | iter 000031 | lr 0.0023 | loss 6.0789 | norm 724.0448
|
| 495 |
+
2025-06-25 06:09:49,863 | INFO | iter 000032 | lr 0.0023 | loss 5.9534 | norm 739.6771
|
| 496 |
+
2025-06-25 06:09:54,951 | INFO | iter 000033 | lr 0.0023 | loss 6.1380 | norm 754.9785
|
| 497 |
+
2025-06-25 06:10:00,358 | INFO | iter 000034 | lr 0.0023 | loss 5.6740 | norm 769.9594
|
| 498 |
+
2025-06-25 06:10:05,430 | INFO | iter 000035 | lr 0.0023 | loss 6.7911 | norm 784.6301
|
| 499 |
+
2025-06-25 06:10:10,341 | INFO | iter 000036 | lr 0.0023 | loss 6.5128 | norm 799.0007
|
| 500 |
+
2025-06-25 06:10:15,590 | INFO | iter 000037 | lr 0.0023 | loss 6.8682 | norm 813.0815
|
| 501 |
+
2025-06-25 06:10:20,823 | INFO | iter 000038 | lr 0.0023 | loss 6.3677 | norm 826.8822
|
| 502 |
+
2025-06-25 06:10:25,968 | INFO | iter 000039 | lr 0.0023 | loss 6.1648 | norm 840.4125
|
| 503 |
+
2025-06-25 06:10:31,768 | INFO | iter 000040 | lr 0.0023 | loss 6.5954 | norm 853.6819
|
| 504 |
+
2025-06-25 06:10:37,016 | INFO | iter 000041 | lr 0.0023 | loss 5.6618 | norm 866.6996
|
| 505 |
+
2025-06-25 06:10:42,164 | INFO | iter 000042 | lr 0.0023 | loss 6.7585 | norm 879.4748
|
| 506 |
+
2025-06-25 06:10:47,336 | INFO | iter 000043 | lr 0.0023 | loss 6.5646 | norm 892.0160
|
| 507 |
+
2025-06-25 06:10:52,421 | INFO | iter 000044 | lr 0.0023 | loss 6.2502 | norm 904.3319
|
| 508 |
+
2025-06-25 06:10:57,760 | INFO | iter 000045 | lr 0.0023 | loss 6.7723 | norm 916.4305
|
| 509 |
+
2025-06-25 06:11:03,658 | INFO | iter 000046 | lr 0.0023 | loss 6.9262 | norm 928.3196
|
| 510 |
+
2025-06-25 06:11:08,849 | INFO | iter 000047 | lr 0.0023 | loss 6.8335 | norm 940.0070
|
| 511 |
+
2025-06-25 06:11:14,029 | INFO | iter 000048 | lr 0.0023 | loss 6.8953 | norm 951.5001
|
| 512 |
+
2025-06-25 06:11:19,142 | INFO | iter 000049 | lr 0.0023 | loss 5.8852 | norm 962.8060
|
| 513 |
+
2025-06-25 06:11:19,143 | INFO | Completed LR test 8/10: lr=0.0023
|
| 514 |
+
2025-06-25 06:11:19,174 | INFO | -------------------------------- EoS --------------------------------
|
| 515 |
+
2025-06-25 06:11:19,174 | INFO | Starting LR test 9/10: lr=0.0025
|
| 516 |
+
2025-06-25 06:11:19,174 | INFO | Starting EoS for LR factor 1.4000
|
| 517 |
+
2025-06-25 06:11:19,174 | INFO | Starting EoS for checkpoint 007000
|
| 518 |
+
2025-06-25 06:11:19,174 | INFO | Starting EoS for model gpt2_small
|
| 519 |
+
2025-06-25 06:11:19,175 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 520 |
+
2025-06-25 06:11:19,175 | INFO | Starting EoS for num_iterations 50
|
| 521 |
+
2025-06-25 06:11:19,175 | INFO | Starting EoS for accum_steps 4
|
| 522 |
+
2025-06-25 06:11:19,175 | INFO | Loading model and checkpoint...
|
| 523 |
+
2025-06-25 06:11:19,871 | INFO | Wrapping model with DDP...
|
| 524 |
+
2025-06-25 06:11:20,178 | INFO | Loading state dict...
|
| 525 |
+
2025-06-25 06:11:20,181 | INFO | Model loaded successfully!
|
| 526 |
+
2025-06-25 06:11:26,593 | INFO | iter 000000 | lr 0.0025 | loss 3.6876 | norm 32.0730
|
| 527 |
+
2025-06-25 06:11:32,036 | INFO | iter 000001 | lr 0.0025 | loss 3.5896 | norm 63.4489
|
| 528 |
+
2025-06-25 06:11:37,229 | INFO | iter 000002 | lr 0.0025 | loss 3.6682 | norm 94.4235
|
| 529 |
+
2025-06-25 06:11:42,390 | INFO | iter 000003 | lr 0.0025 | loss 3.8500 | norm 124.9154
|
| 530 |
+
2025-06-25 06:11:47,646 | INFO | iter 000004 | lr 0.0025 | loss 3.9122 | norm 154.8944
|
| 531 |
+
2025-06-25 06:11:52,952 | INFO | iter 000005 | lr 0.0025 | loss 3.9551 | norm 184.3538
|
| 532 |
+
2025-06-25 06:11:57,954 | INFO | iter 000006 | lr 0.0025 | loss 4.2136 | norm 213.2859
|
| 533 |
+
2025-06-25 06:12:03,477 | INFO | iter 000007 | lr 0.0025 | loss 4.1950 | norm 241.6798
|
| 534 |
+
2025-06-25 06:12:08,505 | INFO | iter 000008 | lr 0.0025 | loss 4.3723 | norm 269.5256
|
| 535 |
+
2025-06-25 06:12:13,681 | INFO | iter 000009 | lr 0.0025 | loss 4.4440 | norm 296.8186
|
| 536 |
+
2025-06-25 06:12:18,787 | INFO | iter 000010 | lr 0.0025 | loss 4.4323 | norm 323.5575
|
| 537 |
+
2025-06-25 06:12:23,836 | INFO | iter 000011 | lr 0.0025 | loss 4.3655 | norm 349.7422
|
| 538 |
+
2025-06-25 06:12:29,157 | INFO | iter 000012 | lr 0.0025 | loss 4.7546 | norm 375.3729
|
| 539 |
+
2025-06-25 06:12:34,519 | INFO | iter 000013 | lr 0.0025 | loss 4.5906 | norm 400.4503
|
| 540 |
+
2025-06-25 06:12:39,712 | INFO | iter 000014 | lr 0.0025 | loss 4.9124 | norm 424.9773
|
| 541 |
+
2025-06-25 06:12:45,174 | INFO | iter 000015 | lr 0.0025 | loss 5.2241 | norm 448.9581
|
| 542 |
+
2025-06-25 06:12:50,358 | INFO | iter 000016 | lr 0.0025 | loss 5.1840 | norm 472.3975
|
| 543 |
+
2025-06-25 06:12:55,418 | INFO | iter 000017 | lr 0.0025 | loss 5.1327 | norm 495.3010
|
| 544 |
+
2025-06-25 06:13:00,485 | INFO | iter 000018 | lr 0.0025 | loss 5.1446 | norm 517.6753
|
| 545 |
+
2025-06-25 06:13:06,131 | INFO | iter 000019 | lr 0.0025 | loss 5.1064 | norm 539.5286
|
| 546 |
+
2025-06-25 06:13:11,225 | INFO | iter 000020 | lr 0.0025 | loss 5.1112 | norm 560.8697
|
| 547 |
+
2025-06-25 06:13:16,220 | INFO | iter 000021 | lr 0.0025 | loss 5.5130 | norm 581.7085
|
| 548 |
+
2025-06-25 06:13:21,342 | INFO | iter 000022 | lr 0.0025 | loss 5.6241 | norm 602.0549
|
| 549 |
+
2025-06-25 06:13:26,451 | INFO | iter 000023 | lr 0.0025 | loss 6.1646 | norm 621.9195
|
| 550 |
+
2025-06-25 06:13:32,022 | INFO | iter 000024 | lr 0.0025 | loss 5.6253 | norm 641.3132
|
| 551 |
+
2025-06-25 06:13:37,001 | INFO | iter 000025 | lr 0.0025 | loss 5.8789 | norm 660.2472
|
| 552 |
+
2025-06-25 06:13:42,161 | INFO | iter 000026 | lr 0.0025 | loss 5.5988 | norm 678.7329
|
| 553 |
+
2025-06-25 06:13:47,209 | INFO | iter 000027 | lr 0.0025 | loss 5.4927 | norm 696.7825
|
| 554 |
+
2025-06-25 06:13:52,260 | INFO | iter 000028 | lr 0.0025 | loss 6.2231 | norm 714.4082
|
| 555 |
+
2025-06-25 06:13:57,457 | INFO | iter 000029 | lr 0.0025 | loss 5.9250 | norm 731.6225
|
| 556 |
+
2025-06-25 06:14:03,013 | INFO | iter 000030 | lr 0.0025 | loss 5.9891 | norm 748.4380
|
| 557 |
+
2025-06-25 06:14:08,128 | INFO | iter 000031 | lr 0.0025 | loss 6.2910 | norm 764.8672
|
| 558 |
+
2025-06-25 06:14:13,228 | INFO | iter 000032 | lr 0.0025 | loss 6.1202 | norm 780.9225
|
| 559 |
+
2025-06-25 06:14:18,498 | INFO | iter 000033 | lr 0.0025 | loss 6.2973 | norm 796.6163
|
| 560 |
+
2025-06-25 06:14:23,706 | INFO | iter 000034 | lr 0.0025 | loss 5.7886 | norm 811.9608
|
| 561 |
+
2025-06-25 06:14:28,852 | INFO | iter 000035 | lr 0.0025 | loss 6.9882 | norm 826.9680
|
| 562 |
+
2025-06-25 06:14:34,397 | INFO | iter 000036 | lr 0.0025 | loss 6.7193 | norm 841.6499
|
| 563 |
+
2025-06-25 06:14:39,459 | INFO | iter 000037 | lr 0.0025 | loss 7.0613 | norm 856.0185
|
| 564 |
+
2025-06-25 06:14:44,530 | INFO | iter 000038 | lr 0.0025 | loss 6.5425 | norm 870.0853
|
| 565 |
+
2025-06-25 06:14:49,627 | INFO | iter 000039 | lr 0.0025 | loss 6.3130 | norm 883.8614
|
| 566 |
+
2025-06-25 06:14:54,817 | INFO | iter 000040 | lr 0.0025 | loss 6.7243 | norm 897.3581
|
| 567 |
+
2025-06-25 06:15:00,128 | INFO | iter 000041 | lr 0.0025 | loss 5.7344 | norm 910.5860
|
| 568 |
+
2025-06-25 06:15:05,761 | INFO | iter 000042 | lr 0.0025 | loss 6.9161 | norm 923.5555
|
| 569 |
+
2025-06-25 06:15:10,962 | INFO | iter 000043 | lr 0.0025 | loss 6.8284 | norm 936.2766
|
| 570 |
+
2025-06-25 06:15:16,242 | INFO | iter 000044 | lr 0.0025 | loss 6.4281 | norm 948.7593
|
| 571 |
+
2025-06-25 06:15:21,392 | INFO | iter 000045 | lr 0.0025 | loss 6.8733 | norm 961.0127
|
| 572 |
+
2025-06-25 06:15:26,535 | INFO | iter 000046 | lr 0.0025 | loss 7.0664 | norm 973.0458
|
| 573 |
+
2025-06-25 06:15:31,984 | INFO | iter 000047 | lr 0.0025 | loss 6.9932 | norm 984.8674
|
| 574 |
+
2025-06-25 06:15:37,095 | INFO | iter 000048 | lr 0.0025 | loss 7.0181 | norm 996.4859
|
| 575 |
+
2025-06-25 06:15:42,239 | INFO | iter 000049 | lr 0.0025 | loss 6.0350 | norm 1007.9093
|
| 576 |
+
2025-06-25 06:15:42,239 | INFO | Completed LR test 9/10: lr=0.0025
|
| 577 |
+
2025-06-25 06:15:42,268 | INFO | -------------------------------- EoS --------------------------------
|
| 578 |
+
2025-06-25 06:15:42,268 | INFO | Starting LR test 10/10: lr=0.0027
|
| 579 |
+
2025-06-25 06:15:42,268 | INFO | Starting EoS for LR factor 1.5000
|
| 580 |
+
2025-06-25 06:15:42,268 | INFO | Starting EoS for checkpoint 007000
|
| 581 |
+
2025-06-25 06:15:42,268 | INFO | Starting EoS for model gpt2_small
|
| 582 |
+
2025-06-25 06:15:42,268 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 583 |
+
2025-06-25 06:15:42,268 | INFO | Starting EoS for num_iterations 50
|
| 584 |
+
2025-06-25 06:15:42,268 | INFO | Starting EoS for accum_steps 4
|
| 585 |
+
2025-06-25 06:15:42,268 | INFO | Loading model and checkpoint...
|
| 586 |
+
2025-06-25 06:15:42,968 | INFO | Wrapping model with DDP...
|
| 587 |
+
2025-06-25 06:15:43,317 | INFO | Loading state dict...
|
| 588 |
+
2025-06-25 06:15:43,321 | INFO | Model loaded successfully!
|
| 589 |
+
2025-06-25 06:15:49,488 | INFO | iter 000000 | lr 0.0027 | loss 3.6876 | norm 34.3605
|
| 590 |
+
2025-06-25 06:15:54,389 | INFO | iter 000001 | lr 0.0027 | loss 3.5943 | norm 67.9249
|
| 591 |
+
2025-06-25 06:15:59,699 | INFO | iter 000002 | lr 0.0027 | loss 3.6800 | norm 101.0297
|
| 592 |
+
2025-06-25 06:16:05,013 | INFO | iter 000003 | lr 0.0027 | loss 3.8703 | norm 133.5825
|
| 593 |
+
2025-06-25 06:16:10,313 | INFO | iter 000004 | lr 0.0027 | loss 3.9398 | norm 165.5491
|
| 594 |
+
2025-06-25 06:16:15,323 | INFO | iter 000005 | lr 0.0027 | loss 3.9873 | norm 196.9229
|
| 595 |
+
2025-06-25 06:16:20,431 | INFO | iter 000006 | lr 0.0027 | loss 4.2650 | norm 227.6963
|
| 596 |
+
2025-06-25 06:16:25,491 | INFO | iter 000007 | lr 0.0027 | loss 4.2481 | norm 257.8574
|
| 597 |
+
2025-06-25 06:16:31,169 | INFO | iter 000008 | lr 0.0027 | loss 4.4322 | norm 287.3958
|
| 598 |
+
2025-06-25 06:16:36,127 | INFO | iter 000009 | lr 0.0027 | loss 4.5136 | norm 316.3069
|
| 599 |
+
2025-06-25 06:16:41,329 | INFO | iter 000010 | lr 0.0027 | loss 4.5131 | norm 344.5903
|
| 600 |
+
2025-06-25 06:16:46,561 | INFO | iter 000011 | lr 0.0027 | loss 4.4507 | norm 372.2470
|
| 601 |
+
2025-06-25 06:16:51,600 | INFO | iter 000012 | lr 0.0027 | loss 4.8472 | norm 399.2780
|
| 602 |
+
2025-06-25 06:16:56,803 | INFO | iter 000013 | lr 0.0027 | loss 4.6747 | norm 425.6851
|
| 603 |
+
2025-06-25 06:17:02,481 | INFO | iter 000014 | lr 0.0027 | loss 5.0471 | norm 451.4725
|
| 604 |
+
2025-06-25 06:17:07,499 | INFO | iter 000015 | lr 0.0027 | loss 5.3386 | norm 476.6461
|
| 605 |
+
2025-06-25 06:17:12,469 | INFO | iter 000016 | lr 0.0027 | loss 5.3129 | norm 501.2121
|
| 606 |
+
2025-06-25 06:17:17,681 | INFO | iter 000017 | lr 0.0027 | loss 5.2700 | norm 525.1778
|
| 607 |
+
2025-06-25 06:17:22,697 | INFO | iter 000018 | lr 0.0027 | loss 5.2826 | norm 548.5519
|
| 608 |
+
2025-06-25 06:17:27,938 | INFO | iter 000019 | lr 0.0027 | loss 5.2451 | norm 571.3443
|
| 609 |
+
2025-06-25 06:17:33,339 | INFO | iter 000020 | lr 0.0027 | loss 5.2423 | norm 593.5663
|
| 610 |
+
2025-06-25 06:17:38,686 | INFO | iter 000021 | lr 0.0027 | loss 5.6845 | norm 615.2298
|
| 611 |
+
2025-06-25 06:17:43,870 | INFO | iter 000022 | lr 0.0027 | loss 5.7502 | norm 636.3470
|
| 612 |
+
2025-06-25 06:17:48,833 | INFO | iter 000023 | lr 0.0027 | loss 6.3552 | norm 656.9308
|
| 613 |
+
2025-06-25 06:17:54,207 | INFO | iter 000024 | lr 0.0027 | loss 5.7950 | norm 676.9943
|
| 614 |
+
2025-06-25 06:17:59,920 | INFO | iter 000025 | lr 0.0027 | loss 6.0212 | norm 696.5510
|
| 615 |
+
2025-06-25 06:18:05,173 | INFO | iter 000026 | lr 0.0027 | loss 5.7221 | norm 715.6146
|
| 616 |
+
2025-06-25 06:18:10,751 | INFO | iter 000027 | lr 0.0027 | loss 5.6325 | norm 734.1994
|
| 617 |
+
2025-06-25 06:18:15,764 | INFO | iter 000028 | lr 0.0027 | loss 6.3907 | norm 752.3203
|
| 618 |
+
2025-06-25 06:18:20,879 | INFO | iter 000029 | lr 0.0027 | loss 6.0694 | norm 769.9918
|
| 619 |
+
2025-06-25 06:18:26,035 | INFO | iter 000030 | lr 0.0027 | loss 6.1491 | norm 787.2289
|
| 620 |
+
2025-06-25 06:18:31,673 | INFO | iter 000031 | lr 0.0027 | loss 6.4236 | norm 804.0462
|
| 621 |
+
2025-06-25 06:18:36,833 | INFO | iter 000032 | lr 0.0027 | loss 6.2595 | norm 820.4583
|
| 622 |
+
2025-06-25 06:18:41,950 | INFO | iter 000033 | lr 0.0027 | loss 6.5047 | norm 836.4797
|
| 623 |
+
2025-06-25 06:18:47,129 | INFO | iter 000034 | lr 0.0027 | loss 5.9256 | norm 852.1246
|
| 624 |
+
2025-06-25 06:18:52,172 | INFO | iter 000035 | lr 0.0027 | loss 7.2157 | norm 867.4069
|
| 625 |
+
2025-06-25 06:18:57,259 | INFO | iter 000036 | lr 0.0027 | loss 6.8648 | norm 882.3406
|
| 626 |
+
2025-06-25 06:19:02,908 | INFO | iter 000037 | lr 0.0027 | loss 7.2547 | norm 896.9395
|
| 627 |
+
2025-06-25 06:19:08,076 | INFO | iter 000038 | lr 0.0027 | loss 6.6690 | norm 911.2167
|
| 628 |
+
2025-06-25 06:19:13,095 | INFO | iter 000039 | lr 0.0027 | loss 6.4529 | norm 925.1854
|
| 629 |
+
2025-06-25 06:19:18,346 | INFO | iter 000040 | lr 0.0027 | loss 6.8312 | norm 938.8581
|
| 630 |
+
2025-06-25 06:19:23,338 | INFO | iter 000041 | lr 0.0027 | loss 5.8287 | norm 952.2471
|
| 631 |
+
2025-06-25 06:19:28,610 | INFO | iter 000042 | lr 0.0027 | loss 7.0746 | norm 965.3642
|
| 632 |
+
2025-06-25 06:19:33,936 | INFO | iter 000043 | lr 0.0027 | loss 7.0014 | norm 978.2209
|
| 633 |
+
2025-06-25 06:19:39,018 | INFO | iter 000044 | lr 0.0027 | loss 6.5323 | norm 990.8282
|
| 634 |
+
2025-06-25 06:19:44,171 | INFO | iter 000045 | lr 0.0027 | loss 7.0292 | norm 1003.1965
|
| 635 |
+
2025-06-25 06:19:49,407 | INFO | iter 000046 | lr 0.0027 | loss 7.2137 | norm 1015.3359
|
| 636 |
+
2025-06-25 06:19:54,614 | INFO | iter 000047 | lr 0.0027 | loss 7.1640 | norm 1027.2564
|
| 637 |
+
2025-06-25 06:19:59,877 | INFO | iter 000048 | lr 0.0027 | loss 7.1574 | norm 1038.9671
|
| 638 |
+
2025-06-25 06:20:05,133 | INFO | iter 000049 | lr 0.0027 | loss 6.1115 | norm 1050.4771
|
| 639 |
+
2025-06-25 06:20:05,133 | INFO | Completed LR test 10/10: lr=0.0027
|
| 640 |
+
2025-06-25 06:20:05,458 | INFO | Cleanup complete
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "gpt2_small",
|
| 3 |
+
"factor_min": 0.6,
|
| 4 |
+
"factor_max": 1.5,
|
| 5 |
+
"factor_num": 10,
|
| 6 |
+
"error": 0.0001,
|
| 7 |
+
"accum_steps": 4,
|
| 8 |
+
"num_iterations": 50,
|
| 9 |
+
"num_checkpoint": 2000,
|
| 10 |
+
"input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin",
|
| 11 |
+
"run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536",
|
| 12 |
+
"timestamp": "250622_035242",
|
| 13 |
+
"raw": false
|
| 14 |
+
}
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png
ADDED
|
fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-25 08:05:01,878 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
|
| 2 |
+
2025-06-25 08:05:03,791 | INFO | Loaded checkpoint with optimizer: adam
|
| 3 |
+
2025-06-25 08:05:03,792 | INFO | Current learning rate: 0.0018
|
| 4 |
+
2025-06-25 08:05:04,404 | INFO | Weight decay: 0.1
|
| 5 |
+
2025-06-25 08:05:04,404 | INFO | Epsilon: 1e-08
|
| 6 |
+
2025-06-25 08:05:04,404 | INFO | Loaded 147 first moment (m) buffers
|
| 7 |
+
2025-06-25 08:05:04,404 | INFO | Loaded 147 second moment (v) buffers
|
| 8 |
+
2025-06-25 08:05:04,404 | INFO | Optimizer state loading completed!
|
| 9 |
+
2025-06-25 08:05:06,318 | INFO | Initialized xs with norm: 1.273501
|
| 10 |
+
2025-06-25 08:05:06,326 | INFO | -------------------------------- EoS --------------------------------
|
| 11 |
+
2025-06-25 08:05:06,326 | INFO | Starting LR test 1/10: lr=0.1800
|
| 12 |
+
2025-06-25 08:05:06,326 | INFO | Starting EoS for LR factor 100.0000
|
| 13 |
+
2025-06-25 08:05:06,326 | INFO | Starting EoS for checkpoint 010000
|
| 14 |
+
2025-06-25 08:05:06,326 | INFO | Starting EoS for model gpt2_small
|
| 15 |
+
2025-06-25 08:05:06,326 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 16 |
+
2025-06-25 08:05:06,326 | INFO | Starting EoS for num_iterations 50
|
| 17 |
+
2025-06-25 08:05:06,326 | INFO | Starting EoS for accum_steps 4
|
| 18 |
+
2025-06-25 08:05:06,326 | INFO | Loading model and checkpoint...
|
| 19 |
+
2025-06-25 08:05:07,207 | INFO | Wrapping model with DDP...
|
| 20 |
+
2025-06-25 08:05:07,346 | INFO | Loading state dict...
|
| 21 |
+
2025-06-25 08:05:07,350 | INFO | Model loaded successfully!
|
| 22 |
+
2025-06-25 08:05:14,155 | INFO | iter 000000 | lr 0.1800 | loss 3.7162 | norm 1035.2508
|
| 23 |
+
2025-06-25 08:05:19,180 | INFO | iter 000001 | lr 0.1800 | loss 1022.7610 | norm 1854.6921
|
| 24 |
+
2025-06-25 08:05:24,315 | INFO | iter 000002 | lr 0.1800 | loss 12916.4541 | norm 2262.8654
|
| 25 |
+
2025-06-25 08:05:29,639 | INFO | iter 000003 | lr 0.1800 | loss 2381.4231 | norm 2784.9186
|
| 26 |
+
2025-06-25 08:05:35,017 | INFO | iter 000004 | lr 0.1800 | loss 5663.5991 | norm 3258.5670
|
| 27 |
+
2025-06-25 08:05:40,197 | INFO | iter 000005 | lr 0.1800 | loss 6263.7207 | norm 3659.7969
|
| 28 |
+
2025-06-25 08:05:45,423 | INFO | iter 000006 | lr 0.1800 | loss 3300.3076 | norm 4042.1641
|
| 29 |
+
2025-06-25 08:05:50,614 | INFO | iter 000007 | lr 0.1800 | loss 2205.2788 | norm 4403.1766
|
| 30 |
+
2025-06-25 08:05:55,705 | INFO | iter 000008 | lr 0.1800 | loss 3169.1311 | norm 4721.1078
|
| 31 |
+
2025-06-25 08:06:01,395 | INFO | iter 000009 | lr 0.1800 | loss 2604.3735 | norm 5007.2289
|
| 32 |
+
2025-06-25 08:06:06,537 | INFO | iter 000010 | lr 0.1800 | loss 1049.3635 | norm 5277.7821
|
| 33 |
+
2025-06-25 08:06:11,635 | INFO | iter 000011 | lr 0.1800 | loss -534.1947 | norm 5542.2629
|
| 34 |
+
2025-06-25 08:06:16,925 | INFO | iter 000012 | lr 0.1800 | loss -1829.3038 | norm 5804.4541
|
| 35 |
+
2025-06-25 08:06:22,270 | INFO | iter 000013 | lr 0.1800 | loss -3147.5239 | norm 6068.7285
|
| 36 |
+
2025-06-25 08:06:27,494 | INFO | iter 000014 | lr 0.1800 | loss -5675.9150 | norm 6341.2563
|
| 37 |
+
2025-06-25 08:06:33,272 | INFO | iter 000015 | lr 0.1800 | loss -8397.9707 | norm 6625.5651
|
| 38 |
+
2025-06-25 08:06:38,517 | INFO | iter 000016 | lr 0.1800 | loss -12464.1982 | norm 6928.3923
|
| 39 |
+
2025-06-25 08:06:43,692 | INFO | iter 000017 | lr 0.1800 | loss -19611.1348 | norm 7248.3939
|
| 40 |
+
2025-06-25 08:06:48,881 | INFO | iter 000018 | lr 0.1800 | loss -27940.6465 | norm 7598.8381
|
| 41 |
+
2025-06-25 08:06:54,254 | INFO | iter 000019 | lr 0.1800 | loss -37102.6367 | norm 7980.6707
|
| 42 |
+
2025-06-25 08:06:59,396 | INFO | iter 000020 | lr 0.1800 | loss -41850.2695 | norm 8391.1617
|
| 43 |
+
2025-06-25 08:07:04,889 | INFO | iter 000021 | lr 0.1800 | loss -55457.6641 | norm 8833.9090
|
| 44 |
+
2025-06-25 08:07:10,211 | INFO | iter 000022 | lr 0.1800 | loss -77246.1016 | norm 9308.0797
|
| 45 |
+
2025-06-25 08:07:15,407 | INFO | iter 000023 | lr 0.1800 | loss -88869.9531 | norm 9810.1623
|
| 46 |
+
2025-06-25 08:07:20,721 | INFO | iter 000024 | lr 0.1800 | loss -106978.7188 | norm 10315.7090
|
| 47 |
+
2025-06-25 08:07:26,169 | INFO | iter 000025 | lr 0.1800 | loss -128147.9297 | norm 10848.4611
|
| 48 |
+
2025-06-25 08:07:31,697 | INFO | iter 000026 | lr 0.1800 | loss -167841.3906 | norm 11408.8324
|
| 49 |
+
2025-06-25 08:07:37,001 | INFO | iter 000027 | lr 0.1800 | loss -180567.0000 | norm 11986.0506
|
| 50 |
+
2025-06-25 08:07:42,165 | INFO | iter 000028 | lr 0.1800 | loss -195498.7031 | norm 12575.1052
|
| 51 |
+
2025-06-25 08:07:47,397 | INFO | iter 000029 | lr 0.1800 | loss -226350.6406 | norm 13174.6499
|
| 52 |
+
2025-06-25 08:07:52,643 | INFO | iter 000030 | lr 0.1800 | loss -278469.5625 | norm 13789.2878
|
| 53 |
+
2025-06-25 08:07:57,699 | INFO | iter 000031 | lr 0.1800 | loss -291369.5000 | norm 14408.3851
|
| 54 |
+
2025-06-25 08:08:03,348 | INFO | iter 000032 | lr 0.1800 | loss -340664.4062 | norm 15027.4904
|
| 55 |
+
2025-06-25 08:08:08,536 | INFO | iter 000033 | lr 0.1800 | loss -381711.4062 | norm 15650.6969
|
| 56 |
+
2025-06-25 08:08:13,710 | INFO | iter 000034 | lr 0.1800 | loss -447641.5625 | norm 16275.4851
|
| 57 |
+
2025-06-25 08:08:18,817 | INFO | iter 000035 | lr 0.1800 | loss -478080.0938 | norm 16906.2037
|
| 58 |
+
2025-06-25 08:08:23,855 | INFO | iter 000036 | lr 0.1800 | loss -474737.6875 | norm 17529.9377
|
| 59 |
+
2025-06-25 08:08:29,137 | INFO | iter 000037 | lr 0.1800 | loss -562731.2500 | norm 18158.1498
|
| 60 |
+
2025-06-25 08:08:34,554 | INFO | iter 000038 | lr 0.1800 | loss -671918.5000 | norm 18787.8108
|
| 61 |
+
2025-06-25 08:08:39,659 | INFO | iter 000039 | lr 0.1800 | loss -611043.3750 | norm 19413.9402
|
| 62 |
+
2025-06-25 08:08:44,757 | INFO | iter 000040 | lr 0.1800 | loss -695682.8750 | norm 20036.0407
|
| 63 |
+
2025-06-25 08:08:49,914 | INFO | iter 000041 | lr 0.1800 | loss -780705.8750 | norm 20644.8346
|
| 64 |
+
2025-06-25 08:08:55,158 | INFO | iter 000042 | lr 0.1800 | loss -995004.5625 | norm 21239.0209
|
| 65 |
+
2025-06-25 08:09:00,499 | INFO | iter 000043 | lr 0.1800 | loss -818858.2500 | norm 21828.2696
|
| 66 |
+
2025-06-25 08:09:05,819 | INFO | iter 000044 | lr 0.1800 | loss -880965.5000 | norm 22411.6198
|
| 67 |
+
2025-06-25 08:09:11,010 | INFO | iter 000045 | lr 0.1800 | loss -1003927.8125 | norm 22994.3220
|
| 68 |
+
2025-06-25 08:09:16,158 | INFO | iter 000046 | lr 0.1800 | loss -1131396.7500 | norm 23577.3465
|
| 69 |
+
2025-06-25 08:09:21,395 | INFO | iter 000047 | lr 0.1800 | loss -1100723.7500 | norm 24149.2858
|
| 70 |
+
2025-06-25 08:09:26,617 | INFO | iter 000048 | lr 0.1800 | loss -1096843.2500 | norm 24709.4051
|
| 71 |
+
2025-06-25 08:09:32,274 | INFO | iter 000049 | lr 0.1800 | loss -1231046.2500 | norm 25267.4215
|
| 72 |
+
2025-06-25 08:09:32,275 | INFO | Completed LR test 1/10: lr=0.1800
|
| 73 |
+
2025-06-25 08:09:32,476 | INFO | Cleanup complete
|
| 74 |
+
2025-06-25 08:14:04,088 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
|
| 75 |
+
2025-06-25 08:14:06,081 | INFO | Loaded checkpoint with optimizer: adam
|
| 76 |
+
2025-06-25 08:14:06,081 | INFO | Current learning rate: 0.0018
|
| 77 |
+
2025-06-25 08:14:06,733 | INFO | Weight decay: 0.1
|
| 78 |
+
2025-06-25 08:14:06,733 | INFO | Epsilon: 1e-08
|
| 79 |
+
2025-06-25 08:14:06,733 | INFO | Loaded 147 first moment (m) buffers
|
| 80 |
+
2025-06-25 08:14:06,733 | INFO | Loaded 147 second moment (v) buffers
|
| 81 |
+
2025-06-25 08:14:06,733 | INFO | Optimizer state loading completed!
|
| 82 |
+
2025-06-25 08:14:08,702 | INFO | Initialized xs with norm: 1.273654
|
| 83 |
+
2025-06-25 08:14:08,705 | INFO | -------------------------------- EoS --------------------------------
|
| 84 |
+
2025-06-25 08:14:08,705 | INFO | Starting LR test 1/10: lr=0.1800
|
| 85 |
+
2025-06-25 08:14:08,705 | INFO | Starting EoS for LR factor 100.0000
|
| 86 |
+
2025-06-25 08:14:08,705 | INFO | Starting EoS for checkpoint 010000
|
| 87 |
+
2025-06-25 08:14:08,706 | INFO | Starting EoS for model gpt2_small
|
| 88 |
+
2025-06-25 08:14:08,706 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 89 |
+
2025-06-25 08:14:08,706 | INFO | Starting EoS for num_iterations 50
|
| 90 |
+
2025-06-25 08:14:08,706 | INFO | Starting EoS for accum_steps 4
|
| 91 |
+
2025-06-25 08:14:08,706 | INFO | Loading model and checkpoint...
|
| 92 |
+
2025-06-25 08:14:09,604 | INFO | Wrapping model with DDP...
|
| 93 |
+
2025-06-25 08:14:09,826 | INFO | Loading state dict...
|
| 94 |
+
2025-06-25 08:14:09,830 | INFO | Model loaded successfully!
|
| 95 |
+
2025-06-25 08:14:16,113 | INFO | iter 000000 | lr 0.1800 | loss 3.7162 | norm 458.7718
|
| 96 |
+
2025-06-25 08:14:21,223 | INFO | iter 000001 | lr 0.1800 | loss 207.3757 | norm 1023.7285
|
| 97 |
+
2025-06-25 08:14:26,544 | INFO | iter 000002 | lr 0.1800 | loss 5262.5127 | norm 1344.5259
|
| 98 |
+
2025-06-25 08:14:32,322 | INFO | iter 000003 | lr 0.1800 | loss 1121.6940 | norm 1825.7758
|
| 99 |
+
2025-06-25 08:14:37,581 | INFO | iter 000004 | lr 0.1800 | loss 3489.6094 | norm 2256.3649
|
| 100 |
+
2025-06-25 08:14:42,793 | INFO | iter 000005 | lr 0.1800 | loss 2611.4473 | norm 2665.4942
|
| 101 |
+
2025-06-25 08:14:47,946 | INFO | iter 000006 | lr 0.1800 | loss 1170.9673 | norm 3086.3505
|
| 102 |
+
2025-06-25 08:14:53,249 | INFO | iter 000007 | lr 0.1800 | loss 1968.7089 | norm 3471.1555
|
| 103 |
+
2025-06-25 08:14:58,293 | INFO | iter 000008 | lr 0.1800 | loss 2037.9554 | norm 3819.5898
|
| 104 |
+
2025-06-25 08:15:04,132 | INFO | iter 000009 | lr 0.1800 | loss 831.5104 | norm 4163.8926
|
| 105 |
+
2025-06-25 08:15:09,325 | INFO | iter 000010 | lr 0.1800 | loss 281.9943 | norm 4504.7350
|
| 106 |
+
2025-06-25 08:15:14,666 | INFO | iter 000011 | lr 0.1800 | loss -326.7262 | norm 4836.5242
|
| 107 |
+
2025-06-25 08:15:19,988 | INFO | iter 000012 | lr 0.1800 | loss -1960.6803 | norm 5168.6060
|
| 108 |
+
2025-06-25 08:15:25,055 | INFO | iter 000013 | lr 0.1800 | loss -4240.2627 | norm 5518.1032
|
| 109 |
+
2025-06-25 08:15:30,756 | INFO | iter 000014 | lr 0.1800 | loss -7296.6255 | norm 5891.2474
|
| 110 |
+
2025-06-25 08:15:35,838 | INFO | iter 000015 | lr 0.1800 | loss -10337.6426 | norm 6286.4372
|
| 111 |
+
2025-06-25 08:15:40,922 | INFO | iter 000016 | lr 0.1800 | loss -15605.5254 | norm 6710.1341
|
| 112 |
+
2025-06-25 08:15:46,100 | INFO | iter 000017 | lr 0.1800 | loss -25703.7539 | norm 7163.8796
|
| 113 |
+
2025-06-25 08:15:51,309 | INFO | iter 000018 | lr 0.1800 | loss -37443.0664 | norm 7659.6704
|
| 114 |
+
2025-06-25 08:15:56,583 | INFO | iter 000019 | lr 0.1800 | loss -50078.2383 | norm 8193.8946
|
| 115 |
+
2025-06-25 08:16:01,998 | INFO | iter 000020 | lr 0.1800 | loss -57032.1875 | norm 8760.0695
|
| 116 |
+
2025-06-25 08:16:07,120 | INFO | iter 000021 | lr 0.1800 | loss -76344.0781 | norm 9360.5588
|
| 117 |
+
2025-06-25 08:16:12,341 | INFO | iter 000022 | lr 0.1800 | loss -106728.8047 | norm 9991.8501
|
| 118 |
+
2025-06-25 08:16:17,385 | INFO | iter 000023 | lr 0.1800 | loss -122665.7422 | norm 10647.0357
|
| 119 |
+
2025-06-25 08:16:22,444 | INFO | iter 000024 | lr 0.1800 | loss -146900.8906 | norm 11291.6713
|
| 120 |
+
2025-06-25 08:16:27,677 | INFO | iter 000025 | lr 0.1800 | loss -175862.7344 | norm 11958.6899
|
| 121 |
+
2025-06-25 08:16:33,138 | INFO | iter 000026 | lr 0.1800 | loss -229347.8750 | norm 12648.0304
|
| 122 |
+
2025-06-25 08:16:38,360 | INFO | iter 000027 | lr 0.1800 | loss -245424.7344 | norm 13346.8304
|
| 123 |
+
2025-06-25 08:16:43,584 | INFO | iter 000028 | lr 0.1800 | loss -263935.7500 | norm 14047.6949
|
| 124 |
+
2025-06-25 08:16:48,605 | INFO | iter 000029 | lr 0.1800 | loss -304780.0625 | norm 14750.5359
|
| 125 |
+
2025-06-25 08:16:53,925 | INFO | iter 000030 | lr 0.1800 | loss -372978.8438 | norm 15461.7537
|
| 126 |
+
2025-06-25 08:16:59,207 | INFO | iter 000031 | lr 0.1800 | loss -386555.4062 | norm 16169.9843
|
| 127 |
+
2025-06-25 08:17:04,766 | INFO | iter 000032 | lr 0.1800 | loss -447786.1250 | norm 16870.0891
|
| 128 |
+
2025-06-25 08:17:09,788 | INFO | iter 000033 | lr 0.1800 | loss -501011.2500 | norm 17566.7465
|
| 129 |
+
2025-06-25 08:17:14,895 | INFO | iter 000034 | lr 0.1800 | loss -584175.8750 | norm 18257.1731
|
| 130 |
+
2025-06-25 08:17:20,270 | INFO | iter 000035 | lr 0.1800 | loss -620550.2500 | norm 18947.4654
|
| 131 |
+
2025-06-25 08:17:25,581 | INFO | iter 000036 | lr 0.1800 | loss -611509.9375 | norm 19623.6972
|
| 132 |
+
2025-06-25 08:17:31,267 | INFO | iter 000037 | lr 0.1800 | loss -720793.8750 | norm 20299.6148
|
| 133 |
+
2025-06-25 08:17:36,391 | INFO | iter 000038 | lr 0.1800 | loss -856993.6875 | norm 20972.3499
|
| 134 |
+
2025-06-25 08:17:41,749 | INFO | iter 000039 | lr 0.1800 | loss -774917.7500 | norm 21636.3731
|
| 135 |
+
2025-06-25 08:17:46,809 | INFO | iter 000040 | lr 0.1800 | loss -875320.1250 | norm 22291.3078
|
| 136 |
+
2025-06-25 08:17:51,936 | INFO | iter 000041 | lr 0.1800 | loss -978267.6250 | norm 22928.2164
|
| 137 |
+
2025-06-25 08:17:57,182 | INFO | iter 000042 | lr 0.1800 | loss -1236919.7500 | norm 23545.3115
|
| 138 |
+
2025-06-25 08:18:02,979 | INFO | iter 000043 | lr 0.1800 | loss -1016738.6875 | norm 24153.6447
|
| 139 |
+
2025-06-25 08:18:08,099 | INFO | iter 000044 | lr 0.1800 | loss -1085286.3750 | norm 24751.7884
|
| 140 |
+
2025-06-25 08:18:13,218 | INFO | iter 000045 | lr 0.1800 | loss -1233294.6250 | norm 25346.0331
|
| 141 |
+
2025-06-25 08:18:18,370 | INFO | iter 000046 | lr 0.1800 | loss -1382542.3750 | norm 25937.9944
|
| 142 |
+
2025-06-25 08:18:23,739 | INFO | iter 000047 | lr 0.1800 | loss -1339756.2500 | norm 26515.0226
|
| 143 |
+
2025-06-25 08:18:29,372 | INFO | iter 000048 | lr 0.1800 | loss -1327051.5000 | norm 27076.3794
|
| 144 |
+
2025-06-25 08:18:34,762 | INFO | iter 000049 | lr 0.1800 | loss -1484669.5000 | norm 27633.4660
|
| 145 |
+
2025-06-25 08:18:34,762 | INFO | Completed LR test 1/10: lr=0.1800
|
| 146 |
+
2025-06-25 08:18:34,933 | INFO | Cleanup complete
|
| 147 |
+
2025-06-25 08:19:09,783 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
|
| 148 |
+
2025-06-25 08:19:11,705 | INFO | Loaded checkpoint with optimizer: adam
|
| 149 |
+
2025-06-25 08:19:11,705 | INFO | Current learning rate: 0.0018
|
| 150 |
+
2025-06-25 08:19:12,319 | INFO | Weight decay: 0.1
|
| 151 |
+
2025-06-25 08:19:12,319 | INFO | Epsilon: 1e-08
|
| 152 |
+
2025-06-25 08:19:12,319 | INFO | Loaded 147 first moment (m) buffers
|
| 153 |
+
2025-06-25 08:19:12,319 | INFO | Loaded 147 second moment (v) buffers
|
| 154 |
+
2025-06-25 08:19:12,319 | INFO | Optimizer state loading completed!
|
| 155 |
+
2025-06-25 08:19:14,382 | INFO | Initialized xs with norm: 1.273542
|
| 156 |
+
2025-06-25 08:19:14,386 | INFO | -------------------------------- EoS --------------------------------
|
| 157 |
+
2025-06-25 08:19:14,386 | INFO | Starting LR test 1/10: lr=0.1800
|
| 158 |
+
2025-06-25 08:19:14,387 | INFO | Starting EoS for LR factor 100.0000
|
| 159 |
+
2025-06-25 08:19:14,387 | INFO | Starting EoS for checkpoint 010000
|
| 160 |
+
2025-06-25 08:19:14,387 | INFO | Starting EoS for model gpt2_small
|
| 161 |
+
2025-06-25 08:19:14,387 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 162 |
+
2025-06-25 08:19:14,387 | INFO | Starting EoS for num_iterations 50
|
| 163 |
+
2025-06-25 08:19:14,387 | INFO | Starting EoS for accum_steps 4
|
| 164 |
+
2025-06-25 08:19:14,387 | INFO | Loading model and checkpoint...
|
| 165 |
+
2025-06-25 08:19:15,188 | INFO | Wrapping model with DDP...
|
| 166 |
+
2025-06-25 08:19:15,646 | INFO | Loading state dict...
|
| 167 |
+
2025-06-25 08:19:15,649 | INFO | Model loaded successfully!
|
| 168 |
+
2025-06-25 08:19:22,000 | INFO | iter 000000 | lr 0.1800 | loss 3.5472 | norm 471.7711
|
| 169 |
+
2025-06-25 08:19:27,321 | INFO | iter 000001 | lr 0.1800 | loss 258.9568 | norm 1033.3546
|
| 170 |
+
2025-06-25 08:19:33,176 | INFO | iter 000002 | lr 0.1800 | loss 6505.9888 | norm 1313.8736
|
| 171 |
+
2025-06-25 08:19:38,376 | INFO | iter 000003 | lr 0.1800 | loss 720.6740 | norm 1808.7756
|
| 172 |
+
2025-06-25 08:19:43,461 | INFO | iter 000004 | lr 0.1800 | loss 4549.4497 | norm 2236.3996
|
| 173 |
+
2025-06-25 08:19:48,625 | INFO | iter 000005 | lr 0.1800 | loss 3147.6418 | norm 2648.9176
|
| 174 |
+
2025-06-25 08:19:53,829 | INFO | iter 000006 | lr 0.1800 | loss 1210.4675 | norm 3081.6848
|
| 175 |
+
2025-06-25 08:19:58,930 | INFO | iter 000007 | lr 0.1800 | loss 2729.4216 | norm 3473.4778
|
| 176 |
+
2025-06-25 08:20:04,381 | INFO | iter 000008 | lr 0.1800 | loss 2761.1892 | norm 3830.6859
|
| 177 |
+
2025-06-25 08:20:09,528 | INFO | iter 000009 | lr 0.1800 | loss 1629.8550 | norm 4172.5473
|
| 178 |
+
2025-06-25 08:20:14,583 | INFO | iter 000010 | lr 0.1800 | loss 738.9063 | norm 4510.9757
|
| 179 |
+
2025-06-25 08:20:19,734 | INFO | iter 000011 | lr 0.1800 | loss 429.7834 | norm 4842.2049
|
| 180 |
+
2025-06-25 08:20:24,921 | INFO | iter 000012 | lr 0.1800 | loss -247.2831 | norm 5150.2921
|
| 181 |
+
2025-06-25 08:20:30,290 | INFO | iter 000013 | lr 0.1800 | loss -2646.1260 | norm 5471.6201
|
| 182 |
+
2025-06-25 08:20:35,302 | INFO | iter 000014 | lr 0.1800 | loss -5207.6968 | norm 5812.3796
|
| 183 |
+
2025-06-25 08:20:40,540 | INFO | iter 000015 | lr 0.1800 | loss -6475.6606 | norm 6168.6810
|
| 184 |
+
2025-06-25 08:20:45,761 | INFO | iter 000016 | lr 0.1800 | loss -11955.2441 | norm 6540.5293
|
| 185 |
+
2025-06-25 08:20:50,785 | INFO | iter 000017 | lr 0.1800 | loss -18120.8965 | norm 6942.5453
|
| 186 |
+
2025-06-25 08:20:56,027 | INFO | iter 000018 | lr 0.1800 | loss -27042.0312 | norm 7375.6815
|
| 187 |
+
2025-06-25 08:21:01,764 | INFO | iter 000019 | lr 0.1800 | loss -32353.4766 | norm 7844.6077
|
| 188 |
+
2025-06-25 08:21:06,819 | INFO | iter 000020 | lr 0.1800 | loss -42713.4258 | norm 8350.1531
|
| 189 |
+
2025-06-25 08:21:11,942 | INFO | iter 000021 | lr 0.1800 | loss -63141.9570 | norm 8895.7420
|
| 190 |
+
2025-06-25 08:21:17,296 | INFO | iter 000022 | lr 0.1800 | loss -79225.8125 | norm 9464.6960
|
| 191 |
+
2025-06-25 08:21:22,364 | INFO | iter 000023 | lr 0.1800 | loss -85295.2188 | norm 10064.8914
|
| 192 |
+
2025-06-25 08:21:27,555 | INFO | iter 000024 | lr 0.1800 | loss -104268.3984 | norm 10670.1012
|
| 193 |
+
2025-06-25 08:21:33,391 | INFO | iter 000025 | lr 0.1800 | loss -138191.5625 | norm 11306.4591
|
| 194 |
+
2025-06-25 08:21:38,449 | INFO | iter 000026 | lr 0.1800 | loss -130759.4922 | norm 11963.0958
|
| 195 |
+
2025-06-25 08:21:43,851 | INFO | iter 000027 | lr 0.1800 | loss -151880.2188 | norm 12623.4203
|
| 196 |
+
2025-06-25 08:21:48,943 | INFO | iter 000028 | lr 0.1800 | loss -199277.5312 | norm 13299.8124
|
| 197 |
+
2025-06-25 08:21:54,133 | INFO | iter 000029 | lr 0.1800 | loss -231175.6094 | norm 13985.3231
|
| 198 |
+
2025-06-25 08:21:59,601 | INFO | iter 000030 | lr 0.1800 | loss -227062.6875 | norm 14638.4871
|
| 199 |
+
2025-06-25 08:22:04,755 | INFO | iter 000031 | lr 0.1800 | loss -305321.5312 | norm 15300.2854
|
| 200 |
+
2025-06-25 08:22:09,996 | INFO | iter 000032 | lr 0.1800 | loss -312911.1562 | norm 15967.4907
|
| 201 |
+
2025-06-25 08:22:15,180 | INFO | iter 000033 | lr 0.1800 | loss -394100.9688 | norm 16633.2168
|
| 202 |
+
2025-06-25 08:22:20,480 | INFO | iter 000034 | lr 0.1800 | loss -396323.7812 | norm 17301.4404
|
| 203 |
+
2025-06-25 08:22:25,744 | INFO | iter 000035 | lr 0.1800 | loss -495372.3750 | norm 17975.7867
|
| 204 |
+
2025-06-25 08:22:31,676 | INFO | iter 000036 | lr 0.1800 | loss -536708.7500 | norm 18632.5354
|
| 205 |
+
2025-06-25 08:22:36,888 | INFO | iter 000037 | lr 0.1800 | loss -565078.6250 | norm 19274.2470
|
| 206 |
+
2025-06-25 08:22:42,081 | INFO | iter 000038 | lr 0.1800 | loss -567815.5000 | norm 19919.4392
|
| 207 |
+
2025-06-25 08:22:47,333 | INFO | iter 000039 | lr 0.1800 | loss -618632.5625 | norm 20562.6429
|
| 208 |
+
2025-06-25 08:22:52,496 | INFO | iter 000040 | lr 0.1800 | loss -746847.8750 | norm 21208.1400
|
| 209 |
+
2025-06-25 08:22:57,533 | INFO | iter 000041 | lr 0.1800 | loss -694621.2500 | norm 21850.9988
|
| 210 |
+
2025-06-25 08:23:03,478 | INFO | iter 000042 | lr 0.1800 | loss -805244.6250 | norm 22487.1557
|
| 211 |
+
2025-06-25 08:23:08,470 | INFO | iter 000043 | lr 0.1800 | loss -911537.5000 | norm 23123.2202
|
| 212 |
+
2025-06-25 08:23:13,595 | INFO | iter 000044 | lr 0.1800 | loss -935658.9375 | norm 23750.3923
|
| 213 |
+
2025-06-25 08:23:18,852 | INFO | iter 000045 | lr 0.1800 | loss -877843.8125 | norm 24361.4878
|
| 214 |
+
2025-06-25 08:23:23,961 | INFO | iter 000046 | lr 0.1800 | loss -1091743.1250 | norm 24972.6994
|
| 215 |
+
2025-06-25 08:23:29,213 | INFO | iter 000047 | lr 0.1800 | loss -1240315.0000 | norm 25579.9303
|
| 216 |
+
2025-06-25 08:23:34,577 | INFO | iter 000048 | lr 0.1800 | loss -1015323.0000 | norm 26170.5807
|
| 217 |
+
2025-06-25 08:23:39,739 | INFO | iter 000049 | lr 0.1800 | loss -1083682.8750 | norm 26751.1997
|
| 218 |
+
2025-06-25 08:23:39,739 | INFO | Completed LR test 1/10: lr=0.1800
|
| 219 |
+
2025-06-25 08:23:40,094 | INFO | Cleanup complete
|
| 220 |
+
2025-06-25 08:25:41,547 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
|
| 221 |
+
2025-06-25 08:25:43,132 | INFO | Loaded checkpoint with optimizer: adam
|
| 222 |
+
2025-06-25 08:25:43,133 | INFO | Current learning rate: 0.0018
|
| 223 |
+
2025-06-25 08:25:43,761 | INFO | Weight decay: 0.1
|
| 224 |
+
2025-06-25 08:25:43,761 | INFO | Epsilon: 1e-08
|
| 225 |
+
2025-06-25 08:25:43,761 | INFO | Loaded 147 first moment (m) buffers
|
| 226 |
+
2025-06-25 08:25:43,761 | INFO | Loaded 147 second moment (v) buffers
|
| 227 |
+
2025-06-25 08:25:43,761 | INFO | Optimizer state loading completed!
|
| 228 |
+
2025-06-25 08:25:45,718 | INFO | Initialized xs with norm: 1.273535
|
| 229 |
+
2025-06-25 08:25:45,726 | INFO | -------------------------------- EoS --------------------------------
|
| 230 |
+
2025-06-25 08:25:45,726 | INFO | Starting LR test 1/10: lr=0.1800
|
| 231 |
+
2025-06-25 08:25:45,726 | INFO | Starting EoS for LR factor 100.0000
|
| 232 |
+
2025-06-25 08:25:45,726 | INFO | Starting EoS for checkpoint 010000
|
| 233 |
+
2025-06-25 08:25:45,726 | INFO | Starting EoS for model gpt2_small
|
| 234 |
+
2025-06-25 08:25:45,726 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 235 |
+
2025-06-25 08:25:45,726 | INFO | Starting EoS for num_iterations 50
|
| 236 |
+
2025-06-25 08:25:45,726 | INFO | Starting EoS for accum_steps 4
|
| 237 |
+
2025-06-25 08:25:45,726 | INFO | Loading model and checkpoint...
|
| 238 |
+
2025-06-25 08:25:46,600 | INFO | Wrapping model with DDP...
|
| 239 |
+
2025-06-25 08:25:46,828 | INFO | Loading state dict...
|
| 240 |
+
2025-06-25 08:25:46,831 | INFO | Model loaded successfully!
|
| 241 |
+
2025-06-25 08:25:54,406 | INFO | iter 000000 | lr 0.1800 | loss 0.0000 | norm 471.7869
|
| 242 |
+
2025-06-25 08:25:59,712 | INFO | iter 000001 | lr 0.1800 | loss 255.7516 | norm 1033.5834
|
| 243 |
+
2025-06-25 08:26:04,882 | INFO | iter 000002 | lr 0.1800 | loss 6477.0049 | norm 1315.8706
|
| 244 |
+
2025-06-25 08:26:10,059 | INFO | iter 000003 | lr 0.1800 | loss 717.1234 | norm 1811.0524
|
| 245 |
+
2025-06-25 08:26:15,184 | INFO | iter 000004 | lr 0.1800 | loss 4544.1074 | norm 2238.2034
|
| 246 |
+
2025-06-25 08:26:20,441 | INFO | iter 000005 | lr 0.1800 | loss 3125.7515 | norm 2650.5604
|
| 247 |
+
2025-06-25 08:26:25,413 | INFO | iter 000006 | lr 0.1800 | loss 1199.5897 | norm 3083.3089
|
| 248 |
+
2025-06-25 08:26:30,945 | INFO | iter 000007 | lr 0.1800 | loss 2725.4370 | norm 3475.1633
|
| 249 |
+
2025-06-25 08:26:36,160 | INFO | iter 000008 | lr 0.1800 | loss 2744.2969 | norm 3832.5715
|
| 250 |
+
2025-06-25 08:26:41,444 | INFO | iter 000009 | lr 0.1800 | loss 1625.9283 | norm 4174.4268
|
| 251 |
+
2025-06-25 08:26:46,671 | INFO | iter 000010 | lr 0.1800 | loss 732.4396 | norm 4512.6526
|
| 252 |
+
2025-06-25 08:26:51,717 | INFO | iter 000011 | lr 0.1800 | loss 401.3749 | norm 4843.9269
|
| 253 |
+
2025-06-25 08:26:56,797 | INFO | iter 000012 | lr 0.1800 | loss -414.9888 | norm 5153.5810
|
| 254 |
+
2025-06-25 08:27:02,557 | INFO | iter 000013 | lr 0.1800 | loss -2720.9531 | norm 5476.3233
|
| 255 |
+
2025-06-25 08:27:07,729 | INFO | iter 000014 | lr 0.1800 | loss -5391.8271 | norm 5818.1825
|
| 256 |
+
2025-06-25 08:27:12,870 | INFO | iter 000015 | lr 0.1800 | loss -6670.7456 | norm 6176.3078
|
| 257 |
+
2025-06-25 08:27:18,153 | INFO | iter 000016 | lr 0.1800 | loss -12359.0928 | norm 6550.6398
|
| 258 |
+
2025-06-25 08:27:23,424 | INFO | iter 000017 | lr 0.1800 | loss -18700.9395 | norm 6955.5065
|
| 259 |
+
2025-06-25 08:27:28,958 | INFO | iter 000018 | lr 0.1800 | loss -28184.4316 | norm 7388.6123
|
| 260 |
+
2025-06-25 08:27:34,339 | INFO | iter 000019 | lr 0.1800 | loss -33184.2344 | norm 7858.2671
|
| 261 |
+
2025-06-25 08:27:39,610 | INFO | iter 000020 | lr 0.1800 | loss -43730.0781 | norm 8364.9345
|
| 262 |
+
2025-06-25 08:27:44,683 | INFO | iter 000021 | lr 0.1800 | loss -64642.3008 | norm 8911.7296
|
| 263 |
+
2025-06-25 08:27:49,940 | INFO | iter 000022 | lr 0.1800 | loss -81204.4609 | norm 9481.5446
|
| 264 |
+
2025-06-25 08:27:55,204 | INFO | iter 000023 | lr 0.1800 | loss -86869.9766 | norm 10082.6514
|
| 265 |
+
2025-06-25 08:28:00,953 | INFO | iter 000024 | lr 0.1800 | loss -105953.4062 | norm 10687.3279
|
| 266 |
+
2025-06-25 08:28:06,307 | INFO | iter 000025 | lr 0.1800 | loss -140775.6562 | norm 11322.9195
|
| 267 |
+
2025-06-25 08:28:11,582 | INFO | iter 000026 | lr 0.1800 | loss -132647.5938 | norm 11978.4767
|
| 268 |
+
2025-06-25 08:28:16,754 | INFO | iter 000027 | lr 0.1800 | loss -153962.5781 | norm 12637.7207
|
| 269 |
+
2025-06-25 08:28:21,813 | INFO | iter 000028 | lr 0.1800 | loss -202029.5312 | norm 13312.6754
|
| 270 |
+
2025-06-25 08:28:27,054 | INFO | iter 000029 | lr 0.1800 | loss -234245.6406 | norm 13996.1748
|
| 271 |
+
2025-06-25 08:28:33,097 | INFO | iter 000030 | lr 0.1800 | loss -229900.0625 | norm 14646.0295
|
| 272 |
+
2025-06-25 08:28:38,339 | INFO | iter 000031 | lr 0.1800 | loss -308670.3125 | norm 15304.3426
|
| 273 |
+
2025-06-25 08:28:43,514 | INFO | iter 000032 | lr 0.1800 | loss -316647.1562 | norm 15968.0695
|
| 274 |
+
2025-06-25 08:28:48,671 | INFO | iter 000033 | lr 0.1800 | loss -398862.5625 | norm 16630.9157
|
| 275 |
+
2025-06-25 08:28:53,992 | INFO | iter 000034 | lr 0.1800 | loss -400159.2812 | norm 17296.2316
|
| 276 |
+
2025-06-25 08:28:59,229 | INFO | iter 000035 | lr 0.1800 | loss -500405.0000 | norm 17967.7855
|
| 277 |
+
2025-06-25 08:29:04,924 | INFO | iter 000036 | lr 0.1800 | loss -542243.6875 | norm 18618.2132
|
| 278 |
+
2025-06-25 08:29:10,021 | INFO | iter 000037 | lr 0.1800 | loss -565329.5000 | norm 19245.1076
|
| 279 |
+
2025-06-25 08:29:24,306 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
|
| 280 |
+
2025-06-25 08:29:26,082 | INFO | Loaded checkpoint with optimizer: adam
|
| 281 |
+
2025-06-25 08:29:26,082 | INFO | Current learning rate: 0.0018
|
| 282 |
+
2025-06-25 08:29:26,684 | INFO | Weight decay: 0.1
|
| 283 |
+
2025-06-25 08:29:26,684 | INFO | Epsilon: 1e-08
|
| 284 |
+
2025-06-25 08:29:26,684 | INFO | Loaded 147 first moment (m) buffers
|
| 285 |
+
2025-06-25 08:29:26,684 | INFO | Loaded 147 second moment (v) buffers
|
| 286 |
+
2025-06-25 08:29:26,684 | INFO | Optimizer state loading completed!
|
| 287 |
+
2025-06-25 08:29:28,983 | INFO | Initialized xs with norm: 1.273466
|
| 288 |
+
2025-06-25 08:29:28,995 | INFO | -------------------------------- EoS --------------------------------
|
| 289 |
+
2025-06-25 08:29:28,995 | INFO | Starting LR test 1/10: lr=0.1800
|
| 290 |
+
2025-06-25 08:29:28,995 | INFO | Starting EoS for LR factor 100.0000
|
| 291 |
+
2025-06-25 08:29:28,995 | INFO | Starting EoS for checkpoint 010000
|
| 292 |
+
2025-06-25 08:29:28,996 | INFO | Starting EoS for model gpt2_small
|
| 293 |
+
2025-06-25 08:29:28,996 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 294 |
+
2025-06-25 08:29:28,996 | INFO | Starting EoS for num_iterations 50
|
| 295 |
+
2025-06-25 08:29:28,996 | INFO | Starting EoS for accum_steps 4
|
| 296 |
+
2025-06-25 08:29:28,996 | INFO | Loading model and checkpoint...
|
| 297 |
+
2025-06-25 08:29:29,754 | INFO | Wrapping model with DDP...
|
| 298 |
+
2025-06-25 08:29:30,234 | INFO | Loading state dict...
|
| 299 |
+
2025-06-25 08:29:30,237 | INFO | Model loaded successfully!
|
| 300 |
+
2025-06-25 08:29:37,767 | INFO | iter 000000 | lr 0.1800 | loss 0.0000 | norm 471.7844
|
| 301 |
+
2025-06-25 08:29:42,698 | INFO | iter 000001 | lr 0.1800 | loss 256.3545 | norm 1033.6785
|
| 302 |
+
2025-06-25 08:29:47,593 | INFO | iter 000002 | lr 0.1800 | loss 6480.4614 | norm 1317.1781
|
| 303 |
+
2025-06-25 08:29:52,655 | INFO | iter 000003 | lr 0.1800 | loss 707.6561 | norm 1814.0126
|
| 304 |
+
2025-06-25 08:29:57,762 | INFO | iter 000004 | lr 0.1800 | loss 4542.7993 | norm 2242.5040
|
| 305 |
+
2025-06-25 08:30:03,258 | INFO | iter 000005 | lr 0.1800 | loss 3116.1213 | norm 2655.8104
|
| 306 |
+
2025-06-25 08:30:08,366 | INFO | iter 000006 | lr 0.1800 | loss 1191.1149 | norm 3089.5643
|
| 307 |
+
2025-06-25 08:30:13,829 | INFO | iter 000007 | lr 0.1800 | loss 2736.9172 | norm 3481.7995
|
| 308 |
+
2025-06-25 08:30:19,004 | INFO | iter 000008 | lr 0.1800 | loss 2748.8972 | norm 3839.4604
|
| 309 |
+
2025-06-25 08:30:24,084 | INFO | iter 000009 | lr 0.1800 | loss 1633.2867 | norm 4181.4834
|
| 310 |
+
2025-06-25 08:30:29,379 | INFO | iter 000010 | lr 0.1800 | loss 741.7749 | norm 4519.9052
|
| 311 |
+
2025-06-25 08:30:34,505 | INFO | iter 000011 | lr 0.1800 | loss 424.5821 | norm 4851.2794
|
| 312 |
+
2025-06-25 08:30:39,626 | INFO | iter 000012 | lr 0.1800 | loss -395.4122 | norm 5162.5043
|
| 313 |
+
2025-06-25 08:30:44,815 | INFO | iter 000013 | lr 0.1800 | loss -2702.6868 | norm 5485.5912
|
| 314 |
+
2025-06-25 08:30:50,029 | INFO | iter 000014 | lr 0.1800 | loss -5335.0098 | norm 5827.3394
|
| 315 |
+
2025-06-25 08:30:55,330 | INFO | iter 000015 | lr 0.1800 | loss -6652.4419 | norm 6184.9165
|
| 316 |
+
2025-06-25 08:31:00,446 | INFO | iter 000016 | lr 0.1800 | loss -12321.6143 | norm 6558.2939
|
| 317 |
+
2025-06-25 08:31:06,042 | INFO | iter 000017 | lr 0.1800 | loss -18648.2637 | norm 6962.6391
|
| 318 |
+
2025-06-25 08:31:11,354 | INFO | iter 000018 | lr 0.1800 | loss -27954.8457 | norm 7397.7840
|
| 319 |
+
2025-06-25 08:31:16,490 | INFO | iter 000019 | lr 0.1800 | loss -33195.2930 | norm 7869.4220
|
| 320 |
+
2025-06-25 08:31:21,610 | INFO | iter 000020 | lr 0.1800 | loss -43725.9531 | norm 8378.2009
|
| 321 |
+
2025-06-25 08:31:35,131 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt
|
| 322 |
+
2025-06-25 08:31:37,075 | INFO | Loaded checkpoint with optimizer: adam
|
| 323 |
+
2025-06-25 08:31:37,075 | INFO | Current learning rate: 0.0018
|
| 324 |
+
2025-06-25 08:31:37,696 | INFO | Weight decay: 0.1
|
| 325 |
+
2025-06-25 08:31:37,696 | INFO | Epsilon: 1e-08
|
| 326 |
+
2025-06-25 08:31:37,696 | INFO | Loaded 147 first moment (m) buffers
|
| 327 |
+
2025-06-25 08:31:37,696 | INFO | Loaded 147 second moment (v) buffers
|
| 328 |
+
2025-06-25 08:31:37,696 | INFO | Optimizer state loading completed!
|
| 329 |
+
2025-06-25 08:31:39,726 | INFO | Initialized xs with norm: 1.273655
|
| 330 |
+
2025-06-25 08:31:39,743 | INFO | -------------------------------- EoS --------------------------------
|
| 331 |
+
2025-06-25 08:31:39,743 | INFO | Starting LR test 1/10: lr=18.0000
|
| 332 |
+
2025-06-25 08:31:39,743 | INFO | Starting EoS for LR factor 10000.0000
|
| 333 |
+
2025-06-25 08:31:39,743 | INFO | Starting EoS for checkpoint 010000
|
| 334 |
+
2025-06-25 08:31:39,743 | INFO | Starting EoS for model gpt2_small
|
| 335 |
+
2025-06-25 08:31:39,743 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin
|
| 336 |
+
2025-06-25 08:31:39,743 | INFO | Starting EoS for num_iterations 50
|
| 337 |
+
2025-06-25 08:31:39,743 | INFO | Starting EoS for accum_steps 4
|
| 338 |
+
2025-06-25 08:31:39,743 | INFO | Loading model and checkpoint...
|
| 339 |
+
2025-06-25 08:31:40,508 | INFO | Wrapping model with DDP...
|
| 340 |
+
2025-06-25 08:31:40,770 | INFO | Loading state dict...
|
| 341 |
+
2025-06-25 08:31:40,773 | INFO | Model loaded successfully!
|
| 342 |
+
2025-06-25 08:31:48,938 | INFO | iter 000000 | lr 18.0000 | loss 0.0000 | norm 47178.7107
|
| 343 |
+
2025-06-25 08:31:54,180 | INFO | iter 000001 | lr 18.0000 | loss 2579423.0000 | norm 105552.8516
|
| 344 |
+
2025-06-25 08:31:59,433 | INFO | iter 000002 | lr 18.0000 | loss 111763472.0000 | norm 135456.5704
|
| 345 |
+
2025-06-25 08:32:04,746 | INFO | iter 000003 | lr 18.0000 | loss 276864480.0000 | norm 133713.0656
|
| 346 |
+
2025-06-25 08:32:09,937 | INFO | iter 000004 | lr 18.0000 | loss 320959104.0000 | norm 132714.9397
|
| 347 |
+
2025-06-25 08:32:15,243 | INFO | iter 000005 | lr 18.0000 | loss 336375200.0000 | norm 124895.1838
|
| 348 |
+
2025-06-25 08:32:20,604 | INFO | iter 000006 | lr 18.0000 | loss 306018016.0000 | norm 117796.8739
|
| 349 |
+
2025-06-25 08:32:25,779 | INFO | iter 000007 | lr 18.0000 | loss 267033840.0000 | norm 108461.6875
|
| 350 |
+
2025-06-25 08:32:31,373 | INFO | iter 000008 | lr 18.0000 | loss 243656960.0000 | norm 102025.2348
|
| 351 |
+
2025-06-25 08:32:36,467 | INFO | iter 000009 | lr 18.0000 | loss 221454464.0000 | norm 93885.3865
|
| 352 |
+
2025-06-25 08:32:41,725 | INFO | iter 000010 | lr 18.0000 | loss 185239232.0000 | norm 87718.0808
|
| 353 |
+
2025-06-25 08:32:47,036 | INFO | iter 000011 | lr 18.0000 | loss 154752192.0000 | norm 80196.6753
|
| 354 |
+
2025-06-25 08:32:52,225 | INFO | iter 000012 | lr 18.0000 | loss 132501480.0000 | norm 77225.2747
|
| 355 |
+
2025-06-25 08:32:57,474 | INFO | iter 000013 | lr 18.0000 | loss 119106048.0000 | norm 68886.8834
|
| 356 |
+
2025-06-25 08:33:03,295 | INFO | iter 000014 | lr 18.0000 | loss 97062160.0000 | norm 65299.5486
|
| 357 |
+
2025-06-25 08:33:08,516 | INFO | iter 000015 | lr 18.0000 | loss 79378960.0000 | norm 59360.0908
|
| 358 |
+
2025-06-25 08:33:13,789 | INFO | iter 000016 | lr 18.0000 | loss 68932920.0000 | norm 56258.6832
|
| 359 |
+
2025-06-25 08:33:19,001 | INFO | iter 000017 | lr 18.0000 | loss 61816232.0000 | norm 51914.3800
|
| 360 |
+
2025-06-25 08:33:24,459 | INFO | iter 000018 | lr 18.0000 | loss 52157472.0000 | norm 49234.2090
|
| 361 |
+
2025-06-25 08:33:30,191 | INFO | iter 000019 | lr 18.0000 | loss 42893376.0000 | norm 45645.4134
|
| 362 |
+
2025-06-25 08:33:35,359 | INFO | iter 000020 | lr 18.0000 | loss 39586664.0000 | norm 43603.7019
|
| 363 |
+
2025-06-25 08:33:40,646 | INFO | iter 000021 | lr 18.0000 | loss 35820804.0000 | norm 40888.9978
|
| 364 |
+
2025-06-25 08:33:45,894 | INFO | iter 000022 | lr 18.0000 | loss 30331428.0000 | norm 39151.8028
|
| 365 |
+
2025-06-25 08:33:51,024 | INFO | iter 000023 | lr 18.0000 | loss 26849960.0000 | norm 37251.1340
|
| 366 |
+
2025-06-25 08:33:56,173 | INFO | iter 000024 | lr 18.0000 | loss 25112784.0000 | norm 36166.3473
|
| 367 |
+
2025-06-25 08:34:01,635 | INFO | iter 000025 | lr 18.0000 | loss 23161016.0000 | norm 34488.3613
|
| 368 |
+
2025-06-25 08:34:06,878 | INFO | iter 000026 | lr 18.0000 | loss 21752610.0000 | norm 33774.8960
|
| 369 |
+
2025-06-25 08:34:12,170 | INFO | iter 000027 | lr 18.0000 | loss 20102200.0000 | norm 32634.3167
|
| 370 |
+
2025-06-25 08:34:17,332 | INFO | iter 000028 | lr 18.0000 | loss 20159082.0000 | norm 32413.1788
|
| 371 |
+
2025-06-25 08:34:22,621 | INFO | iter 000029 | lr 18.0000 | loss 20947662.0000 | norm 32105.9349
|
| 372 |
+
2025-06-25 08:34:27,921 | INFO | iter 000030 | lr 18.0000 | loss 19995154.0000 | norm 33069.7203
|
| 373 |
+
2025-06-25 08:34:33,805 | INFO | iter 000031 | lr 18.0000 | loss 20508292.0000 | norm 32105.0565
|
| 374 |
+
2025-06-25 08:34:39,056 | INFO | iter 000032 | lr 18.0000 | loss 21091504.0000 | norm 32818.1221
|
| 375 |
+
2025-06-25 08:34:44,173 | INFO | iter 000033 | lr 18.0000 | loss 22435536.0000 | norm 32885.5579
|
| 376 |
+
2025-06-25 08:34:49,412 | INFO | iter 000034 | lr 18.0000 | loss 22536620.0000 | norm 33441.7257
|
| 377 |
+
2025-06-25 08:34:54,548 | INFO | iter 000035 | lr 18.0000 | loss 23596192.0000 | norm 33815.4563
|
| 378 |
+
2025-06-25 08:34:59,711 | INFO | iter 000036 | lr 18.0000 | loss 25515312.0000 | norm 34967.7110
|
| 379 |
+
2025-06-25 08:35:04,972 | INFO | iter 000037 | lr 18.0000 | loss 27476122.0000 | norm 35625.2597
|
| 380 |
+
2025-06-25 08:35:10,098 | INFO | iter 000038 | lr 18.0000 | loss 29265056.0000 | norm 36547.8041
|
| 381 |
+
2025-06-25 08:35:15,160 | INFO | iter 000039 | lr 18.0000 | loss 31501882.0000 | norm 37360.1106
|
| 382 |
+
2025-06-25 08:35:20,403 | INFO | iter 000040 | lr 18.0000 | loss 33444498.0000 | norm 38455.2587
|
| 383 |
+
2025-06-25 08:35:25,428 | INFO | iter 000041 | lr 18.0000 | loss 35255556.0000 | norm 39234.6789
|
| 384 |
+
2025-06-25 08:35:31,052 | INFO | iter 000042 | lr 18.0000 | loss 36108252.0000 | norm 40022.0439
|
| 385 |
+
2025-06-25 08:35:36,298 | INFO | iter 000043 | lr 18.0000 | loss 40793144.0000 | norm 41435.8015
|
| 386 |
+
2025-06-25 08:35:41,428 | INFO | iter 000044 | lr 18.0000 | loss 40548568.0000 | norm 41333.9848
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12623cbe522c1e7a4e735fc1067e87afff8187ac246afb4dba2caad567fffb0e
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01c656090c88cf1d156a4a323f0c8f4eb5d418043b6aa18ff6745a0a58a6d000
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63646361ad0ece816c247161cf7978270f56d7739e29ab5af1169d0bef798d3e
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a51b112a9e09695144c5d1072b1e368431f3d522e7db2c8e95d4e05d5a3fee7f
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fefc324e20a91a5de78a3313f19a003cf2f274abe56f6d08944c319fc715520
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d180f12608316931efe85f49ff95f7f6fa91364bc6672244bd094a069646f7ef
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25017e80be486ae3b5dbbb781de85c121e8d2d88f908581c97dfe988b3730084
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2543f607872dbdd3c04424f6d6d02b415b80cfd19f43b8f134b1cc8496c70cd3
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90ddbe1ada352f48db260bc8c6d74f792303e93eb1160184ae995f2335b7c88e
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d4eaf93dc3d53ab7442bafd01c362dc03e2eee7fe304baa0918f3543a2415d9
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca6cfdb6bde4442e646552ed717969483e3e24394674bd3d6cc8b1b67529c1aa
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:981221565146678811b2a7f34a61f973fb9c0f92583119d993bfdfea2d05c0dd
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f4ef37fffcb4fe27f7bed035fadcfdb90af73c77676d69e20d0a1fa1e1efe85
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13445b8a5bc5476cc9805b3ef65eb2b4237fcbe39361704c64a775661f726252
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9dbbcc9daba97cbcdabde080086f9b41f275b4342e0b4892bca7adfcd4f573ae
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bee60122d717616ce4b40fd27948e3db7503784caa6b6f579ded5a34fce8cb7
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2efad723a74e72ca5b5e4d8792afd4c8b3ef2567e075350dd3cd5f44f2b66121
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afb0c7371eb7b09cc13faa342fee7fd484dde2516baa90f9539695420d16738b
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92ede13b57abfda1ebab3d0bb1091e5df05106498fe4a993c0b607bddd954657
|
| 3 |
+
size 988816320
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59d09284853ab69674c8e10e863ce85717ddc0ea15f1a80fa0122ec48913521e
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:985bf2b31f814aa1642190a5258c516511c660d45371fc995ac4944a453fc4ec
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf9c1bf4d9b3a9e1c570be899f9141bb934ecf325e0708496f455a7518c9ff06
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:adde10dc5829f7236b67eb63ab8f0ed6d58ecfb0fb17deee3cdadf01949879d3
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dffb6249ddca4645632f62a4f38ef97fdb00f775f1b41568d0d35a644e8c99a3
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9b28451924e18d20c8691bdda04c00516426541675534bb3fd1fc036d0d7940
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b90a3953ceb60cd88cd4519a01e4d76c4dbab205ecb7c54dd084ed0f3ef72c79
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd19034279aa37701ee63a9ef80d1c6bcd4e040dfc2b93d9f02050cc704cd96f
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b78860a70fe8718833963ca3879abc0d0b0645edabc7832de26df64a922f6b1d
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e534e382e6d665965f8fbb8b7435bb52eaaba18e95a6df95cae4a5aa4bf56a3
|
| 3 |
+
size 1297616507
|
fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1d6da5d86436c34b53dba6c5726c8a79af0780ded43fba94632b58c757f8c8a
|
| 3 |
+
size 1297616507
|