diff --git a/.gitattributes b/.gitattributes index 77a4a498fb2e8a3604a2ea42e25d9341da71afd1..4d5d6025c4d36ac55ba720769dc11e8de83a88f0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -172,3 +172,8 @@ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_ fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_015000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text +fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text +fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis_ori.png filter=lfs diff=lfs merge=lfs -text +fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis.png filter=lfs diff=lfs merge=lfs -text +fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.png filter=lfs diff=lfs merge=lfs -text +fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.png filter=lfs diff=lfs merge=lfs -text diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..db1f5936ef16800a86f026eb1243e8ec6b36b74c --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/config.json @@ -0,0 +1,14 @@ +{ + "model_name": "gpt2_small", + "factor_min": 0.6, + "factor_max": 1.5, + "factor_num": 10, + "error": 0.0001, + "accum_steps": 4, + "num_iterations": 50, + "num_checkpoint": 1000, + "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin", + "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536", + "timestamp": "250622_035242", + "raw": false +} \ No newline at end of file diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..486594b70be2ec351194d9af968eb09b348b9c7d Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/losses_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..488b5a6fefa74a04fa869333a2041d3aabef368a Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png new file mode 100644 index 0000000000000000000000000000000000000000..424b89f9a3187ec43b46993ea14bbdc7ce4c6cce Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/norms_lr_iter.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log new file mode 100644 index 0000000000000000000000000000000000000000..7ac0dead991ba7f060d736618c1fbf143b538703 --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_001000/training.log @@ -0,0 +1,253 @@ +2025-06-25 06:40:04,162 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt +2025-06-25 06:42:00,983 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt +2025-06-25 06:42:02,949 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 06:42:02,949 | INFO | Current learning rate: 0.0018 +2025-06-25 06:42:03,551 | INFO | Weight decay: 0.1 +2025-06-25 06:42:03,551 | INFO | Epsilon: 1e-08 +2025-06-25 06:42:03,551 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 06:42:03,551 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 06:42:03,551 | INFO | Optimizer state loading completed! +2025-06-25 06:42:05,486 | INFO | Initialized xs with norm: 1.273417 +2025-06-25 06:42:05,497 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:42:05,497 | INFO | Starting LR test 1/10: lr=0.0011 +2025-06-25 06:42:05,497 | INFO | Starting EoS for LR factor 0.6000 +2025-06-25 06:42:05,497 | INFO | Starting EoS for checkpoint 001000 +2025-06-25 06:42:05,497 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:42:05,497 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:42:05,497 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:42:05,497 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:42:05,497 | INFO | Loading model and checkpoint... +2025-06-25 06:42:06,378 | INFO | Wrapping model with DDP... +2025-06-25 06:42:06,406 | INFO | Loading state dict... +2025-06-25 06:42:06,410 | INFO | Model loaded successfully! +2025-06-25 06:42:13,594 | INFO | iter 000000 | lr 0.0011 | loss 4.3392 | norm 13.7445 +2025-06-25 06:42:18,961 | INFO | iter 000001 | lr 0.0011 | loss 4.4330 | norm 27.2540 +2025-06-25 06:42:24,075 | INFO | iter 000002 | lr 0.0011 | loss 4.3832 | norm 40.6489 +2025-06-25 06:42:29,377 | INFO | iter 000003 | lr 0.0011 | loss 4.3913 | norm 53.8958 +2025-06-25 06:42:34,634 | INFO | iter 000004 | lr 0.0011 | loss 4.3451 | norm 66.9831 +2025-06-25 06:42:39,893 | INFO | iter 000005 | lr 0.0011 | loss 4.4464 | norm 79.9053 +2025-06-25 06:42:45,174 | INFO | iter 000006 | lr 0.0011 | loss 4.4459 | norm 92.6583 +2025-06-25 06:42:50,203 | INFO | iter 000007 | lr 0.0011 | loss 4.4652 | norm 105.2383 +2025-06-25 06:42:55,371 | INFO | iter 000008 | lr 0.0011 | loss 4.4837 | norm 117.6420 +2025-06-25 06:43:00,884 | INFO | iter 000009 | lr 0.0011 | loss 4.5257 | norm 129.8671 +2025-06-25 06:43:06,419 | INFO | iter 000010 | lr 0.0011 | loss 4.5709 | norm 141.9121 +2025-06-25 06:43:11,725 | INFO | iter 000011 | lr 0.0011 | loss 4.5868 | norm 153.7757 +2025-06-25 06:43:16,715 | INFO | iter 000012 | lr 0.0011 | loss 4.6285 | norm 165.4574 +2025-06-25 06:43:21,934 | INFO | iter 000013 | lr 0.0011 | loss 4.4997 | norm 176.9570 +2025-06-25 06:43:27,054 | INFO | iter 000014 | lr 0.0011 | loss 4.8695 | norm 188.2745 +2025-06-25 06:43:32,694 | INFO | iter 000015 | lr 0.0011 | loss 4.8345 | norm 199.4108 +2025-06-25 06:43:37,757 | INFO | iter 000016 | lr 0.0011 | loss 4.7835 | norm 210.3668 +2025-06-25 06:43:42,881 | INFO | iter 000017 | lr 0.0011 | loss 4.6182 | norm 221.1439 +2025-06-25 06:43:48,183 | INFO | iter 000018 | lr 0.0011 | loss 4.9529 | norm 231.7438 +2025-06-25 06:43:53,391 | INFO | iter 000019 | lr 0.0011 | loss 4.7017 | norm 242.1687 +2025-06-25 06:43:58,710 | INFO | iter 000020 | lr 0.0011 | loss 4.7435 | norm 252.4208 +2025-06-25 06:44:04,463 | INFO | iter 000021 | lr 0.0011 | loss 4.7189 | norm 262.5029 +2025-06-25 06:44:09,748 | INFO | iter 000022 | lr 0.0011 | loss 4.8201 | norm 272.4178 +2025-06-25 06:44:14,874 | INFO | iter 000023 | lr 0.0011 | loss 4.5389 | norm 282.1686 +2025-06-25 06:44:20,048 | INFO | iter 000024 | lr 0.0011 | loss 4.6885 | norm 291.7586 +2025-06-25 06:44:25,212 | INFO | iter 000025 | lr 0.0011 | loss 4.6047 | norm 301.1913 +2025-06-25 06:44:30,907 | INFO | iter 000026 | lr 0.0011 | loss 4.8046 | norm 310.4704 +2025-06-25 06:44:36,109 | INFO | iter 000027 | lr 0.0011 | loss 4.8286 | norm 319.5996 +2025-06-25 06:44:41,223 | INFO | iter 000028 | lr 0.0011 | loss 4.4733 | norm 328.5828 +2025-06-25 06:44:46,307 | INFO | iter 000029 | lr 0.0011 | loss 4.7715 | norm 337.4237 +2025-06-25 06:44:51,454 | INFO | iter 000030 | lr 0.0011 | loss 4.3984 | norm 346.1263 +2025-06-25 06:44:56,725 | INFO | iter 000031 | lr 0.0011 | loss 4.5874 | norm 354.6947 +2025-06-25 06:45:02,103 | INFO | iter 000032 | lr 0.0011 | loss 4.2035 | norm 363.1331 +2025-06-25 06:45:07,293 | INFO | iter 000033 | lr 0.0011 | loss 3.9937 | norm 371.4456 +2025-06-25 06:45:12,413 | INFO | iter 000034 | lr 0.0011 | loss 4.2272 | norm 379.6361 +2025-06-25 06:45:17,562 | INFO | iter 000035 | lr 0.0011 | loss 4.0314 | norm 387.7087 +2025-06-25 06:45:22,574 | INFO | iter 000036 | lr 0.0011 | loss 4.0062 | norm 395.6674 +2025-06-25 06:45:27,804 | INFO | iter 000037 | lr 0.0011 | loss 4.2448 | norm 403.5161 +2025-06-25 06:45:33,292 | INFO | iter 000038 | lr 0.0011 | loss 3.5407 | norm 411.2590 +2025-06-25 06:45:38,469 | INFO | iter 000039 | lr 0.0011 | loss 3.3566 | norm 418.8998 +2025-06-25 06:45:43,663 | INFO | iter 000040 | lr 0.0011 | loss 3.4557 | norm 426.4421 +2025-06-25 06:45:48,847 | INFO | iter 000041 | lr 0.0011 | loss 3.3656 | norm 433.8899 +2025-06-25 06:45:54,209 | INFO | iter 000042 | lr 0.0011 | loss 2.9874 | norm 441.2469 +2025-06-25 06:45:59,878 | INFO | iter 000043 | lr 0.0011 | loss 2.7940 | norm 448.5164 +2025-06-25 06:46:05,083 | INFO | iter 000044 | lr 0.0011 | loss 2.7455 | norm 455.7017 +2025-06-25 06:46:10,201 | INFO | iter 000045 | lr 0.0011 | loss 2.6695 | norm 462.8061 +2025-06-25 06:46:15,556 | INFO | iter 000046 | lr 0.0011 | loss 2.5043 | norm 469.8330 +2025-06-25 06:46:20,540 | INFO | iter 000047 | lr 0.0011 | loss 2.3519 | norm 476.7853 +2025-06-25 06:46:25,531 | INFO | iter 000048 | lr 0.0011 | loss 1.8817 | norm 483.6662 +2025-06-25 06:46:31,412 | INFO | iter 000049 | lr 0.0011 | loss 1.3600 | norm 490.4789 +2025-06-25 06:46:31,413 | INFO | Completed LR test 1/10: lr=0.0011 +2025-06-25 06:46:31,437 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:46:31,437 | INFO | Starting LR test 2/10: lr=0.0018 +2025-06-25 06:46:31,437 | INFO | Starting EoS for LR factor 1.0000 +2025-06-25 06:46:31,437 | INFO | Starting EoS for checkpoint 001000 +2025-06-25 06:46:31,437 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:46:31,437 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:46:31,437 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:46:31,437 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:46:31,437 | INFO | Loading model and checkpoint... +2025-06-25 06:46:32,149 | INFO | Wrapping model with DDP... +2025-06-25 06:46:32,538 | INFO | Loading state dict... +2025-06-25 06:46:32,542 | INFO | Model loaded successfully! +2025-06-25 06:46:39,146 | INFO | iter 000000 | lr 0.0018 | loss 4.3392 | norm 22.8459 +2025-06-25 06:46:44,139 | INFO | iter 000001 | lr 0.0018 | loss 4.4466 | norm 45.1634 +2025-06-25 06:46:49,296 | INFO | iter 000002 | lr 0.0018 | loss 4.4217 | norm 67.1257 +2025-06-25 06:46:54,367 | INFO | iter 000003 | lr 0.0018 | loss 4.4527 | norm 88.6757 +2025-06-25 06:46:59,621 | INFO | iter 000004 | lr 0.0018 | loss 4.4288 | norm 109.7928 +2025-06-25 06:47:05,068 | INFO | iter 000005 | lr 0.0018 | loss 4.5626 | norm 130.4703 +2025-06-25 06:47:10,277 | INFO | iter 000006 | lr 0.0018 | loss 4.5940 | norm 150.7033 +2025-06-25 06:47:15,415 | INFO | iter 000007 | lr 0.0018 | loss 4.6409 | norm 170.4862 +2025-06-25 06:47:20,565 | INFO | iter 000008 | lr 0.0018 | loss 4.6550 | norm 189.8151 +2025-06-25 06:47:25,896 | INFO | iter 000009 | lr 0.0018 | loss 4.7106 | norm 208.6894 +2025-06-25 06:47:31,787 | INFO | iter 000010 | lr 0.0018 | loss 4.7663 | norm 227.1111 +2025-06-25 06:47:37,142 | INFO | iter 000011 | lr 0.0018 | loss 4.7738 | norm 245.0833 +2025-06-25 06:47:42,360 | INFO | iter 000012 | lr 0.0018 | loss 4.7983 | norm 262.6107 +2025-06-25 06:47:47,575 | INFO | iter 000013 | lr 0.0018 | loss 4.6526 | norm 279.6987 +2025-06-25 06:47:52,603 | INFO | iter 000014 | lr 0.0018 | loss 5.0270 | norm 296.3541 +2025-06-25 06:47:57,840 | INFO | iter 000015 | lr 0.0018 | loss 4.9013 | norm 312.5854 +2025-06-25 06:48:03,561 | INFO | iter 000016 | lr 0.0018 | loss 4.8465 | norm 328.4014 +2025-06-25 06:48:08,832 | INFO | iter 000017 | lr 0.0018 | loss 4.5672 | norm 343.8123 +2025-06-25 06:48:13,857 | INFO | iter 000018 | lr 0.0018 | loss 4.7794 | norm 358.8289 +2025-06-25 06:48:19,012 | INFO | iter 000019 | lr 0.0018 | loss 4.4661 | norm 373.4629 +2025-06-25 06:48:24,132 | INFO | iter 000020 | lr 0.0018 | loss 4.5023 | norm 387.7265 +2025-06-25 06:48:29,868 | INFO | iter 000021 | lr 0.0018 | loss 4.2938 | norm 401.6327 +2025-06-25 06:48:35,428 | INFO | iter 000022 | lr 0.0018 | loss 4.3684 | norm 415.1946 +2025-06-25 06:48:40,742 | INFO | iter 000023 | lr 0.0018 | loss 3.8490 | norm 428.4261 +2025-06-25 06:48:45,938 | INFO | iter 000024 | lr 0.0018 | loss 4.0617 | norm 441.3406 +2025-06-25 06:48:51,129 | INFO | iter 000025 | lr 0.0018 | loss 3.6068 | norm 453.9526 +2025-06-25 06:48:56,489 | INFO | iter 000026 | lr 0.0018 | loss 4.0782 | norm 466.2762 +2025-06-25 06:49:02,061 | INFO | iter 000027 | lr 0.0018 | loss 3.8365 | norm 478.3254 +2025-06-25 06:49:07,402 | INFO | iter 000028 | lr 0.0018 | loss 2.9535 | norm 490.1142 +2025-06-25 06:49:12,466 | INFO | iter 000029 | lr 0.0018 | loss 3.1557 | norm 501.6562 +2025-06-25 06:49:17,583 | INFO | iter 000030 | lr 0.0018 | loss 2.7154 | norm 512.9648 +2025-06-25 06:49:22,770 | INFO | iter 000031 | lr 0.0018 | loss 2.8398 | norm 524.0536 +2025-06-25 06:49:27,964 | INFO | iter 000032 | lr 0.0018 | loss 1.6209 | norm 534.9364 +2025-06-25 06:49:33,586 | INFO | iter 000033 | lr 0.0018 | loss 0.8802 | norm 545.6261 +2025-06-25 06:49:38,805 | INFO | iter 000034 | lr 0.0018 | loss 1.5411 | norm 556.1344 +2025-06-25 06:49:44,031 | INFO | iter 000035 | lr 0.0018 | loss 0.7415 | norm 566.4735 +2025-06-25 06:49:49,298 | INFO | iter 000036 | lr 0.0018 | loss 0.3749 | norm 576.6551 +2025-06-25 06:49:54,505 | INFO | iter 000037 | lr 0.0018 | loss 0.9944 | norm 586.6894 +2025-06-25 06:50:00,198 | INFO | iter 000038 | lr 0.0018 | loss -1.2345 | norm 596.5885 +2025-06-25 06:50:05,762 | INFO | iter 000039 | lr 0.0018 | loss -1.4369 | norm 606.3619 +2025-06-25 06:50:10,887 | INFO | iter 000040 | lr 0.0018 | loss -1.5657 | norm 616.0191 +2025-06-25 06:50:16,128 | INFO | iter 000041 | lr 0.0018 | loss -2.0936 | norm 625.5695 +2025-06-25 06:50:21,496 | INFO | iter 000042 | lr 0.0018 | loss -3.7056 | norm 635.0224 +2025-06-25 06:50:26,613 | INFO | iter 000043 | lr 0.0018 | loss -3.8761 | norm 644.3855 +2025-06-25 06:50:32,217 | INFO | iter 000044 | lr 0.0018 | loss -4.3276 | norm 653.6660 +2025-06-25 06:50:37,522 | INFO | iter 000045 | lr 0.0018 | loss -4.6411 | norm 662.8710 +2025-06-25 06:50:42,783 | INFO | iter 000046 | lr 0.0018 | loss -5.7818 | norm 672.0079 +2025-06-25 06:50:47,790 | INFO | iter 000047 | lr 0.0018 | loss -5.7992 | norm 681.0823 +2025-06-25 06:50:52,994 | INFO | iter 000048 | lr 0.0018 | loss -7.2470 | norm 690.1007 +2025-06-25 06:50:58,040 | INFO | iter 000049 | lr 0.0018 | loss -9.4092 | norm 699.0692 +2025-06-25 06:50:58,041 | INFO | Completed LR test 2/10: lr=0.0018 +2025-06-25 06:50:58,068 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:50:58,068 | INFO | Starting LR test 3/10: lr=0.0025 +2025-06-25 06:50:58,068 | INFO | Starting EoS for LR factor 1.4000 +2025-06-25 06:50:58,068 | INFO | Starting EoS for checkpoint 001000 +2025-06-25 06:50:58,068 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:50:58,068 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:50:58,068 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:50:58,068 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:50:58,068 | INFO | Loading model and checkpoint... +2025-06-25 06:50:58,772 | INFO | Wrapping model with DDP... +2025-06-25 06:50:59,088 | INFO | Loading state dict... +2025-06-25 06:50:59,091 | INFO | Model loaded successfully! +2025-06-25 06:51:05,754 | INFO | iter 000000 | lr 0.0025 | loss 4.3392 | norm 31.9607 +2025-06-25 06:51:10,755 | INFO | iter 000001 | lr 0.0025 | loss 4.4680 | norm 62.8866 +2025-06-25 06:51:15,818 | INFO | iter 000002 | lr 0.0025 | loss 4.4710 | norm 93.1217 +2025-06-25 06:51:20,907 | INFO | iter 000003 | lr 0.0025 | loss 4.5262 | norm 122.5640 +2025-06-25 06:51:26,040 | INFO | iter 000004 | lr 0.0025 | loss 4.5248 | norm 151.1809 +2025-06-25 06:51:31,812 | INFO | iter 000005 | lr 0.0025 | loss 4.6891 | norm 178.9703 +2025-06-25 06:51:37,067 | INFO | iter 000006 | lr 0.0025 | loss 4.7442 | norm 205.9325 +2025-06-25 06:51:42,230 | INFO | iter 000007 | lr 0.0025 | loss 4.8070 | norm 232.0657 +2025-06-25 06:51:47,272 | INFO | iter 000008 | lr 0.0025 | loss 4.7906 | norm 257.3720 +2025-06-25 06:51:52,535 | INFO | iter 000009 | lr 0.0025 | loss 4.8344 | norm 281.8610 +2025-06-25 06:51:57,924 | INFO | iter 000010 | lr 0.0025 | loss 4.8678 | norm 305.5477 +2025-06-25 06:52:03,450 | INFO | iter 000011 | lr 0.0025 | loss 4.8304 | norm 328.4503 +2025-06-25 06:52:08,629 | INFO | iter 000012 | lr 0.0025 | loss 4.7894 | norm 350.5884 +2025-06-25 06:52:13,898 | INFO | iter 000013 | lr 0.0025 | loss 4.5864 | norm 371.9838 +2025-06-25 06:52:19,143 | INFO | iter 000014 | lr 0.0025 | loss 4.9290 | norm 392.6606 +2025-06-25 06:52:24,307 | INFO | iter 000015 | lr 0.0025 | loss 4.6104 | norm 412.6456 +2025-06-25 06:52:29,763 | INFO | iter 000016 | lr 0.0025 | loss 4.4951 | norm 431.9664 +2025-06-25 06:52:34,877 | INFO | iter 000017 | lr 0.0025 | loss 4.0138 | norm 450.6518 +2025-06-25 06:52:40,147 | INFO | iter 000018 | lr 0.0025 | loss 3.9656 | norm 468.7319 +2025-06-25 06:52:45,265 | INFO | iter 000019 | lr 0.0025 | loss 3.5134 | norm 486.2377 +2025-06-25 06:52:50,435 | INFO | iter 000020 | lr 0.0025 | loss 3.4981 | norm 503.2006 +2025-06-25 06:52:55,571 | INFO | iter 000021 | lr 0.0025 | loss 3.0034 | norm 519.6526 +2025-06-25 06:53:01,174 | INFO | iter 000022 | lr 0.0025 | loss 3.0086 | norm 535.6248 +2025-06-25 06:53:06,599 | INFO | iter 000023 | lr 0.0025 | loss 2.0425 | norm 551.1493 +2025-06-25 06:53:11,956 | INFO | iter 000024 | lr 0.0025 | loss 2.3420 | norm 566.2564 +2025-06-25 06:53:17,132 | INFO | iter 000025 | lr 0.0025 | loss 1.2484 | norm 580.9775 +2025-06-25 06:53:22,381 | INFO | iter 000026 | lr 0.0025 | loss 1.9663 | norm 595.3424 +2025-06-25 06:53:27,583 | INFO | iter 000027 | lr 0.0025 | loss 1.4790 | norm 609.3793 +2025-06-25 06:53:33,275 | INFO | iter 000028 | lr 0.0025 | loss -0.2816 | norm 623.1166 +2025-06-25 06:53:38,353 | INFO | iter 000029 | lr 0.0025 | loss -0.2689 | norm 636.5806 +2025-06-25 06:53:43,494 | INFO | iter 000030 | lr 0.0025 | loss -0.8742 | norm 649.7966 +2025-06-25 06:53:48,723 | INFO | iter 000031 | lr 0.0025 | loss -0.8715 | norm 662.7894 +2025-06-25 06:59:15,808 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_001000.pt +2025-06-25 06:59:17,627 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 06:59:17,627 | INFO | Current learning rate: 0.0018 +2025-06-25 06:59:18,250 | INFO | Weight decay: 0.1 +2025-06-25 06:59:18,250 | INFO | Epsilon: 1e-08 +2025-06-25 06:59:18,251 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 06:59:18,251 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 06:59:18,251 | INFO | Optimizer state loading completed! +2025-06-25 06:59:20,150 | INFO | Initialized xs with norm: 1.273537 +2025-06-25 06:59:20,157 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:59:20,157 | INFO | Starting LR test 1/10: lr=0.0025 +2025-06-25 06:59:20,157 | INFO | Starting EoS for LR factor 1.4000 +2025-06-25 06:59:20,158 | INFO | Starting EoS for checkpoint 001000 +2025-06-25 06:59:20,158 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:59:20,158 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:59:20,158 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:59:20,158 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:59:20,158 | INFO | Loading model and checkpoint... +2025-06-25 06:59:21,280 | INFO | Wrapping model with DDP... +2025-06-25 06:59:21,347 | INFO | Loading state dict... +2025-06-25 06:59:21,350 | INFO | Model loaded successfully! +2025-06-25 06:59:29,329 | INFO | iter 000000 | lr 0.0025 | loss 4.3393 | norm 15.4976 +2025-06-25 06:59:34,563 | INFO | iter 000001 | lr 0.0025 | loss 5.0174 | norm 26.2033 +2025-06-25 06:59:39,547 | INFO | iter 000002 | lr 0.0025 | loss 7.0303 | norm 34.2732 +2025-06-25 06:59:44,649 | INFO | iter 000003 | lr 0.0025 | loss 4.8583 | norm 42.0572 +2025-06-25 06:59:49,612 | INFO | iter 000004 | lr 0.0025 | loss 5.5988 | norm 48.8932 +2025-06-25 06:59:54,632 | INFO | iter 000005 | lr 0.0025 | loss 4.8926 | norm 55.2742 +2025-06-25 07:00:00,343 | INFO | iter 000006 | lr 0.0025 | loss 4.0337 | norm 61.4189 +2025-06-25 07:00:05,824 | INFO | iter 000007 | lr 0.0025 | loss 3.8261 | norm 67.2519 +2025-06-25 07:00:10,921 | INFO | iter 000008 | lr 0.0025 | loss 1.5497 | norm 73.1772 +2025-06-25 07:00:15,950 | INFO | iter 000009 | lr 0.0025 | loss -0.9868 | norm 79.3588 +2025-06-25 07:00:21,158 | INFO | iter 000010 | lr 0.0025 | loss -4.7045 | norm 85.9307 +2025-06-25 07:00:26,356 | INFO | iter 000011 | lr 0.0025 | loss -9.1555 | norm 92.9646 +2025-06-25 07:00:32,012 | INFO | iter 000012 | lr 0.0025 | loss -18.3042 | norm 100.6413 +2025-06-25 07:00:37,134 | INFO | iter 000013 | lr 0.0025 | loss -24.4566 | norm 108.9797 +2025-06-25 07:00:42,361 | INFO | iter 000014 | lr 0.0025 | loss -34.7879 | norm 117.8907 +2025-06-25 07:00:47,543 | INFO | iter 000015 | lr 0.0025 | loss -56.0771 | norm 127.5276 +2025-06-25 07:00:52,587 | INFO | iter 000016 | lr 0.0025 | loss -74.3452 | norm 137.8423 +2025-06-25 07:00:57,843 | INFO | iter 000017 | lr 0.0025 | loss -97.3003 | norm 148.7922 +2025-06-25 07:01:03,546 | INFO | iter 000018 | lr 0.0025 | loss -128.8613 | norm 160.3333 +2025-06-25 07:01:08,639 | INFO | iter 000019 | lr 0.0025 | loss -153.6679 | norm 172.4106 +2025-06-25 07:01:13,817 | INFO | iter 000020 | lr 0.0025 | loss -177.9772 | norm 184.8852 +2025-06-25 07:01:19,038 | INFO | iter 000021 | lr 0.0025 | loss -212.6209 | norm 197.7662 +2025-06-25 07:01:24,099 | INFO | iter 000022 | lr 0.0025 | loss -228.8878 | norm 210.9760 +2025-06-25 07:01:29,523 | INFO | iter 000023 | lr 0.0025 | loss -302.6816 | norm 224.5441 +2025-06-25 07:01:34,575 | INFO | iter 000024 | lr 0.0025 | loss -313.9990 | norm 238.3132 +2025-06-25 07:01:39,718 | INFO | iter 000025 | lr 0.0025 | loss -412.0068 | norm 252.3185 +2025-06-25 07:01:44,741 | INFO | iter 000026 | lr 0.0025 | loss -436.6941 | norm 266.0395 +2025-06-25 07:01:50,024 | INFO | iter 000027 | lr 0.0025 | loss -440.2226 | norm 279.5380 +2025-06-25 07:01:55,265 | INFO | iter 000028 | lr 0.0025 | loss -601.8568 | norm 293.2904 +2025-06-25 07:02:00,592 | INFO | iter 000029 | lr 0.0025 | loss -676.1694 | norm 307.3040 +2025-06-25 07:02:05,933 | INFO | iter 000030 | lr 0.0025 | loss -704.9308 | norm 321.5338 +2025-06-25 07:02:11,124 | INFO | iter 000031 | lr 0.0025 | loss -774.7177 | norm 335.7946 +2025-06-25 07:02:16,435 | INFO | iter 000032 | lr 0.0025 | loss -920.0737 | norm 350.3229 +2025-06-25 07:02:21,641 | INFO | iter 000033 | lr 0.0025 | loss -1063.0433 | norm 364.4135 +2025-06-25 07:02:26,771 | INFO | iter 000034 | lr 0.0025 | loss -1016.9738 | norm 378.6853 +2025-06-25 07:02:32,448 | INFO | iter 000035 | lr 0.0025 | loss -1197.0923 | norm 393.2000 +2025-06-25 07:02:37,618 | INFO | iter 000036 | lr 0.0025 | loss -1259.9513 | norm 407.9392 +2025-06-25 07:02:42,716 | INFO | iter 000037 | lr 0.0025 | loss -1382.7266 | norm 422.7625 +2025-06-25 07:02:47,839 | INFO | iter 000038 | lr 0.0025 | loss -1556.2228 | norm 437.7693 +2025-06-25 07:02:53,065 | INFO | iter 000039 | lr 0.0025 | loss -1659.4865 | norm 452.8923 +2025-06-25 07:02:58,112 | INFO | iter 000040 | lr 0.0025 | loss -1659.5458 | norm 468.0710 +2025-06-25 07:03:03,864 | INFO | iter 000041 | lr 0.0025 | loss -1773.1254 | norm 483.1063 +2025-06-25 07:03:09,029 | INFO | iter 000042 | lr 0.0025 | loss -2041.8066 | norm 498.2888 +2025-06-25 07:03:14,123 | INFO | iter 000043 | lr 0.0025 | loss -2230.4387 | norm 513.6289 +2025-06-25 07:03:19,337 | INFO | iter 000044 | lr 0.0025 | loss -2268.1362 | norm 529.1135 +2025-06-25 07:03:24,439 | INFO | iter 000045 | lr 0.0025 | loss -2196.4004 | norm 544.5320 +2025-06-25 07:03:29,662 | INFO | iter 000046 | lr 0.0025 | loss -2567.3474 | norm 560.0720 +2025-06-25 07:03:35,121 | INFO | iter 000047 | lr 0.0025 | loss -2551.3745 | norm 575.5949 +2025-06-25 07:03:40,366 | INFO | iter 000048 | lr 0.0025 | loss -2840.6702 | norm 591.2187 +2025-06-25 07:03:45,515 | INFO | iter 000049 | lr 0.0025 | loss -3193.7876 | norm 606.9574 +2025-06-25 07:03:45,516 | INFO | Completed LR test 1/10: lr=0.0025 +2025-06-25 07:03:45,809 | INFO | Cleanup complete diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..964a39d3f5ab39acc0f83d8cb7bee0ac6334026b --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/config.json @@ -0,0 +1,14 @@ +{ + "model_name": "gpt2_small", + "factor_min": 0.6, + "factor_max": 1.5, + "factor_num": 10, + "error": 0.0001, + "accum_steps": 4, + "num_iterations": 50, + "num_checkpoint": 2000, + "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin", + "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536", + "timestamp": "250622_035242", + "raw": false +} \ No newline at end of file diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..a75ed7902f926fd081599f0c48392e5e9d87d5ba Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/losses_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..f744b31462de11fea29931cbdbadfdf780239ec8 Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png new file mode 100644 index 0000000000000000000000000000000000000000..a832381ae6de1b2c6b32b7027beb46e2ddc371a5 Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/norms_lr_iter.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log new file mode 100644 index 0000000000000000000000000000000000000000..de1466f2c4c92bc776189343a50c47e3c7d805d1 --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_002000/training.log @@ -0,0 +1,232 @@ +2025-06-25 07:04:49,319 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt +2025-06-25 07:04:50,921 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 07:04:50,922 | INFO | Current learning rate: 0.0018 +2025-06-25 07:04:51,534 | INFO | Weight decay: 0.1 +2025-06-25 07:04:51,534 | INFO | Epsilon: 1e-08 +2025-06-25 07:04:51,534 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 07:04:51,534 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 07:04:51,534 | INFO | Optimizer state loading completed! +2025-06-25 07:04:53,371 | INFO | Initialized xs with norm: 1.273644 +2025-06-25 07:04:53,383 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 07:04:53,383 | INFO | Starting LR test 1/10: lr=0.0025 +2025-06-25 07:04:53,383 | INFO | Starting EoS for LR factor 1.4000 +2025-06-25 07:04:53,383 | INFO | Starting EoS for checkpoint 002000 +2025-06-25 07:04:53,383 | INFO | Starting EoS for model gpt2_small +2025-06-25 07:04:53,383 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 07:04:53,383 | INFO | Starting EoS for num_iterations 50 +2025-06-25 07:04:53,383 | INFO | Starting EoS for accum_steps 4 +2025-06-25 07:04:53,383 | INFO | Loading model and checkpoint... +2025-06-25 07:04:54,128 | INFO | Wrapping model with DDP... +2025-06-25 07:04:54,613 | INFO | Loading state dict... +2025-06-25 07:04:54,617 | INFO | Model loaded successfully! +2025-06-25 07:05:02,467 | INFO | iter 000000 | lr 0.0025 | loss 3.9757 | norm 15.2051 +2025-06-25 07:05:07,565 | INFO | iter 000001 | lr 0.0025 | loss 4.3233 | norm 25.7014 +2025-06-25 07:05:12,643 | INFO | iter 000002 | lr 0.0025 | loss 5.8232 | norm 33.3926 +2025-06-25 07:05:17,743 | INFO | iter 000003 | lr 0.0025 | loss 4.4479 | norm 40.7412 +2025-06-25 07:05:22,835 | INFO | iter 000004 | lr 0.0025 | loss 5.0561 | norm 47.0897 +2025-06-25 07:05:27,984 | INFO | iter 000005 | lr 0.0025 | loss 4.2371 | norm 53.4390 +2025-06-25 07:05:33,593 | INFO | iter 000006 | lr 0.0025 | loss 4.2628 | norm 59.3858 +2025-06-25 07:05:38,622 | INFO | iter 000007 | lr 0.0025 | loss 4.4517 | norm 64.9470 +2025-06-25 07:05:43,671 | INFO | iter 000008 | lr 0.0025 | loss 3.7213 | norm 70.3563 +2025-06-25 07:05:48,942 | INFO | iter 000009 | lr 0.0025 | loss 3.4583 | norm 75.6566 +2025-06-25 07:05:54,202 | INFO | iter 000010 | lr 0.0025 | loss 3.2253 | norm 80.8225 +2025-06-25 07:05:59,788 | INFO | iter 000011 | lr 0.0025 | loss 2.1490 | norm 86.0380 +2025-06-25 07:06:05,316 | INFO | iter 000012 | lr 0.0025 | loss 0.5857 | norm 91.4942 +2025-06-25 07:06:10,581 | INFO | iter 000013 | lr 0.0025 | loss -0.7333 | norm 97.2915 +2025-06-25 07:06:15,719 | INFO | iter 000014 | lr 0.0025 | loss -2.5905 | norm 103.4982 +2025-06-25 07:06:20,943 | INFO | iter 000015 | lr 0.0025 | loss -6.6798 | norm 110.1739 +2025-06-25 07:08:00,350 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt +2025-06-25 07:08:02,106 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 07:08:02,107 | INFO | Current learning rate: 0.0018 +2025-06-25 07:08:02,720 | INFO | Weight decay: 0.1 +2025-06-25 07:08:02,720 | INFO | Epsilon: 1e-08 +2025-06-25 07:08:02,721 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 07:08:02,721 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 07:08:02,721 | INFO | Optimizer state loading completed! +2025-06-25 07:08:04,562 | INFO | Initialized xs with norm: 1.273412 +2025-06-25 07:08:04,573 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 07:08:04,574 | INFO | Starting LR test 1/10: lr=0.0090 +2025-06-25 07:08:04,574 | INFO | Starting EoS for LR factor 5.0000 +2025-06-25 07:08:04,574 | INFO | Starting EoS for checkpoint 002000 +2025-06-25 07:08:04,574 | INFO | Starting EoS for model gpt2_small +2025-06-25 07:08:04,574 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 07:08:04,574 | INFO | Starting EoS for num_iterations 50 +2025-06-25 07:08:04,574 | INFO | Starting EoS for accum_steps 4 +2025-06-25 07:08:04,574 | INFO | Loading model and checkpoint... +2025-06-25 07:08:05,317 | INFO | Wrapping model with DDP... +2025-06-25 07:08:05,851 | INFO | Loading state dict... +2025-06-25 07:08:05,855 | INFO | Model loaded successfully! +2025-06-25 07:08:12,930 | INFO | iter 000000 | lr 0.0090 | loss 3.9757 | norm 54.1260 +2025-06-25 07:08:17,867 | INFO | iter 000001 | lr 0.0090 | loss 7.9464 | norm 93.7556 +2025-06-25 07:08:22,976 | INFO | iter 000002 | lr 0.0090 | loss 65.9051 | norm 114.5414 +2025-06-25 07:08:27,990 | INFO | iter 000003 | lr 0.0090 | loss 9.6467 | norm 142.1960 +2025-06-25 07:08:33,721 | INFO | iter 000004 | lr 0.0090 | loss 37.9767 | norm 165.1957 +2025-06-25 07:08:38,762 | INFO | iter 000005 | lr 0.0090 | loss 34.2442 | norm 186.2563 +2025-06-25 07:08:44,002 | INFO | iter 000006 | lr 0.0090 | loss 12.5688 | norm 207.7581 +2025-06-25 07:08:49,138 | INFO | iter 000007 | lr 0.0090 | loss 13.4724 | norm 228.4881 +2025-06-25 07:08:54,269 | INFO | iter 000008 | lr 0.0090 | loss 18.9169 | norm 247.7513 +2025-06-25 07:08:59,417 | INFO | iter 000009 | lr 0.0090 | loss 14.7739 | norm 265.9009 +2025-06-25 07:09:04,859 | INFO | iter 000010 | lr 0.0090 | loss 4.6113 | norm 283.6754 +2025-06-25 07:09:10,221 | INFO | iter 000011 | lr 0.0090 | loss -2.8853 | norm 301.4828 +2025-06-25 07:09:15,414 | INFO | iter 000012 | lr 0.0090 | loss -8.5041 | norm 319.3103 +2025-06-25 07:09:20,524 | INFO | iter 000013 | lr 0.0090 | loss -16.0165 | norm 337.3585 +2025-06-25 07:09:25,783 | INFO | iter 000014 | lr 0.0090 | loss -30.7357 | norm 356.0018 +2025-06-25 07:09:31,353 | INFO | iter 000015 | lr 0.0090 | loss -59.7186 | norm 375.0746 +2025-06-25 07:09:36,559 | INFO | iter 000016 | lr 0.0090 | loss -85.9098 | norm 395.1222 +2025-06-25 07:09:41,686 | INFO | iter 000017 | lr 0.0090 | loss -113.7542 | norm 416.7910 +2025-06-25 07:09:47,076 | INFO | iter 000018 | lr 0.0090 | loss -182.1024 | norm 439.8696 +2025-06-25 07:27:24,437 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt +2025-06-25 07:27:26,239 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 07:27:26,240 | INFO | Current learning rate: 0.0018 +2025-06-25 07:27:26,858 | INFO | Weight decay: 0.1 +2025-06-25 07:27:26,858 | INFO | Epsilon: 1e-08 +2025-06-25 07:27:26,858 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 07:27:26,858 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 07:27:26,858 | INFO | Optimizer state loading completed! +2025-06-25 07:27:29,212 | INFO | Initialized xs with norm: 1.273458 +2025-06-25 07:27:29,221 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 07:27:29,221 | INFO | Starting LR test 1/10: lr=0.0180 +2025-06-25 07:27:29,221 | INFO | Starting EoS for LR factor 10.0000 +2025-06-25 07:27:29,221 | INFO | Starting EoS for checkpoint 002000 +2025-06-25 07:27:29,221 | INFO | Starting EoS for model gpt2_small +2025-06-25 07:27:29,221 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 07:27:29,221 | INFO | Starting EoS for num_iterations 50 +2025-06-25 07:27:29,221 | INFO | Starting EoS for accum_steps 4 +2025-06-25 07:27:29,221 | INFO | Loading model and checkpoint... +2025-06-25 07:27:29,979 | INFO | Wrapping model with DDP... +2025-06-25 07:27:30,468 | INFO | Loading state dict... +2025-06-25 07:27:30,471 | INFO | Model loaded successfully! +2025-06-25 07:27:37,640 | INFO | iter 000000 | lr 0.0180 | loss 3.9757 | norm 108.2326 +2025-06-25 07:27:42,495 | INFO | iter 000001 | lr 0.0180 | loss 20.5162 | norm 189.4878 +2025-06-25 07:27:47,595 | INFO | iter 000002 | lr 0.0180 | loss 292.6721 | norm 227.1594 +2025-06-25 07:27:52,527 | INFO | iter 000003 | lr 0.0180 | loss 30.5704 | norm 282.5874 +2025-06-25 07:27:57,708 | INFO | iter 000004 | lr 0.0180 | loss 145.1973 | norm 329.8537 +2025-06-25 07:28:03,188 | INFO | iter 000005 | lr 0.0180 | loss 154.9424 | norm 371.7672 +2025-06-25 07:28:08,432 | INFO | iter 000006 | lr 0.0180 | loss 54.9122 | norm 414.5362 +2025-06-25 07:28:13,591 | INFO | iter 000007 | lr 0.0180 | loss 38.6102 | norm 456.6207 +2025-06-25 07:28:18,809 | INFO | iter 000008 | lr 0.0180 | loss 66.4117 | norm 495.8205 +2025-06-25 07:28:23,750 | INFO | iter 000009 | lr 0.0180 | loss 67.2475 | norm 531.9467 +2025-06-25 07:28:29,478 | INFO | iter 000010 | lr 0.0180 | loss 27.9346 | norm 566.8688 +2025-06-25 07:28:34,849 | INFO | iter 000011 | lr 0.0180 | loss -10.5653 | norm 601.7505 +2025-06-25 07:28:40,026 | INFO | iter 000012 | lr 0.0180 | loss -32.7803 | norm 636.7070 +2025-06-25 07:28:45,181 | INFO | iter 000013 | lr 0.0180 | loss -49.7714 | norm 671.7962 +2025-06-25 07:28:50,620 | INFO | iter 000014 | lr 0.0180 | loss -94.9201 | norm 707.5554 +2025-06-25 07:28:55,620 | INFO | iter 000015 | lr 0.0180 | loss -195.6995 | norm 743.8602 +2025-06-25 07:29:00,782 | INFO | iter 000016 | lr 0.0180 | loss -289.3528 | norm 781.8421 +2025-06-25 07:29:05,969 | INFO | iter 000017 | lr 0.0180 | loss -385.6000 | norm 822.7772 +2025-06-25 07:29:11,247 | INFO | iter 000018 | lr 0.0180 | loss -617.5768 | norm 866.3561 +2025-06-25 07:29:16,457 | INFO | iter 000019 | lr 0.0180 | loss -758.4005 | norm 913.5998 +2025-06-25 07:29:21,562 | INFO | iter 000020 | lr 0.0180 | loss -962.2928 | norm 964.2577 +2025-06-25 07:29:26,751 | INFO | iter 000021 | lr 0.0180 | loss -1308.8584 | norm 1018.9549 +2025-06-25 07:29:32,200 | INFO | iter 000022 | lr 0.0180 | loss -1507.0786 | norm 1075.5761 +2025-06-25 07:29:37,305 | INFO | iter 000023 | lr 0.0180 | loss -1928.5552 | norm 1136.3061 +2025-06-25 07:29:42,442 | INFO | iter 000024 | lr 0.0180 | loss -2365.0591 | norm 1200.9531 +2025-06-25 07:29:47,581 | INFO | iter 000025 | lr 0.0180 | loss -2911.7729 | norm 1269.1230 +2025-06-25 07:29:52,848 | INFO | iter 000026 | lr 0.0180 | loss -3918.7095 | norm 1340.4335 +2025-06-25 07:29:58,025 | INFO | iter 000027 | lr 0.0180 | loss -4075.2781 | norm 1415.0733 +2025-06-25 07:30:03,634 | INFO | iter 000028 | lr 0.0180 | loss -4800.8032 | norm 1492.5634 +2025-06-25 07:30:08,717 | INFO | iter 000029 | lr 0.0180 | loss -5429.1694 | norm 1572.5761 +2025-06-25 07:30:13,912 | INFO | iter 000030 | lr 0.0180 | loss -6909.1343 | norm 1655.3578 +2025-06-25 07:30:19,088 | INFO | iter 000031 | lr 0.0180 | loss -7403.7188 | norm 1740.2757 +2025-06-25 07:30:24,170 | INFO | iter 000032 | lr 0.0180 | loss -8883.3643 | norm 1827.2884 +2025-06-25 07:30:29,463 | INFO | iter 000033 | lr 0.0180 | loss -9913.4092 | norm 1916.4071 +2025-06-25 07:30:34,811 | INFO | iter 000034 | lr 0.0180 | loss -12094.2510 | norm 2007.2927 +2025-06-25 07:30:40,049 | INFO | iter 000035 | lr 0.0180 | loss -13123.3652 | norm 2099.8649 +2025-06-25 07:30:45,184 | INFO | iter 000036 | lr 0.0180 | loss -13453.2988 | norm 2187.7686 +2025-06-25 07:30:50,352 | INFO | iter 000037 | lr 0.0180 | loss -15590.8887 | norm 2277.7149 +2025-06-25 07:30:55,349 | INFO | iter 000038 | lr 0.0180 | loss -17174.6211 | norm 2369.4460 +2025-06-25 07:31:00,515 | INFO | iter 000039 | lr 0.0180 | loss -18859.8008 | norm 2462.6141 +2025-06-25 07:31:05,868 | INFO | iter 000040 | lr 0.0180 | loss -22396.0918 | norm 2557.2235 +2025-06-25 07:31:11,195 | INFO | iter 000041 | lr 0.0180 | loss -23291.8730 | norm 2652.7654 +2025-06-25 07:31:16,272 | INFO | iter 000042 | lr 0.0180 | loss -24466.0820 | norm 2748.3429 +2025-06-25 07:31:21,340 | INFO | iter 000043 | lr 0.0180 | loss -26257.3105 | norm 2844.3545 +2025-06-25 07:31:26,604 | INFO | iter 000044 | lr 0.0180 | loss -28454.4160 | norm 2941.0400 +2025-06-25 07:31:32,098 | INFO | iter 000045 | lr 0.0180 | loss -30838.6445 | norm 3038.6056 +2025-06-25 07:31:37,115 | INFO | iter 000046 | lr 0.0180 | loss -32684.4766 | norm 3136.7237 +2025-06-25 07:31:42,152 | INFO | iter 000047 | lr 0.0180 | loss -39129.3398 | norm 3235.0764 +2025-06-25 07:31:47,357 | INFO | iter 000048 | lr 0.0180 | loss -37924.5391 | norm 3333.8420 +2025-06-25 07:31:52,616 | INFO | iter 000049 | lr 0.0180 | loss -39045.4688 | norm 3432.5284 +2025-06-25 07:31:52,617 | INFO | Completed LR test 1/10: lr=0.0180 +2025-06-25 07:31:52,965 | INFO | Cleanup complete +2025-06-25 08:00:39,916 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt +2025-06-25 08:00:41,765 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:00:41,766 | INFO | Current learning rate: 0.0018 +2025-06-25 08:00:42,410 | INFO | Weight decay: 0.1 +2025-06-25 08:00:42,411 | INFO | Epsilon: 1e-08 +2025-06-25 08:00:42,411 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:00:42,411 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:00:42,411 | INFO | Optimizer state loading completed! +2025-06-25 08:00:44,469 | INFO | Initialized xs with norm: 1.273415 +2025-06-25 08:00:44,473 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:00:44,473 | INFO | Starting LR test 1/10: lr=0.0180 +2025-06-25 08:00:44,473 | INFO | Starting EoS for LR factor 10.0000 +2025-06-25 08:00:44,474 | INFO | Starting EoS for checkpoint 002000 +2025-06-25 08:00:44,474 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:00:44,474 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:00:44,474 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:00:44,474 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:00:44,474 | INFO | Loading model and checkpoint... +2025-06-25 08:00:45,423 | INFO | Wrapping model with DDP... +2025-06-25 08:00:45,442 | INFO | Loading state dict... +2025-06-25 08:00:45,445 | INFO | Model loaded successfully! +2025-06-25 08:00:52,795 | INFO | iter 000000 | lr 0.0180 | loss 4.0603 | norm 103.6942 +2025-06-25 08:00:57,878 | INFO | iter 000001 | lr 0.0180 | loss 14.7676 | norm 181.4721 +2025-06-25 08:01:03,515 | INFO | iter 000002 | lr 0.0180 | loss 240.4388 | norm 221.1537 +2025-06-25 08:01:08,778 | INFO | iter 000003 | lr 0.0180 | loss 37.1792 | norm 277.3678 +2025-06-25 08:01:13,940 | INFO | iter 000004 | lr 0.0180 | loss 125.9179 | norm 326.9716 +2025-06-25 08:01:19,048 | INFO | iter 000005 | lr 0.0180 | loss 119.1384 | norm 371.6566 +2025-06-25 08:01:24,067 | INFO | iter 000006 | lr 0.0180 | loss 58.8411 | norm 415.6504 +2025-06-25 08:01:29,398 | INFO | iter 000007 | lr 0.0180 | loss 50.0099 | norm 457.2432 +2025-06-25 08:01:34,715 | INFO | iter 000008 | lr 0.0180 | loss 57.1595 | norm 495.9399 +2025-06-25 08:01:39,808 | INFO | iter 000009 | lr 0.0180 | loss 50.3250 | norm 531.3629 +2025-06-25 08:01:45,102 | INFO | iter 000010 | lr 0.0180 | loss 18.0556 | norm 566.8195 +2025-06-25 08:01:50,160 | INFO | iter 000011 | lr 0.0180 | loss -12.1102 | norm 602.2888 +2025-06-25 08:01:55,242 | INFO | iter 000012 | lr 0.0180 | loss -66.1828 | norm 639.0583 +2025-06-25 08:02:00,842 | INFO | iter 000013 | lr 0.0180 | loss -103.6073 | norm 677.0137 +2025-06-25 08:02:05,876 | INFO | iter 000014 | lr 0.0180 | loss -169.7566 | norm 715.7398 +2025-06-25 08:02:11,234 | INFO | iter 000015 | lr 0.0180 | loss -249.2370 | norm 755.9124 +2025-06-25 08:02:16,241 | INFO | iter 000016 | lr 0.0180 | loss -360.8513 | norm 798.1494 +2025-06-25 08:02:21,418 | INFO | iter 000017 | lr 0.0180 | loss -541.1733 | norm 843.1959 +2025-06-25 08:02:26,629 | INFO | iter 000018 | lr 0.0180 | loss -727.0453 | norm 891.4551 +2025-06-25 08:02:32,162 | INFO | iter 000019 | lr 0.0180 | loss -957.3318 | norm 943.1777 +2025-06-25 08:02:43,080 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_002000.pt +2025-06-25 08:02:44,945 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:02:44,945 | INFO | Current learning rate: 0.0018 +2025-06-25 08:02:45,555 | INFO | Weight decay: 0.1 +2025-06-25 08:02:45,555 | INFO | Epsilon: 1e-08 +2025-06-25 08:02:45,555 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:02:45,555 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:02:45,555 | INFO | Optimizer state loading completed! +2025-06-25 08:02:47,458 | INFO | Initialized xs with norm: 1.273634 +2025-06-25 08:02:47,466 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:02:47,466 | INFO | Starting LR test 1/10: lr=0.1800 +2025-06-25 08:02:47,466 | INFO | Starting EoS for LR factor 100.0000 +2025-06-25 08:02:47,466 | INFO | Starting EoS for checkpoint 002000 +2025-06-25 08:02:47,466 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:02:47,466 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:02:47,467 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:02:47,467 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:02:47,467 | INFO | Loading model and checkpoint... +2025-06-25 08:02:48,209 | INFO | Wrapping model with DDP... +2025-06-25 08:02:48,467 | INFO | Loading state dict... +2025-06-25 08:02:48,470 | INFO | Model loaded successfully! +2025-06-25 08:02:54,668 | INFO | iter 000000 | lr 0.1800 | loss 4.0603 | norm 1036.7952 +2025-06-25 08:02:59,931 | INFO | iter 000001 | lr 0.1800 | loss 1118.2144 | norm 1860.5170 +2025-06-25 08:03:05,288 | INFO | iter 000002 | lr 0.1800 | loss 27679.1367 | norm 2199.6821 +2025-06-25 08:03:10,533 | INFO | iter 000003 | lr 0.1800 | loss 3684.1533 | norm 2738.3308 +2025-06-25 08:03:15,885 | INFO | iter 000004 | lr 0.1800 | loss 12768.7715 | norm 3217.2167 +2025-06-25 08:03:21,102 | INFO | iter 000005 | lr 0.1800 | loss 13399.8350 | norm 3629.7486 +2025-06-25 08:03:26,163 | INFO | iter 000006 | lr 0.1800 | loss 6779.9473 | norm 4026.9859 +2025-06-25 08:03:31,728 | INFO | iter 000007 | lr 0.1800 | loss 4730.6021 | norm 4397.5922 +2025-06-25 08:03:37,023 | INFO | iter 000008 | lr 0.1800 | loss 5649.2324 | norm 4731.8789 +2025-06-25 08:03:42,224 | INFO | iter 000009 | lr 0.1800 | loss 5887.6724 | norm 5023.6548 +2025-06-25 08:03:47,336 | INFO | iter 000010 | lr 0.1800 | loss 2948.2642 | norm 5307.5434 +2025-06-25 08:03:52,742 | INFO | iter 000011 | lr 0.1800 | loss 679.2209 | norm 5583.8226 +2025-06-25 08:03:57,967 | INFO | iter 000012 | lr 0.1800 | loss -3517.8269 | norm 5866.7620 +2025-06-25 08:04:03,641 | INFO | iter 000013 | lr 0.1800 | loss -6241.0791 | norm 6155.4482 +2025-06-25 08:04:08,650 | INFO | iter 000014 | lr 0.1800 | loss -10283.7734 | norm 6445.5204 +2025-06-25 08:04:14,081 | INFO | iter 000015 | lr 0.1800 | loss -15390.3262 | norm 6741.4577 +2025-06-25 08:04:19,406 | INFO | iter 000016 | lr 0.1800 | loss -23139.1680 | norm 7049.2437 +2025-06-25 08:04:24,416 | INFO | iter 000017 | lr 0.1800 | loss -35265.1953 | norm 7376.9862 +2025-06-25 08:04:30,114 | INFO | iter 000018 | lr 0.1800 | loss -47734.4375 | norm 7729.4336 +2025-06-25 08:04:35,524 | INFO | iter 000019 | lr 0.1800 | loss -63256.2305 | norm 8108.3238 diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7195d69b38d09af441ee118b08d608bec324c160 --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/config.json @@ -0,0 +1,14 @@ +{ + "model_name": "gpt2_small", + "factor_min": 0.6, + "factor_max": 1.5, + "factor_num": 10, + "error": 0.0001, + "accum_steps": 4, + "num_iterations": 50, + "num_checkpoint": 7000, + "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin", + "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536", + "timestamp": "250622_035242", + "raw": false +} \ No newline at end of file diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..88b5cb41bef6784ee20db911c6f4e5250cf062ce Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png new file mode 100644 index 0000000000000000000000000000000000000000..8d38fa6077d642532c3d4a8877715e334c40a0c0 Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/norms_lr_iter.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log new file mode 100644 index 0000000000000000000000000000000000000000..a90e12884cf7471ebe1ec5d44f84f665f88940bb --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_007000/training.log @@ -0,0 +1,640 @@ +2025-06-25 05:35:53,415 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_007000.pt +2025-06-25 05:35:55,332 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 05:35:55,332 | INFO | Current learning rate: 0.0018 +2025-06-25 05:35:55,930 | INFO | Weight decay: 0.1 +2025-06-25 05:35:55,930 | INFO | Epsilon: 1e-08 +2025-06-25 05:35:55,930 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 05:35:55,930 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 05:35:55,930 | INFO | Optimizer state loading completed! +2025-06-25 05:35:57,847 | INFO | Initialized xs with norm: 1.273580 +2025-06-25 05:35:57,853 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 05:35:57,853 | INFO | Starting LR test 1/10: lr=0.0011 +2025-06-25 05:35:57,853 | INFO | Starting EoS for LR factor 0.6000 +2025-06-25 05:35:57,853 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 05:35:57,853 | INFO | Starting EoS for model gpt2_small +2025-06-25 05:35:57,853 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 05:35:57,854 | INFO | Starting EoS for num_iterations 50 +2025-06-25 05:35:57,854 | INFO | Starting EoS for accum_steps 4 +2025-06-25 05:35:57,854 | INFO | Loading model and checkpoint... +2025-06-25 05:35:58,674 | INFO | Wrapping model with DDP... +2025-06-25 05:35:58,743 | INFO | Loading state dict... +2025-06-25 05:35:58,747 | INFO | Model loaded successfully! +2025-06-25 05:36:05,712 | INFO | iter 000000 | lr 0.0011 | loss 3.6876 | norm 13.7931 +2025-06-25 05:36:10,973 | INFO | iter 000001 | lr 0.0011 | loss 3.5650 | norm 27.3894 +2025-06-25 05:36:16,178 | INFO | iter 000002 | lr 0.0011 | loss 3.5923 | norm 40.9245 +2025-06-25 05:36:21,425 | INFO | iter 000003 | lr 0.0011 | loss 3.7197 | norm 54.3712 +2025-06-25 05:36:26,555 | INFO | iter 000004 | lr 0.0011 | loss 3.7198 | norm 67.7207 +2025-06-25 05:36:32,221 | INFO | iter 000005 | lr 0.0011 | loss 3.7145 | norm 80.9691 +2025-06-25 05:36:37,311 | INFO | iter 000006 | lr 0.0011 | loss 3.8629 | norm 94.1130 +2025-06-25 05:36:42,481 | INFO | iter 000007 | lr 0.0011 | loss 3.8424 | norm 107.1491 +2025-06-25 05:36:47,571 | INFO | iter 000008 | lr 0.0011 | loss 3.9408 | norm 120.0743 +2025-06-25 05:36:52,819 | INFO | iter 000009 | lr 0.0011 | loss 3.9754 | norm 132.8863 +2025-06-25 05:36:57,995 | INFO | iter 000010 | lr 0.0011 | loss 3.8358 | norm 145.5831 +2025-06-25 05:37:03,602 | INFO | iter 000011 | lr 0.0011 | loss 3.7628 | norm 158.1634 +2025-06-25 05:37:08,848 | INFO | iter 000012 | lr 0.0011 | loss 4.0600 | norm 170.6257 +2025-06-25 05:37:14,100 | INFO | iter 000013 | lr 0.0011 | loss 3.9553 | norm 182.9687 +2025-06-25 05:37:19,299 | INFO | iter 000014 | lr 0.0011 | loss 3.9901 | norm 195.1916 +2025-06-25 05:37:24,464 | INFO | iter 000015 | lr 0.0011 | loss 4.3327 | norm 207.2938 +2025-06-25 05:37:29,984 | INFO | iter 000016 | lr 0.0011 | loss 4.2055 | norm 219.2749 +2025-06-25 05:37:35,307 | INFO | iter 000017 | lr 0.0011 | loss 4.0625 | norm 231.1345 +2025-06-25 05:37:40,541 | INFO | iter 000018 | lr 0.0011 | loss 4.1686 | norm 242.8727 +2025-06-25 05:37:45,565 | INFO | iter 000019 | lr 0.0011 | loss 4.1835 | norm 254.4896 +2025-06-25 05:37:50,753 | INFO | iter 000020 | lr 0.0011 | loss 4.1426 | norm 265.9856 +2025-06-25 05:37:56,056 | INFO | iter 000021 | lr 0.0011 | loss 4.2872 | norm 277.3614 +2025-06-25 05:38:01,898 | INFO | iter 000022 | lr 0.0011 | loss 4.5773 | norm 288.6175 +2025-06-25 05:38:07,056 | INFO | iter 000023 | lr 0.0011 | loss 4.4376 | norm 299.7551 +2025-06-25 05:38:12,351 | INFO | iter 000024 | lr 0.0011 | loss 4.2737 | norm 310.7748 +2025-06-25 05:38:17,566 | INFO | iter 000025 | lr 0.0011 | loss 4.4620 | norm 321.6779 +2025-06-25 05:38:22,639 | INFO | iter 000026 | lr 0.0011 | loss 4.3275 | norm 332.4654 +2025-06-25 05:38:27,858 | INFO | iter 000027 | lr 0.0011 | loss 4.3138 | norm 343.1386 +2025-06-25 05:38:33,577 | INFO | iter 000028 | lr 0.0011 | loss 4.5462 | norm 353.6991 +2025-06-25 05:38:38,886 | INFO | iter 000029 | lr 0.0011 | loss 4.4448 | norm 364.1481 +2025-06-25 05:38:43,950 | INFO | iter 000030 | lr 0.0011 | loss 4.6483 | norm 374.4874 +2025-06-25 05:38:49,201 | INFO | iter 000031 | lr 0.0011 | loss 4.6357 | norm 384.7184 +2025-06-25 05:38:54,409 | INFO | iter 000032 | lr 0.0011 | loss 4.6015 | norm 394.8427 +2025-06-25 05:38:59,422 | INFO | iter 000033 | lr 0.0011 | loss 4.7027 | norm 404.8622 +2025-06-25 05:39:05,147 | INFO | iter 000034 | lr 0.0011 | loss 4.5023 | norm 414.7783 +2025-06-25 05:39:10,566 | INFO | iter 000035 | lr 0.0011 | loss 4.9184 | norm 424.5929 +2025-06-25 05:39:15,684 | INFO | iter 000036 | lr 0.0011 | loss 4.8844 | norm 434.3076 +2025-06-25 05:39:20,799 | INFO | iter 000037 | lr 0.0011 | loss 5.1758 | norm 443.9242 +2025-06-25 05:39:25,840 | INFO | iter 000038 | lr 0.0011 | loss 5.0171 | norm 453.4445 +2025-06-25 05:39:31,938 | INFO | iter 000039 | lr 0.0011 | loss 4.8304 | norm 462.8701 +2025-06-25 05:39:36,990 | INFO | iter 000040 | lr 0.0011 | loss 5.0139 | norm 472.2029 +2025-06-25 05:39:42,282 | INFO | iter 000041 | lr 0.0011 | loss 4.5102 | norm 481.4444 +2025-06-25 05:39:47,604 | INFO | iter 000042 | lr 0.0011 | loss 5.0961 | norm 490.5965 +2025-06-25 05:39:52,697 | INFO | iter 000043 | lr 0.0011 | loss 4.9875 | norm 499.6608 +2025-06-25 05:39:57,946 | INFO | iter 000044 | lr 0.0011 | loss 4.8258 | norm 508.6389 +2025-06-25 05:40:03,733 | INFO | iter 000045 | lr 0.0011 | loss 5.1973 | norm 517.5325 +2025-06-25 05:40:08,855 | INFO | iter 000046 | lr 0.0011 | loss 5.2978 | norm 526.3431 +2025-06-25 05:40:14,128 | INFO | iter 000047 | lr 0.0011 | loss 5.1401 | norm 535.0723 +2025-06-25 05:40:19,467 | INFO | iter 000048 | lr 0.0011 | loss 5.3447 | norm 543.7217 +2025-06-25 05:40:24,714 | INFO | iter 000049 | lr 0.0011 | loss 4.6829 | norm 552.2926 +2025-06-25 05:40:24,714 | INFO | Completed LR test 1/10: lr=0.0011 +2025-06-25 05:40:24,751 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 05:40:24,751 | INFO | Starting LR test 2/10: lr=0.0013 +2025-06-25 05:40:24,751 | INFO | Starting EoS for LR factor 0.7000 +2025-06-25 05:40:24,751 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 05:40:24,751 | INFO | Starting EoS for model gpt2_small +2025-06-25 05:40:24,751 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 05:40:24,751 | INFO | Starting EoS for num_iterations 50 +2025-06-25 05:40:24,751 | INFO | Starting EoS for accum_steps 4 +2025-06-25 05:40:24,751 | INFO | Loading model and checkpoint... +2025-06-25 05:40:25,477 | INFO | Wrapping model with DDP... +2025-06-25 05:40:25,746 | INFO | Loading state dict... +2025-06-25 05:40:25,749 | INFO | Model loaded successfully! +2025-06-25 05:40:33,135 | INFO | iter 000000 | lr 0.0013 | loss 3.6876 | norm 16.0739 +2025-06-25 05:40:38,238 | INFO | iter 000001 | lr 0.0013 | loss 3.5668 | norm 31.9203 +2025-06-25 05:40:43,503 | INFO | iter 000002 | lr 0.0013 | loss 3.5993 | norm 47.6747 +2025-06-25 05:40:48,640 | INFO | iter 000003 | lr 0.0013 | loss 3.7332 | norm 63.3073 +2025-06-25 05:40:53,858 | INFO | iter 000004 | lr 0.0013 | loss 3.7399 | norm 78.8077 +2025-06-25 05:40:59,072 | INFO | iter 000005 | lr 0.0013 | loss 3.7418 | norm 94.1714 +2025-06-25 05:41:04,563 | INFO | iter 000006 | lr 0.0013 | loss 3.9009 | norm 109.3946 +2025-06-25 05:41:09,648 | INFO | iter 000007 | lr 0.0013 | loss 3.8821 | norm 124.4729 +2025-06-25 05:41:15,013 | INFO | iter 000008 | lr 0.0013 | loss 3.9903 | norm 139.4024 +2025-06-25 05:41:20,139 | INFO | iter 000009 | lr 0.0013 | loss 4.0268 | norm 154.1803 +2025-06-25 05:41:25,227 | INFO | iter 000010 | lr 0.0013 | loss 3.8996 | norm 168.8045 +2025-06-25 05:41:31,038 | INFO | iter 000011 | lr 0.0013 | loss 3.8270 | norm 183.2730 +2025-06-25 05:41:36,315 | INFO | iter 000012 | lr 0.0013 | loss 4.1310 | norm 197.5845 +2025-06-25 05:41:41,326 | INFO | iter 000013 | lr 0.0013 | loss 4.0251 | norm 211.7373 +2025-06-25 05:41:46,451 | INFO | iter 000014 | lr 0.0013 | loss 4.0945 | norm 225.7308 +2025-06-25 05:41:51,631 | INFO | iter 000015 | lr 0.0013 | loss 4.4285 | norm 239.5642 +2025-06-25 05:41:56,676 | INFO | iter 000016 | lr 0.0013 | loss 4.3138 | norm 253.2372 +2025-06-25 05:42:02,225 | INFO | iter 000017 | lr 0.0013 | loss 4.1754 | norm 266.7497 +2025-06-25 05:42:07,383 | INFO | iter 000018 | lr 0.0013 | loss 4.2904 | norm 280.1020 +2025-06-25 05:42:12,527 | INFO | iter 000019 | lr 0.0013 | loss 4.2943 | norm 293.2945 +2025-06-25 05:42:17,700 | INFO | iter 000020 | lr 0.0013 | loss 4.2564 | norm 306.3280 +2025-06-25 05:42:23,000 | INFO | iter 000021 | lr 0.0013 | loss 4.4312 | norm 319.2035 +2025-06-25 05:42:28,355 | INFO | iter 000022 | lr 0.0013 | loss 4.7017 | norm 331.9223 +2025-06-25 05:42:34,085 | INFO | iter 000023 | lr 0.0013 | loss 4.6376 | norm 344.4858 +2025-06-25 05:42:39,128 | INFO | iter 000024 | lr 0.0013 | loss 4.4395 | norm 356.8954 +2025-06-25 05:42:44,446 | INFO | iter 000025 | lr 0.0013 | loss 4.6249 | norm 369.1527 +2025-06-25 05:42:49,650 | INFO | iter 000026 | lr 0.0013 | loss 4.4875 | norm 381.2595 +2025-06-25 05:42:54,760 | INFO | iter 000027 | lr 0.0013 | loss 4.4658 | norm 393.2177 +2025-06-25 05:43:00,105 | INFO | iter 000028 | lr 0.0013 | loss 4.7561 | norm 405.0296 +2025-06-25 05:43:05,673 | INFO | iter 000029 | lr 0.0013 | loss 4.6378 | norm 416.6971 +2025-06-25 05:43:10,891 | INFO | iter 000030 | lr 0.0013 | loss 4.8317 | norm 428.2225 +2025-06-25 05:43:16,007 | INFO | iter 000031 | lr 0.0013 | loss 4.8500 | norm 439.6082 +2025-06-25 05:43:21,121 | INFO | iter 000032 | lr 0.0013 | loss 4.8139 | norm 450.8564 +2025-06-25 05:43:26,282 | INFO | iter 000033 | lr 0.0013 | loss 4.9246 | norm 461.9696 +2025-06-25 05:43:31,833 | INFO | iter 000034 | lr 0.0013 | loss 4.6747 | norm 472.9502 +2025-06-25 05:43:37,078 | INFO | iter 000035 | lr 0.0013 | loss 5.1966 | norm 483.8006 +2025-06-25 05:43:42,249 | INFO | iter 000036 | lr 0.0013 | loss 5.1442 | norm 494.5233 +2025-06-25 05:43:47,360 | INFO | iter 000037 | lr 0.0013 | loss 5.4601 | norm 505.1207 +2025-06-25 05:43:52,622 | INFO | iter 000038 | lr 0.0013 | loss 5.2407 | norm 515.5953 +2025-06-25 05:43:57,845 | INFO | iter 000039 | lr 0.0013 | loss 5.0629 | norm 525.9496 +2025-06-25 05:44:03,515 | INFO | iter 000040 | lr 0.0013 | loss 5.2719 | norm 536.1859 +2025-06-25 05:44:08,699 | INFO | iter 000041 | lr 0.0013 | loss 4.7011 | norm 546.3068 +2025-06-25 05:44:13,777 | INFO | iter 000042 | lr 0.0013 | loss 5.3736 | norm 556.3145 +2025-06-25 05:44:19,131 | INFO | iter 000043 | lr 0.0013 | loss 5.2889 | norm 566.2115 +2025-06-25 05:44:24,440 | INFO | iter 000044 | lr 0.0013 | loss 5.0845 | norm 576.0000 +2025-06-25 05:44:30,026 | INFO | iter 000045 | lr 0.0013 | loss 5.4554 | norm 585.6822 +2025-06-25 05:44:35,085 | INFO | iter 000046 | lr 0.0013 | loss 5.5968 | norm 595.2605 +2025-06-25 05:44:40,229 | INFO | iter 000047 | lr 0.0013 | loss 5.4326 | norm 604.7369 +2025-06-25 05:44:45,571 | INFO | iter 000048 | lr 0.0013 | loss 5.6167 | norm 614.1136 +2025-06-25 05:44:50,721 | INFO | iter 000049 | lr 0.0013 | loss 4.9240 | norm 623.3926 +2025-06-25 05:44:50,722 | INFO | Completed LR test 2/10: lr=0.0013 +2025-06-25 05:44:50,738 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 05:44:50,738 | INFO | Starting LR test 3/10: lr=0.0014 +2025-06-25 05:44:50,738 | INFO | Starting EoS for LR factor 0.8000 +2025-06-25 05:44:50,738 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 05:44:50,738 | INFO | Starting EoS for model gpt2_small +2025-06-25 05:44:50,738 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 05:44:50,738 | INFO | Starting EoS for num_iterations 50 +2025-06-25 05:44:50,738 | INFO | Starting EoS for accum_steps 4 +2025-06-25 05:44:50,738 | INFO | Loading model and checkpoint... +2025-06-25 05:44:51,462 | INFO | Wrapping model with DDP... +2025-06-25 05:44:51,701 | INFO | Loading state dict... +2025-06-25 05:44:51,704 | INFO | Model loaded successfully! +2025-06-25 05:44:58,088 | INFO | iter 000000 | lr 0.0014 | loss 3.6876 | norm 18.3568 +2025-06-25 05:45:03,653 | INFO | iter 000001 | lr 0.0014 | loss 3.5691 | norm 36.4448 +2025-06-25 05:45:08,934 | INFO | iter 000002 | lr 0.0014 | loss 3.6073 | norm 54.4071 +2025-06-25 05:45:14,009 | INFO | iter 000003 | lr 0.0014 | loss 3.7478 | norm 72.2096 +2025-06-25 05:45:19,098 | INFO | iter 000004 | lr 0.0014 | loss 3.7622 | norm 89.8399 +2025-06-25 05:45:24,184 | INFO | iter 000005 | lr 0.0014 | loss 3.7683 | norm 107.2932 +2025-06-25 05:45:29,635 | INFO | iter 000006 | lr 0.0014 | loss 3.9400 | norm 124.5648 +2025-06-25 05:45:35,126 | INFO | iter 000007 | lr 0.0014 | loss 3.9224 | norm 141.6493 +2025-06-25 05:45:40,271 | INFO | iter 000008 | lr 0.0014 | loss 4.0389 | norm 158.5423 +2025-06-25 05:45:45,417 | INFO | iter 000009 | lr 0.0014 | loss 4.0786 | norm 175.2402 +2025-06-25 05:45:50,522 | INFO | iter 000010 | lr 0.0014 | loss 3.9723 | norm 191.7406 +2025-06-25 05:45:55,674 | INFO | iter 000011 | lr 0.0014 | loss 3.8953 | norm 208.0416 +2025-06-25 05:46:01,485 | INFO | iter 000012 | lr 0.0014 | loss 4.2137 | norm 224.1416 +2025-06-25 05:46:06,578 | INFO | iter 000013 | lr 0.0014 | loss 4.0981 | norm 240.0388 +2025-06-25 05:46:11,525 | INFO | iter 000014 | lr 0.0014 | loss 4.1986 | norm 255.7326 +2025-06-25 05:46:16,727 | INFO | iter 000015 | lr 0.0014 | loss 4.5317 | norm 271.2225 +2025-06-25 05:46:21,994 | INFO | iter 000016 | lr 0.0014 | loss 4.4237 | norm 286.5083 +2025-06-25 05:46:27,164 | INFO | iter 000017 | lr 0.0014 | loss 4.3022 | norm 301.5904 +2025-06-25 05:46:32,884 | INFO | iter 000018 | lr 0.0014 | loss 4.3945 | norm 316.4691 +2025-06-25 05:46:38,027 | INFO | iter 000019 | lr 0.0014 | loss 4.4074 | norm 331.1456 +2025-06-25 05:46:43,453 | INFO | iter 000020 | lr 0.0014 | loss 4.3716 | norm 345.6211 +2025-06-25 05:46:48,517 | INFO | iter 000021 | lr 0.0014 | loss 4.5736 | norm 359.8973 +2025-06-25 05:46:53,691 | INFO | iter 000022 | lr 0.0014 | loss 4.8349 | norm 373.9760 +2025-06-25 05:46:58,799 | INFO | iter 000023 | lr 0.0014 | loss 4.8390 | norm 387.8593 +2025-06-25 05:47:04,367 | INFO | iter 000024 | lr 0.0014 | loss 4.6071 | norm 401.5493 +2025-06-25 05:47:09,593 | INFO | iter 000025 | lr 0.0014 | loss 4.8229 | norm 415.0486 +2025-06-25 05:47:14,751 | INFO | iter 000026 | lr 0.0014 | loss 4.6378 | norm 428.3594 +2025-06-25 05:47:19,821 | INFO | iter 000027 | lr 0.0014 | loss 4.6228 | norm 441.4847 +2025-06-25 05:47:24,895 | INFO | iter 000028 | lr 0.0014 | loss 4.9751 | norm 454.4274 +2025-06-25 05:47:30,528 | INFO | iter 000029 | lr 0.0014 | loss 4.8282 | norm 467.1905 +2025-06-25 05:47:35,961 | INFO | iter 000030 | lr 0.0014 | loss 5.0116 | norm 479.7771 +2025-06-25 05:47:41,409 | INFO | iter 000031 | lr 0.0014 | loss 5.0729 | norm 492.1905 +2025-06-25 05:47:46,555 | INFO | iter 000032 | lr 0.0014 | loss 5.0228 | norm 504.4338 +2025-06-25 05:47:51,877 | INFO | iter 000033 | lr 0.0014 | loss 5.1236 | norm 516.5103 +2025-06-25 05:47:57,014 | INFO | iter 000034 | lr 0.0014 | loss 4.8552 | norm 528.4233 +2025-06-25 05:48:02,608 | INFO | iter 000035 | lr 0.0014 | loss 5.4885 | norm 540.1762 +2025-06-25 05:48:07,748 | INFO | iter 000036 | lr 0.0014 | loss 5.4107 | norm 551.7723 +2025-06-25 05:48:12,707 | INFO | iter 000037 | lr 0.0014 | loss 5.7052 | norm 563.2150 +2025-06-25 05:48:17,970 | INFO | iter 000038 | lr 0.0014 | loss 5.4606 | norm 574.5076 +2025-06-25 05:48:23,145 | INFO | iter 000039 | lr 0.0014 | loss 5.2917 | norm 585.6534 +2025-06-25 05:48:28,398 | INFO | iter 000040 | lr 0.0014 | loss 5.5305 | norm 596.6557 +2025-06-25 05:48:33,822 | INFO | iter 000041 | lr 0.0014 | loss 4.8888 | norm 607.5177 +2025-06-25 05:48:39,061 | INFO | iter 000042 | lr 0.0014 | loss 5.6519 | norm 618.2427 +2025-06-25 05:48:44,089 | INFO | iter 000043 | lr 0.0014 | loss 5.5571 | norm 628.8338 +2025-06-25 05:48:49,294 | INFO | iter 000044 | lr 0.0014 | loss 5.3017 | norm 639.2940 +2025-06-25 05:48:54,335 | INFO | iter 000045 | lr 0.0014 | loss 5.7088 | norm 649.6263 +2025-06-25 05:48:59,520 | INFO | iter 000046 | lr 0.0014 | loss 5.8573 | norm 659.8338 +2025-06-25 05:49:05,181 | INFO | iter 000047 | lr 0.0014 | loss 5.7481 | norm 669.9192 +2025-06-25 05:49:10,172 | INFO | iter 000048 | lr 0.0014 | loss 5.8767 | norm 679.8854 +2025-06-25 05:49:15,104 | INFO | iter 000049 | lr 0.0014 | loss 5.1259 | norm 689.7351 +2025-06-25 05:49:15,104 | INFO | Completed LR test 3/10: lr=0.0014 +2025-06-25 05:49:15,127 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 05:49:15,127 | INFO | Starting LR test 4/10: lr=0.0016 +2025-06-25 05:49:15,127 | INFO | Starting EoS for LR factor 0.9000 +2025-06-25 05:49:15,127 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 05:49:15,127 | INFO | Starting EoS for model gpt2_small +2025-06-25 05:49:15,127 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 05:49:15,127 | INFO | Starting EoS for num_iterations 50 +2025-06-25 05:49:15,127 | INFO | Starting EoS for accum_steps 4 +2025-06-25 05:49:15,127 | INFO | Loading model and checkpoint... +2025-06-25 05:49:15,838 | INFO | Wrapping model with DDP... +2025-06-25 05:49:16,179 | INFO | Loading state dict... +2025-06-25 05:49:16,183 | INFO | Model loaded successfully! +2025-06-25 05:49:22,383 | INFO | iter 000000 | lr 0.0016 | loss 3.6876 | norm 20.6411 +2025-06-25 05:49:27,494 | INFO | iter 000001 | lr 0.0016 | loss 3.5716 | norm 40.9628 +2025-06-25 05:49:33,578 | INFO | iter 000002 | lr 0.0016 | loss 3.6157 | norm 61.1216 +2025-06-25 05:49:38,620 | INFO | iter 000003 | lr 0.0016 | loss 3.7629 | norm 81.0780 +2025-06-25 05:49:43,686 | INFO | iter 000004 | lr 0.0016 | loss 3.7846 | norm 100.8176 +2025-06-25 05:49:48,747 | INFO | iter 000005 | lr 0.0016 | loss 3.7971 | norm 120.3347 +2025-06-25 05:49:53,747 | INFO | iter 000006 | lr 0.0016 | loss 3.9829 | norm 139.6241 +2025-06-25 05:49:58,945 | INFO | iter 000007 | lr 0.0016 | loss 3.9620 | norm 158.6796 +2025-06-25 05:50:04,505 | INFO | iter 000008 | lr 0.0016 | loss 4.0909 | norm 177.4956 +2025-06-25 05:50:09,658 | INFO | iter 000009 | lr 0.0016 | loss 4.1380 | norm 196.0683 +2025-06-25 05:50:14,683 | INFO | iter 000010 | lr 0.0016 | loss 4.0397 | norm 214.3950 +2025-06-25 05:50:19,816 | INFO | iter 000011 | lr 0.0016 | loss 3.9675 | norm 232.4738 +2025-06-25 05:50:24,917 | INFO | iter 000012 | lr 0.0016 | loss 4.2959 | norm 250.3029 +2025-06-25 05:50:30,599 | INFO | iter 000013 | lr 0.0016 | loss 4.1695 | norm 267.8807 +2025-06-25 05:50:35,603 | INFO | iter 000014 | lr 0.0016 | loss 4.3111 | norm 285.2066 +2025-06-25 05:50:40,827 | INFO | iter 000015 | lr 0.0016 | loss 4.6379 | norm 302.2806 +2025-06-25 05:50:46,091 | INFO | iter 000016 | lr 0.0016 | loss 4.5418 | norm 319.1028 +2025-06-25 05:50:51,447 | INFO | iter 000017 | lr 0.0016 | loss 4.4444 | norm 335.6738 +2025-06-25 05:50:56,560 | INFO | iter 000018 | lr 0.0016 | loss 4.5090 | norm 351.9947 +2025-06-25 05:51:02,059 | INFO | iter 000019 | lr 0.0016 | loss 4.5233 | norm 368.0672 +2025-06-25 05:51:07,086 | INFO | iter 000020 | lr 0.0016 | loss 4.4959 | norm 383.8933 +2025-06-25 05:51:12,277 | INFO | iter 000021 | lr 0.0016 | loss 4.7354 | norm 399.4754 +2025-06-25 05:51:17,599 | INFO | iter 000022 | lr 0.0016 | loss 4.9572 | norm 414.8162 +2025-06-25 05:51:22,816 | INFO | iter 000023 | lr 0.0016 | loss 5.0554 | norm 429.9186 +2025-06-25 05:51:28,254 | INFO | iter 000024 | lr 0.0016 | loss 4.7812 | norm 444.7857 +2025-06-25 05:51:33,877 | INFO | iter 000025 | lr 0.0016 | loss 4.9812 | norm 459.4208 +2025-06-25 05:51:38,996 | INFO | iter 000026 | lr 0.0016 | loss 4.7977 | norm 473.8274 +2025-06-25 05:51:44,352 | INFO | iter 000027 | lr 0.0016 | loss 4.7787 | norm 488.0092 +2025-06-25 05:51:49,761 | INFO | iter 000028 | lr 0.0016 | loss 5.1998 | norm 501.9703 +2025-06-25 05:51:55,046 | INFO | iter 000029 | lr 0.0016 | loss 5.0280 | norm 515.7147 +2025-06-25 05:52:00,665 | INFO | iter 000030 | lr 0.0016 | loss 5.1958 | norm 529.2467 +2025-06-25 05:52:05,662 | INFO | iter 000031 | lr 0.0016 | loss 5.2719 | norm 542.5705 +2025-06-25 05:52:10,734 | INFO | iter 000032 | lr 0.0016 | loss 5.2195 | norm 555.6903 +2025-06-25 05:52:15,809 | INFO | iter 000033 | lr 0.0016 | loss 5.3781 | norm 568.6107 +2025-06-25 05:52:20,946 | INFO | iter 000034 | lr 0.0016 | loss 5.0383 | norm 581.3358 +2025-06-25 05:52:26,121 | INFO | iter 000035 | lr 0.0016 | loss 5.7508 | norm 593.8701 +2025-06-25 05:52:31,748 | INFO | iter 000036 | lr 0.0016 | loss 5.6478 | norm 606.2180 +2025-06-25 05:52:36,972 | INFO | iter 000037 | lr 0.0016 | loss 5.9527 | norm 618.3839 +2025-06-25 05:52:42,355 | INFO | iter 000038 | lr 0.0016 | loss 5.6528 | norm 630.3722 +2025-06-25 05:52:47,675 | INFO | iter 000039 | lr 0.0016 | loss 5.4899 | norm 642.1872 +2025-06-25 05:52:52,854 | INFO | iter 000040 | lr 0.0016 | loss 5.7534 | norm 653.8331 +2025-06-25 05:52:57,905 | INFO | iter 000041 | lr 0.0016 | loss 5.0604 | norm 665.3142 +2025-06-25 05:53:03,388 | INFO | iter 000042 | lr 0.0016 | loss 5.9295 | norm 676.6346 +2025-06-25 05:53:08,623 | INFO | iter 000043 | lr 0.0016 | loss 5.7979 | norm 687.7984 +2025-06-25 05:53:13,916 | INFO | iter 000044 | lr 0.0016 | loss 5.5369 | norm 698.8096 +2025-06-25 05:53:19,248 | INFO | iter 000045 | lr 0.0016 | loss 5.9652 | norm 709.6719 +2025-06-25 05:53:24,627 | INFO | iter 000046 | lr 0.0016 | loss 6.0899 | norm 720.3892 +2025-06-25 05:53:30,007 | INFO | iter 000047 | lr 0.0016 | loss 6.0334 | norm 730.9651 +2025-06-25 05:53:35,384 | INFO | iter 000048 | lr 0.0016 | loss 6.1324 | norm 741.4034 +2025-06-25 05:53:40,548 | INFO | iter 000049 | lr 0.0016 | loss 5.3401 | norm 751.7074 +2025-06-25 05:53:40,549 | INFO | Completed LR test 4/10: lr=0.0016 +2025-06-25 05:53:40,579 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 05:53:40,580 | INFO | Starting LR test 5/10: lr=0.0018 +2025-06-25 05:53:40,580 | INFO | Starting EoS for LR factor 1.0000 +2025-06-25 05:53:40,580 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 05:53:40,580 | INFO | Starting EoS for model gpt2_small +2025-06-25 05:53:40,580 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 05:53:40,580 | INFO | Starting EoS for num_iterations 50 +2025-06-25 05:53:40,580 | INFO | Starting EoS for accum_steps 4 +2025-06-25 05:53:40,580 | INFO | Loading model and checkpoint... +2025-06-25 05:53:41,270 | INFO | Wrapping model with DDP... +2025-06-25 05:53:41,650 | INFO | Loading state dict... +2025-06-25 05:53:41,653 | INFO | Model loaded successfully! +2025-06-25 05:53:47,958 | INFO | iter 000000 | lr 0.0018 | loss 3.6876 | norm 22.9263 +2025-06-25 05:53:53,039 | INFO | iter 000001 | lr 0.0018 | loss 3.5747 | norm 45.4740 +2025-06-25 05:53:58,328 | INFO | iter 000002 | lr 0.0018 | loss 3.6252 | norm 67.8180 +2025-06-25 05:54:03,897 | INFO | iter 000003 | lr 0.0018 | loss 3.7789 | norm 89.9128 +2025-06-25 05:54:08,955 | INFO | iter 000004 | lr 0.0018 | loss 3.8081 | norm 111.7409 +2025-06-25 05:54:14,093 | INFO | iter 000005 | lr 0.0018 | loss 3.8254 | norm 133.2964 +2025-06-25 05:54:19,498 | INFO | iter 000006 | lr 0.0018 | loss 4.0263 | norm 154.5736 +2025-06-25 05:54:24,625 | INFO | iter 000007 | lr 0.0018 | loss 4.0083 | norm 175.5650 +2025-06-25 05:54:30,300 | INFO | iter 000008 | lr 0.0018 | loss 4.1424 | norm 196.2643 +2025-06-25 05:54:35,636 | INFO | iter 000009 | lr 0.0018 | loss 4.1973 | norm 216.6673 +2025-06-25 05:54:40,921 | INFO | iter 000010 | lr 0.0018 | loss 4.1163 | norm 236.7712 +2025-06-25 05:54:46,048 | INFO | iter 000011 | lr 0.0018 | loss 4.0434 | norm 256.5742 +2025-06-25 05:54:51,400 | INFO | iter 000012 | lr 0.0018 | loss 4.3868 | norm 276.0744 +2025-06-25 05:54:56,503 | INFO | iter 000013 | lr 0.0018 | loss 4.2525 | norm 295.2705 +2025-06-25 05:55:02,238 | INFO | iter 000014 | lr 0.0018 | loss 4.4289 | norm 314.1622 +2025-06-25 05:55:07,532 | INFO | iter 000015 | lr 0.0018 | loss 4.7470 | norm 332.7499 +2025-06-25 05:55:12,639 | INFO | iter 000016 | lr 0.0018 | loss 4.6653 | norm 351.0343 +2025-06-25 05:55:17,801 | INFO | iter 000017 | lr 0.0018 | loss 4.5801 | norm 369.0165 +2025-06-25 05:55:22,922 | INFO | iter 000018 | lr 0.0018 | loss 4.6499 | norm 386.6985 +2025-06-25 05:55:28,023 | INFO | iter 000019 | lr 0.0018 | loss 4.6453 | norm 404.0825 +2025-06-25 05:55:33,721 | INFO | iter 000020 | lr 0.0018 | loss 4.6186 | norm 421.1716 +2025-06-25 05:55:38,680 | INFO | iter 000021 | lr 0.0018 | loss 4.8857 | norm 437.9693 +2025-06-25 05:55:43,961 | INFO | iter 000022 | lr 0.0018 | loss 5.0973 | norm 454.4791 +2025-06-25 05:55:49,091 | INFO | iter 000023 | lr 0.0018 | loss 5.2755 | norm 470.7052 +2025-06-25 05:55:54,224 | INFO | iter 000024 | lr 0.0018 | loss 4.9374 | norm 486.6516 +2025-06-25 05:55:59,987 | INFO | iter 000025 | lr 0.0018 | loss 5.1755 | norm 502.3227 +2025-06-25 05:56:05,519 | INFO | iter 000026 | lr 0.0018 | loss 4.9887 | norm 517.7234 +2025-06-25 05:56:10,497 | INFO | iter 000027 | lr 0.0018 | loss 4.9314 | norm 532.8585 +2025-06-25 05:56:15,715 | INFO | iter 000028 | lr 0.0018 | loss 5.3997 | norm 547.7333 +2025-06-25 05:56:20,871 | INFO | iter 000029 | lr 0.0018 | loss 5.1985 | norm 562.3531 +2025-06-25 05:56:26,149 | INFO | iter 000030 | lr 0.0018 | loss 5.3587 | norm 576.7233 +2025-06-25 05:56:31,899 | INFO | iter 000031 | lr 0.0018 | loss 5.4842 | norm 590.8496 +2025-06-25 05:56:37,096 | INFO | iter 000032 | lr 0.0018 | loss 5.4218 | norm 604.7374 +2025-06-25 05:56:42,378 | INFO | iter 000033 | lr 0.0018 | loss 5.5774 | norm 618.3925 +2025-06-25 05:56:47,635 | INFO | iter 000034 | lr 0.0018 | loss 5.2042 | norm 631.8203 +2025-06-25 05:56:52,745 | INFO | iter 000035 | lr 0.0018 | loss 6.0363 | norm 645.0265 +2025-06-25 05:56:58,068 | INFO | iter 000036 | lr 0.0018 | loss 5.8952 | norm 658.0167 +2025-06-25 05:57:03,613 | INFO | iter 000037 | lr 0.0018 | loss 6.2191 | norm 670.7966 +2025-06-25 05:57:08,695 | INFO | iter 000038 | lr 0.0018 | loss 5.8633 | norm 683.3716 +2025-06-25 05:57:13,857 | INFO | iter 000039 | lr 0.0018 | loss 5.6797 | norm 695.7473 +2025-06-25 05:57:18,927 | INFO | iter 000040 | lr 0.0018 | loss 5.9959 | norm 707.9289 +2025-06-25 05:57:24,051 | INFO | iter 000041 | lr 0.0018 | loss 5.2168 | norm 719.9219 +2025-06-25 05:57:29,317 | INFO | iter 000042 | lr 0.0018 | loss 6.1407 | norm 731.7314 +2025-06-25 05:57:34,877 | INFO | iter 000043 | lr 0.0018 | loss 6.0471 | norm 743.3626 +2025-06-25 05:57:40,102 | INFO | iter 000044 | lr 0.0018 | loss 5.7566 | norm 754.8203 +2025-06-25 05:57:45,152 | INFO | iter 000045 | lr 0.0018 | loss 6.1483 | norm 766.1095 +2025-06-25 05:57:50,342 | INFO | iter 000046 | lr 0.0018 | loss 6.3447 | norm 777.2347 +2025-06-25 05:57:55,654 | INFO | iter 000047 | lr 0.0018 | loss 6.2507 | norm 788.2008 +2025-06-25 05:58:01,245 | INFO | iter 000048 | lr 0.0018 | loss 6.3594 | norm 799.0120 +2025-06-25 05:58:06,311 | INFO | iter 000049 | lr 0.0018 | loss 5.4779 | norm 809.6727 +2025-06-25 05:58:06,311 | INFO | Completed LR test 5/10: lr=0.0018 +2025-06-25 05:58:06,326 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 05:58:06,326 | INFO | Starting LR test 6/10: lr=0.0020 +2025-06-25 05:58:06,326 | INFO | Starting EoS for LR factor 1.1000 +2025-06-25 05:58:06,327 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 05:58:06,327 | INFO | Starting EoS for model gpt2_small +2025-06-25 05:58:06,327 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 05:58:06,327 | INFO | Starting EoS for num_iterations 50 +2025-06-25 05:58:06,327 | INFO | Starting EoS for accum_steps 4 +2025-06-25 05:58:06,327 | INFO | Loading model and checkpoint... +2025-06-25 05:58:07,014 | INFO | Wrapping model with DDP... +2025-06-25 05:58:07,353 | INFO | Loading state dict... +2025-06-25 05:58:07,356 | INFO | Model loaded successfully! +2025-06-25 05:58:13,544 | INFO | iter 000000 | lr 0.0020 | loss 3.6876 | norm 25.2123 +2025-06-25 05:58:18,866 | INFO | iter 000001 | lr 0.0020 | loss 3.5779 | norm 49.9783 +2025-06-25 05:58:24,009 | INFO | iter 000002 | lr 0.0020 | loss 3.6351 | norm 74.4965 +2025-06-25 05:58:29,266 | INFO | iter 000003 | lr 0.0020 | loss 3.7960 | norm 98.7138 +2025-06-25 05:58:34,900 | INFO | iter 000004 | lr 0.0020 | loss 3.8337 | norm 122.6100 +2025-06-25 05:58:40,074 | INFO | iter 000005 | lr 0.0020 | loss 3.8578 | norm 146.1787 +2025-06-25 05:58:45,172 | INFO | iter 000006 | lr 0.0020 | loss 4.0691 | norm 169.4137 +2025-06-25 05:58:50,219 | INFO | iter 000007 | lr 0.0020 | loss 4.0526 | norm 192.3065 +2025-06-25 05:58:55,431 | INFO | iter 000008 | lr 0.0020 | loss 4.1982 | norm 214.8500 +2025-06-25 05:59:00,812 | INFO | iter 000009 | lr 0.0020 | loss 4.2537 | norm 237.0394 +2025-06-25 05:59:06,232 | INFO | iter 000010 | lr 0.0020 | loss 4.1896 | norm 258.8723 +2025-06-25 05:59:11,389 | INFO | iter 000011 | lr 0.0020 | loss 4.1233 | norm 280.3468 +2025-06-25 05:59:16,596 | INFO | iter 000012 | lr 0.0020 | loss 4.4756 | norm 301.4616 +2025-06-25 05:59:21,692 | INFO | iter 000013 | lr 0.0020 | loss 4.3328 | norm 322.2153 +2025-06-25 05:59:26,948 | INFO | iter 000014 | lr 0.0020 | loss 4.5445 | norm 342.6083 +2025-06-25 05:59:32,443 | INFO | iter 000015 | lr 0.0020 | loss 4.8651 | norm 362.6415 +2025-06-25 05:59:37,749 | INFO | iter 000016 | lr 0.0020 | loss 4.7906 | norm 382.3164 +2025-06-25 05:59:42,700 | INFO | iter 000017 | lr 0.0020 | loss 4.6993 | norm 401.6350 +2025-06-25 05:59:47,904 | INFO | iter 000018 | lr 0.0020 | loss 4.7672 | norm 420.5997 +2025-06-25 05:59:53,164 | INFO | iter 000019 | lr 0.0020 | loss 4.7585 | norm 439.2143 +2025-06-25 05:59:58,222 | INFO | iter 000020 | lr 0.0020 | loss 4.7307 | norm 457.4828 +2025-06-25 06:00:03,684 | INFO | iter 000021 | lr 0.0020 | loss 5.0488 | norm 475.4099 +2025-06-25 06:00:08,860 | INFO | iter 000022 | lr 0.0020 | loss 5.2354 | norm 493.0004 +2025-06-25 06:00:14,235 | INFO | iter 000023 | lr 0.0020 | loss 5.4842 | norm 510.2596 +2025-06-25 06:00:19,440 | INFO | iter 000024 | lr 0.0020 | loss 5.1194 | norm 527.1931 +2025-06-25 06:00:24,496 | INFO | iter 000025 | lr 0.0020 | loss 5.3493 | norm 543.8065 +2025-06-25 06:00:30,381 | INFO | iter 000026 | lr 0.0020 | loss 5.1178 | norm 560.1060 +2025-06-25 06:00:35,582 | INFO | iter 000027 | lr 0.0020 | loss 5.0840 | norm 576.0979 +2025-06-25 06:00:40,743 | INFO | iter 000028 | lr 0.0020 | loss 5.6218 | norm 591.7890 +2025-06-25 06:00:46,041 | INFO | iter 000029 | lr 0.0020 | loss 5.3631 | norm 607.1861 +2025-06-25 06:00:51,225 | INFO | iter 000030 | lr 0.0020 | loss 5.5315 | norm 622.2960 +2025-06-25 06:00:56,346 | INFO | iter 000031 | lr 0.0020 | loss 5.6890 | norm 637.1259 +2025-06-25 06:01:02,073 | INFO | iter 000032 | lr 0.0020 | loss 5.5865 | norm 651.6826 +2025-06-25 06:01:07,316 | INFO | iter 000033 | lr 0.0020 | loss 5.7409 | norm 665.9732 +2025-06-25 06:01:12,507 | INFO | iter 000034 | lr 0.0020 | loss 5.3656 | norm 680.0048 +2025-06-25 06:01:17,515 | INFO | iter 000035 | lr 0.0020 | loss 6.3080 | norm 693.7843 +2025-06-25 06:01:22,649 | INFO | iter 000036 | lr 0.0020 | loss 6.1109 | norm 707.3187 +2025-06-25 06:01:27,838 | INFO | iter 000037 | lr 0.0020 | loss 6.4594 | norm 720.6151 +2025-06-25 06:01:33,547 | INFO | iter 000038 | lr 0.0020 | loss 6.0626 | norm 733.6802 +2025-06-25 06:01:38,679 | INFO | iter 000039 | lr 0.0020 | loss 5.8983 | norm 746.5208 +2025-06-25 06:01:43,863 | INFO | iter 000040 | lr 0.0020 | loss 6.1886 | norm 759.1436 +2025-06-25 06:01:48,902 | INFO | iter 000041 | lr 0.0020 | loss 5.4046 | norm 771.5550 +2025-06-25 06:01:54,199 | INFO | iter 000042 | lr 0.0020 | loss 6.3809 | norm 783.7615 +2025-06-25 06:01:59,802 | INFO | iter 000043 | lr 0.0020 | loss 6.2593 | norm 795.7693 +2025-06-25 06:02:04,920 | INFO | iter 000044 | lr 0.0020 | loss 5.9433 | norm 807.5844 +2025-06-25 06:02:09,977 | INFO | iter 000045 | lr 0.0020 | loss 6.3769 | norm 819.2127 +2025-06-25 06:02:15,095 | INFO | iter 000046 | lr 0.0020 | loss 6.5365 | norm 830.6600 +2025-06-25 06:02:20,439 | INFO | iter 000047 | lr 0.0020 | loss 6.4452 | norm 841.9317 +2025-06-25 06:02:25,606 | INFO | iter 000048 | lr 0.0020 | loss 6.5175 | norm 853.0334 +2025-06-25 06:02:31,548 | INFO | iter 000049 | lr 0.0020 | loss 5.6289 | norm 863.9703 +2025-06-25 06:02:31,548 | INFO | Completed LR test 6/10: lr=0.0020 +2025-06-25 06:02:31,566 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:02:31,566 | INFO | Starting LR test 7/10: lr=0.0022 +2025-06-25 06:02:31,566 | INFO | Starting EoS for LR factor 1.2000 +2025-06-25 06:02:31,566 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 06:02:31,566 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:02:31,566 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:02:31,566 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:02:31,566 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:02:31,566 | INFO | Loading model and checkpoint... +2025-06-25 06:02:32,259 | INFO | Wrapping model with DDP... +2025-06-25 06:02:32,591 | INFO | Loading state dict... +2025-06-25 06:02:32,594 | INFO | Model loaded successfully! +2025-06-25 06:02:38,924 | INFO | iter 000000 | lr 0.0022 | loss 3.6876 | norm 27.4988 +2025-06-25 06:02:44,220 | INFO | iter 000001 | lr 0.0022 | loss 3.5816 | norm 54.4755 +2025-06-25 06:02:49,456 | INFO | iter 000002 | lr 0.0022 | loss 3.6457 | norm 81.1569 +2025-06-25 06:02:54,513 | INFO | iter 000003 | lr 0.0022 | loss 3.8138 | norm 107.4811 +2025-06-25 06:02:59,811 | INFO | iter 000004 | lr 0.0022 | loss 3.8571 | norm 133.4251 +2025-06-25 06:03:05,250 | INFO | iter 000005 | lr 0.0022 | loss 3.8876 | norm 158.9821 +2025-06-25 06:03:10,559 | INFO | iter 000006 | lr 0.0022 | loss 4.1145 | norm 184.1454 +2025-06-25 06:03:15,729 | INFO | iter 000007 | lr 0.0022 | loss 4.0991 | norm 208.9055 +2025-06-25 06:03:20,970 | INFO | iter 000008 | lr 0.0022 | loss 4.2516 | norm 233.2544 +2025-06-25 06:03:26,170 | INFO | iter 000009 | lr 0.0022 | loss 4.3180 | norm 257.1872 +2025-06-25 06:03:31,967 | INFO | iter 000010 | lr 0.0022 | loss 4.2748 | norm 280.7017 +2025-06-25 06:03:37,096 | INFO | iter 000011 | lr 0.0022 | loss 4.2000 | norm 303.7964 +2025-06-25 06:03:42,276 | INFO | iter 000012 | lr 0.0022 | loss 4.5692 | norm 326.4702 +2025-06-25 06:03:47,622 | INFO | iter 000013 | lr 0.0022 | loss 4.4204 | norm 348.7223 +2025-06-25 06:03:52,647 | INFO | iter 000014 | lr 0.0022 | loss 4.6603 | norm 370.5538 +2025-06-25 06:03:57,845 | INFO | iter 000015 | lr 0.0022 | loss 4.9774 | norm 391.9663 +2025-06-25 06:04:03,363 | INFO | iter 000016 | lr 0.0022 | loss 4.9215 | norm 412.9624 +2025-06-25 06:04:08,447 | INFO | iter 000017 | lr 0.0022 | loss 4.8513 | norm 433.5448 +2025-06-25 06:04:13,412 | INFO | iter 000018 | lr 0.0022 | loss 4.9012 | norm 453.7173 +2025-06-25 06:04:18,556 | INFO | iter 000019 | lr 0.0022 | loss 4.8747 | norm 473.4848 +2025-06-25 06:04:23,868 | INFO | iter 000020 | lr 0.0022 | loss 4.8614 | norm 492.8527 +2025-06-25 06:04:29,288 | INFO | iter 000021 | lr 0.0022 | loss 5.2076 | norm 511.8271 +2025-06-25 06:04:34,599 | INFO | iter 000022 | lr 0.0022 | loss 5.3604 | norm 530.4143 +2025-06-25 06:04:39,952 | INFO | iter 000023 | lr 0.0022 | loss 5.7092 | norm 548.6212 +2025-06-25 06:04:45,030 | INFO | iter 000024 | lr 0.0022 | loss 5.2997 | norm 566.4548 +2025-06-25 06:04:49,964 | INFO | iter 000025 | lr 0.0022 | loss 5.4966 | norm 583.9225 +2025-06-25 06:04:55,124 | INFO | iter 000026 | lr 0.0022 | loss 5.2925 | norm 601.0318 +2025-06-25 06:05:00,430 | INFO | iter 000027 | lr 0.0022 | loss 5.2346 | norm 617.7909 +2025-06-25 06:05:05,872 | INFO | iter 000028 | lr 0.0022 | loss 5.8256 | norm 634.2081 +2025-06-25 06:05:11,076 | INFO | iter 000029 | lr 0.0022 | loss 5.5557 | norm 650.2919 +2025-06-25 06:05:16,271 | INFO | iter 000030 | lr 0.0022 | loss 5.6963 | norm 666.0509 +2025-06-25 06:05:21,464 | INFO | iter 000031 | lr 0.0022 | loss 5.9050 | norm 681.4938 +2025-06-25 06:05:26,686 | INFO | iter 000032 | lr 0.0022 | loss 5.8065 | norm 696.6291 +2025-06-25 06:05:32,275 | INFO | iter 000033 | lr 0.0022 | loss 6.0051 | norm 711.4657 +2025-06-25 06:05:37,544 | INFO | iter 000034 | lr 0.0022 | loss 5.5032 | norm 726.0119 +2025-06-25 06:05:42,775 | INFO | iter 000035 | lr 0.0022 | loss 6.5643 | norm 740.2764 +2025-06-25 06:05:48,005 | INFO | iter 000036 | lr 0.0022 | loss 6.3361 | norm 754.2676 +2025-06-25 06:05:53,224 | INFO | iter 000037 | lr 0.0022 | loss 6.6566 | norm 767.9942 +2025-06-25 06:05:58,432 | INFO | iter 000038 | lr 0.0022 | loss 6.2322 | norm 781.4643 +2025-06-25 06:06:04,272 | INFO | iter 000039 | lr 0.0022 | loss 6.0644 | norm 794.6861 +2025-06-25 06:06:09,583 | INFO | iter 000040 | lr 0.0022 | loss 6.3769 | norm 807.6677 +2025-06-25 06:06:14,860 | INFO | iter 000041 | lr 0.0022 | loss 5.4851 | norm 820.4168 +2025-06-25 06:06:20,186 | INFO | iter 000042 | lr 0.0022 | loss 6.6002 | norm 832.9411 +2025-06-25 06:06:25,378 | INFO | iter 000043 | lr 0.0022 | loss 6.4176 | norm 845.2482 +2025-06-25 06:06:30,818 | INFO | iter 000044 | lr 0.0022 | loss 6.1134 | norm 857.3452 +2025-06-25 06:06:35,947 | INFO | iter 000045 | lr 0.0022 | loss 6.5890 | norm 869.2391 +2025-06-25 06:06:40,987 | INFO | iter 000046 | lr 0.0022 | loss 6.7593 | norm 880.9367 +2025-06-25 06:06:46,195 | INFO | iter 000047 | lr 0.0022 | loss 6.6622 | norm 892.4446 +2025-06-25 06:06:51,454 | INFO | iter 000048 | lr 0.0022 | loss 6.6858 | norm 903.7693 +2025-06-25 06:06:56,659 | INFO | iter 000049 | lr 0.0022 | loss 5.7619 | norm 914.9168 +2025-06-25 06:06:56,660 | INFO | Completed LR test 7/10: lr=0.0022 +2025-06-25 06:06:56,681 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:06:56,681 | INFO | Starting LR test 8/10: lr=0.0023 +2025-06-25 06:06:56,681 | INFO | Starting EoS for LR factor 1.3000 +2025-06-25 06:06:56,681 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 06:06:56,681 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:06:56,681 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:06:56,682 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:06:56,682 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:06:56,682 | INFO | Loading model and checkpoint... +2025-06-25 06:06:57,381 | INFO | Wrapping model with DDP... +2025-06-25 06:06:57,664 | INFO | Loading state dict... +2025-06-25 06:06:57,668 | INFO | Model loaded successfully! +2025-06-25 06:07:04,307 | INFO | iter 000000 | lr 0.0023 | loss 3.6876 | norm 29.7858 +2025-06-25 06:07:09,325 | INFO | iter 000001 | lr 0.0023 | loss 3.5855 | norm 58.9657 +2025-06-25 06:07:14,299 | INFO | iter 000002 | lr 0.0023 | loss 3.6563 | norm 87.7992 +2025-06-25 06:07:19,266 | INFO | iter 000003 | lr 0.0023 | loss 3.8318 | norm 116.2150 +2025-06-25 06:07:24,351 | INFO | iter 000004 | lr 0.0023 | loss 3.8853 | norm 144.1865 +2025-06-25 06:07:29,939 | INFO | iter 000005 | lr 0.0023 | loss 3.9210 | norm 171.7070 +2025-06-25 06:07:35,284 | INFO | iter 000006 | lr 0.0023 | loss 4.1628 | norm 198.7692 +2025-06-25 06:07:40,587 | INFO | iter 000007 | lr 0.0023 | loss 4.1507 | norm 225.3628 +2025-06-25 06:07:45,771 | INFO | iter 000008 | lr 0.0023 | loss 4.3096 | norm 251.4790 +2025-06-25 06:07:50,939 | INFO | iter 000009 | lr 0.0023 | loss 4.3805 | norm 277.1128 +2025-06-25 06:07:56,136 | INFO | iter 000010 | lr 0.0023 | loss 4.3467 | norm 302.2623 +2025-06-25 06:08:01,650 | INFO | iter 000011 | lr 0.0023 | loss 4.2815 | norm 326.9267 +2025-06-25 06:08:06,740 | INFO | iter 000012 | lr 0.0023 | loss 4.6548 | norm 351.1054 +2025-06-25 06:08:11,812 | INFO | iter 000013 | lr 0.0023 | loss 4.5047 | norm 374.7983 +2025-06-25 06:08:17,126 | INFO | iter 000014 | lr 0.0023 | loss 4.7994 | norm 398.0073 +2025-06-25 06:08:22,168 | INFO | iter 000015 | lr 0.0023 | loss 5.0968 | norm 420.7350 +2025-06-25 06:08:27,247 | INFO | iter 000016 | lr 0.0023 | loss 5.0512 | norm 442.9852 +2025-06-25 06:08:32,847 | INFO | iter 000017 | lr 0.0023 | loss 4.9901 | norm 464.7617 +2025-06-25 06:08:37,944 | INFO | iter 000018 | lr 0.0023 | loss 5.0242 | norm 486.0699 +2025-06-25 06:08:43,006 | INFO | iter 000019 | lr 0.0023 | loss 4.9823 | norm 506.9158 +2025-06-25 06:08:48,089 | INFO | iter 000020 | lr 0.0023 | loss 4.9903 | norm 527.3068 +2025-06-25 06:08:53,055 | INFO | iter 000021 | lr 0.0023 | loss 5.3445 | norm 547.2504 +2025-06-25 06:08:58,067 | INFO | iter 000022 | lr 0.0023 | loss 5.4917 | norm 566.7548 +2025-06-25 06:09:03,752 | INFO | iter 000023 | lr 0.0023 | loss 5.9406 | norm 585.8286 +2025-06-25 06:09:08,966 | INFO | iter 000024 | lr 0.0023 | loss 5.4620 | norm 604.4806 +2025-06-25 06:09:13,890 | INFO | iter 000025 | lr 0.0023 | loss 5.7073 | norm 622.7200 +2025-06-25 06:09:19,064 | INFO | iter 000026 | lr 0.0023 | loss 5.4810 | norm 640.5562 +2025-06-25 06:09:24,103 | INFO | iter 000027 | lr 0.0023 | loss 5.3813 | norm 657.9991 +2025-06-25 06:09:29,610 | INFO | iter 000028 | lr 0.0023 | loss 6.0298 | norm 675.0590 +2025-06-25 06:09:34,627 | INFO | iter 000029 | lr 0.0023 | loss 5.7523 | norm 691.7463 +2025-06-25 06:09:39,706 | INFO | iter 000030 | lr 0.0023 | loss 5.8543 | norm 708.0714 +2025-06-25 06:09:44,756 | INFO | iter 000031 | lr 0.0023 | loss 6.0789 | norm 724.0448 +2025-06-25 06:09:49,863 | INFO | iter 000032 | lr 0.0023 | loss 5.9534 | norm 739.6771 +2025-06-25 06:09:54,951 | INFO | iter 000033 | lr 0.0023 | loss 6.1380 | norm 754.9785 +2025-06-25 06:10:00,358 | INFO | iter 000034 | lr 0.0023 | loss 5.6740 | norm 769.9594 +2025-06-25 06:10:05,430 | INFO | iter 000035 | lr 0.0023 | loss 6.7911 | norm 784.6301 +2025-06-25 06:10:10,341 | INFO | iter 000036 | lr 0.0023 | loss 6.5128 | norm 799.0007 +2025-06-25 06:10:15,590 | INFO | iter 000037 | lr 0.0023 | loss 6.8682 | norm 813.0815 +2025-06-25 06:10:20,823 | INFO | iter 000038 | lr 0.0023 | loss 6.3677 | norm 826.8822 +2025-06-25 06:10:25,968 | INFO | iter 000039 | lr 0.0023 | loss 6.1648 | norm 840.4125 +2025-06-25 06:10:31,768 | INFO | iter 000040 | lr 0.0023 | loss 6.5954 | norm 853.6819 +2025-06-25 06:10:37,016 | INFO | iter 000041 | lr 0.0023 | loss 5.6618 | norm 866.6996 +2025-06-25 06:10:42,164 | INFO | iter 000042 | lr 0.0023 | loss 6.7585 | norm 879.4748 +2025-06-25 06:10:47,336 | INFO | iter 000043 | lr 0.0023 | loss 6.5646 | norm 892.0160 +2025-06-25 06:10:52,421 | INFO | iter 000044 | lr 0.0023 | loss 6.2502 | norm 904.3319 +2025-06-25 06:10:57,760 | INFO | iter 000045 | lr 0.0023 | loss 6.7723 | norm 916.4305 +2025-06-25 06:11:03,658 | INFO | iter 000046 | lr 0.0023 | loss 6.9262 | norm 928.3196 +2025-06-25 06:11:08,849 | INFO | iter 000047 | lr 0.0023 | loss 6.8335 | norm 940.0070 +2025-06-25 06:11:14,029 | INFO | iter 000048 | lr 0.0023 | loss 6.8953 | norm 951.5001 +2025-06-25 06:11:19,142 | INFO | iter 000049 | lr 0.0023 | loss 5.8852 | norm 962.8060 +2025-06-25 06:11:19,143 | INFO | Completed LR test 8/10: lr=0.0023 +2025-06-25 06:11:19,174 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:11:19,174 | INFO | Starting LR test 9/10: lr=0.0025 +2025-06-25 06:11:19,174 | INFO | Starting EoS for LR factor 1.4000 +2025-06-25 06:11:19,174 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 06:11:19,174 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:11:19,175 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:11:19,175 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:11:19,175 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:11:19,175 | INFO | Loading model and checkpoint... +2025-06-25 06:11:19,871 | INFO | Wrapping model with DDP... +2025-06-25 06:11:20,178 | INFO | Loading state dict... +2025-06-25 06:11:20,181 | INFO | Model loaded successfully! +2025-06-25 06:11:26,593 | INFO | iter 000000 | lr 0.0025 | loss 3.6876 | norm 32.0730 +2025-06-25 06:11:32,036 | INFO | iter 000001 | lr 0.0025 | loss 3.5896 | norm 63.4489 +2025-06-25 06:11:37,229 | INFO | iter 000002 | lr 0.0025 | loss 3.6682 | norm 94.4235 +2025-06-25 06:11:42,390 | INFO | iter 000003 | lr 0.0025 | loss 3.8500 | norm 124.9154 +2025-06-25 06:11:47,646 | INFO | iter 000004 | lr 0.0025 | loss 3.9122 | norm 154.8944 +2025-06-25 06:11:52,952 | INFO | iter 000005 | lr 0.0025 | loss 3.9551 | norm 184.3538 +2025-06-25 06:11:57,954 | INFO | iter 000006 | lr 0.0025 | loss 4.2136 | norm 213.2859 +2025-06-25 06:12:03,477 | INFO | iter 000007 | lr 0.0025 | loss 4.1950 | norm 241.6798 +2025-06-25 06:12:08,505 | INFO | iter 000008 | lr 0.0025 | loss 4.3723 | norm 269.5256 +2025-06-25 06:12:13,681 | INFO | iter 000009 | lr 0.0025 | loss 4.4440 | norm 296.8186 +2025-06-25 06:12:18,787 | INFO | iter 000010 | lr 0.0025 | loss 4.4323 | norm 323.5575 +2025-06-25 06:12:23,836 | INFO | iter 000011 | lr 0.0025 | loss 4.3655 | norm 349.7422 +2025-06-25 06:12:29,157 | INFO | iter 000012 | lr 0.0025 | loss 4.7546 | norm 375.3729 +2025-06-25 06:12:34,519 | INFO | iter 000013 | lr 0.0025 | loss 4.5906 | norm 400.4503 +2025-06-25 06:12:39,712 | INFO | iter 000014 | lr 0.0025 | loss 4.9124 | norm 424.9773 +2025-06-25 06:12:45,174 | INFO | iter 000015 | lr 0.0025 | loss 5.2241 | norm 448.9581 +2025-06-25 06:12:50,358 | INFO | iter 000016 | lr 0.0025 | loss 5.1840 | norm 472.3975 +2025-06-25 06:12:55,418 | INFO | iter 000017 | lr 0.0025 | loss 5.1327 | norm 495.3010 +2025-06-25 06:13:00,485 | INFO | iter 000018 | lr 0.0025 | loss 5.1446 | norm 517.6753 +2025-06-25 06:13:06,131 | INFO | iter 000019 | lr 0.0025 | loss 5.1064 | norm 539.5286 +2025-06-25 06:13:11,225 | INFO | iter 000020 | lr 0.0025 | loss 5.1112 | norm 560.8697 +2025-06-25 06:13:16,220 | INFO | iter 000021 | lr 0.0025 | loss 5.5130 | norm 581.7085 +2025-06-25 06:13:21,342 | INFO | iter 000022 | lr 0.0025 | loss 5.6241 | norm 602.0549 +2025-06-25 06:13:26,451 | INFO | iter 000023 | lr 0.0025 | loss 6.1646 | norm 621.9195 +2025-06-25 06:13:32,022 | INFO | iter 000024 | lr 0.0025 | loss 5.6253 | norm 641.3132 +2025-06-25 06:13:37,001 | INFO | iter 000025 | lr 0.0025 | loss 5.8789 | norm 660.2472 +2025-06-25 06:13:42,161 | INFO | iter 000026 | lr 0.0025 | loss 5.5988 | norm 678.7329 +2025-06-25 06:13:47,209 | INFO | iter 000027 | lr 0.0025 | loss 5.4927 | norm 696.7825 +2025-06-25 06:13:52,260 | INFO | iter 000028 | lr 0.0025 | loss 6.2231 | norm 714.4082 +2025-06-25 06:13:57,457 | INFO | iter 000029 | lr 0.0025 | loss 5.9250 | norm 731.6225 +2025-06-25 06:14:03,013 | INFO | iter 000030 | lr 0.0025 | loss 5.9891 | norm 748.4380 +2025-06-25 06:14:08,128 | INFO | iter 000031 | lr 0.0025 | loss 6.2910 | norm 764.8672 +2025-06-25 06:14:13,228 | INFO | iter 000032 | lr 0.0025 | loss 6.1202 | norm 780.9225 +2025-06-25 06:14:18,498 | INFO | iter 000033 | lr 0.0025 | loss 6.2973 | norm 796.6163 +2025-06-25 06:14:23,706 | INFO | iter 000034 | lr 0.0025 | loss 5.7886 | norm 811.9608 +2025-06-25 06:14:28,852 | INFO | iter 000035 | lr 0.0025 | loss 6.9882 | norm 826.9680 +2025-06-25 06:14:34,397 | INFO | iter 000036 | lr 0.0025 | loss 6.7193 | norm 841.6499 +2025-06-25 06:14:39,459 | INFO | iter 000037 | lr 0.0025 | loss 7.0613 | norm 856.0185 +2025-06-25 06:14:44,530 | INFO | iter 000038 | lr 0.0025 | loss 6.5425 | norm 870.0853 +2025-06-25 06:14:49,627 | INFO | iter 000039 | lr 0.0025 | loss 6.3130 | norm 883.8614 +2025-06-25 06:14:54,817 | INFO | iter 000040 | lr 0.0025 | loss 6.7243 | norm 897.3581 +2025-06-25 06:15:00,128 | INFO | iter 000041 | lr 0.0025 | loss 5.7344 | norm 910.5860 +2025-06-25 06:15:05,761 | INFO | iter 000042 | lr 0.0025 | loss 6.9161 | norm 923.5555 +2025-06-25 06:15:10,962 | INFO | iter 000043 | lr 0.0025 | loss 6.8284 | norm 936.2766 +2025-06-25 06:15:16,242 | INFO | iter 000044 | lr 0.0025 | loss 6.4281 | norm 948.7593 +2025-06-25 06:15:21,392 | INFO | iter 000045 | lr 0.0025 | loss 6.8733 | norm 961.0127 +2025-06-25 06:15:26,535 | INFO | iter 000046 | lr 0.0025 | loss 7.0664 | norm 973.0458 +2025-06-25 06:15:31,984 | INFO | iter 000047 | lr 0.0025 | loss 6.9932 | norm 984.8674 +2025-06-25 06:15:37,095 | INFO | iter 000048 | lr 0.0025 | loss 7.0181 | norm 996.4859 +2025-06-25 06:15:42,239 | INFO | iter 000049 | lr 0.0025 | loss 6.0350 | norm 1007.9093 +2025-06-25 06:15:42,239 | INFO | Completed LR test 9/10: lr=0.0025 +2025-06-25 06:15:42,268 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 06:15:42,268 | INFO | Starting LR test 10/10: lr=0.0027 +2025-06-25 06:15:42,268 | INFO | Starting EoS for LR factor 1.5000 +2025-06-25 06:15:42,268 | INFO | Starting EoS for checkpoint 007000 +2025-06-25 06:15:42,268 | INFO | Starting EoS for model gpt2_small +2025-06-25 06:15:42,268 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 06:15:42,268 | INFO | Starting EoS for num_iterations 50 +2025-06-25 06:15:42,268 | INFO | Starting EoS for accum_steps 4 +2025-06-25 06:15:42,268 | INFO | Loading model and checkpoint... +2025-06-25 06:15:42,968 | INFO | Wrapping model with DDP... +2025-06-25 06:15:43,317 | INFO | Loading state dict... +2025-06-25 06:15:43,321 | INFO | Model loaded successfully! +2025-06-25 06:15:49,488 | INFO | iter 000000 | lr 0.0027 | loss 3.6876 | norm 34.3605 +2025-06-25 06:15:54,389 | INFO | iter 000001 | lr 0.0027 | loss 3.5943 | norm 67.9249 +2025-06-25 06:15:59,699 | INFO | iter 000002 | lr 0.0027 | loss 3.6800 | norm 101.0297 +2025-06-25 06:16:05,013 | INFO | iter 000003 | lr 0.0027 | loss 3.8703 | norm 133.5825 +2025-06-25 06:16:10,313 | INFO | iter 000004 | lr 0.0027 | loss 3.9398 | norm 165.5491 +2025-06-25 06:16:15,323 | INFO | iter 000005 | lr 0.0027 | loss 3.9873 | norm 196.9229 +2025-06-25 06:16:20,431 | INFO | iter 000006 | lr 0.0027 | loss 4.2650 | norm 227.6963 +2025-06-25 06:16:25,491 | INFO | iter 000007 | lr 0.0027 | loss 4.2481 | norm 257.8574 +2025-06-25 06:16:31,169 | INFO | iter 000008 | lr 0.0027 | loss 4.4322 | norm 287.3958 +2025-06-25 06:16:36,127 | INFO | iter 000009 | lr 0.0027 | loss 4.5136 | norm 316.3069 +2025-06-25 06:16:41,329 | INFO | iter 000010 | lr 0.0027 | loss 4.5131 | norm 344.5903 +2025-06-25 06:16:46,561 | INFO | iter 000011 | lr 0.0027 | loss 4.4507 | norm 372.2470 +2025-06-25 06:16:51,600 | INFO | iter 000012 | lr 0.0027 | loss 4.8472 | norm 399.2780 +2025-06-25 06:16:56,803 | INFO | iter 000013 | lr 0.0027 | loss 4.6747 | norm 425.6851 +2025-06-25 06:17:02,481 | INFO | iter 000014 | lr 0.0027 | loss 5.0471 | norm 451.4725 +2025-06-25 06:17:07,499 | INFO | iter 000015 | lr 0.0027 | loss 5.3386 | norm 476.6461 +2025-06-25 06:17:12,469 | INFO | iter 000016 | lr 0.0027 | loss 5.3129 | norm 501.2121 +2025-06-25 06:17:17,681 | INFO | iter 000017 | lr 0.0027 | loss 5.2700 | norm 525.1778 +2025-06-25 06:17:22,697 | INFO | iter 000018 | lr 0.0027 | loss 5.2826 | norm 548.5519 +2025-06-25 06:17:27,938 | INFO | iter 000019 | lr 0.0027 | loss 5.2451 | norm 571.3443 +2025-06-25 06:17:33,339 | INFO | iter 000020 | lr 0.0027 | loss 5.2423 | norm 593.5663 +2025-06-25 06:17:38,686 | INFO | iter 000021 | lr 0.0027 | loss 5.6845 | norm 615.2298 +2025-06-25 06:17:43,870 | INFO | iter 000022 | lr 0.0027 | loss 5.7502 | norm 636.3470 +2025-06-25 06:17:48,833 | INFO | iter 000023 | lr 0.0027 | loss 6.3552 | norm 656.9308 +2025-06-25 06:17:54,207 | INFO | iter 000024 | lr 0.0027 | loss 5.7950 | norm 676.9943 +2025-06-25 06:17:59,920 | INFO | iter 000025 | lr 0.0027 | loss 6.0212 | norm 696.5510 +2025-06-25 06:18:05,173 | INFO | iter 000026 | lr 0.0027 | loss 5.7221 | norm 715.6146 +2025-06-25 06:18:10,751 | INFO | iter 000027 | lr 0.0027 | loss 5.6325 | norm 734.1994 +2025-06-25 06:18:15,764 | INFO | iter 000028 | lr 0.0027 | loss 6.3907 | norm 752.3203 +2025-06-25 06:18:20,879 | INFO | iter 000029 | lr 0.0027 | loss 6.0694 | norm 769.9918 +2025-06-25 06:18:26,035 | INFO | iter 000030 | lr 0.0027 | loss 6.1491 | norm 787.2289 +2025-06-25 06:18:31,673 | INFO | iter 000031 | lr 0.0027 | loss 6.4236 | norm 804.0462 +2025-06-25 06:18:36,833 | INFO | iter 000032 | lr 0.0027 | loss 6.2595 | norm 820.4583 +2025-06-25 06:18:41,950 | INFO | iter 000033 | lr 0.0027 | loss 6.5047 | norm 836.4797 +2025-06-25 06:18:47,129 | INFO | iter 000034 | lr 0.0027 | loss 5.9256 | norm 852.1246 +2025-06-25 06:18:52,172 | INFO | iter 000035 | lr 0.0027 | loss 7.2157 | norm 867.4069 +2025-06-25 06:18:57,259 | INFO | iter 000036 | lr 0.0027 | loss 6.8648 | norm 882.3406 +2025-06-25 06:19:02,908 | INFO | iter 000037 | lr 0.0027 | loss 7.2547 | norm 896.9395 +2025-06-25 06:19:08,076 | INFO | iter 000038 | lr 0.0027 | loss 6.6690 | norm 911.2167 +2025-06-25 06:19:13,095 | INFO | iter 000039 | lr 0.0027 | loss 6.4529 | norm 925.1854 +2025-06-25 06:19:18,346 | INFO | iter 000040 | lr 0.0027 | loss 6.8312 | norm 938.8581 +2025-06-25 06:19:23,338 | INFO | iter 000041 | lr 0.0027 | loss 5.8287 | norm 952.2471 +2025-06-25 06:19:28,610 | INFO | iter 000042 | lr 0.0027 | loss 7.0746 | norm 965.3642 +2025-06-25 06:19:33,936 | INFO | iter 000043 | lr 0.0027 | loss 7.0014 | norm 978.2209 +2025-06-25 06:19:39,018 | INFO | iter 000044 | lr 0.0027 | loss 6.5323 | norm 990.8282 +2025-06-25 06:19:44,171 | INFO | iter 000045 | lr 0.0027 | loss 7.0292 | norm 1003.1965 +2025-06-25 06:19:49,407 | INFO | iter 000046 | lr 0.0027 | loss 7.2137 | norm 1015.3359 +2025-06-25 06:19:54,614 | INFO | iter 000047 | lr 0.0027 | loss 7.1640 | norm 1027.2564 +2025-06-25 06:19:59,877 | INFO | iter 000048 | lr 0.0027 | loss 7.1574 | norm 1038.9671 +2025-06-25 06:20:05,133 | INFO | iter 000049 | lr 0.0027 | loss 6.1115 | norm 1050.4771 +2025-06-25 06:20:05,133 | INFO | Completed LR test 10/10: lr=0.0027 +2025-06-25 06:20:05,458 | INFO | Cleanup complete diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..964a39d3f5ab39acc0f83d8cb7bee0ac6334026b --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/config.json @@ -0,0 +1,14 @@ +{ + "model_name": "gpt2_small", + "factor_min": 0.6, + "factor_max": 1.5, + "factor_num": 10, + "error": 0.0001, + "accum_steps": 4, + "num_iterations": 50, + "num_checkpoint": 2000, + "input_bin": "data/fineweb/fineweb10B/fineweb_train_*.bin", + "run_settings": "lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536", + "timestamp": "250622_035242", + "raw": false +} \ No newline at end of file diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..5af9b582139af7721ed01a43d6dfe2fecd98b4a6 Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/losses_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..e86309996841b648324304730566a3d5c8d7dbf4 Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png new file mode 100644 index 0000000000000000000000000000000000000000..898b3c1178ec720594f2c4f3750bb2cf300de921 Binary files /dev/null and b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/norms_lr_iter.png differ diff --git a/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log new file mode 100644 index 0000000000000000000000000000000000000000..4b51d0f571ccd1d6075fd3f8d99009aff6c7d311 --- /dev/null +++ b/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/eos/step_010000/training.log @@ -0,0 +1,386 @@ +2025-06-25 08:05:01,878 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt +2025-06-25 08:05:03,791 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:05:03,792 | INFO | Current learning rate: 0.0018 +2025-06-25 08:05:04,404 | INFO | Weight decay: 0.1 +2025-06-25 08:05:04,404 | INFO | Epsilon: 1e-08 +2025-06-25 08:05:04,404 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:05:04,404 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:05:04,404 | INFO | Optimizer state loading completed! +2025-06-25 08:05:06,318 | INFO | Initialized xs with norm: 1.273501 +2025-06-25 08:05:06,326 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:05:06,326 | INFO | Starting LR test 1/10: lr=0.1800 +2025-06-25 08:05:06,326 | INFO | Starting EoS for LR factor 100.0000 +2025-06-25 08:05:06,326 | INFO | Starting EoS for checkpoint 010000 +2025-06-25 08:05:06,326 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:05:06,326 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:05:06,326 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:05:06,326 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:05:06,326 | INFO | Loading model and checkpoint... +2025-06-25 08:05:07,207 | INFO | Wrapping model with DDP... +2025-06-25 08:05:07,346 | INFO | Loading state dict... +2025-06-25 08:05:07,350 | INFO | Model loaded successfully! +2025-06-25 08:05:14,155 | INFO | iter 000000 | lr 0.1800 | loss 3.7162 | norm 1035.2508 +2025-06-25 08:05:19,180 | INFO | iter 000001 | lr 0.1800 | loss 1022.7610 | norm 1854.6921 +2025-06-25 08:05:24,315 | INFO | iter 000002 | lr 0.1800 | loss 12916.4541 | norm 2262.8654 +2025-06-25 08:05:29,639 | INFO | iter 000003 | lr 0.1800 | loss 2381.4231 | norm 2784.9186 +2025-06-25 08:05:35,017 | INFO | iter 000004 | lr 0.1800 | loss 5663.5991 | norm 3258.5670 +2025-06-25 08:05:40,197 | INFO | iter 000005 | lr 0.1800 | loss 6263.7207 | norm 3659.7969 +2025-06-25 08:05:45,423 | INFO | iter 000006 | lr 0.1800 | loss 3300.3076 | norm 4042.1641 +2025-06-25 08:05:50,614 | INFO | iter 000007 | lr 0.1800 | loss 2205.2788 | norm 4403.1766 +2025-06-25 08:05:55,705 | INFO | iter 000008 | lr 0.1800 | loss 3169.1311 | norm 4721.1078 +2025-06-25 08:06:01,395 | INFO | iter 000009 | lr 0.1800 | loss 2604.3735 | norm 5007.2289 +2025-06-25 08:06:06,537 | INFO | iter 000010 | lr 0.1800 | loss 1049.3635 | norm 5277.7821 +2025-06-25 08:06:11,635 | INFO | iter 000011 | lr 0.1800 | loss -534.1947 | norm 5542.2629 +2025-06-25 08:06:16,925 | INFO | iter 000012 | lr 0.1800 | loss -1829.3038 | norm 5804.4541 +2025-06-25 08:06:22,270 | INFO | iter 000013 | lr 0.1800 | loss -3147.5239 | norm 6068.7285 +2025-06-25 08:06:27,494 | INFO | iter 000014 | lr 0.1800 | loss -5675.9150 | norm 6341.2563 +2025-06-25 08:06:33,272 | INFO | iter 000015 | lr 0.1800 | loss -8397.9707 | norm 6625.5651 +2025-06-25 08:06:38,517 | INFO | iter 000016 | lr 0.1800 | loss -12464.1982 | norm 6928.3923 +2025-06-25 08:06:43,692 | INFO | iter 000017 | lr 0.1800 | loss -19611.1348 | norm 7248.3939 +2025-06-25 08:06:48,881 | INFO | iter 000018 | lr 0.1800 | loss -27940.6465 | norm 7598.8381 +2025-06-25 08:06:54,254 | INFO | iter 000019 | lr 0.1800 | loss -37102.6367 | norm 7980.6707 +2025-06-25 08:06:59,396 | INFO | iter 000020 | lr 0.1800 | loss -41850.2695 | norm 8391.1617 +2025-06-25 08:07:04,889 | INFO | iter 000021 | lr 0.1800 | loss -55457.6641 | norm 8833.9090 +2025-06-25 08:07:10,211 | INFO | iter 000022 | lr 0.1800 | loss -77246.1016 | norm 9308.0797 +2025-06-25 08:07:15,407 | INFO | iter 000023 | lr 0.1800 | loss -88869.9531 | norm 9810.1623 +2025-06-25 08:07:20,721 | INFO | iter 000024 | lr 0.1800 | loss -106978.7188 | norm 10315.7090 +2025-06-25 08:07:26,169 | INFO | iter 000025 | lr 0.1800 | loss -128147.9297 | norm 10848.4611 +2025-06-25 08:07:31,697 | INFO | iter 000026 | lr 0.1800 | loss -167841.3906 | norm 11408.8324 +2025-06-25 08:07:37,001 | INFO | iter 000027 | lr 0.1800 | loss -180567.0000 | norm 11986.0506 +2025-06-25 08:07:42,165 | INFO | iter 000028 | lr 0.1800 | loss -195498.7031 | norm 12575.1052 +2025-06-25 08:07:47,397 | INFO | iter 000029 | lr 0.1800 | loss -226350.6406 | norm 13174.6499 +2025-06-25 08:07:52,643 | INFO | iter 000030 | lr 0.1800 | loss -278469.5625 | norm 13789.2878 +2025-06-25 08:07:57,699 | INFO | iter 000031 | lr 0.1800 | loss -291369.5000 | norm 14408.3851 +2025-06-25 08:08:03,348 | INFO | iter 000032 | lr 0.1800 | loss -340664.4062 | norm 15027.4904 +2025-06-25 08:08:08,536 | INFO | iter 000033 | lr 0.1800 | loss -381711.4062 | norm 15650.6969 +2025-06-25 08:08:13,710 | INFO | iter 000034 | lr 0.1800 | loss -447641.5625 | norm 16275.4851 +2025-06-25 08:08:18,817 | INFO | iter 000035 | lr 0.1800 | loss -478080.0938 | norm 16906.2037 +2025-06-25 08:08:23,855 | INFO | iter 000036 | lr 0.1800 | loss -474737.6875 | norm 17529.9377 +2025-06-25 08:08:29,137 | INFO | iter 000037 | lr 0.1800 | loss -562731.2500 | norm 18158.1498 +2025-06-25 08:08:34,554 | INFO | iter 000038 | lr 0.1800 | loss -671918.5000 | norm 18787.8108 +2025-06-25 08:08:39,659 | INFO | iter 000039 | lr 0.1800 | loss -611043.3750 | norm 19413.9402 +2025-06-25 08:08:44,757 | INFO | iter 000040 | lr 0.1800 | loss -695682.8750 | norm 20036.0407 +2025-06-25 08:08:49,914 | INFO | iter 000041 | lr 0.1800 | loss -780705.8750 | norm 20644.8346 +2025-06-25 08:08:55,158 | INFO | iter 000042 | lr 0.1800 | loss -995004.5625 | norm 21239.0209 +2025-06-25 08:09:00,499 | INFO | iter 000043 | lr 0.1800 | loss -818858.2500 | norm 21828.2696 +2025-06-25 08:09:05,819 | INFO | iter 000044 | lr 0.1800 | loss -880965.5000 | norm 22411.6198 +2025-06-25 08:09:11,010 | INFO | iter 000045 | lr 0.1800 | loss -1003927.8125 | norm 22994.3220 +2025-06-25 08:09:16,158 | INFO | iter 000046 | lr 0.1800 | loss -1131396.7500 | norm 23577.3465 +2025-06-25 08:09:21,395 | INFO | iter 000047 | lr 0.1800 | loss -1100723.7500 | norm 24149.2858 +2025-06-25 08:09:26,617 | INFO | iter 000048 | lr 0.1800 | loss -1096843.2500 | norm 24709.4051 +2025-06-25 08:09:32,274 | INFO | iter 000049 | lr 0.1800 | loss -1231046.2500 | norm 25267.4215 +2025-06-25 08:09:32,275 | INFO | Completed LR test 1/10: lr=0.1800 +2025-06-25 08:09:32,476 | INFO | Cleanup complete +2025-06-25 08:14:04,088 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt +2025-06-25 08:14:06,081 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:14:06,081 | INFO | Current learning rate: 0.0018 +2025-06-25 08:14:06,733 | INFO | Weight decay: 0.1 +2025-06-25 08:14:06,733 | INFO | Epsilon: 1e-08 +2025-06-25 08:14:06,733 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:14:06,733 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:14:06,733 | INFO | Optimizer state loading completed! +2025-06-25 08:14:08,702 | INFO | Initialized xs with norm: 1.273654 +2025-06-25 08:14:08,705 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:14:08,705 | INFO | Starting LR test 1/10: lr=0.1800 +2025-06-25 08:14:08,705 | INFO | Starting EoS for LR factor 100.0000 +2025-06-25 08:14:08,705 | INFO | Starting EoS for checkpoint 010000 +2025-06-25 08:14:08,706 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:14:08,706 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:14:08,706 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:14:08,706 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:14:08,706 | INFO | Loading model and checkpoint... +2025-06-25 08:14:09,604 | INFO | Wrapping model with DDP... +2025-06-25 08:14:09,826 | INFO | Loading state dict... +2025-06-25 08:14:09,830 | INFO | Model loaded successfully! +2025-06-25 08:14:16,113 | INFO | iter 000000 | lr 0.1800 | loss 3.7162 | norm 458.7718 +2025-06-25 08:14:21,223 | INFO | iter 000001 | lr 0.1800 | loss 207.3757 | norm 1023.7285 +2025-06-25 08:14:26,544 | INFO | iter 000002 | lr 0.1800 | loss 5262.5127 | norm 1344.5259 +2025-06-25 08:14:32,322 | INFO | iter 000003 | lr 0.1800 | loss 1121.6940 | norm 1825.7758 +2025-06-25 08:14:37,581 | INFO | iter 000004 | lr 0.1800 | loss 3489.6094 | norm 2256.3649 +2025-06-25 08:14:42,793 | INFO | iter 000005 | lr 0.1800 | loss 2611.4473 | norm 2665.4942 +2025-06-25 08:14:47,946 | INFO | iter 000006 | lr 0.1800 | loss 1170.9673 | norm 3086.3505 +2025-06-25 08:14:53,249 | INFO | iter 000007 | lr 0.1800 | loss 1968.7089 | norm 3471.1555 +2025-06-25 08:14:58,293 | INFO | iter 000008 | lr 0.1800 | loss 2037.9554 | norm 3819.5898 +2025-06-25 08:15:04,132 | INFO | iter 000009 | lr 0.1800 | loss 831.5104 | norm 4163.8926 +2025-06-25 08:15:09,325 | INFO | iter 000010 | lr 0.1800 | loss 281.9943 | norm 4504.7350 +2025-06-25 08:15:14,666 | INFO | iter 000011 | lr 0.1800 | loss -326.7262 | norm 4836.5242 +2025-06-25 08:15:19,988 | INFO | iter 000012 | lr 0.1800 | loss -1960.6803 | norm 5168.6060 +2025-06-25 08:15:25,055 | INFO | iter 000013 | lr 0.1800 | loss -4240.2627 | norm 5518.1032 +2025-06-25 08:15:30,756 | INFO | iter 000014 | lr 0.1800 | loss -7296.6255 | norm 5891.2474 +2025-06-25 08:15:35,838 | INFO | iter 000015 | lr 0.1800 | loss -10337.6426 | norm 6286.4372 +2025-06-25 08:15:40,922 | INFO | iter 000016 | lr 0.1800 | loss -15605.5254 | norm 6710.1341 +2025-06-25 08:15:46,100 | INFO | iter 000017 | lr 0.1800 | loss -25703.7539 | norm 7163.8796 +2025-06-25 08:15:51,309 | INFO | iter 000018 | lr 0.1800 | loss -37443.0664 | norm 7659.6704 +2025-06-25 08:15:56,583 | INFO | iter 000019 | lr 0.1800 | loss -50078.2383 | norm 8193.8946 +2025-06-25 08:16:01,998 | INFO | iter 000020 | lr 0.1800 | loss -57032.1875 | norm 8760.0695 +2025-06-25 08:16:07,120 | INFO | iter 000021 | lr 0.1800 | loss -76344.0781 | norm 9360.5588 +2025-06-25 08:16:12,341 | INFO | iter 000022 | lr 0.1800 | loss -106728.8047 | norm 9991.8501 +2025-06-25 08:16:17,385 | INFO | iter 000023 | lr 0.1800 | loss -122665.7422 | norm 10647.0357 +2025-06-25 08:16:22,444 | INFO | iter 000024 | lr 0.1800 | loss -146900.8906 | norm 11291.6713 +2025-06-25 08:16:27,677 | INFO | iter 000025 | lr 0.1800 | loss -175862.7344 | norm 11958.6899 +2025-06-25 08:16:33,138 | INFO | iter 000026 | lr 0.1800 | loss -229347.8750 | norm 12648.0304 +2025-06-25 08:16:38,360 | INFO | iter 000027 | lr 0.1800 | loss -245424.7344 | norm 13346.8304 +2025-06-25 08:16:43,584 | INFO | iter 000028 | lr 0.1800 | loss -263935.7500 | norm 14047.6949 +2025-06-25 08:16:48,605 | INFO | iter 000029 | lr 0.1800 | loss -304780.0625 | norm 14750.5359 +2025-06-25 08:16:53,925 | INFO | iter 000030 | lr 0.1800 | loss -372978.8438 | norm 15461.7537 +2025-06-25 08:16:59,207 | INFO | iter 000031 | lr 0.1800 | loss -386555.4062 | norm 16169.9843 +2025-06-25 08:17:04,766 | INFO | iter 000032 | lr 0.1800 | loss -447786.1250 | norm 16870.0891 +2025-06-25 08:17:09,788 | INFO | iter 000033 | lr 0.1800 | loss -501011.2500 | norm 17566.7465 +2025-06-25 08:17:14,895 | INFO | iter 000034 | lr 0.1800 | loss -584175.8750 | norm 18257.1731 +2025-06-25 08:17:20,270 | INFO | iter 000035 | lr 0.1800 | loss -620550.2500 | norm 18947.4654 +2025-06-25 08:17:25,581 | INFO | iter 000036 | lr 0.1800 | loss -611509.9375 | norm 19623.6972 +2025-06-25 08:17:31,267 | INFO | iter 000037 | lr 0.1800 | loss -720793.8750 | norm 20299.6148 +2025-06-25 08:17:36,391 | INFO | iter 000038 | lr 0.1800 | loss -856993.6875 | norm 20972.3499 +2025-06-25 08:17:41,749 | INFO | iter 000039 | lr 0.1800 | loss -774917.7500 | norm 21636.3731 +2025-06-25 08:17:46,809 | INFO | iter 000040 | lr 0.1800 | loss -875320.1250 | norm 22291.3078 +2025-06-25 08:17:51,936 | INFO | iter 000041 | lr 0.1800 | loss -978267.6250 | norm 22928.2164 +2025-06-25 08:17:57,182 | INFO | iter 000042 | lr 0.1800 | loss -1236919.7500 | norm 23545.3115 +2025-06-25 08:18:02,979 | INFO | iter 000043 | lr 0.1800 | loss -1016738.6875 | norm 24153.6447 +2025-06-25 08:18:08,099 | INFO | iter 000044 | lr 0.1800 | loss -1085286.3750 | norm 24751.7884 +2025-06-25 08:18:13,218 | INFO | iter 000045 | lr 0.1800 | loss -1233294.6250 | norm 25346.0331 +2025-06-25 08:18:18,370 | INFO | iter 000046 | lr 0.1800 | loss -1382542.3750 | norm 25937.9944 +2025-06-25 08:18:23,739 | INFO | iter 000047 | lr 0.1800 | loss -1339756.2500 | norm 26515.0226 +2025-06-25 08:18:29,372 | INFO | iter 000048 | lr 0.1800 | loss -1327051.5000 | norm 27076.3794 +2025-06-25 08:18:34,762 | INFO | iter 000049 | lr 0.1800 | loss -1484669.5000 | norm 27633.4660 +2025-06-25 08:18:34,762 | INFO | Completed LR test 1/10: lr=0.1800 +2025-06-25 08:18:34,933 | INFO | Cleanup complete +2025-06-25 08:19:09,783 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt +2025-06-25 08:19:11,705 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:19:11,705 | INFO | Current learning rate: 0.0018 +2025-06-25 08:19:12,319 | INFO | Weight decay: 0.1 +2025-06-25 08:19:12,319 | INFO | Epsilon: 1e-08 +2025-06-25 08:19:12,319 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:19:12,319 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:19:12,319 | INFO | Optimizer state loading completed! +2025-06-25 08:19:14,382 | INFO | Initialized xs with norm: 1.273542 +2025-06-25 08:19:14,386 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:19:14,386 | INFO | Starting LR test 1/10: lr=0.1800 +2025-06-25 08:19:14,387 | INFO | Starting EoS for LR factor 100.0000 +2025-06-25 08:19:14,387 | INFO | Starting EoS for checkpoint 010000 +2025-06-25 08:19:14,387 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:19:14,387 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:19:14,387 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:19:14,387 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:19:14,387 | INFO | Loading model and checkpoint... +2025-06-25 08:19:15,188 | INFO | Wrapping model with DDP... +2025-06-25 08:19:15,646 | INFO | Loading state dict... +2025-06-25 08:19:15,649 | INFO | Model loaded successfully! +2025-06-25 08:19:22,000 | INFO | iter 000000 | lr 0.1800 | loss 3.5472 | norm 471.7711 +2025-06-25 08:19:27,321 | INFO | iter 000001 | lr 0.1800 | loss 258.9568 | norm 1033.3546 +2025-06-25 08:19:33,176 | INFO | iter 000002 | lr 0.1800 | loss 6505.9888 | norm 1313.8736 +2025-06-25 08:19:38,376 | INFO | iter 000003 | lr 0.1800 | loss 720.6740 | norm 1808.7756 +2025-06-25 08:19:43,461 | INFO | iter 000004 | lr 0.1800 | loss 4549.4497 | norm 2236.3996 +2025-06-25 08:19:48,625 | INFO | iter 000005 | lr 0.1800 | loss 3147.6418 | norm 2648.9176 +2025-06-25 08:19:53,829 | INFO | iter 000006 | lr 0.1800 | loss 1210.4675 | norm 3081.6848 +2025-06-25 08:19:58,930 | INFO | iter 000007 | lr 0.1800 | loss 2729.4216 | norm 3473.4778 +2025-06-25 08:20:04,381 | INFO | iter 000008 | lr 0.1800 | loss 2761.1892 | norm 3830.6859 +2025-06-25 08:20:09,528 | INFO | iter 000009 | lr 0.1800 | loss 1629.8550 | norm 4172.5473 +2025-06-25 08:20:14,583 | INFO | iter 000010 | lr 0.1800 | loss 738.9063 | norm 4510.9757 +2025-06-25 08:20:19,734 | INFO | iter 000011 | lr 0.1800 | loss 429.7834 | norm 4842.2049 +2025-06-25 08:20:24,921 | INFO | iter 000012 | lr 0.1800 | loss -247.2831 | norm 5150.2921 +2025-06-25 08:20:30,290 | INFO | iter 000013 | lr 0.1800 | loss -2646.1260 | norm 5471.6201 +2025-06-25 08:20:35,302 | INFO | iter 000014 | lr 0.1800 | loss -5207.6968 | norm 5812.3796 +2025-06-25 08:20:40,540 | INFO | iter 000015 | lr 0.1800 | loss -6475.6606 | norm 6168.6810 +2025-06-25 08:20:45,761 | INFO | iter 000016 | lr 0.1800 | loss -11955.2441 | norm 6540.5293 +2025-06-25 08:20:50,785 | INFO | iter 000017 | lr 0.1800 | loss -18120.8965 | norm 6942.5453 +2025-06-25 08:20:56,027 | INFO | iter 000018 | lr 0.1800 | loss -27042.0312 | norm 7375.6815 +2025-06-25 08:21:01,764 | INFO | iter 000019 | lr 0.1800 | loss -32353.4766 | norm 7844.6077 +2025-06-25 08:21:06,819 | INFO | iter 000020 | lr 0.1800 | loss -42713.4258 | norm 8350.1531 +2025-06-25 08:21:11,942 | INFO | iter 000021 | lr 0.1800 | loss -63141.9570 | norm 8895.7420 +2025-06-25 08:21:17,296 | INFO | iter 000022 | lr 0.1800 | loss -79225.8125 | norm 9464.6960 +2025-06-25 08:21:22,364 | INFO | iter 000023 | lr 0.1800 | loss -85295.2188 | norm 10064.8914 +2025-06-25 08:21:27,555 | INFO | iter 000024 | lr 0.1800 | loss -104268.3984 | norm 10670.1012 +2025-06-25 08:21:33,391 | INFO | iter 000025 | lr 0.1800 | loss -138191.5625 | norm 11306.4591 +2025-06-25 08:21:38,449 | INFO | iter 000026 | lr 0.1800 | loss -130759.4922 | norm 11963.0958 +2025-06-25 08:21:43,851 | INFO | iter 000027 | lr 0.1800 | loss -151880.2188 | norm 12623.4203 +2025-06-25 08:21:48,943 | INFO | iter 000028 | lr 0.1800 | loss -199277.5312 | norm 13299.8124 +2025-06-25 08:21:54,133 | INFO | iter 000029 | lr 0.1800 | loss -231175.6094 | norm 13985.3231 +2025-06-25 08:21:59,601 | INFO | iter 000030 | lr 0.1800 | loss -227062.6875 | norm 14638.4871 +2025-06-25 08:22:04,755 | INFO | iter 000031 | lr 0.1800 | loss -305321.5312 | norm 15300.2854 +2025-06-25 08:22:09,996 | INFO | iter 000032 | lr 0.1800 | loss -312911.1562 | norm 15967.4907 +2025-06-25 08:22:15,180 | INFO | iter 000033 | lr 0.1800 | loss -394100.9688 | norm 16633.2168 +2025-06-25 08:22:20,480 | INFO | iter 000034 | lr 0.1800 | loss -396323.7812 | norm 17301.4404 +2025-06-25 08:22:25,744 | INFO | iter 000035 | lr 0.1800 | loss -495372.3750 | norm 17975.7867 +2025-06-25 08:22:31,676 | INFO | iter 000036 | lr 0.1800 | loss -536708.7500 | norm 18632.5354 +2025-06-25 08:22:36,888 | INFO | iter 000037 | lr 0.1800 | loss -565078.6250 | norm 19274.2470 +2025-06-25 08:22:42,081 | INFO | iter 000038 | lr 0.1800 | loss -567815.5000 | norm 19919.4392 +2025-06-25 08:22:47,333 | INFO | iter 000039 | lr 0.1800 | loss -618632.5625 | norm 20562.6429 +2025-06-25 08:22:52,496 | INFO | iter 000040 | lr 0.1800 | loss -746847.8750 | norm 21208.1400 +2025-06-25 08:22:57,533 | INFO | iter 000041 | lr 0.1800 | loss -694621.2500 | norm 21850.9988 +2025-06-25 08:23:03,478 | INFO | iter 000042 | lr 0.1800 | loss -805244.6250 | norm 22487.1557 +2025-06-25 08:23:08,470 | INFO | iter 000043 | lr 0.1800 | loss -911537.5000 | norm 23123.2202 +2025-06-25 08:23:13,595 | INFO | iter 000044 | lr 0.1800 | loss -935658.9375 | norm 23750.3923 +2025-06-25 08:23:18,852 | INFO | iter 000045 | lr 0.1800 | loss -877843.8125 | norm 24361.4878 +2025-06-25 08:23:23,961 | INFO | iter 000046 | lr 0.1800 | loss -1091743.1250 | norm 24972.6994 +2025-06-25 08:23:29,213 | INFO | iter 000047 | lr 0.1800 | loss -1240315.0000 | norm 25579.9303 +2025-06-25 08:23:34,577 | INFO | iter 000048 | lr 0.1800 | loss -1015323.0000 | norm 26170.5807 +2025-06-25 08:23:39,739 | INFO | iter 000049 | lr 0.1800 | loss -1083682.8750 | norm 26751.1997 +2025-06-25 08:23:39,739 | INFO | Completed LR test 1/10: lr=0.1800 +2025-06-25 08:23:40,094 | INFO | Cleanup complete +2025-06-25 08:25:41,547 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt +2025-06-25 08:25:43,132 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:25:43,133 | INFO | Current learning rate: 0.0018 +2025-06-25 08:25:43,761 | INFO | Weight decay: 0.1 +2025-06-25 08:25:43,761 | INFO | Epsilon: 1e-08 +2025-06-25 08:25:43,761 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:25:43,761 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:25:43,761 | INFO | Optimizer state loading completed! +2025-06-25 08:25:45,718 | INFO | Initialized xs with norm: 1.273535 +2025-06-25 08:25:45,726 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:25:45,726 | INFO | Starting LR test 1/10: lr=0.1800 +2025-06-25 08:25:45,726 | INFO | Starting EoS for LR factor 100.0000 +2025-06-25 08:25:45,726 | INFO | Starting EoS for checkpoint 010000 +2025-06-25 08:25:45,726 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:25:45,726 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:25:45,726 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:25:45,726 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:25:45,726 | INFO | Loading model and checkpoint... +2025-06-25 08:25:46,600 | INFO | Wrapping model with DDP... +2025-06-25 08:25:46,828 | INFO | Loading state dict... +2025-06-25 08:25:46,831 | INFO | Model loaded successfully! +2025-06-25 08:25:54,406 | INFO | iter 000000 | lr 0.1800 | loss 0.0000 | norm 471.7869 +2025-06-25 08:25:59,712 | INFO | iter 000001 | lr 0.1800 | loss 255.7516 | norm 1033.5834 +2025-06-25 08:26:04,882 | INFO | iter 000002 | lr 0.1800 | loss 6477.0049 | norm 1315.8706 +2025-06-25 08:26:10,059 | INFO | iter 000003 | lr 0.1800 | loss 717.1234 | norm 1811.0524 +2025-06-25 08:26:15,184 | INFO | iter 000004 | lr 0.1800 | loss 4544.1074 | norm 2238.2034 +2025-06-25 08:26:20,441 | INFO | iter 000005 | lr 0.1800 | loss 3125.7515 | norm 2650.5604 +2025-06-25 08:26:25,413 | INFO | iter 000006 | lr 0.1800 | loss 1199.5897 | norm 3083.3089 +2025-06-25 08:26:30,945 | INFO | iter 000007 | lr 0.1800 | loss 2725.4370 | norm 3475.1633 +2025-06-25 08:26:36,160 | INFO | iter 000008 | lr 0.1800 | loss 2744.2969 | norm 3832.5715 +2025-06-25 08:26:41,444 | INFO | iter 000009 | lr 0.1800 | loss 1625.9283 | norm 4174.4268 +2025-06-25 08:26:46,671 | INFO | iter 000010 | lr 0.1800 | loss 732.4396 | norm 4512.6526 +2025-06-25 08:26:51,717 | INFO | iter 000011 | lr 0.1800 | loss 401.3749 | norm 4843.9269 +2025-06-25 08:26:56,797 | INFO | iter 000012 | lr 0.1800 | loss -414.9888 | norm 5153.5810 +2025-06-25 08:27:02,557 | INFO | iter 000013 | lr 0.1800 | loss -2720.9531 | norm 5476.3233 +2025-06-25 08:27:07,729 | INFO | iter 000014 | lr 0.1800 | loss -5391.8271 | norm 5818.1825 +2025-06-25 08:27:12,870 | INFO | iter 000015 | lr 0.1800 | loss -6670.7456 | norm 6176.3078 +2025-06-25 08:27:18,153 | INFO | iter 000016 | lr 0.1800 | loss -12359.0928 | norm 6550.6398 +2025-06-25 08:27:23,424 | INFO | iter 000017 | lr 0.1800 | loss -18700.9395 | norm 6955.5065 +2025-06-25 08:27:28,958 | INFO | iter 000018 | lr 0.1800 | loss -28184.4316 | norm 7388.6123 +2025-06-25 08:27:34,339 | INFO | iter 000019 | lr 0.1800 | loss -33184.2344 | norm 7858.2671 +2025-06-25 08:27:39,610 | INFO | iter 000020 | lr 0.1800 | loss -43730.0781 | norm 8364.9345 +2025-06-25 08:27:44,683 | INFO | iter 000021 | lr 0.1800 | loss -64642.3008 | norm 8911.7296 +2025-06-25 08:27:49,940 | INFO | iter 000022 | lr 0.1800 | loss -81204.4609 | norm 9481.5446 +2025-06-25 08:27:55,204 | INFO | iter 000023 | lr 0.1800 | loss -86869.9766 | norm 10082.6514 +2025-06-25 08:28:00,953 | INFO | iter 000024 | lr 0.1800 | loss -105953.4062 | norm 10687.3279 +2025-06-25 08:28:06,307 | INFO | iter 000025 | lr 0.1800 | loss -140775.6562 | norm 11322.9195 +2025-06-25 08:28:11,582 | INFO | iter 000026 | lr 0.1800 | loss -132647.5938 | norm 11978.4767 +2025-06-25 08:28:16,754 | INFO | iter 000027 | lr 0.1800 | loss -153962.5781 | norm 12637.7207 +2025-06-25 08:28:21,813 | INFO | iter 000028 | lr 0.1800 | loss -202029.5312 | norm 13312.6754 +2025-06-25 08:28:27,054 | INFO | iter 000029 | lr 0.1800 | loss -234245.6406 | norm 13996.1748 +2025-06-25 08:28:33,097 | INFO | iter 000030 | lr 0.1800 | loss -229900.0625 | norm 14646.0295 +2025-06-25 08:28:38,339 | INFO | iter 000031 | lr 0.1800 | loss -308670.3125 | norm 15304.3426 +2025-06-25 08:28:43,514 | INFO | iter 000032 | lr 0.1800 | loss -316647.1562 | norm 15968.0695 +2025-06-25 08:28:48,671 | INFO | iter 000033 | lr 0.1800 | loss -398862.5625 | norm 16630.9157 +2025-06-25 08:28:53,992 | INFO | iter 000034 | lr 0.1800 | loss -400159.2812 | norm 17296.2316 +2025-06-25 08:28:59,229 | INFO | iter 000035 | lr 0.1800 | loss -500405.0000 | norm 17967.7855 +2025-06-25 08:29:04,924 | INFO | iter 000036 | lr 0.1800 | loss -542243.6875 | norm 18618.2132 +2025-06-25 08:29:10,021 | INFO | iter 000037 | lr 0.1800 | loss -565329.5000 | norm 19245.1076 +2025-06-25 08:29:24,306 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt +2025-06-25 08:29:26,082 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:29:26,082 | INFO | Current learning rate: 0.0018 +2025-06-25 08:29:26,684 | INFO | Weight decay: 0.1 +2025-06-25 08:29:26,684 | INFO | Epsilon: 1e-08 +2025-06-25 08:29:26,684 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:29:26,684 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:29:26,684 | INFO | Optimizer state loading completed! +2025-06-25 08:29:28,983 | INFO | Initialized xs with norm: 1.273466 +2025-06-25 08:29:28,995 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:29:28,995 | INFO | Starting LR test 1/10: lr=0.1800 +2025-06-25 08:29:28,995 | INFO | Starting EoS for LR factor 100.0000 +2025-06-25 08:29:28,995 | INFO | Starting EoS for checkpoint 010000 +2025-06-25 08:29:28,996 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:29:28,996 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:29:28,996 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:29:28,996 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:29:28,996 | INFO | Loading model and checkpoint... +2025-06-25 08:29:29,754 | INFO | Wrapping model with DDP... +2025-06-25 08:29:30,234 | INFO | Loading state dict... +2025-06-25 08:29:30,237 | INFO | Model loaded successfully! +2025-06-25 08:29:37,767 | INFO | iter 000000 | lr 0.1800 | loss 0.0000 | norm 471.7844 +2025-06-25 08:29:42,698 | INFO | iter 000001 | lr 0.1800 | loss 256.3545 | norm 1033.6785 +2025-06-25 08:29:47,593 | INFO | iter 000002 | lr 0.1800 | loss 6480.4614 | norm 1317.1781 +2025-06-25 08:29:52,655 | INFO | iter 000003 | lr 0.1800 | loss 707.6561 | norm 1814.0126 +2025-06-25 08:29:57,762 | INFO | iter 000004 | lr 0.1800 | loss 4542.7993 | norm 2242.5040 +2025-06-25 08:30:03,258 | INFO | iter 000005 | lr 0.1800 | loss 3116.1213 | norm 2655.8104 +2025-06-25 08:30:08,366 | INFO | iter 000006 | lr 0.1800 | loss 1191.1149 | norm 3089.5643 +2025-06-25 08:30:13,829 | INFO | iter 000007 | lr 0.1800 | loss 2736.9172 | norm 3481.7995 +2025-06-25 08:30:19,004 | INFO | iter 000008 | lr 0.1800 | loss 2748.8972 | norm 3839.4604 +2025-06-25 08:30:24,084 | INFO | iter 000009 | lr 0.1800 | loss 1633.2867 | norm 4181.4834 +2025-06-25 08:30:29,379 | INFO | iter 000010 | lr 0.1800 | loss 741.7749 | norm 4519.9052 +2025-06-25 08:30:34,505 | INFO | iter 000011 | lr 0.1800 | loss 424.5821 | norm 4851.2794 +2025-06-25 08:30:39,626 | INFO | iter 000012 | lr 0.1800 | loss -395.4122 | norm 5162.5043 +2025-06-25 08:30:44,815 | INFO | iter 000013 | lr 0.1800 | loss -2702.6868 | norm 5485.5912 +2025-06-25 08:30:50,029 | INFO | iter 000014 | lr 0.1800 | loss -5335.0098 | norm 5827.3394 +2025-06-25 08:30:55,330 | INFO | iter 000015 | lr 0.1800 | loss -6652.4419 | norm 6184.9165 +2025-06-25 08:31:00,446 | INFO | iter 000016 | lr 0.1800 | loss -12321.6143 | norm 6558.2939 +2025-06-25 08:31:06,042 | INFO | iter 000017 | lr 0.1800 | loss -18648.2637 | norm 6962.6391 +2025-06-25 08:31:11,354 | INFO | iter 000018 | lr 0.1800 | loss -27954.8457 | norm 7397.7840 +2025-06-25 08:31:16,490 | INFO | iter 000019 | lr 0.1800 | loss -33195.2930 | norm 7869.4220 +2025-06-25 08:31:21,610 | INFO | iter 000020 | lr 0.1800 | loss -43725.9531 | norm 8378.2009 +2025-06-25 08:31:35,131 | INFO | Loading checkpoint from: logs/fineweb-10B/gpt2/eos/adam/linear/linear-256-2048-15000-0.1/lr_0.0018-wd_0.1-betas_0.9_0.95-bs_32-seq_1024-iters_15000/250622_035242/checkpoints/step_010000.pt +2025-06-25 08:31:37,075 | INFO | Loaded checkpoint with optimizer: adam +2025-06-25 08:31:37,075 | INFO | Current learning rate: 0.0018 +2025-06-25 08:31:37,696 | INFO | Weight decay: 0.1 +2025-06-25 08:31:37,696 | INFO | Epsilon: 1e-08 +2025-06-25 08:31:37,696 | INFO | Loaded 147 first moment (m) buffers +2025-06-25 08:31:37,696 | INFO | Loaded 147 second moment (v) buffers +2025-06-25 08:31:37,696 | INFO | Optimizer state loading completed! +2025-06-25 08:31:39,726 | INFO | Initialized xs with norm: 1.273655 +2025-06-25 08:31:39,743 | INFO | -------------------------------- EoS -------------------------------- +2025-06-25 08:31:39,743 | INFO | Starting LR test 1/10: lr=18.0000 +2025-06-25 08:31:39,743 | INFO | Starting EoS for LR factor 10000.0000 +2025-06-25 08:31:39,743 | INFO | Starting EoS for checkpoint 010000 +2025-06-25 08:31:39,743 | INFO | Starting EoS for model gpt2_small +2025-06-25 08:31:39,743 | INFO | Starting EoS for input_bin data/fineweb/fineweb10B/fineweb_train_*.bin +2025-06-25 08:31:39,743 | INFO | Starting EoS for num_iterations 50 +2025-06-25 08:31:39,743 | INFO | Starting EoS for accum_steps 4 +2025-06-25 08:31:39,743 | INFO | Loading model and checkpoint... +2025-06-25 08:31:40,508 | INFO | Wrapping model with DDP... +2025-06-25 08:31:40,770 | INFO | Loading state dict... +2025-06-25 08:31:40,773 | INFO | Model loaded successfully! +2025-06-25 08:31:48,938 | INFO | iter 000000 | lr 18.0000 | loss 0.0000 | norm 47178.7107 +2025-06-25 08:31:54,180 | INFO | iter 000001 | lr 18.0000 | loss 2579423.0000 | norm 105552.8516 +2025-06-25 08:31:59,433 | INFO | iter 000002 | lr 18.0000 | loss 111763472.0000 | norm 135456.5704 +2025-06-25 08:32:04,746 | INFO | iter 000003 | lr 18.0000 | loss 276864480.0000 | norm 133713.0656 +2025-06-25 08:32:09,937 | INFO | iter 000004 | lr 18.0000 | loss 320959104.0000 | norm 132714.9397 +2025-06-25 08:32:15,243 | INFO | iter 000005 | lr 18.0000 | loss 336375200.0000 | norm 124895.1838 +2025-06-25 08:32:20,604 | INFO | iter 000006 | lr 18.0000 | loss 306018016.0000 | norm 117796.8739 +2025-06-25 08:32:25,779 | INFO | iter 000007 | lr 18.0000 | loss 267033840.0000 | norm 108461.6875 +2025-06-25 08:32:31,373 | INFO | iter 000008 | lr 18.0000 | loss 243656960.0000 | norm 102025.2348 +2025-06-25 08:32:36,467 | INFO | iter 000009 | lr 18.0000 | loss 221454464.0000 | norm 93885.3865 +2025-06-25 08:32:41,725 | INFO | iter 000010 | lr 18.0000 | loss 185239232.0000 | norm 87718.0808 +2025-06-25 08:32:47,036 | INFO | iter 000011 | lr 18.0000 | loss 154752192.0000 | norm 80196.6753 +2025-06-25 08:32:52,225 | INFO | iter 000012 | lr 18.0000 | loss 132501480.0000 | norm 77225.2747 +2025-06-25 08:32:57,474 | INFO | iter 000013 | lr 18.0000 | loss 119106048.0000 | norm 68886.8834 +2025-06-25 08:33:03,295 | INFO | iter 000014 | lr 18.0000 | loss 97062160.0000 | norm 65299.5486 +2025-06-25 08:33:08,516 | INFO | iter 000015 | lr 18.0000 | loss 79378960.0000 | norm 59360.0908 +2025-06-25 08:33:13,789 | INFO | iter 000016 | lr 18.0000 | loss 68932920.0000 | norm 56258.6832 +2025-06-25 08:33:19,001 | INFO | iter 000017 | lr 18.0000 | loss 61816232.0000 | norm 51914.3800 +2025-06-25 08:33:24,459 | INFO | iter 000018 | lr 18.0000 | loss 52157472.0000 | norm 49234.2090 +2025-06-25 08:33:30,191 | INFO | iter 000019 | lr 18.0000 | loss 42893376.0000 | norm 45645.4134 +2025-06-25 08:33:35,359 | INFO | iter 000020 | lr 18.0000 | loss 39586664.0000 | norm 43603.7019 +2025-06-25 08:33:40,646 | INFO | iter 000021 | lr 18.0000 | loss 35820804.0000 | norm 40888.9978 +2025-06-25 08:33:45,894 | INFO | iter 000022 | lr 18.0000 | loss 30331428.0000 | norm 39151.8028 +2025-06-25 08:33:51,024 | INFO | iter 000023 | lr 18.0000 | loss 26849960.0000 | norm 37251.1340 +2025-06-25 08:33:56,173 | INFO | iter 000024 | lr 18.0000 | loss 25112784.0000 | norm 36166.3473 +2025-06-25 08:34:01,635 | INFO | iter 000025 | lr 18.0000 | loss 23161016.0000 | norm 34488.3613 +2025-06-25 08:34:06,878 | INFO | iter 000026 | lr 18.0000 | loss 21752610.0000 | norm 33774.8960 +2025-06-25 08:34:12,170 | INFO | iter 000027 | lr 18.0000 | loss 20102200.0000 | norm 32634.3167 +2025-06-25 08:34:17,332 | INFO | iter 000028 | lr 18.0000 | loss 20159082.0000 | norm 32413.1788 +2025-06-25 08:34:22,621 | INFO | iter 000029 | lr 18.0000 | loss 20947662.0000 | norm 32105.9349 +2025-06-25 08:34:27,921 | INFO | iter 000030 | lr 18.0000 | loss 19995154.0000 | norm 33069.7203 +2025-06-25 08:34:33,805 | INFO | iter 000031 | lr 18.0000 | loss 20508292.0000 | norm 32105.0565 +2025-06-25 08:34:39,056 | INFO | iter 000032 | lr 18.0000 | loss 21091504.0000 | norm 32818.1221 +2025-06-25 08:34:44,173 | INFO | iter 000033 | lr 18.0000 | loss 22435536.0000 | norm 32885.5579 +2025-06-25 08:34:49,412 | INFO | iter 000034 | lr 18.0000 | loss 22536620.0000 | norm 33441.7257 +2025-06-25 08:34:54,548 | INFO | iter 000035 | lr 18.0000 | loss 23596192.0000 | norm 33815.4563 +2025-06-25 08:34:59,711 | INFO | iter 000036 | lr 18.0000 | loss 25515312.0000 | norm 34967.7110 +2025-06-25 08:35:04,972 | INFO | iter 000037 | lr 18.0000 | loss 27476122.0000 | norm 35625.2597 +2025-06-25 08:35:10,098 | INFO | iter 000038 | lr 18.0000 | loss 29265056.0000 | norm 36547.8041 +2025-06-25 08:35:15,160 | INFO | iter 000039 | lr 18.0000 | loss 31501882.0000 | norm 37360.1106 +2025-06-25 08:35:20,403 | INFO | iter 000040 | lr 18.0000 | loss 33444498.0000 | norm 38455.2587 +2025-06-25 08:35:25,428 | INFO | iter 000041 | lr 18.0000 | loss 35255556.0000 | norm 39234.6789 +2025-06-25 08:35:31,052 | INFO | iter 000042 | lr 18.0000 | loss 36108252.0000 | norm 40022.0439 +2025-06-25 08:35:36,298 | INFO | iter 000043 | lr 18.0000 | loss 40793144.0000 | norm 41435.8015 +2025-06-25 08:35:41,428 | INFO | iter 000044 | lr 18.0000 | loss 40548568.0000 | norm 41333.9848 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt new file mode 100644 index 0000000000000000000000000000000000000000..cba23225c7fb0926ae7aca24b4ec79d837a5f544 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_000000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12623cbe522c1e7a4e735fc1067e87afff8187ac246afb4dba2caad567fffb0e +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt new file mode 100644 index 0000000000000000000000000000000000000000..12d794f4ae85271d9881d7dc94ba0d7c10f198ee --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_002000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c656090c88cf1d156a4a323f0c8f4eb5d418043b6aa18ff6745a0a58a6d000 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8f8f6a35c4cbae2b0d0f2fc9e2128d380602a0f --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_003000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63646361ad0ece816c247161cf7978270f56d7739e29ab5af1169d0bef798d3e +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt new file mode 100644 index 0000000000000000000000000000000000000000..49577479e5f4c94f6e277ac1a67ecb2fa1caf125 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_004000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a51b112a9e09695144c5d1072b1e368431f3d522e7db2c8e95d4e05d5a3fee7f +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt new file mode 100644 index 0000000000000000000000000000000000000000..56ea456851319975a1a3cd64618022daf02d09d1 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_005000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fefc324e20a91a5de78a3313f19a003cf2f274abe56f6d08944c319fc715520 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e674c758b95db5bbf62001799049e92a8b58934d --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_007000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d180f12608316931efe85f49ff95f7f6fa91364bc6672244bd094a069646f7ef +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d118641d181b2f8994d193de6384c88e64bb0ea9 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_008000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25017e80be486ae3b5dbbb781de85c121e8d2d88f908581c97dfe988b3730084 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6199b07f63dbe78e6fd462be52af155b6af0249 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_009000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2543f607872dbdd3c04424f6d6d02b415b80cfd19f43b8f134b1cc8496c70cd3 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2d810b510413ade88a31f63808b8a1727721da6 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_010000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ddbe1ada352f48db260bc8c6d74f792303e93eb1160184ae995f2335b7c88e +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt new file mode 100644 index 0000000000000000000000000000000000000000..48d68af60d5b1992d9d594e0b170c939926ab60a --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_011000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d4eaf93dc3d53ab7442bafd01c362dc03e2eee7fe304baa0918f3543a2415d9 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e175accd1b8ad1c493ab75f4d3cac5f8a200f114 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_012000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca6cfdb6bde4442e646552ed717969483e3e24394674bd3d6cc8b1b67529c1aa +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt new file mode 100644 index 0000000000000000000000000000000000000000..853b97405ba6cdc5df651454e695977f3500d493 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_013000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:981221565146678811b2a7f34a61f973fb9c0f92583119d993bfdfea2d05c0dd +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt new file mode 100644 index 0000000000000000000000000000000000000000..efd948af895c6a55e0b52dbd2676145e343f6d6e --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_014000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f4ef37fffcb4fe27f7bed035fadcfdb90af73c77676d69e20d0a1fa1e1efe85 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt new file mode 100644 index 0000000000000000000000000000000000000000..216f18ba1a424143e640149378f8f393132cd29c --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_015000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13445b8a5bc5476cc9805b3ef65eb2b4237fcbe39361704c64a775661f726252 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt new file mode 100644 index 0000000000000000000000000000000000000000..406195c1fa93a6413a27d5f6c580da129c7e88fb --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_016000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dbbcc9daba97cbcdabde080086f9b41f275b4342e0b4892bca7adfcd4f573ae +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d274c5176a9f74c2aee909707a69d662f36444e --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_017000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bee60122d717616ce4b40fd27948e3db7503784caa6b6f579ded5a34fce8cb7 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt new file mode 100644 index 0000000000000000000000000000000000000000..492ccd827d093c56df53df60c2f1742f38f3fc44 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_019000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2efad723a74e72ca5b5e4d8792afd4c8b3ef2567e075350dd3cd5f44f2b66121 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e607769043b16ceaa69bfeb36d80c224a2b7a84 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_020000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb0c7371eb7b09cc13faa342fee7fd484dde2516baa90f9539695420d16738b +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt new file mode 100644 index 0000000000000000000000000000000000000000..6da53354a0d35586c2f87aa64d547b0b9e3b4937 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250730_000401/checkpoints/step_25001.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92ede13b57abfda1ebab3d0bb1091e5df05106498fe4a993c0b607bddd954657 +size 988816320 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt new file mode 100644 index 0000000000000000000000000000000000000000..06d81db1ea4b62beffa433f4ee7764d62027f254 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_000000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59d09284853ab69674c8e10e863ce85717ddc0ea15f1a80fa0122ec48913521e +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1480a1ea9ee070ddda1dfa90ee2c0894064e1afa --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_002000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:985bf2b31f814aa1642190a5258c516511c660d45371fc995ac4944a453fc4ec +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b2dd7a4cc28ba7aaa9acf901ad4f1cb3c21aa5e --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_003000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf9c1bf4d9b3a9e1c570be899f9141bb934ecf325e0708496f455a7518c9ff06 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt new file mode 100644 index 0000000000000000000000000000000000000000..db1401b436e16057f31f06831474896efeef8101 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_006000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adde10dc5829f7236b67eb63ab8f0ed6d58ecfb0fb17deee3cdadf01949879d3 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bf18015db0bff3ef1285d9528e1de515179d5d3 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_007000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dffb6249ddca4645632f62a4f38ef97fdb00f775f1b41568d0d35a644e8c99a3 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6197fe357779415ea9206212c8e3757b4790bba --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_008000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b28451924e18d20c8691bdda04c00516426541675534bb3fd1fc036d0d7940 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt new file mode 100644 index 0000000000000000000000000000000000000000..49d0fda01bcddc046c584d82e6cf12014da5b123 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_009000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b90a3953ceb60cd88cd4519a01e4d76c4dbab205ecb7c54dd084ed0f3ef72c79 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffea366a0cb35edb918425d0fe87925c44679113 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_010000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd19034279aa37701ee63a9ef80d1c6bcd4e040dfc2b93d9f02050cc704cd96f +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a918d63633d4927ab2c5578ee5124df8c9543166 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_011000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78860a70fe8718833963ca3879abc0d0b0645edabc7832de26df64a922f6b1d +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt new file mode 100644 index 0000000000000000000000000000000000000000..485ee966847f8d6be0b5e9b4ce17ffce1e4b1869 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_012000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e534e382e6d665965f8fbb8b7435bb52eaaba18e95a6df95cae4a5aa4bf56a3 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt new file mode 100644 index 0000000000000000000000000000000000000000..404144b8e82b04909bc6dfade848bc1e0ad4d260 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_013000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d6da5d86436c34b53dba6c5726c8a79af0780ded43fba94632b58c757f8c8a +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_014000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_014000.pt new file mode 100644 index 0000000000000000000000000000000000000000..80cac16c1dfa5efa681d606b50fc8ccdba346c4f --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_014000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cae4747e1799381c2f78e537227b6cd6d6697d5edff67c7bba009e264a72862 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_017000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_017000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9decdf8da1747676dbd39b28f1f2156caac1d70 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_017000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a9726adc3b2200ad355329bb2b6679ce985cc6c991c13c30306640e1a85ed4 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_018000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_018000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a12b90e9601ff614c745fe9bc668537a5dfc0c13 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_018000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00540147eb98191f8dffd814d8eb63792d424ad88c483042001eaa9c874fd4e7 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_019000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_019000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d08bb6095d0e51f55d4c40ddcb83c530a9f14877 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_019000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9e1bcaebca99bb0ee1922de798796fb81a5ab828d1b00d0b207e88ade1a7bf6 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_021000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_021000.pt new file mode 100644 index 0000000000000000000000000000000000000000..810bae2b1488ed329ee095587863c7e6484441be --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_021000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6bc050edc931fbfd4ce8f80eecbc231c7a033bb969edcc0e212079e05d20364 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_025000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_025000.pt new file mode 100644 index 0000000000000000000000000000000000000000..fabec0617b21fb0ce0e498aa7a8b30f9ca40fbfe --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_025000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26fe9525d2e96a47323258521b9ffeedb75edd852556ee21d8c07f539762526c +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_25001.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_25001.pt new file mode 100644 index 0000000000000000000000000000000000000000..e80a9e49b0fdb2ebcf7ba980ee0c69c5272a160a --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/checkpoints/step_25001.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae1e6b07a78a253d42057954e3a6ff7e49269ec89cea0655ea52d6db20a2072 +size 981544960 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis.png b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis.png new file mode 100644 index 0000000000000000000000000000000000000000..df9ab2b1692a18039ef5d4a794f99a9ee71364a3 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_003000/combined_eos_analysis.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9472ad439de5f7a59aef4c8e8999be56de3cea7a434feaac02aa2ac1410d4a01 +size 661369 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis.png b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis.png new file mode 100644 index 0000000000000000000000000000000000000000..288355dcdcca1652cb7e1bce12d3d5ce9824fec3 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a71c69ec94fdeffddfe7a2aa7c0b9c7661998e0d8928c7a9a4a1d60cee569e53 +size 455107 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis_ori.png b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis_ori.png new file mode 100644 index 0000000000000000000000000000000000000000..a745ca312733286bae4903d2b6deb5669a36ae77 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.01-wd_0.0001-m_0.98-bs_32-seq_1024-iters_25000/250730_022725/eos/step_025000/combined_eos_analysis_ori.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db0a09ca7d635c4786bfdce8b48a7a72cc7e375252f30392d12d55d95376818 +size 751646 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_000500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f8be9cdbe17e81965eb748b92283ff28b43447d --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c289460d45ec0e45a81fad9fd3f33a7150773339b20349e35a0498e74c247a6 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_001500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_001500.pt new file mode 100644 index 0000000000000000000000000000000000000000..296f1c191ede511b242432752aad3bcac8b2f9a6 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_001500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d12b8dff9dec2c30357851fd43673101fbb53cd5f6240a72590eb9634515185 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_002000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_002000.pt new file mode 100644 index 0000000000000000000000000000000000000000..40e797bddfaba760ef29256691cbbfd3f9cf1ff6 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_002000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd873b951355944637ab7e82795846733b6947e252453f1baeb374e8fa3fea75 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_002500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_002500.pt new file mode 100644 index 0000000000000000000000000000000000000000..1322eccd2f4703f382f9d0a967865cc9ce524a3d --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_002500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40a8a989dc51ccd600d480b86d7e3d9caf84071bc9d9d72b28a4c908536e8c66 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_003000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_003000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3fdbd901bcf7d7fd47e52dc2d4b837da61da383c --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_003000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14d3cc1a6ead75fa5bf13605d400e030c3e877f4b3324684c6631909973045bd +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_004500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_004500.pt new file mode 100644 index 0000000000000000000000000000000000000000..3da2528921b14db1ac166b27bf13bbdaeaedb714 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_004500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb013bca5d10f528f93c6e5897eaec23709dc26d74feb61ce1c9c240200f01e +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_005000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_005000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ae80cdd27ea5a0523df2df477711a67b5ca9f9a --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_005000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a367c3f4b512bd45b79da11a47ee4d9da8681c1e281e5c00faf00f73b2d449d +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_006000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_006000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8a8758b3b8a74a02c1c700d57f4bf3b050919a3 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_006000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04fbf9f1fac7ba062253d26bf0f68ecb596f0c87c11c9ff48fc81ddc7f63b021 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_006500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_006500.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c634242ad8513474c39acb541c5474b08311bcc --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_006500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8757a02bf965700bb3f54ca4136b6e0311534f7c62c6c0a86c1ebb010bcc9b4 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_008000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_008000.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c8538a418b35ed042614edc914cd5b2304f5f4a --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_008000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e506ee24fa65e26959dd2747a1b9eec276a206d88914fa6ae9c717e277c4afed +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_008500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_008500.pt new file mode 100644 index 0000000000000000000000000000000000000000..b66d145afd8d69ae7c4c7b06ea521f015ca856e6 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_008500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351b1fc8964ecc180359151ecbfa5627af51cc02e206a7392f684dd48ab11caf +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_009000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_009000.pt new file mode 100644 index 0000000000000000000000000000000000000000..822b637f179969adc559b881698c961ee7ecb4ef --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_009000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:368acf447821183c2325228df3a33912e43b431f4a1b9020cfb4bc94c2d58b67 +size 648821973 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_9537.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_9537.pt new file mode 100644 index 0000000000000000000000000000000000000000..287dc9d46ed9bbe113aa8475454bcd8cdb48d29b --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/checkpoints/step_9537.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96efd45a80d36bbeff427954b5cd68b828579a67de5f317b2b9738a2ca4d54a2 +size 648821643 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.png b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..5f959201a4be612a831299cc6f3b8e5f5742201e --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dafe95d627f30563ac4ea54866a5f584560d1c975fb4b5f09b7d61bbf85a765 +size 114478 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..86a0c33d772948fa4be67e9a3c488178c9f0aa15 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66b7d3983e7d5d1278455cc3087b87d32c8c414b0283394351c88b41e4ccbc65 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b33a8007cb91c3b0954c420c819b05ed2365383 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_001000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53115f8eae29600244581dffd96ccc30d6bb66d2aa2c47bca562f7f32c46321a +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.png b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.png new file mode 100644 index 0000000000000000000000000000000000000000..3ea1fd9432aec646d21217e416686057d450fbf9 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af15e22d941fe5560bfce9206cbcc82370467f973b623089d81a313d4824c21e +size 100869 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..225f917b2fdd0c7e1b86883f3598a614f5d163f0 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd87cb333ccc0c5b37e6aeab12c7c63163c04d6c3ae4201c1ffdfad2e1b5c33 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce4d48f8ea38cf6eef210089cf0b43fc55362972 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_003500/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857c5e4babdbe0ec737bae3c45e454ab327cd747b24ad1be3001458016fffe47 +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_007500/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_007500/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a9dda7dff12d6bf77a2e20cb544eabc0ebf050b --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_007500/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea014cacce6bcfff88126b8d55cbdb47951c5b587a09c11bc4c9502f86d578fa +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_007500/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_007500/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..61e1aba56b7c2efbcd02261761d268b2529b1bb6 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_007500/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0155ba6da817f5b09ad9b3f8ecfad4b574cfb32fa2aedd62763315a501fe541 +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_009000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_009000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..2378a13af66ee51c5cd5426671e51d29b2f663cd --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_009000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa4c3e76a0f4372fb8458d149f41dd771bf4c40d561859f3d40fb44611fcd43b +size 1911 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_009000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_009000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6b226513065ecb9d7c6000ea175245ac7753aa7 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250622_011417/eos/step_009000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0bf39de11638dbfd1c3509c6d80ad1e9738030085b01ade4cfa1900ec7bc821 +size 1904 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250728_063551/checkpoints/step_000000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250728_063551/checkpoints/step_000000.pt new file mode 100644 index 0000000000000000000000000000000000000000..80ff35fd0fc811b0b4f70c2a05cb563b07bda275 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.0-bs_32-seq_1024-iters_9536/250728_063551/checkpoints/step_000000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:722a4ac4609f01f43d9f711a20042290a2466bf5f8fd0e192d8c0c26543f7f7a +size 648830293 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_009000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_009000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4dea8b0d25db8603b7a264509206e52c1b0a549 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_009000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a332d94995ec0064d87b4df74e4088bbd1a5c3617546a0a1bbce3295086ce7c +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_011000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_011000.pt new file mode 100644 index 0000000000000000000000000000000000000000..883c1408958a25a0b4d468274133d1be8cf4bb09 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_011000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d3e710535d3469edaa1595cd2ce5a7cc4f63d92dd71bb96d5466f689272de7c +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_016000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_016000.pt new file mode 100644 index 0000000000000000000000000000000000000000..cebd74fddf6e1e6274020790c3b78601206ccf6d --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_016000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a5a3d8ee57945232054203ebda47ecad9930af172b942936fec381a1f361820 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_021000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_021000.pt new file mode 100644 index 0000000000000000000000000000000000000000..a27c5cac1062be7859fcec9d5a4c4b1a94f9dcba --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.95-bs_32-seq_1024-iters_25000/250729_212048/checkpoints/step_021000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff95e077e5744c7ad6f65b7caf5ee8ddcb1077219e3b97eace01ee87d7eeec9 +size 1297616507 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_000500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..82e1a01f992edd4e5f17ac190c57864aa0aa1420 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c148115a7fb619f3d789dc4461ecc53e8085f22e826e723ad3aa11dfe6b0955d +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_001000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_001000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d10bfb42ac3631301a99b4410933d40a97c0d4cd --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_001000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72327895ee3ccaba827a6d12aa057735d38694417eba676970ae6e9260cc8576 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_001500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_001500.pt new file mode 100644 index 0000000000000000000000000000000000000000..9570ad179c6c8312a41660702bedd43bb274e827 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_001500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9375cbb2bc20357e44ed0cf9be7871ac0a8dc4c1ba44f61477cec49ece400e58 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_002000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_002000.pt new file mode 100644 index 0000000000000000000000000000000000000000..0211c9d2c583e68d47f29027fbb8591fa74aadb2 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_002000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cdebdec3e98ffeb535d33f39a8ec11a3b145804aa035a478944585e2abaeb31 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_002500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_002500.pt new file mode 100644 index 0000000000000000000000000000000000000000..e93b1dff9d9b38b544ae779bc3f7f990df9a4e16 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_002500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d9a3eec21075d229818e1a294a360b7d86a02b62012c384a3803a57fdde9f7 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_003000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_003000.pt new file mode 100644 index 0000000000000000000000000000000000000000..58765d9e5dbd8283cc7781c5e310f8100057d5ba --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_003000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c80573eb1ace5e01fe2a29b441ff3568b6777cf5194887fdbc6cb3b1d061eb +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_003500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_003500.pt new file mode 100644 index 0000000000000000000000000000000000000000..2147bbeee280d3e61b729eccea3de25de35d7d5e --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_003500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6008a835cd454d2c269636a2f6232c6125c92ee9fa87dd3333eb69b94e053480 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_004000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_004000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3be12e9a80ff73d72b44c3e0bc4c19ee3f9ad3b --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_004000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e41bcb3018cea4126248bd9108bf9c0a6ffa5caccc06ed766a6b8cc1ebd4516 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_004500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_004500.pt new file mode 100644 index 0000000000000000000000000000000000000000..a79f862bdbf6ef5e31e548bde2075de0cf16b3ee --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_004500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af98df88457dc586bcbc25dcbea2c4f5157b6b7e47f651cc75eb8ea97c5e4690 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_005000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_005000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f265a32bccbb1a83628967a468d68372bd9d835d --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_005000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d6a31e253d2549ff93a3f8d30b595a396625037b52ad54d6b8924d4e91d2750 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_005500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_005500.pt new file mode 100644 index 0000000000000000000000000000000000000000..94024bea5cbaf16c5ef5482ff523e9a73f96a5a0 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_005500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c83e71cc777770d50284dc30504278bfbfaa2dbfce7eccea38f23efb6162f459 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_006000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_006000.pt new file mode 100644 index 0000000000000000000000000000000000000000..819211e9299190284a99ade6e90bfd149d0d3fca --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_006000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9775fe074dda3d08374a71edef0afab752f5f88b3200b8e19cfcce1074d4edb +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_006500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_006500.pt new file mode 100644 index 0000000000000000000000000000000000000000..2084c836b1a0c9a5e8b8af9401be28cc19f5008f --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_006500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2082a52d7cae8aa4e377289528524b74a03a6332910f5612a1cc0272e0fe81e5 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_007000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_007000.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fb9203ecabe0b3df00d44d3f187d93781a1da82 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_007000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8e16f8bd7da15ffee3362aa250d695fc7f4c9957ee9cdf1a1e678e553c6cac +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_007500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_007500.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ab3a98994509e40439e1046d1152128265f3511 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_007500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9508db9bf5885863eda48fbc61f6cf96acfd07fcbf5424cde67778872bfec8f +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_008000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_008000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b55b782927d9e7343ce9cf62eff2469990181ad7 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_008000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82d0bb8e2108772eb993e4c96f0b8c87e1ccd874baf079ec7302efae693dff8 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_008500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_008500.pt new file mode 100644 index 0000000000000000000000000000000000000000..79996f52742ed5e186a6e5ec9543b56446e794d0 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_008500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54f9478ea89c288bbcd4c7162a383de37305d36eece32a79fa4a44542a6f7738 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_009000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_009000.pt new file mode 100644 index 0000000000000000000000000000000000000000..253bafd60e4c557403b86697964b0e7855ea1352 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_009000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05bf2e6d07970ca1f24d003019a43ca2821c78df98d49454af325763da0472cc +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_009500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_009500.pt new file mode 100644 index 0000000000000000000000000000000000000000..7105b018d2aa7eb66d7d188687bc3224a6ff9911 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_009500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32ae4ef70a90a7946f5cce88749f2a970b415a0d0a8b09b860eb99bd27a5ae08 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_9537.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_9537.pt new file mode 100644 index 0000000000000000000000000000000000000000..adbfb0671a27dbef96f91613cc866d1fb656802c --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.03-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250622_020657/checkpoints/step_9537.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da2bc3f2b271fc8bd198d08ac9e61651d5cc715d45c562bbdf8c5e92047b8dda +size 1297607371 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_000000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_000000.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a5a9eee1a61241dcaa5f09697cd2ca460f292d9 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_000000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2c8e4073da6887e5f8caf0fc848f84d442f1c17e33df50c631f8f380c179a3 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_000500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..1be0bd334e2e57f1aa2f952cd15edec5fb0934e3 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2758b8f5e45f0bd3fdd2cb188259ce67a020c8d878e3b30ff97180e52efdc9c7 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_001000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_001000.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7065d9f9f93a9d4c8c56e814a4c8e363c1997ee --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_001000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65d9393d149bd76e8feaf2e8ad4e098547a627bd6ceb3c1505fb896192d77984 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_001500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_001500.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2ab327a57469db0d7bf04d35d9490e7d1d5c9ba --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_001500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:092727727c8f5f436a843fb56366a75bbdeb38fc4f6a6da1a488e81b4acbfcf8 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_002000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_002000.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e64d321c60121dfb59c12e10a911ce09421e105 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_002000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1117d1ee873a4efede8de278a63c4549cb817cee1aea16241ff7127461b81d74 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_002500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_002500.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b90864cec00c877f8cc182cad30ca4f270c2482 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_002500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dae543c446c10912e89e70e283865f97d2196c9d26cc22442594cf67c12c358 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_003500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_003500.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fa9458b6a114cb496fc2cb415ae23a95ffbebf2 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_003500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e42df5e2ab659c58116731c872dca68c1ebc821c509405e9a7bc477aa66b805 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_004000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_004000.pt new file mode 100644 index 0000000000000000000000000000000000000000..1272aedafae2c8eaa4ea7781d090a747f067a7d4 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_004000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78955e29a655f017106845418a8275db26accf20b5324a077e32210f26f69206 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_005000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_005000.pt new file mode 100644 index 0000000000000000000000000000000000000000..f85af629af5a34f9ad94b9b17e90160ce517d57b --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_005000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:863d9c719cdb886b580543ecbfe3cfd1ed4605d35206275b890441d2fe0ba303 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_005500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_005500.pt new file mode 100644 index 0000000000000000000000000000000000000000..df9f81706d176e2f2ded0cc6ac75d2142b1165c2 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.95-bs_32-seq_1024-iters_9536/250702_163449/checkpoints/step_005500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341701ec8dfb3e0be1222e1a83581d25785cf9ef41446d77f8af915c913eb6cd +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_001000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_001000.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3f2f7dac7729e8a7fff21b58a052936b97e6750 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_001000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de25116aecdb4fd50760b9c94ccae762dc230fe79655fc935676bc5a77d626bd +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_001500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_001500.pt new file mode 100644 index 0000000000000000000000000000000000000000..9481231a74f01c65885d87b7384a67a1925fd485 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_001500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa6588ac8388dcd7aebee5785a73ad29db610a39d0a9d6fc230ba85ccf1c092 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_002000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_002000.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8fcac2f31645aa23651946dc842beed8584aa11 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_002000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39554aba2b3309523196d2eeed4316d5144e4756d1869c3faa7552ad0cbd56dc +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_002500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_002500.pt new file mode 100644 index 0000000000000000000000000000000000000000..adef645c02e5fa3b9bc6d14189ec45defb5f9ea8 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_002500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72eef06e4636da8cb48163dd5bc8dbb206ad3bb8fd608f65c0e65b102f61435f +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_003000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_003000.pt new file mode 100644 index 0000000000000000000000000000000000000000..6245e797f216ceb002d27c15790ac892c503e238 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_003000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ef559be6a370fe9aefc6fc8c6bb0e48317503552be83e23a1c7db0cd17d44b +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_003500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_003500.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1183db0a80257107b25f0aed6f93605eeb80a56 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_003500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca92fb91fdc0dd2ea89fb55dd1931a7f07a4f6ef61f645a922e6eecde0f7269 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_004000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_004000.pt new file mode 100644 index 0000000000000000000000000000000000000000..21fb7635179315972e756b43e285f04cf07d5c5a --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_004000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4851136b361297b4683f7340a13c49bce66f44c692c31126cb6479150fffc2f2 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_004500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_004500.pt new file mode 100644 index 0000000000000000000000000000000000000000..91f151748f0d0fe33ee1a89fcec67ac672e73a4c --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_004500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04c1c1edd47ec9ad986fc9f4311221f0004de94a1927cd05200978f209718b64 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_005000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_005000.pt new file mode 100644 index 0000000000000000000000000000000000000000..63fa47b9038613247a0f3235d4a3dd4c6adfcf14 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_005000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49b913395f6017837b38a21c3be425ee4b1f02eb5c063b0500f446a6d010260c +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_005500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_005500.pt new file mode 100644 index 0000000000000000000000000000000000000000..17c963f0583bcc1ad44b42b580ca685da3b17dd5 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_005500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b0611a33adfaf16fb0ab70fd6200770d069ed02e54b7bf74d8b7afd57442be +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_006000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_006000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a7373e7e2139bec46fac63a9be199c20cf490c3 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_006000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8512680c6561a1e4d415b1b366f99d0a5eab0f150f61067d1b1aa2b5c642b00 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_006500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_006500.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7f60104813e5eb30b46f5cc972a390a3b34c7d1 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_006500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c37502bdb2910028b25f45e86b04491a31281adf8013eb3bf47449313ab6d406 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_007000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_007000.pt new file mode 100644 index 0000000000000000000000000000000000000000..3373b541ab82386d2fa26a0aed14c98a0f7c3a72 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_007000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec86fc6a67784b0ecc4c3272642109f301f357aebaf2996e2b671d7ad289f21 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_007500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_007500.pt new file mode 100644 index 0000000000000000000000000000000000000000..5791e954572b788d50ec6ec0a6e493a7da689b52 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_007500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2550e737b918c96c9e684a6996fdb5ffebdabbfdd75bacc8be77fe7f606e6d8d +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_008000.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_008000.pt new file mode 100644 index 0000000000000000000000000000000000000000..16d35b3b06fb88a2bdadc08988930c674f39988c --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_008000.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be6e89ae3458a2427ad7a6a36f706cee8d8b91ccf1ce53c44d397822d1c6a137 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_008500.pt b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_008500.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc913607478b72832becfb4eb9b6d591b9b0d7d0 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/constant/lr_0.06-wd_0.0001-m_0.98-bs_32-seq_1024-iters_9536/250702_174110/checkpoints/step_008500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb18b58b35db167b759eb8f16e6a16ed8da9ae8403a5459b3d3eda69ed64b97 +size 1297607995 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_002000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_002000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..74b4c2c98b1af6766eb9febd94a3bb6b552a90d5 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_002000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b53a4eff73cd4256343517dc55099ddfb93e02d4c334b0d8f00376575a2b1c +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_002000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_002000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6454eaac4990142399c29965294ca6bca3b70c9 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_002000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94acafd8f8d4c7d926439938884912ee75bb454847aeab75d1f24cc1a1953444 +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_003000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_003000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e879391237f2b4435a2b8ec3b3ddfd30a496af9 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_003000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc7dc17f228a52a59f3b9947f317916f3b3d4bfc8ed2f7ec9b2ca2f07870c6dd +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_003000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_003000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..c54630f1a537eb37c97838eaec5cf4c5125a668e --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_003000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6d88720cf64159920029f2eae64ade7d871a1b2a52d179f50abaf0e717650f4 +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_004000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_004000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..267294452fc647a24e27d7d0c9b06a17a1a1a749 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_004000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0960f7a7b5f044e4253a0c2b6f19acea6515f23afd0ec01f1af2805a1e65505e +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_004000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_004000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..c485d302f38e93207d2d2045a3ebf36a5dffa3e2 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_004000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c21f3ee55271e9bcabca8c100ea4974cd4ee56e4acf0a5eda9de041b1e21c8 +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_005000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_005000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..e89b5837758bd4b7397ce945082666db2c84339b --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_005000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f838c24a5fb1b93cf797ebf9a092ed620c51344c3bb1bd9de112efbb3cfa9105 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_005000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_005000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfa616464bf8ac95852ad80cf45fcf9857338c44 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_005000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d09d39d82bf3d2e04777031f2a45abcd5bc9689d71352db5ed8d7d0d8a1b2cfa +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_006000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_006000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..772fc59e92db278c200d2e66d72947c1b13f393d --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_006000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67be69e4be81942c020c71364cf9a043bc67017723bfabaa39608d3fed7ff179 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_007000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_007000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b4038e1890b9c28e550193552e6e1e8fa99e5a5 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_007000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8729969581ab3322e44b28c8dfe0f4d3d2a29b02794b3eeaef2dcfc00828920b +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_007000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_007000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..216ac604e4134ac208f4b093ce023b622c2c4992 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_007000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507ec6248c32aece694d3c9fcc1f4fb2198ad7bfbc123fd4bcdad897937a52bb +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_008000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_008000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd03dfadd95f590020f2c28c27bb6b31a0e1881b --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_008000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4710835eccc2222502e6c53e57963d29c069b5c555fd4763249633b273a0007 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_008000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_008000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..44d9b40a78bf59e504fd2816309f9fd04e561e9c --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_008000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3d3ad65e8ee338fc3aaf9033dc67a0be5a95fcc4cb3a634b6cd2ffe0f3264eb +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_010000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_010000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..da05c6f29aa28d411521aba99543a20cfc72bea0 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_010000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6ca22aa9050959eb2b3b8f1e3d27545338e6f81acf82c2ed87c97838722597 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_010000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_010000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..af4f73e59276126aa501a2319a84d033f269b251 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_010000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0846f878ac997c607bbc6961445d5871194291c0d282b8ababa84626baecbdc9 +size 3568 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_015000/losses_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_015000/losses_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bb9d3a919bc59c38cec419e520de41ac16329c9 --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_015000/losses_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea2ca144d99b3135d8ea9521f20146263fe3d2675045b503c704abf50544201 +size 3575 diff --git a/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_015000/norms_lr.pt b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_015000/norms_lr.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0bd48a2ddfbe8e638bd021161e7b3a049e5bbcf --- /dev/null +++ b/fineweb-10B/gpt2/eos/sgd/cosine/cosine-2048-12000-15000-0.1/lr_0.1-wd_0.0001-m_0.0-bs_32-seq_1024-iters_15000/250622_051646/eos_original/step_015000/norms_lr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3714d5549388de337aab3ac5484946ea9a8574be00de67bc0355a1a07b2c845 +size 3568