gsm8k-llama / trainer_state.json
pihull's picture
Initial model upload
5f47fd2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 30,
"global_step": 590,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017094017094017096,
"grad_norm": 23.0,
"learning_rate": 0.0,
"loss": 0.9788,
"num_tokens": 25595.0,
"step": 1
},
{
"epoch": 0.03418803418803419,
"grad_norm": 25.625,
"learning_rate": 1.6666666666666668e-07,
"loss": 1.108,
"num_tokens": 49684.0,
"step": 2
},
{
"epoch": 0.05128205128205128,
"grad_norm": 25.0,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.0319,
"num_tokens": 74064.0,
"step": 3
},
{
"epoch": 0.06837606837606838,
"grad_norm": 24.25,
"learning_rate": 5.000000000000001e-07,
"loss": 0.9993,
"num_tokens": 99232.0,
"step": 4
},
{
"epoch": 0.08547008547008547,
"grad_norm": 25.0,
"learning_rate": 6.666666666666667e-07,
"loss": 1.0217,
"num_tokens": 123515.0,
"step": 5
},
{
"epoch": 0.10256410256410256,
"grad_norm": 23.625,
"learning_rate": 8.333333333333333e-07,
"loss": 1.0487,
"num_tokens": 149121.0,
"step": 6
},
{
"epoch": 0.11965811965811966,
"grad_norm": 24.625,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0543,
"num_tokens": 173222.0,
"step": 7
},
{
"epoch": 0.13675213675213677,
"grad_norm": 24.5,
"learning_rate": 1.1666666666666668e-06,
"loss": 1.001,
"num_tokens": 198434.0,
"step": 8
},
{
"epoch": 0.15384615384615385,
"grad_norm": 24.375,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.0028,
"num_tokens": 222130.0,
"step": 9
},
{
"epoch": 0.17094017094017094,
"grad_norm": 22.375,
"learning_rate": 1.5e-06,
"loss": 0.9943,
"num_tokens": 247232.0,
"step": 10
},
{
"epoch": 0.18803418803418803,
"grad_norm": 23.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.037,
"num_tokens": 271615.0,
"step": 11
},
{
"epoch": 0.20512820512820512,
"grad_norm": 21.625,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.9606,
"num_tokens": 296685.0,
"step": 12
},
{
"epoch": 0.2222222222222222,
"grad_norm": 21.25,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9682,
"num_tokens": 321158.0,
"step": 13
},
{
"epoch": 0.23931623931623933,
"grad_norm": 20.375,
"learning_rate": 2.166666666666667e-06,
"loss": 0.9362,
"num_tokens": 345732.0,
"step": 14
},
{
"epoch": 0.2564102564102564,
"grad_norm": 18.125,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.9439,
"num_tokens": 370226.0,
"step": 15
},
{
"epoch": 0.27350427350427353,
"grad_norm": 15.5625,
"learning_rate": 2.5e-06,
"loss": 0.8796,
"num_tokens": 395659.0,
"step": 16
},
{
"epoch": 0.2905982905982906,
"grad_norm": 13.6875,
"learning_rate": 2.666666666666667e-06,
"loss": 0.8444,
"num_tokens": 421481.0,
"step": 17
},
{
"epoch": 0.3076923076923077,
"grad_norm": 13.5,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.8746,
"num_tokens": 445806.0,
"step": 18
},
{
"epoch": 0.3247863247863248,
"grad_norm": 12.0625,
"learning_rate": 3e-06,
"loss": 0.801,
"num_tokens": 470449.0,
"step": 19
},
{
"epoch": 0.3418803418803419,
"grad_norm": 11.4375,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.7975,
"num_tokens": 495165.0,
"step": 20
},
{
"epoch": 0.358974358974359,
"grad_norm": 10.375,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7578,
"num_tokens": 520803.0,
"step": 21
},
{
"epoch": 0.37606837606837606,
"grad_norm": 10.0625,
"learning_rate": 3.5e-06,
"loss": 0.7703,
"num_tokens": 545292.0,
"step": 22
},
{
"epoch": 0.39316239316239315,
"grad_norm": 9.0,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.7019,
"num_tokens": 570089.0,
"step": 23
},
{
"epoch": 0.41025641025641024,
"grad_norm": 7.96875,
"learning_rate": 3.833333333333334e-06,
"loss": 0.6988,
"num_tokens": 594989.0,
"step": 24
},
{
"epoch": 0.42735042735042733,
"grad_norm": 7.6875,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7039,
"num_tokens": 619663.0,
"step": 25
},
{
"epoch": 0.4444444444444444,
"grad_norm": 7.09375,
"learning_rate": 4.166666666666667e-06,
"loss": 0.6563,
"num_tokens": 644898.0,
"step": 26
},
{
"epoch": 0.46153846153846156,
"grad_norm": 7.0625,
"learning_rate": 4.333333333333334e-06,
"loss": 0.653,
"num_tokens": 669606.0,
"step": 27
},
{
"epoch": 0.47863247863247865,
"grad_norm": 6.625,
"learning_rate": 4.5e-06,
"loss": 0.6002,
"num_tokens": 694163.0,
"step": 28
},
{
"epoch": 0.49572649572649574,
"grad_norm": 5.96875,
"learning_rate": 4.666666666666667e-06,
"loss": 0.6082,
"num_tokens": 720346.0,
"step": 29
},
{
"epoch": 0.5128205128205128,
"grad_norm": 5.59375,
"learning_rate": 4.833333333333333e-06,
"loss": 0.5732,
"num_tokens": 744937.0,
"step": 30
},
{
"epoch": 0.5128205128205128,
"eval_loss": 0.6271482706069946,
"eval_num_tokens": 744937.0,
"eval_runtime": 3.9192,
"eval_samples_per_second": 336.551,
"eval_steps_per_second": 10.717,
"step": 30
},
{
"epoch": 0.5299145299145299,
"grad_norm": 4.15625,
"learning_rate": 5e-06,
"loss": 0.6064,
"num_tokens": 770773.0,
"step": 31
},
{
"epoch": 0.5470085470085471,
"grad_norm": 3.65625,
"learning_rate": 4.999960660162164e-06,
"loss": 0.55,
"num_tokens": 795350.0,
"step": 32
},
{
"epoch": 0.5641025641025641,
"grad_norm": 3.359375,
"learning_rate": 4.999842641886752e-06,
"loss": 0.5366,
"num_tokens": 820624.0,
"step": 33
},
{
"epoch": 0.5811965811965812,
"grad_norm": 3.734375,
"learning_rate": 4.9996459488880215e-06,
"loss": 0.572,
"num_tokens": 844564.0,
"step": 34
},
{
"epoch": 0.5982905982905983,
"grad_norm": 3.71875,
"learning_rate": 4.999370587356267e-06,
"loss": 0.5462,
"num_tokens": 867822.0,
"step": 35
},
{
"epoch": 0.6153846153846154,
"grad_norm": 3.53125,
"learning_rate": 4.999016565957633e-06,
"loss": 0.5399,
"num_tokens": 893203.0,
"step": 36
},
{
"epoch": 0.6324786324786325,
"grad_norm": 2.921875,
"learning_rate": 4.998583895833834e-06,
"loss": 0.5476,
"num_tokens": 919455.0,
"step": 37
},
{
"epoch": 0.6495726495726496,
"grad_norm": 2.640625,
"learning_rate": 4.998072590601808e-06,
"loss": 0.518,
"num_tokens": 944217.0,
"step": 38
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.703125,
"learning_rate": 4.997482666353287e-06,
"loss": 0.57,
"num_tokens": 968490.0,
"step": 39
},
{
"epoch": 0.6837606837606838,
"grad_norm": 2.71875,
"learning_rate": 4.996814141654291e-06,
"loss": 0.5546,
"num_tokens": 992306.0,
"step": 40
},
{
"epoch": 0.7008547008547008,
"grad_norm": 2.4375,
"learning_rate": 4.996067037544542e-06,
"loss": 0.5206,
"num_tokens": 1016929.0,
"step": 41
},
{
"epoch": 0.717948717948718,
"grad_norm": 2.46875,
"learning_rate": 4.9952413775368034e-06,
"loss": 0.525,
"num_tokens": 1042218.0,
"step": 42
},
{
"epoch": 0.7350427350427351,
"grad_norm": 2.3125,
"learning_rate": 4.99433718761614e-06,
"loss": 0.5278,
"num_tokens": 1066745.0,
"step": 43
},
{
"epoch": 0.7521367521367521,
"grad_norm": 2.3125,
"learning_rate": 4.993354496239101e-06,
"loss": 0.5256,
"num_tokens": 1091730.0,
"step": 44
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.203125,
"learning_rate": 4.992293334332821e-06,
"loss": 0.5076,
"num_tokens": 1116183.0,
"step": 45
},
{
"epoch": 0.7863247863247863,
"grad_norm": 2.15625,
"learning_rate": 4.9911537352940485e-06,
"loss": 0.5113,
"num_tokens": 1142542.0,
"step": 46
},
{
"epoch": 0.8034188034188035,
"grad_norm": 2.203125,
"learning_rate": 4.989935734988098e-06,
"loss": 0.5558,
"num_tokens": 1168446.0,
"step": 47
},
{
"epoch": 0.8205128205128205,
"grad_norm": 2.21875,
"learning_rate": 4.988639371747717e-06,
"loss": 0.5316,
"num_tokens": 1193788.0,
"step": 48
},
{
"epoch": 0.8376068376068376,
"grad_norm": 2.125,
"learning_rate": 4.987264686371881e-06,
"loss": 0.5293,
"num_tokens": 1218441.0,
"step": 49
},
{
"epoch": 0.8547008547008547,
"grad_norm": 2.359375,
"learning_rate": 4.98581172212451e-06,
"loss": 0.5493,
"num_tokens": 1242720.0,
"step": 50
},
{
"epoch": 0.8717948717948718,
"grad_norm": 2.125,
"learning_rate": 4.984280524733107e-06,
"loss": 0.5171,
"num_tokens": 1268282.0,
"step": 51
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.046875,
"learning_rate": 4.982671142387316e-06,
"loss": 0.5173,
"num_tokens": 1292780.0,
"step": 52
},
{
"epoch": 0.905982905982906,
"grad_norm": 2.09375,
"learning_rate": 4.980983625737411e-06,
"loss": 0.5292,
"num_tokens": 1318538.0,
"step": 53
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.0625,
"learning_rate": 4.979218027892696e-06,
"loss": 0.5264,
"num_tokens": 1343366.0,
"step": 54
},
{
"epoch": 0.9401709401709402,
"grad_norm": 2.0,
"learning_rate": 4.977374404419838e-06,
"loss": 0.5078,
"num_tokens": 1368198.0,
"step": 55
},
{
"epoch": 0.9572649572649573,
"grad_norm": 1.984375,
"learning_rate": 4.9754528133411144e-06,
"loss": 0.503,
"num_tokens": 1394723.0,
"step": 56
},
{
"epoch": 0.9743589743589743,
"grad_norm": 2.015625,
"learning_rate": 4.973453315132592e-06,
"loss": 0.5035,
"num_tokens": 1419991.0,
"step": 57
},
{
"epoch": 0.9914529914529915,
"grad_norm": 2.03125,
"learning_rate": 4.9713759727222184e-06,
"loss": 0.527,
"num_tokens": 1444550.0,
"step": 58
},
{
"epoch": 1.0,
"grad_norm": 3.0,
"learning_rate": 4.9692208514878445e-06,
"loss": 0.5228,
"num_tokens": 1454461.0,
"step": 59
},
{
"epoch": 1.017094017094017,
"grad_norm": 1.9296875,
"learning_rate": 4.966988019255167e-06,
"loss": 0.5265,
"num_tokens": 1479102.0,
"step": 60
},
{
"epoch": 1.017094017094017,
"eval_loss": 0.5714334845542908,
"eval_num_tokens": 1479102.0,
"eval_runtime": 3.6744,
"eval_samples_per_second": 358.965,
"eval_steps_per_second": 11.43,
"step": 60
},
{
"epoch": 1.0341880341880343,
"grad_norm": 1.9375,
"learning_rate": 4.96467754629559e-06,
"loss": 0.5178,
"num_tokens": 1504664.0,
"step": 61
},
{
"epoch": 1.0512820512820513,
"grad_norm": 1.875,
"learning_rate": 4.962289505324021e-06,
"loss": 0.4939,
"num_tokens": 1529770.0,
"step": 62
},
{
"epoch": 1.0683760683760684,
"grad_norm": 1.984375,
"learning_rate": 4.959823971496575e-06,
"loss": 0.5263,
"num_tokens": 1554337.0,
"step": 63
},
{
"epoch": 1.0854700854700854,
"grad_norm": 1.9453125,
"learning_rate": 4.957281022408212e-06,
"loss": 0.5433,
"num_tokens": 1579659.0,
"step": 64
},
{
"epoch": 1.1025641025641026,
"grad_norm": 1.984375,
"learning_rate": 4.954660738090297e-06,
"loss": 0.5237,
"num_tokens": 1605926.0,
"step": 65
},
{
"epoch": 1.1196581196581197,
"grad_norm": 1.96875,
"learning_rate": 4.9519632010080765e-06,
"loss": 0.5467,
"num_tokens": 1631620.0,
"step": 66
},
{
"epoch": 1.1367521367521367,
"grad_norm": 2.015625,
"learning_rate": 4.949188496058089e-06,
"loss": 0.5179,
"num_tokens": 1656811.0,
"step": 67
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.8359375,
"learning_rate": 4.946336710565489e-06,
"loss": 0.4956,
"num_tokens": 1682668.0,
"step": 68
},
{
"epoch": 1.170940170940171,
"grad_norm": 1.9609375,
"learning_rate": 4.943407934281298e-06,
"loss": 0.4481,
"num_tokens": 1707713.0,
"step": 69
},
{
"epoch": 1.188034188034188,
"grad_norm": 1.9921875,
"learning_rate": 4.940402259379585e-06,
"loss": 0.4895,
"num_tokens": 1732157.0,
"step": 70
},
{
"epoch": 1.205128205128205,
"grad_norm": 1.9453125,
"learning_rate": 4.937319780454559e-06,
"loss": 0.5289,
"num_tokens": 1757982.0,
"step": 71
},
{
"epoch": 1.2222222222222223,
"grad_norm": 1.9609375,
"learning_rate": 4.934160594517598e-06,
"loss": 0.5181,
"num_tokens": 1782772.0,
"step": 72
},
{
"epoch": 1.2393162393162394,
"grad_norm": 2.015625,
"learning_rate": 4.930924800994192e-06,
"loss": 0.5355,
"num_tokens": 1807558.0,
"step": 73
},
{
"epoch": 1.2564102564102564,
"grad_norm": 2.078125,
"learning_rate": 4.9276125017208144e-06,
"loss": 0.5314,
"num_tokens": 1831248.0,
"step": 74
},
{
"epoch": 1.2735042735042734,
"grad_norm": 1.8828125,
"learning_rate": 4.924223800941718e-06,
"loss": 0.4989,
"num_tokens": 1855690.0,
"step": 75
},
{
"epoch": 1.2905982905982907,
"grad_norm": 1.859375,
"learning_rate": 4.920758805305654e-06,
"loss": 0.5124,
"num_tokens": 1879982.0,
"step": 76
},
{
"epoch": 1.3076923076923077,
"grad_norm": 1.96875,
"learning_rate": 4.917217623862516e-06,
"loss": 0.4972,
"num_tokens": 1903772.0,
"step": 77
},
{
"epoch": 1.3247863247863247,
"grad_norm": 1.875,
"learning_rate": 4.913600368059908e-06,
"loss": 0.5096,
"num_tokens": 1928519.0,
"step": 78
},
{
"epoch": 1.341880341880342,
"grad_norm": 1.953125,
"learning_rate": 4.909907151739634e-06,
"loss": 0.5159,
"num_tokens": 1953015.0,
"step": 79
},
{
"epoch": 1.358974358974359,
"grad_norm": 1.78125,
"learning_rate": 4.906138091134118e-06,
"loss": 0.4839,
"num_tokens": 1977983.0,
"step": 80
},
{
"epoch": 1.376068376068376,
"grad_norm": 1.84375,
"learning_rate": 4.9022933048627496e-06,
"loss": 0.5061,
"num_tokens": 2002217.0,
"step": 81
},
{
"epoch": 1.393162393162393,
"grad_norm": 1.90625,
"learning_rate": 4.89837291392814e-06,
"loss": 0.4503,
"num_tokens": 2026080.0,
"step": 82
},
{
"epoch": 1.4102564102564101,
"grad_norm": 1.8984375,
"learning_rate": 4.894377041712327e-06,
"loss": 0.4999,
"num_tokens": 2051323.0,
"step": 83
},
{
"epoch": 1.4273504273504274,
"grad_norm": 1.8671875,
"learning_rate": 4.89030581397288e-06,
"loss": 0.504,
"num_tokens": 2077546.0,
"step": 84
},
{
"epoch": 1.4444444444444444,
"grad_norm": 1.8046875,
"learning_rate": 4.886159358838952e-06,
"loss": 0.4974,
"num_tokens": 2102220.0,
"step": 85
},
{
"epoch": 1.4615384615384617,
"grad_norm": 1.8515625,
"learning_rate": 4.881937806807241e-06,
"loss": 0.4785,
"num_tokens": 2126431.0,
"step": 86
},
{
"epoch": 1.4786324786324787,
"grad_norm": 2.0625,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.5217,
"num_tokens": 2150716.0,
"step": 87
},
{
"epoch": 1.4957264957264957,
"grad_norm": 2.015625,
"learning_rate": 4.873269945850279e-06,
"loss": 0.5251,
"num_tokens": 2175905.0,
"step": 88
},
{
"epoch": 1.5128205128205128,
"grad_norm": 2.0625,
"learning_rate": 4.868823909718823e-06,
"loss": 0.5125,
"num_tokens": 2201148.0,
"step": 89
},
{
"epoch": 1.5299145299145298,
"grad_norm": 1.828125,
"learning_rate": 4.864303322268588e-06,
"loss": 0.4781,
"num_tokens": 2225118.0,
"step": 90
},
{
"epoch": 1.5299145299145298,
"eval_loss": 0.5624352097511292,
"eval_num_tokens": 2225118.0,
"eval_runtime": 3.7185,
"eval_samples_per_second": 354.711,
"eval_steps_per_second": 11.295,
"step": 90
},
{
"epoch": 1.547008547008547,
"grad_norm": 1.859375,
"learning_rate": 4.859708325770919e-06,
"loss": 0.4883,
"num_tokens": 2250080.0,
"step": 91
},
{
"epoch": 1.564102564102564,
"grad_norm": 1.8984375,
"learning_rate": 4.8550390648389475e-06,
"loss": 0.5148,
"num_tokens": 2276542.0,
"step": 92
},
{
"epoch": 1.5811965811965814,
"grad_norm": 1.9140625,
"learning_rate": 4.850295686423048e-06,
"loss": 0.5161,
"num_tokens": 2301549.0,
"step": 93
},
{
"epoch": 1.5982905982905984,
"grad_norm": 1.8515625,
"learning_rate": 4.845478339806211e-06,
"loss": 0.4967,
"num_tokens": 2326231.0,
"step": 94
},
{
"epoch": 1.6153846153846154,
"grad_norm": 1.9375,
"learning_rate": 4.8405871765993435e-06,
"loss": 0.5131,
"num_tokens": 2350342.0,
"step": 95
},
{
"epoch": 1.6324786324786325,
"grad_norm": 1.8671875,
"learning_rate": 4.835622350736499e-06,
"loss": 0.4803,
"num_tokens": 2374576.0,
"step": 96
},
{
"epoch": 1.6495726495726495,
"grad_norm": 1.84375,
"learning_rate": 4.830584018470036e-06,
"loss": 0.4972,
"num_tokens": 2401193.0,
"step": 97
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.9375,
"learning_rate": 4.825472338365691e-06,
"loss": 0.498,
"num_tokens": 2426218.0,
"step": 98
},
{
"epoch": 1.6837606837606838,
"grad_norm": 1.8203125,
"learning_rate": 4.820287471297598e-06,
"loss": 0.4849,
"num_tokens": 2450946.0,
"step": 99
},
{
"epoch": 1.7008547008547008,
"grad_norm": 2.0625,
"learning_rate": 4.81502958044322e-06,
"loss": 0.5148,
"num_tokens": 2475890.0,
"step": 100
},
{
"epoch": 1.717948717948718,
"grad_norm": 2.015625,
"learning_rate": 4.809698831278217e-06,
"loss": 0.5278,
"num_tokens": 2500942.0,
"step": 101
},
{
"epoch": 1.735042735042735,
"grad_norm": 1.9140625,
"learning_rate": 4.8042953915712354e-06,
"loss": 0.473,
"num_tokens": 2525522.0,
"step": 102
},
{
"epoch": 1.7521367521367521,
"grad_norm": 1.859375,
"learning_rate": 4.7988194313786275e-06,
"loss": 0.4824,
"num_tokens": 2550057.0,
"step": 103
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.90625,
"learning_rate": 4.7932711230391015e-06,
"loss": 0.4926,
"num_tokens": 2574298.0,
"step": 104
},
{
"epoch": 1.7863247863247862,
"grad_norm": 1.8671875,
"learning_rate": 4.7876506411683e-06,
"loss": 0.5095,
"num_tokens": 2599108.0,
"step": 105
},
{
"epoch": 1.8034188034188035,
"grad_norm": 1.96875,
"learning_rate": 4.781958162653298e-06,
"loss": 0.5089,
"num_tokens": 2624100.0,
"step": 106
},
{
"epoch": 1.8205128205128205,
"grad_norm": 1.734375,
"learning_rate": 4.7761938666470405e-06,
"loss": 0.4619,
"num_tokens": 2650393.0,
"step": 107
},
{
"epoch": 1.8376068376068377,
"grad_norm": 1.9921875,
"learning_rate": 4.770357934562704e-06,
"loss": 0.504,
"num_tokens": 2675279.0,
"step": 108
},
{
"epoch": 1.8547008547008548,
"grad_norm": 1.96875,
"learning_rate": 4.764450550067986e-06,
"loss": 0.4738,
"num_tokens": 2698998.0,
"step": 109
},
{
"epoch": 1.8717948717948718,
"grad_norm": 1.9140625,
"learning_rate": 4.758471899079325e-06,
"loss": 0.5221,
"num_tokens": 2723560.0,
"step": 110
},
{
"epoch": 1.8888888888888888,
"grad_norm": 1.8203125,
"learning_rate": 4.752422169756048e-06,
"loss": 0.5061,
"num_tokens": 2748478.0,
"step": 111
},
{
"epoch": 1.9059829059829059,
"grad_norm": 1.875,
"learning_rate": 4.746301552494453e-06,
"loss": 0.5154,
"num_tokens": 2774852.0,
"step": 112
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.9296875,
"learning_rate": 4.740110239921813e-06,
"loss": 0.5106,
"num_tokens": 2800119.0,
"step": 113
},
{
"epoch": 1.9401709401709402,
"grad_norm": 1.78125,
"learning_rate": 4.7338484268903125e-06,
"loss": 0.4917,
"num_tokens": 2826077.0,
"step": 114
},
{
"epoch": 1.9572649572649574,
"grad_norm": 1.921875,
"learning_rate": 4.72751631047092e-06,
"loss": 0.5087,
"num_tokens": 2851371.0,
"step": 115
},
{
"epoch": 1.9743589743589745,
"grad_norm": 2.03125,
"learning_rate": 4.721114089947181e-06,
"loss": 0.4763,
"num_tokens": 2875284.0,
"step": 116
},
{
"epoch": 1.9914529914529915,
"grad_norm": 1.9921875,
"learning_rate": 4.71464196680895e-06,
"loss": 0.5029,
"num_tokens": 2899750.0,
"step": 117
},
{
"epoch": 2.0,
"grad_norm": 3.265625,
"learning_rate": 4.708100144746046e-06,
"loss": 0.5426,
"num_tokens": 2908922.0,
"step": 118
},
{
"epoch": 2.017094017094017,
"grad_norm": 1.7109375,
"learning_rate": 4.701488829641845e-06,
"loss": 0.4663,
"num_tokens": 2935424.0,
"step": 119
},
{
"epoch": 2.034188034188034,
"grad_norm": 1.8515625,
"learning_rate": 4.6948082295667985e-06,
"loss": 0.4984,
"num_tokens": 2959733.0,
"step": 120
},
{
"epoch": 2.034188034188034,
"eval_loss": 0.5578281283378601,
"eval_num_tokens": 2959733.0,
"eval_runtime": 3.753,
"eval_samples_per_second": 351.454,
"eval_steps_per_second": 11.191,
"step": 120
},
{
"epoch": 2.051282051282051,
"grad_norm": 1.84375,
"learning_rate": 4.6880585547718845e-06,
"loss": 0.5201,
"num_tokens": 2984243.0,
"step": 121
},
{
"epoch": 2.0683760683760686,
"grad_norm": 1.9375,
"learning_rate": 4.681240017681994e-06,
"loss": 0.4877,
"num_tokens": 3007597.0,
"step": 122
},
{
"epoch": 2.0854700854700856,
"grad_norm": 1.8984375,
"learning_rate": 4.674352832889239e-06,
"loss": 0.52,
"num_tokens": 3031599.0,
"step": 123
},
{
"epoch": 2.1025641025641026,
"grad_norm": 1.9140625,
"learning_rate": 4.667397217146208e-06,
"loss": 0.4729,
"num_tokens": 3055857.0,
"step": 124
},
{
"epoch": 2.1196581196581197,
"grad_norm": 1.9375,
"learning_rate": 4.660373389359137e-06,
"loss": 0.507,
"num_tokens": 3080352.0,
"step": 125
},
{
"epoch": 2.1367521367521367,
"grad_norm": 1.859375,
"learning_rate": 4.653281570581023e-06,
"loss": 0.4814,
"num_tokens": 3104247.0,
"step": 126
},
{
"epoch": 2.1538461538461537,
"grad_norm": 1.8203125,
"learning_rate": 4.646121984004666e-06,
"loss": 0.462,
"num_tokens": 3128125.0,
"step": 127
},
{
"epoch": 2.1709401709401708,
"grad_norm": 1.9375,
"learning_rate": 4.638894854955645e-06,
"loss": 0.4944,
"num_tokens": 3152844.0,
"step": 128
},
{
"epoch": 2.1880341880341883,
"grad_norm": 1.8046875,
"learning_rate": 4.631600410885231e-06,
"loss": 0.4977,
"num_tokens": 3177853.0,
"step": 129
},
{
"epoch": 2.2051282051282053,
"grad_norm": 1.8828125,
"learning_rate": 4.624238881363219e-06,
"loss": 0.5246,
"num_tokens": 3203411.0,
"step": 130
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.8984375,
"learning_rate": 4.6168104980707105e-06,
"loss": 0.5069,
"num_tokens": 3228533.0,
"step": 131
},
{
"epoch": 2.2393162393162394,
"grad_norm": 2.046875,
"learning_rate": 4.609315494792823e-06,
"loss": 0.5008,
"num_tokens": 3252563.0,
"step": 132
},
{
"epoch": 2.2564102564102564,
"grad_norm": 2.078125,
"learning_rate": 4.601754107411326e-06,
"loss": 0.4893,
"num_tokens": 3276004.0,
"step": 133
},
{
"epoch": 2.2735042735042734,
"grad_norm": 1.84375,
"learning_rate": 4.594126573897222e-06,
"loss": 0.449,
"num_tokens": 3300197.0,
"step": 134
},
{
"epoch": 2.2905982905982905,
"grad_norm": 1.7890625,
"learning_rate": 4.586433134303257e-06,
"loss": 0.4602,
"num_tokens": 3324679.0,
"step": 135
},
{
"epoch": 2.3076923076923075,
"grad_norm": 1.7734375,
"learning_rate": 4.578674030756364e-06,
"loss": 0.4809,
"num_tokens": 3350582.0,
"step": 136
},
{
"epoch": 2.324786324786325,
"grad_norm": 1.9140625,
"learning_rate": 4.570849507450042e-06,
"loss": 0.5126,
"num_tokens": 3375575.0,
"step": 137
},
{
"epoch": 2.341880341880342,
"grad_norm": 1.8046875,
"learning_rate": 4.562959810636674e-06,
"loss": 0.4904,
"num_tokens": 3401385.0,
"step": 138
},
{
"epoch": 2.358974358974359,
"grad_norm": 1.7578125,
"learning_rate": 4.555005188619776e-06,
"loss": 0.4845,
"num_tokens": 3426459.0,
"step": 139
},
{
"epoch": 2.376068376068376,
"grad_norm": 1.8828125,
"learning_rate": 4.546985891746177e-06,
"loss": 0.5121,
"num_tokens": 3451522.0,
"step": 140
},
{
"epoch": 2.393162393162393,
"grad_norm": 1.828125,
"learning_rate": 4.538902172398151e-06,
"loss": 0.5063,
"num_tokens": 3477049.0,
"step": 141
},
{
"epoch": 2.41025641025641,
"grad_norm": 1.921875,
"learning_rate": 4.530754284985463e-06,
"loss": 0.4813,
"num_tokens": 3501648.0,
"step": 142
},
{
"epoch": 2.427350427350427,
"grad_norm": 1.84375,
"learning_rate": 4.522542485937369e-06,
"loss": 0.4835,
"num_tokens": 3526588.0,
"step": 143
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.765625,
"learning_rate": 4.514267033694544e-06,
"loss": 0.48,
"num_tokens": 3551875.0,
"step": 144
},
{
"epoch": 2.4615384615384617,
"grad_norm": 1.8359375,
"learning_rate": 4.505928188700946e-06,
"loss": 0.4868,
"num_tokens": 3576916.0,
"step": 145
},
{
"epoch": 2.4786324786324787,
"grad_norm": 1.921875,
"learning_rate": 4.4975262133956235e-06,
"loss": 0.4892,
"num_tokens": 3600882.0,
"step": 146
},
{
"epoch": 2.4957264957264957,
"grad_norm": 2.046875,
"learning_rate": 4.4890613722044526e-06,
"loss": 0.4793,
"num_tokens": 3625564.0,
"step": 147
},
{
"epoch": 2.5128205128205128,
"grad_norm": 2.03125,
"learning_rate": 4.480533931531819e-06,
"loss": 0.4893,
"num_tokens": 3650390.0,
"step": 148
},
{
"epoch": 2.52991452991453,
"grad_norm": 1.8515625,
"learning_rate": 4.471944159752228e-06,
"loss": 0.5143,
"num_tokens": 3675958.0,
"step": 149
},
{
"epoch": 2.547008547008547,
"grad_norm": 1.78125,
"learning_rate": 4.463292327201862e-06,
"loss": 0.4976,
"num_tokens": 3701445.0,
"step": 150
},
{
"epoch": 2.547008547008547,
"eval_loss": 0.5555291175842285,
"eval_num_tokens": 3701445.0,
"eval_runtime": 3.6005,
"eval_samples_per_second": 366.333,
"eval_steps_per_second": 11.665,
"step": 150
},
{
"epoch": 2.564102564102564,
"grad_norm": 1.734375,
"learning_rate": 4.454578706170075e-06,
"loss": 0.4661,
"num_tokens": 3726919.0,
"step": 151
},
{
"epoch": 2.5811965811965814,
"grad_norm": 1.953125,
"learning_rate": 4.445803570890815e-06,
"loss": 0.483,
"num_tokens": 3751903.0,
"step": 152
},
{
"epoch": 2.5982905982905984,
"grad_norm": 1.8671875,
"learning_rate": 4.436967197534003e-06,
"loss": 0.5107,
"num_tokens": 3776558.0,
"step": 153
},
{
"epoch": 2.6153846153846154,
"grad_norm": 1.75,
"learning_rate": 4.4280698641968335e-06,
"loss": 0.4787,
"num_tokens": 3802036.0,
"step": 154
},
{
"epoch": 2.6324786324786325,
"grad_norm": 1.7265625,
"learning_rate": 4.4191118508950286e-06,
"loss": 0.4584,
"num_tokens": 3827070.0,
"step": 155
},
{
"epoch": 2.6495726495726495,
"grad_norm": 1.859375,
"learning_rate": 4.410093439554019e-06,
"loss": 0.5037,
"num_tokens": 3852076.0,
"step": 156
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.8125,
"learning_rate": 4.401014914000078e-06,
"loss": 0.4796,
"num_tokens": 3877546.0,
"step": 157
},
{
"epoch": 2.683760683760684,
"grad_norm": 1.671875,
"learning_rate": 4.391876559951383e-06,
"loss": 0.4525,
"num_tokens": 3902477.0,
"step": 158
},
{
"epoch": 2.700854700854701,
"grad_norm": 1.7890625,
"learning_rate": 4.382678665009028e-06,
"loss": 0.4831,
"num_tokens": 3927621.0,
"step": 159
},
{
"epoch": 2.717948717948718,
"grad_norm": 1.859375,
"learning_rate": 4.373421518647968e-06,
"loss": 0.508,
"num_tokens": 3953135.0,
"step": 160
},
{
"epoch": 2.735042735042735,
"grad_norm": 1.9375,
"learning_rate": 4.364105412207914e-06,
"loss": 0.4923,
"num_tokens": 3977233.0,
"step": 161
},
{
"epoch": 2.752136752136752,
"grad_norm": 1.859375,
"learning_rate": 4.35473063888416e-06,
"loss": 0.4915,
"num_tokens": 4002412.0,
"step": 162
},
{
"epoch": 2.769230769230769,
"grad_norm": 1.7890625,
"learning_rate": 4.345297493718352e-06,
"loss": 0.5062,
"num_tokens": 4027883.0,
"step": 163
},
{
"epoch": 2.786324786324786,
"grad_norm": 1.7265625,
"learning_rate": 4.335806273589214e-06,
"loss": 0.4649,
"num_tokens": 4052975.0,
"step": 164
},
{
"epoch": 2.8034188034188032,
"grad_norm": 1.90625,
"learning_rate": 4.326257277203194e-06,
"loss": 0.499,
"num_tokens": 4077632.0,
"step": 165
},
{
"epoch": 2.8205128205128203,
"grad_norm": 1.921875,
"learning_rate": 4.316650805085068e-06,
"loss": 0.4717,
"num_tokens": 4100768.0,
"step": 166
},
{
"epoch": 2.8376068376068377,
"grad_norm": 1.8359375,
"learning_rate": 4.3069871595684795e-06,
"loss": 0.4634,
"num_tokens": 4125382.0,
"step": 167
},
{
"epoch": 2.8547008547008548,
"grad_norm": 1.8984375,
"learning_rate": 4.297266644786426e-06,
"loss": 0.5104,
"num_tokens": 4151107.0,
"step": 168
},
{
"epoch": 2.871794871794872,
"grad_norm": 1.8828125,
"learning_rate": 4.287489566661689e-06,
"loss": 0.5049,
"num_tokens": 4175799.0,
"step": 169
},
{
"epoch": 2.888888888888889,
"grad_norm": 1.75,
"learning_rate": 4.277656232897201e-06,
"loss": 0.5009,
"num_tokens": 4201835.0,
"step": 170
},
{
"epoch": 2.905982905982906,
"grad_norm": 1.8046875,
"learning_rate": 4.267766952966369e-06,
"loss": 0.4712,
"num_tokens": 4226851.0,
"step": 171
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.9453125,
"learning_rate": 4.257822038103326e-06,
"loss": 0.4979,
"num_tokens": 4252450.0,
"step": 172
},
{
"epoch": 2.9401709401709404,
"grad_norm": 1.78125,
"learning_rate": 4.247821801293144e-06,
"loss": 0.4507,
"num_tokens": 4278416.0,
"step": 173
},
{
"epoch": 2.9572649572649574,
"grad_norm": 1.828125,
"learning_rate": 4.237766557261977e-06,
"loss": 0.4681,
"num_tokens": 4302908.0,
"step": 174
},
{
"epoch": 2.9743589743589745,
"grad_norm": 1.734375,
"learning_rate": 4.227656622467162e-06,
"loss": 0.479,
"num_tokens": 4328683.0,
"step": 175
},
{
"epoch": 2.9914529914529915,
"grad_norm": 1.7890625,
"learning_rate": 4.217492315087255e-06,
"loss": 0.4522,
"num_tokens": 4354142.0,
"step": 176
},
{
"epoch": 3.0,
"grad_norm": 3.09375,
"learning_rate": 4.207273955012018e-06,
"loss": 0.4855,
"num_tokens": 4363383.0,
"step": 177
},
{
"epoch": 3.017094017094017,
"grad_norm": 1.7890625,
"learning_rate": 4.197001863832355e-06,
"loss": 0.4493,
"num_tokens": 4388966.0,
"step": 178
},
{
"epoch": 3.034188034188034,
"grad_norm": 1.8046875,
"learning_rate": 4.186676364830187e-06,
"loss": 0.5116,
"num_tokens": 4414781.0,
"step": 179
},
{
"epoch": 3.051282051282051,
"grad_norm": 1.7890625,
"learning_rate": 4.176297782968277e-06,
"loss": 0.4707,
"num_tokens": 4440274.0,
"step": 180
},
{
"epoch": 3.051282051282051,
"eval_loss": 0.5535330772399902,
"eval_num_tokens": 4440274.0,
"eval_runtime": 3.8645,
"eval_samples_per_second": 341.31,
"eval_steps_per_second": 10.868,
"step": 180
},
{
"epoch": 3.0683760683760686,
"grad_norm": 1.8828125,
"learning_rate": 4.1658664448800105e-06,
"loss": 0.4768,
"num_tokens": 4463887.0,
"step": 181
},
{
"epoch": 3.0854700854700856,
"grad_norm": 1.7578125,
"learning_rate": 4.155382678859103e-06,
"loss": 0.4578,
"num_tokens": 4489299.0,
"step": 182
},
{
"epoch": 3.1025641025641026,
"grad_norm": 1.8828125,
"learning_rate": 4.144846814849282e-06,
"loss": 0.5149,
"num_tokens": 4514175.0,
"step": 183
},
{
"epoch": 3.1196581196581197,
"grad_norm": 1.9375,
"learning_rate": 4.134259184433891e-06,
"loss": 0.4797,
"num_tokens": 4538592.0,
"step": 184
},
{
"epoch": 3.1367521367521367,
"grad_norm": 1.859375,
"learning_rate": 4.123620120825459e-06,
"loss": 0.5062,
"num_tokens": 4563767.0,
"step": 185
},
{
"epoch": 3.1538461538461537,
"grad_norm": 1.921875,
"learning_rate": 4.11292995885522e-06,
"loss": 0.4816,
"num_tokens": 4588715.0,
"step": 186
},
{
"epoch": 3.1709401709401708,
"grad_norm": 1.8125,
"learning_rate": 4.102189034962561e-06,
"loss": 0.4779,
"num_tokens": 4613441.0,
"step": 187
},
{
"epoch": 3.1880341880341883,
"grad_norm": 1.9453125,
"learning_rate": 4.091397687184447e-06,
"loss": 0.4652,
"num_tokens": 4637436.0,
"step": 188
},
{
"epoch": 3.2051282051282053,
"grad_norm": 1.9453125,
"learning_rate": 4.080556255144775e-06,
"loss": 0.4572,
"num_tokens": 4661231.0,
"step": 189
},
{
"epoch": 3.2222222222222223,
"grad_norm": 1.8125,
"learning_rate": 4.069665080043687e-06,
"loss": 0.4529,
"num_tokens": 4686469.0,
"step": 190
},
{
"epoch": 3.2393162393162394,
"grad_norm": 1.796875,
"learning_rate": 4.058724504646834e-06,
"loss": 0.4585,
"num_tokens": 4711342.0,
"step": 191
},
{
"epoch": 3.2564102564102564,
"grad_norm": 1.8359375,
"learning_rate": 4.047734873274586e-06,
"loss": 0.4569,
"num_tokens": 4735893.0,
"step": 192
},
{
"epoch": 3.2735042735042734,
"grad_norm": 1.765625,
"learning_rate": 4.036696531791193e-06,
"loss": 0.4952,
"num_tokens": 4761547.0,
"step": 193
},
{
"epoch": 3.2905982905982905,
"grad_norm": 2.03125,
"learning_rate": 4.025609827593909e-06,
"loss": 0.463,
"num_tokens": 4785024.0,
"step": 194
},
{
"epoch": 3.3076923076923075,
"grad_norm": 1.75,
"learning_rate": 4.01447510960205e-06,
"loss": 0.4517,
"num_tokens": 4809836.0,
"step": 195
},
{
"epoch": 3.324786324786325,
"grad_norm": 1.8984375,
"learning_rate": 4.003292728246015e-06,
"loss": 0.4726,
"num_tokens": 4835379.0,
"step": 196
},
{
"epoch": 3.341880341880342,
"grad_norm": 1.9296875,
"learning_rate": 3.9920630354562595e-06,
"loss": 0.4653,
"num_tokens": 4860091.0,
"step": 197
},
{
"epoch": 3.358974358974359,
"grad_norm": 1.9140625,
"learning_rate": 3.9807863846522186e-06,
"loss": 0.4589,
"num_tokens": 4884680.0,
"step": 198
},
{
"epoch": 3.376068376068376,
"grad_norm": 1.8359375,
"learning_rate": 3.969463130731183e-06,
"loss": 0.4956,
"num_tokens": 4909308.0,
"step": 199
},
{
"epoch": 3.393162393162393,
"grad_norm": 1.90625,
"learning_rate": 3.958093630057132e-06,
"loss": 0.437,
"num_tokens": 4933587.0,
"step": 200
},
{
"epoch": 3.41025641025641,
"grad_norm": 1.828125,
"learning_rate": 3.946678240449515e-06,
"loss": 0.4809,
"num_tokens": 4958005.0,
"step": 201
},
{
"epoch": 3.427350427350427,
"grad_norm": 1.7109375,
"learning_rate": 3.935217321171992e-06,
"loss": 0.4505,
"num_tokens": 4983166.0,
"step": 202
},
{
"epoch": 3.4444444444444446,
"grad_norm": 2.03125,
"learning_rate": 3.92371123292113e-06,
"loss": 0.4751,
"num_tokens": 5008231.0,
"step": 203
},
{
"epoch": 3.4615384615384617,
"grad_norm": 1.8359375,
"learning_rate": 3.912160337815045e-06,
"loss": 0.4717,
"num_tokens": 5033072.0,
"step": 204
},
{
"epoch": 3.4786324786324787,
"grad_norm": 1.765625,
"learning_rate": 3.900564999382007e-06,
"loss": 0.473,
"num_tokens": 5058720.0,
"step": 205
},
{
"epoch": 3.4957264957264957,
"grad_norm": 1.84375,
"learning_rate": 3.888925582549006e-06,
"loss": 0.5132,
"num_tokens": 5084491.0,
"step": 206
},
{
"epoch": 3.5128205128205128,
"grad_norm": 1.9296875,
"learning_rate": 3.8772424536302565e-06,
"loss": 0.4912,
"num_tokens": 5108798.0,
"step": 207
},
{
"epoch": 3.52991452991453,
"grad_norm": 1.7890625,
"learning_rate": 3.865515980315677e-06,
"loss": 0.4467,
"num_tokens": 5133691.0,
"step": 208
},
{
"epoch": 3.547008547008547,
"grad_norm": 1.7734375,
"learning_rate": 3.853746531659315e-06,
"loss": 0.4636,
"num_tokens": 5158639.0,
"step": 209
},
{
"epoch": 3.564102564102564,
"grad_norm": 1.90625,
"learning_rate": 3.84193447806773e-06,
"loss": 0.4878,
"num_tokens": 5183688.0,
"step": 210
},
{
"epoch": 3.564102564102564,
"eval_loss": 0.5530414581298828,
"eval_num_tokens": 5183688.0,
"eval_runtime": 3.7837,
"eval_samples_per_second": 348.596,
"eval_steps_per_second": 11.1,
"step": 210
},
{
"epoch": 3.5811965811965814,
"grad_norm": 1.8828125,
"learning_rate": 3.830080191288342e-06,
"loss": 0.4893,
"num_tokens": 5208452.0,
"step": 211
},
{
"epoch": 3.5982905982905984,
"grad_norm": 1.875,
"learning_rate": 3.8181840443977254e-06,
"loss": 0.5098,
"num_tokens": 5233187.0,
"step": 212
},
{
"epoch": 3.6153846153846154,
"grad_norm": 1.75,
"learning_rate": 3.806246411789872e-06,
"loss": 0.4971,
"num_tokens": 5259781.0,
"step": 213
},
{
"epoch": 3.6324786324786325,
"grad_norm": 1.90625,
"learning_rate": 3.794267669164408e-06,
"loss": 0.5001,
"num_tokens": 5284965.0,
"step": 214
},
{
"epoch": 3.6495726495726495,
"grad_norm": 1.8984375,
"learning_rate": 3.782248193514766e-06,
"loss": 0.4903,
"num_tokens": 5308660.0,
"step": 215
},
{
"epoch": 3.6666666666666665,
"grad_norm": 1.8671875,
"learning_rate": 3.770188363116324e-06,
"loss": 0.4756,
"num_tokens": 5332679.0,
"step": 216
},
{
"epoch": 3.683760683760684,
"grad_norm": 2.0,
"learning_rate": 3.758088557514501e-06,
"loss": 0.4933,
"num_tokens": 5356650.0,
"step": 217
},
{
"epoch": 3.700854700854701,
"grad_norm": 1.8828125,
"learning_rate": 3.7459491575128076e-06,
"loss": 0.5105,
"num_tokens": 5381408.0,
"step": 218
},
{
"epoch": 3.717948717948718,
"grad_norm": 1.7890625,
"learning_rate": 3.7337705451608676e-06,
"loss": 0.4813,
"num_tokens": 5406861.0,
"step": 219
},
{
"epoch": 3.735042735042735,
"grad_norm": 1.8359375,
"learning_rate": 3.721553103742388e-06,
"loss": 0.5061,
"num_tokens": 5432583.0,
"step": 220
},
{
"epoch": 3.752136752136752,
"grad_norm": 1.8515625,
"learning_rate": 3.7092972177630998e-06,
"loss": 0.4987,
"num_tokens": 5457632.0,
"step": 221
},
{
"epoch": 3.769230769230769,
"grad_norm": 1.8359375,
"learning_rate": 3.6970032729386573e-06,
"loss": 0.4942,
"num_tokens": 5482843.0,
"step": 222
},
{
"epoch": 3.786324786324786,
"grad_norm": 1.6875,
"learning_rate": 3.684671656182497e-06,
"loss": 0.451,
"num_tokens": 5507998.0,
"step": 223
},
{
"epoch": 3.8034188034188032,
"grad_norm": 1.734375,
"learning_rate": 3.672302755593661e-06,
"loss": 0.4682,
"num_tokens": 5533208.0,
"step": 224
},
{
"epoch": 3.8205128205128203,
"grad_norm": 1.984375,
"learning_rate": 3.6598969604445854e-06,
"loss": 0.4823,
"num_tokens": 5557678.0,
"step": 225
},
{
"epoch": 3.8376068376068377,
"grad_norm": 1.875,
"learning_rate": 3.6474546611688446e-06,
"loss": 0.4448,
"num_tokens": 5582601.0,
"step": 226
},
{
"epoch": 3.8547008547008548,
"grad_norm": 1.921875,
"learning_rate": 3.634976249348867e-06,
"loss": 0.4716,
"num_tokens": 5608386.0,
"step": 227
},
{
"epoch": 3.871794871794872,
"grad_norm": 1.78125,
"learning_rate": 3.622462117703612e-06,
"loss": 0.4714,
"num_tokens": 5633161.0,
"step": 228
},
{
"epoch": 3.888888888888889,
"grad_norm": 1.84375,
"learning_rate": 3.6099126600762056e-06,
"loss": 0.5174,
"num_tokens": 5658917.0,
"step": 229
},
{
"epoch": 3.905982905982906,
"grad_norm": 1.7578125,
"learning_rate": 3.5973282714215514e-06,
"loss": 0.475,
"num_tokens": 5684214.0,
"step": 230
},
{
"epoch": 3.9230769230769234,
"grad_norm": 1.75,
"learning_rate": 3.5847093477938955e-06,
"loss": 0.4715,
"num_tokens": 5709024.0,
"step": 231
},
{
"epoch": 3.9401709401709404,
"grad_norm": 1.84375,
"learning_rate": 3.5720562863343668e-06,
"loss": 0.4783,
"num_tokens": 5734477.0,
"step": 232
},
{
"epoch": 3.9572649572649574,
"grad_norm": 1.8671875,
"learning_rate": 3.559369485258472e-06,
"loss": 0.482,
"num_tokens": 5758325.0,
"step": 233
},
{
"epoch": 3.9743589743589745,
"grad_norm": 1.859375,
"learning_rate": 3.5466493438435707e-06,
"loss": 0.4831,
"num_tokens": 5783893.0,
"step": 234
},
{
"epoch": 3.9914529914529915,
"grad_norm": 1.8515625,
"learning_rate": 3.533896262416302e-06,
"loss": 0.4902,
"num_tokens": 5809115.0,
"step": 235
},
{
"epoch": 4.0,
"grad_norm": 3.03125,
"learning_rate": 3.521110642339991e-06,
"loss": 0.4344,
"num_tokens": 5817844.0,
"step": 236
},
{
"epoch": 4.017094017094017,
"grad_norm": 1.7421875,
"learning_rate": 3.508292886002013e-06,
"loss": 0.4787,
"num_tokens": 5843069.0,
"step": 237
},
{
"epoch": 4.034188034188034,
"grad_norm": 1.96875,
"learning_rate": 3.495443396801134e-06,
"loss": 0.4796,
"num_tokens": 5866440.0,
"step": 238
},
{
"epoch": 4.051282051282051,
"grad_norm": 1.8359375,
"learning_rate": 3.4825625791348093e-06,
"loss": 0.4859,
"num_tokens": 5891282.0,
"step": 239
},
{
"epoch": 4.068376068376068,
"grad_norm": 1.8046875,
"learning_rate": 3.4696508383864636e-06,
"loss": 0.4602,
"num_tokens": 5915722.0,
"step": 240
},
{
"epoch": 4.068376068376068,
"eval_loss": 0.55167156457901,
"eval_num_tokens": 5915722.0,
"eval_runtime": 3.8113,
"eval_samples_per_second": 346.076,
"eval_steps_per_second": 11.02,
"step": 240
},
{
"epoch": 4.085470085470085,
"grad_norm": 1.796875,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.4978,
"num_tokens": 5940349.0,
"step": 241
},
{
"epoch": 4.102564102564102,
"grad_norm": 1.7578125,
"learning_rate": 3.4437362140306423e-06,
"loss": 0.4968,
"num_tokens": 5966103.0,
"step": 242
},
{
"epoch": 4.119658119658119,
"grad_norm": 1.8046875,
"learning_rate": 3.4307341460048633e-06,
"loss": 0.4989,
"num_tokens": 5991498.0,
"step": 243
},
{
"epoch": 4.136752136752137,
"grad_norm": 1.7890625,
"learning_rate": 3.417702786034786e-06,
"loss": 0.4921,
"num_tokens": 6017439.0,
"step": 244
},
{
"epoch": 4.153846153846154,
"grad_norm": 1.859375,
"learning_rate": 3.4046425442416807e-06,
"loss": 0.4648,
"num_tokens": 6042023.0,
"step": 245
},
{
"epoch": 4.170940170940171,
"grad_norm": 1.796875,
"learning_rate": 3.391553831655783e-06,
"loss": 0.5102,
"num_tokens": 6067004.0,
"step": 246
},
{
"epoch": 4.188034188034188,
"grad_norm": 1.953125,
"learning_rate": 3.3784370602033572e-06,
"loss": 0.4846,
"num_tokens": 6090221.0,
"step": 247
},
{
"epoch": 4.205128205128205,
"grad_norm": 1.9453125,
"learning_rate": 3.3652926426937327e-06,
"loss": 0.5134,
"num_tokens": 6115452.0,
"step": 248
},
{
"epoch": 4.222222222222222,
"grad_norm": 1.7421875,
"learning_rate": 3.3521209928063127e-06,
"loss": 0.4557,
"num_tokens": 6140900.0,
"step": 249
},
{
"epoch": 4.239316239316239,
"grad_norm": 1.8125,
"learning_rate": 3.3389225250775533e-06,
"loss": 0.4488,
"num_tokens": 6166140.0,
"step": 250
},
{
"epoch": 4.256410256410256,
"grad_norm": 1.921875,
"learning_rate": 3.3256976548879183e-06,
"loss": 0.4794,
"num_tokens": 6191157.0,
"step": 251
},
{
"epoch": 4.273504273504273,
"grad_norm": 1.8203125,
"learning_rate": 3.3124467984488067e-06,
"loss": 0.4485,
"num_tokens": 6216180.0,
"step": 252
},
{
"epoch": 4.2905982905982905,
"grad_norm": 1.7578125,
"learning_rate": 3.299170372789454e-06,
"loss": 0.4309,
"num_tokens": 6240305.0,
"step": 253
},
{
"epoch": 4.3076923076923075,
"grad_norm": 1.7265625,
"learning_rate": 3.2858687957438056e-06,
"loss": 0.464,
"num_tokens": 6266444.0,
"step": 254
},
{
"epoch": 4.3247863247863245,
"grad_norm": 1.8046875,
"learning_rate": 3.272542485937369e-06,
"loss": 0.4926,
"num_tokens": 6290807.0,
"step": 255
},
{
"epoch": 4.3418803418803416,
"grad_norm": 1.8828125,
"learning_rate": 3.259191862774037e-06,
"loss": 0.4686,
"num_tokens": 6315562.0,
"step": 256
},
{
"epoch": 4.358974358974359,
"grad_norm": 1.875,
"learning_rate": 3.2458173464228905e-06,
"loss": 0.494,
"num_tokens": 6340242.0,
"step": 257
},
{
"epoch": 4.3760683760683765,
"grad_norm": 1.90625,
"learning_rate": 3.2324193578049727e-06,
"loss": 0.4891,
"num_tokens": 6364338.0,
"step": 258
},
{
"epoch": 4.3931623931623935,
"grad_norm": 1.7890625,
"learning_rate": 3.218998318580043e-06,
"loss": 0.488,
"num_tokens": 6389547.0,
"step": 259
},
{
"epoch": 4.410256410256411,
"grad_norm": 1.765625,
"learning_rate": 3.205554651133308e-06,
"loss": 0.465,
"num_tokens": 6414507.0,
"step": 260
},
{
"epoch": 4.427350427350428,
"grad_norm": 1.8828125,
"learning_rate": 3.1920887785621233e-06,
"loss": 0.4636,
"num_tokens": 6439551.0,
"step": 261
},
{
"epoch": 4.444444444444445,
"grad_norm": 1.9140625,
"learning_rate": 3.1786011246626858e-06,
"loss": 0.4931,
"num_tokens": 6464477.0,
"step": 262
},
{
"epoch": 4.461538461538462,
"grad_norm": 1.921875,
"learning_rate": 3.165092113916688e-06,
"loss": 0.4377,
"num_tokens": 6488087.0,
"step": 263
},
{
"epoch": 4.478632478632479,
"grad_norm": 1.984375,
"learning_rate": 3.151562171477964e-06,
"loss": 0.5193,
"num_tokens": 6512609.0,
"step": 264
},
{
"epoch": 4.495726495726496,
"grad_norm": 1.8046875,
"learning_rate": 3.138011723159107e-06,
"loss": 0.4167,
"num_tokens": 6536550.0,
"step": 265
},
{
"epoch": 4.512820512820513,
"grad_norm": 1.796875,
"learning_rate": 3.1244411954180677e-06,
"loss": 0.4742,
"num_tokens": 6562337.0,
"step": 266
},
{
"epoch": 4.52991452991453,
"grad_norm": 1.796875,
"learning_rate": 3.1108510153447352e-06,
"loss": 0.4381,
"num_tokens": 6587473.0,
"step": 267
},
{
"epoch": 4.547008547008547,
"grad_norm": 1.9453125,
"learning_rate": 3.0972416106474946e-06,
"loss": 0.4589,
"num_tokens": 6611709.0,
"step": 268
},
{
"epoch": 4.564102564102564,
"grad_norm": 1.8203125,
"learning_rate": 3.0836134096397642e-06,
"loss": 0.4719,
"num_tokens": 6637580.0,
"step": 269
},
{
"epoch": 4.581196581196581,
"grad_norm": 1.9375,
"learning_rate": 3.0699668412265175e-06,
"loss": 0.4721,
"num_tokens": 6662574.0,
"step": 270
},
{
"epoch": 4.581196581196581,
"eval_loss": 0.5516491532325745,
"eval_num_tokens": 6662574.0,
"eval_runtime": 3.7149,
"eval_samples_per_second": 355.054,
"eval_steps_per_second": 11.306,
"step": 270
},
{
"epoch": 4.598290598290598,
"grad_norm": 1.8203125,
"learning_rate": 3.056302334890786e-06,
"loss": 0.4657,
"num_tokens": 6687298.0,
"step": 271
},
{
"epoch": 4.615384615384615,
"grad_norm": 1.8828125,
"learning_rate": 3.0426203206801407e-06,
"loss": 0.4967,
"num_tokens": 6712463.0,
"step": 272
},
{
"epoch": 4.632478632478632,
"grad_norm": 1.71875,
"learning_rate": 3.0289212291931576e-06,
"loss": 0.447,
"num_tokens": 6739418.0,
"step": 273
},
{
"epoch": 4.64957264957265,
"grad_norm": 1.7890625,
"learning_rate": 3.0152054915658664e-06,
"loss": 0.4805,
"num_tokens": 6765453.0,
"step": 274
},
{
"epoch": 4.666666666666667,
"grad_norm": 1.8203125,
"learning_rate": 3.0014735394581824e-06,
"loss": 0.4596,
"num_tokens": 6791101.0,
"step": 275
},
{
"epoch": 4.683760683760684,
"grad_norm": 1.875,
"learning_rate": 2.9877258050403214e-06,
"loss": 0.4765,
"num_tokens": 6815841.0,
"step": 276
},
{
"epoch": 4.700854700854701,
"grad_norm": 1.9453125,
"learning_rate": 2.9739627209791965e-06,
"loss": 0.4836,
"num_tokens": 6840193.0,
"step": 277
},
{
"epoch": 4.717948717948718,
"grad_norm": 1.96875,
"learning_rate": 2.9601847204248045e-06,
"loss": 0.4644,
"num_tokens": 6864869.0,
"step": 278
},
{
"epoch": 4.735042735042735,
"grad_norm": 1.9140625,
"learning_rate": 2.946392236996592e-06,
"loss": 0.4973,
"num_tokens": 6889912.0,
"step": 279
},
{
"epoch": 4.752136752136752,
"grad_norm": 1.875,
"learning_rate": 2.932585704769807e-06,
"loss": 0.464,
"num_tokens": 6913871.0,
"step": 280
},
{
"epoch": 4.769230769230769,
"grad_norm": 1.90625,
"learning_rate": 2.9187655582618413e-06,
"loss": 0.4396,
"num_tokens": 6938132.0,
"step": 281
},
{
"epoch": 4.786324786324786,
"grad_norm": 1.7890625,
"learning_rate": 2.9049322324185524e-06,
"loss": 0.4538,
"num_tokens": 6963411.0,
"step": 282
},
{
"epoch": 4.803418803418803,
"grad_norm": 1.7578125,
"learning_rate": 2.8910861626005774e-06,
"loss": 0.4609,
"num_tokens": 6988757.0,
"step": 283
},
{
"epoch": 4.82051282051282,
"grad_norm": 1.8046875,
"learning_rate": 2.877227784569629e-06,
"loss": 0.4293,
"num_tokens": 7013231.0,
"step": 284
},
{
"epoch": 4.837606837606837,
"grad_norm": 1.90625,
"learning_rate": 2.863357534474782e-06,
"loss": 0.4561,
"num_tokens": 7038366.0,
"step": 285
},
{
"epoch": 4.854700854700854,
"grad_norm": 1.7265625,
"learning_rate": 2.849475848838749e-06,
"loss": 0.4761,
"num_tokens": 7063893.0,
"step": 286
},
{
"epoch": 4.871794871794872,
"grad_norm": 1.8203125,
"learning_rate": 2.835583164544139e-06,
"loss": 0.4588,
"num_tokens": 7089629.0,
"step": 287
},
{
"epoch": 4.888888888888889,
"grad_norm": 1.8828125,
"learning_rate": 2.8216799188197096e-06,
"loss": 0.4962,
"num_tokens": 7114684.0,
"step": 288
},
{
"epoch": 4.905982905982906,
"grad_norm": 2.015625,
"learning_rate": 2.8077665492266077e-06,
"loss": 0.4569,
"num_tokens": 7138648.0,
"step": 289
},
{
"epoch": 4.923076923076923,
"grad_norm": 1.8046875,
"learning_rate": 2.7938434936445946e-06,
"loss": 0.4669,
"num_tokens": 7163715.0,
"step": 290
},
{
"epoch": 4.94017094017094,
"grad_norm": 1.9375,
"learning_rate": 2.7799111902582697e-06,
"loss": 0.4888,
"num_tokens": 7188583.0,
"step": 291
},
{
"epoch": 4.957264957264957,
"grad_norm": 1.84375,
"learning_rate": 2.7659700775432785e-06,
"loss": 0.4718,
"num_tokens": 7213949.0,
"step": 292
},
{
"epoch": 4.9743589743589745,
"grad_norm": 1.84375,
"learning_rate": 2.752020594252511e-06,
"loss": 0.4759,
"num_tokens": 7238071.0,
"step": 293
},
{
"epoch": 4.9914529914529915,
"grad_norm": 1.7890625,
"learning_rate": 2.738063179402297e-06,
"loss": 0.4706,
"num_tokens": 7263181.0,
"step": 294
},
{
"epoch": 5.0,
"grad_norm": 3.09375,
"learning_rate": 2.724098272258584e-06,
"loss": 0.4948,
"num_tokens": 7272305.0,
"step": 295
},
{
"epoch": 5.017094017094017,
"grad_norm": 1.78125,
"learning_rate": 2.710126312323119e-06,
"loss": 0.4636,
"num_tokens": 7296700.0,
"step": 296
},
{
"epoch": 5.034188034188034,
"grad_norm": 1.7578125,
"learning_rate": 2.696147739319613e-06,
"loss": 0.4625,
"num_tokens": 7322323.0,
"step": 297
},
{
"epoch": 5.051282051282051,
"grad_norm": 1.828125,
"learning_rate": 2.6821629931799007e-06,
"loss": 0.4597,
"num_tokens": 7346129.0,
"step": 298
},
{
"epoch": 5.068376068376068,
"grad_norm": 1.8984375,
"learning_rate": 2.6681725140300995e-06,
"loss": 0.4551,
"num_tokens": 7370889.0,
"step": 299
},
{
"epoch": 5.085470085470085,
"grad_norm": 1.8984375,
"learning_rate": 2.654176742176754e-06,
"loss": 0.4572,
"num_tokens": 7395404.0,
"step": 300
},
{
"epoch": 5.085470085470085,
"eval_loss": 0.5508914589881897,
"eval_num_tokens": 7395404.0,
"eval_runtime": 3.6111,
"eval_samples_per_second": 365.259,
"eval_steps_per_second": 11.631,
"step": 300
},
{
"epoch": 5.102564102564102,
"grad_norm": 1.734375,
"learning_rate": 2.6401761180929798e-06,
"loss": 0.4685,
"num_tokens": 7421156.0,
"step": 301
},
{
"epoch": 5.119658119658119,
"grad_norm": 1.90625,
"learning_rate": 2.626171082404602e-06,
"loss": 0.456,
"num_tokens": 7445126.0,
"step": 302
},
{
"epoch": 5.136752136752137,
"grad_norm": 1.8203125,
"learning_rate": 2.6121620758762877e-06,
"loss": 0.466,
"num_tokens": 7470250.0,
"step": 303
},
{
"epoch": 5.153846153846154,
"grad_norm": 1.84375,
"learning_rate": 2.5981495393976718e-06,
"loss": 0.493,
"num_tokens": 7495266.0,
"step": 304
},
{
"epoch": 5.170940170940171,
"grad_norm": 1.75,
"learning_rate": 2.5841339139694856e-06,
"loss": 0.4603,
"num_tokens": 7521185.0,
"step": 305
},
{
"epoch": 5.188034188034188,
"grad_norm": 1.765625,
"learning_rate": 2.5701156406896726e-06,
"loss": 0.4628,
"num_tokens": 7546972.0,
"step": 306
},
{
"epoch": 5.205128205128205,
"grad_norm": 1.875,
"learning_rate": 2.556095160739513e-06,
"loss": 0.4763,
"num_tokens": 7572102.0,
"step": 307
},
{
"epoch": 5.222222222222222,
"grad_norm": 1.9765625,
"learning_rate": 2.542072915369731e-06,
"loss": 0.483,
"num_tokens": 7595823.0,
"step": 308
},
{
"epoch": 5.239316239316239,
"grad_norm": 1.8046875,
"learning_rate": 2.528049345886615e-06,
"loss": 0.4536,
"num_tokens": 7621019.0,
"step": 309
},
{
"epoch": 5.256410256410256,
"grad_norm": 1.8125,
"learning_rate": 2.5140248936381245e-06,
"loss": 0.4774,
"num_tokens": 7645327.0,
"step": 310
},
{
"epoch": 5.273504273504273,
"grad_norm": 1.8359375,
"learning_rate": 2.5e-06,
"loss": 0.4717,
"num_tokens": 7670908.0,
"step": 311
},
{
"epoch": 5.2905982905982905,
"grad_norm": 1.90625,
"learning_rate": 2.4859751063618763e-06,
"loss": 0.4603,
"num_tokens": 7696041.0,
"step": 312
},
{
"epoch": 5.3076923076923075,
"grad_norm": 1.7890625,
"learning_rate": 2.4719506541133857e-06,
"loss": 0.4505,
"num_tokens": 7721184.0,
"step": 313
},
{
"epoch": 5.3247863247863245,
"grad_norm": 1.9921875,
"learning_rate": 2.45792708463027e-06,
"loss": 0.5045,
"num_tokens": 7746578.0,
"step": 314
},
{
"epoch": 5.3418803418803416,
"grad_norm": 1.8515625,
"learning_rate": 2.443904839260488e-06,
"loss": 0.483,
"num_tokens": 7772741.0,
"step": 315
},
{
"epoch": 5.358974358974359,
"grad_norm": 1.953125,
"learning_rate": 2.429884359310328e-06,
"loss": 0.4507,
"num_tokens": 7796809.0,
"step": 316
},
{
"epoch": 5.3760683760683765,
"grad_norm": 1.8515625,
"learning_rate": 2.415866086030516e-06,
"loss": 0.5084,
"num_tokens": 7823222.0,
"step": 317
},
{
"epoch": 5.3931623931623935,
"grad_norm": 1.9140625,
"learning_rate": 2.4018504606023295e-06,
"loss": 0.4656,
"num_tokens": 7847421.0,
"step": 318
},
{
"epoch": 5.410256410256411,
"grad_norm": 1.796875,
"learning_rate": 2.3878379241237136e-06,
"loss": 0.4679,
"num_tokens": 7872182.0,
"step": 319
},
{
"epoch": 5.427350427350428,
"grad_norm": 1.8046875,
"learning_rate": 2.373828917595398e-06,
"loss": 0.4579,
"num_tokens": 7897776.0,
"step": 320
},
{
"epoch": 5.444444444444445,
"grad_norm": 1.8046875,
"learning_rate": 2.3598238819070206e-06,
"loss": 0.4579,
"num_tokens": 7922850.0,
"step": 321
},
{
"epoch": 5.461538461538462,
"grad_norm": 1.8984375,
"learning_rate": 2.345823257823246e-06,
"loss": 0.4629,
"num_tokens": 7947751.0,
"step": 322
},
{
"epoch": 5.478632478632479,
"grad_norm": 1.8125,
"learning_rate": 2.331827485969901e-06,
"loss": 0.4474,
"num_tokens": 7971691.0,
"step": 323
},
{
"epoch": 5.495726495726496,
"grad_norm": 1.90625,
"learning_rate": 2.3178370068201e-06,
"loss": 0.4859,
"num_tokens": 7996117.0,
"step": 324
},
{
"epoch": 5.512820512820513,
"grad_norm": 1.84375,
"learning_rate": 2.3038522606803882e-06,
"loss": 0.504,
"num_tokens": 8022056.0,
"step": 325
},
{
"epoch": 5.52991452991453,
"grad_norm": 1.7734375,
"learning_rate": 2.2898736876768816e-06,
"loss": 0.4544,
"num_tokens": 8048659.0,
"step": 326
},
{
"epoch": 5.547008547008547,
"grad_norm": 1.8046875,
"learning_rate": 2.2759017277414165e-06,
"loss": 0.4736,
"num_tokens": 8072668.0,
"step": 327
},
{
"epoch": 5.564102564102564,
"grad_norm": 1.84375,
"learning_rate": 2.2619368205977038e-06,
"loss": 0.4115,
"num_tokens": 8096796.0,
"step": 328
},
{
"epoch": 5.581196581196581,
"grad_norm": 1.7890625,
"learning_rate": 2.2479794057474893e-06,
"loss": 0.4616,
"num_tokens": 8121830.0,
"step": 329
},
{
"epoch": 5.598290598290598,
"grad_norm": 1.9609375,
"learning_rate": 2.234029922456722e-06,
"loss": 0.4777,
"num_tokens": 8145954.0,
"step": 330
},
{
"epoch": 5.598290598290598,
"eval_loss": 0.5508424043655396,
"eval_num_tokens": 8145954.0,
"eval_runtime": 3.7263,
"eval_samples_per_second": 353.969,
"eval_steps_per_second": 11.271,
"step": 330
},
{
"epoch": 5.615384615384615,
"grad_norm": 1.875,
"learning_rate": 2.2200888097417308e-06,
"loss": 0.4598,
"num_tokens": 8169934.0,
"step": 331
},
{
"epoch": 5.632478632478632,
"grad_norm": 1.84375,
"learning_rate": 2.2061565063554063e-06,
"loss": 0.4541,
"num_tokens": 8194695.0,
"step": 332
},
{
"epoch": 5.64957264957265,
"grad_norm": 1.7890625,
"learning_rate": 2.192233450773393e-06,
"loss": 0.5074,
"num_tokens": 8220946.0,
"step": 333
},
{
"epoch": 5.666666666666667,
"grad_norm": 1.8828125,
"learning_rate": 2.178320081180291e-06,
"loss": 0.4731,
"num_tokens": 8245010.0,
"step": 334
},
{
"epoch": 5.683760683760684,
"grad_norm": 1.8359375,
"learning_rate": 2.1644168354558623e-06,
"loss": 0.48,
"num_tokens": 8269142.0,
"step": 335
},
{
"epoch": 5.700854700854701,
"grad_norm": 1.8671875,
"learning_rate": 2.1505241511612522e-06,
"loss": 0.4469,
"num_tokens": 8293617.0,
"step": 336
},
{
"epoch": 5.717948717948718,
"grad_norm": 1.9609375,
"learning_rate": 2.136642465525219e-06,
"loss": 0.4769,
"num_tokens": 8317882.0,
"step": 337
},
{
"epoch": 5.735042735042735,
"grad_norm": 1.796875,
"learning_rate": 2.1227722154303714e-06,
"loss": 0.4728,
"num_tokens": 8342847.0,
"step": 338
},
{
"epoch": 5.752136752136752,
"grad_norm": 1.796875,
"learning_rate": 2.1089138373994226e-06,
"loss": 0.4697,
"num_tokens": 8367548.0,
"step": 339
},
{
"epoch": 5.769230769230769,
"grad_norm": 1.875,
"learning_rate": 2.095067767581447e-06,
"loss": 0.4634,
"num_tokens": 8392538.0,
"step": 340
},
{
"epoch": 5.786324786324786,
"grad_norm": 1.8203125,
"learning_rate": 2.0812344417381595e-06,
"loss": 0.4732,
"num_tokens": 8417232.0,
"step": 341
},
{
"epoch": 5.803418803418803,
"grad_norm": 1.796875,
"learning_rate": 2.0674142952301932e-06,
"loss": 0.4754,
"num_tokens": 8442838.0,
"step": 342
},
{
"epoch": 5.82051282051282,
"grad_norm": 1.6875,
"learning_rate": 2.053607763003409e-06,
"loss": 0.4584,
"num_tokens": 8468492.0,
"step": 343
},
{
"epoch": 5.837606837606837,
"grad_norm": 1.8125,
"learning_rate": 2.039815279575196e-06,
"loss": 0.4924,
"num_tokens": 8494007.0,
"step": 344
},
{
"epoch": 5.854700854700854,
"grad_norm": 1.875,
"learning_rate": 2.026037279020804e-06,
"loss": 0.468,
"num_tokens": 8519316.0,
"step": 345
},
{
"epoch": 5.871794871794872,
"grad_norm": 1.9375,
"learning_rate": 2.01227419495968e-06,
"loss": 0.4625,
"num_tokens": 8543710.0,
"step": 346
},
{
"epoch": 5.888888888888889,
"grad_norm": 1.8828125,
"learning_rate": 1.9985264605418185e-06,
"loss": 0.465,
"num_tokens": 8568173.0,
"step": 347
},
{
"epoch": 5.905982905982906,
"grad_norm": 1.921875,
"learning_rate": 1.9847945084341345e-06,
"loss": 0.4927,
"num_tokens": 8593427.0,
"step": 348
},
{
"epoch": 5.923076923076923,
"grad_norm": 1.7890625,
"learning_rate": 1.9710787708068433e-06,
"loss": 0.4748,
"num_tokens": 8618045.0,
"step": 349
},
{
"epoch": 5.94017094017094,
"grad_norm": 1.8203125,
"learning_rate": 1.9573796793198597e-06,
"loss": 0.4403,
"num_tokens": 8643396.0,
"step": 350
},
{
"epoch": 5.957264957264957,
"grad_norm": 1.78125,
"learning_rate": 1.9436976651092143e-06,
"loss": 0.4478,
"num_tokens": 8667537.0,
"step": 351
},
{
"epoch": 5.9743589743589745,
"grad_norm": 1.8828125,
"learning_rate": 1.9300331587734838e-06,
"loss": 0.4519,
"num_tokens": 8691813.0,
"step": 352
},
{
"epoch": 5.9914529914529915,
"grad_norm": 1.9296875,
"learning_rate": 1.9163865903602374e-06,
"loss": 0.4919,
"num_tokens": 8717266.0,
"step": 353
},
{
"epoch": 6.0,
"grad_norm": 3.203125,
"learning_rate": 1.9027583893525067e-06,
"loss": 0.4946,
"num_tokens": 8726766.0,
"step": 354
},
{
"epoch": 6.017094017094017,
"grad_norm": 1.7265625,
"learning_rate": 1.8891489846552645e-06,
"loss": 0.4804,
"num_tokens": 8751647.0,
"step": 355
},
{
"epoch": 6.034188034188034,
"grad_norm": 1.8671875,
"learning_rate": 1.8755588045819325e-06,
"loss": 0.456,
"num_tokens": 8776752.0,
"step": 356
},
{
"epoch": 6.051282051282051,
"grad_norm": 1.953125,
"learning_rate": 1.8619882768408936e-06,
"loss": 0.4773,
"num_tokens": 8801564.0,
"step": 357
},
{
"epoch": 6.068376068376068,
"grad_norm": 1.8125,
"learning_rate": 1.8484378285220367e-06,
"loss": 0.4717,
"num_tokens": 8826820.0,
"step": 358
},
{
"epoch": 6.085470085470085,
"grad_norm": 1.796875,
"learning_rate": 1.8349078860833125e-06,
"loss": 0.4486,
"num_tokens": 8852240.0,
"step": 359
},
{
"epoch": 6.102564102564102,
"grad_norm": 1.875,
"learning_rate": 1.8213988753373147e-06,
"loss": 0.4355,
"num_tokens": 8876431.0,
"step": 360
},
{
"epoch": 6.102564102564102,
"eval_loss": 0.5509193539619446,
"eval_num_tokens": 8876431.0,
"eval_runtime": 3.8061,
"eval_samples_per_second": 346.55,
"eval_steps_per_second": 11.035,
"step": 360
},
{
"epoch": 6.119658119658119,
"grad_norm": 1.8515625,
"learning_rate": 1.8079112214378769e-06,
"loss": 0.4951,
"num_tokens": 8902069.0,
"step": 361
},
{
"epoch": 6.136752136752137,
"grad_norm": 1.8046875,
"learning_rate": 1.7944453488666929e-06,
"loss": 0.4818,
"num_tokens": 8927782.0,
"step": 362
},
{
"epoch": 6.153846153846154,
"grad_norm": 1.7734375,
"learning_rate": 1.781001681419957e-06,
"loss": 0.4467,
"num_tokens": 8953475.0,
"step": 363
},
{
"epoch": 6.170940170940171,
"grad_norm": 1.796875,
"learning_rate": 1.7675806421950278e-06,
"loss": 0.4827,
"num_tokens": 8978643.0,
"step": 364
},
{
"epoch": 6.188034188034188,
"grad_norm": 1.8125,
"learning_rate": 1.75418265357711e-06,
"loss": 0.4599,
"num_tokens": 9004133.0,
"step": 365
},
{
"epoch": 6.205128205128205,
"grad_norm": 1.890625,
"learning_rate": 1.7408081372259633e-06,
"loss": 0.4838,
"num_tokens": 9029609.0,
"step": 366
},
{
"epoch": 6.222222222222222,
"grad_norm": 1.9765625,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.469,
"num_tokens": 9053446.0,
"step": 367
},
{
"epoch": 6.239316239316239,
"grad_norm": 1.84375,
"learning_rate": 1.714131204256195e-06,
"loss": 0.4354,
"num_tokens": 9078936.0,
"step": 368
},
{
"epoch": 6.256410256410256,
"grad_norm": 1.859375,
"learning_rate": 1.7008296272105469e-06,
"loss": 0.4997,
"num_tokens": 9102790.0,
"step": 369
},
{
"epoch": 6.273504273504273,
"grad_norm": 2.015625,
"learning_rate": 1.6875532015511945e-06,
"loss": 0.4745,
"num_tokens": 9127911.0,
"step": 370
},
{
"epoch": 6.2905982905982905,
"grad_norm": 1.9140625,
"learning_rate": 1.6743023451120831e-06,
"loss": 0.4707,
"num_tokens": 9153311.0,
"step": 371
},
{
"epoch": 6.3076923076923075,
"grad_norm": 1.859375,
"learning_rate": 1.6610774749224484e-06,
"loss": 0.4863,
"num_tokens": 9177952.0,
"step": 372
},
{
"epoch": 6.3247863247863245,
"grad_norm": 1.8203125,
"learning_rate": 1.6478790071936875e-06,
"loss": 0.4565,
"num_tokens": 9203024.0,
"step": 373
},
{
"epoch": 6.3418803418803416,
"grad_norm": 1.90625,
"learning_rate": 1.634707357306267e-06,
"loss": 0.5029,
"num_tokens": 9227504.0,
"step": 374
},
{
"epoch": 6.358974358974359,
"grad_norm": 1.84375,
"learning_rate": 1.6215629397966432e-06,
"loss": 0.4841,
"num_tokens": 9252329.0,
"step": 375
},
{
"epoch": 6.3760683760683765,
"grad_norm": 1.828125,
"learning_rate": 1.6084461683442176e-06,
"loss": 0.485,
"num_tokens": 9278535.0,
"step": 376
},
{
"epoch": 6.3931623931623935,
"grad_norm": 1.9375,
"learning_rate": 1.5953574557583202e-06,
"loss": 0.4622,
"num_tokens": 9301964.0,
"step": 377
},
{
"epoch": 6.410256410256411,
"grad_norm": 1.8359375,
"learning_rate": 1.5822972139652148e-06,
"loss": 0.4401,
"num_tokens": 9326744.0,
"step": 378
},
{
"epoch": 6.427350427350428,
"grad_norm": 1.8984375,
"learning_rate": 1.5692658539951371e-06,
"loss": 0.4447,
"num_tokens": 9351391.0,
"step": 379
},
{
"epoch": 6.444444444444445,
"grad_norm": 1.890625,
"learning_rate": 1.5562637859693585e-06,
"loss": 0.4805,
"num_tokens": 9375727.0,
"step": 380
},
{
"epoch": 6.461538461538462,
"grad_norm": 1.9140625,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.4464,
"num_tokens": 9399774.0,
"step": 381
},
{
"epoch": 6.478632478632479,
"grad_norm": 1.953125,
"learning_rate": 1.5303491616135374e-06,
"loss": 0.4372,
"num_tokens": 9424390.0,
"step": 382
},
{
"epoch": 6.495726495726496,
"grad_norm": 1.828125,
"learning_rate": 1.5174374208651913e-06,
"loss": 0.4291,
"num_tokens": 9448880.0,
"step": 383
},
{
"epoch": 6.512820512820513,
"grad_norm": 1.8359375,
"learning_rate": 1.5045566031988672e-06,
"loss": 0.4669,
"num_tokens": 9473258.0,
"step": 384
},
{
"epoch": 6.52991452991453,
"grad_norm": 1.828125,
"learning_rate": 1.4917071139979877e-06,
"loss": 0.5007,
"num_tokens": 9497996.0,
"step": 385
},
{
"epoch": 6.547008547008547,
"grad_norm": 1.859375,
"learning_rate": 1.47888935766001e-06,
"loss": 0.4926,
"num_tokens": 9523600.0,
"step": 386
},
{
"epoch": 6.564102564102564,
"grad_norm": 1.7578125,
"learning_rate": 1.466103737583699e-06,
"loss": 0.4558,
"num_tokens": 9548721.0,
"step": 387
},
{
"epoch": 6.581196581196581,
"grad_norm": 1.7734375,
"learning_rate": 1.4533506561564305e-06,
"loss": 0.4606,
"num_tokens": 9573873.0,
"step": 388
},
{
"epoch": 6.598290598290598,
"grad_norm": 1.7890625,
"learning_rate": 1.4406305147415284e-06,
"loss": 0.4623,
"num_tokens": 9598689.0,
"step": 389
},
{
"epoch": 6.615384615384615,
"grad_norm": 1.8515625,
"learning_rate": 1.4279437136656338e-06,
"loss": 0.4886,
"num_tokens": 9624643.0,
"step": 390
},
{
"epoch": 6.615384615384615,
"eval_loss": 0.5507991313934326,
"eval_num_tokens": 9624643.0,
"eval_runtime": 3.8292,
"eval_samples_per_second": 344.459,
"eval_steps_per_second": 10.968,
"step": 390
},
{
"epoch": 6.632478632478632,
"grad_norm": 1.8203125,
"learning_rate": 1.415290652206105e-06,
"loss": 0.4754,
"num_tokens": 9650106.0,
"step": 391
},
{
"epoch": 6.64957264957265,
"grad_norm": 1.890625,
"learning_rate": 1.402671728578449e-06,
"loss": 0.4861,
"num_tokens": 9674783.0,
"step": 392
},
{
"epoch": 6.666666666666667,
"grad_norm": 1.8125,
"learning_rate": 1.3900873399237953e-06,
"loss": 0.4472,
"num_tokens": 9699648.0,
"step": 393
},
{
"epoch": 6.683760683760684,
"grad_norm": 1.921875,
"learning_rate": 1.3775378822963884e-06,
"loss": 0.4676,
"num_tokens": 9723820.0,
"step": 394
},
{
"epoch": 6.700854700854701,
"grad_norm": 1.921875,
"learning_rate": 1.3650237506511333e-06,
"loss": 0.4269,
"num_tokens": 9747820.0,
"step": 395
},
{
"epoch": 6.717948717948718,
"grad_norm": 1.875,
"learning_rate": 1.3525453388311554e-06,
"loss": 0.4732,
"num_tokens": 9773075.0,
"step": 396
},
{
"epoch": 6.735042735042735,
"grad_norm": 1.8671875,
"learning_rate": 1.3401030395554152e-06,
"loss": 0.4903,
"num_tokens": 9797797.0,
"step": 397
},
{
"epoch": 6.752136752136752,
"grad_norm": 1.8359375,
"learning_rate": 1.3276972444063386e-06,
"loss": 0.4503,
"num_tokens": 9822635.0,
"step": 398
},
{
"epoch": 6.769230769230769,
"grad_norm": 1.8828125,
"learning_rate": 1.3153283438175036e-06,
"loss": 0.4821,
"num_tokens": 9847708.0,
"step": 399
},
{
"epoch": 6.786324786324786,
"grad_norm": 1.8203125,
"learning_rate": 1.3029967270613435e-06,
"loss": 0.4617,
"num_tokens": 9871866.0,
"step": 400
},
{
"epoch": 6.803418803418803,
"grad_norm": 1.8671875,
"learning_rate": 1.2907027822369006e-06,
"loss": 0.4251,
"num_tokens": 9895958.0,
"step": 401
},
{
"epoch": 6.82051282051282,
"grad_norm": 1.78125,
"learning_rate": 1.2784468962576136e-06,
"loss": 0.4433,
"num_tokens": 9920177.0,
"step": 402
},
{
"epoch": 6.837606837606837,
"grad_norm": 1.734375,
"learning_rate": 1.266229454839133e-06,
"loss": 0.4532,
"num_tokens": 9945727.0,
"step": 403
},
{
"epoch": 6.854700854700854,
"grad_norm": 1.828125,
"learning_rate": 1.2540508424871934e-06,
"loss": 0.4492,
"num_tokens": 9970657.0,
"step": 404
},
{
"epoch": 6.871794871794872,
"grad_norm": 1.9296875,
"learning_rate": 1.2419114424855e-06,
"loss": 0.492,
"num_tokens": 9995805.0,
"step": 405
},
{
"epoch": 6.888888888888889,
"grad_norm": 1.828125,
"learning_rate": 1.2298116368836772e-06,
"loss": 0.4824,
"num_tokens": 10020922.0,
"step": 406
},
{
"epoch": 6.905982905982906,
"grad_norm": 1.8046875,
"learning_rate": 1.217751806485235e-06,
"loss": 0.4254,
"num_tokens": 10046307.0,
"step": 407
},
{
"epoch": 6.923076923076923,
"grad_norm": 1.875,
"learning_rate": 1.2057323308355922e-06,
"loss": 0.4682,
"num_tokens": 10071470.0,
"step": 408
},
{
"epoch": 6.94017094017094,
"grad_norm": 1.828125,
"learning_rate": 1.193753588210128e-06,
"loss": 0.4738,
"num_tokens": 10096698.0,
"step": 409
},
{
"epoch": 6.957264957264957,
"grad_norm": 1.8515625,
"learning_rate": 1.1818159556022748e-06,
"loss": 0.4758,
"num_tokens": 10122020.0,
"step": 410
},
{
"epoch": 6.9743589743589745,
"grad_norm": 1.7890625,
"learning_rate": 1.169919808711659e-06,
"loss": 0.4722,
"num_tokens": 10146447.0,
"step": 411
},
{
"epoch": 6.9914529914529915,
"grad_norm": 1.90625,
"learning_rate": 1.15806552193227e-06,
"loss": 0.4545,
"num_tokens": 10171539.0,
"step": 412
},
{
"epoch": 7.0,
"grad_norm": 3.140625,
"learning_rate": 1.1462534683406859e-06,
"loss": 0.5255,
"num_tokens": 10181227.0,
"step": 413
},
{
"epoch": 7.017094017094017,
"grad_norm": 1.8359375,
"learning_rate": 1.1344840196843228e-06,
"loss": 0.4529,
"num_tokens": 10206262.0,
"step": 414
},
{
"epoch": 7.034188034188034,
"grad_norm": 1.84375,
"learning_rate": 1.122757546369744e-06,
"loss": 0.4641,
"num_tokens": 10230139.0,
"step": 415
},
{
"epoch": 7.051282051282051,
"grad_norm": 1.9140625,
"learning_rate": 1.1110744174509952e-06,
"loss": 0.4764,
"num_tokens": 10255012.0,
"step": 416
},
{
"epoch": 7.068376068376068,
"grad_norm": 1.8203125,
"learning_rate": 1.0994350006179933e-06,
"loss": 0.4632,
"num_tokens": 10280931.0,
"step": 417
},
{
"epoch": 7.085470085470085,
"grad_norm": 1.890625,
"learning_rate": 1.0878396621849565e-06,
"loss": 0.4694,
"num_tokens": 10306239.0,
"step": 418
},
{
"epoch": 7.102564102564102,
"grad_norm": 1.7890625,
"learning_rate": 1.0762887670788702e-06,
"loss": 0.4779,
"num_tokens": 10331372.0,
"step": 419
},
{
"epoch": 7.119658119658119,
"grad_norm": 1.875,
"learning_rate": 1.0647826788280084e-06,
"loss": 0.4851,
"num_tokens": 10356892.0,
"step": 420
},
{
"epoch": 7.119658119658119,
"eval_loss": 0.5506927967071533,
"eval_num_tokens": 10356892.0,
"eval_runtime": 3.8256,
"eval_samples_per_second": 344.787,
"eval_steps_per_second": 10.979,
"step": 420
},
{
"epoch": 7.136752136752137,
"grad_norm": 1.8515625,
"learning_rate": 1.0533217595504859e-06,
"loss": 0.4674,
"num_tokens": 10381394.0,
"step": 421
},
{
"epoch": 7.153846153846154,
"grad_norm": 1.890625,
"learning_rate": 1.041906369942869e-06,
"loss": 0.4644,
"num_tokens": 10405722.0,
"step": 422
},
{
"epoch": 7.170940170940171,
"grad_norm": 2.015625,
"learning_rate": 1.0305368692688175e-06,
"loss": 0.4791,
"num_tokens": 10430615.0,
"step": 423
},
{
"epoch": 7.188034188034188,
"grad_norm": 1.8359375,
"learning_rate": 1.0192136153477825e-06,
"loss": 0.4578,
"num_tokens": 10455551.0,
"step": 424
},
{
"epoch": 7.205128205128205,
"grad_norm": 1.859375,
"learning_rate": 1.0079369645437411e-06,
"loss": 0.4761,
"num_tokens": 10480682.0,
"step": 425
},
{
"epoch": 7.222222222222222,
"grad_norm": 1.828125,
"learning_rate": 9.967072717539852e-07,
"loss": 0.481,
"num_tokens": 10505454.0,
"step": 426
},
{
"epoch": 7.239316239316239,
"grad_norm": 1.9296875,
"learning_rate": 9.855248903979505e-07,
"loss": 0.4992,
"num_tokens": 10529967.0,
"step": 427
},
{
"epoch": 7.256410256410256,
"grad_norm": 1.9453125,
"learning_rate": 9.743901724060905e-07,
"loss": 0.5156,
"num_tokens": 10555112.0,
"step": 428
},
{
"epoch": 7.273504273504273,
"grad_norm": 1.9375,
"learning_rate": 9.633034682088072e-07,
"loss": 0.4761,
"num_tokens": 10580728.0,
"step": 429
},
{
"epoch": 7.2905982905982905,
"grad_norm": 1.890625,
"learning_rate": 9.522651267254149e-07,
"loss": 0.448,
"num_tokens": 10605792.0,
"step": 430
},
{
"epoch": 7.3076923076923075,
"grad_norm": 1.828125,
"learning_rate": 9.412754953531664e-07,
"loss": 0.463,
"num_tokens": 10630947.0,
"step": 431
},
{
"epoch": 7.3247863247863245,
"grad_norm": 1.7265625,
"learning_rate": 9.303349199563131e-07,
"loss": 0.4274,
"num_tokens": 10656582.0,
"step": 432
},
{
"epoch": 7.3418803418803416,
"grad_norm": 1.7265625,
"learning_rate": 9.19443744855226e-07,
"loss": 0.4583,
"num_tokens": 10682836.0,
"step": 433
},
{
"epoch": 7.358974358974359,
"grad_norm": 1.890625,
"learning_rate": 9.086023128155543e-07,
"loss": 0.5,
"num_tokens": 10707868.0,
"step": 434
},
{
"epoch": 7.3760683760683765,
"grad_norm": 1.7265625,
"learning_rate": 8.978109650374398e-07,
"loss": 0.4611,
"num_tokens": 10733254.0,
"step": 435
},
{
"epoch": 7.3931623931623935,
"grad_norm": 1.7890625,
"learning_rate": 8.870700411447817e-07,
"loss": 0.4758,
"num_tokens": 10757749.0,
"step": 436
},
{
"epoch": 7.410256410256411,
"grad_norm": 1.8828125,
"learning_rate": 8.763798791745413e-07,
"loss": 0.4334,
"num_tokens": 10782950.0,
"step": 437
},
{
"epoch": 7.427350427350428,
"grad_norm": 1.84375,
"learning_rate": 8.657408155661109e-07,
"loss": 0.4593,
"num_tokens": 10806903.0,
"step": 438
},
{
"epoch": 7.444444444444445,
"grad_norm": 1.734375,
"learning_rate": 8.551531851507186e-07,
"loss": 0.4515,
"num_tokens": 10831694.0,
"step": 439
},
{
"epoch": 7.461538461538462,
"grad_norm": 1.8671875,
"learning_rate": 8.446173211408972e-07,
"loss": 0.4773,
"num_tokens": 10856829.0,
"step": 440
},
{
"epoch": 7.478632478632479,
"grad_norm": 1.875,
"learning_rate": 8.341335551199903e-07,
"loss": 0.4766,
"num_tokens": 10880998.0,
"step": 441
},
{
"epoch": 7.495726495726496,
"grad_norm": 1.8828125,
"learning_rate": 8.237022170317235e-07,
"loss": 0.4733,
"num_tokens": 10906303.0,
"step": 442
},
{
"epoch": 7.512820512820513,
"grad_norm": 1.796875,
"learning_rate": 8.133236351698143e-07,
"loss": 0.4881,
"num_tokens": 10930727.0,
"step": 443
},
{
"epoch": 7.52991452991453,
"grad_norm": 1.84375,
"learning_rate": 8.029981361676456e-07,
"loss": 0.4623,
"num_tokens": 10955710.0,
"step": 444
},
{
"epoch": 7.547008547008547,
"grad_norm": 1.8046875,
"learning_rate": 7.927260449879828e-07,
"loss": 0.4491,
"num_tokens": 10980669.0,
"step": 445
},
{
"epoch": 7.564102564102564,
"grad_norm": 1.8984375,
"learning_rate": 7.825076849127458e-07,
"loss": 0.4894,
"num_tokens": 11004817.0,
"step": 446
},
{
"epoch": 7.581196581196581,
"grad_norm": 1.921875,
"learning_rate": 7.723433775328385e-07,
"loss": 0.4662,
"num_tokens": 11029057.0,
"step": 447
},
{
"epoch": 7.598290598290598,
"grad_norm": 1.8359375,
"learning_rate": 7.622334427380229e-07,
"loss": 0.4717,
"num_tokens": 11054362.0,
"step": 448
},
{
"epoch": 7.615384615384615,
"grad_norm": 1.7734375,
"learning_rate": 7.521781987068566e-07,
"loss": 0.4596,
"num_tokens": 11079670.0,
"step": 449
},
{
"epoch": 7.632478632478632,
"grad_norm": 1.9375,
"learning_rate": 7.421779618966737e-07,
"loss": 0.4315,
"num_tokens": 11104223.0,
"step": 450
},
{
"epoch": 7.632478632478632,
"eval_loss": 0.5506695508956909,
"eval_num_tokens": 11104223.0,
"eval_runtime": 3.646,
"eval_samples_per_second": 361.764,
"eval_steps_per_second": 11.519,
"step": 450
},
{
"epoch": 7.64957264957265,
"grad_norm": 1.8671875,
"learning_rate": 7.322330470336314e-07,
"loss": 0.4418,
"num_tokens": 11129525.0,
"step": 451
},
{
"epoch": 7.666666666666667,
"grad_norm": 1.8671875,
"learning_rate": 7.223437671027994e-07,
"loss": 0.4092,
"num_tokens": 11153830.0,
"step": 452
},
{
"epoch": 7.683760683760684,
"grad_norm": 1.78125,
"learning_rate": 7.125104333383117e-07,
"loss": 0.4523,
"num_tokens": 11178840.0,
"step": 453
},
{
"epoch": 7.700854700854701,
"grad_norm": 1.84375,
"learning_rate": 7.027333552135748e-07,
"loss": 0.4741,
"num_tokens": 11203539.0,
"step": 454
},
{
"epoch": 7.717948717948718,
"grad_norm": 1.703125,
"learning_rate": 6.930128404315214e-07,
"loss": 0.4736,
"num_tokens": 11229499.0,
"step": 455
},
{
"epoch": 7.735042735042735,
"grad_norm": 1.8671875,
"learning_rate": 6.833491949149329e-07,
"loss": 0.4764,
"num_tokens": 11254429.0,
"step": 456
},
{
"epoch": 7.752136752136752,
"grad_norm": 1.9375,
"learning_rate": 6.737427227968063e-07,
"loss": 0.4781,
"num_tokens": 11279074.0,
"step": 457
},
{
"epoch": 7.769230769230769,
"grad_norm": 1.828125,
"learning_rate": 6.641937264107868e-07,
"loss": 0.4477,
"num_tokens": 11303890.0,
"step": 458
},
{
"epoch": 7.786324786324786,
"grad_norm": 1.859375,
"learning_rate": 6.547025062816487e-07,
"loss": 0.4673,
"num_tokens": 11328933.0,
"step": 459
},
{
"epoch": 7.803418803418803,
"grad_norm": 1.875,
"learning_rate": 6.452693611158412e-07,
"loss": 0.5019,
"num_tokens": 11354261.0,
"step": 460
},
{
"epoch": 7.82051282051282,
"grad_norm": 1.8046875,
"learning_rate": 6.358945877920861e-07,
"loss": 0.4578,
"num_tokens": 11379618.0,
"step": 461
},
{
"epoch": 7.837606837606837,
"grad_norm": 1.7421875,
"learning_rate": 6.265784813520318e-07,
"loss": 0.4628,
"num_tokens": 11405767.0,
"step": 462
},
{
"epoch": 7.854700854700854,
"grad_norm": 1.890625,
"learning_rate": 6.17321334990973e-07,
"loss": 0.4494,
"num_tokens": 11430505.0,
"step": 463
},
{
"epoch": 7.871794871794872,
"grad_norm": 1.890625,
"learning_rate": 6.081234400486172e-07,
"loss": 0.4984,
"num_tokens": 11455251.0,
"step": 464
},
{
"epoch": 7.888888888888889,
"grad_norm": 1.9296875,
"learning_rate": 5.989850859999227e-07,
"loss": 0.4737,
"num_tokens": 11480173.0,
"step": 465
},
{
"epoch": 7.905982905982906,
"grad_norm": 1.84375,
"learning_rate": 5.899065604459814e-07,
"loss": 0.4607,
"num_tokens": 11504623.0,
"step": 466
},
{
"epoch": 7.923076923076923,
"grad_norm": 1.7734375,
"learning_rate": 5.808881491049723e-07,
"loss": 0.4354,
"num_tokens": 11529494.0,
"step": 467
},
{
"epoch": 7.94017094017094,
"grad_norm": 1.8203125,
"learning_rate": 5.719301358031665e-07,
"loss": 0.4639,
"num_tokens": 11553965.0,
"step": 468
},
{
"epoch": 7.957264957264957,
"grad_norm": 1.8984375,
"learning_rate": 5.630328024659979e-07,
"loss": 0.4911,
"num_tokens": 11578169.0,
"step": 469
},
{
"epoch": 7.9743589743589745,
"grad_norm": 1.8515625,
"learning_rate": 5.541964291091856e-07,
"loss": 0.4318,
"num_tokens": 11603175.0,
"step": 470
},
{
"epoch": 7.9914529914529915,
"grad_norm": 1.9609375,
"learning_rate": 5.454212938299256e-07,
"loss": 0.4223,
"num_tokens": 11626457.0,
"step": 471
},
{
"epoch": 8.0,
"grad_norm": 2.953125,
"learning_rate": 5.367076727981383e-07,
"loss": 0.4906,
"num_tokens": 11635688.0,
"step": 472
},
{
"epoch": 8.017094017094017,
"grad_norm": 1.8515625,
"learning_rate": 5.280558402477726e-07,
"loss": 0.4614,
"num_tokens": 11661177.0,
"step": 473
},
{
"epoch": 8.034188034188034,
"grad_norm": 1.875,
"learning_rate": 5.194660684681818e-07,
"loss": 0.4407,
"num_tokens": 11685654.0,
"step": 474
},
{
"epoch": 8.051282051282051,
"grad_norm": 1.78125,
"learning_rate": 5.109386277955477e-07,
"loss": 0.4605,
"num_tokens": 11710852.0,
"step": 475
},
{
"epoch": 8.068376068376068,
"grad_norm": 1.890625,
"learning_rate": 5.02473786604378e-07,
"loss": 0.4734,
"num_tokens": 11735336.0,
"step": 476
},
{
"epoch": 8.085470085470085,
"grad_norm": 1.8203125,
"learning_rate": 4.940718112990553e-07,
"loss": 0.4598,
"num_tokens": 11760061.0,
"step": 477
},
{
"epoch": 8.102564102564102,
"grad_norm": 1.84375,
"learning_rate": 4.857329663054569e-07,
"loss": 0.4642,
"num_tokens": 11784720.0,
"step": 478
},
{
"epoch": 8.11965811965812,
"grad_norm": 1.8125,
"learning_rate": 4.774575140626317e-07,
"loss": 0.4626,
"num_tokens": 11809420.0,
"step": 479
},
{
"epoch": 8.136752136752136,
"grad_norm": 1.890625,
"learning_rate": 4.6924571501453743e-07,
"loss": 0.4586,
"num_tokens": 11833862.0,
"step": 480
},
{
"epoch": 8.136752136752136,
"eval_loss": 0.5508862733840942,
"eval_num_tokens": 11833862.0,
"eval_runtime": 3.8867,
"eval_samples_per_second": 339.36,
"eval_steps_per_second": 10.806,
"step": 480
},
{
"epoch": 8.153846153846153,
"grad_norm": 1.828125,
"learning_rate": 4.610978276018496e-07,
"loss": 0.5026,
"num_tokens": 11858120.0,
"step": 481
},
{
"epoch": 8.17094017094017,
"grad_norm": 1.71875,
"learning_rate": 4.530141082538231e-07,
"loss": 0.426,
"num_tokens": 11883665.0,
"step": 482
},
{
"epoch": 8.188034188034187,
"grad_norm": 1.8671875,
"learning_rate": 4.4499481138022546e-07,
"loss": 0.4912,
"num_tokens": 11907624.0,
"step": 483
},
{
"epoch": 8.205128205128204,
"grad_norm": 1.875,
"learning_rate": 4.370401893633261e-07,
"loss": 0.4683,
"num_tokens": 11931156.0,
"step": 484
},
{
"epoch": 8.222222222222221,
"grad_norm": 1.8203125,
"learning_rate": 4.29150492549959e-07,
"loss": 0.4757,
"num_tokens": 11956116.0,
"step": 485
},
{
"epoch": 8.239316239316238,
"grad_norm": 1.8671875,
"learning_rate": 4.2132596924363666e-07,
"loss": 0.4757,
"num_tokens": 11981019.0,
"step": 486
},
{
"epoch": 8.256410256410255,
"grad_norm": 1.8125,
"learning_rate": 4.1356686569674344e-07,
"loss": 0.4684,
"num_tokens": 12006158.0,
"step": 487
},
{
"epoch": 8.273504273504274,
"grad_norm": 1.7890625,
"learning_rate": 4.058734261027789e-07,
"loss": 0.4658,
"num_tokens": 12031870.0,
"step": 488
},
{
"epoch": 8.290598290598291,
"grad_norm": 1.8515625,
"learning_rate": 3.982458925886748e-07,
"loss": 0.436,
"num_tokens": 12055810.0,
"step": 489
},
{
"epoch": 8.307692307692308,
"grad_norm": 1.859375,
"learning_rate": 3.9068450520717784e-07,
"loss": 0.4608,
"num_tokens": 12080118.0,
"step": 490
},
{
"epoch": 8.324786324786325,
"grad_norm": 1.78125,
"learning_rate": 3.831895019292897e-07,
"loss": 0.4328,
"num_tokens": 12104565.0,
"step": 491
},
{
"epoch": 8.341880341880342,
"grad_norm": 1.8359375,
"learning_rate": 3.757611186367824e-07,
"loss": 0.5016,
"num_tokens": 12129884.0,
"step": 492
},
{
"epoch": 8.35897435897436,
"grad_norm": 1.859375,
"learning_rate": 3.683995891147696e-07,
"loss": 0.4719,
"num_tokens": 12154751.0,
"step": 493
},
{
"epoch": 8.376068376068377,
"grad_norm": 1.9296875,
"learning_rate": 3.611051450443551e-07,
"loss": 0.473,
"num_tokens": 12179366.0,
"step": 494
},
{
"epoch": 8.393162393162394,
"grad_norm": 1.8046875,
"learning_rate": 3.538780159953348e-07,
"loss": 0.4526,
"num_tokens": 12204166.0,
"step": 495
},
{
"epoch": 8.41025641025641,
"grad_norm": 1.8359375,
"learning_rate": 3.4671842941897764e-07,
"loss": 0.4377,
"num_tokens": 12228036.0,
"step": 496
},
{
"epoch": 8.427350427350428,
"grad_norm": 1.75,
"learning_rate": 3.3962661064086356e-07,
"loss": 0.4701,
"num_tokens": 12252716.0,
"step": 497
},
{
"epoch": 8.444444444444445,
"grad_norm": 1.7578125,
"learning_rate": 3.3260278285379225e-07,
"loss": 0.448,
"num_tokens": 12278652.0,
"step": 498
},
{
"epoch": 8.461538461538462,
"grad_norm": 1.8359375,
"learning_rate": 3.256471671107617e-07,
"loss": 0.477,
"num_tokens": 12303497.0,
"step": 499
},
{
"epoch": 8.478632478632479,
"grad_norm": 1.890625,
"learning_rate": 3.187599823180071e-07,
"loss": 0.4697,
"num_tokens": 12328553.0,
"step": 500
},
{
"epoch": 8.495726495726496,
"grad_norm": 1.8203125,
"learning_rate": 3.119414452281158e-07,
"loss": 0.4502,
"num_tokens": 12354244.0,
"step": 501
},
{
"epoch": 8.512820512820513,
"grad_norm": 1.8515625,
"learning_rate": 3.051917704332016e-07,
"loss": 0.4643,
"num_tokens": 12379660.0,
"step": 502
},
{
"epoch": 8.52991452991453,
"grad_norm": 1.8984375,
"learning_rate": 2.98511170358155e-07,
"loss": 0.4925,
"num_tokens": 12404081.0,
"step": 503
},
{
"epoch": 8.547008547008547,
"grad_norm": 1.8203125,
"learning_rate": 2.918998552539545e-07,
"loss": 0.4617,
"num_tokens": 12428994.0,
"step": 504
},
{
"epoch": 8.564102564102564,
"grad_norm": 1.9296875,
"learning_rate": 2.8535803319105047e-07,
"loss": 0.5077,
"num_tokens": 12454375.0,
"step": 505
},
{
"epoch": 8.581196581196581,
"grad_norm": 1.71875,
"learning_rate": 2.788859100528196e-07,
"loss": 0.4401,
"num_tokens": 12479403.0,
"step": 506
},
{
"epoch": 8.598290598290598,
"grad_norm": 1.90625,
"learning_rate": 2.7248368952908055e-07,
"loss": 0.4569,
"num_tokens": 12503591.0,
"step": 507
},
{
"epoch": 8.615384615384615,
"grad_norm": 1.875,
"learning_rate": 2.6615157310968783e-07,
"loss": 0.4808,
"num_tokens": 12528170.0,
"step": 508
},
{
"epoch": 8.632478632478632,
"grad_norm": 1.859375,
"learning_rate": 2.598897600781872e-07,
"loss": 0.4257,
"num_tokens": 12552175.0,
"step": 509
},
{
"epoch": 8.649572649572649,
"grad_norm": 1.828125,
"learning_rate": 2.5369844750554704e-07,
"loss": 0.4638,
"num_tokens": 12576744.0,
"step": 510
},
{
"epoch": 8.649572649572649,
"eval_loss": 0.5507171154022217,
"eval_num_tokens": 12576744.0,
"eval_runtime": 3.7722,
"eval_samples_per_second": 349.662,
"eval_steps_per_second": 11.134,
"step": 510
},
{
"epoch": 8.666666666666666,
"grad_norm": 1.8828125,
"learning_rate": 2.4757783024395244e-07,
"loss": 0.4743,
"num_tokens": 12602284.0,
"step": 511
},
{
"epoch": 8.683760683760683,
"grad_norm": 1.7890625,
"learning_rate": 2.415281009206766e-07,
"loss": 0.3997,
"num_tokens": 12627067.0,
"step": 512
},
{
"epoch": 8.7008547008547,
"grad_norm": 1.8046875,
"learning_rate": 2.355494499320149e-07,
"loss": 0.484,
"num_tokens": 12652451.0,
"step": 513
},
{
"epoch": 8.717948717948717,
"grad_norm": 1.7421875,
"learning_rate": 2.2964206543729662e-07,
"loss": 0.4632,
"num_tokens": 12678633.0,
"step": 514
},
{
"epoch": 8.735042735042736,
"grad_norm": 1.7734375,
"learning_rate": 2.2380613335296037e-07,
"loss": 0.4607,
"num_tokens": 12703802.0,
"step": 515
},
{
"epoch": 8.752136752136753,
"grad_norm": 1.78125,
"learning_rate": 2.1804183734670277e-07,
"loss": 0.4545,
"num_tokens": 12728959.0,
"step": 516
},
{
"epoch": 8.76923076923077,
"grad_norm": 1.78125,
"learning_rate": 2.1234935883170048e-07,
"loss": 0.4693,
"num_tokens": 12754654.0,
"step": 517
},
{
"epoch": 8.786324786324787,
"grad_norm": 1.84375,
"learning_rate": 2.0672887696089826e-07,
"loss": 0.4835,
"num_tokens": 12779552.0,
"step": 518
},
{
"epoch": 8.803418803418804,
"grad_norm": 1.8046875,
"learning_rate": 2.0118056862137358e-07,
"loss": 0.4516,
"num_tokens": 12804786.0,
"step": 519
},
{
"epoch": 8.820512820512821,
"grad_norm": 1.828125,
"learning_rate": 1.9570460842876532e-07,
"loss": 0.4568,
"num_tokens": 12829963.0,
"step": 520
},
{
"epoch": 8.837606837606838,
"grad_norm": 1.7421875,
"learning_rate": 1.9030116872178317e-07,
"loss": 0.471,
"num_tokens": 12855101.0,
"step": 521
},
{
"epoch": 8.854700854700855,
"grad_norm": 1.875,
"learning_rate": 1.8497041955678057e-07,
"loss": 0.4531,
"num_tokens": 12880194.0,
"step": 522
},
{
"epoch": 8.871794871794872,
"grad_norm": 1.859375,
"learning_rate": 1.7971252870240292e-07,
"loss": 0.4766,
"num_tokens": 12905170.0,
"step": 523
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.8515625,
"learning_rate": 1.7452766163430973e-07,
"loss": 0.4839,
"num_tokens": 12931567.0,
"step": 524
},
{
"epoch": 8.905982905982906,
"grad_norm": 1.7890625,
"learning_rate": 1.6941598152996453e-07,
"loss": 0.4803,
"num_tokens": 12957115.0,
"step": 525
},
{
"epoch": 8.923076923076923,
"grad_norm": 1.8515625,
"learning_rate": 1.6437764926350074e-07,
"loss": 0.4782,
"num_tokens": 12982463.0,
"step": 526
},
{
"epoch": 8.94017094017094,
"grad_norm": 1.8984375,
"learning_rate": 1.59412823400657e-07,
"loss": 0.4778,
"num_tokens": 13006357.0,
"step": 527
},
{
"epoch": 8.957264957264957,
"grad_norm": 1.9921875,
"learning_rate": 1.5452166019378989e-07,
"loss": 0.5211,
"num_tokens": 13030925.0,
"step": 528
},
{
"epoch": 8.974358974358974,
"grad_norm": 1.8515625,
"learning_rate": 1.4970431357695241e-07,
"loss": 0.4759,
"num_tokens": 13055908.0,
"step": 529
},
{
"epoch": 8.991452991452991,
"grad_norm": 1.734375,
"learning_rate": 1.449609351610526e-07,
"loss": 0.4291,
"num_tokens": 13080922.0,
"step": 530
},
{
"epoch": 9.0,
"grad_norm": 3.359375,
"learning_rate": 1.4029167422908107e-07,
"loss": 0.5111,
"num_tokens": 13090149.0,
"step": 531
},
{
"epoch": 9.017094017094017,
"grad_norm": 1.765625,
"learning_rate": 1.3569667773141143e-07,
"loss": 0.4281,
"num_tokens": 13114358.0,
"step": 532
},
{
"epoch": 9.034188034188034,
"grad_norm": 1.8046875,
"learning_rate": 1.3117609028117816e-07,
"loss": 0.4658,
"num_tokens": 13139842.0,
"step": 533
},
{
"epoch": 9.051282051282051,
"grad_norm": 1.8671875,
"learning_rate": 1.2673005414972184e-07,
"loss": 0.453,
"num_tokens": 13164650.0,
"step": 534
},
{
"epoch": 9.068376068376068,
"grad_norm": 1.8359375,
"learning_rate": 1.223587092621162e-07,
"loss": 0.4579,
"num_tokens": 13189596.0,
"step": 535
},
{
"epoch": 9.085470085470085,
"grad_norm": 1.8203125,
"learning_rate": 1.1806219319275918e-07,
"loss": 0.4829,
"num_tokens": 13214616.0,
"step": 536
},
{
"epoch": 9.102564102564102,
"grad_norm": 1.75,
"learning_rate": 1.138406411610482e-07,
"loss": 0.4632,
"num_tokens": 13239530.0,
"step": 537
},
{
"epoch": 9.11965811965812,
"grad_norm": 1.7890625,
"learning_rate": 1.0969418602712001e-07,
"loss": 0.4663,
"num_tokens": 13264484.0,
"step": 538
},
{
"epoch": 9.136752136752136,
"grad_norm": 1.9140625,
"learning_rate": 1.0562295828767388e-07,
"loss": 0.4653,
"num_tokens": 13288508.0,
"step": 539
},
{
"epoch": 9.153846153846153,
"grad_norm": 1.953125,
"learning_rate": 1.0162708607186044e-07,
"loss": 0.4705,
"num_tokens": 13312812.0,
"step": 540
},
{
"epoch": 9.153846153846153,
"eval_loss": 0.550665557384491,
"eval_num_tokens": 13312812.0,
"eval_runtime": 3.7846,
"eval_samples_per_second": 348.522,
"eval_steps_per_second": 11.098,
"step": 540
},
{
"epoch": 9.17094017094017,
"grad_norm": 1.8125,
"learning_rate": 9.770669513725128e-08,
"loss": 0.4354,
"num_tokens": 13337364.0,
"step": 541
},
{
"epoch": 9.188034188034187,
"grad_norm": 1.9609375,
"learning_rate": 9.386190886588208e-08,
"loss": 0.4643,
"num_tokens": 13362934.0,
"step": 542
},
{
"epoch": 9.205128205128204,
"grad_norm": 1.8125,
"learning_rate": 9.00928482603669e-08,
"loss": 0.4821,
"num_tokens": 13387608.0,
"step": 543
},
{
"epoch": 9.222222222222221,
"grad_norm": 1.796875,
"learning_rate": 8.639963194009282e-08,
"loss": 0.4411,
"num_tokens": 13412296.0,
"step": 544
},
{
"epoch": 9.239316239316238,
"grad_norm": 1.7265625,
"learning_rate": 8.278237613748408e-08,
"loss": 0.4886,
"num_tokens": 13438087.0,
"step": 545
},
{
"epoch": 9.256410256410255,
"grad_norm": 1.9375,
"learning_rate": 7.924119469434666e-08,
"loss": 0.5127,
"num_tokens": 13462986.0,
"step": 546
},
{
"epoch": 9.273504273504274,
"grad_norm": 2.0,
"learning_rate": 7.577619905828281e-08,
"loss": 0.4918,
"num_tokens": 13486219.0,
"step": 547
},
{
"epoch": 9.290598290598291,
"grad_norm": 1.7734375,
"learning_rate": 7.238749827918639e-08,
"loss": 0.4315,
"num_tokens": 13511066.0,
"step": 548
},
{
"epoch": 9.307692307692308,
"grad_norm": 1.796875,
"learning_rate": 6.907519900580862e-08,
"loss": 0.4421,
"num_tokens": 13536763.0,
"step": 549
},
{
"epoch": 9.324786324786325,
"grad_norm": 1.796875,
"learning_rate": 6.583940548240186e-08,
"loss": 0.4938,
"num_tokens": 13562043.0,
"step": 550
},
{
"epoch": 9.341880341880342,
"grad_norm": 1.6796875,
"learning_rate": 6.268021954544095e-08,
"loss": 0.4603,
"num_tokens": 13587428.0,
"step": 551
},
{
"epoch": 9.35897435897436,
"grad_norm": 1.9296875,
"learning_rate": 5.95977406204154e-08,
"loss": 0.4731,
"num_tokens": 13611667.0,
"step": 552
},
{
"epoch": 9.376068376068377,
"grad_norm": 1.859375,
"learning_rate": 5.659206571870218e-08,
"loss": 0.4834,
"num_tokens": 13637877.0,
"step": 553
},
{
"epoch": 9.393162393162394,
"grad_norm": 1.921875,
"learning_rate": 5.366328943451154e-08,
"loss": 0.4741,
"num_tokens": 13662051.0,
"step": 554
},
{
"epoch": 9.41025641025641,
"grad_norm": 1.8671875,
"learning_rate": 5.0811503941911314e-08,
"loss": 0.4699,
"num_tokens": 13687351.0,
"step": 555
},
{
"epoch": 9.427350427350428,
"grad_norm": 1.828125,
"learning_rate": 4.8036798991923925e-08,
"loss": 0.4709,
"num_tokens": 13712712.0,
"step": 556
},
{
"epoch": 9.444444444444445,
"grad_norm": 1.84375,
"learning_rate": 4.5339261909704e-08,
"loss": 0.4521,
"num_tokens": 13737853.0,
"step": 557
},
{
"epoch": 9.461538461538462,
"grad_norm": 1.84375,
"learning_rate": 4.2718977591788836e-08,
"loss": 0.4645,
"num_tokens": 13762998.0,
"step": 558
},
{
"epoch": 9.478632478632479,
"grad_norm": 1.9453125,
"learning_rate": 4.017602850342584e-08,
"loss": 0.4701,
"num_tokens": 13787441.0,
"step": 559
},
{
"epoch": 9.495726495726496,
"grad_norm": 1.84375,
"learning_rate": 3.771049467597959e-08,
"loss": 0.5049,
"num_tokens": 13813278.0,
"step": 560
},
{
"epoch": 9.512820512820513,
"grad_norm": 1.8671875,
"learning_rate": 3.5322453704410286e-08,
"loss": 0.4402,
"num_tokens": 13837870.0,
"step": 561
},
{
"epoch": 9.52991452991453,
"grad_norm": 1.8828125,
"learning_rate": 3.3011980744833974e-08,
"loss": 0.4524,
"num_tokens": 13861602.0,
"step": 562
},
{
"epoch": 9.547008547008547,
"grad_norm": 1.8515625,
"learning_rate": 3.077914851215585e-08,
"loss": 0.4996,
"num_tokens": 13887368.0,
"step": 563
},
{
"epoch": 9.564102564102564,
"grad_norm": 1.84375,
"learning_rate": 2.8624027277781852e-08,
"loss": 0.4986,
"num_tokens": 13912654.0,
"step": 564
},
{
"epoch": 9.581196581196581,
"grad_norm": 1.859375,
"learning_rate": 2.6546684867408412e-08,
"loss": 0.4248,
"num_tokens": 13937672.0,
"step": 565
},
{
"epoch": 9.598290598290598,
"grad_norm": 1.8515625,
"learning_rate": 2.454718665888589e-08,
"loss": 0.468,
"num_tokens": 13963103.0,
"step": 566
},
{
"epoch": 9.615384615384615,
"grad_norm": 1.7421875,
"learning_rate": 2.262559558016325e-08,
"loss": 0.4627,
"num_tokens": 13987379.0,
"step": 567
},
{
"epoch": 9.632478632478632,
"grad_norm": 1.7890625,
"learning_rate": 2.078197210730465e-08,
"loss": 0.4802,
"num_tokens": 14011324.0,
"step": 568
},
{
"epoch": 9.649572649572649,
"grad_norm": 1.8046875,
"learning_rate": 1.9016374262589842e-08,
"loss": 0.4588,
"num_tokens": 14036876.0,
"step": 569
},
{
"epoch": 9.666666666666666,
"grad_norm": 1.8046875,
"learning_rate": 1.732885761268427e-08,
"loss": 0.4391,
"num_tokens": 14061453.0,
"step": 570
},
{
"epoch": 9.666666666666666,
"eval_loss": 0.550795316696167,
"eval_num_tokens": 14061453.0,
"eval_runtime": 3.8222,
"eval_samples_per_second": 345.091,
"eval_steps_per_second": 10.988,
"step": 570
},
{
"epoch": 9.683760683760683,
"grad_norm": 1.8203125,
"learning_rate": 1.571947526689349e-08,
"loss": 0.4896,
"num_tokens": 14086842.0,
"step": 571
},
{
"epoch": 9.7008547008547,
"grad_norm": 1.8203125,
"learning_rate": 1.418827787548982e-08,
"loss": 0.4609,
"num_tokens": 14112308.0,
"step": 572
},
{
"epoch": 9.717948717948717,
"grad_norm": 1.8671875,
"learning_rate": 1.273531362811914e-08,
"loss": 0.4825,
"num_tokens": 14137169.0,
"step": 573
},
{
"epoch": 9.735042735042736,
"grad_norm": 1.8515625,
"learning_rate": 1.1360628252283513e-08,
"loss": 0.4742,
"num_tokens": 14161587.0,
"step": 574
},
{
"epoch": 9.752136752136753,
"grad_norm": 1.78125,
"learning_rate": 1.006426501190233e-08,
"loss": 0.4446,
"num_tokens": 14186463.0,
"step": 575
},
{
"epoch": 9.76923076923077,
"grad_norm": 1.75,
"learning_rate": 8.84626470595229e-09,
"loss": 0.4671,
"num_tokens": 14211825.0,
"step": 576
},
{
"epoch": 9.786324786324787,
"grad_norm": 1.8671875,
"learning_rate": 7.70666566718009e-09,
"loss": 0.4834,
"num_tokens": 14237567.0,
"step": 577
},
{
"epoch": 9.803418803418804,
"grad_norm": 1.9296875,
"learning_rate": 6.645503760899508e-09,
"loss": 0.4333,
"num_tokens": 14262209.0,
"step": 578
},
{
"epoch": 9.820512820512821,
"grad_norm": 1.8046875,
"learning_rate": 5.662812383859795e-09,
"loss": 0.4579,
"num_tokens": 14287579.0,
"step": 579
},
{
"epoch": 9.837606837606838,
"grad_norm": 1.84375,
"learning_rate": 4.758622463196805e-09,
"loss": 0.4538,
"num_tokens": 14311419.0,
"step": 580
},
{
"epoch": 9.854700854700855,
"grad_norm": 1.875,
"learning_rate": 3.932962455458489e-09,
"loss": 0.4787,
"num_tokens": 14336303.0,
"step": 581
},
{
"epoch": 9.871794871794872,
"grad_norm": 1.78125,
"learning_rate": 3.1858583457095026e-09,
"loss": 0.4192,
"num_tokens": 14361504.0,
"step": 582
},
{
"epoch": 9.88888888888889,
"grad_norm": 1.7578125,
"learning_rate": 2.5173336467135266e-09,
"loss": 0.4746,
"num_tokens": 14386137.0,
"step": 583
},
{
"epoch": 9.905982905982906,
"grad_norm": 1.8671875,
"learning_rate": 1.9274093981927476e-09,
"loss": 0.4572,
"num_tokens": 14410674.0,
"step": 584
},
{
"epoch": 9.923076923076923,
"grad_norm": 1.8671875,
"learning_rate": 1.4161041661667208e-09,
"loss": 0.4646,
"num_tokens": 14434870.0,
"step": 585
},
{
"epoch": 9.94017094017094,
"grad_norm": 1.8671875,
"learning_rate": 9.834340423678368e-10,
"loss": 0.4593,
"num_tokens": 14460092.0,
"step": 586
},
{
"epoch": 9.957264957264957,
"grad_norm": 1.828125,
"learning_rate": 6.294126437336734e-10,
"loss": 0.4751,
"num_tokens": 14485799.0,
"step": 587
},
{
"epoch": 9.974358974358974,
"grad_norm": 1.78125,
"learning_rate": 3.5405111197955866e-10,
"loss": 0.4483,
"num_tokens": 14510709.0,
"step": 588
},
{
"epoch": 9.991452991452991,
"grad_norm": 1.9921875,
"learning_rate": 1.5735811324857354e-10,
"loss": 0.4797,
"num_tokens": 14534764.0,
"step": 589
},
{
"epoch": 10.0,
"grad_norm": 2.734375,
"learning_rate": 3.933983783677153e-11,
"loss": 0.4571,
"num_tokens": 14544610.0,
"step": 590
}
],
"logging_steps": 1,
"max_steps": 590,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3842096471023616e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}