train_copa_42_1760623610 / trainer_state.json
rbelanec's picture
End of training
65d586c verified
{
"best_global_step": 1530,
"best_metric": 0.06136542558670044,
"best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_copa_42_1760623610/checkpoint-1530",
"epoch": 20.0,
"eval_steps": 90,
"global_step": 1800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05555555555555555,
"grad_norm": 2.2088112831115723,
"learning_rate": 1.1111111111111112e-06,
"loss": 0.4573,
"num_input_tokens_seen": 1600,
"step": 5,
"train_runtime": 2.9456,
"train_tokens_per_second": 543.175
},
{
"epoch": 0.1111111111111111,
"grad_norm": 4.39024019241333,
"learning_rate": 2.5e-06,
"loss": 0.5421,
"num_input_tokens_seen": 3200,
"step": 10,
"train_runtime": 3.6773,
"train_tokens_per_second": 870.196
},
{
"epoch": 0.16666666666666666,
"grad_norm": 3.716674327850342,
"learning_rate": 3.888888888888889e-06,
"loss": 0.6127,
"num_input_tokens_seen": 4768,
"step": 15,
"train_runtime": 4.4116,
"train_tokens_per_second": 1080.799
},
{
"epoch": 0.2222222222222222,
"grad_norm": 4.793196201324463,
"learning_rate": 5.277777777777778e-06,
"loss": 0.8032,
"num_input_tokens_seen": 6336,
"step": 20,
"train_runtime": 5.1293,
"train_tokens_per_second": 1235.252
},
{
"epoch": 0.2777777777777778,
"grad_norm": 4.469949722290039,
"learning_rate": 6.666666666666667e-06,
"loss": 0.708,
"num_input_tokens_seen": 7904,
"step": 25,
"train_runtime": 5.8465,
"train_tokens_per_second": 1351.911
},
{
"epoch": 0.3333333333333333,
"grad_norm": 3.813312530517578,
"learning_rate": 8.055555555555557e-06,
"loss": 0.7098,
"num_input_tokens_seen": 9504,
"step": 30,
"train_runtime": 6.5631,
"train_tokens_per_second": 1448.106
},
{
"epoch": 0.3888888888888889,
"grad_norm": 5.289863586425781,
"learning_rate": 9.444444444444445e-06,
"loss": 0.5975,
"num_input_tokens_seen": 11072,
"step": 35,
"train_runtime": 7.2773,
"train_tokens_per_second": 1521.439
},
{
"epoch": 0.4444444444444444,
"grad_norm": 5.294936656951904,
"learning_rate": 1.0833333333333334e-05,
"loss": 0.6869,
"num_input_tokens_seen": 12672,
"step": 40,
"train_runtime": 7.9904,
"train_tokens_per_second": 1585.904
},
{
"epoch": 0.5,
"grad_norm": 4.334069728851318,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.5514,
"num_input_tokens_seen": 14176,
"step": 45,
"train_runtime": 8.7025,
"train_tokens_per_second": 1628.959
},
{
"epoch": 0.5555555555555556,
"grad_norm": 3.8888776302337646,
"learning_rate": 1.3611111111111111e-05,
"loss": 0.5715,
"num_input_tokens_seen": 15776,
"step": 50,
"train_runtime": 9.4207,
"train_tokens_per_second": 1674.613
},
{
"epoch": 0.6111111111111112,
"grad_norm": 3.423602819442749,
"learning_rate": 1.5e-05,
"loss": 0.7156,
"num_input_tokens_seen": 17312,
"step": 55,
"train_runtime": 10.1344,
"train_tokens_per_second": 1708.236
},
{
"epoch": 0.6666666666666666,
"grad_norm": 5.6678619384765625,
"learning_rate": 1.638888888888889e-05,
"loss": 0.6582,
"num_input_tokens_seen": 18848,
"step": 60,
"train_runtime": 10.8468,
"train_tokens_per_second": 1737.659
},
{
"epoch": 0.7222222222222222,
"grad_norm": 3.3733015060424805,
"learning_rate": 1.777777777777778e-05,
"loss": 0.5932,
"num_input_tokens_seen": 20448,
"step": 65,
"train_runtime": 11.5622,
"train_tokens_per_second": 1768.529
},
{
"epoch": 0.7777777777777778,
"grad_norm": 5.003718852996826,
"learning_rate": 1.9166666666666667e-05,
"loss": 0.6346,
"num_input_tokens_seen": 22016,
"step": 70,
"train_runtime": 12.2783,
"train_tokens_per_second": 1793.078
},
{
"epoch": 0.8333333333333334,
"grad_norm": 5.266268253326416,
"learning_rate": 2.0555555555555555e-05,
"loss": 0.6911,
"num_input_tokens_seen": 23616,
"step": 75,
"train_runtime": 12.9932,
"train_tokens_per_second": 1817.571
},
{
"epoch": 0.8888888888888888,
"grad_norm": 3.5122318267822266,
"learning_rate": 2.1944444444444445e-05,
"loss": 0.6298,
"num_input_tokens_seen": 25152,
"step": 80,
"train_runtime": 13.7078,
"train_tokens_per_second": 1834.864
},
{
"epoch": 0.9444444444444444,
"grad_norm": 2.586906671524048,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.502,
"num_input_tokens_seen": 26688,
"step": 85,
"train_runtime": 14.4298,
"train_tokens_per_second": 1849.51
},
{
"epoch": 1.0,
"grad_norm": 5.285933017730713,
"learning_rate": 2.4722222222222223e-05,
"loss": 0.6707,
"num_input_tokens_seen": 28256,
"step": 90,
"train_runtime": 15.1724,
"train_tokens_per_second": 1862.324
},
{
"epoch": 1.0,
"eval_loss": 0.5807542204856873,
"eval_runtime": 0.5309,
"eval_samples_per_second": 75.343,
"eval_steps_per_second": 18.836,
"num_input_tokens_seen": 28256,
"step": 90
},
{
"epoch": 1.0555555555555556,
"grad_norm": 4.274877548217773,
"learning_rate": 2.6111111111111114e-05,
"loss": 0.6538,
"num_input_tokens_seen": 29824,
"step": 95,
"train_runtime": 17.2747,
"train_tokens_per_second": 1726.451
},
{
"epoch": 1.1111111111111112,
"grad_norm": 4.135989665985107,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.6772,
"num_input_tokens_seen": 31360,
"step": 100,
"train_runtime": 17.9993,
"train_tokens_per_second": 1742.294
},
{
"epoch": 1.1666666666666667,
"grad_norm": 4.4399800300598145,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.6713,
"num_input_tokens_seen": 32960,
"step": 105,
"train_runtime": 18.7241,
"train_tokens_per_second": 1760.298
},
{
"epoch": 1.2222222222222223,
"grad_norm": 4.539364814758301,
"learning_rate": 3.0277777777777776e-05,
"loss": 0.7478,
"num_input_tokens_seen": 34464,
"step": 110,
"train_runtime": 19.5695,
"train_tokens_per_second": 1761.109
},
{
"epoch": 1.2777777777777777,
"grad_norm": 2.650864601135254,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.4845,
"num_input_tokens_seen": 36032,
"step": 115,
"train_runtime": 20.2991,
"train_tokens_per_second": 1775.058
},
{
"epoch": 1.3333333333333333,
"grad_norm": 4.143145561218262,
"learning_rate": 3.3055555555555553e-05,
"loss": 0.6008,
"num_input_tokens_seen": 37600,
"step": 120,
"train_runtime": 21.0254,
"train_tokens_per_second": 1788.313
},
{
"epoch": 1.3888888888888888,
"grad_norm": 3.905705690383911,
"learning_rate": 3.444444444444445e-05,
"loss": 0.4756,
"num_input_tokens_seen": 39168,
"step": 125,
"train_runtime": 21.7597,
"train_tokens_per_second": 1800.026
},
{
"epoch": 1.4444444444444444,
"grad_norm": 2.9782958030700684,
"learning_rate": 3.5833333333333335e-05,
"loss": 0.6311,
"num_input_tokens_seen": 40736,
"step": 130,
"train_runtime": 22.4852,
"train_tokens_per_second": 1811.679
},
{
"epoch": 1.5,
"grad_norm": 1.8527201414108276,
"learning_rate": 3.722222222222222e-05,
"loss": 0.3536,
"num_input_tokens_seen": 42240,
"step": 135,
"train_runtime": 23.2063,
"train_tokens_per_second": 1820.199
},
{
"epoch": 1.5555555555555556,
"grad_norm": 2.4050815105438232,
"learning_rate": 3.8611111111111116e-05,
"loss": 0.2908,
"num_input_tokens_seen": 43840,
"step": 140,
"train_runtime": 23.9311,
"train_tokens_per_second": 1831.925
},
{
"epoch": 1.6111111111111112,
"grad_norm": 4.370201110839844,
"learning_rate": 4e-05,
"loss": 0.1444,
"num_input_tokens_seen": 45408,
"step": 145,
"train_runtime": 24.6546,
"train_tokens_per_second": 1841.766
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.18738660216331482,
"learning_rate": 4.138888888888889e-05,
"loss": 0.1972,
"num_input_tokens_seen": 46976,
"step": 150,
"train_runtime": 25.3788,
"train_tokens_per_second": 1850.994
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.4951828718185425,
"learning_rate": 4.277777777777778e-05,
"loss": 0.0597,
"num_input_tokens_seen": 48512,
"step": 155,
"train_runtime": 26.1051,
"train_tokens_per_second": 1858.334
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.1515243649482727,
"learning_rate": 4.4166666666666665e-05,
"loss": 0.0147,
"num_input_tokens_seen": 50112,
"step": 160,
"train_runtime": 26.8225,
"train_tokens_per_second": 1868.284
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.11960956454277039,
"learning_rate": 4.555555555555556e-05,
"loss": 0.1213,
"num_input_tokens_seen": 51712,
"step": 165,
"train_runtime": 27.5514,
"train_tokens_per_second": 1876.925
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.05862782150506973,
"learning_rate": 4.6944444444444446e-05,
"loss": 0.2448,
"num_input_tokens_seen": 53280,
"step": 170,
"train_runtime": 28.2801,
"train_tokens_per_second": 1884.007
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.4216754138469696,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.051,
"num_input_tokens_seen": 54880,
"step": 175,
"train_runtime": 29.0067,
"train_tokens_per_second": 1891.977
},
{
"epoch": 2.0,
"grad_norm": 0.050075557082891464,
"learning_rate": 4.972222222222223e-05,
"loss": 0.1046,
"num_input_tokens_seen": 56480,
"step": 180,
"train_runtime": 29.7614,
"train_tokens_per_second": 1897.76
},
{
"epoch": 2.0,
"eval_loss": 0.11521875858306885,
"eval_runtime": 0.5418,
"eval_samples_per_second": 73.822,
"eval_steps_per_second": 18.455,
"num_input_tokens_seen": 56480,
"step": 180
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.3309241831302643,
"learning_rate": 4.9999247861994194e-05,
"loss": 0.0641,
"num_input_tokens_seen": 58048,
"step": 185,
"train_runtime": 31.8276,
"train_tokens_per_second": 1823.827
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.06720643490552902,
"learning_rate": 4.9996192378909786e-05,
"loss": 0.0257,
"num_input_tokens_seen": 59584,
"step": 190,
"train_runtime": 32.5618,
"train_tokens_per_second": 1829.874
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.46070975065231323,
"learning_rate": 4.999078682916774e-05,
"loss": 0.0885,
"num_input_tokens_seen": 61216,
"step": 195,
"train_runtime": 33.2917,
"train_tokens_per_second": 1838.775
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.796092689037323,
"learning_rate": 4.998303172098155e-05,
"loss": 0.0881,
"num_input_tokens_seen": 62784,
"step": 200,
"train_runtime": 34.0299,
"train_tokens_per_second": 1844.965
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.1357884556055069,
"learning_rate": 4.997292778346312e-05,
"loss": 0.0261,
"num_input_tokens_seen": 64352,
"step": 205,
"train_runtime": 34.7664,
"train_tokens_per_second": 1850.985
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.033360954374074936,
"learning_rate": 4.996047596655418e-05,
"loss": 0.0676,
"num_input_tokens_seen": 65952,
"step": 210,
"train_runtime": 35.5584,
"train_tokens_per_second": 1854.75
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.7607372403144836,
"learning_rate": 4.994567744093703e-05,
"loss": 0.1574,
"num_input_tokens_seen": 67552,
"step": 215,
"train_runtime": 36.2847,
"train_tokens_per_second": 1861.723
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.0921010971069336,
"learning_rate": 4.992853359792444e-05,
"loss": 0.1509,
"num_input_tokens_seen": 69120,
"step": 220,
"train_runtime": 37.0158,
"train_tokens_per_second": 1867.312
},
{
"epoch": 2.5,
"grad_norm": 0.037158917635679245,
"learning_rate": 4.9909046049328846e-05,
"loss": 0.1674,
"num_input_tokens_seen": 70688,
"step": 225,
"train_runtime": 37.7412,
"train_tokens_per_second": 1872.967
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.04759933799505234,
"learning_rate": 4.988721662731083e-05,
"loss": 0.0838,
"num_input_tokens_seen": 72288,
"step": 230,
"train_runtime": 38.4649,
"train_tokens_per_second": 1879.323
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.03261607512831688,
"learning_rate": 4.9863047384206835e-05,
"loss": 0.038,
"num_input_tokens_seen": 73856,
"step": 235,
"train_runtime": 39.1892,
"train_tokens_per_second": 1884.601
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.04402220621705055,
"learning_rate": 4.983654059233626e-05,
"loss": 0.1015,
"num_input_tokens_seen": 75392,
"step": 240,
"train_runtime": 39.9138,
"train_tokens_per_second": 1888.87
},
{
"epoch": 2.7222222222222223,
"grad_norm": 1.1526827812194824,
"learning_rate": 4.9807698743787744e-05,
"loss": 0.0368,
"num_input_tokens_seen": 76960,
"step": 245,
"train_runtime": 40.6396,
"train_tokens_per_second": 1893.719
},
{
"epoch": 2.7777777777777777,
"grad_norm": 1.0865987539291382,
"learning_rate": 4.9776524550184965e-05,
"loss": 0.1612,
"num_input_tokens_seen": 78496,
"step": 250,
"train_runtime": 41.368,
"train_tokens_per_second": 1897.507
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.0332452766597271,
"learning_rate": 4.974302094243164e-05,
"loss": 0.0852,
"num_input_tokens_seen": 80000,
"step": 255,
"train_runtime": 42.1021,
"train_tokens_per_second": 1900.142
},
{
"epoch": 2.888888888888889,
"grad_norm": 2.336534261703491,
"learning_rate": 4.970719107043595e-05,
"loss": 0.1561,
"num_input_tokens_seen": 81568,
"step": 260,
"train_runtime": 42.8216,
"train_tokens_per_second": 1904.832
},
{
"epoch": 2.9444444444444446,
"grad_norm": 2.321704387664795,
"learning_rate": 4.966903830281449e-05,
"loss": 0.1484,
"num_input_tokens_seen": 83168,
"step": 265,
"train_runtime": 43.5535,
"train_tokens_per_second": 1909.561
},
{
"epoch": 3.0,
"grad_norm": 2.2072253227233887,
"learning_rate": 4.962856622657541e-05,
"loss": 0.218,
"num_input_tokens_seen": 84736,
"step": 270,
"train_runtime": 44.3017,
"train_tokens_per_second": 1912.705
},
{
"epoch": 3.0,
"eval_loss": 0.09266040474176407,
"eval_runtime": 0.5311,
"eval_samples_per_second": 75.318,
"eval_steps_per_second": 18.829,
"num_input_tokens_seen": 84736,
"step": 270
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.04840150848031044,
"learning_rate": 4.9585778646781364e-05,
"loss": 0.0109,
"num_input_tokens_seen": 86304,
"step": 275,
"train_runtime": 46.6293,
"train_tokens_per_second": 1850.855
},
{
"epoch": 3.111111111111111,
"grad_norm": 1.0470004081726074,
"learning_rate": 4.9540679586191605e-05,
"loss": 0.1111,
"num_input_tokens_seen": 87904,
"step": 280,
"train_runtime": 47.3578,
"train_tokens_per_second": 1856.166
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.1631588339805603,
"learning_rate": 4.9493273284883854e-05,
"loss": 0.082,
"num_input_tokens_seen": 89408,
"step": 285,
"train_runtime": 48.0854,
"train_tokens_per_second": 1859.357
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.17166022956371307,
"learning_rate": 4.9443564199855666e-05,
"loss": 0.1572,
"num_input_tokens_seen": 91008,
"step": 290,
"train_runtime": 48.8832,
"train_tokens_per_second": 1861.742
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.2681538462638855,
"learning_rate": 4.939155700460536e-05,
"loss": 0.0726,
"num_input_tokens_seen": 92512,
"step": 295,
"train_runtime": 49.6117,
"train_tokens_per_second": 1864.72
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.024233149364590645,
"learning_rate": 4.933725658869267e-05,
"loss": 0.0139,
"num_input_tokens_seen": 94080,
"step": 300,
"train_runtime": 50.3418,
"train_tokens_per_second": 1868.825
},
{
"epoch": 3.388888888888889,
"grad_norm": 1.682276964187622,
"learning_rate": 4.9280668057279014e-05,
"loss": 0.0565,
"num_input_tokens_seen": 95680,
"step": 305,
"train_runtime": 51.0683,
"train_tokens_per_second": 1873.569
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.29281699657440186,
"learning_rate": 4.9221796730647516e-05,
"loss": 0.0126,
"num_input_tokens_seen": 97248,
"step": 310,
"train_runtime": 51.7945,
"train_tokens_per_second": 1877.573
},
{
"epoch": 3.5,
"grad_norm": 0.06379231810569763,
"learning_rate": 4.916064814370287e-05,
"loss": 0.033,
"num_input_tokens_seen": 98784,
"step": 315,
"train_runtime": 52.5174,
"train_tokens_per_second": 1880.976
},
{
"epoch": 3.5555555555555554,
"grad_norm": 1.4467811584472656,
"learning_rate": 4.9097228045450864e-05,
"loss": 0.1501,
"num_input_tokens_seen": 100384,
"step": 320,
"train_runtime": 53.2424,
"train_tokens_per_second": 1885.414
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.5704734921455383,
"learning_rate": 4.9031542398457974e-05,
"loss": 0.0704,
"num_input_tokens_seen": 101952,
"step": 325,
"train_runtime": 53.9719,
"train_tokens_per_second": 1888.983
},
{
"epoch": 3.6666666666666665,
"grad_norm": 2.3169894218444824,
"learning_rate": 4.896359737829071e-05,
"loss": 0.1576,
"num_input_tokens_seen": 103520,
"step": 330,
"train_runtime": 54.6975,
"train_tokens_per_second": 1892.592
},
{
"epoch": 3.7222222222222223,
"grad_norm": 1.74516761302948,
"learning_rate": 4.889339937293508e-05,
"loss": 0.1599,
"num_input_tokens_seen": 105120,
"step": 335,
"train_runtime": 55.4274,
"train_tokens_per_second": 1896.536
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.154998779296875,
"learning_rate": 4.8820954982195905e-05,
"loss": 0.0478,
"num_input_tokens_seen": 106720,
"step": 340,
"train_runtime": 56.1522,
"train_tokens_per_second": 1900.55
},
{
"epoch": 3.8333333333333335,
"grad_norm": 1.1225404739379883,
"learning_rate": 4.874627101707644e-05,
"loss": 0.0306,
"num_input_tokens_seen": 108320,
"step": 345,
"train_runtime": 56.8749,
"train_tokens_per_second": 1904.532
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.2039882242679596,
"learning_rate": 4.8669354499137955e-05,
"loss": 0.1915,
"num_input_tokens_seen": 109888,
"step": 350,
"train_runtime": 57.6027,
"train_tokens_per_second": 1907.689
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.016289830207824707,
"learning_rate": 4.859021265983959e-05,
"loss": 0.0131,
"num_input_tokens_seen": 111424,
"step": 355,
"train_runtime": 58.327,
"train_tokens_per_second": 1910.333
},
{
"epoch": 4.0,
"grad_norm": 0.05436905845999718,
"learning_rate": 4.850885293985853e-05,
"loss": 0.1464,
"num_input_tokens_seen": 113024,
"step": 360,
"train_runtime": 59.0796,
"train_tokens_per_second": 1913.08
},
{
"epoch": 4.0,
"eval_loss": 0.0865594819188118,
"eval_runtime": 0.5373,
"eval_samples_per_second": 74.451,
"eval_steps_per_second": 18.613,
"num_input_tokens_seen": 113024,
"step": 360
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.26754438877105713,
"learning_rate": 4.8425282988390376e-05,
"loss": 0.0143,
"num_input_tokens_seen": 114624,
"step": 365,
"train_runtime": 61.1601,
"train_tokens_per_second": 1874.164
},
{
"epoch": 4.111111111111111,
"grad_norm": 1.4411932229995728,
"learning_rate": 4.8339510662430046e-05,
"loss": 0.0517,
"num_input_tokens_seen": 116224,
"step": 370,
"train_runtime": 61.8991,
"train_tokens_per_second": 1877.636
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.9027936458587646,
"learning_rate": 4.825154402603308e-05,
"loss": 0.0357,
"num_input_tokens_seen": 117760,
"step": 375,
"train_runtime": 62.634,
"train_tokens_per_second": 1880.13
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.05497714877128601,
"learning_rate": 4.816139134955746e-05,
"loss": 0.2172,
"num_input_tokens_seen": 119360,
"step": 380,
"train_runtime": 63.3708,
"train_tokens_per_second": 1883.518
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.019861461594700813,
"learning_rate": 4.806906110888606e-05,
"loss": 0.0508,
"num_input_tokens_seen": 120960,
"step": 385,
"train_runtime": 64.1576,
"train_tokens_per_second": 1885.358
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.0410231314599514,
"learning_rate": 4.797456198462979e-05,
"loss": 0.0525,
"num_input_tokens_seen": 122528,
"step": 390,
"train_runtime": 64.8796,
"train_tokens_per_second": 1888.545
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.019991597160696983,
"learning_rate": 4.7877902861311446e-05,
"loss": 0.0366,
"num_input_tokens_seen": 124096,
"step": 395,
"train_runtime": 65.602,
"train_tokens_per_second": 1891.651
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.22315886616706848,
"learning_rate": 4.777909282653042e-05,
"loss": 0.0932,
"num_input_tokens_seen": 125696,
"step": 400,
"train_runtime": 66.3299,
"train_tokens_per_second": 1895.012
},
{
"epoch": 4.5,
"grad_norm": 0.017999814823269844,
"learning_rate": 4.7678141170108345e-05,
"loss": 0.0129,
"num_input_tokens_seen": 127264,
"step": 405,
"train_runtime": 67.0642,
"train_tokens_per_second": 1897.644
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.014455858618021011,
"learning_rate": 4.757505738321563e-05,
"loss": 0.0619,
"num_input_tokens_seen": 128832,
"step": 410,
"train_runtime": 67.8,
"train_tokens_per_second": 1900.178
},
{
"epoch": 4.611111111111111,
"grad_norm": 1.1234349012374878,
"learning_rate": 4.7469851157479177e-05,
"loss": 0.0666,
"num_input_tokens_seen": 130464,
"step": 415,
"train_runtime": 68.5354,
"train_tokens_per_second": 1903.6
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.1306644231081009,
"learning_rate": 4.736253238407119e-05,
"loss": 0.021,
"num_input_tokens_seen": 132032,
"step": 420,
"train_runtime": 69.2654,
"train_tokens_per_second": 1906.174
},
{
"epoch": 4.722222222222222,
"grad_norm": 2.1763293743133545,
"learning_rate": 4.725311115277924e-05,
"loss": 0.124,
"num_input_tokens_seen": 133632,
"step": 425,
"train_runtime": 69.9934,
"train_tokens_per_second": 1909.21
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.031760722398757935,
"learning_rate": 4.714159775105765e-05,
"loss": 0.0831,
"num_input_tokens_seen": 135232,
"step": 430,
"train_runtime": 70.7244,
"train_tokens_per_second": 1912.098
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.8396589756011963,
"learning_rate": 4.70280026630603e-05,
"loss": 0.0929,
"num_input_tokens_seen": 136768,
"step": 435,
"train_runtime": 71.4563,
"train_tokens_per_second": 1914.009
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.8139696717262268,
"learning_rate": 4.6912336568654925e-05,
"loss": 0.0923,
"num_input_tokens_seen": 138368,
"step": 440,
"train_runtime": 72.1877,
"train_tokens_per_second": 1916.78
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.037018146365880966,
"learning_rate": 4.679461034241906e-05,
"loss": 0.0602,
"num_input_tokens_seen": 139904,
"step": 445,
"train_runtime": 72.9133,
"train_tokens_per_second": 1918.772
},
{
"epoch": 5.0,
"grad_norm": 0.5598136186599731,
"learning_rate": 4.667483505261762e-05,
"loss": 0.1828,
"num_input_tokens_seen": 141440,
"step": 450,
"train_runtime": 73.6811,
"train_tokens_per_second": 1919.625
},
{
"epoch": 5.0,
"eval_loss": 0.08515938371419907,
"eval_runtime": 0.5401,
"eval_samples_per_second": 74.06,
"eval_steps_per_second": 18.515,
"num_input_tokens_seen": 141440,
"step": 450
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.03553323820233345,
"learning_rate": 4.655302196016228e-05,
"loss": 0.0195,
"num_input_tokens_seen": 142976,
"step": 455,
"train_runtime": 75.8668,
"train_tokens_per_second": 1884.567
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.38880011439323425,
"learning_rate": 4.642918251755281e-05,
"loss": 0.0529,
"num_input_tokens_seen": 144576,
"step": 460,
"train_runtime": 76.5962,
"train_tokens_per_second": 1887.508
},
{
"epoch": 5.166666666666667,
"grad_norm": 1.82427179813385,
"learning_rate": 4.6303328367800284e-05,
"loss": 0.0852,
"num_input_tokens_seen": 146144,
"step": 465,
"train_runtime": 77.3337,
"train_tokens_per_second": 1889.783
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.13176696002483368,
"learning_rate": 4.6175471343332485e-05,
"loss": 0.029,
"num_input_tokens_seen": 147712,
"step": 470,
"train_runtime": 78.0599,
"train_tokens_per_second": 1892.29
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.042600348591804504,
"learning_rate": 4.604562346488144e-05,
"loss": 0.016,
"num_input_tokens_seen": 149248,
"step": 475,
"train_runtime": 78.8528,
"train_tokens_per_second": 1892.743
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.2647045850753784,
"learning_rate": 4.591379694035325e-05,
"loss": 0.0317,
"num_input_tokens_seen": 150816,
"step": 480,
"train_runtime": 79.5814,
"train_tokens_per_second": 1895.117
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.010275711305439472,
"learning_rate": 4.5780004163680365e-05,
"loss": 0.0623,
"num_input_tokens_seen": 152352,
"step": 485,
"train_runtime": 80.307,
"train_tokens_per_second": 1897.119
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.01466486044228077,
"learning_rate": 4.5644257713656356e-05,
"loss": 0.0467,
"num_input_tokens_seen": 153888,
"step": 490,
"train_runtime": 81.0311,
"train_tokens_per_second": 1899.123
},
{
"epoch": 5.5,
"grad_norm": 0.2574822008609772,
"learning_rate": 4.550657035275323e-05,
"loss": 0.0727,
"num_input_tokens_seen": 155488,
"step": 495,
"train_runtime": 81.7508,
"train_tokens_per_second": 1901.976
},
{
"epoch": 5.555555555555555,
"grad_norm": 1.9353536367416382,
"learning_rate": 4.536695502592162e-05,
"loss": 0.1666,
"num_input_tokens_seen": 157024,
"step": 500,
"train_runtime": 82.479,
"train_tokens_per_second": 1903.805
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.1050436869263649,
"learning_rate": 4.522542485937369e-05,
"loss": 0.0724,
"num_input_tokens_seen": 158528,
"step": 505,
"train_runtime": 83.2079,
"train_tokens_per_second": 1905.205
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.029432374984025955,
"learning_rate": 4.5081993159349056e-05,
"loss": 0.1272,
"num_input_tokens_seen": 160064,
"step": 510,
"train_runtime": 83.935,
"train_tokens_per_second": 1906.999
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.026064742356538773,
"learning_rate": 4.493667341086379e-05,
"loss": 0.1286,
"num_input_tokens_seen": 161664,
"step": 515,
"train_runtime": 84.6515,
"train_tokens_per_second": 1909.758
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.04574081301689148,
"learning_rate": 4.478947927644258e-05,
"loss": 0.1791,
"num_input_tokens_seen": 163264,
"step": 520,
"train_runtime": 85.3662,
"train_tokens_per_second": 1912.513
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.4281242787837982,
"learning_rate": 4.464042459483425e-05,
"loss": 0.0292,
"num_input_tokens_seen": 164864,
"step": 525,
"train_runtime": 86.0827,
"train_tokens_per_second": 1915.182
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.1832948625087738,
"learning_rate": 4.448952337971064e-05,
"loss": 0.0392,
"num_input_tokens_seen": 166432,
"step": 530,
"train_runtime": 86.8048,
"train_tokens_per_second": 1917.314
},
{
"epoch": 5.944444444444445,
"grad_norm": 0.8455609083175659,
"learning_rate": 4.43367898183491e-05,
"loss": 0.0844,
"num_input_tokens_seen": 168032,
"step": 535,
"train_runtime": 87.5225,
"train_tokens_per_second": 1919.871
},
{
"epoch": 6.0,
"grad_norm": 0.29642245173454285,
"learning_rate": 4.418223827029867e-05,
"loss": 0.0243,
"num_input_tokens_seen": 169600,
"step": 540,
"train_runtime": 88.2761,
"train_tokens_per_second": 1921.246
},
{
"epoch": 6.0,
"eval_loss": 0.0781724601984024,
"eval_runtime": 0.534,
"eval_samples_per_second": 74.912,
"eval_steps_per_second": 18.728,
"num_input_tokens_seen": 169600,
"step": 540
},
{
"epoch": 6.055555555555555,
"grad_norm": 0.04110460728406906,
"learning_rate": 4.402588326603002e-05,
"loss": 0.0079,
"num_input_tokens_seen": 171168,
"step": 545,
"train_runtime": 90.4103,
"train_tokens_per_second": 1893.235
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.7516536116600037,
"learning_rate": 4.386773950556931e-05,
"loss": 0.0233,
"num_input_tokens_seen": 172672,
"step": 550,
"train_runtime": 91.138,
"train_tokens_per_second": 1894.62
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.054245639592409134,
"learning_rate": 4.3707821857116176e-05,
"loss": 0.0854,
"num_input_tokens_seen": 174240,
"step": 555,
"train_runtime": 91.8687,
"train_tokens_per_second": 1896.619
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.02101464383304119,
"learning_rate": 4.354614535564588e-05,
"loss": 0.1191,
"num_input_tokens_seen": 175776,
"step": 560,
"train_runtime": 92.5926,
"train_tokens_per_second": 1898.381
},
{
"epoch": 6.277777777777778,
"grad_norm": 0.41236135363578796,
"learning_rate": 4.3382725201495723e-05,
"loss": 0.0971,
"num_input_tokens_seen": 177376,
"step": 565,
"train_runtime": 93.3832,
"train_tokens_per_second": 1899.442
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.45961734652519226,
"learning_rate": 4.321757675893596e-05,
"loss": 0.0352,
"num_input_tokens_seen": 178912,
"step": 570,
"train_runtime": 94.1059,
"train_tokens_per_second": 1901.177
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.3838369846343994,
"learning_rate": 4.305071555472534e-05,
"loss": 0.0241,
"num_input_tokens_seen": 180480,
"step": 575,
"train_runtime": 94.8324,
"train_tokens_per_second": 1903.147
},
{
"epoch": 6.444444444444445,
"grad_norm": 1.6661782264709473,
"learning_rate": 4.288215727665129e-05,
"loss": 0.2398,
"num_input_tokens_seen": 182048,
"step": 580,
"train_runtime": 95.5571,
"train_tokens_per_second": 1905.123
},
{
"epoch": 6.5,
"grad_norm": 0.010891405865550041,
"learning_rate": 4.2711917772055e-05,
"loss": 0.0831,
"num_input_tokens_seen": 183648,
"step": 585,
"train_runtime": 96.272,
"train_tokens_per_second": 1907.595
},
{
"epoch": 6.555555555555555,
"grad_norm": 1.7999361753463745,
"learning_rate": 4.254001304634151e-05,
"loss": 0.1064,
"num_input_tokens_seen": 185248,
"step": 590,
"train_runtime": 96.9909,
"train_tokens_per_second": 1909.952
},
{
"epoch": 6.611111111111111,
"grad_norm": 0.1158963292837143,
"learning_rate": 4.2366459261474933e-05,
"loss": 0.0343,
"num_input_tokens_seen": 186720,
"step": 595,
"train_runtime": 97.7079,
"train_tokens_per_second": 1911.002
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.14980565011501312,
"learning_rate": 4.2191272734458955e-05,
"loss": 0.0058,
"num_input_tokens_seen": 188288,
"step": 600,
"train_runtime": 98.425,
"train_tokens_per_second": 1913.01
},
{
"epoch": 6.722222222222222,
"grad_norm": 0.031336456537246704,
"learning_rate": 4.201446993580276e-05,
"loss": 0.0557,
"num_input_tokens_seen": 189888,
"step": 605,
"train_runtime": 99.1477,
"train_tokens_per_second": 1915.202
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.10014252364635468,
"learning_rate": 4.183606748797251e-05,
"loss": 0.0224,
"num_input_tokens_seen": 191424,
"step": 610,
"train_runtime": 99.8674,
"train_tokens_per_second": 1916.782
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.02181674912571907,
"learning_rate": 4.1656082163828566e-05,
"loss": 0.0818,
"num_input_tokens_seen": 193056,
"step": 615,
"train_runtime": 100.5921,
"train_tokens_per_second": 1919.196
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.392986536026001,
"learning_rate": 4.147453088504854e-05,
"loss": 0.0517,
"num_input_tokens_seen": 194592,
"step": 620,
"train_runtime": 101.3102,
"train_tokens_per_second": 1920.754
},
{
"epoch": 6.944444444444445,
"grad_norm": 1.3498486280441284,
"learning_rate": 4.129143072053638e-05,
"loss": 0.0833,
"num_input_tokens_seen": 196192,
"step": 625,
"train_runtime": 102.0313,
"train_tokens_per_second": 1922.862
},
{
"epoch": 7.0,
"grad_norm": 0.016840647906064987,
"learning_rate": 4.110679888481763e-05,
"loss": 0.0427,
"num_input_tokens_seen": 197792,
"step": 630,
"train_runtime": 102.7824,
"train_tokens_per_second": 1924.376
},
{
"epoch": 7.0,
"eval_loss": 0.07322361320257187,
"eval_runtime": 0.5374,
"eval_samples_per_second": 74.436,
"eval_steps_per_second": 18.609,
"num_input_tokens_seen": 197792,
"step": 630
},
{
"epoch": 7.055555555555555,
"grad_norm": 0.7116149663925171,
"learning_rate": 4.09206527364209e-05,
"loss": 0.0141,
"num_input_tokens_seen": 199392,
"step": 635,
"train_runtime": 106.2986,
"train_tokens_per_second": 1875.773
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.010340097360312939,
"learning_rate": 4.073300977624594e-05,
"loss": 0.0111,
"num_input_tokens_seen": 200992,
"step": 640,
"train_runtime": 107.0352,
"train_tokens_per_second": 1877.811
},
{
"epoch": 7.166666666666667,
"grad_norm": 1.3066519498825073,
"learning_rate": 4.054388764591822e-05,
"loss": 0.0636,
"num_input_tokens_seen": 202592,
"step": 645,
"train_runtime": 107.763,
"train_tokens_per_second": 1879.977
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.02088622748851776,
"learning_rate": 4.035330412613035e-05,
"loss": 0.0997,
"num_input_tokens_seen": 204064,
"step": 650,
"train_runtime": 108.496,
"train_tokens_per_second": 1880.843
},
{
"epoch": 7.277777777777778,
"grad_norm": 0.028347406536340714,
"learning_rate": 4.0161277134970345e-05,
"loss": 0.0063,
"num_input_tokens_seen": 205664,
"step": 655,
"train_runtime": 109.2906,
"train_tokens_per_second": 1881.809
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.39772096276283264,
"learning_rate": 3.996782472623705e-05,
"loss": 0.1851,
"num_input_tokens_seen": 207264,
"step": 660,
"train_runtime": 110.0122,
"train_tokens_per_second": 1884.01
},
{
"epoch": 7.388888888888889,
"grad_norm": 0.7303412556648254,
"learning_rate": 3.977296508774278e-05,
"loss": 0.0443,
"num_input_tokens_seen": 208832,
"step": 665,
"train_runtime": 110.7332,
"train_tokens_per_second": 1885.902
},
{
"epoch": 7.444444444444445,
"grad_norm": 1.5940624475479126,
"learning_rate": 3.957671653960337e-05,
"loss": 0.1003,
"num_input_tokens_seen": 210368,
"step": 670,
"train_runtime": 111.4506,
"train_tokens_per_second": 1887.545
},
{
"epoch": 7.5,
"grad_norm": 0.5533637404441833,
"learning_rate": 3.9379097532515725e-05,
"loss": 0.0987,
"num_input_tokens_seen": 211936,
"step": 675,
"train_runtime": 112.1701,
"train_tokens_per_second": 1889.417
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.022317882627248764,
"learning_rate": 3.918012664602317e-05,
"loss": 0.0593,
"num_input_tokens_seen": 213536,
"step": 680,
"train_runtime": 112.8872,
"train_tokens_per_second": 1891.587
},
{
"epoch": 7.611111111111111,
"grad_norm": 0.3806142210960388,
"learning_rate": 3.897982258676867e-05,
"loss": 0.0728,
"num_input_tokens_seen": 215136,
"step": 685,
"train_runtime": 113.6042,
"train_tokens_per_second": 1893.732
},
{
"epoch": 7.666666666666667,
"grad_norm": 1.6564552783966064,
"learning_rate": 3.8778204186736076e-05,
"loss": 0.1161,
"num_input_tokens_seen": 216736,
"step": 690,
"train_runtime": 114.3211,
"train_tokens_per_second": 1895.853
},
{
"epoch": 7.722222222222222,
"grad_norm": 0.03356202319264412,
"learning_rate": 3.8575290401479586e-05,
"loss": 0.0112,
"num_input_tokens_seen": 218272,
"step": 695,
"train_runtime": 115.0384,
"train_tokens_per_second": 1897.384
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.5456864237785339,
"learning_rate": 3.837110030834161e-05,
"loss": 0.0608,
"num_input_tokens_seen": 219808,
"step": 700,
"train_runtime": 115.752,
"train_tokens_per_second": 1898.956
},
{
"epoch": 7.833333333333333,
"grad_norm": 0.02318716049194336,
"learning_rate": 3.8165653104659185e-05,
"loss": 0.0097,
"num_input_tokens_seen": 221312,
"step": 705,
"train_runtime": 116.4694,
"train_tokens_per_second": 1900.173
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.13537460565567017,
"learning_rate": 3.79589681059591e-05,
"loss": 0.0481,
"num_input_tokens_seen": 222880,
"step": 710,
"train_runtime": 117.1878,
"train_tokens_per_second": 1901.905
},
{
"epoch": 7.944444444444445,
"grad_norm": 0.28246793150901794,
"learning_rate": 3.775106474414188e-05,
"loss": 0.0856,
"num_input_tokens_seen": 224416,
"step": 715,
"train_runtime": 117.9062,
"train_tokens_per_second": 1903.343
},
{
"epoch": 8.0,
"grad_norm": 0.4781140387058258,
"learning_rate": 3.75419625656549e-05,
"loss": 0.018,
"num_input_tokens_seen": 225984,
"step": 720,
"train_runtime": 118.6553,
"train_tokens_per_second": 1904.542
},
{
"epoch": 8.0,
"eval_loss": 0.07187919318675995,
"eval_runtime": 0.5325,
"eval_samples_per_second": 75.117,
"eval_steps_per_second": 18.779,
"num_input_tokens_seen": 225984,
"step": 720
},
{
"epoch": 8.055555555555555,
"grad_norm": 0.024942545220255852,
"learning_rate": 3.7331681229654635e-05,
"loss": 0.0101,
"num_input_tokens_seen": 227552,
"step": 725,
"train_runtime": 120.7421,
"train_tokens_per_second": 1884.612
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.10381424427032471,
"learning_rate": 3.712024050615843e-05,
"loss": 0.0972,
"num_input_tokens_seen": 229088,
"step": 730,
"train_runtime": 121.4789,
"train_tokens_per_second": 1885.826
},
{
"epoch": 8.166666666666666,
"grad_norm": 0.13342487812042236,
"learning_rate": 3.690766027418573e-05,
"loss": 0.0919,
"num_input_tokens_seen": 230656,
"step": 735,
"train_runtime": 122.2116,
"train_tokens_per_second": 1887.349
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.3371022343635559,
"learning_rate": 3.6693960519889106e-05,
"loss": 0.0366,
"num_input_tokens_seen": 232224,
"step": 740,
"train_runtime": 122.9443,
"train_tokens_per_second": 1888.855
},
{
"epoch": 8.277777777777779,
"grad_norm": 0.0277867391705513,
"learning_rate": 3.6479161334675296e-05,
"loss": 0.0324,
"num_input_tokens_seen": 233792,
"step": 745,
"train_runtime": 123.6753,
"train_tokens_per_second": 1890.369
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.24517586827278137,
"learning_rate": 3.626328291331618e-05,
"loss": 0.1219,
"num_input_tokens_seen": 235328,
"step": 750,
"train_runtime": 124.4739,
"train_tokens_per_second": 1890.582
},
{
"epoch": 8.38888888888889,
"grad_norm": 0.1492447555065155,
"learning_rate": 3.60463455520502e-05,
"loss": 0.0053,
"num_input_tokens_seen": 236864,
"step": 755,
"train_runtime": 125.2052,
"train_tokens_per_second": 1891.806
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.2043927162885666,
"learning_rate": 3.582836964667408e-05,
"loss": 0.0213,
"num_input_tokens_seen": 238368,
"step": 760,
"train_runtime": 125.9297,
"train_tokens_per_second": 1892.865
},
{
"epoch": 8.5,
"grad_norm": 1.6784281730651855,
"learning_rate": 3.560937569062538e-05,
"loss": 0.0844,
"num_input_tokens_seen": 239936,
"step": 765,
"train_runtime": 126.6543,
"train_tokens_per_second": 1894.416
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.9631844758987427,
"learning_rate": 3.538938427305573e-05,
"loss": 0.056,
"num_input_tokens_seen": 241536,
"step": 770,
"train_runtime": 127.3734,
"train_tokens_per_second": 1896.282
},
{
"epoch": 8.61111111111111,
"grad_norm": 1.8858811855316162,
"learning_rate": 3.516841607689501e-05,
"loss": 0.1102,
"num_input_tokens_seen": 243136,
"step": 775,
"train_runtime": 128.0932,
"train_tokens_per_second": 1898.118
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.04732911288738251,
"learning_rate": 3.494649187690695e-05,
"loss": 0.0402,
"num_input_tokens_seen": 244704,
"step": 780,
"train_runtime": 128.81,
"train_tokens_per_second": 1899.728
},
{
"epoch": 8.722222222222221,
"grad_norm": 0.5676568150520325,
"learning_rate": 3.4723632537735846e-05,
"loss": 0.05,
"num_input_tokens_seen": 246272,
"step": 785,
"train_runtime": 129.5253,
"train_tokens_per_second": 1901.342
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.014905107207596302,
"learning_rate": 3.449985901194498e-05,
"loss": 0.0379,
"num_input_tokens_seen": 247808,
"step": 790,
"train_runtime": 130.2469,
"train_tokens_per_second": 1902.603
},
{
"epoch": 8.833333333333334,
"grad_norm": 0.23896387219429016,
"learning_rate": 3.427519233804667e-05,
"loss": 0.0577,
"num_input_tokens_seen": 249376,
"step": 795,
"train_runtime": 130.965,
"train_tokens_per_second": 1904.142
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.4831705689430237,
"learning_rate": 3.404965363852437e-05,
"loss": 0.0611,
"num_input_tokens_seen": 250944,
"step": 800,
"train_runtime": 131.6819,
"train_tokens_per_second": 1905.683
},
{
"epoch": 8.944444444444445,
"grad_norm": 0.32540127635002136,
"learning_rate": 3.382326411784672e-05,
"loss": 0.0375,
"num_input_tokens_seen": 252512,
"step": 805,
"train_runtime": 132.3995,
"train_tokens_per_second": 1907.198
},
{
"epoch": 9.0,
"grad_norm": 0.10771387815475464,
"learning_rate": 3.359604506047403e-05,
"loss": 0.0989,
"num_input_tokens_seen": 254112,
"step": 810,
"train_runtime": 133.15,
"train_tokens_per_second": 1908.465
},
{
"epoch": 9.0,
"eval_loss": 0.07317575812339783,
"eval_runtime": 0.554,
"eval_samples_per_second": 72.204,
"eval_steps_per_second": 18.051,
"num_input_tokens_seen": 254112,
"step": 810
},
{
"epoch": 9.055555555555555,
"grad_norm": 0.1082310676574707,
"learning_rate": 3.336801782885712e-05,
"loss": 0.0755,
"num_input_tokens_seen": 255680,
"step": 815,
"train_runtime": 135.2209,
"train_tokens_per_second": 1890.832
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.40736010670661926,
"learning_rate": 3.313920386142892e-05,
"loss": 0.0437,
"num_input_tokens_seen": 257216,
"step": 820,
"train_runtime": 135.9533,
"train_tokens_per_second": 1891.943
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.014512820169329643,
"learning_rate": 3.290962467058891e-05,
"loss": 0.0275,
"num_input_tokens_seen": 258816,
"step": 825,
"train_runtime": 136.6805,
"train_tokens_per_second": 1893.584
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.8402073383331299,
"learning_rate": 3.267930184068057e-05,
"loss": 0.1428,
"num_input_tokens_seen": 260384,
"step": 830,
"train_runtime": 137.4096,
"train_tokens_per_second": 1894.948
},
{
"epoch": 9.277777777777779,
"grad_norm": 0.2935343086719513,
"learning_rate": 3.244825702596205e-05,
"loss": 0.0559,
"num_input_tokens_seen": 262048,
"step": 835,
"train_runtime": 138.1989,
"train_tokens_per_second": 1896.166
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.3444117605686188,
"learning_rate": 3.2216511948570374e-05,
"loss": 0.0122,
"num_input_tokens_seen": 263616,
"step": 840,
"train_runtime": 138.9225,
"train_tokens_per_second": 1897.577
},
{
"epoch": 9.38888888888889,
"grad_norm": 0.539638340473175,
"learning_rate": 3.198408839647911e-05,
"loss": 0.0114,
"num_input_tokens_seen": 265152,
"step": 845,
"train_runtime": 139.6464,
"train_tokens_per_second": 1898.739
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.12687230110168457,
"learning_rate": 3.1751008221450025e-05,
"loss": 0.0371,
"num_input_tokens_seen": 266688,
"step": 850,
"train_runtime": 140.3648,
"train_tokens_per_second": 1899.963
},
{
"epoch": 9.5,
"grad_norm": 0.21428878605365753,
"learning_rate": 3.151729333697854e-05,
"loss": 0.0334,
"num_input_tokens_seen": 268256,
"step": 855,
"train_runtime": 141.0832,
"train_tokens_per_second": 1901.403
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.2163764238357544,
"learning_rate": 3.1282965716233594e-05,
"loss": 0.0095,
"num_input_tokens_seen": 269824,
"step": 860,
"train_runtime": 141.8029,
"train_tokens_per_second": 1902.811
},
{
"epoch": 9.61111111111111,
"grad_norm": 1.6254862546920776,
"learning_rate": 3.104804738999169e-05,
"loss": 0.1594,
"num_input_tokens_seen": 271424,
"step": 865,
"train_runtime": 142.5204,
"train_tokens_per_second": 1904.458
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.29750144481658936,
"learning_rate": 3.0812560444565745e-05,
"loss": 0.0159,
"num_input_tokens_seen": 272960,
"step": 870,
"train_runtime": 143.2426,
"train_tokens_per_second": 1905.578
},
{
"epoch": 9.722222222222221,
"grad_norm": 0.03307567536830902,
"learning_rate": 3.057652701972848e-05,
"loss": 0.0186,
"num_input_tokens_seen": 274528,
"step": 875,
"train_runtime": 143.9668,
"train_tokens_per_second": 1906.885
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.3856286406517029,
"learning_rate": 3.0339969306631005e-05,
"loss": 0.079,
"num_input_tokens_seen": 276128,
"step": 880,
"train_runtime": 144.6918,
"train_tokens_per_second": 1908.387
},
{
"epoch": 9.833333333333334,
"grad_norm": 0.039159130305051804,
"learning_rate": 3.0102909545716396e-05,
"loss": 0.0675,
"num_input_tokens_seen": 277664,
"step": 885,
"train_runtime": 145.4169,
"train_tokens_per_second": 1909.434
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.31276217103004456,
"learning_rate": 2.9865370024628775e-05,
"loss": 0.0866,
"num_input_tokens_seen": 279232,
"step": 890,
"train_runtime": 146.1333,
"train_tokens_per_second": 1910.804
},
{
"epoch": 9.944444444444445,
"grad_norm": 0.19623835384845734,
"learning_rate": 2.9627373076117863e-05,
"loss": 0.1173,
"num_input_tokens_seen": 280768,
"step": 895,
"train_runtime": 146.8573,
"train_tokens_per_second": 1911.842
},
{
"epoch": 10.0,
"grad_norm": 0.009075737558305264,
"learning_rate": 2.9388941075939334e-05,
"loss": 0.0072,
"num_input_tokens_seen": 282368,
"step": 900,
"train_runtime": 147.6123,
"train_tokens_per_second": 1912.904
},
{
"epoch": 10.0,
"eval_loss": 0.06860806792974472,
"eval_runtime": 0.5358,
"eval_samples_per_second": 74.655,
"eval_steps_per_second": 18.664,
"num_input_tokens_seen": 282368,
"step": 900
},
{
"epoch": 10.055555555555555,
"grad_norm": 0.757532000541687,
"learning_rate": 2.9150096440751107e-05,
"loss": 0.0278,
"num_input_tokens_seen": 283936,
"step": 905,
"train_runtime": 149.7123,
"train_tokens_per_second": 1896.544
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.22894321382045746,
"learning_rate": 2.8910861626005776e-05,
"loss": 0.023,
"num_input_tokens_seen": 285504,
"step": 910,
"train_runtime": 150.4441,
"train_tokens_per_second": 1897.742
},
{
"epoch": 10.166666666666666,
"grad_norm": 0.018537739291787148,
"learning_rate": 2.8671259123839472e-05,
"loss": 0.1103,
"num_input_tokens_seen": 287072,
"step": 915,
"train_runtime": 151.1726,
"train_tokens_per_second": 1898.968
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.017388442531228065,
"learning_rate": 2.843131146095719e-05,
"loss": 0.0086,
"num_input_tokens_seen": 288576,
"step": 920,
"train_runtime": 151.9305,
"train_tokens_per_second": 1899.394
},
{
"epoch": 10.277777777777779,
"grad_norm": 1.3574343919754028,
"learning_rate": 2.8191041196514873e-05,
"loss": 0.0723,
"num_input_tokens_seen": 290144,
"step": 925,
"train_runtime": 152.6969,
"train_tokens_per_second": 1900.13
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.005878520663827658,
"learning_rate": 2.795047091999849e-05,
"loss": 0.0798,
"num_input_tokens_seen": 291744,
"step": 930,
"train_runtime": 153.4169,
"train_tokens_per_second": 1901.642
},
{
"epoch": 10.38888888888889,
"grad_norm": 1.1414101123809814,
"learning_rate": 2.770962324910027e-05,
"loss": 0.0842,
"num_input_tokens_seen": 293344,
"step": 935,
"train_runtime": 154.1374,
"train_tokens_per_second": 1903.134
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.028480498120188713,
"learning_rate": 2.7468520827592197e-05,
"loss": 0.0096,
"num_input_tokens_seen": 294912,
"step": 940,
"train_runtime": 154.8553,
"train_tokens_per_second": 1904.436
},
{
"epoch": 10.5,
"grad_norm": 0.46967682242393494,
"learning_rate": 2.7227186323197162e-05,
"loss": 0.0998,
"num_input_tokens_seen": 296480,
"step": 945,
"train_runtime": 155.5729,
"train_tokens_per_second": 1905.731
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.1990341991186142,
"learning_rate": 2.6985642425457757e-05,
"loss": 0.0329,
"num_input_tokens_seen": 298048,
"step": 950,
"train_runtime": 156.289,
"train_tokens_per_second": 1907.031
},
{
"epoch": 10.61111111111111,
"grad_norm": 0.12564487755298615,
"learning_rate": 2.674391184360313e-05,
"loss": 0.0243,
"num_input_tokens_seen": 299648,
"step": 955,
"train_runtime": 157.005,
"train_tokens_per_second": 1908.525
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.9219412207603455,
"learning_rate": 2.650201730441392e-05,
"loss": 0.0245,
"num_input_tokens_seen": 301216,
"step": 960,
"train_runtime": 157.721,
"train_tokens_per_second": 1909.802
},
{
"epoch": 10.722222222222221,
"grad_norm": 0.3017958402633667,
"learning_rate": 2.6259981550085504e-05,
"loss": 0.045,
"num_input_tokens_seen": 302784,
"step": 965,
"train_runtime": 158.4375,
"train_tokens_per_second": 1911.063
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.06184842064976692,
"learning_rate": 2.60178273360899e-05,
"loss": 0.0051,
"num_input_tokens_seen": 304320,
"step": 970,
"train_runtime": 159.1528,
"train_tokens_per_second": 1912.125
},
{
"epoch": 10.833333333333334,
"grad_norm": 0.1001506820321083,
"learning_rate": 2.5775577429036345e-05,
"loss": 0.0531,
"num_input_tokens_seen": 305856,
"step": 975,
"train_runtime": 159.872,
"train_tokens_per_second": 1913.131
},
{
"epoch": 10.88888888888889,
"grad_norm": 1.2442516088485718,
"learning_rate": 2.553325460453086e-05,
"loss": 0.0839,
"num_input_tokens_seen": 307424,
"step": 980,
"train_runtime": 160.5886,
"train_tokens_per_second": 1914.357
},
{
"epoch": 10.944444444444445,
"grad_norm": 0.007865453138947487,
"learning_rate": 2.5290881645034932e-05,
"loss": 0.0476,
"num_input_tokens_seen": 308992,
"step": 985,
"train_runtime": 161.3127,
"train_tokens_per_second": 1915.484
},
{
"epoch": 11.0,
"grad_norm": 0.5944153666496277,
"learning_rate": 2.504848133772358e-05,
"loss": 0.1143,
"num_input_tokens_seen": 310560,
"step": 990,
"train_runtime": 162.0625,
"train_tokens_per_second": 1916.298
},
{
"epoch": 11.0,
"eval_loss": 0.06816816329956055,
"eval_runtime": 0.5339,
"eval_samples_per_second": 74.924,
"eval_steps_per_second": 18.731,
"num_input_tokens_seen": 310560,
"step": 990
},
{
"epoch": 11.055555555555555,
"grad_norm": 0.38990718126296997,
"learning_rate": 2.4806076472342997e-05,
"loss": 0.0147,
"num_input_tokens_seen": 312160,
"step": 995,
"train_runtime": 164.1664,
"train_tokens_per_second": 1901.485
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.10253170877695084,
"learning_rate": 2.4563689839067913e-05,
"loss": 0.0616,
"num_input_tokens_seen": 313728,
"step": 1000,
"train_runtime": 164.9003,
"train_tokens_per_second": 1902.532
},
{
"epoch": 11.166666666666666,
"grad_norm": 0.08587852865457535,
"learning_rate": 2.432134422635893e-05,
"loss": 0.015,
"num_input_tokens_seen": 315264,
"step": 1005,
"train_runtime": 165.6348,
"train_tokens_per_second": 1903.368
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.022479208186268806,
"learning_rate": 2.4079062418820002e-05,
"loss": 0.0089,
"num_input_tokens_seen": 316864,
"step": 1010,
"train_runtime": 166.376,
"train_tokens_per_second": 1904.505
},
{
"epoch": 11.277777777777779,
"grad_norm": 0.12377724051475525,
"learning_rate": 2.3836867195056335e-05,
"loss": 0.0228,
"num_input_tokens_seen": 318432,
"step": 1015,
"train_runtime": 167.1701,
"train_tokens_per_second": 1904.838
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.2768175005912781,
"learning_rate": 2.3594781325532784e-05,
"loss": 0.0939,
"num_input_tokens_seen": 320032,
"step": 1020,
"train_runtime": 167.8931,
"train_tokens_per_second": 1906.165
},
{
"epoch": 11.38888888888889,
"grad_norm": 0.44363412261009216,
"learning_rate": 2.3352827570433036e-05,
"loss": 0.0239,
"num_input_tokens_seen": 321536,
"step": 1025,
"train_runtime": 168.6167,
"train_tokens_per_second": 1906.904
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.22889579832553864,
"learning_rate": 2.3111028677519804e-05,
"loss": 0.0277,
"num_input_tokens_seen": 323040,
"step": 1030,
"train_runtime": 169.3373,
"train_tokens_per_second": 1907.672
},
{
"epoch": 11.5,
"grad_norm": 0.03744177892804146,
"learning_rate": 2.2869407379996088e-05,
"loss": 0.0354,
"num_input_tokens_seen": 324608,
"step": 1035,
"train_runtime": 170.0547,
"train_tokens_per_second": 1908.844
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.27139776945114136,
"learning_rate": 2.2627986394367938e-05,
"loss": 0.0294,
"num_input_tokens_seen": 326144,
"step": 1040,
"train_runtime": 170.7721,
"train_tokens_per_second": 1909.82
},
{
"epoch": 11.61111111111111,
"grad_norm": 0.15249714255332947,
"learning_rate": 2.238678841830867e-05,
"loss": 0.1264,
"num_input_tokens_seen": 327712,
"step": 1045,
"train_runtime": 171.4941,
"train_tokens_per_second": 1910.923
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.1465960144996643,
"learning_rate": 2.2145836128524902e-05,
"loss": 0.0362,
"num_input_tokens_seen": 329248,
"step": 1050,
"train_runtime": 172.2187,
"train_tokens_per_second": 1911.802
},
{
"epoch": 11.722222222222221,
"grad_norm": 0.30058616399765015,
"learning_rate": 2.1905152178624595e-05,
"loss": 0.1676,
"num_input_tokens_seen": 330816,
"step": 1055,
"train_runtime": 172.9388,
"train_tokens_per_second": 1912.908
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.12410334497690201,
"learning_rate": 2.1664759196987182e-05,
"loss": 0.049,
"num_input_tokens_seen": 332416,
"step": 1060,
"train_runtime": 173.661,
"train_tokens_per_second": 1914.166
},
{
"epoch": 11.833333333333334,
"grad_norm": 0.007951710373163223,
"learning_rate": 2.1424679784636144e-05,
"loss": 0.0453,
"num_input_tokens_seen": 334016,
"step": 1065,
"train_runtime": 174.3868,
"train_tokens_per_second": 1915.375
},
{
"epoch": 11.88888888888889,
"grad_norm": 0.2507447600364685,
"learning_rate": 2.118493651311413e-05,
"loss": 0.0451,
"num_input_tokens_seen": 335616,
"step": 1070,
"train_runtime": 175.1045,
"train_tokens_per_second": 1916.661
},
{
"epoch": 11.944444444444445,
"grad_norm": 0.25571751594543457,
"learning_rate": 2.0945551922360818e-05,
"loss": 0.0556,
"num_input_tokens_seen": 337152,
"step": 1075,
"train_runtime": 175.8246,
"train_tokens_per_second": 1917.548
},
{
"epoch": 12.0,
"grad_norm": 0.020534232258796692,
"learning_rate": 2.070654851859383e-05,
"loss": 0.0457,
"num_input_tokens_seen": 338784,
"step": 1080,
"train_runtime": 176.5722,
"train_tokens_per_second": 1918.671
},
{
"epoch": 12.0,
"eval_loss": 0.06687770783901215,
"eval_runtime": 0.5323,
"eval_samples_per_second": 75.144,
"eval_steps_per_second": 18.786,
"num_input_tokens_seen": 338784,
"step": 1080
},
{
"epoch": 12.055555555555555,
"grad_norm": 1.2923614978790283,
"learning_rate": 2.0467948772192713e-05,
"loss": 0.0482,
"num_input_tokens_seen": 340288,
"step": 1085,
"train_runtime": 178.6516,
"train_tokens_per_second": 1904.757
},
{
"epoch": 12.11111111111111,
"grad_norm": 1.6049034595489502,
"learning_rate": 2.022977511558638e-05,
"loss": 0.083,
"num_input_tokens_seen": 341888,
"step": 1090,
"train_runtime": 179.3776,
"train_tokens_per_second": 1905.968
},
{
"epoch": 12.166666666666666,
"grad_norm": 0.49522802233695984,
"learning_rate": 1.9992049941144066e-05,
"loss": 0.0925,
"num_input_tokens_seen": 343488,
"step": 1095,
"train_runtime": 180.1027,
"train_tokens_per_second": 1907.179
},
{
"epoch": 12.222222222222221,
"grad_norm": 0.21042613685131073,
"learning_rate": 1.9754795599070068e-05,
"loss": 0.0725,
"num_input_tokens_seen": 344992,
"step": 1100,
"train_runtime": 180.8284,
"train_tokens_per_second": 1907.842
},
{
"epoch": 12.277777777777779,
"grad_norm": 0.0459279902279377,
"learning_rate": 1.9518034395302414e-05,
"loss": 0.0175,
"num_input_tokens_seen": 346560,
"step": 1105,
"train_runtime": 181.6209,
"train_tokens_per_second": 1908.151
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.42952170968055725,
"learning_rate": 1.9281788589415804e-05,
"loss": 0.029,
"num_input_tokens_seen": 348160,
"step": 1110,
"train_runtime": 182.3469,
"train_tokens_per_second": 1909.327
},
{
"epoch": 12.38888888888889,
"grad_norm": 0.00739190774038434,
"learning_rate": 1.9046080392528735e-05,
"loss": 0.0087,
"num_input_tokens_seen": 349760,
"step": 1115,
"train_runtime": 183.0709,
"train_tokens_per_second": 1910.517
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.7268601655960083,
"learning_rate": 1.8810931965215356e-05,
"loss": 0.0211,
"num_input_tokens_seen": 351328,
"step": 1120,
"train_runtime": 183.788,
"train_tokens_per_second": 1911.593
},
{
"epoch": 12.5,
"grad_norm": 0.021834464743733406,
"learning_rate": 1.857636541542195e-05,
"loss": 0.0294,
"num_input_tokens_seen": 352896,
"step": 1125,
"train_runtime": 184.5069,
"train_tokens_per_second": 1912.644
},
{
"epoch": 12.555555555555555,
"grad_norm": 0.1519467979669571,
"learning_rate": 1.8342402796388445e-05,
"loss": 0.0115,
"num_input_tokens_seen": 354464,
"step": 1130,
"train_runtime": 185.2247,
"train_tokens_per_second": 1913.697
},
{
"epoch": 12.61111111111111,
"grad_norm": 1.5149493217468262,
"learning_rate": 1.8109066104575023e-05,
"loss": 0.0877,
"num_input_tokens_seen": 356032,
"step": 1135,
"train_runtime": 185.9478,
"train_tokens_per_second": 1914.688
},
{
"epoch": 12.666666666666666,
"grad_norm": 0.12189318984746933,
"learning_rate": 1.7876377277594053e-05,
"loss": 0.0222,
"num_input_tokens_seen": 357632,
"step": 1140,
"train_runtime": 186.6713,
"train_tokens_per_second": 1915.838
},
{
"epoch": 12.722222222222221,
"grad_norm": 1.7125587463378906,
"learning_rate": 1.764435819214762e-05,
"loss": 0.1248,
"num_input_tokens_seen": 359168,
"step": 1145,
"train_runtime": 187.4026,
"train_tokens_per_second": 1916.558
},
{
"epoch": 12.777777777777779,
"grad_norm": 1.6973289251327515,
"learning_rate": 1.7413030661970742e-05,
"loss": 0.0736,
"num_input_tokens_seen": 360736,
"step": 1150,
"train_runtime": 188.1281,
"train_tokens_per_second": 1917.501
},
{
"epoch": 12.833333333333334,
"grad_norm": 0.1838073432445526,
"learning_rate": 1.7182416435780454e-05,
"loss": 0.0277,
"num_input_tokens_seen": 362304,
"step": 1155,
"train_runtime": 188.8455,
"train_tokens_per_second": 1918.52
},
{
"epoch": 12.88888888888889,
"grad_norm": 0.34476426243782043,
"learning_rate": 1.695253719523115e-05,
"loss": 0.0162,
"num_input_tokens_seen": 363872,
"step": 1160,
"train_runtime": 189.5628,
"train_tokens_per_second": 1919.533
},
{
"epoch": 12.944444444444445,
"grad_norm": 0.5110657215118408,
"learning_rate": 1.672341455287605e-05,
"loss": 0.047,
"num_input_tokens_seen": 365376,
"step": 1165,
"train_runtime": 190.2824,
"train_tokens_per_second": 1920.178
},
{
"epoch": 13.0,
"grad_norm": 0.016537530347704887,
"learning_rate": 1.649507005013532e-05,
"loss": 0.0551,
"num_input_tokens_seen": 366944,
"step": 1170,
"train_runtime": 191.037,
"train_tokens_per_second": 1920.801
},
{
"epoch": 13.0,
"eval_loss": 0.06656987965106964,
"eval_runtime": 0.5343,
"eval_samples_per_second": 74.869,
"eval_steps_per_second": 18.717,
"num_input_tokens_seen": 366944,
"step": 1170
},
{
"epoch": 13.055555555555555,
"grad_norm": 0.048492301255464554,
"learning_rate": 1.6267525155270773e-05,
"loss": 0.0259,
"num_input_tokens_seen": 368384,
"step": 1175,
"train_runtime": 193.1048,
"train_tokens_per_second": 1907.689
},
{
"epoch": 13.11111111111111,
"grad_norm": 0.009170373901724815,
"learning_rate": 1.6040801261367493e-05,
"loss": 0.0216,
"num_input_tokens_seen": 369984,
"step": 1180,
"train_runtime": 193.8321,
"train_tokens_per_second": 1908.786
},
{
"epoch": 13.166666666666666,
"grad_norm": 1.6876851320266724,
"learning_rate": 1.5814919684322545e-05,
"loss": 0.1097,
"num_input_tokens_seen": 371520,
"step": 1185,
"train_runtime": 194.5588,
"train_tokens_per_second": 1909.552
},
{
"epoch": 13.222222222222221,
"grad_norm": 0.023002078756690025,
"learning_rate": 1.5589901660840896e-05,
"loss": 0.0883,
"num_input_tokens_seen": 373120,
"step": 1190,
"train_runtime": 195.2853,
"train_tokens_per_second": 1910.64
},
{
"epoch": 13.277777777777779,
"grad_norm": 0.3338475227355957,
"learning_rate": 1.5365768346438797e-05,
"loss": 0.023,
"num_input_tokens_seen": 374688,
"step": 1195,
"train_runtime": 196.0748,
"train_tokens_per_second": 1910.944
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.32469648122787476,
"learning_rate": 1.5142540813454836e-05,
"loss": 0.0364,
"num_input_tokens_seen": 376288,
"step": 1200,
"train_runtime": 196.7963,
"train_tokens_per_second": 1912.068
},
{
"epoch": 13.38888888888889,
"grad_norm": 0.16993819177150726,
"learning_rate": 1.4920240049068748e-05,
"loss": 0.0063,
"num_input_tokens_seen": 377888,
"step": 1205,
"train_runtime": 197.5237,
"train_tokens_per_second": 1913.127
},
{
"epoch": 13.444444444444445,
"grad_norm": 0.03197206184267998,
"learning_rate": 1.4698886953328292e-05,
"loss": 0.0037,
"num_input_tokens_seen": 379424,
"step": 1210,
"train_runtime": 198.243,
"train_tokens_per_second": 1913.934
},
{
"epoch": 13.5,
"grad_norm": 0.009404771961271763,
"learning_rate": 1.4478502337184274e-05,
"loss": 0.072,
"num_input_tokens_seen": 380992,
"step": 1215,
"train_runtime": 198.9608,
"train_tokens_per_second": 1914.91
},
{
"epoch": 13.555555555555555,
"grad_norm": 0.730259895324707,
"learning_rate": 1.4259106920533955e-05,
"loss": 0.0251,
"num_input_tokens_seen": 382592,
"step": 1220,
"train_runtime": 199.6822,
"train_tokens_per_second": 1916.005
},
{
"epoch": 13.61111111111111,
"grad_norm": 0.23059076070785522,
"learning_rate": 1.4040721330273062e-05,
"loss": 0.0444,
"num_input_tokens_seen": 384192,
"step": 1225,
"train_runtime": 200.4048,
"train_tokens_per_second": 1917.08
},
{
"epoch": 13.666666666666666,
"grad_norm": 0.990837574005127,
"learning_rate": 1.3823366098356487e-05,
"loss": 0.0375,
"num_input_tokens_seen": 385760,
"step": 1230,
"train_runtime": 201.1286,
"train_tokens_per_second": 1917.977
},
{
"epoch": 13.722222222222221,
"grad_norm": 0.01306883804500103,
"learning_rate": 1.3607061659867892e-05,
"loss": 0.0579,
"num_input_tokens_seen": 387328,
"step": 1235,
"train_runtime": 201.8474,
"train_tokens_per_second": 1918.915
},
{
"epoch": 13.777777777777779,
"grad_norm": 0.020695485174655914,
"learning_rate": 1.3391828351098578e-05,
"loss": 0.0117,
"num_input_tokens_seen": 388896,
"step": 1240,
"train_runtime": 202.5664,
"train_tokens_per_second": 1919.845
},
{
"epoch": 13.833333333333334,
"grad_norm": 0.6323536038398743,
"learning_rate": 1.3177686407635417e-05,
"loss": 0.0299,
"num_input_tokens_seen": 390496,
"step": 1245,
"train_runtime": 203.284,
"train_tokens_per_second": 1920.938
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.011660303920507431,
"learning_rate": 1.29646559624584e-05,
"loss": 0.0102,
"num_input_tokens_seen": 392064,
"step": 1250,
"train_runtime": 204.0027,
"train_tokens_per_second": 1921.856
},
{
"epoch": 13.944444444444445,
"grad_norm": 0.35882309079170227,
"learning_rate": 1.2752757044047827e-05,
"loss": 0.0984,
"num_input_tokens_seen": 393632,
"step": 1255,
"train_runtime": 204.7229,
"train_tokens_per_second": 1922.755
},
{
"epoch": 14.0,
"grad_norm": 1.5153825283050537,
"learning_rate": 1.2542009574501246e-05,
"loss": 0.139,
"num_input_tokens_seen": 395104,
"step": 1260,
"train_runtime": 205.4779,
"train_tokens_per_second": 1922.854
},
{
"epoch": 14.0,
"eval_loss": 0.06650747358798981,
"eval_runtime": 0.5302,
"eval_samples_per_second": 75.448,
"eval_steps_per_second": 18.862,
"num_input_tokens_seen": 395104,
"step": 1260
},
{
"epoch": 14.055555555555555,
"grad_norm": 0.3795919716358185,
"learning_rate": 1.2332433367660442e-05,
"loss": 0.0579,
"num_input_tokens_seen": 396672,
"step": 1265,
"train_runtime": 207.563,
"train_tokens_per_second": 1911.092
},
{
"epoch": 14.11111111111111,
"grad_norm": 0.027463024482131004,
"learning_rate": 1.2124048127248644e-05,
"loss": 0.0444,
"num_input_tokens_seen": 398304,
"step": 1270,
"train_runtime": 208.3001,
"train_tokens_per_second": 1912.164
},
{
"epoch": 14.166666666666666,
"grad_norm": 0.7048236131668091,
"learning_rate": 1.1916873445017982e-05,
"loss": 0.0254,
"num_input_tokens_seen": 399840,
"step": 1275,
"train_runtime": 209.037,
"train_tokens_per_second": 1912.772
},
{
"epoch": 14.222222222222221,
"grad_norm": 0.009322111494839191,
"learning_rate": 1.1710928798907556e-05,
"loss": 0.0749,
"num_input_tokens_seen": 401440,
"step": 1280,
"train_runtime": 209.7672,
"train_tokens_per_second": 1913.74
},
{
"epoch": 14.277777777777779,
"grad_norm": 0.01669895090162754,
"learning_rate": 1.1506233551212186e-05,
"loss": 0.0216,
"num_input_tokens_seen": 403040,
"step": 1285,
"train_runtime": 210.5591,
"train_tokens_per_second": 1914.142
},
{
"epoch": 14.333333333333334,
"grad_norm": 0.504178524017334,
"learning_rate": 1.1302806946762004e-05,
"loss": 0.0221,
"num_input_tokens_seen": 404640,
"step": 1290,
"train_runtime": 211.281,
"train_tokens_per_second": 1915.175
},
{
"epoch": 14.38888888888889,
"grad_norm": 0.4881063997745514,
"learning_rate": 1.1100668111113166e-05,
"loss": 0.0227,
"num_input_tokens_seen": 406208,
"step": 1295,
"train_runtime": 212.0164,
"train_tokens_per_second": 1915.927
},
{
"epoch": 14.444444444444445,
"grad_norm": 0.0046058776788413525,
"learning_rate": 1.0899836048749645e-05,
"loss": 0.0794,
"num_input_tokens_seen": 407776,
"step": 1300,
"train_runtime": 212.743,
"train_tokens_per_second": 1916.754
},
{
"epoch": 14.5,
"grad_norm": 0.3913433849811554,
"learning_rate": 1.0700329641296541e-05,
"loss": 0.0937,
"num_input_tokens_seen": 409312,
"step": 1305,
"train_runtime": 213.4663,
"train_tokens_per_second": 1917.455
},
{
"epoch": 14.555555555555555,
"grad_norm": 0.04091079160571098,
"learning_rate": 1.0502167645744895e-05,
"loss": 0.0034,
"num_input_tokens_seen": 410816,
"step": 1310,
"train_runtime": 214.1934,
"train_tokens_per_second": 1917.968
},
{
"epoch": 14.61111111111111,
"grad_norm": 0.7491574883460999,
"learning_rate": 1.0305368692688174e-05,
"loss": 0.0553,
"num_input_tokens_seen": 412416,
"step": 1315,
"train_runtime": 214.9138,
"train_tokens_per_second": 1918.983
},
{
"epoch": 14.666666666666666,
"grad_norm": 0.09902898222208023,
"learning_rate": 1.01099512845707e-05,
"loss": 0.0155,
"num_input_tokens_seen": 414016,
"step": 1320,
"train_runtime": 215.6296,
"train_tokens_per_second": 1920.034
},
{
"epoch": 14.722222222222221,
"grad_norm": 0.13095584511756897,
"learning_rate": 9.91593379394811e-06,
"loss": 0.0518,
"num_input_tokens_seen": 415552,
"step": 1325,
"train_runtime": 216.3457,
"train_tokens_per_second": 1920.778
},
{
"epoch": 14.777777777777779,
"grad_norm": 0.008948145434260368,
"learning_rate": 9.723334461760006e-06,
"loss": 0.0101,
"num_input_tokens_seen": 417088,
"step": 1330,
"train_runtime": 217.0664,
"train_tokens_per_second": 1921.477
},
{
"epoch": 14.833333333333334,
"grad_norm": 1.4915354251861572,
"learning_rate": 9.532171395615036e-06,
"loss": 0.127,
"num_input_tokens_seen": 418656,
"step": 1335,
"train_runtime": 217.7857,
"train_tokens_per_second": 1922.33
},
{
"epoch": 14.88888888888889,
"grad_norm": 0.006428571883589029,
"learning_rate": 9.342462568088416e-06,
"loss": 0.0352,
"num_input_tokens_seen": 420256,
"step": 1340,
"train_runtime": 218.5045,
"train_tokens_per_second": 1923.329
},
{
"epoch": 14.944444444444445,
"grad_norm": 0.2612595856189728,
"learning_rate": 9.154225815032242e-06,
"loss": 0.0122,
"num_input_tokens_seen": 421792,
"step": 1345,
"train_runtime": 219.2285,
"train_tokens_per_second": 1923.984
},
{
"epoch": 15.0,
"grad_norm": 0.021902482956647873,
"learning_rate": 8.967478833898612e-06,
"loss": 0.0655,
"num_input_tokens_seen": 423360,
"step": 1350,
"train_runtime": 219.9807,
"train_tokens_per_second": 1924.533
},
{
"epoch": 15.0,
"eval_loss": 0.06403695046901703,
"eval_runtime": 0.5355,
"eval_samples_per_second": 74.701,
"eval_steps_per_second": 18.675,
"num_input_tokens_seen": 423360,
"step": 1350
},
{
"epoch": 15.055555555555555,
"grad_norm": 0.0330515094101429,
"learning_rate": 8.78223918207575e-06,
"loss": 0.0151,
"num_input_tokens_seen": 424992,
"step": 1355,
"train_runtime": 222.0766,
"train_tokens_per_second": 1913.718
},
{
"epoch": 15.11111111111111,
"grad_norm": 0.03642648831009865,
"learning_rate": 8.598524275237322e-06,
"loss": 0.0554,
"num_input_tokens_seen": 426528,
"step": 1360,
"train_runtime": 222.8019,
"train_tokens_per_second": 1914.382
},
{
"epoch": 15.166666666666666,
"grad_norm": 0.11118751019239426,
"learning_rate": 8.41635138570507e-06,
"loss": 0.0283,
"num_input_tokens_seen": 428096,
"step": 1365,
"train_runtime": 223.5275,
"train_tokens_per_second": 1915.182
},
{
"epoch": 15.222222222222221,
"grad_norm": 0.015841670334339142,
"learning_rate": 8.235737640824908e-06,
"loss": 0.0262,
"num_input_tokens_seen": 429600,
"step": 1370,
"train_runtime": 224.256,
"train_tokens_per_second": 1915.668
},
{
"epoch": 15.277777777777779,
"grad_norm": 0.7373225688934326,
"learning_rate": 8.056700021356694e-06,
"loss": 0.1276,
"num_input_tokens_seen": 431200,
"step": 1375,
"train_runtime": 225.046,
"train_tokens_per_second": 1916.053
},
{
"epoch": 15.333333333333334,
"grad_norm": 0.047713033854961395,
"learning_rate": 7.879255359877705e-06,
"loss": 0.0159,
"num_input_tokens_seen": 432736,
"step": 1380,
"train_runtime": 225.7705,
"train_tokens_per_second": 1916.707
},
{
"epoch": 15.38888888888889,
"grad_norm": 0.6301924586296082,
"learning_rate": 7.703420339200101e-06,
"loss": 0.0261,
"num_input_tokens_seen": 434336,
"step": 1385,
"train_runtime": 226.5014,
"train_tokens_per_second": 1917.587
},
{
"epoch": 15.444444444444445,
"grad_norm": 0.19205856323242188,
"learning_rate": 7.529211490802498e-06,
"loss": 0.0145,
"num_input_tokens_seen": 435904,
"step": 1390,
"train_runtime": 227.2344,
"train_tokens_per_second": 1918.301
},
{
"epoch": 15.5,
"grad_norm": 1.5189799070358276,
"learning_rate": 7.3566451932756744e-06,
"loss": 0.0874,
"num_input_tokens_seen": 437440,
"step": 1395,
"train_runtime": 227.9626,
"train_tokens_per_second": 1918.911
},
{
"epoch": 15.555555555555555,
"grad_norm": 0.2762519121170044,
"learning_rate": 7.185737670782727e-06,
"loss": 0.0199,
"num_input_tokens_seen": 438976,
"step": 1400,
"train_runtime": 228.6902,
"train_tokens_per_second": 1919.522
},
{
"epoch": 15.61111111111111,
"grad_norm": 0.03604911267757416,
"learning_rate": 7.016504991533726e-06,
"loss": 0.0216,
"num_input_tokens_seen": 440512,
"step": 1405,
"train_runtime": 229.4139,
"train_tokens_per_second": 1920.163
},
{
"epoch": 15.666666666666666,
"grad_norm": 0.008183962665498257,
"learning_rate": 6.848963066275027e-06,
"loss": 0.0666,
"num_input_tokens_seen": 442112,
"step": 1410,
"train_runtime": 230.1355,
"train_tokens_per_second": 1921.095
},
{
"epoch": 15.722222222222221,
"grad_norm": 0.07913227379322052,
"learning_rate": 6.683127646793411e-06,
"loss": 0.027,
"num_input_tokens_seen": 443616,
"step": 1415,
"train_runtime": 230.8622,
"train_tokens_per_second": 1921.562
},
{
"epoch": 15.777777777777779,
"grad_norm": 0.13029180467128754,
"learning_rate": 6.519014324435102e-06,
"loss": 0.1112,
"num_input_tokens_seen": 445184,
"step": 1420,
"train_runtime": 231.5771,
"train_tokens_per_second": 1922.401
},
{
"epoch": 15.833333333333334,
"grad_norm": 0.43220093846321106,
"learning_rate": 6.356638528639955e-06,
"loss": 0.0265,
"num_input_tokens_seen": 446752,
"step": 1425,
"train_runtime": 232.2972,
"train_tokens_per_second": 1923.192
},
{
"epoch": 15.88888888888889,
"grad_norm": 0.008491002023220062,
"learning_rate": 6.196015525490825e-06,
"loss": 0.0415,
"num_input_tokens_seen": 448352,
"step": 1430,
"train_runtime": 233.0234,
"train_tokens_per_second": 1924.064
},
{
"epoch": 15.944444444444445,
"grad_norm": 0.21576499938964844,
"learning_rate": 6.037160416278278e-06,
"loss": 0.0316,
"num_input_tokens_seen": 449888,
"step": 1435,
"train_runtime": 233.7414,
"train_tokens_per_second": 1924.725
},
{
"epoch": 16.0,
"grad_norm": 0.010713284835219383,
"learning_rate": 5.880088136080814e-06,
"loss": 0.0589,
"num_input_tokens_seen": 451424,
"step": 1440,
"train_runtime": 234.4958,
"train_tokens_per_second": 1925.084
},
{
"epoch": 16.0,
"eval_loss": 0.06382830440998077,
"eval_runtime": 0.5348,
"eval_samples_per_second": 74.794,
"eval_steps_per_second": 18.698,
"num_input_tokens_seen": 451424,
"step": 1440
},
{
"epoch": 16.055555555555557,
"grad_norm": 0.01984066516160965,
"learning_rate": 5.724813452360736e-06,
"loss": 0.0216,
"num_input_tokens_seen": 452992,
"step": 1445,
"train_runtime": 236.5497,
"train_tokens_per_second": 1914.997
},
{
"epoch": 16.11111111111111,
"grad_norm": 0.051839593797922134,
"learning_rate": 5.571350963575728e-06,
"loss": 0.019,
"num_input_tokens_seen": 454496,
"step": 1450,
"train_runtime": 237.277,
"train_tokens_per_second": 1915.466
},
{
"epoch": 16.166666666666668,
"grad_norm": 0.21877069771289825,
"learning_rate": 5.4197150978063965e-06,
"loss": 0.1096,
"num_input_tokens_seen": 456096,
"step": 1455,
"train_runtime": 238.0039,
"train_tokens_per_second": 1916.338
},
{
"epoch": 16.22222222222222,
"grad_norm": 0.11044277250766754,
"learning_rate": 5.269920111399732e-06,
"loss": 0.024,
"num_input_tokens_seen": 457696,
"step": 1460,
"train_runtime": 238.7265,
"train_tokens_per_second": 1917.24
},
{
"epoch": 16.27777777777778,
"grad_norm": 0.009204542264342308,
"learning_rate": 5.121980087628803e-06,
"loss": 0.0034,
"num_input_tokens_seen": 459232,
"step": 1465,
"train_runtime": 239.5225,
"train_tokens_per_second": 1917.281
},
{
"epoch": 16.333333333333332,
"grad_norm": 0.09842357784509659,
"learning_rate": 4.975908935368701e-06,
"loss": 0.0446,
"num_input_tokens_seen": 460832,
"step": 1470,
"train_runtime": 240.2512,
"train_tokens_per_second": 1918.126
},
{
"epoch": 16.38888888888889,
"grad_norm": 0.02341254986822605,
"learning_rate": 4.831720387788827e-06,
"loss": 0.0052,
"num_input_tokens_seen": 462432,
"step": 1475,
"train_runtime": 240.9793,
"train_tokens_per_second": 1918.97
},
{
"epoch": 16.444444444444443,
"grad_norm": 0.9226027727127075,
"learning_rate": 4.689428001061774e-06,
"loss": 0.0307,
"num_input_tokens_seen": 464000,
"step": 1480,
"train_runtime": 241.7004,
"train_tokens_per_second": 1919.732
},
{
"epoch": 16.5,
"grad_norm": 1.7933276891708374,
"learning_rate": 4.549045153088813e-06,
"loss": 0.1125,
"num_input_tokens_seen": 465536,
"step": 1485,
"train_runtime": 242.4254,
"train_tokens_per_second": 1920.326
},
{
"epoch": 16.555555555555557,
"grad_norm": 0.010659473948180676,
"learning_rate": 4.410585042242124e-06,
"loss": 0.0177,
"num_input_tokens_seen": 467136,
"step": 1490,
"train_runtime": 243.144,
"train_tokens_per_second": 1921.232
},
{
"epoch": 16.61111111111111,
"grad_norm": 0.09569656848907471,
"learning_rate": 4.274060686123959e-06,
"loss": 0.0152,
"num_input_tokens_seen": 468672,
"step": 1495,
"train_runtime": 243.8646,
"train_tokens_per_second": 1921.853
},
{
"epoch": 16.666666666666668,
"grad_norm": 1.5692845582962036,
"learning_rate": 4.1394849203427284e-06,
"loss": 0.0559,
"num_input_tokens_seen": 470272,
"step": 1500,
"train_runtime": 244.5845,
"train_tokens_per_second": 1922.738
},
{
"epoch": 16.72222222222222,
"grad_norm": 0.8992767333984375,
"learning_rate": 4.006870397306256e-06,
"loss": 0.1462,
"num_input_tokens_seen": 471872,
"step": 1505,
"train_runtime": 245.3043,
"train_tokens_per_second": 1923.619
},
{
"epoch": 16.77777777777778,
"grad_norm": 0.015070810914039612,
"learning_rate": 3.876229585032245e-06,
"loss": 0.0266,
"num_input_tokens_seen": 473440,
"step": 1510,
"train_runtime": 246.02,
"train_tokens_per_second": 1924.397
},
{
"epoch": 16.833333333333332,
"grad_norm": 0.1533195823431015,
"learning_rate": 3.7475747659760502e-06,
"loss": 0.0416,
"num_input_tokens_seen": 475008,
"step": 1515,
"train_runtime": 246.7407,
"train_tokens_per_second": 1925.13
},
{
"epoch": 16.88888888888889,
"grad_norm": 0.4271990656852722,
"learning_rate": 3.6209180358759394e-06,
"loss": 0.0719,
"num_input_tokens_seen": 476608,
"step": 1520,
"train_runtime": 247.4592,
"train_tokens_per_second": 1926.006
},
{
"epoch": 16.944444444444443,
"grad_norm": 0.008633787743747234,
"learning_rate": 3.4962713026158694e-06,
"loss": 0.0058,
"num_input_tokens_seen": 478176,
"step": 1525,
"train_runtime": 248.1747,
"train_tokens_per_second": 1926.772
},
{
"epoch": 17.0,
"grad_norm": 0.9597387313842773,
"learning_rate": 3.373646285105958e-06,
"loss": 0.0388,
"num_input_tokens_seen": 479744,
"step": 1530,
"train_runtime": 248.9317,
"train_tokens_per_second": 1927.212
},
{
"epoch": 17.0,
"eval_loss": 0.06136542558670044,
"eval_runtime": 0.5322,
"eval_samples_per_second": 75.155,
"eval_steps_per_second": 18.789,
"num_input_tokens_seen": 479744,
"step": 1530
},
{
"epoch": 17.055555555555557,
"grad_norm": 0.025216510519385338,
"learning_rate": 3.2530545121807145e-06,
"loss": 0.0248,
"num_input_tokens_seen": 481344,
"step": 1535,
"train_runtime": 251.0216,
"train_tokens_per_second": 1917.54
},
{
"epoch": 17.11111111111111,
"grad_norm": 0.8008201718330383,
"learning_rate": 3.1345073215151066e-06,
"loss": 0.0326,
"num_input_tokens_seen": 482944,
"step": 1540,
"train_runtime": 251.7525,
"train_tokens_per_second": 1918.329
},
{
"epoch": 17.166666666666668,
"grad_norm": 0.010983273386955261,
"learning_rate": 3.0180158585586397e-06,
"loss": 0.0151,
"num_input_tokens_seen": 484480,
"step": 1545,
"train_runtime": 252.4883,
"train_tokens_per_second": 1918.821
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.6954006552696228,
"learning_rate": 2.9035910754875136e-06,
"loss": 0.0588,
"num_input_tokens_seen": 486016,
"step": 1550,
"train_runtime": 253.217,
"train_tokens_per_second": 1919.366
},
{
"epoch": 17.27777777777778,
"grad_norm": 0.2412204146385193,
"learning_rate": 2.7912437301749026e-06,
"loss": 0.0203,
"num_input_tokens_seen": 487584,
"step": 1555,
"train_runtime": 253.9521,
"train_tokens_per_second": 1919.984
},
{
"epoch": 17.333333333333332,
"grad_norm": 0.11864454299211502,
"learning_rate": 2.6809843851795357e-06,
"loss": 0.0815,
"num_input_tokens_seen": 489088,
"step": 1560,
"train_runtime": 254.7449,
"train_tokens_per_second": 1919.913
},
{
"epoch": 17.38888888888889,
"grad_norm": 0.07409928739070892,
"learning_rate": 2.57282340675267e-06,
"loss": 0.0082,
"num_input_tokens_seen": 490688,
"step": 1565,
"train_runtime": 255.4649,
"train_tokens_per_second": 1920.765
},
{
"epoch": 17.444444444444443,
"grad_norm": 0.15076597034931183,
"learning_rate": 2.4667709638634434e-06,
"loss": 0.0798,
"num_input_tokens_seen": 492288,
"step": 1570,
"train_runtime": 256.1878,
"train_tokens_per_second": 1921.59
},
{
"epoch": 17.5,
"grad_norm": 0.7357990741729736,
"learning_rate": 2.3628370272428564e-06,
"loss": 0.0163,
"num_input_tokens_seen": 493824,
"step": 1575,
"train_runtime": 256.9114,
"train_tokens_per_second": 1922.156
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.37361836433410645,
"learning_rate": 2.2610313684463177e-06,
"loss": 0.0241,
"num_input_tokens_seen": 495456,
"step": 1580,
"train_runtime": 257.6329,
"train_tokens_per_second": 1923.108
},
{
"epoch": 17.61111111111111,
"grad_norm": 0.2539951801300049,
"learning_rate": 2.1613635589349756e-06,
"loss": 0.0799,
"num_input_tokens_seen": 497024,
"step": 1585,
"train_runtime": 258.3515,
"train_tokens_per_second": 1923.829
},
{
"epoch": 17.666666666666668,
"grad_norm": 0.278292179107666,
"learning_rate": 2.063842969175847e-06,
"loss": 0.0776,
"num_input_tokens_seen": 498592,
"step": 1590,
"train_runtime": 259.0701,
"train_tokens_per_second": 1924.544
},
{
"epoch": 17.72222222222222,
"grad_norm": 0.16478262841701508,
"learning_rate": 1.968478767760812e-06,
"loss": 0.0617,
"num_input_tokens_seen": 500128,
"step": 1595,
"train_runtime": 259.7856,
"train_tokens_per_second": 1925.156
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.0895620584487915,
"learning_rate": 1.8752799205445982e-06,
"loss": 0.0131,
"num_input_tokens_seen": 501696,
"step": 1600,
"train_runtime": 260.5015,
"train_tokens_per_second": 1925.885
},
{
"epoch": 17.833333333333332,
"grad_norm": 0.46476849913597107,
"learning_rate": 1.784255189801895e-06,
"loss": 0.0508,
"num_input_tokens_seen": 503232,
"step": 1605,
"train_runtime": 261.2201,
"train_tokens_per_second": 1926.467
},
{
"epoch": 17.88888888888889,
"grad_norm": 0.1637742668390274,
"learning_rate": 1.6954131334034922e-06,
"loss": 0.0527,
"num_input_tokens_seen": 504736,
"step": 1610,
"train_runtime": 261.9407,
"train_tokens_per_second": 1926.909
},
{
"epoch": 17.944444444444443,
"grad_norm": 0.03226396441459656,
"learning_rate": 1.6087621040117157e-06,
"loss": 0.0253,
"num_input_tokens_seen": 506304,
"step": 1615,
"train_runtime": 262.6551,
"train_tokens_per_second": 1927.638
},
{
"epoch": 18.0,
"grad_norm": 0.058289699256420135,
"learning_rate": 1.524310248295152e-06,
"loss": 0.0656,
"num_input_tokens_seen": 507872,
"step": 1620,
"train_runtime": 263.4036,
"train_tokens_per_second": 1928.114
},
{
"epoch": 18.0,
"eval_loss": 0.06606583297252655,
"eval_runtime": 0.5352,
"eval_samples_per_second": 74.735,
"eval_steps_per_second": 18.684,
"num_input_tokens_seen": 507872,
"step": 1620
},
{
"epoch": 18.055555555555557,
"grad_norm": 0.4969352185726166,
"learning_rate": 1.4420655061626932e-06,
"loss": 0.0163,
"num_input_tokens_seen": 509408,
"step": 1625,
"train_runtime": 265.4903,
"train_tokens_per_second": 1918.744
},
{
"epoch": 18.11111111111111,
"grad_norm": 0.008122888393700123,
"learning_rate": 1.362035610017079e-06,
"loss": 0.0931,
"num_input_tokens_seen": 510912,
"step": 1630,
"train_runtime": 266.224,
"train_tokens_per_second": 1919.106
},
{
"epoch": 18.166666666666668,
"grad_norm": 0.9085597395896912,
"learning_rate": 1.2842280840278997e-06,
"loss": 0.0386,
"num_input_tokens_seen": 512384,
"step": 1635,
"train_runtime": 266.9589,
"train_tokens_per_second": 1919.337
},
{
"epoch": 18.22222222222222,
"grad_norm": 0.7538381218910217,
"learning_rate": 1.2086502434241865e-06,
"loss": 0.0215,
"num_input_tokens_seen": 513952,
"step": 1640,
"train_runtime": 267.6859,
"train_tokens_per_second": 1919.981
},
{
"epoch": 18.27777777777778,
"grad_norm": 0.017959676682949066,
"learning_rate": 1.1353091938067023e-06,
"loss": 0.0395,
"num_input_tokens_seen": 515520,
"step": 1645,
"train_runtime": 268.4843,
"train_tokens_per_second": 1920.112
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.9613012671470642,
"learning_rate": 1.0642118304798442e-06,
"loss": 0.0392,
"num_input_tokens_seen": 517120,
"step": 1650,
"train_runtime": 269.2113,
"train_tokens_per_second": 1920.87
},
{
"epoch": 18.38888888888889,
"grad_norm": 0.24886614084243774,
"learning_rate": 9.95364837803392e-07,
"loss": 0.1484,
"num_input_tokens_seen": 518688,
"step": 1655,
"train_runtime": 269.9469,
"train_tokens_per_second": 1921.444
},
{
"epoch": 18.444444444444443,
"grad_norm": 0.0767144188284874,
"learning_rate": 9.287746885640603e-07,
"loss": 0.064,
"num_input_tokens_seen": 520224,
"step": 1660,
"train_runtime": 270.6677,
"train_tokens_per_second": 1922.003
},
{
"epoch": 18.5,
"grad_norm": 0.028083791956305504,
"learning_rate": 8.64447643366953e-07,
"loss": 0.0058,
"num_input_tokens_seen": 521824,
"step": 1665,
"train_runtime": 271.3872,
"train_tokens_per_second": 1922.802
},
{
"epoch": 18.555555555555557,
"grad_norm": 0.02712034434080124,
"learning_rate": 8.023897500469391e-07,
"loss": 0.0357,
"num_input_tokens_seen": 523424,
"step": 1670,
"train_runtime": 272.1108,
"train_tokens_per_second": 1923.57
},
{
"epoch": 18.61111111111111,
"grad_norm": 0.21604163944721222,
"learning_rate": 7.426068431000882e-07,
"loss": 0.0135,
"num_input_tokens_seen": 524960,
"step": 1675,
"train_runtime": 272.8372,
"train_tokens_per_second": 1924.078
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.018388254567980766,
"learning_rate": 6.851045431350927e-07,
"loss": 0.0227,
"num_input_tokens_seen": 526496,
"step": 1680,
"train_runtime": 273.564,
"train_tokens_per_second": 1924.581
},
{
"epoch": 18.72222222222222,
"grad_norm": 0.4869348704814911,
"learning_rate": 6.298882563448599e-07,
"loss": 0.0792,
"num_input_tokens_seen": 528064,
"step": 1685,
"train_runtime": 274.2916,
"train_tokens_per_second": 1925.192
},
{
"epoch": 18.77777777777778,
"grad_norm": 0.8644710779190063,
"learning_rate": 5.769631739982267e-07,
"loss": 0.0555,
"num_input_tokens_seen": 529632,
"step": 1690,
"train_runtime": 275.017,
"train_tokens_per_second": 1925.815
},
{
"epoch": 18.833333333333332,
"grad_norm": 0.19972096383571625,
"learning_rate": 5.263342719518921e-07,
"loss": 0.015,
"num_input_tokens_seen": 531232,
"step": 1695,
"train_runtime": 275.7339,
"train_tokens_per_second": 1926.611
},
{
"epoch": 18.88888888888889,
"grad_norm": 0.13014666736125946,
"learning_rate": 4.780063101826132e-07,
"loss": 0.0051,
"num_input_tokens_seen": 532800,
"step": 1700,
"train_runtime": 276.4579,
"train_tokens_per_second": 1927.237
},
{
"epoch": 18.944444444444443,
"grad_norm": 0.4027363955974579,
"learning_rate": 4.319838323396691e-07,
"loss": 0.0241,
"num_input_tokens_seen": 534400,
"step": 1705,
"train_runtime": 277.1752,
"train_tokens_per_second": 1928.023
},
{
"epoch": 19.0,
"grad_norm": 0.22673600912094116,
"learning_rate": 3.88271165317694e-07,
"loss": 0.0614,
"num_input_tokens_seen": 535968,
"step": 1710,
"train_runtime": 277.9233,
"train_tokens_per_second": 1928.475
},
{
"epoch": 19.0,
"eval_loss": 0.06138663738965988,
"eval_runtime": 0.5312,
"eval_samples_per_second": 75.296,
"eval_steps_per_second": 18.824,
"num_input_tokens_seen": 535968,
"step": 1710
},
{
"epoch": 19.055555555555557,
"grad_norm": 0.3861265182495117,
"learning_rate": 3.468724188498751e-07,
"loss": 0.0172,
"num_input_tokens_seen": 537536,
"step": 1715,
"train_runtime": 280.0071,
"train_tokens_per_second": 1919.723
},
{
"epoch": 19.11111111111111,
"grad_norm": 0.4449467957019806,
"learning_rate": 3.077914851215585e-07,
"loss": 0.0148,
"num_input_tokens_seen": 539072,
"step": 1720,
"train_runtime": 280.754,
"train_tokens_per_second": 1920.087
},
{
"epoch": 19.166666666666668,
"grad_norm": 0.09849676489830017,
"learning_rate": 2.71032038404323e-07,
"loss": 0.0522,
"num_input_tokens_seen": 540608,
"step": 1725,
"train_runtime": 281.488,
"train_tokens_per_second": 1920.537
},
{
"epoch": 19.22222222222222,
"grad_norm": 0.3086458146572113,
"learning_rate": 2.365975347105448e-07,
"loss": 0.1071,
"num_input_tokens_seen": 542208,
"step": 1730,
"train_runtime": 282.2214,
"train_tokens_per_second": 1921.215
},
{
"epoch": 19.27777777777778,
"grad_norm": 1.7546472549438477,
"learning_rate": 2.0449121146845774e-07,
"loss": 0.0479,
"num_input_tokens_seen": 543776,
"step": 1735,
"train_runtime": 283.019,
"train_tokens_per_second": 1921.341
},
{
"epoch": 19.333333333333332,
"grad_norm": 1.386121153831482,
"learning_rate": 1.747160872177883e-07,
"loss": 0.0542,
"num_input_tokens_seen": 545280,
"step": 1740,
"train_runtime": 283.7447,
"train_tokens_per_second": 1921.727
},
{
"epoch": 19.38888888888889,
"grad_norm": 0.21310308575630188,
"learning_rate": 1.472749613259661e-07,
"loss": 0.0952,
"num_input_tokens_seen": 546848,
"step": 1745,
"train_runtime": 284.4692,
"train_tokens_per_second": 1922.345
},
{
"epoch": 19.444444444444443,
"grad_norm": 0.484055757522583,
"learning_rate": 1.22170413724923e-07,
"loss": 0.0119,
"num_input_tokens_seen": 548416,
"step": 1750,
"train_runtime": 285.1914,
"train_tokens_per_second": 1922.975
},
{
"epoch": 19.5,
"grad_norm": 0.11522525548934937,
"learning_rate": 9.940480466855417e-08,
"loss": 0.0149,
"num_input_tokens_seen": 550016,
"step": 1755,
"train_runtime": 285.9187,
"train_tokens_per_second": 1923.68
},
{
"epoch": 19.555555555555557,
"grad_norm": 0.22439134120941162,
"learning_rate": 7.898027451078982e-08,
"loss": 0.0109,
"num_input_tokens_seen": 551584,
"step": 1760,
"train_runtime": 286.6439,
"train_tokens_per_second": 1924.283
},
{
"epoch": 19.61111111111111,
"grad_norm": 0.30249354243278503,
"learning_rate": 6.089874350439506e-08,
"loss": 0.0504,
"num_input_tokens_seen": 553152,
"step": 1765,
"train_runtime": 287.3754,
"train_tokens_per_second": 1924.841
},
{
"epoch": 19.666666666666668,
"grad_norm": 1.4332835674285889,
"learning_rate": 4.516191162040051e-08,
"loss": 0.0612,
"num_input_tokens_seen": 554752,
"step": 1770,
"train_runtime": 288.1001,
"train_tokens_per_second": 1925.553
},
{
"epoch": 19.72222222222222,
"grad_norm": 0.00620208540931344,
"learning_rate": 3.177125838830786e-08,
"loss": 0.0222,
"num_input_tokens_seen": 556288,
"step": 1775,
"train_runtime": 288.8278,
"train_tokens_per_second": 1926.02
},
{
"epoch": 19.77777777777778,
"grad_norm": 0.9789679050445557,
"learning_rate": 2.0728042756967824e-08,
"loss": 0.0446,
"num_input_tokens_seen": 557888,
"step": 1780,
"train_runtime": 289.5544,
"train_tokens_per_second": 1926.712
},
{
"epoch": 19.833333333333332,
"grad_norm": 0.006394943222403526,
"learning_rate": 1.2033302976222071e-08,
"loss": 0.0178,
"num_input_tokens_seen": 559424,
"step": 1785,
"train_runtime": 290.2832,
"train_tokens_per_second": 1927.166
},
{
"epoch": 19.88888888888889,
"grad_norm": 0.8413034081459045,
"learning_rate": 5.687856499297928e-09,
"loss": 0.0878,
"num_input_tokens_seen": 560960,
"step": 1790,
"train_runtime": 291.0104,
"train_tokens_per_second": 1927.629
},
{
"epoch": 19.944444444444443,
"grad_norm": 0.5030195116996765,
"learning_rate": 1.692299905944883e-09,
"loss": 0.0524,
"num_input_tokens_seen": 562592,
"step": 1795,
"train_runtime": 291.7296,
"train_tokens_per_second": 1928.471
},
{
"epoch": 20.0,
"grad_norm": 0.01651857979595661,
"learning_rate": 4.700884634611935e-11,
"loss": 0.0111,
"num_input_tokens_seen": 564096,
"step": 1800,
"train_runtime": 292.485,
"train_tokens_per_second": 1928.632
},
{
"epoch": 20.0,
"eval_loss": 0.06419730186462402,
"eval_runtime": 0.5388,
"eval_samples_per_second": 74.238,
"eval_steps_per_second": 18.56,
"num_input_tokens_seen": 564096,
"step": 1800
},
{
"epoch": 20.0,
"num_input_tokens_seen": 564096,
"step": 1800,
"total_flos": 2.540165336137728e+16,
"train_loss": 0.10145172014832497,
"train_runtime": 294.5966,
"train_samples_per_second": 24.44,
"train_steps_per_second": 6.11
}
],
"logging_steps": 5,
"max_steps": 1800,
"num_input_tokens_seen": 564096,
"num_train_epochs": 20,
"save_steps": 90,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.540165336137728e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}