train_openbookqa_1754507498 / trainer_state.json
rbelanec's picture
End of training
f85104a verified
{
"best_global_step": 3348,
"best_metric": 0.24597814679145813,
"best_model_checkpoint": "saves/prompt-tuning/llama-3-8b-instruct/train_openbookqa_1754507498/checkpoint-3348",
"epoch": 10.0,
"eval_steps": 558,
"global_step": 11160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004480286738351254,
"grad_norm": 3.8125,
"learning_rate": 1.7921146953405018e-07,
"loss": 0.7127,
"num_input_tokens_seen": 1792,
"step": 5
},
{
"epoch": 0.008960573476702509,
"grad_norm": 8.4375,
"learning_rate": 4.032258064516129e-07,
"loss": 0.5934,
"num_input_tokens_seen": 3776,
"step": 10
},
{
"epoch": 0.013440860215053764,
"grad_norm": 15.5,
"learning_rate": 6.272401433691756e-07,
"loss": 1.1274,
"num_input_tokens_seen": 5632,
"step": 15
},
{
"epoch": 0.017921146953405017,
"grad_norm": 13.4375,
"learning_rate": 8.512544802867385e-07,
"loss": 0.5251,
"num_input_tokens_seen": 7392,
"step": 20
},
{
"epoch": 0.022401433691756272,
"grad_norm": 5.4375,
"learning_rate": 1.0752688172043011e-06,
"loss": 0.7529,
"num_input_tokens_seen": 9312,
"step": 25
},
{
"epoch": 0.026881720430107527,
"grad_norm": 5.53125,
"learning_rate": 1.2992831541218638e-06,
"loss": 0.6538,
"num_input_tokens_seen": 11104,
"step": 30
},
{
"epoch": 0.03136200716845878,
"grad_norm": 7.09375,
"learning_rate": 1.5232974910394266e-06,
"loss": 0.4464,
"num_input_tokens_seen": 12928,
"step": 35
},
{
"epoch": 0.035842293906810034,
"grad_norm": 9.375,
"learning_rate": 1.7473118279569893e-06,
"loss": 0.6928,
"num_input_tokens_seen": 14816,
"step": 40
},
{
"epoch": 0.04032258064516129,
"grad_norm": 7.65625,
"learning_rate": 1.971326164874552e-06,
"loss": 0.627,
"num_input_tokens_seen": 16832,
"step": 45
},
{
"epoch": 0.044802867383512544,
"grad_norm": 5.25,
"learning_rate": 2.1953405017921145e-06,
"loss": 0.9248,
"num_input_tokens_seen": 18560,
"step": 50
},
{
"epoch": 0.0492831541218638,
"grad_norm": 10.75,
"learning_rate": 2.4193548387096776e-06,
"loss": 1.1119,
"num_input_tokens_seen": 20416,
"step": 55
},
{
"epoch": 0.053763440860215055,
"grad_norm": 15.3125,
"learning_rate": 2.6433691756272402e-06,
"loss": 0.44,
"num_input_tokens_seen": 22144,
"step": 60
},
{
"epoch": 0.05824372759856631,
"grad_norm": 12.3125,
"learning_rate": 2.867383512544803e-06,
"loss": 0.8532,
"num_input_tokens_seen": 24064,
"step": 65
},
{
"epoch": 0.06272401433691756,
"grad_norm": 4.78125,
"learning_rate": 3.091397849462366e-06,
"loss": 0.4425,
"num_input_tokens_seen": 25920,
"step": 70
},
{
"epoch": 0.06720430107526881,
"grad_norm": 15.125,
"learning_rate": 3.3154121863799286e-06,
"loss": 0.3559,
"num_input_tokens_seen": 27744,
"step": 75
},
{
"epoch": 0.07168458781362007,
"grad_norm": 2.90625,
"learning_rate": 3.5394265232974912e-06,
"loss": 0.4622,
"num_input_tokens_seen": 29664,
"step": 80
},
{
"epoch": 0.07616487455197132,
"grad_norm": 11.5,
"learning_rate": 3.763440860215054e-06,
"loss": 0.655,
"num_input_tokens_seen": 31552,
"step": 85
},
{
"epoch": 0.08064516129032258,
"grad_norm": 7.96875,
"learning_rate": 3.987455197132617e-06,
"loss": 1.2417,
"num_input_tokens_seen": 33536,
"step": 90
},
{
"epoch": 0.08512544802867383,
"grad_norm": 17.0,
"learning_rate": 4.21146953405018e-06,
"loss": 0.9778,
"num_input_tokens_seen": 35360,
"step": 95
},
{
"epoch": 0.08960573476702509,
"grad_norm": 1.6875,
"learning_rate": 4.435483870967742e-06,
"loss": 0.4708,
"num_input_tokens_seen": 37216,
"step": 100
},
{
"epoch": 0.09408602150537634,
"grad_norm": 5.0625,
"learning_rate": 4.659498207885305e-06,
"loss": 0.3768,
"num_input_tokens_seen": 39104,
"step": 105
},
{
"epoch": 0.0985663082437276,
"grad_norm": 13.5625,
"learning_rate": 4.883512544802868e-06,
"loss": 0.8014,
"num_input_tokens_seen": 40960,
"step": 110
},
{
"epoch": 0.10304659498207885,
"grad_norm": 11.0,
"learning_rate": 5.1075268817204305e-06,
"loss": 0.5901,
"num_input_tokens_seen": 42880,
"step": 115
},
{
"epoch": 0.10752688172043011,
"grad_norm": 5.90625,
"learning_rate": 5.331541218637993e-06,
"loss": 0.939,
"num_input_tokens_seen": 44768,
"step": 120
},
{
"epoch": 0.11200716845878136,
"grad_norm": 11.0625,
"learning_rate": 5.555555555555556e-06,
"loss": 0.5213,
"num_input_tokens_seen": 46528,
"step": 125
},
{
"epoch": 0.11648745519713262,
"grad_norm": 7.59375,
"learning_rate": 5.779569892473118e-06,
"loss": 0.6191,
"num_input_tokens_seen": 48544,
"step": 130
},
{
"epoch": 0.12096774193548387,
"grad_norm": 7.40625,
"learning_rate": 6.003584229390681e-06,
"loss": 0.9498,
"num_input_tokens_seen": 50432,
"step": 135
},
{
"epoch": 0.12544802867383512,
"grad_norm": 5.21875,
"learning_rate": 6.227598566308244e-06,
"loss": 0.5463,
"num_input_tokens_seen": 52416,
"step": 140
},
{
"epoch": 0.12992831541218638,
"grad_norm": 5.53125,
"learning_rate": 6.451612903225806e-06,
"loss": 0.3972,
"num_input_tokens_seen": 54304,
"step": 145
},
{
"epoch": 0.13440860215053763,
"grad_norm": 5.25,
"learning_rate": 6.67562724014337e-06,
"loss": 1.0382,
"num_input_tokens_seen": 56256,
"step": 150
},
{
"epoch": 0.1388888888888889,
"grad_norm": 4.96875,
"learning_rate": 6.8996415770609325e-06,
"loss": 0.2313,
"num_input_tokens_seen": 58144,
"step": 155
},
{
"epoch": 0.14336917562724014,
"grad_norm": 4.375,
"learning_rate": 7.1236559139784956e-06,
"loss": 0.3777,
"num_input_tokens_seen": 60032,
"step": 160
},
{
"epoch": 0.1478494623655914,
"grad_norm": 3.46875,
"learning_rate": 7.347670250896058e-06,
"loss": 0.2322,
"num_input_tokens_seen": 61824,
"step": 165
},
{
"epoch": 0.15232974910394265,
"grad_norm": 13.5625,
"learning_rate": 7.571684587813621e-06,
"loss": 0.8871,
"num_input_tokens_seen": 63712,
"step": 170
},
{
"epoch": 0.15681003584229392,
"grad_norm": 9.0625,
"learning_rate": 7.795698924731183e-06,
"loss": 0.4732,
"num_input_tokens_seen": 65600,
"step": 175
},
{
"epoch": 0.16129032258064516,
"grad_norm": 6.96875,
"learning_rate": 8.019713261648744e-06,
"loss": 0.7266,
"num_input_tokens_seen": 67392,
"step": 180
},
{
"epoch": 0.16577060931899643,
"grad_norm": 3.28125,
"learning_rate": 8.24372759856631e-06,
"loss": 0.4565,
"num_input_tokens_seen": 69120,
"step": 185
},
{
"epoch": 0.17025089605734767,
"grad_norm": 11.875,
"learning_rate": 8.46774193548387e-06,
"loss": 0.5642,
"num_input_tokens_seen": 71008,
"step": 190
},
{
"epoch": 0.17473118279569894,
"grad_norm": 13.125,
"learning_rate": 8.691756272401434e-06,
"loss": 0.4376,
"num_input_tokens_seen": 72896,
"step": 195
},
{
"epoch": 0.17921146953405018,
"grad_norm": 6.0,
"learning_rate": 8.915770609318997e-06,
"loss": 0.6527,
"num_input_tokens_seen": 74880,
"step": 200
},
{
"epoch": 0.18369175627240145,
"grad_norm": 2.703125,
"learning_rate": 9.13978494623656e-06,
"loss": 0.5218,
"num_input_tokens_seen": 76768,
"step": 205
},
{
"epoch": 0.1881720430107527,
"grad_norm": 3.53125,
"learning_rate": 9.363799283154121e-06,
"loss": 0.5326,
"num_input_tokens_seen": 78656,
"step": 210
},
{
"epoch": 0.19265232974910393,
"grad_norm": 9.25,
"learning_rate": 9.587813620071686e-06,
"loss": 0.7109,
"num_input_tokens_seen": 80512,
"step": 215
},
{
"epoch": 0.1971326164874552,
"grad_norm": 10.0,
"learning_rate": 9.811827956989247e-06,
"loss": 0.4761,
"num_input_tokens_seen": 82240,
"step": 220
},
{
"epoch": 0.20161290322580644,
"grad_norm": 6.25,
"learning_rate": 1.003584229390681e-05,
"loss": 0.5844,
"num_input_tokens_seen": 84288,
"step": 225
},
{
"epoch": 0.2060931899641577,
"grad_norm": 1.8359375,
"learning_rate": 1.0259856630824373e-05,
"loss": 0.2788,
"num_input_tokens_seen": 86240,
"step": 230
},
{
"epoch": 0.21057347670250895,
"grad_norm": 2.375,
"learning_rate": 1.0483870967741936e-05,
"loss": 0.1649,
"num_input_tokens_seen": 87968,
"step": 235
},
{
"epoch": 0.21505376344086022,
"grad_norm": 3.078125,
"learning_rate": 1.0707885304659498e-05,
"loss": 0.4605,
"num_input_tokens_seen": 89920,
"step": 240
},
{
"epoch": 0.21953405017921146,
"grad_norm": 8.125,
"learning_rate": 1.0931899641577063e-05,
"loss": 0.4359,
"num_input_tokens_seen": 91776,
"step": 245
},
{
"epoch": 0.22401433691756273,
"grad_norm": 2.40625,
"learning_rate": 1.1155913978494624e-05,
"loss": 0.2946,
"num_input_tokens_seen": 93728,
"step": 250
},
{
"epoch": 0.22849462365591397,
"grad_norm": 2.8125,
"learning_rate": 1.1379928315412187e-05,
"loss": 0.3842,
"num_input_tokens_seen": 95552,
"step": 255
},
{
"epoch": 0.23297491039426524,
"grad_norm": 7.65625,
"learning_rate": 1.160394265232975e-05,
"loss": 0.567,
"num_input_tokens_seen": 97440,
"step": 260
},
{
"epoch": 0.23745519713261648,
"grad_norm": 5.53125,
"learning_rate": 1.1827956989247313e-05,
"loss": 0.3364,
"num_input_tokens_seen": 99296,
"step": 265
},
{
"epoch": 0.24193548387096775,
"grad_norm": 6.3125,
"learning_rate": 1.2051971326164874e-05,
"loss": 0.1801,
"num_input_tokens_seen": 101184,
"step": 270
},
{
"epoch": 0.246415770609319,
"grad_norm": 8.5625,
"learning_rate": 1.227598566308244e-05,
"loss": 0.6582,
"num_input_tokens_seen": 103136,
"step": 275
},
{
"epoch": 0.25089605734767023,
"grad_norm": 3.0,
"learning_rate": 1.25e-05,
"loss": 0.7251,
"num_input_tokens_seen": 105024,
"step": 280
},
{
"epoch": 0.2553763440860215,
"grad_norm": 9.9375,
"learning_rate": 1.2724014336917564e-05,
"loss": 0.4209,
"num_input_tokens_seen": 106880,
"step": 285
},
{
"epoch": 0.25985663082437277,
"grad_norm": 5.65625,
"learning_rate": 1.2948028673835125e-05,
"loss": 0.4534,
"num_input_tokens_seen": 108640,
"step": 290
},
{
"epoch": 0.26433691756272404,
"grad_norm": 2.96875,
"learning_rate": 1.3172043010752688e-05,
"loss": 0.4958,
"num_input_tokens_seen": 110528,
"step": 295
},
{
"epoch": 0.26881720430107525,
"grad_norm": 14.6875,
"learning_rate": 1.3396057347670251e-05,
"loss": 1.3137,
"num_input_tokens_seen": 112480,
"step": 300
},
{
"epoch": 0.2732974910394265,
"grad_norm": 7.6875,
"learning_rate": 1.3620071684587816e-05,
"loss": 0.8399,
"num_input_tokens_seen": 114368,
"step": 305
},
{
"epoch": 0.2777777777777778,
"grad_norm": 4.5,
"learning_rate": 1.3844086021505376e-05,
"loss": 0.6965,
"num_input_tokens_seen": 116352,
"step": 310
},
{
"epoch": 0.28225806451612906,
"grad_norm": 14.25,
"learning_rate": 1.4068100358422939e-05,
"loss": 0.7252,
"num_input_tokens_seen": 118272,
"step": 315
},
{
"epoch": 0.2867383512544803,
"grad_norm": 8.875,
"learning_rate": 1.4292114695340503e-05,
"loss": 0.7915,
"num_input_tokens_seen": 120192,
"step": 320
},
{
"epoch": 0.29121863799283154,
"grad_norm": 14.375,
"learning_rate": 1.4516129032258066e-05,
"loss": 0.3508,
"num_input_tokens_seen": 121952,
"step": 325
},
{
"epoch": 0.2956989247311828,
"grad_norm": 12.3125,
"learning_rate": 1.4740143369175626e-05,
"loss": 0.5308,
"num_input_tokens_seen": 123712,
"step": 330
},
{
"epoch": 0.300179211469534,
"grad_norm": 8.375,
"learning_rate": 1.4964157706093191e-05,
"loss": 0.353,
"num_input_tokens_seen": 125568,
"step": 335
},
{
"epoch": 0.3046594982078853,
"grad_norm": 5.9375,
"learning_rate": 1.5188172043010754e-05,
"loss": 0.3982,
"num_input_tokens_seen": 127584,
"step": 340
},
{
"epoch": 0.30913978494623656,
"grad_norm": 4.09375,
"learning_rate": 1.5412186379928317e-05,
"loss": 0.0836,
"num_input_tokens_seen": 129440,
"step": 345
},
{
"epoch": 0.31362007168458783,
"grad_norm": 1.734375,
"learning_rate": 1.563620071684588e-05,
"loss": 0.2822,
"num_input_tokens_seen": 131360,
"step": 350
},
{
"epoch": 0.31810035842293904,
"grad_norm": 8.625,
"learning_rate": 1.586021505376344e-05,
"loss": 0.2722,
"num_input_tokens_seen": 133312,
"step": 355
},
{
"epoch": 0.3225806451612903,
"grad_norm": 5.9375,
"learning_rate": 1.6084229390681005e-05,
"loss": 0.2362,
"num_input_tokens_seen": 135200,
"step": 360
},
{
"epoch": 0.3270609318996416,
"grad_norm": 15.4375,
"learning_rate": 1.630824372759857e-05,
"loss": 0.677,
"num_input_tokens_seen": 136896,
"step": 365
},
{
"epoch": 0.33154121863799285,
"grad_norm": 5.03125,
"learning_rate": 1.653225806451613e-05,
"loss": 0.168,
"num_input_tokens_seen": 139008,
"step": 370
},
{
"epoch": 0.33602150537634407,
"grad_norm": 3.734375,
"learning_rate": 1.6756272401433692e-05,
"loss": 0.2682,
"num_input_tokens_seen": 140832,
"step": 375
},
{
"epoch": 0.34050179211469533,
"grad_norm": 5.78125,
"learning_rate": 1.6980286738351257e-05,
"loss": 0.315,
"num_input_tokens_seen": 142688,
"step": 380
},
{
"epoch": 0.3449820788530466,
"grad_norm": 1.125,
"learning_rate": 1.7204301075268818e-05,
"loss": 0.4681,
"num_input_tokens_seen": 144640,
"step": 385
},
{
"epoch": 0.34946236559139787,
"grad_norm": 3.8125,
"learning_rate": 1.742831541218638e-05,
"loss": 0.239,
"num_input_tokens_seen": 146496,
"step": 390
},
{
"epoch": 0.3539426523297491,
"grad_norm": 2.75,
"learning_rate": 1.7652329749103944e-05,
"loss": 0.6051,
"num_input_tokens_seen": 148288,
"step": 395
},
{
"epoch": 0.35842293906810035,
"grad_norm": 6.4375,
"learning_rate": 1.7876344086021506e-05,
"loss": 0.4644,
"num_input_tokens_seen": 150208,
"step": 400
},
{
"epoch": 0.3629032258064516,
"grad_norm": 0.109375,
"learning_rate": 1.8100358422939067e-05,
"loss": 0.4193,
"num_input_tokens_seen": 152128,
"step": 405
},
{
"epoch": 0.3673835125448029,
"grad_norm": 12.625,
"learning_rate": 1.8324372759856632e-05,
"loss": 0.7114,
"num_input_tokens_seen": 153728,
"step": 410
},
{
"epoch": 0.3718637992831541,
"grad_norm": 15.875,
"learning_rate": 1.8548387096774193e-05,
"loss": 0.5758,
"num_input_tokens_seen": 155648,
"step": 415
},
{
"epoch": 0.3763440860215054,
"grad_norm": 0.12109375,
"learning_rate": 1.8772401433691758e-05,
"loss": 0.6792,
"num_input_tokens_seen": 157568,
"step": 420
},
{
"epoch": 0.38082437275985664,
"grad_norm": 8.9375,
"learning_rate": 1.899641577060932e-05,
"loss": 0.4102,
"num_input_tokens_seen": 159392,
"step": 425
},
{
"epoch": 0.38530465949820786,
"grad_norm": 8.5625,
"learning_rate": 1.922043010752688e-05,
"loss": 0.5033,
"num_input_tokens_seen": 161312,
"step": 430
},
{
"epoch": 0.3897849462365591,
"grad_norm": 6.4375,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.5428,
"num_input_tokens_seen": 163168,
"step": 435
},
{
"epoch": 0.3942652329749104,
"grad_norm": 15.5,
"learning_rate": 1.966845878136201e-05,
"loss": 0.1898,
"num_input_tokens_seen": 165120,
"step": 440
},
{
"epoch": 0.39874551971326166,
"grad_norm": 6.1875,
"learning_rate": 1.989247311827957e-05,
"loss": 0.6915,
"num_input_tokens_seen": 166976,
"step": 445
},
{
"epoch": 0.4032258064516129,
"grad_norm": 4.09375,
"learning_rate": 2.0116487455197133e-05,
"loss": 0.387,
"num_input_tokens_seen": 168768,
"step": 450
},
{
"epoch": 0.40770609318996415,
"grad_norm": 15.875,
"learning_rate": 2.0340501792114698e-05,
"loss": 0.6329,
"num_input_tokens_seen": 170592,
"step": 455
},
{
"epoch": 0.4121863799283154,
"grad_norm": 13.3125,
"learning_rate": 2.056451612903226e-05,
"loss": 0.1658,
"num_input_tokens_seen": 172736,
"step": 460
},
{
"epoch": 0.4166666666666667,
"grad_norm": 12.6875,
"learning_rate": 2.078853046594982e-05,
"loss": 0.6479,
"num_input_tokens_seen": 174592,
"step": 465
},
{
"epoch": 0.4211469534050179,
"grad_norm": 6.09375,
"learning_rate": 2.1012544802867385e-05,
"loss": 0.6009,
"num_input_tokens_seen": 176704,
"step": 470
},
{
"epoch": 0.42562724014336917,
"grad_norm": 11.75,
"learning_rate": 2.1236559139784946e-05,
"loss": 0.3966,
"num_input_tokens_seen": 178816,
"step": 475
},
{
"epoch": 0.43010752688172044,
"grad_norm": 11.0625,
"learning_rate": 2.146057347670251e-05,
"loss": 0.3061,
"num_input_tokens_seen": 180608,
"step": 480
},
{
"epoch": 0.4345878136200717,
"grad_norm": 5.875,
"learning_rate": 2.1684587813620073e-05,
"loss": 0.7979,
"num_input_tokens_seen": 182368,
"step": 485
},
{
"epoch": 0.4390681003584229,
"grad_norm": 0.83203125,
"learning_rate": 2.1908602150537634e-05,
"loss": 0.327,
"num_input_tokens_seen": 184128,
"step": 490
},
{
"epoch": 0.4435483870967742,
"grad_norm": 6.875,
"learning_rate": 2.21326164874552e-05,
"loss": 0.4103,
"num_input_tokens_seen": 186144,
"step": 495
},
{
"epoch": 0.44802867383512546,
"grad_norm": 1.25,
"learning_rate": 2.235663082437276e-05,
"loss": 0.3142,
"num_input_tokens_seen": 188160,
"step": 500
},
{
"epoch": 0.4525089605734767,
"grad_norm": 4.78125,
"learning_rate": 2.258064516129032e-05,
"loss": 0.3128,
"num_input_tokens_seen": 190016,
"step": 505
},
{
"epoch": 0.45698924731182794,
"grad_norm": 6.21875,
"learning_rate": 2.2804659498207886e-05,
"loss": 0.822,
"num_input_tokens_seen": 191840,
"step": 510
},
{
"epoch": 0.4614695340501792,
"grad_norm": 5.6875,
"learning_rate": 2.302867383512545e-05,
"loss": 0.3575,
"num_input_tokens_seen": 193920,
"step": 515
},
{
"epoch": 0.4659498207885305,
"grad_norm": 5.3125,
"learning_rate": 2.325268817204301e-05,
"loss": 0.6149,
"num_input_tokens_seen": 195744,
"step": 520
},
{
"epoch": 0.47043010752688175,
"grad_norm": 9.4375,
"learning_rate": 2.3476702508960574e-05,
"loss": 0.2299,
"num_input_tokens_seen": 197632,
"step": 525
},
{
"epoch": 0.47491039426523296,
"grad_norm": 8.875,
"learning_rate": 2.370071684587814e-05,
"loss": 0.213,
"num_input_tokens_seen": 199424,
"step": 530
},
{
"epoch": 0.47939068100358423,
"grad_norm": 8.0,
"learning_rate": 2.39247311827957e-05,
"loss": 0.5455,
"num_input_tokens_seen": 201280,
"step": 535
},
{
"epoch": 0.4838709677419355,
"grad_norm": 4.625,
"learning_rate": 2.414874551971326e-05,
"loss": 0.5224,
"num_input_tokens_seen": 203072,
"step": 540
},
{
"epoch": 0.4883512544802867,
"grad_norm": 9.875,
"learning_rate": 2.4372759856630826e-05,
"loss": 0.6619,
"num_input_tokens_seen": 204992,
"step": 545
},
{
"epoch": 0.492831541218638,
"grad_norm": 9.875,
"learning_rate": 2.4596774193548387e-05,
"loss": 0.4848,
"num_input_tokens_seen": 206912,
"step": 550
},
{
"epoch": 0.49731182795698925,
"grad_norm": 35.75,
"learning_rate": 2.4820788530465952e-05,
"loss": 0.3495,
"num_input_tokens_seen": 208864,
"step": 555
},
{
"epoch": 0.5,
"eval_loss": 0.47937411069869995,
"eval_runtime": 9.6368,
"eval_samples_per_second": 51.469,
"eval_steps_per_second": 12.867,
"num_input_tokens_seen": 210048,
"step": 558
},
{
"epoch": 0.5017921146953405,
"grad_norm": 8.4375,
"learning_rate": 2.5044802867383517e-05,
"loss": 0.4117,
"num_input_tokens_seen": 210816,
"step": 560
},
{
"epoch": 0.5062724014336918,
"grad_norm": 18.0,
"learning_rate": 2.5268817204301075e-05,
"loss": 0.5386,
"num_input_tokens_seen": 212640,
"step": 565
},
{
"epoch": 0.510752688172043,
"grad_norm": 7.21875,
"learning_rate": 2.5492831541218636e-05,
"loss": 0.4978,
"num_input_tokens_seen": 214560,
"step": 570
},
{
"epoch": 0.5152329749103942,
"grad_norm": 2.796875,
"learning_rate": 2.5716845878136204e-05,
"loss": 0.4296,
"num_input_tokens_seen": 216384,
"step": 575
},
{
"epoch": 0.5197132616487455,
"grad_norm": 25.125,
"learning_rate": 2.5940860215053762e-05,
"loss": 0.3333,
"num_input_tokens_seen": 218304,
"step": 580
},
{
"epoch": 0.5241935483870968,
"grad_norm": 21.375,
"learning_rate": 2.616487455197133e-05,
"loss": 0.4829,
"num_input_tokens_seen": 220128,
"step": 585
},
{
"epoch": 0.5286738351254481,
"grad_norm": 0.97265625,
"learning_rate": 2.6388888888888892e-05,
"loss": 0.4184,
"num_input_tokens_seen": 222016,
"step": 590
},
{
"epoch": 0.5331541218637993,
"grad_norm": 9.0,
"learning_rate": 2.661290322580645e-05,
"loss": 0.582,
"num_input_tokens_seen": 224000,
"step": 595
},
{
"epoch": 0.5376344086021505,
"grad_norm": 0.95703125,
"learning_rate": 2.6836917562724018e-05,
"loss": 0.1031,
"num_input_tokens_seen": 225920,
"step": 600
},
{
"epoch": 0.5421146953405018,
"grad_norm": 16.625,
"learning_rate": 2.706093189964158e-05,
"loss": 0.3882,
"num_input_tokens_seen": 227712,
"step": 605
},
{
"epoch": 0.546594982078853,
"grad_norm": 13.0,
"learning_rate": 2.728494623655914e-05,
"loss": 0.6183,
"num_input_tokens_seen": 229824,
"step": 610
},
{
"epoch": 0.5510752688172043,
"grad_norm": 19.0,
"learning_rate": 2.7508960573476705e-05,
"loss": 0.3255,
"num_input_tokens_seen": 231840,
"step": 615
},
{
"epoch": 0.5555555555555556,
"grad_norm": 13.5625,
"learning_rate": 2.7732974910394267e-05,
"loss": 0.4895,
"num_input_tokens_seen": 233856,
"step": 620
},
{
"epoch": 0.5600358422939068,
"grad_norm": 12.6875,
"learning_rate": 2.7956989247311828e-05,
"loss": 0.4306,
"num_input_tokens_seen": 235680,
"step": 625
},
{
"epoch": 0.5645161290322581,
"grad_norm": 1.078125,
"learning_rate": 2.8181003584229393e-05,
"loss": 0.3221,
"num_input_tokens_seen": 237536,
"step": 630
},
{
"epoch": 0.5689964157706093,
"grad_norm": 0.2158203125,
"learning_rate": 2.8405017921146954e-05,
"loss": 0.4984,
"num_input_tokens_seen": 239488,
"step": 635
},
{
"epoch": 0.5734767025089605,
"grad_norm": 20.75,
"learning_rate": 2.862903225806452e-05,
"loss": 0.4833,
"num_input_tokens_seen": 241376,
"step": 640
},
{
"epoch": 0.5779569892473119,
"grad_norm": 31.125,
"learning_rate": 2.885304659498208e-05,
"loss": 0.1616,
"num_input_tokens_seen": 243200,
"step": 645
},
{
"epoch": 0.5824372759856631,
"grad_norm": 6.125,
"learning_rate": 2.9077060931899642e-05,
"loss": 0.4352,
"num_input_tokens_seen": 245056,
"step": 650
},
{
"epoch": 0.5869175627240143,
"grad_norm": 23.125,
"learning_rate": 2.9301075268817207e-05,
"loss": 0.2331,
"num_input_tokens_seen": 246880,
"step": 655
},
{
"epoch": 0.5913978494623656,
"grad_norm": 2.390625,
"learning_rate": 2.9525089605734768e-05,
"loss": 0.51,
"num_input_tokens_seen": 248832,
"step": 660
},
{
"epoch": 0.5958781362007168,
"grad_norm": 23.875,
"learning_rate": 2.974910394265233e-05,
"loss": 1.0846,
"num_input_tokens_seen": 250880,
"step": 665
},
{
"epoch": 0.600358422939068,
"grad_norm": 14.125,
"learning_rate": 2.9973118279569894e-05,
"loss": 0.3097,
"num_input_tokens_seen": 252864,
"step": 670
},
{
"epoch": 0.6048387096774194,
"grad_norm": 11.6875,
"learning_rate": 3.0197132616487455e-05,
"loss": 0.3327,
"num_input_tokens_seen": 254656,
"step": 675
},
{
"epoch": 0.6093189964157706,
"grad_norm": 42.0,
"learning_rate": 3.0421146953405024e-05,
"loss": 0.1946,
"num_input_tokens_seen": 256640,
"step": 680
},
{
"epoch": 0.6137992831541219,
"grad_norm": 4.5,
"learning_rate": 3.0645161290322585e-05,
"loss": 0.1472,
"num_input_tokens_seen": 258560,
"step": 685
},
{
"epoch": 0.6182795698924731,
"grad_norm": 10.875,
"learning_rate": 3.0869175627240146e-05,
"loss": 0.2914,
"num_input_tokens_seen": 260544,
"step": 690
},
{
"epoch": 0.6227598566308243,
"grad_norm": 13.3125,
"learning_rate": 3.109318996415771e-05,
"loss": 0.4216,
"num_input_tokens_seen": 262368,
"step": 695
},
{
"epoch": 0.6272401433691757,
"grad_norm": 25.75,
"learning_rate": 3.131720430107527e-05,
"loss": 0.2527,
"num_input_tokens_seen": 264288,
"step": 700
},
{
"epoch": 0.6317204301075269,
"grad_norm": 27.625,
"learning_rate": 3.154121863799283e-05,
"loss": 0.3436,
"num_input_tokens_seen": 266112,
"step": 705
},
{
"epoch": 0.6362007168458781,
"grad_norm": 76.0,
"learning_rate": 3.17652329749104e-05,
"loss": 0.5614,
"num_input_tokens_seen": 268064,
"step": 710
},
{
"epoch": 0.6406810035842294,
"grad_norm": 47.5,
"learning_rate": 3.198924731182796e-05,
"loss": 0.5969,
"num_input_tokens_seen": 269792,
"step": 715
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.609375,
"learning_rate": 3.221326164874552e-05,
"loss": 0.34,
"num_input_tokens_seen": 271616,
"step": 720
},
{
"epoch": 0.649641577060932,
"grad_norm": 13.625,
"learning_rate": 3.243727598566308e-05,
"loss": 0.1537,
"num_input_tokens_seen": 273568,
"step": 725
},
{
"epoch": 0.6541218637992832,
"grad_norm": 35.5,
"learning_rate": 3.2661290322580644e-05,
"loss": 0.4837,
"num_input_tokens_seen": 275552,
"step": 730
},
{
"epoch": 0.6586021505376344,
"grad_norm": 23.5,
"learning_rate": 3.288530465949821e-05,
"loss": 0.5085,
"num_input_tokens_seen": 277472,
"step": 735
},
{
"epoch": 0.6630824372759857,
"grad_norm": 23.375,
"learning_rate": 3.3109318996415774e-05,
"loss": 0.7406,
"num_input_tokens_seen": 279360,
"step": 740
},
{
"epoch": 0.6675627240143369,
"grad_norm": 0.130859375,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.3821,
"num_input_tokens_seen": 281440,
"step": 745
},
{
"epoch": 0.6720430107526881,
"grad_norm": 18.25,
"learning_rate": 3.3557347670250896e-05,
"loss": 0.4432,
"num_input_tokens_seen": 283328,
"step": 750
},
{
"epoch": 0.6765232974910395,
"grad_norm": 13.75,
"learning_rate": 3.378136200716846e-05,
"loss": 0.2282,
"num_input_tokens_seen": 285184,
"step": 755
},
{
"epoch": 0.6810035842293907,
"grad_norm": 13.25,
"learning_rate": 3.400537634408602e-05,
"loss": 0.2104,
"num_input_tokens_seen": 286944,
"step": 760
},
{
"epoch": 0.6854838709677419,
"grad_norm": 1.6015625,
"learning_rate": 3.422939068100359e-05,
"loss": 0.2136,
"num_input_tokens_seen": 289120,
"step": 765
},
{
"epoch": 0.6899641577060932,
"grad_norm": 31.5,
"learning_rate": 3.445340501792115e-05,
"loss": 0.6873,
"num_input_tokens_seen": 291008,
"step": 770
},
{
"epoch": 0.6944444444444444,
"grad_norm": 19.625,
"learning_rate": 3.467741935483872e-05,
"loss": 0.2732,
"num_input_tokens_seen": 292960,
"step": 775
},
{
"epoch": 0.6989247311827957,
"grad_norm": 20.0,
"learning_rate": 3.490143369175627e-05,
"loss": 0.351,
"num_input_tokens_seen": 294752,
"step": 780
},
{
"epoch": 0.703405017921147,
"grad_norm": 17.875,
"learning_rate": 3.512544802867383e-05,
"loss": 0.2875,
"num_input_tokens_seen": 296672,
"step": 785
},
{
"epoch": 0.7078853046594982,
"grad_norm": 27.5,
"learning_rate": 3.53494623655914e-05,
"loss": 0.3433,
"num_input_tokens_seen": 298528,
"step": 790
},
{
"epoch": 0.7123655913978495,
"grad_norm": 9.625,
"learning_rate": 3.557347670250896e-05,
"loss": 0.2105,
"num_input_tokens_seen": 300672,
"step": 795
},
{
"epoch": 0.7168458781362007,
"grad_norm": 5.6875,
"learning_rate": 3.5797491039426524e-05,
"loss": 0.2457,
"num_input_tokens_seen": 302528,
"step": 800
},
{
"epoch": 0.7213261648745519,
"grad_norm": 21.125,
"learning_rate": 3.602150537634409e-05,
"loss": 0.583,
"num_input_tokens_seen": 304512,
"step": 805
},
{
"epoch": 0.7258064516129032,
"grad_norm": 11.3125,
"learning_rate": 3.624551971326165e-05,
"loss": 0.4005,
"num_input_tokens_seen": 306240,
"step": 810
},
{
"epoch": 0.7302867383512545,
"grad_norm": 6.25,
"learning_rate": 3.6469534050179214e-05,
"loss": 0.3943,
"num_input_tokens_seen": 308160,
"step": 815
},
{
"epoch": 0.7347670250896058,
"grad_norm": 10.75,
"learning_rate": 3.6693548387096776e-05,
"loss": 0.3597,
"num_input_tokens_seen": 309952,
"step": 820
},
{
"epoch": 0.739247311827957,
"grad_norm": 18.375,
"learning_rate": 3.691756272401434e-05,
"loss": 0.2387,
"num_input_tokens_seen": 311936,
"step": 825
},
{
"epoch": 0.7437275985663082,
"grad_norm": 7.96875,
"learning_rate": 3.7141577060931905e-05,
"loss": 0.1461,
"num_input_tokens_seen": 313760,
"step": 830
},
{
"epoch": 0.7482078853046595,
"grad_norm": 7.25,
"learning_rate": 3.736559139784947e-05,
"loss": 0.2155,
"num_input_tokens_seen": 315456,
"step": 835
},
{
"epoch": 0.7526881720430108,
"grad_norm": 11.875,
"learning_rate": 3.758960573476703e-05,
"loss": 0.2241,
"num_input_tokens_seen": 317312,
"step": 840
},
{
"epoch": 0.757168458781362,
"grad_norm": 16.125,
"learning_rate": 3.781362007168459e-05,
"loss": 0.1211,
"num_input_tokens_seen": 319264,
"step": 845
},
{
"epoch": 0.7616487455197133,
"grad_norm": 22.0,
"learning_rate": 3.803763440860215e-05,
"loss": 0.4899,
"num_input_tokens_seen": 321248,
"step": 850
},
{
"epoch": 0.7661290322580645,
"grad_norm": 26.125,
"learning_rate": 3.826164874551971e-05,
"loss": 0.2401,
"num_input_tokens_seen": 323072,
"step": 855
},
{
"epoch": 0.7706093189964157,
"grad_norm": 36.75,
"learning_rate": 3.848566308243728e-05,
"loss": 0.3035,
"num_input_tokens_seen": 324896,
"step": 860
},
{
"epoch": 0.775089605734767,
"grad_norm": 45.75,
"learning_rate": 3.870967741935484e-05,
"loss": 0.6418,
"num_input_tokens_seen": 326592,
"step": 865
},
{
"epoch": 0.7795698924731183,
"grad_norm": 13.0,
"learning_rate": 3.89336917562724e-05,
"loss": 0.0548,
"num_input_tokens_seen": 328480,
"step": 870
},
{
"epoch": 0.7840501792114696,
"grad_norm": 7.65625,
"learning_rate": 3.9157706093189964e-05,
"loss": 0.485,
"num_input_tokens_seen": 330240,
"step": 875
},
{
"epoch": 0.7885304659498208,
"grad_norm": 36.0,
"learning_rate": 3.9381720430107526e-05,
"loss": 0.5505,
"num_input_tokens_seen": 331936,
"step": 880
},
{
"epoch": 0.793010752688172,
"grad_norm": 0.69921875,
"learning_rate": 3.9605734767025094e-05,
"loss": 0.1709,
"num_input_tokens_seen": 333664,
"step": 885
},
{
"epoch": 0.7974910394265233,
"grad_norm": 24.0,
"learning_rate": 3.9829749103942655e-05,
"loss": 0.3262,
"num_input_tokens_seen": 335488,
"step": 890
},
{
"epoch": 0.8019713261648745,
"grad_norm": 37.75,
"learning_rate": 4.005376344086022e-05,
"loss": 0.269,
"num_input_tokens_seen": 337280,
"step": 895
},
{
"epoch": 0.8064516129032258,
"grad_norm": 41.75,
"learning_rate": 4.027777777777778e-05,
"loss": 0.4521,
"num_input_tokens_seen": 339360,
"step": 900
},
{
"epoch": 0.8109318996415771,
"grad_norm": 51.75,
"learning_rate": 4.050179211469534e-05,
"loss": 0.4407,
"num_input_tokens_seen": 341184,
"step": 905
},
{
"epoch": 0.8154121863799283,
"grad_norm": 24.0,
"learning_rate": 4.072580645161291e-05,
"loss": 0.7413,
"num_input_tokens_seen": 343072,
"step": 910
},
{
"epoch": 0.8198924731182796,
"grad_norm": 26.25,
"learning_rate": 4.094982078853047e-05,
"loss": 0.3121,
"num_input_tokens_seen": 344928,
"step": 915
},
{
"epoch": 0.8243727598566308,
"grad_norm": 27.0,
"learning_rate": 4.117383512544803e-05,
"loss": 0.3929,
"num_input_tokens_seen": 346752,
"step": 920
},
{
"epoch": 0.828853046594982,
"grad_norm": 40.0,
"learning_rate": 4.13978494623656e-05,
"loss": 0.2492,
"num_input_tokens_seen": 348608,
"step": 925
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.25,
"learning_rate": 4.162186379928315e-05,
"loss": 0.2024,
"num_input_tokens_seen": 350496,
"step": 930
},
{
"epoch": 0.8378136200716846,
"grad_norm": 7.78125,
"learning_rate": 4.1845878136200714e-05,
"loss": 0.5547,
"num_input_tokens_seen": 352416,
"step": 935
},
{
"epoch": 0.8422939068100358,
"grad_norm": 35.25,
"learning_rate": 4.206989247311828e-05,
"loss": 0.5555,
"num_input_tokens_seen": 354272,
"step": 940
},
{
"epoch": 0.8467741935483871,
"grad_norm": 23.375,
"learning_rate": 4.2293906810035844e-05,
"loss": 0.5327,
"num_input_tokens_seen": 356064,
"step": 945
},
{
"epoch": 0.8512544802867383,
"grad_norm": 20.625,
"learning_rate": 4.2517921146953405e-05,
"loss": 0.2641,
"num_input_tokens_seen": 358016,
"step": 950
},
{
"epoch": 0.8557347670250897,
"grad_norm": 0.296875,
"learning_rate": 4.2741935483870973e-05,
"loss": 0.0947,
"num_input_tokens_seen": 359808,
"step": 955
},
{
"epoch": 0.8602150537634409,
"grad_norm": 1.90625,
"learning_rate": 4.296594982078853e-05,
"loss": 0.0573,
"num_input_tokens_seen": 361664,
"step": 960
},
{
"epoch": 0.8646953405017921,
"grad_norm": 12.25,
"learning_rate": 4.3189964157706096e-05,
"loss": 0.1219,
"num_input_tokens_seen": 363488,
"step": 965
},
{
"epoch": 0.8691756272401434,
"grad_norm": 5.53125,
"learning_rate": 4.341397849462366e-05,
"loss": 0.239,
"num_input_tokens_seen": 365632,
"step": 970
},
{
"epoch": 0.8736559139784946,
"grad_norm": 2.9375,
"learning_rate": 4.363799283154122e-05,
"loss": 0.1536,
"num_input_tokens_seen": 367616,
"step": 975
},
{
"epoch": 0.8781362007168458,
"grad_norm": 66.0,
"learning_rate": 4.386200716845879e-05,
"loss": 0.3751,
"num_input_tokens_seen": 369408,
"step": 980
},
{
"epoch": 0.8826164874551972,
"grad_norm": 54.0,
"learning_rate": 4.408602150537635e-05,
"loss": 0.977,
"num_input_tokens_seen": 371232,
"step": 985
},
{
"epoch": 0.8870967741935484,
"grad_norm": 25.875,
"learning_rate": 4.431003584229391e-05,
"loss": 0.6969,
"num_input_tokens_seen": 373088,
"step": 990
},
{
"epoch": 0.8915770609318996,
"grad_norm": 0.0849609375,
"learning_rate": 4.453405017921147e-05,
"loss": 0.3713,
"num_input_tokens_seen": 374944,
"step": 995
},
{
"epoch": 0.8960573476702509,
"grad_norm": 14.0,
"learning_rate": 4.475806451612903e-05,
"loss": 0.1883,
"num_input_tokens_seen": 376800,
"step": 1000
},
{
"epoch": 0.9005376344086021,
"grad_norm": 17.625,
"learning_rate": 4.49820788530466e-05,
"loss": 0.3327,
"num_input_tokens_seen": 378720,
"step": 1005
},
{
"epoch": 0.9050179211469535,
"grad_norm": 1.203125,
"learning_rate": 4.520609318996416e-05,
"loss": 0.1485,
"num_input_tokens_seen": 380768,
"step": 1010
},
{
"epoch": 0.9094982078853047,
"grad_norm": 22.125,
"learning_rate": 4.543010752688172e-05,
"loss": 0.3173,
"num_input_tokens_seen": 382752,
"step": 1015
},
{
"epoch": 0.9139784946236559,
"grad_norm": 18.625,
"learning_rate": 4.5654121863799285e-05,
"loss": 0.4751,
"num_input_tokens_seen": 384576,
"step": 1020
},
{
"epoch": 0.9184587813620072,
"grad_norm": 39.75,
"learning_rate": 4.5878136200716846e-05,
"loss": 0.4613,
"num_input_tokens_seen": 386368,
"step": 1025
},
{
"epoch": 0.9229390681003584,
"grad_norm": 18.625,
"learning_rate": 4.610215053763441e-05,
"loss": 0.6101,
"num_input_tokens_seen": 388192,
"step": 1030
},
{
"epoch": 0.9274193548387096,
"grad_norm": 12.0,
"learning_rate": 4.6326164874551976e-05,
"loss": 0.143,
"num_input_tokens_seen": 390016,
"step": 1035
},
{
"epoch": 0.931899641577061,
"grad_norm": 0.98046875,
"learning_rate": 4.655017921146954e-05,
"loss": 0.2871,
"num_input_tokens_seen": 391904,
"step": 1040
},
{
"epoch": 0.9363799283154122,
"grad_norm": 14.0,
"learning_rate": 4.67741935483871e-05,
"loss": 0.3632,
"num_input_tokens_seen": 393888,
"step": 1045
},
{
"epoch": 0.9408602150537635,
"grad_norm": 21.625,
"learning_rate": 4.699820788530466e-05,
"loss": 0.3306,
"num_input_tokens_seen": 396000,
"step": 1050
},
{
"epoch": 0.9453405017921147,
"grad_norm": 4.40625,
"learning_rate": 4.722222222222222e-05,
"loss": 0.0991,
"num_input_tokens_seen": 397824,
"step": 1055
},
{
"epoch": 0.9498207885304659,
"grad_norm": 4.8125,
"learning_rate": 4.744623655913979e-05,
"loss": 0.3083,
"num_input_tokens_seen": 399648,
"step": 1060
},
{
"epoch": 0.9543010752688172,
"grad_norm": 17.5,
"learning_rate": 4.767025089605735e-05,
"loss": 0.3862,
"num_input_tokens_seen": 401728,
"step": 1065
},
{
"epoch": 0.9587813620071685,
"grad_norm": 12.375,
"learning_rate": 4.789426523297491e-05,
"loss": 0.4106,
"num_input_tokens_seen": 403616,
"step": 1070
},
{
"epoch": 0.9632616487455197,
"grad_norm": 36.5,
"learning_rate": 4.811827956989248e-05,
"loss": 0.3813,
"num_input_tokens_seen": 405504,
"step": 1075
},
{
"epoch": 0.967741935483871,
"grad_norm": 20.625,
"learning_rate": 4.8342293906810035e-05,
"loss": 0.2995,
"num_input_tokens_seen": 407392,
"step": 1080
},
{
"epoch": 0.9722222222222222,
"grad_norm": 15.6875,
"learning_rate": 4.8566308243727596e-05,
"loss": 0.3626,
"num_input_tokens_seen": 409216,
"step": 1085
},
{
"epoch": 0.9767025089605734,
"grad_norm": 51.75,
"learning_rate": 4.8790322580645164e-05,
"loss": 0.6226,
"num_input_tokens_seen": 411104,
"step": 1090
},
{
"epoch": 0.9811827956989247,
"grad_norm": 29.625,
"learning_rate": 4.9014336917562726e-05,
"loss": 0.4303,
"num_input_tokens_seen": 412992,
"step": 1095
},
{
"epoch": 0.985663082437276,
"grad_norm": 6.71875,
"learning_rate": 4.9238351254480294e-05,
"loss": 0.2378,
"num_input_tokens_seen": 414656,
"step": 1100
},
{
"epoch": 0.9901433691756273,
"grad_norm": 18.125,
"learning_rate": 4.9462365591397855e-05,
"loss": 0.5119,
"num_input_tokens_seen": 416736,
"step": 1105
},
{
"epoch": 0.9946236559139785,
"grad_norm": 5.125,
"learning_rate": 4.968637992831541e-05,
"loss": 0.2563,
"num_input_tokens_seen": 418496,
"step": 1110
},
{
"epoch": 0.9991039426523297,
"grad_norm": 1.0625,
"learning_rate": 4.991039426523298e-05,
"loss": 0.2307,
"num_input_tokens_seen": 420448,
"step": 1115
},
{
"epoch": 1.0,
"eval_loss": 0.29340866208076477,
"eval_runtime": 9.6403,
"eval_samples_per_second": 51.451,
"eval_steps_per_second": 12.863,
"num_input_tokens_seen": 420520,
"step": 1116
},
{
"epoch": 1.003584229390681,
"grad_norm": 18.75,
"learning_rate": 4.9999988993763824e-05,
"loss": 0.2259,
"num_input_tokens_seen": 422088,
"step": 1120
},
{
"epoch": 1.0080645161290323,
"grad_norm": 42.75,
"learning_rate": 4.9999921733466727e-05,
"loss": 0.3143,
"num_input_tokens_seen": 423912,
"step": 1125
},
{
"epoch": 1.0125448028673836,
"grad_norm": 38.75,
"learning_rate": 4.9999793327612486e-05,
"loss": 0.1442,
"num_input_tokens_seen": 425768,
"step": 1130
},
{
"epoch": 1.0170250896057347,
"grad_norm": 35.5,
"learning_rate": 4.999960377651517e-05,
"loss": 0.0878,
"num_input_tokens_seen": 427528,
"step": 1135
},
{
"epoch": 1.021505376344086,
"grad_norm": 6.9375,
"learning_rate": 4.9999353080638376e-05,
"loss": 0.3032,
"num_input_tokens_seen": 429416,
"step": 1140
},
{
"epoch": 1.0259856630824373,
"grad_norm": 33.25,
"learning_rate": 4.9999041240595276e-05,
"loss": 0.1883,
"num_input_tokens_seen": 431080,
"step": 1145
},
{
"epoch": 1.0304659498207884,
"grad_norm": 44.0,
"learning_rate": 4.9998668257148576e-05,
"loss": 0.8901,
"num_input_tokens_seen": 432936,
"step": 1150
},
{
"epoch": 1.0349462365591398,
"grad_norm": 30.625,
"learning_rate": 4.999823413121053e-05,
"loss": 0.2307,
"num_input_tokens_seen": 434984,
"step": 1155
},
{
"epoch": 1.039426523297491,
"grad_norm": 22.375,
"learning_rate": 4.999773886384293e-05,
"loss": 0.0785,
"num_input_tokens_seen": 436744,
"step": 1160
},
{
"epoch": 1.0439068100358422,
"grad_norm": 0.1484375,
"learning_rate": 4.9997182456257116e-05,
"loss": 0.147,
"num_input_tokens_seen": 438568,
"step": 1165
},
{
"epoch": 1.0483870967741935,
"grad_norm": 0.29296875,
"learning_rate": 4.999656490981397e-05,
"loss": 0.2614,
"num_input_tokens_seen": 440424,
"step": 1170
},
{
"epoch": 1.0528673835125448,
"grad_norm": 80.5,
"learning_rate": 4.9995886226023913e-05,
"loss": 0.3234,
"num_input_tokens_seen": 442504,
"step": 1175
},
{
"epoch": 1.0573476702508962,
"grad_norm": 38.5,
"learning_rate": 4.999514640654688e-05,
"loss": 0.5882,
"num_input_tokens_seen": 444328,
"step": 1180
},
{
"epoch": 1.0618279569892473,
"grad_norm": 16.0,
"learning_rate": 4.999434545319234e-05,
"loss": 0.1195,
"num_input_tokens_seen": 446312,
"step": 1185
},
{
"epoch": 1.0663082437275986,
"grad_norm": 1.765625,
"learning_rate": 4.999348336791929e-05,
"loss": 0.2408,
"num_input_tokens_seen": 448232,
"step": 1190
},
{
"epoch": 1.07078853046595,
"grad_norm": 22.375,
"learning_rate": 4.9992560152836264e-05,
"loss": 0.2582,
"num_input_tokens_seen": 450056,
"step": 1195
},
{
"epoch": 1.075268817204301,
"grad_norm": 4.40625,
"learning_rate": 4.999157581020126e-05,
"loss": 0.3751,
"num_input_tokens_seen": 451976,
"step": 1200
},
{
"epoch": 1.0797491039426523,
"grad_norm": 13.4375,
"learning_rate": 4.9990530342421835e-05,
"loss": 0.0868,
"num_input_tokens_seen": 454056,
"step": 1205
},
{
"epoch": 1.0842293906810037,
"grad_norm": 11.9375,
"learning_rate": 4.998942375205502e-05,
"loss": 0.1627,
"num_input_tokens_seen": 455880,
"step": 1210
},
{
"epoch": 1.0887096774193548,
"grad_norm": 61.25,
"learning_rate": 4.9988256041807334e-05,
"loss": 0.4289,
"num_input_tokens_seen": 457736,
"step": 1215
},
{
"epoch": 1.093189964157706,
"grad_norm": 41.25,
"learning_rate": 4.998702721453481e-05,
"loss": 0.2704,
"num_input_tokens_seen": 459752,
"step": 1220
},
{
"epoch": 1.0976702508960574,
"grad_norm": 39.75,
"learning_rate": 4.998573727324295e-05,
"loss": 0.0674,
"num_input_tokens_seen": 461608,
"step": 1225
},
{
"epoch": 1.1021505376344085,
"grad_norm": 4.6875,
"learning_rate": 4.998438622108673e-05,
"loss": 0.5046,
"num_input_tokens_seen": 463464,
"step": 1230
},
{
"epoch": 1.1066308243727598,
"grad_norm": 58.0,
"learning_rate": 4.9982974061370594e-05,
"loss": 0.441,
"num_input_tokens_seen": 465416,
"step": 1235
},
{
"epoch": 1.1111111111111112,
"grad_norm": 71.0,
"learning_rate": 4.9981500797548445e-05,
"loss": 0.3863,
"num_input_tokens_seen": 467208,
"step": 1240
},
{
"epoch": 1.1155913978494623,
"grad_norm": 42.25,
"learning_rate": 4.9979966433223627e-05,
"loss": 0.3945,
"num_input_tokens_seen": 469096,
"step": 1245
},
{
"epoch": 1.1200716845878136,
"grad_norm": 11.8125,
"learning_rate": 4.997837097214895e-05,
"loss": 0.3466,
"num_input_tokens_seen": 470984,
"step": 1250
},
{
"epoch": 1.124551971326165,
"grad_norm": 7.1875,
"learning_rate": 4.997671441822662e-05,
"loss": 0.12,
"num_input_tokens_seen": 472904,
"step": 1255
},
{
"epoch": 1.129032258064516,
"grad_norm": 7.125,
"learning_rate": 4.997499677550831e-05,
"loss": 0.2053,
"num_input_tokens_seen": 474792,
"step": 1260
},
{
"epoch": 1.1335125448028673,
"grad_norm": 14.0,
"learning_rate": 4.997321804819506e-05,
"loss": 0.1768,
"num_input_tokens_seen": 476680,
"step": 1265
},
{
"epoch": 1.1379928315412187,
"grad_norm": 29.5,
"learning_rate": 4.9971378240637345e-05,
"loss": 0.4991,
"num_input_tokens_seen": 478344,
"step": 1270
},
{
"epoch": 1.14247311827957,
"grad_norm": 1.2265625,
"learning_rate": 4.9969477357335025e-05,
"loss": 0.3667,
"num_input_tokens_seen": 480328,
"step": 1275
},
{
"epoch": 1.146953405017921,
"grad_norm": 15.4375,
"learning_rate": 4.9967515402937334e-05,
"loss": 0.3431,
"num_input_tokens_seen": 482344,
"step": 1280
},
{
"epoch": 1.1514336917562724,
"grad_norm": 17.75,
"learning_rate": 4.996549238224288e-05,
"loss": 0.1361,
"num_input_tokens_seen": 484296,
"step": 1285
},
{
"epoch": 1.1559139784946237,
"grad_norm": 23.625,
"learning_rate": 4.996340830019962e-05,
"loss": 0.6209,
"num_input_tokens_seen": 486216,
"step": 1290
},
{
"epoch": 1.1603942652329748,
"grad_norm": 38.75,
"learning_rate": 4.996126316190488e-05,
"loss": 0.1789,
"num_input_tokens_seen": 488040,
"step": 1295
},
{
"epoch": 1.1648745519713262,
"grad_norm": 23.0,
"learning_rate": 4.995905697260528e-05,
"loss": 0.1307,
"num_input_tokens_seen": 489832,
"step": 1300
},
{
"epoch": 1.1693548387096775,
"grad_norm": 0.78515625,
"learning_rate": 4.995678973769681e-05,
"loss": 0.5493,
"num_input_tokens_seen": 491688,
"step": 1305
},
{
"epoch": 1.1738351254480286,
"grad_norm": 5.90625,
"learning_rate": 4.995446146272472e-05,
"loss": 0.0698,
"num_input_tokens_seen": 493736,
"step": 1310
},
{
"epoch": 1.17831541218638,
"grad_norm": 0.375,
"learning_rate": 4.9952072153383575e-05,
"loss": 0.6333,
"num_input_tokens_seen": 495656,
"step": 1315
},
{
"epoch": 1.1827956989247312,
"grad_norm": 0.06787109375,
"learning_rate": 4.994962181551725e-05,
"loss": 0.3869,
"num_input_tokens_seen": 497640,
"step": 1320
},
{
"epoch": 1.1872759856630823,
"grad_norm": 28.375,
"learning_rate": 4.994711045511881e-05,
"loss": 0.2626,
"num_input_tokens_seen": 499464,
"step": 1325
},
{
"epoch": 1.1917562724014337,
"grad_norm": 16.75,
"learning_rate": 4.9944538078330646e-05,
"loss": 0.2115,
"num_input_tokens_seen": 501352,
"step": 1330
},
{
"epoch": 1.196236559139785,
"grad_norm": 44.5,
"learning_rate": 4.994190469144434e-05,
"loss": 0.6451,
"num_input_tokens_seen": 503240,
"step": 1335
},
{
"epoch": 1.2007168458781363,
"grad_norm": 26.875,
"learning_rate": 4.993921030090072e-05,
"loss": 0.5396,
"num_input_tokens_seen": 505320,
"step": 1340
},
{
"epoch": 1.2051971326164874,
"grad_norm": 19.875,
"learning_rate": 4.99364549132898e-05,
"loss": 0.2078,
"num_input_tokens_seen": 507336,
"step": 1345
},
{
"epoch": 1.2096774193548387,
"grad_norm": 5.0625,
"learning_rate": 4.993363853535079e-05,
"loss": 0.399,
"num_input_tokens_seen": 509128,
"step": 1350
},
{
"epoch": 1.2141577060931898,
"grad_norm": 27.375,
"learning_rate": 4.9930761173972076e-05,
"loss": 0.0919,
"num_input_tokens_seen": 510952,
"step": 1355
},
{
"epoch": 1.2186379928315412,
"grad_norm": 1.6796875,
"learning_rate": 4.992782283619118e-05,
"loss": 0.1124,
"num_input_tokens_seen": 512808,
"step": 1360
},
{
"epoch": 1.2231182795698925,
"grad_norm": 18.375,
"learning_rate": 4.99248235291948e-05,
"loss": 0.0796,
"num_input_tokens_seen": 514504,
"step": 1365
},
{
"epoch": 1.2275985663082438,
"grad_norm": 1.1875,
"learning_rate": 4.992176326031872e-05,
"loss": 0.1368,
"num_input_tokens_seen": 516360,
"step": 1370
},
{
"epoch": 1.232078853046595,
"grad_norm": 3.59375,
"learning_rate": 4.991864203704783e-05,
"loss": 0.1688,
"num_input_tokens_seen": 518152,
"step": 1375
},
{
"epoch": 1.2365591397849462,
"grad_norm": 30.0,
"learning_rate": 4.991545986701611e-05,
"loss": 0.1337,
"num_input_tokens_seen": 520072,
"step": 1380
},
{
"epoch": 1.2410394265232976,
"grad_norm": 17.25,
"learning_rate": 4.991221675800662e-05,
"loss": 0.1065,
"num_input_tokens_seen": 521928,
"step": 1385
},
{
"epoch": 1.2455197132616487,
"grad_norm": 10.1875,
"learning_rate": 4.990891271795145e-05,
"loss": 0.172,
"num_input_tokens_seen": 523880,
"step": 1390
},
{
"epoch": 1.25,
"grad_norm": 45.75,
"learning_rate": 4.99055477549317e-05,
"loss": 0.5192,
"num_input_tokens_seen": 525832,
"step": 1395
},
{
"epoch": 1.2544802867383513,
"grad_norm": 19.625,
"learning_rate": 4.990212187717753e-05,
"loss": 0.1398,
"num_input_tokens_seen": 527560,
"step": 1400
},
{
"epoch": 1.2589605734767024,
"grad_norm": 26.625,
"learning_rate": 4.9898635093068036e-05,
"loss": 0.6003,
"num_input_tokens_seen": 529480,
"step": 1405
},
{
"epoch": 1.2634408602150538,
"grad_norm": 22.375,
"learning_rate": 4.98950874111313e-05,
"loss": 0.5657,
"num_input_tokens_seen": 531592,
"step": 1410
},
{
"epoch": 1.267921146953405,
"grad_norm": 0.625,
"learning_rate": 4.989147884004435e-05,
"loss": 0.64,
"num_input_tokens_seen": 533480,
"step": 1415
},
{
"epoch": 1.2724014336917562,
"grad_norm": 30.875,
"learning_rate": 4.988780938863314e-05,
"loss": 0.3896,
"num_input_tokens_seen": 535464,
"step": 1420
},
{
"epoch": 1.2768817204301075,
"grad_norm": 0.07666015625,
"learning_rate": 4.9884079065872514e-05,
"loss": 0.2331,
"num_input_tokens_seen": 537512,
"step": 1425
},
{
"epoch": 1.2813620071684588,
"grad_norm": 27.5,
"learning_rate": 4.988028788088622e-05,
"loss": 0.138,
"num_input_tokens_seen": 539560,
"step": 1430
},
{
"epoch": 1.2858422939068102,
"grad_norm": 0.71875,
"learning_rate": 4.9876435842946845e-05,
"loss": 0.05,
"num_input_tokens_seen": 541448,
"step": 1435
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.90234375,
"learning_rate": 4.987252296147582e-05,
"loss": 0.2335,
"num_input_tokens_seen": 543336,
"step": 1440
},
{
"epoch": 1.2948028673835126,
"grad_norm": 32.75,
"learning_rate": 4.986854924604339e-05,
"loss": 0.3187,
"num_input_tokens_seen": 545320,
"step": 1445
},
{
"epoch": 1.2992831541218637,
"grad_norm": 4.625,
"learning_rate": 4.986451470636858e-05,
"loss": 0.1929,
"num_input_tokens_seen": 547240,
"step": 1450
},
{
"epoch": 1.303763440860215,
"grad_norm": 26.5,
"learning_rate": 4.98604193523192e-05,
"loss": 0.1406,
"num_input_tokens_seen": 549000,
"step": 1455
},
{
"epoch": 1.3082437275985663,
"grad_norm": 26.25,
"learning_rate": 4.985626319391178e-05,
"loss": 0.2374,
"num_input_tokens_seen": 550920,
"step": 1460
},
{
"epoch": 1.3127240143369177,
"grad_norm": 2.25,
"learning_rate": 4.985204624131157e-05,
"loss": 0.2113,
"num_input_tokens_seen": 552744,
"step": 1465
},
{
"epoch": 1.3172043010752688,
"grad_norm": 0.439453125,
"learning_rate": 4.984776850483254e-05,
"loss": 0.3066,
"num_input_tokens_seen": 554632,
"step": 1470
},
{
"epoch": 1.32168458781362,
"grad_norm": 6.96875,
"learning_rate": 4.9843429994937284e-05,
"loss": 0.4033,
"num_input_tokens_seen": 556392,
"step": 1475
},
{
"epoch": 1.3261648745519714,
"grad_norm": 5.34375,
"learning_rate": 4.983903072223708e-05,
"loss": 0.2324,
"num_input_tokens_seen": 558248,
"step": 1480
},
{
"epoch": 1.3306451612903225,
"grad_norm": 0.012451171875,
"learning_rate": 4.983457069749178e-05,
"loss": 0.0514,
"num_input_tokens_seen": 560200,
"step": 1485
},
{
"epoch": 1.3351254480286738,
"grad_norm": 6.75,
"learning_rate": 4.983004993160986e-05,
"loss": 0.0832,
"num_input_tokens_seen": 562024,
"step": 1490
},
{
"epoch": 1.3396057347670252,
"grad_norm": 23.875,
"learning_rate": 4.982546843564834e-05,
"loss": 0.0806,
"num_input_tokens_seen": 563848,
"step": 1495
},
{
"epoch": 1.3440860215053765,
"grad_norm": 30.25,
"learning_rate": 4.982082622081279e-05,
"loss": 0.4706,
"num_input_tokens_seen": 565832,
"step": 1500
},
{
"epoch": 1.3485663082437276,
"grad_norm": 25.375,
"learning_rate": 4.981612329845726e-05,
"loss": 0.0855,
"num_input_tokens_seen": 567688,
"step": 1505
},
{
"epoch": 1.353046594982079,
"grad_norm": 9.5,
"learning_rate": 4.98113596800843e-05,
"loss": 0.2845,
"num_input_tokens_seen": 569544,
"step": 1510
},
{
"epoch": 1.35752688172043,
"grad_norm": 1.125,
"learning_rate": 4.980653537734493e-05,
"loss": 0.2787,
"num_input_tokens_seen": 571432,
"step": 1515
},
{
"epoch": 1.3620071684587813,
"grad_norm": 1.5625,
"learning_rate": 4.9801650402038555e-05,
"loss": 0.0709,
"num_input_tokens_seen": 573256,
"step": 1520
},
{
"epoch": 1.3664874551971327,
"grad_norm": 17.125,
"learning_rate": 4.979670476611301e-05,
"loss": 0.1638,
"num_input_tokens_seen": 575208,
"step": 1525
},
{
"epoch": 1.370967741935484,
"grad_norm": 0.07275390625,
"learning_rate": 4.979169848166446e-05,
"loss": 0.2012,
"num_input_tokens_seen": 576936,
"step": 1530
},
{
"epoch": 1.375448028673835,
"grad_norm": 65.5,
"learning_rate": 4.978663156093744e-05,
"loss": 0.2071,
"num_input_tokens_seen": 579016,
"step": 1535
},
{
"epoch": 1.3799283154121864,
"grad_norm": 0.5859375,
"learning_rate": 4.978150401632477e-05,
"loss": 0.7948,
"num_input_tokens_seen": 580872,
"step": 1540
},
{
"epoch": 1.3844086021505375,
"grad_norm": 0.23828125,
"learning_rate": 4.9776315860367564e-05,
"loss": 0.2772,
"num_input_tokens_seen": 582696,
"step": 1545
},
{
"epoch": 1.3888888888888888,
"grad_norm": 6.125,
"learning_rate": 4.9771067105755145e-05,
"loss": 0.4533,
"num_input_tokens_seen": 584456,
"step": 1550
},
{
"epoch": 1.3933691756272402,
"grad_norm": 18.875,
"learning_rate": 4.976575776532509e-05,
"loss": 0.5198,
"num_input_tokens_seen": 586408,
"step": 1555
},
{
"epoch": 1.3978494623655915,
"grad_norm": 36.75,
"learning_rate": 4.976038785206315e-05,
"loss": 0.382,
"num_input_tokens_seen": 588296,
"step": 1560
},
{
"epoch": 1.4023297491039426,
"grad_norm": 0.484375,
"learning_rate": 4.9754957379103205e-05,
"loss": 0.1701,
"num_input_tokens_seen": 590280,
"step": 1565
},
{
"epoch": 1.406810035842294,
"grad_norm": 21.0,
"learning_rate": 4.974946635972728e-05,
"loss": 0.195,
"num_input_tokens_seen": 592104,
"step": 1570
},
{
"epoch": 1.4112903225806452,
"grad_norm": 0.033203125,
"learning_rate": 4.974391480736546e-05,
"loss": 0.1872,
"num_input_tokens_seen": 593960,
"step": 1575
},
{
"epoch": 1.4157706093189963,
"grad_norm": 14.4375,
"learning_rate": 4.973830273559591e-05,
"loss": 0.4913,
"num_input_tokens_seen": 595720,
"step": 1580
},
{
"epoch": 1.4202508960573477,
"grad_norm": 31.0,
"learning_rate": 4.97326301581448e-05,
"loss": 0.3379,
"num_input_tokens_seen": 597704,
"step": 1585
},
{
"epoch": 1.424731182795699,
"grad_norm": 13.0625,
"learning_rate": 4.9726897088886294e-05,
"loss": 0.1852,
"num_input_tokens_seen": 599560,
"step": 1590
},
{
"epoch": 1.4292114695340503,
"grad_norm": 42.25,
"learning_rate": 4.972110354184249e-05,
"loss": 0.2524,
"num_input_tokens_seen": 601384,
"step": 1595
},
{
"epoch": 1.4336917562724014,
"grad_norm": 23.375,
"learning_rate": 4.971524953118344e-05,
"loss": 0.2603,
"num_input_tokens_seen": 603176,
"step": 1600
},
{
"epoch": 1.4381720430107527,
"grad_norm": 29.5,
"learning_rate": 4.9709335071227046e-05,
"loss": 0.6264,
"num_input_tokens_seen": 605064,
"step": 1605
},
{
"epoch": 1.4426523297491038,
"grad_norm": 4.8125,
"learning_rate": 4.970336017643907e-05,
"loss": 0.3977,
"num_input_tokens_seen": 606920,
"step": 1610
},
{
"epoch": 1.4471326164874552,
"grad_norm": 2.0625,
"learning_rate": 4.969732486143309e-05,
"loss": 0.1518,
"num_input_tokens_seen": 608712,
"step": 1615
},
{
"epoch": 1.4516129032258065,
"grad_norm": 64.5,
"learning_rate": 4.969122914097046e-05,
"loss": 0.4154,
"num_input_tokens_seen": 610600,
"step": 1620
},
{
"epoch": 1.4560931899641578,
"grad_norm": 24.375,
"learning_rate": 4.968507302996029e-05,
"loss": 0.3731,
"num_input_tokens_seen": 612488,
"step": 1625
},
{
"epoch": 1.460573476702509,
"grad_norm": 9.9375,
"learning_rate": 4.967885654345936e-05,
"loss": 0.0926,
"num_input_tokens_seen": 614344,
"step": 1630
},
{
"epoch": 1.4650537634408602,
"grad_norm": 0.44921875,
"learning_rate": 4.9672579696672136e-05,
"loss": 0.1411,
"num_input_tokens_seen": 616168,
"step": 1635
},
{
"epoch": 1.4695340501792113,
"grad_norm": 12.375,
"learning_rate": 4.966624250495075e-05,
"loss": 0.2866,
"num_input_tokens_seen": 618024,
"step": 1640
},
{
"epoch": 1.4740143369175627,
"grad_norm": 27.125,
"learning_rate": 4.9659844983794855e-05,
"loss": 0.3223,
"num_input_tokens_seen": 619848,
"step": 1645
},
{
"epoch": 1.478494623655914,
"grad_norm": 8.3125,
"learning_rate": 4.965338714885173e-05,
"loss": 0.5086,
"num_input_tokens_seen": 621576,
"step": 1650
},
{
"epoch": 1.4829749103942653,
"grad_norm": 0.7109375,
"learning_rate": 4.964686901591612e-05,
"loss": 0.3582,
"num_input_tokens_seen": 623592,
"step": 1655
},
{
"epoch": 1.4874551971326164,
"grad_norm": 34.75,
"learning_rate": 4.964029060093029e-05,
"loss": 0.2213,
"num_input_tokens_seen": 625384,
"step": 1660
},
{
"epoch": 1.4919354838709677,
"grad_norm": 24.5,
"learning_rate": 4.96336519199839e-05,
"loss": 0.2235,
"num_input_tokens_seen": 627400,
"step": 1665
},
{
"epoch": 1.496415770609319,
"grad_norm": 4.0,
"learning_rate": 4.9626952989314065e-05,
"loss": 0.2229,
"num_input_tokens_seen": 629192,
"step": 1670
},
{
"epoch": 1.5,
"eval_loss": 0.28633037209510803,
"eval_runtime": 9.6809,
"eval_samples_per_second": 51.235,
"eval_steps_per_second": 12.809,
"num_input_tokens_seen": 630888,
"step": 1674
},
{
"epoch": 1.5008960573476702,
"grad_norm": 13.6875,
"learning_rate": 4.962019382530521e-05,
"loss": 0.0335,
"num_input_tokens_seen": 631336,
"step": 1675
},
{
"epoch": 1.5053763440860215,
"grad_norm": 29.875,
"learning_rate": 4.9613374444489095e-05,
"loss": 0.4891,
"num_input_tokens_seen": 633160,
"step": 1680
},
{
"epoch": 1.5098566308243728,
"grad_norm": 10.1875,
"learning_rate": 4.960649486354478e-05,
"loss": 0.4411,
"num_input_tokens_seen": 635176,
"step": 1685
},
{
"epoch": 1.5143369175627241,
"grad_norm": 10.1875,
"learning_rate": 4.959955509929854e-05,
"loss": 0.2609,
"num_input_tokens_seen": 636936,
"step": 1690
},
{
"epoch": 1.5188172043010753,
"grad_norm": 18.75,
"learning_rate": 4.9592555168723875e-05,
"loss": 0.2843,
"num_input_tokens_seen": 638984,
"step": 1695
},
{
"epoch": 1.5232974910394266,
"grad_norm": 29.625,
"learning_rate": 4.95854950889414e-05,
"loss": 0.2994,
"num_input_tokens_seen": 640904,
"step": 1700
},
{
"epoch": 1.5277777777777777,
"grad_norm": 59.0,
"learning_rate": 4.957837487721889e-05,
"loss": 0.3065,
"num_input_tokens_seen": 642792,
"step": 1705
},
{
"epoch": 1.532258064516129,
"grad_norm": 19.875,
"learning_rate": 4.957119455097117e-05,
"loss": 0.2598,
"num_input_tokens_seen": 644680,
"step": 1710
},
{
"epoch": 1.5367383512544803,
"grad_norm": 22.875,
"learning_rate": 4.956395412776008e-05,
"loss": 0.48,
"num_input_tokens_seen": 646504,
"step": 1715
},
{
"epoch": 1.5412186379928317,
"grad_norm": 40.25,
"learning_rate": 4.955665362529448e-05,
"loss": 0.3764,
"num_input_tokens_seen": 648520,
"step": 1720
},
{
"epoch": 1.5456989247311828,
"grad_norm": 10.375,
"learning_rate": 4.954929306143016e-05,
"loss": 0.3446,
"num_input_tokens_seen": 650312,
"step": 1725
},
{
"epoch": 1.550179211469534,
"grad_norm": 4.6875,
"learning_rate": 4.9541872454169794e-05,
"loss": 0.2728,
"num_input_tokens_seen": 652200,
"step": 1730
},
{
"epoch": 1.5546594982078852,
"grad_norm": 4.28125,
"learning_rate": 4.953439182166293e-05,
"loss": 0.1235,
"num_input_tokens_seen": 654280,
"step": 1735
},
{
"epoch": 1.5591397849462365,
"grad_norm": 2.484375,
"learning_rate": 4.952685118220593e-05,
"loss": 0.0467,
"num_input_tokens_seen": 656168,
"step": 1740
},
{
"epoch": 1.5636200716845878,
"grad_norm": 45.75,
"learning_rate": 4.951925055424191e-05,
"loss": 0.4338,
"num_input_tokens_seen": 657992,
"step": 1745
},
{
"epoch": 1.5681003584229392,
"grad_norm": 7.1875,
"learning_rate": 4.951158995636071e-05,
"loss": 0.1005,
"num_input_tokens_seen": 659720,
"step": 1750
},
{
"epoch": 1.5725806451612905,
"grad_norm": 18.25,
"learning_rate": 4.9503869407298856e-05,
"loss": 0.2278,
"num_input_tokens_seen": 661768,
"step": 1755
},
{
"epoch": 1.5770609318996416,
"grad_norm": 22.625,
"learning_rate": 4.94960889259395e-05,
"loss": 0.3737,
"num_input_tokens_seen": 663592,
"step": 1760
},
{
"epoch": 1.5815412186379927,
"grad_norm": 36.0,
"learning_rate": 4.948824853131236e-05,
"loss": 0.5136,
"num_input_tokens_seen": 665384,
"step": 1765
},
{
"epoch": 1.586021505376344,
"grad_norm": 0.283203125,
"learning_rate": 4.948034824259373e-05,
"loss": 0.2581,
"num_input_tokens_seen": 667400,
"step": 1770
},
{
"epoch": 1.5905017921146953,
"grad_norm": 63.5,
"learning_rate": 4.947238807910637e-05,
"loss": 0.2677,
"num_input_tokens_seen": 669192,
"step": 1775
},
{
"epoch": 1.5949820788530467,
"grad_norm": 29.375,
"learning_rate": 4.9464368060319465e-05,
"loss": 0.6715,
"num_input_tokens_seen": 671112,
"step": 1780
},
{
"epoch": 1.599462365591398,
"grad_norm": 1.203125,
"learning_rate": 4.9456288205848634e-05,
"loss": 0.2151,
"num_input_tokens_seen": 672968,
"step": 1785
},
{
"epoch": 1.603942652329749,
"grad_norm": 0.46875,
"learning_rate": 4.944814853545583e-05,
"loss": 0.2373,
"num_input_tokens_seen": 674792,
"step": 1790
},
{
"epoch": 1.6084229390681004,
"grad_norm": 14.125,
"learning_rate": 4.9439949069049294e-05,
"loss": 0.0408,
"num_input_tokens_seen": 676776,
"step": 1795
},
{
"epoch": 1.6129032258064515,
"grad_norm": 1.359375,
"learning_rate": 4.943168982668352e-05,
"loss": 0.148,
"num_input_tokens_seen": 678760,
"step": 1800
},
{
"epoch": 1.6173835125448028,
"grad_norm": 38.25,
"learning_rate": 4.9423370828559236e-05,
"loss": 0.1347,
"num_input_tokens_seen": 680584,
"step": 1805
},
{
"epoch": 1.6218637992831542,
"grad_norm": 2.875,
"learning_rate": 4.941499209502327e-05,
"loss": 0.3771,
"num_input_tokens_seen": 682504,
"step": 1810
},
{
"epoch": 1.6263440860215055,
"grad_norm": 5.8125,
"learning_rate": 4.9406553646568594e-05,
"loss": 0.3276,
"num_input_tokens_seen": 684424,
"step": 1815
},
{
"epoch": 1.6308243727598566,
"grad_norm": 0.1787109375,
"learning_rate": 4.939805550383421e-05,
"loss": 0.2047,
"num_input_tokens_seen": 686248,
"step": 1820
},
{
"epoch": 1.635304659498208,
"grad_norm": 13.8125,
"learning_rate": 4.9389497687605135e-05,
"loss": 0.3788,
"num_input_tokens_seen": 688104,
"step": 1825
},
{
"epoch": 1.639784946236559,
"grad_norm": 70.5,
"learning_rate": 4.938088021881233e-05,
"loss": 0.5566,
"num_input_tokens_seen": 689896,
"step": 1830
},
{
"epoch": 1.6442652329749103,
"grad_norm": 9.4375,
"learning_rate": 4.9372203118532655e-05,
"loss": 0.4025,
"num_input_tokens_seen": 691912,
"step": 1835
},
{
"epoch": 1.6487455197132617,
"grad_norm": 3.609375,
"learning_rate": 4.936346640798883e-05,
"loss": 0.1555,
"num_input_tokens_seen": 693640,
"step": 1840
},
{
"epoch": 1.653225806451613,
"grad_norm": 0.439453125,
"learning_rate": 4.935467010854936e-05,
"loss": 0.1278,
"num_input_tokens_seen": 695528,
"step": 1845
},
{
"epoch": 1.6577060931899643,
"grad_norm": 5.875,
"learning_rate": 4.9345814241728495e-05,
"loss": 0.2873,
"num_input_tokens_seen": 697256,
"step": 1850
},
{
"epoch": 1.6621863799283154,
"grad_norm": 0.1357421875,
"learning_rate": 4.933689882918618e-05,
"loss": 0.1152,
"num_input_tokens_seen": 699112,
"step": 1855
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.9375,
"learning_rate": 4.9327923892728e-05,
"loss": 0.2526,
"num_input_tokens_seen": 700904,
"step": 1860
},
{
"epoch": 1.6711469534050178,
"grad_norm": 10.625,
"learning_rate": 4.9318889454305115e-05,
"loss": 0.4701,
"num_input_tokens_seen": 702632,
"step": 1865
},
{
"epoch": 1.6756272401433692,
"grad_norm": 8.5,
"learning_rate": 4.930979553601423e-05,
"loss": 0.2473,
"num_input_tokens_seen": 704680,
"step": 1870
},
{
"epoch": 1.6801075268817205,
"grad_norm": 1.625,
"learning_rate": 4.930064216009754e-05,
"loss": 0.1458,
"num_input_tokens_seen": 706792,
"step": 1875
},
{
"epoch": 1.6845878136200718,
"grad_norm": 53.5,
"learning_rate": 4.929142934894262e-05,
"loss": 0.1463,
"num_input_tokens_seen": 708552,
"step": 1880
},
{
"epoch": 1.689068100358423,
"grad_norm": 21.875,
"learning_rate": 4.928215712508245e-05,
"loss": 0.2246,
"num_input_tokens_seen": 710568,
"step": 1885
},
{
"epoch": 1.6935483870967742,
"grad_norm": 34.75,
"learning_rate": 4.9272825511195316e-05,
"loss": 0.2671,
"num_input_tokens_seen": 712488,
"step": 1890
},
{
"epoch": 1.6980286738351253,
"grad_norm": 58.25,
"learning_rate": 4.9263434530104755e-05,
"loss": 0.085,
"num_input_tokens_seen": 714440,
"step": 1895
},
{
"epoch": 1.7025089605734767,
"grad_norm": 25.5,
"learning_rate": 4.92539842047795e-05,
"loss": 0.121,
"num_input_tokens_seen": 716328,
"step": 1900
},
{
"epoch": 1.706989247311828,
"grad_norm": 25.5,
"learning_rate": 4.924447455833346e-05,
"loss": 0.126,
"num_input_tokens_seen": 718216,
"step": 1905
},
{
"epoch": 1.7114695340501793,
"grad_norm": 0.796875,
"learning_rate": 4.9234905614025594e-05,
"loss": 0.4191,
"num_input_tokens_seen": 719976,
"step": 1910
},
{
"epoch": 1.7159498207885304,
"grad_norm": 44.75,
"learning_rate": 4.922527739525993e-05,
"loss": 0.3575,
"num_input_tokens_seen": 721928,
"step": 1915
},
{
"epoch": 1.7204301075268817,
"grad_norm": 2.65625,
"learning_rate": 4.9215589925585434e-05,
"loss": 0.2411,
"num_input_tokens_seen": 723752,
"step": 1920
},
{
"epoch": 1.7249103942652328,
"grad_norm": 43.25,
"learning_rate": 4.9205843228696036e-05,
"loss": 0.0597,
"num_input_tokens_seen": 725480,
"step": 1925
},
{
"epoch": 1.7293906810035842,
"grad_norm": 34.0,
"learning_rate": 4.9196037328430475e-05,
"loss": 0.2429,
"num_input_tokens_seen": 727400,
"step": 1930
},
{
"epoch": 1.7338709677419355,
"grad_norm": 39.25,
"learning_rate": 4.918617224877232e-05,
"loss": 0.2412,
"num_input_tokens_seen": 729576,
"step": 1935
},
{
"epoch": 1.7383512544802868,
"grad_norm": 0.016357421875,
"learning_rate": 4.917624801384988e-05,
"loss": 0.3517,
"num_input_tokens_seen": 731528,
"step": 1940
},
{
"epoch": 1.7428315412186381,
"grad_norm": 75.0,
"learning_rate": 4.916626464793616e-05,
"loss": 0.0598,
"num_input_tokens_seen": 733448,
"step": 1945
},
{
"epoch": 1.7473118279569892,
"grad_norm": 29.375,
"learning_rate": 4.915622217544875e-05,
"loss": 0.1007,
"num_input_tokens_seen": 735304,
"step": 1950
},
{
"epoch": 1.7517921146953404,
"grad_norm": 0.0172119140625,
"learning_rate": 4.9146120620949854e-05,
"loss": 0.4202,
"num_input_tokens_seen": 737032,
"step": 1955
},
{
"epoch": 1.7562724014336917,
"grad_norm": 65.0,
"learning_rate": 4.9135960009146135e-05,
"loss": 0.4773,
"num_input_tokens_seen": 738856,
"step": 1960
},
{
"epoch": 1.760752688172043,
"grad_norm": 13.0625,
"learning_rate": 4.912574036488874e-05,
"loss": 0.0261,
"num_input_tokens_seen": 740712,
"step": 1965
},
{
"epoch": 1.7652329749103943,
"grad_norm": 76.5,
"learning_rate": 4.9115461713173174e-05,
"loss": 0.4438,
"num_input_tokens_seen": 742568,
"step": 1970
},
{
"epoch": 1.7697132616487457,
"grad_norm": 0.13671875,
"learning_rate": 4.910512407913926e-05,
"loss": 0.1644,
"num_input_tokens_seen": 744584,
"step": 1975
},
{
"epoch": 1.7741935483870968,
"grad_norm": 19.375,
"learning_rate": 4.9094727488071114e-05,
"loss": 0.2322,
"num_input_tokens_seen": 746376,
"step": 1980
},
{
"epoch": 1.778673835125448,
"grad_norm": 3.625,
"learning_rate": 4.9084271965397014e-05,
"loss": 0.601,
"num_input_tokens_seen": 748488,
"step": 1985
},
{
"epoch": 1.7831541218637992,
"grad_norm": 0.16015625,
"learning_rate": 4.907375753668939e-05,
"loss": 0.1492,
"num_input_tokens_seen": 750376,
"step": 1990
},
{
"epoch": 1.7876344086021505,
"grad_norm": 35.5,
"learning_rate": 4.906318422766476e-05,
"loss": 0.2954,
"num_input_tokens_seen": 752104,
"step": 1995
},
{
"epoch": 1.7921146953405018,
"grad_norm": 0.6171875,
"learning_rate": 4.9052552064183624e-05,
"loss": 0.328,
"num_input_tokens_seen": 753960,
"step": 2000
},
{
"epoch": 1.7965949820788532,
"grad_norm": 60.75,
"learning_rate": 4.904186107225046e-05,
"loss": 0.1256,
"num_input_tokens_seen": 755944,
"step": 2005
},
{
"epoch": 1.8010752688172043,
"grad_norm": 12.125,
"learning_rate": 4.903111127801361e-05,
"loss": 0.3917,
"num_input_tokens_seen": 757832,
"step": 2010
},
{
"epoch": 1.8055555555555556,
"grad_norm": 36.5,
"learning_rate": 4.902030270776524e-05,
"loss": 0.5439,
"num_input_tokens_seen": 759816,
"step": 2015
},
{
"epoch": 1.8100358422939067,
"grad_norm": 31.125,
"learning_rate": 4.9009435387941274e-05,
"loss": 0.156,
"num_input_tokens_seen": 761640,
"step": 2020
},
{
"epoch": 1.814516129032258,
"grad_norm": 0.169921875,
"learning_rate": 4.899850934512134e-05,
"loss": 0.1535,
"num_input_tokens_seen": 763400,
"step": 2025
},
{
"epoch": 1.8189964157706093,
"grad_norm": 20.875,
"learning_rate": 4.898752460602866e-05,
"loss": 0.1526,
"num_input_tokens_seen": 765288,
"step": 2030
},
{
"epoch": 1.8234767025089607,
"grad_norm": 2.53125,
"learning_rate": 4.897648119753006e-05,
"loss": 0.0112,
"num_input_tokens_seen": 767144,
"step": 2035
},
{
"epoch": 1.827956989247312,
"grad_norm": 0.125,
"learning_rate": 4.8965379146635816e-05,
"loss": 0.0499,
"num_input_tokens_seen": 769064,
"step": 2040
},
{
"epoch": 1.832437275985663,
"grad_norm": 74.0,
"learning_rate": 4.895421848049968e-05,
"loss": 0.4675,
"num_input_tokens_seen": 770856,
"step": 2045
},
{
"epoch": 1.8369175627240142,
"grad_norm": 40.25,
"learning_rate": 4.894299922641873e-05,
"loss": 0.3426,
"num_input_tokens_seen": 772776,
"step": 2050
},
{
"epoch": 1.8413978494623655,
"grad_norm": 1.6875,
"learning_rate": 4.893172141183335e-05,
"loss": 0.4338,
"num_input_tokens_seen": 774728,
"step": 2055
},
{
"epoch": 1.8458781362007168,
"grad_norm": 0.25390625,
"learning_rate": 4.892038506432717e-05,
"loss": 0.3803,
"num_input_tokens_seen": 776712,
"step": 2060
},
{
"epoch": 1.8503584229390682,
"grad_norm": 0.90625,
"learning_rate": 4.890899021162696e-05,
"loss": 0.0123,
"num_input_tokens_seen": 778472,
"step": 2065
},
{
"epoch": 1.8548387096774195,
"grad_norm": 51.25,
"learning_rate": 4.8897536881602594e-05,
"loss": 0.2825,
"num_input_tokens_seen": 780360,
"step": 2070
},
{
"epoch": 1.8593189964157706,
"grad_norm": 42.0,
"learning_rate": 4.888602510226697e-05,
"loss": 0.4634,
"num_input_tokens_seen": 782280,
"step": 2075
},
{
"epoch": 1.863799283154122,
"grad_norm": 48.25,
"learning_rate": 4.8874454901775936e-05,
"loss": 0.4387,
"num_input_tokens_seen": 784136,
"step": 2080
},
{
"epoch": 1.868279569892473,
"grad_norm": 14.125,
"learning_rate": 4.8862826308428244e-05,
"loss": 0.0998,
"num_input_tokens_seen": 785960,
"step": 2085
},
{
"epoch": 1.8727598566308243,
"grad_norm": 33.0,
"learning_rate": 4.885113935066545e-05,
"loss": 0.1383,
"num_input_tokens_seen": 787848,
"step": 2090
},
{
"epoch": 1.8772401433691757,
"grad_norm": 25.75,
"learning_rate": 4.883939405707186e-05,
"loss": 0.3347,
"num_input_tokens_seen": 789800,
"step": 2095
},
{
"epoch": 1.881720430107527,
"grad_norm": 3.5625,
"learning_rate": 4.882759045637449e-05,
"loss": 0.026,
"num_input_tokens_seen": 791592,
"step": 2100
},
{
"epoch": 1.886200716845878,
"grad_norm": 16.5,
"learning_rate": 4.88157285774429e-05,
"loss": 0.5513,
"num_input_tokens_seen": 793544,
"step": 2105
},
{
"epoch": 1.8906810035842294,
"grad_norm": 36.25,
"learning_rate": 4.8803808449289264e-05,
"loss": 0.5565,
"num_input_tokens_seen": 795368,
"step": 2110
},
{
"epoch": 1.8951612903225805,
"grad_norm": 28.5,
"learning_rate": 4.879183010106817e-05,
"loss": 0.299,
"num_input_tokens_seen": 797128,
"step": 2115
},
{
"epoch": 1.8996415770609318,
"grad_norm": 57.75,
"learning_rate": 4.877979356207663e-05,
"loss": 0.0732,
"num_input_tokens_seen": 798888,
"step": 2120
},
{
"epoch": 1.9041218637992832,
"grad_norm": 100.0,
"learning_rate": 4.876769886175396e-05,
"loss": 0.6387,
"num_input_tokens_seen": 800616,
"step": 2125
},
{
"epoch": 1.9086021505376345,
"grad_norm": 0.77734375,
"learning_rate": 4.8755546029681746e-05,
"loss": 0.1866,
"num_input_tokens_seen": 802472,
"step": 2130
},
{
"epoch": 1.9130824372759858,
"grad_norm": 69.0,
"learning_rate": 4.874333509558375e-05,
"loss": 0.2513,
"num_input_tokens_seen": 804328,
"step": 2135
},
{
"epoch": 1.917562724014337,
"grad_norm": 0.040283203125,
"learning_rate": 4.873106608932585e-05,
"loss": 0.1903,
"num_input_tokens_seen": 806152,
"step": 2140
},
{
"epoch": 1.922043010752688,
"grad_norm": 41.25,
"learning_rate": 4.871873904091593e-05,
"loss": 0.1546,
"num_input_tokens_seen": 808168,
"step": 2145
},
{
"epoch": 1.9265232974910393,
"grad_norm": 14.75,
"learning_rate": 4.870635398050387e-05,
"loss": 0.2671,
"num_input_tokens_seen": 810056,
"step": 2150
},
{
"epoch": 1.9310035842293907,
"grad_norm": 0.173828125,
"learning_rate": 4.8693910938381404e-05,
"loss": 0.067,
"num_input_tokens_seen": 812008,
"step": 2155
},
{
"epoch": 1.935483870967742,
"grad_norm": 50.25,
"learning_rate": 4.868140994498211e-05,
"loss": 0.5541,
"num_input_tokens_seen": 813736,
"step": 2160
},
{
"epoch": 1.9399641577060933,
"grad_norm": 41.25,
"learning_rate": 4.86688510308813e-05,
"loss": 0.1515,
"num_input_tokens_seen": 815752,
"step": 2165
},
{
"epoch": 1.9444444444444444,
"grad_norm": 51.25,
"learning_rate": 4.865623422679593e-05,
"loss": 0.2993,
"num_input_tokens_seen": 817544,
"step": 2170
},
{
"epoch": 1.9489247311827957,
"grad_norm": 76.5,
"learning_rate": 4.864355956358454e-05,
"loss": 0.2149,
"num_input_tokens_seen": 819432,
"step": 2175
},
{
"epoch": 1.9534050179211468,
"grad_norm": 11.0625,
"learning_rate": 4.8630827072247244e-05,
"loss": 0.4008,
"num_input_tokens_seen": 821224,
"step": 2180
},
{
"epoch": 1.9578853046594982,
"grad_norm": 16.75,
"learning_rate": 4.8618036783925516e-05,
"loss": 0.1681,
"num_input_tokens_seen": 822984,
"step": 2185
},
{
"epoch": 1.9623655913978495,
"grad_norm": 40.5,
"learning_rate": 4.860518872990223e-05,
"loss": 0.1108,
"num_input_tokens_seen": 824968,
"step": 2190
},
{
"epoch": 1.9668458781362008,
"grad_norm": 36.5,
"learning_rate": 4.859228294160155e-05,
"loss": 0.448,
"num_input_tokens_seen": 826984,
"step": 2195
},
{
"epoch": 1.971326164874552,
"grad_norm": 3.375,
"learning_rate": 4.857931945058884e-05,
"loss": 0.1038,
"num_input_tokens_seen": 828872,
"step": 2200
},
{
"epoch": 1.9758064516129032,
"grad_norm": 0.1611328125,
"learning_rate": 4.856629828857059e-05,
"loss": 0.0145,
"num_input_tokens_seen": 830760,
"step": 2205
},
{
"epoch": 1.9802867383512543,
"grad_norm": 22.5,
"learning_rate": 4.855321948739435e-05,
"loss": 0.1439,
"num_input_tokens_seen": 832712,
"step": 2210
},
{
"epoch": 1.9847670250896057,
"grad_norm": 0.96484375,
"learning_rate": 4.8540083079048645e-05,
"loss": 0.1092,
"num_input_tokens_seen": 834696,
"step": 2215
},
{
"epoch": 1.989247311827957,
"grad_norm": 0.5078125,
"learning_rate": 4.85268890956629e-05,
"loss": 0.4771,
"num_input_tokens_seen": 836648,
"step": 2220
},
{
"epoch": 1.9937275985663083,
"grad_norm": 0.291015625,
"learning_rate": 4.851363756950733e-05,
"loss": 0.6373,
"num_input_tokens_seen": 838760,
"step": 2225
},
{
"epoch": 1.9982078853046596,
"grad_norm": 6.40625,
"learning_rate": 4.8500328532992945e-05,
"loss": 0.1181,
"num_input_tokens_seen": 840584,
"step": 2230
},
{
"epoch": 2.0,
"eval_loss": 0.30103394389152527,
"eval_runtime": 9.6528,
"eval_samples_per_second": 51.384,
"eval_steps_per_second": 12.846,
"num_input_tokens_seen": 841024,
"step": 2232
},
{
"epoch": 2.002688172043011,
"grad_norm": 26.0,
"learning_rate": 4.848696201867138e-05,
"loss": 0.0222,
"num_input_tokens_seen": 842272,
"step": 2235
},
{
"epoch": 2.007168458781362,
"grad_norm": 1.265625,
"learning_rate": 4.847353805923484e-05,
"loss": 0.1276,
"num_input_tokens_seen": 844000,
"step": 2240
},
{
"epoch": 2.011648745519713,
"grad_norm": 0.71875,
"learning_rate": 4.846005668751605e-05,
"loss": 0.216,
"num_input_tokens_seen": 845760,
"step": 2245
},
{
"epoch": 2.0161290322580645,
"grad_norm": 21.625,
"learning_rate": 4.844651793648817e-05,
"loss": 0.2396,
"num_input_tokens_seen": 847776,
"step": 2250
},
{
"epoch": 2.020609318996416,
"grad_norm": 0.77734375,
"learning_rate": 4.843292183926466e-05,
"loss": 0.249,
"num_input_tokens_seen": 849728,
"step": 2255
},
{
"epoch": 2.025089605734767,
"grad_norm": 85.5,
"learning_rate": 4.841926842909928e-05,
"loss": 0.3291,
"num_input_tokens_seen": 851584,
"step": 2260
},
{
"epoch": 2.0295698924731185,
"grad_norm": 20.875,
"learning_rate": 4.840555773938594e-05,
"loss": 0.1802,
"num_input_tokens_seen": 853408,
"step": 2265
},
{
"epoch": 2.0340501792114694,
"grad_norm": 66.0,
"learning_rate": 4.839178980365866e-05,
"loss": 0.3472,
"num_input_tokens_seen": 855360,
"step": 2270
},
{
"epoch": 2.0385304659498207,
"grad_norm": 1.2265625,
"learning_rate": 4.8377964655591465e-05,
"loss": 0.0127,
"num_input_tokens_seen": 857504,
"step": 2275
},
{
"epoch": 2.043010752688172,
"grad_norm": 39.75,
"learning_rate": 4.8364082328998314e-05,
"loss": 0.6509,
"num_input_tokens_seen": 859360,
"step": 2280
},
{
"epoch": 2.0474910394265233,
"grad_norm": 6.40625,
"learning_rate": 4.835014285783303e-05,
"loss": 0.2578,
"num_input_tokens_seen": 861312,
"step": 2285
},
{
"epoch": 2.0519713261648747,
"grad_norm": 72.0,
"learning_rate": 4.833614627618918e-05,
"loss": 0.3848,
"num_input_tokens_seen": 863168,
"step": 2290
},
{
"epoch": 2.056451612903226,
"grad_norm": 1.2890625,
"learning_rate": 4.832209261830002e-05,
"loss": 0.0045,
"num_input_tokens_seen": 865184,
"step": 2295
},
{
"epoch": 2.060931899641577,
"grad_norm": 41.75,
"learning_rate": 4.8307981918538405e-05,
"loss": 0.4656,
"num_input_tokens_seen": 867168,
"step": 2300
},
{
"epoch": 2.065412186379928,
"grad_norm": 7.4375,
"learning_rate": 4.829381421141671e-05,
"loss": 0.7217,
"num_input_tokens_seen": 869056,
"step": 2305
},
{
"epoch": 2.0698924731182795,
"grad_norm": 46.0,
"learning_rate": 4.827958953158675e-05,
"loss": 0.2643,
"num_input_tokens_seen": 870816,
"step": 2310
},
{
"epoch": 2.074372759856631,
"grad_norm": 0.11083984375,
"learning_rate": 4.8265307913839655e-05,
"loss": 0.2274,
"num_input_tokens_seen": 872736,
"step": 2315
},
{
"epoch": 2.078853046594982,
"grad_norm": 0.019287109375,
"learning_rate": 4.825096939310584e-05,
"loss": 0.0393,
"num_input_tokens_seen": 874656,
"step": 2320
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.50390625,
"learning_rate": 4.823657400445489e-05,
"loss": 0.1225,
"num_input_tokens_seen": 876576,
"step": 2325
},
{
"epoch": 2.0878136200716844,
"grad_norm": 34.25,
"learning_rate": 4.822212178309548e-05,
"loss": 0.014,
"num_input_tokens_seen": 878528,
"step": 2330
},
{
"epoch": 2.0922939068100357,
"grad_norm": 110.0,
"learning_rate": 4.820761276437527e-05,
"loss": 0.3721,
"num_input_tokens_seen": 880288,
"step": 2335
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.1357421875,
"learning_rate": 4.819304698378089e-05,
"loss": 0.0933,
"num_input_tokens_seen": 882304,
"step": 2340
},
{
"epoch": 2.1012544802867383,
"grad_norm": 33.75,
"learning_rate": 4.817842447693771e-05,
"loss": 0.177,
"num_input_tokens_seen": 884256,
"step": 2345
},
{
"epoch": 2.1057347670250897,
"grad_norm": 94.0,
"learning_rate": 4.816374527960994e-05,
"loss": 0.6941,
"num_input_tokens_seen": 886240,
"step": 2350
},
{
"epoch": 2.110215053763441,
"grad_norm": 0.20703125,
"learning_rate": 4.8149009427700377e-05,
"loss": 0.0897,
"num_input_tokens_seen": 888064,
"step": 2355
},
{
"epoch": 2.1146953405017923,
"grad_norm": 34.5,
"learning_rate": 4.813421695725041e-05,
"loss": 0.6207,
"num_input_tokens_seen": 889728,
"step": 2360
},
{
"epoch": 2.119175627240143,
"grad_norm": 24.0,
"learning_rate": 4.81193679044399e-05,
"loss": 0.2703,
"num_input_tokens_seen": 891520,
"step": 2365
},
{
"epoch": 2.1236559139784945,
"grad_norm": 52.25,
"learning_rate": 4.810446230558714e-05,
"loss": 0.1226,
"num_input_tokens_seen": 893344,
"step": 2370
},
{
"epoch": 2.128136200716846,
"grad_norm": 29.875,
"learning_rate": 4.8089500197148654e-05,
"loss": 0.0878,
"num_input_tokens_seen": 895328,
"step": 2375
},
{
"epoch": 2.132616487455197,
"grad_norm": 27.875,
"learning_rate": 4.807448161571922e-05,
"loss": 0.0906,
"num_input_tokens_seen": 897248,
"step": 2380
},
{
"epoch": 2.1370967741935485,
"grad_norm": 0.7109375,
"learning_rate": 4.805940659803174e-05,
"loss": 0.3801,
"num_input_tokens_seen": 899200,
"step": 2385
},
{
"epoch": 2.1415770609319,
"grad_norm": 48.0,
"learning_rate": 4.804427518095715e-05,
"loss": 0.2034,
"num_input_tokens_seen": 901120,
"step": 2390
},
{
"epoch": 2.1460573476702507,
"grad_norm": 25.875,
"learning_rate": 4.802908740150431e-05,
"loss": 0.1236,
"num_input_tokens_seen": 903040,
"step": 2395
},
{
"epoch": 2.150537634408602,
"grad_norm": 36.25,
"learning_rate": 4.801384329681996e-05,
"loss": 0.3222,
"num_input_tokens_seen": 904960,
"step": 2400
},
{
"epoch": 2.1550179211469533,
"grad_norm": 4.09375,
"learning_rate": 4.799854290418858e-05,
"loss": 0.4735,
"num_input_tokens_seen": 906816,
"step": 2405
},
{
"epoch": 2.1594982078853047,
"grad_norm": 15.5625,
"learning_rate": 4.798318626103233e-05,
"loss": 0.0533,
"num_input_tokens_seen": 908544,
"step": 2410
},
{
"epoch": 2.163978494623656,
"grad_norm": 0.90234375,
"learning_rate": 4.7967773404910946e-05,
"loss": 0.2093,
"num_input_tokens_seen": 910336,
"step": 2415
},
{
"epoch": 2.1684587813620073,
"grad_norm": 37.75,
"learning_rate": 4.7952304373521644e-05,
"loss": 0.0997,
"num_input_tokens_seen": 912288,
"step": 2420
},
{
"epoch": 2.1729390681003586,
"grad_norm": 15.5625,
"learning_rate": 4.793677920469906e-05,
"loss": 0.2692,
"num_input_tokens_seen": 914144,
"step": 2425
},
{
"epoch": 2.1774193548387095,
"grad_norm": 50.5,
"learning_rate": 4.7921197936415106e-05,
"loss": 0.311,
"num_input_tokens_seen": 915904,
"step": 2430
},
{
"epoch": 2.181899641577061,
"grad_norm": 13.4375,
"learning_rate": 4.7905560606778924e-05,
"loss": 0.1883,
"num_input_tokens_seen": 917824,
"step": 2435
},
{
"epoch": 2.186379928315412,
"grad_norm": 29.375,
"learning_rate": 4.7889867254036755e-05,
"loss": 0.8453,
"num_input_tokens_seen": 919744,
"step": 2440
},
{
"epoch": 2.1908602150537635,
"grad_norm": 3.1875,
"learning_rate": 4.787411791657188e-05,
"loss": 0.0186,
"num_input_tokens_seen": 921728,
"step": 2445
},
{
"epoch": 2.195340501792115,
"grad_norm": 29.625,
"learning_rate": 4.785831263290449e-05,
"loss": 0.1221,
"num_input_tokens_seen": 923648,
"step": 2450
},
{
"epoch": 2.199820788530466,
"grad_norm": 81.5,
"learning_rate": 4.784245144169162e-05,
"loss": 0.2469,
"num_input_tokens_seen": 925536,
"step": 2455
},
{
"epoch": 2.204301075268817,
"grad_norm": 63.5,
"learning_rate": 4.782653438172705e-05,
"loss": 0.2926,
"num_input_tokens_seen": 927392,
"step": 2460
},
{
"epoch": 2.2087813620071683,
"grad_norm": 51.0,
"learning_rate": 4.781056149194121e-05,
"loss": 0.8263,
"num_input_tokens_seen": 929536,
"step": 2465
},
{
"epoch": 2.2132616487455197,
"grad_norm": 37.0,
"learning_rate": 4.779453281140107e-05,
"loss": 0.5175,
"num_input_tokens_seen": 931520,
"step": 2470
},
{
"epoch": 2.217741935483871,
"grad_norm": 4.21875,
"learning_rate": 4.777844837931005e-05,
"loss": 0.3372,
"num_input_tokens_seen": 933504,
"step": 2475
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.5625,
"learning_rate": 4.776230823500793e-05,
"loss": 0.0259,
"num_input_tokens_seen": 935360,
"step": 2480
},
{
"epoch": 2.2267025089605736,
"grad_norm": 41.75,
"learning_rate": 4.7746112417970766e-05,
"loss": 0.1463,
"num_input_tokens_seen": 937440,
"step": 2485
},
{
"epoch": 2.2311827956989245,
"grad_norm": 31.125,
"learning_rate": 4.772986096781078e-05,
"loss": 0.1728,
"num_input_tokens_seen": 939424,
"step": 2490
},
{
"epoch": 2.235663082437276,
"grad_norm": 42.75,
"learning_rate": 4.771355392427624e-05,
"loss": 0.2997,
"num_input_tokens_seen": 941312,
"step": 2495
},
{
"epoch": 2.240143369175627,
"grad_norm": 17.0,
"learning_rate": 4.769719132725141e-05,
"loss": 0.0581,
"num_input_tokens_seen": 943264,
"step": 2500
},
{
"epoch": 2.2446236559139785,
"grad_norm": 0.71484375,
"learning_rate": 4.768077321675643e-05,
"loss": 0.0267,
"num_input_tokens_seen": 945248,
"step": 2505
},
{
"epoch": 2.24910394265233,
"grad_norm": 70.5,
"learning_rate": 4.766429963294719e-05,
"loss": 0.0792,
"num_input_tokens_seen": 947168,
"step": 2510
},
{
"epoch": 2.253584229390681,
"grad_norm": 0.34375,
"learning_rate": 4.7647770616115265e-05,
"loss": 0.0358,
"num_input_tokens_seen": 948960,
"step": 2515
},
{
"epoch": 2.258064516129032,
"grad_norm": 29.625,
"learning_rate": 4.763118620668785e-05,
"loss": 0.1084,
"num_input_tokens_seen": 950688,
"step": 2520
},
{
"epoch": 2.2625448028673834,
"grad_norm": 108.0,
"learning_rate": 4.761454644522757e-05,
"loss": 0.1494,
"num_input_tokens_seen": 952608,
"step": 2525
},
{
"epoch": 2.2670250896057347,
"grad_norm": 3.40625,
"learning_rate": 4.759785137243245e-05,
"loss": 0.1925,
"num_input_tokens_seen": 954368,
"step": 2530
},
{
"epoch": 2.271505376344086,
"grad_norm": 0.0162353515625,
"learning_rate": 4.758110102913581e-05,
"loss": 0.0402,
"num_input_tokens_seen": 956160,
"step": 2535
},
{
"epoch": 2.2759856630824373,
"grad_norm": 36.25,
"learning_rate": 4.7564295456306136e-05,
"loss": 0.2656,
"num_input_tokens_seen": 957984,
"step": 2540
},
{
"epoch": 2.2804659498207887,
"grad_norm": 22.25,
"learning_rate": 4.7547434695047e-05,
"loss": 0.1124,
"num_input_tokens_seen": 959872,
"step": 2545
},
{
"epoch": 2.28494623655914,
"grad_norm": 82.0,
"learning_rate": 4.7530518786596954e-05,
"loss": 0.1503,
"num_input_tokens_seen": 961664,
"step": 2550
},
{
"epoch": 2.289426523297491,
"grad_norm": 0.0230712890625,
"learning_rate": 4.7513547772329446e-05,
"loss": 0.2294,
"num_input_tokens_seen": 963712,
"step": 2555
},
{
"epoch": 2.293906810035842,
"grad_norm": 0.08544921875,
"learning_rate": 4.749652169375268e-05,
"loss": 0.0015,
"num_input_tokens_seen": 965696,
"step": 2560
},
{
"epoch": 2.2983870967741935,
"grad_norm": 43.0,
"learning_rate": 4.747944059250955e-05,
"loss": 0.5989,
"num_input_tokens_seen": 967488,
"step": 2565
},
{
"epoch": 2.302867383512545,
"grad_norm": 77.5,
"learning_rate": 4.746230451037752e-05,
"loss": 0.4637,
"num_input_tokens_seen": 969344,
"step": 2570
},
{
"epoch": 2.307347670250896,
"grad_norm": 0.05419921875,
"learning_rate": 4.7445113489268544e-05,
"loss": 0.2774,
"num_input_tokens_seen": 971168,
"step": 2575
},
{
"epoch": 2.3118279569892475,
"grad_norm": 42.75,
"learning_rate": 4.7427867571228926e-05,
"loss": 0.4368,
"num_input_tokens_seen": 973088,
"step": 2580
},
{
"epoch": 2.3163082437275984,
"grad_norm": 0.04541015625,
"learning_rate": 4.741056679843926e-05,
"loss": 0.0937,
"num_input_tokens_seen": 975008,
"step": 2585
},
{
"epoch": 2.3207885304659497,
"grad_norm": 0.48046875,
"learning_rate": 4.739321121321428e-05,
"loss": 0.0161,
"num_input_tokens_seen": 976864,
"step": 2590
},
{
"epoch": 2.325268817204301,
"grad_norm": 45.75,
"learning_rate": 4.737580085800282e-05,
"loss": 0.486,
"num_input_tokens_seen": 978752,
"step": 2595
},
{
"epoch": 2.3297491039426523,
"grad_norm": 0.328125,
"learning_rate": 4.735833577538762e-05,
"loss": 0.277,
"num_input_tokens_seen": 980576,
"step": 2600
},
{
"epoch": 2.3342293906810037,
"grad_norm": 4.75,
"learning_rate": 4.734081600808531e-05,
"loss": 0.4868,
"num_input_tokens_seen": 982336,
"step": 2605
},
{
"epoch": 2.338709677419355,
"grad_norm": 5.34375,
"learning_rate": 4.732324159894627e-05,
"loss": 0.3468,
"num_input_tokens_seen": 984064,
"step": 2610
},
{
"epoch": 2.3431899641577063,
"grad_norm": 36.25,
"learning_rate": 4.730561259095451e-05,
"loss": 0.4078,
"num_input_tokens_seen": 985888,
"step": 2615
},
{
"epoch": 2.347670250896057,
"grad_norm": 0.310546875,
"learning_rate": 4.728792902722759e-05,
"loss": 0.309,
"num_input_tokens_seen": 987712,
"step": 2620
},
{
"epoch": 2.3521505376344085,
"grad_norm": 2.0625,
"learning_rate": 4.7270190951016493e-05,
"loss": 0.1506,
"num_input_tokens_seen": 989568,
"step": 2625
},
{
"epoch": 2.35663082437276,
"grad_norm": 0.2138671875,
"learning_rate": 4.7252398405705535e-05,
"loss": 0.089,
"num_input_tokens_seen": 991584,
"step": 2630
},
{
"epoch": 2.361111111111111,
"grad_norm": 1.0390625,
"learning_rate": 4.723455143481227e-05,
"loss": 0.2575,
"num_input_tokens_seen": 993472,
"step": 2635
},
{
"epoch": 2.3655913978494625,
"grad_norm": 57.25,
"learning_rate": 4.721665008198734e-05,
"loss": 0.2878,
"num_input_tokens_seen": 995296,
"step": 2640
},
{
"epoch": 2.370071684587814,
"grad_norm": 12.3125,
"learning_rate": 4.719869439101442e-05,
"loss": 0.3644,
"num_input_tokens_seen": 997120,
"step": 2645
},
{
"epoch": 2.3745519713261647,
"grad_norm": 0.85546875,
"learning_rate": 4.718068440581007e-05,
"loss": 0.0694,
"num_input_tokens_seen": 999104,
"step": 2650
},
{
"epoch": 2.379032258064516,
"grad_norm": 1.65625,
"learning_rate": 4.7162620170423655e-05,
"loss": 0.0827,
"num_input_tokens_seen": 1000864,
"step": 2655
},
{
"epoch": 2.3835125448028673,
"grad_norm": 54.5,
"learning_rate": 4.714450172903722e-05,
"loss": 0.1444,
"num_input_tokens_seen": 1002976,
"step": 2660
},
{
"epoch": 2.3879928315412187,
"grad_norm": 29.0,
"learning_rate": 4.712632912596538e-05,
"loss": 0.451,
"num_input_tokens_seen": 1004672,
"step": 2665
},
{
"epoch": 2.39247311827957,
"grad_norm": 37.0,
"learning_rate": 4.710810240565526e-05,
"loss": 0.4115,
"num_input_tokens_seen": 1006560,
"step": 2670
},
{
"epoch": 2.3969534050179213,
"grad_norm": 112.0,
"learning_rate": 4.7089821612686295e-05,
"loss": 0.4026,
"num_input_tokens_seen": 1008384,
"step": 2675
},
{
"epoch": 2.4014336917562726,
"grad_norm": 0.1484375,
"learning_rate": 4.707148679177021e-05,
"loss": 0.0386,
"num_input_tokens_seen": 1010208,
"step": 2680
},
{
"epoch": 2.4059139784946235,
"grad_norm": 47.25,
"learning_rate": 4.705309798775084e-05,
"loss": 0.1642,
"num_input_tokens_seen": 1012128,
"step": 2685
},
{
"epoch": 2.410394265232975,
"grad_norm": 66.5,
"learning_rate": 4.703465524560409e-05,
"loss": 0.3227,
"num_input_tokens_seen": 1014080,
"step": 2690
},
{
"epoch": 2.414874551971326,
"grad_norm": 1.03125,
"learning_rate": 4.7016158610437764e-05,
"loss": 0.4077,
"num_input_tokens_seen": 1015968,
"step": 2695
},
{
"epoch": 2.4193548387096775,
"grad_norm": 1.515625,
"learning_rate": 4.69976081274915e-05,
"loss": 0.0207,
"num_input_tokens_seen": 1017696,
"step": 2700
},
{
"epoch": 2.423835125448029,
"grad_norm": 19.25,
"learning_rate": 4.6979003842136596e-05,
"loss": 0.3542,
"num_input_tokens_seen": 1019552,
"step": 2705
},
{
"epoch": 2.4283154121863797,
"grad_norm": 89.0,
"learning_rate": 4.6960345799875995e-05,
"loss": 0.3138,
"num_input_tokens_seen": 1021344,
"step": 2710
},
{
"epoch": 2.432795698924731,
"grad_norm": 0.072265625,
"learning_rate": 4.694163404634408e-05,
"loss": 0.3215,
"num_input_tokens_seen": 1023136,
"step": 2715
},
{
"epoch": 2.4372759856630823,
"grad_norm": 98.0,
"learning_rate": 4.692286862730663e-05,
"loss": 0.3449,
"num_input_tokens_seen": 1024960,
"step": 2720
},
{
"epoch": 2.4417562724014337,
"grad_norm": 11.5,
"learning_rate": 4.690404958866066e-05,
"loss": 0.034,
"num_input_tokens_seen": 1026720,
"step": 2725
},
{
"epoch": 2.446236559139785,
"grad_norm": 20.75,
"learning_rate": 4.6885176976434344e-05,
"loss": 0.2984,
"num_input_tokens_seen": 1028544,
"step": 2730
},
{
"epoch": 2.4507168458781363,
"grad_norm": 1.1875,
"learning_rate": 4.6866250836786876e-05,
"loss": 0.4133,
"num_input_tokens_seen": 1030368,
"step": 2735
},
{
"epoch": 2.4551971326164876,
"grad_norm": 11.9375,
"learning_rate": 4.684727121600838e-05,
"loss": 0.0667,
"num_input_tokens_seen": 1032224,
"step": 2740
},
{
"epoch": 2.4596774193548385,
"grad_norm": 1.3046875,
"learning_rate": 4.6828238160519775e-05,
"loss": 0.0825,
"num_input_tokens_seen": 1034112,
"step": 2745
},
{
"epoch": 2.46415770609319,
"grad_norm": 39.5,
"learning_rate": 4.680915171687269e-05,
"loss": 0.2739,
"num_input_tokens_seen": 1036000,
"step": 2750
},
{
"epoch": 2.468637992831541,
"grad_norm": 0.0390625,
"learning_rate": 4.6790011931749314e-05,
"loss": 0.433,
"num_input_tokens_seen": 1037888,
"step": 2755
},
{
"epoch": 2.4731182795698925,
"grad_norm": 48.0,
"learning_rate": 4.6770818851962305e-05,
"loss": 0.0424,
"num_input_tokens_seen": 1039776,
"step": 2760
},
{
"epoch": 2.477598566308244,
"grad_norm": 0.08203125,
"learning_rate": 4.675157252445467e-05,
"loss": 0.1378,
"num_input_tokens_seen": 1041600,
"step": 2765
},
{
"epoch": 2.482078853046595,
"grad_norm": 34.25,
"learning_rate": 4.673227299629966e-05,
"loss": 0.1319,
"num_input_tokens_seen": 1043456,
"step": 2770
},
{
"epoch": 2.486559139784946,
"grad_norm": 18.125,
"learning_rate": 4.6712920314700624e-05,
"loss": 0.2271,
"num_input_tokens_seen": 1045248,
"step": 2775
},
{
"epoch": 2.4910394265232974,
"grad_norm": 27.25,
"learning_rate": 4.6693514526990955e-05,
"loss": 0.2379,
"num_input_tokens_seen": 1047168,
"step": 2780
},
{
"epoch": 2.4955197132616487,
"grad_norm": 0.6640625,
"learning_rate": 4.6674055680633885e-05,
"loss": 0.1001,
"num_input_tokens_seen": 1049056,
"step": 2785
},
{
"epoch": 2.5,
"grad_norm": 0.0830078125,
"learning_rate": 4.665454382322246e-05,
"loss": 0.0775,
"num_input_tokens_seen": 1051168,
"step": 2790
},
{
"epoch": 2.5,
"eval_loss": 0.3032233417034149,
"eval_runtime": 9.669,
"eval_samples_per_second": 51.298,
"eval_steps_per_second": 12.824,
"num_input_tokens_seen": 1051168,
"step": 2790
},
{
"epoch": 2.5044802867383513,
"grad_norm": 18.375,
"learning_rate": 4.663497900247936e-05,
"loss": 0.1346,
"num_input_tokens_seen": 1053120,
"step": 2795
},
{
"epoch": 2.5089605734767026,
"grad_norm": 19.25,
"learning_rate": 4.6615361266256805e-05,
"loss": 0.0693,
"num_input_tokens_seen": 1055008,
"step": 2800
},
{
"epoch": 2.513440860215054,
"grad_norm": 8.625,
"learning_rate": 4.6595690662536436e-05,
"loss": 0.1463,
"num_input_tokens_seen": 1056832,
"step": 2805
},
{
"epoch": 2.517921146953405,
"grad_norm": 79.0,
"learning_rate": 4.657596723942923e-05,
"loss": 0.1714,
"num_input_tokens_seen": 1058656,
"step": 2810
},
{
"epoch": 2.522401433691756,
"grad_norm": 0.95703125,
"learning_rate": 4.65561910451753e-05,
"loss": 0.3259,
"num_input_tokens_seen": 1060416,
"step": 2815
},
{
"epoch": 2.5268817204301075,
"grad_norm": 12.6875,
"learning_rate": 4.653636212814386e-05,
"loss": 0.4379,
"num_input_tokens_seen": 1062176,
"step": 2820
},
{
"epoch": 2.531362007168459,
"grad_norm": 118.5,
"learning_rate": 4.651648053683308e-05,
"loss": 0.4418,
"num_input_tokens_seen": 1064032,
"step": 2825
},
{
"epoch": 2.53584229390681,
"grad_norm": 2.84375,
"learning_rate": 4.649654631986994e-05,
"loss": 0.2399,
"num_input_tokens_seen": 1065920,
"step": 2830
},
{
"epoch": 2.540322580645161,
"grad_norm": 32.25,
"learning_rate": 4.6476559526010146e-05,
"loss": 0.4892,
"num_input_tokens_seen": 1067840,
"step": 2835
},
{
"epoch": 2.5448028673835124,
"grad_norm": 47.25,
"learning_rate": 4.6456520204137996e-05,
"loss": 0.1832,
"num_input_tokens_seen": 1069824,
"step": 2840
},
{
"epoch": 2.5492831541218637,
"grad_norm": 96.5,
"learning_rate": 4.643642840326627e-05,
"loss": 0.3606,
"num_input_tokens_seen": 1071744,
"step": 2845
},
{
"epoch": 2.553763440860215,
"grad_norm": 1.46875,
"learning_rate": 4.64162841725361e-05,
"loss": 0.4519,
"num_input_tokens_seen": 1073536,
"step": 2850
},
{
"epoch": 2.5582437275985663,
"grad_norm": 0.119140625,
"learning_rate": 4.639608756121684e-05,
"loss": 0.3869,
"num_input_tokens_seen": 1075424,
"step": 2855
},
{
"epoch": 2.5627240143369177,
"grad_norm": 0.10205078125,
"learning_rate": 4.637583861870596e-05,
"loss": 0.0681,
"num_input_tokens_seen": 1077472,
"step": 2860
},
{
"epoch": 2.567204301075269,
"grad_norm": 30.125,
"learning_rate": 4.635553739452895e-05,
"loss": 0.359,
"num_input_tokens_seen": 1079296,
"step": 2865
},
{
"epoch": 2.5716845878136203,
"grad_norm": 9.8125,
"learning_rate": 4.6335183938339125e-05,
"loss": 0.3854,
"num_input_tokens_seen": 1081152,
"step": 2870
},
{
"epoch": 2.576164874551971,
"grad_norm": 1.4296875,
"learning_rate": 4.631477829991761e-05,
"loss": 0.1559,
"num_input_tokens_seen": 1083168,
"step": 2875
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.09228515625,
"learning_rate": 4.629432052917309e-05,
"loss": 0.485,
"num_input_tokens_seen": 1084992,
"step": 2880
},
{
"epoch": 2.585125448028674,
"grad_norm": 27.375,
"learning_rate": 4.627381067614182e-05,
"loss": 0.0729,
"num_input_tokens_seen": 1086784,
"step": 2885
},
{
"epoch": 2.589605734767025,
"grad_norm": 91.5,
"learning_rate": 4.625324879098741e-05,
"loss": 0.1319,
"num_input_tokens_seen": 1088608,
"step": 2890
},
{
"epoch": 2.5940860215053765,
"grad_norm": 3.21875,
"learning_rate": 4.6232634924000725e-05,
"loss": 0.396,
"num_input_tokens_seen": 1090592,
"step": 2895
},
{
"epoch": 2.5985663082437274,
"grad_norm": 14.875,
"learning_rate": 4.621196912559978e-05,
"loss": 0.1397,
"num_input_tokens_seen": 1092448,
"step": 2900
},
{
"epoch": 2.6030465949820787,
"grad_norm": 0.052978515625,
"learning_rate": 4.619125144632961e-05,
"loss": 0.006,
"num_input_tokens_seen": 1094368,
"step": 2905
},
{
"epoch": 2.60752688172043,
"grad_norm": 50.25,
"learning_rate": 4.617048193686213e-05,
"loss": 0.3659,
"num_input_tokens_seen": 1096288,
"step": 2910
},
{
"epoch": 2.6120071684587813,
"grad_norm": 0.036865234375,
"learning_rate": 4.614966064799603e-05,
"loss": 0.6803,
"num_input_tokens_seen": 1098240,
"step": 2915
},
{
"epoch": 2.6164874551971327,
"grad_norm": 49.0,
"learning_rate": 4.612878763065664e-05,
"loss": 0.294,
"num_input_tokens_seen": 1100096,
"step": 2920
},
{
"epoch": 2.620967741935484,
"grad_norm": 0.07470703125,
"learning_rate": 4.610786293589581e-05,
"loss": 0.1025,
"num_input_tokens_seen": 1101984,
"step": 2925
},
{
"epoch": 2.6254480286738353,
"grad_norm": 52.25,
"learning_rate": 4.608688661489179e-05,
"loss": 0.4215,
"num_input_tokens_seen": 1104000,
"step": 2930
},
{
"epoch": 2.6299283154121866,
"grad_norm": 31.0,
"learning_rate": 4.60658587189491e-05,
"loss": 0.14,
"num_input_tokens_seen": 1106080,
"step": 2935
},
{
"epoch": 2.6344086021505375,
"grad_norm": 42.0,
"learning_rate": 4.604477929949837e-05,
"loss": 0.4079,
"num_input_tokens_seen": 1108096,
"step": 2940
},
{
"epoch": 2.638888888888889,
"grad_norm": 60.25,
"learning_rate": 4.60236484080963e-05,
"loss": 0.3598,
"num_input_tokens_seen": 1109952,
"step": 2945
},
{
"epoch": 2.64336917562724,
"grad_norm": 0.0986328125,
"learning_rate": 4.600246609642546e-05,
"loss": 0.2179,
"num_input_tokens_seen": 1111840,
"step": 2950
},
{
"epoch": 2.6478494623655915,
"grad_norm": 31.25,
"learning_rate": 4.598123241629416e-05,
"loss": 0.1949,
"num_input_tokens_seen": 1113600,
"step": 2955
},
{
"epoch": 2.652329749103943,
"grad_norm": 13.125,
"learning_rate": 4.5959947419636394e-05,
"loss": 0.1126,
"num_input_tokens_seen": 1115424,
"step": 2960
},
{
"epoch": 2.6568100358422937,
"grad_norm": 0.69140625,
"learning_rate": 4.593861115851163e-05,
"loss": 0.1644,
"num_input_tokens_seen": 1117376,
"step": 2965
},
{
"epoch": 2.661290322580645,
"grad_norm": 42.5,
"learning_rate": 4.5917223685104735e-05,
"loss": 0.4655,
"num_input_tokens_seen": 1119232,
"step": 2970
},
{
"epoch": 2.6657706093189963,
"grad_norm": 1.1640625,
"learning_rate": 4.5895785051725836e-05,
"loss": 0.0752,
"num_input_tokens_seen": 1121184,
"step": 2975
},
{
"epoch": 2.6702508960573477,
"grad_norm": 53.75,
"learning_rate": 4.587429531081019e-05,
"loss": 0.2294,
"num_input_tokens_seen": 1123424,
"step": 2980
},
{
"epoch": 2.674731182795699,
"grad_norm": 30.5,
"learning_rate": 4.5852754514918034e-05,
"loss": 0.2392,
"num_input_tokens_seen": 1125152,
"step": 2985
},
{
"epoch": 2.6792114695340503,
"grad_norm": 0.291015625,
"learning_rate": 4.58311627167345e-05,
"loss": 0.061,
"num_input_tokens_seen": 1127136,
"step": 2990
},
{
"epoch": 2.6836917562724016,
"grad_norm": 39.25,
"learning_rate": 4.580951996906946e-05,
"loss": 0.0348,
"num_input_tokens_seen": 1128992,
"step": 2995
},
{
"epoch": 2.688172043010753,
"grad_norm": 36.75,
"learning_rate": 4.578782632485738e-05,
"loss": 0.6858,
"num_input_tokens_seen": 1130976,
"step": 3000
},
{
"epoch": 2.692652329749104,
"grad_norm": 0.65625,
"learning_rate": 4.576608183715724e-05,
"loss": 0.2667,
"num_input_tokens_seen": 1132832,
"step": 3005
},
{
"epoch": 2.697132616487455,
"grad_norm": 20.75,
"learning_rate": 4.574428655915235e-05,
"loss": 0.245,
"num_input_tokens_seen": 1134720,
"step": 3010
},
{
"epoch": 2.7016129032258065,
"grad_norm": 32.0,
"learning_rate": 4.572244054415026e-05,
"loss": 0.3798,
"num_input_tokens_seen": 1136576,
"step": 3015
},
{
"epoch": 2.706093189964158,
"grad_norm": 26.25,
"learning_rate": 4.570054384558259e-05,
"loss": 0.1725,
"num_input_tokens_seen": 1138560,
"step": 3020
},
{
"epoch": 2.7105734767025087,
"grad_norm": 15.625,
"learning_rate": 4.5678596517004966e-05,
"loss": 0.1076,
"num_input_tokens_seen": 1140480,
"step": 3025
},
{
"epoch": 2.71505376344086,
"grad_norm": 0.11767578125,
"learning_rate": 4.56565986120968e-05,
"loss": 0.2918,
"num_input_tokens_seen": 1142432,
"step": 3030
},
{
"epoch": 2.7195340501792113,
"grad_norm": 59.25,
"learning_rate": 4.563455018466125e-05,
"loss": 0.5467,
"num_input_tokens_seen": 1144256,
"step": 3035
},
{
"epoch": 2.7240143369175627,
"grad_norm": 0.11328125,
"learning_rate": 4.5612451288624996e-05,
"loss": 0.0189,
"num_input_tokens_seen": 1146240,
"step": 3040
},
{
"epoch": 2.728494623655914,
"grad_norm": 26.875,
"learning_rate": 4.559030197803819e-05,
"loss": 0.0649,
"num_input_tokens_seen": 1148192,
"step": 3045
},
{
"epoch": 2.7329749103942653,
"grad_norm": 22.0,
"learning_rate": 4.5568102307074286e-05,
"loss": 0.16,
"num_input_tokens_seen": 1149984,
"step": 3050
},
{
"epoch": 2.7374551971326166,
"grad_norm": 11.0625,
"learning_rate": 4.554585233002989e-05,
"loss": 0.3895,
"num_input_tokens_seen": 1151872,
"step": 3055
},
{
"epoch": 2.741935483870968,
"grad_norm": 3.296875,
"learning_rate": 4.552355210132467e-05,
"loss": 0.0959,
"num_input_tokens_seen": 1153696,
"step": 3060
},
{
"epoch": 2.746415770609319,
"grad_norm": 14.6875,
"learning_rate": 4.550120167550119e-05,
"loss": 0.2767,
"num_input_tokens_seen": 1155584,
"step": 3065
},
{
"epoch": 2.75089605734767,
"grad_norm": 0.109375,
"learning_rate": 4.54788011072248e-05,
"loss": 0.2045,
"num_input_tokens_seen": 1157376,
"step": 3070
},
{
"epoch": 2.7553763440860215,
"grad_norm": 4.0,
"learning_rate": 4.545635045128347e-05,
"loss": 0.3703,
"num_input_tokens_seen": 1159104,
"step": 3075
},
{
"epoch": 2.759856630824373,
"grad_norm": 14.9375,
"learning_rate": 4.5433849762587685e-05,
"loss": 0.3276,
"num_input_tokens_seen": 1161024,
"step": 3080
},
{
"epoch": 2.764336917562724,
"grad_norm": 0.2890625,
"learning_rate": 4.541129909617031e-05,
"loss": 0.3263,
"num_input_tokens_seen": 1162848,
"step": 3085
},
{
"epoch": 2.768817204301075,
"grad_norm": 13.5,
"learning_rate": 4.5388698507186445e-05,
"loss": 0.0605,
"num_input_tokens_seen": 1164608,
"step": 3090
},
{
"epoch": 2.7732974910394264,
"grad_norm": 80.5,
"learning_rate": 4.536604805091327e-05,
"loss": 0.2655,
"num_input_tokens_seen": 1166368,
"step": 3095
},
{
"epoch": 2.7777777777777777,
"grad_norm": 63.75,
"learning_rate": 4.534334778274997e-05,
"loss": 0.1119,
"num_input_tokens_seen": 1168064,
"step": 3100
},
{
"epoch": 2.782258064516129,
"grad_norm": 50.25,
"learning_rate": 4.532059775821752e-05,
"loss": 0.2074,
"num_input_tokens_seen": 1170016,
"step": 3105
},
{
"epoch": 2.7867383512544803,
"grad_norm": 7.21875,
"learning_rate": 4.529779803295863e-05,
"loss": 0.183,
"num_input_tokens_seen": 1171712,
"step": 3110
},
{
"epoch": 2.7912186379928317,
"grad_norm": 11.4375,
"learning_rate": 4.527494866273753e-05,
"loss": 0.1321,
"num_input_tokens_seen": 1173536,
"step": 3115
},
{
"epoch": 2.795698924731183,
"grad_norm": 94.5,
"learning_rate": 4.525204970343991e-05,
"loss": 0.5104,
"num_input_tokens_seen": 1175456,
"step": 3120
},
{
"epoch": 2.8001792114695343,
"grad_norm": 0.462890625,
"learning_rate": 4.5229101211072736e-05,
"loss": 0.175,
"num_input_tokens_seen": 1177536,
"step": 3125
},
{
"epoch": 2.804659498207885,
"grad_norm": 96.0,
"learning_rate": 4.52061032417641e-05,
"loss": 0.353,
"num_input_tokens_seen": 1179328,
"step": 3130
},
{
"epoch": 2.8091397849462365,
"grad_norm": 36.0,
"learning_rate": 4.518305585176313e-05,
"loss": 0.1537,
"num_input_tokens_seen": 1181152,
"step": 3135
},
{
"epoch": 2.813620071684588,
"grad_norm": 2.609375,
"learning_rate": 4.5159959097439833e-05,
"loss": 0.0965,
"num_input_tokens_seen": 1183104,
"step": 3140
},
{
"epoch": 2.818100358422939,
"grad_norm": 36.75,
"learning_rate": 4.513681303528493e-05,
"loss": 0.284,
"num_input_tokens_seen": 1184960,
"step": 3145
},
{
"epoch": 2.8225806451612905,
"grad_norm": 1.84375,
"learning_rate": 4.511361772190975e-05,
"loss": 0.238,
"num_input_tokens_seen": 1186784,
"step": 3150
},
{
"epoch": 2.8270609318996414,
"grad_norm": 40.75,
"learning_rate": 4.50903732140461e-05,
"loss": 0.2708,
"num_input_tokens_seen": 1188960,
"step": 3155
},
{
"epoch": 2.8315412186379927,
"grad_norm": 0.1240234375,
"learning_rate": 4.506707956854608e-05,
"loss": 0.2754,
"num_input_tokens_seen": 1190784,
"step": 3160
},
{
"epoch": 2.836021505376344,
"grad_norm": 0.671875,
"learning_rate": 4.5043736842382e-05,
"loss": 0.6852,
"num_input_tokens_seen": 1192896,
"step": 3165
},
{
"epoch": 2.8405017921146953,
"grad_norm": 17.5,
"learning_rate": 4.5020345092646176e-05,
"loss": 0.2619,
"num_input_tokens_seen": 1195008,
"step": 3170
},
{
"epoch": 2.8449820788530467,
"grad_norm": 0.0869140625,
"learning_rate": 4.4996904376550876e-05,
"loss": 0.3451,
"num_input_tokens_seen": 1196800,
"step": 3175
},
{
"epoch": 2.849462365591398,
"grad_norm": 0.22265625,
"learning_rate": 4.497341475142808e-05,
"loss": 0.1641,
"num_input_tokens_seen": 1198688,
"step": 3180
},
{
"epoch": 2.8539426523297493,
"grad_norm": 10.0625,
"learning_rate": 4.494987627472943e-05,
"loss": 0.0617,
"num_input_tokens_seen": 1200704,
"step": 3185
},
{
"epoch": 2.8584229390681006,
"grad_norm": 0.173828125,
"learning_rate": 4.492628900402604e-05,
"loss": 0.4403,
"num_input_tokens_seen": 1202560,
"step": 3190
},
{
"epoch": 2.8629032258064515,
"grad_norm": 2.953125,
"learning_rate": 4.4902652997008365e-05,
"loss": 0.0074,
"num_input_tokens_seen": 1204448,
"step": 3195
},
{
"epoch": 2.867383512544803,
"grad_norm": 31.5,
"learning_rate": 4.487896831148605e-05,
"loss": 0.1876,
"num_input_tokens_seen": 1206400,
"step": 3200
},
{
"epoch": 2.871863799283154,
"grad_norm": 40.5,
"learning_rate": 4.48552350053878e-05,
"loss": 0.2149,
"num_input_tokens_seen": 1208160,
"step": 3205
},
{
"epoch": 2.8763440860215055,
"grad_norm": 9.75,
"learning_rate": 4.483145313676127e-05,
"loss": 0.2587,
"num_input_tokens_seen": 1209920,
"step": 3210
},
{
"epoch": 2.8808243727598564,
"grad_norm": 0.158203125,
"learning_rate": 4.480762276377284e-05,
"loss": 0.0023,
"num_input_tokens_seen": 1211872,
"step": 3215
},
{
"epoch": 2.8853046594982077,
"grad_norm": 12.1875,
"learning_rate": 4.4783743944707576e-05,
"loss": 0.1927,
"num_input_tokens_seen": 1213856,
"step": 3220
},
{
"epoch": 2.889784946236559,
"grad_norm": 4.9375,
"learning_rate": 4.475981673796899e-05,
"loss": 0.55,
"num_input_tokens_seen": 1215680,
"step": 3225
},
{
"epoch": 2.8942652329749103,
"grad_norm": 53.25,
"learning_rate": 4.473584120207896e-05,
"loss": 0.2385,
"num_input_tokens_seen": 1217600,
"step": 3230
},
{
"epoch": 2.8987455197132617,
"grad_norm": 49.25,
"learning_rate": 4.471181739567758e-05,
"loss": 0.5878,
"num_input_tokens_seen": 1219488,
"step": 3235
},
{
"epoch": 2.903225806451613,
"grad_norm": 41.5,
"learning_rate": 4.468774537752299e-05,
"loss": 0.9067,
"num_input_tokens_seen": 1221216,
"step": 3240
},
{
"epoch": 2.9077060931899643,
"grad_norm": 61.25,
"learning_rate": 4.466362520649125e-05,
"loss": 0.1145,
"num_input_tokens_seen": 1222944,
"step": 3245
},
{
"epoch": 2.9121863799283156,
"grad_norm": 43.5,
"learning_rate": 4.463945694157621e-05,
"loss": 0.3004,
"num_input_tokens_seen": 1224832,
"step": 3250
},
{
"epoch": 2.9166666666666665,
"grad_norm": 64.0,
"learning_rate": 4.461524064188931e-05,
"loss": 0.4856,
"num_input_tokens_seen": 1226560,
"step": 3255
},
{
"epoch": 2.921146953405018,
"grad_norm": 61.0,
"learning_rate": 4.459097636665953e-05,
"loss": 0.1785,
"num_input_tokens_seen": 1228480,
"step": 3260
},
{
"epoch": 2.925627240143369,
"grad_norm": 3.265625,
"learning_rate": 4.456666417523314e-05,
"loss": 0.0048,
"num_input_tokens_seen": 1230208,
"step": 3265
},
{
"epoch": 2.9301075268817205,
"grad_norm": 0.640625,
"learning_rate": 4.4542304127073644e-05,
"loss": 0.1475,
"num_input_tokens_seen": 1232160,
"step": 3270
},
{
"epoch": 2.934587813620072,
"grad_norm": 46.25,
"learning_rate": 4.451789628176155e-05,
"loss": 0.5688,
"num_input_tokens_seen": 1234112,
"step": 3275
},
{
"epoch": 2.9390681003584227,
"grad_norm": 0.1796875,
"learning_rate": 4.449344069899433e-05,
"loss": 0.1465,
"num_input_tokens_seen": 1236064,
"step": 3280
},
{
"epoch": 2.943548387096774,
"grad_norm": 0.63671875,
"learning_rate": 4.446893743858615e-05,
"loss": 0.408,
"num_input_tokens_seen": 1237856,
"step": 3285
},
{
"epoch": 2.9480286738351253,
"grad_norm": 0.6484375,
"learning_rate": 4.4444386560467836e-05,
"loss": 0.1433,
"num_input_tokens_seen": 1239968,
"step": 3290
},
{
"epoch": 2.9525089605734767,
"grad_norm": 8.25,
"learning_rate": 4.441978812468666e-05,
"loss": 0.1547,
"num_input_tokens_seen": 1241760,
"step": 3295
},
{
"epoch": 2.956989247311828,
"grad_norm": 27.125,
"learning_rate": 4.439514219140621e-05,
"loss": 0.4699,
"num_input_tokens_seen": 1243840,
"step": 3300
},
{
"epoch": 2.9614695340501793,
"grad_norm": 48.5,
"learning_rate": 4.4370448820906246e-05,
"loss": 0.3878,
"num_input_tokens_seen": 1245664,
"step": 3305
},
{
"epoch": 2.9659498207885306,
"grad_norm": 27.0,
"learning_rate": 4.434570807358255e-05,
"loss": 0.3861,
"num_input_tokens_seen": 1247488,
"step": 3310
},
{
"epoch": 2.970430107526882,
"grad_norm": 23.5,
"learning_rate": 4.4320920009946795e-05,
"loss": 0.1782,
"num_input_tokens_seen": 1249280,
"step": 3315
},
{
"epoch": 2.974910394265233,
"grad_norm": 1.6640625,
"learning_rate": 4.4296084690626356e-05,
"loss": 0.2657,
"num_input_tokens_seen": 1251136,
"step": 3320
},
{
"epoch": 2.979390681003584,
"grad_norm": 30.25,
"learning_rate": 4.427120217636421e-05,
"loss": 0.506,
"num_input_tokens_seen": 1253024,
"step": 3325
},
{
"epoch": 2.9838709677419355,
"grad_norm": 3.90625,
"learning_rate": 4.424627252801874e-05,
"loss": 0.0064,
"num_input_tokens_seen": 1254848,
"step": 3330
},
{
"epoch": 2.988351254480287,
"grad_norm": 27.25,
"learning_rate": 4.422129580656365e-05,
"loss": 0.2992,
"num_input_tokens_seen": 1256704,
"step": 3335
},
{
"epoch": 2.992831541218638,
"grad_norm": 28.375,
"learning_rate": 4.419627207308773e-05,
"loss": 0.1532,
"num_input_tokens_seen": 1258624,
"step": 3340
},
{
"epoch": 2.997311827956989,
"grad_norm": 68.0,
"learning_rate": 4.4171201388794795e-05,
"loss": 0.3557,
"num_input_tokens_seen": 1260480,
"step": 3345
},
{
"epoch": 3.0,
"eval_loss": 0.24597814679145813,
"eval_runtime": 9.6734,
"eval_samples_per_second": 51.275,
"eval_steps_per_second": 12.819,
"num_input_tokens_seen": 1261304,
"step": 3348
},
{
"epoch": 3.0017921146953404,
"grad_norm": 41.75,
"learning_rate": 4.414608381500347e-05,
"loss": 0.2989,
"num_input_tokens_seen": 1262008,
"step": 3350
},
{
"epoch": 3.0062724014336917,
"grad_norm": 44.75,
"learning_rate": 4.4120919413147054e-05,
"loss": 0.2034,
"num_input_tokens_seen": 1263800,
"step": 3355
},
{
"epoch": 3.010752688172043,
"grad_norm": 0.036865234375,
"learning_rate": 4.409570824477341e-05,
"loss": 0.4329,
"num_input_tokens_seen": 1265592,
"step": 3360
},
{
"epoch": 3.0152329749103943,
"grad_norm": 3.90625,
"learning_rate": 4.407045037154478e-05,
"loss": 0.037,
"num_input_tokens_seen": 1267512,
"step": 3365
},
{
"epoch": 3.0197132616487457,
"grad_norm": 69.0,
"learning_rate": 4.40451458552376e-05,
"loss": 0.1203,
"num_input_tokens_seen": 1269400,
"step": 3370
},
{
"epoch": 3.024193548387097,
"grad_norm": 0.06884765625,
"learning_rate": 4.4019794757742426e-05,
"loss": 0.2207,
"num_input_tokens_seen": 1271192,
"step": 3375
},
{
"epoch": 3.028673835125448,
"grad_norm": 0.251953125,
"learning_rate": 4.3994397141063734e-05,
"loss": 0.1431,
"num_input_tokens_seen": 1273080,
"step": 3380
},
{
"epoch": 3.033154121863799,
"grad_norm": 20.125,
"learning_rate": 4.3968953067319777e-05,
"loss": 0.0794,
"num_input_tokens_seen": 1275000,
"step": 3385
},
{
"epoch": 3.0376344086021505,
"grad_norm": 82.0,
"learning_rate": 4.394346259874242e-05,
"loss": 0.424,
"num_input_tokens_seen": 1276856,
"step": 3390
},
{
"epoch": 3.042114695340502,
"grad_norm": 3.8125,
"learning_rate": 4.3917925797677025e-05,
"loss": 0.3653,
"num_input_tokens_seen": 1278648,
"step": 3395
},
{
"epoch": 3.046594982078853,
"grad_norm": 7.1875,
"learning_rate": 4.389234272658227e-05,
"loss": 0.1618,
"num_input_tokens_seen": 1280504,
"step": 3400
},
{
"epoch": 3.0510752688172045,
"grad_norm": 4.96875,
"learning_rate": 4.386671344802998e-05,
"loss": 0.4422,
"num_input_tokens_seen": 1282488,
"step": 3405
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.458984375,
"learning_rate": 4.384103802470502e-05,
"loss": 0.0044,
"num_input_tokens_seen": 1284312,
"step": 3410
},
{
"epoch": 3.0600358422939067,
"grad_norm": 62.25,
"learning_rate": 4.381531651940511e-05,
"loss": 0.475,
"num_input_tokens_seen": 1286200,
"step": 3415
},
{
"epoch": 3.064516129032258,
"grad_norm": 7.4375,
"learning_rate": 4.378954899504068e-05,
"loss": 0.0366,
"num_input_tokens_seen": 1288088,
"step": 3420
},
{
"epoch": 3.0689964157706093,
"grad_norm": 36.75,
"learning_rate": 4.3763735514634706e-05,
"loss": 0.3139,
"num_input_tokens_seen": 1290232,
"step": 3425
},
{
"epoch": 3.0734767025089607,
"grad_norm": 35.0,
"learning_rate": 4.3737876141322576e-05,
"loss": 0.3636,
"num_input_tokens_seen": 1292184,
"step": 3430
},
{
"epoch": 3.077956989247312,
"grad_norm": 2.671875,
"learning_rate": 4.371197093835192e-05,
"loss": 0.0568,
"num_input_tokens_seen": 1294168,
"step": 3435
},
{
"epoch": 3.0824372759856633,
"grad_norm": 11.875,
"learning_rate": 4.368601996908246e-05,
"loss": 0.0303,
"num_input_tokens_seen": 1296088,
"step": 3440
},
{
"epoch": 3.086917562724014,
"grad_norm": 54.0,
"learning_rate": 4.366002329698585e-05,
"loss": 0.2945,
"num_input_tokens_seen": 1297816,
"step": 3445
},
{
"epoch": 3.0913978494623655,
"grad_norm": 75.5,
"learning_rate": 4.3633980985645526e-05,
"loss": 0.4732,
"num_input_tokens_seen": 1299704,
"step": 3450
},
{
"epoch": 3.095878136200717,
"grad_norm": 0.09521484375,
"learning_rate": 4.360789309875656e-05,
"loss": 0.0064,
"num_input_tokens_seen": 1301656,
"step": 3455
},
{
"epoch": 3.100358422939068,
"grad_norm": 55.5,
"learning_rate": 4.358175970012549e-05,
"loss": 0.2436,
"num_input_tokens_seen": 1303608,
"step": 3460
},
{
"epoch": 3.1048387096774195,
"grad_norm": 0.79296875,
"learning_rate": 4.3555580853670154e-05,
"loss": 0.0034,
"num_input_tokens_seen": 1305432,
"step": 3465
},
{
"epoch": 3.109318996415771,
"grad_norm": 56.75,
"learning_rate": 4.352935662341956e-05,
"loss": 0.2448,
"num_input_tokens_seen": 1307288,
"step": 3470
},
{
"epoch": 3.1137992831541217,
"grad_norm": 9.5,
"learning_rate": 4.350308707351372e-05,
"loss": 0.1786,
"num_input_tokens_seen": 1309272,
"step": 3475
},
{
"epoch": 3.118279569892473,
"grad_norm": 58.25,
"learning_rate": 4.347677226820349e-05,
"loss": 0.6146,
"num_input_tokens_seen": 1311128,
"step": 3480
},
{
"epoch": 3.1227598566308243,
"grad_norm": 28.25,
"learning_rate": 4.3450412271850406e-05,
"loss": 0.4737,
"num_input_tokens_seen": 1312856,
"step": 3485
},
{
"epoch": 3.1272401433691757,
"grad_norm": 73.5,
"learning_rate": 4.342400714892653e-05,
"loss": 0.0644,
"num_input_tokens_seen": 1315000,
"step": 3490
},
{
"epoch": 3.131720430107527,
"grad_norm": 2.203125,
"learning_rate": 4.339755696401431e-05,
"loss": 0.1211,
"num_input_tokens_seen": 1316792,
"step": 3495
},
{
"epoch": 3.1362007168458783,
"grad_norm": 0.65234375,
"learning_rate": 4.337106178180639e-05,
"loss": 0.0078,
"num_input_tokens_seen": 1318616,
"step": 3500
},
{
"epoch": 3.140681003584229,
"grad_norm": 137.0,
"learning_rate": 4.3344521667105486e-05,
"loss": 0.2265,
"num_input_tokens_seen": 1320504,
"step": 3505
},
{
"epoch": 3.1451612903225805,
"grad_norm": 38.0,
"learning_rate": 4.331793668482421e-05,
"loss": 0.1701,
"num_input_tokens_seen": 1322488,
"step": 3510
},
{
"epoch": 3.149641577060932,
"grad_norm": 1.46875,
"learning_rate": 4.329130689998491e-05,
"loss": 0.3285,
"num_input_tokens_seen": 1324440,
"step": 3515
},
{
"epoch": 3.154121863799283,
"grad_norm": 0.06787109375,
"learning_rate": 4.3264632377719496e-05,
"loss": 0.2985,
"num_input_tokens_seen": 1326488,
"step": 3520
},
{
"epoch": 3.1586021505376345,
"grad_norm": 0.78515625,
"learning_rate": 4.323791318326932e-05,
"loss": 0.3602,
"num_input_tokens_seen": 1328536,
"step": 3525
},
{
"epoch": 3.163082437275986,
"grad_norm": 72.0,
"learning_rate": 4.3211149381984996e-05,
"loss": 0.3522,
"num_input_tokens_seen": 1330328,
"step": 3530
},
{
"epoch": 3.1675627240143367,
"grad_norm": 6.5,
"learning_rate": 4.318434103932622e-05,
"loss": 0.0185,
"num_input_tokens_seen": 1332280,
"step": 3535
},
{
"epoch": 3.172043010752688,
"grad_norm": 0.279296875,
"learning_rate": 4.315748822086164e-05,
"loss": 0.3073,
"num_input_tokens_seen": 1334360,
"step": 3540
},
{
"epoch": 3.1765232974910393,
"grad_norm": 0.232421875,
"learning_rate": 4.3130590992268695e-05,
"loss": 0.1884,
"num_input_tokens_seen": 1336472,
"step": 3545
},
{
"epoch": 3.1810035842293907,
"grad_norm": 0.1484375,
"learning_rate": 4.3103649419333424e-05,
"loss": 0.0361,
"num_input_tokens_seen": 1338296,
"step": 3550
},
{
"epoch": 3.185483870967742,
"grad_norm": 77.0,
"learning_rate": 4.307666356795033e-05,
"loss": 0.2318,
"num_input_tokens_seen": 1340216,
"step": 3555
},
{
"epoch": 3.1899641577060933,
"grad_norm": 0.043212890625,
"learning_rate": 4.3049633504122215e-05,
"loss": 0.2766,
"num_input_tokens_seen": 1341912,
"step": 3560
},
{
"epoch": 3.1944444444444446,
"grad_norm": 2.484375,
"learning_rate": 4.302255929396003e-05,
"loss": 0.0706,
"num_input_tokens_seen": 1343672,
"step": 3565
},
{
"epoch": 3.1989247311827955,
"grad_norm": 37.5,
"learning_rate": 4.299544100368268e-05,
"loss": 0.2026,
"num_input_tokens_seen": 1345528,
"step": 3570
},
{
"epoch": 3.203405017921147,
"grad_norm": 0.09912109375,
"learning_rate": 4.2968278699616885e-05,
"loss": 0.3174,
"num_input_tokens_seen": 1347640,
"step": 3575
},
{
"epoch": 3.207885304659498,
"grad_norm": 68.0,
"learning_rate": 4.294107244819704e-05,
"loss": 0.1331,
"num_input_tokens_seen": 1349432,
"step": 3580
},
{
"epoch": 3.2123655913978495,
"grad_norm": 0.828125,
"learning_rate": 4.291382231596499e-05,
"loss": 0.0614,
"num_input_tokens_seen": 1351160,
"step": 3585
},
{
"epoch": 3.216845878136201,
"grad_norm": 22.25,
"learning_rate": 4.2886528369569935e-05,
"loss": 0.1384,
"num_input_tokens_seen": 1353016,
"step": 3590
},
{
"epoch": 3.221326164874552,
"grad_norm": 101.0,
"learning_rate": 4.285919067576822e-05,
"loss": 0.1975,
"num_input_tokens_seen": 1354904,
"step": 3595
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.451171875,
"learning_rate": 4.283180930142322e-05,
"loss": 0.1572,
"num_input_tokens_seen": 1356792,
"step": 3600
},
{
"epoch": 3.2302867383512543,
"grad_norm": 39.0,
"learning_rate": 4.280438431350508e-05,
"loss": 0.1177,
"num_input_tokens_seen": 1358680,
"step": 3605
},
{
"epoch": 3.2347670250896057,
"grad_norm": 46.5,
"learning_rate": 4.2776915779090674e-05,
"loss": 0.1206,
"num_input_tokens_seen": 1360536,
"step": 3610
},
{
"epoch": 3.239247311827957,
"grad_norm": 0.0556640625,
"learning_rate": 4.274940376536338e-05,
"loss": 0.0378,
"num_input_tokens_seen": 1362424,
"step": 3615
},
{
"epoch": 3.2437275985663083,
"grad_norm": 85.0,
"learning_rate": 4.272184833961289e-05,
"loss": 0.2942,
"num_input_tokens_seen": 1364408,
"step": 3620
},
{
"epoch": 3.2482078853046596,
"grad_norm": 0.1591796875,
"learning_rate": 4.269424956923509e-05,
"loss": 0.1833,
"num_input_tokens_seen": 1366200,
"step": 3625
},
{
"epoch": 3.252688172043011,
"grad_norm": 0.7265625,
"learning_rate": 4.2666607521731883e-05,
"loss": 0.1425,
"num_input_tokens_seen": 1368024,
"step": 3630
},
{
"epoch": 3.257168458781362,
"grad_norm": 24.5,
"learning_rate": 4.2638922264711026e-05,
"loss": 0.0869,
"num_input_tokens_seen": 1369784,
"step": 3635
},
{
"epoch": 3.261648745519713,
"grad_norm": 0.0517578125,
"learning_rate": 4.2611193865885926e-05,
"loss": 0.1546,
"num_input_tokens_seen": 1371608,
"step": 3640
},
{
"epoch": 3.2661290322580645,
"grad_norm": 0.07080078125,
"learning_rate": 4.258342239307554e-05,
"loss": 0.2975,
"num_input_tokens_seen": 1373400,
"step": 3645
},
{
"epoch": 3.270609318996416,
"grad_norm": 41.25,
"learning_rate": 4.255560791420417e-05,
"loss": 0.0842,
"num_input_tokens_seen": 1375256,
"step": 3650
},
{
"epoch": 3.275089605734767,
"grad_norm": 1.3125,
"learning_rate": 4.2527750497301323e-05,
"loss": 0.0152,
"num_input_tokens_seen": 1377336,
"step": 3655
},
{
"epoch": 3.279569892473118,
"grad_norm": 0.01373291015625,
"learning_rate": 4.249985021050147e-05,
"loss": 0.2119,
"num_input_tokens_seen": 1379064,
"step": 3660
},
{
"epoch": 3.2840501792114694,
"grad_norm": 2.53125,
"learning_rate": 4.247190712204398e-05,
"loss": 0.1704,
"num_input_tokens_seen": 1380920,
"step": 3665
},
{
"epoch": 3.2885304659498207,
"grad_norm": 0.54296875,
"learning_rate": 4.2443921300272895e-05,
"loss": 0.2307,
"num_input_tokens_seen": 1382872,
"step": 3670
},
{
"epoch": 3.293010752688172,
"grad_norm": 1.171875,
"learning_rate": 4.241589281363678e-05,
"loss": 0.2672,
"num_input_tokens_seen": 1384888,
"step": 3675
},
{
"epoch": 3.2974910394265233,
"grad_norm": 19.375,
"learning_rate": 4.2387821730688545e-05,
"loss": 0.453,
"num_input_tokens_seen": 1386776,
"step": 3680
},
{
"epoch": 3.3019713261648747,
"grad_norm": 113.5,
"learning_rate": 4.2359708120085286e-05,
"loss": 0.4951,
"num_input_tokens_seen": 1388600,
"step": 3685
},
{
"epoch": 3.306451612903226,
"grad_norm": 0.01483154296875,
"learning_rate": 4.233155205058811e-05,
"loss": 0.2872,
"num_input_tokens_seen": 1390488,
"step": 3690
},
{
"epoch": 3.3109318996415773,
"grad_norm": 151.0,
"learning_rate": 4.230335359106198e-05,
"loss": 0.7319,
"num_input_tokens_seen": 1392344,
"step": 3695
},
{
"epoch": 3.315412186379928,
"grad_norm": 78.0,
"learning_rate": 4.227511281047552e-05,
"loss": 0.0535,
"num_input_tokens_seen": 1394296,
"step": 3700
},
{
"epoch": 3.3198924731182795,
"grad_norm": 43.0,
"learning_rate": 4.22468297779009e-05,
"loss": 0.5711,
"num_input_tokens_seen": 1396216,
"step": 3705
},
{
"epoch": 3.324372759856631,
"grad_norm": 0.043212890625,
"learning_rate": 4.2218504562513584e-05,
"loss": 0.1102,
"num_input_tokens_seen": 1398136,
"step": 3710
},
{
"epoch": 3.328853046594982,
"grad_norm": 6.03125,
"learning_rate": 4.219013723359224e-05,
"loss": 0.1749,
"num_input_tokens_seen": 1400088,
"step": 3715
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.1416015625,
"learning_rate": 4.216172786051854e-05,
"loss": 0.1353,
"num_input_tokens_seen": 1402040,
"step": 3720
},
{
"epoch": 3.3378136200716844,
"grad_norm": 0.0286865234375,
"learning_rate": 4.213327651277697e-05,
"loss": 0.3169,
"num_input_tokens_seen": 1403960,
"step": 3725
},
{
"epoch": 3.3422939068100357,
"grad_norm": 49.0,
"learning_rate": 4.2104783259954687e-05,
"loss": 0.2742,
"num_input_tokens_seen": 1405848,
"step": 3730
},
{
"epoch": 3.346774193548387,
"grad_norm": 43.75,
"learning_rate": 4.207624817174135e-05,
"loss": 0.1502,
"num_input_tokens_seen": 1407736,
"step": 3735
},
{
"epoch": 3.3512544802867383,
"grad_norm": 130.0,
"learning_rate": 4.204767131792892e-05,
"loss": 0.4354,
"num_input_tokens_seen": 1409624,
"step": 3740
},
{
"epoch": 3.3557347670250897,
"grad_norm": 36.25,
"learning_rate": 4.201905276841153e-05,
"loss": 0.1456,
"num_input_tokens_seen": 1411480,
"step": 3745
},
{
"epoch": 3.360215053763441,
"grad_norm": 0.2890625,
"learning_rate": 4.199039259318529e-05,
"loss": 0.4737,
"num_input_tokens_seen": 1413400,
"step": 3750
},
{
"epoch": 3.3646953405017923,
"grad_norm": 42.75,
"learning_rate": 4.196169086234811e-05,
"loss": 0.2921,
"num_input_tokens_seen": 1415224,
"step": 3755
},
{
"epoch": 3.369175627240143,
"grad_norm": 4.6875,
"learning_rate": 4.193294764609954e-05,
"loss": 0.3297,
"num_input_tokens_seen": 1416952,
"step": 3760
},
{
"epoch": 3.3736559139784945,
"grad_norm": 0.1435546875,
"learning_rate": 4.190416301474059e-05,
"loss": 0.3041,
"num_input_tokens_seen": 1418840,
"step": 3765
},
{
"epoch": 3.378136200716846,
"grad_norm": 127.0,
"learning_rate": 4.18753370386736e-05,
"loss": 0.4041,
"num_input_tokens_seen": 1420536,
"step": 3770
},
{
"epoch": 3.382616487455197,
"grad_norm": 41.5,
"learning_rate": 4.184646978840198e-05,
"loss": 0.1347,
"num_input_tokens_seen": 1422456,
"step": 3775
},
{
"epoch": 3.3870967741935485,
"grad_norm": 21.75,
"learning_rate": 4.181756133453013e-05,
"loss": 0.3585,
"num_input_tokens_seen": 1424312,
"step": 3780
},
{
"epoch": 3.3915770609319,
"grad_norm": 47.75,
"learning_rate": 4.17886117477632e-05,
"loss": 0.2509,
"num_input_tokens_seen": 1426360,
"step": 3785
},
{
"epoch": 3.3960573476702507,
"grad_norm": 55.0,
"learning_rate": 4.175962109890696e-05,
"loss": 0.1269,
"num_input_tokens_seen": 1428152,
"step": 3790
},
{
"epoch": 3.400537634408602,
"grad_norm": 46.5,
"learning_rate": 4.173058945886762e-05,
"loss": 0.5953,
"num_input_tokens_seen": 1430360,
"step": 3795
},
{
"epoch": 3.4050179211469533,
"grad_norm": 68.0,
"learning_rate": 4.1701516898651614e-05,
"loss": 0.1791,
"num_input_tokens_seen": 1432184,
"step": 3800
},
{
"epoch": 3.4094982078853047,
"grad_norm": 1.96875,
"learning_rate": 4.1672403489365505e-05,
"loss": 0.3912,
"num_input_tokens_seen": 1434008,
"step": 3805
},
{
"epoch": 3.413978494623656,
"grad_norm": 99.5,
"learning_rate": 4.164324930221571e-05,
"loss": 0.0815,
"num_input_tokens_seen": 1435928,
"step": 3810
},
{
"epoch": 3.4184587813620073,
"grad_norm": 10.625,
"learning_rate": 4.161405440850844e-05,
"loss": 0.0856,
"num_input_tokens_seen": 1437784,
"step": 3815
},
{
"epoch": 3.4229390681003586,
"grad_norm": 1.6875,
"learning_rate": 4.1584818879649426e-05,
"loss": 0.074,
"num_input_tokens_seen": 1439640,
"step": 3820
},
{
"epoch": 3.4274193548387095,
"grad_norm": 16.125,
"learning_rate": 4.1555542787143795e-05,
"loss": 0.3335,
"num_input_tokens_seen": 1441496,
"step": 3825
},
{
"epoch": 3.431899641577061,
"grad_norm": 69.5,
"learning_rate": 4.1526226202595915e-05,
"loss": 0.0748,
"num_input_tokens_seen": 1443512,
"step": 3830
},
{
"epoch": 3.436379928315412,
"grad_norm": 40.75,
"learning_rate": 4.1496869197709146e-05,
"loss": 0.0182,
"num_input_tokens_seen": 1445432,
"step": 3835
},
{
"epoch": 3.4408602150537635,
"grad_norm": 15.3125,
"learning_rate": 4.1467471844285724e-05,
"loss": 0.3318,
"num_input_tokens_seen": 1447384,
"step": 3840
},
{
"epoch": 3.445340501792115,
"grad_norm": 58.0,
"learning_rate": 4.14380342142266e-05,
"loss": 0.2609,
"num_input_tokens_seen": 1449208,
"step": 3845
},
{
"epoch": 3.449820788530466,
"grad_norm": 116.5,
"learning_rate": 4.1408556379531186e-05,
"loss": 0.0643,
"num_input_tokens_seen": 1451064,
"step": 3850
},
{
"epoch": 3.454301075268817,
"grad_norm": 99.5,
"learning_rate": 4.137903841229727e-05,
"loss": 0.3941,
"num_input_tokens_seen": 1452856,
"step": 3855
},
{
"epoch": 3.4587813620071683,
"grad_norm": 0.0118408203125,
"learning_rate": 4.1349480384720765e-05,
"loss": 0.1693,
"num_input_tokens_seen": 1454712,
"step": 3860
},
{
"epoch": 3.4632616487455197,
"grad_norm": 6.15625,
"learning_rate": 4.13198823690956e-05,
"loss": 0.0449,
"num_input_tokens_seen": 1456600,
"step": 3865
},
{
"epoch": 3.467741935483871,
"grad_norm": 40.5,
"learning_rate": 4.1290244437813475e-05,
"loss": 0.1901,
"num_input_tokens_seen": 1458552,
"step": 3870
},
{
"epoch": 3.4722222222222223,
"grad_norm": 24.875,
"learning_rate": 4.126056666336373e-05,
"loss": 0.4483,
"num_input_tokens_seen": 1460408,
"step": 3875
},
{
"epoch": 3.4767025089605736,
"grad_norm": 39.75,
"learning_rate": 4.123084911833315e-05,
"loss": 0.2603,
"num_input_tokens_seen": 1462392,
"step": 3880
},
{
"epoch": 3.481182795698925,
"grad_norm": 30.375,
"learning_rate": 4.120109187540581e-05,
"loss": 0.1122,
"num_input_tokens_seen": 1464184,
"step": 3885
},
{
"epoch": 3.485663082437276,
"grad_norm": 42.5,
"learning_rate": 4.117129500736286e-05,
"loss": 0.0401,
"num_input_tokens_seen": 1466040,
"step": 3890
},
{
"epoch": 3.490143369175627,
"grad_norm": 61.25,
"learning_rate": 4.114145858708236e-05,
"loss": 0.3267,
"num_input_tokens_seen": 1467960,
"step": 3895
},
{
"epoch": 3.4946236559139785,
"grad_norm": 59.75,
"learning_rate": 4.111158268753914e-05,
"loss": 0.174,
"num_input_tokens_seen": 1469944,
"step": 3900
},
{
"epoch": 3.49910394265233,
"grad_norm": 0.1533203125,
"learning_rate": 4.108166738180455e-05,
"loss": 0.1723,
"num_input_tokens_seen": 1471736,
"step": 3905
},
{
"epoch": 3.5,
"eval_loss": 0.26632943749427795,
"eval_runtime": 9.6716,
"eval_samples_per_second": 51.284,
"eval_steps_per_second": 12.821,
"num_input_tokens_seen": 1472152,
"step": 3906
},
{
"epoch": 3.503584229390681,
"grad_norm": 56.75,
"learning_rate": 4.105171274304637e-05,
"loss": 0.3704,
"num_input_tokens_seen": 1473624,
"step": 3910
},
{
"epoch": 3.508064516129032,
"grad_norm": 0.10107421875,
"learning_rate": 4.102171884452852e-05,
"loss": 0.1405,
"num_input_tokens_seen": 1475480,
"step": 3915
},
{
"epoch": 3.5125448028673834,
"grad_norm": 0.7265625,
"learning_rate": 4.099168575961099e-05,
"loss": 0.0035,
"num_input_tokens_seen": 1477336,
"step": 3920
},
{
"epoch": 3.5170250896057347,
"grad_norm": 0.39453125,
"learning_rate": 4.096161356174959e-05,
"loss": 0.338,
"num_input_tokens_seen": 1479256,
"step": 3925
},
{
"epoch": 3.521505376344086,
"grad_norm": 98.0,
"learning_rate": 4.093150232449581e-05,
"loss": 0.1806,
"num_input_tokens_seen": 1481080,
"step": 3930
},
{
"epoch": 3.5259856630824373,
"grad_norm": 4.0,
"learning_rate": 4.0901352121496613e-05,
"loss": 0.5536,
"num_input_tokens_seen": 1483128,
"step": 3935
},
{
"epoch": 3.5304659498207887,
"grad_norm": 45.25,
"learning_rate": 4.087116302649428e-05,
"loss": 0.0726,
"num_input_tokens_seen": 1484984,
"step": 3940
},
{
"epoch": 3.53494623655914,
"grad_norm": 38.25,
"learning_rate": 4.0840935113326184e-05,
"loss": 0.369,
"num_input_tokens_seen": 1486744,
"step": 3945
},
{
"epoch": 3.5394265232974913,
"grad_norm": 0.01507568359375,
"learning_rate": 4.081066845592467e-05,
"loss": 0.0967,
"num_input_tokens_seen": 1488632,
"step": 3950
},
{
"epoch": 3.543906810035842,
"grad_norm": 0.53515625,
"learning_rate": 4.0780363128316844e-05,
"loss": 0.126,
"num_input_tokens_seen": 1490584,
"step": 3955
},
{
"epoch": 3.5483870967741935,
"grad_norm": 31.25,
"learning_rate": 4.0750019204624356e-05,
"loss": 0.4181,
"num_input_tokens_seen": 1492472,
"step": 3960
},
{
"epoch": 3.552867383512545,
"grad_norm": 86.0,
"learning_rate": 4.071963675906331e-05,
"loss": 0.2291,
"num_input_tokens_seen": 1494488,
"step": 3965
},
{
"epoch": 3.557347670250896,
"grad_norm": 48.5,
"learning_rate": 4.0689215865944e-05,
"loss": 0.2454,
"num_input_tokens_seen": 1496504,
"step": 3970
},
{
"epoch": 3.561827956989247,
"grad_norm": 13.25,
"learning_rate": 4.0658756599670735e-05,
"loss": 0.1955,
"num_input_tokens_seen": 1498392,
"step": 3975
},
{
"epoch": 3.5663082437275984,
"grad_norm": 1.96875,
"learning_rate": 4.062825903474172e-05,
"loss": 0.2453,
"num_input_tokens_seen": 1500376,
"step": 3980
},
{
"epoch": 3.5707885304659497,
"grad_norm": 1.046875,
"learning_rate": 4.059772324574881e-05,
"loss": 0.1609,
"num_input_tokens_seen": 1502200,
"step": 3985
},
{
"epoch": 3.575268817204301,
"grad_norm": 80.0,
"learning_rate": 4.056714930737735e-05,
"loss": 0.0665,
"num_input_tokens_seen": 1503928,
"step": 3990
},
{
"epoch": 3.5797491039426523,
"grad_norm": 74.5,
"learning_rate": 4.053653729440599e-05,
"loss": 0.4792,
"num_input_tokens_seen": 1505816,
"step": 3995
},
{
"epoch": 3.5842293906810037,
"grad_norm": 0.09375,
"learning_rate": 4.05058872817065e-05,
"loss": 0.4599,
"num_input_tokens_seen": 1507608,
"step": 4000
},
{
"epoch": 3.588709677419355,
"grad_norm": 24.5,
"learning_rate": 4.047519934424362e-05,
"loss": 0.1509,
"num_input_tokens_seen": 1509560,
"step": 4005
},
{
"epoch": 3.5931899641577063,
"grad_norm": 27.75,
"learning_rate": 4.044447355707483e-05,
"loss": 0.2225,
"num_input_tokens_seen": 1511672,
"step": 4010
},
{
"epoch": 3.597670250896057,
"grad_norm": 0.59765625,
"learning_rate": 4.0413709995350145e-05,
"loss": 0.0008,
"num_input_tokens_seen": 1513560,
"step": 4015
},
{
"epoch": 3.6021505376344085,
"grad_norm": 81.0,
"learning_rate": 4.038290873431203e-05,
"loss": 0.1673,
"num_input_tokens_seen": 1515544,
"step": 4020
},
{
"epoch": 3.60663082437276,
"grad_norm": 0.0269775390625,
"learning_rate": 4.035206984929513e-05,
"loss": 0.0736,
"num_input_tokens_seen": 1517496,
"step": 4025
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.07666015625,
"learning_rate": 4.032119341572612e-05,
"loss": 0.0997,
"num_input_tokens_seen": 1519448,
"step": 4030
},
{
"epoch": 3.6155913978494625,
"grad_norm": 0.05029296875,
"learning_rate": 4.0290279509123483e-05,
"loss": 0.2004,
"num_input_tokens_seen": 1521272,
"step": 4035
},
{
"epoch": 3.6200716845878134,
"grad_norm": 71.0,
"learning_rate": 4.02593282050974e-05,
"loss": 0.6456,
"num_input_tokens_seen": 1523128,
"step": 4040
},
{
"epoch": 3.6245519713261647,
"grad_norm": 26.875,
"learning_rate": 4.022833957934949e-05,
"loss": 0.2973,
"num_input_tokens_seen": 1525144,
"step": 4045
},
{
"epoch": 3.629032258064516,
"grad_norm": 2.421875,
"learning_rate": 4.019731370767267e-05,
"loss": 0.1385,
"num_input_tokens_seen": 1526968,
"step": 4050
},
{
"epoch": 3.6335125448028673,
"grad_norm": 1.1328125,
"learning_rate": 4.016625066595092e-05,
"loss": 0.4064,
"num_input_tokens_seen": 1528760,
"step": 4055
},
{
"epoch": 3.6379928315412187,
"grad_norm": 59.5,
"learning_rate": 4.013515053015918e-05,
"loss": 0.4711,
"num_input_tokens_seen": 1530584,
"step": 4060
},
{
"epoch": 3.64247311827957,
"grad_norm": 0.04931640625,
"learning_rate": 4.010401337636309e-05,
"loss": 0.6193,
"num_input_tokens_seen": 1532312,
"step": 4065
},
{
"epoch": 3.6469534050179213,
"grad_norm": 1.515625,
"learning_rate": 4.007283928071882e-05,
"loss": 0.0759,
"num_input_tokens_seen": 1534008,
"step": 4070
},
{
"epoch": 3.6514336917562726,
"grad_norm": 86.0,
"learning_rate": 4.0041628319472926e-05,
"loss": 0.8775,
"num_input_tokens_seen": 1535896,
"step": 4075
},
{
"epoch": 3.6559139784946235,
"grad_norm": 0.515625,
"learning_rate": 4.001038056896211e-05,
"loss": 0.3808,
"num_input_tokens_seen": 1537752,
"step": 4080
},
{
"epoch": 3.660394265232975,
"grad_norm": 29.5,
"learning_rate": 3.9979096105613035e-05,
"loss": 0.2546,
"num_input_tokens_seen": 1539640,
"step": 4085
},
{
"epoch": 3.664874551971326,
"grad_norm": 1.4375,
"learning_rate": 3.99477750059422e-05,
"loss": 0.0093,
"num_input_tokens_seen": 1541528,
"step": 4090
},
{
"epoch": 3.6693548387096775,
"grad_norm": 31.75,
"learning_rate": 3.991641734655568e-05,
"loss": 0.0852,
"num_input_tokens_seen": 1543448,
"step": 4095
},
{
"epoch": 3.673835125448029,
"grad_norm": 0.4140625,
"learning_rate": 3.988502320414897e-05,
"loss": 0.3155,
"num_input_tokens_seen": 1545240,
"step": 4100
},
{
"epoch": 3.6783154121863797,
"grad_norm": 0.033935546875,
"learning_rate": 3.985359265550682e-05,
"loss": 0.2394,
"num_input_tokens_seen": 1547096,
"step": 4105
},
{
"epoch": 3.682795698924731,
"grad_norm": 65.0,
"learning_rate": 3.9822125777502995e-05,
"loss": 0.0386,
"num_input_tokens_seen": 1549016,
"step": 4110
},
{
"epoch": 3.6872759856630823,
"grad_norm": 9.3125,
"learning_rate": 3.979062264710012e-05,
"loss": 0.0393,
"num_input_tokens_seen": 1550968,
"step": 4115
},
{
"epoch": 3.6917562724014337,
"grad_norm": 48.0,
"learning_rate": 3.975908334134952e-05,
"loss": 0.4508,
"num_input_tokens_seen": 1552760,
"step": 4120
},
{
"epoch": 3.696236559139785,
"grad_norm": 76.5,
"learning_rate": 3.9727507937390954e-05,
"loss": 0.3538,
"num_input_tokens_seen": 1554680,
"step": 4125
},
{
"epoch": 3.7007168458781363,
"grad_norm": 14.4375,
"learning_rate": 3.969589651245249e-05,
"loss": 0.1282,
"num_input_tokens_seen": 1556536,
"step": 4130
},
{
"epoch": 3.7051971326164876,
"grad_norm": 0.28515625,
"learning_rate": 3.9664249143850304e-05,
"loss": 0.0188,
"num_input_tokens_seen": 1558424,
"step": 4135
},
{
"epoch": 3.709677419354839,
"grad_norm": 0.1123046875,
"learning_rate": 3.9632565908988476e-05,
"loss": 0.1234,
"num_input_tokens_seen": 1560344,
"step": 4140
},
{
"epoch": 3.71415770609319,
"grad_norm": 2.234375,
"learning_rate": 3.960084688535881e-05,
"loss": 0.1734,
"num_input_tokens_seen": 1562264,
"step": 4145
},
{
"epoch": 3.718637992831541,
"grad_norm": 7.625,
"learning_rate": 3.956909215054066e-05,
"loss": 0.4418,
"num_input_tokens_seen": 1564120,
"step": 4150
},
{
"epoch": 3.7231182795698925,
"grad_norm": 46.25,
"learning_rate": 3.953730178220067e-05,
"loss": 0.7101,
"num_input_tokens_seen": 1566072,
"step": 4155
},
{
"epoch": 3.727598566308244,
"grad_norm": 45.25,
"learning_rate": 3.9505475858092705e-05,
"loss": 0.345,
"num_input_tokens_seen": 1567992,
"step": 4160
},
{
"epoch": 3.732078853046595,
"grad_norm": 90.0,
"learning_rate": 3.947361445605755e-05,
"loss": 0.5134,
"num_input_tokens_seen": 1569816,
"step": 4165
},
{
"epoch": 3.736559139784946,
"grad_norm": 0.40625,
"learning_rate": 3.944171765402279e-05,
"loss": 0.1613,
"num_input_tokens_seen": 1571672,
"step": 4170
},
{
"epoch": 3.7410394265232974,
"grad_norm": 0.84375,
"learning_rate": 3.9409785530002565e-05,
"loss": 0.2148,
"num_input_tokens_seen": 1573496,
"step": 4175
},
{
"epoch": 3.7455197132616487,
"grad_norm": 1.1484375,
"learning_rate": 3.937781816209742e-05,
"loss": 0.2359,
"num_input_tokens_seen": 1575416,
"step": 4180
},
{
"epoch": 3.75,
"grad_norm": 8.5625,
"learning_rate": 3.934581562849411e-05,
"loss": 0.0983,
"num_input_tokens_seen": 1577272,
"step": 4185
},
{
"epoch": 3.7544802867383513,
"grad_norm": 0.251953125,
"learning_rate": 3.931377800746538e-05,
"loss": 0.5382,
"num_input_tokens_seen": 1579064,
"step": 4190
},
{
"epoch": 3.7589605734767026,
"grad_norm": 42.25,
"learning_rate": 3.928170537736981e-05,
"loss": 0.5398,
"num_input_tokens_seen": 1580984,
"step": 4195
},
{
"epoch": 3.763440860215054,
"grad_norm": 0.0361328125,
"learning_rate": 3.924959781665159e-05,
"loss": 0.1501,
"num_input_tokens_seen": 1583096,
"step": 4200
},
{
"epoch": 3.767921146953405,
"grad_norm": 61.75,
"learning_rate": 3.921745540384038e-05,
"loss": 0.1818,
"num_input_tokens_seen": 1584824,
"step": 4205
},
{
"epoch": 3.772401433691756,
"grad_norm": 5.6875,
"learning_rate": 3.918527821755101e-05,
"loss": 0.4318,
"num_input_tokens_seen": 1586872,
"step": 4210
},
{
"epoch": 3.7768817204301075,
"grad_norm": 41.75,
"learning_rate": 3.915306633648345e-05,
"loss": 0.1079,
"num_input_tokens_seen": 1588696,
"step": 4215
},
{
"epoch": 3.781362007168459,
"grad_norm": 0.06103515625,
"learning_rate": 3.9120819839422456e-05,
"loss": 0.1854,
"num_input_tokens_seen": 1590712,
"step": 4220
},
{
"epoch": 3.78584229390681,
"grad_norm": 1.4765625,
"learning_rate": 3.908853880523748e-05,
"loss": 0.2814,
"num_input_tokens_seen": 1592472,
"step": 4225
},
{
"epoch": 3.790322580645161,
"grad_norm": 1.3203125,
"learning_rate": 3.905622331288246e-05,
"loss": 0.0522,
"num_input_tokens_seen": 1594168,
"step": 4230
},
{
"epoch": 3.7948028673835124,
"grad_norm": 5.125,
"learning_rate": 3.9023873441395574e-05,
"loss": 0.0743,
"num_input_tokens_seen": 1595992,
"step": 4235
},
{
"epoch": 3.7992831541218637,
"grad_norm": 3.328125,
"learning_rate": 3.899148926989912e-05,
"loss": 0.297,
"num_input_tokens_seen": 1597720,
"step": 4240
},
{
"epoch": 3.803763440860215,
"grad_norm": 59.25,
"learning_rate": 3.895907087759926e-05,
"loss": 0.5197,
"num_input_tokens_seen": 1599736,
"step": 4245
},
{
"epoch": 3.8082437275985663,
"grad_norm": 38.25,
"learning_rate": 3.8926618343785876e-05,
"loss": 0.0797,
"num_input_tokens_seen": 1601496,
"step": 4250
},
{
"epoch": 3.8127240143369177,
"grad_norm": 60.75,
"learning_rate": 3.8894131747832354e-05,
"loss": 0.2333,
"num_input_tokens_seen": 1603352,
"step": 4255
},
{
"epoch": 3.817204301075269,
"grad_norm": 0.2412109375,
"learning_rate": 3.886161116919537e-05,
"loss": 0.122,
"num_input_tokens_seen": 1605208,
"step": 4260
},
{
"epoch": 3.8216845878136203,
"grad_norm": 0.107421875,
"learning_rate": 3.8829056687414735e-05,
"loss": 0.0433,
"num_input_tokens_seen": 1607128,
"step": 4265
},
{
"epoch": 3.826164874551971,
"grad_norm": 1.4140625,
"learning_rate": 3.8796468382113184e-05,
"loss": 0.152,
"num_input_tokens_seen": 1609176,
"step": 4270
},
{
"epoch": 3.8306451612903225,
"grad_norm": 4.84375,
"learning_rate": 3.876384633299616e-05,
"loss": 0.289,
"num_input_tokens_seen": 1611096,
"step": 4275
},
{
"epoch": 3.835125448028674,
"grad_norm": 2.15625,
"learning_rate": 3.873119061985164e-05,
"loss": 0.0553,
"num_input_tokens_seen": 1613048,
"step": 4280
},
{
"epoch": 3.839605734767025,
"grad_norm": 49.25,
"learning_rate": 3.869850132254996e-05,
"loss": 0.0471,
"num_input_tokens_seen": 1615128,
"step": 4285
},
{
"epoch": 3.8440860215053765,
"grad_norm": 42.25,
"learning_rate": 3.866577852104358e-05,
"loss": 0.3536,
"num_input_tokens_seen": 1616952,
"step": 4290
},
{
"epoch": 3.8485663082437274,
"grad_norm": 0.09375,
"learning_rate": 3.86330222953669e-05,
"loss": 0.3689,
"num_input_tokens_seen": 1618840,
"step": 4295
},
{
"epoch": 3.8530465949820787,
"grad_norm": 77.0,
"learning_rate": 3.860023272563609e-05,
"loss": 0.4489,
"num_input_tokens_seen": 1620760,
"step": 4300
},
{
"epoch": 3.85752688172043,
"grad_norm": 0.039794921875,
"learning_rate": 3.856740989204884e-05,
"loss": 0.125,
"num_input_tokens_seen": 1622648,
"step": 4305
},
{
"epoch": 3.8620071684587813,
"grad_norm": 0.9921875,
"learning_rate": 3.8534553874884244e-05,
"loss": 0.2182,
"num_input_tokens_seen": 1624632,
"step": 4310
},
{
"epoch": 3.8664874551971327,
"grad_norm": 0.14453125,
"learning_rate": 3.850166475450252e-05,
"loss": 0.1119,
"num_input_tokens_seen": 1626520,
"step": 4315
},
{
"epoch": 3.870967741935484,
"grad_norm": 34.5,
"learning_rate": 3.846874261134485e-05,
"loss": 0.2454,
"num_input_tokens_seen": 1628536,
"step": 4320
},
{
"epoch": 3.8754480286738353,
"grad_norm": 4.09375,
"learning_rate": 3.843578752593323e-05,
"loss": 0.1671,
"num_input_tokens_seen": 1630488,
"step": 4325
},
{
"epoch": 3.8799283154121866,
"grad_norm": 33.75,
"learning_rate": 3.840279957887017e-05,
"loss": 0.4504,
"num_input_tokens_seen": 1632344,
"step": 4330
},
{
"epoch": 3.8844086021505375,
"grad_norm": 45.5,
"learning_rate": 3.836977885083858e-05,
"loss": 0.0496,
"num_input_tokens_seen": 1634296,
"step": 4335
},
{
"epoch": 3.888888888888889,
"grad_norm": 36.5,
"learning_rate": 3.833672542260156e-05,
"loss": 0.1614,
"num_input_tokens_seen": 1636312,
"step": 4340
},
{
"epoch": 3.89336917562724,
"grad_norm": 0.86328125,
"learning_rate": 3.830363937500216e-05,
"loss": 0.3482,
"num_input_tokens_seen": 1638296,
"step": 4345
},
{
"epoch": 3.8978494623655915,
"grad_norm": 40.75,
"learning_rate": 3.827052078896323e-05,
"loss": 0.2429,
"num_input_tokens_seen": 1640248,
"step": 4350
},
{
"epoch": 3.902329749103943,
"grad_norm": 0.0301513671875,
"learning_rate": 3.8237369745487205e-05,
"loss": 0.2033,
"num_input_tokens_seen": 1642040,
"step": 4355
},
{
"epoch": 3.9068100358422937,
"grad_norm": 6.625,
"learning_rate": 3.820418632565589e-05,
"loss": 0.1894,
"num_input_tokens_seen": 1643736,
"step": 4360
},
{
"epoch": 3.911290322580645,
"grad_norm": 59.5,
"learning_rate": 3.817097061063028e-05,
"loss": 0.3942,
"num_input_tokens_seen": 1645784,
"step": 4365
},
{
"epoch": 3.9157706093189963,
"grad_norm": 25.125,
"learning_rate": 3.81377226816504e-05,
"loss": 0.0785,
"num_input_tokens_seen": 1647480,
"step": 4370
},
{
"epoch": 3.9202508960573477,
"grad_norm": 4.96875,
"learning_rate": 3.8104442620035e-05,
"loss": 0.0073,
"num_input_tokens_seen": 1649336,
"step": 4375
},
{
"epoch": 3.924731182795699,
"grad_norm": 1.0625,
"learning_rate": 3.8071130507181466e-05,
"loss": 0.2769,
"num_input_tokens_seen": 1651192,
"step": 4380
},
{
"epoch": 3.9292114695340503,
"grad_norm": 0.0771484375,
"learning_rate": 3.803778642456553e-05,
"loss": 0.2948,
"num_input_tokens_seen": 1653080,
"step": 4385
},
{
"epoch": 3.9336917562724016,
"grad_norm": 2.046875,
"learning_rate": 3.800441045374119e-05,
"loss": 0.0056,
"num_input_tokens_seen": 1655000,
"step": 4390
},
{
"epoch": 3.938172043010753,
"grad_norm": 145.0,
"learning_rate": 3.797100267634038e-05,
"loss": 0.3582,
"num_input_tokens_seen": 1656824,
"step": 4395
},
{
"epoch": 3.942652329749104,
"grad_norm": 0.10302734375,
"learning_rate": 3.7937563174072826e-05,
"loss": 0.5087,
"num_input_tokens_seen": 1658712,
"step": 4400
},
{
"epoch": 3.947132616487455,
"grad_norm": 71.5,
"learning_rate": 3.790409202872588e-05,
"loss": 0.322,
"num_input_tokens_seen": 1660568,
"step": 4405
},
{
"epoch": 3.9516129032258065,
"grad_norm": 0.287109375,
"learning_rate": 3.787058932216427e-05,
"loss": 0.0615,
"num_input_tokens_seen": 1662392,
"step": 4410
},
{
"epoch": 3.956093189964158,
"grad_norm": 2.890625,
"learning_rate": 3.783705513632992e-05,
"loss": 0.1443,
"num_input_tokens_seen": 1664088,
"step": 4415
},
{
"epoch": 3.9605734767025087,
"grad_norm": 0.341796875,
"learning_rate": 3.780348955324173e-05,
"loss": 0.0813,
"num_input_tokens_seen": 1665912,
"step": 4420
},
{
"epoch": 3.96505376344086,
"grad_norm": 0.173828125,
"learning_rate": 3.7769892654995444e-05,
"loss": 0.3217,
"num_input_tokens_seen": 1667832,
"step": 4425
},
{
"epoch": 3.9695340501792113,
"grad_norm": 0.1279296875,
"learning_rate": 3.773626452376332e-05,
"loss": 0.2543,
"num_input_tokens_seen": 1669816,
"step": 4430
},
{
"epoch": 3.9740143369175627,
"grad_norm": 89.0,
"learning_rate": 3.7702605241794073e-05,
"loss": 0.2574,
"num_input_tokens_seen": 1671608,
"step": 4435
},
{
"epoch": 3.978494623655914,
"grad_norm": 38.0,
"learning_rate": 3.7668914891412574e-05,
"loss": 0.4786,
"num_input_tokens_seen": 1673400,
"step": 4440
},
{
"epoch": 3.9829749103942653,
"grad_norm": 1.109375,
"learning_rate": 3.7635193555019697e-05,
"loss": 0.1937,
"num_input_tokens_seen": 1675192,
"step": 4445
},
{
"epoch": 3.9874551971326166,
"grad_norm": 15.75,
"learning_rate": 3.760144131509209e-05,
"loss": 0.6487,
"num_input_tokens_seen": 1677048,
"step": 4450
},
{
"epoch": 3.991935483870968,
"grad_norm": 0.171875,
"learning_rate": 3.756765825418199e-05,
"loss": 0.1002,
"num_input_tokens_seen": 1679128,
"step": 4455
},
{
"epoch": 3.996415770609319,
"grad_norm": 15.25,
"learning_rate": 3.7533844454917025e-05,
"loss": 0.0172,
"num_input_tokens_seen": 1680856,
"step": 4460
},
{
"epoch": 4.0,
"eval_loss": 0.2599673867225647,
"eval_runtime": 9.6754,
"eval_samples_per_second": 51.264,
"eval_steps_per_second": 12.816,
"num_input_tokens_seen": 1682016,
"step": 4464
},
{
"epoch": 4.000896057347671,
"grad_norm": 0.8125,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0387,
"num_input_tokens_seen": 1682336,
"step": 4465
},
{
"epoch": 4.005376344086022,
"grad_norm": 3.5,
"learning_rate": 3.746612497220869e-05,
"loss": 0.2266,
"num_input_tokens_seen": 1684096,
"step": 4470
},
{
"epoch": 4.009856630824372,
"grad_norm": 0.59375,
"learning_rate": 3.743221945439566e-05,
"loss": 0.0754,
"num_input_tokens_seen": 1686112,
"step": 4475
},
{
"epoch": 4.014336917562724,
"grad_norm": 0.412109375,
"learning_rate": 3.739828352948803e-05,
"loss": 0.2888,
"num_input_tokens_seen": 1687872,
"step": 4480
},
{
"epoch": 4.018817204301075,
"grad_norm": 79.0,
"learning_rate": 3.736431728048731e-05,
"loss": 0.5343,
"num_input_tokens_seen": 1689760,
"step": 4485
},
{
"epoch": 4.023297491039426,
"grad_norm": 14.0625,
"learning_rate": 3.733032079046916e-05,
"loss": 0.3996,
"num_input_tokens_seen": 1691584,
"step": 4490
},
{
"epoch": 4.027777777777778,
"grad_norm": 33.5,
"learning_rate": 3.7296294142583225e-05,
"loss": 0.0841,
"num_input_tokens_seen": 1693376,
"step": 4495
},
{
"epoch": 4.032258064516129,
"grad_norm": 58.0,
"learning_rate": 3.726223742005289e-05,
"loss": 0.1875,
"num_input_tokens_seen": 1695232,
"step": 4500
},
{
"epoch": 4.03673835125448,
"grad_norm": 11.125,
"learning_rate": 3.7228150706175116e-05,
"loss": 0.0216,
"num_input_tokens_seen": 1697088,
"step": 4505
},
{
"epoch": 4.041218637992832,
"grad_norm": 28.625,
"learning_rate": 3.7194034084320195e-05,
"loss": 0.1411,
"num_input_tokens_seen": 1699008,
"step": 4510
},
{
"epoch": 4.045698924731183,
"grad_norm": 0.3984375,
"learning_rate": 3.715988763793158e-05,
"loss": 0.295,
"num_input_tokens_seen": 1701216,
"step": 4515
},
{
"epoch": 4.050179211469534,
"grad_norm": 55.5,
"learning_rate": 3.7125711450525704e-05,
"loss": 0.3443,
"num_input_tokens_seen": 1703040,
"step": 4520
},
{
"epoch": 4.054659498207886,
"grad_norm": 22.375,
"learning_rate": 3.7091505605691674e-05,
"loss": 0.2259,
"num_input_tokens_seen": 1704800,
"step": 4525
},
{
"epoch": 4.059139784946237,
"grad_norm": 2.03125,
"learning_rate": 3.705727018709118e-05,
"loss": 0.4618,
"num_input_tokens_seen": 1706592,
"step": 4530
},
{
"epoch": 4.063620071684587,
"grad_norm": 0.37109375,
"learning_rate": 3.702300527845825e-05,
"loss": 0.4405,
"num_input_tokens_seen": 1708544,
"step": 4535
},
{
"epoch": 4.068100358422939,
"grad_norm": 1.1328125,
"learning_rate": 3.6988710963598993e-05,
"loss": 0.1059,
"num_input_tokens_seen": 1710720,
"step": 4540
},
{
"epoch": 4.07258064516129,
"grad_norm": 0.022216796875,
"learning_rate": 3.695438732639149e-05,
"loss": 0.238,
"num_input_tokens_seen": 1712480,
"step": 4545
},
{
"epoch": 4.077060931899641,
"grad_norm": 24.0,
"learning_rate": 3.6920034450785526e-05,
"loss": 0.3368,
"num_input_tokens_seen": 1714368,
"step": 4550
},
{
"epoch": 4.081541218637993,
"grad_norm": 13.1875,
"learning_rate": 3.688565242080238e-05,
"loss": 0.1543,
"num_input_tokens_seen": 1716256,
"step": 4555
},
{
"epoch": 4.086021505376344,
"grad_norm": 14.3125,
"learning_rate": 3.6851241320534665e-05,
"loss": 0.0893,
"num_input_tokens_seen": 1718208,
"step": 4560
},
{
"epoch": 4.090501792114695,
"grad_norm": 12.4375,
"learning_rate": 3.681680123414606e-05,
"loss": 0.0782,
"num_input_tokens_seen": 1719936,
"step": 4565
},
{
"epoch": 4.094982078853047,
"grad_norm": 0.30859375,
"learning_rate": 3.678233224587118e-05,
"loss": 0.0109,
"num_input_tokens_seen": 1721856,
"step": 4570
},
{
"epoch": 4.099462365591398,
"grad_norm": 0.263671875,
"learning_rate": 3.6747834440015294e-05,
"loss": 0.0467,
"num_input_tokens_seen": 1723808,
"step": 4575
},
{
"epoch": 4.103942652329749,
"grad_norm": 65.0,
"learning_rate": 3.671330790095417e-05,
"loss": 0.2825,
"num_input_tokens_seen": 1725696,
"step": 4580
},
{
"epoch": 4.108422939068101,
"grad_norm": 0.1640625,
"learning_rate": 3.667875271313386e-05,
"loss": 0.0591,
"num_input_tokens_seen": 1727584,
"step": 4585
},
{
"epoch": 4.112903225806452,
"grad_norm": 1.2578125,
"learning_rate": 3.664416896107047e-05,
"loss": 0.4676,
"num_input_tokens_seen": 1729568,
"step": 4590
},
{
"epoch": 4.117383512544803,
"grad_norm": 66.0,
"learning_rate": 3.660955672934998e-05,
"loss": 0.5191,
"num_input_tokens_seen": 1731328,
"step": 4595
},
{
"epoch": 4.121863799283154,
"grad_norm": 0.6171875,
"learning_rate": 3.657491610262802e-05,
"loss": 0.2543,
"num_input_tokens_seen": 1733344,
"step": 4600
},
{
"epoch": 4.126344086021505,
"grad_norm": 23.0,
"learning_rate": 3.654024716562968e-05,
"loss": 0.0116,
"num_input_tokens_seen": 1735232,
"step": 4605
},
{
"epoch": 4.130824372759856,
"grad_norm": 0.040771484375,
"learning_rate": 3.650555000314927e-05,
"loss": 0.1877,
"num_input_tokens_seen": 1737248,
"step": 4610
},
{
"epoch": 4.135304659498208,
"grad_norm": 108.0,
"learning_rate": 3.6470824700050155e-05,
"loss": 0.0822,
"num_input_tokens_seen": 1739264,
"step": 4615
},
{
"epoch": 4.139784946236559,
"grad_norm": 71.5,
"learning_rate": 3.643607134126452e-05,
"loss": 0.2854,
"num_input_tokens_seen": 1741184,
"step": 4620
},
{
"epoch": 4.14426523297491,
"grad_norm": 1.859375,
"learning_rate": 3.6401290011793185e-05,
"loss": 0.1791,
"num_input_tokens_seen": 1742976,
"step": 4625
},
{
"epoch": 4.148745519713262,
"grad_norm": 0.1953125,
"learning_rate": 3.636648079670534e-05,
"loss": 0.4787,
"num_input_tokens_seen": 1744832,
"step": 4630
},
{
"epoch": 4.153225806451613,
"grad_norm": 0.259765625,
"learning_rate": 3.6331643781138426e-05,
"loss": 0.2963,
"num_input_tokens_seen": 1746592,
"step": 4635
},
{
"epoch": 4.157706093189964,
"grad_norm": 45.25,
"learning_rate": 3.629677905029785e-05,
"loss": 0.4201,
"num_input_tokens_seen": 1748384,
"step": 4640
},
{
"epoch": 4.162186379928316,
"grad_norm": 3.1875,
"learning_rate": 3.626188668945683e-05,
"loss": 0.1839,
"num_input_tokens_seen": 1750272,
"step": 4645
},
{
"epoch": 4.166666666666667,
"grad_norm": 46.0,
"learning_rate": 3.622696678395613e-05,
"loss": 0.2603,
"num_input_tokens_seen": 1752128,
"step": 4650
},
{
"epoch": 4.171146953405018,
"grad_norm": 0.5703125,
"learning_rate": 3.619201941920389e-05,
"loss": 0.3704,
"num_input_tokens_seen": 1754112,
"step": 4655
},
{
"epoch": 4.175627240143369,
"grad_norm": 0.0439453125,
"learning_rate": 3.615704468067545e-05,
"loss": 0.1367,
"num_input_tokens_seen": 1755936,
"step": 4660
},
{
"epoch": 4.18010752688172,
"grad_norm": 41.0,
"learning_rate": 3.612204265391306e-05,
"loss": 0.1925,
"num_input_tokens_seen": 1757792,
"step": 4665
},
{
"epoch": 4.184587813620071,
"grad_norm": 105.0,
"learning_rate": 3.608701342452573e-05,
"loss": 0.2616,
"num_input_tokens_seen": 1759680,
"step": 4670
},
{
"epoch": 4.189068100358423,
"grad_norm": 25.125,
"learning_rate": 3.605195707818898e-05,
"loss": 0.0973,
"num_input_tokens_seen": 1761568,
"step": 4675
},
{
"epoch": 4.193548387096774,
"grad_norm": 0.029052734375,
"learning_rate": 3.6016873700644685e-05,
"loss": 0.0057,
"num_input_tokens_seen": 1763488,
"step": 4680
},
{
"epoch": 4.198028673835125,
"grad_norm": 97.0,
"learning_rate": 3.598176337770082e-05,
"loss": 0.4031,
"num_input_tokens_seen": 1765440,
"step": 4685
},
{
"epoch": 4.202508960573477,
"grad_norm": 28.0,
"learning_rate": 3.594662619523127e-05,
"loss": 0.1148,
"num_input_tokens_seen": 1767296,
"step": 4690
},
{
"epoch": 4.206989247311828,
"grad_norm": 23.25,
"learning_rate": 3.5911462239175595e-05,
"loss": 0.2243,
"num_input_tokens_seen": 1769248,
"step": 4695
},
{
"epoch": 4.211469534050179,
"grad_norm": 1.2578125,
"learning_rate": 3.587627159553886e-05,
"loss": 0.0521,
"num_input_tokens_seen": 1770976,
"step": 4700
},
{
"epoch": 4.215949820788531,
"grad_norm": 0.2333984375,
"learning_rate": 3.5841054350391386e-05,
"loss": 0.3147,
"num_input_tokens_seen": 1772960,
"step": 4705
},
{
"epoch": 4.220430107526882,
"grad_norm": 0.0279541015625,
"learning_rate": 3.580581058986858e-05,
"loss": 0.1456,
"num_input_tokens_seen": 1774752,
"step": 4710
},
{
"epoch": 4.224910394265233,
"grad_norm": 98.0,
"learning_rate": 3.5770540400170675e-05,
"loss": 0.5658,
"num_input_tokens_seen": 1776800,
"step": 4715
},
{
"epoch": 4.229390681003585,
"grad_norm": 66.5,
"learning_rate": 3.573524386756256e-05,
"loss": 0.3347,
"num_input_tokens_seen": 1778752,
"step": 4720
},
{
"epoch": 4.233870967741935,
"grad_norm": 0.578125,
"learning_rate": 3.569992107837356e-05,
"loss": 0.0016,
"num_input_tokens_seen": 1780736,
"step": 4725
},
{
"epoch": 4.238351254480286,
"grad_norm": 69.0,
"learning_rate": 3.56645721189972e-05,
"loss": 0.4625,
"num_input_tokens_seen": 1782688,
"step": 4730
},
{
"epoch": 4.242831541218638,
"grad_norm": 5.625,
"learning_rate": 3.562919707589102e-05,
"loss": 0.1422,
"num_input_tokens_seen": 1784416,
"step": 4735
},
{
"epoch": 4.247311827956989,
"grad_norm": 0.083984375,
"learning_rate": 3.5593796035576373e-05,
"loss": 0.4159,
"num_input_tokens_seen": 1786432,
"step": 4740
},
{
"epoch": 4.25179211469534,
"grad_norm": 55.5,
"learning_rate": 3.555836908463817e-05,
"loss": 0.1763,
"num_input_tokens_seen": 1788352,
"step": 4745
},
{
"epoch": 4.256272401433692,
"grad_norm": 107.0,
"learning_rate": 3.552291630972472e-05,
"loss": 0.0591,
"num_input_tokens_seen": 1790208,
"step": 4750
},
{
"epoch": 4.260752688172043,
"grad_norm": 8.6875,
"learning_rate": 3.5487437797547476e-05,
"loss": 0.0395,
"num_input_tokens_seen": 1792032,
"step": 4755
},
{
"epoch": 4.265232974910394,
"grad_norm": 0.0361328125,
"learning_rate": 3.545193363488085e-05,
"loss": 0.0057,
"num_input_tokens_seen": 1793792,
"step": 4760
},
{
"epoch": 4.269713261648746,
"grad_norm": 0.1513671875,
"learning_rate": 3.5416403908561966e-05,
"loss": 0.1855,
"num_input_tokens_seen": 1795712,
"step": 4765
},
{
"epoch": 4.274193548387097,
"grad_norm": 33.25,
"learning_rate": 3.538084870549052e-05,
"loss": 0.2746,
"num_input_tokens_seen": 1797536,
"step": 4770
},
{
"epoch": 4.278673835125448,
"grad_norm": 5.0,
"learning_rate": 3.534526811262848e-05,
"loss": 0.3673,
"num_input_tokens_seen": 1799392,
"step": 4775
},
{
"epoch": 4.2831541218638,
"grad_norm": 39.75,
"learning_rate": 3.530966221699992e-05,
"loss": 0.2047,
"num_input_tokens_seen": 1801184,
"step": 4780
},
{
"epoch": 4.287634408602151,
"grad_norm": 73.0,
"learning_rate": 3.5274031105690826e-05,
"loss": 0.2754,
"num_input_tokens_seen": 1803104,
"step": 4785
},
{
"epoch": 4.292114695340501,
"grad_norm": 20.5,
"learning_rate": 3.523837486584881e-05,
"loss": 0.0328,
"num_input_tokens_seen": 1805024,
"step": 4790
},
{
"epoch": 4.296594982078853,
"grad_norm": 0.25,
"learning_rate": 3.5202693584682986e-05,
"loss": 0.4132,
"num_input_tokens_seen": 1806848,
"step": 4795
},
{
"epoch": 4.301075268817204,
"grad_norm": 66.5,
"learning_rate": 3.51669873494637e-05,
"loss": 0.2033,
"num_input_tokens_seen": 1808832,
"step": 4800
},
{
"epoch": 4.305555555555555,
"grad_norm": 115.0,
"learning_rate": 3.513125624752232e-05,
"loss": 0.4578,
"num_input_tokens_seen": 1810656,
"step": 4805
},
{
"epoch": 4.310035842293907,
"grad_norm": 15.25,
"learning_rate": 3.509550036625106e-05,
"loss": 0.0701,
"num_input_tokens_seen": 1812512,
"step": 4810
},
{
"epoch": 4.314516129032258,
"grad_norm": 50.0,
"learning_rate": 3.5059719793102716e-05,
"loss": 0.3195,
"num_input_tokens_seen": 1814336,
"step": 4815
},
{
"epoch": 4.318996415770609,
"grad_norm": 42.5,
"learning_rate": 3.502391461559049e-05,
"loss": 0.0871,
"num_input_tokens_seen": 1816288,
"step": 4820
},
{
"epoch": 4.323476702508961,
"grad_norm": 0.0247802734375,
"learning_rate": 3.498808492128776e-05,
"loss": 0.2108,
"num_input_tokens_seen": 1818144,
"step": 4825
},
{
"epoch": 4.327956989247312,
"grad_norm": 0.06591796875,
"learning_rate": 3.495223079782785e-05,
"loss": 0.0933,
"num_input_tokens_seen": 1820224,
"step": 4830
},
{
"epoch": 4.332437275985663,
"grad_norm": 92.0,
"learning_rate": 3.491635233290387e-05,
"loss": 0.2013,
"num_input_tokens_seen": 1822048,
"step": 4835
},
{
"epoch": 4.336917562724015,
"grad_norm": 240.0,
"learning_rate": 3.488044961426843e-05,
"loss": 0.4813,
"num_input_tokens_seen": 1824000,
"step": 4840
},
{
"epoch": 4.341397849462366,
"grad_norm": 88.0,
"learning_rate": 3.484452272973347e-05,
"loss": 0.1187,
"num_input_tokens_seen": 1825856,
"step": 4845
},
{
"epoch": 4.345878136200717,
"grad_norm": 0.32421875,
"learning_rate": 3.480857176717005e-05,
"loss": 0.2748,
"num_input_tokens_seen": 1827776,
"step": 4850
},
{
"epoch": 4.350358422939068,
"grad_norm": 0.453125,
"learning_rate": 3.4772596814508104e-05,
"loss": 0.2588,
"num_input_tokens_seen": 1829600,
"step": 4855
},
{
"epoch": 4.354838709677419,
"grad_norm": 0.2373046875,
"learning_rate": 3.473659795973626e-05,
"loss": 0.0906,
"num_input_tokens_seen": 1831360,
"step": 4860
},
{
"epoch": 4.35931899641577,
"grad_norm": 76.5,
"learning_rate": 3.470057529090159e-05,
"loss": 0.5825,
"num_input_tokens_seen": 1833152,
"step": 4865
},
{
"epoch": 4.363799283154122,
"grad_norm": 1.3125,
"learning_rate": 3.46645288961094e-05,
"loss": 0.3683,
"num_input_tokens_seen": 1834976,
"step": 4870
},
{
"epoch": 4.368279569892473,
"grad_norm": 1.078125,
"learning_rate": 3.462845886352306e-05,
"loss": 0.1733,
"num_input_tokens_seen": 1837024,
"step": 4875
},
{
"epoch": 4.372759856630824,
"grad_norm": 57.75,
"learning_rate": 3.4592365281363734e-05,
"loss": 0.1822,
"num_input_tokens_seen": 1838848,
"step": 4880
},
{
"epoch": 4.377240143369176,
"grad_norm": 0.376953125,
"learning_rate": 3.455624823791018e-05,
"loss": 0.0133,
"num_input_tokens_seen": 1840640,
"step": 4885
},
{
"epoch": 4.381720430107527,
"grad_norm": 66.5,
"learning_rate": 3.4520107821498544e-05,
"loss": 0.2264,
"num_input_tokens_seen": 1842752,
"step": 4890
},
{
"epoch": 4.386200716845878,
"grad_norm": 1.6640625,
"learning_rate": 3.448394412052215e-05,
"loss": 0.0015,
"num_input_tokens_seen": 1844480,
"step": 4895
},
{
"epoch": 4.39068100358423,
"grad_norm": 6.5625,
"learning_rate": 3.444775722343124e-05,
"loss": 0.3029,
"num_input_tokens_seen": 1846240,
"step": 4900
},
{
"epoch": 4.395161290322581,
"grad_norm": 0.048583984375,
"learning_rate": 3.441154721873284e-05,
"loss": 0.4875,
"num_input_tokens_seen": 1848064,
"step": 4905
},
{
"epoch": 4.399641577060932,
"grad_norm": 0.0341796875,
"learning_rate": 3.437531419499043e-05,
"loss": 0.0244,
"num_input_tokens_seen": 1850048,
"step": 4910
},
{
"epoch": 4.404121863799283,
"grad_norm": 1.2421875,
"learning_rate": 3.4339058240823843e-05,
"loss": 0.3374,
"num_input_tokens_seen": 1852032,
"step": 4915
},
{
"epoch": 4.408602150537634,
"grad_norm": 3.0625,
"learning_rate": 3.430277944490898e-05,
"loss": 0.0193,
"num_input_tokens_seen": 1853888,
"step": 4920
},
{
"epoch": 4.413082437275985,
"grad_norm": 0.333984375,
"learning_rate": 3.42664778959776e-05,
"loss": 0.0034,
"num_input_tokens_seen": 1855776,
"step": 4925
},
{
"epoch": 4.417562724014337,
"grad_norm": 93.0,
"learning_rate": 3.423015368281711e-05,
"loss": 0.3589,
"num_input_tokens_seen": 1857600,
"step": 4930
},
{
"epoch": 4.422043010752688,
"grad_norm": 157.0,
"learning_rate": 3.419380689427038e-05,
"loss": 0.3846,
"num_input_tokens_seen": 1859520,
"step": 4935
},
{
"epoch": 4.426523297491039,
"grad_norm": 68.0,
"learning_rate": 3.415743761923546e-05,
"loss": 0.1358,
"num_input_tokens_seen": 1861440,
"step": 4940
},
{
"epoch": 4.431003584229391,
"grad_norm": 195.0,
"learning_rate": 3.412104594666541e-05,
"loss": 0.3683,
"num_input_tokens_seen": 1863200,
"step": 4945
},
{
"epoch": 4.435483870967742,
"grad_norm": 9.3125,
"learning_rate": 3.408463196556807e-05,
"loss": 0.1198,
"num_input_tokens_seen": 1865120,
"step": 4950
},
{
"epoch": 4.439964157706093,
"grad_norm": 3.6875,
"learning_rate": 3.404819576500586e-05,
"loss": 0.0337,
"num_input_tokens_seen": 1867136,
"step": 4955
},
{
"epoch": 4.444444444444445,
"grad_norm": 59.75,
"learning_rate": 3.401173743409552e-05,
"loss": 0.4173,
"num_input_tokens_seen": 1868864,
"step": 4960
},
{
"epoch": 4.448924731182796,
"grad_norm": 78.0,
"learning_rate": 3.397525706200793e-05,
"loss": 0.3072,
"num_input_tokens_seen": 1870720,
"step": 4965
},
{
"epoch": 4.453405017921147,
"grad_norm": 49.0,
"learning_rate": 3.393875473796787e-05,
"loss": 0.3405,
"num_input_tokens_seen": 1872640,
"step": 4970
},
{
"epoch": 4.457885304659499,
"grad_norm": 67.0,
"learning_rate": 3.390223055125383e-05,
"loss": 0.4417,
"num_input_tokens_seen": 1874528,
"step": 4975
},
{
"epoch": 4.462365591397849,
"grad_norm": 97.0,
"learning_rate": 3.3865684591197745e-05,
"loss": 0.6058,
"num_input_tokens_seen": 1876416,
"step": 4980
},
{
"epoch": 4.4668458781362,
"grad_norm": 2.484375,
"learning_rate": 3.3829116947184823e-05,
"loss": 0.0121,
"num_input_tokens_seen": 1878176,
"step": 4985
},
{
"epoch": 4.471326164874552,
"grad_norm": 40.0,
"learning_rate": 3.379252770865331e-05,
"loss": 0.2859,
"num_input_tokens_seen": 1879968,
"step": 4990
},
{
"epoch": 4.475806451612903,
"grad_norm": 42.25,
"learning_rate": 3.375591696509425e-05,
"loss": 0.5438,
"num_input_tokens_seen": 1881760,
"step": 4995
},
{
"epoch": 4.480286738351254,
"grad_norm": 36.0,
"learning_rate": 3.371928480605131e-05,
"loss": 0.5051,
"num_input_tokens_seen": 1883648,
"step": 5000
},
{
"epoch": 4.484767025089606,
"grad_norm": 15.375,
"learning_rate": 3.3682631321120504e-05,
"loss": 0.0665,
"num_input_tokens_seen": 1885696,
"step": 5005
},
{
"epoch": 4.489247311827957,
"grad_norm": 3.21875,
"learning_rate": 3.3645956599950044e-05,
"loss": 0.1431,
"num_input_tokens_seen": 1887488,
"step": 5010
},
{
"epoch": 4.493727598566308,
"grad_norm": 27.5,
"learning_rate": 3.360926073224004e-05,
"loss": 0.172,
"num_input_tokens_seen": 1889472,
"step": 5015
},
{
"epoch": 4.49820788530466,
"grad_norm": 30.875,
"learning_rate": 3.3572543807742364e-05,
"loss": 0.3535,
"num_input_tokens_seen": 1891360,
"step": 5020
},
{
"epoch": 4.5,
"eval_loss": 0.26038724184036255,
"eval_runtime": 9.6376,
"eval_samples_per_second": 51.465,
"eval_steps_per_second": 12.866,
"num_input_tokens_seen": 1892160,
"step": 5022
},
{
"epoch": 4.502688172043011,
"grad_norm": 0.087890625,
"learning_rate": 3.3535805916260346e-05,
"loss": 0.0725,
"num_input_tokens_seen": 1893312,
"step": 5025
},
{
"epoch": 4.507168458781362,
"grad_norm": 1.140625,
"learning_rate": 3.3499047147648645e-05,
"loss": 0.3165,
"num_input_tokens_seen": 1895072,
"step": 5030
},
{
"epoch": 4.511648745519714,
"grad_norm": 47.0,
"learning_rate": 3.346226759181294e-05,
"loss": 0.4865,
"num_input_tokens_seen": 1896928,
"step": 5035
},
{
"epoch": 4.516129032258064,
"grad_norm": 26.875,
"learning_rate": 3.342546733870977e-05,
"loss": 0.1517,
"num_input_tokens_seen": 1898816,
"step": 5040
},
{
"epoch": 4.520609318996415,
"grad_norm": 0.06103515625,
"learning_rate": 3.338864647834631e-05,
"loss": 0.1587,
"num_input_tokens_seen": 1900672,
"step": 5045
},
{
"epoch": 4.525089605734767,
"grad_norm": 38.5,
"learning_rate": 3.335180510078012e-05,
"loss": 0.0171,
"num_input_tokens_seen": 1902528,
"step": 5050
},
{
"epoch": 4.529569892473118,
"grad_norm": 124.5,
"learning_rate": 3.331494329611894e-05,
"loss": 0.3617,
"num_input_tokens_seen": 1904672,
"step": 5055
},
{
"epoch": 4.534050179211469,
"grad_norm": 0.259765625,
"learning_rate": 3.327806115452046e-05,
"loss": 0.4538,
"num_input_tokens_seen": 1906720,
"step": 5060
},
{
"epoch": 4.538530465949821,
"grad_norm": 1.796875,
"learning_rate": 3.324115876619215e-05,
"loss": 0.2685,
"num_input_tokens_seen": 1908544,
"step": 5065
},
{
"epoch": 4.543010752688172,
"grad_norm": 0.3125,
"learning_rate": 3.3204236221390975e-05,
"loss": 0.0322,
"num_input_tokens_seen": 1910496,
"step": 5070
},
{
"epoch": 4.547491039426523,
"grad_norm": 118.5,
"learning_rate": 3.316729361042319e-05,
"loss": 0.453,
"num_input_tokens_seen": 1912288,
"step": 5075
},
{
"epoch": 4.551971326164875,
"grad_norm": 9.0,
"learning_rate": 3.3130331023644134e-05,
"loss": 0.2588,
"num_input_tokens_seen": 1914208,
"step": 5080
},
{
"epoch": 4.556451612903226,
"grad_norm": 4.53125,
"learning_rate": 3.309334855145803e-05,
"loss": 0.2538,
"num_input_tokens_seen": 1915968,
"step": 5085
},
{
"epoch": 4.560931899641577,
"grad_norm": 35.75,
"learning_rate": 3.30563462843177e-05,
"loss": 0.3915,
"num_input_tokens_seen": 1917824,
"step": 5090
},
{
"epoch": 4.565412186379929,
"grad_norm": 65.0,
"learning_rate": 3.301932431272439e-05,
"loss": 0.3966,
"num_input_tokens_seen": 1919808,
"step": 5095
},
{
"epoch": 4.56989247311828,
"grad_norm": 40.75,
"learning_rate": 3.2982282727227565e-05,
"loss": 0.1499,
"num_input_tokens_seen": 1921728,
"step": 5100
},
{
"epoch": 4.574372759856631,
"grad_norm": 0.72265625,
"learning_rate": 3.294522161842463e-05,
"loss": 0.4451,
"num_input_tokens_seen": 1923584,
"step": 5105
},
{
"epoch": 4.578853046594982,
"grad_norm": 24.5,
"learning_rate": 3.2908141076960766e-05,
"loss": 0.2992,
"num_input_tokens_seen": 1925504,
"step": 5110
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.35546875,
"learning_rate": 3.287104119352867e-05,
"loss": 0.2643,
"num_input_tokens_seen": 1927360,
"step": 5115
},
{
"epoch": 4.587813620071684,
"grad_norm": 0.322265625,
"learning_rate": 3.283392205886833e-05,
"loss": 0.1173,
"num_input_tokens_seen": 1929376,
"step": 5120
},
{
"epoch": 4.592293906810036,
"grad_norm": 42.0,
"learning_rate": 3.279678376376686e-05,
"loss": 0.2603,
"num_input_tokens_seen": 1931232,
"step": 5125
},
{
"epoch": 4.596774193548387,
"grad_norm": 136.0,
"learning_rate": 3.2759626399058196e-05,
"loss": 0.1167,
"num_input_tokens_seen": 1933056,
"step": 5130
},
{
"epoch": 4.601254480286738,
"grad_norm": 1.8203125,
"learning_rate": 3.2722450055622946e-05,
"loss": 0.1099,
"num_input_tokens_seen": 1935040,
"step": 5135
},
{
"epoch": 4.60573476702509,
"grad_norm": 78.0,
"learning_rate": 3.268525482438813e-05,
"loss": 0.2141,
"num_input_tokens_seen": 1937056,
"step": 5140
},
{
"epoch": 4.610215053763441,
"grad_norm": 0.042236328125,
"learning_rate": 3.264804079632693e-05,
"loss": 0.0394,
"num_input_tokens_seen": 1938944,
"step": 5145
},
{
"epoch": 4.614695340501792,
"grad_norm": 105.0,
"learning_rate": 3.2610808062458554e-05,
"loss": 0.259,
"num_input_tokens_seen": 1940768,
"step": 5150
},
{
"epoch": 4.619175627240144,
"grad_norm": 1.484375,
"learning_rate": 3.257355671384794e-05,
"loss": 0.3531,
"num_input_tokens_seen": 1942560,
"step": 5155
},
{
"epoch": 4.623655913978495,
"grad_norm": 11.0625,
"learning_rate": 3.253628684160554e-05,
"loss": 0.0807,
"num_input_tokens_seen": 1944480,
"step": 5160
},
{
"epoch": 4.628136200716845,
"grad_norm": 77.0,
"learning_rate": 3.2498998536887114e-05,
"loss": 0.0842,
"num_input_tokens_seen": 1946336,
"step": 5165
},
{
"epoch": 4.632616487455197,
"grad_norm": 13.5,
"learning_rate": 3.246169189089354e-05,
"loss": 0.0355,
"num_input_tokens_seen": 1948064,
"step": 5170
},
{
"epoch": 4.637096774193548,
"grad_norm": 55.25,
"learning_rate": 3.2424366994870515e-05,
"loss": 0.1918,
"num_input_tokens_seen": 1949952,
"step": 5175
},
{
"epoch": 4.641577060931899,
"grad_norm": 0.0159912109375,
"learning_rate": 3.238702394010839e-05,
"loss": 0.2356,
"num_input_tokens_seen": 1951680,
"step": 5180
},
{
"epoch": 4.646057347670251,
"grad_norm": 0.057861328125,
"learning_rate": 3.234966281794193e-05,
"loss": 0.0076,
"num_input_tokens_seen": 1953472,
"step": 5185
},
{
"epoch": 4.650537634408602,
"grad_norm": 0.34765625,
"learning_rate": 3.231228371975007e-05,
"loss": 0.2244,
"num_input_tokens_seen": 1955328,
"step": 5190
},
{
"epoch": 4.655017921146953,
"grad_norm": 16.875,
"learning_rate": 3.2274886736955744e-05,
"loss": 0.2196,
"num_input_tokens_seen": 1957184,
"step": 5195
},
{
"epoch": 4.659498207885305,
"grad_norm": 15.0,
"learning_rate": 3.223747196102561e-05,
"loss": 0.1521,
"num_input_tokens_seen": 1959040,
"step": 5200
},
{
"epoch": 4.663978494623656,
"grad_norm": 2.046875,
"learning_rate": 3.220003948346984e-05,
"loss": 0.0014,
"num_input_tokens_seen": 1961088,
"step": 5205
},
{
"epoch": 4.668458781362007,
"grad_norm": 0.89453125,
"learning_rate": 3.216258939584192e-05,
"loss": 0.0633,
"num_input_tokens_seen": 1962752,
"step": 5210
},
{
"epoch": 4.672939068100359,
"grad_norm": 16.0,
"learning_rate": 3.2125121789738384e-05,
"loss": 0.263,
"num_input_tokens_seen": 1964704,
"step": 5215
},
{
"epoch": 4.67741935483871,
"grad_norm": 30.375,
"learning_rate": 3.2087636756798635e-05,
"loss": 0.5342,
"num_input_tokens_seen": 1966688,
"step": 5220
},
{
"epoch": 4.681899641577061,
"grad_norm": 60.0,
"learning_rate": 3.205013438870468e-05,
"loss": 0.3616,
"num_input_tokens_seen": 1968480,
"step": 5225
},
{
"epoch": 4.686379928315413,
"grad_norm": 0.80078125,
"learning_rate": 3.201261477718093e-05,
"loss": 0.2161,
"num_input_tokens_seen": 1970304,
"step": 5230
},
{
"epoch": 4.690860215053764,
"grad_norm": 78.5,
"learning_rate": 3.197507801399399e-05,
"loss": 0.2878,
"num_input_tokens_seen": 1972224,
"step": 5235
},
{
"epoch": 4.695340501792114,
"grad_norm": 0.416015625,
"learning_rate": 3.193752419095239e-05,
"loss": 0.0702,
"num_input_tokens_seen": 1974016,
"step": 5240
},
{
"epoch": 4.699820788530466,
"grad_norm": 16.125,
"learning_rate": 3.18999533999064e-05,
"loss": 0.0138,
"num_input_tokens_seen": 1975840,
"step": 5245
},
{
"epoch": 4.704301075268817,
"grad_norm": 0.08544921875,
"learning_rate": 3.186236573274779e-05,
"loss": 0.0054,
"num_input_tokens_seen": 1977728,
"step": 5250
},
{
"epoch": 4.708781362007168,
"grad_norm": 0.177734375,
"learning_rate": 3.1824761281409574e-05,
"loss": 0.4327,
"num_input_tokens_seen": 1979776,
"step": 5255
},
{
"epoch": 4.71326164874552,
"grad_norm": 63.75,
"learning_rate": 3.178714013786587e-05,
"loss": 0.3298,
"num_input_tokens_seen": 1981728,
"step": 5260
},
{
"epoch": 4.717741935483871,
"grad_norm": 0.050048828125,
"learning_rate": 3.174950239413161e-05,
"loss": 0.424,
"num_input_tokens_seen": 1983776,
"step": 5265
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.57421875,
"learning_rate": 3.171184814226228e-05,
"loss": 0.3031,
"num_input_tokens_seen": 1985632,
"step": 5270
},
{
"epoch": 4.726702508960574,
"grad_norm": 0.05712890625,
"learning_rate": 3.167417747435379e-05,
"loss": 0.5461,
"num_input_tokens_seen": 1987456,
"step": 5275
},
{
"epoch": 4.731182795698925,
"grad_norm": 0.455078125,
"learning_rate": 3.16364904825422e-05,
"loss": 0.3433,
"num_input_tokens_seen": 1989344,
"step": 5280
},
{
"epoch": 4.735663082437276,
"grad_norm": 0.031494140625,
"learning_rate": 3.1598787259003476e-05,
"loss": 0.1058,
"num_input_tokens_seen": 1991232,
"step": 5285
},
{
"epoch": 4.740143369175628,
"grad_norm": 34.0,
"learning_rate": 3.1561067895953276e-05,
"loss": 0.2881,
"num_input_tokens_seen": 1993216,
"step": 5290
},
{
"epoch": 4.744623655913978,
"grad_norm": 51.0,
"learning_rate": 3.152333248564677e-05,
"loss": 0.1803,
"num_input_tokens_seen": 1995040,
"step": 5295
},
{
"epoch": 4.749103942652329,
"grad_norm": 76.0,
"learning_rate": 3.148558112037835e-05,
"loss": 0.1115,
"num_input_tokens_seen": 1996928,
"step": 5300
},
{
"epoch": 4.753584229390681,
"grad_norm": 1.3828125,
"learning_rate": 3.1447813892481425e-05,
"loss": 0.1554,
"num_input_tokens_seen": 1998976,
"step": 5305
},
{
"epoch": 4.758064516129032,
"grad_norm": 17.875,
"learning_rate": 3.141003089432822e-05,
"loss": 0.1214,
"num_input_tokens_seen": 2000864,
"step": 5310
},
{
"epoch": 4.762544802867383,
"grad_norm": 32.25,
"learning_rate": 3.137223221832951e-05,
"loss": 0.1336,
"num_input_tokens_seen": 2002688,
"step": 5315
},
{
"epoch": 4.767025089605735,
"grad_norm": 30.375,
"learning_rate": 3.133441795693445e-05,
"loss": 0.1565,
"num_input_tokens_seen": 2004864,
"step": 5320
},
{
"epoch": 4.771505376344086,
"grad_norm": 137.0,
"learning_rate": 3.129658820263028e-05,
"loss": 0.2658,
"num_input_tokens_seen": 2006880,
"step": 5325
},
{
"epoch": 4.775985663082437,
"grad_norm": 0.107421875,
"learning_rate": 3.125874304794214e-05,
"loss": 0.2702,
"num_input_tokens_seen": 2008704,
"step": 5330
},
{
"epoch": 4.780465949820789,
"grad_norm": 0.158203125,
"learning_rate": 3.122088258543287e-05,
"loss": 0.2167,
"num_input_tokens_seen": 2010592,
"step": 5335
},
{
"epoch": 4.78494623655914,
"grad_norm": 3.015625,
"learning_rate": 3.1183006907702684e-05,
"loss": 0.0527,
"num_input_tokens_seen": 2012448,
"step": 5340
},
{
"epoch": 4.789426523297491,
"grad_norm": 4.53125,
"learning_rate": 3.114511610738907e-05,
"loss": 0.0908,
"num_input_tokens_seen": 2014208,
"step": 5345
},
{
"epoch": 4.793906810035843,
"grad_norm": 14.0625,
"learning_rate": 3.110721027716649e-05,
"loss": 0.0619,
"num_input_tokens_seen": 2016032,
"step": 5350
},
{
"epoch": 4.798387096774194,
"grad_norm": 0.23046875,
"learning_rate": 3.106928950974614e-05,
"loss": 0.1405,
"num_input_tokens_seen": 2017920,
"step": 5355
},
{
"epoch": 4.802867383512545,
"grad_norm": 18.75,
"learning_rate": 3.103135389787578e-05,
"loss": 0.0146,
"num_input_tokens_seen": 2019936,
"step": 5360
},
{
"epoch": 4.807347670250896,
"grad_norm": 0.08349609375,
"learning_rate": 3.099340353433946e-05,
"loss": 0.4135,
"num_input_tokens_seen": 2021824,
"step": 5365
},
{
"epoch": 4.811827956989247,
"grad_norm": 0.01251220703125,
"learning_rate": 3.095543851195732e-05,
"loss": 0.0512,
"num_input_tokens_seen": 2023904,
"step": 5370
},
{
"epoch": 4.816308243727598,
"grad_norm": 93.5,
"learning_rate": 3.091745892358535e-05,
"loss": 0.1243,
"num_input_tokens_seen": 2025728,
"step": 5375
},
{
"epoch": 4.82078853046595,
"grad_norm": 0.0301513671875,
"learning_rate": 3.087946486211515e-05,
"loss": 0.021,
"num_input_tokens_seen": 2027520,
"step": 5380
},
{
"epoch": 4.825268817204301,
"grad_norm": 31.875,
"learning_rate": 3.084145642047374e-05,
"loss": 0.0517,
"num_input_tokens_seen": 2029568,
"step": 5385
},
{
"epoch": 4.829749103942652,
"grad_norm": 4.1875,
"learning_rate": 3.080343369162332e-05,
"loss": 0.0818,
"num_input_tokens_seen": 2031552,
"step": 5390
},
{
"epoch": 4.834229390681004,
"grad_norm": 3.265625,
"learning_rate": 3.076539676856101e-05,
"loss": 0.2148,
"num_input_tokens_seen": 2033472,
"step": 5395
},
{
"epoch": 4.838709677419355,
"grad_norm": 0.6953125,
"learning_rate": 3.0727345744318645e-05,
"loss": 0.0076,
"num_input_tokens_seen": 2035424,
"step": 5400
},
{
"epoch": 4.843189964157706,
"grad_norm": 55.75,
"learning_rate": 3.068928071196256e-05,
"loss": 0.1407,
"num_input_tokens_seen": 2037248,
"step": 5405
},
{
"epoch": 4.847670250896058,
"grad_norm": 0.0576171875,
"learning_rate": 3.065120176459338e-05,
"loss": 0.146,
"num_input_tokens_seen": 2039040,
"step": 5410
},
{
"epoch": 4.852150537634409,
"grad_norm": 69.0,
"learning_rate": 3.0613108995345694e-05,
"loss": 0.5169,
"num_input_tokens_seen": 2041152,
"step": 5415
},
{
"epoch": 4.856630824372759,
"grad_norm": 0.031494140625,
"learning_rate": 3.057500249738796e-05,
"loss": 0.1505,
"num_input_tokens_seen": 2043072,
"step": 5420
},
{
"epoch": 4.861111111111111,
"grad_norm": 92.0,
"learning_rate": 3.053688236392219e-05,
"loss": 0.0781,
"num_input_tokens_seen": 2045088,
"step": 5425
},
{
"epoch": 4.865591397849462,
"grad_norm": 51.0,
"learning_rate": 3.0498748688183744e-05,
"loss": 0.3524,
"num_input_tokens_seen": 2046912,
"step": 5430
},
{
"epoch": 4.870071684587813,
"grad_norm": 0.0908203125,
"learning_rate": 3.046060156344111e-05,
"loss": 0.3182,
"num_input_tokens_seen": 2048768,
"step": 5435
},
{
"epoch": 4.874551971326165,
"grad_norm": 21.5,
"learning_rate": 3.0422441082995667e-05,
"loss": 0.342,
"num_input_tokens_seen": 2050624,
"step": 5440
},
{
"epoch": 4.879032258064516,
"grad_norm": 108.5,
"learning_rate": 3.0384267340181462e-05,
"loss": 0.2555,
"num_input_tokens_seen": 2052608,
"step": 5445
},
{
"epoch": 4.883512544802867,
"grad_norm": 73.0,
"learning_rate": 3.0346080428364974e-05,
"loss": 0.6581,
"num_input_tokens_seen": 2054368,
"step": 5450
},
{
"epoch": 4.887992831541219,
"grad_norm": 0.50390625,
"learning_rate": 3.0307880440944902e-05,
"loss": 0.0765,
"num_input_tokens_seen": 2056448,
"step": 5455
},
{
"epoch": 4.89247311827957,
"grad_norm": 0.470703125,
"learning_rate": 3.026966747135192e-05,
"loss": 0.134,
"num_input_tokens_seen": 2058368,
"step": 5460
},
{
"epoch": 4.896953405017921,
"grad_norm": 0.609375,
"learning_rate": 3.023144161304844e-05,
"loss": 0.3222,
"num_input_tokens_seen": 2060256,
"step": 5465
},
{
"epoch": 4.901433691756273,
"grad_norm": 0.06103515625,
"learning_rate": 3.0193202959528426e-05,
"loss": 0.2726,
"num_input_tokens_seen": 2062240,
"step": 5470
},
{
"epoch": 4.905913978494624,
"grad_norm": 0.0296630859375,
"learning_rate": 3.0154951604317118e-05,
"loss": 0.2746,
"num_input_tokens_seen": 2063968,
"step": 5475
},
{
"epoch": 4.910394265232975,
"grad_norm": 116.5,
"learning_rate": 3.0116687640970814e-05,
"loss": 0.2709,
"num_input_tokens_seen": 2065920,
"step": 5480
},
{
"epoch": 4.914874551971327,
"grad_norm": 7.09375,
"learning_rate": 3.0078411163076682e-05,
"loss": 0.2316,
"num_input_tokens_seen": 2067808,
"step": 5485
},
{
"epoch": 4.919354838709677,
"grad_norm": 4.375,
"learning_rate": 3.0040122264252457e-05,
"loss": 0.193,
"num_input_tokens_seen": 2069888,
"step": 5490
},
{
"epoch": 4.923835125448028,
"grad_norm": 0.12353515625,
"learning_rate": 3.0001821038146287e-05,
"loss": 0.0151,
"num_input_tokens_seen": 2071712,
"step": 5495
},
{
"epoch": 4.92831541218638,
"grad_norm": 10.3125,
"learning_rate": 2.9963507578436456e-05,
"loss": 0.0039,
"num_input_tokens_seen": 2073536,
"step": 5500
},
{
"epoch": 4.932795698924731,
"grad_norm": 0.04443359375,
"learning_rate": 2.9925181978831163e-05,
"loss": 0.1415,
"num_input_tokens_seen": 2075392,
"step": 5505
},
{
"epoch": 4.937275985663082,
"grad_norm": 0.026611328125,
"learning_rate": 2.9886844333068314e-05,
"loss": 0.0181,
"num_input_tokens_seen": 2077280,
"step": 5510
},
{
"epoch": 4.941756272401434,
"grad_norm": 0.62109375,
"learning_rate": 2.9848494734915276e-05,
"loss": 0.2349,
"num_input_tokens_seen": 2079360,
"step": 5515
},
{
"epoch": 4.946236559139785,
"grad_norm": 57.25,
"learning_rate": 2.9810133278168643e-05,
"loss": 0.4095,
"num_input_tokens_seen": 2081216,
"step": 5520
},
{
"epoch": 4.950716845878136,
"grad_norm": 45.0,
"learning_rate": 2.9771760056654e-05,
"loss": 0.46,
"num_input_tokens_seen": 2082944,
"step": 5525
},
{
"epoch": 4.955197132616488,
"grad_norm": 0.031982421875,
"learning_rate": 2.973337516422574e-05,
"loss": 0.2482,
"num_input_tokens_seen": 2084768,
"step": 5530
},
{
"epoch": 4.959677419354839,
"grad_norm": 24.875,
"learning_rate": 2.9694978694766767e-05,
"loss": 0.0753,
"num_input_tokens_seen": 2086752,
"step": 5535
},
{
"epoch": 4.96415770609319,
"grad_norm": 8.9375,
"learning_rate": 2.9656570742188332e-05,
"loss": 0.0431,
"num_input_tokens_seen": 2088448,
"step": 5540
},
{
"epoch": 4.968637992831541,
"grad_norm": 0.28125,
"learning_rate": 2.961815140042974e-05,
"loss": 0.2202,
"num_input_tokens_seen": 2090432,
"step": 5545
},
{
"epoch": 4.973118279569892,
"grad_norm": 3.171875,
"learning_rate": 2.957972076345817e-05,
"loss": 0.2299,
"num_input_tokens_seen": 2092384,
"step": 5550
},
{
"epoch": 4.977598566308243,
"grad_norm": 79.0,
"learning_rate": 2.9541278925268428e-05,
"loss": 0.2494,
"num_input_tokens_seen": 2094080,
"step": 5555
},
{
"epoch": 4.982078853046595,
"grad_norm": 41.25,
"learning_rate": 2.950282597988272e-05,
"loss": 0.2931,
"num_input_tokens_seen": 2095776,
"step": 5560
},
{
"epoch": 4.986559139784946,
"grad_norm": 0.314453125,
"learning_rate": 2.9464362021350395e-05,
"loss": 0.0044,
"num_input_tokens_seen": 2097664,
"step": 5565
},
{
"epoch": 4.991039426523297,
"grad_norm": 73.5,
"learning_rate": 2.9425887143747773e-05,
"loss": 0.1293,
"num_input_tokens_seen": 2099456,
"step": 5570
},
{
"epoch": 4.995519713261649,
"grad_norm": 82.5,
"learning_rate": 2.938740144117784e-05,
"loss": 0.0742,
"num_input_tokens_seen": 2101312,
"step": 5575
},
{
"epoch": 5.0,
"grad_norm": 66.0,
"learning_rate": 2.93489050077701e-05,
"loss": 0.0257,
"num_input_tokens_seen": 2102920,
"step": 5580
},
{
"epoch": 5.0,
"eval_loss": 0.2711038887500763,
"eval_runtime": 9.6552,
"eval_samples_per_second": 51.371,
"eval_steps_per_second": 12.843,
"num_input_tokens_seen": 2102920,
"step": 5580
},
{
"epoch": 5.004480286738351,
"grad_norm": 0.28125,
"learning_rate": 2.9310397937680277e-05,
"loss": 0.1502,
"num_input_tokens_seen": 2104808,
"step": 5585
},
{
"epoch": 5.008960573476703,
"grad_norm": 49.25,
"learning_rate": 2.9271880325090105e-05,
"loss": 0.3615,
"num_input_tokens_seen": 2106568,
"step": 5590
},
{
"epoch": 5.013440860215054,
"grad_norm": 0.83984375,
"learning_rate": 2.9233352264207133e-05,
"loss": 0.23,
"num_input_tokens_seen": 2108456,
"step": 5595
},
{
"epoch": 5.017921146953405,
"grad_norm": 30.625,
"learning_rate": 2.919481384926443e-05,
"loss": 0.2035,
"num_input_tokens_seen": 2110184,
"step": 5600
},
{
"epoch": 5.022401433691757,
"grad_norm": 0.044921875,
"learning_rate": 2.9156265174520414e-05,
"loss": 0.1907,
"num_input_tokens_seen": 2112104,
"step": 5605
},
{
"epoch": 5.026881720430108,
"grad_norm": 76.5,
"learning_rate": 2.911770633425858e-05,
"loss": 0.1092,
"num_input_tokens_seen": 2114056,
"step": 5610
},
{
"epoch": 5.031362007168458,
"grad_norm": 56.0,
"learning_rate": 2.90791374227873e-05,
"loss": 0.2243,
"num_input_tokens_seen": 2115880,
"step": 5615
},
{
"epoch": 5.03584229390681,
"grad_norm": 5.3125,
"learning_rate": 2.9040558534439564e-05,
"loss": 0.0053,
"num_input_tokens_seen": 2117640,
"step": 5620
},
{
"epoch": 5.040322580645161,
"grad_norm": 2.1875,
"learning_rate": 2.9001969763572802e-05,
"loss": 0.0162,
"num_input_tokens_seen": 2119496,
"step": 5625
},
{
"epoch": 5.044802867383512,
"grad_norm": 1.1171875,
"learning_rate": 2.8963371204568542e-05,
"loss": 0.0019,
"num_input_tokens_seen": 2121384,
"step": 5630
},
{
"epoch": 5.049283154121864,
"grad_norm": 34.5,
"learning_rate": 2.892476295183232e-05,
"loss": 0.3719,
"num_input_tokens_seen": 2123336,
"step": 5635
},
{
"epoch": 5.053763440860215,
"grad_norm": 90.0,
"learning_rate": 2.888614509979336e-05,
"loss": 0.1717,
"num_input_tokens_seen": 2125064,
"step": 5640
},
{
"epoch": 5.058243727598566,
"grad_norm": 0.37890625,
"learning_rate": 2.8847517742904352e-05,
"loss": 0.0029,
"num_input_tokens_seen": 2126920,
"step": 5645
},
{
"epoch": 5.062724014336918,
"grad_norm": 90.5,
"learning_rate": 2.880888097564124e-05,
"loss": 0.3299,
"num_input_tokens_seen": 2128744,
"step": 5650
},
{
"epoch": 5.067204301075269,
"grad_norm": 17.5,
"learning_rate": 2.877023489250299e-05,
"loss": 0.1958,
"num_input_tokens_seen": 2130664,
"step": 5655
},
{
"epoch": 5.07168458781362,
"grad_norm": 59.25,
"learning_rate": 2.8731579588011343e-05,
"loss": 0.4188,
"num_input_tokens_seen": 2132520,
"step": 5660
},
{
"epoch": 5.076164874551972,
"grad_norm": 0.052490234375,
"learning_rate": 2.8692915156710615e-05,
"loss": 0.2564,
"num_input_tokens_seen": 2134536,
"step": 5665
},
{
"epoch": 5.080645161290323,
"grad_norm": 0.017333984375,
"learning_rate": 2.8654241693167423e-05,
"loss": 0.1847,
"num_input_tokens_seen": 2136616,
"step": 5670
},
{
"epoch": 5.085125448028673,
"grad_norm": 47.0,
"learning_rate": 2.8615559291970474e-05,
"loss": 0.3279,
"num_input_tokens_seen": 2138408,
"step": 5675
},
{
"epoch": 5.089605734767025,
"grad_norm": 1.0625,
"learning_rate": 2.8576868047730354e-05,
"loss": 0.0548,
"num_input_tokens_seen": 2140264,
"step": 5680
},
{
"epoch": 5.094086021505376,
"grad_norm": 0.4453125,
"learning_rate": 2.8538168055079262e-05,
"loss": 0.4299,
"num_input_tokens_seen": 2142152,
"step": 5685
},
{
"epoch": 5.098566308243727,
"grad_norm": 39.0,
"learning_rate": 2.8499459408670796e-05,
"loss": 0.202,
"num_input_tokens_seen": 2144040,
"step": 5690
},
{
"epoch": 5.103046594982079,
"grad_norm": 30.75,
"learning_rate": 2.846074220317973e-05,
"loss": 0.1125,
"num_input_tokens_seen": 2145896,
"step": 5695
},
{
"epoch": 5.10752688172043,
"grad_norm": 0.515625,
"learning_rate": 2.8422016533301753e-05,
"loss": 0.3313,
"num_input_tokens_seen": 2147720,
"step": 5700
},
{
"epoch": 5.112007168458781,
"grad_norm": 0.408203125,
"learning_rate": 2.8383282493753283e-05,
"loss": 0.1883,
"num_input_tokens_seen": 2149704,
"step": 5705
},
{
"epoch": 5.116487455197133,
"grad_norm": 0.1533203125,
"learning_rate": 2.8344540179271178e-05,
"loss": 0.1705,
"num_input_tokens_seen": 2151592,
"step": 5710
},
{
"epoch": 5.120967741935484,
"grad_norm": 118.5,
"learning_rate": 2.830578968461256e-05,
"loss": 0.2821,
"num_input_tokens_seen": 2153320,
"step": 5715
},
{
"epoch": 5.125448028673835,
"grad_norm": 137.0,
"learning_rate": 2.8267031104554552e-05,
"loss": 0.2323,
"num_input_tokens_seen": 2155144,
"step": 5720
},
{
"epoch": 5.129928315412187,
"grad_norm": 1.3125,
"learning_rate": 2.822826453389404e-05,
"loss": 0.2988,
"num_input_tokens_seen": 2156904,
"step": 5725
},
{
"epoch": 5.134408602150538,
"grad_norm": 65.5,
"learning_rate": 2.8189490067447473e-05,
"loss": 0.3838,
"num_input_tokens_seen": 2158792,
"step": 5730
},
{
"epoch": 5.138888888888889,
"grad_norm": 10.6875,
"learning_rate": 2.815070780005059e-05,
"loss": 0.2683,
"num_input_tokens_seen": 2160776,
"step": 5735
},
{
"epoch": 5.14336917562724,
"grad_norm": 40.25,
"learning_rate": 2.811191782655823e-05,
"loss": 0.591,
"num_input_tokens_seen": 2162568,
"step": 5740
},
{
"epoch": 5.147849462365591,
"grad_norm": 0.1689453125,
"learning_rate": 2.8073120241844077e-05,
"loss": 0.098,
"num_input_tokens_seen": 2164488,
"step": 5745
},
{
"epoch": 5.152329749103942,
"grad_norm": 103.0,
"learning_rate": 2.8034315140800414e-05,
"loss": 0.0768,
"num_input_tokens_seen": 2166184,
"step": 5750
},
{
"epoch": 5.156810035842294,
"grad_norm": 16.875,
"learning_rate": 2.7995502618337933e-05,
"loss": 0.0856,
"num_input_tokens_seen": 2168040,
"step": 5755
},
{
"epoch": 5.161290322580645,
"grad_norm": 58.0,
"learning_rate": 2.795668276938545e-05,
"loss": 0.2065,
"num_input_tokens_seen": 2170024,
"step": 5760
},
{
"epoch": 5.165770609318996,
"grad_norm": 114.0,
"learning_rate": 2.7917855688889717e-05,
"loss": 0.3425,
"num_input_tokens_seen": 2171848,
"step": 5765
},
{
"epoch": 5.170250896057348,
"grad_norm": 31.375,
"learning_rate": 2.787902147181517e-05,
"loss": 0.0603,
"num_input_tokens_seen": 2173608,
"step": 5770
},
{
"epoch": 5.174731182795699,
"grad_norm": 53.0,
"learning_rate": 2.7840180213143712e-05,
"loss": 0.2927,
"num_input_tokens_seen": 2175336,
"step": 5775
},
{
"epoch": 5.17921146953405,
"grad_norm": 123.0,
"learning_rate": 2.7801332007874437e-05,
"loss": 0.1042,
"num_input_tokens_seen": 2177192,
"step": 5780
},
{
"epoch": 5.183691756272402,
"grad_norm": 3.34375,
"learning_rate": 2.776247695102345e-05,
"loss": 0.1639,
"num_input_tokens_seen": 2178952,
"step": 5785
},
{
"epoch": 5.188172043010753,
"grad_norm": 3.46875,
"learning_rate": 2.7723615137623637e-05,
"loss": 0.0554,
"num_input_tokens_seen": 2180968,
"step": 5790
},
{
"epoch": 5.192652329749104,
"grad_norm": 0.51171875,
"learning_rate": 2.7684746662724363e-05,
"loss": 0.0475,
"num_input_tokens_seen": 2182792,
"step": 5795
},
{
"epoch": 5.197132616487456,
"grad_norm": 0.0478515625,
"learning_rate": 2.7645871621391305e-05,
"loss": 0.0097,
"num_input_tokens_seen": 2184648,
"step": 5800
},
{
"epoch": 5.201612903225806,
"grad_norm": 55.25,
"learning_rate": 2.760699010870622e-05,
"loss": 0.2714,
"num_input_tokens_seen": 2186440,
"step": 5805
},
{
"epoch": 5.206093189964157,
"grad_norm": 179.0,
"learning_rate": 2.7568102219766666e-05,
"loss": 0.1893,
"num_input_tokens_seen": 2188424,
"step": 5810
},
{
"epoch": 5.210573476702509,
"grad_norm": 44.75,
"learning_rate": 2.7529208049685807e-05,
"loss": 0.447,
"num_input_tokens_seen": 2190152,
"step": 5815
},
{
"epoch": 5.21505376344086,
"grad_norm": 0.0205078125,
"learning_rate": 2.7490307693592172e-05,
"loss": 0.0654,
"num_input_tokens_seen": 2192072,
"step": 5820
},
{
"epoch": 5.219534050179211,
"grad_norm": 40.5,
"learning_rate": 2.7451401246629403e-05,
"loss": 0.2314,
"num_input_tokens_seen": 2194056,
"step": 5825
},
{
"epoch": 5.224014336917563,
"grad_norm": 1.2421875,
"learning_rate": 2.741248880395607e-05,
"loss": 0.1646,
"num_input_tokens_seen": 2195816,
"step": 5830
},
{
"epoch": 5.228494623655914,
"grad_norm": 80.5,
"learning_rate": 2.7373570460745384e-05,
"loss": 0.4708,
"num_input_tokens_seen": 2197736,
"step": 5835
},
{
"epoch": 5.232974910394265,
"grad_norm": 8.9375,
"learning_rate": 2.7334646312184997e-05,
"loss": 0.1236,
"num_input_tokens_seen": 2199688,
"step": 5840
},
{
"epoch": 5.237455197132617,
"grad_norm": 106.5,
"learning_rate": 2.7295716453476755e-05,
"loss": 0.282,
"num_input_tokens_seen": 2201576,
"step": 5845
},
{
"epoch": 5.241935483870968,
"grad_norm": 2.15625,
"learning_rate": 2.7256780979836466e-05,
"loss": 0.0954,
"num_input_tokens_seen": 2203624,
"step": 5850
},
{
"epoch": 5.246415770609319,
"grad_norm": 0.1845703125,
"learning_rate": 2.721783998649369e-05,
"loss": 0.1783,
"num_input_tokens_seen": 2205448,
"step": 5855
},
{
"epoch": 5.250896057347671,
"grad_norm": 65.5,
"learning_rate": 2.717889356869146e-05,
"loss": 0.3688,
"num_input_tokens_seen": 2207272,
"step": 5860
},
{
"epoch": 5.255376344086022,
"grad_norm": 119.5,
"learning_rate": 2.71399418216861e-05,
"loss": 0.3141,
"num_input_tokens_seen": 2209160,
"step": 5865
},
{
"epoch": 5.259856630824372,
"grad_norm": 31.125,
"learning_rate": 2.7100984840746956e-05,
"loss": 0.2935,
"num_input_tokens_seen": 2211080,
"step": 5870
},
{
"epoch": 5.264336917562724,
"grad_norm": 65.0,
"learning_rate": 2.7062022721156177e-05,
"loss": 0.21,
"num_input_tokens_seen": 2213032,
"step": 5875
},
{
"epoch": 5.268817204301075,
"grad_norm": 0.0230712890625,
"learning_rate": 2.7023055558208487e-05,
"loss": 0.0095,
"num_input_tokens_seen": 2214824,
"step": 5880
},
{
"epoch": 5.273297491039426,
"grad_norm": 59.0,
"learning_rate": 2.6984083447210945e-05,
"loss": 0.1312,
"num_input_tokens_seen": 2216648,
"step": 5885
},
{
"epoch": 5.277777777777778,
"grad_norm": 13.25,
"learning_rate": 2.6945106483482686e-05,
"loss": 0.4985,
"num_input_tokens_seen": 2218440,
"step": 5890
},
{
"epoch": 5.282258064516129,
"grad_norm": 0.0230712890625,
"learning_rate": 2.690612476235475e-05,
"loss": 0.3734,
"num_input_tokens_seen": 2220424,
"step": 5895
},
{
"epoch": 5.28673835125448,
"grad_norm": 93.0,
"learning_rate": 2.6867138379169802e-05,
"loss": 0.3469,
"num_input_tokens_seen": 2222152,
"step": 5900
},
{
"epoch": 5.291218637992832,
"grad_norm": 39.0,
"learning_rate": 2.6828147429281902e-05,
"loss": 0.057,
"num_input_tokens_seen": 2223976,
"step": 5905
},
{
"epoch": 5.295698924731183,
"grad_norm": 142.0,
"learning_rate": 2.6789152008056272e-05,
"loss": 0.1789,
"num_input_tokens_seen": 2225960,
"step": 5910
},
{
"epoch": 5.300179211469534,
"grad_norm": 0.302734375,
"learning_rate": 2.6750152210869095e-05,
"loss": 0.5755,
"num_input_tokens_seen": 2227912,
"step": 5915
},
{
"epoch": 5.304659498207886,
"grad_norm": 0.150390625,
"learning_rate": 2.6711148133107233e-05,
"loss": 0.213,
"num_input_tokens_seen": 2229736,
"step": 5920
},
{
"epoch": 5.309139784946237,
"grad_norm": 39.5,
"learning_rate": 2.6672139870168034e-05,
"loss": 0.2354,
"num_input_tokens_seen": 2231720,
"step": 5925
},
{
"epoch": 5.313620071684587,
"grad_norm": 98.5,
"learning_rate": 2.6633127517459066e-05,
"loss": 0.3518,
"num_input_tokens_seen": 2233544,
"step": 5930
},
{
"epoch": 5.318100358422939,
"grad_norm": 0.94140625,
"learning_rate": 2.6594111170397916e-05,
"loss": 0.1099,
"num_input_tokens_seen": 2235336,
"step": 5935
},
{
"epoch": 5.32258064516129,
"grad_norm": 10.6875,
"learning_rate": 2.655509092441194e-05,
"loss": 0.1688,
"num_input_tokens_seen": 2237128,
"step": 5940
},
{
"epoch": 5.327060931899641,
"grad_norm": 12.25,
"learning_rate": 2.6516066874938023e-05,
"loss": 0.5294,
"num_input_tokens_seen": 2239016,
"step": 5945
},
{
"epoch": 5.331541218637993,
"grad_norm": 0.10791015625,
"learning_rate": 2.6477039117422335e-05,
"loss": 0.3577,
"num_input_tokens_seen": 2240968,
"step": 5950
},
{
"epoch": 5.336021505376344,
"grad_norm": 1.1484375,
"learning_rate": 2.6438007747320153e-05,
"loss": 0.0191,
"num_input_tokens_seen": 2242728,
"step": 5955
},
{
"epoch": 5.340501792114695,
"grad_norm": 118.5,
"learning_rate": 2.639897286009556e-05,
"loss": 0.4098,
"num_input_tokens_seen": 2244584,
"step": 5960
},
{
"epoch": 5.344982078853047,
"grad_norm": 0.1796875,
"learning_rate": 2.6359934551221267e-05,
"loss": 0.2375,
"num_input_tokens_seen": 2246408,
"step": 5965
},
{
"epoch": 5.349462365591398,
"grad_norm": 0.2216796875,
"learning_rate": 2.6320892916178326e-05,
"loss": 0.327,
"num_input_tokens_seen": 2248456,
"step": 5970
},
{
"epoch": 5.353942652329749,
"grad_norm": 0.046142578125,
"learning_rate": 2.628184805045593e-05,
"loss": 0.0072,
"num_input_tokens_seen": 2250216,
"step": 5975
},
{
"epoch": 5.358422939068101,
"grad_norm": 59.5,
"learning_rate": 2.6242800049551192e-05,
"loss": 0.4777,
"num_input_tokens_seen": 2252040,
"step": 5980
},
{
"epoch": 5.362903225806452,
"grad_norm": 76.0,
"learning_rate": 2.620374900896889e-05,
"loss": 0.0377,
"num_input_tokens_seen": 2253992,
"step": 5985
},
{
"epoch": 5.367383512544803,
"grad_norm": 0.09716796875,
"learning_rate": 2.6164695024221215e-05,
"loss": 0.2612,
"num_input_tokens_seen": 2255816,
"step": 5990
},
{
"epoch": 5.371863799283154,
"grad_norm": 77.5,
"learning_rate": 2.612563819082757e-05,
"loss": 0.1335,
"num_input_tokens_seen": 2257672,
"step": 5995
},
{
"epoch": 5.376344086021505,
"grad_norm": 80.0,
"learning_rate": 2.6086578604314337e-05,
"loss": 0.0819,
"num_input_tokens_seen": 2259688,
"step": 6000
},
{
"epoch": 5.380824372759856,
"grad_norm": 72.5,
"learning_rate": 2.6047516360214623e-05,
"loss": 0.1127,
"num_input_tokens_seen": 2261512,
"step": 6005
},
{
"epoch": 5.385304659498208,
"grad_norm": 0.1298828125,
"learning_rate": 2.6008451554068025e-05,
"loss": 0.215,
"num_input_tokens_seen": 2263240,
"step": 6010
},
{
"epoch": 5.389784946236559,
"grad_norm": 4.6875,
"learning_rate": 2.5969384281420424e-05,
"loss": 0.0062,
"num_input_tokens_seen": 2265000,
"step": 6015
},
{
"epoch": 5.39426523297491,
"grad_norm": 8.9375,
"learning_rate": 2.593031463782371e-05,
"loss": 0.1147,
"num_input_tokens_seen": 2266792,
"step": 6020
},
{
"epoch": 5.398745519713262,
"grad_norm": 81.0,
"learning_rate": 2.5891242718835614e-05,
"loss": 0.3116,
"num_input_tokens_seen": 2268648,
"step": 6025
},
{
"epoch": 5.403225806451613,
"grad_norm": 0.33984375,
"learning_rate": 2.5852168620019385e-05,
"loss": 0.0036,
"num_input_tokens_seen": 2270472,
"step": 6030
},
{
"epoch": 5.407706093189964,
"grad_norm": 0.08447265625,
"learning_rate": 2.5813092436943626e-05,
"loss": 0.0404,
"num_input_tokens_seen": 2272296,
"step": 6035
},
{
"epoch": 5.412186379928316,
"grad_norm": 139.0,
"learning_rate": 2.577401426518204e-05,
"loss": 0.0725,
"num_input_tokens_seen": 2274248,
"step": 6040
},
{
"epoch": 5.416666666666667,
"grad_norm": 1.3984375,
"learning_rate": 2.573493420031318e-05,
"loss": 0.3543,
"num_input_tokens_seen": 2276168,
"step": 6045
},
{
"epoch": 5.421146953405018,
"grad_norm": 0.162109375,
"learning_rate": 2.569585233792027e-05,
"loss": 0.1768,
"num_input_tokens_seen": 2278056,
"step": 6050
},
{
"epoch": 5.425627240143369,
"grad_norm": 111.5,
"learning_rate": 2.5656768773590854e-05,
"loss": 0.2,
"num_input_tokens_seen": 2279944,
"step": 6055
},
{
"epoch": 5.43010752688172,
"grad_norm": 0.29296875,
"learning_rate": 2.5617683602916714e-05,
"loss": 0.278,
"num_input_tokens_seen": 2281896,
"step": 6060
},
{
"epoch": 5.434587813620071,
"grad_norm": 0.89453125,
"learning_rate": 2.5578596921493525e-05,
"loss": 0.1334,
"num_input_tokens_seen": 2283592,
"step": 6065
},
{
"epoch": 5.439068100358423,
"grad_norm": 8.875,
"learning_rate": 2.553950882492066e-05,
"loss": 0.1822,
"num_input_tokens_seen": 2285640,
"step": 6070
},
{
"epoch": 5.443548387096774,
"grad_norm": 0.19140625,
"learning_rate": 2.5500419408800953e-05,
"loss": 0.0844,
"num_input_tokens_seen": 2287464,
"step": 6075
},
{
"epoch": 5.448028673835125,
"grad_norm": 97.5,
"learning_rate": 2.546132876874048e-05,
"loss": 0.2133,
"num_input_tokens_seen": 2289352,
"step": 6080
},
{
"epoch": 5.452508960573477,
"grad_norm": 0.2119140625,
"learning_rate": 2.5422237000348276e-05,
"loss": 0.2076,
"num_input_tokens_seen": 2291240,
"step": 6085
},
{
"epoch": 5.456989247311828,
"grad_norm": 0.056640625,
"learning_rate": 2.5383144199236188e-05,
"loss": 0.025,
"num_input_tokens_seen": 2293352,
"step": 6090
},
{
"epoch": 5.461469534050179,
"grad_norm": 98.0,
"learning_rate": 2.5344050461018542e-05,
"loss": 0.2368,
"num_input_tokens_seen": 2295464,
"step": 6095
},
{
"epoch": 5.465949820788531,
"grad_norm": 0.0299072265625,
"learning_rate": 2.530495588131197e-05,
"loss": 0.3277,
"num_input_tokens_seen": 2297160,
"step": 6100
},
{
"epoch": 5.470430107526882,
"grad_norm": 1.03125,
"learning_rate": 2.526586055573518e-05,
"loss": 0.1369,
"num_input_tokens_seen": 2299048,
"step": 6105
},
{
"epoch": 5.474910394265233,
"grad_norm": 2.8125,
"learning_rate": 2.5226764579908678e-05,
"loss": 0.1135,
"num_input_tokens_seen": 2300904,
"step": 6110
},
{
"epoch": 5.479390681003585,
"grad_norm": 0.08935546875,
"learning_rate": 2.5187668049454583e-05,
"loss": 0.4181,
"num_input_tokens_seen": 2302824,
"step": 6115
},
{
"epoch": 5.483870967741936,
"grad_norm": 0.03271484375,
"learning_rate": 2.5148571059996346e-05,
"loss": 0.0702,
"num_input_tokens_seen": 2304648,
"step": 6120
},
{
"epoch": 5.488351254480286,
"grad_norm": 0.09228515625,
"learning_rate": 2.5109473707158565e-05,
"loss": 0.1783,
"num_input_tokens_seen": 2306760,
"step": 6125
},
{
"epoch": 5.492831541218638,
"grad_norm": 21.25,
"learning_rate": 2.5070376086566704e-05,
"loss": 0.3409,
"num_input_tokens_seen": 2308648,
"step": 6130
},
{
"epoch": 5.497311827956989,
"grad_norm": 67.0,
"learning_rate": 2.5031278293846922e-05,
"loss": 0.2185,
"num_input_tokens_seen": 2310728,
"step": 6135
},
{
"epoch": 5.5,
"eval_loss": 0.2833144962787628,
"eval_runtime": 9.672,
"eval_samples_per_second": 51.282,
"eval_steps_per_second": 12.821,
"num_input_tokens_seen": 2311976,
"step": 6138
},
{
"epoch": 5.50179211469534,
"grad_norm": 0.047119140625,
"learning_rate": 2.4992180424625737e-05,
"loss": 0.2343,
"num_input_tokens_seen": 2312904,
"step": 6140
},
{
"epoch": 5.506272401433692,
"grad_norm": 0.0986328125,
"learning_rate": 2.4953082574529906e-05,
"loss": 0.319,
"num_input_tokens_seen": 2314856,
"step": 6145
},
{
"epoch": 5.510752688172043,
"grad_norm": 0.12158203125,
"learning_rate": 2.491398483918612e-05,
"loss": 0.4704,
"num_input_tokens_seen": 2316808,
"step": 6150
},
{
"epoch": 5.515232974910394,
"grad_norm": 11.3125,
"learning_rate": 2.48748873142208e-05,
"loss": 0.0074,
"num_input_tokens_seen": 2318664,
"step": 6155
},
{
"epoch": 5.519713261648746,
"grad_norm": 112.0,
"learning_rate": 2.4835790095259825e-05,
"loss": 0.1822,
"num_input_tokens_seen": 2320552,
"step": 6160
},
{
"epoch": 5.524193548387097,
"grad_norm": 86.5,
"learning_rate": 2.479669327792835e-05,
"loss": 0.1711,
"num_input_tokens_seen": 2322632,
"step": 6165
},
{
"epoch": 5.528673835125448,
"grad_norm": 0.0245361328125,
"learning_rate": 2.475759695785054e-05,
"loss": 0.1648,
"num_input_tokens_seen": 2324360,
"step": 6170
},
{
"epoch": 5.5331541218638,
"grad_norm": 2.46875,
"learning_rate": 2.4718501230649355e-05,
"loss": 0.1408,
"num_input_tokens_seen": 2326184,
"step": 6175
},
{
"epoch": 5.53763440860215,
"grad_norm": 0.216796875,
"learning_rate": 2.4679406191946285e-05,
"loss": 0.215,
"num_input_tokens_seen": 2328072,
"step": 6180
},
{
"epoch": 5.542114695340501,
"grad_norm": 85.5,
"learning_rate": 2.464031193736116e-05,
"loss": 0.0337,
"num_input_tokens_seen": 2329960,
"step": 6185
},
{
"epoch": 5.546594982078853,
"grad_norm": 74.5,
"learning_rate": 2.4601218562511856e-05,
"loss": 0.0812,
"num_input_tokens_seen": 2331816,
"step": 6190
},
{
"epoch": 5.551075268817204,
"grad_norm": 8.375,
"learning_rate": 2.4562126163014134e-05,
"loss": 0.0034,
"num_input_tokens_seen": 2333800,
"step": 6195
},
{
"epoch": 5.555555555555555,
"grad_norm": 7.1875,
"learning_rate": 2.452303483448136e-05,
"loss": 0.2057,
"num_input_tokens_seen": 2335624,
"step": 6200
},
{
"epoch": 5.560035842293907,
"grad_norm": 59.25,
"learning_rate": 2.4483944672524263e-05,
"loss": 0.0976,
"num_input_tokens_seen": 2337544,
"step": 6205
},
{
"epoch": 5.564516129032258,
"grad_norm": 0.58203125,
"learning_rate": 2.444485577275075e-05,
"loss": 0.0545,
"num_input_tokens_seen": 2339400,
"step": 6210
},
{
"epoch": 5.568996415770609,
"grad_norm": 46.5,
"learning_rate": 2.44057682307656e-05,
"loss": 0.3086,
"num_input_tokens_seen": 2341128,
"step": 6215
},
{
"epoch": 5.573476702508961,
"grad_norm": 0.22265625,
"learning_rate": 2.436668214217031e-05,
"loss": 0.0009,
"num_input_tokens_seen": 2342920,
"step": 6220
},
{
"epoch": 5.577956989247312,
"grad_norm": 5.625,
"learning_rate": 2.4327597602562792e-05,
"loss": 0.084,
"num_input_tokens_seen": 2344968,
"step": 6225
},
{
"epoch": 5.582437275985663,
"grad_norm": 65.0,
"learning_rate": 2.428851470753719e-05,
"loss": 0.2391,
"num_input_tokens_seen": 2346824,
"step": 6230
},
{
"epoch": 5.586917562724015,
"grad_norm": 56.25,
"learning_rate": 2.4249433552683627e-05,
"loss": 0.2893,
"num_input_tokens_seen": 2348712,
"step": 6235
},
{
"epoch": 5.591397849462366,
"grad_norm": 56.5,
"learning_rate": 2.4210354233587955e-05,
"loss": 0.0758,
"num_input_tokens_seen": 2350600,
"step": 6240
},
{
"epoch": 5.595878136200717,
"grad_norm": 0.056640625,
"learning_rate": 2.417127684583154e-05,
"loss": 0.0303,
"num_input_tokens_seen": 2352584,
"step": 6245
},
{
"epoch": 5.600358422939068,
"grad_norm": 0.11376953125,
"learning_rate": 2.413220148499103e-05,
"loss": 0.185,
"num_input_tokens_seen": 2354408,
"step": 6250
},
{
"epoch": 5.604838709677419,
"grad_norm": 8.8125,
"learning_rate": 2.409312824663811e-05,
"loss": 0.146,
"num_input_tokens_seen": 2356264,
"step": 6255
},
{
"epoch": 5.60931899641577,
"grad_norm": 6.3125,
"learning_rate": 2.405405722633928e-05,
"loss": 0.2155,
"num_input_tokens_seen": 2358152,
"step": 6260
},
{
"epoch": 5.613799283154122,
"grad_norm": 0.328125,
"learning_rate": 2.4014988519655618e-05,
"loss": 0.3023,
"num_input_tokens_seen": 2359912,
"step": 6265
},
{
"epoch": 5.618279569892473,
"grad_norm": 0.039306640625,
"learning_rate": 2.3975922222142517e-05,
"loss": 0.3518,
"num_input_tokens_seen": 2361864,
"step": 6270
},
{
"epoch": 5.622759856630824,
"grad_norm": 0.1123046875,
"learning_rate": 2.3936858429349508e-05,
"loss": 0.4005,
"num_input_tokens_seen": 2363784,
"step": 6275
},
{
"epoch": 5.627240143369176,
"grad_norm": 18.125,
"learning_rate": 2.389779723681999e-05,
"loss": 0.1477,
"num_input_tokens_seen": 2365608,
"step": 6280
},
{
"epoch": 5.631720430107527,
"grad_norm": 109.0,
"learning_rate": 2.3858738740090995e-05,
"loss": 0.3798,
"num_input_tokens_seen": 2367496,
"step": 6285
},
{
"epoch": 5.636200716845878,
"grad_norm": 0.033203125,
"learning_rate": 2.3819683034692953e-05,
"loss": 0.5149,
"num_input_tokens_seen": 2369416,
"step": 6290
},
{
"epoch": 5.64068100358423,
"grad_norm": 73.0,
"learning_rate": 2.3780630216149506e-05,
"loss": 0.7061,
"num_input_tokens_seen": 2371336,
"step": 6295
},
{
"epoch": 5.645161290322581,
"grad_norm": 47.5,
"learning_rate": 2.374158037997717e-05,
"loss": 0.5409,
"num_input_tokens_seen": 2373416,
"step": 6300
},
{
"epoch": 5.649641577060932,
"grad_norm": 0.02880859375,
"learning_rate": 2.3702533621685228e-05,
"loss": 0.1612,
"num_input_tokens_seen": 2375304,
"step": 6305
},
{
"epoch": 5.654121863799283,
"grad_norm": 49.5,
"learning_rate": 2.36634900367754e-05,
"loss": 0.4988,
"num_input_tokens_seen": 2377192,
"step": 6310
},
{
"epoch": 5.658602150537634,
"grad_norm": 39.75,
"learning_rate": 2.3624449720741654e-05,
"loss": 0.1996,
"num_input_tokens_seen": 2379080,
"step": 6315
},
{
"epoch": 5.663082437275985,
"grad_norm": 80.0,
"learning_rate": 2.3585412769069984e-05,
"loss": 0.0562,
"num_input_tokens_seen": 2381384,
"step": 6320
},
{
"epoch": 5.667562724014337,
"grad_norm": 0.55859375,
"learning_rate": 2.3546379277238107e-05,
"loss": 0.076,
"num_input_tokens_seen": 2383304,
"step": 6325
},
{
"epoch": 5.672043010752688,
"grad_norm": 0.14453125,
"learning_rate": 2.3507349340715322e-05,
"loss": 0.3423,
"num_input_tokens_seen": 2385128,
"step": 6330
},
{
"epoch": 5.676523297491039,
"grad_norm": 12.8125,
"learning_rate": 2.3468323054962213e-05,
"loss": 0.0732,
"num_input_tokens_seen": 2386952,
"step": 6335
},
{
"epoch": 5.681003584229391,
"grad_norm": 24.75,
"learning_rate": 2.3429300515430437e-05,
"loss": 0.1547,
"num_input_tokens_seen": 2388872,
"step": 6340
},
{
"epoch": 5.685483870967742,
"grad_norm": 1.671875,
"learning_rate": 2.3390281817562496e-05,
"loss": 0.3452,
"num_input_tokens_seen": 2390888,
"step": 6345
},
{
"epoch": 5.689964157706093,
"grad_norm": 7.375,
"learning_rate": 2.335126705679149e-05,
"loss": 0.0136,
"num_input_tokens_seen": 2392712,
"step": 6350
},
{
"epoch": 5.694444444444445,
"grad_norm": 41.0,
"learning_rate": 2.331225632854087e-05,
"loss": 0.1487,
"num_input_tokens_seen": 2394728,
"step": 6355
},
{
"epoch": 5.698924731182796,
"grad_norm": 215.0,
"learning_rate": 2.327324972822426e-05,
"loss": 0.5102,
"num_input_tokens_seen": 2396648,
"step": 6360
},
{
"epoch": 5.703405017921147,
"grad_norm": 74.0,
"learning_rate": 2.3234247351245177e-05,
"loss": 0.254,
"num_input_tokens_seen": 2398632,
"step": 6365
},
{
"epoch": 5.707885304659499,
"grad_norm": 70.0,
"learning_rate": 2.3195249292996786e-05,
"loss": 0.108,
"num_input_tokens_seen": 2400616,
"step": 6370
},
{
"epoch": 5.71236559139785,
"grad_norm": 2.40625,
"learning_rate": 2.3156255648861723e-05,
"loss": 0.0128,
"num_input_tokens_seen": 2402472,
"step": 6375
},
{
"epoch": 5.7168458781362,
"grad_norm": 41.25,
"learning_rate": 2.3117266514211788e-05,
"loss": 0.5534,
"num_input_tokens_seen": 2404392,
"step": 6380
},
{
"epoch": 5.721326164874552,
"grad_norm": 0.09326171875,
"learning_rate": 2.3078281984407787e-05,
"loss": 0.0044,
"num_input_tokens_seen": 2406312,
"step": 6385
},
{
"epoch": 5.725806451612903,
"grad_norm": 1.921875,
"learning_rate": 2.3039302154799256e-05,
"loss": 0.0634,
"num_input_tokens_seen": 2408168,
"step": 6390
},
{
"epoch": 5.730286738351254,
"grad_norm": 32.0,
"learning_rate": 2.300032712072422e-05,
"loss": 0.8311,
"num_input_tokens_seen": 2409992,
"step": 6395
},
{
"epoch": 5.734767025089606,
"grad_norm": 0.0242919921875,
"learning_rate": 2.2961356977508984e-05,
"loss": 0.0305,
"num_input_tokens_seen": 2411944,
"step": 6400
},
{
"epoch": 5.739247311827957,
"grad_norm": 100.0,
"learning_rate": 2.2922391820467905e-05,
"loss": 0.0975,
"num_input_tokens_seen": 2413928,
"step": 6405
},
{
"epoch": 5.743727598566308,
"grad_norm": 0.240234375,
"learning_rate": 2.2883431744903115e-05,
"loss": 0.2387,
"num_input_tokens_seen": 2415848,
"step": 6410
},
{
"epoch": 5.74820788530466,
"grad_norm": 1.7734375,
"learning_rate": 2.284447684610434e-05,
"loss": 0.2636,
"num_input_tokens_seen": 2417704,
"step": 6415
},
{
"epoch": 5.752688172043011,
"grad_norm": 110.5,
"learning_rate": 2.2805527219348632e-05,
"loss": 0.923,
"num_input_tokens_seen": 2419656,
"step": 6420
},
{
"epoch": 5.757168458781362,
"grad_norm": 2.671875,
"learning_rate": 2.276658295990016e-05,
"loss": 0.1966,
"num_input_tokens_seen": 2421512,
"step": 6425
},
{
"epoch": 5.761648745519714,
"grad_norm": 101.0,
"learning_rate": 2.272764416300997e-05,
"loss": 0.0598,
"num_input_tokens_seen": 2423272,
"step": 6430
},
{
"epoch": 5.766129032258064,
"grad_norm": 0.546875,
"learning_rate": 2.2688710923915718e-05,
"loss": 0.0565,
"num_input_tokens_seen": 2425288,
"step": 6435
},
{
"epoch": 5.770609318996415,
"grad_norm": 34.75,
"learning_rate": 2.264978333784149e-05,
"loss": 0.1536,
"num_input_tokens_seen": 2427112,
"step": 6440
},
{
"epoch": 5.775089605734767,
"grad_norm": 0.037353515625,
"learning_rate": 2.261086149999755e-05,
"loss": 0.0738,
"num_input_tokens_seen": 2428968,
"step": 6445
},
{
"epoch": 5.779569892473118,
"grad_norm": 55.25,
"learning_rate": 2.257194550558009e-05,
"loss": 0.5424,
"num_input_tokens_seen": 2431048,
"step": 6450
},
{
"epoch": 5.784050179211469,
"grad_norm": 0.0185546875,
"learning_rate": 2.253303544977101e-05,
"loss": 0.0002,
"num_input_tokens_seen": 2432904,
"step": 6455
},
{
"epoch": 5.788530465949821,
"grad_norm": 76.0,
"learning_rate": 2.249413142773771e-05,
"loss": 0.1553,
"num_input_tokens_seen": 2434856,
"step": 6460
},
{
"epoch": 5.793010752688172,
"grad_norm": 1.9921875,
"learning_rate": 2.245523353463278e-05,
"loss": 0.0023,
"num_input_tokens_seen": 2436680,
"step": 6465
},
{
"epoch": 5.797491039426523,
"grad_norm": 0.07177734375,
"learning_rate": 2.2416341865593875e-05,
"loss": 0.2671,
"num_input_tokens_seen": 2438344,
"step": 6470
},
{
"epoch": 5.801971326164875,
"grad_norm": 4.96875,
"learning_rate": 2.2377456515743396e-05,
"loss": 0.4818,
"num_input_tokens_seen": 2440168,
"step": 6475
},
{
"epoch": 5.806451612903226,
"grad_norm": 0.0113525390625,
"learning_rate": 2.2338577580188296e-05,
"loss": 0.0399,
"num_input_tokens_seen": 2442056,
"step": 6480
},
{
"epoch": 5.810931899641577,
"grad_norm": 88.0,
"learning_rate": 2.2299705154019846e-05,
"loss": 0.3148,
"num_input_tokens_seen": 2443976,
"step": 6485
},
{
"epoch": 5.815412186379929,
"grad_norm": 73.0,
"learning_rate": 2.2260839332313375e-05,
"loss": 0.3558,
"num_input_tokens_seen": 2445832,
"step": 6490
},
{
"epoch": 5.81989247311828,
"grad_norm": 78.5,
"learning_rate": 2.222198021012809e-05,
"loss": 0.2281,
"num_input_tokens_seen": 2447720,
"step": 6495
},
{
"epoch": 5.824372759856631,
"grad_norm": 34.25,
"learning_rate": 2.218312788250678e-05,
"loss": 0.3957,
"num_input_tokens_seen": 2449704,
"step": 6500
},
{
"epoch": 5.828853046594982,
"grad_norm": 0.119140625,
"learning_rate": 2.2144282444475638e-05,
"loss": 0.6334,
"num_input_tokens_seen": 2451592,
"step": 6505
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.9765625,
"learning_rate": 2.2105443991044006e-05,
"loss": 0.0037,
"num_input_tokens_seen": 2453640,
"step": 6510
},
{
"epoch": 5.837813620071684,
"grad_norm": 0.048095703125,
"learning_rate": 2.206661261720414e-05,
"loss": 0.47,
"num_input_tokens_seen": 2455496,
"step": 6515
},
{
"epoch": 5.842293906810036,
"grad_norm": 0.185546875,
"learning_rate": 2.2027788417930962e-05,
"loss": 0.1956,
"num_input_tokens_seen": 2457320,
"step": 6520
},
{
"epoch": 5.846774193548387,
"grad_norm": 0.58984375,
"learning_rate": 2.1988971488181862e-05,
"loss": 0.0028,
"num_input_tokens_seen": 2459400,
"step": 6525
},
{
"epoch": 5.851254480286738,
"grad_norm": 40.5,
"learning_rate": 2.1950161922896452e-05,
"loss": 0.0882,
"num_input_tokens_seen": 2461352,
"step": 6530
},
{
"epoch": 5.85573476702509,
"grad_norm": 0.046630859375,
"learning_rate": 2.1911359816996342e-05,
"loss": 0.0033,
"num_input_tokens_seen": 2463112,
"step": 6535
},
{
"epoch": 5.860215053763441,
"grad_norm": 0.05419921875,
"learning_rate": 2.1872565265384867e-05,
"loss": 0.1153,
"num_input_tokens_seen": 2465096,
"step": 6540
},
{
"epoch": 5.864695340501792,
"grad_norm": 0.40234375,
"learning_rate": 2.1833778362946914e-05,
"loss": 0.2862,
"num_input_tokens_seen": 2467016,
"step": 6545
},
{
"epoch": 5.869175627240144,
"grad_norm": 113.5,
"learning_rate": 2.179499920454864e-05,
"loss": 0.3319,
"num_input_tokens_seen": 2468680,
"step": 6550
},
{
"epoch": 5.873655913978495,
"grad_norm": 0.86328125,
"learning_rate": 2.1756227885037277e-05,
"loss": 0.0074,
"num_input_tokens_seen": 2470632,
"step": 6555
},
{
"epoch": 5.878136200716845,
"grad_norm": 67.5,
"learning_rate": 2.1717464499240882e-05,
"loss": 0.1412,
"num_input_tokens_seen": 2472616,
"step": 6560
},
{
"epoch": 5.882616487455197,
"grad_norm": 5.3125,
"learning_rate": 2.16787091419681e-05,
"loss": 0.1842,
"num_input_tokens_seen": 2474536,
"step": 6565
},
{
"epoch": 5.887096774193548,
"grad_norm": 0.1201171875,
"learning_rate": 2.1639961908007962e-05,
"loss": 0.5357,
"num_input_tokens_seen": 2476616,
"step": 6570
},
{
"epoch": 5.891577060931899,
"grad_norm": 104.0,
"learning_rate": 2.160122289212958e-05,
"loss": 0.13,
"num_input_tokens_seen": 2478504,
"step": 6575
},
{
"epoch": 5.896057347670251,
"grad_norm": 12.5625,
"learning_rate": 2.1562492189082023e-05,
"loss": 0.1211,
"num_input_tokens_seen": 2480296,
"step": 6580
},
{
"epoch": 5.900537634408602,
"grad_norm": 78.0,
"learning_rate": 2.1523769893593997e-05,
"loss": 0.3241,
"num_input_tokens_seen": 2482312,
"step": 6585
},
{
"epoch": 5.905017921146953,
"grad_norm": 0.02490234375,
"learning_rate": 2.1485056100373646e-05,
"loss": 0.0185,
"num_input_tokens_seen": 2483976,
"step": 6590
},
{
"epoch": 5.909498207885305,
"grad_norm": 14.9375,
"learning_rate": 2.1446350904108346e-05,
"loss": 0.1649,
"num_input_tokens_seen": 2485704,
"step": 6595
},
{
"epoch": 5.913978494623656,
"grad_norm": 166.0,
"learning_rate": 2.14076543994644e-05,
"loss": 0.5884,
"num_input_tokens_seen": 2487688,
"step": 6600
},
{
"epoch": 5.918458781362007,
"grad_norm": 0.1513671875,
"learning_rate": 2.1368966681086892e-05,
"loss": 0.2412,
"num_input_tokens_seen": 2489512,
"step": 6605
},
{
"epoch": 5.922939068100359,
"grad_norm": 282.0,
"learning_rate": 2.1330287843599393e-05,
"loss": 0.11,
"num_input_tokens_seen": 2491464,
"step": 6610
},
{
"epoch": 5.92741935483871,
"grad_norm": 37.5,
"learning_rate": 2.1291617981603766e-05,
"loss": 0.2387,
"num_input_tokens_seen": 2493416,
"step": 6615
},
{
"epoch": 5.931899641577061,
"grad_norm": 0.10693359375,
"learning_rate": 2.1252957189679927e-05,
"loss": 0.0173,
"num_input_tokens_seen": 2495272,
"step": 6620
},
{
"epoch": 5.936379928315413,
"grad_norm": 47.75,
"learning_rate": 2.1214305562385592e-05,
"loss": 0.2664,
"num_input_tokens_seen": 2497192,
"step": 6625
},
{
"epoch": 5.940860215053764,
"grad_norm": 0.10595703125,
"learning_rate": 2.1175663194256056e-05,
"loss": 0.2336,
"num_input_tokens_seen": 2499080,
"step": 6630
},
{
"epoch": 5.945340501792114,
"grad_norm": 69.5,
"learning_rate": 2.113703017980399e-05,
"loss": 0.4033,
"num_input_tokens_seen": 2500872,
"step": 6635
},
{
"epoch": 5.949820788530466,
"grad_norm": 1.1484375,
"learning_rate": 2.1098406613519178e-05,
"loss": 0.1082,
"num_input_tokens_seen": 2502760,
"step": 6640
},
{
"epoch": 5.954301075268817,
"grad_norm": 74.0,
"learning_rate": 2.10597925898683e-05,
"loss": 0.0665,
"num_input_tokens_seen": 2504680,
"step": 6645
},
{
"epoch": 5.958781362007168,
"grad_norm": 3.6875,
"learning_rate": 2.102118820329469e-05,
"loss": 0.1748,
"num_input_tokens_seen": 2506504,
"step": 6650
},
{
"epoch": 5.96326164874552,
"grad_norm": 152.0,
"learning_rate": 2.09825935482181e-05,
"loss": 0.2835,
"num_input_tokens_seen": 2508328,
"step": 6655
},
{
"epoch": 5.967741935483871,
"grad_norm": 62.75,
"learning_rate": 2.09440087190345e-05,
"loss": 0.1264,
"num_input_tokens_seen": 2510280,
"step": 6660
},
{
"epoch": 5.972222222222222,
"grad_norm": 0.01495361328125,
"learning_rate": 2.0905433810115828e-05,
"loss": 0.1247,
"num_input_tokens_seen": 2512264,
"step": 6665
},
{
"epoch": 5.976702508960574,
"grad_norm": 2.59375,
"learning_rate": 2.0866868915809733e-05,
"loss": 0.1406,
"num_input_tokens_seen": 2514216,
"step": 6670
},
{
"epoch": 5.981182795698925,
"grad_norm": 0.0155029296875,
"learning_rate": 2.0828314130439408e-05,
"loss": 0.4045,
"num_input_tokens_seen": 2516104,
"step": 6675
},
{
"epoch": 5.985663082437276,
"grad_norm": 64.5,
"learning_rate": 2.0789769548303303e-05,
"loss": 0.2105,
"num_input_tokens_seen": 2518120,
"step": 6680
},
{
"epoch": 5.990143369175628,
"grad_norm": 0.515625,
"learning_rate": 2.0751235263674893e-05,
"loss": 0.1598,
"num_input_tokens_seen": 2519880,
"step": 6685
},
{
"epoch": 5.994623655913978,
"grad_norm": 2.734375,
"learning_rate": 2.0712711370802495e-05,
"loss": 0.2459,
"num_input_tokens_seen": 2521800,
"step": 6690
},
{
"epoch": 5.999103942652329,
"grad_norm": 0.0211181640625,
"learning_rate": 2.0674197963908997e-05,
"loss": 0.1686,
"num_input_tokens_seen": 2523592,
"step": 6695
},
{
"epoch": 6.0,
"eval_loss": 0.27129343152046204,
"eval_runtime": 9.6531,
"eval_samples_per_second": 51.382,
"eval_steps_per_second": 12.846,
"num_input_tokens_seen": 2523672,
"step": 6696
},
{
"epoch": 6.003584229390681,
"grad_norm": 0.01434326171875,
"learning_rate": 2.0635695137191646e-05,
"loss": 0.1545,
"num_input_tokens_seen": 2525048,
"step": 6700
},
{
"epoch": 6.008064516129032,
"grad_norm": 113.5,
"learning_rate": 2.0597202984821815e-05,
"loss": 0.1623,
"num_input_tokens_seen": 2526776,
"step": 6705
},
{
"epoch": 6.012544802867383,
"grad_norm": 0.228515625,
"learning_rate": 2.0558721600944754e-05,
"loss": 0.1659,
"num_input_tokens_seen": 2528696,
"step": 6710
},
{
"epoch": 6.017025089605735,
"grad_norm": 5.0625,
"learning_rate": 2.0520251079679373e-05,
"loss": 0.0018,
"num_input_tokens_seen": 2530520,
"step": 6715
},
{
"epoch": 6.021505376344086,
"grad_norm": 0.0185546875,
"learning_rate": 2.048179151511804e-05,
"loss": 0.1527,
"num_input_tokens_seen": 2532344,
"step": 6720
},
{
"epoch": 6.025985663082437,
"grad_norm": 11.3125,
"learning_rate": 2.0443343001326303e-05,
"loss": 0.0181,
"num_input_tokens_seen": 2534264,
"step": 6725
},
{
"epoch": 6.030465949820789,
"grad_norm": 40.0,
"learning_rate": 2.04049056323427e-05,
"loss": 0.4009,
"num_input_tokens_seen": 2535992,
"step": 6730
},
{
"epoch": 6.03494623655914,
"grad_norm": 3.0,
"learning_rate": 2.0366479502178497e-05,
"loss": 0.2622,
"num_input_tokens_seen": 2537944,
"step": 6735
},
{
"epoch": 6.039426523297491,
"grad_norm": 0.0830078125,
"learning_rate": 2.0328064704817458e-05,
"loss": 0.0025,
"num_input_tokens_seen": 2539864,
"step": 6740
},
{
"epoch": 6.043906810035843,
"grad_norm": 0.32421875,
"learning_rate": 2.028966133421565e-05,
"loss": 0.0211,
"num_input_tokens_seen": 2541784,
"step": 6745
},
{
"epoch": 6.048387096774194,
"grad_norm": 12.4375,
"learning_rate": 2.0251269484301193e-05,
"loss": 0.1111,
"num_input_tokens_seen": 2543640,
"step": 6750
},
{
"epoch": 6.052867383512544,
"grad_norm": 40.5,
"learning_rate": 2.021288924897402e-05,
"loss": 0.1402,
"num_input_tokens_seen": 2545656,
"step": 6755
},
{
"epoch": 6.057347670250896,
"grad_norm": 103.0,
"learning_rate": 2.0174520722105673e-05,
"loss": 0.2089,
"num_input_tokens_seen": 2547448,
"step": 6760
},
{
"epoch": 6.061827956989247,
"grad_norm": 50.0,
"learning_rate": 2.0136163997539017e-05,
"loss": 0.484,
"num_input_tokens_seen": 2549272,
"step": 6765
},
{
"epoch": 6.066308243727598,
"grad_norm": 78.5,
"learning_rate": 2.0097819169088096e-05,
"loss": 0.1831,
"num_input_tokens_seen": 2551032,
"step": 6770
},
{
"epoch": 6.07078853046595,
"grad_norm": 30.5,
"learning_rate": 2.0059486330537835e-05,
"loss": 0.201,
"num_input_tokens_seen": 2552824,
"step": 6775
},
{
"epoch": 6.075268817204301,
"grad_norm": 0.047607421875,
"learning_rate": 2.0021165575643837e-05,
"loss": 0.1521,
"num_input_tokens_seen": 2554520,
"step": 6780
},
{
"epoch": 6.079749103942652,
"grad_norm": 0.2216796875,
"learning_rate": 1.998285699813215e-05,
"loss": 0.0063,
"num_input_tokens_seen": 2556376,
"step": 6785
},
{
"epoch": 6.084229390681004,
"grad_norm": 142.0,
"learning_rate": 1.9944560691699057e-05,
"loss": 0.1079,
"num_input_tokens_seen": 2558200,
"step": 6790
},
{
"epoch": 6.088709677419355,
"grad_norm": 40.5,
"learning_rate": 1.9906276750010792e-05,
"loss": 0.181,
"num_input_tokens_seen": 2560312,
"step": 6795
},
{
"epoch": 6.093189964157706,
"grad_norm": 101.0,
"learning_rate": 1.9868005266703364e-05,
"loss": 0.4135,
"num_input_tokens_seen": 2562328,
"step": 6800
},
{
"epoch": 6.097670250896058,
"grad_norm": 72.5,
"learning_rate": 1.982974633538232e-05,
"loss": 0.061,
"num_input_tokens_seen": 2564248,
"step": 6805
},
{
"epoch": 6.102150537634409,
"grad_norm": 0.012939453125,
"learning_rate": 1.9791500049622505e-05,
"loss": 0.0073,
"num_input_tokens_seen": 2566296,
"step": 6810
},
{
"epoch": 6.10663082437276,
"grad_norm": 0.039794921875,
"learning_rate": 1.975326650296782e-05,
"loss": 0.0021,
"num_input_tokens_seen": 2568376,
"step": 6815
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.0147705078125,
"learning_rate": 1.9715045788931037e-05,
"loss": 0.0003,
"num_input_tokens_seen": 2570328,
"step": 6820
},
{
"epoch": 6.115591397849462,
"grad_norm": 69.0,
"learning_rate": 1.967683800099349e-05,
"loss": 0.0665,
"num_input_tokens_seen": 2572184,
"step": 6825
},
{
"epoch": 6.120071684587813,
"grad_norm": 0.81640625,
"learning_rate": 1.9638643232604957e-05,
"loss": 0.055,
"num_input_tokens_seen": 2573944,
"step": 6830
},
{
"epoch": 6.124551971326165,
"grad_norm": 0.408203125,
"learning_rate": 1.9600461577183344e-05,
"loss": 0.0024,
"num_input_tokens_seen": 2575864,
"step": 6835
},
{
"epoch": 6.129032258064516,
"grad_norm": 1.7421875,
"learning_rate": 1.9562293128114473e-05,
"loss": 0.0295,
"num_input_tokens_seen": 2577656,
"step": 6840
},
{
"epoch": 6.133512544802867,
"grad_norm": 50.5,
"learning_rate": 1.95241379787519e-05,
"loss": 0.3736,
"num_input_tokens_seen": 2579416,
"step": 6845
},
{
"epoch": 6.137992831541219,
"grad_norm": 51.0,
"learning_rate": 1.9485996222416607e-05,
"loss": 0.1525,
"num_input_tokens_seen": 2581208,
"step": 6850
},
{
"epoch": 6.14247311827957,
"grad_norm": 11.875,
"learning_rate": 1.944786795239686e-05,
"loss": 0.461,
"num_input_tokens_seen": 2583096,
"step": 6855
},
{
"epoch": 6.146953405017921,
"grad_norm": 0.427734375,
"learning_rate": 1.9409753261947927e-05,
"loss": 0.2243,
"num_input_tokens_seen": 2585112,
"step": 6860
},
{
"epoch": 6.151433691756273,
"grad_norm": 98.0,
"learning_rate": 1.9371652244291842e-05,
"loss": 0.2973,
"num_input_tokens_seen": 2586968,
"step": 6865
},
{
"epoch": 6.155913978494624,
"grad_norm": 129.0,
"learning_rate": 1.9333564992617232e-05,
"loss": 0.1993,
"num_input_tokens_seen": 2588760,
"step": 6870
},
{
"epoch": 6.160394265232975,
"grad_norm": 0.060546875,
"learning_rate": 1.9295491600079035e-05,
"loss": 0.0732,
"num_input_tokens_seen": 2590680,
"step": 6875
},
{
"epoch": 6.164874551971327,
"grad_norm": 82.0,
"learning_rate": 1.925743215979829e-05,
"loss": 0.2354,
"num_input_tokens_seen": 2592824,
"step": 6880
},
{
"epoch": 6.169354838709677,
"grad_norm": 0.052978515625,
"learning_rate": 1.9219386764861908e-05,
"loss": 0.0506,
"num_input_tokens_seen": 2594648,
"step": 6885
},
{
"epoch": 6.173835125448028,
"grad_norm": 56.75,
"learning_rate": 1.9181355508322462e-05,
"loss": 0.4853,
"num_input_tokens_seen": 2596536,
"step": 6890
},
{
"epoch": 6.17831541218638,
"grad_norm": 77.5,
"learning_rate": 1.914333848319795e-05,
"loss": 0.4347,
"num_input_tokens_seen": 2598424,
"step": 6895
},
{
"epoch": 6.182795698924731,
"grad_norm": 0.181640625,
"learning_rate": 1.9105335782471534e-05,
"loss": 0.0203,
"num_input_tokens_seen": 2600216,
"step": 6900
},
{
"epoch": 6.187275985663082,
"grad_norm": 50.0,
"learning_rate": 1.9067347499091364e-05,
"loss": 0.5187,
"num_input_tokens_seen": 2601944,
"step": 6905
},
{
"epoch": 6.191756272401434,
"grad_norm": 0.029052734375,
"learning_rate": 1.9029373725970313e-05,
"loss": 0.1452,
"num_input_tokens_seen": 2603896,
"step": 6910
},
{
"epoch": 6.196236559139785,
"grad_norm": 2.15625,
"learning_rate": 1.8991414555985783e-05,
"loss": 0.1083,
"num_input_tokens_seen": 2605880,
"step": 6915
},
{
"epoch": 6.200716845878136,
"grad_norm": 102.0,
"learning_rate": 1.895347008197945e-05,
"loss": 0.0908,
"num_input_tokens_seen": 2607672,
"step": 6920
},
{
"epoch": 6.205197132616488,
"grad_norm": 1.40625,
"learning_rate": 1.891554039675703e-05,
"loss": 0.4013,
"num_input_tokens_seen": 2609496,
"step": 6925
},
{
"epoch": 6.209677419354839,
"grad_norm": 108.0,
"learning_rate": 1.8877625593088104e-05,
"loss": 0.3397,
"num_input_tokens_seen": 2611320,
"step": 6930
},
{
"epoch": 6.21415770609319,
"grad_norm": 37.0,
"learning_rate": 1.8839725763705814e-05,
"loss": 0.362,
"num_input_tokens_seen": 2613304,
"step": 6935
},
{
"epoch": 6.218637992831542,
"grad_norm": 80.0,
"learning_rate": 1.880184100130671e-05,
"loss": 0.1593,
"num_input_tokens_seen": 2615128,
"step": 6940
},
{
"epoch": 6.223118279569892,
"grad_norm": 87.0,
"learning_rate": 1.876397139855047e-05,
"loss": 0.2587,
"num_input_tokens_seen": 2617016,
"step": 6945
},
{
"epoch": 6.227598566308243,
"grad_norm": 85.5,
"learning_rate": 1.8726117048059704e-05,
"loss": 0.4973,
"num_input_tokens_seen": 2618840,
"step": 6950
},
{
"epoch": 6.232078853046595,
"grad_norm": 7.46875,
"learning_rate": 1.8688278042419734e-05,
"loss": 0.2206,
"num_input_tokens_seen": 2620664,
"step": 6955
},
{
"epoch": 6.236559139784946,
"grad_norm": 0.0576171875,
"learning_rate": 1.8650454474178298e-05,
"loss": 0.1435,
"num_input_tokens_seen": 2622360,
"step": 6960
},
{
"epoch": 6.241039426523297,
"grad_norm": 0.53125,
"learning_rate": 1.8612646435845443e-05,
"loss": 0.0034,
"num_input_tokens_seen": 2624120,
"step": 6965
},
{
"epoch": 6.245519713261649,
"grad_norm": 5.25,
"learning_rate": 1.857485401989318e-05,
"loss": 0.1865,
"num_input_tokens_seen": 2625976,
"step": 6970
},
{
"epoch": 6.25,
"grad_norm": 0.859375,
"learning_rate": 1.853707731875534e-05,
"loss": 0.4451,
"num_input_tokens_seen": 2627896,
"step": 6975
},
{
"epoch": 6.254480286738351,
"grad_norm": 0.77734375,
"learning_rate": 1.849931642482732e-05,
"loss": 0.0016,
"num_input_tokens_seen": 2629752,
"step": 6980
},
{
"epoch": 6.258960573476703,
"grad_norm": 85.0,
"learning_rate": 1.8461571430465834e-05,
"loss": 0.3968,
"num_input_tokens_seen": 2631608,
"step": 6985
},
{
"epoch": 6.263440860215054,
"grad_norm": 95.5,
"learning_rate": 1.8423842427988722e-05,
"loss": 0.5997,
"num_input_tokens_seen": 2633528,
"step": 6990
},
{
"epoch": 6.267921146953405,
"grad_norm": 57.25,
"learning_rate": 1.83861295096747e-05,
"loss": 0.1262,
"num_input_tokens_seen": 2635544,
"step": 6995
},
{
"epoch": 6.272401433691757,
"grad_norm": 2.3125,
"learning_rate": 1.8348432767763162e-05,
"loss": 0.2623,
"num_input_tokens_seen": 2637496,
"step": 7000
},
{
"epoch": 6.276881720430108,
"grad_norm": 7.03125,
"learning_rate": 1.8310752294453924e-05,
"loss": 0.0074,
"num_input_tokens_seen": 2639320,
"step": 7005
},
{
"epoch": 6.281362007168458,
"grad_norm": 11.4375,
"learning_rate": 1.8273088181907034e-05,
"loss": 0.4166,
"num_input_tokens_seen": 2641176,
"step": 7010
},
{
"epoch": 6.28584229390681,
"grad_norm": 2.8125,
"learning_rate": 1.823544052224247e-05,
"loss": 0.037,
"num_input_tokens_seen": 2642936,
"step": 7015
},
{
"epoch": 6.290322580645161,
"grad_norm": 2.71875,
"learning_rate": 1.8197809407540028e-05,
"loss": 0.0041,
"num_input_tokens_seen": 2644696,
"step": 7020
},
{
"epoch": 6.294802867383512,
"grad_norm": 0.029296875,
"learning_rate": 1.816019492983902e-05,
"loss": 0.0115,
"num_input_tokens_seen": 2646520,
"step": 7025
},
{
"epoch": 6.299283154121864,
"grad_norm": 0.302734375,
"learning_rate": 1.812259718113805e-05,
"loss": 0.2259,
"num_input_tokens_seen": 2648312,
"step": 7030
},
{
"epoch": 6.303763440860215,
"grad_norm": 51.5,
"learning_rate": 1.8085016253394817e-05,
"loss": 0.0161,
"num_input_tokens_seen": 2650200,
"step": 7035
},
{
"epoch": 6.308243727598566,
"grad_norm": 0.01019287109375,
"learning_rate": 1.8047452238525896e-05,
"loss": 0.2444,
"num_input_tokens_seen": 2651992,
"step": 7040
},
{
"epoch": 6.312724014336918,
"grad_norm": 4.71875,
"learning_rate": 1.8009905228406458e-05,
"loss": 0.092,
"num_input_tokens_seen": 2653848,
"step": 7045
},
{
"epoch": 6.317204301075269,
"grad_norm": 83.0,
"learning_rate": 1.797237531487012e-05,
"loss": 0.342,
"num_input_tokens_seen": 2655672,
"step": 7050
},
{
"epoch": 6.32168458781362,
"grad_norm": 129.0,
"learning_rate": 1.7934862589708657e-05,
"loss": 0.1229,
"num_input_tokens_seen": 2657432,
"step": 7055
},
{
"epoch": 6.326164874551972,
"grad_norm": 27.25,
"learning_rate": 1.789736714467182e-05,
"loss": 0.3312,
"num_input_tokens_seen": 2659256,
"step": 7060
},
{
"epoch": 6.330645161290323,
"grad_norm": 54.75,
"learning_rate": 1.7859889071467102e-05,
"loss": 0.1276,
"num_input_tokens_seen": 2661144,
"step": 7065
},
{
"epoch": 6.335125448028673,
"grad_norm": 44.5,
"learning_rate": 1.7822428461759483e-05,
"loss": 0.1877,
"num_input_tokens_seen": 2662904,
"step": 7070
},
{
"epoch": 6.339605734767025,
"grad_norm": 50.5,
"learning_rate": 1.778498540717124e-05,
"loss": 0.3016,
"num_input_tokens_seen": 2664728,
"step": 7075
},
{
"epoch": 6.344086021505376,
"grad_norm": 0.01141357421875,
"learning_rate": 1.7747559999281723e-05,
"loss": 0.0197,
"num_input_tokens_seen": 2666616,
"step": 7080
},
{
"epoch": 6.348566308243727,
"grad_norm": 0.1240234375,
"learning_rate": 1.771015232962712e-05,
"loss": 0.3446,
"num_input_tokens_seen": 2668376,
"step": 7085
},
{
"epoch": 6.353046594982079,
"grad_norm": 42.0,
"learning_rate": 1.7672762489700227e-05,
"loss": 0.3859,
"num_input_tokens_seen": 2670168,
"step": 7090
},
{
"epoch": 6.35752688172043,
"grad_norm": 0.77734375,
"learning_rate": 1.7635390570950246e-05,
"loss": 0.0013,
"num_input_tokens_seen": 2672120,
"step": 7095
},
{
"epoch": 6.362007168458781,
"grad_norm": 69.5,
"learning_rate": 1.7598036664782508e-05,
"loss": 0.1313,
"num_input_tokens_seen": 2674232,
"step": 7100
},
{
"epoch": 6.366487455197133,
"grad_norm": 0.1484375,
"learning_rate": 1.7560700862558325e-05,
"loss": 0.0013,
"num_input_tokens_seen": 2676120,
"step": 7105
},
{
"epoch": 6.370967741935484,
"grad_norm": 0.0791015625,
"learning_rate": 1.7523383255594735e-05,
"loss": 0.0022,
"num_input_tokens_seen": 2678072,
"step": 7110
},
{
"epoch": 6.375448028673835,
"grad_norm": 0.0947265625,
"learning_rate": 1.7486083935164244e-05,
"loss": 0.0872,
"num_input_tokens_seen": 2679960,
"step": 7115
},
{
"epoch": 6.379928315412187,
"grad_norm": 0.10107421875,
"learning_rate": 1.7448802992494657e-05,
"loss": 0.0994,
"num_input_tokens_seen": 2681816,
"step": 7120
},
{
"epoch": 6.384408602150538,
"grad_norm": 65.0,
"learning_rate": 1.7411540518768805e-05,
"loss": 0.3304,
"num_input_tokens_seen": 2683768,
"step": 7125
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.01239013671875,
"learning_rate": 1.737429660512437e-05,
"loss": 0.0003,
"num_input_tokens_seen": 2685464,
"step": 7130
},
{
"epoch": 6.393369175627241,
"grad_norm": 47.0,
"learning_rate": 1.733707134265363e-05,
"loss": 0.2056,
"num_input_tokens_seen": 2687384,
"step": 7135
},
{
"epoch": 6.397849462365591,
"grad_norm": 0.1640625,
"learning_rate": 1.7299864822403257e-05,
"loss": 0.4393,
"num_input_tokens_seen": 2689176,
"step": 7140
},
{
"epoch": 6.402329749103942,
"grad_norm": 67.0,
"learning_rate": 1.7262677135374053e-05,
"loss": 0.4403,
"num_input_tokens_seen": 2691000,
"step": 7145
},
{
"epoch": 6.406810035842294,
"grad_norm": 65.5,
"learning_rate": 1.72255083725208e-05,
"loss": 0.3452,
"num_input_tokens_seen": 2692824,
"step": 7150
},
{
"epoch": 6.411290322580645,
"grad_norm": 35.75,
"learning_rate": 1.7188358624751954e-05,
"loss": 0.1687,
"num_input_tokens_seen": 2694648,
"step": 7155
},
{
"epoch": 6.415770609318996,
"grad_norm": 0.09375,
"learning_rate": 1.7151227982929477e-05,
"loss": 0.1701,
"num_input_tokens_seen": 2696760,
"step": 7160
},
{
"epoch": 6.420250896057348,
"grad_norm": 39.75,
"learning_rate": 1.711411653786861e-05,
"loss": 0.3491,
"num_input_tokens_seen": 2698680,
"step": 7165
},
{
"epoch": 6.424731182795699,
"grad_norm": 3.578125,
"learning_rate": 1.7077024380337646e-05,
"loss": 0.3573,
"num_input_tokens_seen": 2700568,
"step": 7170
},
{
"epoch": 6.42921146953405,
"grad_norm": 0.0245361328125,
"learning_rate": 1.7039951601057692e-05,
"loss": 0.0496,
"num_input_tokens_seen": 2702360,
"step": 7175
},
{
"epoch": 6.433691756272402,
"grad_norm": 103.0,
"learning_rate": 1.7002898290702454e-05,
"loss": 0.3859,
"num_input_tokens_seen": 2704376,
"step": 7180
},
{
"epoch": 6.438172043010753,
"grad_norm": 11.0625,
"learning_rate": 1.6965864539898026e-05,
"loss": 0.4469,
"num_input_tokens_seen": 2706200,
"step": 7185
},
{
"epoch": 6.442652329749104,
"grad_norm": 104.0,
"learning_rate": 1.6928850439222666e-05,
"loss": 0.0832,
"num_input_tokens_seen": 2708088,
"step": 7190
},
{
"epoch": 6.447132616487455,
"grad_norm": 0.0263671875,
"learning_rate": 1.689185607920658e-05,
"loss": 0.263,
"num_input_tokens_seen": 2709912,
"step": 7195
},
{
"epoch": 6.451612903225806,
"grad_norm": 0.69921875,
"learning_rate": 1.685488155033167e-05,
"loss": 0.1324,
"num_input_tokens_seen": 2711672,
"step": 7200
},
{
"epoch": 6.456093189964157,
"grad_norm": 0.263671875,
"learning_rate": 1.681792694303136e-05,
"loss": 0.0709,
"num_input_tokens_seen": 2713528,
"step": 7205
},
{
"epoch": 6.460573476702509,
"grad_norm": 2.15625,
"learning_rate": 1.6780992347690313e-05,
"loss": 0.2784,
"num_input_tokens_seen": 2715416,
"step": 7210
},
{
"epoch": 6.46505376344086,
"grad_norm": 120.5,
"learning_rate": 1.6744077854644282e-05,
"loss": 0.2818,
"num_input_tokens_seen": 2717464,
"step": 7215
},
{
"epoch": 6.469534050179211,
"grad_norm": 0.474609375,
"learning_rate": 1.6707183554179846e-05,
"loss": 0.0052,
"num_input_tokens_seen": 2719352,
"step": 7220
},
{
"epoch": 6.474014336917563,
"grad_norm": 196.0,
"learning_rate": 1.6670309536534172e-05,
"loss": 0.7244,
"num_input_tokens_seen": 2721368,
"step": 7225
},
{
"epoch": 6.478494623655914,
"grad_norm": 100.0,
"learning_rate": 1.6633455891894858e-05,
"loss": 0.8984,
"num_input_tokens_seen": 2723320,
"step": 7230
},
{
"epoch": 6.482974910394265,
"grad_norm": 0.490234375,
"learning_rate": 1.659662271039963e-05,
"loss": 0.0817,
"num_input_tokens_seen": 2725240,
"step": 7235
},
{
"epoch": 6.487455197132617,
"grad_norm": 0.25,
"learning_rate": 1.65598100821362e-05,
"loss": 0.1743,
"num_input_tokens_seen": 2727288,
"step": 7240
},
{
"epoch": 6.491935483870968,
"grad_norm": 0.0537109375,
"learning_rate": 1.652301809714199e-05,
"loss": 0.6101,
"num_input_tokens_seen": 2729080,
"step": 7245
},
{
"epoch": 6.496415770609319,
"grad_norm": 6.09375,
"learning_rate": 1.648624684540394e-05,
"loss": 0.3082,
"num_input_tokens_seen": 2730904,
"step": 7250
},
{
"epoch": 6.5,
"eval_loss": 0.2827821373939514,
"eval_runtime": 9.6651,
"eval_samples_per_second": 51.319,
"eval_steps_per_second": 12.83,
"num_input_tokens_seen": 2732440,
"step": 7254
},
{
"epoch": 6.500896057347671,
"grad_norm": 2.5625,
"learning_rate": 1.6449496416858284e-05,
"loss": 0.0785,
"num_input_tokens_seen": 2732792,
"step": 7255
},
{
"epoch": 6.505376344086022,
"grad_norm": 0.031982421875,
"learning_rate": 1.6412766901390314e-05,
"loss": 0.0834,
"num_input_tokens_seen": 2734616,
"step": 7260
},
{
"epoch": 6.509856630824372,
"grad_norm": 80.0,
"learning_rate": 1.6376058388834183e-05,
"loss": 0.1102,
"num_input_tokens_seen": 2736472,
"step": 7265
},
{
"epoch": 6.514336917562724,
"grad_norm": 58.5,
"learning_rate": 1.633937096897266e-05,
"loss": 0.2998,
"num_input_tokens_seen": 2738360,
"step": 7270
},
{
"epoch": 6.518817204301075,
"grad_norm": 0.466796875,
"learning_rate": 1.630270473153695e-05,
"loss": 0.2932,
"num_input_tokens_seen": 2740408,
"step": 7275
},
{
"epoch": 6.523297491039426,
"grad_norm": 1.3046875,
"learning_rate": 1.6266059766206425e-05,
"loss": 0.0011,
"num_input_tokens_seen": 2742168,
"step": 7280
},
{
"epoch": 6.527777777777778,
"grad_norm": 89.0,
"learning_rate": 1.6229436162608448e-05,
"loss": 0.6019,
"num_input_tokens_seen": 2743928,
"step": 7285
},
{
"epoch": 6.532258064516129,
"grad_norm": 4.40625,
"learning_rate": 1.619283401031811e-05,
"loss": 0.094,
"num_input_tokens_seen": 2745944,
"step": 7290
},
{
"epoch": 6.53673835125448,
"grad_norm": 84.5,
"learning_rate": 1.6156253398858058e-05,
"loss": 0.411,
"num_input_tokens_seen": 2747960,
"step": 7295
},
{
"epoch": 6.541218637992832,
"grad_norm": 16.5,
"learning_rate": 1.6119694417698246e-05,
"loss": 0.0081,
"num_input_tokens_seen": 2749848,
"step": 7300
},
{
"epoch": 6.545698924731183,
"grad_norm": 8.75,
"learning_rate": 1.6083157156255733e-05,
"loss": 0.0831,
"num_input_tokens_seen": 2751704,
"step": 7305
},
{
"epoch": 6.550179211469534,
"grad_norm": 0.0240478515625,
"learning_rate": 1.6046641703894434e-05,
"loss": 0.11,
"num_input_tokens_seen": 2753528,
"step": 7310
},
{
"epoch": 6.554659498207886,
"grad_norm": 4.59375,
"learning_rate": 1.6010148149924956e-05,
"loss": 0.2642,
"num_input_tokens_seen": 2755320,
"step": 7315
},
{
"epoch": 6.559139784946236,
"grad_norm": 0.0311279296875,
"learning_rate": 1.5973676583604298e-05,
"loss": 0.0031,
"num_input_tokens_seen": 2757368,
"step": 7320
},
{
"epoch": 6.563620071684587,
"grad_norm": 115.5,
"learning_rate": 1.5937227094135733e-05,
"loss": 0.3553,
"num_input_tokens_seen": 2759224,
"step": 7325
},
{
"epoch": 6.568100358422939,
"grad_norm": 0.3125,
"learning_rate": 1.5900799770668495e-05,
"loss": 0.185,
"num_input_tokens_seen": 2761112,
"step": 7330
},
{
"epoch": 6.57258064516129,
"grad_norm": 0.62109375,
"learning_rate": 1.5864394702297636e-05,
"loss": 0.133,
"num_input_tokens_seen": 2763096,
"step": 7335
},
{
"epoch": 6.577060931899641,
"grad_norm": 17.0,
"learning_rate": 1.5828011978063765e-05,
"loss": 0.2268,
"num_input_tokens_seen": 2764888,
"step": 7340
},
{
"epoch": 6.581541218637993,
"grad_norm": 0.62890625,
"learning_rate": 1.5791651686952823e-05,
"loss": 0.0927,
"num_input_tokens_seen": 2766776,
"step": 7345
},
{
"epoch": 6.586021505376344,
"grad_norm": 67.5,
"learning_rate": 1.575531391789591e-05,
"loss": 0.2766,
"num_input_tokens_seen": 2768760,
"step": 7350
},
{
"epoch": 6.590501792114695,
"grad_norm": 58.25,
"learning_rate": 1.5718998759769025e-05,
"loss": 0.2579,
"num_input_tokens_seen": 2770584,
"step": 7355
},
{
"epoch": 6.594982078853047,
"grad_norm": 0.4609375,
"learning_rate": 1.5682706301392867e-05,
"loss": 0.3561,
"num_input_tokens_seen": 2772408,
"step": 7360
},
{
"epoch": 6.599462365591398,
"grad_norm": 0.515625,
"learning_rate": 1.564643663153263e-05,
"loss": 0.0008,
"num_input_tokens_seen": 2774328,
"step": 7365
},
{
"epoch": 6.603942652329749,
"grad_norm": 72.5,
"learning_rate": 1.561018983889775e-05,
"loss": 0.0683,
"num_input_tokens_seen": 2776120,
"step": 7370
},
{
"epoch": 6.608422939068101,
"grad_norm": 0.4453125,
"learning_rate": 1.557396601214171e-05,
"loss": 0.1187,
"num_input_tokens_seen": 2778040,
"step": 7375
},
{
"epoch": 6.612903225806452,
"grad_norm": 78.5,
"learning_rate": 1.5537765239861838e-05,
"loss": 0.027,
"num_input_tokens_seen": 2779928,
"step": 7380
},
{
"epoch": 6.617383512544803,
"grad_norm": 61.25,
"learning_rate": 1.550158761059907e-05,
"loss": 0.2125,
"num_input_tokens_seen": 2781656,
"step": 7385
},
{
"epoch": 6.621863799283155,
"grad_norm": 0.34765625,
"learning_rate": 1.5465433212837726e-05,
"loss": 0.1611,
"num_input_tokens_seen": 2783544,
"step": 7390
},
{
"epoch": 6.626344086021505,
"grad_norm": 102.5,
"learning_rate": 1.542930213500533e-05,
"loss": 0.0498,
"num_input_tokens_seen": 2785304,
"step": 7395
},
{
"epoch": 6.630824372759856,
"grad_norm": 1.625,
"learning_rate": 1.5393194465472337e-05,
"loss": 0.5324,
"num_input_tokens_seen": 2787256,
"step": 7400
},
{
"epoch": 6.635304659498208,
"grad_norm": 60.25,
"learning_rate": 1.535711029255197e-05,
"loss": 0.2301,
"num_input_tokens_seen": 2789144,
"step": 7405
},
{
"epoch": 6.639784946236559,
"grad_norm": 0.515625,
"learning_rate": 1.532104970449999e-05,
"loss": 0.0235,
"num_input_tokens_seen": 2791000,
"step": 7410
},
{
"epoch": 6.64426523297491,
"grad_norm": 102.5,
"learning_rate": 1.5285012789514446e-05,
"loss": 0.4687,
"num_input_tokens_seen": 2793016,
"step": 7415
},
{
"epoch": 6.648745519713262,
"grad_norm": 0.031494140625,
"learning_rate": 1.5248999635735516e-05,
"loss": 0.0938,
"num_input_tokens_seen": 2795000,
"step": 7420
},
{
"epoch": 6.653225806451613,
"grad_norm": 0.09375,
"learning_rate": 1.5213010331245259e-05,
"loss": 0.4469,
"num_input_tokens_seen": 2796984,
"step": 7425
},
{
"epoch": 6.657706093189964,
"grad_norm": 5.5,
"learning_rate": 1.5177044964067372e-05,
"loss": 0.0245,
"num_input_tokens_seen": 2798872,
"step": 7430
},
{
"epoch": 6.662186379928316,
"grad_norm": 46.0,
"learning_rate": 1.5141103622167041e-05,
"loss": 0.4422,
"num_input_tokens_seen": 2800888,
"step": 7435
},
{
"epoch": 6.666666666666667,
"grad_norm": 1.1875,
"learning_rate": 1.5105186393450665e-05,
"loss": 0.3887,
"num_input_tokens_seen": 2802776,
"step": 7440
},
{
"epoch": 6.671146953405018,
"grad_norm": 704.0,
"learning_rate": 1.5069293365765685e-05,
"loss": 0.1628,
"num_input_tokens_seen": 2804504,
"step": 7445
},
{
"epoch": 6.675627240143369,
"grad_norm": 69.0,
"learning_rate": 1.5033424626900353e-05,
"loss": 0.1589,
"num_input_tokens_seen": 2806424,
"step": 7450
},
{
"epoch": 6.68010752688172,
"grad_norm": 15.1875,
"learning_rate": 1.4997580264583488e-05,
"loss": 0.0102,
"num_input_tokens_seen": 2808312,
"step": 7455
},
{
"epoch": 6.684587813620071,
"grad_norm": 59.25,
"learning_rate": 1.4961760366484307e-05,
"loss": 0.7118,
"num_input_tokens_seen": 2810232,
"step": 7460
},
{
"epoch": 6.689068100358423,
"grad_norm": 5.15625,
"learning_rate": 1.492596502021219e-05,
"loss": 0.843,
"num_input_tokens_seen": 2812248,
"step": 7465
},
{
"epoch": 6.693548387096774,
"grad_norm": 13.875,
"learning_rate": 1.4890194313316478e-05,
"loss": 0.1223,
"num_input_tokens_seen": 2814168,
"step": 7470
},
{
"epoch": 6.698028673835125,
"grad_norm": 64.5,
"learning_rate": 1.4854448333286222e-05,
"loss": 0.4203,
"num_input_tokens_seen": 2816088,
"step": 7475
},
{
"epoch": 6.702508960573477,
"grad_norm": 54.25,
"learning_rate": 1.4818727167550025e-05,
"loss": 0.1728,
"num_input_tokens_seen": 2817944,
"step": 7480
},
{
"epoch": 6.706989247311828,
"grad_norm": 37.25,
"learning_rate": 1.478303090347577e-05,
"loss": 0.1402,
"num_input_tokens_seen": 2819928,
"step": 7485
},
{
"epoch": 6.711469534050179,
"grad_norm": 0.00567626953125,
"learning_rate": 1.474735962837045e-05,
"loss": 0.2419,
"num_input_tokens_seen": 2821880,
"step": 7490
},
{
"epoch": 6.715949820788531,
"grad_norm": 5.875,
"learning_rate": 1.4711713429479945e-05,
"loss": 0.0031,
"num_input_tokens_seen": 2823800,
"step": 7495
},
{
"epoch": 6.720430107526882,
"grad_norm": 1.2421875,
"learning_rate": 1.4676092393988791e-05,
"loss": 0.3066,
"num_input_tokens_seen": 2825656,
"step": 7500
},
{
"epoch": 6.724910394265233,
"grad_norm": 1.1171875,
"learning_rate": 1.4640496609019993e-05,
"loss": 0.0133,
"num_input_tokens_seen": 2827512,
"step": 7505
},
{
"epoch": 6.729390681003585,
"grad_norm": 0.0211181640625,
"learning_rate": 1.4604926161634768e-05,
"loss": 0.0007,
"num_input_tokens_seen": 2829336,
"step": 7510
},
{
"epoch": 6.733870967741936,
"grad_norm": 192.0,
"learning_rate": 1.45693811388324e-05,
"loss": 0.1292,
"num_input_tokens_seen": 2831224,
"step": 7515
},
{
"epoch": 6.738351254480286,
"grad_norm": 0.06298828125,
"learning_rate": 1.4533861627549953e-05,
"loss": 0.0033,
"num_input_tokens_seen": 2833176,
"step": 7520
},
{
"epoch": 6.742831541218638,
"grad_norm": 0.0135498046875,
"learning_rate": 1.4498367714662128e-05,
"loss": 0.406,
"num_input_tokens_seen": 2835096,
"step": 7525
},
{
"epoch": 6.747311827956989,
"grad_norm": 16.25,
"learning_rate": 1.4462899486980994e-05,
"loss": 0.2964,
"num_input_tokens_seen": 2836952,
"step": 7530
},
{
"epoch": 6.75179211469534,
"grad_norm": 3.46875,
"learning_rate": 1.4427457031255803e-05,
"loss": 0.3914,
"num_input_tokens_seen": 2838936,
"step": 7535
},
{
"epoch": 6.756272401433692,
"grad_norm": 25.5,
"learning_rate": 1.4392040434172773e-05,
"loss": 0.1999,
"num_input_tokens_seen": 2840888,
"step": 7540
},
{
"epoch": 6.760752688172043,
"grad_norm": 0.07666015625,
"learning_rate": 1.4356649782354872e-05,
"loss": 0.5523,
"num_input_tokens_seen": 2842776,
"step": 7545
},
{
"epoch": 6.765232974910394,
"grad_norm": 0.1865234375,
"learning_rate": 1.432128516236163e-05,
"loss": 0.0383,
"num_input_tokens_seen": 2844568,
"step": 7550
},
{
"epoch": 6.769713261648746,
"grad_norm": 49.5,
"learning_rate": 1.4285946660688888e-05,
"loss": 0.1105,
"num_input_tokens_seen": 2846328,
"step": 7555
},
{
"epoch": 6.774193548387097,
"grad_norm": 16.625,
"learning_rate": 1.4250634363768601e-05,
"loss": 0.2696,
"num_input_tokens_seen": 2848216,
"step": 7560
},
{
"epoch": 6.778673835125448,
"grad_norm": 74.5,
"learning_rate": 1.4215348357968669e-05,
"loss": 0.2144,
"num_input_tokens_seen": 2850104,
"step": 7565
},
{
"epoch": 6.7831541218638,
"grad_norm": 0.6796875,
"learning_rate": 1.4180088729592633e-05,
"loss": 0.005,
"num_input_tokens_seen": 2851960,
"step": 7570
},
{
"epoch": 6.78763440860215,
"grad_norm": 53.75,
"learning_rate": 1.4144855564879553e-05,
"loss": 0.4174,
"num_input_tokens_seen": 2853944,
"step": 7575
},
{
"epoch": 6.792114695340501,
"grad_norm": 73.5,
"learning_rate": 1.410964895000377e-05,
"loss": 0.2266,
"num_input_tokens_seen": 2855672,
"step": 7580
},
{
"epoch": 6.796594982078853,
"grad_norm": 94.0,
"learning_rate": 1.4074468971074673e-05,
"loss": 0.3265,
"num_input_tokens_seen": 2857496,
"step": 7585
},
{
"epoch": 6.801075268817204,
"grad_norm": 0.83203125,
"learning_rate": 1.4039315714136502e-05,
"loss": 0.1876,
"num_input_tokens_seen": 2859320,
"step": 7590
},
{
"epoch": 6.805555555555555,
"grad_norm": 89.5,
"learning_rate": 1.4004189265168149e-05,
"loss": 0.5259,
"num_input_tokens_seen": 2861272,
"step": 7595
},
{
"epoch": 6.810035842293907,
"grad_norm": 132.0,
"learning_rate": 1.3969089710082927e-05,
"loss": 0.3453,
"num_input_tokens_seen": 2863256,
"step": 7600
},
{
"epoch": 6.814516129032258,
"grad_norm": 0.79296875,
"learning_rate": 1.3934017134728397e-05,
"loss": 0.0031,
"num_input_tokens_seen": 2865048,
"step": 7605
},
{
"epoch": 6.818996415770609,
"grad_norm": 102.5,
"learning_rate": 1.3898971624886101e-05,
"loss": 0.4995,
"num_input_tokens_seen": 2866904,
"step": 7610
},
{
"epoch": 6.823476702508961,
"grad_norm": 0.04052734375,
"learning_rate": 1.386395326627139e-05,
"loss": 0.2642,
"num_input_tokens_seen": 2868920,
"step": 7615
},
{
"epoch": 6.827956989247312,
"grad_norm": 185.0,
"learning_rate": 1.3828962144533242e-05,
"loss": 0.0519,
"num_input_tokens_seen": 2870840,
"step": 7620
},
{
"epoch": 6.832437275985663,
"grad_norm": 0.0291748046875,
"learning_rate": 1.379399834525395e-05,
"loss": 0.0002,
"num_input_tokens_seen": 2872792,
"step": 7625
},
{
"epoch": 6.836917562724015,
"grad_norm": 63.75,
"learning_rate": 1.3759061953949054e-05,
"loss": 0.5697,
"num_input_tokens_seen": 2874552,
"step": 7630
},
{
"epoch": 6.841397849462366,
"grad_norm": 9.375,
"learning_rate": 1.3724153056067013e-05,
"loss": 0.028,
"num_input_tokens_seen": 2876312,
"step": 7635
},
{
"epoch": 6.845878136200717,
"grad_norm": 1.6484375,
"learning_rate": 1.3689271736989046e-05,
"loss": 0.4029,
"num_input_tokens_seen": 2878200,
"step": 7640
},
{
"epoch": 6.850358422939068,
"grad_norm": 1.5390625,
"learning_rate": 1.3654418082028956e-05,
"loss": 0.0386,
"num_input_tokens_seen": 2879992,
"step": 7645
},
{
"epoch": 6.854838709677419,
"grad_norm": 48.5,
"learning_rate": 1.3619592176432816e-05,
"loss": 0.1928,
"num_input_tokens_seen": 2881816,
"step": 7650
},
{
"epoch": 6.85931899641577,
"grad_norm": 86.0,
"learning_rate": 1.3584794105378904e-05,
"loss": 0.0587,
"num_input_tokens_seen": 2883832,
"step": 7655
},
{
"epoch": 6.863799283154122,
"grad_norm": 0.05322265625,
"learning_rate": 1.3550023953977367e-05,
"loss": 0.0789,
"num_input_tokens_seen": 2885848,
"step": 7660
},
{
"epoch": 6.868279569892473,
"grad_norm": 0.1982421875,
"learning_rate": 1.3515281807270075e-05,
"loss": 0.0988,
"num_input_tokens_seen": 2887864,
"step": 7665
},
{
"epoch": 6.872759856630824,
"grad_norm": 0.09912109375,
"learning_rate": 1.3480567750230433e-05,
"loss": 0.2827,
"num_input_tokens_seen": 2889816,
"step": 7670
},
{
"epoch": 6.877240143369176,
"grad_norm": 89.5,
"learning_rate": 1.344588186776311e-05,
"loss": 0.27,
"num_input_tokens_seen": 2891608,
"step": 7675
},
{
"epoch": 6.881720430107527,
"grad_norm": 14.5625,
"learning_rate": 1.3411224244703873e-05,
"loss": 0.3221,
"num_input_tokens_seen": 2893528,
"step": 7680
},
{
"epoch": 6.886200716845878,
"grad_norm": 156.0,
"learning_rate": 1.3376594965819378e-05,
"loss": 0.331,
"num_input_tokens_seen": 2895576,
"step": 7685
},
{
"epoch": 6.89068100358423,
"grad_norm": 0.0267333984375,
"learning_rate": 1.3341994115806943e-05,
"loss": 0.2155,
"num_input_tokens_seen": 2897592,
"step": 7690
},
{
"epoch": 6.895161290322581,
"grad_norm": 24.25,
"learning_rate": 1.3307421779294377e-05,
"loss": 0.0054,
"num_input_tokens_seen": 2899384,
"step": 7695
},
{
"epoch": 6.899641577060932,
"grad_norm": 0.056640625,
"learning_rate": 1.3272878040839742e-05,
"loss": 0.0034,
"num_input_tokens_seen": 2901240,
"step": 7700
},
{
"epoch": 6.904121863799283,
"grad_norm": 0.12890625,
"learning_rate": 1.3238362984931113e-05,
"loss": 0.2677,
"num_input_tokens_seen": 2903224,
"step": 7705
},
{
"epoch": 6.908602150537634,
"grad_norm": 84.5,
"learning_rate": 1.3203876695986478e-05,
"loss": 0.3084,
"num_input_tokens_seen": 2905112,
"step": 7710
},
{
"epoch": 6.913082437275985,
"grad_norm": 50.5,
"learning_rate": 1.3169419258353433e-05,
"loss": 0.2589,
"num_input_tokens_seen": 2907192,
"step": 7715
},
{
"epoch": 6.917562724014337,
"grad_norm": 0.083984375,
"learning_rate": 1.313499075630899e-05,
"loss": 0.1176,
"num_input_tokens_seen": 2909016,
"step": 7720
},
{
"epoch": 6.922043010752688,
"grad_norm": 0.5859375,
"learning_rate": 1.3100591274059431e-05,
"loss": 0.0086,
"num_input_tokens_seen": 2910968,
"step": 7725
},
{
"epoch": 6.926523297491039,
"grad_norm": 12.25,
"learning_rate": 1.3066220895740039e-05,
"loss": 0.2284,
"num_input_tokens_seen": 2913080,
"step": 7730
},
{
"epoch": 6.931003584229391,
"grad_norm": 0.01068115234375,
"learning_rate": 1.3031879705414907e-05,
"loss": 0.1095,
"num_input_tokens_seen": 2914968,
"step": 7735
},
{
"epoch": 6.935483870967742,
"grad_norm": 0.4921875,
"learning_rate": 1.2997567787076747e-05,
"loss": 0.4782,
"num_input_tokens_seen": 2916824,
"step": 7740
},
{
"epoch": 6.939964157706093,
"grad_norm": 0.011474609375,
"learning_rate": 1.296328522464667e-05,
"loss": 0.0118,
"num_input_tokens_seen": 2919032,
"step": 7745
},
{
"epoch": 6.944444444444445,
"grad_norm": 68.0,
"learning_rate": 1.2929032101974009e-05,
"loss": 0.175,
"num_input_tokens_seen": 2920920,
"step": 7750
},
{
"epoch": 6.948924731182796,
"grad_norm": 0.1123046875,
"learning_rate": 1.289480850283607e-05,
"loss": 0.0374,
"num_input_tokens_seen": 2923032,
"step": 7755
},
{
"epoch": 6.953405017921147,
"grad_norm": 0.109375,
"learning_rate": 1.2860614510937955e-05,
"loss": 0.1278,
"num_input_tokens_seen": 2924856,
"step": 7760
},
{
"epoch": 6.957885304659499,
"grad_norm": 142.0,
"learning_rate": 1.2826450209912355e-05,
"loss": 0.1233,
"num_input_tokens_seen": 2926680,
"step": 7765
},
{
"epoch": 6.96236559139785,
"grad_norm": 0.1142578125,
"learning_rate": 1.2792315683319328e-05,
"loss": 0.2875,
"num_input_tokens_seen": 2928568,
"step": 7770
},
{
"epoch": 6.9668458781362,
"grad_norm": 127.0,
"learning_rate": 1.2758211014646143e-05,
"loss": 0.3072,
"num_input_tokens_seen": 2930424,
"step": 7775
},
{
"epoch": 6.971326164874552,
"grad_norm": 119.5,
"learning_rate": 1.2724136287307009e-05,
"loss": 0.3411,
"num_input_tokens_seen": 2932376,
"step": 7780
},
{
"epoch": 6.975806451612903,
"grad_norm": 0.12353515625,
"learning_rate": 1.2690091584642916e-05,
"loss": 0.0032,
"num_input_tokens_seen": 2934072,
"step": 7785
},
{
"epoch": 6.980286738351254,
"grad_norm": 0.0184326171875,
"learning_rate": 1.2656076989921417e-05,
"loss": 0.6442,
"num_input_tokens_seen": 2935896,
"step": 7790
},
{
"epoch": 6.984767025089606,
"grad_norm": 0.36328125,
"learning_rate": 1.2622092586336415e-05,
"loss": 0.1848,
"num_input_tokens_seen": 2937720,
"step": 7795
},
{
"epoch": 6.989247311827957,
"grad_norm": 2.28125,
"learning_rate": 1.2588138457008e-05,
"loss": 0.1248,
"num_input_tokens_seen": 2939480,
"step": 7800
},
{
"epoch": 6.993727598566308,
"grad_norm": 16.375,
"learning_rate": 1.2554214684982191e-05,
"loss": 0.1974,
"num_input_tokens_seen": 2941304,
"step": 7805
},
{
"epoch": 6.99820788530466,
"grad_norm": 6.9375,
"learning_rate": 1.2520321353230769e-05,
"loss": 0.002,
"num_input_tokens_seen": 2943256,
"step": 7810
},
{
"epoch": 7.0,
"eval_loss": 0.28480419516563416,
"eval_runtime": 9.6469,
"eval_samples_per_second": 51.416,
"eval_steps_per_second": 12.854,
"num_input_tokens_seen": 2943688,
"step": 7812
},
{
"epoch": 7.002688172043011,
"grad_norm": 0.03857421875,
"learning_rate": 1.248645854465105e-05,
"loss": 0.1313,
"num_input_tokens_seen": 2944680,
"step": 7815
},
{
"epoch": 7.007168458781362,
"grad_norm": 0.05224609375,
"learning_rate": 1.2452626342065702e-05,
"loss": 0.0639,
"num_input_tokens_seen": 2946696,
"step": 7820
},
{
"epoch": 7.011648745519714,
"grad_norm": 40.25,
"learning_rate": 1.2418824828222559e-05,
"loss": 0.1408,
"num_input_tokens_seen": 2948616,
"step": 7825
},
{
"epoch": 7.016129032258065,
"grad_norm": 0.1953125,
"learning_rate": 1.2385054085794361e-05,
"loss": 0.0507,
"num_input_tokens_seen": 2950472,
"step": 7830
},
{
"epoch": 7.020609318996415,
"grad_norm": 24.375,
"learning_rate": 1.2351314197378597e-05,
"loss": 0.0348,
"num_input_tokens_seen": 2952392,
"step": 7835
},
{
"epoch": 7.025089605734767,
"grad_norm": 0.058349609375,
"learning_rate": 1.2317605245497323e-05,
"loss": 0.1611,
"num_input_tokens_seen": 2954248,
"step": 7840
},
{
"epoch": 7.029569892473118,
"grad_norm": 156.0,
"learning_rate": 1.2283927312596874e-05,
"loss": 0.5455,
"num_input_tokens_seen": 2956072,
"step": 7845
},
{
"epoch": 7.034050179211469,
"grad_norm": 0.036376953125,
"learning_rate": 1.2250280481047746e-05,
"loss": 0.0454,
"num_input_tokens_seen": 2958024,
"step": 7850
},
{
"epoch": 7.038530465949821,
"grad_norm": 0.035888671875,
"learning_rate": 1.2216664833144386e-05,
"loss": 0.3266,
"num_input_tokens_seen": 2959816,
"step": 7855
},
{
"epoch": 7.043010752688172,
"grad_norm": 0.033447265625,
"learning_rate": 1.2183080451104937e-05,
"loss": 0.1749,
"num_input_tokens_seen": 2961672,
"step": 7860
},
{
"epoch": 7.047491039426523,
"grad_norm": 6.84375,
"learning_rate": 1.2149527417071107e-05,
"loss": 0.1901,
"num_input_tokens_seen": 2963688,
"step": 7865
},
{
"epoch": 7.051971326164875,
"grad_norm": 97.5,
"learning_rate": 1.2116005813107891e-05,
"loss": 0.4422,
"num_input_tokens_seen": 2965576,
"step": 7870
},
{
"epoch": 7.056451612903226,
"grad_norm": 0.1533203125,
"learning_rate": 1.2082515721203427e-05,
"loss": 0.3751,
"num_input_tokens_seen": 2967464,
"step": 7875
},
{
"epoch": 7.060931899641577,
"grad_norm": 0.076171875,
"learning_rate": 1.2049057223268807e-05,
"loss": 0.2435,
"num_input_tokens_seen": 2969416,
"step": 7880
},
{
"epoch": 7.065412186379929,
"grad_norm": 0.0625,
"learning_rate": 1.2015630401137812e-05,
"loss": 0.4678,
"num_input_tokens_seen": 2971304,
"step": 7885
},
{
"epoch": 7.06989247311828,
"grad_norm": 55.5,
"learning_rate": 1.198223533656676e-05,
"loss": 0.2309,
"num_input_tokens_seen": 2973096,
"step": 7890
},
{
"epoch": 7.07437275985663,
"grad_norm": 29.625,
"learning_rate": 1.1948872111234327e-05,
"loss": 0.1608,
"num_input_tokens_seen": 2975080,
"step": 7895
},
{
"epoch": 7.078853046594982,
"grad_norm": 0.32421875,
"learning_rate": 1.191554080674125e-05,
"loss": 0.1702,
"num_input_tokens_seen": 2977064,
"step": 7900
},
{
"epoch": 7.083333333333333,
"grad_norm": 59.0,
"learning_rate": 1.188224150461026e-05,
"loss": 0.5976,
"num_input_tokens_seen": 2979016,
"step": 7905
},
{
"epoch": 7.087813620071684,
"grad_norm": 0.095703125,
"learning_rate": 1.1848974286285774e-05,
"loss": 0.1999,
"num_input_tokens_seen": 2980904,
"step": 7910
},
{
"epoch": 7.092293906810036,
"grad_norm": 0.1220703125,
"learning_rate": 1.181573923313375e-05,
"loss": 0.2029,
"num_input_tokens_seen": 2982792,
"step": 7915
},
{
"epoch": 7.096774193548387,
"grad_norm": 89.0,
"learning_rate": 1.1782536426441498e-05,
"loss": 0.6841,
"num_input_tokens_seen": 2984552,
"step": 7920
},
{
"epoch": 7.101254480286738,
"grad_norm": 66.0,
"learning_rate": 1.17493659474174e-05,
"loss": 0.0641,
"num_input_tokens_seen": 2986280,
"step": 7925
},
{
"epoch": 7.10573476702509,
"grad_norm": 2.265625,
"learning_rate": 1.1716227877190839e-05,
"loss": 0.1772,
"num_input_tokens_seen": 2988200,
"step": 7930
},
{
"epoch": 7.110215053763441,
"grad_norm": 121.5,
"learning_rate": 1.1683122296811883e-05,
"loss": 0.0731,
"num_input_tokens_seen": 2989928,
"step": 7935
},
{
"epoch": 7.114695340501792,
"grad_norm": 112.0,
"learning_rate": 1.1650049287251147e-05,
"loss": 0.1095,
"num_input_tokens_seen": 2991752,
"step": 7940
},
{
"epoch": 7.119175627240144,
"grad_norm": 91.5,
"learning_rate": 1.1617008929399606e-05,
"loss": 0.2032,
"num_input_tokens_seen": 2993640,
"step": 7945
},
{
"epoch": 7.123655913978495,
"grad_norm": 0.53125,
"learning_rate": 1.1584001304068349e-05,
"loss": 0.1439,
"num_input_tokens_seen": 2995528,
"step": 7950
},
{
"epoch": 7.128136200716846,
"grad_norm": 52.0,
"learning_rate": 1.155102649198841e-05,
"loss": 0.1374,
"num_input_tokens_seen": 2997512,
"step": 7955
},
{
"epoch": 7.132616487455197,
"grad_norm": 0.87109375,
"learning_rate": 1.1518084573810575e-05,
"loss": 0.3792,
"num_input_tokens_seen": 2999272,
"step": 7960
},
{
"epoch": 7.137096774193548,
"grad_norm": 0.1416015625,
"learning_rate": 1.1485175630105163e-05,
"loss": 0.0356,
"num_input_tokens_seen": 3001160,
"step": 7965
},
{
"epoch": 7.141577060931899,
"grad_norm": 59.25,
"learning_rate": 1.1452299741361875e-05,
"loss": 0.2431,
"num_input_tokens_seen": 3003048,
"step": 7970
},
{
"epoch": 7.146057347670251,
"grad_norm": 0.63671875,
"learning_rate": 1.141945698798954e-05,
"loss": 0.0517,
"num_input_tokens_seen": 3005224,
"step": 7975
},
{
"epoch": 7.150537634408602,
"grad_norm": 38.75,
"learning_rate": 1.1386647450315924e-05,
"loss": 0.2001,
"num_input_tokens_seen": 3007112,
"step": 7980
},
{
"epoch": 7.155017921146953,
"grad_norm": 0.014404296875,
"learning_rate": 1.1353871208587602e-05,
"loss": 0.0008,
"num_input_tokens_seen": 3009000,
"step": 7985
},
{
"epoch": 7.159498207885305,
"grad_norm": 0.3515625,
"learning_rate": 1.132112834296967e-05,
"loss": 0.275,
"num_input_tokens_seen": 3010920,
"step": 7990
},
{
"epoch": 7.163978494623656,
"grad_norm": 75.0,
"learning_rate": 1.1288418933545624e-05,
"loss": 0.3288,
"num_input_tokens_seen": 3012936,
"step": 7995
},
{
"epoch": 7.168458781362007,
"grad_norm": 81.0,
"learning_rate": 1.1255743060317115e-05,
"loss": 0.1543,
"num_input_tokens_seen": 3014600,
"step": 8000
},
{
"epoch": 7.172939068100359,
"grad_norm": 0.376953125,
"learning_rate": 1.1223100803203767e-05,
"loss": 0.0025,
"num_input_tokens_seen": 3016552,
"step": 8005
},
{
"epoch": 7.17741935483871,
"grad_norm": 14.9375,
"learning_rate": 1.1190492242042989e-05,
"loss": 0.1856,
"num_input_tokens_seen": 3018376,
"step": 8010
},
{
"epoch": 7.181899641577061,
"grad_norm": 0.361328125,
"learning_rate": 1.1157917456589778e-05,
"loss": 0.2386,
"num_input_tokens_seen": 3020296,
"step": 8015
},
{
"epoch": 7.186379928315413,
"grad_norm": 150.0,
"learning_rate": 1.1125376526516511e-05,
"loss": 0.3946,
"num_input_tokens_seen": 3022120,
"step": 8020
},
{
"epoch": 7.190860215053763,
"grad_norm": 0.0185546875,
"learning_rate": 1.109286953141279e-05,
"loss": 0.0689,
"num_input_tokens_seen": 3023816,
"step": 8025
},
{
"epoch": 7.195340501792114,
"grad_norm": 52.25,
"learning_rate": 1.1060396550785182e-05,
"loss": 0.1724,
"num_input_tokens_seen": 3025672,
"step": 8030
},
{
"epoch": 7.199820788530466,
"grad_norm": 21.375,
"learning_rate": 1.1027957664057079e-05,
"loss": 0.0433,
"num_input_tokens_seen": 3027496,
"step": 8035
},
{
"epoch": 7.204301075268817,
"grad_norm": 0.0283203125,
"learning_rate": 1.099555295056848e-05,
"loss": 0.1029,
"num_input_tokens_seen": 3029288,
"step": 8040
},
{
"epoch": 7.208781362007168,
"grad_norm": 105.0,
"learning_rate": 1.0963182489575797e-05,
"loss": 0.1413,
"num_input_tokens_seen": 3031080,
"step": 8045
},
{
"epoch": 7.21326164874552,
"grad_norm": 173.0,
"learning_rate": 1.0930846360251684e-05,
"loss": 0.6562,
"num_input_tokens_seen": 3033128,
"step": 8050
},
{
"epoch": 7.217741935483871,
"grad_norm": 76.0,
"learning_rate": 1.0898544641684816e-05,
"loss": 0.2897,
"num_input_tokens_seen": 3035144,
"step": 8055
},
{
"epoch": 7.222222222222222,
"grad_norm": 27.5,
"learning_rate": 1.0866277412879695e-05,
"loss": 0.1863,
"num_input_tokens_seen": 3037032,
"step": 8060
},
{
"epoch": 7.226702508960574,
"grad_norm": 0.0264892578125,
"learning_rate": 1.0834044752756478e-05,
"loss": 0.1746,
"num_input_tokens_seen": 3038952,
"step": 8065
},
{
"epoch": 7.231182795698925,
"grad_norm": 0.1357421875,
"learning_rate": 1.0801846740150759e-05,
"loss": 0.4075,
"num_input_tokens_seen": 3041000,
"step": 8070
},
{
"epoch": 7.235663082437276,
"grad_norm": 0.32421875,
"learning_rate": 1.0769683453813426e-05,
"loss": 0.1792,
"num_input_tokens_seen": 3042824,
"step": 8075
},
{
"epoch": 7.240143369175628,
"grad_norm": 0.076171875,
"learning_rate": 1.0737554972410391e-05,
"loss": 0.0926,
"num_input_tokens_seen": 3044648,
"step": 8080
},
{
"epoch": 7.244623655913978,
"grad_norm": 5.96875,
"learning_rate": 1.0705461374522463e-05,
"loss": 0.0041,
"num_input_tokens_seen": 3046664,
"step": 8085
},
{
"epoch": 7.249103942652329,
"grad_norm": 60.25,
"learning_rate": 1.0673402738645116e-05,
"loss": 0.1855,
"num_input_tokens_seen": 3048456,
"step": 8090
},
{
"epoch": 7.253584229390681,
"grad_norm": 44.75,
"learning_rate": 1.0641379143188321e-05,
"loss": 0.0518,
"num_input_tokens_seen": 3050408,
"step": 8095
},
{
"epoch": 7.258064516129032,
"grad_norm": 4.3125,
"learning_rate": 1.060939066647636e-05,
"loss": 0.6442,
"num_input_tokens_seen": 3052264,
"step": 8100
},
{
"epoch": 7.262544802867383,
"grad_norm": 0.1787109375,
"learning_rate": 1.0577437386747601e-05,
"loss": 0.1546,
"num_input_tokens_seen": 3054184,
"step": 8105
},
{
"epoch": 7.267025089605735,
"grad_norm": 0.04541015625,
"learning_rate": 1.054551938215432e-05,
"loss": 0.1486,
"num_input_tokens_seen": 3056008,
"step": 8110
},
{
"epoch": 7.271505376344086,
"grad_norm": 0.1064453125,
"learning_rate": 1.0513636730762558e-05,
"loss": 0.1805,
"num_input_tokens_seen": 3057992,
"step": 8115
},
{
"epoch": 7.275985663082437,
"grad_norm": 0.95703125,
"learning_rate": 1.0481789510551821e-05,
"loss": 0.4512,
"num_input_tokens_seen": 3059720,
"step": 8120
},
{
"epoch": 7.280465949820789,
"grad_norm": 0.5859375,
"learning_rate": 1.044997779941502e-05,
"loss": 0.1827,
"num_input_tokens_seen": 3061576,
"step": 8125
},
{
"epoch": 7.28494623655914,
"grad_norm": 126.5,
"learning_rate": 1.0418201675158182e-05,
"loss": 0.0975,
"num_input_tokens_seen": 3063368,
"step": 8130
},
{
"epoch": 7.289426523297491,
"grad_norm": 0.345703125,
"learning_rate": 1.0386461215500296e-05,
"loss": 0.4467,
"num_input_tokens_seen": 3065128,
"step": 8135
},
{
"epoch": 7.293906810035843,
"grad_norm": 1.3125,
"learning_rate": 1.0354756498073156e-05,
"loss": 0.5098,
"num_input_tokens_seen": 3067144,
"step": 8140
},
{
"epoch": 7.298387096774194,
"grad_norm": 0.0625,
"learning_rate": 1.032308760042108e-05,
"loss": 0.1959,
"num_input_tokens_seen": 3069064,
"step": 8145
},
{
"epoch": 7.302867383512545,
"grad_norm": 0.0225830078125,
"learning_rate": 1.0291454600000805e-05,
"loss": 0.2824,
"num_input_tokens_seen": 3071048,
"step": 8150
},
{
"epoch": 7.307347670250896,
"grad_norm": 137.0,
"learning_rate": 1.0259857574181292e-05,
"loss": 0.4067,
"num_input_tokens_seen": 3073032,
"step": 8155
},
{
"epoch": 7.311827956989247,
"grad_norm": 1.4609375,
"learning_rate": 1.0228296600243483e-05,
"loss": 0.2129,
"num_input_tokens_seen": 3074824,
"step": 8160
},
{
"epoch": 7.316308243727598,
"grad_norm": 2.953125,
"learning_rate": 1.0196771755380145e-05,
"loss": 0.2166,
"num_input_tokens_seen": 3076712,
"step": 8165
},
{
"epoch": 7.32078853046595,
"grad_norm": 0.04443359375,
"learning_rate": 1.016528311669571e-05,
"loss": 0.1671,
"num_input_tokens_seen": 3078536,
"step": 8170
},
{
"epoch": 7.325268817204301,
"grad_norm": 0.515625,
"learning_rate": 1.0133830761206e-05,
"loss": 0.005,
"num_input_tokens_seen": 3080424,
"step": 8175
},
{
"epoch": 7.329749103942652,
"grad_norm": 0.0673828125,
"learning_rate": 1.0102414765838156e-05,
"loss": 0.1817,
"num_input_tokens_seen": 3082472,
"step": 8180
},
{
"epoch": 7.334229390681004,
"grad_norm": 0.6875,
"learning_rate": 1.0071035207430352e-05,
"loss": 0.002,
"num_input_tokens_seen": 3084328,
"step": 8185
},
{
"epoch": 7.338709677419355,
"grad_norm": 35.75,
"learning_rate": 1.0039692162731637e-05,
"loss": 0.469,
"num_input_tokens_seen": 3086120,
"step": 8190
},
{
"epoch": 7.343189964157706,
"grad_norm": 52.25,
"learning_rate": 1.0008385708401802e-05,
"loss": 0.0616,
"num_input_tokens_seen": 3087976,
"step": 8195
},
{
"epoch": 7.347670250896058,
"grad_norm": 0.298828125,
"learning_rate": 9.977115921011071e-06,
"loss": 0.107,
"num_input_tokens_seen": 3089864,
"step": 8200
},
{
"epoch": 7.352150537634409,
"grad_norm": 0.53125,
"learning_rate": 9.945882877040053e-06,
"loss": 0.2218,
"num_input_tokens_seen": 3091688,
"step": 8205
},
{
"epoch": 7.356630824372759,
"grad_norm": 21.125,
"learning_rate": 9.914686652879454e-06,
"loss": 0.2801,
"num_input_tokens_seen": 3093480,
"step": 8210
},
{
"epoch": 7.361111111111111,
"grad_norm": 0.044677734375,
"learning_rate": 9.883527324829925e-06,
"loss": 0.005,
"num_input_tokens_seen": 3095368,
"step": 8215
},
{
"epoch": 7.365591397849462,
"grad_norm": 0.52734375,
"learning_rate": 9.8524049691019e-06,
"loss": 0.079,
"num_input_tokens_seen": 3097224,
"step": 8220
},
{
"epoch": 7.370071684587813,
"grad_norm": 0.68359375,
"learning_rate": 9.821319661815359e-06,
"loss": 0.1452,
"num_input_tokens_seen": 3099016,
"step": 8225
},
{
"epoch": 7.374551971326165,
"grad_norm": 0.0311279296875,
"learning_rate": 9.790271478999677e-06,
"loss": 0.0638,
"num_input_tokens_seen": 3100904,
"step": 8230
},
{
"epoch": 7.379032258064516,
"grad_norm": 56.25,
"learning_rate": 9.759260496593434e-06,
"loss": 0.3638,
"num_input_tokens_seen": 3102696,
"step": 8235
},
{
"epoch": 7.383512544802867,
"grad_norm": 0.01611328125,
"learning_rate": 9.728286790444206e-06,
"loss": 0.016,
"num_input_tokens_seen": 3104488,
"step": 8240
},
{
"epoch": 7.387992831541219,
"grad_norm": 0.1943359375,
"learning_rate": 9.697350436308427e-06,
"loss": 0.1855,
"num_input_tokens_seen": 3106440,
"step": 8245
},
{
"epoch": 7.39247311827957,
"grad_norm": 0.01068115234375,
"learning_rate": 9.666451509851158e-06,
"loss": 0.0024,
"num_input_tokens_seen": 3108264,
"step": 8250
},
{
"epoch": 7.396953405017921,
"grad_norm": 90.0,
"learning_rate": 9.635590086645906e-06,
"loss": 0.1581,
"num_input_tokens_seen": 3110120,
"step": 8255
},
{
"epoch": 7.401433691756273,
"grad_norm": 28.625,
"learning_rate": 9.604766242174474e-06,
"loss": 0.0898,
"num_input_tokens_seen": 3111912,
"step": 8260
},
{
"epoch": 7.405913978494624,
"grad_norm": 0.09423828125,
"learning_rate": 9.573980051826731e-06,
"loss": 0.329,
"num_input_tokens_seen": 3113832,
"step": 8265
},
{
"epoch": 7.410394265232975,
"grad_norm": 4.9375,
"learning_rate": 9.54323159090048e-06,
"loss": 0.3856,
"num_input_tokens_seen": 3115624,
"step": 8270
},
{
"epoch": 7.414874551971327,
"grad_norm": 0.0849609375,
"learning_rate": 9.512520934601225e-06,
"loss": 0.0015,
"num_input_tokens_seen": 3117544,
"step": 8275
},
{
"epoch": 7.419354838709677,
"grad_norm": 0.0169677734375,
"learning_rate": 9.481848158041998e-06,
"loss": 0.1981,
"num_input_tokens_seen": 3119464,
"step": 8280
},
{
"epoch": 7.423835125448028,
"grad_norm": 58.0,
"learning_rate": 9.4512133362432e-06,
"loss": 0.3538,
"num_input_tokens_seen": 3121224,
"step": 8285
},
{
"epoch": 7.42831541218638,
"grad_norm": 1.078125,
"learning_rate": 9.4206165441324e-06,
"loss": 0.1745,
"num_input_tokens_seen": 3123080,
"step": 8290
},
{
"epoch": 7.432795698924731,
"grad_norm": 0.67578125,
"learning_rate": 9.390057856544129e-06,
"loss": 0.2105,
"num_input_tokens_seen": 3125000,
"step": 8295
},
{
"epoch": 7.437275985663082,
"grad_norm": 3.21875,
"learning_rate": 9.359537348219768e-06,
"loss": 0.204,
"num_input_tokens_seen": 3127080,
"step": 8300
},
{
"epoch": 7.441756272401434,
"grad_norm": 0.06982421875,
"learning_rate": 9.329055093807268e-06,
"loss": 0.3111,
"num_input_tokens_seen": 3129032,
"step": 8305
},
{
"epoch": 7.446236559139785,
"grad_norm": 0.68359375,
"learning_rate": 9.298611167861062e-06,
"loss": 0.0769,
"num_input_tokens_seen": 3130792,
"step": 8310
},
{
"epoch": 7.450716845878136,
"grad_norm": 0.0245361328125,
"learning_rate": 9.2682056448418e-06,
"loss": 0.0043,
"num_input_tokens_seen": 3132776,
"step": 8315
},
{
"epoch": 7.455197132616488,
"grad_norm": 68.5,
"learning_rate": 9.237838599116208e-06,
"loss": 0.0598,
"num_input_tokens_seen": 3134728,
"step": 8320
},
{
"epoch": 7.459677419354839,
"grad_norm": 1.28125,
"learning_rate": 9.207510104956944e-06,
"loss": 0.0609,
"num_input_tokens_seen": 3136616,
"step": 8325
},
{
"epoch": 7.46415770609319,
"grad_norm": 9.8125,
"learning_rate": 9.17722023654233e-06,
"loss": 0.0726,
"num_input_tokens_seen": 3138632,
"step": 8330
},
{
"epoch": 7.468637992831542,
"grad_norm": 61.75,
"learning_rate": 9.146969067956238e-06,
"loss": 0.4713,
"num_input_tokens_seen": 3140456,
"step": 8335
},
{
"epoch": 7.473118279569892,
"grad_norm": 0.04736328125,
"learning_rate": 9.116756673187878e-06,
"loss": 0.0042,
"num_input_tokens_seen": 3142312,
"step": 8340
},
{
"epoch": 7.477598566308243,
"grad_norm": 1.8203125,
"learning_rate": 9.08658312613163e-06,
"loss": 0.339,
"num_input_tokens_seen": 3144360,
"step": 8345
},
{
"epoch": 7.482078853046595,
"grad_norm": 55.5,
"learning_rate": 9.056448500586865e-06,
"loss": 0.64,
"num_input_tokens_seen": 3146152,
"step": 8350
},
{
"epoch": 7.486559139784946,
"grad_norm": 65.0,
"learning_rate": 9.026352870257748e-06,
"loss": 0.348,
"num_input_tokens_seen": 3148040,
"step": 8355
},
{
"epoch": 7.491039426523297,
"grad_norm": 18.125,
"learning_rate": 8.996296308753069e-06,
"loss": 0.0227,
"num_input_tokens_seen": 3149864,
"step": 8360
},
{
"epoch": 7.495519713261649,
"grad_norm": 0.87890625,
"learning_rate": 8.966278889586086e-06,
"loss": 0.1357,
"num_input_tokens_seen": 3151720,
"step": 8365
},
{
"epoch": 7.5,
"grad_norm": 0.73828125,
"learning_rate": 8.936300686174268e-06,
"loss": 0.2526,
"num_input_tokens_seen": 3153640,
"step": 8370
},
{
"epoch": 7.5,
"eval_loss": 0.2856603264808655,
"eval_runtime": 9.6772,
"eval_samples_per_second": 51.254,
"eval_steps_per_second": 12.814,
"num_input_tokens_seen": 3153640,
"step": 8370
},
{
"epoch": 7.504480286738351,
"grad_norm": 6.5,
"learning_rate": 8.906361771839227e-06,
"loss": 0.2075,
"num_input_tokens_seen": 3155496,
"step": 8375
},
{
"epoch": 7.508960573476703,
"grad_norm": 0.33984375,
"learning_rate": 8.876462219806456e-06,
"loss": 0.0004,
"num_input_tokens_seen": 3157448,
"step": 8380
},
{
"epoch": 7.513440860215054,
"grad_norm": 15.875,
"learning_rate": 8.846602103205157e-06,
"loss": 0.0432,
"num_input_tokens_seen": 3159496,
"step": 8385
},
{
"epoch": 7.517921146953405,
"grad_norm": 108.0,
"learning_rate": 8.816781495068125e-06,
"loss": 0.148,
"num_input_tokens_seen": 3161320,
"step": 8390
},
{
"epoch": 7.522401433691757,
"grad_norm": 0.2158203125,
"learning_rate": 8.787000468331463e-06,
"loss": 0.147,
"num_input_tokens_seen": 3163144,
"step": 8395
},
{
"epoch": 7.526881720430108,
"grad_norm": 71.5,
"learning_rate": 8.757259095834525e-06,
"loss": 0.3437,
"num_input_tokens_seen": 3164904,
"step": 8400
},
{
"epoch": 7.531362007168459,
"grad_norm": 7.40625,
"learning_rate": 8.72755745031964e-06,
"loss": 0.6944,
"num_input_tokens_seen": 3166696,
"step": 8405
},
{
"epoch": 7.53584229390681,
"grad_norm": 1.59375,
"learning_rate": 8.697895604431974e-06,
"loss": 0.0129,
"num_input_tokens_seen": 3168456,
"step": 8410
},
{
"epoch": 7.540322580645161,
"grad_norm": 90.5,
"learning_rate": 8.668273630719373e-06,
"loss": 0.0198,
"num_input_tokens_seen": 3170344,
"step": 8415
},
{
"epoch": 7.544802867383512,
"grad_norm": 0.8359375,
"learning_rate": 8.638691601632152e-06,
"loss": 0.2642,
"num_input_tokens_seen": 3172232,
"step": 8420
},
{
"epoch": 7.549283154121864,
"grad_norm": 2.15625,
"learning_rate": 8.609149589522894e-06,
"loss": 0.012,
"num_input_tokens_seen": 3174056,
"step": 8425
},
{
"epoch": 7.553763440860215,
"grad_norm": 0.3046875,
"learning_rate": 8.579647666646361e-06,
"loss": 0.1948,
"num_input_tokens_seen": 3175944,
"step": 8430
},
{
"epoch": 7.558243727598566,
"grad_norm": 73.5,
"learning_rate": 8.550185905159227e-06,
"loss": 0.2778,
"num_input_tokens_seen": 3177896,
"step": 8435
},
{
"epoch": 7.562724014336918,
"grad_norm": 0.515625,
"learning_rate": 8.520764377119964e-06,
"loss": 0.21,
"num_input_tokens_seen": 3179912,
"step": 8440
},
{
"epoch": 7.567204301075269,
"grad_norm": 59.25,
"learning_rate": 8.491383154488628e-06,
"loss": 0.1084,
"num_input_tokens_seen": 3181736,
"step": 8445
},
{
"epoch": 7.57168458781362,
"grad_norm": 0.4765625,
"learning_rate": 8.462042309126664e-06,
"loss": 0.0713,
"num_input_tokens_seen": 3183592,
"step": 8450
},
{
"epoch": 7.576164874551972,
"grad_norm": 0.298828125,
"learning_rate": 8.432741912796821e-06,
"loss": 0.1302,
"num_input_tokens_seen": 3185448,
"step": 8455
},
{
"epoch": 7.580645161290323,
"grad_norm": 5.40625,
"learning_rate": 8.403482037162873e-06,
"loss": 0.2815,
"num_input_tokens_seen": 3187368,
"step": 8460
},
{
"epoch": 7.585125448028673,
"grad_norm": 1.625,
"learning_rate": 8.374262753789493e-06,
"loss": 0.0022,
"num_input_tokens_seen": 3189192,
"step": 8465
},
{
"epoch": 7.589605734767025,
"grad_norm": 0.0242919921875,
"learning_rate": 8.345084134142098e-06,
"loss": 0.2046,
"num_input_tokens_seen": 3191112,
"step": 8470
},
{
"epoch": 7.594086021505376,
"grad_norm": 43.75,
"learning_rate": 8.31594624958662e-06,
"loss": 0.22,
"num_input_tokens_seen": 3192808,
"step": 8475
},
{
"epoch": 7.598566308243727,
"grad_norm": 69.0,
"learning_rate": 8.286849171389366e-06,
"loss": 0.139,
"num_input_tokens_seen": 3194632,
"step": 8480
},
{
"epoch": 7.603046594982079,
"grad_norm": 0.026611328125,
"learning_rate": 8.257792970716846e-06,
"loss": 0.0019,
"num_input_tokens_seen": 3196488,
"step": 8485
},
{
"epoch": 7.60752688172043,
"grad_norm": 9.25,
"learning_rate": 8.228777718635575e-06,
"loss": 0.1405,
"num_input_tokens_seen": 3198408,
"step": 8490
},
{
"epoch": 7.612007168458781,
"grad_norm": 39.25,
"learning_rate": 8.19980348611194e-06,
"loss": 0.2583,
"num_input_tokens_seen": 3200264,
"step": 8495
},
{
"epoch": 7.616487455197133,
"grad_norm": 84.5,
"learning_rate": 8.170870344011982e-06,
"loss": 0.0361,
"num_input_tokens_seen": 3202120,
"step": 8500
},
{
"epoch": 7.620967741935484,
"grad_norm": 6.4375,
"learning_rate": 8.141978363101243e-06,
"loss": 0.3324,
"num_input_tokens_seen": 3203976,
"step": 8505
},
{
"epoch": 7.625448028673835,
"grad_norm": 104.5,
"learning_rate": 8.1131276140446e-06,
"loss": 0.0315,
"num_input_tokens_seen": 3205832,
"step": 8510
},
{
"epoch": 7.629928315412187,
"grad_norm": 0.07666015625,
"learning_rate": 8.084318167406063e-06,
"loss": 0.0966,
"num_input_tokens_seen": 3207816,
"step": 8515
},
{
"epoch": 7.634408602150538,
"grad_norm": 0.421875,
"learning_rate": 8.055550093648665e-06,
"loss": 0.0647,
"num_input_tokens_seen": 3209768,
"step": 8520
},
{
"epoch": 7.638888888888889,
"grad_norm": 58.25,
"learning_rate": 8.026823463134206e-06,
"loss": 0.228,
"num_input_tokens_seen": 3211464,
"step": 8525
},
{
"epoch": 7.643369175627241,
"grad_norm": 1.4375,
"learning_rate": 7.99813834612314e-06,
"loss": 0.2794,
"num_input_tokens_seen": 3213320,
"step": 8530
},
{
"epoch": 7.647849462365591,
"grad_norm": 0.0223388671875,
"learning_rate": 7.969494812774392e-06,
"loss": 0.2487,
"num_input_tokens_seen": 3215272,
"step": 8535
},
{
"epoch": 7.652329749103942,
"grad_norm": 35.5,
"learning_rate": 7.940892933145156e-06,
"loss": 0.0383,
"num_input_tokens_seen": 3217256,
"step": 8540
},
{
"epoch": 7.656810035842294,
"grad_norm": 77.5,
"learning_rate": 7.91233277719079e-06,
"loss": 0.2694,
"num_input_tokens_seen": 3219016,
"step": 8545
},
{
"epoch": 7.661290322580645,
"grad_norm": 0.03369140625,
"learning_rate": 7.883814414764566e-06,
"loss": 0.119,
"num_input_tokens_seen": 3220680,
"step": 8550
},
{
"epoch": 7.665770609318996,
"grad_norm": 131.0,
"learning_rate": 7.855337915617548e-06,
"loss": 0.4811,
"num_input_tokens_seen": 3222344,
"step": 8555
},
{
"epoch": 7.670250896057348,
"grad_norm": 0.37890625,
"learning_rate": 7.82690334939841e-06,
"loss": 0.002,
"num_input_tokens_seen": 3224168,
"step": 8560
},
{
"epoch": 7.674731182795699,
"grad_norm": 47.25,
"learning_rate": 7.798510785653263e-06,
"loss": 0.2016,
"num_input_tokens_seen": 3225992,
"step": 8565
},
{
"epoch": 7.67921146953405,
"grad_norm": 0.06591796875,
"learning_rate": 7.770160293825498e-06,
"loss": 0.4631,
"num_input_tokens_seen": 3227912,
"step": 8570
},
{
"epoch": 7.683691756272402,
"grad_norm": 52.25,
"learning_rate": 7.741851943255596e-06,
"loss": 0.0176,
"num_input_tokens_seen": 3229736,
"step": 8575
},
{
"epoch": 7.688172043010753,
"grad_norm": 37.5,
"learning_rate": 7.713585803180956e-06,
"loss": 0.1768,
"num_input_tokens_seen": 3231720,
"step": 8580
},
{
"epoch": 7.692652329749104,
"grad_norm": 0.042236328125,
"learning_rate": 7.685361942735777e-06,
"loss": 0.1667,
"num_input_tokens_seen": 3233640,
"step": 8585
},
{
"epoch": 7.697132616487455,
"grad_norm": 69.0,
"learning_rate": 7.657180430950794e-06,
"loss": 0.1408,
"num_input_tokens_seen": 3235400,
"step": 8590
},
{
"epoch": 7.701612903225806,
"grad_norm": 14.4375,
"learning_rate": 7.629041336753193e-06,
"loss": 0.0034,
"num_input_tokens_seen": 3237384,
"step": 8595
},
{
"epoch": 7.706093189964157,
"grad_norm": 56.25,
"learning_rate": 7.600944728966433e-06,
"loss": 0.5304,
"num_input_tokens_seen": 3239496,
"step": 8600
},
{
"epoch": 7.710573476702509,
"grad_norm": 50.25,
"learning_rate": 7.572890676310026e-06,
"loss": 0.2329,
"num_input_tokens_seen": 3241128,
"step": 8605
},
{
"epoch": 7.71505376344086,
"grad_norm": 0.7578125,
"learning_rate": 7.544879247399417e-06,
"loss": 0.2943,
"num_input_tokens_seen": 3242920,
"step": 8610
},
{
"epoch": 7.719534050179211,
"grad_norm": 31.75,
"learning_rate": 7.516910510745795e-06,
"loss": 0.2259,
"num_input_tokens_seen": 3244680,
"step": 8615
},
{
"epoch": 7.724014336917563,
"grad_norm": 106.5,
"learning_rate": 7.48898453475593e-06,
"loss": 0.3288,
"num_input_tokens_seen": 3246728,
"step": 8620
},
{
"epoch": 7.728494623655914,
"grad_norm": 62.0,
"learning_rate": 7.46110138773202e-06,
"loss": 0.3365,
"num_input_tokens_seen": 3248712,
"step": 8625
},
{
"epoch": 7.732974910394265,
"grad_norm": 12.8125,
"learning_rate": 7.433261137871497e-06,
"loss": 0.2714,
"num_input_tokens_seen": 3250568,
"step": 8630
},
{
"epoch": 7.737455197132617,
"grad_norm": 0.193359375,
"learning_rate": 7.405463853266869e-06,
"loss": 0.2253,
"num_input_tokens_seen": 3252328,
"step": 8635
},
{
"epoch": 7.741935483870968,
"grad_norm": 0.177734375,
"learning_rate": 7.377709601905594e-06,
"loss": 0.001,
"num_input_tokens_seen": 3254248,
"step": 8640
},
{
"epoch": 7.746415770609319,
"grad_norm": 0.59375,
"learning_rate": 7.349998451669812e-06,
"loss": 0.2439,
"num_input_tokens_seen": 3256040,
"step": 8645
},
{
"epoch": 7.750896057347671,
"grad_norm": 0.10009765625,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.1708,
"num_input_tokens_seen": 3257960,
"step": 8650
},
{
"epoch": 7.755376344086022,
"grad_norm": 0.07177734375,
"learning_rate": 7.294705725576267e-06,
"loss": 0.1292,
"num_input_tokens_seen": 3259880,
"step": 8655
},
{
"epoch": 7.759856630824372,
"grad_norm": 0.1796875,
"learning_rate": 7.2671242849550905e-06,
"loss": 0.5855,
"num_input_tokens_seen": 3261960,
"step": 8660
},
{
"epoch": 7.764336917562724,
"grad_norm": 1.03125,
"learning_rate": 7.239586215932323e-06,
"loss": 0.4413,
"num_input_tokens_seen": 3263784,
"step": 8665
},
{
"epoch": 7.768817204301075,
"grad_norm": 5.1875,
"learning_rate": 7.212091585861363e-06,
"loss": 0.2717,
"num_input_tokens_seen": 3265640,
"step": 8670
},
{
"epoch": 7.773297491039426,
"grad_norm": 0.3828125,
"learning_rate": 7.184640461989431e-06,
"loss": 0.148,
"num_input_tokens_seen": 3267368,
"step": 8675
},
{
"epoch": 7.777777777777778,
"grad_norm": 102.5,
"learning_rate": 7.157232911457293e-06,
"loss": 0.0435,
"num_input_tokens_seen": 3269096,
"step": 8680
},
{
"epoch": 7.782258064516129,
"grad_norm": 95.5,
"learning_rate": 7.12986900129915e-06,
"loss": 0.236,
"num_input_tokens_seen": 3270920,
"step": 8685
},
{
"epoch": 7.78673835125448,
"grad_norm": 4.65625,
"learning_rate": 7.10254879844249e-06,
"loss": 0.0039,
"num_input_tokens_seen": 3272840,
"step": 8690
},
{
"epoch": 7.791218637992832,
"grad_norm": 86.5,
"learning_rate": 7.075272369707878e-06,
"loss": 0.1494,
"num_input_tokens_seen": 3274824,
"step": 8695
},
{
"epoch": 7.795698924731183,
"grad_norm": 68.0,
"learning_rate": 7.048039781808816e-06,
"loss": 0.2415,
"num_input_tokens_seen": 3276808,
"step": 8700
},
{
"epoch": 7.800179211469534,
"grad_norm": 0.138671875,
"learning_rate": 7.020851101351583e-06,
"loss": 0.2595,
"num_input_tokens_seen": 3279144,
"step": 8705
},
{
"epoch": 7.804659498207886,
"grad_norm": 8.0625,
"learning_rate": 6.993706394835062e-06,
"loss": 0.0371,
"num_input_tokens_seen": 3281128,
"step": 8710
},
{
"epoch": 7.809139784946236,
"grad_norm": 0.039306640625,
"learning_rate": 6.966605728650602e-06,
"loss": 0.1929,
"num_input_tokens_seen": 3282952,
"step": 8715
},
{
"epoch": 7.813620071684587,
"grad_norm": 0.36328125,
"learning_rate": 6.939549169081827e-06,
"loss": 0.2407,
"num_input_tokens_seen": 3284904,
"step": 8720
},
{
"epoch": 7.818100358422939,
"grad_norm": 152.0,
"learning_rate": 6.912536782304454e-06,
"loss": 0.0573,
"num_input_tokens_seen": 3286760,
"step": 8725
},
{
"epoch": 7.82258064516129,
"grad_norm": 0.09716796875,
"learning_rate": 6.885568634386217e-06,
"loss": 0.0254,
"num_input_tokens_seen": 3288584,
"step": 8730
},
{
"epoch": 7.827060931899641,
"grad_norm": 0.023681640625,
"learning_rate": 6.858644791286603e-06,
"loss": 0.0906,
"num_input_tokens_seen": 3290632,
"step": 8735
},
{
"epoch": 7.831541218637993,
"grad_norm": 30.75,
"learning_rate": 6.83176531885675e-06,
"loss": 0.2341,
"num_input_tokens_seen": 3292488,
"step": 8740
},
{
"epoch": 7.836021505376344,
"grad_norm": 46.0,
"learning_rate": 6.804930282839295e-06,
"loss": 0.6291,
"num_input_tokens_seen": 3294376,
"step": 8745
},
{
"epoch": 7.840501792114695,
"grad_norm": 2.28125,
"learning_rate": 6.778139748868159e-06,
"loss": 0.2229,
"num_input_tokens_seen": 3296360,
"step": 8750
},
{
"epoch": 7.844982078853047,
"grad_norm": 56.25,
"learning_rate": 6.751393782468438e-06,
"loss": 0.6797,
"num_input_tokens_seen": 3298152,
"step": 8755
},
{
"epoch": 7.849462365591398,
"grad_norm": 1.3125,
"learning_rate": 6.7246924490562135e-06,
"loss": 0.1333,
"num_input_tokens_seen": 3300104,
"step": 8760
},
{
"epoch": 7.853942652329749,
"grad_norm": 45.75,
"learning_rate": 6.6980358139384e-06,
"loss": 0.0838,
"num_input_tokens_seen": 3301928,
"step": 8765
},
{
"epoch": 7.858422939068101,
"grad_norm": 53.5,
"learning_rate": 6.671423942312608e-06,
"loss": 0.3952,
"num_input_tokens_seen": 3303816,
"step": 8770
},
{
"epoch": 7.862903225806452,
"grad_norm": 382.0,
"learning_rate": 6.6448568992669434e-06,
"loss": 0.157,
"num_input_tokens_seen": 3305704,
"step": 8775
},
{
"epoch": 7.867383512544803,
"grad_norm": 1.4921875,
"learning_rate": 6.6183347497798755e-06,
"loss": 0.0965,
"num_input_tokens_seen": 3307656,
"step": 8780
},
{
"epoch": 7.871863799283155,
"grad_norm": 0.06787109375,
"learning_rate": 6.591857558720071e-06,
"loss": 0.2862,
"num_input_tokens_seen": 3309608,
"step": 8785
},
{
"epoch": 7.876344086021505,
"grad_norm": 100.5,
"learning_rate": 6.565425390846233e-06,
"loss": 0.3027,
"num_input_tokens_seen": 3311368,
"step": 8790
},
{
"epoch": 7.880824372759856,
"grad_norm": 0.134765625,
"learning_rate": 6.539038310806958e-06,
"loss": 0.3519,
"num_input_tokens_seen": 3313352,
"step": 8795
},
{
"epoch": 7.885304659498208,
"grad_norm": 0.032470703125,
"learning_rate": 6.512696383140551e-06,
"loss": 0.0014,
"num_input_tokens_seen": 3315240,
"step": 8800
},
{
"epoch": 7.889784946236559,
"grad_norm": 0.1640625,
"learning_rate": 6.48639967227489e-06,
"loss": 0.001,
"num_input_tokens_seen": 3317032,
"step": 8805
},
{
"epoch": 7.89426523297491,
"grad_norm": 52.0,
"learning_rate": 6.460148242527253e-06,
"loss": 0.4413,
"num_input_tokens_seen": 3319048,
"step": 8810
},
{
"epoch": 7.898745519713262,
"grad_norm": 81.0,
"learning_rate": 6.4339421581041725e-06,
"loss": 0.0716,
"num_input_tokens_seen": 3320936,
"step": 8815
},
{
"epoch": 7.903225806451613,
"grad_norm": 0.24609375,
"learning_rate": 6.407781483101283e-06,
"loss": 0.2411,
"num_input_tokens_seen": 3322760,
"step": 8820
},
{
"epoch": 7.907706093189964,
"grad_norm": 31.875,
"learning_rate": 6.38166628150314e-06,
"loss": 0.0124,
"num_input_tokens_seen": 3324584,
"step": 8825
},
{
"epoch": 7.912186379928316,
"grad_norm": 112.5,
"learning_rate": 6.355596617183091e-06,
"loss": 0.4751,
"num_input_tokens_seen": 3326600,
"step": 8830
},
{
"epoch": 7.916666666666667,
"grad_norm": 49.75,
"learning_rate": 6.329572553903096e-06,
"loss": 0.1337,
"num_input_tokens_seen": 3328456,
"step": 8835
},
{
"epoch": 7.921146953405018,
"grad_norm": 36.5,
"learning_rate": 6.303594155313583e-06,
"loss": 0.2799,
"num_input_tokens_seen": 3330472,
"step": 8840
},
{
"epoch": 7.925627240143369,
"grad_norm": 0.11474609375,
"learning_rate": 6.277661484953309e-06,
"loss": 0.5744,
"num_input_tokens_seen": 3332488,
"step": 8845
},
{
"epoch": 7.93010752688172,
"grad_norm": 0.248046875,
"learning_rate": 6.251774606249172e-06,
"loss": 0.2474,
"num_input_tokens_seen": 3334376,
"step": 8850
},
{
"epoch": 7.934587813620071,
"grad_norm": 3.421875,
"learning_rate": 6.225933582516069e-06,
"loss": 0.0445,
"num_input_tokens_seen": 3336264,
"step": 8855
},
{
"epoch": 7.939068100358423,
"grad_norm": 0.2333984375,
"learning_rate": 6.200138476956766e-06,
"loss": 0.0154,
"num_input_tokens_seen": 3338024,
"step": 8860
},
{
"epoch": 7.943548387096774,
"grad_norm": 3.53125,
"learning_rate": 6.174389352661686e-06,
"loss": 0.2053,
"num_input_tokens_seen": 3340008,
"step": 8865
},
{
"epoch": 7.948028673835125,
"grad_norm": 146.0,
"learning_rate": 6.148686272608809e-06,
"loss": 0.1777,
"num_input_tokens_seen": 3341864,
"step": 8870
},
{
"epoch": 7.952508960573477,
"grad_norm": 131.0,
"learning_rate": 6.12302929966351e-06,
"loss": 0.442,
"num_input_tokens_seen": 3343752,
"step": 8875
},
{
"epoch": 7.956989247311828,
"grad_norm": 55.0,
"learning_rate": 6.097418496578369e-06,
"loss": 0.523,
"num_input_tokens_seen": 3345672,
"step": 8880
},
{
"epoch": 7.961469534050179,
"grad_norm": 0.10302734375,
"learning_rate": 6.0718539259930766e-06,
"loss": 0.2768,
"num_input_tokens_seen": 3347624,
"step": 8885
},
{
"epoch": 7.965949820788531,
"grad_norm": 0.2412109375,
"learning_rate": 6.046335650434201e-06,
"loss": 0.0416,
"num_input_tokens_seen": 3349480,
"step": 8890
},
{
"epoch": 7.970430107526882,
"grad_norm": 40.5,
"learning_rate": 6.020863732315108e-06,
"loss": 0.2458,
"num_input_tokens_seen": 3351400,
"step": 8895
},
{
"epoch": 7.974910394265233,
"grad_norm": 0.047607421875,
"learning_rate": 5.9954382339357905e-06,
"loss": 0.0047,
"num_input_tokens_seen": 3353352,
"step": 8900
},
{
"epoch": 7.979390681003585,
"grad_norm": 73.0,
"learning_rate": 5.970059217482685e-06,
"loss": 0.081,
"num_input_tokens_seen": 3355176,
"step": 8905
},
{
"epoch": 7.983870967741936,
"grad_norm": 5.78125,
"learning_rate": 5.944726745028545e-06,
"loss": 0.0114,
"num_input_tokens_seen": 3357224,
"step": 8910
},
{
"epoch": 7.988351254480286,
"grad_norm": 53.25,
"learning_rate": 5.919440878532312e-06,
"loss": 0.3855,
"num_input_tokens_seen": 3358984,
"step": 8915
},
{
"epoch": 7.992831541218638,
"grad_norm": 48.75,
"learning_rate": 5.894201679838885e-06,
"loss": 0.6179,
"num_input_tokens_seen": 3361032,
"step": 8920
},
{
"epoch": 7.997311827956989,
"grad_norm": 91.0,
"learning_rate": 5.869009210679074e-06,
"loss": 0.3551,
"num_input_tokens_seen": 3363048,
"step": 8925
},
{
"epoch": 8.0,
"eval_loss": 0.2849765419960022,
"eval_runtime": 9.6404,
"eval_samples_per_second": 51.45,
"eval_steps_per_second": 12.862,
"num_input_tokens_seen": 3363864,
"step": 8928
},
{
"epoch": 8.001792114695341,
"grad_norm": 0.35546875,
"learning_rate": 5.8438635326693664e-06,
"loss": 0.0233,
"num_input_tokens_seen": 3364600,
"step": 8930
},
{
"epoch": 8.006272401433693,
"grad_norm": 6.40625,
"learning_rate": 5.818764707311811e-06,
"loss": 0.0159,
"num_input_tokens_seen": 3366360,
"step": 8935
},
{
"epoch": 8.010752688172044,
"grad_norm": 6.9375,
"learning_rate": 5.7937127959938806e-06,
"loss": 0.3514,
"num_input_tokens_seen": 3368312,
"step": 8940
},
{
"epoch": 8.015232974910393,
"grad_norm": 1.3203125,
"learning_rate": 5.768707859988267e-06,
"loss": 0.2599,
"num_input_tokens_seen": 3370200,
"step": 8945
},
{
"epoch": 8.019713261648745,
"grad_norm": 0.0157470703125,
"learning_rate": 5.7437499604528125e-06,
"loss": 0.0752,
"num_input_tokens_seen": 3372056,
"step": 8950
},
{
"epoch": 8.024193548387096,
"grad_norm": 3.40625,
"learning_rate": 5.7188391584302895e-06,
"loss": 0.0064,
"num_input_tokens_seen": 3373976,
"step": 8955
},
{
"epoch": 8.028673835125447,
"grad_norm": 17.75,
"learning_rate": 5.693975514848271e-06,
"loss": 0.062,
"num_input_tokens_seen": 3375960,
"step": 8960
},
{
"epoch": 8.033154121863799,
"grad_norm": 102.0,
"learning_rate": 5.669159090519019e-06,
"loss": 0.3412,
"num_input_tokens_seen": 3377880,
"step": 8965
},
{
"epoch": 8.03763440860215,
"grad_norm": 0.027099609375,
"learning_rate": 5.644389946139278e-06,
"loss": 0.4138,
"num_input_tokens_seen": 3379768,
"step": 8970
},
{
"epoch": 8.042114695340501,
"grad_norm": 4.0625,
"learning_rate": 5.6196681422901634e-06,
"loss": 0.035,
"num_input_tokens_seen": 3381560,
"step": 8975
},
{
"epoch": 8.046594982078853,
"grad_norm": 0.0634765625,
"learning_rate": 5.594993739437007e-06,
"loss": 0.2346,
"num_input_tokens_seen": 3383544,
"step": 8980
},
{
"epoch": 8.051075268817204,
"grad_norm": 0.12255859375,
"learning_rate": 5.5703667979291915e-06,
"loss": 0.2419,
"num_input_tokens_seen": 3385272,
"step": 8985
},
{
"epoch": 8.055555555555555,
"grad_norm": 92.0,
"learning_rate": 5.545787378000039e-06,
"loss": 0.076,
"num_input_tokens_seen": 3387256,
"step": 8990
},
{
"epoch": 8.060035842293907,
"grad_norm": 5.21875,
"learning_rate": 5.521255539766637e-06,
"loss": 0.197,
"num_input_tokens_seen": 3389144,
"step": 8995
},
{
"epoch": 8.064516129032258,
"grad_norm": 22.75,
"learning_rate": 5.4967713432296674e-06,
"loss": 0.1994,
"num_input_tokens_seen": 3390904,
"step": 9000
},
{
"epoch": 8.06899641577061,
"grad_norm": 39.75,
"learning_rate": 5.472334848273328e-06,
"loss": 0.2094,
"num_input_tokens_seen": 3392792,
"step": 9005
},
{
"epoch": 8.07347670250896,
"grad_norm": 166.0,
"learning_rate": 5.44794611466512e-06,
"loss": 0.5029,
"num_input_tokens_seen": 3394744,
"step": 9010
},
{
"epoch": 8.077956989247312,
"grad_norm": 14.1875,
"learning_rate": 5.4236052020557535e-06,
"loss": 0.5896,
"num_input_tokens_seen": 3396632,
"step": 9015
},
{
"epoch": 8.082437275985663,
"grad_norm": 69.0,
"learning_rate": 5.399312169978949e-06,
"loss": 0.1446,
"num_input_tokens_seen": 3398424,
"step": 9020
},
{
"epoch": 8.086917562724015,
"grad_norm": 0.263671875,
"learning_rate": 5.375067077851337e-06,
"loss": 0.1386,
"num_input_tokens_seen": 3400312,
"step": 9025
},
{
"epoch": 8.091397849462366,
"grad_norm": 64.0,
"learning_rate": 5.350869984972287e-06,
"loss": 0.2907,
"num_input_tokens_seen": 3402200,
"step": 9030
},
{
"epoch": 8.095878136200717,
"grad_norm": 0.048828125,
"learning_rate": 5.326720950523772e-06,
"loss": 0.1821,
"num_input_tokens_seen": 3404152,
"step": 9035
},
{
"epoch": 8.100358422939069,
"grad_norm": 5.75,
"learning_rate": 5.302620033570222e-06,
"loss": 0.1275,
"num_input_tokens_seen": 3405912,
"step": 9040
},
{
"epoch": 8.10483870967742,
"grad_norm": 2.921875,
"learning_rate": 5.27856729305839e-06,
"loss": 0.0059,
"num_input_tokens_seen": 3407672,
"step": 9045
},
{
"epoch": 8.109318996415771,
"grad_norm": 78.5,
"learning_rate": 5.254562787817183e-06,
"loss": 0.2426,
"num_input_tokens_seen": 3409496,
"step": 9050
},
{
"epoch": 8.113799283154123,
"grad_norm": 0.6015625,
"learning_rate": 5.23060657655754e-06,
"loss": 0.2228,
"num_input_tokens_seen": 3411352,
"step": 9055
},
{
"epoch": 8.118279569892474,
"grad_norm": 0.0208740234375,
"learning_rate": 5.206698717872277e-06,
"loss": 0.1006,
"num_input_tokens_seen": 3413432,
"step": 9060
},
{
"epoch": 8.122759856630825,
"grad_norm": 94.0,
"learning_rate": 5.1828392702359504e-06,
"loss": 0.4816,
"num_input_tokens_seen": 3415320,
"step": 9065
},
{
"epoch": 8.127240143369175,
"grad_norm": 0.01324462890625,
"learning_rate": 5.159028292004717e-06,
"loss": 0.2086,
"num_input_tokens_seen": 3417240,
"step": 9070
},
{
"epoch": 8.131720430107526,
"grad_norm": 1.9296875,
"learning_rate": 5.1352658414161785e-06,
"loss": 0.0816,
"num_input_tokens_seen": 3419192,
"step": 9075
},
{
"epoch": 8.136200716845877,
"grad_norm": 101.0,
"learning_rate": 5.111551976589249e-06,
"loss": 0.1281,
"num_input_tokens_seen": 3421208,
"step": 9080
},
{
"epoch": 8.140681003584229,
"grad_norm": 0.263671875,
"learning_rate": 5.087886755524005e-06,
"loss": 0.0338,
"num_input_tokens_seen": 3423064,
"step": 9085
},
{
"epoch": 8.14516129032258,
"grad_norm": 0.19140625,
"learning_rate": 5.064270236101548e-06,
"loss": 0.0008,
"num_input_tokens_seen": 3424984,
"step": 9090
},
{
"epoch": 8.149641577060931,
"grad_norm": 34.25,
"learning_rate": 5.040702476083883e-06,
"loss": 0.0065,
"num_input_tokens_seen": 3426936,
"step": 9095
},
{
"epoch": 8.154121863799283,
"grad_norm": 76.5,
"learning_rate": 5.0171835331137365e-06,
"loss": 0.1797,
"num_input_tokens_seen": 3428696,
"step": 9100
},
{
"epoch": 8.158602150537634,
"grad_norm": 0.470703125,
"learning_rate": 4.993713464714433e-06,
"loss": 0.0416,
"num_input_tokens_seen": 3430744,
"step": 9105
},
{
"epoch": 8.163082437275985,
"grad_norm": 7.375,
"learning_rate": 4.970292328289794e-06,
"loss": 0.0405,
"num_input_tokens_seen": 3432696,
"step": 9110
},
{
"epoch": 8.167562724014337,
"grad_norm": 116.5,
"learning_rate": 4.946920181123904e-06,
"loss": 0.2954,
"num_input_tokens_seen": 3434424,
"step": 9115
},
{
"epoch": 8.172043010752688,
"grad_norm": 0.107421875,
"learning_rate": 4.9235970803810845e-06,
"loss": 0.0951,
"num_input_tokens_seen": 3436312,
"step": 9120
},
{
"epoch": 8.17652329749104,
"grad_norm": 0.435546875,
"learning_rate": 4.900323083105668e-06,
"loss": 0.0406,
"num_input_tokens_seen": 3438328,
"step": 9125
},
{
"epoch": 8.18100358422939,
"grad_norm": 0.030517578125,
"learning_rate": 4.877098246221881e-06,
"loss": 0.1157,
"num_input_tokens_seen": 3440088,
"step": 9130
},
{
"epoch": 8.185483870967742,
"grad_norm": 41.5,
"learning_rate": 4.853922626533749e-06,
"loss": 0.0815,
"num_input_tokens_seen": 3441912,
"step": 9135
},
{
"epoch": 8.189964157706093,
"grad_norm": 0.0771484375,
"learning_rate": 4.830796280724873e-06,
"loss": 0.2556,
"num_input_tokens_seen": 3443832,
"step": 9140
},
{
"epoch": 8.194444444444445,
"grad_norm": 8.6875,
"learning_rate": 4.807719265358377e-06,
"loss": 0.1774,
"num_input_tokens_seen": 3445720,
"step": 9145
},
{
"epoch": 8.198924731182796,
"grad_norm": 32.5,
"learning_rate": 4.7846916368767094e-06,
"loss": 0.1127,
"num_input_tokens_seen": 3447544,
"step": 9150
},
{
"epoch": 8.203405017921147,
"grad_norm": 0.224609375,
"learning_rate": 4.761713451601532e-06,
"loss": 0.2225,
"num_input_tokens_seen": 3449400,
"step": 9155
},
{
"epoch": 8.207885304659499,
"grad_norm": 89.0,
"learning_rate": 4.738784765733586e-06,
"loss": 0.3904,
"num_input_tokens_seen": 3451256,
"step": 9160
},
{
"epoch": 8.21236559139785,
"grad_norm": 64.0,
"learning_rate": 4.715905635352541e-06,
"loss": 0.1328,
"num_input_tokens_seen": 3453240,
"step": 9165
},
{
"epoch": 8.216845878136201,
"grad_norm": 0.026611328125,
"learning_rate": 4.6930761164168395e-06,
"loss": 0.2755,
"num_input_tokens_seen": 3455064,
"step": 9170
},
{
"epoch": 8.221326164874553,
"grad_norm": 0.01287841796875,
"learning_rate": 4.670296264763618e-06,
"loss": 0.2185,
"num_input_tokens_seen": 3456888,
"step": 9175
},
{
"epoch": 8.225806451612904,
"grad_norm": 53.5,
"learning_rate": 4.6475661361085195e-06,
"loss": 0.3425,
"num_input_tokens_seen": 3458776,
"step": 9180
},
{
"epoch": 8.230286738351255,
"grad_norm": 64.0,
"learning_rate": 4.624885786045563e-06,
"loss": 0.5916,
"num_input_tokens_seen": 3460600,
"step": 9185
},
{
"epoch": 8.234767025089607,
"grad_norm": 0.71875,
"learning_rate": 4.602255270047048e-06,
"loss": 0.1208,
"num_input_tokens_seen": 3462552,
"step": 9190
},
{
"epoch": 8.239247311827956,
"grad_norm": 40.25,
"learning_rate": 4.579674643463341e-06,
"loss": 0.0165,
"num_input_tokens_seen": 3464568,
"step": 9195
},
{
"epoch": 8.243727598566307,
"grad_norm": 24.75,
"learning_rate": 4.557143961522836e-06,
"loss": 0.1902,
"num_input_tokens_seen": 3466328,
"step": 9200
},
{
"epoch": 8.248207885304659,
"grad_norm": 116.5,
"learning_rate": 4.534663279331744e-06,
"loss": 0.1515,
"num_input_tokens_seen": 3468248,
"step": 9205
},
{
"epoch": 8.25268817204301,
"grad_norm": 87.0,
"learning_rate": 4.512232651873982e-06,
"loss": 0.3549,
"num_input_tokens_seen": 3470200,
"step": 9210
},
{
"epoch": 8.257168458781361,
"grad_norm": 139.0,
"learning_rate": 4.489852134011061e-06,
"loss": 0.4197,
"num_input_tokens_seen": 3472184,
"step": 9215
},
{
"epoch": 8.261648745519713,
"grad_norm": 0.0966796875,
"learning_rate": 4.46752178048192e-06,
"loss": 0.4768,
"num_input_tokens_seen": 3474008,
"step": 9220
},
{
"epoch": 8.266129032258064,
"grad_norm": 0.04638671875,
"learning_rate": 4.445241645902804e-06,
"loss": 0.0017,
"num_input_tokens_seen": 3475896,
"step": 9225
},
{
"epoch": 8.270609318996415,
"grad_norm": 2.515625,
"learning_rate": 4.423011784767133e-06,
"loss": 0.2011,
"num_input_tokens_seen": 3477880,
"step": 9230
},
{
"epoch": 8.275089605734767,
"grad_norm": 0.1767578125,
"learning_rate": 4.400832251445361e-06,
"loss": 0.0009,
"num_input_tokens_seen": 3479832,
"step": 9235
},
{
"epoch": 8.279569892473118,
"grad_norm": 60.25,
"learning_rate": 4.378703100184869e-06,
"loss": 0.223,
"num_input_tokens_seen": 3481976,
"step": 9240
},
{
"epoch": 8.28405017921147,
"grad_norm": 88.5,
"learning_rate": 4.35662438510979e-06,
"loss": 0.7543,
"num_input_tokens_seen": 3483832,
"step": 9245
},
{
"epoch": 8.28853046594982,
"grad_norm": 1.2890625,
"learning_rate": 4.334596160220905e-06,
"loss": 0.3632,
"num_input_tokens_seen": 3485720,
"step": 9250
},
{
"epoch": 8.293010752688172,
"grad_norm": 0.8671875,
"learning_rate": 4.312618479395506e-06,
"loss": 0.0588,
"num_input_tokens_seen": 3487640,
"step": 9255
},
{
"epoch": 8.297491039426523,
"grad_norm": 52.5,
"learning_rate": 4.290691396387258e-06,
"loss": 0.3095,
"num_input_tokens_seen": 3489688,
"step": 9260
},
{
"epoch": 8.301971326164875,
"grad_norm": 0.328125,
"learning_rate": 4.268814964826093e-06,
"loss": 0.1791,
"num_input_tokens_seen": 3491512,
"step": 9265
},
{
"epoch": 8.306451612903226,
"grad_norm": 3.921875,
"learning_rate": 4.24698923821803e-06,
"loss": 0.0938,
"num_input_tokens_seen": 3493560,
"step": 9270
},
{
"epoch": 8.310931899641577,
"grad_norm": 145.0,
"learning_rate": 4.225214269945088e-06,
"loss": 0.4003,
"num_input_tokens_seen": 3495608,
"step": 9275
},
{
"epoch": 8.315412186379929,
"grad_norm": 44.75,
"learning_rate": 4.203490113265138e-06,
"loss": 0.5021,
"num_input_tokens_seen": 3497464,
"step": 9280
},
{
"epoch": 8.31989247311828,
"grad_norm": 40.25,
"learning_rate": 4.181816821311763e-06,
"loss": 0.5261,
"num_input_tokens_seen": 3499416,
"step": 9285
},
{
"epoch": 8.324372759856631,
"grad_norm": 93.5,
"learning_rate": 4.160194447094162e-06,
"loss": 0.3314,
"num_input_tokens_seen": 3501400,
"step": 9290
},
{
"epoch": 8.328853046594983,
"grad_norm": 0.016357421875,
"learning_rate": 4.138623043496981e-06,
"loss": 0.2029,
"num_input_tokens_seen": 3503160,
"step": 9295
},
{
"epoch": 8.333333333333334,
"grad_norm": 9.0625,
"learning_rate": 4.1171026632802035e-06,
"loss": 0.3204,
"num_input_tokens_seen": 3504952,
"step": 9300
},
{
"epoch": 8.337813620071685,
"grad_norm": 10.5625,
"learning_rate": 4.095633359079024e-06,
"loss": 0.0028,
"num_input_tokens_seen": 3506680,
"step": 9305
},
{
"epoch": 8.342293906810037,
"grad_norm": 4.8125,
"learning_rate": 4.074215183403701e-06,
"loss": 0.107,
"num_input_tokens_seen": 3508600,
"step": 9310
},
{
"epoch": 8.346774193548388,
"grad_norm": 172.0,
"learning_rate": 4.052848188639452e-06,
"loss": 0.2974,
"num_input_tokens_seen": 3510520,
"step": 9315
},
{
"epoch": 8.351254480286737,
"grad_norm": 0.3359375,
"learning_rate": 4.031532427046322e-06,
"loss": 0.0019,
"num_input_tokens_seen": 3512312,
"step": 9320
},
{
"epoch": 8.355734767025089,
"grad_norm": 0.1484375,
"learning_rate": 4.010267950759025e-06,
"loss": 0.1513,
"num_input_tokens_seen": 3514200,
"step": 9325
},
{
"epoch": 8.36021505376344,
"grad_norm": 49.25,
"learning_rate": 3.989054811786874e-06,
"loss": 0.2249,
"num_input_tokens_seen": 3516024,
"step": 9330
},
{
"epoch": 8.364695340501791,
"grad_norm": 9.0,
"learning_rate": 3.967893062013581e-06,
"loss": 0.1822,
"num_input_tokens_seen": 3517912,
"step": 9335
},
{
"epoch": 8.369175627240143,
"grad_norm": 87.0,
"learning_rate": 3.946782753197187e-06,
"loss": 0.2565,
"num_input_tokens_seen": 3519768,
"step": 9340
},
{
"epoch": 8.373655913978494,
"grad_norm": 41.0,
"learning_rate": 3.925723936969927e-06,
"loss": 0.0686,
"num_input_tokens_seen": 3521560,
"step": 9345
},
{
"epoch": 8.378136200716845,
"grad_norm": 42.75,
"learning_rate": 3.9047166648380844e-06,
"loss": 0.1601,
"num_input_tokens_seen": 3523448,
"step": 9350
},
{
"epoch": 8.382616487455197,
"grad_norm": 0.16796875,
"learning_rate": 3.883760988181867e-06,
"loss": 0.0841,
"num_input_tokens_seen": 3525176,
"step": 9355
},
{
"epoch": 8.387096774193548,
"grad_norm": 0.921875,
"learning_rate": 3.862856958255304e-06,
"loss": 0.0008,
"num_input_tokens_seen": 3527128,
"step": 9360
},
{
"epoch": 8.3915770609319,
"grad_norm": 75.0,
"learning_rate": 3.842004626186085e-06,
"loss": 0.1428,
"num_input_tokens_seen": 3528856,
"step": 9365
},
{
"epoch": 8.39605734767025,
"grad_norm": 68.0,
"learning_rate": 3.821204042975482e-06,
"loss": 0.2207,
"num_input_tokens_seen": 3530648,
"step": 9370
},
{
"epoch": 8.400537634408602,
"grad_norm": 36.0,
"learning_rate": 3.8004552594981815e-06,
"loss": 0.1581,
"num_input_tokens_seen": 3532376,
"step": 9375
},
{
"epoch": 8.405017921146953,
"grad_norm": 71.0,
"learning_rate": 3.77975832650217e-06,
"loss": 0.1216,
"num_input_tokens_seen": 3534200,
"step": 9380
},
{
"epoch": 8.409498207885305,
"grad_norm": 38.0,
"learning_rate": 3.7591132946086434e-06,
"loss": 0.0194,
"num_input_tokens_seen": 3536376,
"step": 9385
},
{
"epoch": 8.413978494623656,
"grad_norm": 0.01483154296875,
"learning_rate": 3.7385202143118192e-06,
"loss": 0.0428,
"num_input_tokens_seen": 3538392,
"step": 9390
},
{
"epoch": 8.418458781362007,
"grad_norm": 5.9375,
"learning_rate": 3.717979135978883e-06,
"loss": 0.4549,
"num_input_tokens_seen": 3540280,
"step": 9395
},
{
"epoch": 8.422939068100359,
"grad_norm": 0.45703125,
"learning_rate": 3.697490109849816e-06,
"loss": 0.0035,
"num_input_tokens_seen": 3542136,
"step": 9400
},
{
"epoch": 8.42741935483871,
"grad_norm": 1.0234375,
"learning_rate": 3.6770531860372853e-06,
"loss": 0.1015,
"num_input_tokens_seen": 3543992,
"step": 9405
},
{
"epoch": 8.431899641577061,
"grad_norm": 1.53125,
"learning_rate": 3.6566684145265483e-06,
"loss": 0.4238,
"num_input_tokens_seen": 3545784,
"step": 9410
},
{
"epoch": 8.436379928315413,
"grad_norm": 88.5,
"learning_rate": 3.636335845175265e-06,
"loss": 0.1988,
"num_input_tokens_seen": 3547800,
"step": 9415
},
{
"epoch": 8.440860215053764,
"grad_norm": 6.25,
"learning_rate": 3.616055527713463e-06,
"loss": 0.0709,
"num_input_tokens_seen": 3549528,
"step": 9420
},
{
"epoch": 8.445340501792115,
"grad_norm": 0.1611328125,
"learning_rate": 3.595827511743341e-06,
"loss": 0.0082,
"num_input_tokens_seen": 3551416,
"step": 9425
},
{
"epoch": 8.449820788530467,
"grad_norm": 0.06494140625,
"learning_rate": 3.575651846739181e-06,
"loss": 0.5459,
"num_input_tokens_seen": 3553336,
"step": 9430
},
{
"epoch": 8.454301075268818,
"grad_norm": 58.25,
"learning_rate": 3.5555285820472435e-06,
"loss": 0.4879,
"num_input_tokens_seen": 3555096,
"step": 9435
},
{
"epoch": 8.45878136200717,
"grad_norm": 0.1005859375,
"learning_rate": 3.5354577668856083e-06,
"loss": 0.1278,
"num_input_tokens_seen": 3556952,
"step": 9440
},
{
"epoch": 8.46326164874552,
"grad_norm": 37.0,
"learning_rate": 3.5154394503440576e-06,
"loss": 0.3865,
"num_input_tokens_seen": 3558776,
"step": 9445
},
{
"epoch": 8.46774193548387,
"grad_norm": 65.0,
"learning_rate": 3.4954736813840095e-06,
"loss": 0.6281,
"num_input_tokens_seen": 3560792,
"step": 9450
},
{
"epoch": 8.472222222222221,
"grad_norm": 89.5,
"learning_rate": 3.47556050883833e-06,
"loss": 0.0826,
"num_input_tokens_seen": 3562680,
"step": 9455
},
{
"epoch": 8.476702508960573,
"grad_norm": 15.3125,
"learning_rate": 3.455699981411259e-06,
"loss": 0.0845,
"num_input_tokens_seen": 3564696,
"step": 9460
},
{
"epoch": 8.481182795698924,
"grad_norm": 99.5,
"learning_rate": 3.4358921476782714e-06,
"loss": 0.1485,
"num_input_tokens_seen": 3566520,
"step": 9465
},
{
"epoch": 8.485663082437275,
"grad_norm": 0.0224609375,
"learning_rate": 3.416137056085944e-06,
"loss": 0.0039,
"num_input_tokens_seen": 3568536,
"step": 9470
},
{
"epoch": 8.490143369175627,
"grad_norm": 112.5,
"learning_rate": 3.3964347549518883e-06,
"loss": 0.154,
"num_input_tokens_seen": 3570360,
"step": 9475
},
{
"epoch": 8.494623655913978,
"grad_norm": 135.0,
"learning_rate": 3.376785292464574e-06,
"loss": 0.1192,
"num_input_tokens_seen": 3572280,
"step": 9480
},
{
"epoch": 8.49910394265233,
"grad_norm": 60.5,
"learning_rate": 3.3571887166832434e-06,
"loss": 0.3767,
"num_input_tokens_seen": 3574200,
"step": 9485
},
{
"epoch": 8.5,
"eval_loss": 0.28259721398353577,
"eval_runtime": 9.659,
"eval_samples_per_second": 51.351,
"eval_steps_per_second": 12.838,
"num_input_tokens_seen": 3574616,
"step": 9486
},
{
"epoch": 8.50358422939068,
"grad_norm": 135.0,
"learning_rate": 3.3376450755377958e-06,
"loss": 0.0602,
"num_input_tokens_seen": 3576344,
"step": 9490
},
{
"epoch": 8.508064516129032,
"grad_norm": 6.21875,
"learning_rate": 3.3181544168286503e-06,
"loss": 0.0091,
"num_input_tokens_seen": 3578296,
"step": 9495
},
{
"epoch": 8.512544802867383,
"grad_norm": 0.4140625,
"learning_rate": 3.298716788226644e-06,
"loss": 0.1587,
"num_input_tokens_seen": 3580216,
"step": 9500
},
{
"epoch": 8.517025089605735,
"grad_norm": 46.5,
"learning_rate": 3.2793322372729085e-06,
"loss": 0.2159,
"num_input_tokens_seen": 3582200,
"step": 9505
},
{
"epoch": 8.521505376344086,
"grad_norm": 3.875,
"learning_rate": 3.260000811378755e-06,
"loss": 0.1871,
"num_input_tokens_seen": 3584152,
"step": 9510
},
{
"epoch": 8.525985663082437,
"grad_norm": 169.0,
"learning_rate": 3.240722557825576e-06,
"loss": 0.2215,
"num_input_tokens_seen": 3585816,
"step": 9515
},
{
"epoch": 8.530465949820789,
"grad_norm": 7.84375,
"learning_rate": 3.2214975237646937e-06,
"loss": 0.1938,
"num_input_tokens_seen": 3587896,
"step": 9520
},
{
"epoch": 8.53494623655914,
"grad_norm": 0.609375,
"learning_rate": 3.2023257562172725e-06,
"loss": 0.0812,
"num_input_tokens_seen": 3589752,
"step": 9525
},
{
"epoch": 8.539426523297491,
"grad_norm": 70.0,
"learning_rate": 3.1832073020741983e-06,
"loss": 0.5314,
"num_input_tokens_seen": 3591672,
"step": 9530
},
{
"epoch": 8.543906810035843,
"grad_norm": 77.5,
"learning_rate": 3.1641422080959465e-06,
"loss": 0.4599,
"num_input_tokens_seen": 3593656,
"step": 9535
},
{
"epoch": 8.548387096774194,
"grad_norm": 0.2236328125,
"learning_rate": 3.145130520912515e-06,
"loss": 0.0115,
"num_input_tokens_seen": 3595832,
"step": 9540
},
{
"epoch": 8.552867383512545,
"grad_norm": 0.5859375,
"learning_rate": 3.1261722870232436e-06,
"loss": 0.1144,
"num_input_tokens_seen": 3597528,
"step": 9545
},
{
"epoch": 8.557347670250897,
"grad_norm": 0.03271484375,
"learning_rate": 3.1072675527967526e-06,
"loss": 0.1901,
"num_input_tokens_seen": 3599480,
"step": 9550
},
{
"epoch": 8.561827956989248,
"grad_norm": 7.8125,
"learning_rate": 3.0884163644708084e-06,
"loss": 0.0674,
"num_input_tokens_seen": 3601368,
"step": 9555
},
{
"epoch": 8.5663082437276,
"grad_norm": 0.01251220703125,
"learning_rate": 3.069618768152202e-06,
"loss": 0.3435,
"num_input_tokens_seen": 3603288,
"step": 9560
},
{
"epoch": 8.57078853046595,
"grad_norm": 69.0,
"learning_rate": 3.050874809816673e-06,
"loss": 0.1857,
"num_input_tokens_seen": 3605048,
"step": 9565
},
{
"epoch": 8.575268817204302,
"grad_norm": 176.0,
"learning_rate": 3.0321845353087463e-06,
"loss": 0.3279,
"num_input_tokens_seen": 3606840,
"step": 9570
},
{
"epoch": 8.579749103942653,
"grad_norm": 0.126953125,
"learning_rate": 3.0135479903416586e-06,
"loss": 0.1517,
"num_input_tokens_seen": 3608728,
"step": 9575
},
{
"epoch": 8.584229390681003,
"grad_norm": 19.125,
"learning_rate": 2.9949652204972254e-06,
"loss": 0.0061,
"num_input_tokens_seen": 3610552,
"step": 9580
},
{
"epoch": 8.588709677419354,
"grad_norm": 0.69140625,
"learning_rate": 2.976436271225741e-06,
"loss": 0.1394,
"num_input_tokens_seen": 3612472,
"step": 9585
},
{
"epoch": 8.593189964157705,
"grad_norm": 26.375,
"learning_rate": 2.95796118784587e-06,
"loss": 0.1967,
"num_input_tokens_seen": 3614360,
"step": 9590
},
{
"epoch": 8.597670250896057,
"grad_norm": 46.5,
"learning_rate": 2.939540015544523e-06,
"loss": 0.5062,
"num_input_tokens_seen": 3616216,
"step": 9595
},
{
"epoch": 8.602150537634408,
"grad_norm": 0.232421875,
"learning_rate": 2.9211727993767507e-06,
"loss": 0.3398,
"num_input_tokens_seen": 3617880,
"step": 9600
},
{
"epoch": 8.60663082437276,
"grad_norm": 0.40625,
"learning_rate": 2.902859584265649e-06,
"loss": 0.1545,
"num_input_tokens_seen": 3619736,
"step": 9605
},
{
"epoch": 8.61111111111111,
"grad_norm": 88.5,
"learning_rate": 2.88460041500222e-06,
"loss": 0.442,
"num_input_tokens_seen": 3621560,
"step": 9610
},
{
"epoch": 8.615591397849462,
"grad_norm": 117.5,
"learning_rate": 2.866395336245284e-06,
"loss": 0.1486,
"num_input_tokens_seen": 3623224,
"step": 9615
},
{
"epoch": 8.620071684587813,
"grad_norm": 102.0,
"learning_rate": 2.8482443925213765e-06,
"loss": 0.3985,
"num_input_tokens_seen": 3625208,
"step": 9620
},
{
"epoch": 8.624551971326165,
"grad_norm": 26.75,
"learning_rate": 2.8301476282246164e-06,
"loss": 0.3171,
"num_input_tokens_seen": 3627192,
"step": 9625
},
{
"epoch": 8.629032258064516,
"grad_norm": 98.0,
"learning_rate": 2.8121050876166096e-06,
"loss": 0.1645,
"num_input_tokens_seen": 3629112,
"step": 9630
},
{
"epoch": 8.633512544802867,
"grad_norm": 109.5,
"learning_rate": 2.794116814826342e-06,
"loss": 0.4167,
"num_input_tokens_seen": 3630808,
"step": 9635
},
{
"epoch": 8.637992831541219,
"grad_norm": 0.1494140625,
"learning_rate": 2.776182853850065e-06,
"loss": 0.1989,
"num_input_tokens_seen": 3632664,
"step": 9640
},
{
"epoch": 8.64247311827957,
"grad_norm": 0.08984375,
"learning_rate": 2.758303248551211e-06,
"loss": 0.3616,
"num_input_tokens_seen": 3634456,
"step": 9645
},
{
"epoch": 8.646953405017921,
"grad_norm": 1.03125,
"learning_rate": 2.740478042660244e-06,
"loss": 0.1579,
"num_input_tokens_seen": 3636216,
"step": 9650
},
{
"epoch": 8.651433691756273,
"grad_norm": 180.0,
"learning_rate": 2.7227072797745833e-06,
"loss": 0.7644,
"num_input_tokens_seen": 3638200,
"step": 9655
},
{
"epoch": 8.655913978494624,
"grad_norm": 0.0849609375,
"learning_rate": 2.7049910033585093e-06,
"loss": 0.0312,
"num_input_tokens_seen": 3640088,
"step": 9660
},
{
"epoch": 8.660394265232975,
"grad_norm": 0.2236328125,
"learning_rate": 2.6873292567429986e-06,
"loss": 0.1099,
"num_input_tokens_seen": 3641944,
"step": 9665
},
{
"epoch": 8.664874551971327,
"grad_norm": 0.01300048828125,
"learning_rate": 2.6697220831256974e-06,
"loss": 0.2019,
"num_input_tokens_seen": 3643896,
"step": 9670
},
{
"epoch": 8.669354838709678,
"grad_norm": 18.5,
"learning_rate": 2.6521695255707495e-06,
"loss": 0.2871,
"num_input_tokens_seen": 3645784,
"step": 9675
},
{
"epoch": 8.67383512544803,
"grad_norm": 50.75,
"learning_rate": 2.6346716270087253e-06,
"loss": 0.5444,
"num_input_tokens_seen": 3647800,
"step": 9680
},
{
"epoch": 8.67831541218638,
"grad_norm": 121.5,
"learning_rate": 2.617228430236521e-06,
"loss": 0.1972,
"num_input_tokens_seen": 3649624,
"step": 9685
},
{
"epoch": 8.682795698924732,
"grad_norm": 0.0849609375,
"learning_rate": 2.5998399779172123e-06,
"loss": 0.0517,
"num_input_tokens_seen": 3651416,
"step": 9690
},
{
"epoch": 8.687275985663083,
"grad_norm": 75.5,
"learning_rate": 2.5825063125800074e-06,
"loss": 0.3543,
"num_input_tokens_seen": 3653464,
"step": 9695
},
{
"epoch": 8.691756272401435,
"grad_norm": 49.75,
"learning_rate": 2.565227476620105e-06,
"loss": 0.4386,
"num_input_tokens_seen": 3655320,
"step": 9700
},
{
"epoch": 8.696236559139784,
"grad_norm": 0.1474609375,
"learning_rate": 2.5480035122985885e-06,
"loss": 0.2902,
"num_input_tokens_seen": 3657400,
"step": 9705
},
{
"epoch": 8.700716845878135,
"grad_norm": 0.328125,
"learning_rate": 2.530834461742357e-06,
"loss": 0.3138,
"num_input_tokens_seen": 3659256,
"step": 9710
},
{
"epoch": 8.705197132616487,
"grad_norm": 0.06103515625,
"learning_rate": 2.513720366943986e-06,
"loss": 0.1487,
"num_input_tokens_seen": 3661080,
"step": 9715
},
{
"epoch": 8.709677419354838,
"grad_norm": 6.09375,
"learning_rate": 2.4966612697616382e-06,
"loss": 0.1132,
"num_input_tokens_seen": 3663128,
"step": 9720
},
{
"epoch": 8.71415770609319,
"grad_norm": 0.0439453125,
"learning_rate": 2.4796572119189647e-06,
"loss": 0.0433,
"num_input_tokens_seen": 3665144,
"step": 9725
},
{
"epoch": 8.71863799283154,
"grad_norm": 92.5,
"learning_rate": 2.462708235004996e-06,
"loss": 0.3908,
"num_input_tokens_seen": 3666968,
"step": 9730
},
{
"epoch": 8.723118279569892,
"grad_norm": 122.0,
"learning_rate": 2.445814380474057e-06,
"loss": 0.2358,
"num_input_tokens_seen": 3668952,
"step": 9735
},
{
"epoch": 8.727598566308243,
"grad_norm": 6.03125,
"learning_rate": 2.4289756896456434e-06,
"loss": 0.1752,
"num_input_tokens_seen": 3670744,
"step": 9740
},
{
"epoch": 8.732078853046595,
"grad_norm": 0.072265625,
"learning_rate": 2.412192203704311e-06,
"loss": 0.2971,
"num_input_tokens_seen": 3672600,
"step": 9745
},
{
"epoch": 8.736559139784946,
"grad_norm": 0.64453125,
"learning_rate": 2.395463963699629e-06,
"loss": 0.5051,
"num_input_tokens_seen": 3674360,
"step": 9750
},
{
"epoch": 8.741039426523297,
"grad_norm": 4.8125,
"learning_rate": 2.3787910105460247e-06,
"loss": 0.2317,
"num_input_tokens_seen": 3676152,
"step": 9755
},
{
"epoch": 8.745519713261649,
"grad_norm": 0.054443359375,
"learning_rate": 2.362173385022701e-06,
"loss": 0.207,
"num_input_tokens_seen": 3678104,
"step": 9760
},
{
"epoch": 8.75,
"grad_norm": 0.4140625,
"learning_rate": 2.3456111277735506e-06,
"loss": 0.0069,
"num_input_tokens_seen": 3679864,
"step": 9765
},
{
"epoch": 8.754480286738351,
"grad_norm": 105.5,
"learning_rate": 2.3291042793070374e-06,
"loss": 0.0712,
"num_input_tokens_seen": 3681720,
"step": 9770
},
{
"epoch": 8.758960573476703,
"grad_norm": 0.01251220703125,
"learning_rate": 2.3126528799961024e-06,
"loss": 0.0368,
"num_input_tokens_seen": 3683832,
"step": 9775
},
{
"epoch": 8.763440860215054,
"grad_norm": 0.08154296875,
"learning_rate": 2.2962569700780726e-06,
"loss": 0.021,
"num_input_tokens_seen": 3685752,
"step": 9780
},
{
"epoch": 8.767921146953405,
"grad_norm": 4.09375,
"learning_rate": 2.279916589654549e-06,
"loss": 0.0501,
"num_input_tokens_seen": 3687704,
"step": 9785
},
{
"epoch": 8.772401433691757,
"grad_norm": 0.0830078125,
"learning_rate": 2.263631778691333e-06,
"loss": 0.0704,
"num_input_tokens_seen": 3689624,
"step": 9790
},
{
"epoch": 8.776881720430108,
"grad_norm": 44.25,
"learning_rate": 2.2474025770182982e-06,
"loss": 0.2067,
"num_input_tokens_seen": 3691544,
"step": 9795
},
{
"epoch": 8.78136200716846,
"grad_norm": 43.0,
"learning_rate": 2.2312290243293147e-06,
"loss": 0.2136,
"num_input_tokens_seen": 3693368,
"step": 9800
},
{
"epoch": 8.78584229390681,
"grad_norm": 8.6875,
"learning_rate": 2.21511116018214e-06,
"loss": 0.298,
"num_input_tokens_seen": 3695224,
"step": 9805
},
{
"epoch": 8.790322580645162,
"grad_norm": 8.9375,
"learning_rate": 2.199049023998323e-06,
"loss": 0.3969,
"num_input_tokens_seen": 3697048,
"step": 9810
},
{
"epoch": 8.794802867383513,
"grad_norm": 11.9375,
"learning_rate": 2.1830426550631276e-06,
"loss": 0.2088,
"num_input_tokens_seen": 3699032,
"step": 9815
},
{
"epoch": 8.799283154121865,
"grad_norm": 2.53125,
"learning_rate": 2.1670920925254053e-06,
"loss": 0.3633,
"num_input_tokens_seen": 3700888,
"step": 9820
},
{
"epoch": 8.803763440860216,
"grad_norm": 0.2060546875,
"learning_rate": 2.1511973753975208e-06,
"loss": 0.2102,
"num_input_tokens_seen": 3702680,
"step": 9825
},
{
"epoch": 8.808243727598565,
"grad_norm": 120.0,
"learning_rate": 2.1353585425552463e-06,
"loss": 0.0625,
"num_input_tokens_seen": 3704536,
"step": 9830
},
{
"epoch": 8.812724014336917,
"grad_norm": 0.2314453125,
"learning_rate": 2.1195756327376722e-06,
"loss": 0.3356,
"num_input_tokens_seen": 3706360,
"step": 9835
},
{
"epoch": 8.817204301075268,
"grad_norm": 94.5,
"learning_rate": 2.1038486845471215e-06,
"loss": 0.2435,
"num_input_tokens_seen": 3708088,
"step": 9840
},
{
"epoch": 8.82168458781362,
"grad_norm": 2.375,
"learning_rate": 2.0881777364490265e-06,
"loss": 0.4738,
"num_input_tokens_seen": 3710040,
"step": 9845
},
{
"epoch": 8.82616487455197,
"grad_norm": 3.140625,
"learning_rate": 2.0725628267718595e-06,
"loss": 0.2982,
"num_input_tokens_seen": 3711928,
"step": 9850
},
{
"epoch": 8.830645161290322,
"grad_norm": 0.2255859375,
"learning_rate": 2.0570039937070463e-06,
"loss": 0.2775,
"num_input_tokens_seen": 3713720,
"step": 9855
},
{
"epoch": 8.835125448028673,
"grad_norm": 0.166015625,
"learning_rate": 2.04150127530883e-06,
"loss": 0.0642,
"num_input_tokens_seen": 3715416,
"step": 9860
},
{
"epoch": 8.839605734767025,
"grad_norm": 0.2294921875,
"learning_rate": 2.026054709494235e-06,
"loss": 0.0004,
"num_input_tokens_seen": 3717208,
"step": 9865
},
{
"epoch": 8.844086021505376,
"grad_norm": 0.01287841796875,
"learning_rate": 2.0106643340429332e-06,
"loss": 0.0008,
"num_input_tokens_seen": 3718936,
"step": 9870
},
{
"epoch": 8.848566308243727,
"grad_norm": 62.0,
"learning_rate": 1.995330186597158e-06,
"loss": 0.4684,
"num_input_tokens_seen": 3720920,
"step": 9875
},
{
"epoch": 8.853046594982079,
"grad_norm": 0.578125,
"learning_rate": 1.980052304661642e-06,
"loss": 0.2382,
"num_input_tokens_seen": 3722776,
"step": 9880
},
{
"epoch": 8.85752688172043,
"grad_norm": 0.012451171875,
"learning_rate": 1.9648307256034697e-06,
"loss": 0.0395,
"num_input_tokens_seen": 3724792,
"step": 9885
},
{
"epoch": 8.862007168458781,
"grad_norm": 108.5,
"learning_rate": 1.9496654866520414e-06,
"loss": 0.0711,
"num_input_tokens_seen": 3726712,
"step": 9890
},
{
"epoch": 8.866487455197133,
"grad_norm": 0.99609375,
"learning_rate": 1.9345566248989534e-06,
"loss": 0.1354,
"num_input_tokens_seen": 3728696,
"step": 9895
},
{
"epoch": 8.870967741935484,
"grad_norm": 3.78125,
"learning_rate": 1.9195041772979093e-06,
"loss": 0.167,
"num_input_tokens_seen": 3730488,
"step": 9900
},
{
"epoch": 8.875448028673835,
"grad_norm": 2.90625,
"learning_rate": 1.9045081806646436e-06,
"loss": 0.109,
"num_input_tokens_seen": 3732440,
"step": 9905
},
{
"epoch": 8.879928315412187,
"grad_norm": 22.125,
"learning_rate": 1.8895686716768113e-06,
"loss": 0.1147,
"num_input_tokens_seen": 3734488,
"step": 9910
},
{
"epoch": 8.884408602150538,
"grad_norm": 0.46484375,
"learning_rate": 1.8746856868739004e-06,
"loss": 0.0518,
"num_input_tokens_seen": 3736472,
"step": 9915
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.9609375,
"learning_rate": 1.8598592626571737e-06,
"loss": 0.4138,
"num_input_tokens_seen": 3738264,
"step": 9920
},
{
"epoch": 8.89336917562724,
"grad_norm": 118.5,
"learning_rate": 1.8450894352895375e-06,
"loss": 0.0657,
"num_input_tokens_seen": 3740056,
"step": 9925
},
{
"epoch": 8.897849462365592,
"grad_norm": 17.5,
"learning_rate": 1.8303762408954761e-06,
"loss": 0.0402,
"num_input_tokens_seen": 3742008,
"step": 9930
},
{
"epoch": 8.902329749103943,
"grad_norm": 2.109375,
"learning_rate": 1.81571971546097e-06,
"loss": 0.0014,
"num_input_tokens_seen": 3743864,
"step": 9935
},
{
"epoch": 8.906810035842295,
"grad_norm": 120.0,
"learning_rate": 1.8011198948333751e-06,
"loss": 0.1664,
"num_input_tokens_seen": 3745752,
"step": 9940
},
{
"epoch": 8.911290322580646,
"grad_norm": 0.10009765625,
"learning_rate": 1.7865768147213802e-06,
"loss": 0.1013,
"num_input_tokens_seen": 3747576,
"step": 9945
},
{
"epoch": 8.915770609318997,
"grad_norm": 73.0,
"learning_rate": 1.7720905106948821e-06,
"loss": 0.0797,
"num_input_tokens_seen": 3749464,
"step": 9950
},
{
"epoch": 8.920250896057347,
"grad_norm": 2.078125,
"learning_rate": 1.7576610181849113e-06,
"loss": 0.0514,
"num_input_tokens_seen": 3751352,
"step": 9955
},
{
"epoch": 8.924731182795698,
"grad_norm": 0.373046875,
"learning_rate": 1.7432883724835646e-06,
"loss": 0.1992,
"num_input_tokens_seen": 3753208,
"step": 9960
},
{
"epoch": 8.92921146953405,
"grad_norm": 6.46875,
"learning_rate": 1.7289726087438813e-06,
"loss": 0.0692,
"num_input_tokens_seen": 3755000,
"step": 9965
},
{
"epoch": 8.9336917562724,
"grad_norm": 129.0,
"learning_rate": 1.7147137619797888e-06,
"loss": 0.0421,
"num_input_tokens_seen": 3756856,
"step": 9970
},
{
"epoch": 8.938172043010752,
"grad_norm": 33.5,
"learning_rate": 1.7005118670659987e-06,
"loss": 0.0228,
"num_input_tokens_seen": 3758616,
"step": 9975
},
{
"epoch": 8.942652329749103,
"grad_norm": 3.265625,
"learning_rate": 1.6863669587379282e-06,
"loss": 0.1781,
"num_input_tokens_seen": 3760344,
"step": 9980
},
{
"epoch": 8.947132616487455,
"grad_norm": 0.14453125,
"learning_rate": 1.6722790715916231e-06,
"loss": 0.4241,
"num_input_tokens_seen": 3762232,
"step": 9985
},
{
"epoch": 8.951612903225806,
"grad_norm": 0.01239013671875,
"learning_rate": 1.658248240083657e-06,
"loss": 0.2282,
"num_input_tokens_seen": 3764088,
"step": 9990
},
{
"epoch": 8.956093189964157,
"grad_norm": 0.0159912109375,
"learning_rate": 1.6442744985310593e-06,
"loss": 0.0163,
"num_input_tokens_seen": 3765656,
"step": 9995
},
{
"epoch": 8.960573476702509,
"grad_norm": 0.0311279296875,
"learning_rate": 1.6303578811112246e-06,
"loss": 0.2835,
"num_input_tokens_seen": 3767352,
"step": 10000
},
{
"epoch": 8.96505376344086,
"grad_norm": 40.25,
"learning_rate": 1.6164984218618285e-06,
"loss": 0.3269,
"num_input_tokens_seen": 3769240,
"step": 10005
},
{
"epoch": 8.969534050179211,
"grad_norm": 0.06640625,
"learning_rate": 1.6026961546807605e-06,
"loss": 0.1912,
"num_input_tokens_seen": 3771032,
"step": 10010
},
{
"epoch": 8.974014336917563,
"grad_norm": 80.5,
"learning_rate": 1.5889511133260121e-06,
"loss": 0.7157,
"num_input_tokens_seen": 3772952,
"step": 10015
},
{
"epoch": 8.978494623655914,
"grad_norm": 0.474609375,
"learning_rate": 1.575263331415619e-06,
"loss": 0.0071,
"num_input_tokens_seen": 3774904,
"step": 10020
},
{
"epoch": 8.982974910394265,
"grad_norm": 0.6328125,
"learning_rate": 1.5616328424275656e-06,
"loss": 0.0818,
"num_input_tokens_seen": 3776696,
"step": 10025
},
{
"epoch": 8.987455197132617,
"grad_norm": 47.25,
"learning_rate": 1.5480596796997094e-06,
"loss": 0.4153,
"num_input_tokens_seen": 3778648,
"step": 10030
},
{
"epoch": 8.991935483870968,
"grad_norm": 123.5,
"learning_rate": 1.534543876429706e-06,
"loss": 0.0445,
"num_input_tokens_seen": 3780568,
"step": 10035
},
{
"epoch": 8.99641577060932,
"grad_norm": 0.0244140625,
"learning_rate": 1.521085465674904e-06,
"loss": 0.0792,
"num_input_tokens_seen": 3782488,
"step": 10040
},
{
"epoch": 9.0,
"eval_loss": 0.28632766008377075,
"eval_runtime": 9.6616,
"eval_samples_per_second": 51.337,
"eval_steps_per_second": 12.834,
"num_input_tokens_seen": 3783840,
"step": 10044
},
{
"epoch": 9.00089605734767,
"grad_norm": 1.578125,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.136,
"num_input_tokens_seen": 3784384,
"step": 10045
},
{
"epoch": 9.005376344086022,
"grad_norm": 145.0,
"learning_rate": 1.494340953238399e-06,
"loss": 0.0495,
"num_input_tokens_seen": 3786240,
"step": 10050
},
{
"epoch": 9.009856630824373,
"grad_norm": 0.0189208984375,
"learning_rate": 1.481054916969221e-06,
"loss": 0.0004,
"num_input_tokens_seen": 3788032,
"step": 10055
},
{
"epoch": 9.014336917562725,
"grad_norm": 0.455078125,
"learning_rate": 1.4678264040401458e-06,
"loss": 0.1736,
"num_input_tokens_seen": 3789920,
"step": 10060
},
{
"epoch": 9.018817204301076,
"grad_norm": 61.75,
"learning_rate": 1.4546554468058665e-06,
"loss": 0.3808,
"num_input_tokens_seen": 3791616,
"step": 10065
},
{
"epoch": 9.023297491039427,
"grad_norm": 1.265625,
"learning_rate": 1.441542077480304e-06,
"loss": 0.1838,
"num_input_tokens_seen": 3793472,
"step": 10070
},
{
"epoch": 9.027777777777779,
"grad_norm": 94.0,
"learning_rate": 1.428486328136533e-06,
"loss": 0.3062,
"num_input_tokens_seen": 3795200,
"step": 10075
},
{
"epoch": 9.03225806451613,
"grad_norm": 12.5,
"learning_rate": 1.4154882307066907e-06,
"loss": 0.3556,
"num_input_tokens_seen": 3797152,
"step": 10080
},
{
"epoch": 9.03673835125448,
"grad_norm": 72.0,
"learning_rate": 1.402547816981914e-06,
"loss": 0.3525,
"num_input_tokens_seen": 3799008,
"step": 10085
},
{
"epoch": 9.04121863799283,
"grad_norm": 0.0133056640625,
"learning_rate": 1.3896651186122573e-06,
"loss": 0.1817,
"num_input_tokens_seen": 3801024,
"step": 10090
},
{
"epoch": 9.045698924731182,
"grad_norm": 0.11279296875,
"learning_rate": 1.3768401671066105e-06,
"loss": 0.0024,
"num_input_tokens_seen": 3803008,
"step": 10095
},
{
"epoch": 9.050179211469533,
"grad_norm": 0.0216064453125,
"learning_rate": 1.3640729938326213e-06,
"loss": 0.0014,
"num_input_tokens_seen": 3804992,
"step": 10100
},
{
"epoch": 9.054659498207885,
"grad_norm": 0.197265625,
"learning_rate": 1.351363630016622e-06,
"loss": 0.1154,
"num_input_tokens_seen": 3806848,
"step": 10105
},
{
"epoch": 9.059139784946236,
"grad_norm": 4.0,
"learning_rate": 1.3387121067435588e-06,
"loss": 0.0607,
"num_input_tokens_seen": 3808704,
"step": 10110
},
{
"epoch": 9.063620071684587,
"grad_norm": 126.5,
"learning_rate": 1.3261184549569066e-06,
"loss": 0.4665,
"num_input_tokens_seen": 3810528,
"step": 10115
},
{
"epoch": 9.068100358422939,
"grad_norm": 65.5,
"learning_rate": 1.3135827054585964e-06,
"loss": 0.2013,
"num_input_tokens_seen": 3812288,
"step": 10120
},
{
"epoch": 9.07258064516129,
"grad_norm": 41.0,
"learning_rate": 1.3011048889089355e-06,
"loss": 0.0779,
"num_input_tokens_seen": 3814048,
"step": 10125
},
{
"epoch": 9.077060931899641,
"grad_norm": 0.1640625,
"learning_rate": 1.288685035826548e-06,
"loss": 0.7961,
"num_input_tokens_seen": 3815840,
"step": 10130
},
{
"epoch": 9.081541218637993,
"grad_norm": 0.443359375,
"learning_rate": 1.2763231765882732e-06,
"loss": 0.0011,
"num_input_tokens_seen": 3817632,
"step": 10135
},
{
"epoch": 9.086021505376344,
"grad_norm": 57.5,
"learning_rate": 1.2640193414291262e-06,
"loss": 0.2171,
"num_input_tokens_seen": 3819456,
"step": 10140
},
{
"epoch": 9.090501792114695,
"grad_norm": 75.0,
"learning_rate": 1.2517735604421904e-06,
"loss": 0.1493,
"num_input_tokens_seen": 3821344,
"step": 10145
},
{
"epoch": 9.094982078853047,
"grad_norm": 0.0732421875,
"learning_rate": 1.2395858635785602e-06,
"loss": 0.3443,
"num_input_tokens_seen": 3823296,
"step": 10150
},
{
"epoch": 9.099462365591398,
"grad_norm": 100.0,
"learning_rate": 1.2274562806472794e-06,
"loss": 0.0363,
"num_input_tokens_seen": 3825184,
"step": 10155
},
{
"epoch": 9.10394265232975,
"grad_norm": 2.53125,
"learning_rate": 1.2153848413152341e-06,
"loss": 0.2804,
"num_input_tokens_seen": 3827296,
"step": 10160
},
{
"epoch": 9.1084229390681,
"grad_norm": 0.166015625,
"learning_rate": 1.2033715751071206e-06,
"loss": 0.0676,
"num_input_tokens_seen": 3828992,
"step": 10165
},
{
"epoch": 9.112903225806452,
"grad_norm": 92.0,
"learning_rate": 1.191416511405341e-06,
"loss": 0.0348,
"num_input_tokens_seen": 3830880,
"step": 10170
},
{
"epoch": 9.117383512544803,
"grad_norm": 0.01324462890625,
"learning_rate": 1.1795196794499475e-06,
"loss": 0.0034,
"num_input_tokens_seen": 3832864,
"step": 10175
},
{
"epoch": 9.121863799283155,
"grad_norm": 0.01318359375,
"learning_rate": 1.1676811083385698e-06,
"loss": 0.4059,
"num_input_tokens_seen": 3834752,
"step": 10180
},
{
"epoch": 9.126344086021506,
"grad_norm": 0.490234375,
"learning_rate": 1.155900827026346e-06,
"loss": 0.0006,
"num_input_tokens_seen": 3836640,
"step": 10185
},
{
"epoch": 9.130824372759857,
"grad_norm": 69.0,
"learning_rate": 1.1441788643258233e-06,
"loss": 0.4361,
"num_input_tokens_seen": 3838528,
"step": 10190
},
{
"epoch": 9.135304659498209,
"grad_norm": 0.043212890625,
"learning_rate": 1.1325152489069457e-06,
"loss": 0.031,
"num_input_tokens_seen": 3840320,
"step": 10195
},
{
"epoch": 9.13978494623656,
"grad_norm": 87.0,
"learning_rate": 1.1209100092969244e-06,
"loss": 0.1341,
"num_input_tokens_seen": 3842496,
"step": 10200
},
{
"epoch": 9.144265232974911,
"grad_norm": 0.234375,
"learning_rate": 1.109363173880204e-06,
"loss": 0.0241,
"num_input_tokens_seen": 3844352,
"step": 10205
},
{
"epoch": 9.14874551971326,
"grad_norm": 91.5,
"learning_rate": 1.0978747708983854e-06,
"loss": 0.1806,
"num_input_tokens_seen": 3846304,
"step": 10210
},
{
"epoch": 9.153225806451612,
"grad_norm": 0.427734375,
"learning_rate": 1.0864448284501394e-06,
"loss": 0.3956,
"num_input_tokens_seen": 3848192,
"step": 10215
},
{
"epoch": 9.157706093189963,
"grad_norm": 72.0,
"learning_rate": 1.0750733744911674e-06,
"loss": 0.3326,
"num_input_tokens_seen": 3850016,
"step": 10220
},
{
"epoch": 9.162186379928315,
"grad_norm": 0.2216796875,
"learning_rate": 1.063760436834113e-06,
"loss": 0.038,
"num_input_tokens_seen": 3852000,
"step": 10225
},
{
"epoch": 9.166666666666666,
"grad_norm": 9.6875,
"learning_rate": 1.0525060431484907e-06,
"loss": 0.0108,
"num_input_tokens_seen": 3853760,
"step": 10230
},
{
"epoch": 9.171146953405017,
"grad_norm": 0.0322265625,
"learning_rate": 1.0413102209606424e-06,
"loss": 0.2548,
"num_input_tokens_seen": 3855488,
"step": 10235
},
{
"epoch": 9.175627240143369,
"grad_norm": 9.5625,
"learning_rate": 1.0301729976536417e-06,
"loss": 0.032,
"num_input_tokens_seen": 3857536,
"step": 10240
},
{
"epoch": 9.18010752688172,
"grad_norm": 38.0,
"learning_rate": 1.0190944004672409e-06,
"loss": 0.2028,
"num_input_tokens_seen": 3859424,
"step": 10245
},
{
"epoch": 9.184587813620071,
"grad_norm": 0.0703125,
"learning_rate": 1.0080744564978068e-06,
"loss": 0.3178,
"num_input_tokens_seen": 3861248,
"step": 10250
},
{
"epoch": 9.189068100358423,
"grad_norm": 108.5,
"learning_rate": 9.971131926982458e-07,
"loss": 0.1893,
"num_input_tokens_seen": 3863168,
"step": 10255
},
{
"epoch": 9.193548387096774,
"grad_norm": 86.0,
"learning_rate": 9.86210635877949e-07,
"loss": 0.0418,
"num_input_tokens_seen": 3865312,
"step": 10260
},
{
"epoch": 9.198028673835125,
"grad_norm": 132.0,
"learning_rate": 9.753668127027133e-07,
"loss": 0.0594,
"num_input_tokens_seen": 3867328,
"step": 10265
},
{
"epoch": 9.202508960573477,
"grad_norm": 5.96875,
"learning_rate": 9.645817496946903e-07,
"loss": 0.2032,
"num_input_tokens_seen": 3869056,
"step": 10270
},
{
"epoch": 9.206989247311828,
"grad_norm": 1.4765625,
"learning_rate": 9.538554732323041e-07,
"loss": 0.0051,
"num_input_tokens_seen": 3870976,
"step": 10275
},
{
"epoch": 9.21146953405018,
"grad_norm": 0.016845703125,
"learning_rate": 9.431880095502027e-07,
"loss": 0.1691,
"num_input_tokens_seen": 3872960,
"step": 10280
},
{
"epoch": 9.21594982078853,
"grad_norm": 45.75,
"learning_rate": 9.325793847391962e-07,
"loss": 0.5655,
"num_input_tokens_seen": 3874976,
"step": 10285
},
{
"epoch": 9.220430107526882,
"grad_norm": 1.203125,
"learning_rate": 9.220296247461707e-07,
"loss": 0.0074,
"num_input_tokens_seen": 3876800,
"step": 10290
},
{
"epoch": 9.224910394265233,
"grad_norm": 1.0,
"learning_rate": 9.115387553740473e-07,
"loss": 0.0028,
"num_input_tokens_seen": 3878560,
"step": 10295
},
{
"epoch": 9.229390681003585,
"grad_norm": 70.0,
"learning_rate": 9.011068022817065e-07,
"loss": 0.1634,
"num_input_tokens_seen": 3880544,
"step": 10300
},
{
"epoch": 9.233870967741936,
"grad_norm": 0.0147705078125,
"learning_rate": 8.907337909839275e-07,
"loss": 0.0151,
"num_input_tokens_seen": 3882368,
"step": 10305
},
{
"epoch": 9.238351254480287,
"grad_norm": 358.0,
"learning_rate": 8.804197468513436e-07,
"loss": 0.1654,
"num_input_tokens_seen": 3884288,
"step": 10310
},
{
"epoch": 9.242831541218639,
"grad_norm": 79.5,
"learning_rate": 8.701646951103425e-07,
"loss": 0.6861,
"num_input_tokens_seen": 3886176,
"step": 10315
},
{
"epoch": 9.24731182795699,
"grad_norm": 121.0,
"learning_rate": 8.599686608430413e-07,
"loss": 0.279,
"num_input_tokens_seen": 3888192,
"step": 10320
},
{
"epoch": 9.251792114695341,
"grad_norm": 0.02001953125,
"learning_rate": 8.498316689872055e-07,
"loss": 0.0169,
"num_input_tokens_seen": 3890048,
"step": 10325
},
{
"epoch": 9.256272401433693,
"grad_norm": 0.0654296875,
"learning_rate": 8.397537443361913e-07,
"loss": 0.0013,
"num_input_tokens_seen": 3891968,
"step": 10330
},
{
"epoch": 9.260752688172044,
"grad_norm": 0.0201416015625,
"learning_rate": 8.297349115388903e-07,
"loss": 0.001,
"num_input_tokens_seen": 3893696,
"step": 10335
},
{
"epoch": 9.265232974910393,
"grad_norm": 0.03662109375,
"learning_rate": 8.197751950996619e-07,
"loss": 0.528,
"num_input_tokens_seen": 3895616,
"step": 10340
},
{
"epoch": 9.269713261648745,
"grad_norm": 0.578125,
"learning_rate": 8.098746193782813e-07,
"loss": 0.2667,
"num_input_tokens_seen": 3897472,
"step": 10345
},
{
"epoch": 9.274193548387096,
"grad_norm": 0.016357421875,
"learning_rate": 8.00033208589876e-07,
"loss": 0.0188,
"num_input_tokens_seen": 3899264,
"step": 10350
},
{
"epoch": 9.278673835125447,
"grad_norm": 141.0,
"learning_rate": 7.902509868048552e-07,
"loss": 0.7039,
"num_input_tokens_seen": 3901376,
"step": 10355
},
{
"epoch": 9.283154121863799,
"grad_norm": 60.0,
"learning_rate": 7.805279779488722e-07,
"loss": 0.4773,
"num_input_tokens_seen": 3903328,
"step": 10360
},
{
"epoch": 9.28763440860215,
"grad_norm": 83.0,
"learning_rate": 7.708642058027571e-07,
"loss": 0.458,
"num_input_tokens_seen": 3905312,
"step": 10365
},
{
"epoch": 9.292114695340501,
"grad_norm": 0.047119140625,
"learning_rate": 7.61259694002453e-07,
"loss": 0.2556,
"num_input_tokens_seen": 3907200,
"step": 10370
},
{
"epoch": 9.296594982078853,
"grad_norm": 0.52734375,
"learning_rate": 7.51714466038958e-07,
"loss": 0.1282,
"num_input_tokens_seen": 3909248,
"step": 10375
},
{
"epoch": 9.301075268817204,
"grad_norm": 48.0,
"learning_rate": 7.422285452582805e-07,
"loss": 0.3351,
"num_input_tokens_seen": 3911168,
"step": 10380
},
{
"epoch": 9.305555555555555,
"grad_norm": 0.1474609375,
"learning_rate": 7.328019548613619e-07,
"loss": 0.0037,
"num_input_tokens_seen": 3912992,
"step": 10385
},
{
"epoch": 9.310035842293907,
"grad_norm": 0.00762939453125,
"learning_rate": 7.234347179040507e-07,
"loss": 0.1811,
"num_input_tokens_seen": 3914784,
"step": 10390
},
{
"epoch": 9.314516129032258,
"grad_norm": 0.53125,
"learning_rate": 7.141268572970094e-07,
"loss": 0.082,
"num_input_tokens_seen": 3916896,
"step": 10395
},
{
"epoch": 9.31899641577061,
"grad_norm": 46.5,
"learning_rate": 7.048783958056804e-07,
"loss": 0.2368,
"num_input_tokens_seen": 3918688,
"step": 10400
},
{
"epoch": 9.32347670250896,
"grad_norm": 103.5,
"learning_rate": 6.956893560502359e-07,
"loss": 0.8466,
"num_input_tokens_seen": 3920512,
"step": 10405
},
{
"epoch": 9.327956989247312,
"grad_norm": 52.25,
"learning_rate": 6.865597605054952e-07,
"loss": 0.5276,
"num_input_tokens_seen": 3922304,
"step": 10410
},
{
"epoch": 9.332437275985663,
"grad_norm": 2.171875,
"learning_rate": 6.774896315008994e-07,
"loss": 0.0663,
"num_input_tokens_seen": 3924384,
"step": 10415
},
{
"epoch": 9.336917562724015,
"grad_norm": 7.5,
"learning_rate": 6.68478991220442e-07,
"loss": 0.3007,
"num_input_tokens_seen": 3926368,
"step": 10420
},
{
"epoch": 9.341397849462366,
"grad_norm": 120.5,
"learning_rate": 6.595278617026163e-07,
"loss": 0.4648,
"num_input_tokens_seen": 3928288,
"step": 10425
},
{
"epoch": 9.345878136200717,
"grad_norm": 0.027587890625,
"learning_rate": 6.50636264840368e-07,
"loss": 0.0126,
"num_input_tokens_seen": 3930176,
"step": 10430
},
{
"epoch": 9.350358422939069,
"grad_norm": 0.07568359375,
"learning_rate": 6.418042223810234e-07,
"loss": 0.0108,
"num_input_tokens_seen": 3932224,
"step": 10435
},
{
"epoch": 9.35483870967742,
"grad_norm": 0.05029296875,
"learning_rate": 6.33031755926261e-07,
"loss": 0.0079,
"num_input_tokens_seen": 3934112,
"step": 10440
},
{
"epoch": 9.359318996415771,
"grad_norm": 7.0625,
"learning_rate": 6.243188869320377e-07,
"loss": 0.1784,
"num_input_tokens_seen": 3936096,
"step": 10445
},
{
"epoch": 9.363799283154123,
"grad_norm": 0.07666015625,
"learning_rate": 6.156656367085539e-07,
"loss": 0.2174,
"num_input_tokens_seen": 3937952,
"step": 10450
},
{
"epoch": 9.368279569892474,
"grad_norm": 8.875,
"learning_rate": 6.070720264201857e-07,
"loss": 0.0036,
"num_input_tokens_seen": 3939872,
"step": 10455
},
{
"epoch": 9.372759856630825,
"grad_norm": 1.3203125,
"learning_rate": 5.985380770854476e-07,
"loss": 0.101,
"num_input_tokens_seen": 3941920,
"step": 10460
},
{
"epoch": 9.377240143369175,
"grad_norm": 90.0,
"learning_rate": 5.900638095769185e-07,
"loss": 0.4119,
"num_input_tokens_seen": 3943712,
"step": 10465
},
{
"epoch": 9.381720430107526,
"grad_norm": 0.01336669921875,
"learning_rate": 5.816492446212213e-07,
"loss": 0.2044,
"num_input_tokens_seen": 3945440,
"step": 10470
},
{
"epoch": 9.386200716845877,
"grad_norm": 43.25,
"learning_rate": 5.732944027989518e-07,
"loss": 0.0522,
"num_input_tokens_seen": 3947296,
"step": 10475
},
{
"epoch": 9.390681003584229,
"grad_norm": 0.06982421875,
"learning_rate": 5.649993045446305e-07,
"loss": 0.419,
"num_input_tokens_seen": 3949152,
"step": 10480
},
{
"epoch": 9.39516129032258,
"grad_norm": 0.65625,
"learning_rate": 5.56763970146662e-07,
"loss": 0.2298,
"num_input_tokens_seen": 3951008,
"step": 10485
},
{
"epoch": 9.399641577060931,
"grad_norm": 10.8125,
"learning_rate": 5.485884197472646e-07,
"loss": 0.1155,
"num_input_tokens_seen": 3952960,
"step": 10490
},
{
"epoch": 9.404121863799283,
"grad_norm": 63.75,
"learning_rate": 5.404726733424514e-07,
"loss": 0.2225,
"num_input_tokens_seen": 3954752,
"step": 10495
},
{
"epoch": 9.408602150537634,
"grad_norm": 70.5,
"learning_rate": 5.324167507819555e-07,
"loss": 0.1945,
"num_input_tokens_seen": 3956736,
"step": 10500
},
{
"epoch": 9.413082437275985,
"grad_norm": 4.25,
"learning_rate": 5.244206717691908e-07,
"loss": 0.2786,
"num_input_tokens_seen": 3958528,
"step": 10505
},
{
"epoch": 9.417562724014337,
"grad_norm": 133.0,
"learning_rate": 5.164844558612131e-07,
"loss": 0.3995,
"num_input_tokens_seen": 3960672,
"step": 10510
},
{
"epoch": 9.422043010752688,
"grad_norm": 40.5,
"learning_rate": 5.086081224686512e-07,
"loss": 0.4284,
"num_input_tokens_seen": 3962752,
"step": 10515
},
{
"epoch": 9.42652329749104,
"grad_norm": 0.609375,
"learning_rate": 5.007916908556814e-07,
"loss": 0.0029,
"num_input_tokens_seen": 3964576,
"step": 10520
},
{
"epoch": 9.43100358422939,
"grad_norm": 70.5,
"learning_rate": 4.930351801399641e-07,
"loss": 0.0306,
"num_input_tokens_seen": 3966432,
"step": 10525
},
{
"epoch": 9.435483870967742,
"grad_norm": 0.0107421875,
"learning_rate": 4.853386092926044e-07,
"loss": 0.2041,
"num_input_tokens_seen": 3968256,
"step": 10530
},
{
"epoch": 9.439964157706093,
"grad_norm": 34.0,
"learning_rate": 4.77701997138108e-07,
"loss": 0.0903,
"num_input_tokens_seen": 3970048,
"step": 10535
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.23828125,
"learning_rate": 4.701253623543289e-07,
"loss": 0.3175,
"num_input_tokens_seen": 3971840,
"step": 10540
},
{
"epoch": 9.448924731182796,
"grad_norm": 71.5,
"learning_rate": 4.626087234724269e-07,
"loss": 0.1146,
"num_input_tokens_seen": 3973760,
"step": 10545
},
{
"epoch": 9.453405017921147,
"grad_norm": 5.78125,
"learning_rate": 4.5515209887682096e-07,
"loss": 0.2171,
"num_input_tokens_seen": 3975520,
"step": 10550
},
{
"epoch": 9.457885304659499,
"grad_norm": 41.0,
"learning_rate": 4.477555068051476e-07,
"loss": 0.0964,
"num_input_tokens_seen": 3977376,
"step": 10555
},
{
"epoch": 9.46236559139785,
"grad_norm": 2.203125,
"learning_rate": 4.40418965348216e-07,
"loss": 0.4259,
"num_input_tokens_seen": 3979232,
"step": 10560
},
{
"epoch": 9.466845878136201,
"grad_norm": 0.2275390625,
"learning_rate": 4.3314249244995884e-07,
"loss": 0.4276,
"num_input_tokens_seen": 3981024,
"step": 10565
},
{
"epoch": 9.471326164874553,
"grad_norm": 5.96875,
"learning_rate": 4.259261059073871e-07,
"loss": 0.2273,
"num_input_tokens_seen": 3982816,
"step": 10570
},
{
"epoch": 9.475806451612904,
"grad_norm": 0.0277099609375,
"learning_rate": 4.1876982337055725e-07,
"loss": 0.005,
"num_input_tokens_seen": 3984736,
"step": 10575
},
{
"epoch": 9.480286738351255,
"grad_norm": 36.5,
"learning_rate": 4.1167366234251824e-07,
"loss": 0.4583,
"num_input_tokens_seen": 3986816,
"step": 10580
},
{
"epoch": 9.484767025089607,
"grad_norm": 132.0,
"learning_rate": 4.0463764017927565e-07,
"loss": 0.7429,
"num_input_tokens_seen": 3988640,
"step": 10585
},
{
"epoch": 9.489247311827956,
"grad_norm": 0.034423828125,
"learning_rate": 3.976617740897415e-07,
"loss": 0.0079,
"num_input_tokens_seen": 3990560,
"step": 10590
},
{
"epoch": 9.493727598566307,
"grad_norm": 0.11865234375,
"learning_rate": 3.907460811356956e-07,
"loss": 0.1029,
"num_input_tokens_seen": 3992416,
"step": 10595
},
{
"epoch": 9.498207885304659,
"grad_norm": 10.5,
"learning_rate": 3.8389057823175754e-07,
"loss": 0.0945,
"num_input_tokens_seen": 3994240,
"step": 10600
},
{
"epoch": 9.5,
"eval_loss": 0.2852665185928345,
"eval_runtime": 9.677,
"eval_samples_per_second": 51.256,
"eval_steps_per_second": 12.814,
"num_input_tokens_seen": 3994976,
"step": 10602
},
{
"epoch": 9.50268817204301,
"grad_norm": 3.203125,
"learning_rate": 3.7709528214530664e-07,
"loss": 0.7232,
"num_input_tokens_seen": 3996192,
"step": 10605
},
{
"epoch": 9.507168458781361,
"grad_norm": 103.0,
"learning_rate": 3.7036020949648974e-07,
"loss": 0.5027,
"num_input_tokens_seen": 3998144,
"step": 10610
},
{
"epoch": 9.511648745519713,
"grad_norm": 134.0,
"learning_rate": 3.636853767581494e-07,
"loss": 0.2485,
"num_input_tokens_seen": 4000160,
"step": 10615
},
{
"epoch": 9.516129032258064,
"grad_norm": 62.0,
"learning_rate": 3.5707080025579045e-07,
"loss": 0.2088,
"num_input_tokens_seen": 4001888,
"step": 10620
},
{
"epoch": 9.520609318996415,
"grad_norm": 101.5,
"learning_rate": 3.5051649616754114e-07,
"loss": 0.0706,
"num_input_tokens_seen": 4003680,
"step": 10625
},
{
"epoch": 9.525089605734767,
"grad_norm": 95.5,
"learning_rate": 3.440224805241171e-07,
"loss": 0.3229,
"num_input_tokens_seen": 4005632,
"step": 10630
},
{
"epoch": 9.529569892473118,
"grad_norm": 69.5,
"learning_rate": 3.3758876920877147e-07,
"loss": 0.1617,
"num_input_tokens_seen": 4007296,
"step": 10635
},
{
"epoch": 9.53405017921147,
"grad_norm": 153.0,
"learning_rate": 3.312153779572724e-07,
"loss": 0.1274,
"num_input_tokens_seen": 4009120,
"step": 10640
},
{
"epoch": 9.53853046594982,
"grad_norm": 0.72265625,
"learning_rate": 3.249023223578479e-07,
"loss": 0.1744,
"num_input_tokens_seen": 4010944,
"step": 10645
},
{
"epoch": 9.543010752688172,
"grad_norm": 1.9140625,
"learning_rate": 3.1864961785116054e-07,
"loss": 0.0018,
"num_input_tokens_seen": 4012832,
"step": 10650
},
{
"epoch": 9.547491039426523,
"grad_norm": 0.01904296875,
"learning_rate": 3.124572797302661e-07,
"loss": 0.1629,
"num_input_tokens_seen": 4014688,
"step": 10655
},
{
"epoch": 9.551971326164875,
"grad_norm": 0.1904296875,
"learning_rate": 3.063253231405605e-07,
"loss": 0.0995,
"num_input_tokens_seen": 4016512,
"step": 10660
},
{
"epoch": 9.556451612903226,
"grad_norm": 5.9375,
"learning_rate": 3.002537630797747e-07,
"loss": 0.2074,
"num_input_tokens_seen": 4018368,
"step": 10665
},
{
"epoch": 9.560931899641577,
"grad_norm": 0.1171875,
"learning_rate": 2.9424261439791323e-07,
"loss": 0.0005,
"num_input_tokens_seen": 4020096,
"step": 10670
},
{
"epoch": 9.565412186379929,
"grad_norm": 18.875,
"learning_rate": 2.8829189179721547e-07,
"loss": 0.0873,
"num_input_tokens_seen": 4022048,
"step": 10675
},
{
"epoch": 9.56989247311828,
"grad_norm": 38.0,
"learning_rate": 2.824016098321447e-07,
"loss": 0.2068,
"num_input_tokens_seen": 4023936,
"step": 10680
},
{
"epoch": 9.574372759856631,
"grad_norm": 3.03125,
"learning_rate": 2.7657178290932396e-07,
"loss": 0.0048,
"num_input_tokens_seen": 4025984,
"step": 10685
},
{
"epoch": 9.578853046594983,
"grad_norm": 0.055419921875,
"learning_rate": 2.7080242528751964e-07,
"loss": 0.1648,
"num_input_tokens_seen": 4027808,
"step": 10690
},
{
"epoch": 9.583333333333334,
"grad_norm": 0.82421875,
"learning_rate": 2.650935510776026e-07,
"loss": 0.0646,
"num_input_tokens_seen": 4029632,
"step": 10695
},
{
"epoch": 9.587813620071685,
"grad_norm": 0.11865234375,
"learning_rate": 2.594451742425036e-07,
"loss": 0.0685,
"num_input_tokens_seen": 4031520,
"step": 10700
},
{
"epoch": 9.592293906810037,
"grad_norm": 67.5,
"learning_rate": 2.538573085971968e-07,
"loss": 0.5627,
"num_input_tokens_seen": 4033568,
"step": 10705
},
{
"epoch": 9.596774193548388,
"grad_norm": 0.011962890625,
"learning_rate": 2.4832996780864704e-07,
"loss": 0.2646,
"num_input_tokens_seen": 4035424,
"step": 10710
},
{
"epoch": 9.601254480286737,
"grad_norm": 0.0179443359375,
"learning_rate": 2.42863165395793e-07,
"loss": 0.1114,
"num_input_tokens_seen": 4037376,
"step": 10715
},
{
"epoch": 9.60573476702509,
"grad_norm": 0.1259765625,
"learning_rate": 2.3745691472950026e-07,
"loss": 0.2382,
"num_input_tokens_seen": 4039264,
"step": 10720
},
{
"epoch": 9.61021505376344,
"grad_norm": 40.0,
"learning_rate": 2.3211122903254167e-07,
"loss": 0.2182,
"num_input_tokens_seen": 4040992,
"step": 10725
},
{
"epoch": 9.614695340501791,
"grad_norm": 36.0,
"learning_rate": 2.2682612137955307e-07,
"loss": 0.4744,
"num_input_tokens_seen": 4042848,
"step": 10730
},
{
"epoch": 9.619175627240143,
"grad_norm": 0.0274658203125,
"learning_rate": 2.2160160469701097e-07,
"loss": 0.0505,
"num_input_tokens_seen": 4044608,
"step": 10735
},
{
"epoch": 9.623655913978494,
"grad_norm": 0.07958984375,
"learning_rate": 2.1643769176319385e-07,
"loss": 0.0015,
"num_input_tokens_seen": 4046528,
"step": 10740
},
{
"epoch": 9.628136200716845,
"grad_norm": 48.0,
"learning_rate": 2.1133439520815423e-07,
"loss": 0.4439,
"num_input_tokens_seen": 4048448,
"step": 10745
},
{
"epoch": 9.632616487455197,
"grad_norm": 0.69140625,
"learning_rate": 2.062917275136883e-07,
"loss": 0.0023,
"num_input_tokens_seen": 4050304,
"step": 10750
},
{
"epoch": 9.637096774193548,
"grad_norm": 20.0,
"learning_rate": 2.0130970101330527e-07,
"loss": 0.1473,
"num_input_tokens_seen": 4052224,
"step": 10755
},
{
"epoch": 9.6415770609319,
"grad_norm": 44.25,
"learning_rate": 1.963883278921913e-07,
"loss": 0.5517,
"num_input_tokens_seen": 4054208,
"step": 10760
},
{
"epoch": 9.64605734767025,
"grad_norm": 3.109375,
"learning_rate": 1.9152762018719017e-07,
"loss": 0.2484,
"num_input_tokens_seen": 4056192,
"step": 10765
},
{
"epoch": 9.650537634408602,
"grad_norm": 0.033447265625,
"learning_rate": 1.867275897867643e-07,
"loss": 0.006,
"num_input_tokens_seen": 4058208,
"step": 10770
},
{
"epoch": 9.655017921146953,
"grad_norm": 0.08349609375,
"learning_rate": 1.819882484309754e-07,
"loss": 0.3302,
"num_input_tokens_seen": 4060096,
"step": 10775
},
{
"epoch": 9.659498207885305,
"grad_norm": 0.01434326171875,
"learning_rate": 1.773096077114428e-07,
"loss": 0.3185,
"num_input_tokens_seen": 4062016,
"step": 10780
},
{
"epoch": 9.663978494623656,
"grad_norm": 0.0966796875,
"learning_rate": 1.7269167907132954e-07,
"loss": 0.1579,
"num_input_tokens_seen": 4063808,
"step": 10785
},
{
"epoch": 9.668458781362007,
"grad_norm": 109.0,
"learning_rate": 1.681344738053009e-07,
"loss": 0.2284,
"num_input_tokens_seen": 4065600,
"step": 10790
},
{
"epoch": 9.672939068100359,
"grad_norm": 0.039306640625,
"learning_rate": 1.636380030595075e-07,
"loss": 0.0009,
"num_input_tokens_seen": 4067488,
"step": 10795
},
{
"epoch": 9.67741935483871,
"grad_norm": 0.333984375,
"learning_rate": 1.5920227783155217e-07,
"loss": 0.0465,
"num_input_tokens_seen": 4069312,
"step": 10800
},
{
"epoch": 9.681899641577061,
"grad_norm": 9.9375,
"learning_rate": 1.5482730897046216e-07,
"loss": 0.0419,
"num_input_tokens_seen": 4071104,
"step": 10805
},
{
"epoch": 9.686379928315413,
"grad_norm": 21.875,
"learning_rate": 1.5051310717666967e-07,
"loss": 0.0167,
"num_input_tokens_seen": 4073184,
"step": 10810
},
{
"epoch": 9.690860215053764,
"grad_norm": 0.0267333984375,
"learning_rate": 1.4625968300197857e-07,
"loss": 0.0867,
"num_input_tokens_seen": 4075072,
"step": 10815
},
{
"epoch": 9.695340501792115,
"grad_norm": 68.5,
"learning_rate": 1.4206704684953943e-07,
"loss": 0.1641,
"num_input_tokens_seen": 4077024,
"step": 10820
},
{
"epoch": 9.699820788530467,
"grad_norm": 64.5,
"learning_rate": 1.3793520897383006e-07,
"loss": 0.028,
"num_input_tokens_seen": 4078944,
"step": 10825
},
{
"epoch": 9.704301075268818,
"grad_norm": 72.5,
"learning_rate": 1.3386417948061947e-07,
"loss": 0.4939,
"num_input_tokens_seen": 4080704,
"step": 10830
},
{
"epoch": 9.70878136200717,
"grad_norm": 0.0216064453125,
"learning_rate": 1.2985396832695674e-07,
"loss": 0.292,
"num_input_tokens_seen": 4082432,
"step": 10835
},
{
"epoch": 9.713261648745519,
"grad_norm": 0.042724609375,
"learning_rate": 1.259045853211349e-07,
"loss": 0.0025,
"num_input_tokens_seen": 4084320,
"step": 10840
},
{
"epoch": 9.717741935483872,
"grad_norm": 0.0179443359375,
"learning_rate": 1.2201604012267442e-07,
"loss": 0.3506,
"num_input_tokens_seen": 4086240,
"step": 10845
},
{
"epoch": 9.722222222222221,
"grad_norm": 122.0,
"learning_rate": 1.1818834224229525e-07,
"loss": 0.4556,
"num_input_tokens_seen": 4088096,
"step": 10850
},
{
"epoch": 9.726702508960573,
"grad_norm": 4.71875,
"learning_rate": 1.1442150104189198e-07,
"loss": 0.1001,
"num_input_tokens_seen": 4089888,
"step": 10855
},
{
"epoch": 9.731182795698924,
"grad_norm": 99.0,
"learning_rate": 1.1071552573452271e-07,
"loss": 0.0658,
"num_input_tokens_seen": 4091744,
"step": 10860
},
{
"epoch": 9.735663082437275,
"grad_norm": 0.380859375,
"learning_rate": 1.0707042538437018e-07,
"loss": 0.3701,
"num_input_tokens_seen": 4093408,
"step": 10865
},
{
"epoch": 9.740143369175627,
"grad_norm": 0.1982421875,
"learning_rate": 1.0348620890673067e-07,
"loss": 0.0144,
"num_input_tokens_seen": 4095392,
"step": 10870
},
{
"epoch": 9.744623655913978,
"grad_norm": 70.5,
"learning_rate": 9.9962885067989e-08,
"loss": 0.1781,
"num_input_tokens_seen": 4097280,
"step": 10875
},
{
"epoch": 9.74910394265233,
"grad_norm": 104.5,
"learning_rate": 9.650046248559363e-08,
"loss": 0.3003,
"num_input_tokens_seen": 4099360,
"step": 10880
},
{
"epoch": 9.75358422939068,
"grad_norm": 96.0,
"learning_rate": 9.309894962804267e-08,
"loss": 0.2663,
"num_input_tokens_seen": 4101376,
"step": 10885
},
{
"epoch": 9.758064516129032,
"grad_norm": 62.5,
"learning_rate": 8.975835481485895e-08,
"loss": 0.0598,
"num_input_tokens_seen": 4103296,
"step": 10890
},
{
"epoch": 9.762544802867383,
"grad_norm": 2.171875,
"learning_rate": 8.647868621656785e-08,
"loss": 0.0108,
"num_input_tokens_seen": 4105248,
"step": 10895
},
{
"epoch": 9.767025089605735,
"grad_norm": 0.022705078125,
"learning_rate": 8.325995185468339e-08,
"loss": 0.0891,
"num_input_tokens_seen": 4107072,
"step": 10900
},
{
"epoch": 9.771505376344086,
"grad_norm": 0.0157470703125,
"learning_rate": 8.010215960168044e-08,
"loss": 0.1834,
"num_input_tokens_seen": 4108768,
"step": 10905
},
{
"epoch": 9.775985663082437,
"grad_norm": 0.0201416015625,
"learning_rate": 7.700531718098092e-08,
"loss": 0.2153,
"num_input_tokens_seen": 4110624,
"step": 10910
},
{
"epoch": 9.780465949820789,
"grad_norm": 0.01458740234375,
"learning_rate": 7.396943216693708e-08,
"loss": 0.08,
"num_input_tokens_seen": 4112352,
"step": 10915
},
{
"epoch": 9.78494623655914,
"grad_norm": 0.099609375,
"learning_rate": 7.099451198480378e-08,
"loss": 0.0142,
"num_input_tokens_seen": 4114144,
"step": 10920
},
{
"epoch": 9.789426523297491,
"grad_norm": 55.5,
"learning_rate": 6.808056391073569e-08,
"loss": 0.5995,
"num_input_tokens_seen": 4115872,
"step": 10925
},
{
"epoch": 9.793906810035843,
"grad_norm": 152.0,
"learning_rate": 6.522759507175124e-08,
"loss": 0.6548,
"num_input_tokens_seen": 4117984,
"step": 10930
},
{
"epoch": 9.798387096774194,
"grad_norm": 0.00994873046875,
"learning_rate": 6.243561244572427e-08,
"loss": 0.3448,
"num_input_tokens_seen": 4119968,
"step": 10935
},
{
"epoch": 9.802867383512545,
"grad_norm": 118.5,
"learning_rate": 5.970462286137291e-08,
"loss": 0.2611,
"num_input_tokens_seen": 4122048,
"step": 10940
},
{
"epoch": 9.807347670250897,
"grad_norm": 147.0,
"learning_rate": 5.7034632998231865e-08,
"loss": 0.3511,
"num_input_tokens_seen": 4124032,
"step": 10945
},
{
"epoch": 9.811827956989248,
"grad_norm": 59.75,
"learning_rate": 5.4425649386644075e-08,
"loss": 0.1356,
"num_input_tokens_seen": 4125984,
"step": 10950
},
{
"epoch": 9.8163082437276,
"grad_norm": 6.84375,
"learning_rate": 5.187767840773849e-08,
"loss": 0.1335,
"num_input_tokens_seen": 4128032,
"step": 10955
},
{
"epoch": 9.82078853046595,
"grad_norm": 1.0390625,
"learning_rate": 4.939072629341901e-08,
"loss": 0.0812,
"num_input_tokens_seen": 4129792,
"step": 10960
},
{
"epoch": 9.825268817204302,
"grad_norm": 0.033203125,
"learning_rate": 4.696479912634499e-08,
"loss": 0.0422,
"num_input_tokens_seen": 4131808,
"step": 10965
},
{
"epoch": 9.829749103942653,
"grad_norm": 40.25,
"learning_rate": 4.459990283992577e-08,
"loss": 0.1873,
"num_input_tokens_seen": 4133696,
"step": 10970
},
{
"epoch": 9.834229390681003,
"grad_norm": 0.019775390625,
"learning_rate": 4.229604321829561e-08,
"loss": 0.1593,
"num_input_tokens_seen": 4135616,
"step": 10975
},
{
"epoch": 9.838709677419354,
"grad_norm": 75.0,
"learning_rate": 4.0053225896299894e-08,
"loss": 0.4229,
"num_input_tokens_seen": 4137472,
"step": 10980
},
{
"epoch": 9.843189964157705,
"grad_norm": 27.125,
"learning_rate": 3.787145635948952e-08,
"loss": 0.0399,
"num_input_tokens_seen": 4139328,
"step": 10985
},
{
"epoch": 9.847670250896057,
"grad_norm": 154.0,
"learning_rate": 3.575073994410427e-08,
"loss": 0.39,
"num_input_tokens_seen": 4141216,
"step": 10990
},
{
"epoch": 9.852150537634408,
"grad_norm": 9.6875,
"learning_rate": 3.369108183705339e-08,
"loss": 0.2978,
"num_input_tokens_seen": 4142976,
"step": 10995
},
{
"epoch": 9.85663082437276,
"grad_norm": 37.25,
"learning_rate": 3.169248707590999e-08,
"loss": 0.0482,
"num_input_tokens_seen": 4144672,
"step": 11000
},
{
"epoch": 9.86111111111111,
"grad_norm": 0.048828125,
"learning_rate": 2.975496054889726e-08,
"loss": 0.0058,
"num_input_tokens_seen": 4146496,
"step": 11005
},
{
"epoch": 9.865591397849462,
"grad_norm": 74.0,
"learning_rate": 2.7878506994877263e-08,
"loss": 0.0458,
"num_input_tokens_seen": 4148320,
"step": 11010
},
{
"epoch": 9.870071684587813,
"grad_norm": 0.169921875,
"learning_rate": 2.6063131003337126e-08,
"loss": 0.0688,
"num_input_tokens_seen": 4150176,
"step": 11015
},
{
"epoch": 9.874551971326165,
"grad_norm": 37.25,
"learning_rate": 2.4308837014372366e-08,
"loss": 0.3462,
"num_input_tokens_seen": 4152000,
"step": 11020
},
{
"epoch": 9.879032258064516,
"grad_norm": 3.6875,
"learning_rate": 2.2615629318692434e-08,
"loss": 0.2559,
"num_input_tokens_seen": 4153984,
"step": 11025
},
{
"epoch": 9.883512544802867,
"grad_norm": 0.419921875,
"learning_rate": 2.0983512057595743e-08,
"loss": 0.1873,
"num_input_tokens_seen": 4155904,
"step": 11030
},
{
"epoch": 9.887992831541219,
"grad_norm": 1.5546875,
"learning_rate": 1.941248922296135e-08,
"loss": 0.2147,
"num_input_tokens_seen": 4157760,
"step": 11035
},
{
"epoch": 9.89247311827957,
"grad_norm": 0.05712890625,
"learning_rate": 1.7902564657246158e-08,
"loss": 0.0044,
"num_input_tokens_seen": 4159584,
"step": 11040
},
{
"epoch": 9.896953405017921,
"grad_norm": 0.0888671875,
"learning_rate": 1.6453742053465504e-08,
"loss": 0.0026,
"num_input_tokens_seen": 4161472,
"step": 11045
},
{
"epoch": 9.901433691756273,
"grad_norm": 1.765625,
"learning_rate": 1.506602495519316e-08,
"loss": 0.2868,
"num_input_tokens_seen": 4163328,
"step": 11050
},
{
"epoch": 9.905913978494624,
"grad_norm": 0.040283203125,
"learning_rate": 1.3739416756555768e-08,
"loss": 0.0333,
"num_input_tokens_seen": 4165376,
"step": 11055
},
{
"epoch": 9.910394265232975,
"grad_norm": 0.0703125,
"learning_rate": 1.2473920702202325e-08,
"loss": 0.0824,
"num_input_tokens_seen": 4167168,
"step": 11060
},
{
"epoch": 9.914874551971327,
"grad_norm": 71.0,
"learning_rate": 1.126953988732915e-08,
"loss": 0.3288,
"num_input_tokens_seen": 4169056,
"step": 11065
},
{
"epoch": 9.919354838709678,
"grad_norm": 82.0,
"learning_rate": 1.0126277257641037e-08,
"loss": 0.6047,
"num_input_tokens_seen": 4170976,
"step": 11070
},
{
"epoch": 9.92383512544803,
"grad_norm": 0.11572265625,
"learning_rate": 9.044135609365124e-09,
"loss": 0.4591,
"num_input_tokens_seen": 4172704,
"step": 11075
},
{
"epoch": 9.92831541218638,
"grad_norm": 0.01251220703125,
"learning_rate": 8.023117589237017e-09,
"loss": 0.0314,
"num_input_tokens_seen": 4174688,
"step": 11080
},
{
"epoch": 9.932795698924732,
"grad_norm": 3.40625,
"learning_rate": 7.06322569449247e-09,
"loss": 0.2823,
"num_input_tokens_seen": 4176480,
"step": 11085
},
{
"epoch": 9.937275985663083,
"grad_norm": 82.0,
"learning_rate": 6.164462272864602e-09,
"loss": 0.2021,
"num_input_tokens_seen": 4178432,
"step": 11090
},
{
"epoch": 9.941756272401435,
"grad_norm": 58.0,
"learning_rate": 5.326829522578347e-09,
"loss": 0.2947,
"num_input_tokens_seen": 4180256,
"step": 11095
},
{
"epoch": 9.946236559139784,
"grad_norm": 0.060791015625,
"learning_rate": 4.5503294923338044e-09,
"loss": 0.1379,
"num_input_tokens_seen": 4182144,
"step": 11100
},
{
"epoch": 9.950716845878135,
"grad_norm": 0.08251953125,
"learning_rate": 3.834964081325665e-09,
"loss": 0.4357,
"num_input_tokens_seen": 4184064,
"step": 11105
},
{
"epoch": 9.955197132616487,
"grad_norm": 0.333984375,
"learning_rate": 3.1807350392099033e-09,
"loss": 0.1968,
"num_input_tokens_seen": 4185888,
"step": 11110
},
{
"epoch": 9.959677419354838,
"grad_norm": 191.0,
"learning_rate": 2.58764396612321e-09,
"loss": 0.0913,
"num_input_tokens_seen": 4187808,
"step": 11115
},
{
"epoch": 9.96415770609319,
"grad_norm": 20.25,
"learning_rate": 2.0556923126663353e-09,
"loss": 0.2123,
"num_input_tokens_seen": 4189440,
"step": 11120
},
{
"epoch": 9.96863799283154,
"grad_norm": 62.25,
"learning_rate": 1.5848813798985396e-09,
"loss": 0.3668,
"num_input_tokens_seen": 4191328,
"step": 11125
},
{
"epoch": 9.973118279569892,
"grad_norm": 1.7890625,
"learning_rate": 1.1752123193459197e-09,
"loss": 0.4359,
"num_input_tokens_seen": 4193120,
"step": 11130
},
{
"epoch": 9.977598566308243,
"grad_norm": 80.5,
"learning_rate": 8.266861329903064e-10,
"loss": 0.0269,
"num_input_tokens_seen": 4195200,
"step": 11135
},
{
"epoch": 9.982078853046595,
"grad_norm": 72.5,
"learning_rate": 5.393036732637136e-10,
"loss": 0.2249,
"num_input_tokens_seen": 4197024,
"step": 11140
},
{
"epoch": 9.986559139784946,
"grad_norm": 0.205078125,
"learning_rate": 3.130656430594403e-10,
"loss": 0.0266,
"num_input_tokens_seen": 4198976,
"step": 11145
},
{
"epoch": 9.991039426523297,
"grad_norm": 97.5,
"learning_rate": 1.4797259571541767e-10,
"loss": 0.0763,
"num_input_tokens_seen": 4200832,
"step": 11150
},
{
"epoch": 9.995519713261649,
"grad_norm": 0.244140625,
"learning_rate": 4.402493501975968e-11,
"loss": 0.1371,
"num_input_tokens_seen": 4202656,
"step": 11155
},
{
"epoch": 10.0,
"grad_norm": 243.0,
"learning_rate": 1.2229152107634533e-12,
"loss": 0.9347,
"num_input_tokens_seen": 4204168,
"step": 11160
},
{
"epoch": 10.0,
"eval_loss": 0.2850942015647888,
"eval_runtime": 9.6855,
"eval_samples_per_second": 51.21,
"eval_steps_per_second": 12.803,
"num_input_tokens_seen": 4204168,
"step": 11160
},
{
"epoch": 10.0,
"num_input_tokens_seen": 4204168,
"step": 11160,
"total_flos": 1.8931178489059738e+17,
"train_loss": 0.24860101528843367,
"train_runtime": 2175.1826,
"train_samples_per_second": 20.509,
"train_steps_per_second": 5.131
}
],
"logging_steps": 5,
"max_steps": 11160,
"num_input_tokens_seen": 4204168,
"num_train_epochs": 10,
"save_steps": 558,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8931178489059738e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}