train_conala_42_1760637549 / trainer_state.json
rbelanec's picture
End of training
52cb190 verified
{
"best_global_step": 3216,
"best_metric": 0.6421719789505005,
"best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_conala_42_1760637549/checkpoint-3216",
"epoch": 20.0,
"eval_steps": 536,
"global_step": 10720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009328358208955223,
"grad_norm": 50.2962646484375,
"learning_rate": 3.7313432835820897e-06,
"loss": 5.9132,
"num_input_tokens_seen": 1504,
"step": 5
},
{
"epoch": 0.018656716417910446,
"grad_norm": 62.8128662109375,
"learning_rate": 8.395522388059701e-06,
"loss": 5.341,
"num_input_tokens_seen": 2976,
"step": 10
},
{
"epoch": 0.027985074626865673,
"grad_norm": 33.016021728515625,
"learning_rate": 1.3059701492537313e-05,
"loss": 4.1119,
"num_input_tokens_seen": 4480,
"step": 15
},
{
"epoch": 0.03731343283582089,
"grad_norm": 29.6210994720459,
"learning_rate": 1.7723880597014924e-05,
"loss": 2.4265,
"num_input_tokens_seen": 5984,
"step": 20
},
{
"epoch": 0.04664179104477612,
"grad_norm": 24.01149559020996,
"learning_rate": 2.2388059701492536e-05,
"loss": 1.7477,
"num_input_tokens_seen": 7392,
"step": 25
},
{
"epoch": 0.055970149253731345,
"grad_norm": 10.972054481506348,
"learning_rate": 2.7052238805970147e-05,
"loss": 1.2305,
"num_input_tokens_seen": 8864,
"step": 30
},
{
"epoch": 0.06529850746268656,
"grad_norm": 10.251397132873535,
"learning_rate": 3.171641791044776e-05,
"loss": 1.0503,
"num_input_tokens_seen": 10272,
"step": 35
},
{
"epoch": 0.07462686567164178,
"grad_norm": 10.270315170288086,
"learning_rate": 3.638059701492537e-05,
"loss": 1.1064,
"num_input_tokens_seen": 11584,
"step": 40
},
{
"epoch": 0.08395522388059702,
"grad_norm": 9.021138191223145,
"learning_rate": 4.104477611940299e-05,
"loss": 1.0329,
"num_input_tokens_seen": 13120,
"step": 45
},
{
"epoch": 0.09328358208955224,
"grad_norm": 8.116998672485352,
"learning_rate": 4.5708955223880595e-05,
"loss": 0.8612,
"num_input_tokens_seen": 14624,
"step": 50
},
{
"epoch": 0.10261194029850747,
"grad_norm": 7.769300937652588,
"learning_rate": 5.037313432835821e-05,
"loss": 0.8652,
"num_input_tokens_seen": 16224,
"step": 55
},
{
"epoch": 0.11194029850746269,
"grad_norm": 3.922929048538208,
"learning_rate": 5.503731343283582e-05,
"loss": 0.5218,
"num_input_tokens_seen": 17568,
"step": 60
},
{
"epoch": 0.12126865671641791,
"grad_norm": 2.2006962299346924,
"learning_rate": 5.9701492537313435e-05,
"loss": 1.0326,
"num_input_tokens_seen": 19072,
"step": 65
},
{
"epoch": 0.13059701492537312,
"grad_norm": 7.339227676391602,
"learning_rate": 6.436567164179105e-05,
"loss": 0.8905,
"num_input_tokens_seen": 20448,
"step": 70
},
{
"epoch": 0.13992537313432835,
"grad_norm": 6.341765403747559,
"learning_rate": 6.902985074626866e-05,
"loss": 0.8382,
"num_input_tokens_seen": 21920,
"step": 75
},
{
"epoch": 0.14925373134328357,
"grad_norm": 2.7121095657348633,
"learning_rate": 7.369402985074628e-05,
"loss": 0.6957,
"num_input_tokens_seen": 23360,
"step": 80
},
{
"epoch": 0.15858208955223882,
"grad_norm": 16.42344856262207,
"learning_rate": 7.835820895522389e-05,
"loss": 1.0716,
"num_input_tokens_seen": 24864,
"step": 85
},
{
"epoch": 0.16791044776119404,
"grad_norm": 4.9739580154418945,
"learning_rate": 8.30223880597015e-05,
"loss": 0.5727,
"num_input_tokens_seen": 26592,
"step": 90
},
{
"epoch": 0.17723880597014927,
"grad_norm": 6.865658283233643,
"learning_rate": 8.76865671641791e-05,
"loss": 0.7009,
"num_input_tokens_seen": 27936,
"step": 95
},
{
"epoch": 0.1865671641791045,
"grad_norm": 3.1465818881988525,
"learning_rate": 9.235074626865672e-05,
"loss": 0.9568,
"num_input_tokens_seen": 29440,
"step": 100
},
{
"epoch": 0.1958955223880597,
"grad_norm": 3.4791672229766846,
"learning_rate": 9.701492537313434e-05,
"loss": 1.0771,
"num_input_tokens_seen": 31040,
"step": 105
},
{
"epoch": 0.20522388059701493,
"grad_norm": 6.314712047576904,
"learning_rate": 0.00010167910447761195,
"loss": 0.9265,
"num_input_tokens_seen": 32512,
"step": 110
},
{
"epoch": 0.21455223880597016,
"grad_norm": 3.4345123767852783,
"learning_rate": 0.00010634328358208955,
"loss": 0.8327,
"num_input_tokens_seen": 33952,
"step": 115
},
{
"epoch": 0.22388059701492538,
"grad_norm": 1.887539267539978,
"learning_rate": 0.00011100746268656716,
"loss": 0.5377,
"num_input_tokens_seen": 35520,
"step": 120
},
{
"epoch": 0.2332089552238806,
"grad_norm": 3.8296196460723877,
"learning_rate": 0.00011567164179104479,
"loss": 1.0696,
"num_input_tokens_seen": 37024,
"step": 125
},
{
"epoch": 0.24253731343283583,
"grad_norm": 17.493654251098633,
"learning_rate": 0.0001203358208955224,
"loss": 0.8046,
"num_input_tokens_seen": 38752,
"step": 130
},
{
"epoch": 0.251865671641791,
"grad_norm": 3.7208962440490723,
"learning_rate": 0.000125,
"loss": 1.2193,
"num_input_tokens_seen": 40064,
"step": 135
},
{
"epoch": 0.26119402985074625,
"grad_norm": 3.16337513923645,
"learning_rate": 0.00012966417910447762,
"loss": 0.5599,
"num_input_tokens_seen": 41600,
"step": 140
},
{
"epoch": 0.27052238805970147,
"grad_norm": 2.210693120956421,
"learning_rate": 0.00013432835820895522,
"loss": 0.8478,
"num_input_tokens_seen": 42880,
"step": 145
},
{
"epoch": 0.2798507462686567,
"grad_norm": 3.8227005004882812,
"learning_rate": 0.00013899253731343284,
"loss": 0.8006,
"num_input_tokens_seen": 44320,
"step": 150
},
{
"epoch": 0.2891791044776119,
"grad_norm": 3.232940196990967,
"learning_rate": 0.00014365671641791044,
"loss": 0.8651,
"num_input_tokens_seen": 45664,
"step": 155
},
{
"epoch": 0.29850746268656714,
"grad_norm": 3.337759017944336,
"learning_rate": 0.00014832089552238806,
"loss": 1.3061,
"num_input_tokens_seen": 47008,
"step": 160
},
{
"epoch": 0.30783582089552236,
"grad_norm": 2.434713363647461,
"learning_rate": 0.00015298507462686568,
"loss": 0.6944,
"num_input_tokens_seen": 48416,
"step": 165
},
{
"epoch": 0.31716417910447764,
"grad_norm": 4.333398342132568,
"learning_rate": 0.00015764925373134328,
"loss": 0.8114,
"num_input_tokens_seen": 49984,
"step": 170
},
{
"epoch": 0.32649253731343286,
"grad_norm": 2.168666124343872,
"learning_rate": 0.0001623134328358209,
"loss": 0.7556,
"num_input_tokens_seen": 51264,
"step": 175
},
{
"epoch": 0.3358208955223881,
"grad_norm": 1.2355871200561523,
"learning_rate": 0.00016697761194029852,
"loss": 0.8644,
"num_input_tokens_seen": 52704,
"step": 180
},
{
"epoch": 0.3451492537313433,
"grad_norm": 1.1702848672866821,
"learning_rate": 0.00017164179104477612,
"loss": 0.6404,
"num_input_tokens_seen": 54112,
"step": 185
},
{
"epoch": 0.35447761194029853,
"grad_norm": 1.418177604675293,
"learning_rate": 0.00017630597014925374,
"loss": 0.8023,
"num_input_tokens_seen": 55456,
"step": 190
},
{
"epoch": 0.36380597014925375,
"grad_norm": 0.9577491283416748,
"learning_rate": 0.00018097014925373133,
"loss": 0.6875,
"num_input_tokens_seen": 56800,
"step": 195
},
{
"epoch": 0.373134328358209,
"grad_norm": 1.9909303188323975,
"learning_rate": 0.00018563432835820896,
"loss": 0.7119,
"num_input_tokens_seen": 58240,
"step": 200
},
{
"epoch": 0.3824626865671642,
"grad_norm": 5.038618087768555,
"learning_rate": 0.00019029850746268658,
"loss": 0.6525,
"num_input_tokens_seen": 59552,
"step": 205
},
{
"epoch": 0.3917910447761194,
"grad_norm": 1.5434777736663818,
"learning_rate": 0.00019496268656716417,
"loss": 0.6378,
"num_input_tokens_seen": 60832,
"step": 210
},
{
"epoch": 0.40111940298507465,
"grad_norm": 1.161812424659729,
"learning_rate": 0.0001996268656716418,
"loss": 0.95,
"num_input_tokens_seen": 62208,
"step": 215
},
{
"epoch": 0.41044776119402987,
"grad_norm": 0.9651714563369751,
"learning_rate": 0.0002042910447761194,
"loss": 0.7952,
"num_input_tokens_seen": 63680,
"step": 220
},
{
"epoch": 0.4197761194029851,
"grad_norm": 1.8524196147918701,
"learning_rate": 0.000208955223880597,
"loss": 0.6494,
"num_input_tokens_seen": 65120,
"step": 225
},
{
"epoch": 0.4291044776119403,
"grad_norm": 3.1311075687408447,
"learning_rate": 0.00021361940298507463,
"loss": 0.9792,
"num_input_tokens_seen": 66624,
"step": 230
},
{
"epoch": 0.43843283582089554,
"grad_norm": 1.3214956521987915,
"learning_rate": 0.00021828358208955223,
"loss": 0.8154,
"num_input_tokens_seen": 68192,
"step": 235
},
{
"epoch": 0.44776119402985076,
"grad_norm": 1.1524676084518433,
"learning_rate": 0.00022294776119402985,
"loss": 1.0075,
"num_input_tokens_seen": 69376,
"step": 240
},
{
"epoch": 0.457089552238806,
"grad_norm": 0.8888850212097168,
"learning_rate": 0.00022761194029850745,
"loss": 0.7885,
"num_input_tokens_seen": 70656,
"step": 245
},
{
"epoch": 0.4664179104477612,
"grad_norm": 0.7135015726089478,
"learning_rate": 0.00023227611940298507,
"loss": 0.7681,
"num_input_tokens_seen": 72032,
"step": 250
},
{
"epoch": 0.47574626865671643,
"grad_norm": 0.7615672945976257,
"learning_rate": 0.0002369402985074627,
"loss": 0.601,
"num_input_tokens_seen": 73600,
"step": 255
},
{
"epoch": 0.48507462686567165,
"grad_norm": 0.9831456542015076,
"learning_rate": 0.00024160447761194029,
"loss": 0.6599,
"num_input_tokens_seen": 74944,
"step": 260
},
{
"epoch": 0.4944029850746269,
"grad_norm": 0.826670229434967,
"learning_rate": 0.0002462686567164179,
"loss": 0.7404,
"num_input_tokens_seen": 76288,
"step": 265
},
{
"epoch": 0.503731343283582,
"grad_norm": 1.3727489709854126,
"learning_rate": 0.00025093283582089556,
"loss": 0.7172,
"num_input_tokens_seen": 77536,
"step": 270
},
{
"epoch": 0.5130597014925373,
"grad_norm": 0.6129507422447205,
"learning_rate": 0.00025559701492537315,
"loss": 0.6915,
"num_input_tokens_seen": 78944,
"step": 275
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.9328954219818115,
"learning_rate": 0.00026026119402985075,
"loss": 0.855,
"num_input_tokens_seen": 80224,
"step": 280
},
{
"epoch": 0.5317164179104478,
"grad_norm": 1.9826902151107788,
"learning_rate": 0.00026492537313432834,
"loss": 0.8697,
"num_input_tokens_seen": 81568,
"step": 285
},
{
"epoch": 0.5410447761194029,
"grad_norm": 1.385163426399231,
"learning_rate": 0.000269589552238806,
"loss": 0.7755,
"num_input_tokens_seen": 82912,
"step": 290
},
{
"epoch": 0.5503731343283582,
"grad_norm": 1.0601534843444824,
"learning_rate": 0.0002742537313432836,
"loss": 0.6186,
"num_input_tokens_seen": 84448,
"step": 295
},
{
"epoch": 0.5597014925373134,
"grad_norm": 0.6051679849624634,
"learning_rate": 0.00027891791044776124,
"loss": 0.7983,
"num_input_tokens_seen": 85792,
"step": 300
},
{
"epoch": 0.5690298507462687,
"grad_norm": 0.9398823380470276,
"learning_rate": 0.0002835820895522388,
"loss": 0.8253,
"num_input_tokens_seen": 87200,
"step": 305
},
{
"epoch": 0.5783582089552238,
"grad_norm": 0.9723435044288635,
"learning_rate": 0.0002882462686567164,
"loss": 0.7626,
"num_input_tokens_seen": 88608,
"step": 310
},
{
"epoch": 0.5876865671641791,
"grad_norm": 1.2982566356658936,
"learning_rate": 0.000292910447761194,
"loss": 0.9217,
"num_input_tokens_seen": 90176,
"step": 315
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.6757448315620422,
"learning_rate": 0.00029757462686567167,
"loss": 0.8694,
"num_input_tokens_seen": 91424,
"step": 320
},
{
"epoch": 0.6063432835820896,
"grad_norm": 0.8649131059646606,
"learning_rate": 0.00030223880597014926,
"loss": 0.8126,
"num_input_tokens_seen": 92832,
"step": 325
},
{
"epoch": 0.6156716417910447,
"grad_norm": 1.049835205078125,
"learning_rate": 0.00030690298507462686,
"loss": 0.9724,
"num_input_tokens_seen": 94240,
"step": 330
},
{
"epoch": 0.625,
"grad_norm": 0.9085893630981445,
"learning_rate": 0.00031156716417910445,
"loss": 0.9384,
"num_input_tokens_seen": 95584,
"step": 335
},
{
"epoch": 0.6343283582089553,
"grad_norm": 0.6684736013412476,
"learning_rate": 0.0003162313432835821,
"loss": 0.5239,
"num_input_tokens_seen": 97056,
"step": 340
},
{
"epoch": 0.6436567164179104,
"grad_norm": 0.5452622175216675,
"learning_rate": 0.0003208955223880597,
"loss": 0.7606,
"num_input_tokens_seen": 98464,
"step": 345
},
{
"epoch": 0.6529850746268657,
"grad_norm": 0.5732467770576477,
"learning_rate": 0.00032555970149253735,
"loss": 0.6014,
"num_input_tokens_seen": 99968,
"step": 350
},
{
"epoch": 0.6623134328358209,
"grad_norm": 1.0637229681015015,
"learning_rate": 0.0003302238805970149,
"loss": 0.858,
"num_input_tokens_seen": 101536,
"step": 355
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.574134349822998,
"learning_rate": 0.00033488805970149254,
"loss": 0.5917,
"num_input_tokens_seen": 102976,
"step": 360
},
{
"epoch": 0.6809701492537313,
"grad_norm": 0.9447924494743347,
"learning_rate": 0.00033955223880597013,
"loss": 0.6635,
"num_input_tokens_seen": 104512,
"step": 365
},
{
"epoch": 0.6902985074626866,
"grad_norm": 0.7179959416389465,
"learning_rate": 0.0003442164179104478,
"loss": 0.6203,
"num_input_tokens_seen": 105888,
"step": 370
},
{
"epoch": 0.6996268656716418,
"grad_norm": 1.140496015548706,
"learning_rate": 0.0003488805970149254,
"loss": 0.7066,
"num_input_tokens_seen": 107232,
"step": 375
},
{
"epoch": 0.7089552238805971,
"grad_norm": 0.5980051159858704,
"learning_rate": 0.000353544776119403,
"loss": 0.8239,
"num_input_tokens_seen": 108704,
"step": 380
},
{
"epoch": 0.7182835820895522,
"grad_norm": 0.7040757536888123,
"learning_rate": 0.00035820895522388057,
"loss": 0.6021,
"num_input_tokens_seen": 110016,
"step": 385
},
{
"epoch": 0.7276119402985075,
"grad_norm": 0.6738182902336121,
"learning_rate": 0.0003628731343283582,
"loss": 0.5962,
"num_input_tokens_seen": 111520,
"step": 390
},
{
"epoch": 0.7369402985074627,
"grad_norm": 0.8782476186752319,
"learning_rate": 0.0003675373134328358,
"loss": 0.8944,
"num_input_tokens_seen": 112800,
"step": 395
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.72857666015625,
"learning_rate": 0.00037220149253731346,
"loss": 0.8248,
"num_input_tokens_seen": 114304,
"step": 400
},
{
"epoch": 0.7555970149253731,
"grad_norm": 0.6835526823997498,
"learning_rate": 0.00037686567164179106,
"loss": 0.6764,
"num_input_tokens_seen": 115648,
"step": 405
},
{
"epoch": 0.7649253731343284,
"grad_norm": 0.6340111494064331,
"learning_rate": 0.00038152985074626865,
"loss": 0.4727,
"num_input_tokens_seen": 117088,
"step": 410
},
{
"epoch": 0.7742537313432836,
"grad_norm": 0.5367112159729004,
"learning_rate": 0.00038619402985074625,
"loss": 0.5831,
"num_input_tokens_seen": 118560,
"step": 415
},
{
"epoch": 0.7835820895522388,
"grad_norm": 0.8845934271812439,
"learning_rate": 0.0003908582089552239,
"loss": 0.6996,
"num_input_tokens_seen": 119968,
"step": 420
},
{
"epoch": 0.792910447761194,
"grad_norm": 0.5628993511199951,
"learning_rate": 0.0003955223880597015,
"loss": 0.525,
"num_input_tokens_seen": 121184,
"step": 425
},
{
"epoch": 0.8022388059701493,
"grad_norm": 0.44992223381996155,
"learning_rate": 0.00040018656716417914,
"loss": 0.6806,
"num_input_tokens_seen": 122592,
"step": 430
},
{
"epoch": 0.8115671641791045,
"grad_norm": 0.3055427670478821,
"learning_rate": 0.0004048507462686567,
"loss": 0.667,
"num_input_tokens_seen": 123936,
"step": 435
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.6931546926498413,
"learning_rate": 0.00040951492537313433,
"loss": 0.8071,
"num_input_tokens_seen": 125344,
"step": 440
},
{
"epoch": 0.8302238805970149,
"grad_norm": 0.5060365796089172,
"learning_rate": 0.0004141791044776119,
"loss": 0.632,
"num_input_tokens_seen": 126848,
"step": 445
},
{
"epoch": 0.8395522388059702,
"grad_norm": 0.4852692484855652,
"learning_rate": 0.0004188432835820896,
"loss": 0.7955,
"num_input_tokens_seen": 128160,
"step": 450
},
{
"epoch": 0.8488805970149254,
"grad_norm": 0.4757196009159088,
"learning_rate": 0.00042350746268656717,
"loss": 0.669,
"num_input_tokens_seen": 129728,
"step": 455
},
{
"epoch": 0.8582089552238806,
"grad_norm": 0.4002057611942291,
"learning_rate": 0.00042817164179104476,
"loss": 0.5291,
"num_input_tokens_seen": 131232,
"step": 460
},
{
"epoch": 0.8675373134328358,
"grad_norm": 1.0518105030059814,
"learning_rate": 0.00043283582089552236,
"loss": 0.9762,
"num_input_tokens_seen": 132576,
"step": 465
},
{
"epoch": 0.8768656716417911,
"grad_norm": 0.5036869645118713,
"learning_rate": 0.0004375,
"loss": 0.6403,
"num_input_tokens_seen": 134080,
"step": 470
},
{
"epoch": 0.8861940298507462,
"grad_norm": 0.6456478238105774,
"learning_rate": 0.00044216417910447766,
"loss": 0.5029,
"num_input_tokens_seen": 135584,
"step": 475
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.6417862772941589,
"learning_rate": 0.00044682835820895525,
"loss": 0.6198,
"num_input_tokens_seen": 137248,
"step": 480
},
{
"epoch": 0.9048507462686567,
"grad_norm": 0.6095438003540039,
"learning_rate": 0.00045149253731343285,
"loss": 0.7217,
"num_input_tokens_seen": 138656,
"step": 485
},
{
"epoch": 0.914179104477612,
"grad_norm": 0.5077698826789856,
"learning_rate": 0.00045615671641791044,
"loss": 0.6754,
"num_input_tokens_seen": 140224,
"step": 490
},
{
"epoch": 0.9235074626865671,
"grad_norm": 0.41700974106788635,
"learning_rate": 0.0004608208955223881,
"loss": 0.6219,
"num_input_tokens_seen": 141792,
"step": 495
},
{
"epoch": 0.9328358208955224,
"grad_norm": 0.33780136704444885,
"learning_rate": 0.0004654850746268657,
"loss": 0.5651,
"num_input_tokens_seen": 143200,
"step": 500
},
{
"epoch": 0.9421641791044776,
"grad_norm": 0.9682436585426331,
"learning_rate": 0.00047014925373134334,
"loss": 0.6506,
"num_input_tokens_seen": 144608,
"step": 505
},
{
"epoch": 0.9514925373134329,
"grad_norm": 0.5964924693107605,
"learning_rate": 0.0004748134328358209,
"loss": 0.5424,
"num_input_tokens_seen": 146048,
"step": 510
},
{
"epoch": 0.960820895522388,
"grad_norm": 0.6313313245773315,
"learning_rate": 0.00047947761194029853,
"loss": 0.8197,
"num_input_tokens_seen": 147520,
"step": 515
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.45782962441444397,
"learning_rate": 0.0004841417910447761,
"loss": 0.739,
"num_input_tokens_seen": 149184,
"step": 520
},
{
"epoch": 0.9794776119402985,
"grad_norm": 1.2939541339874268,
"learning_rate": 0.0004888059701492537,
"loss": 0.645,
"num_input_tokens_seen": 150592,
"step": 525
},
{
"epoch": 0.9888059701492538,
"grad_norm": 0.4261542558670044,
"learning_rate": 0.0004934701492537313,
"loss": 0.8326,
"num_input_tokens_seen": 151936,
"step": 530
},
{
"epoch": 0.9981343283582089,
"grad_norm": 0.6485226154327393,
"learning_rate": 0.000498134328358209,
"loss": 0.8366,
"num_input_tokens_seen": 153280,
"step": 535
},
{
"epoch": 1.0,
"eval_loss": 0.6933413147926331,
"eval_runtime": 4.1732,
"eval_samples_per_second": 57.031,
"eval_steps_per_second": 14.377,
"num_input_tokens_seen": 153352,
"step": 536
},
{
"epoch": 1.007462686567164,
"grad_norm": 0.599692165851593,
"learning_rate": 0.0005027985074626866,
"loss": 0.7255,
"num_input_tokens_seen": 154408,
"step": 540
},
{
"epoch": 1.0167910447761195,
"grad_norm": 0.867232620716095,
"learning_rate": 0.0005074626865671642,
"loss": 0.6433,
"num_input_tokens_seen": 155976,
"step": 545
},
{
"epoch": 1.0261194029850746,
"grad_norm": 0.3278610110282898,
"learning_rate": 0.0005121268656716418,
"loss": 0.4283,
"num_input_tokens_seen": 157320,
"step": 550
},
{
"epoch": 1.0354477611940298,
"grad_norm": 0.5500964522361755,
"learning_rate": 0.0005167910447761194,
"loss": 0.682,
"num_input_tokens_seen": 158920,
"step": 555
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.5103652477264404,
"learning_rate": 0.0005214552238805971,
"loss": 0.5198,
"num_input_tokens_seen": 160264,
"step": 560
},
{
"epoch": 1.0541044776119404,
"grad_norm": 0.47425708174705505,
"learning_rate": 0.0005261194029850747,
"loss": 0.7302,
"num_input_tokens_seen": 161768,
"step": 565
},
{
"epoch": 1.0634328358208955,
"grad_norm": 0.5512856245040894,
"learning_rate": 0.0005307835820895523,
"loss": 0.7863,
"num_input_tokens_seen": 163336,
"step": 570
},
{
"epoch": 1.0727611940298507,
"grad_norm": 1.0708633661270142,
"learning_rate": 0.0005354477611940298,
"loss": 0.7078,
"num_input_tokens_seen": 164808,
"step": 575
},
{
"epoch": 1.0820895522388059,
"grad_norm": 0.3448982536792755,
"learning_rate": 0.0005401119402985075,
"loss": 0.8767,
"num_input_tokens_seen": 166440,
"step": 580
},
{
"epoch": 1.0914179104477613,
"grad_norm": 0.5863126516342163,
"learning_rate": 0.0005447761194029851,
"loss": 0.5437,
"num_input_tokens_seen": 167784,
"step": 585
},
{
"epoch": 1.1007462686567164,
"grad_norm": 0.4399169981479645,
"learning_rate": 0.0005494402985074627,
"loss": 0.5878,
"num_input_tokens_seen": 169256,
"step": 590
},
{
"epoch": 1.1100746268656716,
"grad_norm": 0.5453768372535706,
"learning_rate": 0.0005541044776119403,
"loss": 0.6676,
"num_input_tokens_seen": 170824,
"step": 595
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.506891667842865,
"learning_rate": 0.000558768656716418,
"loss": 0.7238,
"num_input_tokens_seen": 172232,
"step": 600
},
{
"epoch": 1.1287313432835822,
"grad_norm": 1.1439929008483887,
"learning_rate": 0.0005634328358208956,
"loss": 0.8913,
"num_input_tokens_seen": 173576,
"step": 605
},
{
"epoch": 1.1380597014925373,
"grad_norm": 0.836676299571991,
"learning_rate": 0.0005680970149253732,
"loss": 0.6187,
"num_input_tokens_seen": 174824,
"step": 610
},
{
"epoch": 1.1473880597014925,
"grad_norm": 1.3345333337783813,
"learning_rate": 0.0005727611940298508,
"loss": 0.6908,
"num_input_tokens_seen": 176360,
"step": 615
},
{
"epoch": 1.1567164179104479,
"grad_norm": 0.5238152742385864,
"learning_rate": 0.0005774253731343285,
"loss": 0.7062,
"num_input_tokens_seen": 177864,
"step": 620
},
{
"epoch": 1.166044776119403,
"grad_norm": 0.42747148871421814,
"learning_rate": 0.0005820895522388059,
"loss": 0.6453,
"num_input_tokens_seen": 179304,
"step": 625
},
{
"epoch": 1.1753731343283582,
"grad_norm": 0.5538851618766785,
"learning_rate": 0.0005867537313432835,
"loss": 0.476,
"num_input_tokens_seen": 181000,
"step": 630
},
{
"epoch": 1.1847014925373134,
"grad_norm": 0.34586551785469055,
"learning_rate": 0.0005914179104477611,
"loss": 0.526,
"num_input_tokens_seen": 182376,
"step": 635
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.6016301512718201,
"learning_rate": 0.0005960820895522388,
"loss": 0.7885,
"num_input_tokens_seen": 183784,
"step": 640
},
{
"epoch": 1.203358208955224,
"grad_norm": 0.4861624240875244,
"learning_rate": 0.0006007462686567164,
"loss": 0.6053,
"num_input_tokens_seen": 185192,
"step": 645
},
{
"epoch": 1.212686567164179,
"grad_norm": 0.7675857543945312,
"learning_rate": 0.000605410447761194,
"loss": 0.4889,
"num_input_tokens_seen": 186504,
"step": 650
},
{
"epoch": 1.2220149253731343,
"grad_norm": 0.33078664541244507,
"learning_rate": 0.0006100746268656716,
"loss": 0.7418,
"num_input_tokens_seen": 187912,
"step": 655
},
{
"epoch": 1.2313432835820897,
"grad_norm": 0.4360729455947876,
"learning_rate": 0.0006147388059701493,
"loss": 0.7477,
"num_input_tokens_seen": 189256,
"step": 660
},
{
"epoch": 1.2406716417910448,
"grad_norm": 0.5418694615364075,
"learning_rate": 0.0006194029850746269,
"loss": 0.6469,
"num_input_tokens_seen": 190568,
"step": 665
},
{
"epoch": 1.25,
"grad_norm": 0.4630480408668518,
"learning_rate": 0.0006240671641791045,
"loss": 0.737,
"num_input_tokens_seen": 191880,
"step": 670
},
{
"epoch": 1.2593283582089552,
"grad_norm": 0.5126736164093018,
"learning_rate": 0.0006287313432835821,
"loss": 0.6417,
"num_input_tokens_seen": 193128,
"step": 675
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.47021928429603577,
"learning_rate": 0.0006333955223880597,
"loss": 0.4621,
"num_input_tokens_seen": 194376,
"step": 680
},
{
"epoch": 1.2779850746268657,
"grad_norm": 0.4174472987651825,
"learning_rate": 0.0006380597014925373,
"loss": 0.4478,
"num_input_tokens_seen": 195880,
"step": 685
},
{
"epoch": 1.287313432835821,
"grad_norm": 0.4990299344062805,
"learning_rate": 0.0006427238805970149,
"loss": 0.4875,
"num_input_tokens_seen": 197128,
"step": 690
},
{
"epoch": 1.296641791044776,
"grad_norm": 0.5962094664573669,
"learning_rate": 0.0006473880597014925,
"loss": 0.8478,
"num_input_tokens_seen": 198504,
"step": 695
},
{
"epoch": 1.3059701492537314,
"grad_norm": 0.7166761755943298,
"learning_rate": 0.0006520522388059702,
"loss": 0.9623,
"num_input_tokens_seen": 199848,
"step": 700
},
{
"epoch": 1.3152985074626866,
"grad_norm": 0.4672403633594513,
"learning_rate": 0.0006567164179104478,
"loss": 0.6889,
"num_input_tokens_seen": 201160,
"step": 705
},
{
"epoch": 1.3246268656716418,
"grad_norm": 0.2829459607601166,
"learning_rate": 0.0006613805970149254,
"loss": 0.7543,
"num_input_tokens_seen": 202504,
"step": 710
},
{
"epoch": 1.333955223880597,
"grad_norm": 0.6832309365272522,
"learning_rate": 0.000666044776119403,
"loss": 0.6392,
"num_input_tokens_seen": 203784,
"step": 715
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.31956401467323303,
"learning_rate": 0.0006707089552238807,
"loss": 0.7686,
"num_input_tokens_seen": 205256,
"step": 720
},
{
"epoch": 1.3526119402985075,
"grad_norm": 0.5048463940620422,
"learning_rate": 0.0006753731343283583,
"loss": 0.6293,
"num_input_tokens_seen": 206856,
"step": 725
},
{
"epoch": 1.3619402985074627,
"grad_norm": 0.45297038555145264,
"learning_rate": 0.0006800373134328358,
"loss": 0.4723,
"num_input_tokens_seen": 208232,
"step": 730
},
{
"epoch": 1.3712686567164178,
"grad_norm": 0.40795764327049255,
"learning_rate": 0.0006847014925373134,
"loss": 0.7259,
"num_input_tokens_seen": 209576,
"step": 735
},
{
"epoch": 1.3805970149253732,
"grad_norm": 0.4087425470352173,
"learning_rate": 0.0006893656716417911,
"loss": 0.8533,
"num_input_tokens_seen": 210920,
"step": 740
},
{
"epoch": 1.3899253731343284,
"grad_norm": 0.3699188828468323,
"learning_rate": 0.0006940298507462687,
"loss": 0.4452,
"num_input_tokens_seen": 212424,
"step": 745
},
{
"epoch": 1.3992537313432836,
"grad_norm": 0.5148809552192688,
"learning_rate": 0.0006986940298507463,
"loss": 0.6277,
"num_input_tokens_seen": 213736,
"step": 750
},
{
"epoch": 1.4085820895522387,
"grad_norm": 0.314248651266098,
"learning_rate": 0.0007033582089552238,
"loss": 0.8832,
"num_input_tokens_seen": 215048,
"step": 755
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.2668759226799011,
"learning_rate": 0.0007080223880597016,
"loss": 0.6837,
"num_input_tokens_seen": 216456,
"step": 760
},
{
"epoch": 1.4272388059701493,
"grad_norm": 0.5210652947425842,
"learning_rate": 0.0007126865671641791,
"loss": 0.5989,
"num_input_tokens_seen": 217896,
"step": 765
},
{
"epoch": 1.4365671641791045,
"grad_norm": 0.3388369083404541,
"learning_rate": 0.0007173507462686567,
"loss": 0.8363,
"num_input_tokens_seen": 219240,
"step": 770
},
{
"epoch": 1.4458955223880596,
"grad_norm": 0.45436587929725647,
"learning_rate": 0.0007220149253731343,
"loss": 0.5282,
"num_input_tokens_seen": 220904,
"step": 775
},
{
"epoch": 1.455223880597015,
"grad_norm": 0.6213339567184448,
"learning_rate": 0.0007266791044776119,
"loss": 0.8923,
"num_input_tokens_seen": 222184,
"step": 780
},
{
"epoch": 1.4645522388059702,
"grad_norm": 0.4394315183162689,
"learning_rate": 0.0007313432835820895,
"loss": 0.7169,
"num_input_tokens_seen": 223656,
"step": 785
},
{
"epoch": 1.4738805970149254,
"grad_norm": 0.3317413032054901,
"learning_rate": 0.0007360074626865671,
"loss": 0.4891,
"num_input_tokens_seen": 225064,
"step": 790
},
{
"epoch": 1.4832089552238805,
"grad_norm": 0.3865710496902466,
"learning_rate": 0.0007406716417910447,
"loss": 0.7401,
"num_input_tokens_seen": 226536,
"step": 795
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.28780487179756165,
"learning_rate": 0.0007453358208955224,
"loss": 0.5946,
"num_input_tokens_seen": 227816,
"step": 800
},
{
"epoch": 1.501865671641791,
"grad_norm": 0.3245728015899658,
"learning_rate": 0.00075,
"loss": 0.751,
"num_input_tokens_seen": 229288,
"step": 805
},
{
"epoch": 1.5111940298507462,
"grad_norm": 0.20647266507148743,
"learning_rate": 0.0007546641791044776,
"loss": 0.4726,
"num_input_tokens_seen": 230728,
"step": 810
},
{
"epoch": 1.5205223880597014,
"grad_norm": 0.41778045892715454,
"learning_rate": 0.0007593283582089553,
"loss": 0.4172,
"num_input_tokens_seen": 232104,
"step": 815
},
{
"epoch": 1.5298507462686568,
"grad_norm": 0.5311048626899719,
"learning_rate": 0.0007639925373134329,
"loss": 0.6319,
"num_input_tokens_seen": 233512,
"step": 820
},
{
"epoch": 1.539179104477612,
"grad_norm": 0.5205721855163574,
"learning_rate": 0.0007686567164179105,
"loss": 0.5626,
"num_input_tokens_seen": 234792,
"step": 825
},
{
"epoch": 1.5485074626865671,
"grad_norm": 0.43273499608039856,
"learning_rate": 0.0007733208955223881,
"loss": 0.6892,
"num_input_tokens_seen": 236168,
"step": 830
},
{
"epoch": 1.5578358208955225,
"grad_norm": 0.3638947606086731,
"learning_rate": 0.0007779850746268657,
"loss": 0.7235,
"num_input_tokens_seen": 237704,
"step": 835
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.26659175753593445,
"learning_rate": 0.0007826492537313433,
"loss": 0.6181,
"num_input_tokens_seen": 239272,
"step": 840
},
{
"epoch": 1.5764925373134329,
"grad_norm": 0.399054616689682,
"learning_rate": 0.0007873134328358209,
"loss": 0.6014,
"num_input_tokens_seen": 240552,
"step": 845
},
{
"epoch": 1.585820895522388,
"grad_norm": 0.8174265623092651,
"learning_rate": 0.0007919776119402985,
"loss": 0.6507,
"num_input_tokens_seen": 242024,
"step": 850
},
{
"epoch": 1.5951492537313432,
"grad_norm": 0.43701648712158203,
"learning_rate": 0.0007966417910447762,
"loss": 0.6974,
"num_input_tokens_seen": 243400,
"step": 855
},
{
"epoch": 1.6044776119402986,
"grad_norm": 0.5361718535423279,
"learning_rate": 0.0008013059701492538,
"loss": 1.2414,
"num_input_tokens_seen": 244744,
"step": 860
},
{
"epoch": 1.6138059701492538,
"grad_norm": 0.36682432889938354,
"learning_rate": 0.0008059701492537314,
"loss": 0.6403,
"num_input_tokens_seen": 246312,
"step": 865
},
{
"epoch": 1.623134328358209,
"grad_norm": 0.48592299222946167,
"learning_rate": 0.000810634328358209,
"loss": 0.6154,
"num_input_tokens_seen": 247944,
"step": 870
},
{
"epoch": 1.6324626865671643,
"grad_norm": 0.1437925547361374,
"learning_rate": 0.0008152985074626867,
"loss": 0.6444,
"num_input_tokens_seen": 249544,
"step": 875
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.3561725914478302,
"learning_rate": 0.0008199626865671643,
"loss": 0.637,
"num_input_tokens_seen": 250920,
"step": 880
},
{
"epoch": 1.6511194029850746,
"grad_norm": 0.36449792981147766,
"learning_rate": 0.0008246268656716418,
"loss": 0.6722,
"num_input_tokens_seen": 252392,
"step": 885
},
{
"epoch": 1.6604477611940298,
"grad_norm": 0.2657022178173065,
"learning_rate": 0.0008292910447761193,
"loss": 0.6982,
"num_input_tokens_seen": 254056,
"step": 890
},
{
"epoch": 1.669776119402985,
"grad_norm": 0.3086923658847809,
"learning_rate": 0.000833955223880597,
"loss": 0.6361,
"num_input_tokens_seen": 255528,
"step": 895
},
{
"epoch": 1.6791044776119404,
"grad_norm": 0.4550116956233978,
"learning_rate": 0.0008386194029850746,
"loss": 0.7733,
"num_input_tokens_seen": 257096,
"step": 900
},
{
"epoch": 1.6884328358208955,
"grad_norm": 0.3251428008079529,
"learning_rate": 0.0008432835820895522,
"loss": 0.4253,
"num_input_tokens_seen": 258536,
"step": 905
},
{
"epoch": 1.6977611940298507,
"grad_norm": 0.18625348806381226,
"learning_rate": 0.0008479477611940298,
"loss": 0.5241,
"num_input_tokens_seen": 260136,
"step": 910
},
{
"epoch": 1.707089552238806,
"grad_norm": 0.4147641062736511,
"learning_rate": 0.0008526119402985075,
"loss": 0.4974,
"num_input_tokens_seen": 261384,
"step": 915
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.4366629123687744,
"learning_rate": 0.0008572761194029851,
"loss": 0.5932,
"num_input_tokens_seen": 262952,
"step": 920
},
{
"epoch": 1.7257462686567164,
"grad_norm": 0.47915467619895935,
"learning_rate": 0.0008619402985074627,
"loss": 0.6001,
"num_input_tokens_seen": 264328,
"step": 925
},
{
"epoch": 1.7350746268656716,
"grad_norm": 0.2741464376449585,
"learning_rate": 0.0008666044776119403,
"loss": 0.8693,
"num_input_tokens_seen": 265672,
"step": 930
},
{
"epoch": 1.7444029850746268,
"grad_norm": 0.2753334939479828,
"learning_rate": 0.0008712686567164179,
"loss": 0.6336,
"num_input_tokens_seen": 267080,
"step": 935
},
{
"epoch": 1.7537313432835822,
"grad_norm": 0.4339545667171478,
"learning_rate": 0.0008759328358208955,
"loss": 0.5896,
"num_input_tokens_seen": 268648,
"step": 940
},
{
"epoch": 1.7630597014925373,
"grad_norm": 0.46501514315605164,
"learning_rate": 0.0008805970149253731,
"loss": 0.5796,
"num_input_tokens_seen": 269960,
"step": 945
},
{
"epoch": 1.7723880597014925,
"grad_norm": 0.31456464529037476,
"learning_rate": 0.0008852611940298507,
"loss": 0.7311,
"num_input_tokens_seen": 271272,
"step": 950
},
{
"epoch": 1.7817164179104479,
"grad_norm": 0.19265945255756378,
"learning_rate": 0.0008899253731343284,
"loss": 0.6366,
"num_input_tokens_seen": 272488,
"step": 955
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.3629007935523987,
"learning_rate": 0.000894589552238806,
"loss": 0.9754,
"num_input_tokens_seen": 274024,
"step": 960
},
{
"epoch": 1.8003731343283582,
"grad_norm": 0.3736506402492523,
"learning_rate": 0.0008992537313432836,
"loss": 0.5409,
"num_input_tokens_seen": 275336,
"step": 965
},
{
"epoch": 1.8097014925373134,
"grad_norm": 0.262616902589798,
"learning_rate": 0.0009039179104477612,
"loss": 0.5686,
"num_input_tokens_seen": 276584,
"step": 970
},
{
"epoch": 1.8190298507462686,
"grad_norm": 0.41028597950935364,
"learning_rate": 0.0009085820895522389,
"loss": 0.8302,
"num_input_tokens_seen": 277960,
"step": 975
},
{
"epoch": 1.828358208955224,
"grad_norm": 0.3309934139251709,
"learning_rate": 0.0009132462686567165,
"loss": 0.4458,
"num_input_tokens_seen": 279400,
"step": 980
},
{
"epoch": 1.837686567164179,
"grad_norm": 0.25409263372421265,
"learning_rate": 0.0009179104477611941,
"loss": 0.4202,
"num_input_tokens_seen": 281064,
"step": 985
},
{
"epoch": 1.8470149253731343,
"grad_norm": 0.48292016983032227,
"learning_rate": 0.0009225746268656716,
"loss": 0.7663,
"num_input_tokens_seen": 282312,
"step": 990
},
{
"epoch": 1.8563432835820897,
"grad_norm": 0.3051922023296356,
"learning_rate": 0.0009272388059701493,
"loss": 0.4506,
"num_input_tokens_seen": 283688,
"step": 995
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.20524144172668457,
"learning_rate": 0.0009319029850746269,
"loss": 0.7163,
"num_input_tokens_seen": 285192,
"step": 1000
},
{
"epoch": 1.875,
"grad_norm": 0.2460353970527649,
"learning_rate": 0.0009365671641791045,
"loss": 0.5831,
"num_input_tokens_seen": 286664,
"step": 1005
},
{
"epoch": 1.8843283582089554,
"grad_norm": 0.4078156650066376,
"learning_rate": 0.0009412313432835821,
"loss": 0.5654,
"num_input_tokens_seen": 287976,
"step": 1010
},
{
"epoch": 1.8936567164179103,
"grad_norm": 0.3027799427509308,
"learning_rate": 0.0009458955223880598,
"loss": 0.6781,
"num_input_tokens_seen": 289256,
"step": 1015
},
{
"epoch": 1.9029850746268657,
"grad_norm": 0.4628511965274811,
"learning_rate": 0.0009505597014925374,
"loss": 0.7969,
"num_input_tokens_seen": 290792,
"step": 1020
},
{
"epoch": 1.912313432835821,
"grad_norm": 0.29008105397224426,
"learning_rate": 0.000955223880597015,
"loss": 0.7692,
"num_input_tokens_seen": 292104,
"step": 1025
},
{
"epoch": 1.921641791044776,
"grad_norm": 0.2522449791431427,
"learning_rate": 0.0009598880597014926,
"loss": 0.6877,
"num_input_tokens_seen": 293672,
"step": 1030
},
{
"epoch": 1.9309701492537314,
"grad_norm": 0.28948765993118286,
"learning_rate": 0.0009645522388059703,
"loss": 0.6873,
"num_input_tokens_seen": 295048,
"step": 1035
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.6912420392036438,
"learning_rate": 0.0009692164179104477,
"loss": 0.6167,
"num_input_tokens_seen": 296584,
"step": 1040
},
{
"epoch": 1.9496268656716418,
"grad_norm": 0.17778247594833374,
"learning_rate": 0.0009738805970149253,
"loss": 0.5713,
"num_input_tokens_seen": 297928,
"step": 1045
},
{
"epoch": 1.9589552238805972,
"grad_norm": 1.7690346240997314,
"learning_rate": 0.000978544776119403,
"loss": 0.9329,
"num_input_tokens_seen": 299336,
"step": 1050
},
{
"epoch": 1.9682835820895521,
"grad_norm": 0.19372034072875977,
"learning_rate": 0.0009832089552238806,
"loss": 0.4327,
"num_input_tokens_seen": 300776,
"step": 1055
},
{
"epoch": 1.9776119402985075,
"grad_norm": 0.5918903350830078,
"learning_rate": 0.0009878731343283583,
"loss": 0.8278,
"num_input_tokens_seen": 302216,
"step": 1060
},
{
"epoch": 1.9869402985074627,
"grad_norm": 0.3756718635559082,
"learning_rate": 0.0009925373134328358,
"loss": 0.6511,
"num_input_tokens_seen": 303560,
"step": 1065
},
{
"epoch": 1.9962686567164178,
"grad_norm": 0.23910151422023773,
"learning_rate": 0.0009972014925373133,
"loss": 0.6292,
"num_input_tokens_seen": 305128,
"step": 1070
},
{
"epoch": 2.0,
"eval_loss": 0.6899347901344299,
"eval_runtime": 4.1904,
"eval_samples_per_second": 56.797,
"eval_steps_per_second": 14.318,
"num_input_tokens_seen": 305496,
"step": 1072
},
{
"epoch": 2.0055970149253732,
"grad_norm": 0.2828496992588043,
"learning_rate": 0.0009999998939708842,
"loss": 0.8802,
"num_input_tokens_seen": 306456,
"step": 1075
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.32462286949157715,
"learning_rate": 0.0009999987011438459,
"loss": 0.5237,
"num_input_tokens_seen": 308216,
"step": 1080
},
{
"epoch": 2.0242537313432836,
"grad_norm": 0.27709144353866577,
"learning_rate": 0.0009999961829565468,
"loss": 0.5552,
"num_input_tokens_seen": 309688,
"step": 1085
},
{
"epoch": 2.033582089552239,
"grad_norm": 0.1804264932870865,
"learning_rate": 0.0009999923394156621,
"loss": 0.7655,
"num_input_tokens_seen": 311096,
"step": 1090
},
{
"epoch": 2.042910447761194,
"grad_norm": 0.40875178575515747,
"learning_rate": 0.0009999871705313795,
"loss": 0.6567,
"num_input_tokens_seen": 312728,
"step": 1095
},
{
"epoch": 2.0522388059701493,
"grad_norm": 0.4328378140926361,
"learning_rate": 0.0009999806763174009,
"loss": 0.6443,
"num_input_tokens_seen": 314200,
"step": 1100
},
{
"epoch": 2.0615671641791047,
"grad_norm": 0.28709733486175537,
"learning_rate": 0.0009999728567909403,
"loss": 0.5532,
"num_input_tokens_seen": 315608,
"step": 1105
},
{
"epoch": 2.0708955223880596,
"grad_norm": 0.3292154371738434,
"learning_rate": 0.0009999637119727251,
"loss": 0.5018,
"num_input_tokens_seen": 317080,
"step": 1110
},
{
"epoch": 2.080223880597015,
"grad_norm": 0.4961127042770386,
"learning_rate": 0.000999953241886996,
"loss": 0.5222,
"num_input_tokens_seen": 318648,
"step": 1115
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.30666232109069824,
"learning_rate": 0.0009999414465615062,
"loss": 0.6002,
"num_input_tokens_seen": 320248,
"step": 1120
},
{
"epoch": 2.0988805970149254,
"grad_norm": 0.37278977036476135,
"learning_rate": 0.0009999283260275218,
"loss": 0.5347,
"num_input_tokens_seen": 321656,
"step": 1125
},
{
"epoch": 2.1082089552238807,
"grad_norm": 0.206303671002388,
"learning_rate": 0.000999913880319822,
"loss": 0.6089,
"num_input_tokens_seen": 323256,
"step": 1130
},
{
"epoch": 2.1175373134328357,
"grad_norm": 0.23221929371356964,
"learning_rate": 0.000999898109476698,
"loss": 0.6503,
"num_input_tokens_seen": 324632,
"step": 1135
},
{
"epoch": 2.126865671641791,
"grad_norm": 0.36873659491539,
"learning_rate": 0.0009998810135399545,
"loss": 0.806,
"num_input_tokens_seen": 325816,
"step": 1140
},
{
"epoch": 2.1361940298507465,
"grad_norm": 0.21523557603359222,
"learning_rate": 0.000999862592554908,
"loss": 0.4014,
"num_input_tokens_seen": 327288,
"step": 1145
},
{
"epoch": 2.1455223880597014,
"grad_norm": 0.3459737002849579,
"learning_rate": 0.0009998428465703873,
"loss": 0.6605,
"num_input_tokens_seen": 328632,
"step": 1150
},
{
"epoch": 2.154850746268657,
"grad_norm": 0.22625266015529633,
"learning_rate": 0.000999821775638734,
"loss": 0.7785,
"num_input_tokens_seen": 329976,
"step": 1155
},
{
"epoch": 2.1641791044776117,
"grad_norm": 0.38777706027030945,
"learning_rate": 0.000999799379815801,
"loss": 0.647,
"num_input_tokens_seen": 331192,
"step": 1160
},
{
"epoch": 2.173507462686567,
"grad_norm": 0.7987584471702576,
"learning_rate": 0.0009997756591609537,
"loss": 0.6472,
"num_input_tokens_seen": 332504,
"step": 1165
},
{
"epoch": 2.1828358208955225,
"grad_norm": 0.2811796963214874,
"learning_rate": 0.0009997506137370692,
"loss": 0.5644,
"num_input_tokens_seen": 333816,
"step": 1170
},
{
"epoch": 2.1921641791044775,
"grad_norm": 0.350900262594223,
"learning_rate": 0.0009997242436105358,
"loss": 0.5828,
"num_input_tokens_seen": 335224,
"step": 1175
},
{
"epoch": 2.201492537313433,
"grad_norm": 0.24604664742946625,
"learning_rate": 0.000999696548851254,
"loss": 0.5264,
"num_input_tokens_seen": 336504,
"step": 1180
},
{
"epoch": 2.2108208955223883,
"grad_norm": 0.13010980188846588,
"learning_rate": 0.0009996675295326344,
"loss": 0.5302,
"num_input_tokens_seen": 337784,
"step": 1185
},
{
"epoch": 2.220149253731343,
"grad_norm": 0.30489736795425415,
"learning_rate": 0.0009996371857316,
"loss": 0.7664,
"num_input_tokens_seen": 339096,
"step": 1190
},
{
"epoch": 2.2294776119402986,
"grad_norm": 0.3614640533924103,
"learning_rate": 0.0009996055175285833,
"loss": 0.4912,
"num_input_tokens_seen": 340376,
"step": 1195
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.20211651921272278,
"learning_rate": 0.0009995725250075288,
"loss": 0.6022,
"num_input_tokens_seen": 341880,
"step": 1200
},
{
"epoch": 2.248134328358209,
"grad_norm": 0.1927761435508728,
"learning_rate": 0.0009995382082558899,
"loss": 0.5555,
"num_input_tokens_seen": 343416,
"step": 1205
},
{
"epoch": 2.2574626865671643,
"grad_norm": 0.33579009771347046,
"learning_rate": 0.0009995025673646314,
"loss": 0.8326,
"num_input_tokens_seen": 344920,
"step": 1210
},
{
"epoch": 2.2667910447761193,
"grad_norm": 0.13861723244190216,
"learning_rate": 0.0009994656024282277,
"loss": 0.5842,
"num_input_tokens_seen": 346424,
"step": 1215
},
{
"epoch": 2.2761194029850746,
"grad_norm": 0.2502031624317169,
"learning_rate": 0.0009994273135446622,
"loss": 0.621,
"num_input_tokens_seen": 347960,
"step": 1220
},
{
"epoch": 2.28544776119403,
"grad_norm": 0.17798258364200592,
"learning_rate": 0.000999387700815429,
"loss": 0.486,
"num_input_tokens_seen": 349400,
"step": 1225
},
{
"epoch": 2.294776119402985,
"grad_norm": 0.19423194229602814,
"learning_rate": 0.0009993467643455301,
"loss": 0.6475,
"num_input_tokens_seen": 350744,
"step": 1230
},
{
"epoch": 2.3041044776119404,
"grad_norm": 0.10956660658121109,
"learning_rate": 0.0009993045042434772,
"loss": 0.8843,
"num_input_tokens_seen": 352152,
"step": 1235
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.21084345877170563,
"learning_rate": 0.0009992609206212902,
"loss": 0.6028,
"num_input_tokens_seen": 353560,
"step": 1240
},
{
"epoch": 2.3227611940298507,
"grad_norm": 0.23705990612506866,
"learning_rate": 0.0009992160135944975,
"loss": 0.6813,
"num_input_tokens_seen": 354808,
"step": 1245
},
{
"epoch": 2.332089552238806,
"grad_norm": 0.26078465580940247,
"learning_rate": 0.0009991697832821354,
"loss": 0.7028,
"num_input_tokens_seen": 356120,
"step": 1250
},
{
"epoch": 2.341417910447761,
"grad_norm": 0.4466816186904907,
"learning_rate": 0.0009991222298067477,
"loss": 0.6778,
"num_input_tokens_seen": 357400,
"step": 1255
},
{
"epoch": 2.3507462686567164,
"grad_norm": 0.2676059603691101,
"learning_rate": 0.0009990733532943858,
"loss": 0.4555,
"num_input_tokens_seen": 358584,
"step": 1260
},
{
"epoch": 2.360074626865672,
"grad_norm": 0.28492385149002075,
"learning_rate": 0.0009990231538746079,
"loss": 0.8428,
"num_input_tokens_seen": 360088,
"step": 1265
},
{
"epoch": 2.3694029850746268,
"grad_norm": 0.324629545211792,
"learning_rate": 0.0009989716316804794,
"loss": 0.4129,
"num_input_tokens_seen": 361496,
"step": 1270
},
{
"epoch": 2.378731343283582,
"grad_norm": 0.13705921173095703,
"learning_rate": 0.000998918786848571,
"loss": 0.459,
"num_input_tokens_seen": 362968,
"step": 1275
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.2732636034488678,
"learning_rate": 0.0009988646195189601,
"loss": 0.7371,
"num_input_tokens_seen": 364312,
"step": 1280
},
{
"epoch": 2.3973880597014925,
"grad_norm": 0.21165116131305695,
"learning_rate": 0.00099880912983523,
"loss": 0.4329,
"num_input_tokens_seen": 365752,
"step": 1285
},
{
"epoch": 2.406716417910448,
"grad_norm": 0.2917303144931793,
"learning_rate": 0.0009987523179444682,
"loss": 0.5815,
"num_input_tokens_seen": 367576,
"step": 1290
},
{
"epoch": 2.416044776119403,
"grad_norm": 0.22703352570533752,
"learning_rate": 0.0009986941839972676,
"loss": 0.5994,
"num_input_tokens_seen": 369016,
"step": 1295
},
{
"epoch": 2.425373134328358,
"grad_norm": 0.2888859212398529,
"learning_rate": 0.0009986347281477257,
"loss": 0.5273,
"num_input_tokens_seen": 370584,
"step": 1300
},
{
"epoch": 2.4347014925373136,
"grad_norm": 0.25431835651397705,
"learning_rate": 0.0009985739505534437,
"loss": 0.6897,
"num_input_tokens_seen": 371992,
"step": 1305
},
{
"epoch": 2.4440298507462686,
"grad_norm": 0.28907614946365356,
"learning_rate": 0.000998511851375526,
"loss": 0.4778,
"num_input_tokens_seen": 373304,
"step": 1310
},
{
"epoch": 2.453358208955224,
"grad_norm": 0.2507428228855133,
"learning_rate": 0.000998448430778581,
"loss": 0.8246,
"num_input_tokens_seen": 374776,
"step": 1315
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.24830341339111328,
"learning_rate": 0.0009983836889307196,
"loss": 0.6537,
"num_input_tokens_seen": 376248,
"step": 1320
},
{
"epoch": 2.4720149253731343,
"grad_norm": 0.2691262364387512,
"learning_rate": 0.0009983176260035544,
"loss": 0.5673,
"num_input_tokens_seen": 377560,
"step": 1325
},
{
"epoch": 2.4813432835820897,
"grad_norm": 0.44198179244995117,
"learning_rate": 0.0009982502421722005,
"loss": 0.7322,
"num_input_tokens_seen": 378904,
"step": 1330
},
{
"epoch": 2.4906716417910446,
"grad_norm": 0.3275417983531952,
"learning_rate": 0.0009981815376152736,
"loss": 0.6478,
"num_input_tokens_seen": 380312,
"step": 1335
},
{
"epoch": 2.5,
"grad_norm": 0.2878626585006714,
"learning_rate": 0.000998111512514891,
"loss": 0.5703,
"num_input_tokens_seen": 381784,
"step": 1340
},
{
"epoch": 2.5093283582089554,
"grad_norm": 0.15107418596744537,
"learning_rate": 0.0009980401670566705,
"loss": 0.3825,
"num_input_tokens_seen": 383256,
"step": 1345
},
{
"epoch": 2.5186567164179103,
"grad_norm": 0.23873454332351685,
"learning_rate": 0.0009979675014297293,
"loss": 0.7657,
"num_input_tokens_seen": 384536,
"step": 1350
},
{
"epoch": 2.5279850746268657,
"grad_norm": 0.22512106597423553,
"learning_rate": 0.0009978935158266838,
"loss": 0.5868,
"num_input_tokens_seen": 385880,
"step": 1355
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.1494188755750656,
"learning_rate": 0.00099781821044365,
"loss": 0.5919,
"num_input_tokens_seen": 387576,
"step": 1360
},
{
"epoch": 2.546641791044776,
"grad_norm": 0.29194745421409607,
"learning_rate": 0.0009977415854802419,
"loss": 0.652,
"num_input_tokens_seen": 389112,
"step": 1365
},
{
"epoch": 2.5559701492537314,
"grad_norm": 0.25149044394493103,
"learning_rate": 0.000997663641139571,
"loss": 0.5783,
"num_input_tokens_seen": 390456,
"step": 1370
},
{
"epoch": 2.5652985074626864,
"grad_norm": 0.2535479962825775,
"learning_rate": 0.0009975843776282472,
"loss": 0.6898,
"num_input_tokens_seen": 392024,
"step": 1375
},
{
"epoch": 2.574626865671642,
"grad_norm": 0.3264197111129761,
"learning_rate": 0.0009975037951563761,
"loss": 0.6649,
"num_input_tokens_seen": 393176,
"step": 1380
},
{
"epoch": 2.583955223880597,
"grad_norm": 0.15236614644527435,
"learning_rate": 0.00099742189393756,
"loss": 0.4409,
"num_input_tokens_seen": 394680,
"step": 1385
},
{
"epoch": 2.593283582089552,
"grad_norm": 0.2783982455730438,
"learning_rate": 0.0009973386741888963,
"loss": 0.6724,
"num_input_tokens_seen": 396024,
"step": 1390
},
{
"epoch": 2.6026119402985075,
"grad_norm": 0.3226190507411957,
"learning_rate": 0.0009972541361309782,
"loss": 0.6453,
"num_input_tokens_seen": 397560,
"step": 1395
},
{
"epoch": 2.611940298507463,
"grad_norm": 0.3189884424209595,
"learning_rate": 0.000997168279987893,
"loss": 0.6401,
"num_input_tokens_seen": 398776,
"step": 1400
},
{
"epoch": 2.621268656716418,
"grad_norm": 0.1870376318693161,
"learning_rate": 0.000997081105987222,
"loss": 0.5131,
"num_input_tokens_seen": 400376,
"step": 1405
},
{
"epoch": 2.6305970149253732,
"grad_norm": 0.2934994697570801,
"learning_rate": 0.0009969926143600396,
"loss": 0.773,
"num_input_tokens_seen": 401624,
"step": 1410
},
{
"epoch": 2.6399253731343286,
"grad_norm": 0.27938956022262573,
"learning_rate": 0.0009969028053409131,
"loss": 0.7983,
"num_input_tokens_seen": 402968,
"step": 1415
},
{
"epoch": 2.6492537313432836,
"grad_norm": 0.3684418797492981,
"learning_rate": 0.0009968116791679014,
"loss": 0.7426,
"num_input_tokens_seen": 404440,
"step": 1420
},
{
"epoch": 2.658582089552239,
"grad_norm": 0.21254830062389374,
"learning_rate": 0.0009967192360825557,
"loss": 0.5771,
"num_input_tokens_seen": 405752,
"step": 1425
},
{
"epoch": 2.667910447761194,
"grad_norm": 0.3110397160053253,
"learning_rate": 0.000996625476329917,
"loss": 0.6822,
"num_input_tokens_seen": 407064,
"step": 1430
},
{
"epoch": 2.6772388059701493,
"grad_norm": 0.38691264390945435,
"learning_rate": 0.000996530400158517,
"loss": 0.5575,
"num_input_tokens_seen": 408536,
"step": 1435
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.28269466757774353,
"learning_rate": 0.0009964340078203765,
"loss": 0.817,
"num_input_tokens_seen": 409912,
"step": 1440
},
{
"epoch": 2.6958955223880596,
"grad_norm": 0.28892797231674194,
"learning_rate": 0.0009963362995710056,
"loss": 0.7048,
"num_input_tokens_seen": 411192,
"step": 1445
},
{
"epoch": 2.705223880597015,
"grad_norm": 0.4032808840274811,
"learning_rate": 0.0009962372756694023,
"loss": 0.5718,
"num_input_tokens_seen": 412632,
"step": 1450
},
{
"epoch": 2.71455223880597,
"grad_norm": 0.28502070903778076,
"learning_rate": 0.0009961369363780514,
"loss": 0.6646,
"num_input_tokens_seen": 413944,
"step": 1455
},
{
"epoch": 2.7238805970149254,
"grad_norm": 0.3142834007740021,
"learning_rate": 0.0009960352819629258,
"loss": 0.7401,
"num_input_tokens_seen": 415352,
"step": 1460
},
{
"epoch": 2.7332089552238807,
"grad_norm": 0.24872903525829315,
"learning_rate": 0.000995932312693483,
"loss": 0.6132,
"num_input_tokens_seen": 416536,
"step": 1465
},
{
"epoch": 2.7425373134328357,
"grad_norm": 0.2512160539627075,
"learning_rate": 0.0009958280288426668,
"loss": 0.6953,
"num_input_tokens_seen": 418040,
"step": 1470
},
{
"epoch": 2.751865671641791,
"grad_norm": 0.2220417857170105,
"learning_rate": 0.0009957224306869053,
"loss": 0.5511,
"num_input_tokens_seen": 419448,
"step": 1475
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.22836509346961975,
"learning_rate": 0.00099561551850611,
"loss": 0.5046,
"num_input_tokens_seen": 420856,
"step": 1480
},
{
"epoch": 2.7705223880597014,
"grad_norm": 0.2554475963115692,
"learning_rate": 0.0009955072925836765,
"loss": 0.7276,
"num_input_tokens_seen": 422264,
"step": 1485
},
{
"epoch": 2.779850746268657,
"grad_norm": 0.3000124990940094,
"learning_rate": 0.0009953977532064819,
"loss": 0.7986,
"num_input_tokens_seen": 423800,
"step": 1490
},
{
"epoch": 2.789179104477612,
"grad_norm": 0.34656092524528503,
"learning_rate": 0.0009952869006648853,
"loss": 0.5285,
"num_input_tokens_seen": 425496,
"step": 1495
},
{
"epoch": 2.798507462686567,
"grad_norm": 0.23471760749816895,
"learning_rate": 0.0009951747352527265,
"loss": 0.5433,
"num_input_tokens_seen": 426872,
"step": 1500
},
{
"epoch": 2.8078358208955225,
"grad_norm": 0.2542097270488739,
"learning_rate": 0.0009950612572673255,
"loss": 0.8049,
"num_input_tokens_seen": 428248,
"step": 1505
},
{
"epoch": 2.8171641791044775,
"grad_norm": 0.2707230746746063,
"learning_rate": 0.0009949464670094815,
"loss": 0.6151,
"num_input_tokens_seen": 429624,
"step": 1510
},
{
"epoch": 2.826492537313433,
"grad_norm": 0.6066590547561646,
"learning_rate": 0.0009948303647834722,
"loss": 0.6492,
"num_input_tokens_seen": 431000,
"step": 1515
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.24002781510353088,
"learning_rate": 0.000994712950897053,
"loss": 0.5351,
"num_input_tokens_seen": 432760,
"step": 1520
},
{
"epoch": 2.845149253731343,
"grad_norm": 0.2488313466310501,
"learning_rate": 0.000994594225661456,
"loss": 0.7667,
"num_input_tokens_seen": 434232,
"step": 1525
},
{
"epoch": 2.8544776119402986,
"grad_norm": 0.22874920070171356,
"learning_rate": 0.0009944741893913895,
"loss": 0.8033,
"num_input_tokens_seen": 435608,
"step": 1530
},
{
"epoch": 2.8638059701492535,
"grad_norm": 0.1811210960149765,
"learning_rate": 0.0009943528424050368,
"loss": 0.5726,
"num_input_tokens_seen": 436984,
"step": 1535
},
{
"epoch": 2.873134328358209,
"grad_norm": 0.1760341376066208,
"learning_rate": 0.000994230185024056,
"loss": 0.6519,
"num_input_tokens_seen": 438424,
"step": 1540
},
{
"epoch": 2.8824626865671643,
"grad_norm": 0.12924505770206451,
"learning_rate": 0.000994106217573578,
"loss": 0.4132,
"num_input_tokens_seen": 439928,
"step": 1545
},
{
"epoch": 2.8917910447761193,
"grad_norm": 0.3055063784122467,
"learning_rate": 0.0009939809403822068,
"loss": 0.5335,
"num_input_tokens_seen": 441496,
"step": 1550
},
{
"epoch": 2.9011194029850746,
"grad_norm": 0.2536128759384155,
"learning_rate": 0.0009938543537820184,
"loss": 0.5995,
"num_input_tokens_seen": 442872,
"step": 1555
},
{
"epoch": 2.91044776119403,
"grad_norm": 0.20395897328853607,
"learning_rate": 0.0009937264581085592,
"loss": 0.5315,
"num_input_tokens_seen": 444312,
"step": 1560
},
{
"epoch": 2.919776119402985,
"grad_norm": 0.20518085360527039,
"learning_rate": 0.0009935972537008456,
"loss": 0.5239,
"num_input_tokens_seen": 445592,
"step": 1565
},
{
"epoch": 2.9291044776119404,
"grad_norm": 0.21364372968673706,
"learning_rate": 0.0009934667409013634,
"loss": 0.8367,
"num_input_tokens_seen": 446968,
"step": 1570
},
{
"epoch": 2.9384328358208958,
"grad_norm": 0.24295471608638763,
"learning_rate": 0.0009933349200560665,
"loss": 0.4123,
"num_input_tokens_seen": 448408,
"step": 1575
},
{
"epoch": 2.9477611940298507,
"grad_norm": 0.13642080128192902,
"learning_rate": 0.0009932017915143757,
"loss": 0.6884,
"num_input_tokens_seen": 449816,
"step": 1580
},
{
"epoch": 2.957089552238806,
"grad_norm": 0.28196004033088684,
"learning_rate": 0.000993067355629179,
"loss": 0.4606,
"num_input_tokens_seen": 451128,
"step": 1585
},
{
"epoch": 2.966417910447761,
"grad_norm": 0.3152237832546234,
"learning_rate": 0.0009929316127568288,
"loss": 0.6885,
"num_input_tokens_seen": 452728,
"step": 1590
},
{
"epoch": 2.9757462686567164,
"grad_norm": 0.1925637274980545,
"learning_rate": 0.000992794563257143,
"loss": 0.5397,
"num_input_tokens_seen": 454328,
"step": 1595
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.3836488723754883,
"learning_rate": 0.0009926562074934018,
"loss": 0.4762,
"num_input_tokens_seen": 455800,
"step": 1600
},
{
"epoch": 2.9944029850746268,
"grad_norm": 0.17350491881370544,
"learning_rate": 0.000992516545832349,
"loss": 0.4186,
"num_input_tokens_seen": 457560,
"step": 1605
},
{
"epoch": 3.0,
"eval_loss": 0.6456666588783264,
"eval_runtime": 4.1922,
"eval_samples_per_second": 56.772,
"eval_steps_per_second": 14.312,
"num_input_tokens_seen": 458160,
"step": 1608
},
{
"epoch": 3.003731343283582,
"grad_norm": 0.23106440901756287,
"learning_rate": 0.0009923755786441896,
"loss": 0.6499,
"num_input_tokens_seen": 458736,
"step": 1610
},
{
"epoch": 3.013059701492537,
"grad_norm": 0.155120387673378,
"learning_rate": 0.0009922333063025893,
"loss": 0.6578,
"num_input_tokens_seen": 460080,
"step": 1615
},
{
"epoch": 3.0223880597014925,
"grad_norm": 0.23484502732753754,
"learning_rate": 0.0009920897291846732,
"loss": 0.5039,
"num_input_tokens_seen": 461488,
"step": 1620
},
{
"epoch": 3.031716417910448,
"grad_norm": 0.19638964533805847,
"learning_rate": 0.0009919448476710248,
"loss": 0.5332,
"num_input_tokens_seen": 462992,
"step": 1625
},
{
"epoch": 3.041044776119403,
"grad_norm": 0.1650891751050949,
"learning_rate": 0.0009917986621456856,
"loss": 0.6471,
"num_input_tokens_seen": 464304,
"step": 1630
},
{
"epoch": 3.050373134328358,
"grad_norm": 0.1567000150680542,
"learning_rate": 0.000991651172996154,
"loss": 0.318,
"num_input_tokens_seen": 465968,
"step": 1635
},
{
"epoch": 3.0597014925373136,
"grad_norm": 0.23875364661216736,
"learning_rate": 0.0009915023806133833,
"loss": 0.7304,
"num_input_tokens_seen": 467344,
"step": 1640
},
{
"epoch": 3.0690298507462686,
"grad_norm": 0.32178816199302673,
"learning_rate": 0.0009913522853917812,
"loss": 0.4617,
"num_input_tokens_seen": 468816,
"step": 1645
},
{
"epoch": 3.078358208955224,
"grad_norm": 0.24563711881637573,
"learning_rate": 0.0009912008877292096,
"loss": 0.6088,
"num_input_tokens_seen": 470416,
"step": 1650
},
{
"epoch": 3.0876865671641793,
"grad_norm": 0.26710382103919983,
"learning_rate": 0.0009910481880269825,
"loss": 0.4255,
"num_input_tokens_seen": 471760,
"step": 1655
},
{
"epoch": 3.0970149253731343,
"grad_norm": 0.25559887290000916,
"learning_rate": 0.0009908941866898647,
"loss": 0.4412,
"num_input_tokens_seen": 473104,
"step": 1660
},
{
"epoch": 3.1063432835820897,
"grad_norm": 0.2960032820701599,
"learning_rate": 0.0009907388841260722,
"loss": 0.6419,
"num_input_tokens_seen": 474320,
"step": 1665
},
{
"epoch": 3.1156716417910446,
"grad_norm": 0.25373607873916626,
"learning_rate": 0.0009905822807472699,
"loss": 0.5873,
"num_input_tokens_seen": 475632,
"step": 1670
},
{
"epoch": 3.125,
"grad_norm": 0.2537466287612915,
"learning_rate": 0.0009904243769685702,
"loss": 0.5751,
"num_input_tokens_seen": 477168,
"step": 1675
},
{
"epoch": 3.1343283582089554,
"grad_norm": 0.25388938188552856,
"learning_rate": 0.0009902651732085332,
"loss": 0.8109,
"num_input_tokens_seen": 478544,
"step": 1680
},
{
"epoch": 3.1436567164179103,
"grad_norm": 0.13722610473632812,
"learning_rate": 0.0009901046698891649,
"loss": 0.4441,
"num_input_tokens_seen": 479952,
"step": 1685
},
{
"epoch": 3.1529850746268657,
"grad_norm": 0.23463381826877594,
"learning_rate": 0.0009899428674359154,
"loss": 0.4207,
"num_input_tokens_seen": 481392,
"step": 1690
},
{
"epoch": 3.1623134328358207,
"grad_norm": 0.19632478058338165,
"learning_rate": 0.000989779766277679,
"loss": 0.4384,
"num_input_tokens_seen": 482992,
"step": 1695
},
{
"epoch": 3.171641791044776,
"grad_norm": 0.2727997899055481,
"learning_rate": 0.0009896153668467926,
"loss": 0.5026,
"num_input_tokens_seen": 484304,
"step": 1700
},
{
"epoch": 3.1809701492537314,
"grad_norm": 1.4269156455993652,
"learning_rate": 0.0009894496695790345,
"loss": 0.5492,
"num_input_tokens_seen": 485808,
"step": 1705
},
{
"epoch": 3.1902985074626864,
"grad_norm": 0.25974732637405396,
"learning_rate": 0.0009892826749136224,
"loss": 0.6403,
"num_input_tokens_seen": 487344,
"step": 1710
},
{
"epoch": 3.199626865671642,
"grad_norm": 0.2388739287853241,
"learning_rate": 0.000989114383293214,
"loss": 0.6338,
"num_input_tokens_seen": 488848,
"step": 1715
},
{
"epoch": 3.208955223880597,
"grad_norm": 0.21609841287136078,
"learning_rate": 0.0009889447951639044,
"loss": 0.6226,
"num_input_tokens_seen": 490288,
"step": 1720
},
{
"epoch": 3.218283582089552,
"grad_norm": 0.38346755504608154,
"learning_rate": 0.0009887739109752255,
"loss": 0.7313,
"num_input_tokens_seen": 491728,
"step": 1725
},
{
"epoch": 3.2276119402985075,
"grad_norm": 0.20293354988098145,
"learning_rate": 0.0009886017311801448,
"loss": 0.3952,
"num_input_tokens_seen": 493200,
"step": 1730
},
{
"epoch": 3.236940298507463,
"grad_norm": 0.26398634910583496,
"learning_rate": 0.000988428256235064,
"loss": 0.7915,
"num_input_tokens_seen": 494480,
"step": 1735
},
{
"epoch": 3.246268656716418,
"grad_norm": 0.505710244178772,
"learning_rate": 0.0009882534865998176,
"loss": 0.4743,
"num_input_tokens_seen": 495696,
"step": 1740
},
{
"epoch": 3.2555970149253732,
"grad_norm": 0.22293971478939056,
"learning_rate": 0.0009880774227376727,
"loss": 0.5284,
"num_input_tokens_seen": 497136,
"step": 1745
},
{
"epoch": 3.264925373134328,
"grad_norm": 0.2277284413576126,
"learning_rate": 0.0009879000651153262,
"loss": 0.3545,
"num_input_tokens_seen": 498672,
"step": 1750
},
{
"epoch": 3.2742537313432836,
"grad_norm": 0.24438872933387756,
"learning_rate": 0.0009877214142029053,
"loss": 0.595,
"num_input_tokens_seen": 500016,
"step": 1755
},
{
"epoch": 3.283582089552239,
"grad_norm": 0.2326008379459381,
"learning_rate": 0.0009875414704739645,
"loss": 0.5767,
"num_input_tokens_seen": 501360,
"step": 1760
},
{
"epoch": 3.292910447761194,
"grad_norm": 0.22996485233306885,
"learning_rate": 0.0009873602344054855,
"loss": 0.6522,
"num_input_tokens_seen": 502768,
"step": 1765
},
{
"epoch": 3.3022388059701493,
"grad_norm": 0.24486055970191956,
"learning_rate": 0.0009871777064778759,
"loss": 0.5828,
"num_input_tokens_seen": 504176,
"step": 1770
},
{
"epoch": 3.3115671641791042,
"grad_norm": 0.24386072158813477,
"learning_rate": 0.0009869938871749674,
"loss": 0.4539,
"num_input_tokens_seen": 505424,
"step": 1775
},
{
"epoch": 3.3208955223880596,
"grad_norm": 0.1616196483373642,
"learning_rate": 0.0009868087769840151,
"loss": 0.4915,
"num_input_tokens_seen": 507024,
"step": 1780
},
{
"epoch": 3.330223880597015,
"grad_norm": 0.18312668800354004,
"learning_rate": 0.0009866223763956954,
"loss": 0.8632,
"num_input_tokens_seen": 508432,
"step": 1785
},
{
"epoch": 3.33955223880597,
"grad_norm": 0.28520822525024414,
"learning_rate": 0.0009864346859041057,
"loss": 0.728,
"num_input_tokens_seen": 510000,
"step": 1790
},
{
"epoch": 3.3488805970149254,
"grad_norm": 0.19970983266830444,
"learning_rate": 0.0009862457060067617,
"loss": 0.9002,
"num_input_tokens_seen": 511376,
"step": 1795
},
{
"epoch": 3.3582089552238807,
"grad_norm": 0.23394840955734253,
"learning_rate": 0.0009860554372045985,
"loss": 0.5357,
"num_input_tokens_seen": 512688,
"step": 1800
},
{
"epoch": 3.3675373134328357,
"grad_norm": 0.23587185144424438,
"learning_rate": 0.000985863880001966,
"loss": 0.7726,
"num_input_tokens_seen": 514000,
"step": 1805
},
{
"epoch": 3.376865671641791,
"grad_norm": 0.25410711765289307,
"learning_rate": 0.0009856710349066308,
"loss": 0.5922,
"num_input_tokens_seen": 515312,
"step": 1810
},
{
"epoch": 3.3861940298507465,
"grad_norm": 0.3701179623603821,
"learning_rate": 0.000985476902429772,
"loss": 0.7119,
"num_input_tokens_seen": 516528,
"step": 1815
},
{
"epoch": 3.3955223880597014,
"grad_norm": 0.19945183396339417,
"learning_rate": 0.0009852814830859826,
"loss": 0.5055,
"num_input_tokens_seen": 517936,
"step": 1820
},
{
"epoch": 3.404850746268657,
"grad_norm": 0.40914735198020935,
"learning_rate": 0.0009850847773932656,
"loss": 0.4616,
"num_input_tokens_seen": 519376,
"step": 1825
},
{
"epoch": 3.4141791044776117,
"grad_norm": 0.244802787899971,
"learning_rate": 0.000984886785873034,
"loss": 0.6121,
"num_input_tokens_seen": 521168,
"step": 1830
},
{
"epoch": 3.423507462686567,
"grad_norm": 0.48571139574050903,
"learning_rate": 0.00098468750905011,
"loss": 0.9497,
"num_input_tokens_seen": 522512,
"step": 1835
},
{
"epoch": 3.4328358208955225,
"grad_norm": 0.284146249294281,
"learning_rate": 0.0009844869474527214,
"loss": 0.7253,
"num_input_tokens_seen": 523888,
"step": 1840
},
{
"epoch": 3.4421641791044775,
"grad_norm": 0.2026161402463913,
"learning_rate": 0.0009842851016125028,
"loss": 0.6453,
"num_input_tokens_seen": 525360,
"step": 1845
},
{
"epoch": 3.451492537313433,
"grad_norm": 0.26175108551979065,
"learning_rate": 0.0009840819720644922,
"loss": 0.6997,
"num_input_tokens_seen": 526928,
"step": 1850
},
{
"epoch": 3.4608208955223883,
"grad_norm": 0.18297512829303741,
"learning_rate": 0.0009838775593471309,
"loss": 0.561,
"num_input_tokens_seen": 528272,
"step": 1855
},
{
"epoch": 3.470149253731343,
"grad_norm": 0.24992622435092926,
"learning_rate": 0.0009836718640022612,
"loss": 0.3125,
"num_input_tokens_seen": 529584,
"step": 1860
},
{
"epoch": 3.4794776119402986,
"grad_norm": 0.3266121745109558,
"learning_rate": 0.0009834648865751252,
"loss": 0.6182,
"num_input_tokens_seen": 530896,
"step": 1865
},
{
"epoch": 3.4888059701492535,
"grad_norm": 0.24236100912094116,
"learning_rate": 0.0009832566276143642,
"loss": 0.6259,
"num_input_tokens_seen": 532304,
"step": 1870
},
{
"epoch": 3.498134328358209,
"grad_norm": 0.20327317714691162,
"learning_rate": 0.0009830470876720152,
"loss": 0.5588,
"num_input_tokens_seen": 533808,
"step": 1875
},
{
"epoch": 3.5074626865671643,
"grad_norm": 0.18417012691497803,
"learning_rate": 0.000982836267303512,
"loss": 0.5219,
"num_input_tokens_seen": 535216,
"step": 1880
},
{
"epoch": 3.5167910447761193,
"grad_norm": 0.32585784792900085,
"learning_rate": 0.0009826241670676816,
"loss": 0.5187,
"num_input_tokens_seen": 536720,
"step": 1885
},
{
"epoch": 3.5261194029850746,
"grad_norm": 0.31505173444747925,
"learning_rate": 0.0009824107875267443,
"loss": 0.6881,
"num_input_tokens_seen": 538064,
"step": 1890
},
{
"epoch": 3.53544776119403,
"grad_norm": 0.3329700529575348,
"learning_rate": 0.0009821961292463108,
"loss": 0.4131,
"num_input_tokens_seen": 539600,
"step": 1895
},
{
"epoch": 3.544776119402985,
"grad_norm": 0.1608249843120575,
"learning_rate": 0.0009819801927953816,
"loss": 0.6528,
"num_input_tokens_seen": 540976,
"step": 1900
},
{
"epoch": 3.5541044776119404,
"grad_norm": 0.24903835356235504,
"learning_rate": 0.0009817629787463456,
"loss": 0.6215,
"num_input_tokens_seen": 542416,
"step": 1905
},
{
"epoch": 3.5634328358208958,
"grad_norm": 0.21984891593456268,
"learning_rate": 0.0009815444876749779,
"loss": 0.5424,
"num_input_tokens_seen": 543792,
"step": 1910
},
{
"epoch": 3.5727611940298507,
"grad_norm": 0.37518739700317383,
"learning_rate": 0.0009813247201604389,
"loss": 0.6319,
"num_input_tokens_seen": 545328,
"step": 1915
},
{
"epoch": 3.582089552238806,
"grad_norm": 0.16793659329414368,
"learning_rate": 0.0009811036767852725,
"loss": 0.4634,
"num_input_tokens_seen": 546672,
"step": 1920
},
{
"epoch": 3.591417910447761,
"grad_norm": 0.17305253446102142,
"learning_rate": 0.000980881358135404,
"loss": 0.4199,
"num_input_tokens_seen": 548080,
"step": 1925
},
{
"epoch": 3.6007462686567164,
"grad_norm": 0.13368631899356842,
"learning_rate": 0.0009806577648001397,
"loss": 0.4275,
"num_input_tokens_seen": 549680,
"step": 1930
},
{
"epoch": 3.6100746268656714,
"grad_norm": 0.3820650577545166,
"learning_rate": 0.0009804328973721645,
"loss": 0.6723,
"num_input_tokens_seen": 550992,
"step": 1935
},
{
"epoch": 3.6194029850746268,
"grad_norm": 0.18812960386276245,
"learning_rate": 0.0009802067564475413,
"loss": 0.7015,
"num_input_tokens_seen": 552496,
"step": 1940
},
{
"epoch": 3.628731343283582,
"grad_norm": 0.31182393431663513,
"learning_rate": 0.000979979342625707,
"loss": 0.5029,
"num_input_tokens_seen": 553840,
"step": 1945
},
{
"epoch": 3.638059701492537,
"grad_norm": 0.1738317906856537,
"learning_rate": 0.0009797506565094745,
"loss": 0.733,
"num_input_tokens_seen": 555248,
"step": 1950
},
{
"epoch": 3.6473880597014925,
"grad_norm": 0.2425081580877304,
"learning_rate": 0.000979520698705028,
"loss": 0.5905,
"num_input_tokens_seen": 556624,
"step": 1955
},
{
"epoch": 3.656716417910448,
"grad_norm": 0.23325420916080475,
"learning_rate": 0.000979289469821923,
"loss": 1.0098,
"num_input_tokens_seen": 558096,
"step": 1960
},
{
"epoch": 3.666044776119403,
"grad_norm": 0.3119276463985443,
"learning_rate": 0.0009790569704730843,
"loss": 0.7166,
"num_input_tokens_seen": 559376,
"step": 1965
},
{
"epoch": 3.675373134328358,
"grad_norm": 0.26881784200668335,
"learning_rate": 0.0009788232012748043,
"loss": 0.7225,
"num_input_tokens_seen": 560784,
"step": 1970
},
{
"epoch": 3.6847014925373136,
"grad_norm": 0.3153090476989746,
"learning_rate": 0.0009785881628467412,
"loss": 0.3893,
"num_input_tokens_seen": 562416,
"step": 1975
},
{
"epoch": 3.6940298507462686,
"grad_norm": 0.26809364557266235,
"learning_rate": 0.0009783518558119182,
"loss": 0.6695,
"num_input_tokens_seen": 563920,
"step": 1980
},
{
"epoch": 3.703358208955224,
"grad_norm": 0.3261995017528534,
"learning_rate": 0.0009781142807967205,
"loss": 0.5614,
"num_input_tokens_seen": 565424,
"step": 1985
},
{
"epoch": 3.7126865671641793,
"grad_norm": 0.23048445582389832,
"learning_rate": 0.0009778754384308947,
"loss": 0.4495,
"num_input_tokens_seen": 566896,
"step": 1990
},
{
"epoch": 3.7220149253731343,
"grad_norm": 0.2346111238002777,
"learning_rate": 0.000977635329347547,
"loss": 0.7348,
"num_input_tokens_seen": 568176,
"step": 1995
},
{
"epoch": 3.7313432835820897,
"grad_norm": 0.4382629990577698,
"learning_rate": 0.000977393954183141,
"loss": 0.7043,
"num_input_tokens_seen": 569616,
"step": 2000
},
{
"epoch": 3.7406716417910446,
"grad_norm": 0.28776127099990845,
"learning_rate": 0.0009771513135774965,
"loss": 0.3675,
"num_input_tokens_seen": 571024,
"step": 2005
},
{
"epoch": 3.75,
"grad_norm": 0.24283809959888458,
"learning_rate": 0.0009769074081737877,
"loss": 0.7819,
"num_input_tokens_seen": 572432,
"step": 2010
},
{
"epoch": 3.7593283582089554,
"grad_norm": 0.299737811088562,
"learning_rate": 0.000976662238618541,
"loss": 0.5218,
"num_input_tokens_seen": 573712,
"step": 2015
},
{
"epoch": 3.7686567164179103,
"grad_norm": 0.26282361149787903,
"learning_rate": 0.0009764158055616346,
"loss": 0.6876,
"num_input_tokens_seen": 575184,
"step": 2020
},
{
"epoch": 3.7779850746268657,
"grad_norm": 0.13650095462799072,
"learning_rate": 0.0009761681096562949,
"loss": 0.5098,
"num_input_tokens_seen": 576688,
"step": 2025
},
{
"epoch": 3.7873134328358207,
"grad_norm": 0.2913561463356018,
"learning_rate": 0.0009759191515590963,
"loss": 0.8325,
"num_input_tokens_seen": 578000,
"step": 2030
},
{
"epoch": 3.796641791044776,
"grad_norm": 0.23576973378658295,
"learning_rate": 0.0009756689319299592,
"loss": 0.5476,
"num_input_tokens_seen": 579632,
"step": 2035
},
{
"epoch": 3.8059701492537314,
"grad_norm": 0.14105857908725739,
"learning_rate": 0.0009754174514321472,
"loss": 0.5916,
"num_input_tokens_seen": 581104,
"step": 2040
},
{
"epoch": 3.8152985074626864,
"grad_norm": 0.3116491436958313,
"learning_rate": 0.0009751647107322667,
"loss": 0.8013,
"num_input_tokens_seen": 582512,
"step": 2045
},
{
"epoch": 3.824626865671642,
"grad_norm": 0.3280572295188904,
"learning_rate": 0.0009749107105002646,
"loss": 0.5183,
"num_input_tokens_seen": 583920,
"step": 2050
},
{
"epoch": 3.833955223880597,
"grad_norm": 0.2682742178440094,
"learning_rate": 0.000974655451409426,
"loss": 0.7085,
"num_input_tokens_seen": 585328,
"step": 2055
},
{
"epoch": 3.843283582089552,
"grad_norm": 0.18694724142551422,
"learning_rate": 0.0009743989341363731,
"loss": 0.6983,
"num_input_tokens_seen": 586608,
"step": 2060
},
{
"epoch": 3.8526119402985075,
"grad_norm": 0.19222024083137512,
"learning_rate": 0.0009741411593610635,
"loss": 0.5223,
"num_input_tokens_seen": 588080,
"step": 2065
},
{
"epoch": 3.861940298507463,
"grad_norm": 0.19455499947071075,
"learning_rate": 0.0009738821277667878,
"loss": 0.4412,
"num_input_tokens_seen": 589552,
"step": 2070
},
{
"epoch": 3.871268656716418,
"grad_norm": 0.14917276799678802,
"learning_rate": 0.0009736218400401682,
"loss": 0.5749,
"num_input_tokens_seen": 590928,
"step": 2075
},
{
"epoch": 3.8805970149253732,
"grad_norm": 0.1883125603199005,
"learning_rate": 0.0009733602968711565,
"loss": 0.6998,
"num_input_tokens_seen": 592560,
"step": 2080
},
{
"epoch": 3.8899253731343286,
"grad_norm": 0.4482322335243225,
"learning_rate": 0.0009730974989530321,
"loss": 0.5617,
"num_input_tokens_seen": 594032,
"step": 2085
},
{
"epoch": 3.8992537313432836,
"grad_norm": 0.22975589334964752,
"learning_rate": 0.000972833446982401,
"loss": 0.6922,
"num_input_tokens_seen": 595472,
"step": 2090
},
{
"epoch": 3.908582089552239,
"grad_norm": 0.5198706388473511,
"learning_rate": 0.0009725681416591927,
"loss": 0.7359,
"num_input_tokens_seen": 596976,
"step": 2095
},
{
"epoch": 3.917910447761194,
"grad_norm": 0.2125498652458191,
"learning_rate": 0.0009723015836866595,
"loss": 0.5097,
"num_input_tokens_seen": 598256,
"step": 2100
},
{
"epoch": 3.9272388059701493,
"grad_norm": 0.1989123821258545,
"learning_rate": 0.0009720337737713739,
"loss": 0.5684,
"num_input_tokens_seen": 599568,
"step": 2105
},
{
"epoch": 3.9365671641791042,
"grad_norm": 0.2720445990562439,
"learning_rate": 0.000971764712623227,
"loss": 0.5866,
"num_input_tokens_seen": 601008,
"step": 2110
},
{
"epoch": 3.9458955223880596,
"grad_norm": 0.23459851741790771,
"learning_rate": 0.0009714944009554262,
"loss": 0.5404,
"num_input_tokens_seen": 602256,
"step": 2115
},
{
"epoch": 3.955223880597015,
"grad_norm": 0.17958712577819824,
"learning_rate": 0.0009712228394844945,
"loss": 0.4168,
"num_input_tokens_seen": 603760,
"step": 2120
},
{
"epoch": 3.96455223880597,
"grad_norm": 0.13713163137435913,
"learning_rate": 0.0009709500289302673,
"loss": 0.5269,
"num_input_tokens_seen": 605232,
"step": 2125
},
{
"epoch": 3.9738805970149254,
"grad_norm": 0.15691228210926056,
"learning_rate": 0.0009706759700158907,
"loss": 0.5853,
"num_input_tokens_seen": 606448,
"step": 2130
},
{
"epoch": 3.9832089552238807,
"grad_norm": 0.15463533997535706,
"learning_rate": 0.0009704006634678205,
"loss": 0.3447,
"num_input_tokens_seen": 608144,
"step": 2135
},
{
"epoch": 3.9925373134328357,
"grad_norm": 0.20043453574180603,
"learning_rate": 0.0009701241100158189,
"loss": 0.5617,
"num_input_tokens_seen": 609744,
"step": 2140
},
{
"epoch": 4.0,
"eval_loss": 0.64982670545578,
"eval_runtime": 4.1877,
"eval_samples_per_second": 56.834,
"eval_steps_per_second": 14.328,
"num_input_tokens_seen": 610584,
"step": 2144
},
{
"epoch": 4.001865671641791,
"grad_norm": 0.22188594937324524,
"learning_rate": 0.0009698463103929542,
"loss": 0.4556,
"num_input_tokens_seen": 610808,
"step": 2145
},
{
"epoch": 4.0111940298507465,
"grad_norm": 0.151460200548172,
"learning_rate": 0.0009695672653355972,
"loss": 0.387,
"num_input_tokens_seen": 612280,
"step": 2150
},
{
"epoch": 4.020522388059701,
"grad_norm": 0.1626012623310089,
"learning_rate": 0.0009692869755834203,
"loss": 0.6283,
"num_input_tokens_seen": 613816,
"step": 2155
},
{
"epoch": 4.029850746268656,
"grad_norm": 0.21569664776325226,
"learning_rate": 0.0009690054418793955,
"loss": 0.3275,
"num_input_tokens_seen": 615224,
"step": 2160
},
{
"epoch": 4.039179104477612,
"grad_norm": 0.2591455280780792,
"learning_rate": 0.0009687226649697915,
"loss": 0.5641,
"num_input_tokens_seen": 616568,
"step": 2165
},
{
"epoch": 4.048507462686567,
"grad_norm": 0.24344514310359955,
"learning_rate": 0.000968438645604173,
"loss": 0.3863,
"num_input_tokens_seen": 618008,
"step": 2170
},
{
"epoch": 4.057835820895522,
"grad_norm": 0.32639652490615845,
"learning_rate": 0.0009681533845353978,
"loss": 0.5677,
"num_input_tokens_seen": 619576,
"step": 2175
},
{
"epoch": 4.067164179104478,
"grad_norm": 0.3175128400325775,
"learning_rate": 0.0009678668825196154,
"loss": 0.6541,
"num_input_tokens_seen": 620920,
"step": 2180
},
{
"epoch": 4.076492537313433,
"grad_norm": 0.327414333820343,
"learning_rate": 0.0009675791403162645,
"loss": 0.5617,
"num_input_tokens_seen": 622296,
"step": 2185
},
{
"epoch": 4.085820895522388,
"grad_norm": 0.1868485063314438,
"learning_rate": 0.0009672901586880711,
"loss": 0.2805,
"num_input_tokens_seen": 623800,
"step": 2190
},
{
"epoch": 4.095149253731344,
"grad_norm": 0.2809881567955017,
"learning_rate": 0.000966999938401047,
"loss": 0.4671,
"num_input_tokens_seen": 625208,
"step": 2195
},
{
"epoch": 4.104477611940299,
"grad_norm": 0.21550977230072021,
"learning_rate": 0.0009667084802244868,
"loss": 0.585,
"num_input_tokens_seen": 626584,
"step": 2200
},
{
"epoch": 4.1138059701492535,
"grad_norm": 0.27427732944488525,
"learning_rate": 0.0009664157849309669,
"loss": 0.6408,
"num_input_tokens_seen": 627960,
"step": 2205
},
{
"epoch": 4.123134328358209,
"grad_norm": 0.23494300246238708,
"learning_rate": 0.0009661218532963426,
"loss": 0.5565,
"num_input_tokens_seen": 629368,
"step": 2210
},
{
"epoch": 4.132462686567164,
"grad_norm": 0.26595717668533325,
"learning_rate": 0.0009658266860997465,
"loss": 0.6724,
"num_input_tokens_seen": 630776,
"step": 2215
},
{
"epoch": 4.141791044776119,
"grad_norm": 0.18775132298469543,
"learning_rate": 0.0009655302841235865,
"loss": 0.4481,
"num_input_tokens_seen": 632120,
"step": 2220
},
{
"epoch": 4.151119402985074,
"grad_norm": 0.13259465992450714,
"learning_rate": 0.0009652326481535434,
"loss": 0.4325,
"num_input_tokens_seen": 633784,
"step": 2225
},
{
"epoch": 4.16044776119403,
"grad_norm": 0.1564403623342514,
"learning_rate": 0.0009649337789785688,
"loss": 0.6364,
"num_input_tokens_seen": 635224,
"step": 2230
},
{
"epoch": 4.169776119402985,
"grad_norm": 0.326425701379776,
"learning_rate": 0.000964633677390884,
"loss": 0.5656,
"num_input_tokens_seen": 636504,
"step": 2235
},
{
"epoch": 4.17910447761194,
"grad_norm": 0.18858228623867035,
"learning_rate": 0.0009643323441859757,
"loss": 0.4936,
"num_input_tokens_seen": 638232,
"step": 2240
},
{
"epoch": 4.188432835820896,
"grad_norm": 0.21565784513950348,
"learning_rate": 0.0009640297801625968,
"loss": 0.595,
"num_input_tokens_seen": 639608,
"step": 2245
},
{
"epoch": 4.197761194029851,
"grad_norm": 0.15167202055454254,
"learning_rate": 0.0009637259861227616,
"loss": 0.5027,
"num_input_tokens_seen": 641112,
"step": 2250
},
{
"epoch": 4.207089552238806,
"grad_norm": 0.24145975708961487,
"learning_rate": 0.0009634209628717455,
"loss": 0.577,
"num_input_tokens_seen": 642488,
"step": 2255
},
{
"epoch": 4.2164179104477615,
"grad_norm": 0.32112327218055725,
"learning_rate": 0.000963114711218082,
"loss": 0.6006,
"num_input_tokens_seen": 643800,
"step": 2260
},
{
"epoch": 4.225746268656716,
"grad_norm": 0.3594324290752411,
"learning_rate": 0.0009628072319735606,
"loss": 0.7394,
"num_input_tokens_seen": 645304,
"step": 2265
},
{
"epoch": 4.235074626865671,
"grad_norm": 0.26933395862579346,
"learning_rate": 0.0009624985259532251,
"loss": 0.5461,
"num_input_tokens_seen": 646648,
"step": 2270
},
{
"epoch": 4.244402985074627,
"grad_norm": 0.2904312014579773,
"learning_rate": 0.000962188593975371,
"loss": 0.6195,
"num_input_tokens_seen": 648216,
"step": 2275
},
{
"epoch": 4.253731343283582,
"grad_norm": 0.12113097310066223,
"learning_rate": 0.0009618774368615432,
"loss": 0.6924,
"num_input_tokens_seen": 649592,
"step": 2280
},
{
"epoch": 4.263059701492537,
"grad_norm": 0.32047057151794434,
"learning_rate": 0.000961565055436535,
"loss": 0.468,
"num_input_tokens_seen": 651096,
"step": 2285
},
{
"epoch": 4.272388059701493,
"grad_norm": 0.2290755957365036,
"learning_rate": 0.0009612514505283838,
"loss": 0.4576,
"num_input_tokens_seen": 652536,
"step": 2290
},
{
"epoch": 4.281716417910448,
"grad_norm": 0.19104692339897156,
"learning_rate": 0.000960936622968371,
"loss": 0.7115,
"num_input_tokens_seen": 653976,
"step": 2295
},
{
"epoch": 4.291044776119403,
"grad_norm": 0.2492237538099289,
"learning_rate": 0.0009606205735910186,
"loss": 0.5699,
"num_input_tokens_seen": 655320,
"step": 2300
},
{
"epoch": 4.300373134328359,
"grad_norm": 0.19649381935596466,
"learning_rate": 0.0009603033032340874,
"loss": 0.7391,
"num_input_tokens_seen": 656760,
"step": 2305
},
{
"epoch": 4.309701492537314,
"grad_norm": 0.20256049931049347,
"learning_rate": 0.0009599848127385747,
"loss": 0.6327,
"num_input_tokens_seen": 658200,
"step": 2310
},
{
"epoch": 4.3190298507462686,
"grad_norm": 0.22700995206832886,
"learning_rate": 0.0009596651029487116,
"loss": 0.6798,
"num_input_tokens_seen": 659544,
"step": 2315
},
{
"epoch": 4.3283582089552235,
"grad_norm": 0.1987559199333191,
"learning_rate": 0.000959344174711962,
"loss": 0.5871,
"num_input_tokens_seen": 660952,
"step": 2320
},
{
"epoch": 4.337686567164179,
"grad_norm": 0.21659596264362335,
"learning_rate": 0.0009590220288790191,
"loss": 0.5206,
"num_input_tokens_seen": 662424,
"step": 2325
},
{
"epoch": 4.347014925373134,
"grad_norm": 0.20847119390964508,
"learning_rate": 0.0009586986663038035,
"loss": 0.341,
"num_input_tokens_seen": 663832,
"step": 2330
},
{
"epoch": 4.356343283582089,
"grad_norm": 0.2969812750816345,
"learning_rate": 0.0009583740878434616,
"loss": 0.6927,
"num_input_tokens_seen": 665112,
"step": 2335
},
{
"epoch": 4.365671641791045,
"grad_norm": 0.33004283905029297,
"learning_rate": 0.0009580482943583621,
"loss": 0.7178,
"num_input_tokens_seen": 666712,
"step": 2340
},
{
"epoch": 4.375,
"grad_norm": 0.2885778248310089,
"learning_rate": 0.0009577212867120946,
"loss": 0.6155,
"num_input_tokens_seen": 668248,
"step": 2345
},
{
"epoch": 4.384328358208955,
"grad_norm": 0.15386725962162018,
"learning_rate": 0.0009573930657714678,
"loss": 0.5363,
"num_input_tokens_seen": 669816,
"step": 2350
},
{
"epoch": 4.393656716417911,
"grad_norm": 0.19172047078609467,
"learning_rate": 0.0009570636324065054,
"loss": 0.7379,
"num_input_tokens_seen": 671448,
"step": 2355
},
{
"epoch": 4.402985074626866,
"grad_norm": 0.14004682004451752,
"learning_rate": 0.0009567329874904456,
"loss": 0.5496,
"num_input_tokens_seen": 673016,
"step": 2360
},
{
"epoch": 4.412313432835821,
"grad_norm": 0.24465550482273102,
"learning_rate": 0.0009564011318997379,
"loss": 0.4256,
"num_input_tokens_seen": 674456,
"step": 2365
},
{
"epoch": 4.4216417910447765,
"grad_norm": 0.2554193437099457,
"learning_rate": 0.0009560680665140414,
"loss": 0.619,
"num_input_tokens_seen": 676056,
"step": 2370
},
{
"epoch": 4.4309701492537314,
"grad_norm": 0.16861599683761597,
"learning_rate": 0.0009557337922162211,
"loss": 0.327,
"num_input_tokens_seen": 677784,
"step": 2375
},
{
"epoch": 4.440298507462686,
"grad_norm": 0.21215523779392242,
"learning_rate": 0.0009553983098923473,
"loss": 0.5506,
"num_input_tokens_seen": 678968,
"step": 2380
},
{
"epoch": 4.449626865671641,
"grad_norm": 0.23762239515781403,
"learning_rate": 0.0009550616204316922,
"loss": 0.672,
"num_input_tokens_seen": 680504,
"step": 2385
},
{
"epoch": 4.458955223880597,
"grad_norm": 0.3079289197921753,
"learning_rate": 0.0009547237247267277,
"loss": 0.6254,
"num_input_tokens_seen": 681816,
"step": 2390
},
{
"epoch": 4.468283582089552,
"grad_norm": 0.37273162603378296,
"learning_rate": 0.0009543846236731234,
"loss": 0.6398,
"num_input_tokens_seen": 683256,
"step": 2395
},
{
"epoch": 4.477611940298507,
"grad_norm": 0.18848223984241486,
"learning_rate": 0.0009540443181697436,
"loss": 0.4413,
"num_input_tokens_seen": 684888,
"step": 2400
},
{
"epoch": 4.486940298507463,
"grad_norm": 0.22393417358398438,
"learning_rate": 0.0009537028091186453,
"loss": 0.3668,
"num_input_tokens_seen": 686296,
"step": 2405
},
{
"epoch": 4.496268656716418,
"grad_norm": 0.22919961810112,
"learning_rate": 0.000953360097425076,
"loss": 0.5036,
"num_input_tokens_seen": 687736,
"step": 2410
},
{
"epoch": 4.505597014925373,
"grad_norm": 0.15756945312023163,
"learning_rate": 0.0009530161839974711,
"loss": 0.6575,
"num_input_tokens_seen": 689240,
"step": 2415
},
{
"epoch": 4.514925373134329,
"grad_norm": 0.1780344396829605,
"learning_rate": 0.0009526710697474513,
"loss": 0.7309,
"num_input_tokens_seen": 690616,
"step": 2420
},
{
"epoch": 4.524253731343284,
"grad_norm": 0.2617693245410919,
"learning_rate": 0.0009523247555898205,
"loss": 0.6014,
"num_input_tokens_seen": 691960,
"step": 2425
},
{
"epoch": 4.5335820895522385,
"grad_norm": 0.23155321180820465,
"learning_rate": 0.0009519772424425628,
"loss": 0.5385,
"num_input_tokens_seen": 693592,
"step": 2430
},
{
"epoch": 4.542910447761194,
"grad_norm": 0.2970794141292572,
"learning_rate": 0.000951628531226841,
"loss": 0.513,
"num_input_tokens_seen": 695128,
"step": 2435
},
{
"epoch": 4.552238805970149,
"grad_norm": 0.14807343482971191,
"learning_rate": 0.0009512786228669936,
"loss": 0.3699,
"num_input_tokens_seen": 696504,
"step": 2440
},
{
"epoch": 4.561567164179104,
"grad_norm": 0.26394209265708923,
"learning_rate": 0.0009509275182905322,
"loss": 0.6715,
"num_input_tokens_seen": 697880,
"step": 2445
},
{
"epoch": 4.57089552238806,
"grad_norm": 0.29761648178100586,
"learning_rate": 0.0009505752184281391,
"loss": 0.6177,
"num_input_tokens_seen": 699352,
"step": 2450
},
{
"epoch": 4.580223880597015,
"grad_norm": 0.24216167628765106,
"learning_rate": 0.0009502217242136656,
"loss": 0.609,
"num_input_tokens_seen": 700824,
"step": 2455
},
{
"epoch": 4.58955223880597,
"grad_norm": 0.2688663899898529,
"learning_rate": 0.0009498670365841282,
"loss": 0.4861,
"num_input_tokens_seen": 702232,
"step": 2460
},
{
"epoch": 4.598880597014926,
"grad_norm": 0.22293132543563843,
"learning_rate": 0.0009495111564797073,
"loss": 0.345,
"num_input_tokens_seen": 703640,
"step": 2465
},
{
"epoch": 4.608208955223881,
"grad_norm": 0.21038322150707245,
"learning_rate": 0.000949154084843744,
"loss": 0.6915,
"num_input_tokens_seen": 704952,
"step": 2470
},
{
"epoch": 4.617537313432836,
"grad_norm": 0.21407586336135864,
"learning_rate": 0.0009487958226227378,
"loss": 0.4968,
"num_input_tokens_seen": 706328,
"step": 2475
},
{
"epoch": 4.6268656716417915,
"grad_norm": 0.2326938360929489,
"learning_rate": 0.0009484363707663442,
"loss": 0.5528,
"num_input_tokens_seen": 707800,
"step": 2480
},
{
"epoch": 4.6361940298507465,
"grad_norm": 0.23766377568244934,
"learning_rate": 0.0009480757302273721,
"loss": 0.5268,
"num_input_tokens_seen": 709144,
"step": 2485
},
{
"epoch": 4.645522388059701,
"grad_norm": 0.1962820440530777,
"learning_rate": 0.0009477139019617813,
"loss": 0.4697,
"num_input_tokens_seen": 710488,
"step": 2490
},
{
"epoch": 4.654850746268656,
"grad_norm": 0.29607799649238586,
"learning_rate": 0.00094735088692868,
"loss": 0.5732,
"num_input_tokens_seen": 711736,
"step": 2495
},
{
"epoch": 4.664179104477612,
"grad_norm": 0.26132312417030334,
"learning_rate": 0.0009469866860903217,
"loss": 0.7312,
"num_input_tokens_seen": 712888,
"step": 2500
},
{
"epoch": 4.673507462686567,
"grad_norm": 0.29316362738609314,
"learning_rate": 0.0009466213004121041,
"loss": 0.4659,
"num_input_tokens_seen": 714168,
"step": 2505
},
{
"epoch": 4.682835820895522,
"grad_norm": 0.21937410533428192,
"learning_rate": 0.0009462547308625647,
"loss": 0.2729,
"num_input_tokens_seen": 715544,
"step": 2510
},
{
"epoch": 4.692164179104478,
"grad_norm": 0.15683762729167938,
"learning_rate": 0.0009458869784133795,
"loss": 0.4786,
"num_input_tokens_seen": 716952,
"step": 2515
},
{
"epoch": 4.701492537313433,
"grad_norm": 0.377902626991272,
"learning_rate": 0.0009455180440393598,
"loss": 0.4655,
"num_input_tokens_seen": 718264,
"step": 2520
},
{
"epoch": 4.710820895522388,
"grad_norm": 0.21586859226226807,
"learning_rate": 0.0009451479287184505,
"loss": 0.4381,
"num_input_tokens_seen": 719704,
"step": 2525
},
{
"epoch": 4.720149253731344,
"grad_norm": 0.12033271789550781,
"learning_rate": 0.000944776633431726,
"loss": 0.5281,
"num_input_tokens_seen": 721112,
"step": 2530
},
{
"epoch": 4.729477611940299,
"grad_norm": 0.29399579763412476,
"learning_rate": 0.0009444041591633893,
"loss": 0.7112,
"num_input_tokens_seen": 722456,
"step": 2535
},
{
"epoch": 4.7388059701492535,
"grad_norm": 0.21166181564331055,
"learning_rate": 0.0009440305069007678,
"loss": 0.6444,
"num_input_tokens_seen": 723864,
"step": 2540
},
{
"epoch": 4.7481343283582085,
"grad_norm": 0.30777788162231445,
"learning_rate": 0.0009436556776343119,
"loss": 0.6277,
"num_input_tokens_seen": 725272,
"step": 2545
},
{
"epoch": 4.757462686567164,
"grad_norm": 0.5244273543357849,
"learning_rate": 0.0009432796723575918,
"loss": 0.6372,
"num_input_tokens_seen": 726648,
"step": 2550
},
{
"epoch": 4.766791044776119,
"grad_norm": 0.28162702918052673,
"learning_rate": 0.000942902492067295,
"loss": 0.7612,
"num_input_tokens_seen": 727928,
"step": 2555
},
{
"epoch": 4.776119402985074,
"grad_norm": 0.23690469563007355,
"learning_rate": 0.0009425241377632239,
"loss": 0.7744,
"num_input_tokens_seen": 729208,
"step": 2560
},
{
"epoch": 4.78544776119403,
"grad_norm": 0.26138120889663696,
"learning_rate": 0.0009421446104482923,
"loss": 0.6066,
"num_input_tokens_seen": 730616,
"step": 2565
},
{
"epoch": 4.794776119402985,
"grad_norm": 0.24936138093471527,
"learning_rate": 0.0009417639111285234,
"loss": 0.54,
"num_input_tokens_seen": 732120,
"step": 2570
},
{
"epoch": 4.80410447761194,
"grad_norm": 0.23255106806755066,
"learning_rate": 0.000941382040813048,
"loss": 0.4528,
"num_input_tokens_seen": 733560,
"step": 2575
},
{
"epoch": 4.813432835820896,
"grad_norm": 0.19354680180549622,
"learning_rate": 0.0009409990005140998,
"loss": 0.496,
"num_input_tokens_seen": 735000,
"step": 2580
},
{
"epoch": 4.822761194029851,
"grad_norm": 0.1485157459974289,
"learning_rate": 0.0009406147912470142,
"loss": 0.5483,
"num_input_tokens_seen": 736440,
"step": 2585
},
{
"epoch": 4.832089552238806,
"grad_norm": 0.3111797273159027,
"learning_rate": 0.0009402294140302255,
"loss": 0.8823,
"num_input_tokens_seen": 738008,
"step": 2590
},
{
"epoch": 4.8414179104477615,
"grad_norm": 0.26995381712913513,
"learning_rate": 0.0009398428698852632,
"loss": 0.6301,
"num_input_tokens_seen": 739320,
"step": 2595
},
{
"epoch": 4.850746268656716,
"grad_norm": 0.30793821811676025,
"learning_rate": 0.0009394551598367509,
"loss": 0.539,
"num_input_tokens_seen": 740792,
"step": 2600
},
{
"epoch": 4.860074626865671,
"grad_norm": 0.2184814214706421,
"learning_rate": 0.0009390662849124021,
"loss": 0.4266,
"num_input_tokens_seen": 742296,
"step": 2605
},
{
"epoch": 4.869402985074627,
"grad_norm": 0.26626819372177124,
"learning_rate": 0.0009386762461430182,
"loss": 0.7834,
"num_input_tokens_seen": 743928,
"step": 2610
},
{
"epoch": 4.878731343283582,
"grad_norm": 0.16901420056819916,
"learning_rate": 0.0009382850445624855,
"loss": 0.4878,
"num_input_tokens_seen": 745304,
"step": 2615
},
{
"epoch": 4.888059701492537,
"grad_norm": 0.16570930182933807,
"learning_rate": 0.0009378926812077732,
"loss": 0.3941,
"num_input_tokens_seen": 746648,
"step": 2620
},
{
"epoch": 4.897388059701493,
"grad_norm": 0.21604560315608978,
"learning_rate": 0.000937499157118929,
"loss": 0.6682,
"num_input_tokens_seen": 748152,
"step": 2625
},
{
"epoch": 4.906716417910448,
"grad_norm": 0.15624657273292542,
"learning_rate": 0.0009371044733390786,
"loss": 0.648,
"num_input_tokens_seen": 749560,
"step": 2630
},
{
"epoch": 4.916044776119403,
"grad_norm": 0.24081504344940186,
"learning_rate": 0.0009367086309144206,
"loss": 0.5101,
"num_input_tokens_seen": 751000,
"step": 2635
},
{
"epoch": 4.925373134328359,
"grad_norm": 0.17723239958286285,
"learning_rate": 0.0009363116308942256,
"loss": 0.547,
"num_input_tokens_seen": 752408,
"step": 2640
},
{
"epoch": 4.934701492537314,
"grad_norm": 0.26823535561561584,
"learning_rate": 0.0009359134743308324,
"loss": 0.5798,
"num_input_tokens_seen": 753784,
"step": 2645
},
{
"epoch": 4.9440298507462686,
"grad_norm": 0.2015918642282486,
"learning_rate": 0.0009355141622796455,
"loss": 0.757,
"num_input_tokens_seen": 755224,
"step": 2650
},
{
"epoch": 4.9533582089552235,
"grad_norm": 0.2711903154850006,
"learning_rate": 0.0009351136957991324,
"loss": 0.5665,
"num_input_tokens_seen": 756600,
"step": 2655
},
{
"epoch": 4.962686567164179,
"grad_norm": 0.3151332139968872,
"learning_rate": 0.0009347120759508205,
"loss": 0.7391,
"num_input_tokens_seen": 758008,
"step": 2660
},
{
"epoch": 4.972014925373134,
"grad_norm": 0.2473651021718979,
"learning_rate": 0.0009343093037992945,
"loss": 0.6037,
"num_input_tokens_seen": 759448,
"step": 2665
},
{
"epoch": 4.981343283582089,
"grad_norm": 0.1524503082036972,
"learning_rate": 0.0009339053804121936,
"loss": 0.6259,
"num_input_tokens_seen": 760696,
"step": 2670
},
{
"epoch": 4.990671641791045,
"grad_norm": 0.20363134145736694,
"learning_rate": 0.0009335003068602086,
"loss": 0.5679,
"num_input_tokens_seen": 762104,
"step": 2675
},
{
"epoch": 5.0,
"grad_norm": 0.4046460688114166,
"learning_rate": 0.0009330940842170789,
"loss": 0.5602,
"num_input_tokens_seen": 763216,
"step": 2680
},
{
"epoch": 5.0,
"eval_loss": 0.6487875580787659,
"eval_runtime": 4.2039,
"eval_samples_per_second": 56.614,
"eval_steps_per_second": 14.272,
"num_input_tokens_seen": 763216,
"step": 2680
},
{
"epoch": 5.009328358208955,
"grad_norm": 0.20314471423625946,
"learning_rate": 0.0009326867135595905,
"loss": 0.4152,
"num_input_tokens_seen": 764624,
"step": 2685
},
{
"epoch": 5.018656716417911,
"grad_norm": 0.11228202283382416,
"learning_rate": 0.0009322781959675714,
"loss": 0.4049,
"num_input_tokens_seen": 766128,
"step": 2690
},
{
"epoch": 5.027985074626866,
"grad_norm": 0.19546857476234436,
"learning_rate": 0.0009318685325238908,
"loss": 0.5314,
"num_input_tokens_seen": 767536,
"step": 2695
},
{
"epoch": 5.037313432835821,
"grad_norm": 0.1725783497095108,
"learning_rate": 0.0009314577243144546,
"loss": 0.3877,
"num_input_tokens_seen": 769008,
"step": 2700
},
{
"epoch": 5.0466417910447765,
"grad_norm": 0.2179887890815735,
"learning_rate": 0.0009310457724282034,
"loss": 0.4164,
"num_input_tokens_seen": 770544,
"step": 2705
},
{
"epoch": 5.0559701492537314,
"grad_norm": 0.27726832032203674,
"learning_rate": 0.0009306326779571092,
"loss": 0.5732,
"num_input_tokens_seen": 771792,
"step": 2710
},
{
"epoch": 5.065298507462686,
"grad_norm": 0.23128509521484375,
"learning_rate": 0.0009302184419961731,
"loss": 0.5697,
"num_input_tokens_seen": 773328,
"step": 2715
},
{
"epoch": 5.074626865671641,
"grad_norm": 0.2378668338060379,
"learning_rate": 0.0009298030656434216,
"loss": 0.5977,
"num_input_tokens_seen": 774992,
"step": 2720
},
{
"epoch": 5.083955223880597,
"grad_norm": 0.24161554872989655,
"learning_rate": 0.0009293865499999043,
"loss": 0.6412,
"num_input_tokens_seen": 776368,
"step": 2725
},
{
"epoch": 5.093283582089552,
"grad_norm": 0.16764408349990845,
"learning_rate": 0.0009289688961696904,
"loss": 0.4418,
"num_input_tokens_seen": 777808,
"step": 2730
},
{
"epoch": 5.102611940298507,
"grad_norm": 0.14279451966285706,
"learning_rate": 0.0009285501052598666,
"loss": 0.5859,
"num_input_tokens_seen": 779376,
"step": 2735
},
{
"epoch": 5.111940298507463,
"grad_norm": 0.2730609178543091,
"learning_rate": 0.0009281301783805331,
"loss": 0.8133,
"num_input_tokens_seen": 780688,
"step": 2740
},
{
"epoch": 5.121268656716418,
"grad_norm": 0.310469388961792,
"learning_rate": 0.0009277091166448022,
"loss": 0.5642,
"num_input_tokens_seen": 782000,
"step": 2745
},
{
"epoch": 5.130597014925373,
"grad_norm": 0.283130407333374,
"learning_rate": 0.0009272869211687931,
"loss": 0.5266,
"num_input_tokens_seen": 783216,
"step": 2750
},
{
"epoch": 5.139925373134329,
"grad_norm": 0.3301401138305664,
"learning_rate": 0.0009268635930716314,
"loss": 0.5039,
"num_input_tokens_seen": 784592,
"step": 2755
},
{
"epoch": 5.149253731343284,
"grad_norm": 0.31221431493759155,
"learning_rate": 0.0009264391334754441,
"loss": 0.5843,
"num_input_tokens_seen": 785968,
"step": 2760
},
{
"epoch": 5.1585820895522385,
"grad_norm": 0.19616518914699554,
"learning_rate": 0.0009260135435053583,
"loss": 0.3343,
"num_input_tokens_seen": 787536,
"step": 2765
},
{
"epoch": 5.167910447761194,
"grad_norm": 0.370902419090271,
"learning_rate": 0.0009255868242894967,
"loss": 0.7573,
"num_input_tokens_seen": 788880,
"step": 2770
},
{
"epoch": 5.177238805970149,
"grad_norm": 0.25199368596076965,
"learning_rate": 0.0009251589769589757,
"loss": 0.5911,
"num_input_tokens_seen": 790192,
"step": 2775
},
{
"epoch": 5.186567164179104,
"grad_norm": 0.24278584122657776,
"learning_rate": 0.000924730002647902,
"loss": 0.7491,
"num_input_tokens_seen": 791504,
"step": 2780
},
{
"epoch": 5.19589552238806,
"grad_norm": 0.33345553278923035,
"learning_rate": 0.0009242999024933694,
"loss": 0.6366,
"num_input_tokens_seen": 792880,
"step": 2785
},
{
"epoch": 5.205223880597015,
"grad_norm": 0.24725379049777985,
"learning_rate": 0.0009238686776354564,
"loss": 0.6177,
"num_input_tokens_seen": 794288,
"step": 2790
},
{
"epoch": 5.21455223880597,
"grad_norm": 0.25025907158851624,
"learning_rate": 0.0009234363292172224,
"loss": 0.6038,
"num_input_tokens_seen": 795536,
"step": 2795
},
{
"epoch": 5.223880597014926,
"grad_norm": 0.2058946043252945,
"learning_rate": 0.0009230028583847054,
"loss": 0.4427,
"num_input_tokens_seen": 796944,
"step": 2800
},
{
"epoch": 5.233208955223881,
"grad_norm": 0.31456539034843445,
"learning_rate": 0.000922568266286918,
"loss": 0.494,
"num_input_tokens_seen": 798384,
"step": 2805
},
{
"epoch": 5.242537313432836,
"grad_norm": 0.30209773778915405,
"learning_rate": 0.0009221325540758458,
"loss": 0.6301,
"num_input_tokens_seen": 799792,
"step": 2810
},
{
"epoch": 5.251865671641791,
"grad_norm": 0.25161316990852356,
"learning_rate": 0.0009216957229064429,
"loss": 0.7827,
"num_input_tokens_seen": 801200,
"step": 2815
},
{
"epoch": 5.2611940298507465,
"grad_norm": 0.3006390631198883,
"learning_rate": 0.0009212577739366297,
"loss": 0.4305,
"num_input_tokens_seen": 802640,
"step": 2820
},
{
"epoch": 5.270522388059701,
"grad_norm": 0.20886574685573578,
"learning_rate": 0.0009208187083272894,
"loss": 0.6315,
"num_input_tokens_seen": 804080,
"step": 2825
},
{
"epoch": 5.279850746268656,
"grad_norm": 0.2783293128013611,
"learning_rate": 0.0009203785272422656,
"loss": 0.3979,
"num_input_tokens_seen": 805616,
"step": 2830
},
{
"epoch": 5.289179104477612,
"grad_norm": 0.18563398718833923,
"learning_rate": 0.0009199372318483581,
"loss": 0.5115,
"num_input_tokens_seen": 807088,
"step": 2835
},
{
"epoch": 5.298507462686567,
"grad_norm": 0.265781432390213,
"learning_rate": 0.0009194948233153206,
"loss": 0.5374,
"num_input_tokens_seen": 808464,
"step": 2840
},
{
"epoch": 5.307835820895522,
"grad_norm": 0.3264658451080322,
"learning_rate": 0.0009190513028158578,
"loss": 0.5647,
"num_input_tokens_seen": 810064,
"step": 2845
},
{
"epoch": 5.317164179104478,
"grad_norm": 0.28654745221138,
"learning_rate": 0.0009186066715256213,
"loss": 0.5609,
"num_input_tokens_seen": 811312,
"step": 2850
},
{
"epoch": 5.326492537313433,
"grad_norm": 0.2565745413303375,
"learning_rate": 0.000918160930623208,
"loss": 0.4579,
"num_input_tokens_seen": 812656,
"step": 2855
},
{
"epoch": 5.335820895522388,
"grad_norm": 0.15798066556453705,
"learning_rate": 0.0009177140812901549,
"loss": 0.3762,
"num_input_tokens_seen": 814128,
"step": 2860
},
{
"epoch": 5.345149253731344,
"grad_norm": 0.23694878816604614,
"learning_rate": 0.0009172661247109382,
"loss": 0.7089,
"num_input_tokens_seen": 815440,
"step": 2865
},
{
"epoch": 5.354477611940299,
"grad_norm": 0.1648920327425003,
"learning_rate": 0.0009168170620729683,
"loss": 0.7812,
"num_input_tokens_seen": 816848,
"step": 2870
},
{
"epoch": 5.3638059701492535,
"grad_norm": 0.22573807835578918,
"learning_rate": 0.0009163668945665884,
"loss": 0.3872,
"num_input_tokens_seen": 818352,
"step": 2875
},
{
"epoch": 5.373134328358209,
"grad_norm": 0.2857038974761963,
"learning_rate": 0.0009159156233850693,
"loss": 0.6371,
"num_input_tokens_seen": 819696,
"step": 2880
},
{
"epoch": 5.382462686567164,
"grad_norm": 0.325327068567276,
"learning_rate": 0.0009154632497246081,
"loss": 0.461,
"num_input_tokens_seen": 821040,
"step": 2885
},
{
"epoch": 5.391791044776119,
"grad_norm": 0.10212195664644241,
"learning_rate": 0.0009150097747843242,
"loss": 0.4968,
"num_input_tokens_seen": 822832,
"step": 2890
},
{
"epoch": 5.401119402985074,
"grad_norm": 0.2704038619995117,
"learning_rate": 0.0009145551997662559,
"loss": 0.6225,
"num_input_tokens_seen": 824304,
"step": 2895
},
{
"epoch": 5.41044776119403,
"grad_norm": 0.14122045040130615,
"learning_rate": 0.0009140995258753577,
"loss": 0.4699,
"num_input_tokens_seen": 825968,
"step": 2900
},
{
"epoch": 5.419776119402985,
"grad_norm": 0.23320503532886505,
"learning_rate": 0.0009136427543194967,
"loss": 0.3793,
"num_input_tokens_seen": 827248,
"step": 2905
},
{
"epoch": 5.42910447761194,
"grad_norm": 0.20187321305274963,
"learning_rate": 0.0009131848863094501,
"loss": 0.7002,
"num_input_tokens_seen": 828656,
"step": 2910
},
{
"epoch": 5.438432835820896,
"grad_norm": 0.25212332606315613,
"learning_rate": 0.000912725923058901,
"loss": 0.6236,
"num_input_tokens_seen": 830160,
"step": 2915
},
{
"epoch": 5.447761194029851,
"grad_norm": 0.2680214047431946,
"learning_rate": 0.0009122658657844358,
"loss": 0.5077,
"num_input_tokens_seen": 831792,
"step": 2920
},
{
"epoch": 5.457089552238806,
"grad_norm": 0.19384333491325378,
"learning_rate": 0.0009118047157055412,
"loss": 0.4084,
"num_input_tokens_seen": 833424,
"step": 2925
},
{
"epoch": 5.4664179104477615,
"grad_norm": 0.2156229168176651,
"learning_rate": 0.0009113424740446,
"loss": 0.5477,
"num_input_tokens_seen": 834896,
"step": 2930
},
{
"epoch": 5.475746268656716,
"grad_norm": 0.11520472913980484,
"learning_rate": 0.0009108791420268891,
"loss": 0.379,
"num_input_tokens_seen": 836208,
"step": 2935
},
{
"epoch": 5.485074626865671,
"grad_norm": 0.29559290409088135,
"learning_rate": 0.0009104147208805753,
"loss": 0.6259,
"num_input_tokens_seen": 837712,
"step": 2940
},
{
"epoch": 5.494402985074627,
"grad_norm": 0.25696009397506714,
"learning_rate": 0.0009099492118367123,
"loss": 0.5686,
"num_input_tokens_seen": 838992,
"step": 2945
},
{
"epoch": 5.503731343283582,
"grad_norm": 0.2539997398853302,
"learning_rate": 0.000909482616129238,
"loss": 0.3924,
"num_input_tokens_seen": 840464,
"step": 2950
},
{
"epoch": 5.513059701492537,
"grad_norm": 0.1732897013425827,
"learning_rate": 0.0009090149349949701,
"loss": 0.6351,
"num_input_tokens_seen": 841968,
"step": 2955
},
{
"epoch": 5.522388059701493,
"grad_norm": 0.22714626789093018,
"learning_rate": 0.000908546169673604,
"loss": 0.6026,
"num_input_tokens_seen": 843280,
"step": 2960
},
{
"epoch": 5.531716417910448,
"grad_norm": 0.15474703907966614,
"learning_rate": 0.0009080763214077088,
"loss": 0.685,
"num_input_tokens_seen": 844848,
"step": 2965
},
{
"epoch": 5.541044776119403,
"grad_norm": 0.183040589094162,
"learning_rate": 0.0009076053914427242,
"loss": 0.5363,
"num_input_tokens_seen": 846224,
"step": 2970
},
{
"epoch": 5.550373134328359,
"grad_norm": 0.2342386096715927,
"learning_rate": 0.0009071333810269569,
"loss": 0.5128,
"num_input_tokens_seen": 847760,
"step": 2975
},
{
"epoch": 5.559701492537314,
"grad_norm": 0.3226813077926636,
"learning_rate": 0.0009066602914115781,
"loss": 0.5095,
"num_input_tokens_seen": 849232,
"step": 2980
},
{
"epoch": 5.5690298507462686,
"grad_norm": 0.17433519661426544,
"learning_rate": 0.0009061861238506193,
"loss": 0.4562,
"num_input_tokens_seen": 850480,
"step": 2985
},
{
"epoch": 5.5783582089552235,
"grad_norm": 0.2655240297317505,
"learning_rate": 0.0009057108796009696,
"loss": 0.539,
"num_input_tokens_seen": 851792,
"step": 2990
},
{
"epoch": 5.587686567164179,
"grad_norm": 0.23230163753032684,
"learning_rate": 0.0009052345599223719,
"loss": 0.6974,
"num_input_tokens_seen": 853136,
"step": 2995
},
{
"epoch": 5.597014925373134,
"grad_norm": 0.369926393032074,
"learning_rate": 0.0009047571660774197,
"loss": 0.95,
"num_input_tokens_seen": 854512,
"step": 3000
},
{
"epoch": 5.606343283582089,
"grad_norm": 0.18043026328086853,
"learning_rate": 0.0009042786993315539,
"loss": 0.4186,
"num_input_tokens_seen": 856112,
"step": 3005
},
{
"epoch": 5.615671641791045,
"grad_norm": 0.24363280832767487,
"learning_rate": 0.0009037991609530596,
"loss": 0.5985,
"num_input_tokens_seen": 857456,
"step": 3010
},
{
"epoch": 5.625,
"grad_norm": 0.19951696693897247,
"learning_rate": 0.0009033185522130622,
"loss": 0.5111,
"num_input_tokens_seen": 859056,
"step": 3015
},
{
"epoch": 5.634328358208955,
"grad_norm": 0.14212666451931,
"learning_rate": 0.0009028368743855247,
"loss": 0.7327,
"num_input_tokens_seen": 860560,
"step": 3020
},
{
"epoch": 5.643656716417911,
"grad_norm": 0.2475363165140152,
"learning_rate": 0.0009023541287472434,
"loss": 0.55,
"num_input_tokens_seen": 862064,
"step": 3025
},
{
"epoch": 5.652985074626866,
"grad_norm": 0.2791047990322113,
"learning_rate": 0.0009018703165778457,
"loss": 0.5895,
"num_input_tokens_seen": 863536,
"step": 3030
},
{
"epoch": 5.662313432835821,
"grad_norm": 0.28386175632476807,
"learning_rate": 0.0009013854391597856,
"loss": 0.4768,
"num_input_tokens_seen": 864880,
"step": 3035
},
{
"epoch": 5.6716417910447765,
"grad_norm": 0.274679571390152,
"learning_rate": 0.0009008994977783407,
"loss": 0.6087,
"num_input_tokens_seen": 866288,
"step": 3040
},
{
"epoch": 5.6809701492537314,
"grad_norm": 0.3296756148338318,
"learning_rate": 0.0009004124937216096,
"loss": 0.6623,
"num_input_tokens_seen": 867664,
"step": 3045
},
{
"epoch": 5.690298507462686,
"grad_norm": 0.24241182208061218,
"learning_rate": 0.0008999244282805072,
"loss": 0.6314,
"num_input_tokens_seen": 869040,
"step": 3050
},
{
"epoch": 5.699626865671641,
"grad_norm": 0.27372506260871887,
"learning_rate": 0.0008994353027487616,
"loss": 0.416,
"num_input_tokens_seen": 870608,
"step": 3055
},
{
"epoch": 5.708955223880597,
"grad_norm": 0.2752893269062042,
"learning_rate": 0.0008989451184229118,
"loss": 0.7361,
"num_input_tokens_seen": 871824,
"step": 3060
},
{
"epoch": 5.718283582089552,
"grad_norm": 0.3607504963874817,
"learning_rate": 0.0008984538766023024,
"loss": 0.5149,
"num_input_tokens_seen": 873168,
"step": 3065
},
{
"epoch": 5.727611940298507,
"grad_norm": 0.2767292559146881,
"learning_rate": 0.0008979615785890817,
"loss": 0.5195,
"num_input_tokens_seen": 874640,
"step": 3070
},
{
"epoch": 5.736940298507463,
"grad_norm": 0.2679109275341034,
"learning_rate": 0.0008974682256881974,
"loss": 0.4831,
"num_input_tokens_seen": 876080,
"step": 3075
},
{
"epoch": 5.746268656716418,
"grad_norm": 0.18482770025730133,
"learning_rate": 0.0008969738192073939,
"loss": 0.3598,
"num_input_tokens_seen": 877680,
"step": 3080
},
{
"epoch": 5.755597014925373,
"grad_norm": 0.23143534362316132,
"learning_rate": 0.0008964783604572076,
"loss": 0.4919,
"num_input_tokens_seen": 879056,
"step": 3085
},
{
"epoch": 5.764925373134329,
"grad_norm": 0.2972404360771179,
"learning_rate": 0.0008959818507509649,
"loss": 0.5199,
"num_input_tokens_seen": 880368,
"step": 3090
},
{
"epoch": 5.774253731343284,
"grad_norm": 0.31133216619491577,
"learning_rate": 0.0008954842914047776,
"loss": 0.6719,
"num_input_tokens_seen": 881648,
"step": 3095
},
{
"epoch": 5.7835820895522385,
"grad_norm": 0.16302241384983063,
"learning_rate": 0.0008949856837375397,
"loss": 0.4121,
"num_input_tokens_seen": 883024,
"step": 3100
},
{
"epoch": 5.792910447761194,
"grad_norm": 0.3164055049419403,
"learning_rate": 0.0008944860290709245,
"loss": 0.4792,
"num_input_tokens_seen": 884240,
"step": 3105
},
{
"epoch": 5.802238805970149,
"grad_norm": 0.23580315709114075,
"learning_rate": 0.0008939853287293802,
"loss": 0.5963,
"num_input_tokens_seen": 885744,
"step": 3110
},
{
"epoch": 5.811567164179104,
"grad_norm": 0.22375214099884033,
"learning_rate": 0.000893483584040127,
"loss": 0.5519,
"num_input_tokens_seen": 887440,
"step": 3115
},
{
"epoch": 5.82089552238806,
"grad_norm": 0.17884212732315063,
"learning_rate": 0.000892980796333153,
"loss": 0.5346,
"num_input_tokens_seen": 888720,
"step": 3120
},
{
"epoch": 5.830223880597015,
"grad_norm": 0.21421754360198975,
"learning_rate": 0.0008924769669412116,
"loss": 0.4967,
"num_input_tokens_seen": 890288,
"step": 3125
},
{
"epoch": 5.83955223880597,
"grad_norm": 0.21498635411262512,
"learning_rate": 0.0008919720971998172,
"loss": 0.681,
"num_input_tokens_seen": 891760,
"step": 3130
},
{
"epoch": 5.848880597014926,
"grad_norm": 0.30558252334594727,
"learning_rate": 0.0008914661884472418,
"loss": 0.5239,
"num_input_tokens_seen": 893072,
"step": 3135
},
{
"epoch": 5.858208955223881,
"grad_norm": 0.2122422456741333,
"learning_rate": 0.0008909592420245116,
"loss": 0.3839,
"num_input_tokens_seen": 894544,
"step": 3140
},
{
"epoch": 5.867537313432836,
"grad_norm": 0.23761402070522308,
"learning_rate": 0.0008904512592754033,
"loss": 0.5714,
"num_input_tokens_seen": 895824,
"step": 3145
},
{
"epoch": 5.8768656716417915,
"grad_norm": 0.26812535524368286,
"learning_rate": 0.0008899422415464408,
"loss": 0.4689,
"num_input_tokens_seen": 897296,
"step": 3150
},
{
"epoch": 5.8861940298507465,
"grad_norm": 0.2236464023590088,
"learning_rate": 0.0008894321901868915,
"loss": 0.4325,
"num_input_tokens_seen": 898608,
"step": 3155
},
{
"epoch": 5.895522388059701,
"grad_norm": 0.2537100315093994,
"learning_rate": 0.0008889211065487621,
"loss": 0.4664,
"num_input_tokens_seen": 899952,
"step": 3160
},
{
"epoch": 5.904850746268656,
"grad_norm": 0.2743018865585327,
"learning_rate": 0.0008884089919867963,
"loss": 0.5646,
"num_input_tokens_seen": 901296,
"step": 3165
},
{
"epoch": 5.914179104477612,
"grad_norm": 0.1934853196144104,
"learning_rate": 0.0008878958478584703,
"loss": 0.4589,
"num_input_tokens_seen": 902768,
"step": 3170
},
{
"epoch": 5.923507462686567,
"grad_norm": 0.28554612398147583,
"learning_rate": 0.000887381675523989,
"loss": 0.8204,
"num_input_tokens_seen": 904112,
"step": 3175
},
{
"epoch": 5.932835820895522,
"grad_norm": 0.2392246276140213,
"learning_rate": 0.0008868664763462832,
"loss": 0.3573,
"num_input_tokens_seen": 905680,
"step": 3180
},
{
"epoch": 5.942164179104478,
"grad_norm": 0.23579584062099457,
"learning_rate": 0.0008863502516910058,
"loss": 0.5725,
"num_input_tokens_seen": 907024,
"step": 3185
},
{
"epoch": 5.951492537313433,
"grad_norm": 0.1369100958108902,
"learning_rate": 0.0008858330029265271,
"loss": 0.5443,
"num_input_tokens_seen": 908368,
"step": 3190
},
{
"epoch": 5.960820895522388,
"grad_norm": 0.19882099330425262,
"learning_rate": 0.0008853147314239329,
"loss": 0.5867,
"num_input_tokens_seen": 909808,
"step": 3195
},
{
"epoch": 5.970149253731344,
"grad_norm": 0.23294775187969208,
"learning_rate": 0.0008847954385570198,
"loss": 0.6957,
"num_input_tokens_seen": 911344,
"step": 3200
},
{
"epoch": 5.979477611940299,
"grad_norm": 0.18562950193881989,
"learning_rate": 0.0008842751257022911,
"loss": 0.4195,
"num_input_tokens_seen": 912752,
"step": 3205
},
{
"epoch": 5.9888059701492535,
"grad_norm": 0.2487240731716156,
"learning_rate": 0.0008837537942389551,
"loss": 0.7351,
"num_input_tokens_seen": 914032,
"step": 3210
},
{
"epoch": 5.9981343283582085,
"grad_norm": 0.12646648287773132,
"learning_rate": 0.0008832314455489188,
"loss": 0.4184,
"num_input_tokens_seen": 915472,
"step": 3215
},
{
"epoch": 6.0,
"eval_loss": 0.6421719789505005,
"eval_runtime": 4.1775,
"eval_samples_per_second": 56.972,
"eval_steps_per_second": 14.363,
"num_input_tokens_seen": 915528,
"step": 3216
},
{
"epoch": 6.007462686567164,
"grad_norm": 0.2584002912044525,
"learning_rate": 0.0008827080810167864,
"loss": 0.5532,
"num_input_tokens_seen": 916680,
"step": 3220
},
{
"epoch": 6.016791044776119,
"grad_norm": 0.23565295338630676,
"learning_rate": 0.0008821837020298546,
"loss": 0.603,
"num_input_tokens_seen": 918120,
"step": 3225
},
{
"epoch": 6.026119402985074,
"grad_norm": 0.17623180150985718,
"learning_rate": 0.0008816583099781093,
"loss": 0.5479,
"num_input_tokens_seen": 919496,
"step": 3230
},
{
"epoch": 6.03544776119403,
"grad_norm": 0.22531640529632568,
"learning_rate": 0.0008811319062542214,
"loss": 0.6126,
"num_input_tokens_seen": 920808,
"step": 3235
},
{
"epoch": 6.044776119402985,
"grad_norm": 0.27955806255340576,
"learning_rate": 0.0008806044922535436,
"loss": 0.4747,
"num_input_tokens_seen": 922024,
"step": 3240
},
{
"epoch": 6.05410447761194,
"grad_norm": 0.20974354445934296,
"learning_rate": 0.0008800760693741068,
"loss": 0.5458,
"num_input_tokens_seen": 923432,
"step": 3245
},
{
"epoch": 6.063432835820896,
"grad_norm": 0.14437653124332428,
"learning_rate": 0.0008795466390166161,
"loss": 0.4203,
"num_input_tokens_seen": 924936,
"step": 3250
},
{
"epoch": 6.072761194029851,
"grad_norm": 0.2256515473127365,
"learning_rate": 0.000879016202584447,
"loss": 0.4165,
"num_input_tokens_seen": 926408,
"step": 3255
},
{
"epoch": 6.082089552238806,
"grad_norm": 0.16263511776924133,
"learning_rate": 0.0008784847614836418,
"loss": 0.4719,
"num_input_tokens_seen": 927944,
"step": 3260
},
{
"epoch": 6.0914179104477615,
"grad_norm": 0.19972696900367737,
"learning_rate": 0.000877952317122906,
"loss": 0.7397,
"num_input_tokens_seen": 929416,
"step": 3265
},
{
"epoch": 6.100746268656716,
"grad_norm": 0.2780134975910187,
"learning_rate": 0.0008774188709136045,
"loss": 0.5968,
"num_input_tokens_seen": 930632,
"step": 3270
},
{
"epoch": 6.110074626865671,
"grad_norm": 0.38921645283699036,
"learning_rate": 0.0008768844242697578,
"loss": 0.7472,
"num_input_tokens_seen": 932168,
"step": 3275
},
{
"epoch": 6.119402985074627,
"grad_norm": 0.2642660439014435,
"learning_rate": 0.0008763489786080383,
"loss": 0.5761,
"num_input_tokens_seen": 933416,
"step": 3280
},
{
"epoch": 6.128731343283582,
"grad_norm": 0.3693527579307556,
"learning_rate": 0.0008758125353477663,
"loss": 0.6748,
"num_input_tokens_seen": 934824,
"step": 3285
},
{
"epoch": 6.138059701492537,
"grad_norm": 0.15553449094295502,
"learning_rate": 0.000875275095910907,
"loss": 0.6219,
"num_input_tokens_seen": 936200,
"step": 3290
},
{
"epoch": 6.147388059701493,
"grad_norm": 0.16091737151145935,
"learning_rate": 0.0008747366617220656,
"loss": 0.4781,
"num_input_tokens_seen": 937704,
"step": 3295
},
{
"epoch": 6.156716417910448,
"grad_norm": 0.23516695201396942,
"learning_rate": 0.0008741972342084843,
"loss": 0.452,
"num_input_tokens_seen": 939272,
"step": 3300
},
{
"epoch": 6.166044776119403,
"grad_norm": 0.2708747684955597,
"learning_rate": 0.0008736568148000385,
"loss": 0.4932,
"num_input_tokens_seen": 940520,
"step": 3305
},
{
"epoch": 6.175373134328359,
"grad_norm": 0.3128451108932495,
"learning_rate": 0.0008731154049292329,
"loss": 0.3687,
"num_input_tokens_seen": 942120,
"step": 3310
},
{
"epoch": 6.184701492537314,
"grad_norm": 0.39994189143180847,
"learning_rate": 0.0008725730060311972,
"loss": 0.4669,
"num_input_tokens_seen": 943528,
"step": 3315
},
{
"epoch": 6.1940298507462686,
"grad_norm": 0.11239203810691833,
"learning_rate": 0.0008720296195436831,
"loss": 0.3533,
"num_input_tokens_seen": 945096,
"step": 3320
},
{
"epoch": 6.2033582089552235,
"grad_norm": 0.26304325461387634,
"learning_rate": 0.0008714852469070602,
"loss": 0.5458,
"num_input_tokens_seen": 946472,
"step": 3325
},
{
"epoch": 6.212686567164179,
"grad_norm": 0.13661259412765503,
"learning_rate": 0.0008709398895643117,
"loss": 0.4306,
"num_input_tokens_seen": 948072,
"step": 3330
},
{
"epoch": 6.222014925373134,
"grad_norm": 0.23913314938545227,
"learning_rate": 0.0008703935489610315,
"loss": 0.5885,
"num_input_tokens_seen": 949512,
"step": 3335
},
{
"epoch": 6.231343283582089,
"grad_norm": 0.4055419862270355,
"learning_rate": 0.0008698462265454197,
"loss": 0.5071,
"num_input_tokens_seen": 950888,
"step": 3340
},
{
"epoch": 6.240671641791045,
"grad_norm": 0.1768474131822586,
"learning_rate": 0.0008692979237682786,
"loss": 0.3251,
"num_input_tokens_seen": 952392,
"step": 3345
},
{
"epoch": 6.25,
"grad_norm": 0.30177706480026245,
"learning_rate": 0.0008687486420830093,
"loss": 0.756,
"num_input_tokens_seen": 953736,
"step": 3350
},
{
"epoch": 6.259328358208955,
"grad_norm": 0.23593708872795105,
"learning_rate": 0.000868198382945608,
"loss": 0.6532,
"num_input_tokens_seen": 955272,
"step": 3355
},
{
"epoch": 6.268656716417911,
"grad_norm": 0.2411566525697708,
"learning_rate": 0.0008676471478146617,
"loss": 0.3248,
"num_input_tokens_seen": 956648,
"step": 3360
},
{
"epoch": 6.277985074626866,
"grad_norm": 0.18759582936763763,
"learning_rate": 0.0008670949381513445,
"loss": 0.4224,
"num_input_tokens_seen": 958024,
"step": 3365
},
{
"epoch": 6.287313432835821,
"grad_norm": 0.2575382888317108,
"learning_rate": 0.0008665417554194135,
"loss": 0.5043,
"num_input_tokens_seen": 959400,
"step": 3370
},
{
"epoch": 6.2966417910447765,
"grad_norm": 0.32555249333381653,
"learning_rate": 0.0008659876010852055,
"loss": 0.6098,
"num_input_tokens_seen": 960872,
"step": 3375
},
{
"epoch": 6.3059701492537314,
"grad_norm": 0.24410775303840637,
"learning_rate": 0.0008654324766176325,
"loss": 0.7446,
"num_input_tokens_seen": 962184,
"step": 3380
},
{
"epoch": 6.315298507462686,
"grad_norm": 0.1657802164554596,
"learning_rate": 0.000864876383488178,
"loss": 0.5946,
"num_input_tokens_seen": 963496,
"step": 3385
},
{
"epoch": 6.324626865671641,
"grad_norm": 0.22825486958026886,
"learning_rate": 0.0008643193231708937,
"loss": 0.4938,
"num_input_tokens_seen": 965032,
"step": 3390
},
{
"epoch": 6.333955223880597,
"grad_norm": 0.11497091501951218,
"learning_rate": 0.0008637612971423943,
"loss": 0.4575,
"num_input_tokens_seen": 966696,
"step": 3395
},
{
"epoch": 6.343283582089552,
"grad_norm": 0.11938580870628357,
"learning_rate": 0.000863202306881855,
"loss": 0.3512,
"num_input_tokens_seen": 968200,
"step": 3400
},
{
"epoch": 6.352611940298507,
"grad_norm": 0.2507960796356201,
"learning_rate": 0.0008626423538710062,
"loss": 0.4401,
"num_input_tokens_seen": 969672,
"step": 3405
},
{
"epoch": 6.361940298507463,
"grad_norm": 0.1698283851146698,
"learning_rate": 0.000862081439594131,
"loss": 0.4485,
"num_input_tokens_seen": 971112,
"step": 3410
},
{
"epoch": 6.371268656716418,
"grad_norm": 0.12110260128974915,
"learning_rate": 0.00086151956553806,
"loss": 0.5036,
"num_input_tokens_seen": 972456,
"step": 3415
},
{
"epoch": 6.380597014925373,
"grad_norm": 0.27388572692871094,
"learning_rate": 0.0008609567331921684,
"loss": 0.4426,
"num_input_tokens_seen": 973992,
"step": 3420
},
{
"epoch": 6.389925373134329,
"grad_norm": 0.3161713778972626,
"learning_rate": 0.0008603929440483713,
"loss": 0.5486,
"num_input_tokens_seen": 975464,
"step": 3425
},
{
"epoch": 6.399253731343284,
"grad_norm": 0.35460421442985535,
"learning_rate": 0.0008598281996011199,
"loss": 0.5463,
"num_input_tokens_seen": 976840,
"step": 3430
},
{
"epoch": 6.4085820895522385,
"grad_norm": 0.29080653190612793,
"learning_rate": 0.0008592625013473978,
"loss": 0.6731,
"num_input_tokens_seen": 978344,
"step": 3435
},
{
"epoch": 6.417910447761194,
"grad_norm": 0.27483072876930237,
"learning_rate": 0.0008586958507867168,
"loss": 0.7566,
"num_input_tokens_seen": 979752,
"step": 3440
},
{
"epoch": 6.427238805970149,
"grad_norm": 0.22223156690597534,
"learning_rate": 0.0008581282494211134,
"loss": 0.4567,
"num_input_tokens_seen": 981192,
"step": 3445
},
{
"epoch": 6.436567164179104,
"grad_norm": 0.23036664724349976,
"learning_rate": 0.0008575596987551438,
"loss": 0.5825,
"num_input_tokens_seen": 982536,
"step": 3450
},
{
"epoch": 6.44589552238806,
"grad_norm": 0.19232529401779175,
"learning_rate": 0.000856990200295881,
"loss": 0.4377,
"num_input_tokens_seen": 984008,
"step": 3455
},
{
"epoch": 6.455223880597015,
"grad_norm": 0.28331199288368225,
"learning_rate": 0.00085641975555291,
"loss": 0.5706,
"num_input_tokens_seen": 985448,
"step": 3460
},
{
"epoch": 6.46455223880597,
"grad_norm": 0.24698172509670258,
"learning_rate": 0.0008558483660383245,
"loss": 0.4667,
"num_input_tokens_seen": 986920,
"step": 3465
},
{
"epoch": 6.473880597014926,
"grad_norm": 0.23428291082382202,
"learning_rate": 0.0008552760332667223,
"loss": 0.5937,
"num_input_tokens_seen": 988520,
"step": 3470
},
{
"epoch": 6.483208955223881,
"grad_norm": 0.32097363471984863,
"learning_rate": 0.0008547027587552012,
"loss": 0.6805,
"num_input_tokens_seen": 989928,
"step": 3475
},
{
"epoch": 6.492537313432836,
"grad_norm": 0.3465026915073395,
"learning_rate": 0.0008541285440233562,
"loss": 0.4623,
"num_input_tokens_seen": 991368,
"step": 3480
},
{
"epoch": 6.5018656716417915,
"grad_norm": 0.19996623694896698,
"learning_rate": 0.0008535533905932737,
"loss": 0.4461,
"num_input_tokens_seen": 992712,
"step": 3485
},
{
"epoch": 6.5111940298507465,
"grad_norm": 0.20951132476329803,
"learning_rate": 0.0008529772999895289,
"loss": 0.5706,
"num_input_tokens_seen": 994024,
"step": 3490
},
{
"epoch": 6.520522388059701,
"grad_norm": 0.2945927083492279,
"learning_rate": 0.0008524002737391807,
"loss": 0.67,
"num_input_tokens_seen": 995400,
"step": 3495
},
{
"epoch": 6.529850746268656,
"grad_norm": 0.23618575930595398,
"learning_rate": 0.0008518223133717687,
"loss": 0.4631,
"num_input_tokens_seen": 997032,
"step": 3500
},
{
"epoch": 6.539179104477612,
"grad_norm": 0.19505491852760315,
"learning_rate": 0.0008512434204193079,
"loss": 0.4377,
"num_input_tokens_seen": 998408,
"step": 3505
},
{
"epoch": 6.548507462686567,
"grad_norm": 0.26609039306640625,
"learning_rate": 0.000850663596416286,
"loss": 0.522,
"num_input_tokens_seen": 999944,
"step": 3510
},
{
"epoch": 6.557835820895522,
"grad_norm": 0.2637292444705963,
"learning_rate": 0.0008500828428996583,
"loss": 0.293,
"num_input_tokens_seen": 1001480,
"step": 3515
},
{
"epoch": 6.567164179104478,
"grad_norm": 0.3107680082321167,
"learning_rate": 0.0008495011614088439,
"loss": 0.6297,
"num_input_tokens_seen": 1002792,
"step": 3520
},
{
"epoch": 6.576492537313433,
"grad_norm": 0.20006173849105835,
"learning_rate": 0.0008489185534857223,
"loss": 0.3466,
"num_input_tokens_seen": 1004264,
"step": 3525
},
{
"epoch": 6.585820895522388,
"grad_norm": 0.19617074728012085,
"learning_rate": 0.0008483350206746278,
"loss": 0.4365,
"num_input_tokens_seen": 1005704,
"step": 3530
},
{
"epoch": 6.595149253731344,
"grad_norm": 0.10148897767066956,
"learning_rate": 0.000847750564522347,
"loss": 0.4371,
"num_input_tokens_seen": 1007240,
"step": 3535
},
{
"epoch": 6.604477611940299,
"grad_norm": 0.42178183794021606,
"learning_rate": 0.000847165186578114,
"loss": 0.4784,
"num_input_tokens_seen": 1008648,
"step": 3540
},
{
"epoch": 6.6138059701492535,
"grad_norm": 0.41911423206329346,
"learning_rate": 0.0008465788883936059,
"loss": 0.5173,
"num_input_tokens_seen": 1010088,
"step": 3545
},
{
"epoch": 6.6231343283582085,
"grad_norm": 0.1649949848651886,
"learning_rate": 0.0008459916715229396,
"loss": 0.39,
"num_input_tokens_seen": 1011432,
"step": 3550
},
{
"epoch": 6.632462686567164,
"grad_norm": 0.3317459225654602,
"learning_rate": 0.000845403537522667,
"loss": 0.6234,
"num_input_tokens_seen": 1012840,
"step": 3555
},
{
"epoch": 6.641791044776119,
"grad_norm": 0.16076388955116272,
"learning_rate": 0.0008448144879517705,
"loss": 0.5754,
"num_input_tokens_seen": 1014216,
"step": 3560
},
{
"epoch": 6.651119402985074,
"grad_norm": 0.2902985215187073,
"learning_rate": 0.0008442245243716606,
"loss": 0.5202,
"num_input_tokens_seen": 1015560,
"step": 3565
},
{
"epoch": 6.66044776119403,
"grad_norm": 0.22482368350028992,
"learning_rate": 0.0008436336483461695,
"loss": 0.6555,
"num_input_tokens_seen": 1017064,
"step": 3570
},
{
"epoch": 6.669776119402985,
"grad_norm": 0.2745371162891388,
"learning_rate": 0.0008430418614415487,
"loss": 0.496,
"num_input_tokens_seen": 1018312,
"step": 3575
},
{
"epoch": 6.67910447761194,
"grad_norm": 0.15380549430847168,
"learning_rate": 0.0008424491652264639,
"loss": 0.5961,
"num_input_tokens_seen": 1019688,
"step": 3580
},
{
"epoch": 6.688432835820896,
"grad_norm": 0.18027140200138092,
"learning_rate": 0.000841855561271991,
"loss": 0.6227,
"num_input_tokens_seen": 1021128,
"step": 3585
},
{
"epoch": 6.697761194029851,
"grad_norm": 0.2426953762769699,
"learning_rate": 0.0008412610511516125,
"loss": 0.4379,
"num_input_tokens_seen": 1022504,
"step": 3590
},
{
"epoch": 6.707089552238806,
"grad_norm": 0.23552139103412628,
"learning_rate": 0.0008406656364412128,
"loss": 0.7039,
"num_input_tokens_seen": 1023752,
"step": 3595
},
{
"epoch": 6.7164179104477615,
"grad_norm": 0.25078070163726807,
"learning_rate": 0.0008400693187190736,
"loss": 0.4357,
"num_input_tokens_seen": 1025224,
"step": 3600
},
{
"epoch": 6.725746268656716,
"grad_norm": 0.2658351957798004,
"learning_rate": 0.000839472099565871,
"loss": 0.8229,
"num_input_tokens_seen": 1026568,
"step": 3605
},
{
"epoch": 6.735074626865671,
"grad_norm": 0.25117790699005127,
"learning_rate": 0.00083887398056467,
"loss": 0.7487,
"num_input_tokens_seen": 1027912,
"step": 3610
},
{
"epoch": 6.744402985074627,
"grad_norm": 0.1720697283744812,
"learning_rate": 0.000838274963300921,
"loss": 0.4305,
"num_input_tokens_seen": 1029384,
"step": 3615
},
{
"epoch": 6.753731343283582,
"grad_norm": 0.21772412955760956,
"learning_rate": 0.0008376750493624555,
"loss": 0.5213,
"num_input_tokens_seen": 1030952,
"step": 3620
},
{
"epoch": 6.763059701492537,
"grad_norm": 0.21206441521644592,
"learning_rate": 0.000837074240339482,
"loss": 0.6029,
"num_input_tokens_seen": 1032264,
"step": 3625
},
{
"epoch": 6.772388059701493,
"grad_norm": 0.19085289537906647,
"learning_rate": 0.0008364725378245811,
"loss": 0.5669,
"num_input_tokens_seen": 1033544,
"step": 3630
},
{
"epoch": 6.781716417910448,
"grad_norm": 0.2896425724029541,
"learning_rate": 0.0008358699434127024,
"loss": 0.6797,
"num_input_tokens_seen": 1035112,
"step": 3635
},
{
"epoch": 6.791044776119403,
"grad_norm": 0.18138831853866577,
"learning_rate": 0.0008352664587011595,
"loss": 0.42,
"num_input_tokens_seen": 1036488,
"step": 3640
},
{
"epoch": 6.800373134328359,
"grad_norm": 0.36892759799957275,
"learning_rate": 0.0008346620852896256,
"loss": 0.6121,
"num_input_tokens_seen": 1037960,
"step": 3645
},
{
"epoch": 6.809701492537314,
"grad_norm": 0.24577677249908447,
"learning_rate": 0.00083405682478013,
"loss": 0.6551,
"num_input_tokens_seen": 1039304,
"step": 3650
},
{
"epoch": 6.8190298507462686,
"grad_norm": 0.2634584903717041,
"learning_rate": 0.0008334506787770532,
"loss": 0.5676,
"num_input_tokens_seen": 1040616,
"step": 3655
},
{
"epoch": 6.8283582089552235,
"grad_norm": 0.3486064076423645,
"learning_rate": 0.0008328436488871234,
"loss": 0.4678,
"num_input_tokens_seen": 1042056,
"step": 3660
},
{
"epoch": 6.837686567164179,
"grad_norm": 0.14382286369800568,
"learning_rate": 0.0008322357367194109,
"loss": 0.474,
"num_input_tokens_seen": 1043528,
"step": 3665
},
{
"epoch": 6.847014925373134,
"grad_norm": 0.19818229973316193,
"learning_rate": 0.0008316269438853255,
"loss": 0.5245,
"num_input_tokens_seen": 1044904,
"step": 3670
},
{
"epoch": 6.856343283582089,
"grad_norm": 0.36218205094337463,
"learning_rate": 0.0008310172719986108,
"loss": 0.5821,
"num_input_tokens_seen": 1046408,
"step": 3675
},
{
"epoch": 6.865671641791045,
"grad_norm": 0.22650587558746338,
"learning_rate": 0.0008304067226753408,
"loss": 0.4627,
"num_input_tokens_seen": 1047912,
"step": 3680
},
{
"epoch": 6.875,
"grad_norm": 0.23026876151561737,
"learning_rate": 0.0008297952975339155,
"loss": 0.7357,
"num_input_tokens_seen": 1049320,
"step": 3685
},
{
"epoch": 6.884328358208955,
"grad_norm": 0.2109512984752655,
"learning_rate": 0.0008291829981950562,
"loss": 0.4451,
"num_input_tokens_seen": 1050632,
"step": 3690
},
{
"epoch": 6.893656716417911,
"grad_norm": 0.1196155846118927,
"learning_rate": 0.0008285698262818016,
"loss": 0.4494,
"num_input_tokens_seen": 1051976,
"step": 3695
},
{
"epoch": 6.902985074626866,
"grad_norm": 0.1713961511850357,
"learning_rate": 0.0008279557834195031,
"loss": 0.6417,
"num_input_tokens_seen": 1053224,
"step": 3700
},
{
"epoch": 6.912313432835821,
"grad_norm": 0.3117183446884155,
"learning_rate": 0.000827340871235821,
"loss": 0.4689,
"num_input_tokens_seen": 1054568,
"step": 3705
},
{
"epoch": 6.9216417910447765,
"grad_norm": 0.2057737559080124,
"learning_rate": 0.00082672509136072,
"loss": 0.5804,
"num_input_tokens_seen": 1055944,
"step": 3710
},
{
"epoch": 6.9309701492537314,
"grad_norm": 0.21782958507537842,
"learning_rate": 0.0008261084454264647,
"loss": 0.3641,
"num_input_tokens_seen": 1057384,
"step": 3715
},
{
"epoch": 6.940298507462686,
"grad_norm": 0.33613821864128113,
"learning_rate": 0.0008254909350676151,
"loss": 0.6094,
"num_input_tokens_seen": 1058920,
"step": 3720
},
{
"epoch": 6.949626865671641,
"grad_norm": 0.480368435382843,
"learning_rate": 0.0008248725619210233,
"loss": 0.4046,
"num_input_tokens_seen": 1060424,
"step": 3725
},
{
"epoch": 6.958955223880597,
"grad_norm": 0.36097168922424316,
"learning_rate": 0.0008242533276258277,
"loss": 0.2856,
"num_input_tokens_seen": 1061832,
"step": 3730
},
{
"epoch": 6.968283582089552,
"grad_norm": 0.1994985193014145,
"learning_rate": 0.0008236332338234496,
"loss": 0.6103,
"num_input_tokens_seen": 1063176,
"step": 3735
},
{
"epoch": 6.977611940298507,
"grad_norm": 0.15217134356498718,
"learning_rate": 0.0008230122821575884,
"loss": 0.3072,
"num_input_tokens_seen": 1064616,
"step": 3740
},
{
"epoch": 6.986940298507463,
"grad_norm": 0.33062049746513367,
"learning_rate": 0.0008223904742742181,
"loss": 0.3627,
"num_input_tokens_seen": 1066152,
"step": 3745
},
{
"epoch": 6.996268656716418,
"grad_norm": 0.30312541127204895,
"learning_rate": 0.0008217678118215819,
"loss": 0.5468,
"num_input_tokens_seen": 1067592,
"step": 3750
},
{
"epoch": 7.0,
"eval_loss": 0.6684337258338928,
"eval_runtime": 4.1842,
"eval_samples_per_second": 56.881,
"eval_steps_per_second": 14.34,
"num_input_tokens_seen": 1067904,
"step": 3752
},
{
"epoch": 7.005597014925373,
"grad_norm": 0.30474305152893066,
"learning_rate": 0.0008211442964501879,
"loss": 0.5487,
"num_input_tokens_seen": 1068736,
"step": 3755
},
{
"epoch": 7.014925373134329,
"grad_norm": 0.44127199053764343,
"learning_rate": 0.0008205199298128055,
"loss": 0.6161,
"num_input_tokens_seen": 1070176,
"step": 3760
},
{
"epoch": 7.024253731343284,
"grad_norm": 0.2390872836112976,
"learning_rate": 0.0008198947135644606,
"loss": 0.5455,
"num_input_tokens_seen": 1071584,
"step": 3765
},
{
"epoch": 7.0335820895522385,
"grad_norm": 0.27098581194877625,
"learning_rate": 0.000819268649362431,
"loss": 0.6568,
"num_input_tokens_seen": 1072896,
"step": 3770
},
{
"epoch": 7.042910447761194,
"grad_norm": 0.3743234872817993,
"learning_rate": 0.0008186417388662421,
"loss": 0.4295,
"num_input_tokens_seen": 1074336,
"step": 3775
},
{
"epoch": 7.052238805970149,
"grad_norm": 0.16391004621982574,
"learning_rate": 0.000818013983737663,
"loss": 0.5123,
"num_input_tokens_seen": 1075712,
"step": 3780
},
{
"epoch": 7.061567164179104,
"grad_norm": 0.29341673851013184,
"learning_rate": 0.0008173853856407011,
"loss": 0.3093,
"num_input_tokens_seen": 1077152,
"step": 3785
},
{
"epoch": 7.07089552238806,
"grad_norm": 0.14143411815166473,
"learning_rate": 0.0008167559462415988,
"loss": 0.5145,
"num_input_tokens_seen": 1078496,
"step": 3790
},
{
"epoch": 7.080223880597015,
"grad_norm": 0.18919670581817627,
"learning_rate": 0.0008161256672088285,
"loss": 0.5812,
"num_input_tokens_seen": 1079680,
"step": 3795
},
{
"epoch": 7.08955223880597,
"grad_norm": 0.30277538299560547,
"learning_rate": 0.0008154945502130877,
"loss": 0.5403,
"num_input_tokens_seen": 1080864,
"step": 3800
},
{
"epoch": 7.098880597014926,
"grad_norm": 0.24299748241901398,
"learning_rate": 0.0008148625969272959,
"loss": 0.4128,
"num_input_tokens_seen": 1082208,
"step": 3805
},
{
"epoch": 7.108208955223881,
"grad_norm": 0.24601735174655914,
"learning_rate": 0.0008142298090265887,
"loss": 0.5396,
"num_input_tokens_seen": 1083680,
"step": 3810
},
{
"epoch": 7.117537313432836,
"grad_norm": 0.29024389386177063,
"learning_rate": 0.0008135961881883146,
"loss": 0.4562,
"num_input_tokens_seen": 1085152,
"step": 3815
},
{
"epoch": 7.126865671641791,
"grad_norm": 0.33937326073646545,
"learning_rate": 0.0008129617360920296,
"loss": 0.3553,
"num_input_tokens_seen": 1086752,
"step": 3820
},
{
"epoch": 7.1361940298507465,
"grad_norm": 0.20014850795269012,
"learning_rate": 0.0008123264544194933,
"loss": 0.4393,
"num_input_tokens_seen": 1088064,
"step": 3825
},
{
"epoch": 7.145522388059701,
"grad_norm": 0.2823147475719452,
"learning_rate": 0.0008116903448546639,
"loss": 0.5611,
"num_input_tokens_seen": 1089568,
"step": 3830
},
{
"epoch": 7.154850746268656,
"grad_norm": 0.2024276703596115,
"learning_rate": 0.0008110534090836951,
"loss": 0.5713,
"num_input_tokens_seen": 1091136,
"step": 3835
},
{
"epoch": 7.164179104477612,
"grad_norm": 0.2638244926929474,
"learning_rate": 0.0008104156487949297,
"loss": 0.3811,
"num_input_tokens_seen": 1092704,
"step": 3840
},
{
"epoch": 7.173507462686567,
"grad_norm": 0.21746188402175903,
"learning_rate": 0.000809777065678896,
"loss": 0.9308,
"num_input_tokens_seen": 1093952,
"step": 3845
},
{
"epoch": 7.182835820895522,
"grad_norm": 0.41835376620292664,
"learning_rate": 0.0008091376614283045,
"loss": 0.587,
"num_input_tokens_seen": 1095264,
"step": 3850
},
{
"epoch": 7.192164179104478,
"grad_norm": 0.41872432827949524,
"learning_rate": 0.0008084974377380409,
"loss": 0.4512,
"num_input_tokens_seen": 1096576,
"step": 3855
},
{
"epoch": 7.201492537313433,
"grad_norm": 0.19008269906044006,
"learning_rate": 0.0008078563963051642,
"loss": 0.6325,
"num_input_tokens_seen": 1097920,
"step": 3860
},
{
"epoch": 7.210820895522388,
"grad_norm": 0.19446614384651184,
"learning_rate": 0.0008072145388289,
"loss": 0.3391,
"num_input_tokens_seen": 1099424,
"step": 3865
},
{
"epoch": 7.220149253731344,
"grad_norm": 0.3054392635822296,
"learning_rate": 0.0008065718670106379,
"loss": 0.4429,
"num_input_tokens_seen": 1100864,
"step": 3870
},
{
"epoch": 7.229477611940299,
"grad_norm": 0.23451483249664307,
"learning_rate": 0.0008059283825539256,
"loss": 0.4245,
"num_input_tokens_seen": 1102272,
"step": 3875
},
{
"epoch": 7.2388059701492535,
"grad_norm": 0.30054786801338196,
"learning_rate": 0.0008052840871644649,
"loss": 0.4085,
"num_input_tokens_seen": 1103680,
"step": 3880
},
{
"epoch": 7.248134328358209,
"grad_norm": 0.2466873675584793,
"learning_rate": 0.0008046389825501072,
"loss": 0.3337,
"num_input_tokens_seen": 1104960,
"step": 3885
},
{
"epoch": 7.257462686567164,
"grad_norm": 0.1704123169183731,
"learning_rate": 0.0008039930704208492,
"loss": 0.6229,
"num_input_tokens_seen": 1106368,
"step": 3890
},
{
"epoch": 7.266791044776119,
"grad_norm": 0.19805479049682617,
"learning_rate": 0.0008033463524888278,
"loss": 0.4459,
"num_input_tokens_seen": 1107680,
"step": 3895
},
{
"epoch": 7.276119402985074,
"grad_norm": 0.3426274061203003,
"learning_rate": 0.0008026988304683158,
"loss": 0.5647,
"num_input_tokens_seen": 1109184,
"step": 3900
},
{
"epoch": 7.28544776119403,
"grad_norm": 0.18383286893367767,
"learning_rate": 0.0008020505060757178,
"loss": 0.3523,
"num_input_tokens_seen": 1110656,
"step": 3905
},
{
"epoch": 7.294776119402985,
"grad_norm": 0.22394180297851562,
"learning_rate": 0.0008014013810295649,
"loss": 0.4805,
"num_input_tokens_seen": 1112256,
"step": 3910
},
{
"epoch": 7.30410447761194,
"grad_norm": 0.1889072060585022,
"learning_rate": 0.0008007514570505107,
"loss": 0.4508,
"num_input_tokens_seen": 1113696,
"step": 3915
},
{
"epoch": 7.313432835820896,
"grad_norm": 0.1642770767211914,
"learning_rate": 0.0008001007358613263,
"loss": 0.5743,
"num_input_tokens_seen": 1114976,
"step": 3920
},
{
"epoch": 7.322761194029851,
"grad_norm": 0.32939183712005615,
"learning_rate": 0.0007994492191868965,
"loss": 0.4055,
"num_input_tokens_seen": 1116288,
"step": 3925
},
{
"epoch": 7.332089552238806,
"grad_norm": 0.32331785559654236,
"learning_rate": 0.0007987969087542142,
"loss": 0.4768,
"num_input_tokens_seen": 1117760,
"step": 3930
},
{
"epoch": 7.3414179104477615,
"grad_norm": 0.19969119131565094,
"learning_rate": 0.0007981438062923767,
"loss": 0.6326,
"num_input_tokens_seen": 1119072,
"step": 3935
},
{
"epoch": 7.350746268656716,
"grad_norm": 0.31672680377960205,
"learning_rate": 0.0007974899135325804,
"loss": 0.6273,
"num_input_tokens_seen": 1120448,
"step": 3940
},
{
"epoch": 7.360074626865671,
"grad_norm": 0.19286391139030457,
"learning_rate": 0.000796835232208117,
"loss": 0.3779,
"num_input_tokens_seen": 1121952,
"step": 3945
},
{
"epoch": 7.369402985074627,
"grad_norm": 0.1753414422273636,
"learning_rate": 0.0007961797640543678,
"loss": 0.4282,
"num_input_tokens_seen": 1123424,
"step": 3950
},
{
"epoch": 7.378731343283582,
"grad_norm": 0.1815296709537506,
"learning_rate": 0.0007955235108088008,
"loss": 0.4562,
"num_input_tokens_seen": 1124800,
"step": 3955
},
{
"epoch": 7.388059701492537,
"grad_norm": 0.22495396435260773,
"learning_rate": 0.0007948664742109639,
"loss": 0.5935,
"num_input_tokens_seen": 1126208,
"step": 3960
},
{
"epoch": 7.397388059701493,
"grad_norm": 0.2588635981082916,
"learning_rate": 0.0007942086560024826,
"loss": 0.5487,
"num_input_tokens_seen": 1127488,
"step": 3965
},
{
"epoch": 7.406716417910448,
"grad_norm": 0.459797203540802,
"learning_rate": 0.0007935500579270532,
"loss": 0.8277,
"num_input_tokens_seen": 1128864,
"step": 3970
},
{
"epoch": 7.416044776119403,
"grad_norm": 0.398291677236557,
"learning_rate": 0.0007928906817304397,
"loss": 0.5892,
"num_input_tokens_seen": 1130368,
"step": 3975
},
{
"epoch": 7.425373134328359,
"grad_norm": 0.20469245314598083,
"learning_rate": 0.0007922305291604687,
"loss": 0.4956,
"num_input_tokens_seen": 1131872,
"step": 3980
},
{
"epoch": 7.434701492537314,
"grad_norm": 0.36543020606040955,
"learning_rate": 0.0007915696019670248,
"loss": 0.6108,
"num_input_tokens_seen": 1133440,
"step": 3985
},
{
"epoch": 7.4440298507462686,
"grad_norm": 0.29691100120544434,
"learning_rate": 0.000790907901902046,
"loss": 0.6105,
"num_input_tokens_seen": 1134720,
"step": 3990
},
{
"epoch": 7.4533582089552235,
"grad_norm": 0.23733581602573395,
"learning_rate": 0.0007902454307195184,
"loss": 0.3708,
"num_input_tokens_seen": 1136224,
"step": 3995
},
{
"epoch": 7.462686567164179,
"grad_norm": 0.19936566054821014,
"learning_rate": 0.0007895821901754727,
"loss": 0.5386,
"num_input_tokens_seen": 1137600,
"step": 4000
},
{
"epoch": 7.472014925373134,
"grad_norm": 0.21452738344669342,
"learning_rate": 0.000788918182027979,
"loss": 0.5008,
"num_input_tokens_seen": 1139040,
"step": 4005
},
{
"epoch": 7.481343283582089,
"grad_norm": 0.2815008759498596,
"learning_rate": 0.0007882534080371414,
"loss": 0.3934,
"num_input_tokens_seen": 1140576,
"step": 4010
},
{
"epoch": 7.490671641791045,
"grad_norm": 0.28204086422920227,
"learning_rate": 0.000787587869965095,
"loss": 0.4209,
"num_input_tokens_seen": 1142080,
"step": 4015
},
{
"epoch": 7.5,
"grad_norm": 0.2625596523284912,
"learning_rate": 0.0007869215695759996,
"loss": 0.6684,
"num_input_tokens_seen": 1143680,
"step": 4020
},
{
"epoch": 7.509328358208955,
"grad_norm": 0.25634852051734924,
"learning_rate": 0.000786254508636036,
"loss": 0.5474,
"num_input_tokens_seen": 1145120,
"step": 4025
},
{
"epoch": 7.518656716417911,
"grad_norm": 0.2722490727901459,
"learning_rate": 0.0007855866889134008,
"loss": 0.848,
"num_input_tokens_seen": 1146496,
"step": 4030
},
{
"epoch": 7.527985074626866,
"grad_norm": 0.3221842348575592,
"learning_rate": 0.0007849181121783021,
"loss": 0.4516,
"num_input_tokens_seen": 1147808,
"step": 4035
},
{
"epoch": 7.537313432835821,
"grad_norm": 0.22305116057395935,
"learning_rate": 0.0007842487802029545,
"loss": 0.6108,
"num_input_tokens_seen": 1149344,
"step": 4040
},
{
"epoch": 7.5466417910447765,
"grad_norm": 0.23894543945789337,
"learning_rate": 0.0007835786947615748,
"loss": 0.3998,
"num_input_tokens_seen": 1150880,
"step": 4045
},
{
"epoch": 7.5559701492537314,
"grad_norm": 0.30003124475479126,
"learning_rate": 0.0007829078576303768,
"loss": 0.5354,
"num_input_tokens_seen": 1152352,
"step": 4050
},
{
"epoch": 7.565298507462686,
"grad_norm": 0.344744473695755,
"learning_rate": 0.0007822362705875667,
"loss": 0.6149,
"num_input_tokens_seen": 1153856,
"step": 4055
},
{
"epoch": 7.574626865671641,
"grad_norm": 0.20434407889842987,
"learning_rate": 0.0007815639354133388,
"loss": 0.3679,
"num_input_tokens_seen": 1155424,
"step": 4060
},
{
"epoch": 7.583955223880597,
"grad_norm": 0.3096279203891754,
"learning_rate": 0.0007808908538898703,
"loss": 0.4134,
"num_input_tokens_seen": 1156992,
"step": 4065
},
{
"epoch": 7.593283582089552,
"grad_norm": 0.12501931190490723,
"learning_rate": 0.000780217027801317,
"loss": 0.3932,
"num_input_tokens_seen": 1158624,
"step": 4070
},
{
"epoch": 7.602611940298507,
"grad_norm": 0.18204405903816223,
"learning_rate": 0.0007795424589338079,
"loss": 0.4619,
"num_input_tokens_seen": 1159968,
"step": 4075
},
{
"epoch": 7.611940298507463,
"grad_norm": 0.3062838912010193,
"learning_rate": 0.0007788671490754416,
"loss": 0.6422,
"num_input_tokens_seen": 1161216,
"step": 4080
},
{
"epoch": 7.621268656716418,
"grad_norm": 0.3464128375053406,
"learning_rate": 0.00077819110001628,
"loss": 0.4757,
"num_input_tokens_seen": 1162592,
"step": 4085
},
{
"epoch": 7.630597014925373,
"grad_norm": 0.1720285266637802,
"learning_rate": 0.0007775143135483451,
"loss": 0.3046,
"num_input_tokens_seen": 1164288,
"step": 4090
},
{
"epoch": 7.639925373134329,
"grad_norm": 0.32936620712280273,
"learning_rate": 0.0007768367914656135,
"loss": 0.4966,
"num_input_tokens_seen": 1165824,
"step": 4095
},
{
"epoch": 7.649253731343284,
"grad_norm": 0.3390771448612213,
"learning_rate": 0.0007761585355640112,
"loss": 0.5416,
"num_input_tokens_seen": 1167264,
"step": 4100
},
{
"epoch": 7.6585820895522385,
"grad_norm": 0.23222362995147705,
"learning_rate": 0.00077547954764141,
"loss": 0.4355,
"num_input_tokens_seen": 1168672,
"step": 4105
},
{
"epoch": 7.667910447761194,
"grad_norm": 0.2488616555929184,
"learning_rate": 0.0007747998294976216,
"loss": 0.4442,
"num_input_tokens_seen": 1170400,
"step": 4110
},
{
"epoch": 7.677238805970149,
"grad_norm": 0.33894675970077515,
"learning_rate": 0.0007741193829343937,
"loss": 0.5229,
"num_input_tokens_seen": 1172000,
"step": 4115
},
{
"epoch": 7.686567164179104,
"grad_norm": 0.1934259831905365,
"learning_rate": 0.0007734382097554044,
"loss": 0.3793,
"num_input_tokens_seen": 1173568,
"step": 4120
},
{
"epoch": 7.69589552238806,
"grad_norm": 0.3763267993927002,
"learning_rate": 0.0007727563117662584,
"loss": 0.5349,
"num_input_tokens_seen": 1175040,
"step": 4125
},
{
"epoch": 7.705223880597015,
"grad_norm": 0.28987762331962585,
"learning_rate": 0.0007720736907744811,
"loss": 0.402,
"num_input_tokens_seen": 1176480,
"step": 4130
},
{
"epoch": 7.71455223880597,
"grad_norm": 0.3133031129837036,
"learning_rate": 0.0007713903485895148,
"loss": 0.449,
"num_input_tokens_seen": 1177856,
"step": 4135
},
{
"epoch": 7.723880597014926,
"grad_norm": 0.26172906160354614,
"learning_rate": 0.0007707062870227136,
"loss": 0.341,
"num_input_tokens_seen": 1179072,
"step": 4140
},
{
"epoch": 7.733208955223881,
"grad_norm": 0.23562432825565338,
"learning_rate": 0.0007700215078873378,
"loss": 0.3482,
"num_input_tokens_seen": 1180640,
"step": 4145
},
{
"epoch": 7.742537313432836,
"grad_norm": 0.213342547416687,
"learning_rate": 0.0007693360129985507,
"loss": 0.5387,
"num_input_tokens_seen": 1181920,
"step": 4150
},
{
"epoch": 7.7518656716417915,
"grad_norm": 0.3449512720108032,
"learning_rate": 0.000768649804173412,
"loss": 0.3724,
"num_input_tokens_seen": 1183488,
"step": 4155
},
{
"epoch": 7.7611940298507465,
"grad_norm": 0.3592718243598938,
"learning_rate": 0.0007679628832308743,
"loss": 0.5064,
"num_input_tokens_seen": 1184864,
"step": 4160
},
{
"epoch": 7.770522388059701,
"grad_norm": 0.32927563786506653,
"learning_rate": 0.0007672752519917779,
"loss": 0.4903,
"num_input_tokens_seen": 1186240,
"step": 4165
},
{
"epoch": 7.779850746268656,
"grad_norm": 0.18627624213695526,
"learning_rate": 0.0007665869122788458,
"loss": 0.6339,
"num_input_tokens_seen": 1187488,
"step": 4170
},
{
"epoch": 7.789179104477612,
"grad_norm": 0.42838889360427856,
"learning_rate": 0.0007658978659166787,
"loss": 0.769,
"num_input_tokens_seen": 1188800,
"step": 4175
},
{
"epoch": 7.798507462686567,
"grad_norm": 0.21956242620944977,
"learning_rate": 0.0007652081147317509,
"loss": 0.4466,
"num_input_tokens_seen": 1190272,
"step": 4180
},
{
"epoch": 7.807835820895522,
"grad_norm": 0.40260300040245056,
"learning_rate": 0.0007645176605524049,
"loss": 0.5467,
"num_input_tokens_seen": 1191648,
"step": 4185
},
{
"epoch": 7.817164179104478,
"grad_norm": 0.34386584162712097,
"learning_rate": 0.000763826505208846,
"loss": 0.7075,
"num_input_tokens_seen": 1193024,
"step": 4190
},
{
"epoch": 7.826492537313433,
"grad_norm": 0.45418819785118103,
"learning_rate": 0.0007631346505331391,
"loss": 0.5423,
"num_input_tokens_seen": 1194272,
"step": 4195
},
{
"epoch": 7.835820895522388,
"grad_norm": 0.36769580841064453,
"learning_rate": 0.0007624420983592022,
"loss": 0.4375,
"num_input_tokens_seen": 1195680,
"step": 4200
},
{
"epoch": 7.845149253731344,
"grad_norm": 0.20569390058517456,
"learning_rate": 0.0007617488505228023,
"loss": 0.4004,
"num_input_tokens_seen": 1197280,
"step": 4205
},
{
"epoch": 7.854477611940299,
"grad_norm": 0.2689351439476013,
"learning_rate": 0.0007610549088615504,
"loss": 0.5249,
"num_input_tokens_seen": 1198656,
"step": 4210
},
{
"epoch": 7.8638059701492535,
"grad_norm": 0.2242366224527359,
"learning_rate": 0.0007603602752148968,
"loss": 0.5177,
"num_input_tokens_seen": 1200160,
"step": 4215
},
{
"epoch": 7.8731343283582085,
"grad_norm": 0.20943698287010193,
"learning_rate": 0.0007596649514241259,
"loss": 0.3971,
"num_input_tokens_seen": 1201536,
"step": 4220
},
{
"epoch": 7.882462686567164,
"grad_norm": 0.19763123989105225,
"learning_rate": 0.0007589689393323513,
"loss": 0.4378,
"num_input_tokens_seen": 1203104,
"step": 4225
},
{
"epoch": 7.891791044776119,
"grad_norm": 0.06624174118041992,
"learning_rate": 0.0007582722407845118,
"loss": 0.4475,
"num_input_tokens_seen": 1204640,
"step": 4230
},
{
"epoch": 7.901119402985074,
"grad_norm": 0.17526885867118835,
"learning_rate": 0.0007575748576273649,
"loss": 0.5821,
"num_input_tokens_seen": 1206176,
"step": 4235
},
{
"epoch": 7.91044776119403,
"grad_norm": 0.22568243741989136,
"learning_rate": 0.0007568767917094836,
"loss": 0.465,
"num_input_tokens_seen": 1207680,
"step": 4240
},
{
"epoch": 7.919776119402985,
"grad_norm": 0.17621439695358276,
"learning_rate": 0.0007561780448812501,
"loss": 0.4096,
"num_input_tokens_seen": 1208960,
"step": 4245
},
{
"epoch": 7.92910447761194,
"grad_norm": 0.26744747161865234,
"learning_rate": 0.0007554786189948518,
"loss": 0.5476,
"num_input_tokens_seen": 1210400,
"step": 4250
},
{
"epoch": 7.938432835820896,
"grad_norm": 0.12365785986185074,
"learning_rate": 0.0007547785159042761,
"loss": 0.2842,
"num_input_tokens_seen": 1211904,
"step": 4255
},
{
"epoch": 7.947761194029851,
"grad_norm": 0.21446040272712708,
"learning_rate": 0.0007540777374653056,
"loss": 0.524,
"num_input_tokens_seen": 1213344,
"step": 4260
},
{
"epoch": 7.957089552238806,
"grad_norm": 0.21890805661678314,
"learning_rate": 0.0007533762855355126,
"loss": 0.4233,
"num_input_tokens_seen": 1214592,
"step": 4265
},
{
"epoch": 7.9664179104477615,
"grad_norm": 0.284196674823761,
"learning_rate": 0.0007526741619742553,
"loss": 0.4797,
"num_input_tokens_seen": 1215936,
"step": 4270
},
{
"epoch": 7.975746268656716,
"grad_norm": 0.30007484555244446,
"learning_rate": 0.0007519713686426717,
"loss": 0.7494,
"num_input_tokens_seen": 1217504,
"step": 4275
},
{
"epoch": 7.985074626865671,
"grad_norm": 0.17375683784484863,
"learning_rate": 0.0007512679074036751,
"loss": 0.7121,
"num_input_tokens_seen": 1218880,
"step": 4280
},
{
"epoch": 7.994402985074627,
"grad_norm": 0.22325703501701355,
"learning_rate": 0.00075056378012195,
"loss": 0.429,
"num_input_tokens_seen": 1220320,
"step": 4285
},
{
"epoch": 8.0,
"eval_loss": 0.6669880747795105,
"eval_runtime": 4.1872,
"eval_samples_per_second": 56.84,
"eval_steps_per_second": 14.329,
"num_input_tokens_seen": 1221016,
"step": 4288
},
{
"epoch": 8.003731343283581,
"grad_norm": 0.34321409463882446,
"learning_rate": 0.0007498589886639457,
"loss": 0.5687,
"num_input_tokens_seen": 1221560,
"step": 4290
},
{
"epoch": 8.013059701492537,
"grad_norm": 0.2419702708721161,
"learning_rate": 0.0007491535348978719,
"loss": 0.2726,
"num_input_tokens_seen": 1223224,
"step": 4295
},
{
"epoch": 8.022388059701493,
"grad_norm": 0.25188514590263367,
"learning_rate": 0.0007484474206936947,
"loss": 0.5986,
"num_input_tokens_seen": 1224600,
"step": 4300
},
{
"epoch": 8.031716417910447,
"grad_norm": 0.30116868019104004,
"learning_rate": 0.0007477406479231299,
"loss": 0.339,
"num_input_tokens_seen": 1225880,
"step": 4305
},
{
"epoch": 8.041044776119403,
"grad_norm": 0.21877850592136383,
"learning_rate": 0.0007470332184596398,
"loss": 0.3826,
"num_input_tokens_seen": 1227288,
"step": 4310
},
{
"epoch": 8.050373134328359,
"grad_norm": 0.3433838486671448,
"learning_rate": 0.0007463251341784271,
"loss": 0.3924,
"num_input_tokens_seen": 1228600,
"step": 4315
},
{
"epoch": 8.059701492537313,
"grad_norm": 0.23504002392292023,
"learning_rate": 0.00074561639695643,
"loss": 0.4071,
"num_input_tokens_seen": 1230008,
"step": 4320
},
{
"epoch": 8.069029850746269,
"grad_norm": 0.344004362821579,
"learning_rate": 0.0007449070086723178,
"loss": 0.6863,
"num_input_tokens_seen": 1231320,
"step": 4325
},
{
"epoch": 8.078358208955224,
"grad_norm": 0.6055997610092163,
"learning_rate": 0.0007441969712064856,
"loss": 0.5183,
"num_input_tokens_seen": 1232536,
"step": 4330
},
{
"epoch": 8.087686567164178,
"grad_norm": 0.20780882239341736,
"learning_rate": 0.0007434862864410487,
"loss": 0.4671,
"num_input_tokens_seen": 1233880,
"step": 4335
},
{
"epoch": 8.097014925373134,
"grad_norm": 0.17353133857250214,
"learning_rate": 0.0007427749562598392,
"loss": 0.4412,
"num_input_tokens_seen": 1235416,
"step": 4340
},
{
"epoch": 8.10634328358209,
"grad_norm": 0.2616088390350342,
"learning_rate": 0.0007420629825483993,
"loss": 0.4073,
"num_input_tokens_seen": 1236664,
"step": 4345
},
{
"epoch": 8.115671641791044,
"grad_norm": 0.3216690123081207,
"learning_rate": 0.000741350367193977,
"loss": 0.4714,
"num_input_tokens_seen": 1238136,
"step": 4350
},
{
"epoch": 8.125,
"grad_norm": 0.19009575247764587,
"learning_rate": 0.000740637112085522,
"loss": 0.4795,
"num_input_tokens_seen": 1239768,
"step": 4355
},
{
"epoch": 8.134328358208956,
"grad_norm": 0.3063131868839264,
"learning_rate": 0.0007399232191136785,
"loss": 0.3927,
"num_input_tokens_seen": 1241240,
"step": 4360
},
{
"epoch": 8.14365671641791,
"grad_norm": 0.2866196930408478,
"learning_rate": 0.0007392086901707824,
"loss": 0.4687,
"num_input_tokens_seen": 1242712,
"step": 4365
},
{
"epoch": 8.152985074626866,
"grad_norm": 0.19563937187194824,
"learning_rate": 0.0007384935271508552,
"loss": 0.3187,
"num_input_tokens_seen": 1244088,
"step": 4370
},
{
"epoch": 8.162313432835822,
"grad_norm": 0.3950585722923279,
"learning_rate": 0.000737777731949599,
"loss": 0.3728,
"num_input_tokens_seen": 1245496,
"step": 4375
},
{
"epoch": 8.171641791044776,
"grad_norm": 0.18759894371032715,
"learning_rate": 0.0007370613064643921,
"loss": 0.3397,
"num_input_tokens_seen": 1247096,
"step": 4380
},
{
"epoch": 8.180970149253731,
"grad_norm": 0.2706758677959442,
"learning_rate": 0.0007363442525942826,
"loss": 0.4425,
"num_input_tokens_seen": 1248600,
"step": 4385
},
{
"epoch": 8.190298507462687,
"grad_norm": 0.1951591670513153,
"learning_rate": 0.0007356265722399854,
"loss": 0.4339,
"num_input_tokens_seen": 1250008,
"step": 4390
},
{
"epoch": 8.199626865671641,
"grad_norm": 0.2725299894809723,
"learning_rate": 0.0007349082673038752,
"loss": 0.4047,
"num_input_tokens_seen": 1251512,
"step": 4395
},
{
"epoch": 8.208955223880597,
"grad_norm": 0.38873162865638733,
"learning_rate": 0.0007341893396899825,
"loss": 0.4714,
"num_input_tokens_seen": 1252952,
"step": 4400
},
{
"epoch": 8.218283582089553,
"grad_norm": 0.24204622209072113,
"learning_rate": 0.0007334697913039885,
"loss": 0.3452,
"num_input_tokens_seen": 1254392,
"step": 4405
},
{
"epoch": 8.227611940298507,
"grad_norm": 0.3463059067726135,
"learning_rate": 0.0007327496240532201,
"loss": 0.5566,
"num_input_tokens_seen": 1255576,
"step": 4410
},
{
"epoch": 8.236940298507463,
"grad_norm": 0.2980581223964691,
"learning_rate": 0.0007320288398466442,
"loss": 0.4341,
"num_input_tokens_seen": 1257080,
"step": 4415
},
{
"epoch": 8.246268656716419,
"grad_norm": 0.18802469968795776,
"learning_rate": 0.0007313074405948629,
"loss": 0.5806,
"num_input_tokens_seen": 1258520,
"step": 4420
},
{
"epoch": 8.255597014925373,
"grad_norm": 0.20121391117572784,
"learning_rate": 0.0007305854282101097,
"loss": 0.3982,
"num_input_tokens_seen": 1259992,
"step": 4425
},
{
"epoch": 8.264925373134329,
"grad_norm": 0.28050583600997925,
"learning_rate": 0.0007298628046062416,
"loss": 0.5442,
"num_input_tokens_seen": 1261400,
"step": 4430
},
{
"epoch": 8.274253731343283,
"grad_norm": 0.218626469373703,
"learning_rate": 0.0007291395716987379,
"loss": 0.5104,
"num_input_tokens_seen": 1262776,
"step": 4435
},
{
"epoch": 8.283582089552239,
"grad_norm": 0.2403896003961563,
"learning_rate": 0.0007284157314046911,
"loss": 0.3303,
"num_input_tokens_seen": 1264184,
"step": 4440
},
{
"epoch": 8.292910447761194,
"grad_norm": 0.24433453381061554,
"learning_rate": 0.0007276912856428048,
"loss": 0.4662,
"num_input_tokens_seen": 1265528,
"step": 4445
},
{
"epoch": 8.302238805970148,
"grad_norm": 0.25048547983169556,
"learning_rate": 0.0007269662363333873,
"loss": 0.4738,
"num_input_tokens_seen": 1266808,
"step": 4450
},
{
"epoch": 8.311567164179104,
"grad_norm": 0.33556827902793884,
"learning_rate": 0.0007262405853983467,
"loss": 0.5143,
"num_input_tokens_seen": 1268120,
"step": 4455
},
{
"epoch": 8.32089552238806,
"grad_norm": 0.2557665705680847,
"learning_rate": 0.0007255143347611855,
"loss": 0.5515,
"num_input_tokens_seen": 1269464,
"step": 4460
},
{
"epoch": 8.330223880597014,
"grad_norm": 0.2448815554380417,
"learning_rate": 0.0007247874863469963,
"loss": 0.4604,
"num_input_tokens_seen": 1271032,
"step": 4465
},
{
"epoch": 8.33955223880597,
"grad_norm": 0.2030094712972641,
"learning_rate": 0.0007240600420824564,
"loss": 0.3656,
"num_input_tokens_seen": 1272280,
"step": 4470
},
{
"epoch": 8.348880597014926,
"grad_norm": 0.3827509582042694,
"learning_rate": 0.000723332003895822,
"loss": 0.5106,
"num_input_tokens_seen": 1273848,
"step": 4475
},
{
"epoch": 8.35820895522388,
"grad_norm": 0.2691296637058258,
"learning_rate": 0.000722603373716924,
"loss": 0.3502,
"num_input_tokens_seen": 1275384,
"step": 4480
},
{
"epoch": 8.367537313432836,
"grad_norm": 0.29246485233306885,
"learning_rate": 0.0007218741534771621,
"loss": 0.4031,
"num_input_tokens_seen": 1276728,
"step": 4485
},
{
"epoch": 8.376865671641792,
"grad_norm": 0.5716930031776428,
"learning_rate": 0.0007211443451095007,
"loss": 0.4505,
"num_input_tokens_seen": 1278328,
"step": 4490
},
{
"epoch": 8.386194029850746,
"grad_norm": 0.29046720266342163,
"learning_rate": 0.0007204139505484627,
"loss": 0.6272,
"num_input_tokens_seen": 1279832,
"step": 4495
},
{
"epoch": 8.395522388059701,
"grad_norm": 0.24031990766525269,
"learning_rate": 0.000719682971730125,
"loss": 0.498,
"num_input_tokens_seen": 1281176,
"step": 4500
},
{
"epoch": 8.404850746268657,
"grad_norm": 0.3898731470108032,
"learning_rate": 0.0007189514105921133,
"loss": 0.5826,
"num_input_tokens_seen": 1282648,
"step": 4505
},
{
"epoch": 8.414179104477611,
"grad_norm": 0.2248714119195938,
"learning_rate": 0.0007182192690735964,
"loss": 0.4553,
"num_input_tokens_seen": 1283928,
"step": 4510
},
{
"epoch": 8.423507462686567,
"grad_norm": 0.3099049925804138,
"learning_rate": 0.0007174865491152823,
"loss": 0.6155,
"num_input_tokens_seen": 1285272,
"step": 4515
},
{
"epoch": 8.432835820895523,
"grad_norm": 0.3284352719783783,
"learning_rate": 0.0007167532526594115,
"loss": 0.5621,
"num_input_tokens_seen": 1286616,
"step": 4520
},
{
"epoch": 8.442164179104477,
"grad_norm": 0.28215911984443665,
"learning_rate": 0.0007160193816497536,
"loss": 0.2902,
"num_input_tokens_seen": 1288088,
"step": 4525
},
{
"epoch": 8.451492537313433,
"grad_norm": 0.2650202810764313,
"learning_rate": 0.0007152849380315999,
"loss": 0.5968,
"num_input_tokens_seen": 1289528,
"step": 4530
},
{
"epoch": 8.460820895522389,
"grad_norm": 0.2579214572906494,
"learning_rate": 0.0007145499237517607,
"loss": 0.4103,
"num_input_tokens_seen": 1291032,
"step": 4535
},
{
"epoch": 8.470149253731343,
"grad_norm": 0.24751406908035278,
"learning_rate": 0.0007138143407585584,
"loss": 0.4289,
"num_input_tokens_seen": 1292472,
"step": 4540
},
{
"epoch": 8.479477611940299,
"grad_norm": 0.23992621898651123,
"learning_rate": 0.0007130781910018227,
"loss": 0.6279,
"num_input_tokens_seen": 1293944,
"step": 4545
},
{
"epoch": 8.488805970149254,
"grad_norm": 0.12086621671915054,
"learning_rate": 0.0007123414764328864,
"loss": 0.3467,
"num_input_tokens_seen": 1295384,
"step": 4550
},
{
"epoch": 8.498134328358208,
"grad_norm": 0.3661694824695587,
"learning_rate": 0.0007116041990045788,
"loss": 0.584,
"num_input_tokens_seen": 1296792,
"step": 4555
},
{
"epoch": 8.507462686567164,
"grad_norm": 0.44709137082099915,
"learning_rate": 0.0007108663606712214,
"loss": 0.4874,
"num_input_tokens_seen": 1298104,
"step": 4560
},
{
"epoch": 8.51679104477612,
"grad_norm": 0.47159019112586975,
"learning_rate": 0.0007101279633886222,
"loss": 0.5383,
"num_input_tokens_seen": 1299608,
"step": 4565
},
{
"epoch": 8.526119402985074,
"grad_norm": 0.3675750494003296,
"learning_rate": 0.0007093890091140716,
"loss": 0.4992,
"num_input_tokens_seen": 1300888,
"step": 4570
},
{
"epoch": 8.53544776119403,
"grad_norm": 0.4664657413959503,
"learning_rate": 0.0007086494998063357,
"loss": 0.5515,
"num_input_tokens_seen": 1302296,
"step": 4575
},
{
"epoch": 8.544776119402986,
"grad_norm": 0.25660377740859985,
"learning_rate": 0.0007079094374256521,
"loss": 0.5217,
"num_input_tokens_seen": 1303768,
"step": 4580
},
{
"epoch": 8.55410447761194,
"grad_norm": 0.21756531298160553,
"learning_rate": 0.0007071688239337244,
"loss": 0.5465,
"num_input_tokens_seen": 1305208,
"step": 4585
},
{
"epoch": 8.563432835820896,
"grad_norm": 0.3067342936992645,
"learning_rate": 0.0007064276612937172,
"loss": 0.5361,
"num_input_tokens_seen": 1306584,
"step": 4590
},
{
"epoch": 8.572761194029852,
"grad_norm": 0.169167622923851,
"learning_rate": 0.0007056859514702506,
"loss": 0.4106,
"num_input_tokens_seen": 1308056,
"step": 4595
},
{
"epoch": 8.582089552238806,
"grad_norm": 0.2943738102912903,
"learning_rate": 0.0007049436964293949,
"loss": 0.4275,
"num_input_tokens_seen": 1309624,
"step": 4600
},
{
"epoch": 8.591417910447761,
"grad_norm": 0.2510972023010254,
"learning_rate": 0.0007042008981386663,
"loss": 0.4938,
"num_input_tokens_seen": 1311160,
"step": 4605
},
{
"epoch": 8.600746268656717,
"grad_norm": 0.45341455936431885,
"learning_rate": 0.0007034575585670204,
"loss": 0.383,
"num_input_tokens_seen": 1312504,
"step": 4610
},
{
"epoch": 8.610074626865671,
"grad_norm": 0.2252267301082611,
"learning_rate": 0.0007027136796848477,
"loss": 0.3927,
"num_input_tokens_seen": 1314104,
"step": 4615
},
{
"epoch": 8.619402985074627,
"grad_norm": 0.2824556827545166,
"learning_rate": 0.0007019692634639683,
"loss": 0.4237,
"num_input_tokens_seen": 1315544,
"step": 4620
},
{
"epoch": 8.628731343283581,
"grad_norm": 0.2865005433559418,
"learning_rate": 0.0007012243118776269,
"loss": 0.4228,
"num_input_tokens_seen": 1317016,
"step": 4625
},
{
"epoch": 8.638059701492537,
"grad_norm": 0.32809343934059143,
"learning_rate": 0.0007004788269004869,
"loss": 0.4611,
"num_input_tokens_seen": 1318392,
"step": 4630
},
{
"epoch": 8.647388059701493,
"grad_norm": 0.38598981499671936,
"learning_rate": 0.0006997328105086257,
"loss": 0.5493,
"num_input_tokens_seen": 1319768,
"step": 4635
},
{
"epoch": 8.656716417910447,
"grad_norm": 0.32761213183403015,
"learning_rate": 0.0006989862646795298,
"loss": 0.4471,
"num_input_tokens_seen": 1321336,
"step": 4640
},
{
"epoch": 8.666044776119403,
"grad_norm": 0.21694421768188477,
"learning_rate": 0.0006982391913920883,
"loss": 0.2965,
"num_input_tokens_seen": 1322712,
"step": 4645
},
{
"epoch": 8.675373134328359,
"grad_norm": 0.21429555118083954,
"learning_rate": 0.0006974915926265889,
"loss": 0.5602,
"num_input_tokens_seen": 1324216,
"step": 4650
},
{
"epoch": 8.684701492537313,
"grad_norm": 0.4526868462562561,
"learning_rate": 0.0006967434703647122,
"loss": 0.7087,
"num_input_tokens_seen": 1325656,
"step": 4655
},
{
"epoch": 8.694029850746269,
"grad_norm": 0.28775081038475037,
"learning_rate": 0.0006959948265895264,
"loss": 0.5634,
"num_input_tokens_seen": 1327032,
"step": 4660
},
{
"epoch": 8.703358208955224,
"grad_norm": 0.4699382185935974,
"learning_rate": 0.000695245663285482,
"loss": 0.3949,
"num_input_tokens_seen": 1328184,
"step": 4665
},
{
"epoch": 8.712686567164178,
"grad_norm": 0.3626880645751953,
"learning_rate": 0.0006944959824384067,
"loss": 0.5689,
"num_input_tokens_seen": 1329592,
"step": 4670
},
{
"epoch": 8.722014925373134,
"grad_norm": 0.2578609883785248,
"learning_rate": 0.0006937457860355002,
"loss": 0.4704,
"num_input_tokens_seen": 1330904,
"step": 4675
},
{
"epoch": 8.73134328358209,
"grad_norm": 0.3120516538619995,
"learning_rate": 0.0006929950760653285,
"loss": 0.4856,
"num_input_tokens_seen": 1332440,
"step": 4680
},
{
"epoch": 8.740671641791044,
"grad_norm": 0.24091090261936188,
"learning_rate": 0.0006922438545178194,
"loss": 0.5936,
"num_input_tokens_seen": 1333784,
"step": 4685
},
{
"epoch": 8.75,
"grad_norm": 0.4123125970363617,
"learning_rate": 0.000691492123384256,
"loss": 0.5579,
"num_input_tokens_seen": 1335128,
"step": 4690
},
{
"epoch": 8.759328358208956,
"grad_norm": 0.19290226697921753,
"learning_rate": 0.0006907398846572728,
"loss": 0.396,
"num_input_tokens_seen": 1336568,
"step": 4695
},
{
"epoch": 8.76865671641791,
"grad_norm": 0.2879132032394409,
"learning_rate": 0.0006899871403308498,
"loss": 0.3578,
"num_input_tokens_seen": 1338040,
"step": 4700
},
{
"epoch": 8.777985074626866,
"grad_norm": 0.23420406877994537,
"learning_rate": 0.0006892338924003068,
"loss": 0.4722,
"num_input_tokens_seen": 1339608,
"step": 4705
},
{
"epoch": 8.787313432835822,
"grad_norm": 0.3197929561138153,
"learning_rate": 0.0006884801428622989,
"loss": 0.5515,
"num_input_tokens_seen": 1340920,
"step": 4710
},
{
"epoch": 8.796641791044776,
"grad_norm": 0.3196072578430176,
"learning_rate": 0.0006877258937148103,
"loss": 0.3438,
"num_input_tokens_seen": 1342328,
"step": 4715
},
{
"epoch": 8.805970149253731,
"grad_norm": 0.3184308409690857,
"learning_rate": 0.0006869711469571504,
"loss": 0.4048,
"num_input_tokens_seen": 1343672,
"step": 4720
},
{
"epoch": 8.815298507462687,
"grad_norm": 0.16743049025535583,
"learning_rate": 0.0006862159045899468,
"loss": 0.4086,
"num_input_tokens_seen": 1345176,
"step": 4725
},
{
"epoch": 8.824626865671641,
"grad_norm": 0.250658243894577,
"learning_rate": 0.0006854601686151412,
"loss": 0.4755,
"num_input_tokens_seen": 1346552,
"step": 4730
},
{
"epoch": 8.833955223880597,
"grad_norm": 0.3016541302204132,
"learning_rate": 0.0006847039410359837,
"loss": 0.4492,
"num_input_tokens_seen": 1348088,
"step": 4735
},
{
"epoch": 8.843283582089553,
"grad_norm": 0.1681380718946457,
"learning_rate": 0.0006839472238570273,
"loss": 0.4924,
"num_input_tokens_seen": 1349432,
"step": 4740
},
{
"epoch": 8.852611940298507,
"grad_norm": 0.32491400837898254,
"learning_rate": 0.0006831900190841231,
"loss": 0.5161,
"num_input_tokens_seen": 1350712,
"step": 4745
},
{
"epoch": 8.861940298507463,
"grad_norm": 0.4035620391368866,
"learning_rate": 0.0006824323287244146,
"loss": 0.6008,
"num_input_tokens_seen": 1351992,
"step": 4750
},
{
"epoch": 8.871268656716419,
"grad_norm": 0.3141079843044281,
"learning_rate": 0.0006816741547863324,
"loss": 0.3931,
"num_input_tokens_seen": 1353496,
"step": 4755
},
{
"epoch": 8.880597014925373,
"grad_norm": 0.23624996840953827,
"learning_rate": 0.0006809154992795887,
"loss": 0.4218,
"num_input_tokens_seen": 1354904,
"step": 4760
},
{
"epoch": 8.889925373134329,
"grad_norm": 0.3903117775917053,
"learning_rate": 0.0006801563642151729,
"loss": 0.6213,
"num_input_tokens_seen": 1356280,
"step": 4765
},
{
"epoch": 8.899253731343283,
"grad_norm": 0.4300483465194702,
"learning_rate": 0.0006793967516053448,
"loss": 0.6809,
"num_input_tokens_seen": 1357720,
"step": 4770
},
{
"epoch": 8.908582089552239,
"grad_norm": 0.2975708246231079,
"learning_rate": 0.0006786366634636303,
"loss": 0.9333,
"num_input_tokens_seen": 1359096,
"step": 4775
},
{
"epoch": 8.917910447761194,
"grad_norm": 0.2819536626338959,
"learning_rate": 0.0006778761018048161,
"loss": 0.6371,
"num_input_tokens_seen": 1360472,
"step": 4780
},
{
"epoch": 8.927238805970148,
"grad_norm": 0.3771061599254608,
"learning_rate": 0.0006771150686449435,
"loss": 0.5515,
"num_input_tokens_seen": 1362008,
"step": 4785
},
{
"epoch": 8.936567164179104,
"grad_norm": 0.37994271516799927,
"learning_rate": 0.0006763535660013044,
"loss": 0.4953,
"num_input_tokens_seen": 1363384,
"step": 4790
},
{
"epoch": 8.94589552238806,
"grad_norm": 0.2501105070114136,
"learning_rate": 0.0006755915958924344,
"loss": 0.4797,
"num_input_tokens_seen": 1364728,
"step": 4795
},
{
"epoch": 8.955223880597014,
"grad_norm": 0.2391573041677475,
"learning_rate": 0.0006748291603381087,
"loss": 0.3529,
"num_input_tokens_seen": 1366136,
"step": 4800
},
{
"epoch": 8.96455223880597,
"grad_norm": 0.19431819021701813,
"learning_rate": 0.000674066261359336,
"loss": 0.4215,
"num_input_tokens_seen": 1367448,
"step": 4805
},
{
"epoch": 8.973880597014926,
"grad_norm": 0.23345661163330078,
"learning_rate": 0.0006733029009783537,
"loss": 0.5026,
"num_input_tokens_seen": 1369208,
"step": 4810
},
{
"epoch": 8.98320895522388,
"grad_norm": 0.3160540759563446,
"learning_rate": 0.000672539081218622,
"loss": 0.2601,
"num_input_tokens_seen": 1370744,
"step": 4815
},
{
"epoch": 8.992537313432836,
"grad_norm": 0.2732608914375305,
"learning_rate": 0.0006717748041048187,
"loss": 0.4701,
"num_input_tokens_seen": 1372056,
"step": 4820
},
{
"epoch": 9.0,
"eval_loss": 0.6892353296279907,
"eval_runtime": 4.1958,
"eval_samples_per_second": 56.724,
"eval_steps_per_second": 14.3,
"num_input_tokens_seen": 1373032,
"step": 4824
},
{
"epoch": 9.001865671641792,
"grad_norm": 0.3725232779979706,
"learning_rate": 0.0006710100716628344,
"loss": 0.6341,
"num_input_tokens_seen": 1373416,
"step": 4825
},
{
"epoch": 9.011194029850746,
"grad_norm": 0.3368667960166931,
"learning_rate": 0.0006702448859197661,
"loss": 0.574,
"num_input_tokens_seen": 1374888,
"step": 4830
},
{
"epoch": 9.020522388059701,
"grad_norm": 0.21971359848976135,
"learning_rate": 0.0006694792489039128,
"loss": 0.4461,
"num_input_tokens_seen": 1376392,
"step": 4835
},
{
"epoch": 9.029850746268657,
"grad_norm": 0.36806461215019226,
"learning_rate": 0.0006687131626447694,
"loss": 0.5298,
"num_input_tokens_seen": 1377992,
"step": 4840
},
{
"epoch": 9.039179104477611,
"grad_norm": 0.24120855331420898,
"learning_rate": 0.0006679466291730218,
"loss": 0.4954,
"num_input_tokens_seen": 1379400,
"step": 4845
},
{
"epoch": 9.048507462686567,
"grad_norm": 0.16234050691127777,
"learning_rate": 0.0006671796505205414,
"loss": 0.4344,
"num_input_tokens_seen": 1380872,
"step": 4850
},
{
"epoch": 9.057835820895523,
"grad_norm": 0.2468133419752121,
"learning_rate": 0.0006664122287203791,
"loss": 0.6219,
"num_input_tokens_seen": 1382184,
"step": 4855
},
{
"epoch": 9.067164179104477,
"grad_norm": 0.288142591714859,
"learning_rate": 0.0006656443658067615,
"loss": 0.3348,
"num_input_tokens_seen": 1383816,
"step": 4860
},
{
"epoch": 9.076492537313433,
"grad_norm": 0.5281508564949036,
"learning_rate": 0.0006648760638150832,
"loss": 0.5361,
"num_input_tokens_seen": 1385128,
"step": 4865
},
{
"epoch": 9.085820895522389,
"grad_norm": 0.3075248897075653,
"learning_rate": 0.0006641073247819041,
"loss": 0.4782,
"num_input_tokens_seen": 1386440,
"step": 4870
},
{
"epoch": 9.095149253731343,
"grad_norm": 0.18158727884292603,
"learning_rate": 0.0006633381507449412,
"loss": 0.4621,
"num_input_tokens_seen": 1387944,
"step": 4875
},
{
"epoch": 9.104477611940299,
"grad_norm": 0.4284612238407135,
"learning_rate": 0.0006625685437430655,
"loss": 0.5046,
"num_input_tokens_seen": 1389416,
"step": 4880
},
{
"epoch": 9.113805970149254,
"grad_norm": 0.2957424521446228,
"learning_rate": 0.0006617985058162953,
"loss": 0.3839,
"num_input_tokens_seen": 1390632,
"step": 4885
},
{
"epoch": 9.123134328358208,
"grad_norm": 0.23130932450294495,
"learning_rate": 0.0006610280390057914,
"loss": 0.3038,
"num_input_tokens_seen": 1392200,
"step": 4890
},
{
"epoch": 9.132462686567164,
"grad_norm": 0.20099236071109772,
"learning_rate": 0.000660257145353851,
"loss": 0.2988,
"num_input_tokens_seen": 1393576,
"step": 4895
},
{
"epoch": 9.14179104477612,
"grad_norm": 0.37872520089149475,
"learning_rate": 0.0006594858269039032,
"loss": 0.3758,
"num_input_tokens_seen": 1394824,
"step": 4900
},
{
"epoch": 9.151119402985074,
"grad_norm": 0.28783118724823,
"learning_rate": 0.0006587140857005029,
"loss": 0.4344,
"num_input_tokens_seen": 1396136,
"step": 4905
},
{
"epoch": 9.16044776119403,
"grad_norm": 0.5245941877365112,
"learning_rate": 0.0006579419237893256,
"loss": 0.6692,
"num_input_tokens_seen": 1397384,
"step": 4910
},
{
"epoch": 9.169776119402986,
"grad_norm": 0.38943246006965637,
"learning_rate": 0.0006571693432171624,
"loss": 0.4255,
"num_input_tokens_seen": 1399016,
"step": 4915
},
{
"epoch": 9.17910447761194,
"grad_norm": 0.3359316289424896,
"learning_rate": 0.0006563963460319134,
"loss": 0.4888,
"num_input_tokens_seen": 1400424,
"step": 4920
},
{
"epoch": 9.188432835820896,
"grad_norm": 0.2700708210468292,
"learning_rate": 0.0006556229342825835,
"loss": 0.461,
"num_input_tokens_seen": 1401992,
"step": 4925
},
{
"epoch": 9.197761194029852,
"grad_norm": 0.2669214606285095,
"learning_rate": 0.0006548491100192763,
"loss": 0.4753,
"num_input_tokens_seen": 1403336,
"step": 4930
},
{
"epoch": 9.207089552238806,
"grad_norm": 0.3095604181289673,
"learning_rate": 0.0006540748752931894,
"loss": 0.4736,
"num_input_tokens_seen": 1404840,
"step": 4935
},
{
"epoch": 9.216417910447761,
"grad_norm": 0.18496310710906982,
"learning_rate": 0.0006533002321566078,
"loss": 0.4155,
"num_input_tokens_seen": 1406216,
"step": 4940
},
{
"epoch": 9.225746268656717,
"grad_norm": 0.34469074010849,
"learning_rate": 0.0006525251826628991,
"loss": 0.3456,
"num_input_tokens_seen": 1407656,
"step": 4945
},
{
"epoch": 9.235074626865671,
"grad_norm": 0.21673814952373505,
"learning_rate": 0.0006517497288665086,
"loss": 0.1967,
"num_input_tokens_seen": 1409448,
"step": 4950
},
{
"epoch": 9.244402985074627,
"grad_norm": 0.357761025428772,
"learning_rate": 0.0006509738728229525,
"loss": 0.4642,
"num_input_tokens_seen": 1410792,
"step": 4955
},
{
"epoch": 9.253731343283581,
"grad_norm": 0.3709312975406647,
"learning_rate": 0.000650197616588814,
"loss": 0.449,
"num_input_tokens_seen": 1412296,
"step": 4960
},
{
"epoch": 9.263059701492537,
"grad_norm": 0.4108433723449707,
"learning_rate": 0.0006494209622217365,
"loss": 0.4615,
"num_input_tokens_seen": 1413800,
"step": 4965
},
{
"epoch": 9.272388059701493,
"grad_norm": 0.26729562878608704,
"learning_rate": 0.0006486439117804195,
"loss": 0.495,
"num_input_tokens_seen": 1415176,
"step": 4970
},
{
"epoch": 9.281716417910447,
"grad_norm": 0.2769668996334076,
"learning_rate": 0.0006478664673246115,
"loss": 0.327,
"num_input_tokens_seen": 1416616,
"step": 4975
},
{
"epoch": 9.291044776119403,
"grad_norm": 0.31615519523620605,
"learning_rate": 0.0006470886309151058,
"loss": 0.5333,
"num_input_tokens_seen": 1417960,
"step": 4980
},
{
"epoch": 9.300373134328359,
"grad_norm": 0.39532470703125,
"learning_rate": 0.0006463104046137349,
"loss": 0.5181,
"num_input_tokens_seen": 1419176,
"step": 4985
},
{
"epoch": 9.309701492537313,
"grad_norm": 0.27754735946655273,
"learning_rate": 0.0006455317904833644,
"loss": 0.3109,
"num_input_tokens_seen": 1420584,
"step": 4990
},
{
"epoch": 9.319029850746269,
"grad_norm": 0.24021460115909576,
"learning_rate": 0.0006447527905878883,
"loss": 0.4653,
"num_input_tokens_seen": 1422024,
"step": 4995
},
{
"epoch": 9.328358208955224,
"grad_norm": 0.35687342286109924,
"learning_rate": 0.0006439734069922229,
"loss": 0.4767,
"num_input_tokens_seen": 1423336,
"step": 5000
},
{
"epoch": 9.337686567164178,
"grad_norm": 0.32762643694877625,
"learning_rate": 0.0006431936417623016,
"loss": 0.4609,
"num_input_tokens_seen": 1424680,
"step": 5005
},
{
"epoch": 9.347014925373134,
"grad_norm": 0.31598585844039917,
"learning_rate": 0.0006424134969650695,
"loss": 0.4259,
"num_input_tokens_seen": 1426024,
"step": 5010
},
{
"epoch": 9.35634328358209,
"grad_norm": 0.6430713534355164,
"learning_rate": 0.0006416329746684779,
"loss": 0.685,
"num_input_tokens_seen": 1427304,
"step": 5015
},
{
"epoch": 9.365671641791044,
"grad_norm": 0.25953567028045654,
"learning_rate": 0.0006408520769414785,
"loss": 0.3353,
"num_input_tokens_seen": 1428936,
"step": 5020
},
{
"epoch": 9.375,
"grad_norm": 0.22472409904003143,
"learning_rate": 0.0006400708058540182,
"loss": 0.3017,
"num_input_tokens_seen": 1430344,
"step": 5025
},
{
"epoch": 9.384328358208956,
"grad_norm": 0.32781997323036194,
"learning_rate": 0.0006392891634770341,
"loss": 0.3861,
"num_input_tokens_seen": 1431656,
"step": 5030
},
{
"epoch": 9.39365671641791,
"grad_norm": 0.3750348389148712,
"learning_rate": 0.0006385071518824467,
"loss": 0.5049,
"num_input_tokens_seen": 1433032,
"step": 5035
},
{
"epoch": 9.402985074626866,
"grad_norm": 0.3276551365852356,
"learning_rate": 0.0006377247731431557,
"loss": 0.4301,
"num_input_tokens_seen": 1434440,
"step": 5040
},
{
"epoch": 9.412313432835822,
"grad_norm": 0.3144753873348236,
"learning_rate": 0.0006369420293330338,
"loss": 0.5351,
"num_input_tokens_seen": 1436072,
"step": 5045
},
{
"epoch": 9.421641791044776,
"grad_norm": 0.21689414978027344,
"learning_rate": 0.0006361589225269216,
"loss": 0.2511,
"num_input_tokens_seen": 1437448,
"step": 5050
},
{
"epoch": 9.430970149253731,
"grad_norm": 0.2835860252380371,
"learning_rate": 0.0006353754548006215,
"loss": 0.4159,
"num_input_tokens_seen": 1438792,
"step": 5055
},
{
"epoch": 9.440298507462687,
"grad_norm": 0.2518909275531769,
"learning_rate": 0.0006345916282308932,
"loss": 0.4009,
"num_input_tokens_seen": 1440168,
"step": 5060
},
{
"epoch": 9.449626865671641,
"grad_norm": 0.2714730203151703,
"learning_rate": 0.0006338074448954472,
"loss": 0.4108,
"num_input_tokens_seen": 1441640,
"step": 5065
},
{
"epoch": 9.458955223880597,
"grad_norm": 0.4161335527896881,
"learning_rate": 0.0006330229068729396,
"loss": 0.3473,
"num_input_tokens_seen": 1443048,
"step": 5070
},
{
"epoch": 9.468283582089553,
"grad_norm": 0.38253721594810486,
"learning_rate": 0.0006322380162429671,
"loss": 0.48,
"num_input_tokens_seen": 1444360,
"step": 5075
},
{
"epoch": 9.477611940298507,
"grad_norm": 0.21055766940116882,
"learning_rate": 0.0006314527750860603,
"loss": 0.5905,
"num_input_tokens_seen": 1445864,
"step": 5080
},
{
"epoch": 9.486940298507463,
"grad_norm": 0.5116686820983887,
"learning_rate": 0.0006306671854836801,
"loss": 0.4005,
"num_input_tokens_seen": 1447240,
"step": 5085
},
{
"epoch": 9.496268656716419,
"grad_norm": 0.38486605882644653,
"learning_rate": 0.00062988124951821,
"loss": 0.6322,
"num_input_tokens_seen": 1448616,
"step": 5090
},
{
"epoch": 9.505597014925373,
"grad_norm": 0.3872508704662323,
"learning_rate": 0.0006290949692729521,
"loss": 0.3415,
"num_input_tokens_seen": 1450056,
"step": 5095
},
{
"epoch": 9.514925373134329,
"grad_norm": 0.21692124009132385,
"learning_rate": 0.000628308346832121,
"loss": 0.4015,
"num_input_tokens_seen": 1451496,
"step": 5100
},
{
"epoch": 9.524253731343283,
"grad_norm": 0.19488053023815155,
"learning_rate": 0.0006275213842808383,
"loss": 0.3756,
"num_input_tokens_seen": 1453288,
"step": 5105
},
{
"epoch": 9.533582089552239,
"grad_norm": 0.18054017424583435,
"learning_rate": 0.0006267340837051273,
"loss": 0.5908,
"num_input_tokens_seen": 1454792,
"step": 5110
},
{
"epoch": 9.542910447761194,
"grad_norm": 0.26882755756378174,
"learning_rate": 0.0006259464471919069,
"loss": 0.4475,
"num_input_tokens_seen": 1456136,
"step": 5115
},
{
"epoch": 9.552238805970148,
"grad_norm": 0.21234282851219177,
"learning_rate": 0.0006251584768289874,
"loss": 0.4061,
"num_input_tokens_seen": 1457672,
"step": 5120
},
{
"epoch": 9.561567164179104,
"grad_norm": 0.33642059564590454,
"learning_rate": 0.000624370174705063,
"loss": 0.7005,
"num_input_tokens_seen": 1459080,
"step": 5125
},
{
"epoch": 9.57089552238806,
"grad_norm": 0.1804080754518509,
"learning_rate": 0.000623581542909708,
"loss": 0.452,
"num_input_tokens_seen": 1460552,
"step": 5130
},
{
"epoch": 9.580223880597014,
"grad_norm": 0.3603133261203766,
"learning_rate": 0.0006227925835333699,
"loss": 0.4513,
"num_input_tokens_seen": 1461832,
"step": 5135
},
{
"epoch": 9.58955223880597,
"grad_norm": 0.18820270895957947,
"learning_rate": 0.0006220032986673652,
"loss": 0.3246,
"num_input_tokens_seen": 1463176,
"step": 5140
},
{
"epoch": 9.598880597014926,
"grad_norm": 0.4032508432865143,
"learning_rate": 0.000621213690403873,
"loss": 0.486,
"num_input_tokens_seen": 1464488,
"step": 5145
},
{
"epoch": 9.60820895522388,
"grad_norm": 0.4418434202671051,
"learning_rate": 0.0006204237608359296,
"loss": 0.4409,
"num_input_tokens_seen": 1465928,
"step": 5150
},
{
"epoch": 9.617537313432836,
"grad_norm": 0.4267770051956177,
"learning_rate": 0.0006196335120574227,
"loss": 0.3294,
"num_input_tokens_seen": 1467432,
"step": 5155
},
{
"epoch": 9.626865671641792,
"grad_norm": 0.2499537467956543,
"learning_rate": 0.0006188429461630866,
"loss": 0.3229,
"num_input_tokens_seen": 1468840,
"step": 5160
},
{
"epoch": 9.636194029850746,
"grad_norm": 0.3874378204345703,
"learning_rate": 0.000618052065248496,
"loss": 0.4701,
"num_input_tokens_seen": 1470280,
"step": 5165
},
{
"epoch": 9.645522388059701,
"grad_norm": 0.25871914625167847,
"learning_rate": 0.0006172608714100603,
"loss": 0.3273,
"num_input_tokens_seen": 1471880,
"step": 5170
},
{
"epoch": 9.654850746268657,
"grad_norm": 0.2216385304927826,
"learning_rate": 0.000616469366745019,
"loss": 0.4356,
"num_input_tokens_seen": 1473224,
"step": 5175
},
{
"epoch": 9.664179104477611,
"grad_norm": 0.26175910234451294,
"learning_rate": 0.0006156775533514353,
"loss": 0.4981,
"num_input_tokens_seen": 1474504,
"step": 5180
},
{
"epoch": 9.673507462686567,
"grad_norm": 0.22560495138168335,
"learning_rate": 0.0006148854333281905,
"loss": 0.386,
"num_input_tokens_seen": 1476008,
"step": 5185
},
{
"epoch": 9.682835820895523,
"grad_norm": 0.2675865888595581,
"learning_rate": 0.0006140930087749789,
"loss": 0.4949,
"num_input_tokens_seen": 1477416,
"step": 5190
},
{
"epoch": 9.692164179104477,
"grad_norm": 0.41903549432754517,
"learning_rate": 0.0006133002817923018,
"loss": 0.5435,
"num_input_tokens_seen": 1478696,
"step": 5195
},
{
"epoch": 9.701492537313433,
"grad_norm": 0.3527034819126129,
"learning_rate": 0.0006125072544814625,
"loss": 0.5474,
"num_input_tokens_seen": 1480040,
"step": 5200
},
{
"epoch": 9.710820895522389,
"grad_norm": 0.4132210910320282,
"learning_rate": 0.0006117139289445601,
"loss": 0.4877,
"num_input_tokens_seen": 1481384,
"step": 5205
},
{
"epoch": 9.720149253731343,
"grad_norm": 0.3663824498653412,
"learning_rate": 0.0006109203072844847,
"loss": 0.7451,
"num_input_tokens_seen": 1482760,
"step": 5210
},
{
"epoch": 9.729477611940299,
"grad_norm": 0.29538971185684204,
"learning_rate": 0.0006101263916049107,
"loss": 0.4493,
"num_input_tokens_seen": 1484136,
"step": 5215
},
{
"epoch": 9.738805970149254,
"grad_norm": 0.341337114572525,
"learning_rate": 0.0006093321840102921,
"loss": 0.5761,
"num_input_tokens_seen": 1485672,
"step": 5220
},
{
"epoch": 9.748134328358208,
"grad_norm": 0.17743420600891113,
"learning_rate": 0.0006085376866058568,
"loss": 0.2284,
"num_input_tokens_seen": 1487304,
"step": 5225
},
{
"epoch": 9.757462686567164,
"grad_norm": 0.2598201632499695,
"learning_rate": 0.000607742901497601,
"loss": 0.4622,
"num_input_tokens_seen": 1488808,
"step": 5230
},
{
"epoch": 9.76679104477612,
"grad_norm": 0.2551731467247009,
"learning_rate": 0.0006069478307922831,
"loss": 0.61,
"num_input_tokens_seen": 1490280,
"step": 5235
},
{
"epoch": 9.776119402985074,
"grad_norm": 0.4407395124435425,
"learning_rate": 0.000606152476597419,
"loss": 0.3799,
"num_input_tokens_seen": 1491656,
"step": 5240
},
{
"epoch": 9.78544776119403,
"grad_norm": 0.41758984327316284,
"learning_rate": 0.0006053568410212759,
"loss": 0.4126,
"num_input_tokens_seen": 1493160,
"step": 5245
},
{
"epoch": 9.794776119402986,
"grad_norm": 0.3324006497859955,
"learning_rate": 0.0006045609261728667,
"loss": 0.4813,
"num_input_tokens_seen": 1494504,
"step": 5250
},
{
"epoch": 9.80410447761194,
"grad_norm": 0.2460772544145584,
"learning_rate": 0.0006037647341619448,
"loss": 0.3861,
"num_input_tokens_seen": 1495816,
"step": 5255
},
{
"epoch": 9.813432835820896,
"grad_norm": 0.2575085759162903,
"learning_rate": 0.000602968267098998,
"loss": 0.4007,
"num_input_tokens_seen": 1497288,
"step": 5260
},
{
"epoch": 9.822761194029852,
"grad_norm": 0.2791326940059662,
"learning_rate": 0.0006021715270952435,
"loss": 0.4939,
"num_input_tokens_seen": 1498792,
"step": 5265
},
{
"epoch": 9.832089552238806,
"grad_norm": 0.38074228167533875,
"learning_rate": 0.000601374516262622,
"loss": 0.4303,
"num_input_tokens_seen": 1500520,
"step": 5270
},
{
"epoch": 9.841417910447761,
"grad_norm": 0.34724968671798706,
"learning_rate": 0.0006005772367137917,
"loss": 0.2854,
"num_input_tokens_seen": 1501832,
"step": 5275
},
{
"epoch": 9.850746268656717,
"grad_norm": 0.27803465723991394,
"learning_rate": 0.0005997796905621236,
"loss": 0.4868,
"num_input_tokens_seen": 1503112,
"step": 5280
},
{
"epoch": 9.860074626865671,
"grad_norm": 0.31478098034858704,
"learning_rate": 0.0005989818799216949,
"loss": 0.4081,
"num_input_tokens_seen": 1504392,
"step": 5285
},
{
"epoch": 9.869402985074627,
"grad_norm": 0.3727143108844757,
"learning_rate": 0.0005981838069072843,
"loss": 0.5072,
"num_input_tokens_seen": 1505640,
"step": 5290
},
{
"epoch": 9.878731343283581,
"grad_norm": 0.2286718338727951,
"learning_rate": 0.0005973854736343658,
"loss": 0.5946,
"num_input_tokens_seen": 1506824,
"step": 5295
},
{
"epoch": 9.888059701492537,
"grad_norm": 0.24771994352340698,
"learning_rate": 0.0005965868822191032,
"loss": 0.3597,
"num_input_tokens_seen": 1508264,
"step": 5300
},
{
"epoch": 9.897388059701493,
"grad_norm": 0.2050355076789856,
"learning_rate": 0.0005957880347783449,
"loss": 0.4493,
"num_input_tokens_seen": 1509704,
"step": 5305
},
{
"epoch": 9.906716417910447,
"grad_norm": 0.35616564750671387,
"learning_rate": 0.0005949889334296172,
"loss": 0.4813,
"num_input_tokens_seen": 1511304,
"step": 5310
},
{
"epoch": 9.916044776119403,
"grad_norm": 0.3519928455352783,
"learning_rate": 0.0005941895802911205,
"loss": 0.288,
"num_input_tokens_seen": 1512712,
"step": 5315
},
{
"epoch": 9.925373134328359,
"grad_norm": 0.31984153389930725,
"learning_rate": 0.0005933899774817216,
"loss": 0.4164,
"num_input_tokens_seen": 1514184,
"step": 5320
},
{
"epoch": 9.934701492537313,
"grad_norm": 0.34073641896247864,
"learning_rate": 0.00059259012712095,
"loss": 0.4098,
"num_input_tokens_seen": 1515464,
"step": 5325
},
{
"epoch": 9.944029850746269,
"grad_norm": 0.17610888183116913,
"learning_rate": 0.0005917900313289906,
"loss": 0.4047,
"num_input_tokens_seen": 1516968,
"step": 5330
},
{
"epoch": 9.953358208955224,
"grad_norm": 0.38012248277664185,
"learning_rate": 0.0005909896922266796,
"loss": 0.3608,
"num_input_tokens_seen": 1518504,
"step": 5335
},
{
"epoch": 9.962686567164178,
"grad_norm": 0.21262069046497345,
"learning_rate": 0.0005901891119354976,
"loss": 0.3746,
"num_input_tokens_seen": 1519880,
"step": 5340
},
{
"epoch": 9.972014925373134,
"grad_norm": 0.20688098669052124,
"learning_rate": 0.0005893882925775648,
"loss": 0.4088,
"num_input_tokens_seen": 1521320,
"step": 5345
},
{
"epoch": 9.98134328358209,
"grad_norm": 0.3373405635356903,
"learning_rate": 0.0005885872362756348,
"loss": 0.4424,
"num_input_tokens_seen": 1522696,
"step": 5350
},
{
"epoch": 9.990671641791044,
"grad_norm": 0.2352280467748642,
"learning_rate": 0.0005877859451530895,
"loss": 0.4812,
"num_input_tokens_seen": 1524072,
"step": 5355
},
{
"epoch": 10.0,
"grad_norm": 1.3264073133468628,
"learning_rate": 0.0005869844213339338,
"loss": 0.9467,
"num_input_tokens_seen": 1525104,
"step": 5360
},
{
"epoch": 10.0,
"eval_loss": 0.6917274594306946,
"eval_runtime": 4.1989,
"eval_samples_per_second": 56.681,
"eval_steps_per_second": 14.289,
"num_input_tokens_seen": 1525104,
"step": 5360
},
{
"epoch": 10.009328358208956,
"grad_norm": 0.391521155834198,
"learning_rate": 0.0005861826669427882,
"loss": 0.4883,
"num_input_tokens_seen": 1526608,
"step": 5365
},
{
"epoch": 10.01865671641791,
"grad_norm": 0.3746884763240814,
"learning_rate": 0.0005853806841048854,
"loss": 0.4612,
"num_input_tokens_seen": 1528048,
"step": 5370
},
{
"epoch": 10.027985074626866,
"grad_norm": 0.44840776920318604,
"learning_rate": 0.0005845784749460631,
"loss": 0.567,
"num_input_tokens_seen": 1529360,
"step": 5375
},
{
"epoch": 10.037313432835822,
"grad_norm": 0.4189889430999756,
"learning_rate": 0.0005837760415927593,
"loss": 0.4075,
"num_input_tokens_seen": 1530672,
"step": 5380
},
{
"epoch": 10.046641791044776,
"grad_norm": 0.37005850672721863,
"learning_rate": 0.0005829733861720059,
"loss": 0.4669,
"num_input_tokens_seen": 1532176,
"step": 5385
},
{
"epoch": 10.055970149253731,
"grad_norm": 0.3520726263523102,
"learning_rate": 0.0005821705108114236,
"loss": 0.348,
"num_input_tokens_seen": 1533680,
"step": 5390
},
{
"epoch": 10.065298507462687,
"grad_norm": 0.3111060559749603,
"learning_rate": 0.0005813674176392163,
"loss": 0.6895,
"num_input_tokens_seen": 1535024,
"step": 5395
},
{
"epoch": 10.074626865671641,
"grad_norm": 0.1819819211959839,
"learning_rate": 0.0005805641087841649,
"loss": 0.3598,
"num_input_tokens_seen": 1536624,
"step": 5400
},
{
"epoch": 10.083955223880597,
"grad_norm": 0.3409096896648407,
"learning_rate": 0.0005797605863756224,
"loss": 0.4161,
"num_input_tokens_seen": 1538000,
"step": 5405
},
{
"epoch": 10.093283582089553,
"grad_norm": 0.3817724883556366,
"learning_rate": 0.0005789568525435076,
"loss": 0.3222,
"num_input_tokens_seen": 1539440,
"step": 5410
},
{
"epoch": 10.102611940298507,
"grad_norm": 0.3032921254634857,
"learning_rate": 0.0005781529094182995,
"loss": 0.4074,
"num_input_tokens_seen": 1540848,
"step": 5415
},
{
"epoch": 10.111940298507463,
"grad_norm": 0.514035165309906,
"learning_rate": 0.0005773487591310328,
"loss": 0.4141,
"num_input_tokens_seen": 1542352,
"step": 5420
},
{
"epoch": 10.121268656716419,
"grad_norm": 0.46226271986961365,
"learning_rate": 0.0005765444038132901,
"loss": 0.3154,
"num_input_tokens_seen": 1543856,
"step": 5425
},
{
"epoch": 10.130597014925373,
"grad_norm": 0.39483413100242615,
"learning_rate": 0.0005757398455971984,
"loss": 0.2744,
"num_input_tokens_seen": 1545264,
"step": 5430
},
{
"epoch": 10.139925373134329,
"grad_norm": 0.3189958333969116,
"learning_rate": 0.000574935086615422,
"loss": 0.3179,
"num_input_tokens_seen": 1546544,
"step": 5435
},
{
"epoch": 10.149253731343283,
"grad_norm": 0.21321584284305573,
"learning_rate": 0.000574130129001158,
"loss": 0.3314,
"num_input_tokens_seen": 1547856,
"step": 5440
},
{
"epoch": 10.158582089552239,
"grad_norm": 0.30174720287323,
"learning_rate": 0.000573324974888129,
"loss": 0.303,
"num_input_tokens_seen": 1549264,
"step": 5445
},
{
"epoch": 10.167910447761194,
"grad_norm": 0.22235773503780365,
"learning_rate": 0.0005725196264105796,
"loss": 0.4078,
"num_input_tokens_seen": 1550576,
"step": 5450
},
{
"epoch": 10.177238805970148,
"grad_norm": 0.3746367394924164,
"learning_rate": 0.0005717140857032691,
"loss": 0.2964,
"num_input_tokens_seen": 1552080,
"step": 5455
},
{
"epoch": 10.186567164179104,
"grad_norm": 0.319317102432251,
"learning_rate": 0.0005709083549014658,
"loss": 0.2896,
"num_input_tokens_seen": 1553488,
"step": 5460
},
{
"epoch": 10.19589552238806,
"grad_norm": 0.534639298915863,
"learning_rate": 0.0005701024361409431,
"loss": 0.4107,
"num_input_tokens_seen": 1554960,
"step": 5465
},
{
"epoch": 10.205223880597014,
"grad_norm": 0.45749685168266296,
"learning_rate": 0.0005692963315579712,
"loss": 0.2126,
"num_input_tokens_seen": 1556464,
"step": 5470
},
{
"epoch": 10.21455223880597,
"grad_norm": 0.6257716417312622,
"learning_rate": 0.0005684900432893141,
"loss": 0.6183,
"num_input_tokens_seen": 1557872,
"step": 5475
},
{
"epoch": 10.223880597014926,
"grad_norm": 0.3536997437477112,
"learning_rate": 0.0005676835734722222,
"loss": 0.4375,
"num_input_tokens_seen": 1559152,
"step": 5480
},
{
"epoch": 10.23320895522388,
"grad_norm": 0.4197767376899719,
"learning_rate": 0.0005668769242444271,
"loss": 0.4315,
"num_input_tokens_seen": 1560592,
"step": 5485
},
{
"epoch": 10.242537313432836,
"grad_norm": 0.4398539662361145,
"learning_rate": 0.0005660700977441358,
"loss": 0.6232,
"num_input_tokens_seen": 1561872,
"step": 5490
},
{
"epoch": 10.251865671641792,
"grad_norm": 0.3463667035102844,
"learning_rate": 0.000565263096110026,
"loss": 0.3988,
"num_input_tokens_seen": 1563280,
"step": 5495
},
{
"epoch": 10.261194029850746,
"grad_norm": 0.3945913314819336,
"learning_rate": 0.0005644559214812382,
"loss": 0.4993,
"num_input_tokens_seen": 1564464,
"step": 5500
},
{
"epoch": 10.270522388059701,
"grad_norm": 0.34219738841056824,
"learning_rate": 0.0005636485759973729,
"loss": 0.4785,
"num_input_tokens_seen": 1565648,
"step": 5505
},
{
"epoch": 10.279850746268657,
"grad_norm": 0.2330724447965622,
"learning_rate": 0.0005628410617984828,
"loss": 0.4828,
"num_input_tokens_seen": 1567248,
"step": 5510
},
{
"epoch": 10.289179104477611,
"grad_norm": 0.4671902060508728,
"learning_rate": 0.0005620333810250678,
"loss": 0.4126,
"num_input_tokens_seen": 1568784,
"step": 5515
},
{
"epoch": 10.298507462686567,
"grad_norm": 0.2536733150482178,
"learning_rate": 0.0005612255358180698,
"loss": 0.3553,
"num_input_tokens_seen": 1570288,
"step": 5520
},
{
"epoch": 10.307835820895523,
"grad_norm": 0.3621693551540375,
"learning_rate": 0.0005604175283188658,
"loss": 0.4296,
"num_input_tokens_seen": 1571632,
"step": 5525
},
{
"epoch": 10.317164179104477,
"grad_norm": 0.42798948287963867,
"learning_rate": 0.0005596093606692635,
"loss": 0.4056,
"num_input_tokens_seen": 1572944,
"step": 5530
},
{
"epoch": 10.326492537313433,
"grad_norm": 0.23219768702983856,
"learning_rate": 0.0005588010350114953,
"loss": 0.6229,
"num_input_tokens_seen": 1574480,
"step": 5535
},
{
"epoch": 10.335820895522389,
"grad_norm": 0.4013402760028839,
"learning_rate": 0.0005579925534882117,
"loss": 0.4501,
"num_input_tokens_seen": 1575952,
"step": 5540
},
{
"epoch": 10.345149253731343,
"grad_norm": 0.39546915888786316,
"learning_rate": 0.0005571839182424775,
"loss": 0.4323,
"num_input_tokens_seen": 1577360,
"step": 5545
},
{
"epoch": 10.354477611940299,
"grad_norm": 0.42694559693336487,
"learning_rate": 0.0005563751314177638,
"loss": 0.5057,
"num_input_tokens_seen": 1578864,
"step": 5550
},
{
"epoch": 10.363805970149254,
"grad_norm": 0.46801120042800903,
"learning_rate": 0.0005555661951579442,
"loss": 0.4682,
"num_input_tokens_seen": 1580176,
"step": 5555
},
{
"epoch": 10.373134328358208,
"grad_norm": 0.5218855142593384,
"learning_rate": 0.000554757111607288,
"loss": 0.3993,
"num_input_tokens_seen": 1581776,
"step": 5560
},
{
"epoch": 10.382462686567164,
"grad_norm": 0.37519174814224243,
"learning_rate": 0.0005539478829104555,
"loss": 0.261,
"num_input_tokens_seen": 1583248,
"step": 5565
},
{
"epoch": 10.39179104477612,
"grad_norm": 0.5104201436042786,
"learning_rate": 0.0005531385112124912,
"loss": 0.3501,
"num_input_tokens_seen": 1584592,
"step": 5570
},
{
"epoch": 10.401119402985074,
"grad_norm": 0.25808826088905334,
"learning_rate": 0.0005523289986588188,
"loss": 0.295,
"num_input_tokens_seen": 1585840,
"step": 5575
},
{
"epoch": 10.41044776119403,
"grad_norm": 0.4015122652053833,
"learning_rate": 0.0005515193473952355,
"loss": 0.4941,
"num_input_tokens_seen": 1587216,
"step": 5580
},
{
"epoch": 10.419776119402986,
"grad_norm": 0.2906988859176636,
"learning_rate": 0.0005507095595679059,
"loss": 0.5156,
"num_input_tokens_seen": 1588560,
"step": 5585
},
{
"epoch": 10.42910447761194,
"grad_norm": 0.31299829483032227,
"learning_rate": 0.000549899637323357,
"loss": 0.3407,
"num_input_tokens_seen": 1590128,
"step": 5590
},
{
"epoch": 10.438432835820896,
"grad_norm": 0.31594422459602356,
"learning_rate": 0.0005490895828084717,
"loss": 0.528,
"num_input_tokens_seen": 1591568,
"step": 5595
},
{
"epoch": 10.447761194029852,
"grad_norm": 0.30748066306114197,
"learning_rate": 0.0005482793981704841,
"loss": 0.4311,
"num_input_tokens_seen": 1592944,
"step": 5600
},
{
"epoch": 10.457089552238806,
"grad_norm": 0.4373854100704193,
"learning_rate": 0.0005474690855569724,
"loss": 0.5543,
"num_input_tokens_seen": 1594416,
"step": 5605
},
{
"epoch": 10.466417910447761,
"grad_norm": 0.318352073431015,
"learning_rate": 0.0005466586471158548,
"loss": 0.618,
"num_input_tokens_seen": 1595824,
"step": 5610
},
{
"epoch": 10.475746268656717,
"grad_norm": 0.3729168772697449,
"learning_rate": 0.0005458480849953822,
"loss": 0.3934,
"num_input_tokens_seen": 1597168,
"step": 5615
},
{
"epoch": 10.485074626865671,
"grad_norm": 0.4165967106819153,
"learning_rate": 0.0005450374013441343,
"loss": 0.3776,
"num_input_tokens_seen": 1598768,
"step": 5620
},
{
"epoch": 10.494402985074627,
"grad_norm": 0.4932622015476227,
"learning_rate": 0.0005442265983110123,
"loss": 0.3508,
"num_input_tokens_seen": 1600592,
"step": 5625
},
{
"epoch": 10.503731343283581,
"grad_norm": 0.3454512059688568,
"learning_rate": 0.0005434156780452339,
"loss": 0.4834,
"num_input_tokens_seen": 1602064,
"step": 5630
},
{
"epoch": 10.513059701492537,
"grad_norm": 0.4346556067466736,
"learning_rate": 0.0005426046426963279,
"loss": 0.6184,
"num_input_tokens_seen": 1603568,
"step": 5635
},
{
"epoch": 10.522388059701493,
"grad_norm": 0.24719765782356262,
"learning_rate": 0.0005417934944141277,
"loss": 0.4562,
"num_input_tokens_seen": 1604912,
"step": 5640
},
{
"epoch": 10.531716417910447,
"grad_norm": 0.3545502722263336,
"learning_rate": 0.0005409822353487666,
"loss": 0.5075,
"num_input_tokens_seen": 1606288,
"step": 5645
},
{
"epoch": 10.541044776119403,
"grad_norm": 0.4384237825870514,
"learning_rate": 0.0005401708676506709,
"loss": 0.353,
"num_input_tokens_seen": 1607728,
"step": 5650
},
{
"epoch": 10.550373134328359,
"grad_norm": 0.22765681147575378,
"learning_rate": 0.0005393593934705553,
"loss": 0.4498,
"num_input_tokens_seen": 1609168,
"step": 5655
},
{
"epoch": 10.559701492537313,
"grad_norm": 0.37284964323043823,
"learning_rate": 0.0005385478149594168,
"loss": 0.4036,
"num_input_tokens_seen": 1610416,
"step": 5660
},
{
"epoch": 10.569029850746269,
"grad_norm": 0.376591295003891,
"learning_rate": 0.0005377361342685286,
"loss": 0.4248,
"num_input_tokens_seen": 1611824,
"step": 5665
},
{
"epoch": 10.578358208955224,
"grad_norm": 0.30000317096710205,
"learning_rate": 0.0005369243535494352,
"loss": 0.3881,
"num_input_tokens_seen": 1613200,
"step": 5670
},
{
"epoch": 10.587686567164178,
"grad_norm": 0.4292525351047516,
"learning_rate": 0.0005361124749539457,
"loss": 0.4555,
"num_input_tokens_seen": 1614480,
"step": 5675
},
{
"epoch": 10.597014925373134,
"grad_norm": 0.3540624678134918,
"learning_rate": 0.000535300500634129,
"loss": 0.4955,
"num_input_tokens_seen": 1615792,
"step": 5680
},
{
"epoch": 10.60634328358209,
"grad_norm": 0.361230731010437,
"learning_rate": 0.000534488432742308,
"loss": 0.4688,
"num_input_tokens_seen": 1617168,
"step": 5685
},
{
"epoch": 10.615671641791044,
"grad_norm": 0.4050374925136566,
"learning_rate": 0.0005336762734310529,
"loss": 0.325,
"num_input_tokens_seen": 1618800,
"step": 5690
},
{
"epoch": 10.625,
"grad_norm": 0.3878132700920105,
"learning_rate": 0.000532864024853177,
"loss": 0.2777,
"num_input_tokens_seen": 1620304,
"step": 5695
},
{
"epoch": 10.634328358208956,
"grad_norm": 0.3716028928756714,
"learning_rate": 0.0005320516891617296,
"loss": 0.5114,
"num_input_tokens_seen": 1621808,
"step": 5700
},
{
"epoch": 10.64365671641791,
"grad_norm": 0.3676183819770813,
"learning_rate": 0.0005312392685099914,
"loss": 0.3349,
"num_input_tokens_seen": 1623248,
"step": 5705
},
{
"epoch": 10.652985074626866,
"grad_norm": 0.4085880517959595,
"learning_rate": 0.0005304267650514678,
"loss": 0.3084,
"num_input_tokens_seen": 1624656,
"step": 5710
},
{
"epoch": 10.662313432835822,
"grad_norm": 0.49978184700012207,
"learning_rate": 0.0005296141809398844,
"loss": 0.456,
"num_input_tokens_seen": 1626032,
"step": 5715
},
{
"epoch": 10.671641791044776,
"grad_norm": 0.21887758374214172,
"learning_rate": 0.0005288015183291797,
"loss": 0.3885,
"num_input_tokens_seen": 1627440,
"step": 5720
},
{
"epoch": 10.680970149253731,
"grad_norm": 0.42853760719299316,
"learning_rate": 0.0005279887793735011,
"loss": 0.3648,
"num_input_tokens_seen": 1628784,
"step": 5725
},
{
"epoch": 10.690298507462687,
"grad_norm": 0.32524973154067993,
"learning_rate": 0.0005271759662271978,
"loss": 0.4533,
"num_input_tokens_seen": 1630032,
"step": 5730
},
{
"epoch": 10.699626865671641,
"grad_norm": 0.3890216052532196,
"learning_rate": 0.0005263630810448161,
"loss": 0.307,
"num_input_tokens_seen": 1631504,
"step": 5735
},
{
"epoch": 10.708955223880597,
"grad_norm": 0.37297531962394714,
"learning_rate": 0.0005255501259810929,
"loss": 0.3934,
"num_input_tokens_seen": 1632944,
"step": 5740
},
{
"epoch": 10.718283582089553,
"grad_norm": 0.28741270303726196,
"learning_rate": 0.0005247371031909505,
"loss": 0.2608,
"num_input_tokens_seen": 1634480,
"step": 5745
},
{
"epoch": 10.727611940298507,
"grad_norm": 0.23218028247356415,
"learning_rate": 0.0005239240148294907,
"loss": 0.4041,
"num_input_tokens_seen": 1636048,
"step": 5750
},
{
"epoch": 10.736940298507463,
"grad_norm": 0.47181567549705505,
"learning_rate": 0.0005231108630519891,
"loss": 0.6394,
"num_input_tokens_seen": 1637392,
"step": 5755
},
{
"epoch": 10.746268656716419,
"grad_norm": 0.4819324016571045,
"learning_rate": 0.0005222976500138894,
"loss": 0.4447,
"num_input_tokens_seen": 1638800,
"step": 5760
},
{
"epoch": 10.755597014925373,
"grad_norm": 0.3169451355934143,
"learning_rate": 0.0005214843778707977,
"loss": 0.4645,
"num_input_tokens_seen": 1640112,
"step": 5765
},
{
"epoch": 10.764925373134329,
"grad_norm": 0.2931522727012634,
"learning_rate": 0.0005206710487784767,
"loss": 0.4478,
"num_input_tokens_seen": 1641488,
"step": 5770
},
{
"epoch": 10.774253731343283,
"grad_norm": 0.4820563793182373,
"learning_rate": 0.0005198576648928402,
"loss": 0.3516,
"num_input_tokens_seen": 1642832,
"step": 5775
},
{
"epoch": 10.783582089552239,
"grad_norm": 0.3501931428909302,
"learning_rate": 0.0005190442283699472,
"loss": 0.2941,
"num_input_tokens_seen": 1644368,
"step": 5780
},
{
"epoch": 10.792910447761194,
"grad_norm": 0.32435059547424316,
"learning_rate": 0.000518230741365996,
"loss": 0.4281,
"num_input_tokens_seen": 1645840,
"step": 5785
},
{
"epoch": 10.802238805970148,
"grad_norm": 0.39015549421310425,
"learning_rate": 0.0005174172060373189,
"loss": 0.3721,
"num_input_tokens_seen": 1647408,
"step": 5790
},
{
"epoch": 10.811567164179104,
"grad_norm": 0.4212459325790405,
"learning_rate": 0.0005166036245403767,
"loss": 0.7593,
"num_input_tokens_seen": 1648688,
"step": 5795
},
{
"epoch": 10.82089552238806,
"grad_norm": 0.42889150977134705,
"learning_rate": 0.0005157899990317515,
"loss": 0.5395,
"num_input_tokens_seen": 1650224,
"step": 5800
},
{
"epoch": 10.830223880597014,
"grad_norm": 0.2492639124393463,
"learning_rate": 0.0005149763316681433,
"loss": 0.3117,
"num_input_tokens_seen": 1651696,
"step": 5805
},
{
"epoch": 10.83955223880597,
"grad_norm": 0.389493465423584,
"learning_rate": 0.0005141626246063622,
"loss": 0.4391,
"num_input_tokens_seen": 1653200,
"step": 5810
},
{
"epoch": 10.848880597014926,
"grad_norm": 0.1207047626376152,
"learning_rate": 0.0005133488800033241,
"loss": 0.2615,
"num_input_tokens_seen": 1654896,
"step": 5815
},
{
"epoch": 10.85820895522388,
"grad_norm": 0.36992502212524414,
"learning_rate": 0.0005125351000160438,
"loss": 0.367,
"num_input_tokens_seen": 1656240,
"step": 5820
},
{
"epoch": 10.867537313432836,
"grad_norm": 0.31403908133506775,
"learning_rate": 0.0005117212868016303,
"loss": 0.3378,
"num_input_tokens_seen": 1657744,
"step": 5825
},
{
"epoch": 10.876865671641792,
"grad_norm": 0.28204184770584106,
"learning_rate": 0.0005109074425172806,
"loss": 0.3401,
"num_input_tokens_seen": 1658960,
"step": 5830
},
{
"epoch": 10.886194029850746,
"grad_norm": 0.5115631222724915,
"learning_rate": 0.0005100935693202741,
"loss": 0.4407,
"num_input_tokens_seen": 1660240,
"step": 5835
},
{
"epoch": 10.895522388059701,
"grad_norm": 0.45001569390296936,
"learning_rate": 0.0005092796693679667,
"loss": 0.4221,
"num_input_tokens_seen": 1661680,
"step": 5840
},
{
"epoch": 10.904850746268657,
"grad_norm": 0.44775334000587463,
"learning_rate": 0.0005084657448177855,
"loss": 0.4631,
"num_input_tokens_seen": 1663056,
"step": 5845
},
{
"epoch": 10.914179104477611,
"grad_norm": 0.18757286667823792,
"learning_rate": 0.0005076517978272225,
"loss": 0.389,
"num_input_tokens_seen": 1664400,
"step": 5850
},
{
"epoch": 10.923507462686567,
"grad_norm": 0.3951946794986725,
"learning_rate": 0.0005068378305538292,
"loss": 0.3844,
"num_input_tokens_seen": 1665968,
"step": 5855
},
{
"epoch": 10.932835820895523,
"grad_norm": 0.23343099653720856,
"learning_rate": 0.0005060238451552111,
"loss": 0.4433,
"num_input_tokens_seen": 1667280,
"step": 5860
},
{
"epoch": 10.942164179104477,
"grad_norm": 0.3723389804363251,
"learning_rate": 0.0005052098437890215,
"loss": 0.547,
"num_input_tokens_seen": 1669008,
"step": 5865
},
{
"epoch": 10.951492537313433,
"grad_norm": 0.37627390027046204,
"learning_rate": 0.0005043958286129562,
"loss": 0.3503,
"num_input_tokens_seen": 1670512,
"step": 5870
},
{
"epoch": 10.960820895522389,
"grad_norm": 0.21539832651615143,
"learning_rate": 0.0005035818017847476,
"loss": 0.2571,
"num_input_tokens_seen": 1672080,
"step": 5875
},
{
"epoch": 10.970149253731343,
"grad_norm": 0.38906389474868774,
"learning_rate": 0.0005027677654621586,
"loss": 0.3463,
"num_input_tokens_seen": 1673552,
"step": 5880
},
{
"epoch": 10.979477611940299,
"grad_norm": 0.5205991268157959,
"learning_rate": 0.000501953721802978,
"loss": 0.3297,
"num_input_tokens_seen": 1674896,
"step": 5885
},
{
"epoch": 10.988805970149254,
"grad_norm": 0.2763025164604187,
"learning_rate": 0.0005011396729650135,
"loss": 0.4295,
"num_input_tokens_seen": 1676272,
"step": 5890
},
{
"epoch": 10.998134328358208,
"grad_norm": 0.32854244112968445,
"learning_rate": 0.0005003256211060866,
"loss": 0.5869,
"num_input_tokens_seen": 1677616,
"step": 5895
},
{
"epoch": 11.0,
"eval_loss": 0.7297408580780029,
"eval_runtime": 4.1905,
"eval_samples_per_second": 56.795,
"eval_steps_per_second": 14.318,
"num_input_tokens_seen": 1677680,
"step": 5896
},
{
"epoch": 11.007462686567164,
"grad_norm": 0.2975498139858246,
"learning_rate": 0.0004995115683840269,
"loss": 0.2877,
"num_input_tokens_seen": 1678960,
"step": 5900
},
{
"epoch": 11.01679104477612,
"grad_norm": 0.3614169955253601,
"learning_rate": 0.0004986975169566662,
"loss": 0.4449,
"num_input_tokens_seen": 1680496,
"step": 5905
},
{
"epoch": 11.026119402985074,
"grad_norm": 0.31577494740486145,
"learning_rate": 0.0004978834689818331,
"loss": 0.2121,
"num_input_tokens_seen": 1681744,
"step": 5910
},
{
"epoch": 11.03544776119403,
"grad_norm": 0.2219756692647934,
"learning_rate": 0.0004970694266173466,
"loss": 0.3117,
"num_input_tokens_seen": 1683248,
"step": 5915
},
{
"epoch": 11.044776119402986,
"grad_norm": 0.4874166250228882,
"learning_rate": 0.0004962553920210117,
"loss": 0.492,
"num_input_tokens_seen": 1684528,
"step": 5920
},
{
"epoch": 11.05410447761194,
"grad_norm": 0.4654216468334198,
"learning_rate": 0.0004954413673506114,
"loss": 0.299,
"num_input_tokens_seen": 1686064,
"step": 5925
},
{
"epoch": 11.063432835820896,
"grad_norm": 0.311125785112381,
"learning_rate": 0.0004946273547639039,
"loss": 0.2558,
"num_input_tokens_seen": 1687440,
"step": 5930
},
{
"epoch": 11.072761194029852,
"grad_norm": 0.26367080211639404,
"learning_rate": 0.0004938133564186141,
"loss": 0.3663,
"num_input_tokens_seen": 1688976,
"step": 5935
},
{
"epoch": 11.082089552238806,
"grad_norm": 0.5018680691719055,
"learning_rate": 0.00049299937447243,
"loss": 0.2736,
"num_input_tokens_seen": 1690512,
"step": 5940
},
{
"epoch": 11.091417910447761,
"grad_norm": 0.5152081847190857,
"learning_rate": 0.0004921854110829962,
"loss": 0.4058,
"num_input_tokens_seen": 1691920,
"step": 5945
},
{
"epoch": 11.100746268656716,
"grad_norm": 0.5324569344520569,
"learning_rate": 0.0004913714684079071,
"loss": 0.4961,
"num_input_tokens_seen": 1693392,
"step": 5950
},
{
"epoch": 11.110074626865671,
"grad_norm": 0.48775115609169006,
"learning_rate": 0.0004905575486047034,
"loss": 0.4429,
"num_input_tokens_seen": 1694768,
"step": 5955
},
{
"epoch": 11.119402985074627,
"grad_norm": 0.4510795474052429,
"learning_rate": 0.0004897436538308641,
"loss": 0.3833,
"num_input_tokens_seen": 1696272,
"step": 5960
},
{
"epoch": 11.128731343283581,
"grad_norm": 0.4744063913822174,
"learning_rate": 0.0004889297862438028,
"loss": 0.4359,
"num_input_tokens_seen": 1697584,
"step": 5965
},
{
"epoch": 11.138059701492537,
"grad_norm": 0.357310026884079,
"learning_rate": 0.00048811594800086066,
"loss": 0.266,
"num_input_tokens_seen": 1699216,
"step": 5970
},
{
"epoch": 11.147388059701493,
"grad_norm": 0.39351770281791687,
"learning_rate": 0.00048730214125930076,
"loss": 0.286,
"num_input_tokens_seen": 1700432,
"step": 5975
},
{
"epoch": 11.156716417910447,
"grad_norm": 0.3268272876739502,
"learning_rate": 0.0004864883681763032,
"loss": 0.4033,
"num_input_tokens_seen": 1701840,
"step": 5980
},
{
"epoch": 11.166044776119403,
"grad_norm": 0.31125694513320923,
"learning_rate": 0.0004856746309089582,
"loss": 0.3183,
"num_input_tokens_seen": 1703216,
"step": 5985
},
{
"epoch": 11.175373134328359,
"grad_norm": 0.5361759662628174,
"learning_rate": 0.0004848609316142618,
"loss": 0.4384,
"num_input_tokens_seen": 1704496,
"step": 5990
},
{
"epoch": 11.184701492537313,
"grad_norm": 0.3227105736732483,
"learning_rate": 0.00048404727244910883,
"loss": 0.3747,
"num_input_tokens_seen": 1705808,
"step": 5995
},
{
"epoch": 11.194029850746269,
"grad_norm": 0.5162398815155029,
"learning_rate": 0.000483233655570288,
"loss": 0.4446,
"num_input_tokens_seen": 1707280,
"step": 6000
},
{
"epoch": 11.203358208955224,
"grad_norm": 0.2446233034133911,
"learning_rate": 0.000482420083134476,
"loss": 0.357,
"num_input_tokens_seen": 1708848,
"step": 6005
},
{
"epoch": 11.212686567164178,
"grad_norm": 0.5056248307228088,
"learning_rate": 0.0004816065572982313,
"loss": 0.4111,
"num_input_tokens_seen": 1710288,
"step": 6010
},
{
"epoch": 11.222014925373134,
"grad_norm": 0.5253577828407288,
"learning_rate": 0.0004807930802179894,
"loss": 0.4007,
"num_input_tokens_seen": 1711600,
"step": 6015
},
{
"epoch": 11.23134328358209,
"grad_norm": 0.4653439521789551,
"learning_rate": 0.0004799796540500561,
"loss": 0.356,
"num_input_tokens_seen": 1712912,
"step": 6020
},
{
"epoch": 11.240671641791044,
"grad_norm": 0.3523794412612915,
"learning_rate": 0.0004791662809506025,
"loss": 0.2683,
"num_input_tokens_seen": 1714256,
"step": 6025
},
{
"epoch": 11.25,
"grad_norm": 0.5544191002845764,
"learning_rate": 0.00047835296307565913,
"loss": 0.7138,
"num_input_tokens_seen": 1715664,
"step": 6030
},
{
"epoch": 11.259328358208956,
"grad_norm": 0.42940109968185425,
"learning_rate": 0.0004775397025811097,
"loss": 0.5004,
"num_input_tokens_seen": 1717040,
"step": 6035
},
{
"epoch": 11.26865671641791,
"grad_norm": 0.48940351605415344,
"learning_rate": 0.0004767265016226863,
"loss": 0.6169,
"num_input_tokens_seen": 1718480,
"step": 6040
},
{
"epoch": 11.277985074626866,
"grad_norm": 0.4342253804206848,
"learning_rate": 0.0004759133623559628,
"loss": 0.4248,
"num_input_tokens_seen": 1719728,
"step": 6045
},
{
"epoch": 11.287313432835822,
"grad_norm": 0.30282506346702576,
"learning_rate": 0.00047510028693634995,
"loss": 0.4106,
"num_input_tokens_seen": 1721264,
"step": 6050
},
{
"epoch": 11.296641791044776,
"grad_norm": 0.3388311564922333,
"learning_rate": 0.0004742872775190889,
"loss": 0.552,
"num_input_tokens_seen": 1722896,
"step": 6055
},
{
"epoch": 11.305970149253731,
"grad_norm": 0.4148770868778229,
"learning_rate": 0.000473474336259246,
"loss": 0.306,
"num_input_tokens_seen": 1724240,
"step": 6060
},
{
"epoch": 11.315298507462687,
"grad_norm": 0.19478370249271393,
"learning_rate": 0.0004726614653117071,
"loss": 0.3752,
"num_input_tokens_seen": 1725680,
"step": 6065
},
{
"epoch": 11.324626865671641,
"grad_norm": 0.4652278423309326,
"learning_rate": 0.00047184866683117125,
"loss": 0.4032,
"num_input_tokens_seen": 1727056,
"step": 6070
},
{
"epoch": 11.333955223880597,
"grad_norm": 0.4905243217945099,
"learning_rate": 0.00047103594297214597,
"loss": 0.4775,
"num_input_tokens_seen": 1728496,
"step": 6075
},
{
"epoch": 11.343283582089553,
"grad_norm": 0.38213881850242615,
"learning_rate": 0.00047022329588894033,
"loss": 0.3003,
"num_input_tokens_seen": 1729936,
"step": 6080
},
{
"epoch": 11.352611940298507,
"grad_norm": 0.45340460538864136,
"learning_rate": 0.0004694107277356604,
"loss": 0.2487,
"num_input_tokens_seen": 1731536,
"step": 6085
},
{
"epoch": 11.361940298507463,
"grad_norm": 0.385991632938385,
"learning_rate": 0.00046859824066620287,
"loss": 0.3439,
"num_input_tokens_seen": 1733072,
"step": 6090
},
{
"epoch": 11.371268656716419,
"grad_norm": 0.36053866147994995,
"learning_rate": 0.00046778583683424943,
"loss": 0.3811,
"num_input_tokens_seen": 1734640,
"step": 6095
},
{
"epoch": 11.380597014925373,
"grad_norm": 0.4456652104854584,
"learning_rate": 0.0004669735183932613,
"loss": 0.3588,
"num_input_tokens_seen": 1735984,
"step": 6100
},
{
"epoch": 11.389925373134329,
"grad_norm": 0.21310646831989288,
"learning_rate": 0.00046616128749647296,
"loss": 0.2416,
"num_input_tokens_seen": 1737488,
"step": 6105
},
{
"epoch": 11.399253731343283,
"grad_norm": 0.36799484491348267,
"learning_rate": 0.00046534914629688747,
"loss": 0.5418,
"num_input_tokens_seen": 1738960,
"step": 6110
},
{
"epoch": 11.408582089552239,
"grad_norm": 0.30316370725631714,
"learning_rate": 0.00046453709694726944,
"loss": 0.3266,
"num_input_tokens_seen": 1740464,
"step": 6115
},
{
"epoch": 11.417910447761194,
"grad_norm": 0.3485303521156311,
"learning_rate": 0.00046372514160014037,
"loss": 0.3988,
"num_input_tokens_seen": 1741904,
"step": 6120
},
{
"epoch": 11.427238805970148,
"grad_norm": 0.41022706031799316,
"learning_rate": 0.00046291328240777297,
"loss": 0.4461,
"num_input_tokens_seen": 1743216,
"step": 6125
},
{
"epoch": 11.436567164179104,
"grad_norm": 0.4416004419326782,
"learning_rate": 0.00046210152152218397,
"loss": 0.2416,
"num_input_tokens_seen": 1744816,
"step": 6130
},
{
"epoch": 11.44589552238806,
"grad_norm": 0.541028618812561,
"learning_rate": 0.000461289861095131,
"loss": 0.4478,
"num_input_tokens_seen": 1746064,
"step": 6135
},
{
"epoch": 11.455223880597014,
"grad_norm": 0.2696289122104645,
"learning_rate": 0.0004604783032781039,
"loss": 0.3888,
"num_input_tokens_seen": 1747728,
"step": 6140
},
{
"epoch": 11.46455223880597,
"grad_norm": 0.2709604799747467,
"learning_rate": 0.00045966685022232143,
"loss": 0.4124,
"num_input_tokens_seen": 1749104,
"step": 6145
},
{
"epoch": 11.473880597014926,
"grad_norm": 0.3468097448348999,
"learning_rate": 0.00045885550407872476,
"loss": 0.3091,
"num_input_tokens_seen": 1750416,
"step": 6150
},
{
"epoch": 11.48320895522388,
"grad_norm": 0.38475626707077026,
"learning_rate": 0.0004580442669979708,
"loss": 0.4098,
"num_input_tokens_seen": 1751952,
"step": 6155
},
{
"epoch": 11.492537313432836,
"grad_norm": 0.4953322112560272,
"learning_rate": 0.00045723314113042856,
"loss": 0.4645,
"num_input_tokens_seen": 1753328,
"step": 6160
},
{
"epoch": 11.501865671641792,
"grad_norm": 0.6726153492927551,
"learning_rate": 0.00045642212862617086,
"loss": 0.5969,
"num_input_tokens_seen": 1754672,
"step": 6165
},
{
"epoch": 11.511194029850746,
"grad_norm": 0.33583176136016846,
"learning_rate": 0.0004556112316349716,
"loss": 0.4025,
"num_input_tokens_seen": 1756080,
"step": 6170
},
{
"epoch": 11.520522388059701,
"grad_norm": 0.29343482851982117,
"learning_rate": 0.0004548004523062968,
"loss": 0.2802,
"num_input_tokens_seen": 1757456,
"step": 6175
},
{
"epoch": 11.529850746268657,
"grad_norm": 0.2839649021625519,
"learning_rate": 0.000453989792789302,
"loss": 0.4035,
"num_input_tokens_seen": 1758960,
"step": 6180
},
{
"epoch": 11.539179104477611,
"grad_norm": 0.4980579614639282,
"learning_rate": 0.0004531792552328247,
"loss": 0.3924,
"num_input_tokens_seen": 1760368,
"step": 6185
},
{
"epoch": 11.548507462686567,
"grad_norm": 0.43809887766838074,
"learning_rate": 0.0004523688417853785,
"loss": 0.3805,
"num_input_tokens_seen": 1761744,
"step": 6190
},
{
"epoch": 11.557835820895523,
"grad_norm": 0.16008871793746948,
"learning_rate": 0.00045155855459514917,
"loss": 0.2367,
"num_input_tokens_seen": 1763472,
"step": 6195
},
{
"epoch": 11.567164179104477,
"grad_norm": 0.406055748462677,
"learning_rate": 0.00045074839580998646,
"loss": 0.5474,
"num_input_tokens_seen": 1764848,
"step": 6200
},
{
"epoch": 11.576492537313433,
"grad_norm": 0.5940313935279846,
"learning_rate": 0.00044993836757740096,
"loss": 0.4387,
"num_input_tokens_seen": 1766160,
"step": 6205
},
{
"epoch": 11.585820895522389,
"grad_norm": 0.5240064859390259,
"learning_rate": 0.0004491284720445567,
"loss": 0.2951,
"num_input_tokens_seen": 1767664,
"step": 6210
},
{
"epoch": 11.595149253731343,
"grad_norm": 0.4598220884799957,
"learning_rate": 0.00044831871135826576,
"loss": 0.4724,
"num_input_tokens_seen": 1769040,
"step": 6215
},
{
"epoch": 11.604477611940299,
"grad_norm": 0.40932193398475647,
"learning_rate": 0.0004475090876649831,
"loss": 0.3878,
"num_input_tokens_seen": 1770480,
"step": 6220
},
{
"epoch": 11.613805970149254,
"grad_norm": 0.2749176323413849,
"learning_rate": 0.0004466996031108004,
"loss": 0.2885,
"num_input_tokens_seen": 1771952,
"step": 6225
},
{
"epoch": 11.623134328358208,
"grad_norm": 0.6056463718414307,
"learning_rate": 0.00044589025984144063,
"loss": 0.2995,
"num_input_tokens_seen": 1773392,
"step": 6230
},
{
"epoch": 11.632462686567164,
"grad_norm": 0.3814938962459564,
"learning_rate": 0.0004450810600022519,
"loss": 0.4098,
"num_input_tokens_seen": 1774800,
"step": 6235
},
{
"epoch": 11.64179104477612,
"grad_norm": 0.41231343150138855,
"learning_rate": 0.0004442720057382027,
"loss": 0.5592,
"num_input_tokens_seen": 1776176,
"step": 6240
},
{
"epoch": 11.651119402985074,
"grad_norm": 0.25101757049560547,
"learning_rate": 0.0004434630991938754,
"loss": 0.2176,
"num_input_tokens_seen": 1778000,
"step": 6245
},
{
"epoch": 11.66044776119403,
"grad_norm": 0.4763393998146057,
"learning_rate": 0.0004426543425134604,
"loss": 0.2743,
"num_input_tokens_seen": 1779536,
"step": 6250
},
{
"epoch": 11.669776119402986,
"grad_norm": 0.3026311695575714,
"learning_rate": 0.0004418457378407516,
"loss": 0.3232,
"num_input_tokens_seen": 1780912,
"step": 6255
},
{
"epoch": 11.67910447761194,
"grad_norm": 0.5905582904815674,
"learning_rate": 0.00044103728731913916,
"loss": 0.3738,
"num_input_tokens_seen": 1782256,
"step": 6260
},
{
"epoch": 11.688432835820896,
"grad_norm": 0.2815220355987549,
"learning_rate": 0.0004402289930916053,
"loss": 0.3236,
"num_input_tokens_seen": 1783696,
"step": 6265
},
{
"epoch": 11.697761194029852,
"grad_norm": 0.5030792355537415,
"learning_rate": 0.0004394208573007177,
"loss": 0.5156,
"num_input_tokens_seen": 1785040,
"step": 6270
},
{
"epoch": 11.707089552238806,
"grad_norm": 0.4295044541358948,
"learning_rate": 0.00043861288208862394,
"loss": 0.3153,
"num_input_tokens_seen": 1786544,
"step": 6275
},
{
"epoch": 11.716417910447761,
"grad_norm": 0.49271804094314575,
"learning_rate": 0.00043780506959704616,
"loss": 0.4076,
"num_input_tokens_seen": 1788080,
"step": 6280
},
{
"epoch": 11.725746268656717,
"grad_norm": 0.3802368640899658,
"learning_rate": 0.0004369974219672748,
"loss": 0.4979,
"num_input_tokens_seen": 1789424,
"step": 6285
},
{
"epoch": 11.735074626865671,
"grad_norm": 0.4424251317977905,
"learning_rate": 0.000436189941340164,
"loss": 0.4163,
"num_input_tokens_seen": 1790992,
"step": 6290
},
{
"epoch": 11.744402985074627,
"grad_norm": 0.28739726543426514,
"learning_rate": 0.00043538262985612445,
"loss": 0.5469,
"num_input_tokens_seen": 1792464,
"step": 6295
},
{
"epoch": 11.753731343283581,
"grad_norm": 0.40592968463897705,
"learning_rate": 0.00043457548965511884,
"loss": 0.4155,
"num_input_tokens_seen": 1793872,
"step": 6300
},
{
"epoch": 11.763059701492537,
"grad_norm": 0.4930996000766754,
"learning_rate": 0.0004337685228766561,
"loss": 0.3816,
"num_input_tokens_seen": 1795344,
"step": 6305
},
{
"epoch": 11.772388059701493,
"grad_norm": 0.34781700372695923,
"learning_rate": 0.0004329617316597849,
"loss": 0.2512,
"num_input_tokens_seen": 1796848,
"step": 6310
},
{
"epoch": 11.781716417910447,
"grad_norm": 0.5414386987686157,
"learning_rate": 0.000432155118143089,
"loss": 0.4204,
"num_input_tokens_seen": 1798192,
"step": 6315
},
{
"epoch": 11.791044776119403,
"grad_norm": 0.41075921058654785,
"learning_rate": 0.0004313486844646808,
"loss": 0.5099,
"num_input_tokens_seen": 1799664,
"step": 6320
},
{
"epoch": 11.800373134328359,
"grad_norm": 0.40663543343544006,
"learning_rate": 0.0004305424327621962,
"loss": 0.4131,
"num_input_tokens_seen": 1801072,
"step": 6325
},
{
"epoch": 11.809701492537313,
"grad_norm": 0.47907477617263794,
"learning_rate": 0.00042973636517278893,
"loss": 0.4387,
"num_input_tokens_seen": 1802640,
"step": 6330
},
{
"epoch": 11.819029850746269,
"grad_norm": 0.510122537612915,
"learning_rate": 0.0004289304838331241,
"loss": 0.4236,
"num_input_tokens_seen": 1804048,
"step": 6335
},
{
"epoch": 11.828358208955224,
"grad_norm": 0.42072373628616333,
"learning_rate": 0.0004281247908793737,
"loss": 0.2858,
"num_input_tokens_seen": 1805296,
"step": 6340
},
{
"epoch": 11.837686567164178,
"grad_norm": 0.509661078453064,
"learning_rate": 0.0004273192884472099,
"loss": 0.5232,
"num_input_tokens_seen": 1806640,
"step": 6345
},
{
"epoch": 11.847014925373134,
"grad_norm": 0.24327515065670013,
"learning_rate": 0.0004265139786718004,
"loss": 0.2774,
"num_input_tokens_seen": 1808112,
"step": 6350
},
{
"epoch": 11.85634328358209,
"grad_norm": 0.38800907135009766,
"learning_rate": 0.0004257088636878015,
"loss": 0.2649,
"num_input_tokens_seen": 1809360,
"step": 6355
},
{
"epoch": 11.865671641791044,
"grad_norm": 0.3098767399787903,
"learning_rate": 0.0004249039456293537,
"loss": 0.3889,
"num_input_tokens_seen": 1810832,
"step": 6360
},
{
"epoch": 11.875,
"grad_norm": 0.4385940730571747,
"learning_rate": 0.0004240992266300757,
"loss": 0.3961,
"num_input_tokens_seen": 1812240,
"step": 6365
},
{
"epoch": 11.884328358208956,
"grad_norm": 0.30191636085510254,
"learning_rate": 0.00042329470882305765,
"loss": 0.2755,
"num_input_tokens_seen": 1813584,
"step": 6370
},
{
"epoch": 11.89365671641791,
"grad_norm": 0.4465119540691376,
"learning_rate": 0.00042249039434085747,
"loss": 0.7074,
"num_input_tokens_seen": 1815024,
"step": 6375
},
{
"epoch": 11.902985074626866,
"grad_norm": 0.7072200179100037,
"learning_rate": 0.0004216862853154932,
"loss": 0.52,
"num_input_tokens_seen": 1816400,
"step": 6380
},
{
"epoch": 11.912313432835822,
"grad_norm": 0.360170841217041,
"learning_rate": 0.0004208823838784386,
"loss": 0.4577,
"num_input_tokens_seen": 1817840,
"step": 6385
},
{
"epoch": 11.921641791044776,
"grad_norm": 0.37331485748291016,
"learning_rate": 0.0004200786921606179,
"loss": 0.4316,
"num_input_tokens_seen": 1819152,
"step": 6390
},
{
"epoch": 11.930970149253731,
"grad_norm": 0.5474173426628113,
"learning_rate": 0.00041927521229239795,
"loss": 0.5153,
"num_input_tokens_seen": 1820592,
"step": 6395
},
{
"epoch": 11.940298507462687,
"grad_norm": 0.38393107056617737,
"learning_rate": 0.0004184719464035856,
"loss": 0.3978,
"num_input_tokens_seen": 1821872,
"step": 6400
},
{
"epoch": 11.949626865671641,
"grad_norm": 0.17455793917179108,
"learning_rate": 0.00041766889662341907,
"loss": 0.3079,
"num_input_tokens_seen": 1823152,
"step": 6405
},
{
"epoch": 11.958955223880597,
"grad_norm": 0.6789186596870422,
"learning_rate": 0.000416866065080565,
"loss": 0.427,
"num_input_tokens_seen": 1824464,
"step": 6410
},
{
"epoch": 11.968283582089553,
"grad_norm": 0.44379931688308716,
"learning_rate": 0.0004160634539031105,
"loss": 0.3356,
"num_input_tokens_seen": 1825872,
"step": 6415
},
{
"epoch": 11.977611940298507,
"grad_norm": 0.37370961904525757,
"learning_rate": 0.0004152610652185592,
"loss": 0.463,
"num_input_tokens_seen": 1827280,
"step": 6420
},
{
"epoch": 11.986940298507463,
"grad_norm": 0.7289624810218811,
"learning_rate": 0.000414458901153825,
"loss": 0.4434,
"num_input_tokens_seen": 1828624,
"step": 6425
},
{
"epoch": 11.996268656716419,
"grad_norm": 0.41602692008018494,
"learning_rate": 0.00041365696383522586,
"loss": 0.3809,
"num_input_tokens_seen": 1829872,
"step": 6430
},
{
"epoch": 12.0,
"eval_loss": 0.7218378782272339,
"eval_runtime": 4.2039,
"eval_samples_per_second": 56.614,
"eval_steps_per_second": 14.272,
"num_input_tokens_seen": 1830200,
"step": 6432
},
{
"epoch": 12.005597014925373,
"grad_norm": 0.3600406348705292,
"learning_rate": 0.00041285525538847936,
"loss": 0.3121,
"num_input_tokens_seen": 1831032,
"step": 6435
},
{
"epoch": 12.014925373134329,
"grad_norm": 0.19030925631523132,
"learning_rate": 0.0004120537779386954,
"loss": 0.3586,
"num_input_tokens_seen": 1832472,
"step": 6440
},
{
"epoch": 12.024253731343284,
"grad_norm": 0.4201837182044983,
"learning_rate": 0.00041125253361037277,
"loss": 0.5458,
"num_input_tokens_seen": 1833752,
"step": 6445
},
{
"epoch": 12.033582089552239,
"grad_norm": 0.2953159809112549,
"learning_rate": 0.00041045152452739183,
"loss": 0.2556,
"num_input_tokens_seen": 1835192,
"step": 6450
},
{
"epoch": 12.042910447761194,
"grad_norm": 0.4765187203884125,
"learning_rate": 0.00040965075281300893,
"loss": 0.2821,
"num_input_tokens_seen": 1836632,
"step": 6455
},
{
"epoch": 12.052238805970148,
"grad_norm": 0.44841447472572327,
"learning_rate": 0.00040885022058985193,
"loss": 0.3791,
"num_input_tokens_seen": 1837976,
"step": 6460
},
{
"epoch": 12.061567164179104,
"grad_norm": 0.6252573728561401,
"learning_rate": 0.0004080499299799133,
"loss": 0.4758,
"num_input_tokens_seen": 1839384,
"step": 6465
},
{
"epoch": 12.07089552238806,
"grad_norm": 0.3894113004207611,
"learning_rate": 0.0004072498831045455,
"loss": 0.2906,
"num_input_tokens_seen": 1840696,
"step": 6470
},
{
"epoch": 12.080223880597014,
"grad_norm": 0.4687945544719696,
"learning_rate": 0.00040645008208445445,
"loss": 0.2959,
"num_input_tokens_seen": 1842328,
"step": 6475
},
{
"epoch": 12.08955223880597,
"grad_norm": 0.2961972653865814,
"learning_rate": 0.0004056505290396948,
"loss": 0.2436,
"num_input_tokens_seen": 1843992,
"step": 6480
},
{
"epoch": 12.098880597014926,
"grad_norm": 0.4359433948993683,
"learning_rate": 0.00040485122608966377,
"loss": 0.2145,
"num_input_tokens_seen": 1845464,
"step": 6485
},
{
"epoch": 12.10820895522388,
"grad_norm": 0.38550934195518494,
"learning_rate": 0.00040405217535309545,
"loss": 0.2177,
"num_input_tokens_seen": 1847000,
"step": 6490
},
{
"epoch": 12.117537313432836,
"grad_norm": 0.4485433101654053,
"learning_rate": 0.0004032533789480557,
"loss": 0.5714,
"num_input_tokens_seen": 1848344,
"step": 6495
},
{
"epoch": 12.126865671641792,
"grad_norm": 0.33935850858688354,
"learning_rate": 0.00040245483899193594,
"loss": 0.2848,
"num_input_tokens_seen": 1849784,
"step": 6500
},
{
"epoch": 12.136194029850746,
"grad_norm": 0.38977697491645813,
"learning_rate": 0.00040165655760144783,
"loss": 0.4378,
"num_input_tokens_seen": 1851256,
"step": 6505
},
{
"epoch": 12.145522388059701,
"grad_norm": 0.5800938606262207,
"learning_rate": 0.0004008585368926179,
"loss": 0.2898,
"num_input_tokens_seen": 1852760,
"step": 6510
},
{
"epoch": 12.154850746268657,
"grad_norm": 0.430899977684021,
"learning_rate": 0.0004000607789807814,
"loss": 0.5703,
"num_input_tokens_seen": 1854264,
"step": 6515
},
{
"epoch": 12.164179104477611,
"grad_norm": 0.45637640357017517,
"learning_rate": 0.0003992632859805773,
"loss": 0.3081,
"num_input_tokens_seen": 1855736,
"step": 6520
},
{
"epoch": 12.173507462686567,
"grad_norm": 0.4487372040748596,
"learning_rate": 0.0003984660600059418,
"loss": 0.3069,
"num_input_tokens_seen": 1857112,
"step": 6525
},
{
"epoch": 12.182835820895523,
"grad_norm": 0.24957291781902313,
"learning_rate": 0.00039766910317010377,
"loss": 0.4203,
"num_input_tokens_seen": 1858712,
"step": 6530
},
{
"epoch": 12.192164179104477,
"grad_norm": 0.44331094622612,
"learning_rate": 0.0003968724175855788,
"loss": 0.2864,
"num_input_tokens_seen": 1860280,
"step": 6535
},
{
"epoch": 12.201492537313433,
"grad_norm": 0.37186896800994873,
"learning_rate": 0.00039607600536416287,
"loss": 0.2347,
"num_input_tokens_seen": 1861816,
"step": 6540
},
{
"epoch": 12.210820895522389,
"grad_norm": 0.5465824604034424,
"learning_rate": 0.0003952798686169279,
"loss": 0.2855,
"num_input_tokens_seen": 1863160,
"step": 6545
},
{
"epoch": 12.220149253731343,
"grad_norm": 0.422227144241333,
"learning_rate": 0.0003944840094542152,
"loss": 0.3494,
"num_input_tokens_seen": 1864504,
"step": 6550
},
{
"epoch": 12.229477611940299,
"grad_norm": 0.31042227149009705,
"learning_rate": 0.00039368842998563065,
"loss": 0.2654,
"num_input_tokens_seen": 1866104,
"step": 6555
},
{
"epoch": 12.238805970149254,
"grad_norm": 0.6005743741989136,
"learning_rate": 0.0003928931323200384,
"loss": 0.3776,
"num_input_tokens_seen": 1867480,
"step": 6560
},
{
"epoch": 12.248134328358208,
"grad_norm": 0.39204689860343933,
"learning_rate": 0.00039209811856555566,
"loss": 0.332,
"num_input_tokens_seen": 1869144,
"step": 6565
},
{
"epoch": 12.257462686567164,
"grad_norm": 0.26447024941444397,
"learning_rate": 0.0003913033908295477,
"loss": 0.326,
"num_input_tokens_seen": 1870552,
"step": 6570
},
{
"epoch": 12.26679104477612,
"grad_norm": 0.4324093759059906,
"learning_rate": 0.00039050895121862055,
"loss": 0.2876,
"num_input_tokens_seen": 1872216,
"step": 6575
},
{
"epoch": 12.276119402985074,
"grad_norm": 0.5774677395820618,
"learning_rate": 0.0003897148018386174,
"loss": 0.345,
"num_input_tokens_seen": 1873432,
"step": 6580
},
{
"epoch": 12.28544776119403,
"grad_norm": 0.3903844356536865,
"learning_rate": 0.0003889209447946116,
"loss": 0.5506,
"num_input_tokens_seen": 1874776,
"step": 6585
},
{
"epoch": 12.294776119402986,
"grad_norm": 0.39230290055274963,
"learning_rate": 0.0003881273821909016,
"loss": 0.3959,
"num_input_tokens_seen": 1876216,
"step": 6590
},
{
"epoch": 12.30410447761194,
"grad_norm": 0.49180158972740173,
"learning_rate": 0.00038733411613100615,
"loss": 0.3796,
"num_input_tokens_seen": 1877528,
"step": 6595
},
{
"epoch": 12.313432835820896,
"grad_norm": 0.5546965003013611,
"learning_rate": 0.0003865411487176567,
"loss": 0.3909,
"num_input_tokens_seen": 1879064,
"step": 6600
},
{
"epoch": 12.322761194029852,
"grad_norm": 0.4393283724784851,
"learning_rate": 0.00038574848205279416,
"loss": 0.3867,
"num_input_tokens_seen": 1880504,
"step": 6605
},
{
"epoch": 12.332089552238806,
"grad_norm": 0.4588424563407898,
"learning_rate": 0.0003849561182375613,
"loss": 0.3882,
"num_input_tokens_seen": 1881944,
"step": 6610
},
{
"epoch": 12.341417910447761,
"grad_norm": 0.30848169326782227,
"learning_rate": 0.0003841640593722992,
"loss": 0.3721,
"num_input_tokens_seen": 1883352,
"step": 6615
},
{
"epoch": 12.350746268656717,
"grad_norm": 0.5776231288909912,
"learning_rate": 0.0003833723075565394,
"loss": 0.4433,
"num_input_tokens_seen": 1884888,
"step": 6620
},
{
"epoch": 12.360074626865671,
"grad_norm": 0.45106711983680725,
"learning_rate": 0.0003825808648890005,
"loss": 0.4721,
"num_input_tokens_seen": 1886456,
"step": 6625
},
{
"epoch": 12.369402985074627,
"grad_norm": 0.4222325086593628,
"learning_rate": 0.00038178973346758143,
"loss": 0.5654,
"num_input_tokens_seen": 1887832,
"step": 6630
},
{
"epoch": 12.378731343283581,
"grad_norm": 0.3949355185031891,
"learning_rate": 0.00038099891538935537,
"loss": 0.4811,
"num_input_tokens_seen": 1889144,
"step": 6635
},
{
"epoch": 12.388059701492537,
"grad_norm": 0.48827362060546875,
"learning_rate": 0.0003802084127505662,
"loss": 0.3642,
"num_input_tokens_seen": 1890616,
"step": 6640
},
{
"epoch": 12.397388059701493,
"grad_norm": 0.41914358735084534,
"learning_rate": 0.0003794182276466201,
"loss": 0.3448,
"num_input_tokens_seen": 1892248,
"step": 6645
},
{
"epoch": 12.406716417910447,
"grad_norm": 0.41748374700546265,
"learning_rate": 0.00037862836217208295,
"loss": 0.3126,
"num_input_tokens_seen": 1893688,
"step": 6650
},
{
"epoch": 12.416044776119403,
"grad_norm": 0.3538048565387726,
"learning_rate": 0.0003778388184206728,
"loss": 0.3595,
"num_input_tokens_seen": 1895096,
"step": 6655
},
{
"epoch": 12.425373134328359,
"grad_norm": 0.6328346729278564,
"learning_rate": 0.00037704959848525464,
"loss": 0.3776,
"num_input_tokens_seen": 1896312,
"step": 6660
},
{
"epoch": 12.434701492537313,
"grad_norm": 0.5902320146560669,
"learning_rate": 0.00037626070445783566,
"loss": 0.4868,
"num_input_tokens_seen": 1897496,
"step": 6665
},
{
"epoch": 12.444029850746269,
"grad_norm": 0.4881054162979126,
"learning_rate": 0.0003754721384295587,
"loss": 0.3676,
"num_input_tokens_seen": 1898968,
"step": 6670
},
{
"epoch": 12.453358208955224,
"grad_norm": 0.3948395848274231,
"learning_rate": 0.0003746839024906974,
"loss": 0.4843,
"num_input_tokens_seen": 1900248,
"step": 6675
},
{
"epoch": 12.462686567164178,
"grad_norm": 0.41477930545806885,
"learning_rate": 0.00037389599873065033,
"loss": 0.3943,
"num_input_tokens_seen": 1901592,
"step": 6680
},
{
"epoch": 12.472014925373134,
"grad_norm": 0.4800880253314972,
"learning_rate": 0.0003731084292379356,
"loss": 0.4224,
"num_input_tokens_seen": 1902904,
"step": 6685
},
{
"epoch": 12.48134328358209,
"grad_norm": 0.5596414804458618,
"learning_rate": 0.00037232119610018535,
"loss": 0.3543,
"num_input_tokens_seen": 1904248,
"step": 6690
},
{
"epoch": 12.490671641791044,
"grad_norm": 0.4646502435207367,
"learning_rate": 0.00037153430140413984,
"loss": 0.3899,
"num_input_tokens_seen": 1905688,
"step": 6695
},
{
"epoch": 12.5,
"grad_norm": 0.7304385304450989,
"learning_rate": 0.00037074774723564266,
"loss": 0.5413,
"num_input_tokens_seen": 1906840,
"step": 6700
},
{
"epoch": 12.509328358208956,
"grad_norm": 0.4171794056892395,
"learning_rate": 0.0003699615356796342,
"loss": 0.2608,
"num_input_tokens_seen": 1908184,
"step": 6705
},
{
"epoch": 12.51865671641791,
"grad_norm": 0.4009363055229187,
"learning_rate": 0.0003691756688201471,
"loss": 0.3174,
"num_input_tokens_seen": 1909528,
"step": 6710
},
{
"epoch": 12.527985074626866,
"grad_norm": 0.5148348808288574,
"learning_rate": 0.0003683901487403004,
"loss": 0.3957,
"num_input_tokens_seen": 1910968,
"step": 6715
},
{
"epoch": 12.537313432835822,
"grad_norm": 0.3038680851459503,
"learning_rate": 0.00036760497752229336,
"loss": 0.2928,
"num_input_tokens_seen": 1912472,
"step": 6720
},
{
"epoch": 12.546641791044776,
"grad_norm": 0.5594529509544373,
"learning_rate": 0.00036682015724740115,
"loss": 0.3539,
"num_input_tokens_seen": 1913912,
"step": 6725
},
{
"epoch": 12.555970149253731,
"grad_norm": 0.35502153635025024,
"learning_rate": 0.00036603568999596815,
"loss": 0.3463,
"num_input_tokens_seen": 1915512,
"step": 6730
},
{
"epoch": 12.565298507462687,
"grad_norm": 0.547687828540802,
"learning_rate": 0.00036525157784740337,
"loss": 0.4171,
"num_input_tokens_seen": 1916856,
"step": 6735
},
{
"epoch": 12.574626865671641,
"grad_norm": 0.3124009072780609,
"learning_rate": 0.0003644678228801742,
"loss": 0.2618,
"num_input_tokens_seen": 1918360,
"step": 6740
},
{
"epoch": 12.583955223880597,
"grad_norm": 0.5247648358345032,
"learning_rate": 0.00036368442717180154,
"loss": 0.3337,
"num_input_tokens_seen": 1919864,
"step": 6745
},
{
"epoch": 12.593283582089553,
"grad_norm": 0.41606518626213074,
"learning_rate": 0.00036290139279885394,
"loss": 0.3932,
"num_input_tokens_seen": 1921176,
"step": 6750
},
{
"epoch": 12.602611940298507,
"grad_norm": 0.4646458327770233,
"learning_rate": 0.0003621187218369418,
"loss": 0.5042,
"num_input_tokens_seen": 1922424,
"step": 6755
},
{
"epoch": 12.611940298507463,
"grad_norm": 0.322963148355484,
"learning_rate": 0.0003613364163607128,
"loss": 0.388,
"num_input_tokens_seen": 1923736,
"step": 6760
},
{
"epoch": 12.621268656716419,
"grad_norm": 0.4273810088634491,
"learning_rate": 0.00036055447844384527,
"loss": 0.3713,
"num_input_tokens_seen": 1925112,
"step": 6765
},
{
"epoch": 12.630597014925373,
"grad_norm": 0.4464828073978424,
"learning_rate": 0.0003597729101590436,
"loss": 0.3738,
"num_input_tokens_seen": 1926680,
"step": 6770
},
{
"epoch": 12.639925373134329,
"grad_norm": 0.5656095743179321,
"learning_rate": 0.0003589917135780323,
"loss": 0.2075,
"num_input_tokens_seen": 1928024,
"step": 6775
},
{
"epoch": 12.649253731343283,
"grad_norm": 0.5868489146232605,
"learning_rate": 0.00035821089077155046,
"loss": 0.2905,
"num_input_tokens_seen": 1929592,
"step": 6780
},
{
"epoch": 12.658582089552239,
"grad_norm": 0.5153266191482544,
"learning_rate": 0.00035743044380934653,
"loss": 0.4824,
"num_input_tokens_seen": 1930840,
"step": 6785
},
{
"epoch": 12.667910447761194,
"grad_norm": 0.632239818572998,
"learning_rate": 0.00035665037476017257,
"loss": 0.3472,
"num_input_tokens_seen": 1932312,
"step": 6790
},
{
"epoch": 12.677238805970148,
"grad_norm": 0.4063834547996521,
"learning_rate": 0.00035587068569177923,
"loss": 0.4525,
"num_input_tokens_seen": 1933784,
"step": 6795
},
{
"epoch": 12.686567164179104,
"grad_norm": 0.44790780544281006,
"learning_rate": 0.0003550913786709094,
"loss": 0.3497,
"num_input_tokens_seen": 1935224,
"step": 6800
},
{
"epoch": 12.69589552238806,
"grad_norm": 0.4837767481803894,
"learning_rate": 0.0003543124557632936,
"loss": 0.4892,
"num_input_tokens_seen": 1936504,
"step": 6805
},
{
"epoch": 12.705223880597014,
"grad_norm": 0.48883289098739624,
"learning_rate": 0.0003535339190336446,
"loss": 0.433,
"num_input_tokens_seen": 1937912,
"step": 6810
},
{
"epoch": 12.71455223880597,
"grad_norm": 0.3776569664478302,
"learning_rate": 0.00035275577054565046,
"loss": 0.3781,
"num_input_tokens_seen": 1939128,
"step": 6815
},
{
"epoch": 12.723880597014926,
"grad_norm": 0.4895813763141632,
"learning_rate": 0.0003519780123619709,
"loss": 0.3092,
"num_input_tokens_seen": 1940600,
"step": 6820
},
{
"epoch": 12.73320895522388,
"grad_norm": 0.3760162591934204,
"learning_rate": 0.0003512006465442309,
"loss": 0.4304,
"num_input_tokens_seen": 1942008,
"step": 6825
},
{
"epoch": 12.742537313432836,
"grad_norm": 0.31054630875587463,
"learning_rate": 0.0003504236751530152,
"loss": 0.2892,
"num_input_tokens_seen": 1943352,
"step": 6830
},
{
"epoch": 12.751865671641792,
"grad_norm": 0.36416080594062805,
"learning_rate": 0.0003496471002478635,
"loss": 0.2315,
"num_input_tokens_seen": 1944792,
"step": 6835
},
{
"epoch": 12.761194029850746,
"grad_norm": 0.22087052464485168,
"learning_rate": 0.0003488709238872637,
"loss": 0.194,
"num_input_tokens_seen": 1946392,
"step": 6840
},
{
"epoch": 12.770522388059701,
"grad_norm": 0.4641207754611969,
"learning_rate": 0.0003480951481286484,
"loss": 0.2972,
"num_input_tokens_seen": 1947896,
"step": 6845
},
{
"epoch": 12.779850746268657,
"grad_norm": 0.3787482678890228,
"learning_rate": 0.00034731977502838686,
"loss": 0.2635,
"num_input_tokens_seen": 1949304,
"step": 6850
},
{
"epoch": 12.789179104477611,
"grad_norm": 0.3138027787208557,
"learning_rate": 0.00034654480664178257,
"loss": 0.1856,
"num_input_tokens_seen": 1950744,
"step": 6855
},
{
"epoch": 12.798507462686567,
"grad_norm": 0.34298160672187805,
"learning_rate": 0.00034577024502306484,
"loss": 0.4447,
"num_input_tokens_seen": 1952184,
"step": 6860
},
{
"epoch": 12.807835820895523,
"grad_norm": 0.7594728469848633,
"learning_rate": 0.0003449960922253857,
"loss": 0.4111,
"num_input_tokens_seen": 1953400,
"step": 6865
},
{
"epoch": 12.817164179104477,
"grad_norm": 0.41920387744903564,
"learning_rate": 0.0003442223503008135,
"loss": 0.4234,
"num_input_tokens_seen": 1955032,
"step": 6870
},
{
"epoch": 12.826492537313433,
"grad_norm": 0.30781400203704834,
"learning_rate": 0.0003434490213003264,
"loss": 0.3907,
"num_input_tokens_seen": 1956600,
"step": 6875
},
{
"epoch": 12.835820895522389,
"grad_norm": 0.42326509952545166,
"learning_rate": 0.00034267610727380956,
"loss": 0.4245,
"num_input_tokens_seen": 1958392,
"step": 6880
},
{
"epoch": 12.845149253731343,
"grad_norm": 0.3201337456703186,
"learning_rate": 0.0003419036102700467,
"loss": 0.3024,
"num_input_tokens_seen": 1959864,
"step": 6885
},
{
"epoch": 12.854477611940299,
"grad_norm": 0.5356371998786926,
"learning_rate": 0.0003411315323367172,
"loss": 0.5257,
"num_input_tokens_seen": 1961240,
"step": 6890
},
{
"epoch": 12.863805970149254,
"grad_norm": 0.5199465751647949,
"learning_rate": 0.00034035987552038914,
"loss": 0.3615,
"num_input_tokens_seen": 1962584,
"step": 6895
},
{
"epoch": 12.873134328358208,
"grad_norm": 0.3426276445388794,
"learning_rate": 0.0003395886418665144,
"loss": 0.3051,
"num_input_tokens_seen": 1964056,
"step": 6900
},
{
"epoch": 12.882462686567164,
"grad_norm": 0.45455190539360046,
"learning_rate": 0.0003388178334194232,
"loss": 0.3607,
"num_input_tokens_seen": 1965560,
"step": 6905
},
{
"epoch": 12.89179104477612,
"grad_norm": 0.40248537063598633,
"learning_rate": 0.00033804745222231836,
"loss": 0.3731,
"num_input_tokens_seen": 1967000,
"step": 6910
},
{
"epoch": 12.901119402985074,
"grad_norm": 0.24892118573188782,
"learning_rate": 0.00033727750031727077,
"loss": 0.2977,
"num_input_tokens_seen": 1968376,
"step": 6915
},
{
"epoch": 12.91044776119403,
"grad_norm": 0.2123814970254898,
"learning_rate": 0.00033650797974521285,
"loss": 0.372,
"num_input_tokens_seen": 1969752,
"step": 6920
},
{
"epoch": 12.919776119402986,
"grad_norm": 0.6029353141784668,
"learning_rate": 0.00033573889254593384,
"loss": 0.6293,
"num_input_tokens_seen": 1971192,
"step": 6925
},
{
"epoch": 12.92910447761194,
"grad_norm": 0.32942214608192444,
"learning_rate": 0.0003349702407580745,
"loss": 0.4528,
"num_input_tokens_seen": 1972664,
"step": 6930
},
{
"epoch": 12.938432835820896,
"grad_norm": 0.40889158844947815,
"learning_rate": 0.0003342020264191208,
"loss": 0.4118,
"num_input_tokens_seen": 1974104,
"step": 6935
},
{
"epoch": 12.947761194029852,
"grad_norm": 0.38301944732666016,
"learning_rate": 0.0003334342515654,
"loss": 0.3288,
"num_input_tokens_seen": 1975512,
"step": 6940
},
{
"epoch": 12.957089552238806,
"grad_norm": 0.6120052337646484,
"learning_rate": 0.00033266691823207356,
"loss": 0.4245,
"num_input_tokens_seen": 1976760,
"step": 6945
},
{
"epoch": 12.966417910447761,
"grad_norm": 0.34194818139076233,
"learning_rate": 0.0003319000284531332,
"loss": 0.2904,
"num_input_tokens_seen": 1978072,
"step": 6950
},
{
"epoch": 12.975746268656717,
"grad_norm": 0.31473881006240845,
"learning_rate": 0.00033113358426139464,
"loss": 0.248,
"num_input_tokens_seen": 1979512,
"step": 6955
},
{
"epoch": 12.985074626865671,
"grad_norm": 0.6300244331359863,
"learning_rate": 0.0003303675876884923,
"loss": 0.4075,
"num_input_tokens_seen": 1980792,
"step": 6960
},
{
"epoch": 12.994402985074627,
"grad_norm": 0.463429719209671,
"learning_rate": 0.0003296020407648747,
"loss": 0.301,
"num_input_tokens_seen": 1982104,
"step": 6965
},
{
"epoch": 13.0,
"eval_loss": 0.763629674911499,
"eval_runtime": 4.2048,
"eval_samples_per_second": 56.602,
"eval_steps_per_second": 14.27,
"num_input_tokens_seen": 1982664,
"step": 6968
},
{
"epoch": 13.003731343283581,
"grad_norm": 0.36986637115478516,
"learning_rate": 0.00032883694551979765,
"loss": 0.4503,
"num_input_tokens_seen": 1983272,
"step": 6970
},
{
"epoch": 13.013059701492537,
"grad_norm": 0.5193461179733276,
"learning_rate": 0.00032807230398132037,
"loss": 0.3963,
"num_input_tokens_seen": 1984744,
"step": 6975
},
{
"epoch": 13.022388059701493,
"grad_norm": 0.4553769528865814,
"learning_rate": 0.0003273081181762989,
"loss": 0.2928,
"num_input_tokens_seen": 1986088,
"step": 6980
},
{
"epoch": 13.031716417910447,
"grad_norm": 0.30947038531303406,
"learning_rate": 0.0003265443901303816,
"loss": 0.2241,
"num_input_tokens_seen": 1987496,
"step": 6985
},
{
"epoch": 13.041044776119403,
"grad_norm": 0.4447937607765198,
"learning_rate": 0.0003257811218680035,
"loss": 0.3132,
"num_input_tokens_seen": 1988904,
"step": 6990
},
{
"epoch": 13.050373134328359,
"grad_norm": 0.7096430659294128,
"learning_rate": 0.00032501831541238046,
"loss": 0.5288,
"num_input_tokens_seen": 1990280,
"step": 6995
},
{
"epoch": 13.059701492537313,
"grad_norm": 0.4876936972141266,
"learning_rate": 0.0003242559727855047,
"loss": 0.3568,
"num_input_tokens_seen": 1991720,
"step": 7000
},
{
"epoch": 13.069029850746269,
"grad_norm": 0.5312484502792358,
"learning_rate": 0.0003234940960081384,
"loss": 0.4548,
"num_input_tokens_seen": 1993128,
"step": 7005
},
{
"epoch": 13.078358208955224,
"grad_norm": 0.4942055344581604,
"learning_rate": 0.00032273268709980934,
"loss": 0.2092,
"num_input_tokens_seen": 1994696,
"step": 7010
},
{
"epoch": 13.087686567164178,
"grad_norm": 0.5409295558929443,
"learning_rate": 0.0003219717480788052,
"loss": 0.4487,
"num_input_tokens_seen": 1996072,
"step": 7015
},
{
"epoch": 13.097014925373134,
"grad_norm": 0.49344658851623535,
"learning_rate": 0.0003212112809621676,
"loss": 0.365,
"num_input_tokens_seen": 1997448,
"step": 7020
},
{
"epoch": 13.10634328358209,
"grad_norm": 0.2911165654659271,
"learning_rate": 0.0003204512877656878,
"loss": 0.3687,
"num_input_tokens_seen": 1999016,
"step": 7025
},
{
"epoch": 13.115671641791044,
"grad_norm": 0.596234142780304,
"learning_rate": 0.0003196917705039004,
"loss": 0.3475,
"num_input_tokens_seen": 2000136,
"step": 7030
},
{
"epoch": 13.125,
"grad_norm": 0.4349397122859955,
"learning_rate": 0.0003189327311900788,
"loss": 0.333,
"num_input_tokens_seen": 2001576,
"step": 7035
},
{
"epoch": 13.134328358208956,
"grad_norm": 0.5499020218849182,
"learning_rate": 0.00031817417183622917,
"loss": 0.2503,
"num_input_tokens_seen": 2002920,
"step": 7040
},
{
"epoch": 13.14365671641791,
"grad_norm": 0.5799429416656494,
"learning_rate": 0.0003174160944530855,
"loss": 0.3933,
"num_input_tokens_seen": 2004424,
"step": 7045
},
{
"epoch": 13.152985074626866,
"grad_norm": 0.5225064754486084,
"learning_rate": 0.00031665850105010466,
"loss": 0.3231,
"num_input_tokens_seen": 2005864,
"step": 7050
},
{
"epoch": 13.162313432835822,
"grad_norm": 0.4829569160938263,
"learning_rate": 0.0003159013936354598,
"loss": 0.405,
"num_input_tokens_seen": 2007432,
"step": 7055
},
{
"epoch": 13.171641791044776,
"grad_norm": 0.47015833854675293,
"learning_rate": 0.00031514477421603677,
"loss": 0.3184,
"num_input_tokens_seen": 2008808,
"step": 7060
},
{
"epoch": 13.180970149253731,
"grad_norm": 0.5161189436912537,
"learning_rate": 0.0003143886447974269,
"loss": 0.3284,
"num_input_tokens_seen": 2010184,
"step": 7065
},
{
"epoch": 13.190298507462687,
"grad_norm": 0.5534188151359558,
"learning_rate": 0.0003136330073839233,
"loss": 0.2741,
"num_input_tokens_seen": 2011816,
"step": 7070
},
{
"epoch": 13.199626865671641,
"grad_norm": 0.37270939350128174,
"learning_rate": 0.00031287786397851523,
"loss": 0.3485,
"num_input_tokens_seen": 2013448,
"step": 7075
},
{
"epoch": 13.208955223880597,
"grad_norm": 0.35400187969207764,
"learning_rate": 0.0003121232165828813,
"loss": 0.3619,
"num_input_tokens_seen": 2014632,
"step": 7080
},
{
"epoch": 13.218283582089553,
"grad_norm": 0.7645230293273926,
"learning_rate": 0.0003113690671973867,
"loss": 0.335,
"num_input_tokens_seen": 2015944,
"step": 7085
},
{
"epoch": 13.227611940298507,
"grad_norm": 0.6307098865509033,
"learning_rate": 0.0003106154178210753,
"loss": 0.3954,
"num_input_tokens_seen": 2017288,
"step": 7090
},
{
"epoch": 13.236940298507463,
"grad_norm": 0.4696156680583954,
"learning_rate": 0.0003098622704516667,
"loss": 0.3492,
"num_input_tokens_seen": 2018728,
"step": 7095
},
{
"epoch": 13.246268656716419,
"grad_norm": 0.3370290994644165,
"learning_rate": 0.0003091096270855487,
"loss": 0.3904,
"num_input_tokens_seen": 2020136,
"step": 7100
},
{
"epoch": 13.255597014925373,
"grad_norm": 0.5213427543640137,
"learning_rate": 0.00030835748971777413,
"loss": 0.2854,
"num_input_tokens_seen": 2021384,
"step": 7105
},
{
"epoch": 13.264925373134329,
"grad_norm": 0.6416372060775757,
"learning_rate": 0.000307605860342054,
"loss": 0.2781,
"num_input_tokens_seen": 2022728,
"step": 7110
},
{
"epoch": 13.274253731343283,
"grad_norm": 0.40849006175994873,
"learning_rate": 0.0003068547409507528,
"loss": 0.2897,
"num_input_tokens_seen": 2024168,
"step": 7115
},
{
"epoch": 13.283582089552239,
"grad_norm": 0.4013388156890869,
"learning_rate": 0.0003061041335348837,
"loss": 0.2546,
"num_input_tokens_seen": 2025576,
"step": 7120
},
{
"epoch": 13.292910447761194,
"grad_norm": 0.3964715301990509,
"learning_rate": 0.00030535404008410165,
"loss": 0.227,
"num_input_tokens_seen": 2026984,
"step": 7125
},
{
"epoch": 13.302238805970148,
"grad_norm": 0.49855029582977295,
"learning_rate": 0.0003046044625867004,
"loss": 0.4356,
"num_input_tokens_seen": 2028584,
"step": 7130
},
{
"epoch": 13.311567164179104,
"grad_norm": 0.22731898725032806,
"learning_rate": 0.0003038554030296056,
"loss": 0.4009,
"num_input_tokens_seen": 2030088,
"step": 7135
},
{
"epoch": 13.32089552238806,
"grad_norm": 0.43297654390335083,
"learning_rate": 0.0003031068633983697,
"loss": 0.259,
"num_input_tokens_seen": 2031304,
"step": 7140
},
{
"epoch": 13.330223880597014,
"grad_norm": 0.3339049220085144,
"learning_rate": 0.00030235884567716737,
"loss": 0.2448,
"num_input_tokens_seen": 2032648,
"step": 7145
},
{
"epoch": 13.33955223880597,
"grad_norm": 0.4528850317001343,
"learning_rate": 0.00030161135184878955,
"loss": 0.4167,
"num_input_tokens_seen": 2034024,
"step": 7150
},
{
"epoch": 13.348880597014926,
"grad_norm": 0.367870032787323,
"learning_rate": 0.00030086438389463887,
"loss": 0.3501,
"num_input_tokens_seen": 2035656,
"step": 7155
},
{
"epoch": 13.35820895522388,
"grad_norm": 0.49384191632270813,
"learning_rate": 0.00030011794379472344,
"loss": 0.3107,
"num_input_tokens_seen": 2037256,
"step": 7160
},
{
"epoch": 13.367537313432836,
"grad_norm": 0.578643798828125,
"learning_rate": 0.00029937203352765267,
"loss": 0.3913,
"num_input_tokens_seen": 2038888,
"step": 7165
},
{
"epoch": 13.376865671641792,
"grad_norm": 0.4546909034252167,
"learning_rate": 0.00029862665507063144,
"loss": 0.2388,
"num_input_tokens_seen": 2040456,
"step": 7170
},
{
"epoch": 13.386194029850746,
"grad_norm": 0.39690783619880676,
"learning_rate": 0.00029788181039945463,
"loss": 0.2372,
"num_input_tokens_seen": 2041896,
"step": 7175
},
{
"epoch": 13.395522388059701,
"grad_norm": 0.3378133773803711,
"learning_rate": 0.0002971375014885026,
"loss": 0.3537,
"num_input_tokens_seen": 2043368,
"step": 7180
},
{
"epoch": 13.404850746268657,
"grad_norm": 0.5262777209281921,
"learning_rate": 0.0002963937303107352,
"loss": 0.3643,
"num_input_tokens_seen": 2044680,
"step": 7185
},
{
"epoch": 13.414179104477611,
"grad_norm": 0.506253182888031,
"learning_rate": 0.0002956504988376873,
"loss": 0.3046,
"num_input_tokens_seen": 2046120,
"step": 7190
},
{
"epoch": 13.423507462686567,
"grad_norm": 0.4321569502353668,
"learning_rate": 0.0002949078090394629,
"loss": 0.3583,
"num_input_tokens_seen": 2047528,
"step": 7195
},
{
"epoch": 13.432835820895523,
"grad_norm": 0.6814833879470825,
"learning_rate": 0.00029416566288472995,
"loss": 0.3434,
"num_input_tokens_seen": 2048968,
"step": 7200
},
{
"epoch": 13.442164179104477,
"grad_norm": 0.1856795698404312,
"learning_rate": 0.00029342406234071595,
"loss": 0.3247,
"num_input_tokens_seen": 2050312,
"step": 7205
},
{
"epoch": 13.451492537313433,
"grad_norm": 0.43723657727241516,
"learning_rate": 0.00029268300937320145,
"loss": 0.2912,
"num_input_tokens_seen": 2051848,
"step": 7210
},
{
"epoch": 13.460820895522389,
"grad_norm": 0.6841850280761719,
"learning_rate": 0.00029194250594651624,
"loss": 0.3761,
"num_input_tokens_seen": 2053160,
"step": 7215
},
{
"epoch": 13.470149253731343,
"grad_norm": 0.3979460895061493,
"learning_rate": 0.0002912025540235327,
"loss": 0.4691,
"num_input_tokens_seen": 2054600,
"step": 7220
},
{
"epoch": 13.479477611940299,
"grad_norm": 0.33426252007484436,
"learning_rate": 0.0002904631555656616,
"loss": 0.3804,
"num_input_tokens_seen": 2055944,
"step": 7225
},
{
"epoch": 13.488805970149254,
"grad_norm": 0.28705891966819763,
"learning_rate": 0.00028972431253284725,
"loss": 0.3625,
"num_input_tokens_seen": 2057384,
"step": 7230
},
{
"epoch": 13.498134328358208,
"grad_norm": 0.49933427572250366,
"learning_rate": 0.0002889860268835607,
"loss": 0.297,
"num_input_tokens_seen": 2058888,
"step": 7235
},
{
"epoch": 13.507462686567164,
"grad_norm": 0.5333276391029358,
"learning_rate": 0.00028824830057479613,
"loss": 0.1839,
"num_input_tokens_seen": 2060200,
"step": 7240
},
{
"epoch": 13.51679104477612,
"grad_norm": 0.3615100085735321,
"learning_rate": 0.00028751113556206456,
"loss": 0.244,
"num_input_tokens_seen": 2061640,
"step": 7245
},
{
"epoch": 13.526119402985074,
"grad_norm": 0.38745981454849243,
"learning_rate": 0.0002867745337993899,
"loss": 0.4342,
"num_input_tokens_seen": 2063112,
"step": 7250
},
{
"epoch": 13.53544776119403,
"grad_norm": 0.5390931367874146,
"learning_rate": 0.00028603849723930243,
"loss": 0.3696,
"num_input_tokens_seen": 2064488,
"step": 7255
},
{
"epoch": 13.544776119402986,
"grad_norm": 0.4872875511646271,
"learning_rate": 0.00028530302783283433,
"loss": 0.4132,
"num_input_tokens_seen": 2065864,
"step": 7260
},
{
"epoch": 13.55410447761194,
"grad_norm": 0.6258790493011475,
"learning_rate": 0.00028456812752951485,
"loss": 0.3688,
"num_input_tokens_seen": 2067304,
"step": 7265
},
{
"epoch": 13.563432835820896,
"grad_norm": 0.4587050974369049,
"learning_rate": 0.0002838337982773641,
"loss": 0.3483,
"num_input_tokens_seen": 2068680,
"step": 7270
},
{
"epoch": 13.572761194029852,
"grad_norm": 0.5654541850090027,
"learning_rate": 0.00028310004202288885,
"loss": 0.6093,
"num_input_tokens_seen": 2070120,
"step": 7275
},
{
"epoch": 13.582089552238806,
"grad_norm": 0.4311787784099579,
"learning_rate": 0.0002823668607110767,
"loss": 0.2606,
"num_input_tokens_seen": 2071496,
"step": 7280
},
{
"epoch": 13.591417910447761,
"grad_norm": 0.4221719205379486,
"learning_rate": 0.00028163425628539184,
"loss": 0.3933,
"num_input_tokens_seen": 2072840,
"step": 7285
},
{
"epoch": 13.600746268656717,
"grad_norm": 0.4586186110973358,
"learning_rate": 0.00028090223068776867,
"loss": 0.4825,
"num_input_tokens_seen": 2074152,
"step": 7290
},
{
"epoch": 13.610074626865671,
"grad_norm": 0.6966432332992554,
"learning_rate": 0.00028017078585860735,
"loss": 0.371,
"num_input_tokens_seen": 2075528,
"step": 7295
},
{
"epoch": 13.619402985074627,
"grad_norm": 0.5013579726219177,
"learning_rate": 0.000279439923736769,
"loss": 0.3927,
"num_input_tokens_seen": 2077000,
"step": 7300
},
{
"epoch": 13.628731343283581,
"grad_norm": 0.4000113904476166,
"learning_rate": 0.00027870964625956987,
"loss": 0.4186,
"num_input_tokens_seen": 2078440,
"step": 7305
},
{
"epoch": 13.638059701492537,
"grad_norm": 0.4626561105251312,
"learning_rate": 0.0002779799553627762,
"loss": 0.3657,
"num_input_tokens_seen": 2079848,
"step": 7310
},
{
"epoch": 13.647388059701493,
"grad_norm": 0.289797306060791,
"learning_rate": 0.00027725085298060004,
"loss": 0.2702,
"num_input_tokens_seen": 2081416,
"step": 7315
},
{
"epoch": 13.656716417910447,
"grad_norm": 0.23923040926456451,
"learning_rate": 0.0002765223410456929,
"loss": 0.3342,
"num_input_tokens_seen": 2082888,
"step": 7320
},
{
"epoch": 13.666044776119403,
"grad_norm": 0.5279592871665955,
"learning_rate": 0.0002757944214891412,
"loss": 0.2363,
"num_input_tokens_seen": 2084264,
"step": 7325
},
{
"epoch": 13.675373134328359,
"grad_norm": 0.4002044200897217,
"learning_rate": 0.00027506709624046133,
"loss": 0.2882,
"num_input_tokens_seen": 2085640,
"step": 7330
},
{
"epoch": 13.684701492537313,
"grad_norm": 0.4956214725971222,
"learning_rate": 0.00027434036722759434,
"loss": 0.3519,
"num_input_tokens_seen": 2087048,
"step": 7335
},
{
"epoch": 13.694029850746269,
"grad_norm": 0.4935523271560669,
"learning_rate": 0.00027361423637690073,
"loss": 0.4493,
"num_input_tokens_seen": 2088456,
"step": 7340
},
{
"epoch": 13.703358208955224,
"grad_norm": 0.5428826808929443,
"learning_rate": 0.00027288870561315525,
"loss": 0.2448,
"num_input_tokens_seen": 2090120,
"step": 7345
},
{
"epoch": 13.712686567164178,
"grad_norm": 0.5399242043495178,
"learning_rate": 0.00027216377685954253,
"loss": 0.4441,
"num_input_tokens_seen": 2091592,
"step": 7350
},
{
"epoch": 13.722014925373134,
"grad_norm": 0.397484689950943,
"learning_rate": 0.00027143945203765086,
"loss": 0.3017,
"num_input_tokens_seen": 2093192,
"step": 7355
},
{
"epoch": 13.73134328358209,
"grad_norm": 0.4469533860683441,
"learning_rate": 0.00027071573306746793,
"loss": 0.4668,
"num_input_tokens_seen": 2094632,
"step": 7360
},
{
"epoch": 13.740671641791044,
"grad_norm": 0.538241982460022,
"learning_rate": 0.0002699926218673753,
"loss": 0.2292,
"num_input_tokens_seen": 2096200,
"step": 7365
},
{
"epoch": 13.75,
"grad_norm": 0.6859297752380371,
"learning_rate": 0.00026927012035414397,
"loss": 0.2249,
"num_input_tokens_seen": 2097736,
"step": 7370
},
{
"epoch": 13.759328358208956,
"grad_norm": 0.5205817818641663,
"learning_rate": 0.0002685482304429283,
"loss": 0.3923,
"num_input_tokens_seen": 2099016,
"step": 7375
},
{
"epoch": 13.76865671641791,
"grad_norm": 0.5006594061851501,
"learning_rate": 0.00026782695404726153,
"loss": 0.2378,
"num_input_tokens_seen": 2100360,
"step": 7380
},
{
"epoch": 13.777985074626866,
"grad_norm": 0.45096078515052795,
"learning_rate": 0.00026710629307905107,
"loss": 0.2475,
"num_input_tokens_seen": 2101832,
"step": 7385
},
{
"epoch": 13.787313432835822,
"grad_norm": 0.3145253360271454,
"learning_rate": 0.0002663862494485727,
"loss": 0.2111,
"num_input_tokens_seen": 2103208,
"step": 7390
},
{
"epoch": 13.796641791044776,
"grad_norm": 0.739738404750824,
"learning_rate": 0.0002656668250644656,
"loss": 0.251,
"num_input_tokens_seen": 2104648,
"step": 7395
},
{
"epoch": 13.805970149253731,
"grad_norm": 0.4297608733177185,
"learning_rate": 0.0002649480218337276,
"loss": 0.3109,
"num_input_tokens_seen": 2105992,
"step": 7400
},
{
"epoch": 13.815298507462687,
"grad_norm": 0.4921467900276184,
"learning_rate": 0.0002642298416617102,
"loss": 0.3678,
"num_input_tokens_seen": 2107368,
"step": 7405
},
{
"epoch": 13.824626865671641,
"grad_norm": 0.35035428404808044,
"learning_rate": 0.0002635122864521138,
"loss": 0.3889,
"num_input_tokens_seen": 2108744,
"step": 7410
},
{
"epoch": 13.833955223880597,
"grad_norm": 0.467009961605072,
"learning_rate": 0.00026279535810698083,
"loss": 0.1599,
"num_input_tokens_seen": 2110184,
"step": 7415
},
{
"epoch": 13.843283582089553,
"grad_norm": 0.5431391000747681,
"learning_rate": 0.00026207905852669355,
"loss": 0.316,
"num_input_tokens_seen": 2111656,
"step": 7420
},
{
"epoch": 13.852611940298507,
"grad_norm": 0.5196968913078308,
"learning_rate": 0.00026136338960996666,
"loss": 0.3132,
"num_input_tokens_seen": 2113064,
"step": 7425
},
{
"epoch": 13.861940298507463,
"grad_norm": 0.8434855341911316,
"learning_rate": 0.00026064835325384305,
"loss": 0.3459,
"num_input_tokens_seen": 2114536,
"step": 7430
},
{
"epoch": 13.871268656716419,
"grad_norm": 0.6276556849479675,
"learning_rate": 0.0002599339513536897,
"loss": 0.3708,
"num_input_tokens_seen": 2115912,
"step": 7435
},
{
"epoch": 13.880597014925373,
"grad_norm": 0.5806488394737244,
"learning_rate": 0.000259220185803191,
"loss": 0.3929,
"num_input_tokens_seen": 2117352,
"step": 7440
},
{
"epoch": 13.889925373134329,
"grad_norm": 0.45292994379997253,
"learning_rate": 0.0002585070584943452,
"loss": 0.3945,
"num_input_tokens_seen": 2118664,
"step": 7445
},
{
"epoch": 13.899253731343283,
"grad_norm": 0.45331817865371704,
"learning_rate": 0.00025779457131745774,
"loss": 0.4171,
"num_input_tokens_seen": 2120008,
"step": 7450
},
{
"epoch": 13.908582089552239,
"grad_norm": 0.23430407047271729,
"learning_rate": 0.00025708272616113866,
"loss": 0.2375,
"num_input_tokens_seen": 2121480,
"step": 7455
},
{
"epoch": 13.917910447761194,
"grad_norm": 0.47137734293937683,
"learning_rate": 0.0002563715249122948,
"loss": 0.3517,
"num_input_tokens_seen": 2122920,
"step": 7460
},
{
"epoch": 13.927238805970148,
"grad_norm": 0.6656373739242554,
"learning_rate": 0.00025566096945612725,
"loss": 0.4133,
"num_input_tokens_seen": 2124264,
"step": 7465
},
{
"epoch": 13.936567164179104,
"grad_norm": 0.6758831739425659,
"learning_rate": 0.0002549510616761248,
"loss": 0.3163,
"num_input_tokens_seen": 2125832,
"step": 7470
},
{
"epoch": 13.94589552238806,
"grad_norm": 0.6634296178817749,
"learning_rate": 0.00025424180345405903,
"loss": 0.2261,
"num_input_tokens_seen": 2127336,
"step": 7475
},
{
"epoch": 13.955223880597014,
"grad_norm": 0.6755518317222595,
"learning_rate": 0.0002535331966699809,
"loss": 0.3299,
"num_input_tokens_seen": 2128616,
"step": 7480
},
{
"epoch": 13.96455223880597,
"grad_norm": 0.5297658443450928,
"learning_rate": 0.0002528252432022129,
"loss": 0.2549,
"num_input_tokens_seen": 2129928,
"step": 7485
},
{
"epoch": 13.973880597014926,
"grad_norm": 0.8608595132827759,
"learning_rate": 0.0002521179449273472,
"loss": 0.2852,
"num_input_tokens_seen": 2131304,
"step": 7490
},
{
"epoch": 13.98320895522388,
"grad_norm": 0.5724624395370483,
"learning_rate": 0.0002514113037202389,
"loss": 0.4458,
"num_input_tokens_seen": 2132808,
"step": 7495
},
{
"epoch": 13.992537313432836,
"grad_norm": 0.27308163046836853,
"learning_rate": 0.00025070532145400105,
"loss": 0.1776,
"num_input_tokens_seen": 2134184,
"step": 7500
},
{
"epoch": 14.0,
"eval_loss": 0.7891493439674377,
"eval_runtime": 4.213,
"eval_samples_per_second": 56.491,
"eval_steps_per_second": 14.242,
"num_input_tokens_seen": 2135168,
"step": 7504
},
{
"epoch": 14.001865671641792,
"grad_norm": 0.3923545777797699,
"learning_rate": 0.0002500000000000001,
"loss": 0.5102,
"num_input_tokens_seen": 2135456,
"step": 7505
},
{
"epoch": 14.011194029850746,
"grad_norm": 0.5965924263000488,
"learning_rate": 0.00024929534122785084,
"loss": 0.2351,
"num_input_tokens_seen": 2136960,
"step": 7510
},
{
"epoch": 14.020522388059701,
"grad_norm": 0.32224705815315247,
"learning_rate": 0.0002485913470054119,
"loss": 0.2127,
"num_input_tokens_seen": 2138272,
"step": 7515
},
{
"epoch": 14.029850746268657,
"grad_norm": 0.3885835111141205,
"learning_rate": 0.00024788801919878,
"loss": 0.2403,
"num_input_tokens_seen": 2139680,
"step": 7520
},
{
"epoch": 14.039179104477611,
"grad_norm": 0.42298731207847595,
"learning_rate": 0.0002471853596722851,
"loss": 0.2951,
"num_input_tokens_seen": 2141056,
"step": 7525
},
{
"epoch": 14.048507462686567,
"grad_norm": 0.46374988555908203,
"learning_rate": 0.00024648337028848654,
"loss": 0.1905,
"num_input_tokens_seen": 2142784,
"step": 7530
},
{
"epoch": 14.057835820895523,
"grad_norm": 0.5345138311386108,
"learning_rate": 0.00024578205290816656,
"loss": 0.2886,
"num_input_tokens_seen": 2144192,
"step": 7535
},
{
"epoch": 14.067164179104477,
"grad_norm": 0.5211803913116455,
"learning_rate": 0.00024508140939032646,
"loss": 0.3581,
"num_input_tokens_seen": 2145600,
"step": 7540
},
{
"epoch": 14.076492537313433,
"grad_norm": 0.365950345993042,
"learning_rate": 0.0002443814415921809,
"loss": 0.2361,
"num_input_tokens_seen": 2146976,
"step": 7545
},
{
"epoch": 14.085820895522389,
"grad_norm": 0.5437518954277039,
"learning_rate": 0.00024368215136915417,
"loss": 0.3863,
"num_input_tokens_seen": 2148352,
"step": 7550
},
{
"epoch": 14.095149253731343,
"grad_norm": 0.415061891078949,
"learning_rate": 0.00024298354057487382,
"loss": 0.3582,
"num_input_tokens_seen": 2149664,
"step": 7555
},
{
"epoch": 14.104477611940299,
"grad_norm": 0.6573102474212646,
"learning_rate": 0.00024228561106116647,
"loss": 0.2205,
"num_input_tokens_seen": 2151520,
"step": 7560
},
{
"epoch": 14.113805970149254,
"grad_norm": 0.3543594181537628,
"learning_rate": 0.00024158836467805334,
"loss": 0.2546,
"num_input_tokens_seen": 2152864,
"step": 7565
},
{
"epoch": 14.123134328358208,
"grad_norm": 0.43706467747688293,
"learning_rate": 0.0002408918032737444,
"loss": 0.1902,
"num_input_tokens_seen": 2154368,
"step": 7570
},
{
"epoch": 14.132462686567164,
"grad_norm": 0.6588314771652222,
"learning_rate": 0.00024019592869463374,
"loss": 0.1914,
"num_input_tokens_seen": 2155712,
"step": 7575
},
{
"epoch": 14.14179104477612,
"grad_norm": 0.3580341935157776,
"learning_rate": 0.00023950074278529567,
"loss": 0.2419,
"num_input_tokens_seen": 2157280,
"step": 7580
},
{
"epoch": 14.151119402985074,
"grad_norm": 0.7395253777503967,
"learning_rate": 0.00023880624738847835,
"loss": 0.34,
"num_input_tokens_seen": 2158656,
"step": 7585
},
{
"epoch": 14.16044776119403,
"grad_norm": 0.49231547117233276,
"learning_rate": 0.0002381124443450997,
"loss": 0.2136,
"num_input_tokens_seen": 2160160,
"step": 7590
},
{
"epoch": 14.169776119402986,
"grad_norm": 0.5264060497283936,
"learning_rate": 0.00023741933549424228,
"loss": 0.3088,
"num_input_tokens_seen": 2161600,
"step": 7595
},
{
"epoch": 14.17910447761194,
"grad_norm": 0.6888109445571899,
"learning_rate": 0.00023672692267314916,
"loss": 0.3481,
"num_input_tokens_seen": 2162912,
"step": 7600
},
{
"epoch": 14.188432835820896,
"grad_norm": 0.6409774422645569,
"learning_rate": 0.0002360352077172177,
"loss": 0.3394,
"num_input_tokens_seen": 2164320,
"step": 7605
},
{
"epoch": 14.197761194029852,
"grad_norm": 0.3647206127643585,
"learning_rate": 0.0002353441924599956,
"loss": 0.1627,
"num_input_tokens_seen": 2165888,
"step": 7610
},
{
"epoch": 14.207089552238806,
"grad_norm": 0.48043763637542725,
"learning_rate": 0.0002346538787331763,
"loss": 0.4696,
"num_input_tokens_seen": 2167296,
"step": 7615
},
{
"epoch": 14.216417910447761,
"grad_norm": 0.5464047193527222,
"learning_rate": 0.00023396426836659303,
"loss": 0.2985,
"num_input_tokens_seen": 2168672,
"step": 7620
},
{
"epoch": 14.225746268656717,
"grad_norm": 0.688077986240387,
"learning_rate": 0.00023327536318821495,
"loss": 0.3632,
"num_input_tokens_seen": 2170048,
"step": 7625
},
{
"epoch": 14.235074626865671,
"grad_norm": 0.6989650130271912,
"learning_rate": 0.0002325871650241418,
"loss": 0.3588,
"num_input_tokens_seen": 2171264,
"step": 7630
},
{
"epoch": 14.244402985074627,
"grad_norm": 0.999320387840271,
"learning_rate": 0.00023189967569859938,
"loss": 0.2911,
"num_input_tokens_seen": 2172800,
"step": 7635
},
{
"epoch": 14.253731343283581,
"grad_norm": 0.3599291443824768,
"learning_rate": 0.00023121289703393488,
"loss": 0.2441,
"num_input_tokens_seen": 2174208,
"step": 7640
},
{
"epoch": 14.263059701492537,
"grad_norm": 0.8607851266860962,
"learning_rate": 0.0002305268308506106,
"loss": 0.3268,
"num_input_tokens_seen": 2175584,
"step": 7645
},
{
"epoch": 14.272388059701493,
"grad_norm": 0.4545849859714508,
"learning_rate": 0.0002298414789672016,
"loss": 0.2989,
"num_input_tokens_seen": 2176960,
"step": 7650
},
{
"epoch": 14.281716417910447,
"grad_norm": 0.482704758644104,
"learning_rate": 0.00022915684320038836,
"loss": 0.3393,
"num_input_tokens_seen": 2178400,
"step": 7655
},
{
"epoch": 14.291044776119403,
"grad_norm": 0.49524471163749695,
"learning_rate": 0.00022847292536495447,
"loss": 0.3255,
"num_input_tokens_seen": 2179648,
"step": 7660
},
{
"epoch": 14.300373134328359,
"grad_norm": 0.39817991852760315,
"learning_rate": 0.00022778972727377866,
"loss": 0.3488,
"num_input_tokens_seen": 2181088,
"step": 7665
},
{
"epoch": 14.309701492537313,
"grad_norm": 0.3302770256996155,
"learning_rate": 0.00022710725073783346,
"loss": 0.2023,
"num_input_tokens_seen": 2182464,
"step": 7670
},
{
"epoch": 14.319029850746269,
"grad_norm": 0.6790076494216919,
"learning_rate": 0.00022642549756617835,
"loss": 0.2749,
"num_input_tokens_seen": 2183968,
"step": 7675
},
{
"epoch": 14.328358208955224,
"grad_norm": 0.4351758062839508,
"learning_rate": 0.00022574446956595445,
"loss": 0.2023,
"num_input_tokens_seen": 2185248,
"step": 7680
},
{
"epoch": 14.337686567164178,
"grad_norm": 0.3751530349254608,
"learning_rate": 0.00022506416854238187,
"loss": 0.2206,
"num_input_tokens_seen": 2186848,
"step": 7685
},
{
"epoch": 14.347014925373134,
"grad_norm": 0.3495856821537018,
"learning_rate": 0.00022438459629875291,
"loss": 0.2509,
"num_input_tokens_seen": 2188320,
"step": 7690
},
{
"epoch": 14.35634328358209,
"grad_norm": 0.6211624145507812,
"learning_rate": 0.00022370575463642856,
"loss": 0.3539,
"num_input_tokens_seen": 2189792,
"step": 7695
},
{
"epoch": 14.365671641791044,
"grad_norm": 0.56031733751297,
"learning_rate": 0.00022302764535483293,
"loss": 0.2784,
"num_input_tokens_seen": 2191232,
"step": 7700
},
{
"epoch": 14.375,
"grad_norm": 0.24593977630138397,
"learning_rate": 0.00022235027025144873,
"loss": 0.3143,
"num_input_tokens_seen": 2192672,
"step": 7705
},
{
"epoch": 14.384328358208956,
"grad_norm": 0.7664890885353088,
"learning_rate": 0.000221673631121813,
"loss": 0.4823,
"num_input_tokens_seen": 2193984,
"step": 7710
},
{
"epoch": 14.39365671641791,
"grad_norm": 0.4323963522911072,
"learning_rate": 0.00022099772975951143,
"loss": 0.3216,
"num_input_tokens_seen": 2195424,
"step": 7715
},
{
"epoch": 14.402985074626866,
"grad_norm": 0.6344655156135559,
"learning_rate": 0.00022032256795617434,
"loss": 0.4039,
"num_input_tokens_seen": 2196736,
"step": 7720
},
{
"epoch": 14.412313432835822,
"grad_norm": 0.4761766493320465,
"learning_rate": 0.00021964814750147143,
"loss": 0.3493,
"num_input_tokens_seen": 2198080,
"step": 7725
},
{
"epoch": 14.421641791044776,
"grad_norm": 0.5730311274528503,
"learning_rate": 0.00021897447018310784,
"loss": 0.326,
"num_input_tokens_seen": 2199328,
"step": 7730
},
{
"epoch": 14.430970149253731,
"grad_norm": 0.5259273648262024,
"learning_rate": 0.00021830153778681832,
"loss": 0.2736,
"num_input_tokens_seen": 2200768,
"step": 7735
},
{
"epoch": 14.440298507462687,
"grad_norm": 0.3467364013195038,
"learning_rate": 0.00021762935209636308,
"loss": 0.2123,
"num_input_tokens_seen": 2202240,
"step": 7740
},
{
"epoch": 14.449626865671641,
"grad_norm": 0.6609408259391785,
"learning_rate": 0.00021695791489352345,
"loss": 0.3752,
"num_input_tokens_seen": 2203584,
"step": 7745
},
{
"epoch": 14.458955223880597,
"grad_norm": 0.27350056171417236,
"learning_rate": 0.00021628722795809623,
"loss": 0.252,
"num_input_tokens_seen": 2204928,
"step": 7750
},
{
"epoch": 14.468283582089553,
"grad_norm": 0.7308428883552551,
"learning_rate": 0.00021561729306788957,
"loss": 0.3164,
"num_input_tokens_seen": 2206464,
"step": 7755
},
{
"epoch": 14.477611940298507,
"grad_norm": 0.4643590450286865,
"learning_rate": 0.00021494811199871856,
"loss": 0.3272,
"num_input_tokens_seen": 2207904,
"step": 7760
},
{
"epoch": 14.486940298507463,
"grad_norm": 0.49550262093544006,
"learning_rate": 0.00021427968652439956,
"loss": 0.2784,
"num_input_tokens_seen": 2209376,
"step": 7765
},
{
"epoch": 14.496268656716419,
"grad_norm": 0.5133547186851501,
"learning_rate": 0.00021361201841674639,
"loss": 0.3686,
"num_input_tokens_seen": 2210912,
"step": 7770
},
{
"epoch": 14.505597014925373,
"grad_norm": 0.275790810585022,
"learning_rate": 0.000212945109445565,
"loss": 0.3955,
"num_input_tokens_seen": 2212288,
"step": 7775
},
{
"epoch": 14.514925373134329,
"grad_norm": 0.3647710382938385,
"learning_rate": 0.0002122789613786496,
"loss": 0.2778,
"num_input_tokens_seen": 2213664,
"step": 7780
},
{
"epoch": 14.524253731343283,
"grad_norm": 0.5007681250572205,
"learning_rate": 0.00021161357598177693,
"loss": 0.3911,
"num_input_tokens_seen": 2214976,
"step": 7785
},
{
"epoch": 14.533582089552239,
"grad_norm": 0.4205113351345062,
"learning_rate": 0.0002109489550187022,
"loss": 0.2883,
"num_input_tokens_seen": 2216512,
"step": 7790
},
{
"epoch": 14.542910447761194,
"grad_norm": 0.3494776487350464,
"learning_rate": 0.00021028510025115476,
"loss": 0.2344,
"num_input_tokens_seen": 2217920,
"step": 7795
},
{
"epoch": 14.552238805970148,
"grad_norm": 0.7673720717430115,
"learning_rate": 0.00020962201343883237,
"loss": 0.4006,
"num_input_tokens_seen": 2219296,
"step": 7800
},
{
"epoch": 14.561567164179104,
"grad_norm": 0.5976295471191406,
"learning_rate": 0.00020895969633939748,
"loss": 0.3326,
"num_input_tokens_seen": 2220864,
"step": 7805
},
{
"epoch": 14.57089552238806,
"grad_norm": 0.5417135953903198,
"learning_rate": 0.00020829815070847203,
"loss": 0.3475,
"num_input_tokens_seen": 2222208,
"step": 7810
},
{
"epoch": 14.580223880597014,
"grad_norm": 0.8238497972488403,
"learning_rate": 0.00020763737829963347,
"loss": 0.4032,
"num_input_tokens_seen": 2223456,
"step": 7815
},
{
"epoch": 14.58955223880597,
"grad_norm": 0.2751303017139435,
"learning_rate": 0.00020697738086440914,
"loss": 0.2232,
"num_input_tokens_seen": 2224928,
"step": 7820
},
{
"epoch": 14.598880597014926,
"grad_norm": 0.6023114919662476,
"learning_rate": 0.00020631816015227218,
"loss": 0.37,
"num_input_tokens_seen": 2226464,
"step": 7825
},
{
"epoch": 14.60820895522388,
"grad_norm": 0.9885722994804382,
"learning_rate": 0.00020565971791063731,
"loss": 0.3756,
"num_input_tokens_seen": 2227968,
"step": 7830
},
{
"epoch": 14.617537313432836,
"grad_norm": 0.5508349537849426,
"learning_rate": 0.0002050020558848553,
"loss": 0.3164,
"num_input_tokens_seen": 2229312,
"step": 7835
},
{
"epoch": 14.626865671641792,
"grad_norm": 0.3190430998802185,
"learning_rate": 0.00020434517581820893,
"loss": 0.2365,
"num_input_tokens_seen": 2230752,
"step": 7840
},
{
"epoch": 14.636194029850746,
"grad_norm": 0.46599534153938293,
"learning_rate": 0.000203689079451908,
"loss": 0.219,
"num_input_tokens_seen": 2232096,
"step": 7845
},
{
"epoch": 14.645522388059701,
"grad_norm": 0.5045431852340698,
"learning_rate": 0.00020303376852508526,
"loss": 0.2736,
"num_input_tokens_seen": 2233504,
"step": 7850
},
{
"epoch": 14.654850746268657,
"grad_norm": 0.7143300771713257,
"learning_rate": 0.0002023792447747917,
"loss": 0.4204,
"num_input_tokens_seen": 2234848,
"step": 7855
},
{
"epoch": 14.664179104477611,
"grad_norm": 0.4074963927268982,
"learning_rate": 0.00020172550993599072,
"loss": 0.2737,
"num_input_tokens_seen": 2236192,
"step": 7860
},
{
"epoch": 14.673507462686567,
"grad_norm": 0.38568681478500366,
"learning_rate": 0.00020107256574155563,
"loss": 0.3055,
"num_input_tokens_seen": 2237696,
"step": 7865
},
{
"epoch": 14.682835820895523,
"grad_norm": 0.47734951972961426,
"learning_rate": 0.0002004204139222634,
"loss": 0.3397,
"num_input_tokens_seen": 2239104,
"step": 7870
},
{
"epoch": 14.692164179104477,
"grad_norm": 0.708835244178772,
"learning_rate": 0.00019976905620679053,
"loss": 0.5271,
"num_input_tokens_seen": 2240480,
"step": 7875
},
{
"epoch": 14.701492537313433,
"grad_norm": 0.508047342300415,
"learning_rate": 0.00019911849432170908,
"loss": 0.385,
"num_input_tokens_seen": 2241984,
"step": 7880
},
{
"epoch": 14.710820895522389,
"grad_norm": 0.7358197569847107,
"learning_rate": 0.0001984687299914809,
"loss": 0.4814,
"num_input_tokens_seen": 2243328,
"step": 7885
},
{
"epoch": 14.720149253731343,
"grad_norm": 0.5231989622116089,
"learning_rate": 0.00019781976493845477,
"loss": 0.2839,
"num_input_tokens_seen": 2244832,
"step": 7890
},
{
"epoch": 14.729477611940299,
"grad_norm": 0.4327712655067444,
"learning_rate": 0.00019717160088285928,
"loss": 0.1759,
"num_input_tokens_seen": 2246240,
"step": 7895
},
{
"epoch": 14.738805970149254,
"grad_norm": 0.5407418608665466,
"learning_rate": 0.0001965242395428013,
"loss": 0.3704,
"num_input_tokens_seen": 2247456,
"step": 7900
},
{
"epoch": 14.748134328358208,
"grad_norm": 0.5258449912071228,
"learning_rate": 0.00019587768263425886,
"loss": 0.3365,
"num_input_tokens_seen": 2248768,
"step": 7905
},
{
"epoch": 14.757462686567164,
"grad_norm": 0.29008418321609497,
"learning_rate": 0.00019523193187107846,
"loss": 0.2287,
"num_input_tokens_seen": 2250112,
"step": 7910
},
{
"epoch": 14.76679104477612,
"grad_norm": 0.4476899802684784,
"learning_rate": 0.00019458698896496917,
"loss": 0.3182,
"num_input_tokens_seen": 2251776,
"step": 7915
},
{
"epoch": 14.776119402985074,
"grad_norm": 0.35547712445259094,
"learning_rate": 0.00019394285562549863,
"loss": 0.2906,
"num_input_tokens_seen": 2253184,
"step": 7920
},
{
"epoch": 14.78544776119403,
"grad_norm": 0.5664058327674866,
"learning_rate": 0.00019329953356008928,
"loss": 0.3694,
"num_input_tokens_seen": 2254560,
"step": 7925
},
{
"epoch": 14.794776119402986,
"grad_norm": 0.6664396524429321,
"learning_rate": 0.00019265702447401184,
"loss": 0.4551,
"num_input_tokens_seen": 2255904,
"step": 7930
},
{
"epoch": 14.80410447761194,
"grad_norm": 0.39799797534942627,
"learning_rate": 0.00019201533007038308,
"loss": 0.2978,
"num_input_tokens_seen": 2257280,
"step": 7935
},
{
"epoch": 14.813432835820896,
"grad_norm": 0.5523400902748108,
"learning_rate": 0.00019137445205016018,
"loss": 0.2565,
"num_input_tokens_seen": 2258624,
"step": 7940
},
{
"epoch": 14.822761194029852,
"grad_norm": 0.5098326206207275,
"learning_rate": 0.00019073439211213589,
"loss": 0.2085,
"num_input_tokens_seen": 2260096,
"step": 7945
},
{
"epoch": 14.832089552238806,
"grad_norm": 0.6128082871437073,
"learning_rate": 0.0001900951519529346,
"loss": 0.2444,
"num_input_tokens_seen": 2261696,
"step": 7950
},
{
"epoch": 14.841417910447761,
"grad_norm": 0.6016358137130737,
"learning_rate": 0.0001894567332670075,
"loss": 0.3776,
"num_input_tokens_seen": 2263040,
"step": 7955
},
{
"epoch": 14.850746268656717,
"grad_norm": 0.5598629117012024,
"learning_rate": 0.0001888191377466289,
"loss": 0.3781,
"num_input_tokens_seen": 2264320,
"step": 7960
},
{
"epoch": 14.860074626865671,
"grad_norm": 0.39370617270469666,
"learning_rate": 0.00018818236708189058,
"loss": 0.2014,
"num_input_tokens_seen": 2265792,
"step": 7965
},
{
"epoch": 14.869402985074627,
"grad_norm": 0.5136460661888123,
"learning_rate": 0.0001875464229606978,
"loss": 0.3423,
"num_input_tokens_seen": 2267520,
"step": 7970
},
{
"epoch": 14.878731343283581,
"grad_norm": 0.44041234254837036,
"learning_rate": 0.00018691130706876535,
"loss": 0.2944,
"num_input_tokens_seen": 2268928,
"step": 7975
},
{
"epoch": 14.888059701492537,
"grad_norm": 0.6629489064216614,
"learning_rate": 0.00018627702108961225,
"loss": 0.2556,
"num_input_tokens_seen": 2270240,
"step": 7980
},
{
"epoch": 14.897388059701493,
"grad_norm": 0.4910067319869995,
"learning_rate": 0.00018564356670455767,
"loss": 0.3722,
"num_input_tokens_seen": 2271776,
"step": 7985
},
{
"epoch": 14.906716417910447,
"grad_norm": 0.5173795223236084,
"learning_rate": 0.00018501094559271637,
"loss": 0.2558,
"num_input_tokens_seen": 2273248,
"step": 7990
},
{
"epoch": 14.916044776119403,
"grad_norm": 0.589825451374054,
"learning_rate": 0.0001843791594309948,
"loss": 0.2848,
"num_input_tokens_seen": 2274816,
"step": 7995
},
{
"epoch": 14.925373134328359,
"grad_norm": 0.34794145822525024,
"learning_rate": 0.0001837482098940857,
"loss": 0.2368,
"num_input_tokens_seen": 2276192,
"step": 8000
},
{
"epoch": 14.934701492537313,
"grad_norm": 0.8483067750930786,
"learning_rate": 0.00018311809865446404,
"loss": 0.3701,
"num_input_tokens_seen": 2277408,
"step": 8005
},
{
"epoch": 14.944029850746269,
"grad_norm": 0.38415467739105225,
"learning_rate": 0.00018248882738238344,
"loss": 0.4439,
"num_input_tokens_seen": 2278784,
"step": 8010
},
{
"epoch": 14.953358208955224,
"grad_norm": 0.4851486384868622,
"learning_rate": 0.00018186039774587025,
"loss": 0.3857,
"num_input_tokens_seen": 2280416,
"step": 8015
},
{
"epoch": 14.962686567164178,
"grad_norm": 0.3671543002128601,
"learning_rate": 0.0001812328114107201,
"loss": 0.4152,
"num_input_tokens_seen": 2281856,
"step": 8020
},
{
"epoch": 14.972014925373134,
"grad_norm": 0.6492642760276794,
"learning_rate": 0.00018060607004049322,
"loss": 0.5156,
"num_input_tokens_seen": 2283200,
"step": 8025
},
{
"epoch": 14.98134328358209,
"grad_norm": 0.44069990515708923,
"learning_rate": 0.0001799801752965104,
"loss": 0.3393,
"num_input_tokens_seen": 2284736,
"step": 8030
},
{
"epoch": 14.990671641791044,
"grad_norm": 0.676559329032898,
"learning_rate": 0.00017935512883784788,
"loss": 0.4805,
"num_input_tokens_seen": 2285984,
"step": 8035
},
{
"epoch": 15.0,
"grad_norm": 1.6291553974151611,
"learning_rate": 0.0001787309323213332,
"loss": 0.3594,
"num_input_tokens_seen": 2287232,
"step": 8040
},
{
"epoch": 15.0,
"eval_loss": 0.8014525175094604,
"eval_runtime": 4.2002,
"eval_samples_per_second": 56.664,
"eval_steps_per_second": 14.285,
"num_input_tokens_seen": 2287232,
"step": 8040
},
{
"epoch": 15.009328358208956,
"grad_norm": 0.4468965232372284,
"learning_rate": 0.00017810758740154155,
"loss": 0.2567,
"num_input_tokens_seen": 2288576,
"step": 8045
},
{
"epoch": 15.01865671641791,
"grad_norm": 0.4501148760318756,
"learning_rate": 0.0001774850957307902,
"loss": 0.365,
"num_input_tokens_seen": 2289856,
"step": 8050
},
{
"epoch": 15.027985074626866,
"grad_norm": 0.8252508044242859,
"learning_rate": 0.00017686345895913475,
"loss": 0.3132,
"num_input_tokens_seen": 2291232,
"step": 8055
},
{
"epoch": 15.037313432835822,
"grad_norm": 0.631758987903595,
"learning_rate": 0.00017624267873436516,
"loss": 0.3493,
"num_input_tokens_seen": 2292576,
"step": 8060
},
{
"epoch": 15.046641791044776,
"grad_norm": 0.3204401731491089,
"learning_rate": 0.0001756227567020004,
"loss": 0.3052,
"num_input_tokens_seen": 2294368,
"step": 8065
},
{
"epoch": 15.055970149253731,
"grad_norm": 0.5686541199684143,
"learning_rate": 0.00017500369450528482,
"loss": 0.2585,
"num_input_tokens_seen": 2295936,
"step": 8070
},
{
"epoch": 15.065298507462687,
"grad_norm": 0.5454219579696655,
"learning_rate": 0.0001743854937851833,
"loss": 0.3738,
"num_input_tokens_seen": 2297280,
"step": 8075
},
{
"epoch": 15.074626865671641,
"grad_norm": 0.8047488927841187,
"learning_rate": 0.00017376815618037788,
"loss": 0.3455,
"num_input_tokens_seen": 2298720,
"step": 8080
},
{
"epoch": 15.083955223880597,
"grad_norm": 0.6854057312011719,
"learning_rate": 0.00017315168332726207,
"loss": 0.2792,
"num_input_tokens_seen": 2300192,
"step": 8085
},
{
"epoch": 15.093283582089553,
"grad_norm": 0.6454561352729797,
"learning_rate": 0.0001725360768599371,
"loss": 0.3133,
"num_input_tokens_seen": 2301472,
"step": 8090
},
{
"epoch": 15.102611940298507,
"grad_norm": 0.6675901412963867,
"learning_rate": 0.00017192133841020834,
"loss": 0.2859,
"num_input_tokens_seen": 2303104,
"step": 8095
},
{
"epoch": 15.111940298507463,
"grad_norm": 0.48817259073257446,
"learning_rate": 0.00017130746960757954,
"loss": 0.3838,
"num_input_tokens_seen": 2304480,
"step": 8100
},
{
"epoch": 15.121268656716419,
"grad_norm": 0.6950250864028931,
"learning_rate": 0.00017069447207924992,
"loss": 0.3662,
"num_input_tokens_seen": 2305792,
"step": 8105
},
{
"epoch": 15.130597014925373,
"grad_norm": 0.48446905612945557,
"learning_rate": 0.00017008234745010832,
"loss": 0.4478,
"num_input_tokens_seen": 2307328,
"step": 8110
},
{
"epoch": 15.139925373134329,
"grad_norm": 0.4077727198600769,
"learning_rate": 0.00016947109734273048,
"loss": 0.1662,
"num_input_tokens_seen": 2308896,
"step": 8115
},
{
"epoch": 15.149253731343283,
"grad_norm": 0.3715053200721741,
"learning_rate": 0.00016886072337737417,
"loss": 0.3857,
"num_input_tokens_seen": 2310464,
"step": 8120
},
{
"epoch": 15.158582089552239,
"grad_norm": 0.24879996478557587,
"learning_rate": 0.00016825122717197382,
"loss": 0.138,
"num_input_tokens_seen": 2311872,
"step": 8125
},
{
"epoch": 15.167910447761194,
"grad_norm": 0.46096736192703247,
"learning_rate": 0.00016764261034213812,
"loss": 0.3597,
"num_input_tokens_seen": 2313312,
"step": 8130
},
{
"epoch": 15.177238805970148,
"grad_norm": 0.5743706822395325,
"learning_rate": 0.00016703487450114407,
"loss": 0.3697,
"num_input_tokens_seen": 2314560,
"step": 8135
},
{
"epoch": 15.186567164179104,
"grad_norm": 0.6948251128196716,
"learning_rate": 0.00016642802125993428,
"loss": 0.2639,
"num_input_tokens_seen": 2315840,
"step": 8140
},
{
"epoch": 15.19589552238806,
"grad_norm": 0.37598899006843567,
"learning_rate": 0.0001658220522271105,
"loss": 0.3013,
"num_input_tokens_seen": 2317152,
"step": 8145
},
{
"epoch": 15.205223880597014,
"grad_norm": 0.5866940021514893,
"learning_rate": 0.00016521696900893192,
"loss": 0.273,
"num_input_tokens_seen": 2318720,
"step": 8150
},
{
"epoch": 15.21455223880597,
"grad_norm": 0.5363821983337402,
"learning_rate": 0.00016461277320930923,
"loss": 0.2352,
"num_input_tokens_seen": 2320192,
"step": 8155
},
{
"epoch": 15.223880597014926,
"grad_norm": 0.6275938749313354,
"learning_rate": 0.0001640094664298007,
"loss": 0.3144,
"num_input_tokens_seen": 2321536,
"step": 8160
},
{
"epoch": 15.23320895522388,
"grad_norm": 0.47853443026542664,
"learning_rate": 0.00016340705026960818,
"loss": 0.2924,
"num_input_tokens_seen": 2322976,
"step": 8165
},
{
"epoch": 15.242537313432836,
"grad_norm": 0.6668580770492554,
"learning_rate": 0.00016280552632557245,
"loss": 0.2733,
"num_input_tokens_seen": 2324448,
"step": 8170
},
{
"epoch": 15.251865671641792,
"grad_norm": 0.6049531102180481,
"learning_rate": 0.0001622048961921699,
"loss": 0.4943,
"num_input_tokens_seen": 2325760,
"step": 8175
},
{
"epoch": 15.261194029850746,
"grad_norm": 0.47579655051231384,
"learning_rate": 0.0001616051614615071,
"loss": 0.2544,
"num_input_tokens_seen": 2327328,
"step": 8180
},
{
"epoch": 15.270522388059701,
"grad_norm": 0.6176946759223938,
"learning_rate": 0.00016100632372331725,
"loss": 0.2634,
"num_input_tokens_seen": 2328672,
"step": 8185
},
{
"epoch": 15.279850746268657,
"grad_norm": 0.6476304531097412,
"learning_rate": 0.00016040838456495615,
"loss": 0.3383,
"num_input_tokens_seen": 2330176,
"step": 8190
},
{
"epoch": 15.289179104477611,
"grad_norm": 0.3113108277320862,
"learning_rate": 0.00015981134557139742,
"loss": 0.2696,
"num_input_tokens_seen": 2331584,
"step": 8195
},
{
"epoch": 15.298507462686567,
"grad_norm": 0.4137718379497528,
"learning_rate": 0.00015921520832522874,
"loss": 0.3759,
"num_input_tokens_seen": 2332832,
"step": 8200
},
{
"epoch": 15.307835820895523,
"grad_norm": 1.1625181436538696,
"learning_rate": 0.00015861997440664717,
"loss": 0.3662,
"num_input_tokens_seen": 2333984,
"step": 8205
},
{
"epoch": 15.317164179104477,
"grad_norm": 0.5340922474861145,
"learning_rate": 0.000158025645393456,
"loss": 0.2759,
"num_input_tokens_seen": 2335232,
"step": 8210
},
{
"epoch": 15.326492537313433,
"grad_norm": 0.46783941984176636,
"learning_rate": 0.0001574322228610592,
"loss": 0.2871,
"num_input_tokens_seen": 2336512,
"step": 8215
},
{
"epoch": 15.335820895522389,
"grad_norm": 0.6034365296363831,
"learning_rate": 0.00015683970838245798,
"loss": 0.305,
"num_input_tokens_seen": 2338112,
"step": 8220
},
{
"epoch": 15.345149253731343,
"grad_norm": 0.3258196711540222,
"learning_rate": 0.0001562481035282471,
"loss": 0.2678,
"num_input_tokens_seen": 2339744,
"step": 8225
},
{
"epoch": 15.354477611940299,
"grad_norm": 0.4759286344051361,
"learning_rate": 0.00015565740986660947,
"loss": 0.3317,
"num_input_tokens_seen": 2341088,
"step": 8230
},
{
"epoch": 15.363805970149254,
"grad_norm": 0.8320894241333008,
"learning_rate": 0.000155067628963313,
"loss": 0.2662,
"num_input_tokens_seen": 2342592,
"step": 8235
},
{
"epoch": 15.373134328358208,
"grad_norm": 0.5550680160522461,
"learning_rate": 0.00015447876238170626,
"loss": 0.2799,
"num_input_tokens_seen": 2344128,
"step": 8240
},
{
"epoch": 15.382462686567164,
"grad_norm": 0.5990989804267883,
"learning_rate": 0.0001538908116827139,
"loss": 0.2282,
"num_input_tokens_seen": 2345472,
"step": 8245
},
{
"epoch": 15.39179104477612,
"grad_norm": 0.791718602180481,
"learning_rate": 0.00015330377842483306,
"loss": 0.2224,
"num_input_tokens_seen": 2346944,
"step": 8250
},
{
"epoch": 15.401119402985074,
"grad_norm": 0.5611154437065125,
"learning_rate": 0.00015271766416412858,
"loss": 0.2715,
"num_input_tokens_seen": 2348288,
"step": 8255
},
{
"epoch": 15.41044776119403,
"grad_norm": 0.4410983920097351,
"learning_rate": 0.00015213247045422996,
"loss": 0.2581,
"num_input_tokens_seen": 2349728,
"step": 8260
},
{
"epoch": 15.419776119402986,
"grad_norm": 0.6476612091064453,
"learning_rate": 0.0001515481988463261,
"loss": 0.3037,
"num_input_tokens_seen": 2351008,
"step": 8265
},
{
"epoch": 15.42910447761194,
"grad_norm": 0.831028938293457,
"learning_rate": 0.00015096485088916155,
"loss": 0.2987,
"num_input_tokens_seen": 2352384,
"step": 8270
},
{
"epoch": 15.438432835820896,
"grad_norm": 0.22620081901550293,
"learning_rate": 0.00015038242812903313,
"loss": 0.3147,
"num_input_tokens_seen": 2353920,
"step": 8275
},
{
"epoch": 15.447761194029852,
"grad_norm": 0.4023994207382202,
"learning_rate": 0.00014980093210978452,
"loss": 0.3046,
"num_input_tokens_seen": 2355392,
"step": 8280
},
{
"epoch": 15.457089552238806,
"grad_norm": 0.6006656885147095,
"learning_rate": 0.00014922036437280324,
"loss": 0.3329,
"num_input_tokens_seen": 2356800,
"step": 8285
},
{
"epoch": 15.466417910447761,
"grad_norm": 0.6616522669792175,
"learning_rate": 0.00014864072645701592,
"loss": 0.205,
"num_input_tokens_seen": 2358208,
"step": 8290
},
{
"epoch": 15.475746268656717,
"grad_norm": 0.5346037745475769,
"learning_rate": 0.00014806201989888502,
"loss": 0.3307,
"num_input_tokens_seen": 2359552,
"step": 8295
},
{
"epoch": 15.485074626865671,
"grad_norm": 0.4655405282974243,
"learning_rate": 0.00014748424623240363,
"loss": 0.3352,
"num_input_tokens_seen": 2360960,
"step": 8300
},
{
"epoch": 15.494402985074627,
"grad_norm": 1.0552483797073364,
"learning_rate": 0.00014690740698909222,
"loss": 0.2739,
"num_input_tokens_seen": 2362400,
"step": 8305
},
{
"epoch": 15.503731343283581,
"grad_norm": 0.5363913178443909,
"learning_rate": 0.0001463315036979946,
"loss": 0.3144,
"num_input_tokens_seen": 2363808,
"step": 8310
},
{
"epoch": 15.513059701492537,
"grad_norm": 0.4912722408771515,
"learning_rate": 0.0001457565378856733,
"loss": 0.2818,
"num_input_tokens_seen": 2365344,
"step": 8315
},
{
"epoch": 15.522388059701493,
"grad_norm": 0.2873976230621338,
"learning_rate": 0.0001451825110762059,
"loss": 0.1968,
"num_input_tokens_seen": 2366976,
"step": 8320
},
{
"epoch": 15.531716417910447,
"grad_norm": 0.6075371503829956,
"learning_rate": 0.00014460942479118083,
"loss": 0.3125,
"num_input_tokens_seen": 2368384,
"step": 8325
},
{
"epoch": 15.541044776119403,
"grad_norm": 0.42490240931510925,
"learning_rate": 0.0001440372805496939,
"loss": 0.3007,
"num_input_tokens_seen": 2369952,
"step": 8330
},
{
"epoch": 15.550373134328359,
"grad_norm": 0.40874677896499634,
"learning_rate": 0.0001434660798683437,
"loss": 0.2063,
"num_input_tokens_seen": 2371360,
"step": 8335
},
{
"epoch": 15.559701492537313,
"grad_norm": 0.3459983766078949,
"learning_rate": 0.00014289582426122693,
"loss": 0.2497,
"num_input_tokens_seen": 2372928,
"step": 8340
},
{
"epoch": 15.569029850746269,
"grad_norm": 0.4452391564846039,
"learning_rate": 0.00014232651523993635,
"loss": 0.2508,
"num_input_tokens_seen": 2374400,
"step": 8345
},
{
"epoch": 15.578358208955224,
"grad_norm": 0.6100215911865234,
"learning_rate": 0.00014175815431355466,
"loss": 0.2734,
"num_input_tokens_seen": 2375904,
"step": 8350
},
{
"epoch": 15.587686567164178,
"grad_norm": 0.8004417419433594,
"learning_rate": 0.00014119074298865164,
"loss": 0.402,
"num_input_tokens_seen": 2377056,
"step": 8355
},
{
"epoch": 15.597014925373134,
"grad_norm": 0.3961009979248047,
"learning_rate": 0.00014062428276928046,
"loss": 0.1442,
"num_input_tokens_seen": 2378432,
"step": 8360
},
{
"epoch": 15.60634328358209,
"grad_norm": 0.6664312481880188,
"learning_rate": 0.0001400587751569723,
"loss": 0.3781,
"num_input_tokens_seen": 2379872,
"step": 8365
},
{
"epoch": 15.615671641791044,
"grad_norm": 0.46110233664512634,
"learning_rate": 0.00013949422165073421,
"loss": 0.1627,
"num_input_tokens_seen": 2381376,
"step": 8370
},
{
"epoch": 15.625,
"grad_norm": 0.5933917760848999,
"learning_rate": 0.00013893062374704308,
"loss": 0.4071,
"num_input_tokens_seen": 2382752,
"step": 8375
},
{
"epoch": 15.634328358208956,
"grad_norm": 0.5305347442626953,
"learning_rate": 0.00013836798293984364,
"loss": 0.2568,
"num_input_tokens_seen": 2384000,
"step": 8380
},
{
"epoch": 15.64365671641791,
"grad_norm": 0.3536977171897888,
"learning_rate": 0.00013780630072054311,
"loss": 0.1958,
"num_input_tokens_seen": 2385344,
"step": 8385
},
{
"epoch": 15.652985074626866,
"grad_norm": 0.46019840240478516,
"learning_rate": 0.00013724557857800824,
"loss": 0.2185,
"num_input_tokens_seen": 2386784,
"step": 8390
},
{
"epoch": 15.662313432835822,
"grad_norm": 0.7752018570899963,
"learning_rate": 0.0001366858179985604,
"loss": 0.2764,
"num_input_tokens_seen": 2388192,
"step": 8395
},
{
"epoch": 15.671641791044776,
"grad_norm": 0.6549847722053528,
"learning_rate": 0.0001361270204659721,
"loss": 0.2833,
"num_input_tokens_seen": 2389664,
"step": 8400
},
{
"epoch": 15.680970149253731,
"grad_norm": 0.2715495228767395,
"learning_rate": 0.0001355691874614638,
"loss": 0.3665,
"num_input_tokens_seen": 2391136,
"step": 8405
},
{
"epoch": 15.690298507462687,
"grad_norm": 0.46817731857299805,
"learning_rate": 0.00013501232046369811,
"loss": 0.235,
"num_input_tokens_seen": 2392448,
"step": 8410
},
{
"epoch": 15.699626865671641,
"grad_norm": 0.9094480276107788,
"learning_rate": 0.00013445642094877793,
"loss": 0.4955,
"num_input_tokens_seen": 2393888,
"step": 8415
},
{
"epoch": 15.708955223880597,
"grad_norm": 0.4685889780521393,
"learning_rate": 0.0001339014903902415,
"loss": 0.3565,
"num_input_tokens_seen": 2395264,
"step": 8420
},
{
"epoch": 15.718283582089553,
"grad_norm": 1.049347162246704,
"learning_rate": 0.0001333475302590584,
"loss": 0.3012,
"num_input_tokens_seen": 2396608,
"step": 8425
},
{
"epoch": 15.727611940298507,
"grad_norm": 0.5767874121665955,
"learning_rate": 0.00013279454202362573,
"loss": 0.2502,
"num_input_tokens_seen": 2398080,
"step": 8430
},
{
"epoch": 15.736940298507463,
"grad_norm": 0.589856743812561,
"learning_rate": 0.0001322425271497646,
"loss": 0.2351,
"num_input_tokens_seen": 2399616,
"step": 8435
},
{
"epoch": 15.746268656716419,
"grad_norm": 0.5481240749359131,
"learning_rate": 0.00013169148710071615,
"loss": 0.2441,
"num_input_tokens_seen": 2401056,
"step": 8440
},
{
"epoch": 15.755597014925373,
"grad_norm": 0.5046172738075256,
"learning_rate": 0.00013114142333713725,
"loss": 0.2152,
"num_input_tokens_seen": 2402528,
"step": 8445
},
{
"epoch": 15.764925373134329,
"grad_norm": 0.7722668647766113,
"learning_rate": 0.00013059233731709685,
"loss": 0.226,
"num_input_tokens_seen": 2403872,
"step": 8450
},
{
"epoch": 15.774253731343283,
"grad_norm": 0.6092769503593445,
"learning_rate": 0.00013004423049607256,
"loss": 0.2227,
"num_input_tokens_seen": 2405376,
"step": 8455
},
{
"epoch": 15.783582089552239,
"grad_norm": 0.6560283899307251,
"learning_rate": 0.000129497104326946,
"loss": 0.261,
"num_input_tokens_seen": 2406656,
"step": 8460
},
{
"epoch": 15.792910447761194,
"grad_norm": 0.4380224049091339,
"learning_rate": 0.00012895096025999957,
"loss": 0.4086,
"num_input_tokens_seen": 2408256,
"step": 8465
},
{
"epoch": 15.802238805970148,
"grad_norm": 0.28187674283981323,
"learning_rate": 0.00012840579974291217,
"loss": 0.3477,
"num_input_tokens_seen": 2409568,
"step": 8470
},
{
"epoch": 15.811567164179104,
"grad_norm": 0.4487137794494629,
"learning_rate": 0.00012786162422075598,
"loss": 0.1889,
"num_input_tokens_seen": 2410912,
"step": 8475
},
{
"epoch": 15.82089552238806,
"grad_norm": 0.5683954358100891,
"learning_rate": 0.0001273184351359918,
"loss": 0.2914,
"num_input_tokens_seen": 2412416,
"step": 8480
},
{
"epoch": 15.830223880597014,
"grad_norm": 0.37054821848869324,
"learning_rate": 0.00012677623392846565,
"loss": 0.2331,
"num_input_tokens_seen": 2413792,
"step": 8485
},
{
"epoch": 15.83955223880597,
"grad_norm": 0.5374351143836975,
"learning_rate": 0.00012623502203540555,
"loss": 0.2678,
"num_input_tokens_seen": 2415232,
"step": 8490
},
{
"epoch": 15.848880597014926,
"grad_norm": 0.6811636090278625,
"learning_rate": 0.0001256948008914165,
"loss": 0.2646,
"num_input_tokens_seen": 2416544,
"step": 8495
},
{
"epoch": 15.85820895522388,
"grad_norm": 0.5923631191253662,
"learning_rate": 0.00012515557192847737,
"loss": 0.3662,
"num_input_tokens_seen": 2418176,
"step": 8500
},
{
"epoch": 15.867537313432836,
"grad_norm": 0.5537749528884888,
"learning_rate": 0.00012461733657593722,
"loss": 0.2539,
"num_input_tokens_seen": 2419616,
"step": 8505
},
{
"epoch": 15.876865671641792,
"grad_norm": 0.5480448603630066,
"learning_rate": 0.00012408009626051135,
"loss": 0.2163,
"num_input_tokens_seen": 2420928,
"step": 8510
},
{
"epoch": 15.886194029850746,
"grad_norm": 0.8457586169242859,
"learning_rate": 0.00012354385240627736,
"loss": 0.1695,
"num_input_tokens_seen": 2422336,
"step": 8515
},
{
"epoch": 15.895522388059701,
"grad_norm": 0.3043023645877838,
"learning_rate": 0.00012300860643467133,
"loss": 0.2807,
"num_input_tokens_seen": 2423584,
"step": 8520
},
{
"epoch": 15.904850746268657,
"grad_norm": 0.822465717792511,
"learning_rate": 0.00012247435976448474,
"loss": 0.2682,
"num_input_tokens_seen": 2424928,
"step": 8525
},
{
"epoch": 15.914179104477611,
"grad_norm": 0.5391532182693481,
"learning_rate": 0.00012194111381185973,
"loss": 0.2548,
"num_input_tokens_seen": 2426400,
"step": 8530
},
{
"epoch": 15.923507462686567,
"grad_norm": 0.737484335899353,
"learning_rate": 0.00012140886999028583,
"loss": 0.1762,
"num_input_tokens_seen": 2427872,
"step": 8535
},
{
"epoch": 15.932835820895523,
"grad_norm": 0.6602703928947449,
"learning_rate": 0.00012087762971059663,
"loss": 0.2668,
"num_input_tokens_seen": 2429120,
"step": 8540
},
{
"epoch": 15.942164179104477,
"grad_norm": 0.5175513029098511,
"learning_rate": 0.00012034739438096509,
"loss": 0.2865,
"num_input_tokens_seen": 2430368,
"step": 8545
},
{
"epoch": 15.951492537313433,
"grad_norm": 0.7112910747528076,
"learning_rate": 0.0001198181654069006,
"loss": 0.3435,
"num_input_tokens_seen": 2431712,
"step": 8550
},
{
"epoch": 15.960820895522389,
"grad_norm": 0.46317917108535767,
"learning_rate": 0.00011928994419124467,
"loss": 0.2369,
"num_input_tokens_seen": 2433152,
"step": 8555
},
{
"epoch": 15.970149253731343,
"grad_norm": 0.8226314783096313,
"learning_rate": 0.000118762732134168,
"loss": 0.317,
"num_input_tokens_seen": 2434752,
"step": 8560
},
{
"epoch": 15.979477611940299,
"grad_norm": 0.6281651258468628,
"learning_rate": 0.00011823653063316631,
"loss": 0.3671,
"num_input_tokens_seen": 2436128,
"step": 8565
},
{
"epoch": 15.988805970149254,
"grad_norm": 0.4451799988746643,
"learning_rate": 0.0001177113410830557,
"loss": 0.3262,
"num_input_tokens_seen": 2437536,
"step": 8570
},
{
"epoch": 15.998134328358208,
"grad_norm": 0.43823346495628357,
"learning_rate": 0.00011718716487597098,
"loss": 0.2563,
"num_input_tokens_seen": 2438944,
"step": 8575
},
{
"epoch": 16.0,
"eval_loss": 0.8285130858421326,
"eval_runtime": 4.2024,
"eval_samples_per_second": 56.635,
"eval_steps_per_second": 14.278,
"num_input_tokens_seen": 2438992,
"step": 8576
},
{
"epoch": 16.007462686567163,
"grad_norm": 0.6276515126228333,
"learning_rate": 0.00011666400340136013,
"loss": 0.3762,
"num_input_tokens_seen": 2439984,
"step": 8580
},
{
"epoch": 16.01679104477612,
"grad_norm": 0.732320249080658,
"learning_rate": 0.00011614185804598199,
"loss": 0.3555,
"num_input_tokens_seen": 2441232,
"step": 8585
},
{
"epoch": 16.026119402985074,
"grad_norm": 0.41469866037368774,
"learning_rate": 0.00011562073019390096,
"loss": 0.225,
"num_input_tokens_seen": 2442512,
"step": 8590
},
{
"epoch": 16.03544776119403,
"grad_norm": 0.2994535565376282,
"learning_rate": 0.00011510062122648528,
"loss": 0.1708,
"num_input_tokens_seen": 2444176,
"step": 8595
},
{
"epoch": 16.044776119402986,
"grad_norm": 0.6539736986160278,
"learning_rate": 0.00011458153252240233,
"loss": 0.2227,
"num_input_tokens_seen": 2445584,
"step": 8600
},
{
"epoch": 16.05410447761194,
"grad_norm": 0.6162667274475098,
"learning_rate": 0.00011406346545761415,
"loss": 0.3296,
"num_input_tokens_seen": 2446928,
"step": 8605
},
{
"epoch": 16.063432835820894,
"grad_norm": 0.5158094763755798,
"learning_rate": 0.0001135464214053758,
"loss": 0.3577,
"num_input_tokens_seen": 2448272,
"step": 8610
},
{
"epoch": 16.07276119402985,
"grad_norm": 0.6340551376342773,
"learning_rate": 0.00011303040173622975,
"loss": 0.2312,
"num_input_tokens_seen": 2449744,
"step": 8615
},
{
"epoch": 16.082089552238806,
"grad_norm": 0.5297074913978577,
"learning_rate": 0.00011251540781800379,
"loss": 0.2522,
"num_input_tokens_seen": 2451184,
"step": 8620
},
{
"epoch": 16.09141791044776,
"grad_norm": 0.5866245031356812,
"learning_rate": 0.00011200144101580634,
"loss": 0.3074,
"num_input_tokens_seen": 2452496,
"step": 8625
},
{
"epoch": 16.100746268656717,
"grad_norm": 0.6043202877044678,
"learning_rate": 0.00011148850269202305,
"loss": 0.2295,
"num_input_tokens_seen": 2454000,
"step": 8630
},
{
"epoch": 16.11007462686567,
"grad_norm": 0.5400373935699463,
"learning_rate": 0.0001109765942063139,
"loss": 0.3557,
"num_input_tokens_seen": 2455440,
"step": 8635
},
{
"epoch": 16.119402985074625,
"grad_norm": 0.48669207096099854,
"learning_rate": 0.00011046571691560863,
"loss": 0.2135,
"num_input_tokens_seen": 2456784,
"step": 8640
},
{
"epoch": 16.128731343283583,
"grad_norm": 0.8610177636146545,
"learning_rate": 0.00010995587217410369,
"loss": 0.2616,
"num_input_tokens_seen": 2458224,
"step": 8645
},
{
"epoch": 16.138059701492537,
"grad_norm": 0.47535207867622375,
"learning_rate": 0.00010944706133325832,
"loss": 0.3128,
"num_input_tokens_seen": 2459824,
"step": 8650
},
{
"epoch": 16.14738805970149,
"grad_norm": 0.7235194444656372,
"learning_rate": 0.00010893928574179174,
"loss": 0.2185,
"num_input_tokens_seen": 2461264,
"step": 8655
},
{
"epoch": 16.15671641791045,
"grad_norm": 0.6055487990379333,
"learning_rate": 0.00010843254674567832,
"loss": 0.4397,
"num_input_tokens_seen": 2462576,
"step": 8660
},
{
"epoch": 16.166044776119403,
"grad_norm": 0.4858103096485138,
"learning_rate": 0.00010792684568814504,
"loss": 0.3052,
"num_input_tokens_seen": 2464144,
"step": 8665
},
{
"epoch": 16.175373134328357,
"grad_norm": 0.6048887372016907,
"learning_rate": 0.00010742218390966768,
"loss": 0.2574,
"num_input_tokens_seen": 2465584,
"step": 8670
},
{
"epoch": 16.184701492537314,
"grad_norm": 0.4468727707862854,
"learning_rate": 0.00010691856274796702,
"loss": 0.1812,
"num_input_tokens_seen": 2467056,
"step": 8675
},
{
"epoch": 16.19402985074627,
"grad_norm": 0.7549290657043457,
"learning_rate": 0.0001064159835380053,
"loss": 0.2291,
"num_input_tokens_seen": 2468464,
"step": 8680
},
{
"epoch": 16.203358208955223,
"grad_norm": 0.5962735414505005,
"learning_rate": 0.00010591444761198332,
"loss": 0.3033,
"num_input_tokens_seen": 2469968,
"step": 8685
},
{
"epoch": 16.21268656716418,
"grad_norm": 0.7374847531318665,
"learning_rate": 0.00010541395629933586,
"loss": 0.3931,
"num_input_tokens_seen": 2471280,
"step": 8690
},
{
"epoch": 16.222014925373134,
"grad_norm": 0.3907533884048462,
"learning_rate": 0.00010491451092672904,
"loss": 0.1632,
"num_input_tokens_seen": 2472560,
"step": 8695
},
{
"epoch": 16.23134328358209,
"grad_norm": 0.4274299740791321,
"learning_rate": 0.0001044161128180563,
"loss": 0.2025,
"num_input_tokens_seen": 2474032,
"step": 8700
},
{
"epoch": 16.240671641791046,
"grad_norm": 0.4550669491291046,
"learning_rate": 0.00010391876329443534,
"loss": 0.3309,
"num_input_tokens_seen": 2475472,
"step": 8705
},
{
"epoch": 16.25,
"grad_norm": 0.5084713697433472,
"learning_rate": 0.00010342246367420411,
"loss": 0.2977,
"num_input_tokens_seen": 2476912,
"step": 8710
},
{
"epoch": 16.259328358208954,
"grad_norm": 0.5619415044784546,
"learning_rate": 0.00010292721527291742,
"loss": 0.2917,
"num_input_tokens_seen": 2478256,
"step": 8715
},
{
"epoch": 16.26865671641791,
"grad_norm": 0.574165403842926,
"learning_rate": 0.00010243301940334415,
"loss": 0.2578,
"num_input_tokens_seen": 2479696,
"step": 8720
},
{
"epoch": 16.277985074626866,
"grad_norm": 0.6561511754989624,
"learning_rate": 0.00010193987737546262,
"loss": 0.2869,
"num_input_tokens_seen": 2481136,
"step": 8725
},
{
"epoch": 16.28731343283582,
"grad_norm": 0.5067024827003479,
"learning_rate": 0.00010144779049645792,
"loss": 0.1726,
"num_input_tokens_seen": 2482416,
"step": 8730
},
{
"epoch": 16.296641791044777,
"grad_norm": 0.6899192333221436,
"learning_rate": 0.00010095676007071808,
"loss": 0.2841,
"num_input_tokens_seen": 2483792,
"step": 8735
},
{
"epoch": 16.30597014925373,
"grad_norm": 0.6847012042999268,
"learning_rate": 0.00010046678739983129,
"loss": 0.2843,
"num_input_tokens_seen": 2485136,
"step": 8740
},
{
"epoch": 16.315298507462686,
"grad_norm": 0.4542698264122009,
"learning_rate": 9.997787378258122e-05,
"loss": 0.3108,
"num_input_tokens_seen": 2486640,
"step": 8745
},
{
"epoch": 16.324626865671643,
"grad_norm": 0.4527323842048645,
"learning_rate": 9.949002051494465e-05,
"loss": 0.2882,
"num_input_tokens_seen": 2488176,
"step": 8750
},
{
"epoch": 16.333955223880597,
"grad_norm": 0.596170961856842,
"learning_rate": 9.900322889008772e-05,
"loss": 0.2263,
"num_input_tokens_seen": 2489648,
"step": 8755
},
{
"epoch": 16.34328358208955,
"grad_norm": 0.9109885096549988,
"learning_rate": 9.851750019836231e-05,
"loss": 0.3221,
"num_input_tokens_seen": 2491120,
"step": 8760
},
{
"epoch": 16.35261194029851,
"grad_norm": 0.5357970595359802,
"learning_rate": 9.803283572730271e-05,
"loss": 0.2394,
"num_input_tokens_seen": 2492656,
"step": 8765
},
{
"epoch": 16.361940298507463,
"grad_norm": 0.705450177192688,
"learning_rate": 9.75492367616222e-05,
"loss": 0.2454,
"num_input_tokens_seen": 2493904,
"step": 8770
},
{
"epoch": 16.371268656716417,
"grad_norm": 0.6140018105506897,
"learning_rate": 9.706670458320993e-05,
"loss": 0.2821,
"num_input_tokens_seen": 2495536,
"step": 8775
},
{
"epoch": 16.380597014925375,
"grad_norm": 0.545140266418457,
"learning_rate": 9.658524047112749e-05,
"loss": 0.303,
"num_input_tokens_seen": 2496816,
"step": 8780
},
{
"epoch": 16.38992537313433,
"grad_norm": 0.6749919652938843,
"learning_rate": 9.610484570160444e-05,
"loss": 0.2499,
"num_input_tokens_seen": 2498224,
"step": 8785
},
{
"epoch": 16.399253731343283,
"grad_norm": 0.5334325432777405,
"learning_rate": 9.562552154803673e-05,
"loss": 0.3425,
"num_input_tokens_seen": 2499760,
"step": 8790
},
{
"epoch": 16.40858208955224,
"grad_norm": 0.3113941252231598,
"learning_rate": 9.514726928098189e-05,
"loss": 0.2433,
"num_input_tokens_seen": 2501040,
"step": 8795
},
{
"epoch": 16.417910447761194,
"grad_norm": 0.39453864097595215,
"learning_rate": 9.467009016815625e-05,
"loss": 0.2447,
"num_input_tokens_seen": 2502672,
"step": 8800
},
{
"epoch": 16.42723880597015,
"grad_norm": 0.7549194693565369,
"learning_rate": 9.419398547443175e-05,
"loss": 0.3651,
"num_input_tokens_seen": 2504048,
"step": 8805
},
{
"epoch": 16.436567164179106,
"grad_norm": 0.4813898801803589,
"learning_rate": 9.371895646183199e-05,
"loss": 0.4382,
"num_input_tokens_seen": 2505488,
"step": 8810
},
{
"epoch": 16.44589552238806,
"grad_norm": 0.6909576654434204,
"learning_rate": 9.324500438952965e-05,
"loss": 0.2031,
"num_input_tokens_seen": 2506800,
"step": 8815
},
{
"epoch": 16.455223880597014,
"grad_norm": 0.31445515155792236,
"learning_rate": 9.27721305138421e-05,
"loss": 0.1622,
"num_input_tokens_seen": 2508144,
"step": 8820
},
{
"epoch": 16.46455223880597,
"grad_norm": 0.4787421226501465,
"learning_rate": 9.23003360882293e-05,
"loss": 0.2795,
"num_input_tokens_seen": 2509456,
"step": 8825
},
{
"epoch": 16.473880597014926,
"grad_norm": 0.610919713973999,
"learning_rate": 9.182962236328957e-05,
"loss": 0.3738,
"num_input_tokens_seen": 2510864,
"step": 8830
},
{
"epoch": 16.48320895522388,
"grad_norm": 0.5633202791213989,
"learning_rate": 9.135999058675687e-05,
"loss": 0.1995,
"num_input_tokens_seen": 2512144,
"step": 8835
},
{
"epoch": 16.492537313432837,
"grad_norm": 0.5101869106292725,
"learning_rate": 9.089144200349685e-05,
"loss": 0.3134,
"num_input_tokens_seen": 2513488,
"step": 8840
},
{
"epoch": 16.50186567164179,
"grad_norm": 0.38828736543655396,
"learning_rate": 9.042397785550405e-05,
"loss": 0.1767,
"num_input_tokens_seen": 2514768,
"step": 8845
},
{
"epoch": 16.511194029850746,
"grad_norm": 0.5241112112998962,
"learning_rate": 8.995759938189884e-05,
"loss": 0.3342,
"num_input_tokens_seen": 2516560,
"step": 8850
},
{
"epoch": 16.520522388059703,
"grad_norm": 0.47178515791893005,
"learning_rate": 8.949230781892287e-05,
"loss": 0.1936,
"num_input_tokens_seen": 2518064,
"step": 8855
},
{
"epoch": 16.529850746268657,
"grad_norm": 0.7868510484695435,
"learning_rate": 8.902810439993752e-05,
"loss": 0.3258,
"num_input_tokens_seen": 2519568,
"step": 8860
},
{
"epoch": 16.53917910447761,
"grad_norm": 0.44424474239349365,
"learning_rate": 8.85649903554197e-05,
"loss": 0.177,
"num_input_tokens_seen": 2520912,
"step": 8865
},
{
"epoch": 16.548507462686565,
"grad_norm": 0.6200153231620789,
"learning_rate": 8.810296691295827e-05,
"loss": 0.3229,
"num_input_tokens_seen": 2522320,
"step": 8870
},
{
"epoch": 16.557835820895523,
"grad_norm": 0.6333581209182739,
"learning_rate": 8.764203529725152e-05,
"loss": 0.213,
"num_input_tokens_seen": 2523728,
"step": 8875
},
{
"epoch": 16.567164179104477,
"grad_norm": 0.40464335680007935,
"learning_rate": 8.71821967301033e-05,
"loss": 0.2427,
"num_input_tokens_seen": 2525104,
"step": 8880
},
{
"epoch": 16.576492537313435,
"grad_norm": 0.5083310604095459,
"learning_rate": 8.672345243042069e-05,
"loss": 0.2852,
"num_input_tokens_seen": 2526544,
"step": 8885
},
{
"epoch": 16.58582089552239,
"grad_norm": 0.3773159682750702,
"learning_rate": 8.626580361420955e-05,
"loss": 0.4596,
"num_input_tokens_seen": 2527952,
"step": 8890
},
{
"epoch": 16.595149253731343,
"grad_norm": 0.7448423504829407,
"learning_rate": 8.580925149457197e-05,
"loss": 0.1847,
"num_input_tokens_seen": 2529328,
"step": 8895
},
{
"epoch": 16.604477611940297,
"grad_norm": 0.6146315336227417,
"learning_rate": 8.535379728170356e-05,
"loss": 0.2781,
"num_input_tokens_seen": 2530736,
"step": 8900
},
{
"epoch": 16.613805970149254,
"grad_norm": 0.4092324376106262,
"learning_rate": 8.489944218288909e-05,
"loss": 0.2197,
"num_input_tokens_seen": 2532208,
"step": 8905
},
{
"epoch": 16.62313432835821,
"grad_norm": 0.4566566050052643,
"learning_rate": 8.444618740249998e-05,
"loss": 0.177,
"num_input_tokens_seen": 2533616,
"step": 8910
},
{
"epoch": 16.632462686567163,
"grad_norm": 0.7507036924362183,
"learning_rate": 8.399403414199114e-05,
"loss": 0.2945,
"num_input_tokens_seen": 2535024,
"step": 8915
},
{
"epoch": 16.64179104477612,
"grad_norm": 0.49269193410873413,
"learning_rate": 8.354298359989776e-05,
"loss": 0.2004,
"num_input_tokens_seen": 2536368,
"step": 8920
},
{
"epoch": 16.651119402985074,
"grad_norm": 0.5462902188301086,
"learning_rate": 8.309303697183179e-05,
"loss": 0.2524,
"num_input_tokens_seen": 2537712,
"step": 8925
},
{
"epoch": 16.66044776119403,
"grad_norm": 0.7016326785087585,
"learning_rate": 8.264419545047891e-05,
"loss": 0.2144,
"num_input_tokens_seen": 2539024,
"step": 8930
},
{
"epoch": 16.669776119402986,
"grad_norm": 0.8711559176445007,
"learning_rate": 8.219646022559597e-05,
"loss": 0.2867,
"num_input_tokens_seen": 2540240,
"step": 8935
},
{
"epoch": 16.67910447761194,
"grad_norm": 0.4443308115005493,
"learning_rate": 8.174983248400674e-05,
"loss": 0.3432,
"num_input_tokens_seen": 2541584,
"step": 8940
},
{
"epoch": 16.688432835820894,
"grad_norm": 0.44244229793548584,
"learning_rate": 8.130431340959982e-05,
"loss": 0.2659,
"num_input_tokens_seen": 2543024,
"step": 8945
},
{
"epoch": 16.69776119402985,
"grad_norm": 0.4805716574192047,
"learning_rate": 8.08599041833245e-05,
"loss": 0.274,
"num_input_tokens_seen": 2544528,
"step": 8950
},
{
"epoch": 16.707089552238806,
"grad_norm": 0.35907986760139465,
"learning_rate": 8.041660598318889e-05,
"loss": 0.238,
"num_input_tokens_seen": 2546064,
"step": 8955
},
{
"epoch": 16.71641791044776,
"grad_norm": 0.597048282623291,
"learning_rate": 7.997441998425553e-05,
"loss": 0.3557,
"num_input_tokens_seen": 2547472,
"step": 8960
},
{
"epoch": 16.725746268656717,
"grad_norm": 0.500659167766571,
"learning_rate": 7.953334735863881e-05,
"loss": 0.3487,
"num_input_tokens_seen": 2549040,
"step": 8965
},
{
"epoch": 16.73507462686567,
"grad_norm": 0.5837845206260681,
"learning_rate": 7.909338927550225e-05,
"loss": 0.1703,
"num_input_tokens_seen": 2550480,
"step": 8970
},
{
"epoch": 16.744402985074625,
"grad_norm": 0.4150075912475586,
"learning_rate": 7.865454690105473e-05,
"loss": 0.2964,
"num_input_tokens_seen": 2551888,
"step": 8975
},
{
"epoch": 16.753731343283583,
"grad_norm": 0.616115927696228,
"learning_rate": 7.821682139854758e-05,
"loss": 0.2685,
"num_input_tokens_seen": 2553232,
"step": 8980
},
{
"epoch": 16.763059701492537,
"grad_norm": 0.3962574899196625,
"learning_rate": 7.778021392827211e-05,
"loss": 0.3528,
"num_input_tokens_seen": 2554640,
"step": 8985
},
{
"epoch": 16.77238805970149,
"grad_norm": 0.5545009970664978,
"learning_rate": 7.734472564755551e-05,
"loss": 0.231,
"num_input_tokens_seen": 2556112,
"step": 8990
},
{
"epoch": 16.78171641791045,
"grad_norm": 0.46232175827026367,
"learning_rate": 7.691035771075855e-05,
"loss": 0.3658,
"num_input_tokens_seen": 2557584,
"step": 8995
},
{
"epoch": 16.791044776119403,
"grad_norm": 0.6394046545028687,
"learning_rate": 7.64771112692721e-05,
"loss": 0.2446,
"num_input_tokens_seen": 2559184,
"step": 9000
},
{
"epoch": 16.800373134328357,
"grad_norm": 0.34025850892066956,
"learning_rate": 7.604498747151456e-05,
"loss": 0.2375,
"num_input_tokens_seen": 2560688,
"step": 9005
},
{
"epoch": 16.809701492537314,
"grad_norm": 0.42679092288017273,
"learning_rate": 7.56139874629283e-05,
"loss": 0.2571,
"num_input_tokens_seen": 2562224,
"step": 9010
},
{
"epoch": 16.81902985074627,
"grad_norm": 0.512681245803833,
"learning_rate": 7.518411238597667e-05,
"loss": 0.1278,
"num_input_tokens_seen": 2563568,
"step": 9015
},
{
"epoch": 16.828358208955223,
"grad_norm": 0.8411496877670288,
"learning_rate": 7.475536338014155e-05,
"loss": 0.262,
"num_input_tokens_seen": 2565104,
"step": 9020
},
{
"epoch": 16.83768656716418,
"grad_norm": 0.45846620202064514,
"learning_rate": 7.432774158191946e-05,
"loss": 0.2383,
"num_input_tokens_seen": 2566512,
"step": 9025
},
{
"epoch": 16.847014925373134,
"grad_norm": 0.6833055019378662,
"learning_rate": 7.390124812481957e-05,
"loss": 0.2417,
"num_input_tokens_seen": 2568048,
"step": 9030
},
{
"epoch": 16.85634328358209,
"grad_norm": 0.5997596383094788,
"learning_rate": 7.347588413935935e-05,
"loss": 0.3637,
"num_input_tokens_seen": 2569392,
"step": 9035
},
{
"epoch": 16.865671641791046,
"grad_norm": 0.6235828995704651,
"learning_rate": 7.305165075306297e-05,
"loss": 0.3092,
"num_input_tokens_seen": 2570704,
"step": 9040
},
{
"epoch": 16.875,
"grad_norm": 0.5785413980484009,
"learning_rate": 7.262854909045774e-05,
"loss": 0.325,
"num_input_tokens_seen": 2572336,
"step": 9045
},
{
"epoch": 16.884328358208954,
"grad_norm": 0.4815627336502075,
"learning_rate": 7.220658027307036e-05,
"loss": 0.2405,
"num_input_tokens_seen": 2573680,
"step": 9050
},
{
"epoch": 16.89365671641791,
"grad_norm": 0.2667815089225769,
"learning_rate": 7.178574541942545e-05,
"loss": 0.3088,
"num_input_tokens_seen": 2575120,
"step": 9055
},
{
"epoch": 16.902985074626866,
"grad_norm": 0.5672833323478699,
"learning_rate": 7.136604564504134e-05,
"loss": 0.2136,
"num_input_tokens_seen": 2576400,
"step": 9060
},
{
"epoch": 16.91231343283582,
"grad_norm": 0.7024040818214417,
"learning_rate": 7.094748206242796e-05,
"loss": 0.316,
"num_input_tokens_seen": 2578320,
"step": 9065
},
{
"epoch": 16.921641791044777,
"grad_norm": 0.6449324488639832,
"learning_rate": 7.053005578108296e-05,
"loss": 0.2807,
"num_input_tokens_seen": 2579888,
"step": 9070
},
{
"epoch": 16.93097014925373,
"grad_norm": 0.6066656112670898,
"learning_rate": 7.01137679074897e-05,
"loss": 0.2749,
"num_input_tokens_seen": 2581296,
"step": 9075
},
{
"epoch": 16.940298507462686,
"grad_norm": 0.7407752871513367,
"learning_rate": 6.969861954511409e-05,
"loss": 0.2132,
"num_input_tokens_seen": 2582768,
"step": 9080
},
{
"epoch": 16.949626865671643,
"grad_norm": 0.3702264130115509,
"learning_rate": 6.928461179440109e-05,
"loss": 0.1839,
"num_input_tokens_seen": 2584144,
"step": 9085
},
{
"epoch": 16.958955223880597,
"grad_norm": 0.6591739654541016,
"learning_rate": 6.887174575277239e-05,
"loss": 0.2496,
"num_input_tokens_seen": 2585456,
"step": 9090
},
{
"epoch": 16.96828358208955,
"grad_norm": 0.8507482409477234,
"learning_rate": 6.846002251462324e-05,
"loss": 0.306,
"num_input_tokens_seen": 2586768,
"step": 9095
},
{
"epoch": 16.97761194029851,
"grad_norm": 0.6380784511566162,
"learning_rate": 6.804944317131995e-05,
"loss": 0.4314,
"num_input_tokens_seen": 2588016,
"step": 9100
},
{
"epoch": 16.986940298507463,
"grad_norm": 0.3737356662750244,
"learning_rate": 6.76400088111963e-05,
"loss": 0.3087,
"num_input_tokens_seen": 2589520,
"step": 9105
},
{
"epoch": 16.996268656716417,
"grad_norm": 0.6403619050979614,
"learning_rate": 6.723172051955101e-05,
"loss": 0.124,
"num_input_tokens_seen": 2591120,
"step": 9110
},
{
"epoch": 17.0,
"eval_loss": 0.8633560538291931,
"eval_runtime": 4.6195,
"eval_samples_per_second": 51.521,
"eval_steps_per_second": 12.988,
"num_input_tokens_seen": 2591432,
"step": 9112
},
{
"epoch": 17.005597014925375,
"grad_norm": 0.4501326084136963,
"learning_rate": 6.682457937864538e-05,
"loss": 0.3698,
"num_input_tokens_seen": 2592296,
"step": 9115
},
{
"epoch": 17.01492537313433,
"grad_norm": 0.48867860436439514,
"learning_rate": 6.641858646769938e-05,
"loss": 0.3062,
"num_input_tokens_seen": 2593704,
"step": 9120
},
{
"epoch": 17.024253731343283,
"grad_norm": 0.6671390533447266,
"learning_rate": 6.601374286288963e-05,
"loss": 0.2267,
"num_input_tokens_seen": 2595272,
"step": 9125
},
{
"epoch": 17.03358208955224,
"grad_norm": 0.4498516619205475,
"learning_rate": 6.561004963734595e-05,
"loss": 0.2633,
"num_input_tokens_seen": 2596712,
"step": 9130
},
{
"epoch": 17.042910447761194,
"grad_norm": 0.5411965250968933,
"learning_rate": 6.520750786114938e-05,
"loss": 0.3891,
"num_input_tokens_seen": 2598152,
"step": 9135
},
{
"epoch": 17.05223880597015,
"grad_norm": 0.28718677163124084,
"learning_rate": 6.480611860132824e-05,
"loss": 0.3361,
"num_input_tokens_seen": 2599624,
"step": 9140
},
{
"epoch": 17.061567164179106,
"grad_norm": 1.2912830114364624,
"learning_rate": 6.440588292185595e-05,
"loss": 0.211,
"num_input_tokens_seen": 2601160,
"step": 9145
},
{
"epoch": 17.07089552238806,
"grad_norm": 0.38549405336380005,
"learning_rate": 6.400680188364844e-05,
"loss": 0.2075,
"num_input_tokens_seen": 2602536,
"step": 9150
},
{
"epoch": 17.080223880597014,
"grad_norm": 0.8170197010040283,
"learning_rate": 6.360887654456065e-05,
"loss": 0.2086,
"num_input_tokens_seen": 2604040,
"step": 9155
},
{
"epoch": 17.08955223880597,
"grad_norm": 0.3383115530014038,
"learning_rate": 6.321210795938403e-05,
"loss": 0.1943,
"num_input_tokens_seen": 2605352,
"step": 9160
},
{
"epoch": 17.098880597014926,
"grad_norm": 0.7286571264266968,
"learning_rate": 6.281649717984417e-05,
"loss": 0.2104,
"num_input_tokens_seen": 2606696,
"step": 9165
},
{
"epoch": 17.10820895522388,
"grad_norm": 0.5768951773643494,
"learning_rate": 6.242204525459738e-05,
"loss": 0.2729,
"num_input_tokens_seen": 2608168,
"step": 9170
},
{
"epoch": 17.117537313432837,
"grad_norm": 0.5091217756271362,
"learning_rate": 6.202875322922808e-05,
"loss": 0.3402,
"num_input_tokens_seen": 2609608,
"step": 9175
},
{
"epoch": 17.12686567164179,
"grad_norm": 0.4321063160896301,
"learning_rate": 6.163662214624616e-05,
"loss": 0.2309,
"num_input_tokens_seen": 2610920,
"step": 9180
},
{
"epoch": 17.136194029850746,
"grad_norm": 0.7079524993896484,
"learning_rate": 6.12456530450844e-05,
"loss": 0.1885,
"num_input_tokens_seen": 2612200,
"step": 9185
},
{
"epoch": 17.145522388059703,
"grad_norm": 0.5813358426094055,
"learning_rate": 6.0855846962095285e-05,
"loss": 0.245,
"num_input_tokens_seen": 2613544,
"step": 9190
},
{
"epoch": 17.154850746268657,
"grad_norm": 0.5503816604614258,
"learning_rate": 6.0467204930548357e-05,
"loss": 0.2032,
"num_input_tokens_seen": 2614888,
"step": 9195
},
{
"epoch": 17.16417910447761,
"grad_norm": 0.43504875898361206,
"learning_rate": 6.007972798062783e-05,
"loss": 0.1711,
"num_input_tokens_seen": 2616488,
"step": 9200
},
{
"epoch": 17.17350746268657,
"grad_norm": 0.41562536358833313,
"learning_rate": 5.96934171394295e-05,
"loss": 0.2347,
"num_input_tokens_seen": 2617800,
"step": 9205
},
{
"epoch": 17.182835820895523,
"grad_norm": 0.33625268936157227,
"learning_rate": 5.930827343095801e-05,
"loss": 0.2238,
"num_input_tokens_seen": 2619080,
"step": 9210
},
{
"epoch": 17.192164179104477,
"grad_norm": 0.608429491519928,
"learning_rate": 5.8924297876124246e-05,
"loss": 0.3715,
"num_input_tokens_seen": 2620520,
"step": 9215
},
{
"epoch": 17.20149253731343,
"grad_norm": 0.8513853549957275,
"learning_rate": 5.854149149274296e-05,
"loss": 0.3546,
"num_input_tokens_seen": 2621896,
"step": 9220
},
{
"epoch": 17.21082089552239,
"grad_norm": 0.42540302872657776,
"learning_rate": 5.815985529552942e-05,
"loss": 0.2143,
"num_input_tokens_seen": 2623176,
"step": 9225
},
{
"epoch": 17.220149253731343,
"grad_norm": 0.7629372477531433,
"learning_rate": 5.777939029609708e-05,
"loss": 0.2484,
"num_input_tokens_seen": 2624584,
"step": 9230
},
{
"epoch": 17.229477611940297,
"grad_norm": 0.6999621391296387,
"learning_rate": 5.740009750295505e-05,
"loss": 0.3354,
"num_input_tokens_seen": 2626120,
"step": 9235
},
{
"epoch": 17.238805970149254,
"grad_norm": 1.2121412754058838,
"learning_rate": 5.7021977921505156e-05,
"loss": 0.1779,
"num_input_tokens_seen": 2627624,
"step": 9240
},
{
"epoch": 17.24813432835821,
"grad_norm": 0.6345969438552856,
"learning_rate": 5.664503255403924e-05,
"loss": 0.3501,
"num_input_tokens_seen": 2629000,
"step": 9245
},
{
"epoch": 17.257462686567163,
"grad_norm": 0.36961647868156433,
"learning_rate": 5.626926239973668e-05,
"loss": 0.1767,
"num_input_tokens_seen": 2630504,
"step": 9250
},
{
"epoch": 17.26679104477612,
"grad_norm": 0.4116383194923401,
"learning_rate": 5.589466845466179e-05,
"loss": 0.2798,
"num_input_tokens_seen": 2631880,
"step": 9255
},
{
"epoch": 17.276119402985074,
"grad_norm": 0.8709968328475952,
"learning_rate": 5.5521251711761256e-05,
"loss": 0.2116,
"num_input_tokens_seen": 2633288,
"step": 9260
},
{
"epoch": 17.28544776119403,
"grad_norm": 0.567836344242096,
"learning_rate": 5.5149013160860575e-05,
"loss": 0.2808,
"num_input_tokens_seen": 2634824,
"step": 9265
},
{
"epoch": 17.294776119402986,
"grad_norm": 0.6120123267173767,
"learning_rate": 5.477795378866307e-05,
"loss": 0.2145,
"num_input_tokens_seen": 2636136,
"step": 9270
},
{
"epoch": 17.30410447761194,
"grad_norm": 0.6449649930000305,
"learning_rate": 5.4408074578745806e-05,
"loss": 0.2289,
"num_input_tokens_seen": 2637480,
"step": 9275
},
{
"epoch": 17.313432835820894,
"grad_norm": 0.38337478041648865,
"learning_rate": 5.403937651155771e-05,
"loss": 0.229,
"num_input_tokens_seen": 2639080,
"step": 9280
},
{
"epoch": 17.32276119402985,
"grad_norm": 0.49158769845962524,
"learning_rate": 5.367186056441703e-05,
"loss": 0.2575,
"num_input_tokens_seen": 2640616,
"step": 9285
},
{
"epoch": 17.332089552238806,
"grad_norm": 0.3472841680049896,
"learning_rate": 5.3305527711508205e-05,
"loss": 0.1758,
"num_input_tokens_seen": 2642184,
"step": 9290
},
{
"epoch": 17.34141791044776,
"grad_norm": 0.29027411341667175,
"learning_rate": 5.294037892387998e-05,
"loss": 0.1465,
"num_input_tokens_seen": 2643784,
"step": 9295
},
{
"epoch": 17.350746268656717,
"grad_norm": 0.4464806616306305,
"learning_rate": 5.2576415169441895e-05,
"loss": 0.2601,
"num_input_tokens_seen": 2645160,
"step": 9300
},
{
"epoch": 17.36007462686567,
"grad_norm": 0.34677013754844666,
"learning_rate": 5.221363741296298e-05,
"loss": 0.2752,
"num_input_tokens_seen": 2646760,
"step": 9305
},
{
"epoch": 17.369402985074625,
"grad_norm": 0.7161241173744202,
"learning_rate": 5.18520466160679e-05,
"loss": 0.2292,
"num_input_tokens_seen": 2648264,
"step": 9310
},
{
"epoch": 17.378731343283583,
"grad_norm": 0.3472307622432709,
"learning_rate": 5.149164373723558e-05,
"loss": 0.1797,
"num_input_tokens_seen": 2649864,
"step": 9315
},
{
"epoch": 17.388059701492537,
"grad_norm": 0.6642903089523315,
"learning_rate": 5.113242973179566e-05,
"loss": 0.3569,
"num_input_tokens_seen": 2651272,
"step": 9320
},
{
"epoch": 17.39738805970149,
"grad_norm": 0.6346752047538757,
"learning_rate": 5.077440555192647e-05,
"loss": 0.2407,
"num_input_tokens_seen": 2652552,
"step": 9325
},
{
"epoch": 17.40671641791045,
"grad_norm": 0.7212795615196228,
"learning_rate": 5.0417572146652825e-05,
"loss": 0.1955,
"num_input_tokens_seen": 2654056,
"step": 9330
},
{
"epoch": 17.416044776119403,
"grad_norm": 0.5936789512634277,
"learning_rate": 5.0061930461842375e-05,
"loss": 0.2506,
"num_input_tokens_seen": 2655592,
"step": 9335
},
{
"epoch": 17.425373134328357,
"grad_norm": 0.45224127173423767,
"learning_rate": 4.9707481440204486e-05,
"loss": 0.1789,
"num_input_tokens_seen": 2657064,
"step": 9340
},
{
"epoch": 17.434701492537314,
"grad_norm": 0.6721286773681641,
"learning_rate": 4.9354226021286975e-05,
"loss": 0.2685,
"num_input_tokens_seen": 2658664,
"step": 9345
},
{
"epoch": 17.44402985074627,
"grad_norm": 0.2122386246919632,
"learning_rate": 4.900216514147365e-05,
"loss": 0.3693,
"num_input_tokens_seen": 2660232,
"step": 9350
},
{
"epoch": 17.453358208955223,
"grad_norm": 0.29205867648124695,
"learning_rate": 4.8651299733981855e-05,
"loss": 0.2109,
"num_input_tokens_seen": 2661704,
"step": 9355
},
{
"epoch": 17.46268656716418,
"grad_norm": 1.0239243507385254,
"learning_rate": 4.830163072886007e-05,
"loss": 0.2698,
"num_input_tokens_seen": 2662984,
"step": 9360
},
{
"epoch": 17.472014925373134,
"grad_norm": 0.6300740242004395,
"learning_rate": 4.7953159052985693e-05,
"loss": 0.3608,
"num_input_tokens_seen": 2664392,
"step": 9365
},
{
"epoch": 17.48134328358209,
"grad_norm": 0.7175711989402771,
"learning_rate": 4.760588563006207e-05,
"loss": 0.1967,
"num_input_tokens_seen": 2665960,
"step": 9370
},
{
"epoch": 17.490671641791046,
"grad_norm": 0.47732552886009216,
"learning_rate": 4.725981138061625e-05,
"loss": 0.257,
"num_input_tokens_seen": 2667528,
"step": 9375
},
{
"epoch": 17.5,
"grad_norm": 0.7046051025390625,
"learning_rate": 4.691493722199697e-05,
"loss": 0.2495,
"num_input_tokens_seen": 2668904,
"step": 9380
},
{
"epoch": 17.509328358208954,
"grad_norm": 0.5751326680183411,
"learning_rate": 4.657126406837148e-05,
"loss": 0.2772,
"num_input_tokens_seen": 2670536,
"step": 9385
},
{
"epoch": 17.51865671641791,
"grad_norm": 0.437429279088974,
"learning_rate": 4.622879283072368e-05,
"loss": 0.2323,
"num_input_tokens_seen": 2671816,
"step": 9390
},
{
"epoch": 17.527985074626866,
"grad_norm": 0.35734742879867554,
"learning_rate": 4.588752441685129e-05,
"loss": 0.1012,
"num_input_tokens_seen": 2673288,
"step": 9395
},
{
"epoch": 17.53731343283582,
"grad_norm": 0.6184195280075073,
"learning_rate": 4.554745973136409e-05,
"loss": 0.3341,
"num_input_tokens_seen": 2674664,
"step": 9400
},
{
"epoch": 17.546641791044777,
"grad_norm": 0.9945972561836243,
"learning_rate": 4.5208599675680754e-05,
"loss": 0.2572,
"num_input_tokens_seen": 2676040,
"step": 9405
},
{
"epoch": 17.55597014925373,
"grad_norm": 0.6488131880760193,
"learning_rate": 4.487094514802686e-05,
"loss": 0.1918,
"num_input_tokens_seen": 2677416,
"step": 9410
},
{
"epoch": 17.565298507462686,
"grad_norm": 0.5945156216621399,
"learning_rate": 4.4534497043432655e-05,
"loss": 0.2109,
"num_input_tokens_seen": 2678792,
"step": 9415
},
{
"epoch": 17.574626865671643,
"grad_norm": 0.5408649444580078,
"learning_rate": 4.419925625373028e-05,
"loss": 0.1932,
"num_input_tokens_seen": 2680232,
"step": 9420
},
{
"epoch": 17.583955223880597,
"grad_norm": 0.7728055119514465,
"learning_rate": 4.386522366755169e-05,
"loss": 0.3155,
"num_input_tokens_seen": 2681640,
"step": 9425
},
{
"epoch": 17.59328358208955,
"grad_norm": 0.6577746272087097,
"learning_rate": 4.353240017032611e-05,
"loss": 0.3126,
"num_input_tokens_seen": 2682984,
"step": 9430
},
{
"epoch": 17.60261194029851,
"grad_norm": 0.6137169599533081,
"learning_rate": 4.3200786644278064e-05,
"loss": 0.3288,
"num_input_tokens_seen": 2684456,
"step": 9435
},
{
"epoch": 17.611940298507463,
"grad_norm": 0.4128672778606415,
"learning_rate": 4.287038396842463e-05,
"loss": 0.1679,
"num_input_tokens_seen": 2685992,
"step": 9440
},
{
"epoch": 17.621268656716417,
"grad_norm": 0.5442803502082825,
"learning_rate": 4.254119301857301e-05,
"loss": 0.3345,
"num_input_tokens_seen": 2687368,
"step": 9445
},
{
"epoch": 17.630597014925375,
"grad_norm": 0.412265807390213,
"learning_rate": 4.2213214667318925e-05,
"loss": 0.1516,
"num_input_tokens_seen": 2688840,
"step": 9450
},
{
"epoch": 17.63992537313433,
"grad_norm": 0.7147823572158813,
"learning_rate": 4.188644978404349e-05,
"loss": 0.2244,
"num_input_tokens_seen": 2690280,
"step": 9455
},
{
"epoch": 17.649253731343283,
"grad_norm": 0.6293306350708008,
"learning_rate": 4.156089923491124e-05,
"loss": 0.2742,
"num_input_tokens_seen": 2691720,
"step": 9460
},
{
"epoch": 17.65858208955224,
"grad_norm": 0.7018571496009827,
"learning_rate": 4.1236563882868116e-05,
"loss": 0.2319,
"num_input_tokens_seen": 2693064,
"step": 9465
},
{
"epoch": 17.667910447761194,
"grad_norm": 0.5599743723869324,
"learning_rate": 4.091344458763863e-05,
"loss": 0.3065,
"num_input_tokens_seen": 2694344,
"step": 9470
},
{
"epoch": 17.67723880597015,
"grad_norm": 0.3867090046405792,
"learning_rate": 4.0591542205723975e-05,
"loss": 0.3248,
"num_input_tokens_seen": 2695816,
"step": 9475
},
{
"epoch": 17.686567164179106,
"grad_norm": 0.7426350712776184,
"learning_rate": 4.02708575903995e-05,
"loss": 0.3151,
"num_input_tokens_seen": 2697192,
"step": 9480
},
{
"epoch": 17.69589552238806,
"grad_norm": 0.5198386907577515,
"learning_rate": 3.995139159171296e-05,
"loss": 0.2335,
"num_input_tokens_seen": 2698696,
"step": 9485
},
{
"epoch": 17.705223880597014,
"grad_norm": 0.5265973210334778,
"learning_rate": 3.963314505648141e-05,
"loss": 0.3741,
"num_input_tokens_seen": 2700200,
"step": 9490
},
{
"epoch": 17.71455223880597,
"grad_norm": 0.4689077138900757,
"learning_rate": 3.931611882828967e-05,
"loss": 0.1904,
"num_input_tokens_seen": 2701800,
"step": 9495
},
{
"epoch": 17.723880597014926,
"grad_norm": 0.6163126826286316,
"learning_rate": 3.900031374748797e-05,
"loss": 0.1411,
"num_input_tokens_seen": 2703208,
"step": 9500
},
{
"epoch": 17.73320895522388,
"grad_norm": 0.7090937495231628,
"learning_rate": 3.868573065118935e-05,
"loss": 0.3838,
"num_input_tokens_seen": 2704744,
"step": 9505
},
{
"epoch": 17.742537313432837,
"grad_norm": 0.4497239589691162,
"learning_rate": 3.837237037326813e-05,
"loss": 0.3361,
"num_input_tokens_seen": 2706120,
"step": 9510
},
{
"epoch": 17.75186567164179,
"grad_norm": 0.4925253391265869,
"learning_rate": 3.806023374435663e-05,
"loss": 0.2392,
"num_input_tokens_seen": 2707656,
"step": 9515
},
{
"epoch": 17.761194029850746,
"grad_norm": 0.22657965123653412,
"learning_rate": 3.774932159184413e-05,
"loss": 0.156,
"num_input_tokens_seen": 2709320,
"step": 9520
},
{
"epoch": 17.770522388059703,
"grad_norm": 0.8829783201217651,
"learning_rate": 3.7439634739874165e-05,
"loss": 0.3334,
"num_input_tokens_seen": 2710728,
"step": 9525
},
{
"epoch": 17.779850746268657,
"grad_norm": 0.6137654781341553,
"learning_rate": 3.7131174009341794e-05,
"loss": 0.2255,
"num_input_tokens_seen": 2712136,
"step": 9530
},
{
"epoch": 17.78917910447761,
"grad_norm": 0.8342494964599609,
"learning_rate": 3.682394021789259e-05,
"loss": 0.3316,
"num_input_tokens_seen": 2713672,
"step": 9535
},
{
"epoch": 17.798507462686565,
"grad_norm": 0.3486193120479584,
"learning_rate": 3.65179341799195e-05,
"loss": 0.3042,
"num_input_tokens_seen": 2715048,
"step": 9540
},
{
"epoch": 17.807835820895523,
"grad_norm": 0.6700887680053711,
"learning_rate": 3.6213156706561166e-05,
"loss": 0.3171,
"num_input_tokens_seen": 2716360,
"step": 9545
},
{
"epoch": 17.817164179104477,
"grad_norm": 0.6130694150924683,
"learning_rate": 3.590960860569959e-05,
"loss": 0.3269,
"num_input_tokens_seen": 2717928,
"step": 9550
},
{
"epoch": 17.826492537313435,
"grad_norm": 0.5257990956306458,
"learning_rate": 3.5607290681957894e-05,
"loss": 0.2473,
"num_input_tokens_seen": 2719304,
"step": 9555
},
{
"epoch": 17.83582089552239,
"grad_norm": 0.7420773506164551,
"learning_rate": 3.5306203736698685e-05,
"loss": 0.3058,
"num_input_tokens_seen": 2720680,
"step": 9560
},
{
"epoch": 17.845149253731343,
"grad_norm": 0.6225281953811646,
"learning_rate": 3.500634856802132e-05,
"loss": 0.2302,
"num_input_tokens_seen": 2722056,
"step": 9565
},
{
"epoch": 17.854477611940297,
"grad_norm": 0.4176914095878601,
"learning_rate": 3.4707725970760054e-05,
"loss": 0.1842,
"num_input_tokens_seen": 2723592,
"step": 9570
},
{
"epoch": 17.863805970149254,
"grad_norm": 0.6423101425170898,
"learning_rate": 3.441033673648197e-05,
"loss": 0.1773,
"num_input_tokens_seen": 2724808,
"step": 9575
},
{
"epoch": 17.87313432835821,
"grad_norm": 0.6204359531402588,
"learning_rate": 3.411418165348501e-05,
"loss": 0.2692,
"num_input_tokens_seen": 2726216,
"step": 9580
},
{
"epoch": 17.882462686567163,
"grad_norm": 0.7418746948242188,
"learning_rate": 3.381926150679543e-05,
"loss": 0.2128,
"num_input_tokens_seen": 2727528,
"step": 9585
},
{
"epoch": 17.89179104477612,
"grad_norm": 0.49704796075820923,
"learning_rate": 3.352557707816617e-05,
"loss": 0.1486,
"num_input_tokens_seen": 2728936,
"step": 9590
},
{
"epoch": 17.901119402985074,
"grad_norm": 0.4309185743331909,
"learning_rate": 3.323312914607468e-05,
"loss": 0.1939,
"num_input_tokens_seen": 2730600,
"step": 9595
},
{
"epoch": 17.91044776119403,
"grad_norm": 0.7381911277770996,
"learning_rate": 3.294191848572059e-05,
"loss": 0.2325,
"num_input_tokens_seen": 2732072,
"step": 9600
},
{
"epoch": 17.919776119402986,
"grad_norm": 0.5855839848518372,
"learning_rate": 3.265194586902404e-05,
"loss": 0.3232,
"num_input_tokens_seen": 2733704,
"step": 9605
},
{
"epoch": 17.92910447761194,
"grad_norm": 0.5614606142044067,
"learning_rate": 3.236321206462339e-05,
"loss": 0.2282,
"num_input_tokens_seen": 2735016,
"step": 9610
},
{
"epoch": 17.938432835820894,
"grad_norm": 0.8269302248954773,
"learning_rate": 3.207571783787328e-05,
"loss": 0.1947,
"num_input_tokens_seen": 2736424,
"step": 9615
},
{
"epoch": 17.94776119402985,
"grad_norm": 0.8580751419067383,
"learning_rate": 3.1789463950842476e-05,
"loss": 0.2183,
"num_input_tokens_seen": 2737864,
"step": 9620
},
{
"epoch": 17.957089552238806,
"grad_norm": 0.8197803497314453,
"learning_rate": 3.1504451162311986e-05,
"loss": 0.2768,
"num_input_tokens_seen": 2739016,
"step": 9625
},
{
"epoch": 17.96641791044776,
"grad_norm": 0.5299542546272278,
"learning_rate": 3.122068022777313e-05,
"loss": 0.3969,
"num_input_tokens_seen": 2740520,
"step": 9630
},
{
"epoch": 17.975746268656717,
"grad_norm": 0.42745083570480347,
"learning_rate": 3.093815189942523e-05,
"loss": 0.2228,
"num_input_tokens_seen": 2741800,
"step": 9635
},
{
"epoch": 17.98507462686567,
"grad_norm": 0.6629648804664612,
"learning_rate": 3.065686692617381e-05,
"loss": 0.2194,
"num_input_tokens_seen": 2743208,
"step": 9640
},
{
"epoch": 17.994402985074625,
"grad_norm": 0.4685826897621155,
"learning_rate": 3.037682605362879e-05,
"loss": 0.3369,
"num_input_tokens_seen": 2744424,
"step": 9645
},
{
"epoch": 18.0,
"eval_loss": 0.8991429805755615,
"eval_runtime": 4.2043,
"eval_samples_per_second": 56.609,
"eval_steps_per_second": 14.271,
"num_input_tokens_seen": 2744944,
"step": 9648
},
{
"epoch": 18.003731343283583,
"grad_norm": 0.5331962704658508,
"learning_rate": 3.0098030024102107e-05,
"loss": 0.1665,
"num_input_tokens_seen": 2745456,
"step": 9650
},
{
"epoch": 18.013059701492537,
"grad_norm": 0.40830346941947937,
"learning_rate": 2.9820479576606054e-05,
"loss": 0.2144,
"num_input_tokens_seen": 2747120,
"step": 9655
},
{
"epoch": 18.02238805970149,
"grad_norm": 0.6302569508552551,
"learning_rate": 2.954417544685112e-05,
"loss": 0.1607,
"num_input_tokens_seen": 2748528,
"step": 9660
},
{
"epoch": 18.03171641791045,
"grad_norm": 0.5230251550674438,
"learning_rate": 2.9269118367244385e-05,
"loss": 0.2282,
"num_input_tokens_seen": 2749744,
"step": 9665
},
{
"epoch": 18.041044776119403,
"grad_norm": 0.31676530838012695,
"learning_rate": 2.8995309066887076e-05,
"loss": 0.2259,
"num_input_tokens_seen": 2751280,
"step": 9670
},
{
"epoch": 18.050373134328357,
"grad_norm": 0.7035130858421326,
"learning_rate": 2.8722748271573064e-05,
"loss": 0.2814,
"num_input_tokens_seen": 2752656,
"step": 9675
},
{
"epoch": 18.059701492537314,
"grad_norm": 0.6157044172286987,
"learning_rate": 2.845143670378675e-05,
"loss": 0.2271,
"num_input_tokens_seen": 2754000,
"step": 9680
},
{
"epoch": 18.06902985074627,
"grad_norm": 0.22312885522842407,
"learning_rate": 2.8181375082701077e-05,
"loss": 0.241,
"num_input_tokens_seen": 2755440,
"step": 9685
},
{
"epoch": 18.078358208955223,
"grad_norm": 0.7368862628936768,
"learning_rate": 2.7912564124175866e-05,
"loss": 0.2336,
"num_input_tokens_seen": 2757008,
"step": 9690
},
{
"epoch": 18.08768656716418,
"grad_norm": 0.48901888728141785,
"learning_rate": 2.7645004540755525e-05,
"loss": 0.3062,
"num_input_tokens_seen": 2758224,
"step": 9695
},
{
"epoch": 18.097014925373134,
"grad_norm": 0.5629950165748596,
"learning_rate": 2.7378697041667676e-05,
"loss": 0.2175,
"num_input_tokens_seen": 2759600,
"step": 9700
},
{
"epoch": 18.10634328358209,
"grad_norm": 0.5760655999183655,
"learning_rate": 2.7113642332821043e-05,
"loss": 0.2869,
"num_input_tokens_seen": 2761072,
"step": 9705
},
{
"epoch": 18.115671641791046,
"grad_norm": 0.5357734560966492,
"learning_rate": 2.6849841116803218e-05,
"loss": 0.2062,
"num_input_tokens_seen": 2762608,
"step": 9710
},
{
"epoch": 18.125,
"grad_norm": 0.4605900049209595,
"learning_rate": 2.6587294092879354e-05,
"loss": 0.206,
"num_input_tokens_seen": 2764144,
"step": 9715
},
{
"epoch": 18.134328358208954,
"grad_norm": 0.8400527834892273,
"learning_rate": 2.632600195699014e-05,
"loss": 0.2989,
"num_input_tokens_seen": 2765424,
"step": 9720
},
{
"epoch": 18.14365671641791,
"grad_norm": 0.7730468511581421,
"learning_rate": 2.6065965401749602e-05,
"loss": 0.2935,
"num_input_tokens_seen": 2767024,
"step": 9725
},
{
"epoch": 18.152985074626866,
"grad_norm": 0.6380898952484131,
"learning_rate": 2.5807185116444033e-05,
"loss": 0.4573,
"num_input_tokens_seen": 2768432,
"step": 9730
},
{
"epoch": 18.16231343283582,
"grad_norm": 0.5300564765930176,
"learning_rate": 2.5549661787029167e-05,
"loss": 0.2651,
"num_input_tokens_seen": 2769968,
"step": 9735
},
{
"epoch": 18.171641791044777,
"grad_norm": 0.5630916357040405,
"learning_rate": 2.5293396096129406e-05,
"loss": 0.2353,
"num_input_tokens_seen": 2771280,
"step": 9740
},
{
"epoch": 18.18097014925373,
"grad_norm": 0.4602615237236023,
"learning_rate": 2.503838872303493e-05,
"loss": 0.2609,
"num_input_tokens_seen": 2772784,
"step": 9745
},
{
"epoch": 18.190298507462686,
"grad_norm": 0.33134764432907104,
"learning_rate": 2.4784640343701094e-05,
"loss": 0.1409,
"num_input_tokens_seen": 2774256,
"step": 9750
},
{
"epoch": 18.199626865671643,
"grad_norm": 0.597190797328949,
"learning_rate": 2.4532151630745403e-05,
"loss": 0.2962,
"num_input_tokens_seen": 2775856,
"step": 9755
},
{
"epoch": 18.208955223880597,
"grad_norm": 0.30851173400878906,
"learning_rate": 2.428092325344683e-05,
"loss": 0.1645,
"num_input_tokens_seen": 2777456,
"step": 9760
},
{
"epoch": 18.21828358208955,
"grad_norm": 0.5781135559082031,
"learning_rate": 2.4030955877743232e-05,
"loss": 0.2795,
"num_input_tokens_seen": 2778896,
"step": 9765
},
{
"epoch": 18.22761194029851,
"grad_norm": 0.6469405293464661,
"learning_rate": 2.3782250166229925e-05,
"loss": 0.274,
"num_input_tokens_seen": 2780400,
"step": 9770
},
{
"epoch": 18.236940298507463,
"grad_norm": 0.2876856327056885,
"learning_rate": 2.3534806778158113e-05,
"loss": 0.2551,
"num_input_tokens_seen": 2781968,
"step": 9775
},
{
"epoch": 18.246268656716417,
"grad_norm": 0.4758549630641937,
"learning_rate": 2.328862636943252e-05,
"loss": 0.1693,
"num_input_tokens_seen": 2783376,
"step": 9780
},
{
"epoch": 18.255597014925375,
"grad_norm": 0.8363102078437805,
"learning_rate": 2.3043709592610483e-05,
"loss": 0.1628,
"num_input_tokens_seen": 2784624,
"step": 9785
},
{
"epoch": 18.26492537313433,
"grad_norm": 0.6684785485267639,
"learning_rate": 2.280005709689964e-05,
"loss": 0.2323,
"num_input_tokens_seen": 2786128,
"step": 9790
},
{
"epoch": 18.274253731343283,
"grad_norm": 0.5245328545570374,
"learning_rate": 2.2557669528156245e-05,
"loss": 0.1595,
"num_input_tokens_seen": 2787440,
"step": 9795
},
{
"epoch": 18.28358208955224,
"grad_norm": 0.839958667755127,
"learning_rate": 2.2316547528883734e-05,
"loss": 0.4074,
"num_input_tokens_seen": 2788848,
"step": 9800
},
{
"epoch": 18.292910447761194,
"grad_norm": 0.4217190444469452,
"learning_rate": 2.207669173823068e-05,
"loss": 0.1433,
"num_input_tokens_seen": 2790448,
"step": 9805
},
{
"epoch": 18.30223880597015,
"grad_norm": 0.745557963848114,
"learning_rate": 2.1838102791989557e-05,
"loss": 0.2321,
"num_input_tokens_seen": 2791984,
"step": 9810
},
{
"epoch": 18.311567164179106,
"grad_norm": 0.7656036019325256,
"learning_rate": 2.160078132259452e-05,
"loss": 0.2635,
"num_input_tokens_seen": 2793360,
"step": 9815
},
{
"epoch": 18.32089552238806,
"grad_norm": 0.6078481078147888,
"learning_rate": 2.1364727959120088e-05,
"loss": 0.3943,
"num_input_tokens_seen": 2794608,
"step": 9820
},
{
"epoch": 18.330223880597014,
"grad_norm": 0.6271487474441528,
"learning_rate": 2.112994332727952e-05,
"loss": 0.23,
"num_input_tokens_seen": 2796048,
"step": 9825
},
{
"epoch": 18.33955223880597,
"grad_norm": 0.9702181220054626,
"learning_rate": 2.0896428049422765e-05,
"loss": 0.208,
"num_input_tokens_seen": 2797488,
"step": 9830
},
{
"epoch": 18.348880597014926,
"grad_norm": 0.5267967581748962,
"learning_rate": 2.0664182744535132e-05,
"loss": 0.2428,
"num_input_tokens_seen": 2798864,
"step": 9835
},
{
"epoch": 18.35820895522388,
"grad_norm": 0.4196302890777588,
"learning_rate": 2.0433208028235672e-05,
"loss": 0.1731,
"num_input_tokens_seen": 2800304,
"step": 9840
},
{
"epoch": 18.367537313432837,
"grad_norm": 0.8147212862968445,
"learning_rate": 2.020350451277536e-05,
"loss": 0.1959,
"num_input_tokens_seen": 2801584,
"step": 9845
},
{
"epoch": 18.37686567164179,
"grad_norm": 1.0722787380218506,
"learning_rate": 1.99750728070357e-05,
"loss": 0.4208,
"num_input_tokens_seen": 2802960,
"step": 9850
},
{
"epoch": 18.386194029850746,
"grad_norm": 0.6142206788063049,
"learning_rate": 1.9747913516526715e-05,
"loss": 0.3148,
"num_input_tokens_seen": 2804272,
"step": 9855
},
{
"epoch": 18.395522388059703,
"grad_norm": 0.9456634521484375,
"learning_rate": 1.952202724338592e-05,
"loss": 0.3173,
"num_input_tokens_seen": 2805520,
"step": 9860
},
{
"epoch": 18.404850746268657,
"grad_norm": 0.8432918787002563,
"learning_rate": 1.9297414586376184e-05,
"loss": 0.2247,
"num_input_tokens_seen": 2806960,
"step": 9865
},
{
"epoch": 18.41417910447761,
"grad_norm": 0.683684766292572,
"learning_rate": 1.907407614088441e-05,
"loss": 0.2114,
"num_input_tokens_seen": 2808432,
"step": 9870
},
{
"epoch": 18.423507462686565,
"grad_norm": 0.9373521208763123,
"learning_rate": 1.885201249891988e-05,
"loss": 0.2357,
"num_input_tokens_seen": 2810000,
"step": 9875
},
{
"epoch": 18.432835820895523,
"grad_norm": 0.38543543219566345,
"learning_rate": 1.8631224249112953e-05,
"loss": 0.2834,
"num_input_tokens_seen": 2811568,
"step": 9880
},
{
"epoch": 18.442164179104477,
"grad_norm": 0.4837745428085327,
"learning_rate": 1.841171197671293e-05,
"loss": 0.2146,
"num_input_tokens_seen": 2812816,
"step": 9885
},
{
"epoch": 18.451492537313435,
"grad_norm": 0.30426064133644104,
"learning_rate": 1.8193476263587084e-05,
"loss": 0.2971,
"num_input_tokens_seen": 2814480,
"step": 9890
},
{
"epoch": 18.46082089552239,
"grad_norm": 0.5340366959571838,
"learning_rate": 1.7976517688218786e-05,
"loss": 0.1861,
"num_input_tokens_seen": 2815856,
"step": 9895
},
{
"epoch": 18.470149253731343,
"grad_norm": 0.672731876373291,
"learning_rate": 1.7760836825706117e-05,
"loss": 0.328,
"num_input_tokens_seen": 2817328,
"step": 9900
},
{
"epoch": 18.479477611940297,
"grad_norm": 0.6022140383720398,
"learning_rate": 1.7546434247760147e-05,
"loss": 0.2673,
"num_input_tokens_seen": 2818608,
"step": 9905
},
{
"epoch": 18.488805970149254,
"grad_norm": 0.6115740537643433,
"learning_rate": 1.7333310522703814e-05,
"loss": 0.3055,
"num_input_tokens_seen": 2820080,
"step": 9910
},
{
"epoch": 18.49813432835821,
"grad_norm": 0.43116679787635803,
"learning_rate": 1.7121466215469893e-05,
"loss": 0.1638,
"num_input_tokens_seen": 2821296,
"step": 9915
},
{
"epoch": 18.507462686567163,
"grad_norm": 0.8029485940933228,
"learning_rate": 1.6910901887599917e-05,
"loss": 0.3756,
"num_input_tokens_seen": 2822704,
"step": 9920
},
{
"epoch": 18.51679104477612,
"grad_norm": 0.3002271354198456,
"learning_rate": 1.6701618097242522e-05,
"loss": 0.2539,
"num_input_tokens_seen": 2824432,
"step": 9925
},
{
"epoch": 18.526119402985074,
"grad_norm": 0.35039186477661133,
"learning_rate": 1.649361539915206e-05,
"loss": 0.1985,
"num_input_tokens_seen": 2825872,
"step": 9930
},
{
"epoch": 18.53544776119403,
"grad_norm": 0.6990351676940918,
"learning_rate": 1.628689434468694e-05,
"loss": 0.2503,
"num_input_tokens_seen": 2827184,
"step": 9935
},
{
"epoch": 18.544776119402986,
"grad_norm": 0.5490748882293701,
"learning_rate": 1.6081455481808226e-05,
"loss": 0.2516,
"num_input_tokens_seen": 2828720,
"step": 9940
},
{
"epoch": 18.55410447761194,
"grad_norm": 0.48887062072753906,
"learning_rate": 1.5877299355078534e-05,
"loss": 0.2402,
"num_input_tokens_seen": 2829968,
"step": 9945
},
{
"epoch": 18.563432835820894,
"grad_norm": 0.36090975999832153,
"learning_rate": 1.567442650565998e-05,
"loss": 0.1997,
"num_input_tokens_seen": 2831504,
"step": 9950
},
{
"epoch": 18.57276119402985,
"grad_norm": 0.32836994528770447,
"learning_rate": 1.5472837471313174e-05,
"loss": 0.1798,
"num_input_tokens_seen": 2832816,
"step": 9955
},
{
"epoch": 18.582089552238806,
"grad_norm": 0.5255143642425537,
"learning_rate": 1.5272532786395733e-05,
"loss": 0.1834,
"num_input_tokens_seen": 2834352,
"step": 9960
},
{
"epoch": 18.59141791044776,
"grad_norm": 0.6501192450523376,
"learning_rate": 1.5073512981860716e-05,
"loss": 0.2873,
"num_input_tokens_seen": 2835856,
"step": 9965
},
{
"epoch": 18.600746268656717,
"grad_norm": 0.6325224041938782,
"learning_rate": 1.4875778585255572e-05,
"loss": 0.2008,
"num_input_tokens_seen": 2837200,
"step": 9970
},
{
"epoch": 18.61007462686567,
"grad_norm": 0.42452865839004517,
"learning_rate": 1.4679330120720036e-05,
"loss": 0.1948,
"num_input_tokens_seen": 2838512,
"step": 9975
},
{
"epoch": 18.619402985074625,
"grad_norm": 0.2303168773651123,
"learning_rate": 1.4484168108985619e-05,
"loss": 0.1488,
"num_input_tokens_seen": 2840048,
"step": 9980
},
{
"epoch": 18.628731343283583,
"grad_norm": 0.7124575972557068,
"learning_rate": 1.429029306737345e-05,
"loss": 0.2161,
"num_input_tokens_seen": 2841360,
"step": 9985
},
{
"epoch": 18.638059701492537,
"grad_norm": 0.6218158006668091,
"learning_rate": 1.4097705509793612e-05,
"loss": 0.1754,
"num_input_tokens_seen": 2842672,
"step": 9990
},
{
"epoch": 18.64738805970149,
"grad_norm": 0.4380490183830261,
"learning_rate": 1.3906405946743028e-05,
"loss": 0.1943,
"num_input_tokens_seen": 2844144,
"step": 9995
},
{
"epoch": 18.65671641791045,
"grad_norm": 0.7543335556983948,
"learning_rate": 1.371639488530474e-05,
"loss": 0.2378,
"num_input_tokens_seen": 2845616,
"step": 10000
},
{
"epoch": 18.666044776119403,
"grad_norm": 0.6214406490325928,
"learning_rate": 1.3527672829146465e-05,
"loss": 0.3568,
"num_input_tokens_seen": 2847152,
"step": 10005
},
{
"epoch": 18.675373134328357,
"grad_norm": 0.8499645590782166,
"learning_rate": 1.3340240278518656e-05,
"loss": 0.1902,
"num_input_tokens_seen": 2848464,
"step": 10010
},
{
"epoch": 18.684701492537314,
"grad_norm": 0.31233519315719604,
"learning_rate": 1.3154097730254055e-05,
"loss": 0.1963,
"num_input_tokens_seen": 2850000,
"step": 10015
},
{
"epoch": 18.69402985074627,
"grad_norm": 0.5519477725028992,
"learning_rate": 1.2969245677765806e-05,
"loss": 0.2531,
"num_input_tokens_seen": 2851376,
"step": 10020
},
{
"epoch": 18.703358208955223,
"grad_norm": 0.6598090529441833,
"learning_rate": 1.2785684611046344e-05,
"loss": 0.1737,
"num_input_tokens_seen": 2852816,
"step": 10025
},
{
"epoch": 18.71268656716418,
"grad_norm": 0.41588467359542847,
"learning_rate": 1.2603415016665954e-05,
"loss": 0.2888,
"num_input_tokens_seen": 2854224,
"step": 10030
},
{
"epoch": 18.722014925373134,
"grad_norm": 0.9295200109481812,
"learning_rate": 1.24224373777716e-05,
"loss": 0.2034,
"num_input_tokens_seen": 2855536,
"step": 10035
},
{
"epoch": 18.73134328358209,
"grad_norm": 0.4858807623386383,
"learning_rate": 1.2242752174085824e-05,
"loss": 0.2194,
"num_input_tokens_seen": 2856976,
"step": 10040
},
{
"epoch": 18.740671641791046,
"grad_norm": 0.8125934600830078,
"learning_rate": 1.2064359881905018e-05,
"loss": 0.3345,
"num_input_tokens_seen": 2858480,
"step": 10045
},
{
"epoch": 18.75,
"grad_norm": 0.5733120441436768,
"learning_rate": 1.188726097409859e-05,
"loss": 0.194,
"num_input_tokens_seen": 2859888,
"step": 10050
},
{
"epoch": 18.759328358208954,
"grad_norm": 0.4118148982524872,
"learning_rate": 1.1711455920107306e-05,
"loss": 0.2242,
"num_input_tokens_seen": 2861488,
"step": 10055
},
{
"epoch": 18.76865671641791,
"grad_norm": 0.5921308994293213,
"learning_rate": 1.1536945185942615e-05,
"loss": 0.2137,
"num_input_tokens_seen": 2862864,
"step": 10060
},
{
"epoch": 18.777985074626866,
"grad_norm": 0.41935235261917114,
"learning_rate": 1.1363729234184827e-05,
"loss": 0.2219,
"num_input_tokens_seen": 2864400,
"step": 10065
},
{
"epoch": 18.78731343283582,
"grad_norm": 0.9974322319030762,
"learning_rate": 1.1191808523982217e-05,
"loss": 0.2545,
"num_input_tokens_seen": 2865648,
"step": 10070
},
{
"epoch": 18.796641791044777,
"grad_norm": 0.46737658977508545,
"learning_rate": 1.1021183511049748e-05,
"loss": 0.2224,
"num_input_tokens_seen": 2867120,
"step": 10075
},
{
"epoch": 18.80597014925373,
"grad_norm": 0.35736414790153503,
"learning_rate": 1.0851854647667803e-05,
"loss": 0.2584,
"num_input_tokens_seen": 2868464,
"step": 10080
},
{
"epoch": 18.815298507462686,
"grad_norm": 0.7535758018493652,
"learning_rate": 1.0683822382681008e-05,
"loss": 0.275,
"num_input_tokens_seen": 2869904,
"step": 10085
},
{
"epoch": 18.824626865671643,
"grad_norm": 0.32048988342285156,
"learning_rate": 1.051708716149713e-05,
"loss": 0.1825,
"num_input_tokens_seen": 2871248,
"step": 10090
},
{
"epoch": 18.833955223880597,
"grad_norm": 0.43274596333503723,
"learning_rate": 1.0351649426085852e-05,
"loss": 0.2755,
"num_input_tokens_seen": 2872496,
"step": 10095
},
{
"epoch": 18.84328358208955,
"grad_norm": 0.6226127743721008,
"learning_rate": 1.0187509614977387e-05,
"loss": 0.3063,
"num_input_tokens_seen": 2873872,
"step": 10100
},
{
"epoch": 18.85261194029851,
"grad_norm": 0.39895033836364746,
"learning_rate": 1.0024668163261641e-05,
"loss": 0.2383,
"num_input_tokens_seen": 2875376,
"step": 10105
},
{
"epoch": 18.861940298507463,
"grad_norm": 0.6880218982696533,
"learning_rate": 9.863125502587056e-06,
"loss": 0.277,
"num_input_tokens_seen": 2876848,
"step": 10110
},
{
"epoch": 18.871268656716417,
"grad_norm": 0.7496880292892456,
"learning_rate": 9.702882061159046e-06,
"loss": 0.1987,
"num_input_tokens_seen": 2878128,
"step": 10115
},
{
"epoch": 18.880597014925375,
"grad_norm": 0.6792168617248535,
"learning_rate": 9.543938263739338e-06,
"loss": 0.1663,
"num_input_tokens_seen": 2879728,
"step": 10120
},
{
"epoch": 18.88992537313433,
"grad_norm": 0.8453506827354431,
"learning_rate": 9.386294531644634e-06,
"loss": 0.2542,
"num_input_tokens_seen": 2881104,
"step": 10125
},
{
"epoch": 18.899253731343283,
"grad_norm": 0.7367790937423706,
"learning_rate": 9.229951282745507e-06,
"loss": 0.1282,
"num_input_tokens_seen": 2882576,
"step": 10130
},
{
"epoch": 18.90858208955224,
"grad_norm": 0.5763761401176453,
"learning_rate": 9.07490893146523e-06,
"loss": 0.2398,
"num_input_tokens_seen": 2884112,
"step": 10135
},
{
"epoch": 18.917910447761194,
"grad_norm": 0.7230084538459778,
"learning_rate": 8.921167888778836e-06,
"loss": 0.2872,
"num_input_tokens_seen": 2885360,
"step": 10140
},
{
"epoch": 18.92723880597015,
"grad_norm": 0.5383572578430176,
"learning_rate": 8.768728562211947e-06,
"loss": 0.1634,
"num_input_tokens_seen": 2886896,
"step": 10145
},
{
"epoch": 18.936567164179106,
"grad_norm": 0.20805750787258148,
"learning_rate": 8.617591355839672e-06,
"loss": 0.1707,
"num_input_tokens_seen": 2888240,
"step": 10150
},
{
"epoch": 18.94589552238806,
"grad_norm": 0.4978165328502655,
"learning_rate": 8.467756670285432e-06,
"loss": 0.3313,
"num_input_tokens_seen": 2889616,
"step": 10155
},
{
"epoch": 18.955223880597014,
"grad_norm": 0.7477753162384033,
"learning_rate": 8.319224902720302e-06,
"loss": 0.2436,
"num_input_tokens_seen": 2890960,
"step": 10160
},
{
"epoch": 18.96455223880597,
"grad_norm": 0.6375783681869507,
"learning_rate": 8.171996446861396e-06,
"loss": 0.4074,
"num_input_tokens_seen": 2892368,
"step": 10165
},
{
"epoch": 18.973880597014926,
"grad_norm": 0.7193611860275269,
"learning_rate": 8.026071692971315e-06,
"loss": 0.248,
"num_input_tokens_seen": 2893808,
"step": 10170
},
{
"epoch": 18.98320895522388,
"grad_norm": 0.4070175886154175,
"learning_rate": 7.881451027856645e-06,
"loss": 0.2247,
"num_input_tokens_seen": 2895280,
"step": 10175
},
{
"epoch": 18.992537313432837,
"grad_norm": 0.9722506999969482,
"learning_rate": 7.738134834867461e-06,
"loss": 0.3073,
"num_input_tokens_seen": 2896720,
"step": 10180
},
{
"epoch": 19.0,
"eval_loss": 0.9123162627220154,
"eval_runtime": 4.1942,
"eval_samples_per_second": 56.745,
"eval_steps_per_second": 14.305,
"num_input_tokens_seen": 2897552,
"step": 10184
},
{
"epoch": 19.00186567164179,
"grad_norm": 0.4594847559928894,
"learning_rate": 7.59612349389599e-06,
"loss": 0.1506,
"num_input_tokens_seen": 2897840,
"step": 10185
},
{
"epoch": 19.011194029850746,
"grad_norm": 0.45773735642433167,
"learning_rate": 7.455417381375451e-06,
"loss": 0.1724,
"num_input_tokens_seen": 2899248,
"step": 10190
},
{
"epoch": 19.020522388059703,
"grad_norm": 0.9145358204841614,
"learning_rate": 7.316016870279441e-06,
"loss": 0.1997,
"num_input_tokens_seen": 2900464,
"step": 10195
},
{
"epoch": 19.029850746268657,
"grad_norm": 0.5567941069602966,
"learning_rate": 7.177922330120712e-06,
"loss": 0.1263,
"num_input_tokens_seen": 2901840,
"step": 10200
},
{
"epoch": 19.03917910447761,
"grad_norm": 0.6734683513641357,
"learning_rate": 7.041134126950233e-06,
"loss": 0.2547,
"num_input_tokens_seen": 2903248,
"step": 10205
},
{
"epoch": 19.04850746268657,
"grad_norm": 0.7478652000427246,
"learning_rate": 6.9056526233562955e-06,
"loss": 0.3607,
"num_input_tokens_seen": 2904944,
"step": 10210
},
{
"epoch": 19.057835820895523,
"grad_norm": 1.0115318298339844,
"learning_rate": 6.771478178463353e-06,
"loss": 0.2637,
"num_input_tokens_seen": 2906256,
"step": 10215
},
{
"epoch": 19.067164179104477,
"grad_norm": 0.5048120021820068,
"learning_rate": 6.638611147931406e-06,
"loss": 0.1724,
"num_input_tokens_seen": 2907760,
"step": 10220
},
{
"epoch": 19.07649253731343,
"grad_norm": 0.8928444385528564,
"learning_rate": 6.507051883954618e-06,
"loss": 0.2378,
"num_input_tokens_seen": 2909040,
"step": 10225
},
{
"epoch": 19.08582089552239,
"grad_norm": 0.5665672421455383,
"learning_rate": 6.376800735260757e-06,
"loss": 0.2593,
"num_input_tokens_seen": 2910448,
"step": 10230
},
{
"epoch": 19.095149253731343,
"grad_norm": 0.5140485763549805,
"learning_rate": 6.247858047110145e-06,
"loss": 0.1728,
"num_input_tokens_seen": 2911792,
"step": 10235
},
{
"epoch": 19.104477611940297,
"grad_norm": 0.5509104132652283,
"learning_rate": 6.1202241612947075e-06,
"loss": 0.2249,
"num_input_tokens_seen": 2913072,
"step": 10240
},
{
"epoch": 19.113805970149254,
"grad_norm": 0.6295739412307739,
"learning_rate": 5.993899416137039e-06,
"loss": 0.1709,
"num_input_tokens_seen": 2914480,
"step": 10245
},
{
"epoch": 19.12313432835821,
"grad_norm": 0.8722971081733704,
"learning_rate": 5.868884146489617e-06,
"loss": 0.5177,
"num_input_tokens_seen": 2915920,
"step": 10250
},
{
"epoch": 19.132462686567163,
"grad_norm": 0.5105932354927063,
"learning_rate": 5.7451786837339205e-06,
"loss": 0.2387,
"num_input_tokens_seen": 2917360,
"step": 10255
},
{
"epoch": 19.14179104477612,
"grad_norm": 0.5840516090393066,
"learning_rate": 5.622783355779315e-06,
"loss": 0.2359,
"num_input_tokens_seen": 2918896,
"step": 10260
},
{
"epoch": 19.151119402985074,
"grad_norm": 0.5876781940460205,
"learning_rate": 5.501698487062445e-06,
"loss": 0.1998,
"num_input_tokens_seen": 2920368,
"step": 10265
},
{
"epoch": 19.16044776119403,
"grad_norm": 0.38335317373275757,
"learning_rate": 5.3819243985463454e-06,
"loss": 0.1661,
"num_input_tokens_seen": 2921872,
"step": 10270
},
{
"epoch": 19.169776119402986,
"grad_norm": 0.7055832147598267,
"learning_rate": 5.263461407719438e-06,
"loss": 0.2888,
"num_input_tokens_seen": 2923312,
"step": 10275
},
{
"epoch": 19.17910447761194,
"grad_norm": 0.8590061664581299,
"learning_rate": 5.146309828594875e-06,
"loss": 0.3048,
"num_input_tokens_seen": 2924912,
"step": 10280
},
{
"epoch": 19.188432835820894,
"grad_norm": 0.5439656376838684,
"learning_rate": 5.030469971709472e-06,
"loss": 0.4523,
"num_input_tokens_seen": 2926416,
"step": 10285
},
{
"epoch": 19.19776119402985,
"grad_norm": 0.44218385219573975,
"learning_rate": 4.91594214412322e-06,
"loss": 0.1621,
"num_input_tokens_seen": 2927696,
"step": 10290
},
{
"epoch": 19.207089552238806,
"grad_norm": 0.523102879524231,
"learning_rate": 4.80272664941811e-06,
"loss": 0.1128,
"num_input_tokens_seen": 2929232,
"step": 10295
},
{
"epoch": 19.21641791044776,
"grad_norm": 0.8231872320175171,
"learning_rate": 4.690823787697473e-06,
"loss": 0.2353,
"num_input_tokens_seen": 2930608,
"step": 10300
},
{
"epoch": 19.225746268656717,
"grad_norm": 0.4466846287250519,
"learning_rate": 4.5802338555854254e-06,
"loss": 0.1663,
"num_input_tokens_seen": 2932240,
"step": 10305
},
{
"epoch": 19.23507462686567,
"grad_norm": 0.4112730920314789,
"learning_rate": 4.4709571462256956e-06,
"loss": 0.2029,
"num_input_tokens_seen": 2933712,
"step": 10310
},
{
"epoch": 19.244402985074625,
"grad_norm": 0.6375637650489807,
"learning_rate": 4.36299394928108e-06,
"loss": 0.173,
"num_input_tokens_seen": 2934960,
"step": 10315
},
{
"epoch": 19.253731343283583,
"grad_norm": 0.3967975080013275,
"learning_rate": 4.256344550932434e-06,
"loss": 0.2052,
"num_input_tokens_seen": 2936368,
"step": 10320
},
{
"epoch": 19.263059701492537,
"grad_norm": 0.41094884276390076,
"learning_rate": 4.1510092338784e-06,
"loss": 0.1815,
"num_input_tokens_seen": 2937936,
"step": 10325
},
{
"epoch": 19.27238805970149,
"grad_norm": 0.48285865783691406,
"learning_rate": 4.046988277334185e-06,
"loss": 0.3622,
"num_input_tokens_seen": 2939248,
"step": 10330
},
{
"epoch": 19.28171641791045,
"grad_norm": 0.43332603573799133,
"learning_rate": 3.944281957030893e-06,
"loss": 0.1298,
"num_input_tokens_seen": 2940688,
"step": 10335
},
{
"epoch": 19.291044776119403,
"grad_norm": 0.716364324092865,
"learning_rate": 3.842890545215028e-06,
"loss": 0.1903,
"num_input_tokens_seen": 2942032,
"step": 10340
},
{
"epoch": 19.300373134328357,
"grad_norm": 0.7156405448913574,
"learning_rate": 3.742814310647602e-06,
"loss": 0.2553,
"num_input_tokens_seen": 2943472,
"step": 10345
},
{
"epoch": 19.309701492537314,
"grad_norm": 0.7371711134910583,
"learning_rate": 3.6440535186034184e-06,
"loss": 0.3324,
"num_input_tokens_seen": 2944880,
"step": 10350
},
{
"epoch": 19.31902985074627,
"grad_norm": 0.6731292605400085,
"learning_rate": 3.5466084308704017e-06,
"loss": 0.2659,
"num_input_tokens_seen": 2946352,
"step": 10355
},
{
"epoch": 19.328358208955223,
"grad_norm": 0.7745460867881775,
"learning_rate": 3.4504793057489326e-06,
"loss": 0.3356,
"num_input_tokens_seen": 2947568,
"step": 10360
},
{
"epoch": 19.33768656716418,
"grad_norm": 0.33525073528289795,
"learning_rate": 3.3556663980511826e-06,
"loss": 0.1609,
"num_input_tokens_seen": 2949104,
"step": 10365
},
{
"epoch": 19.347014925373134,
"grad_norm": 0.3880627155303955,
"learning_rate": 3.2621699591001695e-06,
"loss": 0.2441,
"num_input_tokens_seen": 2950448,
"step": 10370
},
{
"epoch": 19.35634328358209,
"grad_norm": 0.885238766670227,
"learning_rate": 3.1699902367295917e-06,
"loss": 0.2596,
"num_input_tokens_seen": 2951760,
"step": 10375
},
{
"epoch": 19.365671641791046,
"grad_norm": 0.6533879041671753,
"learning_rate": 3.079127475282717e-06,
"loss": 0.2539,
"num_input_tokens_seen": 2953200,
"step": 10380
},
{
"epoch": 19.375,
"grad_norm": 0.48250091075897217,
"learning_rate": 2.9895819156119943e-06,
"loss": 0.1706,
"num_input_tokens_seen": 2954512,
"step": 10385
},
{
"epoch": 19.384328358208954,
"grad_norm": 0.6290906071662903,
"learning_rate": 2.9013537950782765e-06,
"loss": 0.238,
"num_input_tokens_seen": 2955856,
"step": 10390
},
{
"epoch": 19.39365671641791,
"grad_norm": 0.40463876724243164,
"learning_rate": 2.8144433475502105e-06,
"loss": 0.2206,
"num_input_tokens_seen": 2957328,
"step": 10395
},
{
"epoch": 19.402985074626866,
"grad_norm": 0.5499265193939209,
"learning_rate": 2.728850803403793e-06,
"loss": 0.1399,
"num_input_tokens_seen": 2958768,
"step": 10400
},
{
"epoch": 19.41231343283582,
"grad_norm": 0.7918092012405396,
"learning_rate": 2.644576389521425e-06,
"loss": 0.3264,
"num_input_tokens_seen": 2960272,
"step": 10405
},
{
"epoch": 19.421641791044777,
"grad_norm": 0.38886696100234985,
"learning_rate": 2.5616203292916916e-06,
"loss": 0.2346,
"num_input_tokens_seen": 2961648,
"step": 10410
},
{
"epoch": 19.43097014925373,
"grad_norm": 0.4961099624633789,
"learning_rate": 2.479982842608475e-06,
"loss": 0.091,
"num_input_tokens_seen": 2963088,
"step": 10415
},
{
"epoch": 19.440298507462686,
"grad_norm": 0.9479005336761475,
"learning_rate": 2.3996641458704504e-06,
"loss": 0.371,
"num_input_tokens_seen": 2964432,
"step": 10420
},
{
"epoch": 19.449626865671643,
"grad_norm": 0.568725049495697,
"learning_rate": 2.320664451980592e-06,
"loss": 0.2244,
"num_input_tokens_seen": 2965808,
"step": 10425
},
{
"epoch": 19.458955223880597,
"grad_norm": 0.46647632122039795,
"learning_rate": 2.2429839703456136e-06,
"loss": 0.2795,
"num_input_tokens_seen": 2967248,
"step": 10430
},
{
"epoch": 19.46828358208955,
"grad_norm": 0.4957118034362793,
"learning_rate": 2.1666229068753594e-06,
"loss": 0.2252,
"num_input_tokens_seen": 2968528,
"step": 10435
},
{
"epoch": 19.47761194029851,
"grad_norm": 0.8878422975540161,
"learning_rate": 2.091581463981973e-06,
"loss": 0.2318,
"num_input_tokens_seen": 2969968,
"step": 10440
},
{
"epoch": 19.486940298507463,
"grad_norm": 0.5628931522369385,
"learning_rate": 2.0178598405800606e-06,
"loss": 0.2825,
"num_input_tokens_seen": 2971472,
"step": 10445
},
{
"epoch": 19.496268656716417,
"grad_norm": 0.4205546975135803,
"learning_rate": 1.945458232085473e-06,
"loss": 0.228,
"num_input_tokens_seen": 2972880,
"step": 10450
},
{
"epoch": 19.505597014925375,
"grad_norm": 0.8694109320640564,
"learning_rate": 1.8743768304151366e-06,
"loss": 0.434,
"num_input_tokens_seen": 2974192,
"step": 10455
},
{
"epoch": 19.51492537313433,
"grad_norm": 0.941114604473114,
"learning_rate": 1.8046158239864996e-06,
"loss": 0.1921,
"num_input_tokens_seen": 2975632,
"step": 10460
},
{
"epoch": 19.524253731343283,
"grad_norm": 0.6050335764884949,
"learning_rate": 1.7361753977169215e-06,
"loss": 0.2097,
"num_input_tokens_seen": 2977168,
"step": 10465
},
{
"epoch": 19.53358208955224,
"grad_norm": 0.6744943857192993,
"learning_rate": 1.6690557330233947e-06,
"loss": 0.3264,
"num_input_tokens_seen": 2978736,
"step": 10470
},
{
"epoch": 19.542910447761194,
"grad_norm": 0.43842124938964844,
"learning_rate": 1.6032570078217678e-06,
"loss": 0.2519,
"num_input_tokens_seen": 2980176,
"step": 10475
},
{
"epoch": 19.55223880597015,
"grad_norm": 0.395082026720047,
"learning_rate": 1.5387793965265794e-06,
"loss": 0.2272,
"num_input_tokens_seen": 2981488,
"step": 10480
},
{
"epoch": 19.561567164179106,
"grad_norm": 0.28870466351509094,
"learning_rate": 1.4756230700503914e-06,
"loss": 0.1393,
"num_input_tokens_seen": 2982800,
"step": 10485
},
{
"epoch": 19.57089552238806,
"grad_norm": 0.4630226492881775,
"learning_rate": 1.4137881958034006e-06,
"loss": 0.1739,
"num_input_tokens_seen": 2984208,
"step": 10490
},
{
"epoch": 19.580223880597014,
"grad_norm": 0.7136488556861877,
"learning_rate": 1.3532749376929944e-06,
"loss": 0.3847,
"num_input_tokens_seen": 2985616,
"step": 10495
},
{
"epoch": 19.58955223880597,
"grad_norm": 0.5953083634376526,
"learning_rate": 1.2940834561233627e-06,
"loss": 0.2892,
"num_input_tokens_seen": 2987056,
"step": 10500
},
{
"epoch": 19.598880597014926,
"grad_norm": 0.5428503155708313,
"learning_rate": 1.236213907994943e-06,
"loss": 0.2513,
"num_input_tokens_seen": 2988592,
"step": 10505
},
{
"epoch": 19.60820895522388,
"grad_norm": 0.8605926632881165,
"learning_rate": 1.1796664467041973e-06,
"loss": 0.225,
"num_input_tokens_seen": 2989904,
"step": 10510
},
{
"epoch": 19.617537313432837,
"grad_norm": 0.4664938449859619,
"learning_rate": 1.1244412221429468e-06,
"loss": 0.1427,
"num_input_tokens_seen": 2991440,
"step": 10515
},
{
"epoch": 19.62686567164179,
"grad_norm": 0.7807358503341675,
"learning_rate": 1.0705383806982606e-06,
"loss": 0.2393,
"num_input_tokens_seen": 2992848,
"step": 10520
},
{
"epoch": 19.636194029850746,
"grad_norm": 0.6703472137451172,
"learning_rate": 1.017958065251845e-06,
"loss": 0.4173,
"num_input_tokens_seen": 2994096,
"step": 10525
},
{
"epoch": 19.645522388059703,
"grad_norm": 0.5756611227989197,
"learning_rate": 9.66700415179822e-07,
"loss": 0.2562,
"num_input_tokens_seen": 2995440,
"step": 10530
},
{
"epoch": 19.654850746268657,
"grad_norm": 0.7052452564239502,
"learning_rate": 9.16765566352229e-07,
"loss": 0.3594,
"num_input_tokens_seen": 2996816,
"step": 10535
},
{
"epoch": 19.66417910447761,
"grad_norm": 0.4399753212928772,
"learning_rate": 8.681536511327415e-07,
"loss": 0.2093,
"num_input_tokens_seen": 2998352,
"step": 10540
},
{
"epoch": 19.673507462686565,
"grad_norm": 0.439879834651947,
"learning_rate": 8.208647983782846e-07,
"loss": 0.2758,
"num_input_tokens_seen": 2999792,
"step": 10545
},
{
"epoch": 19.682835820895523,
"grad_norm": 0.4704674482345581,
"learning_rate": 7.748991334387557e-07,
"loss": 0.1757,
"num_input_tokens_seen": 3001072,
"step": 10550
},
{
"epoch": 19.692164179104477,
"grad_norm": 0.5811406970024109,
"learning_rate": 7.302567781565794e-07,
"loss": 0.2236,
"num_input_tokens_seen": 3002576,
"step": 10555
},
{
"epoch": 19.701492537313435,
"grad_norm": 0.28484269976615906,
"learning_rate": 6.869378508664315e-07,
"loss": 0.1104,
"num_input_tokens_seen": 3003984,
"step": 10560
},
{
"epoch": 19.71082089552239,
"grad_norm": 0.7560864686965942,
"learning_rate": 6.449424663950155e-07,
"loss": 0.1164,
"num_input_tokens_seen": 3005392,
"step": 10565
},
{
"epoch": 19.720149253731343,
"grad_norm": 0.6477477550506592,
"learning_rate": 6.042707360606192e-07,
"loss": 0.3706,
"num_input_tokens_seen": 3006896,
"step": 10570
},
{
"epoch": 19.729477611940297,
"grad_norm": 0.6175541281700134,
"learning_rate": 5.64922767673004e-07,
"loss": 0.2673,
"num_input_tokens_seen": 3008240,
"step": 10575
},
{
"epoch": 19.738805970149254,
"grad_norm": 0.3923323452472687,
"learning_rate": 5.268986655327934e-07,
"loss": 0.2049,
"num_input_tokens_seen": 3010000,
"step": 10580
},
{
"epoch": 19.74813432835821,
"grad_norm": 0.3566945195198059,
"learning_rate": 4.901985304315848e-07,
"loss": 0.2348,
"num_input_tokens_seen": 3011344,
"step": 10585
},
{
"epoch": 19.757462686567163,
"grad_norm": 0.57565838098526,
"learning_rate": 4.548224596513939e-07,
"loss": 0.3252,
"num_input_tokens_seen": 3012880,
"step": 10590
},
{
"epoch": 19.76679104477612,
"grad_norm": 0.3473455309867859,
"learning_rate": 4.207705469645995e-07,
"loss": 0.2089,
"num_input_tokens_seen": 3014448,
"step": 10595
},
{
"epoch": 19.776119402985074,
"grad_norm": 0.44248923659324646,
"learning_rate": 3.8804288263349917e-07,
"loss": 0.1524,
"num_input_tokens_seen": 3015984,
"step": 10600
},
{
"epoch": 19.78544776119403,
"grad_norm": 0.729060709476471,
"learning_rate": 3.56639553410143e-07,
"loss": 0.1925,
"num_input_tokens_seen": 3017392,
"step": 10605
},
{
"epoch": 19.794776119402986,
"grad_norm": 0.7432746291160583,
"learning_rate": 3.265606425363332e-07,
"loss": 0.3039,
"num_input_tokens_seen": 3018896,
"step": 10610
},
{
"epoch": 19.80410447761194,
"grad_norm": 0.34453150629997253,
"learning_rate": 2.97806229743014e-07,
"loss": 0.0832,
"num_input_tokens_seen": 3020496,
"step": 10615
},
{
"epoch": 19.813432835820894,
"grad_norm": 1.0268018245697021,
"learning_rate": 2.703763912502155e-07,
"loss": 0.1898,
"num_input_tokens_seen": 3021840,
"step": 10620
},
{
"epoch": 19.82276119402985,
"grad_norm": 0.6820981502532959,
"learning_rate": 2.4427119976705436e-07,
"loss": 0.2509,
"num_input_tokens_seen": 3023216,
"step": 10625
},
{
"epoch": 19.832089552238806,
"grad_norm": 0.4682190716266632,
"learning_rate": 2.1949072449123363e-07,
"loss": 0.1979,
"num_input_tokens_seen": 3024720,
"step": 10630
},
{
"epoch": 19.84141791044776,
"grad_norm": 0.5495747923851013,
"learning_rate": 1.9603503110904308e-07,
"loss": 0.268,
"num_input_tokens_seen": 3026160,
"step": 10635
},
{
"epoch": 19.850746268656717,
"grad_norm": 0.9342331886291504,
"learning_rate": 1.739041817951925e-07,
"loss": 0.3326,
"num_input_tokens_seen": 3027632,
"step": 10640
},
{
"epoch": 19.86007462686567,
"grad_norm": 0.5672308802604675,
"learning_rate": 1.5309823521242328e-07,
"loss": 0.1947,
"num_input_tokens_seen": 3028848,
"step": 10645
},
{
"epoch": 19.869402985074625,
"grad_norm": 0.5239127278327942,
"learning_rate": 1.3361724651167473e-07,
"loss": 0.2189,
"num_input_tokens_seen": 3030288,
"step": 10650
},
{
"epoch": 19.878731343283583,
"grad_norm": 0.5062621235847473,
"learning_rate": 1.1546126733180673e-07,
"loss": 0.1734,
"num_input_tokens_seen": 3031696,
"step": 10655
},
{
"epoch": 19.888059701492537,
"grad_norm": 0.8205133676528931,
"learning_rate": 9.863034579926655e-08,
"loss": 0.1619,
"num_input_tokens_seen": 3032976,
"step": 10660
},
{
"epoch": 19.89738805970149,
"grad_norm": 0.5459089875221252,
"learning_rate": 8.312452652831093e-08,
"loss": 0.2341,
"num_input_tokens_seen": 3034512,
"step": 10665
},
{
"epoch": 19.90671641791045,
"grad_norm": 0.5834176540374756,
"learning_rate": 6.894385062056197e-08,
"loss": 0.1828,
"num_input_tokens_seen": 3036016,
"step": 10670
},
{
"epoch": 19.916044776119403,
"grad_norm": 0.5604172348976135,
"learning_rate": 5.6088355665229187e-08,
"loss": 0.1543,
"num_input_tokens_seen": 3037296,
"step": 10675
},
{
"epoch": 19.925373134328357,
"grad_norm": 0.8893586993217468,
"learning_rate": 4.4558075738609926e-08,
"loss": 0.2943,
"num_input_tokens_seen": 3038736,
"step": 10680
},
{
"epoch": 19.934701492537314,
"grad_norm": 0.833718478679657,
"learning_rate": 3.4353041404477926e-08,
"loss": 0.2967,
"num_input_tokens_seen": 3040048,
"step": 10685
},
{
"epoch": 19.94402985074627,
"grad_norm": 0.4309261739253998,
"learning_rate": 2.5473279713472685e-08,
"loss": 0.1819,
"num_input_tokens_seen": 3041424,
"step": 10690
},
{
"epoch": 19.953358208955223,
"grad_norm": 0.5390483736991882,
"learning_rate": 1.7918814203432555e-08,
"loss": 0.2043,
"num_input_tokens_seen": 3042960,
"step": 10695
},
{
"epoch": 19.96268656716418,
"grad_norm": 0.4581194818019867,
"learning_rate": 1.1689664899283691e-08,
"loss": 0.2257,
"num_input_tokens_seen": 3044336,
"step": 10700
},
{
"epoch": 19.972014925373134,
"grad_norm": 0.27400490641593933,
"learning_rate": 6.78584831270701e-09,
"loss": 0.1333,
"num_input_tokens_seen": 3045904,
"step": 10705
},
{
"epoch": 19.98134328358209,
"grad_norm": 0.8952294588088989,
"learning_rate": 3.2073774424157263e-09,
"loss": 0.5205,
"num_input_tokens_seen": 3047248,
"step": 10710
},
{
"epoch": 19.990671641791046,
"grad_norm": 0.7515770196914673,
"learning_rate": 9.54261773933318e-10,
"loss": 0.2663,
"num_input_tokens_seen": 3048848,
"step": 10715
},
{
"epoch": 20.0,
"grad_norm": 0.20985476672649384,
"learning_rate": 2.650727970454625e-11,
"loss": 0.2325,
"num_input_tokens_seen": 3049984,
"step": 10720
},
{
"epoch": 20.0,
"eval_loss": 0.9178647398948669,
"eval_runtime": 4.1975,
"eval_samples_per_second": 56.701,
"eval_steps_per_second": 14.294,
"num_input_tokens_seen": 3049984,
"step": 10720
},
{
"epoch": 20.0,
"num_input_tokens_seen": 3049984,
"step": 10720,
"total_flos": 1.3733940102483149e+17,
"train_loss": 0.44957640529957726,
"train_runtime": 1754.1922,
"train_samples_per_second": 24.41,
"train_steps_per_second": 6.111
}
],
"logging_steps": 5,
"max_steps": 10720,
"num_input_tokens_seen": 3049984,
"num_train_epochs": 20,
"save_steps": 536,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3733940102483149e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}