train_conala_1756729619 / trainer_state.json
rbelanec's picture
End of training
d06e6ab verified
{
"best_global_step": 2144,
"best_metric": 0.6239609122276306,
"best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_conala_1756729619/checkpoint-2144",
"epoch": 10.0,
"eval_steps": 536,
"global_step": 10710,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004668534080298786,
"grad_norm": 84.6258544921875,
"learning_rate": 1.8674136321195148e-07,
"loss": 8.5449,
"num_input_tokens_seen": 576,
"step": 5
},
{
"epoch": 0.009337068160597572,
"grad_norm": 76.01036834716797,
"learning_rate": 4.2016806722689076e-07,
"loss": 8.3941,
"num_input_tokens_seen": 1168,
"step": 10
},
{
"epoch": 0.014005602240896359,
"grad_norm": 99.43428802490234,
"learning_rate": 6.535947712418301e-07,
"loss": 8.6591,
"num_input_tokens_seen": 1824,
"step": 15
},
{
"epoch": 0.018674136321195144,
"grad_norm": 103.66761016845703,
"learning_rate": 8.870214752567694e-07,
"loss": 8.3524,
"num_input_tokens_seen": 2384,
"step": 20
},
{
"epoch": 0.02334267040149393,
"grad_norm": 55.7948112487793,
"learning_rate": 1.1204481792717088e-06,
"loss": 8.3068,
"num_input_tokens_seen": 3040,
"step": 25
},
{
"epoch": 0.028011204481792718,
"grad_norm": 88.3673095703125,
"learning_rate": 1.3538748832866481e-06,
"loss": 7.901,
"num_input_tokens_seen": 3776,
"step": 30
},
{
"epoch": 0.032679738562091505,
"grad_norm": 95.092041015625,
"learning_rate": 1.5873015873015873e-06,
"loss": 7.813,
"num_input_tokens_seen": 4336,
"step": 35
},
{
"epoch": 0.03734827264239029,
"grad_norm": 76.32391357421875,
"learning_rate": 1.820728291316527e-06,
"loss": 6.4782,
"num_input_tokens_seen": 4944,
"step": 40
},
{
"epoch": 0.04201680672268908,
"grad_norm": 52.776458740234375,
"learning_rate": 2.054154995331466e-06,
"loss": 6.6237,
"num_input_tokens_seen": 5536,
"step": 45
},
{
"epoch": 0.04668534080298786,
"grad_norm": 45.56707000732422,
"learning_rate": 2.2875816993464053e-06,
"loss": 5.8744,
"num_input_tokens_seen": 6208,
"step": 50
},
{
"epoch": 0.051353874883286646,
"grad_norm": 53.54768753051758,
"learning_rate": 2.521008403361345e-06,
"loss": 5.7237,
"num_input_tokens_seen": 6848,
"step": 55
},
{
"epoch": 0.056022408963585436,
"grad_norm": 46.92519760131836,
"learning_rate": 2.754435107376284e-06,
"loss": 4.7982,
"num_input_tokens_seen": 7584,
"step": 60
},
{
"epoch": 0.06069094304388422,
"grad_norm": 60.48017883300781,
"learning_rate": 2.9878618113912236e-06,
"loss": 5.0754,
"num_input_tokens_seen": 8176,
"step": 65
},
{
"epoch": 0.06535947712418301,
"grad_norm": 47.011898040771484,
"learning_rate": 3.2212885154061624e-06,
"loss": 5.0185,
"num_input_tokens_seen": 8720,
"step": 70
},
{
"epoch": 0.0700280112044818,
"grad_norm": 75.90699768066406,
"learning_rate": 3.454715219421102e-06,
"loss": 5.0108,
"num_input_tokens_seen": 9312,
"step": 75
},
{
"epoch": 0.07469654528478058,
"grad_norm": 47.10323715209961,
"learning_rate": 3.688141923436041e-06,
"loss": 5.2044,
"num_input_tokens_seen": 9888,
"step": 80
},
{
"epoch": 0.07936507936507936,
"grad_norm": 43.27751541137695,
"learning_rate": 3.92156862745098e-06,
"loss": 4.6655,
"num_input_tokens_seen": 10480,
"step": 85
},
{
"epoch": 0.08403361344537816,
"grad_norm": 49.48586654663086,
"learning_rate": 4.15499533146592e-06,
"loss": 4.8074,
"num_input_tokens_seen": 11088,
"step": 90
},
{
"epoch": 0.08870214752567694,
"grad_norm": 42.18975830078125,
"learning_rate": 4.388422035480859e-06,
"loss": 3.6381,
"num_input_tokens_seen": 11872,
"step": 95
},
{
"epoch": 0.09337068160597572,
"grad_norm": 43.712135314941406,
"learning_rate": 4.621848739495799e-06,
"loss": 4.0669,
"num_input_tokens_seen": 12608,
"step": 100
},
{
"epoch": 0.09803921568627451,
"grad_norm": 47.92484664916992,
"learning_rate": 4.855275443510738e-06,
"loss": 4.4186,
"num_input_tokens_seen": 13280,
"step": 105
},
{
"epoch": 0.10270774976657329,
"grad_norm": 42.59040069580078,
"learning_rate": 5.0887021475256775e-06,
"loss": 3.6303,
"num_input_tokens_seen": 13952,
"step": 110
},
{
"epoch": 0.10737628384687208,
"grad_norm": 41.3209342956543,
"learning_rate": 5.322128851540616e-06,
"loss": 3.4098,
"num_input_tokens_seen": 14592,
"step": 115
},
{
"epoch": 0.11204481792717087,
"grad_norm": 66.64546966552734,
"learning_rate": 5.555555555555556e-06,
"loss": 3.858,
"num_input_tokens_seen": 15248,
"step": 120
},
{
"epoch": 0.11671335200746966,
"grad_norm": 78.6478042602539,
"learning_rate": 5.788982259570495e-06,
"loss": 3.4176,
"num_input_tokens_seen": 15904,
"step": 125
},
{
"epoch": 0.12138188608776844,
"grad_norm": 44.221187591552734,
"learning_rate": 6.022408963585434e-06,
"loss": 3.6577,
"num_input_tokens_seen": 16576,
"step": 130
},
{
"epoch": 0.12605042016806722,
"grad_norm": 42.961875915527344,
"learning_rate": 6.255835667600374e-06,
"loss": 3.1459,
"num_input_tokens_seen": 17216,
"step": 135
},
{
"epoch": 0.13071895424836602,
"grad_norm": 42.52253341674805,
"learning_rate": 6.489262371615313e-06,
"loss": 3.5311,
"num_input_tokens_seen": 17888,
"step": 140
},
{
"epoch": 0.1353874883286648,
"grad_norm": 34.132728576660156,
"learning_rate": 6.722689075630252e-06,
"loss": 3.344,
"num_input_tokens_seen": 18544,
"step": 145
},
{
"epoch": 0.1400560224089636,
"grad_norm": 45.744293212890625,
"learning_rate": 6.956115779645192e-06,
"loss": 3.1912,
"num_input_tokens_seen": 19184,
"step": 150
},
{
"epoch": 0.14472455648926238,
"grad_norm": 55.98895263671875,
"learning_rate": 7.1895424836601305e-06,
"loss": 2.7634,
"num_input_tokens_seen": 19760,
"step": 155
},
{
"epoch": 0.14939309056956115,
"grad_norm": 48.209251403808594,
"learning_rate": 7.42296918767507e-06,
"loss": 2.8525,
"num_input_tokens_seen": 20432,
"step": 160
},
{
"epoch": 0.15406162464985995,
"grad_norm": 55.19756317138672,
"learning_rate": 7.65639589169001e-06,
"loss": 2.4636,
"num_input_tokens_seen": 21120,
"step": 165
},
{
"epoch": 0.15873015873015872,
"grad_norm": 39.187644958496094,
"learning_rate": 7.889822595704948e-06,
"loss": 2.7641,
"num_input_tokens_seen": 21744,
"step": 170
},
{
"epoch": 0.16339869281045752,
"grad_norm": 37.96087646484375,
"learning_rate": 8.123249299719889e-06,
"loss": 2.7158,
"num_input_tokens_seen": 22432,
"step": 175
},
{
"epoch": 0.16806722689075632,
"grad_norm": 41.540382385253906,
"learning_rate": 8.356676003734828e-06,
"loss": 2.1432,
"num_input_tokens_seen": 23216,
"step": 180
},
{
"epoch": 0.17273576097105509,
"grad_norm": 52.784114837646484,
"learning_rate": 8.590102707749766e-06,
"loss": 2.309,
"num_input_tokens_seen": 23856,
"step": 185
},
{
"epoch": 0.17740429505135388,
"grad_norm": 43.1025390625,
"learning_rate": 8.823529411764707e-06,
"loss": 2.2712,
"num_input_tokens_seen": 24480,
"step": 190
},
{
"epoch": 0.18207282913165265,
"grad_norm": 23.09410285949707,
"learning_rate": 9.056956115779646e-06,
"loss": 2.2288,
"num_input_tokens_seen": 25184,
"step": 195
},
{
"epoch": 0.18674136321195145,
"grad_norm": 43.10409164428711,
"learning_rate": 9.290382819794586e-06,
"loss": 2.5402,
"num_input_tokens_seen": 25872,
"step": 200
},
{
"epoch": 0.19140989729225025,
"grad_norm": 37.84174346923828,
"learning_rate": 9.523809523809523e-06,
"loss": 2.5288,
"num_input_tokens_seen": 26544,
"step": 205
},
{
"epoch": 0.19607843137254902,
"grad_norm": 50.47822570800781,
"learning_rate": 9.757236227824464e-06,
"loss": 2.9575,
"num_input_tokens_seen": 27152,
"step": 210
},
{
"epoch": 0.20074696545284781,
"grad_norm": 42.75927734375,
"learning_rate": 9.990662931839404e-06,
"loss": 2.4739,
"num_input_tokens_seen": 27744,
"step": 215
},
{
"epoch": 0.20541549953314658,
"grad_norm": 41.93454360961914,
"learning_rate": 1.0224089635854343e-05,
"loss": 2.2127,
"num_input_tokens_seen": 28368,
"step": 220
},
{
"epoch": 0.21008403361344538,
"grad_norm": 52.05291748046875,
"learning_rate": 1.0457516339869281e-05,
"loss": 2.1488,
"num_input_tokens_seen": 29008,
"step": 225
},
{
"epoch": 0.21475256769374415,
"grad_norm": 48.70277786254883,
"learning_rate": 1.069094304388422e-05,
"loss": 2.0124,
"num_input_tokens_seen": 29648,
"step": 230
},
{
"epoch": 0.21942110177404295,
"grad_norm": 58.86783981323242,
"learning_rate": 1.092436974789916e-05,
"loss": 2.3292,
"num_input_tokens_seen": 30288,
"step": 235
},
{
"epoch": 0.22408963585434175,
"grad_norm": 73.74400329589844,
"learning_rate": 1.11577964519141e-05,
"loss": 2.465,
"num_input_tokens_seen": 30960,
"step": 240
},
{
"epoch": 0.22875816993464052,
"grad_norm": 44.3712043762207,
"learning_rate": 1.1391223155929038e-05,
"loss": 2.0996,
"num_input_tokens_seen": 31648,
"step": 245
},
{
"epoch": 0.2334267040149393,
"grad_norm": 38.637699127197266,
"learning_rate": 1.1624649859943979e-05,
"loss": 2.0566,
"num_input_tokens_seen": 32304,
"step": 250
},
{
"epoch": 0.23809523809523808,
"grad_norm": 48.099578857421875,
"learning_rate": 1.1858076563958917e-05,
"loss": 2.099,
"num_input_tokens_seen": 32880,
"step": 255
},
{
"epoch": 0.24276377217553688,
"grad_norm": 32.05073547363281,
"learning_rate": 1.2091503267973856e-05,
"loss": 1.7378,
"num_input_tokens_seen": 33504,
"step": 260
},
{
"epoch": 0.24743230625583568,
"grad_norm": 38.621734619140625,
"learning_rate": 1.2324929971988797e-05,
"loss": 2.0952,
"num_input_tokens_seen": 34128,
"step": 265
},
{
"epoch": 0.25210084033613445,
"grad_norm": 31.620946884155273,
"learning_rate": 1.2558356676003735e-05,
"loss": 1.3152,
"num_input_tokens_seen": 34864,
"step": 270
},
{
"epoch": 0.2567693744164332,
"grad_norm": 36.46520233154297,
"learning_rate": 1.2791783380018674e-05,
"loss": 1.7895,
"num_input_tokens_seen": 35472,
"step": 275
},
{
"epoch": 0.26143790849673204,
"grad_norm": 60.55278396606445,
"learning_rate": 1.3025210084033614e-05,
"loss": 1.9903,
"num_input_tokens_seen": 36144,
"step": 280
},
{
"epoch": 0.2661064425770308,
"grad_norm": 26.2589111328125,
"learning_rate": 1.3258636788048553e-05,
"loss": 1.2714,
"num_input_tokens_seen": 36768,
"step": 285
},
{
"epoch": 0.2707749766573296,
"grad_norm": 47.84441375732422,
"learning_rate": 1.3492063492063492e-05,
"loss": 1.3887,
"num_input_tokens_seen": 37424,
"step": 290
},
{
"epoch": 0.2754435107376284,
"grad_norm": 36.61695098876953,
"learning_rate": 1.3725490196078432e-05,
"loss": 1.4811,
"num_input_tokens_seen": 38096,
"step": 295
},
{
"epoch": 0.2801120448179272,
"grad_norm": 38.53045654296875,
"learning_rate": 1.3958916900093371e-05,
"loss": 1.9248,
"num_input_tokens_seen": 38736,
"step": 300
},
{
"epoch": 0.28478057889822594,
"grad_norm": 44.79354476928711,
"learning_rate": 1.419234360410831e-05,
"loss": 1.134,
"num_input_tokens_seen": 39424,
"step": 305
},
{
"epoch": 0.28944911297852477,
"grad_norm": 46.52348709106445,
"learning_rate": 1.4425770308123249e-05,
"loss": 1.3332,
"num_input_tokens_seen": 40160,
"step": 310
},
{
"epoch": 0.29411764705882354,
"grad_norm": 29.020601272583008,
"learning_rate": 1.4659197012138189e-05,
"loss": 1.307,
"num_input_tokens_seen": 40768,
"step": 315
},
{
"epoch": 0.2987861811391223,
"grad_norm": 47.507301330566406,
"learning_rate": 1.4892623716153128e-05,
"loss": 1.7801,
"num_input_tokens_seen": 41376,
"step": 320
},
{
"epoch": 0.3034547152194211,
"grad_norm": 30.17751121520996,
"learning_rate": 1.5126050420168067e-05,
"loss": 1.1193,
"num_input_tokens_seen": 42144,
"step": 325
},
{
"epoch": 0.3081232492997199,
"grad_norm": 24.388837814331055,
"learning_rate": 1.5359477124183007e-05,
"loss": 1.0519,
"num_input_tokens_seen": 42880,
"step": 330
},
{
"epoch": 0.3127917833800187,
"grad_norm": 26.254653930664062,
"learning_rate": 1.5592903828197946e-05,
"loss": 1.2371,
"num_input_tokens_seen": 43472,
"step": 335
},
{
"epoch": 0.31746031746031744,
"grad_norm": 32.2711296081543,
"learning_rate": 1.5826330532212885e-05,
"loss": 1.2114,
"num_input_tokens_seen": 44064,
"step": 340
},
{
"epoch": 0.32212885154061627,
"grad_norm": 23.010696411132812,
"learning_rate": 1.6059757236227827e-05,
"loss": 1.5254,
"num_input_tokens_seen": 44688,
"step": 345
},
{
"epoch": 0.32679738562091504,
"grad_norm": 14.727700233459473,
"learning_rate": 1.6293183940242765e-05,
"loss": 1.0906,
"num_input_tokens_seen": 45296,
"step": 350
},
{
"epoch": 0.3314659197012138,
"grad_norm": 25.24241065979004,
"learning_rate": 1.6526610644257704e-05,
"loss": 1.4038,
"num_input_tokens_seen": 45920,
"step": 355
},
{
"epoch": 0.33613445378151263,
"grad_norm": 28.438600540161133,
"learning_rate": 1.6760037348272643e-05,
"loss": 1.1871,
"num_input_tokens_seen": 46496,
"step": 360
},
{
"epoch": 0.3408029878618114,
"grad_norm": 33.14903259277344,
"learning_rate": 1.6993464052287582e-05,
"loss": 1.6374,
"num_input_tokens_seen": 47088,
"step": 365
},
{
"epoch": 0.34547152194211017,
"grad_norm": 23.363435745239258,
"learning_rate": 1.722689075630252e-05,
"loss": 1.5303,
"num_input_tokens_seen": 47680,
"step": 370
},
{
"epoch": 0.35014005602240894,
"grad_norm": 19.318904876708984,
"learning_rate": 1.746031746031746e-05,
"loss": 0.7557,
"num_input_tokens_seen": 48320,
"step": 375
},
{
"epoch": 0.35480859010270777,
"grad_norm": 7.826172828674316,
"learning_rate": 1.76937441643324e-05,
"loss": 0.8215,
"num_input_tokens_seen": 48976,
"step": 380
},
{
"epoch": 0.35947712418300654,
"grad_norm": 17.366634368896484,
"learning_rate": 1.792717086834734e-05,
"loss": 1.5873,
"num_input_tokens_seen": 49536,
"step": 385
},
{
"epoch": 0.3641456582633053,
"grad_norm": 21.24698829650879,
"learning_rate": 1.816059757236228e-05,
"loss": 1.1401,
"num_input_tokens_seen": 50160,
"step": 390
},
{
"epoch": 0.36881419234360413,
"grad_norm": 12.986654281616211,
"learning_rate": 1.839402427637722e-05,
"loss": 0.6504,
"num_input_tokens_seen": 50864,
"step": 395
},
{
"epoch": 0.3734827264239029,
"grad_norm": 24.244192123413086,
"learning_rate": 1.862745098039216e-05,
"loss": 1.4523,
"num_input_tokens_seen": 51552,
"step": 400
},
{
"epoch": 0.37815126050420167,
"grad_norm": 12.188508987426758,
"learning_rate": 1.8860877684407095e-05,
"loss": 0.8425,
"num_input_tokens_seen": 52208,
"step": 405
},
{
"epoch": 0.3828197945845005,
"grad_norm": 37.367069244384766,
"learning_rate": 1.9094304388422034e-05,
"loss": 1.1623,
"num_input_tokens_seen": 52864,
"step": 410
},
{
"epoch": 0.38748832866479926,
"grad_norm": 39.64646530151367,
"learning_rate": 1.9327731092436976e-05,
"loss": 1.0445,
"num_input_tokens_seen": 53456,
"step": 415
},
{
"epoch": 0.39215686274509803,
"grad_norm": 23.681903839111328,
"learning_rate": 1.9561157796451915e-05,
"loss": 1.414,
"num_input_tokens_seen": 54080,
"step": 420
},
{
"epoch": 0.3968253968253968,
"grad_norm": 15.80068302154541,
"learning_rate": 1.9794584500466854e-05,
"loss": 0.842,
"num_input_tokens_seen": 54720,
"step": 425
},
{
"epoch": 0.40149393090569563,
"grad_norm": 23.32377815246582,
"learning_rate": 2.0028011204481796e-05,
"loss": 0.7494,
"num_input_tokens_seen": 55328,
"step": 430
},
{
"epoch": 0.4061624649859944,
"grad_norm": 6.138006687164307,
"learning_rate": 2.0261437908496734e-05,
"loss": 0.9735,
"num_input_tokens_seen": 55936,
"step": 435
},
{
"epoch": 0.41083099906629317,
"grad_norm": 124.19203186035156,
"learning_rate": 2.0494864612511673e-05,
"loss": 0.9148,
"num_input_tokens_seen": 56608,
"step": 440
},
{
"epoch": 0.415499533146592,
"grad_norm": 16.503829956054688,
"learning_rate": 2.0728291316526612e-05,
"loss": 0.9937,
"num_input_tokens_seen": 57264,
"step": 445
},
{
"epoch": 0.42016806722689076,
"grad_norm": 64.5467758178711,
"learning_rate": 2.096171802054155e-05,
"loss": 0.6676,
"num_input_tokens_seen": 57856,
"step": 450
},
{
"epoch": 0.42483660130718953,
"grad_norm": 9.891252517700195,
"learning_rate": 2.119514472455649e-05,
"loss": 0.5961,
"num_input_tokens_seen": 58448,
"step": 455
},
{
"epoch": 0.4295051353874883,
"grad_norm": 45.48982620239258,
"learning_rate": 2.1428571428571428e-05,
"loss": 1.1232,
"num_input_tokens_seen": 59072,
"step": 460
},
{
"epoch": 0.4341736694677871,
"grad_norm": 11.444414138793945,
"learning_rate": 2.166199813258637e-05,
"loss": 0.5722,
"num_input_tokens_seen": 59744,
"step": 465
},
{
"epoch": 0.4388422035480859,
"grad_norm": 19.90337371826172,
"learning_rate": 2.189542483660131e-05,
"loss": 1.4224,
"num_input_tokens_seen": 60336,
"step": 470
},
{
"epoch": 0.44351073762838467,
"grad_norm": 14.948549270629883,
"learning_rate": 2.2128851540616248e-05,
"loss": 0.5425,
"num_input_tokens_seen": 60896,
"step": 475
},
{
"epoch": 0.4481792717086835,
"grad_norm": 6.758205890655518,
"learning_rate": 2.2362278244631187e-05,
"loss": 1.067,
"num_input_tokens_seen": 61600,
"step": 480
},
{
"epoch": 0.45284780578898226,
"grad_norm": 12.607068061828613,
"learning_rate": 2.2595704948646125e-05,
"loss": 1.2737,
"num_input_tokens_seen": 62272,
"step": 485
},
{
"epoch": 0.45751633986928103,
"grad_norm": 15.035085678100586,
"learning_rate": 2.2829131652661064e-05,
"loss": 0.8138,
"num_input_tokens_seen": 62784,
"step": 490
},
{
"epoch": 0.46218487394957986,
"grad_norm": 14.877044677734375,
"learning_rate": 2.3062558356676006e-05,
"loss": 1.0438,
"num_input_tokens_seen": 63472,
"step": 495
},
{
"epoch": 0.4668534080298786,
"grad_norm": 14.44520092010498,
"learning_rate": 2.3295985060690945e-05,
"loss": 1.2706,
"num_input_tokens_seen": 64192,
"step": 500
},
{
"epoch": 0.4715219421101774,
"grad_norm": 11.816399574279785,
"learning_rate": 2.3529411764705884e-05,
"loss": 1.0394,
"num_input_tokens_seen": 64880,
"step": 505
},
{
"epoch": 0.47619047619047616,
"grad_norm": 13.24144172668457,
"learning_rate": 2.3762838468720822e-05,
"loss": 0.6415,
"num_input_tokens_seen": 65536,
"step": 510
},
{
"epoch": 0.480859010270775,
"grad_norm": 13.559804916381836,
"learning_rate": 2.3996265172735765e-05,
"loss": 1.3095,
"num_input_tokens_seen": 66176,
"step": 515
},
{
"epoch": 0.48552754435107376,
"grad_norm": 7.28280782699585,
"learning_rate": 2.42296918767507e-05,
"loss": 1.1768,
"num_input_tokens_seen": 66768,
"step": 520
},
{
"epoch": 0.49019607843137253,
"grad_norm": 5.257969856262207,
"learning_rate": 2.446311858076564e-05,
"loss": 0.4696,
"num_input_tokens_seen": 67520,
"step": 525
},
{
"epoch": 0.49486461251167135,
"grad_norm": 6.459392070770264,
"learning_rate": 2.469654528478058e-05,
"loss": 1.0809,
"num_input_tokens_seen": 68144,
"step": 530
},
{
"epoch": 0.4995331465919701,
"grad_norm": 8.934903144836426,
"learning_rate": 2.492997198879552e-05,
"loss": 0.9926,
"num_input_tokens_seen": 68752,
"step": 535
},
{
"epoch": 0.5004668534080299,
"eval_loss": 0.8520299792289734,
"eval_runtime": 3.8914,
"eval_samples_per_second": 61.161,
"eval_steps_per_second": 30.58,
"num_input_tokens_seen": 68880,
"step": 536
},
{
"epoch": 0.5042016806722689,
"grad_norm": 11.780537605285645,
"learning_rate": 2.516339869281046e-05,
"loss": 0.8823,
"num_input_tokens_seen": 69424,
"step": 540
},
{
"epoch": 0.5088702147525677,
"grad_norm": 8.576756477355957,
"learning_rate": 2.5396825396825397e-05,
"loss": 0.6315,
"num_input_tokens_seen": 70128,
"step": 545
},
{
"epoch": 0.5135387488328664,
"grad_norm": 6.6480584144592285,
"learning_rate": 2.5630252100840336e-05,
"loss": 0.7986,
"num_input_tokens_seen": 70784,
"step": 550
},
{
"epoch": 0.5182072829131653,
"grad_norm": 16.78489112854004,
"learning_rate": 2.5863678804855278e-05,
"loss": 0.679,
"num_input_tokens_seen": 71456,
"step": 555
},
{
"epoch": 0.5228758169934641,
"grad_norm": 14.316130638122559,
"learning_rate": 2.6097105508870217e-05,
"loss": 0.7054,
"num_input_tokens_seen": 72096,
"step": 560
},
{
"epoch": 0.5275443510737629,
"grad_norm": 13.169844627380371,
"learning_rate": 2.6330532212885155e-05,
"loss": 1.5116,
"num_input_tokens_seen": 72784,
"step": 565
},
{
"epoch": 0.5322128851540616,
"grad_norm": 16.326101303100586,
"learning_rate": 2.6563958916900094e-05,
"loss": 0.9846,
"num_input_tokens_seen": 73440,
"step": 570
},
{
"epoch": 0.5368814192343604,
"grad_norm": 13.680533409118652,
"learning_rate": 2.6797385620915033e-05,
"loss": 0.6246,
"num_input_tokens_seen": 74096,
"step": 575
},
{
"epoch": 0.5415499533146592,
"grad_norm": 19.503694534301758,
"learning_rate": 2.7030812324929972e-05,
"loss": 1.4243,
"num_input_tokens_seen": 74800,
"step": 580
},
{
"epoch": 0.5462184873949579,
"grad_norm": 7.272068977355957,
"learning_rate": 2.726423902894491e-05,
"loss": 0.5295,
"num_input_tokens_seen": 75344,
"step": 585
},
{
"epoch": 0.5508870214752568,
"grad_norm": 9.080270767211914,
"learning_rate": 2.7497665732959853e-05,
"loss": 0.7331,
"num_input_tokens_seen": 75920,
"step": 590
},
{
"epoch": 0.5555555555555556,
"grad_norm": 12.239612579345703,
"learning_rate": 2.773109243697479e-05,
"loss": 1.0243,
"num_input_tokens_seen": 76480,
"step": 595
},
{
"epoch": 0.5602240896358543,
"grad_norm": 5.2204790115356445,
"learning_rate": 2.796451914098973e-05,
"loss": 0.7178,
"num_input_tokens_seen": 77248,
"step": 600
},
{
"epoch": 0.5648926237161531,
"grad_norm": 23.551700592041016,
"learning_rate": 2.819794584500467e-05,
"loss": 0.7556,
"num_input_tokens_seen": 77920,
"step": 605
},
{
"epoch": 0.5695611577964519,
"grad_norm": 32.25157928466797,
"learning_rate": 2.8431372549019608e-05,
"loss": 0.8648,
"num_input_tokens_seen": 78560,
"step": 610
},
{
"epoch": 0.5742296918767507,
"grad_norm": 29.887779235839844,
"learning_rate": 2.8664799253034546e-05,
"loss": 0.8888,
"num_input_tokens_seen": 79184,
"step": 615
},
{
"epoch": 0.5788982259570495,
"grad_norm": 2.3339264392852783,
"learning_rate": 2.8898225957049485e-05,
"loss": 0.5831,
"num_input_tokens_seen": 79776,
"step": 620
},
{
"epoch": 0.5835667600373483,
"grad_norm": 7.388897895812988,
"learning_rate": 2.913165266106443e-05,
"loss": 0.8904,
"num_input_tokens_seen": 80368,
"step": 625
},
{
"epoch": 0.5882352941176471,
"grad_norm": 11.283287048339844,
"learning_rate": 2.9365079365079366e-05,
"loss": 0.6893,
"num_input_tokens_seen": 81040,
"step": 630
},
{
"epoch": 0.5929038281979458,
"grad_norm": 80.2661361694336,
"learning_rate": 2.9598506069094305e-05,
"loss": 0.7345,
"num_input_tokens_seen": 81648,
"step": 635
},
{
"epoch": 0.5975723622782446,
"grad_norm": 7.673534870147705,
"learning_rate": 2.9831932773109244e-05,
"loss": 0.8234,
"num_input_tokens_seen": 82208,
"step": 640
},
{
"epoch": 0.6022408963585434,
"grad_norm": 8.291051864624023,
"learning_rate": 3.0065359477124182e-05,
"loss": 0.9744,
"num_input_tokens_seen": 82784,
"step": 645
},
{
"epoch": 0.6069094304388422,
"grad_norm": 6.479700088500977,
"learning_rate": 3.029878618113912e-05,
"loss": 0.6968,
"num_input_tokens_seen": 83424,
"step": 650
},
{
"epoch": 0.611577964519141,
"grad_norm": 13.389955520629883,
"learning_rate": 3.053221288515406e-05,
"loss": 0.3775,
"num_input_tokens_seen": 84112,
"step": 655
},
{
"epoch": 0.6162464985994398,
"grad_norm": 9.457218170166016,
"learning_rate": 3.0765639589169e-05,
"loss": 0.919,
"num_input_tokens_seen": 84720,
"step": 660
},
{
"epoch": 0.6209150326797386,
"grad_norm": 8.425762176513672,
"learning_rate": 3.099906629318394e-05,
"loss": 1.012,
"num_input_tokens_seen": 85264,
"step": 665
},
{
"epoch": 0.6255835667600373,
"grad_norm": 8.807003021240234,
"learning_rate": 3.123249299719888e-05,
"loss": 0.6742,
"num_input_tokens_seen": 85840,
"step": 670
},
{
"epoch": 0.6302521008403361,
"grad_norm": 5.953861713409424,
"learning_rate": 3.146591970121382e-05,
"loss": 0.6991,
"num_input_tokens_seen": 86464,
"step": 675
},
{
"epoch": 0.6349206349206349,
"grad_norm": 6.615007400512695,
"learning_rate": 3.169934640522876e-05,
"loss": 0.5356,
"num_input_tokens_seen": 87104,
"step": 680
},
{
"epoch": 0.6395891690009337,
"grad_norm": 9.70020580291748,
"learning_rate": 3.1932773109243696e-05,
"loss": 0.6897,
"num_input_tokens_seen": 87792,
"step": 685
},
{
"epoch": 0.6442577030812325,
"grad_norm": 13.47252082824707,
"learning_rate": 3.216619981325864e-05,
"loss": 1.0909,
"num_input_tokens_seen": 88480,
"step": 690
},
{
"epoch": 0.6489262371615313,
"grad_norm": 11.896513938903809,
"learning_rate": 3.239962651727358e-05,
"loss": 1.3468,
"num_input_tokens_seen": 89024,
"step": 695
},
{
"epoch": 0.6535947712418301,
"grad_norm": 11.38027572631836,
"learning_rate": 3.263305322128852e-05,
"loss": 0.7407,
"num_input_tokens_seen": 89632,
"step": 700
},
{
"epoch": 0.6582633053221288,
"grad_norm": 8.730727195739746,
"learning_rate": 3.286647992530346e-05,
"loss": 0.866,
"num_input_tokens_seen": 90256,
"step": 705
},
{
"epoch": 0.6629318394024276,
"grad_norm": 13.638816833496094,
"learning_rate": 3.3099906629318396e-05,
"loss": 0.9243,
"num_input_tokens_seen": 90864,
"step": 710
},
{
"epoch": 0.6676003734827264,
"grad_norm": 4.575711250305176,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.8537,
"num_input_tokens_seen": 91584,
"step": 715
},
{
"epoch": 0.6722689075630253,
"grad_norm": 9.954397201538086,
"learning_rate": 3.3566760037348274e-05,
"loss": 0.5893,
"num_input_tokens_seen": 92192,
"step": 720
},
{
"epoch": 0.676937441643324,
"grad_norm": 29.623050689697266,
"learning_rate": 3.380018674136321e-05,
"loss": 0.7864,
"num_input_tokens_seen": 92832,
"step": 725
},
{
"epoch": 0.6816059757236228,
"grad_norm": 4.245872974395752,
"learning_rate": 3.403361344537815e-05,
"loss": 0.7591,
"num_input_tokens_seen": 93632,
"step": 730
},
{
"epoch": 0.6862745098039216,
"grad_norm": 18.07567024230957,
"learning_rate": 3.426704014939309e-05,
"loss": 0.8031,
"num_input_tokens_seen": 94256,
"step": 735
},
{
"epoch": 0.6909430438842203,
"grad_norm": 12.143633842468262,
"learning_rate": 3.450046685340803e-05,
"loss": 1.1466,
"num_input_tokens_seen": 94832,
"step": 740
},
{
"epoch": 0.6956115779645191,
"grad_norm": 32.405723571777344,
"learning_rate": 3.473389355742297e-05,
"loss": 0.9448,
"num_input_tokens_seen": 95440,
"step": 745
},
{
"epoch": 0.7002801120448179,
"grad_norm": 11.133152961730957,
"learning_rate": 3.4967320261437906e-05,
"loss": 0.6633,
"num_input_tokens_seen": 96208,
"step": 750
},
{
"epoch": 0.7049486461251168,
"grad_norm": 5.159026145935059,
"learning_rate": 3.520074696545285e-05,
"loss": 0.747,
"num_input_tokens_seen": 96992,
"step": 755
},
{
"epoch": 0.7096171802054155,
"grad_norm": 9.558073997497559,
"learning_rate": 3.543417366946779e-05,
"loss": 0.8285,
"num_input_tokens_seen": 97600,
"step": 760
},
{
"epoch": 0.7142857142857143,
"grad_norm": 7.429594993591309,
"learning_rate": 3.566760037348273e-05,
"loss": 0.6289,
"num_input_tokens_seen": 98384,
"step": 765
},
{
"epoch": 0.7189542483660131,
"grad_norm": 9.359416961669922,
"learning_rate": 3.590102707749767e-05,
"loss": 1.4207,
"num_input_tokens_seen": 99008,
"step": 770
},
{
"epoch": 0.7236227824463118,
"grad_norm": 57.07271957397461,
"learning_rate": 3.613445378151261e-05,
"loss": 1.0578,
"num_input_tokens_seen": 99648,
"step": 775
},
{
"epoch": 0.7282913165266106,
"grad_norm": 13.815699577331543,
"learning_rate": 3.6367880485527545e-05,
"loss": 0.9202,
"num_input_tokens_seen": 100368,
"step": 780
},
{
"epoch": 0.7329598506069094,
"grad_norm": 7.66619873046875,
"learning_rate": 3.6601307189542484e-05,
"loss": 0.9465,
"num_input_tokens_seen": 101056,
"step": 785
},
{
"epoch": 0.7376283846872083,
"grad_norm": 5.203883171081543,
"learning_rate": 3.683473389355743e-05,
"loss": 0.5311,
"num_input_tokens_seen": 101744,
"step": 790
},
{
"epoch": 0.742296918767507,
"grad_norm": 19.66509437561035,
"learning_rate": 3.706816059757237e-05,
"loss": 1.1779,
"num_input_tokens_seen": 102400,
"step": 795
},
{
"epoch": 0.7469654528478058,
"grad_norm": 10.233268737792969,
"learning_rate": 3.730158730158731e-05,
"loss": 0.7842,
"num_input_tokens_seen": 103120,
"step": 800
},
{
"epoch": 0.7516339869281046,
"grad_norm": 4.705903053283691,
"learning_rate": 3.753501400560224e-05,
"loss": 0.6265,
"num_input_tokens_seen": 103776,
"step": 805
},
{
"epoch": 0.7563025210084033,
"grad_norm": 6.988365173339844,
"learning_rate": 3.776844070961718e-05,
"loss": 0.7407,
"num_input_tokens_seen": 104432,
"step": 810
},
{
"epoch": 0.7609710550887021,
"grad_norm": 9.381958961486816,
"learning_rate": 3.800186741363212e-05,
"loss": 0.5331,
"num_input_tokens_seen": 105136,
"step": 815
},
{
"epoch": 0.765639589169001,
"grad_norm": 6.593667984008789,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.6444,
"num_input_tokens_seen": 105760,
"step": 820
},
{
"epoch": 0.7703081232492998,
"grad_norm": 6.7614569664001465,
"learning_rate": 3.8468720821662e-05,
"loss": 0.9035,
"num_input_tokens_seen": 106416,
"step": 825
},
{
"epoch": 0.7749766573295985,
"grad_norm": 29.56807518005371,
"learning_rate": 3.870214752567694e-05,
"loss": 0.6458,
"num_input_tokens_seen": 107264,
"step": 830
},
{
"epoch": 0.7796451914098973,
"grad_norm": 2.848146915435791,
"learning_rate": 3.893557422969188e-05,
"loss": 0.5388,
"num_input_tokens_seen": 107952,
"step": 835
},
{
"epoch": 0.7843137254901961,
"grad_norm": 4.524416446685791,
"learning_rate": 3.916900093370682e-05,
"loss": 0.843,
"num_input_tokens_seen": 108560,
"step": 840
},
{
"epoch": 0.7889822595704948,
"grad_norm": 13.986185073852539,
"learning_rate": 3.9402427637721756e-05,
"loss": 0.8335,
"num_input_tokens_seen": 109216,
"step": 845
},
{
"epoch": 0.7936507936507936,
"grad_norm": 7.887524604797363,
"learning_rate": 3.9635854341736695e-05,
"loss": 1.1149,
"num_input_tokens_seen": 109824,
"step": 850
},
{
"epoch": 0.7983193277310925,
"grad_norm": 10.166050910949707,
"learning_rate": 3.986928104575164e-05,
"loss": 0.9014,
"num_input_tokens_seen": 110496,
"step": 855
},
{
"epoch": 0.8029878618113913,
"grad_norm": 3.036750555038452,
"learning_rate": 4.010270774976658e-05,
"loss": 0.4639,
"num_input_tokens_seen": 111248,
"step": 860
},
{
"epoch": 0.80765639589169,
"grad_norm": 8.926546096801758,
"learning_rate": 4.033613445378152e-05,
"loss": 0.8944,
"num_input_tokens_seen": 111888,
"step": 865
},
{
"epoch": 0.8123249299719888,
"grad_norm": 14.85969066619873,
"learning_rate": 4.0569561157796457e-05,
"loss": 0.6904,
"num_input_tokens_seen": 112496,
"step": 870
},
{
"epoch": 0.8169934640522876,
"grad_norm": 8.692270278930664,
"learning_rate": 4.0802987861811395e-05,
"loss": 1.0703,
"num_input_tokens_seen": 113152,
"step": 875
},
{
"epoch": 0.8216619981325863,
"grad_norm": 2.891441822052002,
"learning_rate": 4.1036414565826334e-05,
"loss": 0.6687,
"num_input_tokens_seen": 113856,
"step": 880
},
{
"epoch": 0.8263305322128851,
"grad_norm": 9.000094413757324,
"learning_rate": 4.126984126984127e-05,
"loss": 0.7608,
"num_input_tokens_seen": 114512,
"step": 885
},
{
"epoch": 0.830999066293184,
"grad_norm": 4.151757717132568,
"learning_rate": 4.150326797385621e-05,
"loss": 0.6469,
"num_input_tokens_seen": 115168,
"step": 890
},
{
"epoch": 0.8356676003734828,
"grad_norm": 16.637250900268555,
"learning_rate": 4.173669467787115e-05,
"loss": 0.5817,
"num_input_tokens_seen": 115728,
"step": 895
},
{
"epoch": 0.8403361344537815,
"grad_norm": 17.092973709106445,
"learning_rate": 4.197012138188609e-05,
"loss": 0.9298,
"num_input_tokens_seen": 116352,
"step": 900
},
{
"epoch": 0.8450046685340803,
"grad_norm": 3.194045305252075,
"learning_rate": 4.220354808590103e-05,
"loss": 0.8504,
"num_input_tokens_seen": 117008,
"step": 905
},
{
"epoch": 0.8496732026143791,
"grad_norm": 14.898492813110352,
"learning_rate": 4.2436974789915967e-05,
"loss": 0.5924,
"num_input_tokens_seen": 117776,
"step": 910
},
{
"epoch": 0.8543417366946778,
"grad_norm": 12.749673843383789,
"learning_rate": 4.2670401493930905e-05,
"loss": 0.7349,
"num_input_tokens_seen": 118368,
"step": 915
},
{
"epoch": 0.8590102707749766,
"grad_norm": 7.691690921783447,
"learning_rate": 4.2903828197945844e-05,
"loss": 0.7437,
"num_input_tokens_seen": 119024,
"step": 920
},
{
"epoch": 0.8636788048552755,
"grad_norm": 7.8026442527771,
"learning_rate": 4.313725490196079e-05,
"loss": 0.5653,
"num_input_tokens_seen": 119632,
"step": 925
},
{
"epoch": 0.8683473389355743,
"grad_norm": 2.568347692489624,
"learning_rate": 4.337068160597573e-05,
"loss": 0.9791,
"num_input_tokens_seen": 120240,
"step": 930
},
{
"epoch": 0.873015873015873,
"grad_norm": 5.230515480041504,
"learning_rate": 4.360410830999067e-05,
"loss": 0.976,
"num_input_tokens_seen": 120832,
"step": 935
},
{
"epoch": 0.8776844070961718,
"grad_norm": 14.007842063903809,
"learning_rate": 4.3837535014005606e-05,
"loss": 0.9204,
"num_input_tokens_seen": 121488,
"step": 940
},
{
"epoch": 0.8823529411764706,
"grad_norm": 7.546886920928955,
"learning_rate": 4.4070961718020545e-05,
"loss": 0.6922,
"num_input_tokens_seen": 122112,
"step": 945
},
{
"epoch": 0.8870214752567693,
"grad_norm": 6.620900630950928,
"learning_rate": 4.430438842203548e-05,
"loss": 0.6913,
"num_input_tokens_seen": 122736,
"step": 950
},
{
"epoch": 0.8916900093370682,
"grad_norm": 1.7955033779144287,
"learning_rate": 4.453781512605042e-05,
"loss": 1.0533,
"num_input_tokens_seen": 123328,
"step": 955
},
{
"epoch": 0.896358543417367,
"grad_norm": 16.979063034057617,
"learning_rate": 4.477124183006536e-05,
"loss": 0.7929,
"num_input_tokens_seen": 124016,
"step": 960
},
{
"epoch": 0.9010270774976658,
"grad_norm": 4.907767295837402,
"learning_rate": 4.50046685340803e-05,
"loss": 0.7584,
"num_input_tokens_seen": 124672,
"step": 965
},
{
"epoch": 0.9056956115779645,
"grad_norm": 10.652020454406738,
"learning_rate": 4.523809523809524e-05,
"loss": 0.9878,
"num_input_tokens_seen": 125376,
"step": 970
},
{
"epoch": 0.9103641456582633,
"grad_norm": 6.290508270263672,
"learning_rate": 4.547152194211018e-05,
"loss": 0.7448,
"num_input_tokens_seen": 126032,
"step": 975
},
{
"epoch": 0.9150326797385621,
"grad_norm": 9.92817497253418,
"learning_rate": 4.5704948646125116e-05,
"loss": 0.8184,
"num_input_tokens_seen": 126640,
"step": 980
},
{
"epoch": 0.9197012138188608,
"grad_norm": 3.918785333633423,
"learning_rate": 4.5938375350140055e-05,
"loss": 0.4832,
"num_input_tokens_seen": 127360,
"step": 985
},
{
"epoch": 0.9243697478991597,
"grad_norm": 7.656343936920166,
"learning_rate": 4.6171802054155e-05,
"loss": 0.7947,
"num_input_tokens_seen": 128000,
"step": 990
},
{
"epoch": 0.9290382819794585,
"grad_norm": 9.467264175415039,
"learning_rate": 4.640522875816994e-05,
"loss": 0.9679,
"num_input_tokens_seen": 128672,
"step": 995
},
{
"epoch": 0.9337068160597572,
"grad_norm": 8.756389617919922,
"learning_rate": 4.663865546218488e-05,
"loss": 0.5004,
"num_input_tokens_seen": 129328,
"step": 1000
},
{
"epoch": 0.938375350140056,
"grad_norm": 5.837770938873291,
"learning_rate": 4.6872082166199816e-05,
"loss": 0.6132,
"num_input_tokens_seen": 129984,
"step": 1005
},
{
"epoch": 0.9430438842203548,
"grad_norm": 4.566342830657959,
"learning_rate": 4.7105508870214755e-05,
"loss": 0.6422,
"num_input_tokens_seen": 130720,
"step": 1010
},
{
"epoch": 0.9477124183006536,
"grad_norm": 8.053881645202637,
"learning_rate": 4.7338935574229694e-05,
"loss": 0.419,
"num_input_tokens_seen": 131392,
"step": 1015
},
{
"epoch": 0.9523809523809523,
"grad_norm": 12.437357902526855,
"learning_rate": 4.757236227824463e-05,
"loss": 0.9272,
"num_input_tokens_seen": 132064,
"step": 1020
},
{
"epoch": 0.9570494864612512,
"grad_norm": 5.731832504272461,
"learning_rate": 4.780578898225958e-05,
"loss": 0.4189,
"num_input_tokens_seen": 132704,
"step": 1025
},
{
"epoch": 0.96171802054155,
"grad_norm": 5.499335289001465,
"learning_rate": 4.803921568627452e-05,
"loss": 0.5612,
"num_input_tokens_seen": 133344,
"step": 1030
},
{
"epoch": 0.9663865546218487,
"grad_norm": 7.233240604400635,
"learning_rate": 4.827264239028945e-05,
"loss": 0.4785,
"num_input_tokens_seen": 133952,
"step": 1035
},
{
"epoch": 0.9710550887021475,
"grad_norm": 38.50244903564453,
"learning_rate": 4.850606909430439e-05,
"loss": 0.5721,
"num_input_tokens_seen": 134512,
"step": 1040
},
{
"epoch": 0.9757236227824463,
"grad_norm": 8.803295135498047,
"learning_rate": 4.8739495798319326e-05,
"loss": 0.5325,
"num_input_tokens_seen": 135136,
"step": 1045
},
{
"epoch": 0.9803921568627451,
"grad_norm": 6.8154826164245605,
"learning_rate": 4.8972922502334265e-05,
"loss": 0.7822,
"num_input_tokens_seen": 135728,
"step": 1050
},
{
"epoch": 0.9850606909430439,
"grad_norm": 10.711194038391113,
"learning_rate": 4.9206349206349204e-05,
"loss": 1.1347,
"num_input_tokens_seen": 136336,
"step": 1055
},
{
"epoch": 0.9897292250233427,
"grad_norm": 4.25268030166626,
"learning_rate": 4.943977591036415e-05,
"loss": 0.8395,
"num_input_tokens_seen": 136976,
"step": 1060
},
{
"epoch": 0.9943977591036415,
"grad_norm": 14.6632661819458,
"learning_rate": 4.967320261437909e-05,
"loss": 0.9144,
"num_input_tokens_seen": 137584,
"step": 1065
},
{
"epoch": 0.9990662931839402,
"grad_norm": 21.839466094970703,
"learning_rate": 4.990662931839403e-05,
"loss": 1.0361,
"num_input_tokens_seen": 138160,
"step": 1070
},
{
"epoch": 1.0009337068160598,
"eval_loss": 0.7260501980781555,
"eval_runtime": 3.8833,
"eval_samples_per_second": 61.288,
"eval_steps_per_second": 30.644,
"num_input_tokens_seen": 138320,
"step": 1072
},
{
"epoch": 1.003734827264239,
"grad_norm": 4.944736480712891,
"learning_rate": 4.999998804943956e-05,
"loss": 0.7482,
"num_input_tokens_seen": 138640,
"step": 1075
},
{
"epoch": 1.0084033613445378,
"grad_norm": 14.909157752990723,
"learning_rate": 4.999991501827824e-05,
"loss": 0.6911,
"num_input_tokens_seen": 139360,
"step": 1080
},
{
"epoch": 1.0130718954248366,
"grad_norm": 3.459869146347046,
"learning_rate": 4.999977559534957e-05,
"loss": 0.5468,
"num_input_tokens_seen": 139984,
"step": 1085
},
{
"epoch": 1.0177404295051353,
"grad_norm": 4.126331329345703,
"learning_rate": 4.9999569781023795e-05,
"loss": 0.9195,
"num_input_tokens_seen": 140592,
"step": 1090
},
{
"epoch": 1.022408963585434,
"grad_norm": 13.112894058227539,
"learning_rate": 4.99992975758475e-05,
"loss": 1.0261,
"num_input_tokens_seen": 141248,
"step": 1095
},
{
"epoch": 1.0270774976657329,
"grad_norm": 46.097816467285156,
"learning_rate": 4.999895898054357e-05,
"loss": 1.0014,
"num_input_tokens_seen": 141760,
"step": 1100
},
{
"epoch": 1.0317460317460316,
"grad_norm": 9.920419692993164,
"learning_rate": 4.999855399601122e-05,
"loss": 0.9958,
"num_input_tokens_seen": 142416,
"step": 1105
},
{
"epoch": 1.0364145658263306,
"grad_norm": 5.019301891326904,
"learning_rate": 4.999808262332595e-05,
"loss": 0.5371,
"num_input_tokens_seen": 143136,
"step": 1110
},
{
"epoch": 1.0410830999066294,
"grad_norm": 9.689529418945312,
"learning_rate": 4.9997544863739565e-05,
"loss": 0.7149,
"num_input_tokens_seen": 143776,
"step": 1115
},
{
"epoch": 1.0457516339869282,
"grad_norm": 13.740153312683105,
"learning_rate": 4.999694071868019e-05,
"loss": 0.6224,
"num_input_tokens_seen": 144448,
"step": 1120
},
{
"epoch": 1.050420168067227,
"grad_norm": 12.397239685058594,
"learning_rate": 4.999627018975226e-05,
"loss": 0.7541,
"num_input_tokens_seen": 145056,
"step": 1125
},
{
"epoch": 1.0550887021475257,
"grad_norm": 8.042997360229492,
"learning_rate": 4.999553327873645e-05,
"loss": 0.6833,
"num_input_tokens_seen": 145728,
"step": 1130
},
{
"epoch": 1.0597572362278245,
"grad_norm": 4.10331392288208,
"learning_rate": 4.999472998758978e-05,
"loss": 0.5729,
"num_input_tokens_seen": 146400,
"step": 1135
},
{
"epoch": 1.0644257703081232,
"grad_norm": 11.920084953308105,
"learning_rate": 4.999386031844554e-05,
"loss": 1.217,
"num_input_tokens_seen": 146960,
"step": 1140
},
{
"epoch": 1.069094304388422,
"grad_norm": 7.711158275604248,
"learning_rate": 4.999292427361328e-05,
"loss": 0.7875,
"num_input_tokens_seen": 147584,
"step": 1145
},
{
"epoch": 1.0737628384687208,
"grad_norm": 5.348123073577881,
"learning_rate": 4.999192185557884e-05,
"loss": 0.6283,
"num_input_tokens_seen": 148224,
"step": 1150
},
{
"epoch": 1.0784313725490196,
"grad_norm": 7.719697952270508,
"learning_rate": 4.999085306700431e-05,
"loss": 1.2682,
"num_input_tokens_seen": 148816,
"step": 1155
},
{
"epoch": 1.0830999066293183,
"grad_norm": 2.1265199184417725,
"learning_rate": 4.998971791072807e-05,
"loss": 0.7327,
"num_input_tokens_seen": 149568,
"step": 1160
},
{
"epoch": 1.087768440709617,
"grad_norm": 12.70578670501709,
"learning_rate": 4.998851638976472e-05,
"loss": 0.5073,
"num_input_tokens_seen": 150192,
"step": 1165
},
{
"epoch": 1.092436974789916,
"grad_norm": 6.228635787963867,
"learning_rate": 4.9987248507305114e-05,
"loss": 0.7351,
"num_input_tokens_seen": 150848,
"step": 1170
},
{
"epoch": 1.0971055088702149,
"grad_norm": 9.919082641601562,
"learning_rate": 4.998591426671635e-05,
"loss": 0.4393,
"num_input_tokens_seen": 151440,
"step": 1175
},
{
"epoch": 1.1017740429505136,
"grad_norm": 2.2125799655914307,
"learning_rate": 4.998451367154173e-05,
"loss": 0.7022,
"num_input_tokens_seen": 152032,
"step": 1180
},
{
"epoch": 1.1064425770308124,
"grad_norm": 2.3273732662200928,
"learning_rate": 4.998304672550081e-05,
"loss": 0.5521,
"num_input_tokens_seen": 152688,
"step": 1185
},
{
"epoch": 1.1111111111111112,
"grad_norm": 8.704615592956543,
"learning_rate": 4.9981513432489295e-05,
"loss": 1.1852,
"num_input_tokens_seen": 153296,
"step": 1190
},
{
"epoch": 1.11577964519141,
"grad_norm": 6.8140645027160645,
"learning_rate": 4.9979913796579146e-05,
"loss": 0.5501,
"num_input_tokens_seen": 153936,
"step": 1195
},
{
"epoch": 1.1204481792717087,
"grad_norm": 6.674036026000977,
"learning_rate": 4.9978247822018476e-05,
"loss": 0.6271,
"num_input_tokens_seen": 154608,
"step": 1200
},
{
"epoch": 1.1251167133520075,
"grad_norm": 13.860675811767578,
"learning_rate": 4.997651551323158e-05,
"loss": 0.5194,
"num_input_tokens_seen": 155472,
"step": 1205
},
{
"epoch": 1.1297852474323062,
"grad_norm": 6.035935401916504,
"learning_rate": 4.997471687481892e-05,
"loss": 0.8798,
"num_input_tokens_seen": 156112,
"step": 1210
},
{
"epoch": 1.134453781512605,
"grad_norm": 5.109834671020508,
"learning_rate": 4.9972851911557095e-05,
"loss": 0.4041,
"num_input_tokens_seen": 156784,
"step": 1215
},
{
"epoch": 1.1391223155929038,
"grad_norm": 9.794246673583984,
"learning_rate": 4.997092062839885e-05,
"loss": 0.7887,
"num_input_tokens_seen": 157440,
"step": 1220
},
{
"epoch": 1.1437908496732025,
"grad_norm": 6.726025581359863,
"learning_rate": 4.996892303047306e-05,
"loss": 1.0743,
"num_input_tokens_seen": 157952,
"step": 1225
},
{
"epoch": 1.1484593837535013,
"grad_norm": 8.099092483520508,
"learning_rate": 4.996685912308471e-05,
"loss": 0.4954,
"num_input_tokens_seen": 158544,
"step": 1230
},
{
"epoch": 1.1531279178338,
"grad_norm": 5.0507893562316895,
"learning_rate": 4.9964728911714866e-05,
"loss": 0.6247,
"num_input_tokens_seen": 159232,
"step": 1235
},
{
"epoch": 1.1577964519140989,
"grad_norm": 3.901911497116089,
"learning_rate": 4.996253240202069e-05,
"loss": 0.4887,
"num_input_tokens_seen": 159824,
"step": 1240
},
{
"epoch": 1.1624649859943978,
"grad_norm": 9.426101684570312,
"learning_rate": 4.996026959983541e-05,
"loss": 0.6639,
"num_input_tokens_seen": 160544,
"step": 1245
},
{
"epoch": 1.1671335200746966,
"grad_norm": 3.252911329269409,
"learning_rate": 4.995794051116831e-05,
"loss": 0.551,
"num_input_tokens_seen": 161216,
"step": 1250
},
{
"epoch": 1.1718020541549954,
"grad_norm": 9.056844711303711,
"learning_rate": 4.99555451422047e-05,
"loss": 1.3325,
"num_input_tokens_seen": 161920,
"step": 1255
},
{
"epoch": 1.1764705882352942,
"grad_norm": 9.504724502563477,
"learning_rate": 4.99530834993059e-05,
"loss": 0.9963,
"num_input_tokens_seen": 162560,
"step": 1260
},
{
"epoch": 1.181139122315593,
"grad_norm": 4.32997989654541,
"learning_rate": 4.9950555589009255e-05,
"loss": 0.5409,
"num_input_tokens_seen": 163296,
"step": 1265
},
{
"epoch": 1.1858076563958917,
"grad_norm": 5.476011753082275,
"learning_rate": 4.994796141802809e-05,
"loss": 0.7473,
"num_input_tokens_seen": 163952,
"step": 1270
},
{
"epoch": 1.1904761904761905,
"grad_norm": 4.545778751373291,
"learning_rate": 4.994530099325169e-05,
"loss": 0.6341,
"num_input_tokens_seen": 164544,
"step": 1275
},
{
"epoch": 1.1951447245564892,
"grad_norm": 3.5914573669433594,
"learning_rate": 4.994257432174529e-05,
"loss": 0.4417,
"num_input_tokens_seen": 165216,
"step": 1280
},
{
"epoch": 1.199813258636788,
"grad_norm": 6.310541152954102,
"learning_rate": 4.9939781410750055e-05,
"loss": 0.699,
"num_input_tokens_seen": 165936,
"step": 1285
},
{
"epoch": 1.2044817927170868,
"grad_norm": 40.967552185058594,
"learning_rate": 4.993692226768306e-05,
"loss": 0.6085,
"num_input_tokens_seen": 166672,
"step": 1290
},
{
"epoch": 1.2091503267973855,
"grad_norm": 12.131630897521973,
"learning_rate": 4.993399690013727e-05,
"loss": 0.8548,
"num_input_tokens_seen": 167328,
"step": 1295
},
{
"epoch": 1.2138188608776843,
"grad_norm": 5.9977803230285645,
"learning_rate": 4.993100531588154e-05,
"loss": 0.4468,
"num_input_tokens_seen": 167920,
"step": 1300
},
{
"epoch": 1.2184873949579833,
"grad_norm": 6.294752597808838,
"learning_rate": 4.992794752286054e-05,
"loss": 0.5278,
"num_input_tokens_seen": 168496,
"step": 1305
},
{
"epoch": 1.223155929038282,
"grad_norm": 5.4937424659729,
"learning_rate": 4.99248235291948e-05,
"loss": 0.6388,
"num_input_tokens_seen": 169120,
"step": 1310
},
{
"epoch": 1.2278244631185808,
"grad_norm": 12.049308776855469,
"learning_rate": 4.9921633343180654e-05,
"loss": 0.4561,
"num_input_tokens_seen": 169808,
"step": 1315
},
{
"epoch": 1.2324929971988796,
"grad_norm": 2.764819860458374,
"learning_rate": 4.99183769732902e-05,
"loss": 0.6871,
"num_input_tokens_seen": 170432,
"step": 1320
},
{
"epoch": 1.2371615312791784,
"grad_norm": 4.406725883483887,
"learning_rate": 4.991505442817131e-05,
"loss": 0.5468,
"num_input_tokens_seen": 171152,
"step": 1325
},
{
"epoch": 1.2418300653594772,
"grad_norm": 3.939741611480713,
"learning_rate": 4.9911665716647624e-05,
"loss": 0.9366,
"num_input_tokens_seen": 171776,
"step": 1330
},
{
"epoch": 1.246498599439776,
"grad_norm": 3.4890053272247314,
"learning_rate": 4.990821084771845e-05,
"loss": 0.5881,
"num_input_tokens_seen": 172528,
"step": 1335
},
{
"epoch": 1.2511671335200747,
"grad_norm": 11.309175491333008,
"learning_rate": 4.990468983055883e-05,
"loss": 0.954,
"num_input_tokens_seen": 173200,
"step": 1340
},
{
"epoch": 1.2558356676003735,
"grad_norm": 1.9080860614776611,
"learning_rate": 4.990110267451944e-05,
"loss": 0.6237,
"num_input_tokens_seen": 173840,
"step": 1345
},
{
"epoch": 1.2605042016806722,
"grad_norm": 8.571795463562012,
"learning_rate": 4.989744938912663e-05,
"loss": 0.5249,
"num_input_tokens_seen": 174560,
"step": 1350
},
{
"epoch": 1.265172735760971,
"grad_norm": 7.339284896850586,
"learning_rate": 4.989372998408236e-05,
"loss": 1.1344,
"num_input_tokens_seen": 175248,
"step": 1355
},
{
"epoch": 1.2698412698412698,
"grad_norm": 4.928005695343018,
"learning_rate": 4.9889944469264166e-05,
"loss": 0.8068,
"num_input_tokens_seen": 175888,
"step": 1360
},
{
"epoch": 1.2745098039215685,
"grad_norm": 6.07223653793335,
"learning_rate": 4.988609285472517e-05,
"loss": 0.7017,
"num_input_tokens_seen": 176560,
"step": 1365
},
{
"epoch": 1.2791783380018673,
"grad_norm": 6.8295698165893555,
"learning_rate": 4.988217515069403e-05,
"loss": 0.9394,
"num_input_tokens_seen": 177136,
"step": 1370
},
{
"epoch": 1.283846872082166,
"grad_norm": 11.6112642288208,
"learning_rate": 4.98781913675749e-05,
"loss": 0.6165,
"num_input_tokens_seen": 177728,
"step": 1375
},
{
"epoch": 1.2885154061624648,
"grad_norm": 4.467996597290039,
"learning_rate": 4.9874141515947456e-05,
"loss": 0.3883,
"num_input_tokens_seen": 178352,
"step": 1380
},
{
"epoch": 1.2931839402427638,
"grad_norm": 3.5836374759674072,
"learning_rate": 4.987002560656678e-05,
"loss": 0.6349,
"num_input_tokens_seen": 178928,
"step": 1385
},
{
"epoch": 1.2978524743230626,
"grad_norm": 13.943968772888184,
"learning_rate": 4.986584365036343e-05,
"loss": 0.5884,
"num_input_tokens_seen": 179568,
"step": 1390
},
{
"epoch": 1.3025210084033614,
"grad_norm": 10.653841972351074,
"learning_rate": 4.986159565844333e-05,
"loss": 0.8191,
"num_input_tokens_seen": 180272,
"step": 1395
},
{
"epoch": 1.3071895424836601,
"grad_norm": 6.080850124359131,
"learning_rate": 4.9857281642087785e-05,
"loss": 0.7922,
"num_input_tokens_seen": 180912,
"step": 1400
},
{
"epoch": 1.311858076563959,
"grad_norm": 2.8319334983825684,
"learning_rate": 4.985290161275345e-05,
"loss": 0.769,
"num_input_tokens_seen": 181536,
"step": 1405
},
{
"epoch": 1.3165266106442577,
"grad_norm": 3.922654867172241,
"learning_rate": 4.9848455582072265e-05,
"loss": 0.4264,
"num_input_tokens_seen": 182128,
"step": 1410
},
{
"epoch": 1.3211951447245565,
"grad_norm": 2.9652132987976074,
"learning_rate": 4.984394356185148e-05,
"loss": 0.5366,
"num_input_tokens_seen": 182816,
"step": 1415
},
{
"epoch": 1.3258636788048552,
"grad_norm": 7.564930438995361,
"learning_rate": 4.983936556407357e-05,
"loss": 0.6557,
"num_input_tokens_seen": 183408,
"step": 1420
},
{
"epoch": 1.330532212885154,
"grad_norm": 3.050708770751953,
"learning_rate": 4.983472160089623e-05,
"loss": 0.7718,
"num_input_tokens_seen": 184064,
"step": 1425
},
{
"epoch": 1.3352007469654528,
"grad_norm": 8.317185401916504,
"learning_rate": 4.983001168465234e-05,
"loss": 0.547,
"num_input_tokens_seen": 184704,
"step": 1430
},
{
"epoch": 1.3398692810457518,
"grad_norm": 3.655439853668213,
"learning_rate": 4.982523582784992e-05,
"loss": 0.6278,
"num_input_tokens_seen": 185280,
"step": 1435
},
{
"epoch": 1.3445378151260505,
"grad_norm": 8.60561752319336,
"learning_rate": 4.9820394043172136e-05,
"loss": 0.7132,
"num_input_tokens_seen": 185872,
"step": 1440
},
{
"epoch": 1.3492063492063493,
"grad_norm": 8.540854454040527,
"learning_rate": 4.98154863434772e-05,
"loss": 0.7167,
"num_input_tokens_seen": 186432,
"step": 1445
},
{
"epoch": 1.353874883286648,
"grad_norm": 11.289070129394531,
"learning_rate": 4.98105127417984e-05,
"loss": 0.7061,
"num_input_tokens_seen": 187120,
"step": 1450
},
{
"epoch": 1.3585434173669468,
"grad_norm": 8.960957527160645,
"learning_rate": 4.980547325134401e-05,
"loss": 1.1768,
"num_input_tokens_seen": 187648,
"step": 1455
},
{
"epoch": 1.3632119514472456,
"grad_norm": 9.159289360046387,
"learning_rate": 4.980036788549733e-05,
"loss": 0.6238,
"num_input_tokens_seen": 188288,
"step": 1460
},
{
"epoch": 1.3678804855275444,
"grad_norm": 9.342123985290527,
"learning_rate": 4.9795196657816564e-05,
"loss": 0.6563,
"num_input_tokens_seen": 188880,
"step": 1465
},
{
"epoch": 1.3725490196078431,
"grad_norm": 5.995706081390381,
"learning_rate": 4.978995958203484e-05,
"loss": 0.6132,
"num_input_tokens_seen": 189488,
"step": 1470
},
{
"epoch": 1.377217553688142,
"grad_norm": 14.103466987609863,
"learning_rate": 4.978465667206015e-05,
"loss": 0.6597,
"num_input_tokens_seen": 190112,
"step": 1475
},
{
"epoch": 1.3818860877684407,
"grad_norm": 4.49061918258667,
"learning_rate": 4.977928794197532e-05,
"loss": 0.8417,
"num_input_tokens_seen": 190816,
"step": 1480
},
{
"epoch": 1.3865546218487395,
"grad_norm": 5.403022766113281,
"learning_rate": 4.977385340603798e-05,
"loss": 0.4915,
"num_input_tokens_seen": 191456,
"step": 1485
},
{
"epoch": 1.3912231559290382,
"grad_norm": 7.392147541046143,
"learning_rate": 4.976835307868053e-05,
"loss": 1.1056,
"num_input_tokens_seen": 192192,
"step": 1490
},
{
"epoch": 1.395891690009337,
"grad_norm": 2.5143346786499023,
"learning_rate": 4.976278697451006e-05,
"loss": 0.4763,
"num_input_tokens_seen": 192832,
"step": 1495
},
{
"epoch": 1.4005602240896358,
"grad_norm": 5.508999824523926,
"learning_rate": 4.975715510830837e-05,
"loss": 0.7842,
"num_input_tokens_seen": 193504,
"step": 1500
},
{
"epoch": 1.4052287581699345,
"grad_norm": 12.90170955657959,
"learning_rate": 4.9751457495031894e-05,
"loss": 0.7676,
"num_input_tokens_seen": 194112,
"step": 1505
},
{
"epoch": 1.4098972922502333,
"grad_norm": 4.344299793243408,
"learning_rate": 4.974569414981166e-05,
"loss": 0.9106,
"num_input_tokens_seen": 194784,
"step": 1510
},
{
"epoch": 1.4145658263305323,
"grad_norm": 3.9784064292907715,
"learning_rate": 4.973986508795327e-05,
"loss": 0.4855,
"num_input_tokens_seen": 195456,
"step": 1515
},
{
"epoch": 1.419234360410831,
"grad_norm": 8.706938743591309,
"learning_rate": 4.9733970324936855e-05,
"loss": 0.6186,
"num_input_tokens_seen": 196064,
"step": 1520
},
{
"epoch": 1.4239028944911298,
"grad_norm": 21.58633804321289,
"learning_rate": 4.9728009876416995e-05,
"loss": 0.7339,
"num_input_tokens_seen": 196688,
"step": 1525
},
{
"epoch": 1.4285714285714286,
"grad_norm": 3.153777599334717,
"learning_rate": 4.972198375822276e-05,
"loss": 0.9472,
"num_input_tokens_seen": 197360,
"step": 1530
},
{
"epoch": 1.4332399626517274,
"grad_norm": 5.13116455078125,
"learning_rate": 4.9715891986357566e-05,
"loss": 0.8951,
"num_input_tokens_seen": 198032,
"step": 1535
},
{
"epoch": 1.4379084967320261,
"grad_norm": 0.6041532754898071,
"learning_rate": 4.9709734576999226e-05,
"loss": 0.4999,
"num_input_tokens_seen": 198624,
"step": 1540
},
{
"epoch": 1.442577030812325,
"grad_norm": 7.3027143478393555,
"learning_rate": 4.9703511546499836e-05,
"loss": 0.7394,
"num_input_tokens_seen": 199312,
"step": 1545
},
{
"epoch": 1.4472455648926237,
"grad_norm": 9.439616203308105,
"learning_rate": 4.969722291138578e-05,
"loss": 0.9974,
"num_input_tokens_seen": 199968,
"step": 1550
},
{
"epoch": 1.4519140989729225,
"grad_norm": 12.853424072265625,
"learning_rate": 4.969086868835765e-05,
"loss": 0.5288,
"num_input_tokens_seen": 200800,
"step": 1555
},
{
"epoch": 1.4565826330532212,
"grad_norm": 1.419742226600647,
"learning_rate": 4.9684448894290236e-05,
"loss": 0.4857,
"num_input_tokens_seen": 201488,
"step": 1560
},
{
"epoch": 1.4612511671335202,
"grad_norm": 3.2149834632873535,
"learning_rate": 4.9677963546232445e-05,
"loss": 0.7109,
"num_input_tokens_seen": 202080,
"step": 1565
},
{
"epoch": 1.465919701213819,
"grad_norm": 3.134138584136963,
"learning_rate": 4.9671412661407296e-05,
"loss": 0.4495,
"num_input_tokens_seen": 202656,
"step": 1570
},
{
"epoch": 1.4705882352941178,
"grad_norm": 2.583625555038452,
"learning_rate": 4.966479625721183e-05,
"loss": 0.6337,
"num_input_tokens_seen": 203376,
"step": 1575
},
{
"epoch": 1.4752567693744165,
"grad_norm": 5.287656307220459,
"learning_rate": 4.9658114351217105e-05,
"loss": 0.5663,
"num_input_tokens_seen": 204016,
"step": 1580
},
{
"epoch": 1.4799253034547153,
"grad_norm": 7.850647926330566,
"learning_rate": 4.965136696116812e-05,
"loss": 0.8005,
"num_input_tokens_seen": 204640,
"step": 1585
},
{
"epoch": 1.484593837535014,
"grad_norm": 2.4759323596954346,
"learning_rate": 4.964455410498378e-05,
"loss": 0.4219,
"num_input_tokens_seen": 205264,
"step": 1590
},
{
"epoch": 1.4892623716153128,
"grad_norm": 1.871109962463379,
"learning_rate": 4.963767580075685e-05,
"loss": 0.506,
"num_input_tokens_seen": 205984,
"step": 1595
},
{
"epoch": 1.4939309056956116,
"grad_norm": 2.148587703704834,
"learning_rate": 4.9630732066753914e-05,
"loss": 0.7896,
"num_input_tokens_seen": 206672,
"step": 1600
},
{
"epoch": 1.4985994397759104,
"grad_norm": 8.374788284301758,
"learning_rate": 4.962372292141529e-05,
"loss": 0.6059,
"num_input_tokens_seen": 207360,
"step": 1605
},
{
"epoch": 1.5014005602240896,
"eval_loss": 0.6730406880378723,
"eval_runtime": 3.8743,
"eval_samples_per_second": 61.431,
"eval_steps_per_second": 30.716,
"num_input_tokens_seen": 207744,
"step": 1608
},
{
"epoch": 1.5032679738562091,
"grad_norm": 4.593263149261475,
"learning_rate": 4.9616648383355037e-05,
"loss": 0.5781,
"num_input_tokens_seen": 207936,
"step": 1610
},
{
"epoch": 1.507936507936508,
"grad_norm": 3.9901912212371826,
"learning_rate": 4.960950847136085e-05,
"loss": 0.6448,
"num_input_tokens_seen": 208528,
"step": 1615
},
{
"epoch": 1.5126050420168067,
"grad_norm": 12.342235565185547,
"learning_rate": 4.9602303204394044e-05,
"loss": 0.6921,
"num_input_tokens_seen": 209216,
"step": 1620
},
{
"epoch": 1.5172735760971054,
"grad_norm": 19.401485443115234,
"learning_rate": 4.9595032601589514e-05,
"loss": 0.4746,
"num_input_tokens_seen": 209856,
"step": 1625
},
{
"epoch": 1.5219421101774042,
"grad_norm": 5.1063971519470215,
"learning_rate": 4.958769668225565e-05,
"loss": 0.5231,
"num_input_tokens_seen": 210480,
"step": 1630
},
{
"epoch": 1.526610644257703,
"grad_norm": 3.45485258102417,
"learning_rate": 4.9580295465874304e-05,
"loss": 0.503,
"num_input_tokens_seen": 211152,
"step": 1635
},
{
"epoch": 1.5312791783380018,
"grad_norm": 8.241432189941406,
"learning_rate": 4.9572828972100734e-05,
"loss": 0.5253,
"num_input_tokens_seen": 211760,
"step": 1640
},
{
"epoch": 1.5359477124183005,
"grad_norm": 4.817863464355469,
"learning_rate": 4.956529722076355e-05,
"loss": 0.5521,
"num_input_tokens_seen": 212400,
"step": 1645
},
{
"epoch": 1.5406162464985993,
"grad_norm": 3.3362035751342773,
"learning_rate": 4.955770023186469e-05,
"loss": 0.4619,
"num_input_tokens_seen": 213040,
"step": 1650
},
{
"epoch": 1.545284780578898,
"grad_norm": 10.945075035095215,
"learning_rate": 4.9550038025579306e-05,
"loss": 1.012,
"num_input_tokens_seen": 213696,
"step": 1655
},
{
"epoch": 1.549953314659197,
"grad_norm": 10.770771980285645,
"learning_rate": 4.954231062225576e-05,
"loss": 0.4708,
"num_input_tokens_seen": 214336,
"step": 1660
},
{
"epoch": 1.5546218487394958,
"grad_norm": 1.7344568967819214,
"learning_rate": 4.9534518042415575e-05,
"loss": 0.3593,
"num_input_tokens_seen": 215040,
"step": 1665
},
{
"epoch": 1.5592903828197946,
"grad_norm": 7.169672012329102,
"learning_rate": 4.9526660306753346e-05,
"loss": 0.7011,
"num_input_tokens_seen": 215696,
"step": 1670
},
{
"epoch": 1.5639589169000934,
"grad_norm": 5.537058353424072,
"learning_rate": 4.95187374361367e-05,
"loss": 0.4733,
"num_input_tokens_seen": 216352,
"step": 1675
},
{
"epoch": 1.5686274509803921,
"grad_norm": 4.959131240844727,
"learning_rate": 4.951074945160623e-05,
"loss": 0.4652,
"num_input_tokens_seen": 216928,
"step": 1680
},
{
"epoch": 1.573295985060691,
"grad_norm": 13.092927932739258,
"learning_rate": 4.950269637437548e-05,
"loss": 0.701,
"num_input_tokens_seen": 217552,
"step": 1685
},
{
"epoch": 1.5779645191409897,
"grad_norm": 18.209341049194336,
"learning_rate": 4.949457822583085e-05,
"loss": 0.5557,
"num_input_tokens_seen": 218208,
"step": 1690
},
{
"epoch": 1.5826330532212887,
"grad_norm": 2.769484758377075,
"learning_rate": 4.9486395027531526e-05,
"loss": 0.5706,
"num_input_tokens_seen": 218784,
"step": 1695
},
{
"epoch": 1.5873015873015874,
"grad_norm": 5.670962810516357,
"learning_rate": 4.947814680120947e-05,
"loss": 0.7001,
"num_input_tokens_seen": 219424,
"step": 1700
},
{
"epoch": 1.5919701213818862,
"grad_norm": 8.110953330993652,
"learning_rate": 4.946983356876932e-05,
"loss": 0.739,
"num_input_tokens_seen": 220032,
"step": 1705
},
{
"epoch": 1.596638655462185,
"grad_norm": 1.141108512878418,
"learning_rate": 4.946145535228837e-05,
"loss": 0.4477,
"num_input_tokens_seen": 220752,
"step": 1710
},
{
"epoch": 1.6013071895424837,
"grad_norm": 6.900874137878418,
"learning_rate": 4.945301217401648e-05,
"loss": 0.6388,
"num_input_tokens_seen": 221376,
"step": 1715
},
{
"epoch": 1.6059757236227825,
"grad_norm": 5.326786518096924,
"learning_rate": 4.944450405637602e-05,
"loss": 0.5577,
"num_input_tokens_seen": 222032,
"step": 1720
},
{
"epoch": 1.6106442577030813,
"grad_norm": 15.054530143737793,
"learning_rate": 4.943593102196183e-05,
"loss": 0.8117,
"num_input_tokens_seen": 222608,
"step": 1725
},
{
"epoch": 1.61531279178338,
"grad_norm": 6.387323379516602,
"learning_rate": 4.942729309354115e-05,
"loss": 0.5248,
"num_input_tokens_seen": 223264,
"step": 1730
},
{
"epoch": 1.6199813258636788,
"grad_norm": 9.268798828125,
"learning_rate": 4.941859029405353e-05,
"loss": 0.6716,
"num_input_tokens_seen": 223904,
"step": 1735
},
{
"epoch": 1.6246498599439776,
"grad_norm": 10.223531723022461,
"learning_rate": 4.940982264661084e-05,
"loss": 0.8909,
"num_input_tokens_seen": 224496,
"step": 1740
},
{
"epoch": 1.6293183940242764,
"grad_norm": 5.692781925201416,
"learning_rate": 4.940099017449714e-05,
"loss": 0.7171,
"num_input_tokens_seen": 225232,
"step": 1745
},
{
"epoch": 1.6339869281045751,
"grad_norm": 4.1930084228515625,
"learning_rate": 4.9392092901168635e-05,
"loss": 0.4088,
"num_input_tokens_seen": 225872,
"step": 1750
},
{
"epoch": 1.638655462184874,
"grad_norm": 5.081546783447266,
"learning_rate": 4.9383130850253645e-05,
"loss": 0.6362,
"num_input_tokens_seen": 226448,
"step": 1755
},
{
"epoch": 1.6433239962651727,
"grad_norm": 4.499953269958496,
"learning_rate": 4.937410404555251e-05,
"loss": 0.658,
"num_input_tokens_seen": 227040,
"step": 1760
},
{
"epoch": 1.6479925303454714,
"grad_norm": 2.329761028289795,
"learning_rate": 4.9365012511037514e-05,
"loss": 0.5597,
"num_input_tokens_seen": 227664,
"step": 1765
},
{
"epoch": 1.6526610644257702,
"grad_norm": 2.832841396331787,
"learning_rate": 4.9355856270852865e-05,
"loss": 0.6842,
"num_input_tokens_seen": 228416,
"step": 1770
},
{
"epoch": 1.657329598506069,
"grad_norm": 2.6941893100738525,
"learning_rate": 4.934663534931462e-05,
"loss": 0.3806,
"num_input_tokens_seen": 229120,
"step": 1775
},
{
"epoch": 1.6619981325863677,
"grad_norm": 5.795032024383545,
"learning_rate": 4.933734977091059e-05,
"loss": 0.8215,
"num_input_tokens_seen": 229792,
"step": 1780
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.9985203742980957,
"learning_rate": 4.9327999560300284e-05,
"loss": 0.6302,
"num_input_tokens_seen": 230432,
"step": 1785
},
{
"epoch": 1.6713352007469653,
"grad_norm": 2.222291946411133,
"learning_rate": 4.931858474231488e-05,
"loss": 0.4667,
"num_input_tokens_seen": 231040,
"step": 1790
},
{
"epoch": 1.6760037348272643,
"grad_norm": 7.770413398742676,
"learning_rate": 4.930910534195712e-05,
"loss": 0.5139,
"num_input_tokens_seen": 231904,
"step": 1795
},
{
"epoch": 1.680672268907563,
"grad_norm": 4.517005443572998,
"learning_rate": 4.9299561384401236e-05,
"loss": 1.2908,
"num_input_tokens_seen": 232480,
"step": 1800
},
{
"epoch": 1.6853408029878618,
"grad_norm": 7.685630798339844,
"learning_rate": 4.928995289499294e-05,
"loss": 0.611,
"num_input_tokens_seen": 233072,
"step": 1805
},
{
"epoch": 1.6900093370681606,
"grad_norm": 9.455100059509277,
"learning_rate": 4.92802798992493e-05,
"loss": 0.5963,
"num_input_tokens_seen": 233712,
"step": 1810
},
{
"epoch": 1.6946778711484594,
"grad_norm": 7.285452842712402,
"learning_rate": 4.92705424228587e-05,
"loss": 0.5151,
"num_input_tokens_seen": 234384,
"step": 1815
},
{
"epoch": 1.6993464052287581,
"grad_norm": 2.7807321548461914,
"learning_rate": 4.926074049168074e-05,
"loss": 0.4471,
"num_input_tokens_seen": 234976,
"step": 1820
},
{
"epoch": 1.7040149393090571,
"grad_norm": 4.179620265960693,
"learning_rate": 4.9250874131746226e-05,
"loss": 0.439,
"num_input_tokens_seen": 235600,
"step": 1825
},
{
"epoch": 1.708683473389356,
"grad_norm": 8.957188606262207,
"learning_rate": 4.924094336925704e-05,
"loss": 0.755,
"num_input_tokens_seen": 236256,
"step": 1830
},
{
"epoch": 1.7133520074696547,
"grad_norm": 2.2546701431274414,
"learning_rate": 4.923094823058612e-05,
"loss": 0.6398,
"num_input_tokens_seen": 236896,
"step": 1835
},
{
"epoch": 1.7180205415499534,
"grad_norm": 3.878314971923828,
"learning_rate": 4.9220888742277336e-05,
"loss": 0.957,
"num_input_tokens_seen": 237552,
"step": 1840
},
{
"epoch": 1.7226890756302522,
"grad_norm": 4.878526210784912,
"learning_rate": 4.921076493104549e-05,
"loss": 0.6211,
"num_input_tokens_seen": 238224,
"step": 1845
},
{
"epoch": 1.727357609710551,
"grad_norm": 10.116549491882324,
"learning_rate": 4.920057682377616e-05,
"loss": 0.5877,
"num_input_tokens_seen": 238912,
"step": 1850
},
{
"epoch": 1.7320261437908497,
"grad_norm": 1.715928077697754,
"learning_rate": 4.9190324447525705e-05,
"loss": 0.4448,
"num_input_tokens_seen": 239616,
"step": 1855
},
{
"epoch": 1.7366946778711485,
"grad_norm": 5.732523441314697,
"learning_rate": 4.918000782952114e-05,
"loss": 0.6933,
"num_input_tokens_seen": 240304,
"step": 1860
},
{
"epoch": 1.7413632119514473,
"grad_norm": 4.6649322509765625,
"learning_rate": 4.916962699716013e-05,
"loss": 0.9146,
"num_input_tokens_seen": 240992,
"step": 1865
},
{
"epoch": 1.746031746031746,
"grad_norm": 5.096378326416016,
"learning_rate": 4.9159181978010814e-05,
"loss": 0.8553,
"num_input_tokens_seen": 241648,
"step": 1870
},
{
"epoch": 1.7507002801120448,
"grad_norm": 3.9121956825256348,
"learning_rate": 4.9148672799811825e-05,
"loss": 0.3869,
"num_input_tokens_seen": 242320,
"step": 1875
},
{
"epoch": 1.7553688141923436,
"grad_norm": 3.4306793212890625,
"learning_rate": 4.9138099490472165e-05,
"loss": 1.2696,
"num_input_tokens_seen": 242912,
"step": 1880
},
{
"epoch": 1.7600373482726424,
"grad_norm": 1.4592175483703613,
"learning_rate": 4.912746207807117e-05,
"loss": 0.3769,
"num_input_tokens_seen": 243616,
"step": 1885
},
{
"epoch": 1.7647058823529411,
"grad_norm": 2.9718809127807617,
"learning_rate": 4.9116760590858404e-05,
"loss": 0.4158,
"num_input_tokens_seen": 244224,
"step": 1890
},
{
"epoch": 1.76937441643324,
"grad_norm": 5.453405857086182,
"learning_rate": 4.9105995057253586e-05,
"loss": 0.5839,
"num_input_tokens_seen": 244832,
"step": 1895
},
{
"epoch": 1.7740429505135387,
"grad_norm": 7.693014621734619,
"learning_rate": 4.9095165505846505e-05,
"loss": 0.6206,
"num_input_tokens_seen": 245520,
"step": 1900
},
{
"epoch": 1.7787114845938374,
"grad_norm": 15.895974159240723,
"learning_rate": 4.9084271965397014e-05,
"loss": 0.7017,
"num_input_tokens_seen": 246176,
"step": 1905
},
{
"epoch": 1.7833800186741362,
"grad_norm": 3.057833433151245,
"learning_rate": 4.9073314464834844e-05,
"loss": 0.4313,
"num_input_tokens_seen": 246848,
"step": 1910
},
{
"epoch": 1.788048552754435,
"grad_norm": 17.901315689086914,
"learning_rate": 4.906229303325961e-05,
"loss": 0.6497,
"num_input_tokens_seen": 247472,
"step": 1915
},
{
"epoch": 1.7927170868347337,
"grad_norm": 4.0249810218811035,
"learning_rate": 4.905120769994072e-05,
"loss": 0.6531,
"num_input_tokens_seen": 248128,
"step": 1920
},
{
"epoch": 1.7973856209150327,
"grad_norm": 0.5758776068687439,
"learning_rate": 4.9040058494317244e-05,
"loss": 0.3771,
"num_input_tokens_seen": 248864,
"step": 1925
},
{
"epoch": 1.8020541549953315,
"grad_norm": 10.201847076416016,
"learning_rate": 4.902884544599792e-05,
"loss": 0.702,
"num_input_tokens_seen": 249616,
"step": 1930
},
{
"epoch": 1.8067226890756303,
"grad_norm": 4.930323600769043,
"learning_rate": 4.901756858476101e-05,
"loss": 0.7461,
"num_input_tokens_seen": 250256,
"step": 1935
},
{
"epoch": 1.811391223155929,
"grad_norm": 4.371157646179199,
"learning_rate": 4.900622794055424e-05,
"loss": 0.8322,
"num_input_tokens_seen": 250864,
"step": 1940
},
{
"epoch": 1.8160597572362278,
"grad_norm": 3.946460485458374,
"learning_rate": 4.899482354349473e-05,
"loss": 0.8725,
"num_input_tokens_seen": 251568,
"step": 1945
},
{
"epoch": 1.8207282913165266,
"grad_norm": 9.607342720031738,
"learning_rate": 4.8983355423868913e-05,
"loss": 1.0061,
"num_input_tokens_seen": 252192,
"step": 1950
},
{
"epoch": 1.8253968253968254,
"grad_norm": 3.2483971118927,
"learning_rate": 4.8971823612132436e-05,
"loss": 0.637,
"num_input_tokens_seen": 252832,
"step": 1955
},
{
"epoch": 1.8300653594771243,
"grad_norm": 3.698613166809082,
"learning_rate": 4.8960228138910106e-05,
"loss": 0.4496,
"num_input_tokens_seen": 253488,
"step": 1960
},
{
"epoch": 1.8347338935574231,
"grad_norm": 7.103636741638184,
"learning_rate": 4.8948569034995765e-05,
"loss": 0.6317,
"num_input_tokens_seen": 254176,
"step": 1965
},
{
"epoch": 1.8394024276377219,
"grad_norm": 8.10537052154541,
"learning_rate": 4.8936846331352284e-05,
"loss": 0.8247,
"num_input_tokens_seen": 254752,
"step": 1970
},
{
"epoch": 1.8440709617180207,
"grad_norm": 5.597965717315674,
"learning_rate": 4.8925060059111394e-05,
"loss": 0.7657,
"num_input_tokens_seen": 255328,
"step": 1975
},
{
"epoch": 1.8487394957983194,
"grad_norm": 3.3380625247955322,
"learning_rate": 4.891321024957366e-05,
"loss": 0.6026,
"num_input_tokens_seen": 255936,
"step": 1980
},
{
"epoch": 1.8534080298786182,
"grad_norm": 3.575791835784912,
"learning_rate": 4.890129693420839e-05,
"loss": 0.7018,
"num_input_tokens_seen": 256608,
"step": 1985
},
{
"epoch": 1.858076563958917,
"grad_norm": 5.416692733764648,
"learning_rate": 4.888932014465352e-05,
"loss": 0.4166,
"num_input_tokens_seen": 257216,
"step": 1990
},
{
"epoch": 1.8627450980392157,
"grad_norm": 9.988777160644531,
"learning_rate": 4.887727991271558e-05,
"loss": 0.6218,
"num_input_tokens_seen": 257952,
"step": 1995
},
{
"epoch": 1.8674136321195145,
"grad_norm": 6.201949596405029,
"learning_rate": 4.8865176270369565e-05,
"loss": 0.8124,
"num_input_tokens_seen": 258608,
"step": 2000
},
{
"epoch": 1.8720821661998133,
"grad_norm": 4.34901237487793,
"learning_rate": 4.885300924975887e-05,
"loss": 0.418,
"num_input_tokens_seen": 259264,
"step": 2005
},
{
"epoch": 1.876750700280112,
"grad_norm": 7.196844577789307,
"learning_rate": 4.884077888319522e-05,
"loss": 0.7026,
"num_input_tokens_seen": 259936,
"step": 2010
},
{
"epoch": 1.8814192343604108,
"grad_norm": 4.676304340362549,
"learning_rate": 4.882848520315852e-05,
"loss": 0.7609,
"num_input_tokens_seen": 260560,
"step": 2015
},
{
"epoch": 1.8860877684407096,
"grad_norm": 6.854541301727295,
"learning_rate": 4.8816128242296876e-05,
"loss": 0.495,
"num_input_tokens_seen": 261184,
"step": 2020
},
{
"epoch": 1.8907563025210083,
"grad_norm": 3.158414840698242,
"learning_rate": 4.8803708033426404e-05,
"loss": 0.5712,
"num_input_tokens_seen": 261808,
"step": 2025
},
{
"epoch": 1.8954248366013071,
"grad_norm": 11.574291229248047,
"learning_rate": 4.8791224609531204e-05,
"loss": 0.4681,
"num_input_tokens_seen": 262384,
"step": 2030
},
{
"epoch": 1.9000933706816059,
"grad_norm": 3.723761558532715,
"learning_rate": 4.877867800376325e-05,
"loss": 0.5733,
"num_input_tokens_seen": 263008,
"step": 2035
},
{
"epoch": 1.9047619047619047,
"grad_norm": 3.9397077560424805,
"learning_rate": 4.8766068249442326e-05,
"loss": 1.066,
"num_input_tokens_seen": 263664,
"step": 2040
},
{
"epoch": 1.9094304388422034,
"grad_norm": 4.511308670043945,
"learning_rate": 4.875339538005588e-05,
"loss": 1.0934,
"num_input_tokens_seen": 264320,
"step": 2045
},
{
"epoch": 1.9140989729225022,
"grad_norm": 7.311952590942383,
"learning_rate": 4.874065942925899e-05,
"loss": 0.4114,
"num_input_tokens_seen": 265072,
"step": 2050
},
{
"epoch": 1.918767507002801,
"grad_norm": 14.041531562805176,
"learning_rate": 4.8727860430874285e-05,
"loss": 0.5461,
"num_input_tokens_seen": 265680,
"step": 2055
},
{
"epoch": 1.9234360410831,
"grad_norm": 2.48492431640625,
"learning_rate": 4.871499841889179e-05,
"loss": 0.9028,
"num_input_tokens_seen": 266384,
"step": 2060
},
{
"epoch": 1.9281045751633987,
"grad_norm": 5.875805854797363,
"learning_rate": 4.870207342746889e-05,
"loss": 0.5761,
"num_input_tokens_seen": 267040,
"step": 2065
},
{
"epoch": 1.9327731092436975,
"grad_norm": 5.392336845397949,
"learning_rate": 4.868908549093022e-05,
"loss": 0.624,
"num_input_tokens_seen": 267712,
"step": 2070
},
{
"epoch": 1.9374416433239963,
"grad_norm": 6.086864948272705,
"learning_rate": 4.867603464376759e-05,
"loss": 0.4388,
"num_input_tokens_seen": 268320,
"step": 2075
},
{
"epoch": 1.942110177404295,
"grad_norm": 12.777872085571289,
"learning_rate": 4.8662920920639866e-05,
"loss": 0.9118,
"num_input_tokens_seen": 268880,
"step": 2080
},
{
"epoch": 1.9467787114845938,
"grad_norm": 5.037237167358398,
"learning_rate": 4.864974435637289e-05,
"loss": 0.5265,
"num_input_tokens_seen": 269552,
"step": 2085
},
{
"epoch": 1.9514472455648926,
"grad_norm": 7.752954006195068,
"learning_rate": 4.863650498595941e-05,
"loss": 0.4638,
"num_input_tokens_seen": 270144,
"step": 2090
},
{
"epoch": 1.9561157796451916,
"grad_norm": 2.958127021789551,
"learning_rate": 4.862320284455894e-05,
"loss": 0.4215,
"num_input_tokens_seen": 270800,
"step": 2095
},
{
"epoch": 1.9607843137254903,
"grad_norm": 15.195981979370117,
"learning_rate": 4.860983796749771e-05,
"loss": 0.811,
"num_input_tokens_seen": 271456,
"step": 2100
},
{
"epoch": 1.965452847805789,
"grad_norm": 1.6306458711624146,
"learning_rate": 4.859641039026856e-05,
"loss": 0.6108,
"num_input_tokens_seen": 272016,
"step": 2105
},
{
"epoch": 1.9701213818860879,
"grad_norm": 5.165258884429932,
"learning_rate": 4.858292014853083e-05,
"loss": 0.713,
"num_input_tokens_seen": 272544,
"step": 2110
},
{
"epoch": 1.9747899159663866,
"grad_norm": 4.176558971405029,
"learning_rate": 4.8569367278110284e-05,
"loss": 0.7166,
"num_input_tokens_seen": 273216,
"step": 2115
},
{
"epoch": 1.9794584500466854,
"grad_norm": 5.415503025054932,
"learning_rate": 4.8555751814998994e-05,
"loss": 0.5213,
"num_input_tokens_seen": 273856,
"step": 2120
},
{
"epoch": 1.9841269841269842,
"grad_norm": 5.2978105545043945,
"learning_rate": 4.8542073795355294e-05,
"loss": 0.7801,
"num_input_tokens_seen": 274496,
"step": 2125
},
{
"epoch": 1.988795518207283,
"grad_norm": 15.461605072021484,
"learning_rate": 4.85283332555036e-05,
"loss": 0.6693,
"num_input_tokens_seen": 275072,
"step": 2130
},
{
"epoch": 1.9934640522875817,
"grad_norm": 4.298515319824219,
"learning_rate": 4.8514530231934385e-05,
"loss": 0.8724,
"num_input_tokens_seen": 275744,
"step": 2135
},
{
"epoch": 1.9981325863678805,
"grad_norm": 9.890350341796875,
"learning_rate": 4.850066476130407e-05,
"loss": 0.4097,
"num_input_tokens_seen": 276368,
"step": 2140
},
{
"epoch": 2.0018674136321195,
"eval_loss": 0.6239609122276306,
"eval_runtime": 3.8759,
"eval_samples_per_second": 61.405,
"eval_steps_per_second": 30.702,
"num_input_tokens_seen": 276856,
"step": 2144
},
{
"epoch": 2.0028011204481793,
"grad_norm": 5.202558994293213,
"learning_rate": 4.84867368804349e-05,
"loss": 0.4272,
"num_input_tokens_seen": 276984,
"step": 2145
},
{
"epoch": 2.007469654528478,
"grad_norm": 5.1944732666015625,
"learning_rate": 4.847274662631487e-05,
"loss": 0.6998,
"num_input_tokens_seen": 277544,
"step": 2150
},
{
"epoch": 2.012138188608777,
"grad_norm": 6.575268268585205,
"learning_rate": 4.8458694036097604e-05,
"loss": 0.5162,
"num_input_tokens_seen": 278264,
"step": 2155
},
{
"epoch": 2.0168067226890756,
"grad_norm": 4.267121315002441,
"learning_rate": 4.84445791471023e-05,
"loss": 0.5913,
"num_input_tokens_seen": 278888,
"step": 2160
},
{
"epoch": 2.0214752567693743,
"grad_norm": 5.02635383605957,
"learning_rate": 4.843040199681356e-05,
"loss": 0.8597,
"num_input_tokens_seen": 279576,
"step": 2165
},
{
"epoch": 2.026143790849673,
"grad_norm": 8.290732383728027,
"learning_rate": 4.8416162622881367e-05,
"loss": 0.396,
"num_input_tokens_seen": 280216,
"step": 2170
},
{
"epoch": 2.030812324929972,
"grad_norm": 3.7890634536743164,
"learning_rate": 4.840186106312094e-05,
"loss": 0.4928,
"num_input_tokens_seen": 280936,
"step": 2175
},
{
"epoch": 2.0354808590102706,
"grad_norm": 5.653308391571045,
"learning_rate": 4.8387497355512625e-05,
"loss": 0.4655,
"num_input_tokens_seen": 281560,
"step": 2180
},
{
"epoch": 2.0401493930905694,
"grad_norm": 13.255468368530273,
"learning_rate": 4.837307153820184e-05,
"loss": 0.8693,
"num_input_tokens_seen": 282200,
"step": 2185
},
{
"epoch": 2.044817927170868,
"grad_norm": 8.845050811767578,
"learning_rate": 4.835858364949894e-05,
"loss": 0.4014,
"num_input_tokens_seen": 282840,
"step": 2190
},
{
"epoch": 2.049486461251167,
"grad_norm": 2.6028687953948975,
"learning_rate": 4.834403372787912e-05,
"loss": 0.4387,
"num_input_tokens_seen": 283496,
"step": 2195
},
{
"epoch": 2.0541549953314657,
"grad_norm": 19.733068466186523,
"learning_rate": 4.83294218119823e-05,
"loss": 0.7063,
"num_input_tokens_seen": 284136,
"step": 2200
},
{
"epoch": 2.0588235294117645,
"grad_norm": 4.527667999267578,
"learning_rate": 4.831474794061305e-05,
"loss": 0.694,
"num_input_tokens_seen": 284744,
"step": 2205
},
{
"epoch": 2.0634920634920633,
"grad_norm": 1.302569031715393,
"learning_rate": 4.830001215274048e-05,
"loss": 0.2824,
"num_input_tokens_seen": 285352,
"step": 2210
},
{
"epoch": 2.0681605975723625,
"grad_norm": 6.566271781921387,
"learning_rate": 4.828521448749812e-05,
"loss": 0.8398,
"num_input_tokens_seen": 285992,
"step": 2215
},
{
"epoch": 2.0728291316526612,
"grad_norm": 5.239344120025635,
"learning_rate": 4.827035498418382e-05,
"loss": 0.6146,
"num_input_tokens_seen": 286616,
"step": 2220
},
{
"epoch": 2.07749766573296,
"grad_norm": 3.5977797508239746,
"learning_rate": 4.8255433682259685e-05,
"loss": 0.4369,
"num_input_tokens_seen": 287256,
"step": 2225
},
{
"epoch": 2.082166199813259,
"grad_norm": 7.737928867340088,
"learning_rate": 4.824045062135189e-05,
"loss": 0.6933,
"num_input_tokens_seen": 287864,
"step": 2230
},
{
"epoch": 2.0868347338935576,
"grad_norm": 8.302957534790039,
"learning_rate": 4.822540584125066e-05,
"loss": 0.4181,
"num_input_tokens_seen": 288456,
"step": 2235
},
{
"epoch": 2.0915032679738563,
"grad_norm": 2.4553043842315674,
"learning_rate": 4.82102993819101e-05,
"loss": 0.4067,
"num_input_tokens_seen": 289128,
"step": 2240
},
{
"epoch": 2.096171802054155,
"grad_norm": 4.2813544273376465,
"learning_rate": 4.819513128344814e-05,
"loss": 0.5259,
"num_input_tokens_seen": 289704,
"step": 2245
},
{
"epoch": 2.100840336134454,
"grad_norm": 2.3208742141723633,
"learning_rate": 4.8179901586146385e-05,
"loss": 0.4714,
"num_input_tokens_seen": 290360,
"step": 2250
},
{
"epoch": 2.1055088702147526,
"grad_norm": 8.080585479736328,
"learning_rate": 4.816461033045004e-05,
"loss": 0.6754,
"num_input_tokens_seen": 291080,
"step": 2255
},
{
"epoch": 2.1101774042950514,
"grad_norm": 2.4208004474639893,
"learning_rate": 4.8149257556967774e-05,
"loss": 0.28,
"num_input_tokens_seen": 291752,
"step": 2260
},
{
"epoch": 2.11484593837535,
"grad_norm": 3.9821560382843018,
"learning_rate": 4.813384330647164e-05,
"loss": 0.4721,
"num_input_tokens_seen": 292408,
"step": 2265
},
{
"epoch": 2.119514472455649,
"grad_norm": 17.761463165283203,
"learning_rate": 4.8118367619896956e-05,
"loss": 0.5283,
"num_input_tokens_seen": 293112,
"step": 2270
},
{
"epoch": 2.1241830065359477,
"grad_norm": 3.123309373855591,
"learning_rate": 4.8102830538342176e-05,
"loss": 0.4104,
"num_input_tokens_seen": 293768,
"step": 2275
},
{
"epoch": 2.1288515406162465,
"grad_norm": 4.399372100830078,
"learning_rate": 4.808723210306882e-05,
"loss": 0.3421,
"num_input_tokens_seen": 294472,
"step": 2280
},
{
"epoch": 2.1335200746965453,
"grad_norm": 13.870634078979492,
"learning_rate": 4.807157235550134e-05,
"loss": 0.4928,
"num_input_tokens_seen": 295240,
"step": 2285
},
{
"epoch": 2.138188608776844,
"grad_norm": 9.316656112670898,
"learning_rate": 4.8055851337227006e-05,
"loss": 0.3461,
"num_input_tokens_seen": 295880,
"step": 2290
},
{
"epoch": 2.142857142857143,
"grad_norm": 6.0931596755981445,
"learning_rate": 4.804006908999581e-05,
"loss": 0.7741,
"num_input_tokens_seen": 296568,
"step": 2295
},
{
"epoch": 2.1475256769374416,
"grad_norm": 3.882946729660034,
"learning_rate": 4.802422565572034e-05,
"loss": 0.3851,
"num_input_tokens_seen": 297208,
"step": 2300
},
{
"epoch": 2.1521942110177403,
"grad_norm": 4.334148406982422,
"learning_rate": 4.8008321076475694e-05,
"loss": 0.481,
"num_input_tokens_seen": 297976,
"step": 2305
},
{
"epoch": 2.156862745098039,
"grad_norm": 8.217869758605957,
"learning_rate": 4.799235539449932e-05,
"loss": 0.6381,
"num_input_tokens_seen": 298552,
"step": 2310
},
{
"epoch": 2.161531279178338,
"grad_norm": 12.515053749084473,
"learning_rate": 4.797632865219098e-05,
"loss": 0.7755,
"num_input_tokens_seen": 299128,
"step": 2315
},
{
"epoch": 2.1661998132586366,
"grad_norm": 6.713847637176514,
"learning_rate": 4.7960240892112554e-05,
"loss": 0.4644,
"num_input_tokens_seen": 299784,
"step": 2320
},
{
"epoch": 2.1708683473389354,
"grad_norm": 1.149377465248108,
"learning_rate": 4.794409215698799e-05,
"loss": 0.2835,
"num_input_tokens_seen": 300408,
"step": 2325
},
{
"epoch": 2.175536881419234,
"grad_norm": 6.214423179626465,
"learning_rate": 4.792788248970314e-05,
"loss": 0.5038,
"num_input_tokens_seen": 301064,
"step": 2330
},
{
"epoch": 2.180205415499533,
"grad_norm": 1.930323600769043,
"learning_rate": 4.7911611933305707e-05,
"loss": 0.2548,
"num_input_tokens_seen": 301736,
"step": 2335
},
{
"epoch": 2.184873949579832,
"grad_norm": 7.587453842163086,
"learning_rate": 4.7895280531005064e-05,
"loss": 0.5286,
"num_input_tokens_seen": 302280,
"step": 2340
},
{
"epoch": 2.189542483660131,
"grad_norm": 6.7828192710876465,
"learning_rate": 4.78788883261722e-05,
"loss": 0.4586,
"num_input_tokens_seen": 302872,
"step": 2345
},
{
"epoch": 2.1942110177404297,
"grad_norm": 15.733732223510742,
"learning_rate": 4.786243536233954e-05,
"loss": 0.9646,
"num_input_tokens_seen": 303464,
"step": 2350
},
{
"epoch": 2.1988795518207285,
"grad_norm": 2.523688554763794,
"learning_rate": 4.7845921683200905e-05,
"loss": 0.3255,
"num_input_tokens_seen": 304200,
"step": 2355
},
{
"epoch": 2.2035480859010272,
"grad_norm": 4.354966640472412,
"learning_rate": 4.782934733261133e-05,
"loss": 0.3044,
"num_input_tokens_seen": 304824,
"step": 2360
},
{
"epoch": 2.208216619981326,
"grad_norm": 2.6396069526672363,
"learning_rate": 4.781271235458699e-05,
"loss": 0.6225,
"num_input_tokens_seen": 305416,
"step": 2365
},
{
"epoch": 2.212885154061625,
"grad_norm": 11.276773452758789,
"learning_rate": 4.779601679330504e-05,
"loss": 0.6085,
"num_input_tokens_seen": 305992,
"step": 2370
},
{
"epoch": 2.2175536881419236,
"grad_norm": 2.625945568084717,
"learning_rate": 4.7779260693103556e-05,
"loss": 0.5244,
"num_input_tokens_seen": 306728,
"step": 2375
},
{
"epoch": 2.2222222222222223,
"grad_norm": 4.601404666900635,
"learning_rate": 4.776244409848138e-05,
"loss": 0.4817,
"num_input_tokens_seen": 307336,
"step": 2380
},
{
"epoch": 2.226890756302521,
"grad_norm": 10.985107421875,
"learning_rate": 4.774556705409799e-05,
"loss": 0.79,
"num_input_tokens_seen": 307960,
"step": 2385
},
{
"epoch": 2.23155929038282,
"grad_norm": 2.6938464641571045,
"learning_rate": 4.7728629604773415e-05,
"loss": 0.5564,
"num_input_tokens_seen": 308664,
"step": 2390
},
{
"epoch": 2.2362278244631186,
"grad_norm": 7.201427459716797,
"learning_rate": 4.7711631795488096e-05,
"loss": 0.2981,
"num_input_tokens_seen": 309320,
"step": 2395
},
{
"epoch": 2.2408963585434174,
"grad_norm": 1.2306467294692993,
"learning_rate": 4.769457367138277e-05,
"loss": 0.3292,
"num_input_tokens_seen": 309944,
"step": 2400
},
{
"epoch": 2.245564892623716,
"grad_norm": 4.115130424499512,
"learning_rate": 4.767745527775834e-05,
"loss": 0.8866,
"num_input_tokens_seen": 310568,
"step": 2405
},
{
"epoch": 2.250233426704015,
"grad_norm": 4.4345526695251465,
"learning_rate": 4.7660276660075804e-05,
"loss": 0.5751,
"num_input_tokens_seen": 311240,
"step": 2410
},
{
"epoch": 2.2549019607843137,
"grad_norm": 10.08226490020752,
"learning_rate": 4.764303786395604e-05,
"loss": 0.9322,
"num_input_tokens_seen": 311832,
"step": 2415
},
{
"epoch": 2.2595704948646125,
"grad_norm": 2.849057912826538,
"learning_rate": 4.7625738935179794e-05,
"loss": 0.5432,
"num_input_tokens_seen": 312504,
"step": 2420
},
{
"epoch": 2.2642390289449112,
"grad_norm": 8.79856014251709,
"learning_rate": 4.760837991968746e-05,
"loss": 0.4157,
"num_input_tokens_seen": 313160,
"step": 2425
},
{
"epoch": 2.26890756302521,
"grad_norm": 6.586191654205322,
"learning_rate": 4.7590960863579034e-05,
"loss": 0.4689,
"num_input_tokens_seen": 313768,
"step": 2430
},
{
"epoch": 2.273576097105509,
"grad_norm": 5.84085750579834,
"learning_rate": 4.757348181311394e-05,
"loss": 0.5085,
"num_input_tokens_seen": 314376,
"step": 2435
},
{
"epoch": 2.2782446311858076,
"grad_norm": 4.015573501586914,
"learning_rate": 4.7555942814710954e-05,
"loss": 0.491,
"num_input_tokens_seen": 315032,
"step": 2440
},
{
"epoch": 2.2829131652661063,
"grad_norm": 4.170641899108887,
"learning_rate": 4.7538343914948025e-05,
"loss": 0.4183,
"num_input_tokens_seen": 315656,
"step": 2445
},
{
"epoch": 2.287581699346405,
"grad_norm": 3.4333198070526123,
"learning_rate": 4.75206851605622e-05,
"loss": 0.386,
"num_input_tokens_seen": 316216,
"step": 2450
},
{
"epoch": 2.292250233426704,
"grad_norm": 4.160677433013916,
"learning_rate": 4.7502966598449475e-05,
"loss": 0.5694,
"num_input_tokens_seen": 316840,
"step": 2455
},
{
"epoch": 2.2969187675070026,
"grad_norm": 3.7964909076690674,
"learning_rate": 4.748518827566468e-05,
"loss": 0.4163,
"num_input_tokens_seen": 317448,
"step": 2460
},
{
"epoch": 2.3015873015873014,
"grad_norm": 7.525023937225342,
"learning_rate": 4.746735023942134e-05,
"loss": 0.4436,
"num_input_tokens_seen": 318056,
"step": 2465
},
{
"epoch": 2.3062558356676,
"grad_norm": 13.83182430267334,
"learning_rate": 4.744945253709156e-05,
"loss": 0.6373,
"num_input_tokens_seen": 318728,
"step": 2470
},
{
"epoch": 2.310924369747899,
"grad_norm": 1.9470397233963013,
"learning_rate": 4.743149521620591e-05,
"loss": 1.0598,
"num_input_tokens_seen": 319320,
"step": 2475
},
{
"epoch": 2.3155929038281977,
"grad_norm": 3.6188297271728516,
"learning_rate": 4.7413478324453296e-05,
"loss": 0.4637,
"num_input_tokens_seen": 319928,
"step": 2480
},
{
"epoch": 2.3202614379084965,
"grad_norm": 9.640084266662598,
"learning_rate": 4.7395401909680805e-05,
"loss": 0.7186,
"num_input_tokens_seen": 320456,
"step": 2485
},
{
"epoch": 2.3249299719887957,
"grad_norm": 4.796760082244873,
"learning_rate": 4.737726601989359e-05,
"loss": 0.2771,
"num_input_tokens_seen": 321192,
"step": 2490
},
{
"epoch": 2.3295985060690945,
"grad_norm": 3.5131568908691406,
"learning_rate": 4.735907070325478e-05,
"loss": 0.6796,
"num_input_tokens_seen": 321816,
"step": 2495
},
{
"epoch": 2.3342670401493932,
"grad_norm": 9.193059921264648,
"learning_rate": 4.734081600808531e-05,
"loss": 0.6376,
"num_input_tokens_seen": 322440,
"step": 2500
},
{
"epoch": 2.338935574229692,
"grad_norm": 2.209547758102417,
"learning_rate": 4.73225019828638e-05,
"loss": 0.3292,
"num_input_tokens_seen": 323128,
"step": 2505
},
{
"epoch": 2.3436041083099908,
"grad_norm": 9.847227096557617,
"learning_rate": 4.7304128676226426e-05,
"loss": 0.366,
"num_input_tokens_seen": 323832,
"step": 2510
},
{
"epoch": 2.3482726423902895,
"grad_norm": 2.2758820056915283,
"learning_rate": 4.728569613696683e-05,
"loss": 0.3458,
"num_input_tokens_seen": 324376,
"step": 2515
},
{
"epoch": 2.3529411764705883,
"grad_norm": 4.512838363647461,
"learning_rate": 4.72672044140359e-05,
"loss": 0.3957,
"num_input_tokens_seen": 325048,
"step": 2520
},
{
"epoch": 2.357609710550887,
"grad_norm": 13.972112655639648,
"learning_rate": 4.724865355654176e-05,
"loss": 0.745,
"num_input_tokens_seen": 325624,
"step": 2525
},
{
"epoch": 2.362278244631186,
"grad_norm": 1.9871619939804077,
"learning_rate": 4.723004361374953e-05,
"loss": 0.3958,
"num_input_tokens_seen": 326312,
"step": 2530
},
{
"epoch": 2.3669467787114846,
"grad_norm": 12.324496269226074,
"learning_rate": 4.7211374635081264e-05,
"loss": 0.4743,
"num_input_tokens_seen": 326968,
"step": 2535
},
{
"epoch": 2.3716153127917834,
"grad_norm": 10.762685775756836,
"learning_rate": 4.719264667011578e-05,
"loss": 0.5992,
"num_input_tokens_seen": 327512,
"step": 2540
},
{
"epoch": 2.376283846872082,
"grad_norm": 6.492682456970215,
"learning_rate": 4.717385976858857e-05,
"loss": 0.4598,
"num_input_tokens_seen": 328296,
"step": 2545
},
{
"epoch": 2.380952380952381,
"grad_norm": 14.06905746459961,
"learning_rate": 4.715501398039162e-05,
"loss": 0.5516,
"num_input_tokens_seen": 328808,
"step": 2550
},
{
"epoch": 2.3856209150326797,
"grad_norm": 6.21990966796875,
"learning_rate": 4.71361093555733e-05,
"loss": 0.7209,
"num_input_tokens_seen": 329496,
"step": 2555
},
{
"epoch": 2.3902894491129785,
"grad_norm": 5.462920188903809,
"learning_rate": 4.711714594433825e-05,
"loss": 0.362,
"num_input_tokens_seen": 330216,
"step": 2560
},
{
"epoch": 2.3949579831932772,
"grad_norm": 2.444115161895752,
"learning_rate": 4.7098123797047214e-05,
"loss": 1.1321,
"num_input_tokens_seen": 330840,
"step": 2565
},
{
"epoch": 2.399626517273576,
"grad_norm": 3.1685636043548584,
"learning_rate": 4.7079042964216916e-05,
"loss": 0.4157,
"num_input_tokens_seen": 331512,
"step": 2570
},
{
"epoch": 2.404295051353875,
"grad_norm": 4.733340263366699,
"learning_rate": 4.705990349651994e-05,
"loss": 0.6675,
"num_input_tokens_seen": 332232,
"step": 2575
},
{
"epoch": 2.4089635854341735,
"grad_norm": 6.854113578796387,
"learning_rate": 4.704070544478459e-05,
"loss": 0.4659,
"num_input_tokens_seen": 332872,
"step": 2580
},
{
"epoch": 2.4136321195144723,
"grad_norm": 7.259758949279785,
"learning_rate": 4.7021448859994735e-05,
"loss": 0.7414,
"num_input_tokens_seen": 333560,
"step": 2585
},
{
"epoch": 2.418300653594771,
"grad_norm": 5.3515424728393555,
"learning_rate": 4.70021337932897e-05,
"loss": 0.6076,
"num_input_tokens_seen": 334168,
"step": 2590
},
{
"epoch": 2.42296918767507,
"grad_norm": 19.960798263549805,
"learning_rate": 4.698276029596411e-05,
"loss": 0.8932,
"num_input_tokens_seen": 334856,
"step": 2595
},
{
"epoch": 2.4276377217553686,
"grad_norm": 4.575296401977539,
"learning_rate": 4.696332841946778e-05,
"loss": 0.3443,
"num_input_tokens_seen": 335560,
"step": 2600
},
{
"epoch": 2.432306255835668,
"grad_norm": 2.0907132625579834,
"learning_rate": 4.694383821540555e-05,
"loss": 0.358,
"num_input_tokens_seen": 336216,
"step": 2605
},
{
"epoch": 2.4369747899159666,
"grad_norm": 2.8520407676696777,
"learning_rate": 4.6924289735537144e-05,
"loss": 0.3586,
"num_input_tokens_seen": 336808,
"step": 2610
},
{
"epoch": 2.4416433239962654,
"grad_norm": 8.168237686157227,
"learning_rate": 4.690468303177706e-05,
"loss": 0.5266,
"num_input_tokens_seen": 337480,
"step": 2615
},
{
"epoch": 2.446311858076564,
"grad_norm": 2.263343334197998,
"learning_rate": 4.688501815619446e-05,
"loss": 0.5147,
"num_input_tokens_seen": 338136,
"step": 2620
},
{
"epoch": 2.450980392156863,
"grad_norm": 6.954434871673584,
"learning_rate": 4.6865295161012926e-05,
"loss": 0.5564,
"num_input_tokens_seen": 338808,
"step": 2625
},
{
"epoch": 2.4556489262371617,
"grad_norm": 2.3235628604888916,
"learning_rate": 4.684551409861042e-05,
"loss": 0.4085,
"num_input_tokens_seen": 339528,
"step": 2630
},
{
"epoch": 2.4603174603174605,
"grad_norm": 4.111285209655762,
"learning_rate": 4.682567502151911e-05,
"loss": 0.7339,
"num_input_tokens_seen": 340264,
"step": 2635
},
{
"epoch": 2.4649859943977592,
"grad_norm": 5.659359455108643,
"learning_rate": 4.680577798242523e-05,
"loss": 0.5562,
"num_input_tokens_seen": 340888,
"step": 2640
},
{
"epoch": 2.469654528478058,
"grad_norm": 1.6552449464797974,
"learning_rate": 4.6785823034168955e-05,
"loss": 0.3952,
"num_input_tokens_seen": 341640,
"step": 2645
},
{
"epoch": 2.4743230625583568,
"grad_norm": 1.5371763706207275,
"learning_rate": 4.676581022974421e-05,
"loss": 0.4896,
"num_input_tokens_seen": 342280,
"step": 2650
},
{
"epoch": 2.4789915966386555,
"grad_norm": 0.7637246251106262,
"learning_rate": 4.674573962229862e-05,
"loss": 0.8027,
"num_input_tokens_seen": 342872,
"step": 2655
},
{
"epoch": 2.4836601307189543,
"grad_norm": 7.288429260253906,
"learning_rate": 4.672561126513328e-05,
"loss": 0.5183,
"num_input_tokens_seen": 343496,
"step": 2660
},
{
"epoch": 2.488328664799253,
"grad_norm": 7.098439693450928,
"learning_rate": 4.670542521170266e-05,
"loss": 0.6193,
"num_input_tokens_seen": 344136,
"step": 2665
},
{
"epoch": 2.492997198879552,
"grad_norm": 4.773173809051514,
"learning_rate": 4.6685181515614454e-05,
"loss": 0.5955,
"num_input_tokens_seen": 344808,
"step": 2670
},
{
"epoch": 2.4976657329598506,
"grad_norm": 3.8115127086639404,
"learning_rate": 4.666488023062943e-05,
"loss": 0.3639,
"num_input_tokens_seen": 345400,
"step": 2675
},
{
"epoch": 2.5023342670401494,
"grad_norm": 7.138146877288818,
"learning_rate": 4.664452141066131e-05,
"loss": 0.6925,
"num_input_tokens_seen": 346040,
"step": 2680
},
{
"epoch": 2.5023342670401494,
"eval_loss": 0.652177631855011,
"eval_runtime": 3.8707,
"eval_samples_per_second": 61.488,
"eval_steps_per_second": 30.744,
"num_input_tokens_seen": 346040,
"step": 2680
},
{
"epoch": 2.507002801120448,
"grad_norm": 3.9293606281280518,
"learning_rate": 4.662410510977659e-05,
"loss": 0.6155,
"num_input_tokens_seen": 346616,
"step": 2685
},
{
"epoch": 2.511671335200747,
"grad_norm": 10.7706880569458,
"learning_rate": 4.6603631382194426e-05,
"loss": 0.8976,
"num_input_tokens_seen": 347192,
"step": 2690
},
{
"epoch": 2.5163398692810457,
"grad_norm": 5.461066246032715,
"learning_rate": 4.658310028228649e-05,
"loss": 0.4815,
"num_input_tokens_seen": 347768,
"step": 2695
},
{
"epoch": 2.5210084033613445,
"grad_norm": 2.074761152267456,
"learning_rate": 4.65625118645768e-05,
"loss": 0.1902,
"num_input_tokens_seen": 348360,
"step": 2700
},
{
"epoch": 2.5256769374416432,
"grad_norm": 3.5420331954956055,
"learning_rate": 4.654186618374159e-05,
"loss": 0.5325,
"num_input_tokens_seen": 348968,
"step": 2705
},
{
"epoch": 2.530345471521942,
"grad_norm": 0.8925294876098633,
"learning_rate": 4.6521163294609196e-05,
"loss": 0.695,
"num_input_tokens_seen": 349672,
"step": 2710
},
{
"epoch": 2.5350140056022408,
"grad_norm": 3.142289400100708,
"learning_rate": 4.650040325215985e-05,
"loss": 0.415,
"num_input_tokens_seen": 350328,
"step": 2715
},
{
"epoch": 2.5396825396825395,
"grad_norm": 8.260872840881348,
"learning_rate": 4.647958611152557e-05,
"loss": 0.59,
"num_input_tokens_seen": 350984,
"step": 2720
},
{
"epoch": 2.5443510737628383,
"grad_norm": 5.627845287322998,
"learning_rate": 4.645871192799004e-05,
"loss": 0.5929,
"num_input_tokens_seen": 351624,
"step": 2725
},
{
"epoch": 2.549019607843137,
"grad_norm": 6.843557834625244,
"learning_rate": 4.643778075698838e-05,
"loss": 0.4834,
"num_input_tokens_seen": 352264,
"step": 2730
},
{
"epoch": 2.553688141923436,
"grad_norm": 7.715879440307617,
"learning_rate": 4.6416792654107076e-05,
"loss": 0.7563,
"num_input_tokens_seen": 352920,
"step": 2735
},
{
"epoch": 2.5583566760037346,
"grad_norm": 4.674366474151611,
"learning_rate": 4.6395747675083825e-05,
"loss": 0.4375,
"num_input_tokens_seen": 353688,
"step": 2740
},
{
"epoch": 2.5630252100840334,
"grad_norm": 6.640374660491943,
"learning_rate": 4.637464587580734e-05,
"loss": 0.3506,
"num_input_tokens_seen": 354312,
"step": 2745
},
{
"epoch": 2.567693744164332,
"grad_norm": 2.6401913166046143,
"learning_rate": 4.6353487312317237e-05,
"loss": 0.6531,
"num_input_tokens_seen": 354888,
"step": 2750
},
{
"epoch": 2.572362278244631,
"grad_norm": 6.281817436218262,
"learning_rate": 4.6332272040803895e-05,
"loss": 0.4135,
"num_input_tokens_seen": 355480,
"step": 2755
},
{
"epoch": 2.5770308123249297,
"grad_norm": 4.7368388175964355,
"learning_rate": 4.631100011760827e-05,
"loss": 0.657,
"num_input_tokens_seen": 356072,
"step": 2760
},
{
"epoch": 2.581699346405229,
"grad_norm": 7.262160301208496,
"learning_rate": 4.628967159922178e-05,
"loss": 0.6438,
"num_input_tokens_seen": 356664,
"step": 2765
},
{
"epoch": 2.5863678804855277,
"grad_norm": 17.83985137939453,
"learning_rate": 4.626828654228615e-05,
"loss": 0.5256,
"num_input_tokens_seen": 357352,
"step": 2770
},
{
"epoch": 2.5910364145658265,
"grad_norm": 4.219756126403809,
"learning_rate": 4.624684500359323e-05,
"loss": 0.3764,
"num_input_tokens_seen": 357960,
"step": 2775
},
{
"epoch": 2.595704948646125,
"grad_norm": 4.930896282196045,
"learning_rate": 4.622534704008489e-05,
"loss": 1.1571,
"num_input_tokens_seen": 358584,
"step": 2780
},
{
"epoch": 2.600373482726424,
"grad_norm": 8.239065170288086,
"learning_rate": 4.620379270885282e-05,
"loss": 0.5915,
"num_input_tokens_seen": 359272,
"step": 2785
},
{
"epoch": 2.6050420168067228,
"grad_norm": 5.721827983856201,
"learning_rate": 4.6182182067138424e-05,
"loss": 0.4863,
"num_input_tokens_seen": 359992,
"step": 2790
},
{
"epoch": 2.6097105508870215,
"grad_norm": 5.4295654296875,
"learning_rate": 4.6160515172332655e-05,
"loss": 0.6015,
"num_input_tokens_seen": 360568,
"step": 2795
},
{
"epoch": 2.6143790849673203,
"grad_norm": 3.9932007789611816,
"learning_rate": 4.6138792081975846e-05,
"loss": 0.491,
"num_input_tokens_seen": 361272,
"step": 2800
},
{
"epoch": 2.619047619047619,
"grad_norm": 7.950539588928223,
"learning_rate": 4.611701285375756e-05,
"loss": 0.6307,
"num_input_tokens_seen": 361992,
"step": 2805
},
{
"epoch": 2.623716153127918,
"grad_norm": 9.549372673034668,
"learning_rate": 4.609517754551644e-05,
"loss": 0.8141,
"num_input_tokens_seen": 362584,
"step": 2810
},
{
"epoch": 2.6283846872082166,
"grad_norm": 4.924779415130615,
"learning_rate": 4.6073286215240105e-05,
"loss": 0.7201,
"num_input_tokens_seen": 363208,
"step": 2815
},
{
"epoch": 2.6330532212885154,
"grad_norm": 6.979094505310059,
"learning_rate": 4.605133892106488e-05,
"loss": 0.4521,
"num_input_tokens_seen": 363816,
"step": 2820
},
{
"epoch": 2.637721755368814,
"grad_norm": 16.25713348388672,
"learning_rate": 4.602933572127578e-05,
"loss": 0.5296,
"num_input_tokens_seen": 364504,
"step": 2825
},
{
"epoch": 2.642390289449113,
"grad_norm": 1.5559110641479492,
"learning_rate": 4.600727667430624e-05,
"loss": 0.499,
"num_input_tokens_seen": 365176,
"step": 2830
},
{
"epoch": 2.6470588235294117,
"grad_norm": 3.178032875061035,
"learning_rate": 4.598516183873802e-05,
"loss": 0.6472,
"num_input_tokens_seen": 365800,
"step": 2835
},
{
"epoch": 2.6517273576097105,
"grad_norm": 30.63277816772461,
"learning_rate": 4.596299127330106e-05,
"loss": 0.7986,
"num_input_tokens_seen": 366392,
"step": 2840
},
{
"epoch": 2.6563958916900092,
"grad_norm": 0.5623080134391785,
"learning_rate": 4.594076503687326e-05,
"loss": 0.5632,
"num_input_tokens_seen": 366984,
"step": 2845
},
{
"epoch": 2.661064425770308,
"grad_norm": 18.52434539794922,
"learning_rate": 4.591848318848039e-05,
"loss": 0.6165,
"num_input_tokens_seen": 367576,
"step": 2850
},
{
"epoch": 2.6657329598506068,
"grad_norm": 8.841620445251465,
"learning_rate": 4.589614578729591e-05,
"loss": 0.3239,
"num_input_tokens_seen": 368360,
"step": 2855
},
{
"epoch": 2.6704014939309055,
"grad_norm": 0.7548959851264954,
"learning_rate": 4.5873752892640796e-05,
"loss": 0.3458,
"num_input_tokens_seen": 369032,
"step": 2860
},
{
"epoch": 2.6750700280112047,
"grad_norm": 11.500990867614746,
"learning_rate": 4.5851304563983414e-05,
"loss": 0.6321,
"num_input_tokens_seen": 369688,
"step": 2865
},
{
"epoch": 2.6797385620915035,
"grad_norm": 1.3931317329406738,
"learning_rate": 4.582880086093933e-05,
"loss": 0.3262,
"num_input_tokens_seen": 370392,
"step": 2870
},
{
"epoch": 2.6844070961718023,
"grad_norm": 9.6282320022583,
"learning_rate": 4.5806241843271166e-05,
"loss": 0.678,
"num_input_tokens_seen": 370984,
"step": 2875
},
{
"epoch": 2.689075630252101,
"grad_norm": 2.316354990005493,
"learning_rate": 4.578362757088846e-05,
"loss": 0.4045,
"num_input_tokens_seen": 371720,
"step": 2880
},
{
"epoch": 2.6937441643324,
"grad_norm": 8.465997695922852,
"learning_rate": 4.5760958103847455e-05,
"loss": 0.4269,
"num_input_tokens_seen": 372392,
"step": 2885
},
{
"epoch": 2.6984126984126986,
"grad_norm": 1.1068583726882935,
"learning_rate": 4.573823350235102e-05,
"loss": 0.2869,
"num_input_tokens_seen": 373000,
"step": 2890
},
{
"epoch": 2.7030812324929974,
"grad_norm": 5.867659091949463,
"learning_rate": 4.57154538267484e-05,
"loss": 0.4467,
"num_input_tokens_seen": 373672,
"step": 2895
},
{
"epoch": 2.707749766573296,
"grad_norm": 2.448972702026367,
"learning_rate": 4.5692619137535134e-05,
"loss": 0.5245,
"num_input_tokens_seen": 374344,
"step": 2900
},
{
"epoch": 2.712418300653595,
"grad_norm": 7.908166885375977,
"learning_rate": 4.566972949535283e-05,
"loss": 0.5952,
"num_input_tokens_seen": 375000,
"step": 2905
},
{
"epoch": 2.7170868347338937,
"grad_norm": 2.1407413482666016,
"learning_rate": 4.5646784960989054e-05,
"loss": 0.6988,
"num_input_tokens_seen": 375560,
"step": 2910
},
{
"epoch": 2.7217553688141924,
"grad_norm": 2.4962363243103027,
"learning_rate": 4.562378559537714e-05,
"loss": 0.6708,
"num_input_tokens_seen": 376280,
"step": 2915
},
{
"epoch": 2.726423902894491,
"grad_norm": 10.618359565734863,
"learning_rate": 4.560073145959602e-05,
"loss": 0.5289,
"num_input_tokens_seen": 376968,
"step": 2920
},
{
"epoch": 2.73109243697479,
"grad_norm": 3.9154977798461914,
"learning_rate": 4.557762261487013e-05,
"loss": 0.9578,
"num_input_tokens_seen": 377592,
"step": 2925
},
{
"epoch": 2.7357609710550888,
"grad_norm": 5.653745651245117,
"learning_rate": 4.5554459122569124e-05,
"loss": 0.4755,
"num_input_tokens_seen": 378376,
"step": 2930
},
{
"epoch": 2.7404295051353875,
"grad_norm": 4.4318976402282715,
"learning_rate": 4.553124104420784e-05,
"loss": 0.3741,
"num_input_tokens_seen": 378968,
"step": 2935
},
{
"epoch": 2.7450980392156863,
"grad_norm": 2.4929981231689453,
"learning_rate": 4.550796844144605e-05,
"loss": 0.3582,
"num_input_tokens_seen": 379608,
"step": 2940
},
{
"epoch": 2.749766573295985,
"grad_norm": 4.1619486808776855,
"learning_rate": 4.548464137608834e-05,
"loss": 0.6926,
"num_input_tokens_seen": 380232,
"step": 2945
},
{
"epoch": 2.754435107376284,
"grad_norm": 2.7385575771331787,
"learning_rate": 4.546125991008392e-05,
"loss": 0.5283,
"num_input_tokens_seen": 380936,
"step": 2950
},
{
"epoch": 2.7591036414565826,
"grad_norm": 2.4300732612609863,
"learning_rate": 4.5437824105526474e-05,
"loss": 0.5301,
"num_input_tokens_seen": 381560,
"step": 2955
},
{
"epoch": 2.7637721755368814,
"grad_norm": 4.369988441467285,
"learning_rate": 4.541433402465399e-05,
"loss": 0.5472,
"num_input_tokens_seen": 382264,
"step": 2960
},
{
"epoch": 2.76844070961718,
"grad_norm": 3.6987788677215576,
"learning_rate": 4.5390789729848605e-05,
"loss": 0.6572,
"num_input_tokens_seen": 382936,
"step": 2965
},
{
"epoch": 2.773109243697479,
"grad_norm": 7.901442050933838,
"learning_rate": 4.5367191283636426e-05,
"loss": 0.6978,
"num_input_tokens_seen": 383528,
"step": 2970
},
{
"epoch": 2.7777777777777777,
"grad_norm": 12.086589813232422,
"learning_rate": 4.534353874868736e-05,
"loss": 0.4766,
"num_input_tokens_seen": 384104,
"step": 2975
},
{
"epoch": 2.7824463118580764,
"grad_norm": 4.757928848266602,
"learning_rate": 4.531983218781498e-05,
"loss": 0.5683,
"num_input_tokens_seen": 384664,
"step": 2980
},
{
"epoch": 2.787114845938375,
"grad_norm": 2.6184375286102295,
"learning_rate": 4.52960716639763e-05,
"loss": 0.4537,
"num_input_tokens_seen": 385288,
"step": 2985
},
{
"epoch": 2.791783380018674,
"grad_norm": 3.0726981163024902,
"learning_rate": 4.5272257240271676e-05,
"loss": 0.4201,
"num_input_tokens_seen": 385928,
"step": 2990
},
{
"epoch": 2.7964519140989728,
"grad_norm": 7.38121223449707,
"learning_rate": 4.524838897994458e-05,
"loss": 0.4057,
"num_input_tokens_seen": 386552,
"step": 2995
},
{
"epoch": 2.8011204481792715,
"grad_norm": 9.7579927444458,
"learning_rate": 4.5224466946381476e-05,
"loss": 0.6116,
"num_input_tokens_seen": 387144,
"step": 3000
},
{
"epoch": 2.8057889822595703,
"grad_norm": 2.8062477111816406,
"learning_rate": 4.520049120311162e-05,
"loss": 0.4844,
"num_input_tokens_seen": 387704,
"step": 3005
},
{
"epoch": 2.810457516339869,
"grad_norm": 5.818384647369385,
"learning_rate": 4.5176461813806904e-05,
"loss": 0.2765,
"num_input_tokens_seen": 388248,
"step": 3010
},
{
"epoch": 2.815126050420168,
"grad_norm": 3.2478232383728027,
"learning_rate": 4.5152378842281694e-05,
"loss": 0.5931,
"num_input_tokens_seen": 388952,
"step": 3015
},
{
"epoch": 2.8197945845004666,
"grad_norm": 1.4422049522399902,
"learning_rate": 4.512824235249265e-05,
"loss": 0.4764,
"num_input_tokens_seen": 389768,
"step": 3020
},
{
"epoch": 2.8244631185807654,
"grad_norm": 3.1220462322235107,
"learning_rate": 4.510405240853854e-05,
"loss": 0.6353,
"num_input_tokens_seen": 390408,
"step": 3025
},
{
"epoch": 2.8291316526610646,
"grad_norm": 5.443966865539551,
"learning_rate": 4.507980907466014e-05,
"loss": 0.6038,
"num_input_tokens_seen": 391112,
"step": 3030
},
{
"epoch": 2.8338001867413634,
"grad_norm": 10.735891342163086,
"learning_rate": 4.505551241523996e-05,
"loss": 0.4195,
"num_input_tokens_seen": 391752,
"step": 3035
},
{
"epoch": 2.838468720821662,
"grad_norm": 1.7903181314468384,
"learning_rate": 4.503116249480215e-05,
"loss": 0.5118,
"num_input_tokens_seen": 392456,
"step": 3040
},
{
"epoch": 2.843137254901961,
"grad_norm": 2.6636769771575928,
"learning_rate": 4.500675937801229e-05,
"loss": 0.4882,
"num_input_tokens_seen": 393112,
"step": 3045
},
{
"epoch": 2.8478057889822597,
"grad_norm": 6.658154487609863,
"learning_rate": 4.498230312967726e-05,
"loss": 0.4428,
"num_input_tokens_seen": 393768,
"step": 3050
},
{
"epoch": 2.8524743230625584,
"grad_norm": 4.922386169433594,
"learning_rate": 4.495779381474502e-05,
"loss": 0.5097,
"num_input_tokens_seen": 394424,
"step": 3055
},
{
"epoch": 2.857142857142857,
"grad_norm": 11.311652183532715,
"learning_rate": 4.4933231498304445e-05,
"loss": 0.6931,
"num_input_tokens_seen": 395096,
"step": 3060
},
{
"epoch": 2.861811391223156,
"grad_norm": 4.589713096618652,
"learning_rate": 4.490861624558519e-05,
"loss": 0.2915,
"num_input_tokens_seen": 395880,
"step": 3065
},
{
"epoch": 2.8664799253034547,
"grad_norm": 3.9051504135131836,
"learning_rate": 4.488394812195749e-05,
"loss": 0.5947,
"num_input_tokens_seen": 396520,
"step": 3070
},
{
"epoch": 2.8711484593837535,
"grad_norm": 5.779489517211914,
"learning_rate": 4.4859227192931974e-05,
"loss": 0.7912,
"num_input_tokens_seen": 397240,
"step": 3075
},
{
"epoch": 2.8758169934640523,
"grad_norm": 8.4837064743042,
"learning_rate": 4.483445352415951e-05,
"loss": 0.6806,
"num_input_tokens_seen": 397896,
"step": 3080
},
{
"epoch": 2.880485527544351,
"grad_norm": 6.633236408233643,
"learning_rate": 4.480962718143102e-05,
"loss": 1.0596,
"num_input_tokens_seen": 398504,
"step": 3085
},
{
"epoch": 2.88515406162465,
"grad_norm": 4.59561824798584,
"learning_rate": 4.4784748230677344e-05,
"loss": 0.7187,
"num_input_tokens_seen": 399192,
"step": 3090
},
{
"epoch": 2.8898225957049486,
"grad_norm": 2.5471694469451904,
"learning_rate": 4.475981673796899e-05,
"loss": 0.543,
"num_input_tokens_seen": 399848,
"step": 3095
},
{
"epoch": 2.8944911297852474,
"grad_norm": 5.1371893882751465,
"learning_rate": 4.4734832769516014e-05,
"loss": 0.8593,
"num_input_tokens_seen": 400392,
"step": 3100
},
{
"epoch": 2.899159663865546,
"grad_norm": 4.461278438568115,
"learning_rate": 4.470979639166784e-05,
"loss": 0.5156,
"num_input_tokens_seen": 401016,
"step": 3105
},
{
"epoch": 2.903828197945845,
"grad_norm": 3.752699851989746,
"learning_rate": 4.468470767091306e-05,
"loss": 0.7429,
"num_input_tokens_seen": 401608,
"step": 3110
},
{
"epoch": 2.9084967320261437,
"grad_norm": 7.799002170562744,
"learning_rate": 4.46595666738793e-05,
"loss": 0.4073,
"num_input_tokens_seen": 402328,
"step": 3115
},
{
"epoch": 2.9131652661064424,
"grad_norm": 1.570083498954773,
"learning_rate": 4.4634373467332994e-05,
"loss": 0.6496,
"num_input_tokens_seen": 402984,
"step": 3120
},
{
"epoch": 2.917833800186741,
"grad_norm": 1.5849401950836182,
"learning_rate": 4.46091281181792e-05,
"loss": 0.5304,
"num_input_tokens_seen": 403592,
"step": 3125
},
{
"epoch": 2.9225023342670404,
"grad_norm": 5.008321762084961,
"learning_rate": 4.458383069346152e-05,
"loss": 0.4337,
"num_input_tokens_seen": 404360,
"step": 3130
},
{
"epoch": 2.927170868347339,
"grad_norm": 2.1844277381896973,
"learning_rate": 4.4558481260361785e-05,
"loss": 0.4136,
"num_input_tokens_seen": 404984,
"step": 3135
},
{
"epoch": 2.931839402427638,
"grad_norm": 12.90602970123291,
"learning_rate": 4.453307988619997e-05,
"loss": 0.8935,
"num_input_tokens_seen": 405528,
"step": 3140
},
{
"epoch": 2.9365079365079367,
"grad_norm": 10.970650672912598,
"learning_rate": 4.4507626638434006e-05,
"loss": 0.3612,
"num_input_tokens_seen": 406104,
"step": 3145
},
{
"epoch": 2.9411764705882355,
"grad_norm": 5.149006366729736,
"learning_rate": 4.448212158465956e-05,
"loss": 0.2854,
"num_input_tokens_seen": 406712,
"step": 3150
},
{
"epoch": 2.9458450046685343,
"grad_norm": 4.595794200897217,
"learning_rate": 4.4456564792609886e-05,
"loss": 0.3998,
"num_input_tokens_seen": 407336,
"step": 3155
},
{
"epoch": 2.950513538748833,
"grad_norm": 3.2409141063690186,
"learning_rate": 4.4430956330155636e-05,
"loss": 0.4878,
"num_input_tokens_seen": 408008,
"step": 3160
},
{
"epoch": 2.955182072829132,
"grad_norm": 20.001317977905273,
"learning_rate": 4.440529626530469e-05,
"loss": 0.4073,
"num_input_tokens_seen": 408632,
"step": 3165
},
{
"epoch": 2.9598506069094306,
"grad_norm": 5.227000713348389,
"learning_rate": 4.4379584666201944e-05,
"loss": 0.9487,
"num_input_tokens_seen": 409240,
"step": 3170
},
{
"epoch": 2.9645191409897294,
"grad_norm": 8.687257766723633,
"learning_rate": 4.43538216011292e-05,
"loss": 0.6452,
"num_input_tokens_seen": 409912,
"step": 3175
},
{
"epoch": 2.969187675070028,
"grad_norm": 3.1073365211486816,
"learning_rate": 4.432800713850488e-05,
"loss": 0.6379,
"num_input_tokens_seen": 410632,
"step": 3180
},
{
"epoch": 2.973856209150327,
"grad_norm": 6.412373065948486,
"learning_rate": 4.430214134688394e-05,
"loss": 0.6797,
"num_input_tokens_seen": 411224,
"step": 3185
},
{
"epoch": 2.9785247432306257,
"grad_norm": 2.9643750190734863,
"learning_rate": 4.427622429495765e-05,
"loss": 0.6146,
"num_input_tokens_seen": 411848,
"step": 3190
},
{
"epoch": 2.9831932773109244,
"grad_norm": 15.312832832336426,
"learning_rate": 4.425025605155337e-05,
"loss": 0.48,
"num_input_tokens_seen": 412536,
"step": 3195
},
{
"epoch": 2.987861811391223,
"grad_norm": 6.596396446228027,
"learning_rate": 4.4224236685634466e-05,
"loss": 0.5964,
"num_input_tokens_seen": 413144,
"step": 3200
},
{
"epoch": 2.992530345471522,
"grad_norm": 3.3958661556243896,
"learning_rate": 4.419816626630003e-05,
"loss": 0.2579,
"num_input_tokens_seen": 413752,
"step": 3205
},
{
"epoch": 2.9971988795518207,
"grad_norm": 5.963001251220703,
"learning_rate": 4.417204486278475e-05,
"loss": 0.7093,
"num_input_tokens_seen": 414488,
"step": 3210
},
{
"epoch": 3.0018674136321195,
"grad_norm": 4.949418067932129,
"learning_rate": 4.414587254445869e-05,
"loss": 0.5218,
"num_input_tokens_seen": 415056,
"step": 3215
},
{
"epoch": 3.0028011204481793,
"eval_loss": 0.6635326147079468,
"eval_runtime": 3.8704,
"eval_samples_per_second": 61.493,
"eval_steps_per_second": 30.746,
"num_input_tokens_seen": 415184,
"step": 3216
},
{
"epoch": 3.0065359477124183,
"grad_norm": 4.510260105133057,
"learning_rate": 4.411964938082717e-05,
"loss": 0.4789,
"num_input_tokens_seen": 415712,
"step": 3220
},
{
"epoch": 3.011204481792717,
"grad_norm": 2.7946648597717285,
"learning_rate": 4.409337544153049e-05,
"loss": 0.4034,
"num_input_tokens_seen": 416320,
"step": 3225
},
{
"epoch": 3.015873015873016,
"grad_norm": 1.9573955535888672,
"learning_rate": 4.406705079634384e-05,
"loss": 0.2833,
"num_input_tokens_seen": 416992,
"step": 3230
},
{
"epoch": 3.0205415499533146,
"grad_norm": 4.682679653167725,
"learning_rate": 4.404067551517703e-05,
"loss": 0.2534,
"num_input_tokens_seen": 417728,
"step": 3235
},
{
"epoch": 3.0252100840336134,
"grad_norm": 7.025947570800781,
"learning_rate": 4.401424966807438e-05,
"loss": 0.558,
"num_input_tokens_seen": 418336,
"step": 3240
},
{
"epoch": 3.029878618113912,
"grad_norm": 3.119868278503418,
"learning_rate": 4.398777332521444e-05,
"loss": 0.4241,
"num_input_tokens_seen": 418928,
"step": 3245
},
{
"epoch": 3.034547152194211,
"grad_norm": 16.719175338745117,
"learning_rate": 4.3961246556909934e-05,
"loss": 0.2493,
"num_input_tokens_seen": 419472,
"step": 3250
},
{
"epoch": 3.0392156862745097,
"grad_norm": 1.7161365747451782,
"learning_rate": 4.393466943360745e-05,
"loss": 0.3051,
"num_input_tokens_seen": 420192,
"step": 3255
},
{
"epoch": 3.0438842203548084,
"grad_norm": 3.6444671154022217,
"learning_rate": 4.39080420258873e-05,
"loss": 0.5382,
"num_input_tokens_seen": 420864,
"step": 3260
},
{
"epoch": 3.048552754435107,
"grad_norm": 2.0355281829833984,
"learning_rate": 4.388136440446337e-05,
"loss": 0.3762,
"num_input_tokens_seen": 421408,
"step": 3265
},
{
"epoch": 3.053221288515406,
"grad_norm": 4.12183141708374,
"learning_rate": 4.385463664018288e-05,
"loss": 0.6386,
"num_input_tokens_seen": 422032,
"step": 3270
},
{
"epoch": 3.0578898225957047,
"grad_norm": 6.405667781829834,
"learning_rate": 4.382785880402619e-05,
"loss": 0.4338,
"num_input_tokens_seen": 422704,
"step": 3275
},
{
"epoch": 3.0625583566760035,
"grad_norm": 2.73160982131958,
"learning_rate": 4.3801030967106676e-05,
"loss": 0.2651,
"num_input_tokens_seen": 423328,
"step": 3280
},
{
"epoch": 3.0672268907563027,
"grad_norm": 11.54159164428711,
"learning_rate": 4.377415320067048e-05,
"loss": 0.2931,
"num_input_tokens_seen": 423920,
"step": 3285
},
{
"epoch": 3.0718954248366015,
"grad_norm": 6.4927144050598145,
"learning_rate": 4.374722557609633e-05,
"loss": 0.3357,
"num_input_tokens_seen": 424752,
"step": 3290
},
{
"epoch": 3.0765639589169003,
"grad_norm": 6.1774091720581055,
"learning_rate": 4.372024816489537e-05,
"loss": 0.4242,
"num_input_tokens_seen": 425456,
"step": 3295
},
{
"epoch": 3.081232492997199,
"grad_norm": 3.846402645111084,
"learning_rate": 4.3693221038710986e-05,
"loss": 0.3312,
"num_input_tokens_seen": 426096,
"step": 3300
},
{
"epoch": 3.085901027077498,
"grad_norm": 4.391523361206055,
"learning_rate": 4.366614426931855e-05,
"loss": 0.4697,
"num_input_tokens_seen": 426720,
"step": 3305
},
{
"epoch": 3.0905695611577966,
"grad_norm": 5.942804336547852,
"learning_rate": 4.363901792862529e-05,
"loss": 0.6869,
"num_input_tokens_seen": 427344,
"step": 3310
},
{
"epoch": 3.0952380952380953,
"grad_norm": 7.151164531707764,
"learning_rate": 4.361184208867009e-05,
"loss": 0.4796,
"num_input_tokens_seen": 427904,
"step": 3315
},
{
"epoch": 3.099906629318394,
"grad_norm": 8.739173889160156,
"learning_rate": 4.3584616821623267e-05,
"loss": 0.3204,
"num_input_tokens_seen": 428480,
"step": 3320
},
{
"epoch": 3.104575163398693,
"grad_norm": 1.7101725339889526,
"learning_rate": 4.3557342199786414e-05,
"loss": 0.5556,
"num_input_tokens_seen": 429104,
"step": 3325
},
{
"epoch": 3.1092436974789917,
"grad_norm": 5.500372886657715,
"learning_rate": 4.353001829559219e-05,
"loss": 0.5862,
"num_input_tokens_seen": 429824,
"step": 3330
},
{
"epoch": 3.1139122315592904,
"grad_norm": 4.01257848739624,
"learning_rate": 4.350264518160414e-05,
"loss": 0.3249,
"num_input_tokens_seen": 430528,
"step": 3335
},
{
"epoch": 3.118580765639589,
"grad_norm": 5.113812446594238,
"learning_rate": 4.347522293051648e-05,
"loss": 0.6022,
"num_input_tokens_seen": 431120,
"step": 3340
},
{
"epoch": 3.123249299719888,
"grad_norm": 3.563166618347168,
"learning_rate": 4.344775161515393e-05,
"loss": 0.2757,
"num_input_tokens_seen": 431744,
"step": 3345
},
{
"epoch": 3.1279178338001867,
"grad_norm": 5.309462547302246,
"learning_rate": 4.3420231308471496e-05,
"loss": 0.4937,
"num_input_tokens_seen": 432432,
"step": 3350
},
{
"epoch": 3.1325863678804855,
"grad_norm": 6.111654758453369,
"learning_rate": 4.3392662083554316e-05,
"loss": 0.301,
"num_input_tokens_seen": 433104,
"step": 3355
},
{
"epoch": 3.1372549019607843,
"grad_norm": 3.6707801818847656,
"learning_rate": 4.3365044013617406e-05,
"loss": 0.3583,
"num_input_tokens_seen": 433728,
"step": 3360
},
{
"epoch": 3.141923436041083,
"grad_norm": 2.1480159759521484,
"learning_rate": 4.3337377172005524e-05,
"loss": 0.5164,
"num_input_tokens_seen": 434384,
"step": 3365
},
{
"epoch": 3.146591970121382,
"grad_norm": 1.8540209531784058,
"learning_rate": 4.330966163219293e-05,
"loss": 0.2326,
"num_input_tokens_seen": 435056,
"step": 3370
},
{
"epoch": 3.1512605042016806,
"grad_norm": 13.281586647033691,
"learning_rate": 4.328189746778323e-05,
"loss": 0.3451,
"num_input_tokens_seen": 435680,
"step": 3375
},
{
"epoch": 3.1559290382819793,
"grad_norm": 4.929521083831787,
"learning_rate": 4.3254084752509145e-05,
"loss": 0.3595,
"num_input_tokens_seen": 436336,
"step": 3380
},
{
"epoch": 3.160597572362278,
"grad_norm": 1.3814646005630493,
"learning_rate": 4.322622356023235e-05,
"loss": 0.4323,
"num_input_tokens_seen": 437008,
"step": 3385
},
{
"epoch": 3.165266106442577,
"grad_norm": 11.279054641723633,
"learning_rate": 4.319831396494324e-05,
"loss": 0.4234,
"num_input_tokens_seen": 437744,
"step": 3390
},
{
"epoch": 3.1699346405228757,
"grad_norm": 9.408576965332031,
"learning_rate": 4.317035604076076e-05,
"loss": 0.5266,
"num_input_tokens_seen": 438384,
"step": 3395
},
{
"epoch": 3.1746031746031744,
"grad_norm": 4.971055507659912,
"learning_rate": 4.3142349861932205e-05,
"loss": 0.237,
"num_input_tokens_seen": 439072,
"step": 3400
},
{
"epoch": 3.179271708683473,
"grad_norm": 13.090262413024902,
"learning_rate": 4.3114295502833026e-05,
"loss": 0.4381,
"num_input_tokens_seen": 439680,
"step": 3405
},
{
"epoch": 3.1839402427637724,
"grad_norm": 3.374798059463501,
"learning_rate": 4.3086193037966593e-05,
"loss": 0.8413,
"num_input_tokens_seen": 440320,
"step": 3410
},
{
"epoch": 3.188608776844071,
"grad_norm": 2.8687775135040283,
"learning_rate": 4.305804254196407e-05,
"loss": 0.3212,
"num_input_tokens_seen": 440944,
"step": 3415
},
{
"epoch": 3.19327731092437,
"grad_norm": 7.667285919189453,
"learning_rate": 4.302984408958416e-05,
"loss": 0.4842,
"num_input_tokens_seen": 441520,
"step": 3420
},
{
"epoch": 3.1979458450046687,
"grad_norm": 0.6867608428001404,
"learning_rate": 4.3001597755712906e-05,
"loss": 0.3884,
"num_input_tokens_seen": 442176,
"step": 3425
},
{
"epoch": 3.2026143790849675,
"grad_norm": 5.781373977661133,
"learning_rate": 4.297330361536354e-05,
"loss": 0.3297,
"num_input_tokens_seen": 442864,
"step": 3430
},
{
"epoch": 3.2072829131652663,
"grad_norm": 3.4922213554382324,
"learning_rate": 4.294496174367623e-05,
"loss": 0.4167,
"num_input_tokens_seen": 443504,
"step": 3435
},
{
"epoch": 3.211951447245565,
"grad_norm": 6.011383056640625,
"learning_rate": 4.2916572215917906e-05,
"loss": 0.4975,
"num_input_tokens_seen": 444160,
"step": 3440
},
{
"epoch": 3.216619981325864,
"grad_norm": 4.055123805999756,
"learning_rate": 4.2888135107482067e-05,
"loss": 0.7297,
"num_input_tokens_seen": 444768,
"step": 3445
},
{
"epoch": 3.2212885154061626,
"grad_norm": 14.692686080932617,
"learning_rate": 4.2859650493888556e-05,
"loss": 0.7433,
"num_input_tokens_seen": 445424,
"step": 3450
},
{
"epoch": 3.2259570494864613,
"grad_norm": 14.178847312927246,
"learning_rate": 4.283111845078339e-05,
"loss": 0.5043,
"num_input_tokens_seen": 446160,
"step": 3455
},
{
"epoch": 3.23062558356676,
"grad_norm": 8.43553638458252,
"learning_rate": 4.280253905393855e-05,
"loss": 0.5195,
"num_input_tokens_seen": 446768,
"step": 3460
},
{
"epoch": 3.235294117647059,
"grad_norm": 7.715728759765625,
"learning_rate": 4.277391237925174e-05,
"loss": 0.4626,
"num_input_tokens_seen": 447376,
"step": 3465
},
{
"epoch": 3.2399626517273576,
"grad_norm": 3.9560585021972656,
"learning_rate": 4.274523850274625e-05,
"loss": 0.4005,
"num_input_tokens_seen": 447952,
"step": 3470
},
{
"epoch": 3.2446311858076564,
"grad_norm": 3.0296170711517334,
"learning_rate": 4.2716517500570705e-05,
"loss": 0.3344,
"num_input_tokens_seen": 448624,
"step": 3475
},
{
"epoch": 3.249299719887955,
"grad_norm": 6.199625492095947,
"learning_rate": 4.2687749448998906e-05,
"loss": 0.5182,
"num_input_tokens_seen": 449184,
"step": 3480
},
{
"epoch": 3.253968253968254,
"grad_norm": 7.57280969619751,
"learning_rate": 4.265893442442957e-05,
"loss": 0.7696,
"num_input_tokens_seen": 449856,
"step": 3485
},
{
"epoch": 3.2586367880485527,
"grad_norm": 5.486474990844727,
"learning_rate": 4.2630072503386165e-05,
"loss": 0.3264,
"num_input_tokens_seen": 450464,
"step": 3490
},
{
"epoch": 3.2633053221288515,
"grad_norm": 6.274860382080078,
"learning_rate": 4.260116376251672e-05,
"loss": 0.6919,
"num_input_tokens_seen": 451104,
"step": 3495
},
{
"epoch": 3.2679738562091503,
"grad_norm": 2.639403820037842,
"learning_rate": 4.2572208278593596e-05,
"loss": 0.3676,
"num_input_tokens_seen": 451840,
"step": 3500
},
{
"epoch": 3.272642390289449,
"grad_norm": 8.113277435302734,
"learning_rate": 4.254320612851328e-05,
"loss": 0.5073,
"num_input_tokens_seen": 452528,
"step": 3505
},
{
"epoch": 3.277310924369748,
"grad_norm": 2.9319417476654053,
"learning_rate": 4.2514157389296196e-05,
"loss": 0.1423,
"num_input_tokens_seen": 453136,
"step": 3510
},
{
"epoch": 3.2819794584500466,
"grad_norm": 2.199583053588867,
"learning_rate": 4.248506213808648e-05,
"loss": 0.2884,
"num_input_tokens_seen": 453728,
"step": 3515
},
{
"epoch": 3.2866479925303453,
"grad_norm": 5.822009563446045,
"learning_rate": 4.245592045215182e-05,
"loss": 0.1555,
"num_input_tokens_seen": 454384,
"step": 3520
},
{
"epoch": 3.291316526610644,
"grad_norm": 2.6942152976989746,
"learning_rate": 4.242673240888319e-05,
"loss": 0.2941,
"num_input_tokens_seen": 454960,
"step": 3525
},
{
"epoch": 3.295985060690943,
"grad_norm": 16.42827606201172,
"learning_rate": 4.239749808579468e-05,
"loss": 0.5326,
"num_input_tokens_seen": 455648,
"step": 3530
},
{
"epoch": 3.3006535947712417,
"grad_norm": 6.735487937927246,
"learning_rate": 4.2368217560523306e-05,
"loss": 0.3945,
"num_input_tokens_seen": 456320,
"step": 3535
},
{
"epoch": 3.3053221288515404,
"grad_norm": 4.459935188293457,
"learning_rate": 4.233889091082874e-05,
"loss": 0.1235,
"num_input_tokens_seen": 456912,
"step": 3540
},
{
"epoch": 3.309990662931839,
"grad_norm": 7.533689975738525,
"learning_rate": 4.230951821459319e-05,
"loss": 0.392,
"num_input_tokens_seen": 457568,
"step": 3545
},
{
"epoch": 3.314659197012138,
"grad_norm": 5.718842029571533,
"learning_rate": 4.228009954982112e-05,
"loss": 0.2334,
"num_input_tokens_seen": 458416,
"step": 3550
},
{
"epoch": 3.3193277310924367,
"grad_norm": 6.39036750793457,
"learning_rate": 4.2250634994639095e-05,
"loss": 0.5383,
"num_input_tokens_seen": 459136,
"step": 3555
},
{
"epoch": 3.323996265172736,
"grad_norm": 4.211606979370117,
"learning_rate": 4.222112462729552e-05,
"loss": 0.3948,
"num_input_tokens_seen": 459824,
"step": 3560
},
{
"epoch": 3.3286647992530347,
"grad_norm": 4.4313273429870605,
"learning_rate": 4.2191568526160485e-05,
"loss": 0.3832,
"num_input_tokens_seen": 460544,
"step": 3565
},
{
"epoch": 3.3333333333333335,
"grad_norm": 4.849913597106934,
"learning_rate": 4.216196676972553e-05,
"loss": 0.6303,
"num_input_tokens_seen": 461120,
"step": 3570
},
{
"epoch": 3.3380018674136323,
"grad_norm": 6.789108753204346,
"learning_rate": 4.213231943660344e-05,
"loss": 0.5102,
"num_input_tokens_seen": 461712,
"step": 3575
},
{
"epoch": 3.342670401493931,
"grad_norm": 5.179975986480713,
"learning_rate": 4.210262660552804e-05,
"loss": 0.5964,
"num_input_tokens_seen": 462368,
"step": 3580
},
{
"epoch": 3.34733893557423,
"grad_norm": 2.2866899967193604,
"learning_rate": 4.2072888355353966e-05,
"loss": 0.3182,
"num_input_tokens_seen": 463088,
"step": 3585
},
{
"epoch": 3.3520074696545286,
"grad_norm": 14.321250915527344,
"learning_rate": 4.2043104765056504e-05,
"loss": 0.3753,
"num_input_tokens_seen": 463648,
"step": 3590
},
{
"epoch": 3.3566760037348273,
"grad_norm": 2.1171340942382812,
"learning_rate": 4.2013275913731315e-05,
"loss": 0.3841,
"num_input_tokens_seen": 464288,
"step": 3595
},
{
"epoch": 3.361344537815126,
"grad_norm": 4.470531940460205,
"learning_rate": 4.198340188059429e-05,
"loss": 0.3792,
"num_input_tokens_seen": 464992,
"step": 3600
},
{
"epoch": 3.366013071895425,
"grad_norm": 2.543765068054199,
"learning_rate": 4.1953482744981274e-05,
"loss": 0.2807,
"num_input_tokens_seen": 465712,
"step": 3605
},
{
"epoch": 3.3706816059757236,
"grad_norm": 3.711956262588501,
"learning_rate": 4.192351858634792e-05,
"loss": 0.3673,
"num_input_tokens_seen": 466320,
"step": 3610
},
{
"epoch": 3.3753501400560224,
"grad_norm": 6.865347385406494,
"learning_rate": 4.1893509484269443e-05,
"loss": 0.505,
"num_input_tokens_seen": 466944,
"step": 3615
},
{
"epoch": 3.380018674136321,
"grad_norm": 4.681259632110596,
"learning_rate": 4.186345551844039e-05,
"loss": 0.4728,
"num_input_tokens_seen": 467616,
"step": 3620
},
{
"epoch": 3.38468720821662,
"grad_norm": 4.106618881225586,
"learning_rate": 4.183335676867448e-05,
"loss": 0.3185,
"num_input_tokens_seen": 468208,
"step": 3625
},
{
"epoch": 3.3893557422969187,
"grad_norm": 1.4591264724731445,
"learning_rate": 4.180321331490436e-05,
"loss": 0.1859,
"num_input_tokens_seen": 468912,
"step": 3630
},
{
"epoch": 3.3940242763772175,
"grad_norm": 4.069035053253174,
"learning_rate": 4.1773025237181365e-05,
"loss": 0.4485,
"num_input_tokens_seen": 469536,
"step": 3635
},
{
"epoch": 3.3986928104575163,
"grad_norm": 3.73283314704895,
"learning_rate": 4.1742792615675385e-05,
"loss": 0.4666,
"num_input_tokens_seen": 470112,
"step": 3640
},
{
"epoch": 3.403361344537815,
"grad_norm": 12.443367004394531,
"learning_rate": 4.171251553067457e-05,
"loss": 0.4469,
"num_input_tokens_seen": 470784,
"step": 3645
},
{
"epoch": 3.408029878618114,
"grad_norm": 3.0841293334960938,
"learning_rate": 4.168219406258515e-05,
"loss": 0.493,
"num_input_tokens_seen": 471456,
"step": 3650
},
{
"epoch": 3.4126984126984126,
"grad_norm": 4.980912685394287,
"learning_rate": 4.1651828291931264e-05,
"loss": 0.3285,
"num_input_tokens_seen": 472144,
"step": 3655
},
{
"epoch": 3.4173669467787113,
"grad_norm": 3.9507060050964355,
"learning_rate": 4.1621418299354634e-05,
"loss": 0.5102,
"num_input_tokens_seen": 472848,
"step": 3660
},
{
"epoch": 3.42203548085901,
"grad_norm": 3.0940191745758057,
"learning_rate": 4.159096416561449e-05,
"loss": 0.5427,
"num_input_tokens_seen": 473392,
"step": 3665
},
{
"epoch": 3.426704014939309,
"grad_norm": 4.08584451675415,
"learning_rate": 4.156046597158724e-05,
"loss": 0.2202,
"num_input_tokens_seen": 474032,
"step": 3670
},
{
"epoch": 3.431372549019608,
"grad_norm": 9.69025707244873,
"learning_rate": 4.1529923798266326e-05,
"loss": 0.4965,
"num_input_tokens_seen": 474640,
"step": 3675
},
{
"epoch": 3.436041083099907,
"grad_norm": 3.4433095455169678,
"learning_rate": 4.149933772676198e-05,
"loss": 0.5427,
"num_input_tokens_seen": 475232,
"step": 3680
},
{
"epoch": 3.4407096171802056,
"grad_norm": 3.5992636680603027,
"learning_rate": 4.146870783830101e-05,
"loss": 0.4117,
"num_input_tokens_seen": 475824,
"step": 3685
},
{
"epoch": 3.4453781512605044,
"grad_norm": 2.926135778427124,
"learning_rate": 4.14380342142266e-05,
"loss": 0.4184,
"num_input_tokens_seen": 476448,
"step": 3690
},
{
"epoch": 3.450046685340803,
"grad_norm": 5.663197040557861,
"learning_rate": 4.140731693599805e-05,
"loss": 0.3798,
"num_input_tokens_seen": 477024,
"step": 3695
},
{
"epoch": 3.454715219421102,
"grad_norm": 4.990402698516846,
"learning_rate": 4.137655608519063e-05,
"loss": 0.3333,
"num_input_tokens_seen": 477664,
"step": 3700
},
{
"epoch": 3.4593837535014007,
"grad_norm": 5.383077621459961,
"learning_rate": 4.13457517434953e-05,
"loss": 0.6738,
"num_input_tokens_seen": 478272,
"step": 3705
},
{
"epoch": 3.4640522875816995,
"grad_norm": 4.2853803634643555,
"learning_rate": 4.131490399271852e-05,
"loss": 0.6813,
"num_input_tokens_seen": 478816,
"step": 3710
},
{
"epoch": 3.4687208216619982,
"grad_norm": 9.064924240112305,
"learning_rate": 4.128401291478206e-05,
"loss": 0.5682,
"num_input_tokens_seen": 479440,
"step": 3715
},
{
"epoch": 3.473389355742297,
"grad_norm": 2.432324171066284,
"learning_rate": 4.12530785917227e-05,
"loss": 0.3548,
"num_input_tokens_seen": 480080,
"step": 3720
},
{
"epoch": 3.478057889822596,
"grad_norm": 2.6238317489624023,
"learning_rate": 4.1222101105692116e-05,
"loss": 0.7767,
"num_input_tokens_seen": 480768,
"step": 3725
},
{
"epoch": 3.4827264239028946,
"grad_norm": 5.692093849182129,
"learning_rate": 4.1191080538956586e-05,
"loss": 0.4093,
"num_input_tokens_seen": 481392,
"step": 3730
},
{
"epoch": 3.4873949579831933,
"grad_norm": 5.617989540100098,
"learning_rate": 4.116001697389678e-05,
"loss": 0.3229,
"num_input_tokens_seen": 482064,
"step": 3735
},
{
"epoch": 3.492063492063492,
"grad_norm": 11.547221183776855,
"learning_rate": 4.11289104930076e-05,
"loss": 0.4888,
"num_input_tokens_seen": 482800,
"step": 3740
},
{
"epoch": 3.496732026143791,
"grad_norm": 3.141573429107666,
"learning_rate": 4.109776117889789e-05,
"loss": 0.2159,
"num_input_tokens_seen": 483472,
"step": 3745
},
{
"epoch": 3.5014005602240896,
"grad_norm": 14.447588920593262,
"learning_rate": 4.1066569114290257e-05,
"loss": 0.3896,
"num_input_tokens_seen": 484320,
"step": 3750
},
{
"epoch": 3.503267973856209,
"eval_loss": 0.6631819009780884,
"eval_runtime": 3.8657,
"eval_samples_per_second": 61.568,
"eval_steps_per_second": 30.784,
"num_input_tokens_seen": 484576,
"step": 3752
},
{
"epoch": 3.5060690943043884,
"grad_norm": 6.598292827606201,
"learning_rate": 4.103533438202082e-05,
"loss": 0.4745,
"num_input_tokens_seen": 484976,
"step": 3755
},
{
"epoch": 3.510737628384687,
"grad_norm": 2.7642271518707275,
"learning_rate": 4.100405706503904e-05,
"loss": 0.317,
"num_input_tokens_seen": 485568,
"step": 3760
},
{
"epoch": 3.515406162464986,
"grad_norm": 4.99310302734375,
"learning_rate": 4.0972737246407444e-05,
"loss": 0.5242,
"num_input_tokens_seen": 486256,
"step": 3765
},
{
"epoch": 3.5200746965452847,
"grad_norm": 12.541024208068848,
"learning_rate": 4.0941375009301444e-05,
"loss": 0.7703,
"num_input_tokens_seen": 486864,
"step": 3770
},
{
"epoch": 3.5247432306255835,
"grad_norm": 10.067193031311035,
"learning_rate": 4.0909970437009096e-05,
"loss": 0.635,
"num_input_tokens_seen": 487568,
"step": 3775
},
{
"epoch": 3.5294117647058822,
"grad_norm": 2.522975444793701,
"learning_rate": 4.087852361293088e-05,
"loss": 0.3526,
"num_input_tokens_seen": 488256,
"step": 3780
},
{
"epoch": 3.534080298786181,
"grad_norm": 4.470706939697266,
"learning_rate": 4.084703462057949e-05,
"loss": 0.3543,
"num_input_tokens_seen": 488880,
"step": 3785
},
{
"epoch": 3.53874883286648,
"grad_norm": 1.5186313390731812,
"learning_rate": 4.081550354357962e-05,
"loss": 0.2626,
"num_input_tokens_seen": 489712,
"step": 3790
},
{
"epoch": 3.5434173669467786,
"grad_norm": 8.709242820739746,
"learning_rate": 4.078393046566769e-05,
"loss": 0.5864,
"num_input_tokens_seen": 490352,
"step": 3795
},
{
"epoch": 3.5480859010270773,
"grad_norm": 8.166577339172363,
"learning_rate": 4.0752315470691696e-05,
"loss": 0.6285,
"num_input_tokens_seen": 490992,
"step": 3800
},
{
"epoch": 3.552754435107376,
"grad_norm": 1.631000280380249,
"learning_rate": 4.0720658642610934e-05,
"loss": 0.2448,
"num_input_tokens_seen": 491712,
"step": 3805
},
{
"epoch": 3.557422969187675,
"grad_norm": 7.412605285644531,
"learning_rate": 4.068896006549579e-05,
"loss": 0.5105,
"num_input_tokens_seen": 492352,
"step": 3810
},
{
"epoch": 3.5620915032679736,
"grad_norm": 2.645429849624634,
"learning_rate": 4.0657219823527566e-05,
"loss": 0.6244,
"num_input_tokens_seen": 492976,
"step": 3815
},
{
"epoch": 3.5667600373482724,
"grad_norm": 5.971198558807373,
"learning_rate": 4.0625438000998153e-05,
"loss": 0.2768,
"num_input_tokens_seen": 493712,
"step": 3820
},
{
"epoch": 3.571428571428571,
"grad_norm": 3.65240216255188,
"learning_rate": 4.059361468230989e-05,
"loss": 0.5011,
"num_input_tokens_seen": 494320,
"step": 3825
},
{
"epoch": 3.57609710550887,
"grad_norm": 2.6212239265441895,
"learning_rate": 4.0561749951975324e-05,
"loss": 0.5193,
"num_input_tokens_seen": 494960,
"step": 3830
},
{
"epoch": 3.580765639589169,
"grad_norm": 6.561978340148926,
"learning_rate": 4.052984389461698e-05,
"loss": 0.2862,
"num_input_tokens_seen": 495664,
"step": 3835
},
{
"epoch": 3.585434173669468,
"grad_norm": 3.486088275909424,
"learning_rate": 4.049789659496712e-05,
"loss": 0.3949,
"num_input_tokens_seen": 496320,
"step": 3840
},
{
"epoch": 3.5901027077497667,
"grad_norm": 8.017462730407715,
"learning_rate": 4.0465908137867545e-05,
"loss": 0.5328,
"num_input_tokens_seen": 496992,
"step": 3845
},
{
"epoch": 3.5947712418300655,
"grad_norm": 5.656188488006592,
"learning_rate": 4.043387860826936e-05,
"loss": 0.5002,
"num_input_tokens_seen": 497600,
"step": 3850
},
{
"epoch": 3.5994397759103642,
"grad_norm": 0.9717182517051697,
"learning_rate": 4.040180809123272e-05,
"loss": 0.564,
"num_input_tokens_seen": 498192,
"step": 3855
},
{
"epoch": 3.604108309990663,
"grad_norm": 8.40697956085205,
"learning_rate": 4.036969667192665e-05,
"loss": 0.2135,
"num_input_tokens_seen": 498928,
"step": 3860
},
{
"epoch": 3.6087768440709618,
"grad_norm": 1.4905232191085815,
"learning_rate": 4.03375444356288e-05,
"loss": 0.1885,
"num_input_tokens_seen": 499648,
"step": 3865
},
{
"epoch": 3.6134453781512605,
"grad_norm": 5.896583557128906,
"learning_rate": 4.030535146772521e-05,
"loss": 1.0008,
"num_input_tokens_seen": 500224,
"step": 3870
},
{
"epoch": 3.6181139122315593,
"grad_norm": 4.838075637817383,
"learning_rate": 4.027311785371009e-05,
"loss": 0.2981,
"num_input_tokens_seen": 500832,
"step": 3875
},
{
"epoch": 3.622782446311858,
"grad_norm": 4.020202159881592,
"learning_rate": 4.0240843679185603e-05,
"loss": 0.6731,
"num_input_tokens_seen": 501440,
"step": 3880
},
{
"epoch": 3.627450980392157,
"grad_norm": 4.53800630569458,
"learning_rate": 4.020852902986162e-05,
"loss": 0.3302,
"num_input_tokens_seen": 501968,
"step": 3885
},
{
"epoch": 3.6321195144724556,
"grad_norm": 4.003100872039795,
"learning_rate": 4.017617399155548e-05,
"loss": 0.5148,
"num_input_tokens_seen": 502576,
"step": 3890
},
{
"epoch": 3.6367880485527544,
"grad_norm": 5.846912860870361,
"learning_rate": 4.0143778650191835e-05,
"loss": 0.3139,
"num_input_tokens_seen": 503280,
"step": 3895
},
{
"epoch": 3.641456582633053,
"grad_norm": 5.185009956359863,
"learning_rate": 4.01113430918023e-05,
"loss": 0.4181,
"num_input_tokens_seen": 503920,
"step": 3900
},
{
"epoch": 3.646125116713352,
"grad_norm": 3.249129295349121,
"learning_rate": 4.0078867402525354e-05,
"loss": 0.3392,
"num_input_tokens_seen": 504608,
"step": 3905
},
{
"epoch": 3.6507936507936507,
"grad_norm": 3.677781343460083,
"learning_rate": 4.004635166860602e-05,
"loss": 0.3489,
"num_input_tokens_seen": 505152,
"step": 3910
},
{
"epoch": 3.6554621848739495,
"grad_norm": 1.4997936487197876,
"learning_rate": 4.0013795976395674e-05,
"loss": 0.2741,
"num_input_tokens_seen": 505856,
"step": 3915
},
{
"epoch": 3.6601307189542482,
"grad_norm": 2.245586633682251,
"learning_rate": 3.9981200412351816e-05,
"loss": 0.332,
"num_input_tokens_seen": 506480,
"step": 3920
},
{
"epoch": 3.664799253034547,
"grad_norm": 7.378620147705078,
"learning_rate": 3.99485650630378e-05,
"loss": 0.5586,
"num_input_tokens_seen": 507040,
"step": 3925
},
{
"epoch": 3.669467787114846,
"grad_norm": 43.491329193115234,
"learning_rate": 3.9915890015122683e-05,
"loss": 0.3228,
"num_input_tokens_seen": 507632,
"step": 3930
},
{
"epoch": 3.674136321195145,
"grad_norm": 4.064418315887451,
"learning_rate": 3.988317535538092e-05,
"loss": 0.3529,
"num_input_tokens_seen": 508208,
"step": 3935
},
{
"epoch": 3.6788048552754438,
"grad_norm": 7.0785627365112305,
"learning_rate": 3.985042117069217e-05,
"loss": 0.85,
"num_input_tokens_seen": 508832,
"step": 3940
},
{
"epoch": 3.6834733893557425,
"grad_norm": 1.5454710721969604,
"learning_rate": 3.981762754804107e-05,
"loss": 0.4017,
"num_input_tokens_seen": 509488,
"step": 3945
},
{
"epoch": 3.6881419234360413,
"grad_norm": 6.187710762023926,
"learning_rate": 3.9784794574516945e-05,
"loss": 0.4936,
"num_input_tokens_seen": 510112,
"step": 3950
},
{
"epoch": 3.69281045751634,
"grad_norm": 4.999000549316406,
"learning_rate": 3.975192233731369e-05,
"loss": 0.4316,
"num_input_tokens_seen": 510720,
"step": 3955
},
{
"epoch": 3.697478991596639,
"grad_norm": 2.906360149383545,
"learning_rate": 3.971901092372942e-05,
"loss": 0.3729,
"num_input_tokens_seen": 511344,
"step": 3960
},
{
"epoch": 3.7021475256769376,
"grad_norm": 2.675576686859131,
"learning_rate": 3.968606042116632e-05,
"loss": 0.483,
"num_input_tokens_seen": 512016,
"step": 3965
},
{
"epoch": 3.7068160597572364,
"grad_norm": 6.858824729919434,
"learning_rate": 3.965307091713037e-05,
"loss": 0.5243,
"num_input_tokens_seen": 512592,
"step": 3970
},
{
"epoch": 3.711484593837535,
"grad_norm": 14.2874174118042,
"learning_rate": 3.962004249923112e-05,
"loss": 0.5098,
"num_input_tokens_seen": 513248,
"step": 3975
},
{
"epoch": 3.716153127917834,
"grad_norm": 8.637978553771973,
"learning_rate": 3.958697525518148e-05,
"loss": 0.3194,
"num_input_tokens_seen": 513888,
"step": 3980
},
{
"epoch": 3.7208216619981327,
"grad_norm": 4.814327716827393,
"learning_rate": 3.955386927279744e-05,
"loss": 0.4633,
"num_input_tokens_seen": 514480,
"step": 3985
},
{
"epoch": 3.7254901960784315,
"grad_norm": 5.299570560455322,
"learning_rate": 3.952072463999791e-05,
"loss": 0.3609,
"num_input_tokens_seen": 515072,
"step": 3990
},
{
"epoch": 3.7301587301587302,
"grad_norm": 4.821151256561279,
"learning_rate": 3.94875414448044e-05,
"loss": 0.6235,
"num_input_tokens_seen": 515712,
"step": 3995
},
{
"epoch": 3.734827264239029,
"grad_norm": 4.8716254234313965,
"learning_rate": 3.945431977534086e-05,
"loss": 0.574,
"num_input_tokens_seen": 516464,
"step": 4000
},
{
"epoch": 3.7394957983193278,
"grad_norm": 13.349778175354004,
"learning_rate": 3.942105971983341e-05,
"loss": 0.4969,
"num_input_tokens_seen": 517168,
"step": 4005
},
{
"epoch": 3.7441643323996265,
"grad_norm": 4.25555419921875,
"learning_rate": 3.938776136661008e-05,
"loss": 0.6308,
"num_input_tokens_seen": 517792,
"step": 4010
},
{
"epoch": 3.7488328664799253,
"grad_norm": 2.265429735183716,
"learning_rate": 3.935442480410065e-05,
"loss": 0.5301,
"num_input_tokens_seen": 518480,
"step": 4015
},
{
"epoch": 3.753501400560224,
"grad_norm": 4.872378826141357,
"learning_rate": 3.932105012083637e-05,
"loss": 0.3686,
"num_input_tokens_seen": 519136,
"step": 4020
},
{
"epoch": 3.758169934640523,
"grad_norm": 2.909376382827759,
"learning_rate": 3.928763740544967e-05,
"loss": 0.3435,
"num_input_tokens_seen": 519696,
"step": 4025
},
{
"epoch": 3.7628384687208216,
"grad_norm": 2.4545042514801025,
"learning_rate": 3.925418674667405e-05,
"loss": 0.1534,
"num_input_tokens_seen": 520304,
"step": 4030
},
{
"epoch": 3.7675070028011204,
"grad_norm": 5.603614330291748,
"learning_rate": 3.922069823334373e-05,
"loss": 0.3479,
"num_input_tokens_seen": 520944,
"step": 4035
},
{
"epoch": 3.772175536881419,
"grad_norm": 5.503204822540283,
"learning_rate": 3.918717195439349e-05,
"loss": 0.6142,
"num_input_tokens_seen": 521584,
"step": 4040
},
{
"epoch": 3.776844070961718,
"grad_norm": 9.066848754882812,
"learning_rate": 3.915360799885837e-05,
"loss": 0.4284,
"num_input_tokens_seen": 522208,
"step": 4045
},
{
"epoch": 3.7815126050420167,
"grad_norm": 6.612861156463623,
"learning_rate": 3.9120006455873506e-05,
"loss": 0.417,
"num_input_tokens_seen": 522880,
"step": 4050
},
{
"epoch": 3.7861811391223155,
"grad_norm": 3.602097511291504,
"learning_rate": 3.908636741467382e-05,
"loss": 0.3534,
"num_input_tokens_seen": 523520,
"step": 4055
},
{
"epoch": 3.7908496732026142,
"grad_norm": 1.3662033081054688,
"learning_rate": 3.905269096459384e-05,
"loss": 0.2182,
"num_input_tokens_seen": 524208,
"step": 4060
},
{
"epoch": 3.795518207282913,
"grad_norm": 9.043498992919922,
"learning_rate": 3.901897719506743e-05,
"loss": 0.2156,
"num_input_tokens_seen": 524912,
"step": 4065
},
{
"epoch": 3.8001867413632118,
"grad_norm": 6.337942600250244,
"learning_rate": 3.8985226195627563e-05,
"loss": 0.3852,
"num_input_tokens_seen": 525488,
"step": 4070
},
{
"epoch": 3.8048552754435105,
"grad_norm": 0.5356036424636841,
"learning_rate": 3.8951438055906084e-05,
"loss": 0.3144,
"num_input_tokens_seen": 526096,
"step": 4075
},
{
"epoch": 3.8095238095238093,
"grad_norm": 7.135247707366943,
"learning_rate": 3.891761286563347e-05,
"loss": 0.8808,
"num_input_tokens_seen": 526752,
"step": 4080
},
{
"epoch": 3.814192343604108,
"grad_norm": 1.464228630065918,
"learning_rate": 3.88837507146386e-05,
"loss": 0.2911,
"num_input_tokens_seen": 527440,
"step": 4085
},
{
"epoch": 3.818860877684407,
"grad_norm": 3.206880569458008,
"learning_rate": 3.88498516928485e-05,
"loss": 0.6713,
"num_input_tokens_seen": 528032,
"step": 4090
},
{
"epoch": 3.8235294117647056,
"grad_norm": 5.08414363861084,
"learning_rate": 3.881591589028809e-05,
"loss": 0.3554,
"num_input_tokens_seen": 528688,
"step": 4095
},
{
"epoch": 3.828197945845005,
"grad_norm": 2.6961653232574463,
"learning_rate": 3.878194339708002e-05,
"loss": 0.2922,
"num_input_tokens_seen": 529344,
"step": 4100
},
{
"epoch": 3.8328664799253036,
"grad_norm": 4.350805759429932,
"learning_rate": 3.8747934303444344e-05,
"loss": 0.4911,
"num_input_tokens_seen": 529920,
"step": 4105
},
{
"epoch": 3.8375350140056024,
"grad_norm": 12.087987899780273,
"learning_rate": 3.871388869969833e-05,
"loss": 1.0404,
"num_input_tokens_seen": 530544,
"step": 4110
},
{
"epoch": 3.842203548085901,
"grad_norm": 6.6325883865356445,
"learning_rate": 3.867980667625618e-05,
"loss": 0.6221,
"num_input_tokens_seen": 531232,
"step": 4115
},
{
"epoch": 3.8468720821662,
"grad_norm": 4.015625953674316,
"learning_rate": 3.864568832362885e-05,
"loss": 0.4101,
"num_input_tokens_seen": 531872,
"step": 4120
},
{
"epoch": 3.8515406162464987,
"grad_norm": 4.436892986297607,
"learning_rate": 3.861153373242374e-05,
"loss": 0.2394,
"num_input_tokens_seen": 532576,
"step": 4125
},
{
"epoch": 3.8562091503267975,
"grad_norm": 2.150585651397705,
"learning_rate": 3.857734299334452e-05,
"loss": 0.3944,
"num_input_tokens_seen": 533216,
"step": 4130
},
{
"epoch": 3.860877684407096,
"grad_norm": 2.3524234294891357,
"learning_rate": 3.854311619719084e-05,
"loss": 0.2307,
"num_input_tokens_seen": 533856,
"step": 4135
},
{
"epoch": 3.865546218487395,
"grad_norm": 9.083573341369629,
"learning_rate": 3.850885343485811e-05,
"loss": 0.5951,
"num_input_tokens_seen": 534448,
"step": 4140
},
{
"epoch": 3.8702147525676938,
"grad_norm": 4.580225467681885,
"learning_rate": 3.847455479733724e-05,
"loss": 0.3928,
"num_input_tokens_seen": 535072,
"step": 4145
},
{
"epoch": 3.8748832866479925,
"grad_norm": 7.7241106033325195,
"learning_rate": 3.844022037571443e-05,
"loss": 0.2974,
"num_input_tokens_seen": 535760,
"step": 4150
},
{
"epoch": 3.8795518207282913,
"grad_norm": 6.325448989868164,
"learning_rate": 3.840585026117093e-05,
"loss": 0.3938,
"num_input_tokens_seen": 536368,
"step": 4155
},
{
"epoch": 3.88422035480859,
"grad_norm": 1.6497693061828613,
"learning_rate": 3.837144454498272e-05,
"loss": 0.2775,
"num_input_tokens_seen": 537104,
"step": 4160
},
{
"epoch": 3.888888888888889,
"grad_norm": 4.049694538116455,
"learning_rate": 3.8337003318520394e-05,
"loss": 0.3838,
"num_input_tokens_seen": 537776,
"step": 4165
},
{
"epoch": 3.8935574229691876,
"grad_norm": 9.887384414672852,
"learning_rate": 3.8302526673248796e-05,
"loss": 0.3456,
"num_input_tokens_seen": 538384,
"step": 4170
},
{
"epoch": 3.8982259570494864,
"grad_norm": 5.9591779708862305,
"learning_rate": 3.8268014700726876e-05,
"loss": 0.3213,
"num_input_tokens_seen": 539120,
"step": 4175
},
{
"epoch": 3.902894491129785,
"grad_norm": 5.987460613250732,
"learning_rate": 3.8233467492607354e-05,
"loss": 0.5887,
"num_input_tokens_seen": 539792,
"step": 4180
},
{
"epoch": 3.907563025210084,
"grad_norm": 5.307256698608398,
"learning_rate": 3.819888514063658e-05,
"loss": 0.4946,
"num_input_tokens_seen": 540496,
"step": 4185
},
{
"epoch": 3.9122315592903827,
"grad_norm": 5.681676387786865,
"learning_rate": 3.8164267736654166e-05,
"loss": 0.3623,
"num_input_tokens_seen": 541088,
"step": 4190
},
{
"epoch": 3.9169000933706815,
"grad_norm": 6.065107822418213,
"learning_rate": 3.812961537259289e-05,
"loss": 0.5706,
"num_input_tokens_seen": 541856,
"step": 4195
},
{
"epoch": 3.9215686274509802,
"grad_norm": 4.150299549102783,
"learning_rate": 3.809492814047831e-05,
"loss": 0.3745,
"num_input_tokens_seen": 542544,
"step": 4200
},
{
"epoch": 3.9262371615312794,
"grad_norm": 3.3918418884277344,
"learning_rate": 3.80602061324286e-05,
"loss": 0.6235,
"num_input_tokens_seen": 543152,
"step": 4205
},
{
"epoch": 3.930905695611578,
"grad_norm": 3.0744729042053223,
"learning_rate": 3.802544944065431e-05,
"loss": 0.4412,
"num_input_tokens_seen": 543760,
"step": 4210
},
{
"epoch": 3.935574229691877,
"grad_norm": 5.920441627502441,
"learning_rate": 3.799065815745808e-05,
"loss": 0.3594,
"num_input_tokens_seen": 544368,
"step": 4215
},
{
"epoch": 3.9402427637721757,
"grad_norm": 0.5871037244796753,
"learning_rate": 3.7955832375234404e-05,
"loss": 0.2189,
"num_input_tokens_seen": 544928,
"step": 4220
},
{
"epoch": 3.9449112978524745,
"grad_norm": 1.7833948135375977,
"learning_rate": 3.7920972186469406e-05,
"loss": 0.5074,
"num_input_tokens_seen": 545648,
"step": 4225
},
{
"epoch": 3.9495798319327733,
"grad_norm": 2.337198257446289,
"learning_rate": 3.788607768374059e-05,
"loss": 0.2963,
"num_input_tokens_seen": 546288,
"step": 4230
},
{
"epoch": 3.954248366013072,
"grad_norm": 2.6441376209259033,
"learning_rate": 3.785114895971658e-05,
"loss": 0.6003,
"num_input_tokens_seen": 546976,
"step": 4235
},
{
"epoch": 3.958916900093371,
"grad_norm": 10.86312198638916,
"learning_rate": 3.781618610715687e-05,
"loss": 0.725,
"num_input_tokens_seen": 547600,
"step": 4240
},
{
"epoch": 3.9635854341736696,
"grad_norm": 7.368520259857178,
"learning_rate": 3.77811892189116e-05,
"loss": 0.5826,
"num_input_tokens_seen": 548240,
"step": 4245
},
{
"epoch": 3.9682539682539684,
"grad_norm": 7.850865840911865,
"learning_rate": 3.774615838792131e-05,
"loss": 0.3734,
"num_input_tokens_seen": 548880,
"step": 4250
},
{
"epoch": 3.972922502334267,
"grad_norm": 6.826239585876465,
"learning_rate": 3.771109370721666e-05,
"loss": 0.5284,
"num_input_tokens_seen": 549456,
"step": 4255
},
{
"epoch": 3.977591036414566,
"grad_norm": 8.290151596069336,
"learning_rate": 3.7675995269918205e-05,
"loss": 0.4612,
"num_input_tokens_seen": 550064,
"step": 4260
},
{
"epoch": 3.9822595704948647,
"grad_norm": 7.847808361053467,
"learning_rate": 3.764086316923616e-05,
"loss": 0.3022,
"num_input_tokens_seen": 550784,
"step": 4265
},
{
"epoch": 3.9869281045751634,
"grad_norm": 4.398458957672119,
"learning_rate": 3.760569749847013e-05,
"loss": 0.1849,
"num_input_tokens_seen": 551472,
"step": 4270
},
{
"epoch": 3.991596638655462,
"grad_norm": 3.6097769737243652,
"learning_rate": 3.757049835100888e-05,
"loss": 0.2915,
"num_input_tokens_seen": 552080,
"step": 4275
},
{
"epoch": 3.996265172735761,
"grad_norm": 5.2276930809021,
"learning_rate": 3.753526582033007e-05,
"loss": 0.295,
"num_input_tokens_seen": 552688,
"step": 4280
},
{
"epoch": 4.00093370681606,
"grad_norm": 3.192323923110962,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.1977,
"num_input_tokens_seen": 553248,
"step": 4285
},
{
"epoch": 4.003734827264239,
"eval_loss": 0.6991814374923706,
"eval_runtime": 3.8602,
"eval_samples_per_second": 61.654,
"eval_steps_per_second": 30.827,
"num_input_tokens_seen": 553632,
"step": 4288
},
{
"epoch": 4.0056022408963585,
"grad_norm": 2.7682571411132812,
"learning_rate": 3.7464700983673416e-05,
"loss": 0.1532,
"num_input_tokens_seen": 553840,
"step": 4290
},
{
"epoch": 4.010270774976657,
"grad_norm": 7.069489479064941,
"learning_rate": 3.74293688650932e-05,
"loss": 0.1893,
"num_input_tokens_seen": 554464,
"step": 4295
},
{
"epoch": 4.014939309056956,
"grad_norm": 1.4517050981521606,
"learning_rate": 3.739400373809012e-05,
"loss": 0.3711,
"num_input_tokens_seen": 555072,
"step": 4300
},
{
"epoch": 4.019607843137255,
"grad_norm": 1.9523597955703735,
"learning_rate": 3.735860569658265e-05,
"loss": 0.1403,
"num_input_tokens_seen": 555696,
"step": 4305
},
{
"epoch": 4.024276377217554,
"grad_norm": 2.3467447757720947,
"learning_rate": 3.7323174834576634e-05,
"loss": 0.2017,
"num_input_tokens_seen": 556352,
"step": 4310
},
{
"epoch": 4.028944911297852,
"grad_norm": 4.454637050628662,
"learning_rate": 3.728771124616511e-05,
"loss": 0.3451,
"num_input_tokens_seen": 556912,
"step": 4315
},
{
"epoch": 4.033613445378151,
"grad_norm": 12.304389953613281,
"learning_rate": 3.7252215025528004e-05,
"loss": 0.507,
"num_input_tokens_seen": 557488,
"step": 4320
},
{
"epoch": 4.03828197945845,
"grad_norm": 6.556013584136963,
"learning_rate": 3.72166862669319e-05,
"loss": 0.3884,
"num_input_tokens_seen": 558128,
"step": 4325
},
{
"epoch": 4.042950513538749,
"grad_norm": 5.730507850646973,
"learning_rate": 3.7181125064729815e-05,
"loss": 0.1663,
"num_input_tokens_seen": 558800,
"step": 4330
},
{
"epoch": 4.0476190476190474,
"grad_norm": 24.123844146728516,
"learning_rate": 3.714553151336091e-05,
"loss": 0.4761,
"num_input_tokens_seen": 559472,
"step": 4335
},
{
"epoch": 4.052287581699346,
"grad_norm": 4.613626480102539,
"learning_rate": 3.710990570735025e-05,
"loss": 0.1796,
"num_input_tokens_seen": 560080,
"step": 4340
},
{
"epoch": 4.056956115779645,
"grad_norm": 48.392879486083984,
"learning_rate": 3.707424774130858e-05,
"loss": 0.1985,
"num_input_tokens_seen": 560752,
"step": 4345
},
{
"epoch": 4.061624649859944,
"grad_norm": 4.906088829040527,
"learning_rate": 3.703855770993201e-05,
"loss": 0.2424,
"num_input_tokens_seen": 561504,
"step": 4350
},
{
"epoch": 4.0662931839402425,
"grad_norm": 6.213633060455322,
"learning_rate": 3.700283570800187e-05,
"loss": 0.4731,
"num_input_tokens_seen": 562080,
"step": 4355
},
{
"epoch": 4.070961718020541,
"grad_norm": 2.7080910205841064,
"learning_rate": 3.696708183038432e-05,
"loss": 0.2403,
"num_input_tokens_seen": 562752,
"step": 4360
},
{
"epoch": 4.07563025210084,
"grad_norm": 1.7861884832382202,
"learning_rate": 3.6931296172030236e-05,
"loss": 0.3308,
"num_input_tokens_seen": 563392,
"step": 4365
},
{
"epoch": 4.080298786181139,
"grad_norm": 2.173994779586792,
"learning_rate": 3.689547882797485e-05,
"loss": 0.3537,
"num_input_tokens_seen": 564080,
"step": 4370
},
{
"epoch": 4.084967320261438,
"grad_norm": 6.196120738983154,
"learning_rate": 3.6859629893337556e-05,
"loss": 0.286,
"num_input_tokens_seen": 564768,
"step": 4375
},
{
"epoch": 4.089635854341736,
"grad_norm": 7.646449089050293,
"learning_rate": 3.682374946332165e-05,
"loss": 0.4532,
"num_input_tokens_seen": 565392,
"step": 4380
},
{
"epoch": 4.094304388422035,
"grad_norm": 6.959907531738281,
"learning_rate": 3.6787837633214064e-05,
"loss": 0.2021,
"num_input_tokens_seen": 565984,
"step": 4385
},
{
"epoch": 4.098972922502334,
"grad_norm": 4.212170124053955,
"learning_rate": 3.67518944983851e-05,
"loss": 0.2379,
"num_input_tokens_seen": 566608,
"step": 4390
},
{
"epoch": 4.103641456582633,
"grad_norm": 4.175776481628418,
"learning_rate": 3.671592015428823e-05,
"loss": 0.2317,
"num_input_tokens_seen": 567232,
"step": 4395
},
{
"epoch": 4.1083099906629315,
"grad_norm": 5.702651500701904,
"learning_rate": 3.667991469645979e-05,
"loss": 0.2823,
"num_input_tokens_seen": 567888,
"step": 4400
},
{
"epoch": 4.11297852474323,
"grad_norm": 3.9328668117523193,
"learning_rate": 3.6643878220518736e-05,
"loss": 0.3263,
"num_input_tokens_seen": 568480,
"step": 4405
},
{
"epoch": 4.117647058823529,
"grad_norm": 10.232110023498535,
"learning_rate": 3.6607810822166404e-05,
"loss": 0.3833,
"num_input_tokens_seen": 569120,
"step": 4410
},
{
"epoch": 4.122315592903828,
"grad_norm": 4.055398464202881,
"learning_rate": 3.657171259718626e-05,
"loss": 0.3163,
"num_input_tokens_seen": 569696,
"step": 4415
},
{
"epoch": 4.1269841269841265,
"grad_norm": 3.281766414642334,
"learning_rate": 3.6535583641443634e-05,
"loss": 0.1429,
"num_input_tokens_seen": 570528,
"step": 4420
},
{
"epoch": 4.131652661064426,
"grad_norm": 16.788419723510742,
"learning_rate": 3.649942405088544e-05,
"loss": 0.6168,
"num_input_tokens_seen": 571168,
"step": 4425
},
{
"epoch": 4.136321195144725,
"grad_norm": 3.544717311859131,
"learning_rate": 3.646323392153999e-05,
"loss": 0.2535,
"num_input_tokens_seen": 571776,
"step": 4430
},
{
"epoch": 4.140989729225024,
"grad_norm": 5.109630107879639,
"learning_rate": 3.6427013349516664e-05,
"loss": 0.1899,
"num_input_tokens_seen": 572368,
"step": 4435
},
{
"epoch": 4.1456582633053225,
"grad_norm": 4.163717269897461,
"learning_rate": 3.639076243100571e-05,
"loss": 0.4238,
"num_input_tokens_seen": 573072,
"step": 4440
},
{
"epoch": 4.150326797385621,
"grad_norm": 13.520872116088867,
"learning_rate": 3.635448126227795e-05,
"loss": 0.4113,
"num_input_tokens_seen": 573680,
"step": 4445
},
{
"epoch": 4.15499533146592,
"grad_norm": 3.852015733718872,
"learning_rate": 3.631816993968455e-05,
"loss": 0.2477,
"num_input_tokens_seen": 574352,
"step": 4450
},
{
"epoch": 4.159663865546219,
"grad_norm": 26.530088424682617,
"learning_rate": 3.628182855965676e-05,
"loss": 0.1845,
"num_input_tokens_seen": 575056,
"step": 4455
},
{
"epoch": 4.164332399626518,
"grad_norm": 1.7594077587127686,
"learning_rate": 3.624545721870563e-05,
"loss": 0.338,
"num_input_tokens_seen": 575792,
"step": 4460
},
{
"epoch": 4.169000933706816,
"grad_norm": 1.8824595212936401,
"learning_rate": 3.6209056013421805e-05,
"loss": 0.2513,
"num_input_tokens_seen": 576432,
"step": 4465
},
{
"epoch": 4.173669467787115,
"grad_norm": 7.08219575881958,
"learning_rate": 3.617262504047523e-05,
"loss": 0.3619,
"num_input_tokens_seen": 577024,
"step": 4470
},
{
"epoch": 4.178338001867414,
"grad_norm": 1.5463865995407104,
"learning_rate": 3.613616439661489e-05,
"loss": 0.3355,
"num_input_tokens_seen": 577632,
"step": 4475
},
{
"epoch": 4.183006535947713,
"grad_norm": 4.219307899475098,
"learning_rate": 3.6099674178668595e-05,
"loss": 0.1978,
"num_input_tokens_seen": 578224,
"step": 4480
},
{
"epoch": 4.187675070028011,
"grad_norm": 24.176820755004883,
"learning_rate": 3.606315448354265e-05,
"loss": 0.517,
"num_input_tokens_seen": 578816,
"step": 4485
},
{
"epoch": 4.19234360410831,
"grad_norm": 9.95002555847168,
"learning_rate": 3.60266054082217e-05,
"loss": 0.3464,
"num_input_tokens_seen": 579472,
"step": 4490
},
{
"epoch": 4.197012138188609,
"grad_norm": 4.038425922393799,
"learning_rate": 3.599002704976835e-05,
"loss": 0.2076,
"num_input_tokens_seen": 580032,
"step": 4495
},
{
"epoch": 4.201680672268908,
"grad_norm": 1.5917898416519165,
"learning_rate": 3.595341950532304e-05,
"loss": 0.3451,
"num_input_tokens_seen": 580624,
"step": 4500
},
{
"epoch": 4.2063492063492065,
"grad_norm": 4.347362995147705,
"learning_rate": 3.591678287210366e-05,
"loss": 0.2847,
"num_input_tokens_seen": 581312,
"step": 4505
},
{
"epoch": 4.211017740429505,
"grad_norm": 2.4160451889038086,
"learning_rate": 3.588011724740537e-05,
"loss": 0.2936,
"num_input_tokens_seen": 581984,
"step": 4510
},
{
"epoch": 4.215686274509804,
"grad_norm": 3.665546417236328,
"learning_rate": 3.584342272860034e-05,
"loss": 0.0873,
"num_input_tokens_seen": 582720,
"step": 4515
},
{
"epoch": 4.220354808590103,
"grad_norm": 5.41824197769165,
"learning_rate": 3.580669941313746e-05,
"loss": 0.3641,
"num_input_tokens_seen": 583360,
"step": 4520
},
{
"epoch": 4.225023342670402,
"grad_norm": 7.571991920471191,
"learning_rate": 3.5769947398542086e-05,
"loss": 0.4328,
"num_input_tokens_seen": 584032,
"step": 4525
},
{
"epoch": 4.2296918767507,
"grad_norm": 7.419412136077881,
"learning_rate": 3.5733166782415805e-05,
"loss": 0.435,
"num_input_tokens_seen": 584672,
"step": 4530
},
{
"epoch": 4.234360410830999,
"grad_norm": 3.60376238822937,
"learning_rate": 3.5696357662436145e-05,
"loss": 0.3029,
"num_input_tokens_seen": 585296,
"step": 4535
},
{
"epoch": 4.239028944911298,
"grad_norm": 0.17386208474636078,
"learning_rate": 3.565952013635635e-05,
"loss": 0.3092,
"num_input_tokens_seen": 585984,
"step": 4540
},
{
"epoch": 4.243697478991597,
"grad_norm": 0.5566981434822083,
"learning_rate": 3.562265430200508e-05,
"loss": 0.3977,
"num_input_tokens_seen": 586624,
"step": 4545
},
{
"epoch": 4.248366013071895,
"grad_norm": 7.640924453735352,
"learning_rate": 3.558576025728619e-05,
"loss": 0.2716,
"num_input_tokens_seen": 587216,
"step": 4550
},
{
"epoch": 4.253034547152194,
"grad_norm": 4.149720668792725,
"learning_rate": 3.554883810017844e-05,
"loss": 0.3383,
"num_input_tokens_seen": 587920,
"step": 4555
},
{
"epoch": 4.257703081232493,
"grad_norm": 12.189506530761719,
"learning_rate": 3.551188792873527e-05,
"loss": 0.4403,
"num_input_tokens_seen": 588528,
"step": 4560
},
{
"epoch": 4.262371615312792,
"grad_norm": 5.4155426025390625,
"learning_rate": 3.547490984108449e-05,
"loss": 0.5473,
"num_input_tokens_seen": 589136,
"step": 4565
},
{
"epoch": 4.2670401493930905,
"grad_norm": 5.073611736297607,
"learning_rate": 3.543790393542805e-05,
"loss": 0.325,
"num_input_tokens_seen": 589744,
"step": 4570
},
{
"epoch": 4.271708683473389,
"grad_norm": 8.7191801071167,
"learning_rate": 3.5400870310041794e-05,
"loss": 0.3047,
"num_input_tokens_seen": 590368,
"step": 4575
},
{
"epoch": 4.276377217553688,
"grad_norm": 7.070326328277588,
"learning_rate": 3.536380906327516e-05,
"loss": 0.2448,
"num_input_tokens_seen": 590992,
"step": 4580
},
{
"epoch": 4.281045751633987,
"grad_norm": 6.877079963684082,
"learning_rate": 3.532672029355097e-05,
"loss": 0.3595,
"num_input_tokens_seen": 591744,
"step": 4585
},
{
"epoch": 4.285714285714286,
"grad_norm": 2.179201364517212,
"learning_rate": 3.5289604099365096e-05,
"loss": 0.123,
"num_input_tokens_seen": 592368,
"step": 4590
},
{
"epoch": 4.290382819794584,
"grad_norm": 3.5357630252838135,
"learning_rate": 3.525246057928627e-05,
"loss": 0.1877,
"num_input_tokens_seen": 593056,
"step": 4595
},
{
"epoch": 4.295051353874883,
"grad_norm": 4.004199028015137,
"learning_rate": 3.5215289831955786e-05,
"loss": 0.3331,
"num_input_tokens_seen": 593632,
"step": 4600
},
{
"epoch": 4.299719887955182,
"grad_norm": 5.568762302398682,
"learning_rate": 3.517809195608725e-05,
"loss": 0.5129,
"num_input_tokens_seen": 594272,
"step": 4605
},
{
"epoch": 4.304388422035481,
"grad_norm": 17.591794967651367,
"learning_rate": 3.5140867050466295e-05,
"loss": 0.2957,
"num_input_tokens_seen": 594960,
"step": 4610
},
{
"epoch": 4.309056956115779,
"grad_norm": 10.526442527770996,
"learning_rate": 3.510361521395035e-05,
"loss": 0.2965,
"num_input_tokens_seen": 595680,
"step": 4615
},
{
"epoch": 4.313725490196078,
"grad_norm": 5.713769435882568,
"learning_rate": 3.506633654546837e-05,
"loss": 0.2579,
"num_input_tokens_seen": 596320,
"step": 4620
},
{
"epoch": 4.318394024276377,
"grad_norm": 3.8671867847442627,
"learning_rate": 3.502903114402055e-05,
"loss": 0.1889,
"num_input_tokens_seen": 596976,
"step": 4625
},
{
"epoch": 4.323062558356676,
"grad_norm": 2.351252794265747,
"learning_rate": 3.499169910867809e-05,
"loss": 0.1431,
"num_input_tokens_seen": 597648,
"step": 4630
},
{
"epoch": 4.3277310924369745,
"grad_norm": 8.63907527923584,
"learning_rate": 3.4954340538582926e-05,
"loss": 0.5485,
"num_input_tokens_seen": 598336,
"step": 4635
},
{
"epoch": 4.332399626517273,
"grad_norm": 4.3061299324035645,
"learning_rate": 3.491695553294745e-05,
"loss": 0.3966,
"num_input_tokens_seen": 598928,
"step": 4640
},
{
"epoch": 4.337068160597572,
"grad_norm": 5.864599227905273,
"learning_rate": 3.487954419105426e-05,
"loss": 0.4937,
"num_input_tokens_seen": 599600,
"step": 4645
},
{
"epoch": 4.341736694677871,
"grad_norm": 5.271458148956299,
"learning_rate": 3.484210661225591e-05,
"loss": 0.4097,
"num_input_tokens_seen": 600176,
"step": 4650
},
{
"epoch": 4.34640522875817,
"grad_norm": 5.326613426208496,
"learning_rate": 3.4804642895974596e-05,
"loss": 0.244,
"num_input_tokens_seen": 600896,
"step": 4655
},
{
"epoch": 4.351073762838468,
"grad_norm": 4.950856685638428,
"learning_rate": 3.476715314170198e-05,
"loss": 0.414,
"num_input_tokens_seen": 601472,
"step": 4660
},
{
"epoch": 4.355742296918767,
"grad_norm": 11.154131889343262,
"learning_rate": 3.472963744899882e-05,
"loss": 0.4395,
"num_input_tokens_seen": 602224,
"step": 4665
},
{
"epoch": 4.360410830999066,
"grad_norm": 10.789007186889648,
"learning_rate": 3.4692095917494785e-05,
"loss": 0.4422,
"num_input_tokens_seen": 602848,
"step": 4670
},
{
"epoch": 4.365079365079365,
"grad_norm": 3.9653983116149902,
"learning_rate": 3.465452864688816e-05,
"loss": 0.4116,
"num_input_tokens_seen": 603392,
"step": 4675
},
{
"epoch": 4.369747899159664,
"grad_norm": 3.1383543014526367,
"learning_rate": 3.461693573694558e-05,
"loss": 0.3692,
"num_input_tokens_seen": 604032,
"step": 4680
},
{
"epoch": 4.374416433239963,
"grad_norm": 1.6144094467163086,
"learning_rate": 3.457931728750179e-05,
"loss": 0.2329,
"num_input_tokens_seen": 604688,
"step": 4685
},
{
"epoch": 4.379084967320262,
"grad_norm": 7.495258331298828,
"learning_rate": 3.4541673398459316e-05,
"loss": 0.332,
"num_input_tokens_seen": 605296,
"step": 4690
},
{
"epoch": 4.383753501400561,
"grad_norm": 2.3249220848083496,
"learning_rate": 3.4504004169788294e-05,
"loss": 0.1417,
"num_input_tokens_seen": 605968,
"step": 4695
},
{
"epoch": 4.388422035480859,
"grad_norm": 4.117162227630615,
"learning_rate": 3.446630970152612e-05,
"loss": 0.2862,
"num_input_tokens_seen": 606704,
"step": 4700
},
{
"epoch": 4.393090569561158,
"grad_norm": 3.2297849655151367,
"learning_rate": 3.4428590093777244e-05,
"loss": 0.3799,
"num_input_tokens_seen": 607392,
"step": 4705
},
{
"epoch": 4.397759103641457,
"grad_norm": 2.561896800994873,
"learning_rate": 3.4390845446712836e-05,
"loss": 0.0899,
"num_input_tokens_seen": 608080,
"step": 4710
},
{
"epoch": 4.402427637721756,
"grad_norm": 2.327104091644287,
"learning_rate": 3.4353075860570614e-05,
"loss": 0.1184,
"num_input_tokens_seen": 608768,
"step": 4715
},
{
"epoch": 4.4070961718020545,
"grad_norm": 3.8246302604675293,
"learning_rate": 3.4315281435654484e-05,
"loss": 0.5739,
"num_input_tokens_seen": 609472,
"step": 4720
},
{
"epoch": 4.411764705882353,
"grad_norm": 8.933488845825195,
"learning_rate": 3.427746227233436e-05,
"loss": 0.7091,
"num_input_tokens_seen": 610144,
"step": 4725
},
{
"epoch": 4.416433239962652,
"grad_norm": 4.204476356506348,
"learning_rate": 3.4239618471045795e-05,
"loss": 0.2344,
"num_input_tokens_seen": 610704,
"step": 4730
},
{
"epoch": 4.421101774042951,
"grad_norm": 1.9256473779678345,
"learning_rate": 3.420175013228982e-05,
"loss": 0.2514,
"num_input_tokens_seen": 611408,
"step": 4735
},
{
"epoch": 4.42577030812325,
"grad_norm": 5.400211811065674,
"learning_rate": 3.416385735663262e-05,
"loss": 0.3381,
"num_input_tokens_seen": 612096,
"step": 4740
},
{
"epoch": 4.430438842203548,
"grad_norm": 5.82473611831665,
"learning_rate": 3.412594024470526e-05,
"loss": 0.6267,
"num_input_tokens_seen": 612656,
"step": 4745
},
{
"epoch": 4.435107376283847,
"grad_norm": 3.858933687210083,
"learning_rate": 3.408799889720345e-05,
"loss": 0.2105,
"num_input_tokens_seen": 613536,
"step": 4750
},
{
"epoch": 4.439775910364146,
"grad_norm": 2.8734130859375,
"learning_rate": 3.405003341488726e-05,
"loss": 0.3456,
"num_input_tokens_seen": 614112,
"step": 4755
},
{
"epoch": 4.444444444444445,
"grad_norm": 2.022538900375366,
"learning_rate": 3.401204389858085e-05,
"loss": 0.1767,
"num_input_tokens_seen": 614832,
"step": 4760
},
{
"epoch": 4.449112978524743,
"grad_norm": 1.868141531944275,
"learning_rate": 3.3974030449172206e-05,
"loss": 0.2623,
"num_input_tokens_seen": 615456,
"step": 4765
},
{
"epoch": 4.453781512605042,
"grad_norm": 9.049056053161621,
"learning_rate": 3.393599316761288e-05,
"loss": 0.5222,
"num_input_tokens_seen": 616048,
"step": 4770
},
{
"epoch": 4.458450046685341,
"grad_norm": 2.4421088695526123,
"learning_rate": 3.389793215491769e-05,
"loss": 0.2303,
"num_input_tokens_seen": 616704,
"step": 4775
},
{
"epoch": 4.46311858076564,
"grad_norm": 4.034763813018799,
"learning_rate": 3.385984751216452e-05,
"loss": 0.2149,
"num_input_tokens_seen": 617424,
"step": 4780
},
{
"epoch": 4.4677871148459385,
"grad_norm": 1.4706504344940186,
"learning_rate": 3.382173934049397e-05,
"loss": 0.2602,
"num_input_tokens_seen": 618176,
"step": 4785
},
{
"epoch": 4.472455648926237,
"grad_norm": 7.432245254516602,
"learning_rate": 3.378360774110916e-05,
"loss": 0.473,
"num_input_tokens_seen": 618768,
"step": 4790
},
{
"epoch": 4.477124183006536,
"grad_norm": 7.518472671508789,
"learning_rate": 3.374545281527538e-05,
"loss": 0.1864,
"num_input_tokens_seen": 619520,
"step": 4795
},
{
"epoch": 4.481792717086835,
"grad_norm": 10.482436180114746,
"learning_rate": 3.370727466431989e-05,
"loss": 0.4683,
"num_input_tokens_seen": 620192,
"step": 4800
},
{
"epoch": 4.486461251167134,
"grad_norm": 2.4239566326141357,
"learning_rate": 3.3669073389631644e-05,
"loss": 0.2869,
"num_input_tokens_seen": 620800,
"step": 4805
},
{
"epoch": 4.491129785247432,
"grad_norm": 2.9639928340911865,
"learning_rate": 3.3630849092661e-05,
"loss": 0.2356,
"num_input_tokens_seen": 621440,
"step": 4810
},
{
"epoch": 4.495798319327731,
"grad_norm": 3.342315435409546,
"learning_rate": 3.359260187491943e-05,
"loss": 0.289,
"num_input_tokens_seen": 622080,
"step": 4815
},
{
"epoch": 4.50046685340803,
"grad_norm": 9.140509605407715,
"learning_rate": 3.3554331837979307e-05,
"loss": 0.2292,
"num_input_tokens_seen": 622752,
"step": 4820
},
{
"epoch": 4.504201680672269,
"eval_loss": 0.7412708401679993,
"eval_runtime": 3.8742,
"eval_samples_per_second": 61.431,
"eval_steps_per_second": 30.716,
"num_input_tokens_seen": 623280,
"step": 4824
},
{
"epoch": 4.505135387488329,
"grad_norm": 1.1927273273468018,
"learning_rate": 3.3516039083473595e-05,
"loss": 0.4,
"num_input_tokens_seen": 623392,
"step": 4825
},
{
"epoch": 4.509803921568627,
"grad_norm": 3.2595295906066895,
"learning_rate": 3.347772371309557e-05,
"loss": 0.3292,
"num_input_tokens_seen": 624016,
"step": 4830
},
{
"epoch": 4.514472455648926,
"grad_norm": 8.653651237487793,
"learning_rate": 3.34393858285986e-05,
"loss": 0.4914,
"num_input_tokens_seen": 624752,
"step": 4835
},
{
"epoch": 4.519140989729225,
"grad_norm": 5.411193370819092,
"learning_rate": 3.340102553179581e-05,
"loss": 0.4863,
"num_input_tokens_seen": 625440,
"step": 4840
},
{
"epoch": 4.523809523809524,
"grad_norm": 6.883440017700195,
"learning_rate": 3.336264292455989e-05,
"loss": 0.3992,
"num_input_tokens_seen": 625984,
"step": 4845
},
{
"epoch": 4.5284780578898225,
"grad_norm": 4.110279083251953,
"learning_rate": 3.3324238108822726e-05,
"loss": 0.3535,
"num_input_tokens_seen": 626608,
"step": 4850
},
{
"epoch": 4.533146591970121,
"grad_norm": 2.576063394546509,
"learning_rate": 3.328581118657522e-05,
"loss": 0.2165,
"num_input_tokens_seen": 627280,
"step": 4855
},
{
"epoch": 4.53781512605042,
"grad_norm": 0.18222852051258087,
"learning_rate": 3.3247362259866956e-05,
"loss": 0.1554,
"num_input_tokens_seen": 627920,
"step": 4860
},
{
"epoch": 4.542483660130719,
"grad_norm": 10.023906707763672,
"learning_rate": 3.3208891430805994e-05,
"loss": 0.409,
"num_input_tokens_seen": 628496,
"step": 4865
},
{
"epoch": 4.547152194211018,
"grad_norm": 4.180869102478027,
"learning_rate": 3.317039880155852e-05,
"loss": 0.2893,
"num_input_tokens_seen": 629216,
"step": 4870
},
{
"epoch": 4.551820728291316,
"grad_norm": 9.3419189453125,
"learning_rate": 3.313188447434862e-05,
"loss": 0.1579,
"num_input_tokens_seen": 629904,
"step": 4875
},
{
"epoch": 4.556489262371615,
"grad_norm": 2.9429197311401367,
"learning_rate": 3.309334855145803e-05,
"loss": 0.1973,
"num_input_tokens_seen": 630544,
"step": 4880
},
{
"epoch": 4.561157796451914,
"grad_norm": 10.754371643066406,
"learning_rate": 3.3054791135225804e-05,
"loss": 0.371,
"num_input_tokens_seen": 631120,
"step": 4885
},
{
"epoch": 4.565826330532213,
"grad_norm": 6.133115291595459,
"learning_rate": 3.30162123280481e-05,
"loss": 0.3092,
"num_input_tokens_seen": 631760,
"step": 4890
},
{
"epoch": 4.570494864612511,
"grad_norm": 2.5016603469848633,
"learning_rate": 3.297761223237788e-05,
"loss": 0.2878,
"num_input_tokens_seen": 632384,
"step": 4895
},
{
"epoch": 4.57516339869281,
"grad_norm": 5.324434757232666,
"learning_rate": 3.293899095072461e-05,
"loss": 0.3063,
"num_input_tokens_seen": 633024,
"step": 4900
},
{
"epoch": 4.579831932773109,
"grad_norm": 8.173105239868164,
"learning_rate": 3.2900348585654076e-05,
"loss": 0.2217,
"num_input_tokens_seen": 633648,
"step": 4905
},
{
"epoch": 4.584500466853408,
"grad_norm": 2.823427677154541,
"learning_rate": 3.286168523978801e-05,
"loss": 0.3353,
"num_input_tokens_seen": 634272,
"step": 4910
},
{
"epoch": 4.5891690009337065,
"grad_norm": 16.714040756225586,
"learning_rate": 3.282300101580386e-05,
"loss": 0.1903,
"num_input_tokens_seen": 634976,
"step": 4915
},
{
"epoch": 4.593837535014005,
"grad_norm": 3.048602819442749,
"learning_rate": 3.278429601643456e-05,
"loss": 0.3076,
"num_input_tokens_seen": 635584,
"step": 4920
},
{
"epoch": 4.598506069094304,
"grad_norm": 7.008090972900391,
"learning_rate": 3.2745570344468166e-05,
"loss": 0.2682,
"num_input_tokens_seen": 636208,
"step": 4925
},
{
"epoch": 4.603174603174603,
"grad_norm": 4.961944103240967,
"learning_rate": 3.2706824102747694e-05,
"loss": 0.1542,
"num_input_tokens_seen": 636864,
"step": 4930
},
{
"epoch": 4.607843137254902,
"grad_norm": 8.771209716796875,
"learning_rate": 3.266805739417073e-05,
"loss": 0.5569,
"num_input_tokens_seen": 637536,
"step": 4935
},
{
"epoch": 4.6125116713352,
"grad_norm": 1.5831955671310425,
"learning_rate": 3.262927032168923e-05,
"loss": 0.3309,
"num_input_tokens_seen": 638176,
"step": 4940
},
{
"epoch": 4.617180205415499,
"grad_norm": 12.607647895812988,
"learning_rate": 3.259046298830924e-05,
"loss": 0.2684,
"num_input_tokens_seen": 638848,
"step": 4945
},
{
"epoch": 4.621848739495798,
"grad_norm": 7.591634750366211,
"learning_rate": 3.255163549709063e-05,
"loss": 0.3422,
"num_input_tokens_seen": 639520,
"step": 4950
},
{
"epoch": 4.626517273576097,
"grad_norm": 3.3472375869750977,
"learning_rate": 3.251278795114676e-05,
"loss": 0.236,
"num_input_tokens_seen": 640144,
"step": 4955
},
{
"epoch": 4.631185807656395,
"grad_norm": 6.409518241882324,
"learning_rate": 3.247392045364426e-05,
"loss": 0.2417,
"num_input_tokens_seen": 640800,
"step": 4960
},
{
"epoch": 4.635854341736694,
"grad_norm": 7.663181304931641,
"learning_rate": 3.243503310780278e-05,
"loss": 0.2252,
"num_input_tokens_seen": 641488,
"step": 4965
},
{
"epoch": 4.640522875816993,
"grad_norm": 4.352063179016113,
"learning_rate": 3.2396126016894646e-05,
"loss": 0.6517,
"num_input_tokens_seen": 642080,
"step": 4970
},
{
"epoch": 4.645191409897293,
"grad_norm": 3.6530802249908447,
"learning_rate": 3.2357199284244626e-05,
"loss": 0.1837,
"num_input_tokens_seen": 642752,
"step": 4975
},
{
"epoch": 4.649859943977591,
"grad_norm": 9.228134155273438,
"learning_rate": 3.231825301322966e-05,
"loss": 0.2263,
"num_input_tokens_seen": 643264,
"step": 4980
},
{
"epoch": 4.65452847805789,
"grad_norm": 4.053821086883545,
"learning_rate": 3.227928730727857e-05,
"loss": 0.3454,
"num_input_tokens_seen": 643984,
"step": 4985
},
{
"epoch": 4.659197012138189,
"grad_norm": 7.429311275482178,
"learning_rate": 3.224030226987179e-05,
"loss": 0.311,
"num_input_tokens_seen": 644640,
"step": 4990
},
{
"epoch": 4.663865546218488,
"grad_norm": 4.688385963439941,
"learning_rate": 3.220129800454108e-05,
"loss": 0.5501,
"num_input_tokens_seen": 645280,
"step": 4995
},
{
"epoch": 4.6685340802987865,
"grad_norm": 4.372518539428711,
"learning_rate": 3.21622746148693e-05,
"loss": 0.2034,
"num_input_tokens_seen": 645888,
"step": 5000
},
{
"epoch": 4.673202614379085,
"grad_norm": 6.792498588562012,
"learning_rate": 3.212323220449006e-05,
"loss": 0.2641,
"num_input_tokens_seen": 646560,
"step": 5005
},
{
"epoch": 4.677871148459384,
"grad_norm": 6.591304779052734,
"learning_rate": 3.2084170877087504e-05,
"loss": 0.2674,
"num_input_tokens_seen": 647120,
"step": 5010
},
{
"epoch": 4.682539682539683,
"grad_norm": 2.276726007461548,
"learning_rate": 3.2045090736396006e-05,
"loss": 0.265,
"num_input_tokens_seen": 647696,
"step": 5015
},
{
"epoch": 4.6872082166199815,
"grad_norm": 7.44296407699585,
"learning_rate": 3.200599188619989e-05,
"loss": 0.1148,
"num_input_tokens_seen": 648464,
"step": 5020
},
{
"epoch": 4.69187675070028,
"grad_norm": 6.232602596282959,
"learning_rate": 3.196687443033321e-05,
"loss": 0.3454,
"num_input_tokens_seen": 649008,
"step": 5025
},
{
"epoch": 4.696545284780579,
"grad_norm": 6.96764612197876,
"learning_rate": 3.192773847267937e-05,
"loss": 0.6691,
"num_input_tokens_seen": 649568,
"step": 5030
},
{
"epoch": 4.701213818860878,
"grad_norm": 19.437314987182617,
"learning_rate": 3.188858411717095e-05,
"loss": 0.4101,
"num_input_tokens_seen": 650224,
"step": 5035
},
{
"epoch": 4.705882352941177,
"grad_norm": 1.8838727474212646,
"learning_rate": 3.184941146778938e-05,
"loss": 0.3288,
"num_input_tokens_seen": 651152,
"step": 5040
},
{
"epoch": 4.710550887021475,
"grad_norm": 2.7591898441314697,
"learning_rate": 3.181022062856466e-05,
"loss": 0.1966,
"num_input_tokens_seen": 651712,
"step": 5045
},
{
"epoch": 4.715219421101774,
"grad_norm": 0.3163776397705078,
"learning_rate": 3.177101170357513e-05,
"loss": 0.2892,
"num_input_tokens_seen": 652384,
"step": 5050
},
{
"epoch": 4.719887955182073,
"grad_norm": 8.201074600219727,
"learning_rate": 3.173178479694712e-05,
"loss": 0.3812,
"num_input_tokens_seen": 653072,
"step": 5055
},
{
"epoch": 4.724556489262372,
"grad_norm": 2.460468292236328,
"learning_rate": 3.1692540012854726e-05,
"loss": 0.2103,
"num_input_tokens_seen": 653712,
"step": 5060
},
{
"epoch": 4.7292250233426705,
"grad_norm": 3.513843297958374,
"learning_rate": 3.165327745551954e-05,
"loss": 0.4444,
"num_input_tokens_seen": 654368,
"step": 5065
},
{
"epoch": 4.733893557422969,
"grad_norm": 4.301528453826904,
"learning_rate": 3.161399722921033e-05,
"loss": 0.5824,
"num_input_tokens_seen": 655040,
"step": 5070
},
{
"epoch": 4.738562091503268,
"grad_norm": 4.474994659423828,
"learning_rate": 3.1574699438242804e-05,
"loss": 0.4809,
"num_input_tokens_seen": 655600,
"step": 5075
},
{
"epoch": 4.743230625583567,
"grad_norm": 4.316887378692627,
"learning_rate": 3.15353841869793e-05,
"loss": 0.1618,
"num_input_tokens_seen": 656224,
"step": 5080
},
{
"epoch": 4.7478991596638656,
"grad_norm": 7.845536708831787,
"learning_rate": 3.149605157982852e-05,
"loss": 0.5412,
"num_input_tokens_seen": 656832,
"step": 5085
},
{
"epoch": 4.752567693744164,
"grad_norm": 9.774452209472656,
"learning_rate": 3.1456701721245305e-05,
"loss": 0.3414,
"num_input_tokens_seen": 657440,
"step": 5090
},
{
"epoch": 4.757236227824463,
"grad_norm": 3.0468742847442627,
"learning_rate": 3.1417334715730265e-05,
"loss": 0.2242,
"num_input_tokens_seen": 658032,
"step": 5095
},
{
"epoch": 4.761904761904762,
"grad_norm": 9.727741241455078,
"learning_rate": 3.137795066782954e-05,
"loss": 0.339,
"num_input_tokens_seen": 658704,
"step": 5100
},
{
"epoch": 4.766573295985061,
"grad_norm": 14.910123825073242,
"learning_rate": 3.1338549682134564e-05,
"loss": 0.3962,
"num_input_tokens_seen": 659328,
"step": 5105
},
{
"epoch": 4.771241830065359,
"grad_norm": 13.144892692565918,
"learning_rate": 3.1299131863281734e-05,
"loss": 0.5179,
"num_input_tokens_seen": 659968,
"step": 5110
},
{
"epoch": 4.775910364145658,
"grad_norm": 4.052852630615234,
"learning_rate": 3.125969731595215e-05,
"loss": 0.3517,
"num_input_tokens_seen": 660608,
"step": 5115
},
{
"epoch": 4.780578898225957,
"grad_norm": 2.2380478382110596,
"learning_rate": 3.1220246144871334e-05,
"loss": 0.1959,
"num_input_tokens_seen": 661184,
"step": 5120
},
{
"epoch": 4.785247432306256,
"grad_norm": 3.6989505290985107,
"learning_rate": 3.118077845480897e-05,
"loss": 0.2745,
"num_input_tokens_seen": 661808,
"step": 5125
},
{
"epoch": 4.7899159663865545,
"grad_norm": 3.7099645137786865,
"learning_rate": 3.11412943505786e-05,
"loss": 0.1971,
"num_input_tokens_seen": 662576,
"step": 5130
},
{
"epoch": 4.794584500466853,
"grad_norm": 8.147356986999512,
"learning_rate": 3.110179393703737e-05,
"loss": 0.3137,
"num_input_tokens_seen": 663264,
"step": 5135
},
{
"epoch": 4.799253034547152,
"grad_norm": 3.5122320652008057,
"learning_rate": 3.106227731908569e-05,
"loss": 0.168,
"num_input_tokens_seen": 663888,
"step": 5140
},
{
"epoch": 4.803921568627451,
"grad_norm": 1.2344309091567993,
"learning_rate": 3.1022744601667076e-05,
"loss": 0.2979,
"num_input_tokens_seen": 664512,
"step": 5145
},
{
"epoch": 4.80859010270775,
"grad_norm": 2.26531720161438,
"learning_rate": 3.0983195889767756e-05,
"loss": 0.5175,
"num_input_tokens_seen": 665040,
"step": 5150
},
{
"epoch": 4.813258636788048,
"grad_norm": 5.6935930252075195,
"learning_rate": 3.0943631288416444e-05,
"loss": 0.4906,
"num_input_tokens_seen": 665696,
"step": 5155
},
{
"epoch": 4.817927170868347,
"grad_norm": 6.8716912269592285,
"learning_rate": 3.0904050902684046e-05,
"loss": 0.4338,
"num_input_tokens_seen": 666320,
"step": 5160
},
{
"epoch": 4.822595704948646,
"grad_norm": 3.8555526733398438,
"learning_rate": 3.086445483768338e-05,
"loss": 0.3313,
"num_input_tokens_seen": 666944,
"step": 5165
},
{
"epoch": 4.827264239028945,
"grad_norm": 10.221288681030273,
"learning_rate": 3.082484319856893e-05,
"loss": 0.3209,
"num_input_tokens_seen": 667504,
"step": 5170
},
{
"epoch": 4.831932773109243,
"grad_norm": 4.305709362030029,
"learning_rate": 3.0785216090536514e-05,
"loss": 0.3736,
"num_input_tokens_seen": 668176,
"step": 5175
},
{
"epoch": 4.836601307189542,
"grad_norm": 3.446392774581909,
"learning_rate": 3.0745573618823046e-05,
"loss": 0.3752,
"num_input_tokens_seen": 668784,
"step": 5180
},
{
"epoch": 4.841269841269841,
"grad_norm": 2.5847973823547363,
"learning_rate": 3.070591588870622e-05,
"loss": 0.3039,
"num_input_tokens_seen": 669488,
"step": 5185
},
{
"epoch": 4.84593837535014,
"grad_norm": 17.622394561767578,
"learning_rate": 3.066624300550427e-05,
"loss": 0.8055,
"num_input_tokens_seen": 670112,
"step": 5190
},
{
"epoch": 4.8506069094304385,
"grad_norm": 4.8430705070495605,
"learning_rate": 3.062655507457569e-05,
"loss": 0.1342,
"num_input_tokens_seen": 670848,
"step": 5195
},
{
"epoch": 4.855275443510737,
"grad_norm": 4.713992118835449,
"learning_rate": 3.058685220131888e-05,
"loss": 0.4015,
"num_input_tokens_seen": 671648,
"step": 5200
},
{
"epoch": 4.859943977591037,
"grad_norm": 6.5015692710876465,
"learning_rate": 3.054713449117197e-05,
"loss": 0.3398,
"num_input_tokens_seen": 672352,
"step": 5205
},
{
"epoch": 4.864612511671336,
"grad_norm": 1.2088795900344849,
"learning_rate": 3.0507402049612482e-05,
"loss": 0.2334,
"num_input_tokens_seen": 672976,
"step": 5210
},
{
"epoch": 4.8692810457516345,
"grad_norm": 4.3346710205078125,
"learning_rate": 3.046765498215705e-05,
"loss": 0.3382,
"num_input_tokens_seen": 673504,
"step": 5215
},
{
"epoch": 4.873949579831933,
"grad_norm": 6.621752738952637,
"learning_rate": 3.042789339436116e-05,
"loss": 0.1954,
"num_input_tokens_seen": 674048,
"step": 5220
},
{
"epoch": 4.878618113912232,
"grad_norm": 9.04273796081543,
"learning_rate": 3.038811739181885e-05,
"loss": 0.3084,
"num_input_tokens_seen": 674688,
"step": 5225
},
{
"epoch": 4.883286647992531,
"grad_norm": 3.0526490211486816,
"learning_rate": 3.0348327080162435e-05,
"loss": 0.4238,
"num_input_tokens_seen": 675344,
"step": 5230
},
{
"epoch": 4.8879551820728295,
"grad_norm": 6.297845363616943,
"learning_rate": 3.0308522565062265e-05,
"loss": 0.2566,
"num_input_tokens_seen": 676064,
"step": 5235
},
{
"epoch": 4.892623716153128,
"grad_norm": 3.6078665256500244,
"learning_rate": 3.026870395222635e-05,
"loss": 0.2171,
"num_input_tokens_seen": 676704,
"step": 5240
},
{
"epoch": 4.897292250233427,
"grad_norm": 7.2723236083984375,
"learning_rate": 3.0228871347400194e-05,
"loss": 0.3802,
"num_input_tokens_seen": 677408,
"step": 5245
},
{
"epoch": 4.901960784313726,
"grad_norm": 16.52984046936035,
"learning_rate": 3.018902485636643e-05,
"loss": 0.437,
"num_input_tokens_seen": 677936,
"step": 5250
},
{
"epoch": 4.906629318394025,
"grad_norm": 7.225020885467529,
"learning_rate": 3.014916458494459e-05,
"loss": 0.388,
"num_input_tokens_seen": 678512,
"step": 5255
},
{
"epoch": 4.911297852474323,
"grad_norm": 0.8362892270088196,
"learning_rate": 3.0109290638990772e-05,
"loss": 0.2453,
"num_input_tokens_seen": 679104,
"step": 5260
},
{
"epoch": 4.915966386554622,
"grad_norm": 9.181543350219727,
"learning_rate": 3.0069403124397412e-05,
"loss": 0.3347,
"num_input_tokens_seen": 679712,
"step": 5265
},
{
"epoch": 4.920634920634921,
"grad_norm": 4.07673454284668,
"learning_rate": 3.002950214709297e-05,
"loss": 0.1877,
"num_input_tokens_seen": 680304,
"step": 5270
},
{
"epoch": 4.92530345471522,
"grad_norm": 4.580996513366699,
"learning_rate": 2.998958781304167e-05,
"loss": 0.1706,
"num_input_tokens_seen": 680992,
"step": 5275
},
{
"epoch": 4.9299719887955185,
"grad_norm": 3.018458604812622,
"learning_rate": 2.994966022824319e-05,
"loss": 0.2046,
"num_input_tokens_seen": 681600,
"step": 5280
},
{
"epoch": 4.934640522875817,
"grad_norm": 4.071977138519287,
"learning_rate": 2.9909719498732414e-05,
"loss": 0.3367,
"num_input_tokens_seen": 682336,
"step": 5285
},
{
"epoch": 4.939309056956116,
"grad_norm": 2.586224317550659,
"learning_rate": 2.9869765730579125e-05,
"loss": 0.2818,
"num_input_tokens_seen": 683040,
"step": 5290
},
{
"epoch": 4.943977591036415,
"grad_norm": 9.728919982910156,
"learning_rate": 2.9829799029887738e-05,
"loss": 0.4241,
"num_input_tokens_seen": 683632,
"step": 5295
},
{
"epoch": 4.9486461251167135,
"grad_norm": 5.366093635559082,
"learning_rate": 2.9789819502797012e-05,
"loss": 0.2647,
"num_input_tokens_seen": 684304,
"step": 5300
},
{
"epoch": 4.953314659197012,
"grad_norm": 2.941143751144409,
"learning_rate": 2.9749827255479755e-05,
"loss": 0.2741,
"num_input_tokens_seen": 684960,
"step": 5305
},
{
"epoch": 4.957983193277311,
"grad_norm": 2.2302086353302,
"learning_rate": 2.9709822394142572e-05,
"loss": 0.2222,
"num_input_tokens_seen": 685744,
"step": 5310
},
{
"epoch": 4.96265172735761,
"grad_norm": 6.357166767120361,
"learning_rate": 2.9669805025025567e-05,
"loss": 0.5765,
"num_input_tokens_seen": 686352,
"step": 5315
},
{
"epoch": 4.967320261437909,
"grad_norm": 6.87725305557251,
"learning_rate": 2.9629775254402053e-05,
"loss": 0.208,
"num_input_tokens_seen": 686960,
"step": 5320
},
{
"epoch": 4.971988795518207,
"grad_norm": 6.549231052398682,
"learning_rate": 2.958973318857827e-05,
"loss": 0.2937,
"num_input_tokens_seen": 687584,
"step": 5325
},
{
"epoch": 4.976657329598506,
"grad_norm": 1.1992058753967285,
"learning_rate": 2.9549678933893143e-05,
"loss": 0.3239,
"num_input_tokens_seen": 688272,
"step": 5330
},
{
"epoch": 4.981325863678805,
"grad_norm": 4.71303129196167,
"learning_rate": 2.950961259671793e-05,
"loss": 0.4471,
"num_input_tokens_seen": 688880,
"step": 5335
},
{
"epoch": 4.985994397759104,
"grad_norm": 4.6151933670043945,
"learning_rate": 2.946953428345598e-05,
"loss": 0.3116,
"num_input_tokens_seen": 689504,
"step": 5340
},
{
"epoch": 4.9906629318394025,
"grad_norm": 4.89030647277832,
"learning_rate": 2.942944410054248e-05,
"loss": 0.2833,
"num_input_tokens_seen": 690160,
"step": 5345
},
{
"epoch": 4.995331465919701,
"grad_norm": 14.496315956115723,
"learning_rate": 2.9389342154444093e-05,
"loss": 0.5057,
"num_input_tokens_seen": 690736,
"step": 5350
},
{
"epoch": 5.0,
"grad_norm": 11.446124076843262,
"learning_rate": 2.9349228551658766e-05,
"loss": 0.5066,
"num_input_tokens_seen": 691304,
"step": 5355
},
{
"epoch": 5.004668534080299,
"grad_norm": 4.030084609985352,
"learning_rate": 2.930910339871536e-05,
"loss": 0.2427,
"num_input_tokens_seen": 691912,
"step": 5360
},
{
"epoch": 5.004668534080299,
"eval_loss": 0.7145299911499023,
"eval_runtime": 3.8675,
"eval_samples_per_second": 61.539,
"eval_steps_per_second": 30.769,
"num_input_tokens_seen": 691912,
"step": 5360
},
{
"epoch": 5.0093370681605975,
"grad_norm": 2.417534351348877,
"learning_rate": 2.9268966802173436e-05,
"loss": 0.1194,
"num_input_tokens_seen": 692440,
"step": 5365
},
{
"epoch": 5.014005602240896,
"grad_norm": 3.961277484893799,
"learning_rate": 2.9228818868622953e-05,
"loss": 0.4708,
"num_input_tokens_seen": 693032,
"step": 5370
},
{
"epoch": 5.018674136321195,
"grad_norm": 4.8509721755981445,
"learning_rate": 2.9188659704683953e-05,
"loss": 0.3194,
"num_input_tokens_seen": 693704,
"step": 5375
},
{
"epoch": 5.023342670401494,
"grad_norm": 2.1123046875,
"learning_rate": 2.9148489417006308e-05,
"loss": 0.1934,
"num_input_tokens_seen": 694328,
"step": 5380
},
{
"epoch": 5.028011204481793,
"grad_norm": 0.24643854796886444,
"learning_rate": 2.910830811226944e-05,
"loss": 0.1006,
"num_input_tokens_seen": 695000,
"step": 5385
},
{
"epoch": 5.032679738562091,
"grad_norm": 1.844314455986023,
"learning_rate": 2.9068115897182036e-05,
"loss": 0.1476,
"num_input_tokens_seen": 695656,
"step": 5390
},
{
"epoch": 5.03734827264239,
"grad_norm": 9.13713550567627,
"learning_rate": 2.902791287848173e-05,
"loss": 0.2345,
"num_input_tokens_seen": 696264,
"step": 5395
},
{
"epoch": 5.042016806722689,
"grad_norm": 1.417795181274414,
"learning_rate": 2.898769916293488e-05,
"loss": 0.1987,
"num_input_tokens_seen": 696856,
"step": 5400
},
{
"epoch": 5.046685340802988,
"grad_norm": 8.42477798461914,
"learning_rate": 2.894747485733622e-05,
"loss": 0.2888,
"num_input_tokens_seen": 697512,
"step": 5405
},
{
"epoch": 5.0513538748832865,
"grad_norm": 7.067439079284668,
"learning_rate": 2.8907240068508627e-05,
"loss": 0.3286,
"num_input_tokens_seen": 698152,
"step": 5410
},
{
"epoch": 5.056022408963585,
"grad_norm": 3.5997512340545654,
"learning_rate": 2.8866994903302823e-05,
"loss": 0.1111,
"num_input_tokens_seen": 698824,
"step": 5415
},
{
"epoch": 5.060690943043884,
"grad_norm": 4.830616474151611,
"learning_rate": 2.8826739468597068e-05,
"loss": 0.1264,
"num_input_tokens_seen": 699544,
"step": 5420
},
{
"epoch": 5.065359477124183,
"grad_norm": 15.35937786102295,
"learning_rate": 2.87864738712969e-05,
"loss": 0.2375,
"num_input_tokens_seen": 700216,
"step": 5425
},
{
"epoch": 5.0700280112044815,
"grad_norm": 0.5134664177894592,
"learning_rate": 2.8746198218334862e-05,
"loss": 0.1763,
"num_input_tokens_seen": 700840,
"step": 5430
},
{
"epoch": 5.07469654528478,
"grad_norm": 9.345810890197754,
"learning_rate": 2.870591261667018e-05,
"loss": 0.1846,
"num_input_tokens_seen": 701544,
"step": 5435
},
{
"epoch": 5.079365079365079,
"grad_norm": 11.941632270812988,
"learning_rate": 2.8665617173288516e-05,
"loss": 0.2576,
"num_input_tokens_seen": 702104,
"step": 5440
},
{
"epoch": 5.084033613445378,
"grad_norm": 5.5290422439575195,
"learning_rate": 2.8625311995201648e-05,
"loss": 0.0805,
"num_input_tokens_seen": 702776,
"step": 5445
},
{
"epoch": 5.088702147525677,
"grad_norm": 4.16331672668457,
"learning_rate": 2.8584997189447226e-05,
"loss": 0.1695,
"num_input_tokens_seen": 703400,
"step": 5450
},
{
"epoch": 5.093370681605975,
"grad_norm": 14.67473030090332,
"learning_rate": 2.854467286308848e-05,
"loss": 0.3571,
"num_input_tokens_seen": 704024,
"step": 5455
},
{
"epoch": 5.098039215686274,
"grad_norm": 4.0992255210876465,
"learning_rate": 2.8504339123213886e-05,
"loss": 0.0809,
"num_input_tokens_seen": 704712,
"step": 5460
},
{
"epoch": 5.102707749766573,
"grad_norm": 10.043177604675293,
"learning_rate": 2.8463996076936944e-05,
"loss": 0.1856,
"num_input_tokens_seen": 705384,
"step": 5465
},
{
"epoch": 5.107376283846872,
"grad_norm": 10.882224082946777,
"learning_rate": 2.8423643831395856e-05,
"loss": 0.3468,
"num_input_tokens_seen": 706056,
"step": 5470
},
{
"epoch": 5.1120448179271705,
"grad_norm": 2.696959972381592,
"learning_rate": 2.8383282493753283e-05,
"loss": 0.0661,
"num_input_tokens_seen": 706664,
"step": 5475
},
{
"epoch": 5.116713352007469,
"grad_norm": 12.80626106262207,
"learning_rate": 2.834291217119599e-05,
"loss": 0.3852,
"num_input_tokens_seen": 707224,
"step": 5480
},
{
"epoch": 5.121381886087768,
"grad_norm": 0.5678945183753967,
"learning_rate": 2.830253297093463e-05,
"loss": 0.1757,
"num_input_tokens_seen": 707800,
"step": 5485
},
{
"epoch": 5.126050420168067,
"grad_norm": 3.3095643520355225,
"learning_rate": 2.826214500020344e-05,
"loss": 0.2569,
"num_input_tokens_seen": 708456,
"step": 5490
},
{
"epoch": 5.130718954248366,
"grad_norm": 75.66603088378906,
"learning_rate": 2.8221748366259915e-05,
"loss": 0.2313,
"num_input_tokens_seen": 709080,
"step": 5495
},
{
"epoch": 5.135387488328665,
"grad_norm": 8.143814086914062,
"learning_rate": 2.818134317638459e-05,
"loss": 0.2562,
"num_input_tokens_seen": 709720,
"step": 5500
},
{
"epoch": 5.140056022408964,
"grad_norm": 3.207918405532837,
"learning_rate": 2.81409295378807e-05,
"loss": 0.1835,
"num_input_tokens_seen": 710344,
"step": 5505
},
{
"epoch": 5.144724556489263,
"grad_norm": 2.8475122451782227,
"learning_rate": 2.8100507558073924e-05,
"loss": 0.2123,
"num_input_tokens_seen": 711000,
"step": 5510
},
{
"epoch": 5.1493930905695615,
"grad_norm": 1.4689849615097046,
"learning_rate": 2.8060077344312106e-05,
"loss": 0.1878,
"num_input_tokens_seen": 711592,
"step": 5515
},
{
"epoch": 5.15406162464986,
"grad_norm": 6.083222389221191,
"learning_rate": 2.8019639003964936e-05,
"loss": 0.2004,
"num_input_tokens_seen": 712264,
"step": 5520
},
{
"epoch": 5.158730158730159,
"grad_norm": 4.9259748458862305,
"learning_rate": 2.7979192644423703e-05,
"loss": 0.1988,
"num_input_tokens_seen": 712936,
"step": 5525
},
{
"epoch": 5.163398692810458,
"grad_norm": 2.0269927978515625,
"learning_rate": 2.7938738373100966e-05,
"loss": 0.1394,
"num_input_tokens_seen": 713624,
"step": 5530
},
{
"epoch": 5.168067226890757,
"grad_norm": 17.339391708374023,
"learning_rate": 2.789827629743032e-05,
"loss": 0.355,
"num_input_tokens_seen": 714312,
"step": 5535
},
{
"epoch": 5.172735760971055,
"grad_norm": 4.577376365661621,
"learning_rate": 2.78578065248661e-05,
"loss": 0.1229,
"num_input_tokens_seen": 714968,
"step": 5540
},
{
"epoch": 5.177404295051354,
"grad_norm": 3.624584197998047,
"learning_rate": 2.781732916288303e-05,
"loss": 0.09,
"num_input_tokens_seen": 715608,
"step": 5545
},
{
"epoch": 5.182072829131653,
"grad_norm": 3.7476773262023926,
"learning_rate": 2.7776844318976035e-05,
"loss": 0.2231,
"num_input_tokens_seen": 716232,
"step": 5550
},
{
"epoch": 5.186741363211952,
"grad_norm": 3.4681551456451416,
"learning_rate": 2.773635210065989e-05,
"loss": 0.1263,
"num_input_tokens_seen": 716936,
"step": 5555
},
{
"epoch": 5.19140989729225,
"grad_norm": 4.748959064483643,
"learning_rate": 2.769585261546897e-05,
"loss": 0.2337,
"num_input_tokens_seen": 717576,
"step": 5560
},
{
"epoch": 5.196078431372549,
"grad_norm": 6.100517272949219,
"learning_rate": 2.765534597095692e-05,
"loss": 0.1429,
"num_input_tokens_seen": 718248,
"step": 5565
},
{
"epoch": 5.200746965452848,
"grad_norm": 3.645913600921631,
"learning_rate": 2.7614832274696416e-05,
"loss": 0.3484,
"num_input_tokens_seen": 718904,
"step": 5570
},
{
"epoch": 5.205415499533147,
"grad_norm": 6.494524002075195,
"learning_rate": 2.7574311634278872e-05,
"loss": 0.2583,
"num_input_tokens_seen": 719640,
"step": 5575
},
{
"epoch": 5.2100840336134455,
"grad_norm": 14.866647720336914,
"learning_rate": 2.753378415731412e-05,
"loss": 0.3697,
"num_input_tokens_seen": 720344,
"step": 5580
},
{
"epoch": 5.214752567693744,
"grad_norm": 6.337793827056885,
"learning_rate": 2.749324995143016e-05,
"loss": 0.1749,
"num_input_tokens_seen": 721064,
"step": 5585
},
{
"epoch": 5.219421101774043,
"grad_norm": 3.1200544834136963,
"learning_rate": 2.7452709124272863e-05,
"loss": 0.1373,
"num_input_tokens_seen": 721672,
"step": 5590
},
{
"epoch": 5.224089635854342,
"grad_norm": 5.817122936248779,
"learning_rate": 2.741216178350568e-05,
"loss": 0.2363,
"num_input_tokens_seen": 722280,
"step": 5595
},
{
"epoch": 5.228758169934641,
"grad_norm": 0.6314363479614258,
"learning_rate": 2.7371608036809364e-05,
"loss": 0.457,
"num_input_tokens_seen": 722920,
"step": 5600
},
{
"epoch": 5.233426704014939,
"grad_norm": 8.405885696411133,
"learning_rate": 2.733104799188168e-05,
"loss": 0.2979,
"num_input_tokens_seen": 723560,
"step": 5605
},
{
"epoch": 5.238095238095238,
"grad_norm": 5.43946647644043,
"learning_rate": 2.7290481756437112e-05,
"loss": 0.1767,
"num_input_tokens_seen": 724136,
"step": 5610
},
{
"epoch": 5.242763772175537,
"grad_norm": 0.9292370080947876,
"learning_rate": 2.724990943820659e-05,
"loss": 0.1797,
"num_input_tokens_seen": 724728,
"step": 5615
},
{
"epoch": 5.247432306255836,
"grad_norm": 3.5616211891174316,
"learning_rate": 2.72093311449372e-05,
"loss": 0.1392,
"num_input_tokens_seen": 725368,
"step": 5620
},
{
"epoch": 5.2521008403361344,
"grad_norm": 6.952834606170654,
"learning_rate": 2.716874698439189e-05,
"loss": 0.3549,
"num_input_tokens_seen": 726072,
"step": 5625
},
{
"epoch": 5.256769374416433,
"grad_norm": 3.403163433074951,
"learning_rate": 2.7128157064349186e-05,
"loss": 0.2227,
"num_input_tokens_seen": 726744,
"step": 5630
},
{
"epoch": 5.261437908496732,
"grad_norm": 0.7634921669960022,
"learning_rate": 2.7087561492602925e-05,
"loss": 0.0984,
"num_input_tokens_seen": 727368,
"step": 5635
},
{
"epoch": 5.266106442577031,
"grad_norm": 0.6099923253059387,
"learning_rate": 2.7046960376961934e-05,
"loss": 0.6007,
"num_input_tokens_seen": 728008,
"step": 5640
},
{
"epoch": 5.2707749766573295,
"grad_norm": 2.206415891647339,
"learning_rate": 2.7006353825249792e-05,
"loss": 0.1308,
"num_input_tokens_seen": 728696,
"step": 5645
},
{
"epoch": 5.275443510737628,
"grad_norm": 6.221801280975342,
"learning_rate": 2.6965741945304467e-05,
"loss": 0.2882,
"num_input_tokens_seen": 729400,
"step": 5650
},
{
"epoch": 5.280112044817927,
"grad_norm": 3.225241184234619,
"learning_rate": 2.6925124844978126e-05,
"loss": 0.2439,
"num_input_tokens_seen": 730024,
"step": 5655
},
{
"epoch": 5.284780578898226,
"grad_norm": 8.841064453125,
"learning_rate": 2.6884502632136777e-05,
"loss": 0.1993,
"num_input_tokens_seen": 730664,
"step": 5660
},
{
"epoch": 5.289449112978525,
"grad_norm": 2.7826311588287354,
"learning_rate": 2.6843875414659996e-05,
"loss": 0.2607,
"num_input_tokens_seen": 731320,
"step": 5665
},
{
"epoch": 5.294117647058823,
"grad_norm": 4.322564601898193,
"learning_rate": 2.680324330044067e-05,
"loss": 0.1684,
"num_input_tokens_seen": 731992,
"step": 5670
},
{
"epoch": 5.298786181139122,
"grad_norm": 1.4055625200271606,
"learning_rate": 2.6762606397384677e-05,
"loss": 0.1244,
"num_input_tokens_seen": 732808,
"step": 5675
},
{
"epoch": 5.303454715219421,
"grad_norm": 3.7955009937286377,
"learning_rate": 2.6721964813410616e-05,
"loss": 0.2172,
"num_input_tokens_seen": 733448,
"step": 5680
},
{
"epoch": 5.30812324929972,
"grad_norm": 1.5904556512832642,
"learning_rate": 2.6681318656449522e-05,
"loss": 0.2895,
"num_input_tokens_seen": 734040,
"step": 5685
},
{
"epoch": 5.3127917833800185,
"grad_norm": 4.815072059631348,
"learning_rate": 2.664066803444456e-05,
"loss": 0.1865,
"num_input_tokens_seen": 734760,
"step": 5690
},
{
"epoch": 5.317460317460317,
"grad_norm": 11.906131744384766,
"learning_rate": 2.6600013055350776e-05,
"loss": 0.1374,
"num_input_tokens_seen": 735448,
"step": 5695
},
{
"epoch": 5.322128851540616,
"grad_norm": 2.0406405925750732,
"learning_rate": 2.6559353827134754e-05,
"loss": 0.1208,
"num_input_tokens_seen": 736104,
"step": 5700
},
{
"epoch": 5.326797385620915,
"grad_norm": 5.281521320343018,
"learning_rate": 2.651869045777441e-05,
"loss": 0.2245,
"num_input_tokens_seen": 736696,
"step": 5705
},
{
"epoch": 5.3314659197012135,
"grad_norm": 2.748307704925537,
"learning_rate": 2.6478023055258606e-05,
"loss": 0.2027,
"num_input_tokens_seen": 737336,
"step": 5710
},
{
"epoch": 5.336134453781512,
"grad_norm": 18.87708854675293,
"learning_rate": 2.643735172758694e-05,
"loss": 0.2526,
"num_input_tokens_seen": 737976,
"step": 5715
},
{
"epoch": 5.340802987861811,
"grad_norm": 3.8325388431549072,
"learning_rate": 2.6396676582769447e-05,
"loss": 0.2278,
"num_input_tokens_seen": 738648,
"step": 5720
},
{
"epoch": 5.34547152194211,
"grad_norm": 4.2706990242004395,
"learning_rate": 2.6355997728826276e-05,
"loss": 0.0846,
"num_input_tokens_seen": 739384,
"step": 5725
},
{
"epoch": 5.350140056022409,
"grad_norm": 2.1158320903778076,
"learning_rate": 2.6315315273787428e-05,
"loss": 0.2136,
"num_input_tokens_seen": 739992,
"step": 5730
},
{
"epoch": 5.354808590102707,
"grad_norm": 3.025188684463501,
"learning_rate": 2.627462932569248e-05,
"loss": 0.3116,
"num_input_tokens_seen": 740824,
"step": 5735
},
{
"epoch": 5.359477124183006,
"grad_norm": 3.6472253799438477,
"learning_rate": 2.6233939992590277e-05,
"loss": 0.3267,
"num_input_tokens_seen": 741448,
"step": 5740
},
{
"epoch": 5.364145658263305,
"grad_norm": 1.7700618505477905,
"learning_rate": 2.619324738253867e-05,
"loss": 0.1885,
"num_input_tokens_seen": 742184,
"step": 5745
},
{
"epoch": 5.368814192343605,
"grad_norm": 0.3386409878730774,
"learning_rate": 2.6152551603604176e-05,
"loss": 0.1666,
"num_input_tokens_seen": 742808,
"step": 5750
},
{
"epoch": 5.373482726423903,
"grad_norm": 1.589062213897705,
"learning_rate": 2.611185276386176e-05,
"loss": 0.3356,
"num_input_tokens_seen": 743384,
"step": 5755
},
{
"epoch": 5.378151260504202,
"grad_norm": 3.3072431087493896,
"learning_rate": 2.6071150971394503e-05,
"loss": 0.1891,
"num_input_tokens_seen": 743944,
"step": 5760
},
{
"epoch": 5.382819794584501,
"grad_norm": 12.216253280639648,
"learning_rate": 2.603044633429334e-05,
"loss": 0.5011,
"num_input_tokens_seen": 744520,
"step": 5765
},
{
"epoch": 5.3874883286648,
"grad_norm": 10.251158714294434,
"learning_rate": 2.598973896065674e-05,
"loss": 0.2908,
"num_input_tokens_seen": 745176,
"step": 5770
},
{
"epoch": 5.392156862745098,
"grad_norm": 2.105750560760498,
"learning_rate": 2.5949028958590447e-05,
"loss": 0.1305,
"num_input_tokens_seen": 745848,
"step": 5775
},
{
"epoch": 5.396825396825397,
"grad_norm": 3.5795063972473145,
"learning_rate": 2.5908316436207203e-05,
"loss": 0.198,
"num_input_tokens_seen": 746504,
"step": 5780
},
{
"epoch": 5.401493930905696,
"grad_norm": 7.272071361541748,
"learning_rate": 2.5867601501626415e-05,
"loss": 0.3211,
"num_input_tokens_seen": 747128,
"step": 5785
},
{
"epoch": 5.406162464985995,
"grad_norm": 3.216158866882324,
"learning_rate": 2.5826884262973906e-05,
"loss": 0.0894,
"num_input_tokens_seen": 747752,
"step": 5790
},
{
"epoch": 5.4108309990662935,
"grad_norm": 1.1698273420333862,
"learning_rate": 2.5786164828381633e-05,
"loss": 0.2741,
"num_input_tokens_seen": 748296,
"step": 5795
},
{
"epoch": 5.415499533146592,
"grad_norm": 6.914623260498047,
"learning_rate": 2.5745443305987366e-05,
"loss": 0.2973,
"num_input_tokens_seen": 748920,
"step": 5800
},
{
"epoch": 5.420168067226891,
"grad_norm": 8.555301666259766,
"learning_rate": 2.570471980393443e-05,
"loss": 0.2134,
"num_input_tokens_seen": 749672,
"step": 5805
},
{
"epoch": 5.42483660130719,
"grad_norm": 3.0789635181427,
"learning_rate": 2.5663994430371403e-05,
"loss": 0.2423,
"num_input_tokens_seen": 750344,
"step": 5810
},
{
"epoch": 5.429505135387489,
"grad_norm": 19.087554931640625,
"learning_rate": 2.5623267293451826e-05,
"loss": 0.1097,
"num_input_tokens_seen": 751000,
"step": 5815
},
{
"epoch": 5.434173669467787,
"grad_norm": 10.77205753326416,
"learning_rate": 2.5582538501333934e-05,
"loss": 0.2338,
"num_input_tokens_seen": 751752,
"step": 5820
},
{
"epoch": 5.438842203548086,
"grad_norm": 1.9925490617752075,
"learning_rate": 2.5541808162180364e-05,
"loss": 0.1922,
"num_input_tokens_seen": 752392,
"step": 5825
},
{
"epoch": 5.443510737628385,
"grad_norm": 11.317987442016602,
"learning_rate": 2.5501076384157848e-05,
"loss": 0.323,
"num_input_tokens_seen": 753112,
"step": 5830
},
{
"epoch": 5.448179271708684,
"grad_norm": 6.738025188446045,
"learning_rate": 2.5460343275436925e-05,
"loss": 0.1361,
"num_input_tokens_seen": 753704,
"step": 5835
},
{
"epoch": 5.452847805788982,
"grad_norm": 6.475763320922852,
"learning_rate": 2.5419608944191714e-05,
"loss": 0.2934,
"num_input_tokens_seen": 754344,
"step": 5840
},
{
"epoch": 5.457516339869281,
"grad_norm": 0.0828949436545372,
"learning_rate": 2.5378873498599535e-05,
"loss": 0.2367,
"num_input_tokens_seen": 755128,
"step": 5845
},
{
"epoch": 5.46218487394958,
"grad_norm": 0.22640591859817505,
"learning_rate": 2.5338137046840687e-05,
"loss": 0.1342,
"num_input_tokens_seen": 755800,
"step": 5850
},
{
"epoch": 5.466853408029879,
"grad_norm": 6.038141250610352,
"learning_rate": 2.529739969709814e-05,
"loss": 0.1208,
"num_input_tokens_seen": 756520,
"step": 5855
},
{
"epoch": 5.4715219421101775,
"grad_norm": 12.068619728088379,
"learning_rate": 2.5256661557557247e-05,
"loss": 0.5476,
"num_input_tokens_seen": 757128,
"step": 5860
},
{
"epoch": 5.476190476190476,
"grad_norm": 8.999595642089844,
"learning_rate": 2.5215922736405468e-05,
"loss": 0.1985,
"num_input_tokens_seen": 757864,
"step": 5865
},
{
"epoch": 5.480859010270775,
"grad_norm": 6.384941101074219,
"learning_rate": 2.5175183341832048e-05,
"loss": 0.3681,
"num_input_tokens_seen": 758472,
"step": 5870
},
{
"epoch": 5.485527544351074,
"grad_norm": 4.548037052154541,
"learning_rate": 2.513444348202778e-05,
"loss": 0.1332,
"num_input_tokens_seen": 759032,
"step": 5875
},
{
"epoch": 5.490196078431373,
"grad_norm": 3.7748093605041504,
"learning_rate": 2.5093703265184686e-05,
"loss": 0.1326,
"num_input_tokens_seen": 759624,
"step": 5880
},
{
"epoch": 5.494864612511671,
"grad_norm": 3.9037418365478516,
"learning_rate": 2.505296279949574e-05,
"loss": 0.1086,
"num_input_tokens_seen": 760264,
"step": 5885
},
{
"epoch": 5.49953314659197,
"grad_norm": 0.08623213320970535,
"learning_rate": 2.5012222193154548e-05,
"loss": 0.1744,
"num_input_tokens_seen": 761192,
"step": 5890
},
{
"epoch": 5.504201680672269,
"grad_norm": 7.6671013832092285,
"learning_rate": 2.4971481554355133e-05,
"loss": 0.2383,
"num_input_tokens_seen": 761864,
"step": 5895
},
{
"epoch": 5.505135387488329,
"eval_loss": 0.8534978032112122,
"eval_runtime": 3.8678,
"eval_samples_per_second": 61.533,
"eval_steps_per_second": 30.766,
"num_input_tokens_seen": 762008,
"step": 5896
},
{
"epoch": 5.508870214752568,
"grad_norm": 5.7819647789001465,
"learning_rate": 2.4930740991291567e-05,
"loss": 0.208,
"num_input_tokens_seen": 762552,
"step": 5900
},
{
"epoch": 5.513538748832866,
"grad_norm": 4.081792831420898,
"learning_rate": 2.489000061215775e-05,
"loss": 0.1272,
"num_input_tokens_seen": 763096,
"step": 5905
},
{
"epoch": 5.518207282913165,
"grad_norm": 7.347453594207764,
"learning_rate": 2.4849260525147078e-05,
"loss": 0.4156,
"num_input_tokens_seen": 763768,
"step": 5910
},
{
"epoch": 5.522875816993464,
"grad_norm": 10.584314346313477,
"learning_rate": 2.4808520838452168e-05,
"loss": 0.3707,
"num_input_tokens_seen": 764344,
"step": 5915
},
{
"epoch": 5.527544351073763,
"grad_norm": 10.097419738769531,
"learning_rate": 2.4767781660264596e-05,
"loss": 0.2649,
"num_input_tokens_seen": 764904,
"step": 5920
},
{
"epoch": 5.5322128851540615,
"grad_norm": 4.417219161987305,
"learning_rate": 2.4727043098774548e-05,
"loss": 0.1982,
"num_input_tokens_seen": 765512,
"step": 5925
},
{
"epoch": 5.53688141923436,
"grad_norm": 2.841074228286743,
"learning_rate": 2.4686305262170617e-05,
"loss": 0.4358,
"num_input_tokens_seen": 766072,
"step": 5930
},
{
"epoch": 5.541549953314659,
"grad_norm": 7.550325393676758,
"learning_rate": 2.4645568258639433e-05,
"loss": 0.2835,
"num_input_tokens_seen": 766728,
"step": 5935
},
{
"epoch": 5.546218487394958,
"grad_norm": 7.056575298309326,
"learning_rate": 2.4604832196365435e-05,
"loss": 0.2372,
"num_input_tokens_seen": 767336,
"step": 5940
},
{
"epoch": 5.550887021475257,
"grad_norm": 3.2988297939300537,
"learning_rate": 2.4564097183530572e-05,
"loss": 0.2944,
"num_input_tokens_seen": 768008,
"step": 5945
},
{
"epoch": 5.555555555555555,
"grad_norm": 7.315042018890381,
"learning_rate": 2.4523363328313974e-05,
"loss": 0.1725,
"num_input_tokens_seen": 768664,
"step": 5950
},
{
"epoch": 5.560224089635854,
"grad_norm": 21.145261764526367,
"learning_rate": 2.4482630738891713e-05,
"loss": 0.2216,
"num_input_tokens_seen": 769208,
"step": 5955
},
{
"epoch": 5.564892623716153,
"grad_norm": 2.414390802383423,
"learning_rate": 2.444189952343651e-05,
"loss": 0.1617,
"num_input_tokens_seen": 769864,
"step": 5960
},
{
"epoch": 5.569561157796452,
"grad_norm": 6.385523319244385,
"learning_rate": 2.4401169790117427e-05,
"loss": 0.1384,
"num_input_tokens_seen": 770472,
"step": 5965
},
{
"epoch": 5.57422969187675,
"grad_norm": 4.405248165130615,
"learning_rate": 2.4360441647099592e-05,
"loss": 0.1617,
"num_input_tokens_seen": 771128,
"step": 5970
},
{
"epoch": 5.578898225957049,
"grad_norm": 10.55893611907959,
"learning_rate": 2.4319715202543905e-05,
"loss": 0.2984,
"num_input_tokens_seen": 771784,
"step": 5975
},
{
"epoch": 5.583566760037348,
"grad_norm": 8.2843017578125,
"learning_rate": 2.4278990564606753e-05,
"loss": 0.1171,
"num_input_tokens_seen": 772472,
"step": 5980
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.031208815053105354,
"learning_rate": 2.423826784143974e-05,
"loss": 0.1669,
"num_input_tokens_seen": 773192,
"step": 5985
},
{
"epoch": 5.5929038281979455,
"grad_norm": 14.423694610595703,
"learning_rate": 2.419754714118938e-05,
"loss": 0.2992,
"num_input_tokens_seen": 773800,
"step": 5990
},
{
"epoch": 5.597572362278244,
"grad_norm": 0.023451492190361023,
"learning_rate": 2.4156828571996808e-05,
"loss": 0.2251,
"num_input_tokens_seen": 774472,
"step": 5995
},
{
"epoch": 5.602240896358543,
"grad_norm": 3.960092306137085,
"learning_rate": 2.4116112241997486e-05,
"loss": 0.1467,
"num_input_tokens_seen": 775080,
"step": 6000
},
{
"epoch": 5.606909430438842,
"grad_norm": 4.254623889923096,
"learning_rate": 2.4075398259320964e-05,
"loss": 0.2805,
"num_input_tokens_seen": 775704,
"step": 6005
},
{
"epoch": 5.611577964519141,
"grad_norm": 2.3658244609832764,
"learning_rate": 2.403468673209054e-05,
"loss": 0.1634,
"num_input_tokens_seen": 776328,
"step": 6010
},
{
"epoch": 5.616246498599439,
"grad_norm": 5.901800632476807,
"learning_rate": 2.399397776842298e-05,
"loss": 0.4695,
"num_input_tokens_seen": 776936,
"step": 6015
},
{
"epoch": 5.620915032679738,
"grad_norm": 4.022805690765381,
"learning_rate": 2.3953271476428268e-05,
"loss": 0.1232,
"num_input_tokens_seen": 777576,
"step": 6020
},
{
"epoch": 5.625583566760037,
"grad_norm": 14.644888877868652,
"learning_rate": 2.3912567964209264e-05,
"loss": 0.3136,
"num_input_tokens_seen": 778168,
"step": 6025
},
{
"epoch": 5.630252100840336,
"grad_norm": 6.010149002075195,
"learning_rate": 2.387186733986147e-05,
"loss": 0.2141,
"num_input_tokens_seen": 778792,
"step": 6030
},
{
"epoch": 5.634920634920634,
"grad_norm": 4.33161735534668,
"learning_rate": 2.38311697114727e-05,
"loss": 0.2491,
"num_input_tokens_seen": 779448,
"step": 6035
},
{
"epoch": 5.639589169000933,
"grad_norm": 6.012837886810303,
"learning_rate": 2.3790475187122836e-05,
"loss": 0.2456,
"num_input_tokens_seen": 780168,
"step": 6040
},
{
"epoch": 5.644257703081233,
"grad_norm": 0.5816524624824524,
"learning_rate": 2.374978387488348e-05,
"loss": 0.2491,
"num_input_tokens_seen": 780728,
"step": 6045
},
{
"epoch": 5.648926237161532,
"grad_norm": 14.974713325500488,
"learning_rate": 2.3709095882817737e-05,
"loss": 0.2709,
"num_input_tokens_seen": 781368,
"step": 6050
},
{
"epoch": 5.65359477124183,
"grad_norm": 8.99953842163086,
"learning_rate": 2.3668411318979884e-05,
"loss": 0.3081,
"num_input_tokens_seen": 782072,
"step": 6055
},
{
"epoch": 5.658263305322129,
"grad_norm": 1.4049012660980225,
"learning_rate": 2.362773029141508e-05,
"loss": 0.1005,
"num_input_tokens_seen": 782744,
"step": 6060
},
{
"epoch": 5.662931839402428,
"grad_norm": 2.7593705654144287,
"learning_rate": 2.358705290815913e-05,
"loss": 0.1774,
"num_input_tokens_seen": 783384,
"step": 6065
},
{
"epoch": 5.667600373482727,
"grad_norm": 8.04900074005127,
"learning_rate": 2.3546379277238107e-05,
"loss": 0.1899,
"num_input_tokens_seen": 784008,
"step": 6070
},
{
"epoch": 5.6722689075630255,
"grad_norm": 5.208436012268066,
"learning_rate": 2.3505709506668165e-05,
"loss": 0.1833,
"num_input_tokens_seen": 784648,
"step": 6075
},
{
"epoch": 5.676937441643324,
"grad_norm": 3.1610107421875,
"learning_rate": 2.3465043704455182e-05,
"loss": 0.1554,
"num_input_tokens_seen": 785400,
"step": 6080
},
{
"epoch": 5.681605975723623,
"grad_norm": 6.8983988761901855,
"learning_rate": 2.3424381978594505e-05,
"loss": 0.0653,
"num_input_tokens_seen": 786056,
"step": 6085
},
{
"epoch": 5.686274509803922,
"grad_norm": 14.282424926757812,
"learning_rate": 2.3383724437070668e-05,
"loss": 0.1803,
"num_input_tokens_seen": 786648,
"step": 6090
},
{
"epoch": 5.690943043884221,
"grad_norm": 12.445168495178223,
"learning_rate": 2.3343071187857062e-05,
"loss": 0.1965,
"num_input_tokens_seen": 787272,
"step": 6095
},
{
"epoch": 5.695611577964519,
"grad_norm": 2.3133530616760254,
"learning_rate": 2.3302422338915696e-05,
"loss": 0.2115,
"num_input_tokens_seen": 787896,
"step": 6100
},
{
"epoch": 5.700280112044818,
"grad_norm": 2.805271863937378,
"learning_rate": 2.3261777998196905e-05,
"loss": 0.1056,
"num_input_tokens_seen": 788504,
"step": 6105
},
{
"epoch": 5.704948646125117,
"grad_norm": 4.6447248458862305,
"learning_rate": 2.322113827363904e-05,
"loss": 0.1847,
"num_input_tokens_seen": 789112,
"step": 6110
},
{
"epoch": 5.709617180205416,
"grad_norm": 8.13668155670166,
"learning_rate": 2.3180503273168194e-05,
"loss": 0.1741,
"num_input_tokens_seen": 789832,
"step": 6115
},
{
"epoch": 5.714285714285714,
"grad_norm": 2.305729389190674,
"learning_rate": 2.31398731046979e-05,
"loss": 0.3343,
"num_input_tokens_seen": 790424,
"step": 6120
},
{
"epoch": 5.718954248366013,
"grad_norm": 2.2352488040924072,
"learning_rate": 2.3099247876128877e-05,
"loss": 0.154,
"num_input_tokens_seen": 791128,
"step": 6125
},
{
"epoch": 5.723622782446312,
"grad_norm": 10.044025421142578,
"learning_rate": 2.3058627695348737e-05,
"loss": 0.2203,
"num_input_tokens_seen": 791896,
"step": 6130
},
{
"epoch": 5.728291316526611,
"grad_norm": 9.180370330810547,
"learning_rate": 2.3018012670231647e-05,
"loss": 0.131,
"num_input_tokens_seen": 792520,
"step": 6135
},
{
"epoch": 5.7329598506069095,
"grad_norm": 5.549483776092529,
"learning_rate": 2.2977402908638135e-05,
"loss": 0.1664,
"num_input_tokens_seen": 793192,
"step": 6140
},
{
"epoch": 5.737628384687208,
"grad_norm": 10.082823753356934,
"learning_rate": 2.2936798518414686e-05,
"loss": 0.2269,
"num_input_tokens_seen": 793816,
"step": 6145
},
{
"epoch": 5.742296918767507,
"grad_norm": 4.678831577301025,
"learning_rate": 2.2896199607393577e-05,
"loss": 0.1145,
"num_input_tokens_seen": 794520,
"step": 6150
},
{
"epoch": 5.746965452847806,
"grad_norm": 7.657114505767822,
"learning_rate": 2.2855606283392516e-05,
"loss": 0.1094,
"num_input_tokens_seen": 795176,
"step": 6155
},
{
"epoch": 5.751633986928105,
"grad_norm": 20.42408561706543,
"learning_rate": 2.281501865421436e-05,
"loss": 0.2962,
"num_input_tokens_seen": 795768,
"step": 6160
},
{
"epoch": 5.756302521008403,
"grad_norm": 5.679710388183594,
"learning_rate": 2.2774436827646865e-05,
"loss": 0.392,
"num_input_tokens_seen": 796408,
"step": 6165
},
{
"epoch": 5.760971055088702,
"grad_norm": 6.150409698486328,
"learning_rate": 2.2733860911462342e-05,
"loss": 0.2671,
"num_input_tokens_seen": 797048,
"step": 6170
},
{
"epoch": 5.765639589169001,
"grad_norm": 11.56528377532959,
"learning_rate": 2.2693291013417453e-05,
"loss": 0.2104,
"num_input_tokens_seen": 797672,
"step": 6175
},
{
"epoch": 5.7703081232493,
"grad_norm": 4.323744297027588,
"learning_rate": 2.265272724125284e-05,
"loss": 0.2423,
"num_input_tokens_seen": 798312,
"step": 6180
},
{
"epoch": 5.774976657329598,
"grad_norm": 6.345007419586182,
"learning_rate": 2.2612169702692887e-05,
"loss": 0.6142,
"num_input_tokens_seen": 798936,
"step": 6185
},
{
"epoch": 5.779645191409897,
"grad_norm": 7.959813117980957,
"learning_rate": 2.257161850544545e-05,
"loss": 0.3423,
"num_input_tokens_seen": 799496,
"step": 6190
},
{
"epoch": 5.784313725490196,
"grad_norm": 4.817144393920898,
"learning_rate": 2.25310737572015e-05,
"loss": 0.4931,
"num_input_tokens_seen": 800088,
"step": 6195
},
{
"epoch": 5.788982259570495,
"grad_norm": 3.499696969985962,
"learning_rate": 2.2490535565634897e-05,
"loss": 0.2209,
"num_input_tokens_seen": 800728,
"step": 6200
},
{
"epoch": 5.7936507936507935,
"grad_norm": 5.070218086242676,
"learning_rate": 2.2450004038402107e-05,
"loss": 0.1922,
"num_input_tokens_seen": 801272,
"step": 6205
},
{
"epoch": 5.798319327731092,
"grad_norm": 2.386444330215454,
"learning_rate": 2.2409479283141886e-05,
"loss": 0.1461,
"num_input_tokens_seen": 801992,
"step": 6210
},
{
"epoch": 5.802987861811391,
"grad_norm": 22.35248374938965,
"learning_rate": 2.236896140747501e-05,
"loss": 0.2301,
"num_input_tokens_seen": 802664,
"step": 6215
},
{
"epoch": 5.80765639589169,
"grad_norm": 12.163101196289062,
"learning_rate": 2.2328450519003963e-05,
"loss": 0.1354,
"num_input_tokens_seen": 803288,
"step": 6220
},
{
"epoch": 5.812324929971989,
"grad_norm": 9.302704811096191,
"learning_rate": 2.2287946725312693e-05,
"loss": 0.2459,
"num_input_tokens_seen": 803928,
"step": 6225
},
{
"epoch": 5.816993464052287,
"grad_norm": 1.620877742767334,
"learning_rate": 2.2247450133966317e-05,
"loss": 0.264,
"num_input_tokens_seen": 804568,
"step": 6230
},
{
"epoch": 5.821661998132586,
"grad_norm": 2.096247911453247,
"learning_rate": 2.2206960852510804e-05,
"loss": 0.1366,
"num_input_tokens_seen": 805272,
"step": 6235
},
{
"epoch": 5.826330532212885,
"grad_norm": 5.8347578048706055,
"learning_rate": 2.2166478988472716e-05,
"loss": 0.3308,
"num_input_tokens_seen": 805832,
"step": 6240
},
{
"epoch": 5.830999066293184,
"grad_norm": 5.4447550773620605,
"learning_rate": 2.2126004649358916e-05,
"loss": 0.1452,
"num_input_tokens_seen": 806440,
"step": 6245
},
{
"epoch": 5.835667600373482,
"grad_norm": 5.139708042144775,
"learning_rate": 2.2085537942656287e-05,
"loss": 0.1665,
"num_input_tokens_seen": 807112,
"step": 6250
},
{
"epoch": 5.840336134453781,
"grad_norm": 3.3073081970214844,
"learning_rate": 2.2045078975831452e-05,
"loss": 0.2967,
"num_input_tokens_seen": 807672,
"step": 6255
},
{
"epoch": 5.84500466853408,
"grad_norm": 7.739352226257324,
"learning_rate": 2.2004627856330462e-05,
"loss": 0.1503,
"num_input_tokens_seen": 808344,
"step": 6260
},
{
"epoch": 5.849673202614379,
"grad_norm": 2.465211868286133,
"learning_rate": 2.196418469157852e-05,
"loss": 0.1154,
"num_input_tokens_seen": 809144,
"step": 6265
},
{
"epoch": 5.8543417366946775,
"grad_norm": 3.8778762817382812,
"learning_rate": 2.1923749588979737e-05,
"loss": 0.2426,
"num_input_tokens_seen": 809816,
"step": 6270
},
{
"epoch": 5.859010270774976,
"grad_norm": 13.767936706542969,
"learning_rate": 2.1883322655916793e-05,
"loss": 0.2215,
"num_input_tokens_seen": 810520,
"step": 6275
},
{
"epoch": 5.863678804855276,
"grad_norm": 25.827281951904297,
"learning_rate": 2.1842903999750665e-05,
"loss": 0.4355,
"num_input_tokens_seen": 811080,
"step": 6280
},
{
"epoch": 5.868347338935575,
"grad_norm": 2.462376117706299,
"learning_rate": 2.180249372782038e-05,
"loss": 0.2118,
"num_input_tokens_seen": 811704,
"step": 6285
},
{
"epoch": 5.8730158730158735,
"grad_norm": 0.8857133388519287,
"learning_rate": 2.1762091947442643e-05,
"loss": 0.2445,
"num_input_tokens_seen": 812312,
"step": 6290
},
{
"epoch": 5.877684407096172,
"grad_norm": 3.9621355533599854,
"learning_rate": 2.1721698765911674e-05,
"loss": 0.1307,
"num_input_tokens_seen": 812984,
"step": 6295
},
{
"epoch": 5.882352941176471,
"grad_norm": 3.7414557933807373,
"learning_rate": 2.1681314290498806e-05,
"loss": 0.1606,
"num_input_tokens_seen": 813768,
"step": 6300
},
{
"epoch": 5.88702147525677,
"grad_norm": 2.700284719467163,
"learning_rate": 2.164093862845228e-05,
"loss": 0.3214,
"num_input_tokens_seen": 814424,
"step": 6305
},
{
"epoch": 5.8916900093370685,
"grad_norm": 4.355251789093018,
"learning_rate": 2.1600571886996933e-05,
"loss": 0.2542,
"num_input_tokens_seen": 815000,
"step": 6310
},
{
"epoch": 5.896358543417367,
"grad_norm": 0.26216188073158264,
"learning_rate": 2.156021417333388e-05,
"loss": 0.1463,
"num_input_tokens_seen": 815624,
"step": 6315
},
{
"epoch": 5.901027077497666,
"grad_norm": 5.650294780731201,
"learning_rate": 2.1519865594640302e-05,
"loss": 0.1264,
"num_input_tokens_seen": 816248,
"step": 6320
},
{
"epoch": 5.905695611577965,
"grad_norm": 4.275903224945068,
"learning_rate": 2.1479526258069087e-05,
"loss": 0.0928,
"num_input_tokens_seen": 816808,
"step": 6325
},
{
"epoch": 5.910364145658264,
"grad_norm": 6.359528064727783,
"learning_rate": 2.1439196270748598e-05,
"loss": 0.2651,
"num_input_tokens_seen": 817416,
"step": 6330
},
{
"epoch": 5.915032679738562,
"grad_norm": 3.161203145980835,
"learning_rate": 2.139887573978238e-05,
"loss": 0.0926,
"num_input_tokens_seen": 818104,
"step": 6335
},
{
"epoch": 5.919701213818861,
"grad_norm": 7.976051330566406,
"learning_rate": 2.1358564772248826e-05,
"loss": 0.2749,
"num_input_tokens_seen": 818744,
"step": 6340
},
{
"epoch": 5.92436974789916,
"grad_norm": 9.922679901123047,
"learning_rate": 2.131826347520096e-05,
"loss": 0.3568,
"num_input_tokens_seen": 819368,
"step": 6345
},
{
"epoch": 5.929038281979459,
"grad_norm": 2.597715377807617,
"learning_rate": 2.1277971955666125e-05,
"loss": 0.2073,
"num_input_tokens_seen": 820040,
"step": 6350
},
{
"epoch": 5.9337068160597575,
"grad_norm": 16.03790855407715,
"learning_rate": 2.1237690320645695e-05,
"loss": 0.2687,
"num_input_tokens_seen": 820664,
"step": 6355
},
{
"epoch": 5.938375350140056,
"grad_norm": 2.43839168548584,
"learning_rate": 2.1197418677114795e-05,
"loss": 0.257,
"num_input_tokens_seen": 821304,
"step": 6360
},
{
"epoch": 5.943043884220355,
"grad_norm": 5.4671220779418945,
"learning_rate": 2.1157157132021994e-05,
"loss": 0.1754,
"num_input_tokens_seen": 821928,
"step": 6365
},
{
"epoch": 5.947712418300654,
"grad_norm": 3.340067148208618,
"learning_rate": 2.1116905792289067e-05,
"loss": 0.3771,
"num_input_tokens_seen": 822536,
"step": 6370
},
{
"epoch": 5.9523809523809526,
"grad_norm": 8.478741645812988,
"learning_rate": 2.1076664764810693e-05,
"loss": 0.2832,
"num_input_tokens_seen": 823176,
"step": 6375
},
{
"epoch": 5.957049486461251,
"grad_norm": 8.456277847290039,
"learning_rate": 2.103643415645414e-05,
"loss": 0.2786,
"num_input_tokens_seen": 823864,
"step": 6380
},
{
"epoch": 5.96171802054155,
"grad_norm": 2.0959372520446777,
"learning_rate": 2.0996214074059034e-05,
"loss": 0.2735,
"num_input_tokens_seen": 824600,
"step": 6385
},
{
"epoch": 5.966386554621849,
"grad_norm": 1.7711430788040161,
"learning_rate": 2.0956004624437014e-05,
"loss": 0.1891,
"num_input_tokens_seen": 825192,
"step": 6390
},
{
"epoch": 5.971055088702148,
"grad_norm": 8.200021743774414,
"learning_rate": 2.091580591437151e-05,
"loss": 0.1611,
"num_input_tokens_seen": 825800,
"step": 6395
},
{
"epoch": 5.975723622782446,
"grad_norm": 1.9257704019546509,
"learning_rate": 2.087561805061741e-05,
"loss": 0.3492,
"num_input_tokens_seen": 826536,
"step": 6400
},
{
"epoch": 5.980392156862745,
"grad_norm": 11.712030410766602,
"learning_rate": 2.0835441139900836e-05,
"loss": 0.2512,
"num_input_tokens_seen": 827176,
"step": 6405
},
{
"epoch": 5.985060690943044,
"grad_norm": 8.402978897094727,
"learning_rate": 2.0795275288918763e-05,
"loss": 0.4235,
"num_input_tokens_seen": 827832,
"step": 6410
},
{
"epoch": 5.989729225023343,
"grad_norm": 5.503647804260254,
"learning_rate": 2.075512060433884e-05,
"loss": 0.2737,
"num_input_tokens_seen": 828504,
"step": 6415
},
{
"epoch": 5.9943977591036415,
"grad_norm": 3.2183749675750732,
"learning_rate": 2.0714977192799055e-05,
"loss": 0.1136,
"num_input_tokens_seen": 829176,
"step": 6420
},
{
"epoch": 5.99906629318394,
"grad_norm": 4.965052604675293,
"learning_rate": 2.067484516090744e-05,
"loss": 0.2705,
"num_input_tokens_seen": 829800,
"step": 6425
},
{
"epoch": 6.003734827264239,
"grad_norm": 2.7685673236846924,
"learning_rate": 2.063472461524184e-05,
"loss": 0.1354,
"num_input_tokens_seen": 830456,
"step": 6430
},
{
"epoch": 6.0056022408963585,
"eval_loss": 0.8559670448303223,
"eval_runtime": 3.8802,
"eval_samples_per_second": 61.337,
"eval_steps_per_second": 30.669,
"num_input_tokens_seen": 830744,
"step": 6432
},
{
"epoch": 6.008403361344538,
"grad_norm": 1.8242768049240112,
"learning_rate": 2.059461566234954e-05,
"loss": 0.1321,
"num_input_tokens_seen": 831144,
"step": 6435
},
{
"epoch": 6.0130718954248366,
"grad_norm": 1.6450865268707275,
"learning_rate": 2.0554518408747103e-05,
"loss": 0.0649,
"num_input_tokens_seen": 831768,
"step": 6440
},
{
"epoch": 6.017740429505135,
"grad_norm": 41.15055465698242,
"learning_rate": 2.051443296091998e-05,
"loss": 0.1878,
"num_input_tokens_seen": 832408,
"step": 6445
},
{
"epoch": 6.022408963585434,
"grad_norm": 1.7977114915847778,
"learning_rate": 2.0474359425322276e-05,
"loss": 0.2574,
"num_input_tokens_seen": 833096,
"step": 6450
},
{
"epoch": 6.027077497665733,
"grad_norm": 0.9406064748764038,
"learning_rate": 2.0434297908376486e-05,
"loss": 0.1256,
"num_input_tokens_seen": 833736,
"step": 6455
},
{
"epoch": 6.031746031746032,
"grad_norm": 1.812961220741272,
"learning_rate": 2.0394248516473156e-05,
"loss": 0.0935,
"num_input_tokens_seen": 834600,
"step": 6460
},
{
"epoch": 6.03641456582633,
"grad_norm": 8.429821968078613,
"learning_rate": 2.0354211355970633e-05,
"loss": 0.1205,
"num_input_tokens_seen": 835176,
"step": 6465
},
{
"epoch": 6.041083099906629,
"grad_norm": 7.432557106018066,
"learning_rate": 2.0314186533194807e-05,
"loss": 0.2049,
"num_input_tokens_seen": 835864,
"step": 6470
},
{
"epoch": 6.045751633986928,
"grad_norm": 8.452810287475586,
"learning_rate": 2.0274174154438787e-05,
"loss": 0.1331,
"num_input_tokens_seen": 836536,
"step": 6475
},
{
"epoch": 6.050420168067227,
"grad_norm": 1.3018027544021606,
"learning_rate": 2.0234174325962638e-05,
"loss": 0.0948,
"num_input_tokens_seen": 837320,
"step": 6480
},
{
"epoch": 6.0550887021475255,
"grad_norm": 5.2079057693481445,
"learning_rate": 2.0194187153993085e-05,
"loss": 0.0504,
"num_input_tokens_seen": 837912,
"step": 6485
},
{
"epoch": 6.059757236227824,
"grad_norm": 2.5622472763061523,
"learning_rate": 2.015421274472325e-05,
"loss": 0.0902,
"num_input_tokens_seen": 838584,
"step": 6490
},
{
"epoch": 6.064425770308123,
"grad_norm": 3.108729600906372,
"learning_rate": 2.0114251204312367e-05,
"loss": 0.1672,
"num_input_tokens_seen": 839192,
"step": 6495
},
{
"epoch": 6.069094304388422,
"grad_norm": 0.7695748805999756,
"learning_rate": 2.007430263888549e-05,
"loss": 0.063,
"num_input_tokens_seen": 839848,
"step": 6500
},
{
"epoch": 6.073762838468721,
"grad_norm": 3.6968908309936523,
"learning_rate": 2.003436715453321e-05,
"loss": 0.1887,
"num_input_tokens_seen": 840520,
"step": 6505
},
{
"epoch": 6.078431372549019,
"grad_norm": 4.965339183807373,
"learning_rate": 1.999444485731138e-05,
"loss": 0.3157,
"num_input_tokens_seen": 841064,
"step": 6510
},
{
"epoch": 6.083099906629318,
"grad_norm": 4.380035400390625,
"learning_rate": 1.9954535853240837e-05,
"loss": 0.1266,
"num_input_tokens_seen": 841688,
"step": 6515
},
{
"epoch": 6.087768440709617,
"grad_norm": 9.54501724243164,
"learning_rate": 1.991464024830712e-05,
"loss": 0.3874,
"num_input_tokens_seen": 842312,
"step": 6520
},
{
"epoch": 6.092436974789916,
"grad_norm": 2.171372652053833,
"learning_rate": 1.987475814846017e-05,
"loss": 0.1375,
"num_input_tokens_seen": 842888,
"step": 6525
},
{
"epoch": 6.097105508870214,
"grad_norm": 1.786184310913086,
"learning_rate": 1.9834889659614082e-05,
"loss": 0.0824,
"num_input_tokens_seen": 843416,
"step": 6530
},
{
"epoch": 6.101774042950513,
"grad_norm": 0.32185816764831543,
"learning_rate": 1.979503488764678e-05,
"loss": 0.2355,
"num_input_tokens_seen": 844008,
"step": 6535
},
{
"epoch": 6.106442577030812,
"grad_norm": 0.2377963364124298,
"learning_rate": 1.975519393839978e-05,
"loss": 0.1485,
"num_input_tokens_seen": 844600,
"step": 6540
},
{
"epoch": 6.111111111111111,
"grad_norm": 1.2736645936965942,
"learning_rate": 1.9715366917677878e-05,
"loss": 0.1354,
"num_input_tokens_seen": 845176,
"step": 6545
},
{
"epoch": 6.1157796451914095,
"grad_norm": 4.923788070678711,
"learning_rate": 1.967555393124889e-05,
"loss": 0.2031,
"num_input_tokens_seen": 845784,
"step": 6550
},
{
"epoch": 6.120448179271708,
"grad_norm": 10.04391860961914,
"learning_rate": 1.9635755084843366e-05,
"loss": 0.11,
"num_input_tokens_seen": 846424,
"step": 6555
},
{
"epoch": 6.125116713352007,
"grad_norm": 1.872023105621338,
"learning_rate": 1.959597048415428e-05,
"loss": 0.2214,
"num_input_tokens_seen": 847080,
"step": 6560
},
{
"epoch": 6.129785247432307,
"grad_norm": 3.639702081680298,
"learning_rate": 1.9556200234836792e-05,
"loss": 0.0403,
"num_input_tokens_seen": 847864,
"step": 6565
},
{
"epoch": 6.1344537815126055,
"grad_norm": 5.251626491546631,
"learning_rate": 1.9516444442507947e-05,
"loss": 0.1082,
"num_input_tokens_seen": 848488,
"step": 6570
},
{
"epoch": 6.139122315592904,
"grad_norm": 3.5228431224823,
"learning_rate": 1.9476703212746413e-05,
"loss": 0.058,
"num_input_tokens_seen": 849144,
"step": 6575
},
{
"epoch": 6.143790849673203,
"grad_norm": 4.5999650955200195,
"learning_rate": 1.9436976651092144e-05,
"loss": 0.1331,
"num_input_tokens_seen": 849768,
"step": 6580
},
{
"epoch": 6.148459383753502,
"grad_norm": 5.804399013519287,
"learning_rate": 1.9397264863046176e-05,
"loss": 0.3311,
"num_input_tokens_seen": 850472,
"step": 6585
},
{
"epoch": 6.1531279178338005,
"grad_norm": 2.8000004291534424,
"learning_rate": 1.9357567954070294e-05,
"loss": 0.1268,
"num_input_tokens_seen": 851176,
"step": 6590
},
{
"epoch": 6.157796451914099,
"grad_norm": 4.5097503662109375,
"learning_rate": 1.9317886029586778e-05,
"loss": 0.092,
"num_input_tokens_seen": 851944,
"step": 6595
},
{
"epoch": 6.162464985994398,
"grad_norm": 1.2983640432357788,
"learning_rate": 1.927821919497812e-05,
"loss": 0.0387,
"num_input_tokens_seen": 852552,
"step": 6600
},
{
"epoch": 6.167133520074697,
"grad_norm": 7.8906354904174805,
"learning_rate": 1.9238567555586714e-05,
"loss": 0.172,
"num_input_tokens_seen": 853224,
"step": 6605
},
{
"epoch": 6.171802054154996,
"grad_norm": 8.38399887084961,
"learning_rate": 1.9198931216714614e-05,
"loss": 0.2156,
"num_input_tokens_seen": 853736,
"step": 6610
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.570091724395752,
"learning_rate": 1.9159310283623245e-05,
"loss": 0.0985,
"num_input_tokens_seen": 854360,
"step": 6615
},
{
"epoch": 6.181139122315593,
"grad_norm": 0.9433493614196777,
"learning_rate": 1.911970486153312e-05,
"loss": 0.2101,
"num_input_tokens_seen": 855048,
"step": 6620
},
{
"epoch": 6.185807656395892,
"grad_norm": 12.825504302978516,
"learning_rate": 1.908011505562356e-05,
"loss": 0.1157,
"num_input_tokens_seen": 855720,
"step": 6625
},
{
"epoch": 6.190476190476191,
"grad_norm": 7.946544647216797,
"learning_rate": 1.9040540971032392e-05,
"loss": 0.1044,
"num_input_tokens_seen": 856264,
"step": 6630
},
{
"epoch": 6.1951447245564895,
"grad_norm": 6.417392253875732,
"learning_rate": 1.900098271285572e-05,
"loss": 0.2791,
"num_input_tokens_seen": 856856,
"step": 6635
},
{
"epoch": 6.199813258636788,
"grad_norm": 0.97379469871521,
"learning_rate": 1.896144038614761e-05,
"loss": 0.0588,
"num_input_tokens_seen": 857576,
"step": 6640
},
{
"epoch": 6.204481792717087,
"grad_norm": 0.8709614872932434,
"learning_rate": 1.8921914095919814e-05,
"loss": 0.098,
"num_input_tokens_seen": 858216,
"step": 6645
},
{
"epoch": 6.209150326797386,
"grad_norm": 17.944868087768555,
"learning_rate": 1.8882403947141507e-05,
"loss": 0.3388,
"num_input_tokens_seen": 858904,
"step": 6650
},
{
"epoch": 6.2138188608776845,
"grad_norm": 0.9572569727897644,
"learning_rate": 1.8842910044738975e-05,
"loss": 0.012,
"num_input_tokens_seen": 859656,
"step": 6655
},
{
"epoch": 6.218487394957983,
"grad_norm": 4.014954090118408,
"learning_rate": 1.8803432493595387e-05,
"loss": 0.1918,
"num_input_tokens_seen": 860264,
"step": 6660
},
{
"epoch": 6.223155929038282,
"grad_norm": 0.572869598865509,
"learning_rate": 1.876397139855047e-05,
"loss": 0.0631,
"num_input_tokens_seen": 860920,
"step": 6665
},
{
"epoch": 6.227824463118581,
"grad_norm": 3.1461665630340576,
"learning_rate": 1.8724526864400248e-05,
"loss": 0.0422,
"num_input_tokens_seen": 861528,
"step": 6670
},
{
"epoch": 6.23249299719888,
"grad_norm": 10.453712463378906,
"learning_rate": 1.8685098995896792e-05,
"loss": 0.1564,
"num_input_tokens_seen": 862104,
"step": 6675
},
{
"epoch": 6.237161531279178,
"grad_norm": 0.3828265070915222,
"learning_rate": 1.8645687897747864e-05,
"loss": 0.2306,
"num_input_tokens_seen": 862840,
"step": 6680
},
{
"epoch": 6.241830065359477,
"grad_norm": 8.274612426757812,
"learning_rate": 1.8606293674616737e-05,
"loss": 0.1782,
"num_input_tokens_seen": 863480,
"step": 6685
},
{
"epoch": 6.246498599439776,
"grad_norm": 3.889509677886963,
"learning_rate": 1.856691643112184e-05,
"loss": 0.1569,
"num_input_tokens_seen": 864152,
"step": 6690
},
{
"epoch": 6.251167133520075,
"grad_norm": 0.07747960835695267,
"learning_rate": 1.8527556271836524e-05,
"loss": 0.0345,
"num_input_tokens_seen": 864824,
"step": 6695
},
{
"epoch": 6.2558356676003735,
"grad_norm": 1.108803629875183,
"learning_rate": 1.848821330128878e-05,
"loss": 0.0478,
"num_input_tokens_seen": 865464,
"step": 6700
},
{
"epoch": 6.260504201680672,
"grad_norm": 1.7075055837631226,
"learning_rate": 1.844888762396092e-05,
"loss": 0.0651,
"num_input_tokens_seen": 866184,
"step": 6705
},
{
"epoch": 6.265172735760971,
"grad_norm": 3.643524169921875,
"learning_rate": 1.8409579344289342e-05,
"loss": 0.0464,
"num_input_tokens_seen": 866872,
"step": 6710
},
{
"epoch": 6.26984126984127,
"grad_norm": 13.78272533416748,
"learning_rate": 1.8370288566664262e-05,
"loss": 0.2371,
"num_input_tokens_seen": 867496,
"step": 6715
},
{
"epoch": 6.2745098039215685,
"grad_norm": 3.8864872455596924,
"learning_rate": 1.83310153954294e-05,
"loss": 0.4803,
"num_input_tokens_seen": 868072,
"step": 6720
},
{
"epoch": 6.279178338001867,
"grad_norm": 3.947277069091797,
"learning_rate": 1.829175993488172e-05,
"loss": 0.1295,
"num_input_tokens_seen": 868744,
"step": 6725
},
{
"epoch": 6.283846872082166,
"grad_norm": 11.612577438354492,
"learning_rate": 1.8252522289271142e-05,
"loss": 0.0762,
"num_input_tokens_seen": 869416,
"step": 6730
},
{
"epoch": 6.288515406162465,
"grad_norm": 4.163094520568848,
"learning_rate": 1.8213302562800294e-05,
"loss": 0.2386,
"num_input_tokens_seen": 870056,
"step": 6735
},
{
"epoch": 6.293183940242764,
"grad_norm": 8.031195640563965,
"learning_rate": 1.817410085962421e-05,
"loss": 0.206,
"num_input_tokens_seen": 870696,
"step": 6740
},
{
"epoch": 6.297852474323062,
"grad_norm": 5.002910614013672,
"learning_rate": 1.8134917283850053e-05,
"loss": 0.109,
"num_input_tokens_seen": 871272,
"step": 6745
},
{
"epoch": 6.302521008403361,
"grad_norm": 5.592998504638672,
"learning_rate": 1.8095751939536866e-05,
"loss": 0.1421,
"num_input_tokens_seen": 871896,
"step": 6750
},
{
"epoch": 6.30718954248366,
"grad_norm": 2.709061622619629,
"learning_rate": 1.8056604930695232e-05,
"loss": 0.0284,
"num_input_tokens_seen": 872616,
"step": 6755
},
{
"epoch": 6.311858076563959,
"grad_norm": 1.0777950286865234,
"learning_rate": 1.8017476361287087e-05,
"loss": 0.1222,
"num_input_tokens_seen": 873336,
"step": 6760
},
{
"epoch": 6.3165266106442575,
"grad_norm": 0.15798994898796082,
"learning_rate": 1.797836633522538e-05,
"loss": 0.1798,
"num_input_tokens_seen": 874008,
"step": 6765
},
{
"epoch": 6.321195144724556,
"grad_norm": 1.92898428440094,
"learning_rate": 1.7939274956373813e-05,
"loss": 0.1037,
"num_input_tokens_seen": 874696,
"step": 6770
},
{
"epoch": 6.325863678804855,
"grad_norm": 2.868745803833008,
"learning_rate": 1.7900202328546557e-05,
"loss": 0.0836,
"num_input_tokens_seen": 875336,
"step": 6775
},
{
"epoch": 6.330532212885154,
"grad_norm": 3.641486167907715,
"learning_rate": 1.7861148555508007e-05,
"loss": 0.2157,
"num_input_tokens_seen": 875960,
"step": 6780
},
{
"epoch": 6.3352007469654525,
"grad_norm": 1.8708469867706299,
"learning_rate": 1.7822113740972478e-05,
"loss": 0.0314,
"num_input_tokens_seen": 876712,
"step": 6785
},
{
"epoch": 6.339869281045751,
"grad_norm": 0.6900608539581299,
"learning_rate": 1.778309798860393e-05,
"loss": 0.09,
"num_input_tokens_seen": 877352,
"step": 6790
},
{
"epoch": 6.34453781512605,
"grad_norm": 10.317023277282715,
"learning_rate": 1.7744101402015716e-05,
"loss": 0.3264,
"num_input_tokens_seen": 877944,
"step": 6795
},
{
"epoch": 6.349206349206349,
"grad_norm": 3.3235373497009277,
"learning_rate": 1.770512408477026e-05,
"loss": 0.0738,
"num_input_tokens_seen": 878568,
"step": 6800
},
{
"epoch": 6.353874883286648,
"grad_norm": 4.6949334144592285,
"learning_rate": 1.7666166140378852e-05,
"loss": 0.2285,
"num_input_tokens_seen": 879128,
"step": 6805
},
{
"epoch": 6.358543417366946,
"grad_norm": 4.452512741088867,
"learning_rate": 1.7627227672301302e-05,
"loss": 0.2037,
"num_input_tokens_seen": 879784,
"step": 6810
},
{
"epoch": 6.363211951447245,
"grad_norm": 2.7044475078582764,
"learning_rate": 1.7588308783945717e-05,
"loss": 0.053,
"num_input_tokens_seen": 880536,
"step": 6815
},
{
"epoch": 6.367880485527545,
"grad_norm": 15.64172077178955,
"learning_rate": 1.7549409578668206e-05,
"loss": 0.2094,
"num_input_tokens_seen": 881128,
"step": 6820
},
{
"epoch": 6.372549019607844,
"grad_norm": 19.395587921142578,
"learning_rate": 1.7510530159772586e-05,
"loss": 0.281,
"num_input_tokens_seen": 881736,
"step": 6825
},
{
"epoch": 6.377217553688142,
"grad_norm": 2.9765985012054443,
"learning_rate": 1.7471670630510152e-05,
"loss": 0.1436,
"num_input_tokens_seen": 882424,
"step": 6830
},
{
"epoch": 6.381886087768441,
"grad_norm": 1.3620012998580933,
"learning_rate": 1.7432831094079355e-05,
"loss": 0.0742,
"num_input_tokens_seen": 883176,
"step": 6835
},
{
"epoch": 6.38655462184874,
"grad_norm": 3.0921950340270996,
"learning_rate": 1.739401165362557e-05,
"loss": 0.1604,
"num_input_tokens_seen": 883816,
"step": 6840
},
{
"epoch": 6.391223155929039,
"grad_norm": 0.9669636487960815,
"learning_rate": 1.7355212412240817e-05,
"loss": 0.137,
"num_input_tokens_seen": 884440,
"step": 6845
},
{
"epoch": 6.395891690009337,
"grad_norm": 2.259877920150757,
"learning_rate": 1.7316433472963427e-05,
"loss": 0.4171,
"num_input_tokens_seen": 885144,
"step": 6850
},
{
"epoch": 6.400560224089636,
"grad_norm": 1.4858440160751343,
"learning_rate": 1.7277674938777855e-05,
"loss": 0.0331,
"num_input_tokens_seen": 885928,
"step": 6855
},
{
"epoch": 6.405228758169935,
"grad_norm": 6.3448591232299805,
"learning_rate": 1.723893691261435e-05,
"loss": 0.2404,
"num_input_tokens_seen": 886568,
"step": 6860
},
{
"epoch": 6.409897292250234,
"grad_norm": 7.592992305755615,
"learning_rate": 1.7200219497348707e-05,
"loss": 0.192,
"num_input_tokens_seen": 887224,
"step": 6865
},
{
"epoch": 6.4145658263305325,
"grad_norm": 1.5566842555999756,
"learning_rate": 1.716152279580199e-05,
"loss": 0.1125,
"num_input_tokens_seen": 887864,
"step": 6870
},
{
"epoch": 6.419234360410831,
"grad_norm": 6.8202128410339355,
"learning_rate": 1.712284691074022e-05,
"loss": 0.1109,
"num_input_tokens_seen": 888552,
"step": 6875
},
{
"epoch": 6.42390289449113,
"grad_norm": 6.580783367156982,
"learning_rate": 1.7084191944874174e-05,
"loss": 0.2174,
"num_input_tokens_seen": 889160,
"step": 6880
},
{
"epoch": 6.428571428571429,
"grad_norm": 1.1786524057388306,
"learning_rate": 1.7045558000859068e-05,
"loss": 0.0866,
"num_input_tokens_seen": 889800,
"step": 6885
},
{
"epoch": 6.433239962651728,
"grad_norm": 0.5497421026229858,
"learning_rate": 1.7006945181294275e-05,
"loss": 0.0944,
"num_input_tokens_seen": 890440,
"step": 6890
},
{
"epoch": 6.437908496732026,
"grad_norm": 0.2169853150844574,
"learning_rate": 1.6968353588723084e-05,
"loss": 0.2037,
"num_input_tokens_seen": 891112,
"step": 6895
},
{
"epoch": 6.442577030812325,
"grad_norm": 2.8242037296295166,
"learning_rate": 1.6929783325632393e-05,
"loss": 0.0724,
"num_input_tokens_seen": 891816,
"step": 6900
},
{
"epoch": 6.447245564892624,
"grad_norm": 5.830324172973633,
"learning_rate": 1.6891234494452476e-05,
"loss": 0.2002,
"num_input_tokens_seen": 892408,
"step": 6905
},
{
"epoch": 6.451914098972923,
"grad_norm": 0.2872973084449768,
"learning_rate": 1.6852707197556677e-05,
"loss": 0.043,
"num_input_tokens_seen": 893048,
"step": 6910
},
{
"epoch": 6.456582633053221,
"grad_norm": 4.120421409606934,
"learning_rate": 1.6814201537261162e-05,
"loss": 0.187,
"num_input_tokens_seen": 893672,
"step": 6915
},
{
"epoch": 6.46125116713352,
"grad_norm": 5.557107448577881,
"learning_rate": 1.677571761582464e-05,
"loss": 0.1201,
"num_input_tokens_seen": 894344,
"step": 6920
},
{
"epoch": 6.465919701213819,
"grad_norm": 7.2985076904296875,
"learning_rate": 1.6737255535448063e-05,
"loss": 0.2039,
"num_input_tokens_seen": 894920,
"step": 6925
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.9252108931541443,
"learning_rate": 1.669881539827441e-05,
"loss": 0.1017,
"num_input_tokens_seen": 895512,
"step": 6930
},
{
"epoch": 6.4752567693744165,
"grad_norm": 13.045816421508789,
"learning_rate": 1.6660397306388364e-05,
"loss": 0.1111,
"num_input_tokens_seen": 896216,
"step": 6935
},
{
"epoch": 6.479925303454715,
"grad_norm": 3.7154791355133057,
"learning_rate": 1.662200136181609e-05,
"loss": 0.1685,
"num_input_tokens_seen": 896856,
"step": 6940
},
{
"epoch": 6.484593837535014,
"grad_norm": 1.7174103260040283,
"learning_rate": 1.6583627666524902e-05,
"loss": 0.1315,
"num_input_tokens_seen": 897560,
"step": 6945
},
{
"epoch": 6.489262371615313,
"grad_norm": 2.6956374645233154,
"learning_rate": 1.6545276322423054e-05,
"loss": 0.102,
"num_input_tokens_seen": 898216,
"step": 6950
},
{
"epoch": 6.493930905695612,
"grad_norm": 2.201688766479492,
"learning_rate": 1.650694743135942e-05,
"loss": 0.065,
"num_input_tokens_seen": 898840,
"step": 6955
},
{
"epoch": 6.49859943977591,
"grad_norm": 7.494628429412842,
"learning_rate": 1.6468641095123273e-05,
"loss": 0.3311,
"num_input_tokens_seen": 899464,
"step": 6960
},
{
"epoch": 6.503267973856209,
"grad_norm": 3.3498306274414062,
"learning_rate": 1.643035741544398e-05,
"loss": 0.0319,
"num_input_tokens_seen": 900152,
"step": 6965
},
{
"epoch": 6.506069094304388,
"eval_loss": 0.9735978245735168,
"eval_runtime": 3.8751,
"eval_samples_per_second": 61.418,
"eval_steps_per_second": 30.709,
"num_input_tokens_seen": 900568,
"step": 6968
},
{
"epoch": 6.507936507936508,
"grad_norm": 3.840616464614868,
"learning_rate": 1.6392096493990713e-05,
"loss": 0.1514,
"num_input_tokens_seen": 900792,
"step": 6970
},
{
"epoch": 6.512605042016807,
"grad_norm": 0.730390191078186,
"learning_rate": 1.6353858432372228e-05,
"loss": 0.3106,
"num_input_tokens_seen": 901464,
"step": 6975
},
{
"epoch": 6.5172735760971054,
"grad_norm": 6.283236980438232,
"learning_rate": 1.631564333213658e-05,
"loss": 0.2399,
"num_input_tokens_seen": 902088,
"step": 6980
},
{
"epoch": 6.521942110177404,
"grad_norm": 5.225600242614746,
"learning_rate": 1.6277451294770834e-05,
"loss": 0.1521,
"num_input_tokens_seen": 902712,
"step": 6985
},
{
"epoch": 6.526610644257703,
"grad_norm": 0.35495224595069885,
"learning_rate": 1.6239282421700807e-05,
"loss": 0.2793,
"num_input_tokens_seen": 903448,
"step": 6990
},
{
"epoch": 6.531279178338002,
"grad_norm": 5.897226810455322,
"learning_rate": 1.6201136814290802e-05,
"loss": 0.0455,
"num_input_tokens_seen": 904088,
"step": 6995
},
{
"epoch": 6.5359477124183005,
"grad_norm": 9.795798301696777,
"learning_rate": 1.6163014573843323e-05,
"loss": 0.1692,
"num_input_tokens_seen": 904728,
"step": 7000
},
{
"epoch": 6.540616246498599,
"grad_norm": 5.636192321777344,
"learning_rate": 1.6124915801598852e-05,
"loss": 0.0688,
"num_input_tokens_seen": 905304,
"step": 7005
},
{
"epoch": 6.545284780578898,
"grad_norm": 14.28243350982666,
"learning_rate": 1.6086840598735507e-05,
"loss": 0.023,
"num_input_tokens_seen": 905896,
"step": 7010
},
{
"epoch": 6.549953314659197,
"grad_norm": 0.4303819537162781,
"learning_rate": 1.6048789066368858e-05,
"loss": 0.2272,
"num_input_tokens_seen": 906552,
"step": 7015
},
{
"epoch": 6.554621848739496,
"grad_norm": 8.530994415283203,
"learning_rate": 1.6010761305551553e-05,
"loss": 0.1954,
"num_input_tokens_seen": 907160,
"step": 7020
},
{
"epoch": 6.559290382819794,
"grad_norm": 0.3170902132987976,
"learning_rate": 1.5972757417273166e-05,
"loss": 0.1061,
"num_input_tokens_seen": 907784,
"step": 7025
},
{
"epoch": 6.563958916900093,
"grad_norm": 0.19078557193279266,
"learning_rate": 1.5934777502459855e-05,
"loss": 0.1826,
"num_input_tokens_seen": 908456,
"step": 7030
},
{
"epoch": 6.568627450980392,
"grad_norm": 3.0452566146850586,
"learning_rate": 1.5896821661974098e-05,
"loss": 0.116,
"num_input_tokens_seen": 909064,
"step": 7035
},
{
"epoch": 6.573295985060691,
"grad_norm": 3.596776008605957,
"learning_rate": 1.5858889996614468e-05,
"loss": 0.0836,
"num_input_tokens_seen": 909656,
"step": 7040
},
{
"epoch": 6.5779645191409895,
"grad_norm": 0.0865001380443573,
"learning_rate": 1.5820982607115297e-05,
"loss": 0.0932,
"num_input_tokens_seen": 910280,
"step": 7045
},
{
"epoch": 6.582633053221288,
"grad_norm": 6.569155216217041,
"learning_rate": 1.578309959414649e-05,
"loss": 0.1726,
"num_input_tokens_seen": 910872,
"step": 7050
},
{
"epoch": 6.587301587301587,
"grad_norm": 0.8266429305076599,
"learning_rate": 1.574524105831318e-05,
"loss": 0.1021,
"num_input_tokens_seen": 911560,
"step": 7055
},
{
"epoch": 6.591970121381886,
"grad_norm": 46.219268798828125,
"learning_rate": 1.5707407100155517e-05,
"loss": 0.2182,
"num_input_tokens_seen": 912184,
"step": 7060
},
{
"epoch": 6.5966386554621845,
"grad_norm": 14.47891902923584,
"learning_rate": 1.5669597820148398e-05,
"loss": 0.201,
"num_input_tokens_seen": 912824,
"step": 7065
},
{
"epoch": 6.601307189542483,
"grad_norm": 0.10131403058767319,
"learning_rate": 1.5631813318701138e-05,
"loss": 0.2368,
"num_input_tokens_seen": 913528,
"step": 7070
},
{
"epoch": 6.605975723622782,
"grad_norm": 4.317729949951172,
"learning_rate": 1.559405369615727e-05,
"loss": 0.1386,
"num_input_tokens_seen": 914168,
"step": 7075
},
{
"epoch": 6.610644257703081,
"grad_norm": 10.373050689697266,
"learning_rate": 1.5556319052794267e-05,
"loss": 0.2691,
"num_input_tokens_seen": 914856,
"step": 7080
},
{
"epoch": 6.61531279178338,
"grad_norm": 0.2939615845680237,
"learning_rate": 1.5518609488823258e-05,
"loss": 0.1503,
"num_input_tokens_seen": 915432,
"step": 7085
},
{
"epoch": 6.619981325863678,
"grad_norm": 2.4400196075439453,
"learning_rate": 1.5480925104388762e-05,
"loss": 0.0218,
"num_input_tokens_seen": 916072,
"step": 7090
},
{
"epoch": 6.624649859943977,
"grad_norm": 2.928807497024536,
"learning_rate": 1.544326599956844e-05,
"loss": 0.1635,
"num_input_tokens_seen": 916728,
"step": 7095
},
{
"epoch": 6.629318394024276,
"grad_norm": 0.10388700664043427,
"learning_rate": 1.54056322743728e-05,
"loss": 0.132,
"num_input_tokens_seen": 917304,
"step": 7100
},
{
"epoch": 6.633986928104575,
"grad_norm": 2.3083953857421875,
"learning_rate": 1.5368024028744976e-05,
"loss": 0.1171,
"num_input_tokens_seen": 918024,
"step": 7105
},
{
"epoch": 6.6386554621848735,
"grad_norm": 16.023496627807617,
"learning_rate": 1.5330441362560425e-05,
"loss": 0.1859,
"num_input_tokens_seen": 918760,
"step": 7110
},
{
"epoch": 6.643323996265173,
"grad_norm": 0.659245491027832,
"learning_rate": 1.5292884375626664e-05,
"loss": 0.0545,
"num_input_tokens_seen": 919400,
"step": 7115
},
{
"epoch": 6.647992530345472,
"grad_norm": 0.08721905946731567,
"learning_rate": 1.5255353167683017e-05,
"loss": 0.0533,
"num_input_tokens_seen": 919976,
"step": 7120
},
{
"epoch": 6.652661064425771,
"grad_norm": 7.850348949432373,
"learning_rate": 1.5217847838400362e-05,
"loss": 0.2076,
"num_input_tokens_seen": 920664,
"step": 7125
},
{
"epoch": 6.657329598506069,
"grad_norm": 1.92311692237854,
"learning_rate": 1.5180368487380839e-05,
"loss": 0.133,
"num_input_tokens_seen": 921336,
"step": 7130
},
{
"epoch": 6.661998132586368,
"grad_norm": 5.412630081176758,
"learning_rate": 1.5142915214157605e-05,
"loss": 0.2534,
"num_input_tokens_seen": 921976,
"step": 7135
},
{
"epoch": 6.666666666666667,
"grad_norm": 5.1437177658081055,
"learning_rate": 1.5105488118194544e-05,
"loss": 0.2151,
"num_input_tokens_seen": 922520,
"step": 7140
},
{
"epoch": 6.671335200746966,
"grad_norm": 2.109266996383667,
"learning_rate": 1.5068087298886041e-05,
"loss": 0.0364,
"num_input_tokens_seen": 923128,
"step": 7145
},
{
"epoch": 6.6760037348272645,
"grad_norm": 4.490639686584473,
"learning_rate": 1.5030712855556705e-05,
"loss": 0.136,
"num_input_tokens_seen": 923832,
"step": 7150
},
{
"epoch": 6.680672268907563,
"grad_norm": 5.7482008934021,
"learning_rate": 1.499336488746107e-05,
"loss": 0.1265,
"num_input_tokens_seen": 924552,
"step": 7155
},
{
"epoch": 6.685340802987862,
"grad_norm": 0.4535566568374634,
"learning_rate": 1.4956043493783401e-05,
"loss": 0.0958,
"num_input_tokens_seen": 925208,
"step": 7160
},
{
"epoch": 6.690009337068161,
"grad_norm": 1.7882232666015625,
"learning_rate": 1.4918748773637337e-05,
"loss": 0.0885,
"num_input_tokens_seen": 925976,
"step": 7165
},
{
"epoch": 6.69467787114846,
"grad_norm": 3.1727871894836426,
"learning_rate": 1.4881480826065736e-05,
"loss": 0.0629,
"num_input_tokens_seen": 926648,
"step": 7170
},
{
"epoch": 6.699346405228758,
"grad_norm": 0.20563183724880219,
"learning_rate": 1.4844239750040308e-05,
"loss": 0.0594,
"num_input_tokens_seen": 927272,
"step": 7175
},
{
"epoch": 6.704014939309057,
"grad_norm": 3.364445686340332,
"learning_rate": 1.4807025644461436e-05,
"loss": 0.0427,
"num_input_tokens_seen": 927880,
"step": 7180
},
{
"epoch": 6.708683473389356,
"grad_norm": 1.1256296634674072,
"learning_rate": 1.4769838608157877e-05,
"loss": 0.1734,
"num_input_tokens_seen": 928568,
"step": 7185
},
{
"epoch": 6.713352007469655,
"grad_norm": 0.5387191772460938,
"learning_rate": 1.4732678739886468e-05,
"loss": 0.0138,
"num_input_tokens_seen": 929224,
"step": 7190
},
{
"epoch": 6.718020541549953,
"grad_norm": 5.2028398513793945,
"learning_rate": 1.4695546138331928e-05,
"loss": 0.1926,
"num_input_tokens_seen": 929832,
"step": 7195
},
{
"epoch": 6.722689075630252,
"grad_norm": 0.20318341255187988,
"learning_rate": 1.465844090210655e-05,
"loss": 0.07,
"num_input_tokens_seen": 930488,
"step": 7200
},
{
"epoch": 6.727357609710551,
"grad_norm": 8.667532920837402,
"learning_rate": 1.4621363129749958e-05,
"loss": 0.098,
"num_input_tokens_seen": 931128,
"step": 7205
},
{
"epoch": 6.73202614379085,
"grad_norm": 3.0935397148132324,
"learning_rate": 1.4584312919728853e-05,
"loss": 0.0368,
"num_input_tokens_seen": 931688,
"step": 7210
},
{
"epoch": 6.7366946778711485,
"grad_norm": 3.340531349182129,
"learning_rate": 1.4547290370436707e-05,
"loss": 0.245,
"num_input_tokens_seen": 932360,
"step": 7215
},
{
"epoch": 6.741363211951447,
"grad_norm": 1.7328623533248901,
"learning_rate": 1.451029558019356e-05,
"loss": 0.0781,
"num_input_tokens_seen": 933000,
"step": 7220
},
{
"epoch": 6.746031746031746,
"grad_norm": 12.751051902770996,
"learning_rate": 1.4473328647245726e-05,
"loss": 0.0738,
"num_input_tokens_seen": 933672,
"step": 7225
},
{
"epoch": 6.750700280112045,
"grad_norm": 3.262699842453003,
"learning_rate": 1.4436389669765543e-05,
"loss": 0.1764,
"num_input_tokens_seen": 934296,
"step": 7230
},
{
"epoch": 6.755368814192344,
"grad_norm": 0.568540632724762,
"learning_rate": 1.4399478745851107e-05,
"loss": 0.0288,
"num_input_tokens_seen": 935048,
"step": 7235
},
{
"epoch": 6.760037348272642,
"grad_norm": 8.316237449645996,
"learning_rate": 1.4362595973526005e-05,
"loss": 0.1433,
"num_input_tokens_seen": 935720,
"step": 7240
},
{
"epoch": 6.764705882352941,
"grad_norm": 4.168196678161621,
"learning_rate": 1.4325741450739072e-05,
"loss": 0.1233,
"num_input_tokens_seen": 936360,
"step": 7245
},
{
"epoch": 6.76937441643324,
"grad_norm": 0.5175381302833557,
"learning_rate": 1.4288915275364107e-05,
"loss": 0.0847,
"num_input_tokens_seen": 937000,
"step": 7250
},
{
"epoch": 6.774042950513539,
"grad_norm": 0.7255039215087891,
"learning_rate": 1.425211754519964e-05,
"loss": 0.1635,
"num_input_tokens_seen": 937608,
"step": 7255
},
{
"epoch": 6.778711484593837,
"grad_norm": 1.7699940204620361,
"learning_rate": 1.4215348357968669e-05,
"loss": 0.0913,
"num_input_tokens_seen": 938200,
"step": 7260
},
{
"epoch": 6.783380018674136,
"grad_norm": 6.348759651184082,
"learning_rate": 1.4178607811318361e-05,
"loss": 0.2961,
"num_input_tokens_seen": 938824,
"step": 7265
},
{
"epoch": 6.788048552754435,
"grad_norm": 10.6443510055542,
"learning_rate": 1.4141896002819854e-05,
"loss": 0.0846,
"num_input_tokens_seen": 939528,
"step": 7270
},
{
"epoch": 6.792717086834734,
"grad_norm": 0.29247570037841797,
"learning_rate": 1.4105213029967945e-05,
"loss": 0.2019,
"num_input_tokens_seen": 940120,
"step": 7275
},
{
"epoch": 6.7973856209150325,
"grad_norm": 2.046889305114746,
"learning_rate": 1.4068558990180875e-05,
"loss": 0.2389,
"num_input_tokens_seen": 940792,
"step": 7280
},
{
"epoch": 6.802054154995331,
"grad_norm": 25.0539493560791,
"learning_rate": 1.4031933980800028e-05,
"loss": 0.5873,
"num_input_tokens_seen": 941416,
"step": 7285
},
{
"epoch": 6.80672268907563,
"grad_norm": 2.1702511310577393,
"learning_rate": 1.399533809908968e-05,
"loss": 0.0993,
"num_input_tokens_seen": 942024,
"step": 7290
},
{
"epoch": 6.811391223155929,
"grad_norm": 1.2021642923355103,
"learning_rate": 1.395877144223679e-05,
"loss": 0.1214,
"num_input_tokens_seen": 942712,
"step": 7295
},
{
"epoch": 6.816059757236228,
"grad_norm": 5.554089069366455,
"learning_rate": 1.3922234107350684e-05,
"loss": 0.0719,
"num_input_tokens_seen": 943368,
"step": 7300
},
{
"epoch": 6.820728291316526,
"grad_norm": 4.765249252319336,
"learning_rate": 1.388572619146283e-05,
"loss": 0.1859,
"num_input_tokens_seen": 944088,
"step": 7305
},
{
"epoch": 6.825396825396825,
"grad_norm": 3.2210237979888916,
"learning_rate": 1.3849247791526543e-05,
"loss": 0.165,
"num_input_tokens_seen": 944728,
"step": 7310
},
{
"epoch": 6.830065359477124,
"grad_norm": 0.09927883744239807,
"learning_rate": 1.3812799004416779e-05,
"loss": 0.0848,
"num_input_tokens_seen": 945352,
"step": 7315
},
{
"epoch": 6.834733893557423,
"grad_norm": 4.660610675811768,
"learning_rate": 1.3776379926929842e-05,
"loss": 0.1015,
"num_input_tokens_seen": 946008,
"step": 7320
},
{
"epoch": 6.839402427637721,
"grad_norm": 2.595057249069214,
"learning_rate": 1.3739990655783147e-05,
"loss": 0.0979,
"num_input_tokens_seen": 946584,
"step": 7325
},
{
"epoch": 6.84407096171802,
"grad_norm": 4.264656066894531,
"learning_rate": 1.3703631287614935e-05,
"loss": 0.2259,
"num_input_tokens_seen": 947224,
"step": 7330
},
{
"epoch": 6.848739495798319,
"grad_norm": 4.981330871582031,
"learning_rate": 1.3667301918984032e-05,
"loss": 0.3648,
"num_input_tokens_seen": 947896,
"step": 7335
},
{
"epoch": 6.853408029878618,
"grad_norm": 3.690455913543701,
"learning_rate": 1.3631002646369615e-05,
"loss": 0.0457,
"num_input_tokens_seen": 948552,
"step": 7340
},
{
"epoch": 6.8580765639589165,
"grad_norm": 1.3205879926681519,
"learning_rate": 1.3594733566170926e-05,
"loss": 0.0872,
"num_input_tokens_seen": 949112,
"step": 7345
},
{
"epoch": 6.862745098039216,
"grad_norm": 1.1191065311431885,
"learning_rate": 1.3558494774707026e-05,
"loss": 0.1099,
"num_input_tokens_seen": 949816,
"step": 7350
},
{
"epoch": 6.867413632119515,
"grad_norm": 6.446228504180908,
"learning_rate": 1.3522286368216553e-05,
"loss": 0.2311,
"num_input_tokens_seen": 950408,
"step": 7355
},
{
"epoch": 6.872082166199814,
"grad_norm": 2.946200370788574,
"learning_rate": 1.3486108442857412e-05,
"loss": 0.2259,
"num_input_tokens_seen": 951016,
"step": 7360
},
{
"epoch": 6.8767507002801125,
"grad_norm": 0.6405202746391296,
"learning_rate": 1.3449961094706606e-05,
"loss": 0.1807,
"num_input_tokens_seen": 951656,
"step": 7365
},
{
"epoch": 6.881419234360411,
"grad_norm": 3.961355209350586,
"learning_rate": 1.34138444197599e-05,
"loss": 0.1963,
"num_input_tokens_seen": 952328,
"step": 7370
},
{
"epoch": 6.88608776844071,
"grad_norm": 4.892948627471924,
"learning_rate": 1.3377758513931621e-05,
"loss": 0.1093,
"num_input_tokens_seen": 952968,
"step": 7375
},
{
"epoch": 6.890756302521009,
"grad_norm": 1.85221266746521,
"learning_rate": 1.3341703473054384e-05,
"loss": 0.2895,
"num_input_tokens_seen": 953640,
"step": 7380
},
{
"epoch": 6.895424836601308,
"grad_norm": 5.994509220123291,
"learning_rate": 1.3305679392878817e-05,
"loss": 0.2183,
"num_input_tokens_seen": 954248,
"step": 7385
},
{
"epoch": 6.900093370681606,
"grad_norm": 12.080351829528809,
"learning_rate": 1.3269686369073347e-05,
"loss": 0.2908,
"num_input_tokens_seen": 954856,
"step": 7390
},
{
"epoch": 6.904761904761905,
"grad_norm": 2.8995492458343506,
"learning_rate": 1.3233724497223914e-05,
"loss": 0.1119,
"num_input_tokens_seen": 955496,
"step": 7395
},
{
"epoch": 6.909430438842204,
"grad_norm": 3.4688098430633545,
"learning_rate": 1.3197793872833735e-05,
"loss": 0.1273,
"num_input_tokens_seen": 956072,
"step": 7400
},
{
"epoch": 6.914098972922503,
"grad_norm": 7.582507610321045,
"learning_rate": 1.316189459132305e-05,
"loss": 0.1408,
"num_input_tokens_seen": 956680,
"step": 7405
},
{
"epoch": 6.918767507002801,
"grad_norm": 9.396662712097168,
"learning_rate": 1.3126026748028843e-05,
"loss": 0.2068,
"num_input_tokens_seen": 957304,
"step": 7410
},
{
"epoch": 6.9234360410831,
"grad_norm": 1.258137822151184,
"learning_rate": 1.3090190438204607e-05,
"loss": 0.1737,
"num_input_tokens_seen": 957928,
"step": 7415
},
{
"epoch": 6.928104575163399,
"grad_norm": 8.436930656433105,
"learning_rate": 1.3054385757020119e-05,
"loss": 0.1514,
"num_input_tokens_seen": 958600,
"step": 7420
},
{
"epoch": 6.932773109243698,
"grad_norm": 3.9069759845733643,
"learning_rate": 1.3018612799561137e-05,
"loss": 0.0558,
"num_input_tokens_seen": 959240,
"step": 7425
},
{
"epoch": 6.9374416433239965,
"grad_norm": 13.299653053283691,
"learning_rate": 1.2982871660829191e-05,
"loss": 0.1571,
"num_input_tokens_seen": 959976,
"step": 7430
},
{
"epoch": 6.942110177404295,
"grad_norm": 2.508751630783081,
"learning_rate": 1.2947162435741278e-05,
"loss": 0.0605,
"num_input_tokens_seen": 960712,
"step": 7435
},
{
"epoch": 6.946778711484594,
"grad_norm": 9.50046443939209,
"learning_rate": 1.2911485219129677e-05,
"loss": 0.2175,
"num_input_tokens_seen": 961304,
"step": 7440
},
{
"epoch": 6.951447245564893,
"grad_norm": 0.7765032052993774,
"learning_rate": 1.2875840105741654e-05,
"loss": 0.2403,
"num_input_tokens_seen": 961976,
"step": 7445
},
{
"epoch": 6.956115779645192,
"grad_norm": 1.1209977865219116,
"learning_rate": 1.2840227190239195e-05,
"loss": 0.0854,
"num_input_tokens_seen": 962616,
"step": 7450
},
{
"epoch": 6.96078431372549,
"grad_norm": 1.1824790239334106,
"learning_rate": 1.2804646567198818e-05,
"loss": 0.0786,
"num_input_tokens_seen": 963224,
"step": 7455
},
{
"epoch": 6.965452847805789,
"grad_norm": 10.131546974182129,
"learning_rate": 1.2769098331111246e-05,
"loss": 0.1808,
"num_input_tokens_seen": 963736,
"step": 7460
},
{
"epoch": 6.970121381886088,
"grad_norm": 0.2978059649467468,
"learning_rate": 1.2733582576381211e-05,
"loss": 0.0198,
"num_input_tokens_seen": 964328,
"step": 7465
},
{
"epoch": 6.974789915966387,
"grad_norm": 0.7438709735870361,
"learning_rate": 1.269809939732719e-05,
"loss": 0.131,
"num_input_tokens_seen": 964952,
"step": 7470
},
{
"epoch": 6.979458450046685,
"grad_norm": 3.8623740673065186,
"learning_rate": 1.2662648888181145e-05,
"loss": 0.1148,
"num_input_tokens_seen": 965672,
"step": 7475
},
{
"epoch": 6.984126984126984,
"grad_norm": 2.075993061065674,
"learning_rate": 1.2627231143088259e-05,
"loss": 0.039,
"num_input_tokens_seen": 966408,
"step": 7480
},
{
"epoch": 6.988795518207283,
"grad_norm": 2.753314971923828,
"learning_rate": 1.2591846256106732e-05,
"loss": 0.1244,
"num_input_tokens_seen": 967032,
"step": 7485
},
{
"epoch": 6.993464052287582,
"grad_norm": 0.46315088868141174,
"learning_rate": 1.255649432120749e-05,
"loss": 0.1778,
"num_input_tokens_seen": 967592,
"step": 7490
},
{
"epoch": 6.9981325863678805,
"grad_norm": 4.949937343597412,
"learning_rate": 1.252117543227394e-05,
"loss": 0.0908,
"num_input_tokens_seen": 968200,
"step": 7495
},
{
"epoch": 7.002801120448179,
"grad_norm": 0.3900282382965088,
"learning_rate": 1.2485889683101758e-05,
"loss": 0.0472,
"num_input_tokens_seen": 968752,
"step": 7500
},
{
"epoch": 7.006535947712418,
"eval_loss": 0.969052255153656,
"eval_runtime": 3.8766,
"eval_samples_per_second": 61.394,
"eval_steps_per_second": 30.697,
"num_input_tokens_seen": 969200,
"step": 7504
},
{
"epoch": 7.007469654528478,
"grad_norm": 1.4273070096969604,
"learning_rate": 1.2450637167398571e-05,
"loss": 0.0418,
"num_input_tokens_seen": 969344,
"step": 7505
},
{
"epoch": 7.012138188608777,
"grad_norm": 16.452985763549805,
"learning_rate": 1.2415417978783777e-05,
"loss": 0.1801,
"num_input_tokens_seen": 969920,
"step": 7510
},
{
"epoch": 7.016806722689076,
"grad_norm": 0.042361099272966385,
"learning_rate": 1.2380232210788265e-05,
"loss": 0.0203,
"num_input_tokens_seen": 970560,
"step": 7515
},
{
"epoch": 7.021475256769374,
"grad_norm": 3.3039093017578125,
"learning_rate": 1.2345079956854164e-05,
"loss": 0.0695,
"num_input_tokens_seen": 971152,
"step": 7520
},
{
"epoch": 7.026143790849673,
"grad_norm": 1.6099435091018677,
"learning_rate": 1.2309961310334609e-05,
"loss": 0.0611,
"num_input_tokens_seen": 971728,
"step": 7525
},
{
"epoch": 7.030812324929972,
"grad_norm": 5.736599922180176,
"learning_rate": 1.2274876364493474e-05,
"loss": 0.0954,
"num_input_tokens_seen": 972384,
"step": 7530
},
{
"epoch": 7.035480859010271,
"grad_norm": 9.741619110107422,
"learning_rate": 1.2239825212505124e-05,
"loss": 0.0737,
"num_input_tokens_seen": 973120,
"step": 7535
},
{
"epoch": 7.040149393090569,
"grad_norm": 2.7313551902770996,
"learning_rate": 1.2204807947454203e-05,
"loss": 0.1562,
"num_input_tokens_seen": 973792,
"step": 7540
},
{
"epoch": 7.044817927170868,
"grad_norm": 4.553571701049805,
"learning_rate": 1.2169824662335352e-05,
"loss": 0.0597,
"num_input_tokens_seen": 974400,
"step": 7545
},
{
"epoch": 7.049486461251167,
"grad_norm": 0.03360239043831825,
"learning_rate": 1.2134875450052979e-05,
"loss": 0.0293,
"num_input_tokens_seen": 974992,
"step": 7550
},
{
"epoch": 7.054154995331466,
"grad_norm": 4.209261417388916,
"learning_rate": 1.2099960403420985e-05,
"loss": 0.0987,
"num_input_tokens_seen": 975568,
"step": 7555
},
{
"epoch": 7.0588235294117645,
"grad_norm": 9.991903305053711,
"learning_rate": 1.2065079615162559e-05,
"loss": 0.0483,
"num_input_tokens_seen": 976288,
"step": 7560
},
{
"epoch": 7.063492063492063,
"grad_norm": 2.156618356704712,
"learning_rate": 1.2030233177909896e-05,
"loss": 0.0854,
"num_input_tokens_seen": 976848,
"step": 7565
},
{
"epoch": 7.068160597572362,
"grad_norm": 5.080526351928711,
"learning_rate": 1.1995421184203992e-05,
"loss": 0.1016,
"num_input_tokens_seen": 977568,
"step": 7570
},
{
"epoch": 7.072829131652661,
"grad_norm": 1.6115537881851196,
"learning_rate": 1.196064372649434e-05,
"loss": 0.0218,
"num_input_tokens_seen": 978240,
"step": 7575
},
{
"epoch": 7.07749766573296,
"grad_norm": 1.5015766620635986,
"learning_rate": 1.1925900897138718e-05,
"loss": 0.034,
"num_input_tokens_seen": 978928,
"step": 7580
},
{
"epoch": 7.082166199813258,
"grad_norm": 2.4616377353668213,
"learning_rate": 1.189119278840296e-05,
"loss": 0.0496,
"num_input_tokens_seen": 979840,
"step": 7585
},
{
"epoch": 7.086834733893557,
"grad_norm": 18.41492462158203,
"learning_rate": 1.1856519492460694e-05,
"loss": 0.2292,
"num_input_tokens_seen": 980336,
"step": 7590
},
{
"epoch": 7.091503267973856,
"grad_norm": 1.4025119543075562,
"learning_rate": 1.1821881101393084e-05,
"loss": 0.0471,
"num_input_tokens_seen": 980992,
"step": 7595
},
{
"epoch": 7.096171802054155,
"grad_norm": 9.339607238769531,
"learning_rate": 1.1787277707188616e-05,
"loss": 0.1712,
"num_input_tokens_seen": 981760,
"step": 7600
},
{
"epoch": 7.100840336134453,
"grad_norm": 1.5936764478683472,
"learning_rate": 1.1752709401742799e-05,
"loss": 0.0083,
"num_input_tokens_seen": 982352,
"step": 7605
},
{
"epoch": 7.105508870214752,
"grad_norm": 0.4758903682231903,
"learning_rate": 1.1718176276858001e-05,
"loss": 0.0177,
"num_input_tokens_seen": 982960,
"step": 7610
},
{
"epoch": 7.110177404295051,
"grad_norm": 9.756832122802734,
"learning_rate": 1.1683678424243122e-05,
"loss": 0.0674,
"num_input_tokens_seen": 983520,
"step": 7615
},
{
"epoch": 7.11484593837535,
"grad_norm": 3.2350313663482666,
"learning_rate": 1.1649215935513422e-05,
"loss": 0.0395,
"num_input_tokens_seen": 984240,
"step": 7620
},
{
"epoch": 7.1195144724556485,
"grad_norm": 13.398826599121094,
"learning_rate": 1.161478890219024e-05,
"loss": 0.0716,
"num_input_tokens_seen": 984912,
"step": 7625
},
{
"epoch": 7.124183006535947,
"grad_norm": 0.20864218473434448,
"learning_rate": 1.1580397415700733e-05,
"loss": 0.0486,
"num_input_tokens_seen": 985520,
"step": 7630
},
{
"epoch": 7.128851540616246,
"grad_norm": 5.6734299659729,
"learning_rate": 1.1546041567377686e-05,
"loss": 0.0489,
"num_input_tokens_seen": 986176,
"step": 7635
},
{
"epoch": 7.133520074696546,
"grad_norm": 0.6356542110443115,
"learning_rate": 1.1511721448459223e-05,
"loss": 0.1587,
"num_input_tokens_seen": 986768,
"step": 7640
},
{
"epoch": 7.1381886087768445,
"grad_norm": 0.35997679829597473,
"learning_rate": 1.14774371500886e-05,
"loss": 0.0241,
"num_input_tokens_seen": 987376,
"step": 7645
},
{
"epoch": 7.142857142857143,
"grad_norm": 1.3711936473846436,
"learning_rate": 1.1443188763313915e-05,
"loss": 0.108,
"num_input_tokens_seen": 987936,
"step": 7650
},
{
"epoch": 7.147525676937442,
"grad_norm": 0.11039953678846359,
"learning_rate": 1.1408976379087932e-05,
"loss": 0.0807,
"num_input_tokens_seen": 988592,
"step": 7655
},
{
"epoch": 7.152194211017741,
"grad_norm": 3.1219189167022705,
"learning_rate": 1.1374800088267767e-05,
"loss": 0.2025,
"num_input_tokens_seen": 989168,
"step": 7660
},
{
"epoch": 7.1568627450980395,
"grad_norm": 1.7455995082855225,
"learning_rate": 1.1340659981614715e-05,
"loss": 0.1152,
"num_input_tokens_seen": 989696,
"step": 7665
},
{
"epoch": 7.161531279178338,
"grad_norm": 13.108963966369629,
"learning_rate": 1.1306556149793971e-05,
"loss": 0.0463,
"num_input_tokens_seen": 990288,
"step": 7670
},
{
"epoch": 7.166199813258637,
"grad_norm": 2.7943058013916016,
"learning_rate": 1.1272488683374369e-05,
"loss": 0.0585,
"num_input_tokens_seen": 990880,
"step": 7675
},
{
"epoch": 7.170868347338936,
"grad_norm": 3.618239641189575,
"learning_rate": 1.1238457672828204e-05,
"loss": 0.0628,
"num_input_tokens_seen": 991472,
"step": 7680
},
{
"epoch": 7.175536881419235,
"grad_norm": 1.0060943365097046,
"learning_rate": 1.1204463208530936e-05,
"loss": 0.1359,
"num_input_tokens_seen": 992048,
"step": 7685
},
{
"epoch": 7.180205415499533,
"grad_norm": 0.04717274010181427,
"learning_rate": 1.1170505380760984e-05,
"loss": 0.229,
"num_input_tokens_seen": 992704,
"step": 7690
},
{
"epoch": 7.184873949579832,
"grad_norm": 0.1487988978624344,
"learning_rate": 1.1136584279699458e-05,
"loss": 0.0174,
"num_input_tokens_seen": 993472,
"step": 7695
},
{
"epoch": 7.189542483660131,
"grad_norm": 11.722426414489746,
"learning_rate": 1.1102699995429921e-05,
"loss": 0.0752,
"num_input_tokens_seen": 994064,
"step": 7700
},
{
"epoch": 7.19421101774043,
"grad_norm": 9.349660873413086,
"learning_rate": 1.1068852617938196e-05,
"loss": 0.2765,
"num_input_tokens_seen": 994656,
"step": 7705
},
{
"epoch": 7.1988795518207285,
"grad_norm": 1.1677833795547485,
"learning_rate": 1.1035042237112076e-05,
"loss": 0.0618,
"num_input_tokens_seen": 995312,
"step": 7710
},
{
"epoch": 7.203548085901027,
"grad_norm": 0.6099132895469666,
"learning_rate": 1.1001268942741099e-05,
"loss": 0.0155,
"num_input_tokens_seen": 995984,
"step": 7715
},
{
"epoch": 7.208216619981326,
"grad_norm": 3.9095051288604736,
"learning_rate": 1.0967532824516334e-05,
"loss": 0.0812,
"num_input_tokens_seen": 996624,
"step": 7720
},
{
"epoch": 7.212885154061625,
"grad_norm": 0.8077293634414673,
"learning_rate": 1.0933833972030081e-05,
"loss": 0.0118,
"num_input_tokens_seen": 997184,
"step": 7725
},
{
"epoch": 7.2175536881419236,
"grad_norm": 0.04219364747405052,
"learning_rate": 1.0900172474775714e-05,
"loss": 0.0526,
"num_input_tokens_seen": 997888,
"step": 7730
},
{
"epoch": 7.222222222222222,
"grad_norm": 4.558381080627441,
"learning_rate": 1.086654842214739e-05,
"loss": 0.0981,
"num_input_tokens_seen": 998496,
"step": 7735
},
{
"epoch": 7.226890756302521,
"grad_norm": 4.1472086906433105,
"learning_rate": 1.0832961903439815e-05,
"loss": 0.1938,
"num_input_tokens_seen": 999136,
"step": 7740
},
{
"epoch": 7.23155929038282,
"grad_norm": 1.4830154180526733,
"learning_rate": 1.0799413007848039e-05,
"loss": 0.1462,
"num_input_tokens_seen": 999728,
"step": 7745
},
{
"epoch": 7.236227824463119,
"grad_norm": 0.15791846811771393,
"learning_rate": 1.0765901824467167e-05,
"loss": 0.0642,
"num_input_tokens_seen": 1000384,
"step": 7750
},
{
"epoch": 7.240896358543417,
"grad_norm": 7.790059566497803,
"learning_rate": 1.0732428442292174e-05,
"loss": 0.1602,
"num_input_tokens_seen": 1001088,
"step": 7755
},
{
"epoch": 7.245564892623716,
"grad_norm": 1.7139387130737305,
"learning_rate": 1.0698992950217649e-05,
"loss": 0.0208,
"num_input_tokens_seen": 1001808,
"step": 7760
},
{
"epoch": 7.250233426704015,
"grad_norm": 6.180329322814941,
"learning_rate": 1.0665595437037545e-05,
"loss": 0.5134,
"num_input_tokens_seen": 1002560,
"step": 7765
},
{
"epoch": 7.254901960784314,
"grad_norm": 0.14951446652412415,
"learning_rate": 1.0632235991444972e-05,
"loss": 0.0613,
"num_input_tokens_seen": 1003200,
"step": 7770
},
{
"epoch": 7.2595704948646125,
"grad_norm": 0.27165699005126953,
"learning_rate": 1.0598914702031923e-05,
"loss": 0.0963,
"num_input_tokens_seen": 1003776,
"step": 7775
},
{
"epoch": 7.264239028944911,
"grad_norm": 2.8281211853027344,
"learning_rate": 1.0565631657289064e-05,
"loss": 0.1009,
"num_input_tokens_seen": 1004480,
"step": 7780
},
{
"epoch": 7.26890756302521,
"grad_norm": 0.6824027895927429,
"learning_rate": 1.0532386945605508e-05,
"loss": 0.0515,
"num_input_tokens_seen": 1005120,
"step": 7785
},
{
"epoch": 7.273576097105509,
"grad_norm": 0.21875107288360596,
"learning_rate": 1.0499180655268562e-05,
"loss": 0.0814,
"num_input_tokens_seen": 1005712,
"step": 7790
},
{
"epoch": 7.278244631185808,
"grad_norm": 7.165660858154297,
"learning_rate": 1.0466012874463507e-05,
"loss": 0.1297,
"num_input_tokens_seen": 1006400,
"step": 7795
},
{
"epoch": 7.282913165266106,
"grad_norm": 0.6695095896720886,
"learning_rate": 1.0432883691273329e-05,
"loss": 0.0837,
"num_input_tokens_seen": 1007024,
"step": 7800
},
{
"epoch": 7.287581699346405,
"grad_norm": 1.1955631971359253,
"learning_rate": 1.039979319367854e-05,
"loss": 0.0841,
"num_input_tokens_seen": 1007744,
"step": 7805
},
{
"epoch": 7.292250233426704,
"grad_norm": 2.22499942779541,
"learning_rate": 1.0366741469556906e-05,
"loss": 0.0635,
"num_input_tokens_seen": 1008352,
"step": 7810
},
{
"epoch": 7.296918767507003,
"grad_norm": 2.4868786334991455,
"learning_rate": 1.0333728606683204e-05,
"loss": 0.0457,
"num_input_tokens_seen": 1009008,
"step": 7815
},
{
"epoch": 7.301587301587301,
"grad_norm": 5.993730068206787,
"learning_rate": 1.0300754692729047e-05,
"loss": 0.0993,
"num_input_tokens_seen": 1009600,
"step": 7820
},
{
"epoch": 7.3062558356676,
"grad_norm": 3.345144510269165,
"learning_rate": 1.026781981526257e-05,
"loss": 0.0906,
"num_input_tokens_seen": 1010224,
"step": 7825
},
{
"epoch": 7.310924369747899,
"grad_norm": 3.8627970218658447,
"learning_rate": 1.0234924061748263e-05,
"loss": 0.1081,
"num_input_tokens_seen": 1010896,
"step": 7830
},
{
"epoch": 7.315592903828198,
"grad_norm": 15.3854341506958,
"learning_rate": 1.0202067519546718e-05,
"loss": 0.1311,
"num_input_tokens_seen": 1011536,
"step": 7835
},
{
"epoch": 7.3202614379084965,
"grad_norm": 3.0586917400360107,
"learning_rate": 1.0169250275914394e-05,
"loss": 0.0298,
"num_input_tokens_seen": 1012208,
"step": 7840
},
{
"epoch": 7.324929971988795,
"grad_norm": 2.1076900959014893,
"learning_rate": 1.0136472418003362e-05,
"loss": 0.0336,
"num_input_tokens_seen": 1012944,
"step": 7845
},
{
"epoch": 7.329598506069094,
"grad_norm": 6.537927627563477,
"learning_rate": 1.0103734032861123e-05,
"loss": 0.0968,
"num_input_tokens_seen": 1013568,
"step": 7850
},
{
"epoch": 7.334267040149393,
"grad_norm": 4.479717254638672,
"learning_rate": 1.0071035207430352e-05,
"loss": 0.0535,
"num_input_tokens_seen": 1014240,
"step": 7855
},
{
"epoch": 7.338935574229692,
"grad_norm": 1.1037297248840332,
"learning_rate": 1.0038376028548637e-05,
"loss": 0.0067,
"num_input_tokens_seen": 1014864,
"step": 7860
},
{
"epoch": 7.34360410830999,
"grad_norm": 1.1650564670562744,
"learning_rate": 1.0005756582948316e-05,
"loss": 0.1159,
"num_input_tokens_seen": 1015520,
"step": 7865
},
{
"epoch": 7.348272642390289,
"grad_norm": 0.03593031316995621,
"learning_rate": 9.973176957256175e-06,
"loss": 0.145,
"num_input_tokens_seen": 1016112,
"step": 7870
},
{
"epoch": 7.352941176470588,
"grad_norm": 3.5073347091674805,
"learning_rate": 9.940637237993269e-06,
"loss": 0.0585,
"num_input_tokens_seen": 1016752,
"step": 7875
},
{
"epoch": 7.357609710550887,
"grad_norm": 1.1457678079605103,
"learning_rate": 9.908137511574675e-06,
"loss": 0.0701,
"num_input_tokens_seen": 1017344,
"step": 7880
},
{
"epoch": 7.362278244631185,
"grad_norm": 8.913501739501953,
"learning_rate": 9.875677864309255e-06,
"loss": 0.0607,
"num_input_tokens_seen": 1017984,
"step": 7885
},
{
"epoch": 7.366946778711484,
"grad_norm": 1.648511290550232,
"learning_rate": 9.843258382399442e-06,
"loss": 0.0465,
"num_input_tokens_seen": 1018592,
"step": 7890
},
{
"epoch": 7.371615312791784,
"grad_norm": 3.110186815261841,
"learning_rate": 9.810879151940982e-06,
"loss": 0.06,
"num_input_tokens_seen": 1019264,
"step": 7895
},
{
"epoch": 7.376283846872083,
"grad_norm": 4.371214866638184,
"learning_rate": 9.778540258922762e-06,
"loss": 0.1029,
"num_input_tokens_seen": 1019856,
"step": 7900
},
{
"epoch": 7.380952380952381,
"grad_norm": 20.6632137298584,
"learning_rate": 9.746241789226502e-06,
"loss": 0.1593,
"num_input_tokens_seen": 1020432,
"step": 7905
},
{
"epoch": 7.38562091503268,
"grad_norm": 38.212284088134766,
"learning_rate": 9.7139838286266e-06,
"loss": 0.6516,
"num_input_tokens_seen": 1021056,
"step": 7910
},
{
"epoch": 7.390289449112979,
"grad_norm": 0.10892492532730103,
"learning_rate": 9.681766462789883e-06,
"loss": 0.0418,
"num_input_tokens_seen": 1021696,
"step": 7915
},
{
"epoch": 7.394957983193278,
"grad_norm": 7.932071208953857,
"learning_rate": 9.649589777275334e-06,
"loss": 0.1172,
"num_input_tokens_seen": 1022304,
"step": 7920
},
{
"epoch": 7.3996265172735765,
"grad_norm": 3.0598278045654297,
"learning_rate": 9.617453857533934e-06,
"loss": 0.1303,
"num_input_tokens_seen": 1023040,
"step": 7925
},
{
"epoch": 7.404295051353875,
"grad_norm": 6.430777072906494,
"learning_rate": 9.585358788908394e-06,
"loss": 0.1392,
"num_input_tokens_seen": 1023632,
"step": 7930
},
{
"epoch": 7.408963585434174,
"grad_norm": 7.087793350219727,
"learning_rate": 9.553304656632944e-06,
"loss": 0.1354,
"num_input_tokens_seen": 1024304,
"step": 7935
},
{
"epoch": 7.413632119514473,
"grad_norm": 8.742671966552734,
"learning_rate": 9.521291545833086e-06,
"loss": 0.1999,
"num_input_tokens_seen": 1024960,
"step": 7940
},
{
"epoch": 7.4183006535947715,
"grad_norm": 6.230234622955322,
"learning_rate": 9.489319541525383e-06,
"loss": 0.1121,
"num_input_tokens_seen": 1025536,
"step": 7945
},
{
"epoch": 7.42296918767507,
"grad_norm": 10.568910598754883,
"learning_rate": 9.457388728617239e-06,
"loss": 0.106,
"num_input_tokens_seen": 1026352,
"step": 7950
},
{
"epoch": 7.427637721755369,
"grad_norm": 2.8674209117889404,
"learning_rate": 9.425499191906675e-06,
"loss": 0.1084,
"num_input_tokens_seen": 1026992,
"step": 7955
},
{
"epoch": 7.432306255835668,
"grad_norm": 1.4187448024749756,
"learning_rate": 9.393651016082083e-06,
"loss": 0.1247,
"num_input_tokens_seen": 1027600,
"step": 7960
},
{
"epoch": 7.436974789915967,
"grad_norm": 0.17856673896312714,
"learning_rate": 9.361844285722027e-06,
"loss": 0.0572,
"num_input_tokens_seen": 1028192,
"step": 7965
},
{
"epoch": 7.441643323996265,
"grad_norm": 0.6200303435325623,
"learning_rate": 9.33007908529498e-06,
"loss": 0.0672,
"num_input_tokens_seen": 1028736,
"step": 7970
},
{
"epoch": 7.446311858076564,
"grad_norm": 0.7025942802429199,
"learning_rate": 9.298355499159156e-06,
"loss": 0.0064,
"num_input_tokens_seen": 1029456,
"step": 7975
},
{
"epoch": 7.450980392156863,
"grad_norm": 13.140531539916992,
"learning_rate": 9.266673611562221e-06,
"loss": 0.0942,
"num_input_tokens_seen": 1030160,
"step": 7980
},
{
"epoch": 7.455648926237162,
"grad_norm": 4.14353084564209,
"learning_rate": 9.23503350664113e-06,
"loss": 0.1122,
"num_input_tokens_seen": 1030832,
"step": 7985
},
{
"epoch": 7.4603174603174605,
"grad_norm": 5.697788715362549,
"learning_rate": 9.203435268421881e-06,
"loss": 0.1073,
"num_input_tokens_seen": 1031456,
"step": 7990
},
{
"epoch": 7.464985994397759,
"grad_norm": 5.6900129318237305,
"learning_rate": 9.171878980819254e-06,
"loss": 0.0624,
"num_input_tokens_seen": 1032064,
"step": 7995
},
{
"epoch": 7.469654528478058,
"grad_norm": 2.6561965942382812,
"learning_rate": 9.140364727636651e-06,
"loss": 0.0618,
"num_input_tokens_seen": 1032656,
"step": 8000
},
{
"epoch": 7.474323062558357,
"grad_norm": 7.419023036956787,
"learning_rate": 9.108892592565837e-06,
"loss": 0.0905,
"num_input_tokens_seen": 1033280,
"step": 8005
},
{
"epoch": 7.4789915966386555,
"grad_norm": 0.4191058278083801,
"learning_rate": 9.077462659186728e-06,
"loss": 0.0261,
"num_input_tokens_seen": 1033920,
"step": 8010
},
{
"epoch": 7.483660130718954,
"grad_norm": 5.563292503356934,
"learning_rate": 9.046075010967145e-06,
"loss": 0.0657,
"num_input_tokens_seen": 1034560,
"step": 8015
},
{
"epoch": 7.488328664799253,
"grad_norm": 0.4257217049598694,
"learning_rate": 9.014729731262647e-06,
"loss": 0.0997,
"num_input_tokens_seen": 1035312,
"step": 8020
},
{
"epoch": 7.492997198879552,
"grad_norm": 0.023411273956298828,
"learning_rate": 8.983426903316242e-06,
"loss": 0.0257,
"num_input_tokens_seen": 1035888,
"step": 8025
},
{
"epoch": 7.497665732959851,
"grad_norm": 8.618074417114258,
"learning_rate": 8.95216661025822e-06,
"loss": 0.2303,
"num_input_tokens_seen": 1036512,
"step": 8030
},
{
"epoch": 7.502334267040149,
"grad_norm": 4.378950595855713,
"learning_rate": 8.92094893510592e-06,
"loss": 0.102,
"num_input_tokens_seen": 1037248,
"step": 8035
},
{
"epoch": 7.507002801120448,
"grad_norm": 8.568061828613281,
"learning_rate": 8.889773960763465e-06,
"loss": 0.1088,
"num_input_tokens_seen": 1037856,
"step": 8040
},
{
"epoch": 7.507002801120448,
"eval_loss": 1.077528476715088,
"eval_runtime": 3.8754,
"eval_samples_per_second": 61.414,
"eval_steps_per_second": 30.707,
"num_input_tokens_seen": 1037856,
"step": 8040
},
{
"epoch": 7.511671335200747,
"grad_norm": 9.698957443237305,
"learning_rate": 8.858641770021619e-06,
"loss": 0.1022,
"num_input_tokens_seen": 1038496,
"step": 8045
},
{
"epoch": 7.516339869281046,
"grad_norm": 4.305899620056152,
"learning_rate": 8.827552445557505e-06,
"loss": 0.2151,
"num_input_tokens_seen": 1039136,
"step": 8050
},
{
"epoch": 7.5210084033613445,
"grad_norm": 0.05098443478345871,
"learning_rate": 8.79650606993442e-06,
"loss": 0.0433,
"num_input_tokens_seen": 1039728,
"step": 8055
},
{
"epoch": 7.525676937441643,
"grad_norm": 3.4601404666900635,
"learning_rate": 8.765502725601582e-06,
"loss": 0.059,
"num_input_tokens_seen": 1040384,
"step": 8060
},
{
"epoch": 7.530345471521942,
"grad_norm": 0.7538869380950928,
"learning_rate": 8.734542494893955e-06,
"loss": 0.0568,
"num_input_tokens_seen": 1040976,
"step": 8065
},
{
"epoch": 7.535014005602241,
"grad_norm": 0.21339933574199677,
"learning_rate": 8.70362546003198e-06,
"loss": 0.0355,
"num_input_tokens_seen": 1041632,
"step": 8070
},
{
"epoch": 7.5396825396825395,
"grad_norm": 0.22579693794250488,
"learning_rate": 8.67275170312141e-06,
"loss": 0.1002,
"num_input_tokens_seen": 1042256,
"step": 8075
},
{
"epoch": 7.544351073762838,
"grad_norm": 15.217583656311035,
"learning_rate": 8.641921306153052e-06,
"loss": 0.0909,
"num_input_tokens_seen": 1042928,
"step": 8080
},
{
"epoch": 7.549019607843137,
"grad_norm": 4.866734981536865,
"learning_rate": 8.611134351002579e-06,
"loss": 0.0347,
"num_input_tokens_seen": 1043504,
"step": 8085
},
{
"epoch": 7.553688141923436,
"grad_norm": 3.1651668548583984,
"learning_rate": 8.580390919430264e-06,
"loss": 0.0467,
"num_input_tokens_seen": 1044192,
"step": 8090
},
{
"epoch": 7.558356676003735,
"grad_norm": 1.9329001903533936,
"learning_rate": 8.549691093080822e-06,
"loss": 0.0826,
"num_input_tokens_seen": 1044800,
"step": 8095
},
{
"epoch": 7.563025210084033,
"grad_norm": 9.483233451843262,
"learning_rate": 8.519034953483171e-06,
"loss": 0.1911,
"num_input_tokens_seen": 1045488,
"step": 8100
},
{
"epoch": 7.567693744164332,
"grad_norm": 6.546195030212402,
"learning_rate": 8.488422582050182e-06,
"loss": 0.0514,
"num_input_tokens_seen": 1046112,
"step": 8105
},
{
"epoch": 7.572362278244631,
"grad_norm": 6.966008186340332,
"learning_rate": 8.45785406007852e-06,
"loss": 0.0977,
"num_input_tokens_seen": 1046768,
"step": 8110
},
{
"epoch": 7.57703081232493,
"grad_norm": 0.8122055530548096,
"learning_rate": 8.42732946874838e-06,
"loss": 0.0274,
"num_input_tokens_seen": 1047424,
"step": 8115
},
{
"epoch": 7.5816993464052285,
"grad_norm": 0.033146731555461884,
"learning_rate": 8.396848889123304e-06,
"loss": 0.1072,
"num_input_tokens_seen": 1048048,
"step": 8120
},
{
"epoch": 7.586367880485527,
"grad_norm": 1.4818317890167236,
"learning_rate": 8.366412402149954e-06,
"loss": 0.0435,
"num_input_tokens_seen": 1048720,
"step": 8125
},
{
"epoch": 7.591036414565826,
"grad_norm": 1.447939395904541,
"learning_rate": 8.336020088657884e-06,
"loss": 0.1128,
"num_input_tokens_seen": 1049424,
"step": 8130
},
{
"epoch": 7.595704948646125,
"grad_norm": 10.602599143981934,
"learning_rate": 8.305672029359357e-06,
"loss": 0.0777,
"num_input_tokens_seen": 1050016,
"step": 8135
},
{
"epoch": 7.6003734827264235,
"grad_norm": 1.1130367517471313,
"learning_rate": 8.27536830484909e-06,
"loss": 0.1002,
"num_input_tokens_seen": 1050608,
"step": 8140
},
{
"epoch": 7.605042016806722,
"grad_norm": 0.2835767865180969,
"learning_rate": 8.245108995604061e-06,
"loss": 0.0834,
"num_input_tokens_seen": 1051216,
"step": 8145
},
{
"epoch": 7.609710550887021,
"grad_norm": 1.318534016609192,
"learning_rate": 8.214894181983314e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1052000,
"step": 8150
},
{
"epoch": 7.61437908496732,
"grad_norm": 12.885905265808105,
"learning_rate": 8.184723944227717e-06,
"loss": 0.1562,
"num_input_tokens_seen": 1052624,
"step": 8155
},
{
"epoch": 7.619047619047619,
"grad_norm": 5.901303768157959,
"learning_rate": 8.154598362459765e-06,
"loss": 0.1439,
"num_input_tokens_seen": 1053344,
"step": 8160
},
{
"epoch": 7.623716153127917,
"grad_norm": 6.216315269470215,
"learning_rate": 8.124517516683337e-06,
"loss": 0.1417,
"num_input_tokens_seen": 1053920,
"step": 8165
},
{
"epoch": 7.628384687208216,
"grad_norm": 1.9915357828140259,
"learning_rate": 8.094481486783534e-06,
"loss": 0.108,
"num_input_tokens_seen": 1054656,
"step": 8170
},
{
"epoch": 7.633053221288515,
"grad_norm": 4.136562824249268,
"learning_rate": 8.064490352526432e-06,
"loss": 0.1449,
"num_input_tokens_seen": 1055328,
"step": 8175
},
{
"epoch": 7.637721755368814,
"grad_norm": 0.12521328032016754,
"learning_rate": 8.034544193558888e-06,
"loss": 0.0834,
"num_input_tokens_seen": 1055920,
"step": 8180
},
{
"epoch": 7.642390289449113,
"grad_norm": 2.289386034011841,
"learning_rate": 8.0046430894083e-06,
"loss": 0.0068,
"num_input_tokens_seen": 1056496,
"step": 8185
},
{
"epoch": 7.647058823529412,
"grad_norm": 8.30339527130127,
"learning_rate": 7.974787119482416e-06,
"loss": 0.1128,
"num_input_tokens_seen": 1057152,
"step": 8190
},
{
"epoch": 7.651727357609711,
"grad_norm": 4.268256187438965,
"learning_rate": 7.944976363069137e-06,
"loss": 0.1039,
"num_input_tokens_seen": 1057776,
"step": 8195
},
{
"epoch": 7.65639589169001,
"grad_norm": 0.635925829410553,
"learning_rate": 7.915210899336284e-06,
"loss": 0.0245,
"num_input_tokens_seen": 1058432,
"step": 8200
},
{
"epoch": 7.661064425770308,
"grad_norm": 0.12795297801494598,
"learning_rate": 7.885490807331405e-06,
"loss": 0.0494,
"num_input_tokens_seen": 1059216,
"step": 8205
},
{
"epoch": 7.665732959850607,
"grad_norm": 0.22869738936424255,
"learning_rate": 7.855816165981528e-06,
"loss": 0.0965,
"num_input_tokens_seen": 1059792,
"step": 8210
},
{
"epoch": 7.670401493930906,
"grad_norm": 3.5849406719207764,
"learning_rate": 7.826187054093004e-06,
"loss": 0.0409,
"num_input_tokens_seen": 1060464,
"step": 8215
},
{
"epoch": 7.675070028011205,
"grad_norm": 0.6717523336410522,
"learning_rate": 7.796603550351276e-06,
"loss": 0.0769,
"num_input_tokens_seen": 1061152,
"step": 8220
},
{
"epoch": 7.6797385620915035,
"grad_norm": 3.960289716720581,
"learning_rate": 7.767065733320636e-06,
"loss": 0.0305,
"num_input_tokens_seen": 1061792,
"step": 8225
},
{
"epoch": 7.684407096171802,
"grad_norm": 3.6754231452941895,
"learning_rate": 7.737573681444082e-06,
"loss": 0.0808,
"num_input_tokens_seen": 1062512,
"step": 8230
},
{
"epoch": 7.689075630252101,
"grad_norm": 0.45391157269477844,
"learning_rate": 7.708127473043044e-06,
"loss": 0.097,
"num_input_tokens_seen": 1063184,
"step": 8235
},
{
"epoch": 7.6937441643324,
"grad_norm": 12.55863094329834,
"learning_rate": 7.678727186317225e-06,
"loss": 0.1237,
"num_input_tokens_seen": 1063776,
"step": 8240
},
{
"epoch": 7.698412698412699,
"grad_norm": 17.174266815185547,
"learning_rate": 7.649372899344376e-06,
"loss": 0.2717,
"num_input_tokens_seen": 1064384,
"step": 8245
},
{
"epoch": 7.703081232492997,
"grad_norm": 0.3060591518878937,
"learning_rate": 7.620064690080076e-06,
"loss": 0.0221,
"num_input_tokens_seen": 1064944,
"step": 8250
},
{
"epoch": 7.707749766573296,
"grad_norm": 3.2448484897613525,
"learning_rate": 7.59080263635755e-06,
"loss": 0.077,
"num_input_tokens_seen": 1065584,
"step": 8255
},
{
"epoch": 7.712418300653595,
"grad_norm": 1.3086873292922974,
"learning_rate": 7.561586815887428e-06,
"loss": 0.0542,
"num_input_tokens_seen": 1066320,
"step": 8260
},
{
"epoch": 7.717086834733894,
"grad_norm": 4.001552581787109,
"learning_rate": 7.532417306257589e-06,
"loss": 0.1468,
"num_input_tokens_seen": 1066992,
"step": 8265
},
{
"epoch": 7.721755368814192,
"grad_norm": 0.9954704642295837,
"learning_rate": 7.503294184932888e-06,
"loss": 0.0911,
"num_input_tokens_seen": 1067664,
"step": 8270
},
{
"epoch": 7.726423902894491,
"grad_norm": 0.2005486637353897,
"learning_rate": 7.474217529255018e-06,
"loss": 0.0434,
"num_input_tokens_seen": 1068304,
"step": 8275
},
{
"epoch": 7.73109243697479,
"grad_norm": 1.2900813817977905,
"learning_rate": 7.44518741644227e-06,
"loss": 0.0335,
"num_input_tokens_seen": 1068928,
"step": 8280
},
{
"epoch": 7.735760971055089,
"grad_norm": 0.8297515511512756,
"learning_rate": 7.416203923589312e-06,
"loss": 0.0565,
"num_input_tokens_seen": 1069680,
"step": 8285
},
{
"epoch": 7.7404295051353875,
"grad_norm": 2.7593870162963867,
"learning_rate": 7.387267127667028e-06,
"loss": 0.1607,
"num_input_tokens_seen": 1070368,
"step": 8290
},
{
"epoch": 7.745098039215686,
"grad_norm": 1.2917249202728271,
"learning_rate": 7.358377105522276e-06,
"loss": 0.0402,
"num_input_tokens_seen": 1071056,
"step": 8295
},
{
"epoch": 7.749766573295985,
"grad_norm": 2.100257396697998,
"learning_rate": 7.329533933877713e-06,
"loss": 0.1192,
"num_input_tokens_seen": 1071728,
"step": 8300
},
{
"epoch": 7.754435107376284,
"grad_norm": 0.5423152446746826,
"learning_rate": 7.300737689331555e-06,
"loss": 0.1131,
"num_input_tokens_seen": 1072368,
"step": 8305
},
{
"epoch": 7.759103641456583,
"grad_norm": 0.27788621187210083,
"learning_rate": 7.2719884483573975e-06,
"loss": 0.0266,
"num_input_tokens_seen": 1073056,
"step": 8310
},
{
"epoch": 7.763772175536881,
"grad_norm": 3.2515792846679688,
"learning_rate": 7.243286287304024e-06,
"loss": 0.1105,
"num_input_tokens_seen": 1073680,
"step": 8315
},
{
"epoch": 7.76844070961718,
"grad_norm": 6.515419006347656,
"learning_rate": 7.214631282395184e-06,
"loss": 0.1254,
"num_input_tokens_seen": 1074288,
"step": 8320
},
{
"epoch": 7.773109243697479,
"grad_norm": 0.06773083657026291,
"learning_rate": 7.186023509729392e-06,
"loss": 0.1299,
"num_input_tokens_seen": 1074944,
"step": 8325
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.05192798748612404,
"learning_rate": 7.157463045279736e-06,
"loss": 0.1067,
"num_input_tokens_seen": 1075552,
"step": 8330
},
{
"epoch": 7.7824463118580764,
"grad_norm": 4.573253631591797,
"learning_rate": 7.128949964893647e-06,
"loss": 0.1099,
"num_input_tokens_seen": 1076192,
"step": 8335
},
{
"epoch": 7.787114845938375,
"grad_norm": 3.051204204559326,
"learning_rate": 7.100484344292743e-06,
"loss": 0.2794,
"num_input_tokens_seen": 1076880,
"step": 8340
},
{
"epoch": 7.791783380018674,
"grad_norm": 0.13514143228530884,
"learning_rate": 7.072066259072602e-06,
"loss": 0.0938,
"num_input_tokens_seen": 1077616,
"step": 8345
},
{
"epoch": 7.796451914098973,
"grad_norm": 2.7879085540771484,
"learning_rate": 7.043695784702553e-06,
"loss": 0.0664,
"num_input_tokens_seen": 1078288,
"step": 8350
},
{
"epoch": 7.8011204481792715,
"grad_norm": 1.6922277212142944,
"learning_rate": 7.015372996525477e-06,
"loss": 0.0457,
"num_input_tokens_seen": 1078896,
"step": 8355
},
{
"epoch": 7.80578898225957,
"grad_norm": 0.956597626209259,
"learning_rate": 6.987097969757636e-06,
"loss": 0.0687,
"num_input_tokens_seen": 1079664,
"step": 8360
},
{
"epoch": 7.810457516339869,
"grad_norm": 0.3348096013069153,
"learning_rate": 6.958870779488447e-06,
"loss": 0.0132,
"num_input_tokens_seen": 1080304,
"step": 8365
},
{
"epoch": 7.815126050420168,
"grad_norm": 10.991283416748047,
"learning_rate": 6.930691500680289e-06,
"loss": 0.2201,
"num_input_tokens_seen": 1080992,
"step": 8370
},
{
"epoch": 7.819794584500467,
"grad_norm": 0.2432507872581482,
"learning_rate": 6.902560208168304e-06,
"loss": 0.1369,
"num_input_tokens_seen": 1081664,
"step": 8375
},
{
"epoch": 7.824463118580765,
"grad_norm": 3.725372791290283,
"learning_rate": 6.8744769766601854e-06,
"loss": 0.2358,
"num_input_tokens_seen": 1082224,
"step": 8380
},
{
"epoch": 7.829131652661064,
"grad_norm": 0.3020375669002533,
"learning_rate": 6.8464418807360095e-06,
"loss": 0.0204,
"num_input_tokens_seen": 1082832,
"step": 8385
},
{
"epoch": 7.833800186741363,
"grad_norm": 0.009726503863930702,
"learning_rate": 6.818454994848006e-06,
"loss": 0.1,
"num_input_tokens_seen": 1083424,
"step": 8390
},
{
"epoch": 7.838468720821662,
"grad_norm": 10.553521156311035,
"learning_rate": 6.7905163933203785e-06,
"loss": 0.1198,
"num_input_tokens_seen": 1084144,
"step": 8395
},
{
"epoch": 7.8431372549019605,
"grad_norm": 0.8017410039901733,
"learning_rate": 6.762626150349119e-06,
"loss": 0.0439,
"num_input_tokens_seen": 1084816,
"step": 8400
},
{
"epoch": 7.847805788982259,
"grad_norm": 6.144360065460205,
"learning_rate": 6.7347843400017625e-06,
"loss": 0.1038,
"num_input_tokens_seen": 1085424,
"step": 8405
},
{
"epoch": 7.852474323062558,
"grad_norm": 6.126713275909424,
"learning_rate": 6.7069910362172474e-06,
"loss": 0.074,
"num_input_tokens_seen": 1086096,
"step": 8410
},
{
"epoch": 7.857142857142857,
"grad_norm": 1.0252084732055664,
"learning_rate": 6.679246312805687e-06,
"loss": 0.0591,
"num_input_tokens_seen": 1086816,
"step": 8415
},
{
"epoch": 7.861811391223156,
"grad_norm": 0.3098870813846588,
"learning_rate": 6.651550243448182e-06,
"loss": 0.0269,
"num_input_tokens_seen": 1087408,
"step": 8420
},
{
"epoch": 7.866479925303455,
"grad_norm": 1.8085873126983643,
"learning_rate": 6.62390290169663e-06,
"loss": 0.0183,
"num_input_tokens_seen": 1088112,
"step": 8425
},
{
"epoch": 7.871148459383754,
"grad_norm": 4.327649116516113,
"learning_rate": 6.596304360973504e-06,
"loss": 0.0375,
"num_input_tokens_seen": 1088704,
"step": 8430
},
{
"epoch": 7.875816993464053,
"grad_norm": 0.09986984729766846,
"learning_rate": 6.568754694571685e-06,
"loss": 0.0063,
"num_input_tokens_seen": 1089328,
"step": 8435
},
{
"epoch": 7.8804855275443515,
"grad_norm": 6.41303825378418,
"learning_rate": 6.541253975654271e-06,
"loss": 0.0485,
"num_input_tokens_seen": 1089984,
"step": 8440
},
{
"epoch": 7.88515406162465,
"grad_norm": 1.0045740604400635,
"learning_rate": 6.513802277254363e-06,
"loss": 0.1427,
"num_input_tokens_seen": 1090576,
"step": 8445
},
{
"epoch": 7.889822595704949,
"grad_norm": 0.6578733325004578,
"learning_rate": 6.48639967227489e-06,
"loss": 0.0215,
"num_input_tokens_seen": 1091296,
"step": 8450
},
{
"epoch": 7.894491129785248,
"grad_norm": 1.3396340608596802,
"learning_rate": 6.459046233488372e-06,
"loss": 0.0259,
"num_input_tokens_seen": 1091984,
"step": 8455
},
{
"epoch": 7.899159663865547,
"grad_norm": 4.178629398345947,
"learning_rate": 6.431742033536797e-06,
"loss": 0.101,
"num_input_tokens_seen": 1092608,
"step": 8460
},
{
"epoch": 7.903828197945845,
"grad_norm": 0.4377411901950836,
"learning_rate": 6.404487144931379e-06,
"loss": 0.0471,
"num_input_tokens_seen": 1093296,
"step": 8465
},
{
"epoch": 7.908496732026144,
"grad_norm": 11.185189247131348,
"learning_rate": 6.377281640052357e-06,
"loss": 0.1267,
"num_input_tokens_seen": 1093952,
"step": 8470
},
{
"epoch": 7.913165266106443,
"grad_norm": 0.6686238646507263,
"learning_rate": 6.3501255911488565e-06,
"loss": 0.2006,
"num_input_tokens_seen": 1094576,
"step": 8475
},
{
"epoch": 7.917833800186742,
"grad_norm": 3.336912155151367,
"learning_rate": 6.323019070338629e-06,
"loss": 0.0915,
"num_input_tokens_seen": 1095200,
"step": 8480
},
{
"epoch": 7.92250233426704,
"grad_norm": 0.2326798141002655,
"learning_rate": 6.29596214960792e-06,
"loss": 0.082,
"num_input_tokens_seen": 1095856,
"step": 8485
},
{
"epoch": 7.927170868347339,
"grad_norm": 0.3160500228404999,
"learning_rate": 6.2689549008112466e-06,
"loss": 0.1561,
"num_input_tokens_seen": 1096592,
"step": 8490
},
{
"epoch": 7.931839402427638,
"grad_norm": 0.7252269387245178,
"learning_rate": 6.241997395671209e-06,
"loss": 0.022,
"num_input_tokens_seen": 1097200,
"step": 8495
},
{
"epoch": 7.936507936507937,
"grad_norm": 5.833823204040527,
"learning_rate": 6.215089705778315e-06,
"loss": 0.0991,
"num_input_tokens_seen": 1097840,
"step": 8500
},
{
"epoch": 7.9411764705882355,
"grad_norm": 0.3958582282066345,
"learning_rate": 6.18823190259076e-06,
"loss": 0.0325,
"num_input_tokens_seen": 1098432,
"step": 8505
},
{
"epoch": 7.945845004668534,
"grad_norm": 0.03990139067173004,
"learning_rate": 6.161424057434278e-06,
"loss": 0.0129,
"num_input_tokens_seen": 1099136,
"step": 8510
},
{
"epoch": 7.950513538748833,
"grad_norm": 0.18915289640426636,
"learning_rate": 6.134666241501905e-06,
"loss": 0.2817,
"num_input_tokens_seen": 1099776,
"step": 8515
},
{
"epoch": 7.955182072829132,
"grad_norm": 1.3937067985534668,
"learning_rate": 6.107958525853838e-06,
"loss": 0.1055,
"num_input_tokens_seen": 1100464,
"step": 8520
},
{
"epoch": 7.959850606909431,
"grad_norm": 3.5351006984710693,
"learning_rate": 6.081300981417226e-06,
"loss": 0.0682,
"num_input_tokens_seen": 1101104,
"step": 8525
},
{
"epoch": 7.964519140989729,
"grad_norm": 3.100557804107666,
"learning_rate": 6.0546936789859505e-06,
"loss": 0.0532,
"num_input_tokens_seen": 1101744,
"step": 8530
},
{
"epoch": 7.969187675070028,
"grad_norm": 1.3666666746139526,
"learning_rate": 6.028136689220498e-06,
"loss": 0.1588,
"num_input_tokens_seen": 1102400,
"step": 8535
},
{
"epoch": 7.973856209150327,
"grad_norm": 5.621835231781006,
"learning_rate": 6.001630082647722e-06,
"loss": 0.0461,
"num_input_tokens_seen": 1102992,
"step": 8540
},
{
"epoch": 7.978524743230626,
"grad_norm": 2.5733768939971924,
"learning_rate": 5.975173929660688e-06,
"loss": 0.1164,
"num_input_tokens_seen": 1103632,
"step": 8545
},
{
"epoch": 7.983193277310924,
"grad_norm": 3.054447650909424,
"learning_rate": 5.948768300518459e-06,
"loss": 0.1701,
"num_input_tokens_seen": 1104320,
"step": 8550
},
{
"epoch": 7.987861811391223,
"grad_norm": 4.3491058349609375,
"learning_rate": 5.922413265345922e-06,
"loss": 0.1025,
"num_input_tokens_seen": 1104928,
"step": 8555
},
{
"epoch": 7.992530345471522,
"grad_norm": 11.961054801940918,
"learning_rate": 5.896108894133617e-06,
"loss": 0.1614,
"num_input_tokens_seen": 1105440,
"step": 8560
},
{
"epoch": 7.997198879551821,
"grad_norm": 11.302621841430664,
"learning_rate": 5.8698552567375275e-06,
"loss": 0.1423,
"num_input_tokens_seen": 1106112,
"step": 8565
},
{
"epoch": 8.00186741363212,
"grad_norm": 0.8292189240455627,
"learning_rate": 5.8436524228789145e-06,
"loss": 0.0762,
"num_input_tokens_seen": 1106664,
"step": 8570
},
{
"epoch": 8.006535947712418,
"grad_norm": 1.8794206380844116,
"learning_rate": 5.817500462144099e-06,
"loss": 0.0452,
"num_input_tokens_seen": 1107368,
"step": 8575
},
{
"epoch": 8.007469654528478,
"eval_loss": 1.052361011505127,
"eval_runtime": 3.8664,
"eval_samples_per_second": 61.556,
"eval_steps_per_second": 30.778,
"num_input_tokens_seen": 1107480,
"step": 8576
},
{
"epoch": 8.011204481792717,
"grad_norm": 1.9271084070205688,
"learning_rate": 5.791399443984319e-06,
"loss": 0.2034,
"num_input_tokens_seen": 1107992,
"step": 8580
},
{
"epoch": 8.015873015873016,
"grad_norm": 3.432539939880371,
"learning_rate": 5.76534943771552e-06,
"loss": 0.0288,
"num_input_tokens_seen": 1108664,
"step": 8585
},
{
"epoch": 8.020541549953315,
"grad_norm": 0.08454066514968872,
"learning_rate": 5.73935051251818e-06,
"loss": 0.0748,
"num_input_tokens_seen": 1109224,
"step": 8590
},
{
"epoch": 8.025210084033613,
"grad_norm": 1.4576191902160645,
"learning_rate": 5.71340273743711e-06,
"loss": 0.0179,
"num_input_tokens_seen": 1109976,
"step": 8595
},
{
"epoch": 8.029878618113912,
"grad_norm": 6.439827919006348,
"learning_rate": 5.687506181381286e-06,
"loss": 0.0867,
"num_input_tokens_seen": 1110648,
"step": 8600
},
{
"epoch": 8.034547152194211,
"grad_norm": 1.4698748588562012,
"learning_rate": 5.661660913123673e-06,
"loss": 0.053,
"num_input_tokens_seen": 1111320,
"step": 8605
},
{
"epoch": 8.03921568627451,
"grad_norm": 11.151979446411133,
"learning_rate": 5.635867001301026e-06,
"loss": 0.081,
"num_input_tokens_seen": 1111976,
"step": 8610
},
{
"epoch": 8.043884220354808,
"grad_norm": 0.18296192586421967,
"learning_rate": 5.610124514413714e-06,
"loss": 0.0617,
"num_input_tokens_seen": 1112712,
"step": 8615
},
{
"epoch": 8.048552754435107,
"grad_norm": 0.12693344056606293,
"learning_rate": 5.584433520825541e-06,
"loss": 0.0376,
"num_input_tokens_seen": 1113336,
"step": 8620
},
{
"epoch": 8.053221288515406,
"grad_norm": 1.491117238998413,
"learning_rate": 5.55879408876355e-06,
"loss": 0.03,
"num_input_tokens_seen": 1113960,
"step": 8625
},
{
"epoch": 8.057889822595705,
"grad_norm": 3.4405813217163086,
"learning_rate": 5.5332062863178685e-06,
"loss": 0.0393,
"num_input_tokens_seen": 1114600,
"step": 8630
},
{
"epoch": 8.062558356676004,
"grad_norm": 5.879638671875,
"learning_rate": 5.507670181441493e-06,
"loss": 0.1544,
"num_input_tokens_seen": 1115240,
"step": 8635
},
{
"epoch": 8.067226890756302,
"grad_norm": 8.053985595703125,
"learning_rate": 5.482185841950147e-06,
"loss": 0.0551,
"num_input_tokens_seen": 1115800,
"step": 8640
},
{
"epoch": 8.071895424836601,
"grad_norm": 5.818375587463379,
"learning_rate": 5.4567533355220804e-06,
"loss": 0.017,
"num_input_tokens_seen": 1116392,
"step": 8645
},
{
"epoch": 8.0765639589169,
"grad_norm": 0.13191650807857513,
"learning_rate": 5.43137272969787e-06,
"loss": 0.0075,
"num_input_tokens_seen": 1116984,
"step": 8650
},
{
"epoch": 8.081232492997199,
"grad_norm": 7.324196815490723,
"learning_rate": 5.406044091880285e-06,
"loss": 0.0568,
"num_input_tokens_seen": 1117544,
"step": 8655
},
{
"epoch": 8.085901027077497,
"grad_norm": 4.285675525665283,
"learning_rate": 5.380767489334076e-06,
"loss": 0.0771,
"num_input_tokens_seen": 1118248,
"step": 8660
},
{
"epoch": 8.090569561157796,
"grad_norm": 1.300607681274414,
"learning_rate": 5.3555429891858075e-06,
"loss": 0.0213,
"num_input_tokens_seen": 1118888,
"step": 8665
},
{
"epoch": 8.095238095238095,
"grad_norm": 1.639029860496521,
"learning_rate": 5.330370658423661e-06,
"loss": 0.0412,
"num_input_tokens_seen": 1119560,
"step": 8670
},
{
"epoch": 8.099906629318394,
"grad_norm": 0.05550481006503105,
"learning_rate": 5.305250563897299e-06,
"loss": 0.0482,
"num_input_tokens_seen": 1120184,
"step": 8675
},
{
"epoch": 8.104575163398692,
"grad_norm": 0.6807479858398438,
"learning_rate": 5.280182772317632e-06,
"loss": 0.0375,
"num_input_tokens_seen": 1120888,
"step": 8680
},
{
"epoch": 8.109243697478991,
"grad_norm": 1.5354772806167603,
"learning_rate": 5.255167350256693e-06,
"loss": 0.0708,
"num_input_tokens_seen": 1121496,
"step": 8685
},
{
"epoch": 8.11391223155929,
"grad_norm": 4.521956443786621,
"learning_rate": 5.230204364147432e-06,
"loss": 0.0741,
"num_input_tokens_seen": 1122120,
"step": 8690
},
{
"epoch": 8.118580765639589,
"grad_norm": 3.2341227531433105,
"learning_rate": 5.205293880283552e-06,
"loss": 0.0191,
"num_input_tokens_seen": 1122792,
"step": 8695
},
{
"epoch": 8.123249299719888,
"grad_norm": 1.803482174873352,
"learning_rate": 5.180435964819303e-06,
"loss": 0.046,
"num_input_tokens_seen": 1123336,
"step": 8700
},
{
"epoch": 8.127917833800186,
"grad_norm": 1.5913814306259155,
"learning_rate": 5.155630683769358e-06,
"loss": 0.0512,
"num_input_tokens_seen": 1123960,
"step": 8705
},
{
"epoch": 8.132586367880485,
"grad_norm": 0.06489665806293488,
"learning_rate": 5.130878103008604e-06,
"loss": 0.062,
"num_input_tokens_seen": 1124648,
"step": 8710
},
{
"epoch": 8.137254901960784,
"grad_norm": 0.5183777809143066,
"learning_rate": 5.106178288271962e-06,
"loss": 0.0055,
"num_input_tokens_seen": 1125352,
"step": 8715
},
{
"epoch": 8.141923436041083,
"grad_norm": 0.4233246147632599,
"learning_rate": 5.081531305154219e-06,
"loss": 0.0208,
"num_input_tokens_seen": 1125976,
"step": 8720
},
{
"epoch": 8.146591970121381,
"grad_norm": 1.313720464706421,
"learning_rate": 5.056937219109881e-06,
"loss": 0.0077,
"num_input_tokens_seen": 1126600,
"step": 8725
},
{
"epoch": 8.15126050420168,
"grad_norm": 2.7932381629943848,
"learning_rate": 5.032396095452957e-06,
"loss": 0.0248,
"num_input_tokens_seen": 1127240,
"step": 8730
},
{
"epoch": 8.155929038281979,
"grad_norm": 2.909801959991455,
"learning_rate": 5.007907999356814e-06,
"loss": 0.0111,
"num_input_tokens_seen": 1127944,
"step": 8735
},
{
"epoch": 8.160597572362278,
"grad_norm": 1.8586024045944214,
"learning_rate": 4.9834729958540025e-06,
"loss": 0.0489,
"num_input_tokens_seen": 1128632,
"step": 8740
},
{
"epoch": 8.165266106442576,
"grad_norm": 2.5531888008117676,
"learning_rate": 4.959091149836048e-06,
"loss": 0.0074,
"num_input_tokens_seen": 1129256,
"step": 8745
},
{
"epoch": 8.169934640522875,
"grad_norm": 0.19135218858718872,
"learning_rate": 4.934762526053333e-06,
"loss": 0.0356,
"num_input_tokens_seen": 1129896,
"step": 8750
},
{
"epoch": 8.174603174603174,
"grad_norm": 0.020777558907866478,
"learning_rate": 4.910487189114893e-06,
"loss": 0.0827,
"num_input_tokens_seen": 1130568,
"step": 8755
},
{
"epoch": 8.179271708683473,
"grad_norm": 4.427436828613281,
"learning_rate": 4.886265203488241e-06,
"loss": 0.089,
"num_input_tokens_seen": 1131208,
"step": 8760
},
{
"epoch": 8.183940242763772,
"grad_norm": 0.16428785026073456,
"learning_rate": 4.862096633499225e-06,
"loss": 0.0159,
"num_input_tokens_seen": 1131848,
"step": 8765
},
{
"epoch": 8.18860877684407,
"grad_norm": 3.0339784622192383,
"learning_rate": 4.83798154333181e-06,
"loss": 0.0792,
"num_input_tokens_seen": 1132536,
"step": 8770
},
{
"epoch": 8.193277310924369,
"grad_norm": 0.03558497503399849,
"learning_rate": 4.81391999702796e-06,
"loss": 0.0184,
"num_input_tokens_seen": 1133176,
"step": 8775
},
{
"epoch": 8.197945845004668,
"grad_norm": 0.018901566043496132,
"learning_rate": 4.789912058487436e-06,
"loss": 0.0375,
"num_input_tokens_seen": 1133832,
"step": 8780
},
{
"epoch": 8.202614379084967,
"grad_norm": 1.9881067276000977,
"learning_rate": 4.765957791467635e-06,
"loss": 0.2102,
"num_input_tokens_seen": 1134408,
"step": 8785
},
{
"epoch": 8.207282913165265,
"grad_norm": 4.089415550231934,
"learning_rate": 4.7420572595834185e-06,
"loss": 0.0369,
"num_input_tokens_seen": 1135128,
"step": 8790
},
{
"epoch": 8.211951447245564,
"grad_norm": 9.398062705993652,
"learning_rate": 4.7182105263069455e-06,
"loss": 0.0878,
"num_input_tokens_seen": 1135752,
"step": 8795
},
{
"epoch": 8.216619981325863,
"grad_norm": 3.531256914138794,
"learning_rate": 4.694417654967492e-06,
"loss": 0.0092,
"num_input_tokens_seen": 1136440,
"step": 8800
},
{
"epoch": 8.221288515406162,
"grad_norm": 7.09257698059082,
"learning_rate": 4.670678708751311e-06,
"loss": 0.129,
"num_input_tokens_seen": 1137000,
"step": 8805
},
{
"epoch": 8.22595704948646,
"grad_norm": 0.060900285840034485,
"learning_rate": 4.646993750701439e-06,
"loss": 0.104,
"num_input_tokens_seen": 1137608,
"step": 8810
},
{
"epoch": 8.23062558356676,
"grad_norm": 0.01634305715560913,
"learning_rate": 4.623362843717549e-06,
"loss": 0.008,
"num_input_tokens_seen": 1138184,
"step": 8815
},
{
"epoch": 8.235294117647058,
"grad_norm": 14.870940208435059,
"learning_rate": 4.599786050555746e-06,
"loss": 0.0601,
"num_input_tokens_seen": 1138776,
"step": 8820
},
{
"epoch": 8.239962651727357,
"grad_norm": 0.4319959282875061,
"learning_rate": 4.576263433828445e-06,
"loss": 0.0125,
"num_input_tokens_seen": 1139448,
"step": 8825
},
{
"epoch": 8.244631185807656,
"grad_norm": 4.329498767852783,
"learning_rate": 4.552795056004194e-06,
"loss": 0.0407,
"num_input_tokens_seen": 1140056,
"step": 8830
},
{
"epoch": 8.249299719887954,
"grad_norm": 4.342251777648926,
"learning_rate": 4.5293809794074744e-06,
"loss": 0.0548,
"num_input_tokens_seen": 1140600,
"step": 8835
},
{
"epoch": 8.253968253968253,
"grad_norm": 6.068799018859863,
"learning_rate": 4.506021266218582e-06,
"loss": 0.0323,
"num_input_tokens_seen": 1141224,
"step": 8840
},
{
"epoch": 8.258636788048554,
"grad_norm": 6.533751487731934,
"learning_rate": 4.482715978473428e-06,
"loss": 0.0266,
"num_input_tokens_seen": 1141896,
"step": 8845
},
{
"epoch": 8.263305322128852,
"grad_norm": 8.173686027526855,
"learning_rate": 4.459465178063396e-06,
"loss": 0.0754,
"num_input_tokens_seen": 1142536,
"step": 8850
},
{
"epoch": 8.267973856209151,
"grad_norm": 0.6834007501602173,
"learning_rate": 4.436268926735162e-06,
"loss": 0.0458,
"num_input_tokens_seen": 1143112,
"step": 8855
},
{
"epoch": 8.27264239028945,
"grad_norm": 1.0048269033432007,
"learning_rate": 4.4131272860905455e-06,
"loss": 0.0213,
"num_input_tokens_seen": 1143736,
"step": 8860
},
{
"epoch": 8.277310924369749,
"grad_norm": 1.5272879600524902,
"learning_rate": 4.390040317586336e-06,
"loss": 0.0805,
"num_input_tokens_seen": 1144456,
"step": 8865
},
{
"epoch": 8.281979458450047,
"grad_norm": 0.3423355221748352,
"learning_rate": 4.367008082534113e-06,
"loss": 0.03,
"num_input_tokens_seen": 1145208,
"step": 8870
},
{
"epoch": 8.286647992530346,
"grad_norm": 6.090906620025635,
"learning_rate": 4.344030642100133e-06,
"loss": 0.0183,
"num_input_tokens_seen": 1145864,
"step": 8875
},
{
"epoch": 8.291316526610645,
"grad_norm": 3.7591450214385986,
"learning_rate": 4.321108057305101e-06,
"loss": 0.1285,
"num_input_tokens_seen": 1146504,
"step": 8880
},
{
"epoch": 8.295985060690944,
"grad_norm": 5.501720905303955,
"learning_rate": 4.298240389024077e-06,
"loss": 0.0518,
"num_input_tokens_seen": 1147016,
"step": 8885
},
{
"epoch": 8.300653594771243,
"grad_norm": 7.5667266845703125,
"learning_rate": 4.2754276979862536e-06,
"loss": 0.0339,
"num_input_tokens_seen": 1147624,
"step": 8890
},
{
"epoch": 8.305322128851541,
"grad_norm": 0.4266260862350464,
"learning_rate": 4.252670044774831e-06,
"loss": 0.0054,
"num_input_tokens_seen": 1148168,
"step": 8895
},
{
"epoch": 8.30999066293184,
"grad_norm": 0.5307876467704773,
"learning_rate": 4.229967489826853e-06,
"loss": 0.0462,
"num_input_tokens_seen": 1148856,
"step": 8900
},
{
"epoch": 8.314659197012139,
"grad_norm": 6.373628616333008,
"learning_rate": 4.2073200934330315e-06,
"loss": 0.1144,
"num_input_tokens_seen": 1149496,
"step": 8905
},
{
"epoch": 8.319327731092438,
"grad_norm": 0.1435551941394806,
"learning_rate": 4.184727915737607e-06,
"loss": 0.0092,
"num_input_tokens_seen": 1150328,
"step": 8910
},
{
"epoch": 8.323996265172736,
"grad_norm": 0.4174332916736603,
"learning_rate": 4.162191016738151e-06,
"loss": 0.0188,
"num_input_tokens_seen": 1150920,
"step": 8915
},
{
"epoch": 8.328664799253035,
"grad_norm": 0.07925593107938766,
"learning_rate": 4.139709456285465e-06,
"loss": 0.0047,
"num_input_tokens_seen": 1151512,
"step": 8920
},
{
"epoch": 8.333333333333334,
"grad_norm": 4.719840049743652,
"learning_rate": 4.11728329408336e-06,
"loss": 0.0052,
"num_input_tokens_seen": 1152184,
"step": 8925
},
{
"epoch": 8.338001867413633,
"grad_norm": 0.5039065480232239,
"learning_rate": 4.094912589688546e-06,
"loss": 0.0378,
"num_input_tokens_seen": 1152872,
"step": 8930
},
{
"epoch": 8.342670401493931,
"grad_norm": 0.1591583490371704,
"learning_rate": 4.072597402510455e-06,
"loss": 0.0094,
"num_input_tokens_seen": 1153592,
"step": 8935
},
{
"epoch": 8.34733893557423,
"grad_norm": 1.2247728109359741,
"learning_rate": 4.050337791811068e-06,
"loss": 0.1042,
"num_input_tokens_seen": 1154168,
"step": 8940
},
{
"epoch": 8.352007469654529,
"grad_norm": 0.2649737596511841,
"learning_rate": 4.0281338167047825e-06,
"loss": 0.0668,
"num_input_tokens_seen": 1154872,
"step": 8945
},
{
"epoch": 8.356676003734828,
"grad_norm": 0.05071331188082695,
"learning_rate": 4.005985536158246e-06,
"loss": 0.0744,
"num_input_tokens_seen": 1155576,
"step": 8950
},
{
"epoch": 8.361344537815127,
"grad_norm": 3.964921712875366,
"learning_rate": 3.983893008990208e-06,
"loss": 0.0497,
"num_input_tokens_seen": 1156184,
"step": 8955
},
{
"epoch": 8.366013071895425,
"grad_norm": 0.3039446771144867,
"learning_rate": 3.961856293871336e-06,
"loss": 0.0033,
"num_input_tokens_seen": 1156792,
"step": 8960
},
{
"epoch": 8.370681605975724,
"grad_norm": 0.20161408185958862,
"learning_rate": 3.939875449324082e-06,
"loss": 0.1079,
"num_input_tokens_seen": 1157320,
"step": 8965
},
{
"epoch": 8.375350140056023,
"grad_norm": 0.36548763513565063,
"learning_rate": 3.917950533722534e-06,
"loss": 0.1091,
"num_input_tokens_seen": 1158024,
"step": 8970
},
{
"epoch": 8.380018674136322,
"grad_norm": 0.5267390608787537,
"learning_rate": 3.896081605292246e-06,
"loss": 0.0544,
"num_input_tokens_seen": 1158600,
"step": 8975
},
{
"epoch": 8.38468720821662,
"grad_norm": 0.9543160796165466,
"learning_rate": 3.874268722110089e-06,
"loss": 0.3445,
"num_input_tokens_seen": 1159192,
"step": 8980
},
{
"epoch": 8.38935574229692,
"grad_norm": 2.8803083896636963,
"learning_rate": 3.852511942104101e-06,
"loss": 0.0229,
"num_input_tokens_seen": 1159800,
"step": 8985
},
{
"epoch": 8.394024276377218,
"grad_norm": 5.321974277496338,
"learning_rate": 3.83081132305331e-06,
"loss": 0.1096,
"num_input_tokens_seen": 1160376,
"step": 8990
},
{
"epoch": 8.398692810457517,
"grad_norm": 0.6705957055091858,
"learning_rate": 3.8091669225876176e-06,
"loss": 0.0038,
"num_input_tokens_seen": 1161016,
"step": 8995
},
{
"epoch": 8.403361344537815,
"grad_norm": 3.0900211334228516,
"learning_rate": 3.7875787981876105e-06,
"loss": 0.0255,
"num_input_tokens_seen": 1161752,
"step": 9000
},
{
"epoch": 8.408029878618114,
"grad_norm": 1.3996245861053467,
"learning_rate": 3.7660470071844393e-06,
"loss": 0.1057,
"num_input_tokens_seen": 1162328,
"step": 9005
},
{
"epoch": 8.412698412698413,
"grad_norm": 0.35684734582901,
"learning_rate": 3.7445716067596503e-06,
"loss": 0.0695,
"num_input_tokens_seen": 1163000,
"step": 9010
},
{
"epoch": 8.417366946778712,
"grad_norm": 7.436266899108887,
"learning_rate": 3.7231526539450167e-06,
"loss": 0.054,
"num_input_tokens_seen": 1163672,
"step": 9015
},
{
"epoch": 8.42203548085901,
"grad_norm": 0.4128905236721039,
"learning_rate": 3.701790205622421e-06,
"loss": 0.0206,
"num_input_tokens_seen": 1164344,
"step": 9020
},
{
"epoch": 8.42670401493931,
"grad_norm": 6.121613025665283,
"learning_rate": 3.6804843185236885e-06,
"loss": 0.0515,
"num_input_tokens_seen": 1164936,
"step": 9025
},
{
"epoch": 8.431372549019608,
"grad_norm": 0.07441362738609314,
"learning_rate": 3.6592350492304277e-06,
"loss": 0.0027,
"num_input_tokens_seen": 1165496,
"step": 9030
},
{
"epoch": 8.436041083099907,
"grad_norm": 0.13714176416397095,
"learning_rate": 3.638042454173901e-06,
"loss": 0.0231,
"num_input_tokens_seen": 1166088,
"step": 9035
},
{
"epoch": 8.440709617180206,
"grad_norm": 3.3369691371917725,
"learning_rate": 3.616906589634844e-06,
"loss": 0.0825,
"num_input_tokens_seen": 1166680,
"step": 9040
},
{
"epoch": 8.445378151260504,
"grad_norm": 0.21978700160980225,
"learning_rate": 3.595827511743341e-06,
"loss": 0.1878,
"num_input_tokens_seen": 1167320,
"step": 9045
},
{
"epoch": 8.450046685340803,
"grad_norm": 1.6111265420913696,
"learning_rate": 3.5748052764786737e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1167992,
"step": 9050
},
{
"epoch": 8.454715219421102,
"grad_norm": 6.312107563018799,
"learning_rate": 3.5538399396691707e-06,
"loss": 0.0709,
"num_input_tokens_seen": 1168600,
"step": 9055
},
{
"epoch": 8.4593837535014,
"grad_norm": 0.629169225692749,
"learning_rate": 3.5329315569920558e-06,
"loss": 0.0143,
"num_input_tokens_seen": 1169192,
"step": 9060
},
{
"epoch": 8.4640522875817,
"grad_norm": 0.046670157462358475,
"learning_rate": 3.512080183973285e-06,
"loss": 0.0017,
"num_input_tokens_seen": 1169992,
"step": 9065
},
{
"epoch": 8.468720821661998,
"grad_norm": 0.28610295057296753,
"learning_rate": 3.4912858759874295e-06,
"loss": 0.0572,
"num_input_tokens_seen": 1170648,
"step": 9070
},
{
"epoch": 8.473389355742297,
"grad_norm": 0.05349961295723915,
"learning_rate": 3.470548688257522e-06,
"loss": 0.046,
"num_input_tokens_seen": 1171400,
"step": 9075
},
{
"epoch": 8.478057889822596,
"grad_norm": 12.51286506652832,
"learning_rate": 3.4498686758548784e-06,
"loss": 0.0764,
"num_input_tokens_seen": 1172120,
"step": 9080
},
{
"epoch": 8.482726423902895,
"grad_norm": 5.514620304107666,
"learning_rate": 3.4292458936989983e-06,
"loss": 0.0543,
"num_input_tokens_seen": 1172648,
"step": 9085
},
{
"epoch": 8.487394957983193,
"grad_norm": 1.6812564134597778,
"learning_rate": 3.408680396557376e-06,
"loss": 0.1259,
"num_input_tokens_seen": 1173320,
"step": 9090
},
{
"epoch": 8.492063492063492,
"grad_norm": 0.4996938407421112,
"learning_rate": 3.3881722390453923e-06,
"loss": 0.0785,
"num_input_tokens_seen": 1173928,
"step": 9095
},
{
"epoch": 8.49673202614379,
"grad_norm": 0.6753255724906921,
"learning_rate": 3.36772147562614e-06,
"loss": 0.07,
"num_input_tokens_seen": 1174632,
"step": 9100
},
{
"epoch": 8.50140056022409,
"grad_norm": 7.039217948913574,
"learning_rate": 3.3473281606103078e-06,
"loss": 0.0705,
"num_input_tokens_seen": 1175400,
"step": 9105
},
{
"epoch": 8.506069094304388,
"grad_norm": 6.246345520019531,
"learning_rate": 3.3269923481559966e-06,
"loss": 0.1948,
"num_input_tokens_seen": 1175960,
"step": 9110
},
{
"epoch": 8.507936507936508,
"eval_loss": 1.1914538145065308,
"eval_runtime": 3.8658,
"eval_samples_per_second": 61.565,
"eval_steps_per_second": 30.783,
"num_input_tokens_seen": 1176200,
"step": 9112
},
{
"epoch": 8.510737628384687,
"grad_norm": 2.0727744102478027,
"learning_rate": 3.3067140922686174e-06,
"loss": 0.0087,
"num_input_tokens_seen": 1176584,
"step": 9115
},
{
"epoch": 8.515406162464986,
"grad_norm": 4.565818786621094,
"learning_rate": 3.286493446800723e-06,
"loss": 0.0851,
"num_input_tokens_seen": 1177272,
"step": 9120
},
{
"epoch": 8.520074696545285,
"grad_norm": 18.623031616210938,
"learning_rate": 3.2663304654518695e-06,
"loss": 0.2342,
"num_input_tokens_seen": 1177864,
"step": 9125
},
{
"epoch": 8.524743230625583,
"grad_norm": 0.040356144309043884,
"learning_rate": 3.2462252017684797e-06,
"loss": 0.0861,
"num_input_tokens_seen": 1178504,
"step": 9130
},
{
"epoch": 8.529411764705882,
"grad_norm": 6.924943923950195,
"learning_rate": 3.2261777091436907e-06,
"loss": 0.0166,
"num_input_tokens_seen": 1179224,
"step": 9135
},
{
"epoch": 8.534080298786181,
"grad_norm": 9.759965896606445,
"learning_rate": 3.2061880408172235e-06,
"loss": 0.0224,
"num_input_tokens_seen": 1179816,
"step": 9140
},
{
"epoch": 8.53874883286648,
"grad_norm": 4.266696453094482,
"learning_rate": 3.1862562498752356e-06,
"loss": 0.1285,
"num_input_tokens_seen": 1180552,
"step": 9145
},
{
"epoch": 8.543417366946779,
"grad_norm": 3.827735424041748,
"learning_rate": 3.1663823892501803e-06,
"loss": 0.0228,
"num_input_tokens_seen": 1181160,
"step": 9150
},
{
"epoch": 8.548085901027077,
"grad_norm": 17.82668113708496,
"learning_rate": 3.146566511720675e-06,
"loss": 0.1993,
"num_input_tokens_seen": 1181784,
"step": 9155
},
{
"epoch": 8.552754435107376,
"grad_norm": 0.9619858264923096,
"learning_rate": 3.1268086699113324e-06,
"loss": 0.0923,
"num_input_tokens_seen": 1182392,
"step": 9160
},
{
"epoch": 8.557422969187675,
"grad_norm": 0.9562932848930359,
"learning_rate": 3.1071089162926503e-06,
"loss": 0.0069,
"num_input_tokens_seen": 1183080,
"step": 9165
},
{
"epoch": 8.562091503267974,
"grad_norm": 7.583783149719238,
"learning_rate": 3.0874673031808713e-06,
"loss": 0.0645,
"num_input_tokens_seen": 1183752,
"step": 9170
},
{
"epoch": 8.566760037348272,
"grad_norm": 5.784301280975342,
"learning_rate": 3.0678838827378263e-06,
"loss": 0.0224,
"num_input_tokens_seen": 1184344,
"step": 9175
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.48377174139022827,
"learning_rate": 3.0483587069708165e-06,
"loss": 0.0085,
"num_input_tokens_seen": 1185000,
"step": 9180
},
{
"epoch": 8.57609710550887,
"grad_norm": 3.4420366287231445,
"learning_rate": 3.0288918277324413e-06,
"loss": 0.0735,
"num_input_tokens_seen": 1185624,
"step": 9185
},
{
"epoch": 8.580765639589169,
"grad_norm": 0.18768535554409027,
"learning_rate": 3.009483296720503e-06,
"loss": 0.0257,
"num_input_tokens_seen": 1186280,
"step": 9190
},
{
"epoch": 8.585434173669467,
"grad_norm": 0.013358098454773426,
"learning_rate": 2.990133165477846e-06,
"loss": 0.1094,
"num_input_tokens_seen": 1186904,
"step": 9195
},
{
"epoch": 8.590102707749766,
"grad_norm": 3.7494723796844482,
"learning_rate": 2.970841485392223e-06,
"loss": 0.0293,
"num_input_tokens_seen": 1187544,
"step": 9200
},
{
"epoch": 8.594771241830065,
"grad_norm": 13.038296699523926,
"learning_rate": 2.9516083076961577e-06,
"loss": 0.2713,
"num_input_tokens_seen": 1188216,
"step": 9205
},
{
"epoch": 8.599439775910364,
"grad_norm": 0.1423317939043045,
"learning_rate": 2.932433683466801e-06,
"loss": 0.0192,
"num_input_tokens_seen": 1188840,
"step": 9210
},
{
"epoch": 8.604108309990663,
"grad_norm": 5.753176689147949,
"learning_rate": 2.9133176636258196e-06,
"loss": 0.0274,
"num_input_tokens_seen": 1189512,
"step": 9215
},
{
"epoch": 8.608776844070961,
"grad_norm": 0.27099505066871643,
"learning_rate": 2.8942602989392386e-06,
"loss": 0.1617,
"num_input_tokens_seen": 1190072,
"step": 9220
},
{
"epoch": 8.61344537815126,
"grad_norm": 0.08621136844158173,
"learning_rate": 2.8752616400173184e-06,
"loss": 0.0243,
"num_input_tokens_seen": 1190760,
"step": 9225
},
{
"epoch": 8.618113912231559,
"grad_norm": 11.92264461517334,
"learning_rate": 2.856321737314413e-06,
"loss": 0.0347,
"num_input_tokens_seen": 1191352,
"step": 9230
},
{
"epoch": 8.622782446311858,
"grad_norm": 2.3230783939361572,
"learning_rate": 2.83744064112883e-06,
"loss": 0.0859,
"num_input_tokens_seen": 1191928,
"step": 9235
},
{
"epoch": 8.627450980392156,
"grad_norm": 9.768386840820312,
"learning_rate": 2.8186184016027268e-06,
"loss": 0.1392,
"num_input_tokens_seen": 1192552,
"step": 9240
},
{
"epoch": 8.632119514472455,
"grad_norm": 14.89448356628418,
"learning_rate": 2.7998550687219267e-06,
"loss": 0.0993,
"num_input_tokens_seen": 1193272,
"step": 9245
},
{
"epoch": 8.636788048552754,
"grad_norm": 0.07315748929977417,
"learning_rate": 2.781150692315848e-06,
"loss": 0.0793,
"num_input_tokens_seen": 1193896,
"step": 9250
},
{
"epoch": 8.641456582633053,
"grad_norm": 5.044677257537842,
"learning_rate": 2.76250532205731e-06,
"loss": 0.045,
"num_input_tokens_seen": 1194616,
"step": 9255
},
{
"epoch": 8.646125116713351,
"grad_norm": 1.9704540967941284,
"learning_rate": 2.7439190074624505e-06,
"loss": 0.0558,
"num_input_tokens_seen": 1195352,
"step": 9260
},
{
"epoch": 8.65079365079365,
"grad_norm": 3.324946403503418,
"learning_rate": 2.7253917978905696e-06,
"loss": 0.0753,
"num_input_tokens_seen": 1196056,
"step": 9265
},
{
"epoch": 8.655462184873949,
"grad_norm": 3.511345386505127,
"learning_rate": 2.706923742544001e-06,
"loss": 0.062,
"num_input_tokens_seen": 1196760,
"step": 9270
},
{
"epoch": 8.660130718954248,
"grad_norm": 0.5469305515289307,
"learning_rate": 2.6885148904679914e-06,
"loss": 0.0147,
"num_input_tokens_seen": 1197432,
"step": 9275
},
{
"epoch": 8.664799253034547,
"grad_norm": 0.09258583188056946,
"learning_rate": 2.6701652905505443e-06,
"loss": 0.0376,
"num_input_tokens_seen": 1198056,
"step": 9280
},
{
"epoch": 8.669467787114845,
"grad_norm": 1.9686568975448608,
"learning_rate": 2.6518749915223296e-06,
"loss": 0.1191,
"num_input_tokens_seen": 1198696,
"step": 9285
},
{
"epoch": 8.674136321195144,
"grad_norm": 2.1283085346221924,
"learning_rate": 2.633644041956515e-06,
"loss": 0.0148,
"num_input_tokens_seen": 1199432,
"step": 9290
},
{
"epoch": 8.678804855275443,
"grad_norm": 0.5042722225189209,
"learning_rate": 2.6154724902686667e-06,
"loss": 0.0371,
"num_input_tokens_seen": 1200024,
"step": 9295
},
{
"epoch": 8.683473389355742,
"grad_norm": 0.5686901211738586,
"learning_rate": 2.597360384716613e-06,
"loss": 0.055,
"num_input_tokens_seen": 1200664,
"step": 9300
},
{
"epoch": 8.68814192343604,
"grad_norm": 0.9969847798347473,
"learning_rate": 2.579307773400294e-06,
"loss": 0.0884,
"num_input_tokens_seen": 1201272,
"step": 9305
},
{
"epoch": 8.69281045751634,
"grad_norm": 0.07653947919607162,
"learning_rate": 2.561314704261669e-06,
"loss": 0.0702,
"num_input_tokens_seen": 1201880,
"step": 9310
},
{
"epoch": 8.697478991596638,
"grad_norm": 2.5944416522979736,
"learning_rate": 2.543381225084568e-06,
"loss": 0.0872,
"num_input_tokens_seen": 1202520,
"step": 9315
},
{
"epoch": 8.702147525676937,
"grad_norm": 4.421728134155273,
"learning_rate": 2.5255073834945715e-06,
"loss": 0.0336,
"num_input_tokens_seen": 1203176,
"step": 9320
},
{
"epoch": 8.706816059757235,
"grad_norm": 1.0446518659591675,
"learning_rate": 2.507693226958871e-06,
"loss": 0.0191,
"num_input_tokens_seen": 1203784,
"step": 9325
},
{
"epoch": 8.711484593837534,
"grad_norm": 9.855815887451172,
"learning_rate": 2.4899388027861626e-06,
"loss": 0.1938,
"num_input_tokens_seen": 1204440,
"step": 9330
},
{
"epoch": 8.716153127917833,
"grad_norm": 0.242303729057312,
"learning_rate": 2.472244158126508e-06,
"loss": 0.0286,
"num_input_tokens_seen": 1205144,
"step": 9335
},
{
"epoch": 8.720821661998132,
"grad_norm": 0.00873336661607027,
"learning_rate": 2.45460933997122e-06,
"loss": 0.0911,
"num_input_tokens_seen": 1205816,
"step": 9340
},
{
"epoch": 8.72549019607843,
"grad_norm": 2.0310895442962646,
"learning_rate": 2.437034395152729e-06,
"loss": 0.0562,
"num_input_tokens_seen": 1206472,
"step": 9345
},
{
"epoch": 8.73015873015873,
"grad_norm": 1.3347821235656738,
"learning_rate": 2.4195193703444587e-06,
"loss": 0.1385,
"num_input_tokens_seen": 1207160,
"step": 9350
},
{
"epoch": 8.73482726423903,
"grad_norm": 1.2632123231887817,
"learning_rate": 2.4020643120607034e-06,
"loss": 0.0297,
"num_input_tokens_seen": 1207784,
"step": 9355
},
{
"epoch": 8.739495798319329,
"grad_norm": 0.7416768670082092,
"learning_rate": 2.3846692666565055e-06,
"loss": 0.0144,
"num_input_tokens_seen": 1208392,
"step": 9360
},
{
"epoch": 8.744164332399627,
"grad_norm": 1.5258066654205322,
"learning_rate": 2.3673342803275434e-06,
"loss": 0.0356,
"num_input_tokens_seen": 1209048,
"step": 9365
},
{
"epoch": 8.748832866479926,
"grad_norm": 0.5528998970985413,
"learning_rate": 2.3500593991099774e-06,
"loss": 0.0157,
"num_input_tokens_seen": 1209752,
"step": 9370
},
{
"epoch": 8.753501400560225,
"grad_norm": 3.4063894748687744,
"learning_rate": 2.3328446688803685e-06,
"loss": 0.0133,
"num_input_tokens_seen": 1210376,
"step": 9375
},
{
"epoch": 8.758169934640524,
"grad_norm": 0.1745508909225464,
"learning_rate": 2.3156901353555167e-06,
"loss": 0.0388,
"num_input_tokens_seen": 1211160,
"step": 9380
},
{
"epoch": 8.762838468720823,
"grad_norm": 2.1299688816070557,
"learning_rate": 2.298595844092377e-06,
"loss": 0.1209,
"num_input_tokens_seen": 1211784,
"step": 9385
},
{
"epoch": 8.767507002801121,
"grad_norm": 3.634575128555298,
"learning_rate": 2.2815618404879087e-06,
"loss": 0.0841,
"num_input_tokens_seen": 1212456,
"step": 9390
},
{
"epoch": 8.77217553688142,
"grad_norm": 6.291904926300049,
"learning_rate": 2.2645881697789697e-06,
"loss": 0.1058,
"num_input_tokens_seen": 1213032,
"step": 9395
},
{
"epoch": 8.776844070961719,
"grad_norm": 4.1127095222473145,
"learning_rate": 2.2476748770421995e-06,
"loss": 0.0461,
"num_input_tokens_seen": 1213800,
"step": 9400
},
{
"epoch": 8.781512605042018,
"grad_norm": 3.1000053882598877,
"learning_rate": 2.2308220071938805e-06,
"loss": 0.0455,
"num_input_tokens_seen": 1214424,
"step": 9405
},
{
"epoch": 8.786181139122316,
"grad_norm": 2.81400465965271,
"learning_rate": 2.214029604989834e-06,
"loss": 0.0553,
"num_input_tokens_seen": 1215016,
"step": 9410
},
{
"epoch": 8.790849673202615,
"grad_norm": 0.10587247461080551,
"learning_rate": 2.1972977150253064e-06,
"loss": 0.0247,
"num_input_tokens_seen": 1215672,
"step": 9415
},
{
"epoch": 8.795518207282914,
"grad_norm": 0.8938270211219788,
"learning_rate": 2.1806263817348432e-06,
"loss": 0.0609,
"num_input_tokens_seen": 1216328,
"step": 9420
},
{
"epoch": 8.800186741363213,
"grad_norm": 0.26707565784454346,
"learning_rate": 2.1640156493921566e-06,
"loss": 0.1118,
"num_input_tokens_seen": 1217032,
"step": 9425
},
{
"epoch": 8.804855275443511,
"grad_norm": 0.22385770082473755,
"learning_rate": 2.1474655621100347e-06,
"loss": 0.1078,
"num_input_tokens_seen": 1217784,
"step": 9430
},
{
"epoch": 8.80952380952381,
"grad_norm": 0.3320285379886627,
"learning_rate": 2.130976163840212e-06,
"loss": 0.0087,
"num_input_tokens_seen": 1218536,
"step": 9435
},
{
"epoch": 8.814192343604109,
"grad_norm": 4.328696250915527,
"learning_rate": 2.1145474983732484e-06,
"loss": 0.0654,
"num_input_tokens_seen": 1219128,
"step": 9440
},
{
"epoch": 8.818860877684408,
"grad_norm": 1.3271536827087402,
"learning_rate": 2.0981796093384216e-06,
"loss": 0.0678,
"num_input_tokens_seen": 1219720,
"step": 9445
},
{
"epoch": 8.823529411764707,
"grad_norm": 5.28490686416626,
"learning_rate": 2.0818725402035944e-06,
"loss": 0.0571,
"num_input_tokens_seen": 1220344,
"step": 9450
},
{
"epoch": 8.828197945845005,
"grad_norm": 3.1943273544311523,
"learning_rate": 2.06562633427512e-06,
"loss": 0.1072,
"num_input_tokens_seen": 1221096,
"step": 9455
},
{
"epoch": 8.832866479925304,
"grad_norm": 0.32759732007980347,
"learning_rate": 2.0494410346977216e-06,
"loss": 0.0025,
"num_input_tokens_seen": 1221720,
"step": 9460
},
{
"epoch": 8.837535014005603,
"grad_norm": 0.14161980152130127,
"learning_rate": 2.03331668445437e-06,
"loss": 0.0841,
"num_input_tokens_seen": 1222312,
"step": 9465
},
{
"epoch": 8.842203548085902,
"grad_norm": 0.1818762719631195,
"learning_rate": 2.017253326366181e-06,
"loss": 0.0305,
"num_input_tokens_seen": 1222920,
"step": 9470
},
{
"epoch": 8.8468720821662,
"grad_norm": 3.802173614501953,
"learning_rate": 2.0012510030922775e-06,
"loss": 0.0523,
"num_input_tokens_seen": 1223480,
"step": 9475
},
{
"epoch": 8.8515406162465,
"grad_norm": 1.9173213243484497,
"learning_rate": 1.985309757129711e-06,
"loss": 0.0463,
"num_input_tokens_seen": 1224056,
"step": 9480
},
{
"epoch": 8.856209150326798,
"grad_norm": 0.11994439363479614,
"learning_rate": 1.9694296308133298e-06,
"loss": 0.0162,
"num_input_tokens_seen": 1224680,
"step": 9485
},
{
"epoch": 8.860877684407097,
"grad_norm": 2.666987180709839,
"learning_rate": 1.9536106663156555e-06,
"loss": 0.0056,
"num_input_tokens_seen": 1225368,
"step": 9490
},
{
"epoch": 8.865546218487395,
"grad_norm": 5.9014410972595215,
"learning_rate": 1.9378529056467976e-06,
"loss": 0.0321,
"num_input_tokens_seen": 1226120,
"step": 9495
},
{
"epoch": 8.870214752567694,
"grad_norm": 1.6379077434539795,
"learning_rate": 1.9221563906543143e-06,
"loss": 0.0841,
"num_input_tokens_seen": 1226856,
"step": 9500
},
{
"epoch": 8.874883286647993,
"grad_norm": 0.10031027346849442,
"learning_rate": 1.9065211630231283e-06,
"loss": 0.0318,
"num_input_tokens_seen": 1227480,
"step": 9505
},
{
"epoch": 8.879551820728292,
"grad_norm": 5.307277202606201,
"learning_rate": 1.8909472642753917e-06,
"loss": 0.1524,
"num_input_tokens_seen": 1228120,
"step": 9510
},
{
"epoch": 8.88422035480859,
"grad_norm": 0.08876470476388931,
"learning_rate": 1.8754347357703955e-06,
"loss": 0.0114,
"num_input_tokens_seen": 1228744,
"step": 9515
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.21213771402835846,
"learning_rate": 1.85998361870445e-06,
"loss": 0.0057,
"num_input_tokens_seen": 1229336,
"step": 9520
},
{
"epoch": 8.893557422969188,
"grad_norm": 0.10798242688179016,
"learning_rate": 1.8445939541107654e-06,
"loss": 0.0121,
"num_input_tokens_seen": 1229912,
"step": 9525
},
{
"epoch": 8.898225957049487,
"grad_norm": 0.17130322754383087,
"learning_rate": 1.8292657828593712e-06,
"loss": 0.0869,
"num_input_tokens_seen": 1230648,
"step": 9530
},
{
"epoch": 8.902894491129786,
"grad_norm": 0.165962815284729,
"learning_rate": 1.8139991456569694e-06,
"loss": 0.0161,
"num_input_tokens_seen": 1231400,
"step": 9535
},
{
"epoch": 8.907563025210084,
"grad_norm": 1.7331267595291138,
"learning_rate": 1.7987940830468675e-06,
"loss": 0.0493,
"num_input_tokens_seen": 1232104,
"step": 9540
},
{
"epoch": 8.912231559290383,
"grad_norm": 0.1018749475479126,
"learning_rate": 1.7836506354088428e-06,
"loss": 0.1381,
"num_input_tokens_seen": 1232776,
"step": 9545
},
{
"epoch": 8.916900093370682,
"grad_norm": 0.2601299583911896,
"learning_rate": 1.768568842959037e-06,
"loss": 0.0917,
"num_input_tokens_seen": 1233352,
"step": 9550
},
{
"epoch": 8.92156862745098,
"grad_norm": 7.419183254241943,
"learning_rate": 1.7535487457498583e-06,
"loss": 0.0666,
"num_input_tokens_seen": 1234008,
"step": 9555
},
{
"epoch": 8.92623716153128,
"grad_norm": 4.737880706787109,
"learning_rate": 1.7385903836698802e-06,
"loss": 0.0894,
"num_input_tokens_seen": 1234664,
"step": 9560
},
{
"epoch": 8.930905695611578,
"grad_norm": 4.031538009643555,
"learning_rate": 1.723693796443726e-06,
"loss": 0.1108,
"num_input_tokens_seen": 1235272,
"step": 9565
},
{
"epoch": 8.935574229691877,
"grad_norm": 4.560408115386963,
"learning_rate": 1.7088590236319507e-06,
"loss": 0.1927,
"num_input_tokens_seen": 1235864,
"step": 9570
},
{
"epoch": 8.940242763772176,
"grad_norm": 4.4047088623046875,
"learning_rate": 1.6940861046309625e-06,
"loss": 0.0726,
"num_input_tokens_seen": 1236520,
"step": 9575
},
{
"epoch": 8.944911297852475,
"grad_norm": 0.03612064942717552,
"learning_rate": 1.6793750786729012e-06,
"loss": 0.0995,
"num_input_tokens_seen": 1237240,
"step": 9580
},
{
"epoch": 8.949579831932773,
"grad_norm": 1.356696605682373,
"learning_rate": 1.664725984825541e-06,
"loss": 0.0148,
"num_input_tokens_seen": 1237848,
"step": 9585
},
{
"epoch": 8.954248366013072,
"grad_norm": 1.6855072975158691,
"learning_rate": 1.650138861992187e-06,
"loss": 0.0804,
"num_input_tokens_seen": 1238456,
"step": 9590
},
{
"epoch": 8.95891690009337,
"grad_norm": 7.222161293029785,
"learning_rate": 1.6356137489115658e-06,
"loss": 0.0405,
"num_input_tokens_seen": 1239064,
"step": 9595
},
{
"epoch": 8.96358543417367,
"grad_norm": 0.48260605335235596,
"learning_rate": 1.6211506841577185e-06,
"loss": 0.0288,
"num_input_tokens_seen": 1239720,
"step": 9600
},
{
"epoch": 8.968253968253968,
"grad_norm": 0.33404096961021423,
"learning_rate": 1.6067497061399179e-06,
"loss": 0.0085,
"num_input_tokens_seen": 1240376,
"step": 9605
},
{
"epoch": 8.972922502334267,
"grad_norm": 0.10782495886087418,
"learning_rate": 1.592410853102555e-06,
"loss": 0.0324,
"num_input_tokens_seen": 1241000,
"step": 9610
},
{
"epoch": 8.977591036414566,
"grad_norm": 6.687010288238525,
"learning_rate": 1.5781341631250224e-06,
"loss": 0.0385,
"num_input_tokens_seen": 1241704,
"step": 9615
},
{
"epoch": 8.982259570494865,
"grad_norm": 5.4267659187316895,
"learning_rate": 1.563919674121636e-06,
"loss": 0.0529,
"num_input_tokens_seen": 1242312,
"step": 9620
},
{
"epoch": 8.986928104575163,
"grad_norm": 0.3334493637084961,
"learning_rate": 1.5497674238415277e-06,
"loss": 0.0312,
"num_input_tokens_seen": 1242888,
"step": 9625
},
{
"epoch": 8.991596638655462,
"grad_norm": 12.44144344329834,
"learning_rate": 1.5356774498685417e-06,
"loss": 0.1086,
"num_input_tokens_seen": 1243480,
"step": 9630
},
{
"epoch": 8.996265172735761,
"grad_norm": 1.71649169921875,
"learning_rate": 1.521649789621138e-06,
"loss": 0.02,
"num_input_tokens_seen": 1244120,
"step": 9635
},
{
"epoch": 9.00093370681606,
"grad_norm": 0.35185933113098145,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.0347,
"num_input_tokens_seen": 1244768,
"step": 9640
},
{
"epoch": 9.005602240896359,
"grad_norm": 2.0746331214904785,
"learning_rate": 1.4937815591493848e-06,
"loss": 0.0369,
"num_input_tokens_seen": 1245392,
"step": 9645
},
{
"epoch": 9.008403361344538,
"eval_loss": 1.1851145029067993,
"eval_runtime": 3.8629,
"eval_samples_per_second": 61.611,
"eval_steps_per_second": 30.806,
"num_input_tokens_seen": 1245744,
"step": 9648
},
{
"epoch": 9.010270774976657,
"grad_norm": 2.7271130084991455,
"learning_rate": 1.4799410629341315e-06,
"loss": 0.0366,
"num_input_tokens_seen": 1245968,
"step": 9650
},
{
"epoch": 9.014939309056956,
"grad_norm": 0.29737892746925354,
"learning_rate": 1.4661630284624444e-06,
"loss": 0.0343,
"num_input_tokens_seen": 1246640,
"step": 9655
},
{
"epoch": 9.019607843137255,
"grad_norm": 0.14246688783168793,
"learning_rate": 1.4524474923243825e-06,
"loss": 0.0131,
"num_input_tokens_seen": 1247232,
"step": 9660
},
{
"epoch": 9.024276377217554,
"grad_norm": 0.5064313411712646,
"learning_rate": 1.438794490944012e-06,
"loss": 0.0371,
"num_input_tokens_seen": 1247872,
"step": 9665
},
{
"epoch": 9.028944911297852,
"grad_norm": 7.1344404220581055,
"learning_rate": 1.4252040605793327e-06,
"loss": 0.1093,
"num_input_tokens_seen": 1248528,
"step": 9670
},
{
"epoch": 9.033613445378151,
"grad_norm": 2.0987865924835205,
"learning_rate": 1.411676237322171e-06,
"loss": 0.0118,
"num_input_tokens_seen": 1249216,
"step": 9675
},
{
"epoch": 9.03828197945845,
"grad_norm": 0.6497586369514465,
"learning_rate": 1.3982110570980978e-06,
"loss": 0.0099,
"num_input_tokens_seen": 1249824,
"step": 9680
},
{
"epoch": 9.042950513538749,
"grad_norm": 2.457868814468384,
"learning_rate": 1.3848085556663198e-06,
"loss": 0.0787,
"num_input_tokens_seen": 1250512,
"step": 9685
},
{
"epoch": 9.047619047619047,
"grad_norm": 0.06839052587747574,
"learning_rate": 1.3714687686195827e-06,
"loss": 0.0144,
"num_input_tokens_seen": 1251248,
"step": 9690
},
{
"epoch": 9.052287581699346,
"grad_norm": 0.15281909704208374,
"learning_rate": 1.358191731384098e-06,
"loss": 0.0291,
"num_input_tokens_seen": 1251952,
"step": 9695
},
{
"epoch": 9.056956115779645,
"grad_norm": 1.8181029558181763,
"learning_rate": 1.3449774792194114e-06,
"loss": 0.0161,
"num_input_tokens_seen": 1252624,
"step": 9700
},
{
"epoch": 9.061624649859944,
"grad_norm": 2.0059690475463867,
"learning_rate": 1.3318260472183514e-06,
"loss": 0.0067,
"num_input_tokens_seen": 1253344,
"step": 9705
},
{
"epoch": 9.066293183940243,
"grad_norm": 0.31657326221466064,
"learning_rate": 1.3187374703069105e-06,
"loss": 0.0784,
"num_input_tokens_seen": 1253920,
"step": 9710
},
{
"epoch": 9.070961718020541,
"grad_norm": 1.749794840812683,
"learning_rate": 1.3057117832441567e-06,
"loss": 0.0428,
"num_input_tokens_seen": 1254560,
"step": 9715
},
{
"epoch": 9.07563025210084,
"grad_norm": 3.9665474891662598,
"learning_rate": 1.2927490206221388e-06,
"loss": 0.0354,
"num_input_tokens_seen": 1255168,
"step": 9720
},
{
"epoch": 9.080298786181139,
"grad_norm": 0.1431313157081604,
"learning_rate": 1.2798492168658083e-06,
"loss": 0.0189,
"num_input_tokens_seen": 1255888,
"step": 9725
},
{
"epoch": 9.084967320261438,
"grad_norm": 5.252475738525391,
"learning_rate": 1.267012406232909e-06,
"loss": 0.097,
"num_input_tokens_seen": 1256464,
"step": 9730
},
{
"epoch": 9.089635854341736,
"grad_norm": 0.1456415206193924,
"learning_rate": 1.2542386228138997e-06,
"loss": 0.0019,
"num_input_tokens_seen": 1257120,
"step": 9735
},
{
"epoch": 9.094304388422035,
"grad_norm": 0.6008406281471252,
"learning_rate": 1.241527900531858e-06,
"loss": 0.0025,
"num_input_tokens_seen": 1257776,
"step": 9740
},
{
"epoch": 9.098972922502334,
"grad_norm": 0.1887224167585373,
"learning_rate": 1.2288802731423883e-06,
"loss": 0.0828,
"num_input_tokens_seen": 1258464,
"step": 9745
},
{
"epoch": 9.103641456582633,
"grad_norm": 12.766477584838867,
"learning_rate": 1.2162957742335418e-06,
"loss": 0.1032,
"num_input_tokens_seen": 1259088,
"step": 9750
},
{
"epoch": 9.108309990662931,
"grad_norm": 6.654147148132324,
"learning_rate": 1.203774437225716e-06,
"loss": 0.0534,
"num_input_tokens_seen": 1259696,
"step": 9755
},
{
"epoch": 9.11297852474323,
"grad_norm": 0.9072001576423645,
"learning_rate": 1.1913162953715695e-06,
"loss": 0.0488,
"num_input_tokens_seen": 1260336,
"step": 9760
},
{
"epoch": 9.117647058823529,
"grad_norm": 0.38549622893333435,
"learning_rate": 1.1789213817559458e-06,
"loss": 0.2404,
"num_input_tokens_seen": 1260992,
"step": 9765
},
{
"epoch": 9.122315592903828,
"grad_norm": 1.1125404834747314,
"learning_rate": 1.1665897292957556e-06,
"loss": 0.0516,
"num_input_tokens_seen": 1261616,
"step": 9770
},
{
"epoch": 9.126984126984127,
"grad_norm": 0.5928016304969788,
"learning_rate": 1.154321370739922e-06,
"loss": 0.0345,
"num_input_tokens_seen": 1262288,
"step": 9775
},
{
"epoch": 9.131652661064425,
"grad_norm": 0.19205573201179504,
"learning_rate": 1.1421163386692719e-06,
"loss": 0.0079,
"num_input_tokens_seen": 1262944,
"step": 9780
},
{
"epoch": 9.136321195144724,
"grad_norm": 1.2952364683151245,
"learning_rate": 1.1299746654964721e-06,
"loss": 0.0172,
"num_input_tokens_seen": 1263632,
"step": 9785
},
{
"epoch": 9.140989729225023,
"grad_norm": 0.17227812111377716,
"learning_rate": 1.117896383465905e-06,
"loss": 0.0222,
"num_input_tokens_seen": 1264320,
"step": 9790
},
{
"epoch": 9.145658263305322,
"grad_norm": 0.37296009063720703,
"learning_rate": 1.1058815246536263e-06,
"loss": 0.0033,
"num_input_tokens_seen": 1264928,
"step": 9795
},
{
"epoch": 9.15032679738562,
"grad_norm": 4.000223159790039,
"learning_rate": 1.0939301209672543e-06,
"loss": 0.0457,
"num_input_tokens_seen": 1265456,
"step": 9800
},
{
"epoch": 9.15499533146592,
"grad_norm": 0.1993858367204666,
"learning_rate": 1.0820422041458834e-06,
"loss": 0.0229,
"num_input_tokens_seen": 1266064,
"step": 9805
},
{
"epoch": 9.159663865546218,
"grad_norm": 1.7776328325271606,
"learning_rate": 1.070217805760021e-06,
"loss": 0.1402,
"num_input_tokens_seen": 1266640,
"step": 9810
},
{
"epoch": 9.164332399626517,
"grad_norm": 0.3360730707645416,
"learning_rate": 1.0584569572114789e-06,
"loss": 0.0118,
"num_input_tokens_seen": 1267232,
"step": 9815
},
{
"epoch": 9.169000933706815,
"grad_norm": 4.538324356079102,
"learning_rate": 1.046759689733301e-06,
"loss": 0.0666,
"num_input_tokens_seen": 1267824,
"step": 9820
},
{
"epoch": 9.173669467787114,
"grad_norm": 4.662381649017334,
"learning_rate": 1.0351260343896828e-06,
"loss": 0.077,
"num_input_tokens_seen": 1268464,
"step": 9825
},
{
"epoch": 9.178338001867413,
"grad_norm": 5.578917026519775,
"learning_rate": 1.0235560220758916e-06,
"loss": 0.0666,
"num_input_tokens_seen": 1269120,
"step": 9830
},
{
"epoch": 9.183006535947712,
"grad_norm": 0.07224404066801071,
"learning_rate": 1.0120496835181764e-06,
"loss": 0.0034,
"num_input_tokens_seen": 1269792,
"step": 9835
},
{
"epoch": 9.18767507002801,
"grad_norm": 7.480001449584961,
"learning_rate": 1.0006070492736775e-06,
"loss": 0.1353,
"num_input_tokens_seen": 1270352,
"step": 9840
},
{
"epoch": 9.19234360410831,
"grad_norm": 0.06923279166221619,
"learning_rate": 9.892281497303757e-07,
"loss": 0.0125,
"num_input_tokens_seen": 1270896,
"step": 9845
},
{
"epoch": 9.197012138188608,
"grad_norm": 2.548983573913574,
"learning_rate": 9.77913015106982e-07,
"loss": 0.0086,
"num_input_tokens_seen": 1271520,
"step": 9850
},
{
"epoch": 9.201680672268907,
"grad_norm": 0.0795988142490387,
"learning_rate": 9.66661675452865e-07,
"loss": 0.0109,
"num_input_tokens_seen": 1272128,
"step": 9855
},
{
"epoch": 9.206349206349206,
"grad_norm": 0.8964292407035828,
"learning_rate": 9.554741606479845e-07,
"loss": 0.0683,
"num_input_tokens_seen": 1272896,
"step": 9860
},
{
"epoch": 9.211017740429504,
"grad_norm": 0.06357522308826447,
"learning_rate": 9.443505004027936e-07,
"loss": 0.071,
"num_input_tokens_seen": 1273584,
"step": 9865
},
{
"epoch": 9.215686274509803,
"grad_norm": 0.20794254541397095,
"learning_rate": 9.332907242581735e-07,
"loss": 0.0026,
"num_input_tokens_seen": 1274288,
"step": 9870
},
{
"epoch": 9.220354808590102,
"grad_norm": 7.484773635864258,
"learning_rate": 9.222948615853433e-07,
"loss": 0.0368,
"num_input_tokens_seen": 1274848,
"step": 9875
},
{
"epoch": 9.2250233426704,
"grad_norm": 4.599846839904785,
"learning_rate": 9.113629415857999e-07,
"loss": 0.1142,
"num_input_tokens_seen": 1275536,
"step": 9880
},
{
"epoch": 9.2296918767507,
"grad_norm": 0.17251808941364288,
"learning_rate": 9.004949932912177e-07,
"loss": 0.0396,
"num_input_tokens_seen": 1276112,
"step": 9885
},
{
"epoch": 9.234360410830998,
"grad_norm": 0.1543770283460617,
"learning_rate": 8.896910455633844e-07,
"loss": 0.0196,
"num_input_tokens_seen": 1276704,
"step": 9890
},
{
"epoch": 9.239028944911297,
"grad_norm": 2.0087528228759766,
"learning_rate": 8.78951127094127e-07,
"loss": 0.1253,
"num_input_tokens_seen": 1277312,
"step": 9895
},
{
"epoch": 9.243697478991596,
"grad_norm": 0.400070458650589,
"learning_rate": 8.682752664052302e-07,
"loss": 0.0722,
"num_input_tokens_seen": 1277920,
"step": 9900
},
{
"epoch": 9.248366013071895,
"grad_norm": 0.5074196457862854,
"learning_rate": 8.576634918483567e-07,
"loss": 0.0762,
"num_input_tokens_seen": 1278624,
"step": 9905
},
{
"epoch": 9.253034547152193,
"grad_norm": 0.04042162373661995,
"learning_rate": 8.47115831604986e-07,
"loss": 0.0647,
"num_input_tokens_seen": 1279248,
"step": 9910
},
{
"epoch": 9.257703081232492,
"grad_norm": 2.7458279132843018,
"learning_rate": 8.366323136863225e-07,
"loss": 0.0108,
"num_input_tokens_seen": 1279904,
"step": 9915
},
{
"epoch": 9.262371615312793,
"grad_norm": 2.4689865112304688,
"learning_rate": 8.262129659332346e-07,
"loss": 0.0362,
"num_input_tokens_seen": 1280496,
"step": 9920
},
{
"epoch": 9.267040149393091,
"grad_norm": 2.182461738586426,
"learning_rate": 8.15857816016169e-07,
"loss": 0.0638,
"num_input_tokens_seen": 1281200,
"step": 9925
},
{
"epoch": 9.27170868347339,
"grad_norm": 1.7279038429260254,
"learning_rate": 8.055668914350916e-07,
"loss": 0.0492,
"num_input_tokens_seen": 1281776,
"step": 9930
},
{
"epoch": 9.276377217553689,
"grad_norm": 0.5922943949699402,
"learning_rate": 7.953402195193999e-07,
"loss": 0.0432,
"num_input_tokens_seen": 1282416,
"step": 9935
},
{
"epoch": 9.281045751633988,
"grad_norm": 0.8308282494544983,
"learning_rate": 7.851778274278576e-07,
"loss": 0.0086,
"num_input_tokens_seen": 1283088,
"step": 9940
},
{
"epoch": 9.285714285714286,
"grad_norm": 0.3864976167678833,
"learning_rate": 7.750797421485267e-07,
"loss": 0.0211,
"num_input_tokens_seen": 1283664,
"step": 9945
},
{
"epoch": 9.290382819794585,
"grad_norm": 6.857438087463379,
"learning_rate": 7.650459904986834e-07,
"loss": 0.0093,
"num_input_tokens_seen": 1284320,
"step": 9950
},
{
"epoch": 9.295051353874884,
"grad_norm": 10.91110610961914,
"learning_rate": 7.550765991247654e-07,
"loss": 0.1058,
"num_input_tokens_seen": 1284880,
"step": 9955
},
{
"epoch": 9.299719887955183,
"grad_norm": 3.3910160064697266,
"learning_rate": 7.451715945022752e-07,
"loss": 0.0478,
"num_input_tokens_seen": 1285456,
"step": 9960
},
{
"epoch": 9.304388422035482,
"grad_norm": 0.13573530316352844,
"learning_rate": 7.353310029357352e-07,
"loss": 0.0075,
"num_input_tokens_seen": 1286144,
"step": 9965
},
{
"epoch": 9.30905695611578,
"grad_norm": 0.33091700077056885,
"learning_rate": 7.255548505586074e-07,
"loss": 0.066,
"num_input_tokens_seen": 1286752,
"step": 9970
},
{
"epoch": 9.313725490196079,
"grad_norm": 0.1710100620985031,
"learning_rate": 7.158431633332241e-07,
"loss": 0.0434,
"num_input_tokens_seen": 1287440,
"step": 9975
},
{
"epoch": 9.318394024276378,
"grad_norm": 1.2184041738510132,
"learning_rate": 7.061959670507102e-07,
"loss": 0.0267,
"num_input_tokens_seen": 1288048,
"step": 9980
},
{
"epoch": 9.323062558356677,
"grad_norm": 0.09887643158435822,
"learning_rate": 6.966132873309273e-07,
"loss": 0.0602,
"num_input_tokens_seen": 1288688,
"step": 9985
},
{
"epoch": 9.327731092436975,
"grad_norm": 0.12310995906591415,
"learning_rate": 6.870951496224076e-07,
"loss": 0.0142,
"num_input_tokens_seen": 1289328,
"step": 9990
},
{
"epoch": 9.332399626517274,
"grad_norm": 0.08117198199033737,
"learning_rate": 6.776415792022789e-07,
"loss": 0.0431,
"num_input_tokens_seen": 1289936,
"step": 9995
},
{
"epoch": 9.337068160597573,
"grad_norm": 0.21514607965946198,
"learning_rate": 6.682526011761919e-07,
"loss": 0.003,
"num_input_tokens_seen": 1290656,
"step": 10000
},
{
"epoch": 9.341736694677872,
"grad_norm": 0.2976192831993103,
"learning_rate": 6.589282404782682e-07,
"loss": 0.0577,
"num_input_tokens_seen": 1291184,
"step": 10005
},
{
"epoch": 9.34640522875817,
"grad_norm": 3.206972122192383,
"learning_rate": 6.496685218710219e-07,
"loss": 0.0448,
"num_input_tokens_seen": 1291808,
"step": 10010
},
{
"epoch": 9.35107376283847,
"grad_norm": 0.09685748815536499,
"learning_rate": 6.404734699453018e-07,
"loss": 0.0254,
"num_input_tokens_seen": 1292464,
"step": 10015
},
{
"epoch": 9.355742296918768,
"grad_norm": 0.07187190651893616,
"learning_rate": 6.313431091202165e-07,
"loss": 0.0107,
"num_input_tokens_seen": 1293072,
"step": 10020
},
{
"epoch": 9.360410830999067,
"grad_norm": 0.08758264780044556,
"learning_rate": 6.222774636430811e-07,
"loss": 0.0651,
"num_input_tokens_seen": 1293648,
"step": 10025
},
{
"epoch": 9.365079365079366,
"grad_norm": 0.6055055856704712,
"learning_rate": 6.13276557589354e-07,
"loss": 0.061,
"num_input_tokens_seen": 1294224,
"step": 10030
},
{
"epoch": 9.369747899159664,
"grad_norm": 0.9085772037506104,
"learning_rate": 6.043404148625503e-07,
"loss": 0.0291,
"num_input_tokens_seen": 1294912,
"step": 10035
},
{
"epoch": 9.374416433239963,
"grad_norm": 3.352837324142456,
"learning_rate": 5.954690591942036e-07,
"loss": 0.0216,
"num_input_tokens_seen": 1295616,
"step": 10040
},
{
"epoch": 9.379084967320262,
"grad_norm": 0.11360262334346771,
"learning_rate": 5.866625141437959e-07,
"loss": 0.0207,
"num_input_tokens_seen": 1296256,
"step": 10045
},
{
"epoch": 9.38375350140056,
"grad_norm": 0.08144768327474594,
"learning_rate": 5.779208030986916e-07,
"loss": 0.1058,
"num_input_tokens_seen": 1296896,
"step": 10050
},
{
"epoch": 9.38842203548086,
"grad_norm": 0.12814949452877045,
"learning_rate": 5.692439492740759e-07,
"loss": 0.0058,
"num_input_tokens_seen": 1297472,
"step": 10055
},
{
"epoch": 9.393090569561158,
"grad_norm": 0.12964555621147156,
"learning_rate": 5.606319757128914e-07,
"loss": 0.0269,
"num_input_tokens_seen": 1297984,
"step": 10060
},
{
"epoch": 9.397759103641457,
"grad_norm": 0.6443309783935547,
"learning_rate": 5.520849052857768e-07,
"loss": 0.0085,
"num_input_tokens_seen": 1298592,
"step": 10065
},
{
"epoch": 9.402427637721756,
"grad_norm": 0.11005096882581711,
"learning_rate": 5.436027606910199e-07,
"loss": 0.002,
"num_input_tokens_seen": 1299216,
"step": 10070
},
{
"epoch": 9.407096171802054,
"grad_norm": 0.4922139644622803,
"learning_rate": 5.351855644544796e-07,
"loss": 0.0396,
"num_input_tokens_seen": 1299808,
"step": 10075
},
{
"epoch": 9.411764705882353,
"grad_norm": 6.919455051422119,
"learning_rate": 5.26833338929536e-07,
"loss": 0.1592,
"num_input_tokens_seen": 1300416,
"step": 10080
},
{
"epoch": 9.416433239962652,
"grad_norm": 0.36009007692337036,
"learning_rate": 5.18546106297016e-07,
"loss": 0.0093,
"num_input_tokens_seen": 1301200,
"step": 10085
},
{
"epoch": 9.42110177404295,
"grad_norm": 1.9589595794677734,
"learning_rate": 5.103238885651618e-07,
"loss": 0.1431,
"num_input_tokens_seen": 1301936,
"step": 10090
},
{
"epoch": 9.42577030812325,
"grad_norm": 0.031739529222249985,
"learning_rate": 5.021667075695541e-07,
"loss": 0.0309,
"num_input_tokens_seen": 1302576,
"step": 10095
},
{
"epoch": 9.430438842203548,
"grad_norm": 0.13981761038303375,
"learning_rate": 4.940745849730421e-07,
"loss": 0.0134,
"num_input_tokens_seen": 1303104,
"step": 10100
},
{
"epoch": 9.435107376283847,
"grad_norm": 1.46657395362854,
"learning_rate": 4.860475422657218e-07,
"loss": 0.0052,
"num_input_tokens_seen": 1303712,
"step": 10105
},
{
"epoch": 9.439775910364146,
"grad_norm": 4.406033515930176,
"learning_rate": 4.780856007648437e-07,
"loss": 0.0946,
"num_input_tokens_seen": 1304320,
"step": 10110
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.8488879203796387,
"learning_rate": 4.701887816147721e-07,
"loss": 0.0124,
"num_input_tokens_seen": 1304960,
"step": 10115
},
{
"epoch": 9.449112978524743,
"grad_norm": 0.2057238519191742,
"learning_rate": 4.6235710578693135e-07,
"loss": 0.0745,
"num_input_tokens_seen": 1305632,
"step": 10120
},
{
"epoch": 9.453781512605042,
"grad_norm": 3.9333977699279785,
"learning_rate": 4.545905940797457e-07,
"loss": 0.0357,
"num_input_tokens_seen": 1306304,
"step": 10125
},
{
"epoch": 9.458450046685341,
"grad_norm": 0.4153412878513336,
"learning_rate": 4.468892671185831e-07,
"loss": 0.0199,
"num_input_tokens_seen": 1306960,
"step": 10130
},
{
"epoch": 9.46311858076564,
"grad_norm": 0.0644126683473587,
"learning_rate": 4.392531453556975e-07,
"loss": 0.0108,
"num_input_tokens_seen": 1307728,
"step": 10135
},
{
"epoch": 9.467787114845938,
"grad_norm": 1.2335668802261353,
"learning_rate": 4.316822490701866e-07,
"loss": 0.0149,
"num_input_tokens_seen": 1308288,
"step": 10140
},
{
"epoch": 9.472455648926237,
"grad_norm": 0.025622710585594177,
"learning_rate": 4.24176598367923e-07,
"loss": 0.051,
"num_input_tokens_seen": 1308912,
"step": 10145
},
{
"epoch": 9.477124183006536,
"grad_norm": 3.5109171867370605,
"learning_rate": 4.16736213181515e-07,
"loss": 0.0612,
"num_input_tokens_seen": 1309648,
"step": 10150
},
{
"epoch": 9.481792717086835,
"grad_norm": 0.17764407396316528,
"learning_rate": 4.0936111327024017e-07,
"loss": 0.034,
"num_input_tokens_seen": 1310240,
"step": 10155
},
{
"epoch": 9.486461251167134,
"grad_norm": 0.03151076287031174,
"learning_rate": 4.0205131822000087e-07,
"loss": 0.0195,
"num_input_tokens_seen": 1310944,
"step": 10160
},
{
"epoch": 9.491129785247432,
"grad_norm": 0.14204958081245422,
"learning_rate": 3.948068474432715e-07,
"loss": 0.0507,
"num_input_tokens_seen": 1311600,
"step": 10165
},
{
"epoch": 9.495798319327731,
"grad_norm": 2.0150723457336426,
"learning_rate": 3.876277201790485e-07,
"loss": 0.0336,
"num_input_tokens_seen": 1312160,
"step": 10170
},
{
"epoch": 9.50046685340803,
"grad_norm": 0.4125816226005554,
"learning_rate": 3.80513955492795e-07,
"loss": 0.0039,
"num_input_tokens_seen": 1312976,
"step": 10175
},
{
"epoch": 9.505135387488329,
"grad_norm": 3.8984930515289307,
"learning_rate": 3.7346557227639077e-07,
"loss": 0.0468,
"num_input_tokens_seen": 1313584,
"step": 10180
},
{
"epoch": 9.508870214752568,
"eval_loss": 1.2354722023010254,
"eval_runtime": 3.869,
"eval_samples_per_second": 61.515,
"eval_steps_per_second": 30.757,
"num_input_tokens_seen": 1314112,
"step": 10184
},
{
"epoch": 9.509803921568627,
"grad_norm": 0.03488253802061081,
"learning_rate": 3.6648258924807944e-07,
"loss": 0.0222,
"num_input_tokens_seen": 1314272,
"step": 10185
},
{
"epoch": 9.514472455648926,
"grad_norm": 0.47239649295806885,
"learning_rate": 3.5956502495243517e-07,
"loss": 0.0108,
"num_input_tokens_seen": 1315024,
"step": 10190
},
{
"epoch": 9.519140989729225,
"grad_norm": 0.2930271327495575,
"learning_rate": 3.5271289776028503e-07,
"loss": 0.3299,
"num_input_tokens_seen": 1315552,
"step": 10195
},
{
"epoch": 9.523809523809524,
"grad_norm": 10.508183479309082,
"learning_rate": 3.4592622586869517e-07,
"loss": 0.0492,
"num_input_tokens_seen": 1316192,
"step": 10200
},
{
"epoch": 9.528478057889822,
"grad_norm": 6.849645137786865,
"learning_rate": 3.3920502730088176e-07,
"loss": 0.0533,
"num_input_tokens_seen": 1316784,
"step": 10205
},
{
"epoch": 9.533146591970121,
"grad_norm": 1.7187350988388062,
"learning_rate": 3.3254931990620017e-07,
"loss": 0.038,
"num_input_tokens_seen": 1317344,
"step": 10210
},
{
"epoch": 9.53781512605042,
"grad_norm": 0.098175048828125,
"learning_rate": 3.2595912136007543e-07,
"loss": 0.052,
"num_input_tokens_seen": 1318048,
"step": 10215
},
{
"epoch": 9.542483660130719,
"grad_norm": 5.587791442871094,
"learning_rate": 3.1943444916396894e-07,
"loss": 0.0975,
"num_input_tokens_seen": 1318816,
"step": 10220
},
{
"epoch": 9.547152194211018,
"grad_norm": 0.12869498133659363,
"learning_rate": 3.129753206453201e-07,
"loss": 0.1018,
"num_input_tokens_seen": 1319456,
"step": 10225
},
{
"epoch": 9.551820728291316,
"grad_norm": 0.026377171277999878,
"learning_rate": 3.0658175295749656e-07,
"loss": 0.009,
"num_input_tokens_seen": 1320096,
"step": 10230
},
{
"epoch": 9.556489262371615,
"grad_norm": 1.9462381601333618,
"learning_rate": 3.002537630797747e-07,
"loss": 0.0132,
"num_input_tokens_seen": 1320704,
"step": 10235
},
{
"epoch": 9.561157796451914,
"grad_norm": 0.13072355091571808,
"learning_rate": 2.9399136781726735e-07,
"loss": 0.0283,
"num_input_tokens_seen": 1321408,
"step": 10240
},
{
"epoch": 9.565826330532213,
"grad_norm": 0.4914097487926483,
"learning_rate": 2.877945838008905e-07,
"loss": 0.0606,
"num_input_tokens_seen": 1322048,
"step": 10245
},
{
"epoch": 9.570494864612511,
"grad_norm": 0.8193666338920593,
"learning_rate": 2.816634274873192e-07,
"loss": 0.0388,
"num_input_tokens_seen": 1322720,
"step": 10250
},
{
"epoch": 9.57516339869281,
"grad_norm": 3.4000868797302246,
"learning_rate": 2.7559791515893717e-07,
"loss": 0.0181,
"num_input_tokens_seen": 1323376,
"step": 10255
},
{
"epoch": 9.579831932773109,
"grad_norm": 0.41794702410697937,
"learning_rate": 2.695980629238065e-07,
"loss": 0.0244,
"num_input_tokens_seen": 1324000,
"step": 10260
},
{
"epoch": 9.584500466853408,
"grad_norm": 1.3658123016357422,
"learning_rate": 2.6366388671560936e-07,
"loss": 0.0139,
"num_input_tokens_seen": 1324624,
"step": 10265
},
{
"epoch": 9.589169000933706,
"grad_norm": 1.1461269855499268,
"learning_rate": 2.5779540229361745e-07,
"loss": 0.0083,
"num_input_tokens_seen": 1325264,
"step": 10270
},
{
"epoch": 9.593837535014005,
"grad_norm": 2.883629083633423,
"learning_rate": 2.5199262524265023e-07,
"loss": 0.01,
"num_input_tokens_seen": 1325840,
"step": 10275
},
{
"epoch": 9.598506069094304,
"grad_norm": 0.3690735399723053,
"learning_rate": 2.462555709730197e-07,
"loss": 0.0113,
"num_input_tokens_seen": 1326432,
"step": 10280
},
{
"epoch": 9.603174603174603,
"grad_norm": 1.3626866340637207,
"learning_rate": 2.4058425472050785e-07,
"loss": 0.0555,
"num_input_tokens_seen": 1327072,
"step": 10285
},
{
"epoch": 9.607843137254902,
"grad_norm": 8.382253646850586,
"learning_rate": 2.3497869154631147e-07,
"loss": 0.0422,
"num_input_tokens_seen": 1327744,
"step": 10290
},
{
"epoch": 9.6125116713352,
"grad_norm": 2.498363494873047,
"learning_rate": 2.2943889633701698e-07,
"loss": 0.0125,
"num_input_tokens_seen": 1328368,
"step": 10295
},
{
"epoch": 9.6171802054155,
"grad_norm": 2.3475382328033447,
"learning_rate": 2.239648838045394e-07,
"loss": 0.0273,
"num_input_tokens_seen": 1329088,
"step": 10300
},
{
"epoch": 9.621848739495798,
"grad_norm": 0.2977651357650757,
"learning_rate": 2.1855666848610845e-07,
"loss": 0.0061,
"num_input_tokens_seen": 1329760,
"step": 10305
},
{
"epoch": 9.626517273576097,
"grad_norm": 0.7848034501075745,
"learning_rate": 2.132142647442048e-07,
"loss": 0.043,
"num_input_tokens_seen": 1330416,
"step": 10310
},
{
"epoch": 9.631185807656395,
"grad_norm": 0.05007505044341087,
"learning_rate": 2.079376867665489e-07,
"loss": 0.1435,
"num_input_tokens_seen": 1331072,
"step": 10315
},
{
"epoch": 9.635854341736694,
"grad_norm": 2.539809226989746,
"learning_rate": 2.0272694856603991e-07,
"loss": 0.0857,
"num_input_tokens_seen": 1331792,
"step": 10320
},
{
"epoch": 9.640522875816993,
"grad_norm": 4.598158359527588,
"learning_rate": 1.975820639807252e-07,
"loss": 0.0913,
"num_input_tokens_seen": 1332448,
"step": 10325
},
{
"epoch": 9.645191409897292,
"grad_norm": 0.18733039498329163,
"learning_rate": 1.925030466737754e-07,
"loss": 0.174,
"num_input_tokens_seen": 1333072,
"step": 10330
},
{
"epoch": 9.64985994397759,
"grad_norm": 0.0705503523349762,
"learning_rate": 1.8748991013343152e-07,
"loss": 0.063,
"num_input_tokens_seen": 1333680,
"step": 10335
},
{
"epoch": 9.65452847805789,
"grad_norm": 0.14228954911231995,
"learning_rate": 1.8254266767298023e-07,
"loss": 0.0043,
"num_input_tokens_seen": 1334368,
"step": 10340
},
{
"epoch": 9.659197012138188,
"grad_norm": 0.3317956328392029,
"learning_rate": 1.7766133243071192e-07,
"loss": 0.0349,
"num_input_tokens_seen": 1335072,
"step": 10345
},
{
"epoch": 9.663865546218487,
"grad_norm": 5.647100925445557,
"learning_rate": 1.7284591736989042e-07,
"loss": 0.0193,
"num_input_tokens_seen": 1335648,
"step": 10350
},
{
"epoch": 9.668534080298786,
"grad_norm": 3.1891496181488037,
"learning_rate": 1.6809643527871398e-07,
"loss": 0.0087,
"num_input_tokens_seen": 1336368,
"step": 10355
},
{
"epoch": 9.673202614379084,
"grad_norm": 0.18910200893878937,
"learning_rate": 1.6341289877028486e-07,
"loss": 0.0471,
"num_input_tokens_seen": 1336944,
"step": 10360
},
{
"epoch": 9.677871148459383,
"grad_norm": 0.21139132976531982,
"learning_rate": 1.5879532028258148e-07,
"loss": 0.006,
"num_input_tokens_seen": 1337584,
"step": 10365
},
{
"epoch": 9.682539682539682,
"grad_norm": 0.3320813775062561,
"learning_rate": 1.5424371207841127e-07,
"loss": 0.0732,
"num_input_tokens_seen": 1338272,
"step": 10370
},
{
"epoch": 9.68720821661998,
"grad_norm": 0.11079546809196472,
"learning_rate": 1.497580862453829e-07,
"loss": 0.0501,
"num_input_tokens_seen": 1338944,
"step": 10375
},
{
"epoch": 9.69187675070028,
"grad_norm": 0.1472056806087494,
"learning_rate": 1.453384546958869e-07,
"loss": 0.0039,
"num_input_tokens_seen": 1339552,
"step": 10380
},
{
"epoch": 9.696545284780578,
"grad_norm": 8.740482330322266,
"learning_rate": 1.4098482916705126e-07,
"loss": 0.0157,
"num_input_tokens_seen": 1340224,
"step": 10385
},
{
"epoch": 9.701213818860877,
"grad_norm": 3.24934458732605,
"learning_rate": 1.3669722122070516e-07,
"loss": 0.1227,
"num_input_tokens_seen": 1340768,
"step": 10390
},
{
"epoch": 9.705882352941176,
"grad_norm": 0.5039592385292053,
"learning_rate": 1.324756422433654e-07,
"loss": 0.0288,
"num_input_tokens_seen": 1341456,
"step": 10395
},
{
"epoch": 9.710550887021475,
"grad_norm": 5.067893981933594,
"learning_rate": 1.283201034461917e-07,
"loss": 0.0439,
"num_input_tokens_seen": 1342096,
"step": 10400
},
{
"epoch": 9.715219421101773,
"grad_norm": 0.04172622784972191,
"learning_rate": 1.2423061586496477e-07,
"loss": 0.0044,
"num_input_tokens_seen": 1342720,
"step": 10405
},
{
"epoch": 9.719887955182072,
"grad_norm": 0.34393176436424255,
"learning_rate": 1.2020719036005545e-07,
"loss": 0.0047,
"num_input_tokens_seen": 1343360,
"step": 10410
},
{
"epoch": 9.72455648926237,
"grad_norm": 1.6086621284484863,
"learning_rate": 1.1624983761639174e-07,
"loss": 0.0068,
"num_input_tokens_seen": 1344000,
"step": 10415
},
{
"epoch": 9.72922502334267,
"grad_norm": 0.15175510942935944,
"learning_rate": 1.1235856814343914e-07,
"loss": 0.0133,
"num_input_tokens_seen": 1344688,
"step": 10420
},
{
"epoch": 9.733893557422968,
"grad_norm": 0.01866198517382145,
"learning_rate": 1.0853339227515635e-07,
"loss": 0.086,
"num_input_tokens_seen": 1345392,
"step": 10425
},
{
"epoch": 9.738562091503269,
"grad_norm": 0.1492108702659607,
"learning_rate": 1.0477432016998967e-07,
"loss": 0.02,
"num_input_tokens_seen": 1346032,
"step": 10430
},
{
"epoch": 9.743230625583568,
"grad_norm": 2.7389090061187744,
"learning_rate": 1.0108136181082862e-07,
"loss": 0.0313,
"num_input_tokens_seen": 1346624,
"step": 10435
},
{
"epoch": 9.747899159663866,
"grad_norm": 0.3672071397304535,
"learning_rate": 9.745452700498925e-08,
"loss": 0.0049,
"num_input_tokens_seen": 1347248,
"step": 10440
},
{
"epoch": 9.752567693744165,
"grad_norm": 0.11658532172441483,
"learning_rate": 9.38938253841809e-08,
"loss": 0.0028,
"num_input_tokens_seen": 1347856,
"step": 10445
},
{
"epoch": 9.757236227824464,
"grad_norm": 0.11300468444824219,
"learning_rate": 9.039926640449226e-08,
"loss": 0.0454,
"num_input_tokens_seen": 1348480,
"step": 10450
},
{
"epoch": 9.761904761904763,
"grad_norm": 0.8278319239616394,
"learning_rate": 8.697085934634696e-08,
"loss": 0.0477,
"num_input_tokens_seen": 1349216,
"step": 10455
},
{
"epoch": 9.766573295985062,
"grad_norm": 0.8841646313667297,
"learning_rate": 8.36086133144981e-08,
"loss": 0.0395,
"num_input_tokens_seen": 1349856,
"step": 10460
},
{
"epoch": 9.77124183006536,
"grad_norm": 0.09983119368553162,
"learning_rate": 8.03125372379948e-08,
"loss": 0.0047,
"num_input_tokens_seen": 1350480,
"step": 10465
},
{
"epoch": 9.775910364145659,
"grad_norm": 0.1856643557548523,
"learning_rate": 7.70826398701574e-08,
"loss": 0.0027,
"num_input_tokens_seen": 1351152,
"step": 10470
},
{
"epoch": 9.780578898225958,
"grad_norm": 0.3595370352268219,
"learning_rate": 7.391892978856341e-08,
"loss": 0.0141,
"num_input_tokens_seen": 1351856,
"step": 10475
},
{
"epoch": 9.785247432306257,
"grad_norm": 0.802082896232605,
"learning_rate": 7.082141539500597e-08,
"loss": 0.032,
"num_input_tokens_seen": 1352560,
"step": 10480
},
{
"epoch": 9.789915966386555,
"grad_norm": 0.0799412950873375,
"learning_rate": 6.779010491549942e-08,
"loss": 0.0188,
"num_input_tokens_seen": 1353200,
"step": 10485
},
{
"epoch": 9.794584500466854,
"grad_norm": 0.3375213146209717,
"learning_rate": 6.482500640022926e-08,
"loss": 0.0975,
"num_input_tokens_seen": 1353792,
"step": 10490
},
{
"epoch": 9.799253034547153,
"grad_norm": 2.9539597034454346,
"learning_rate": 6.192612772354945e-08,
"loss": 0.0147,
"num_input_tokens_seen": 1354464,
"step": 10495
},
{
"epoch": 9.803921568627452,
"grad_norm": 0.9493117332458496,
"learning_rate": 5.909347658394904e-08,
"loss": 0.083,
"num_input_tokens_seen": 1355184,
"step": 10500
},
{
"epoch": 9.80859010270775,
"grad_norm": 0.3258625566959381,
"learning_rate": 5.632706050404668e-08,
"loss": 0.0032,
"num_input_tokens_seen": 1355856,
"step": 10505
},
{
"epoch": 9.81325863678805,
"grad_norm": 4.085659503936768,
"learning_rate": 5.3626886830557274e-08,
"loss": 0.0413,
"num_input_tokens_seen": 1356592,
"step": 10510
},
{
"epoch": 9.817927170868348,
"grad_norm": 0.2234000712633133,
"learning_rate": 5.099296273427534e-08,
"loss": 0.0795,
"num_input_tokens_seen": 1357120,
"step": 10515
},
{
"epoch": 9.822595704948647,
"grad_norm": 2.338752031326294,
"learning_rate": 4.8425295210058344e-08,
"loss": 0.0692,
"num_input_tokens_seen": 1357728,
"step": 10520
},
{
"epoch": 9.827264239028946,
"grad_norm": 0.4640015959739685,
"learning_rate": 4.592389107681283e-08,
"loss": 0.0085,
"num_input_tokens_seen": 1358496,
"step": 10525
},
{
"epoch": 9.831932773109244,
"grad_norm": 0.17430801689624786,
"learning_rate": 4.3488756977463906e-08,
"loss": 0.0424,
"num_input_tokens_seen": 1359072,
"step": 10530
},
{
"epoch": 9.836601307189543,
"grad_norm": 9.863551139831543,
"learning_rate": 4.111989937894967e-08,
"loss": 0.0857,
"num_input_tokens_seen": 1359744,
"step": 10535
},
{
"epoch": 9.841269841269842,
"grad_norm": 0.20236068964004517,
"learning_rate": 3.881732457219622e-08,
"loss": 0.0136,
"num_input_tokens_seen": 1360400,
"step": 10540
},
{
"epoch": 9.84593837535014,
"grad_norm": 2.3232908248901367,
"learning_rate": 3.65810386721066e-08,
"loss": 0.0933,
"num_input_tokens_seen": 1361040,
"step": 10545
},
{
"epoch": 9.85060690943044,
"grad_norm": 0.05601672828197479,
"learning_rate": 3.441104761753578e-08,
"loss": 0.0633,
"num_input_tokens_seen": 1361712,
"step": 10550
},
{
"epoch": 9.855275443510738,
"grad_norm": 0.11657658964395523,
"learning_rate": 3.230735717129063e-08,
"loss": 0.0287,
"num_input_tokens_seen": 1362304,
"step": 10555
},
{
"epoch": 9.859943977591037,
"grad_norm": 3.281459093093872,
"learning_rate": 3.026997292009392e-08,
"loss": 0.0783,
"num_input_tokens_seen": 1362912,
"step": 10560
},
{
"epoch": 9.864612511671336,
"grad_norm": 0.4836415946483612,
"learning_rate": 2.8298900274589813e-08,
"loss": 0.0312,
"num_input_tokens_seen": 1363552,
"step": 10565
},
{
"epoch": 9.869281045751634,
"grad_norm": 2.9066414833068848,
"learning_rate": 2.6394144469310543e-08,
"loss": 0.0595,
"num_input_tokens_seen": 1364240,
"step": 10570
},
{
"epoch": 9.873949579831933,
"grad_norm": 1.6726226806640625,
"learning_rate": 2.4555710562684796e-08,
"loss": 0.0522,
"num_input_tokens_seen": 1364864,
"step": 10575
},
{
"epoch": 9.878618113912232,
"grad_norm": 0.22271433472633362,
"learning_rate": 2.2783603436998813e-08,
"loss": 0.0326,
"num_input_tokens_seen": 1365488,
"step": 10580
},
{
"epoch": 9.88328664799253,
"grad_norm": 0.06830593198537827,
"learning_rate": 2.1077827798404726e-08,
"loss": 0.0053,
"num_input_tokens_seen": 1366192,
"step": 10585
},
{
"epoch": 9.88795518207283,
"grad_norm": 1.006521463394165,
"learning_rate": 1.943838817689281e-08,
"loss": 0.0255,
"num_input_tokens_seen": 1366928,
"step": 10590
},
{
"epoch": 9.892623716153128,
"grad_norm": 0.22948722541332245,
"learning_rate": 1.786528892629147e-08,
"loss": 0.0561,
"num_input_tokens_seen": 1367648,
"step": 10595
},
{
"epoch": 9.897292250233427,
"grad_norm": 0.5100280046463013,
"learning_rate": 1.6358534224250598e-08,
"loss": 0.1655,
"num_input_tokens_seen": 1368304,
"step": 10600
},
{
"epoch": 9.901960784313726,
"grad_norm": 0.16577644646167755,
"learning_rate": 1.4918128072224924e-08,
"loss": 0.0217,
"num_input_tokens_seen": 1368928,
"step": 10605
},
{
"epoch": 9.906629318394025,
"grad_norm": 1.0775461196899414,
"learning_rate": 1.3544074295473996e-08,
"loss": 0.0552,
"num_input_tokens_seen": 1369584,
"step": 10610
},
{
"epoch": 9.911297852474323,
"grad_norm": 0.1106322854757309,
"learning_rate": 1.2236376543042772e-08,
"loss": 0.0232,
"num_input_tokens_seen": 1370208,
"step": 10615
},
{
"epoch": 9.915966386554622,
"grad_norm": 1.7885626554489136,
"learning_rate": 1.099503828775883e-08,
"loss": 0.0076,
"num_input_tokens_seen": 1370896,
"step": 10620
},
{
"epoch": 9.920634920634921,
"grad_norm": 3.9994475841522217,
"learning_rate": 9.820062826218502e-09,
"loss": 0.0439,
"num_input_tokens_seen": 1371536,
"step": 10625
},
{
"epoch": 9.92530345471522,
"grad_norm": 0.17582900822162628,
"learning_rate": 8.711453278778536e-09,
"loss": 0.0037,
"num_input_tokens_seen": 1372144,
"step": 10630
},
{
"epoch": 9.929971988795518,
"grad_norm": 0.2206035703420639,
"learning_rate": 7.669212589556108e-09,
"loss": 0.0134,
"num_input_tokens_seen": 1372768,
"step": 10635
},
{
"epoch": 9.934640522875817,
"grad_norm": 0.08980569988489151,
"learning_rate": 6.693343526403828e-09,
"loss": 0.0234,
"num_input_tokens_seen": 1373472,
"step": 10640
},
{
"epoch": 9.939309056956116,
"grad_norm": 0.33472374081611633,
"learning_rate": 5.78384868091808e-09,
"loss": 0.0795,
"num_input_tokens_seen": 1374080,
"step": 10645
},
{
"epoch": 9.943977591036415,
"grad_norm": 0.08384134620428085,
"learning_rate": 4.940730468427912e-09,
"loss": 0.02,
"num_input_tokens_seen": 1374736,
"step": 10650
},
{
"epoch": 9.948646125116714,
"grad_norm": 0.3119681477546692,
"learning_rate": 4.163991127983935e-09,
"loss": 0.0369,
"num_input_tokens_seen": 1375456,
"step": 10655
},
{
"epoch": 9.953314659197012,
"grad_norm": 0.5931786894798279,
"learning_rate": 3.453632722358324e-09,
"loss": 0.023,
"num_input_tokens_seen": 1376032,
"step": 10660
},
{
"epoch": 9.957983193277311,
"grad_norm": 3.6127638816833496,
"learning_rate": 2.8096571380309413e-09,
"loss": 0.0376,
"num_input_tokens_seen": 1376720,
"step": 10665
},
{
"epoch": 9.96265172735761,
"grad_norm": 0.20840921998023987,
"learning_rate": 2.232066085200435e-09,
"loss": 0.0043,
"num_input_tokens_seen": 1377392,
"step": 10670
},
{
"epoch": 9.967320261437909,
"grad_norm": 0.34955111145973206,
"learning_rate": 1.7208610977620388e-09,
"loss": 0.0231,
"num_input_tokens_seen": 1378096,
"step": 10675
},
{
"epoch": 9.971988795518207,
"grad_norm": 6.421323776245117,
"learning_rate": 1.2760435333103448e-09,
"loss": 0.0984,
"num_input_tokens_seen": 1378800,
"step": 10680
},
{
"epoch": 9.976657329598506,
"grad_norm": 5.146061420440674,
"learning_rate": 8.976145731393049e-10,
"loss": 0.0277,
"num_input_tokens_seen": 1379520,
"step": 10685
},
{
"epoch": 9.981325863678805,
"grad_norm": 0.22845172882080078,
"learning_rate": 5.855752222366783e-10,
"loss": 0.0465,
"num_input_tokens_seen": 1380032,
"step": 10690
},
{
"epoch": 9.985994397759104,
"grad_norm": 1.768675684928894,
"learning_rate": 3.3992630927848213e-10,
"loss": 0.0314,
"num_input_tokens_seen": 1380720,
"step": 10695
},
{
"epoch": 9.990662931839402,
"grad_norm": 0.8772925734519958,
"learning_rate": 1.6066848662621426e-10,
"loss": 0.0195,
"num_input_tokens_seen": 1381328,
"step": 10700
},
{
"epoch": 9.995331465919701,
"grad_norm": 0.02560342662036419,
"learning_rate": 4.780223033795661e-11,
"loss": 0.0029,
"num_input_tokens_seen": 1382080,
"step": 10705
},
{
"epoch": 10.0,
"grad_norm": 0.5053896903991699,
"learning_rate": 1.3278401433947096e-12,
"loss": 0.0035,
"num_input_tokens_seen": 1382584,
"step": 10710
},
{
"epoch": 10.0,
"num_input_tokens_seen": 1382584,
"step": 10710,
"total_flos": 6.225713263627469e+16,
"train_loss": 0.4241082760738426,
"train_runtime": 967.9016,
"train_samples_per_second": 22.12,
"train_steps_per_second": 11.065
}
],
"logging_steps": 5,
"max_steps": 10710,
"num_input_tokens_seen": 1382584,
"num_train_epochs": 10,
"save_steps": 536,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.225713263627469e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}