{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.3014789180465245, "eval_steps": 500, "global_step": 470500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015518552429429383, "grad_norm": 3.45402455329895, "learning_rate": 4.999846366330949e-05, "loss": 2.2311, "step": 100 }, { "epoch": 0.0031037104858858766, "grad_norm": 3.8429577350616455, "learning_rate": 4.9996911808066544e-05, "loss": 1.9447, "step": 200 }, { "epoch": 0.0046555657288288145, "grad_norm": 2.976985454559326, "learning_rate": 4.99953599528236e-05, "loss": 1.8563, "step": 300 }, { "epoch": 0.006207420971771753, "grad_norm": 3.213278293609619, "learning_rate": 4.999380809758066e-05, "loss": 1.8092, "step": 400 }, { "epoch": 0.007759276214714691, "grad_norm": 3.1327388286590576, "learning_rate": 4.999225624233772e-05, "loss": 1.7633, "step": 500 }, { "epoch": 0.009311131457657629, "grad_norm": 2.9328699111938477, "learning_rate": 4.9990704387094775e-05, "loss": 1.7589, "step": 600 }, { "epoch": 0.010862986700600569, "grad_norm": 3.349161148071289, "learning_rate": 4.998915253185183e-05, "loss": 1.7504, "step": 700 }, { "epoch": 0.012414841943543507, "grad_norm": 3.1088201999664307, "learning_rate": 4.9987600676608884e-05, "loss": 1.6966, "step": 800 }, { "epoch": 0.013966697186486444, "grad_norm": 3.4292752742767334, "learning_rate": 4.998604882136594e-05, "loss": 1.7051, "step": 900 }, { "epoch": 0.015518552429429382, "grad_norm": 2.640695571899414, "learning_rate": 4.9984496966123e-05, "loss": 1.6682, "step": 1000 }, { "epoch": 0.01707040767237232, "grad_norm": 2.923456907272339, "learning_rate": 4.998294511088006e-05, "loss": 1.6522, "step": 1100 }, { "epoch": 0.018622262915315258, "grad_norm": 3.0905439853668213, "learning_rate": 4.9981393255637115e-05, "loss": 1.6395, "step": 1200 }, { "epoch": 0.020174118158258196, "grad_norm": 3.6706044673919678, "learning_rate": 4.997984140039417e-05, "loss": 1.6346, "step": 1300 }, { "epoch": 0.021725973401201137, "grad_norm": 2.609037399291992, "learning_rate": 4.997828954515123e-05, "loss": 1.607, "step": 1400 }, { "epoch": 0.023277828644144075, "grad_norm": 2.7228541374206543, "learning_rate": 4.997673768990829e-05, "loss": 1.6165, "step": 1500 }, { "epoch": 0.024829683887087013, "grad_norm": 2.900372266769409, "learning_rate": 4.9975185834665346e-05, "loss": 1.6259, "step": 1600 }, { "epoch": 0.02638153913002995, "grad_norm": 2.7823703289031982, "learning_rate": 4.9973633979422404e-05, "loss": 1.5951, "step": 1700 }, { "epoch": 0.02793339437297289, "grad_norm": 2.436638355255127, "learning_rate": 4.997208212417946e-05, "loss": 1.5733, "step": 1800 }, { "epoch": 0.029485249615915827, "grad_norm": 2.976652145385742, "learning_rate": 4.997053026893652e-05, "loss": 1.5745, "step": 1900 }, { "epoch": 0.031037104858858765, "grad_norm": 2.9422554969787598, "learning_rate": 4.996897841369358e-05, "loss": 1.5691, "step": 2000 }, { "epoch": 0.032588960101801706, "grad_norm": 2.7876274585723877, "learning_rate": 4.996742655845063e-05, "loss": 1.5458, "step": 2100 }, { "epoch": 0.03414081534474464, "grad_norm": 2.8115477561950684, "learning_rate": 4.9965874703207686e-05, "loss": 1.5562, "step": 2200 }, { "epoch": 0.03569267058768758, "grad_norm": 2.705639123916626, "learning_rate": 4.996432284796474e-05, "loss": 1.5358, "step": 2300 }, { "epoch": 0.037244525830630516, "grad_norm": 3.510756492614746, "learning_rate": 4.99627709927218e-05, "loss": 1.5405, "step": 2400 }, { "epoch": 0.03879638107357346, "grad_norm": 2.5096957683563232, "learning_rate": 4.996121913747886e-05, "loss": 1.5489, "step": 2500 }, { "epoch": 0.04034823631651639, "grad_norm": 2.9852073192596436, "learning_rate": 4.9959667282235917e-05, "loss": 1.5493, "step": 2600 }, { "epoch": 0.04190009155945933, "grad_norm": 2.7485008239746094, "learning_rate": 4.9958115426992974e-05, "loss": 1.5243, "step": 2700 }, { "epoch": 0.043451946802402275, "grad_norm": 3.1658973693847656, "learning_rate": 4.995656357175003e-05, "loss": 1.5096, "step": 2800 }, { "epoch": 0.04500380204534521, "grad_norm": 3.20015549659729, "learning_rate": 4.995501171650709e-05, "loss": 1.496, "step": 2900 }, { "epoch": 0.04655565728828815, "grad_norm": 3.0995285511016846, "learning_rate": 4.995345986126415e-05, "loss": 1.5095, "step": 3000 }, { "epoch": 0.048107512531231085, "grad_norm": 3.0172009468078613, "learning_rate": 4.9951908006021205e-05, "loss": 1.4852, "step": 3100 }, { "epoch": 0.049659367774174026, "grad_norm": 3.3135604858398438, "learning_rate": 4.995035615077826e-05, "loss": 1.4845, "step": 3200 }, { "epoch": 0.05121122301711696, "grad_norm": 3.2055881023406982, "learning_rate": 4.994880429553532e-05, "loss": 1.5104, "step": 3300 }, { "epoch": 0.0527630782600599, "grad_norm": 2.850691080093384, "learning_rate": 4.994725244029237e-05, "loss": 1.4877, "step": 3400 }, { "epoch": 0.05431493350300284, "grad_norm": 2.659492015838623, "learning_rate": 4.994570058504943e-05, "loss": 1.496, "step": 3500 }, { "epoch": 0.05586678874594578, "grad_norm": 3.2706923484802246, "learning_rate": 4.994414872980648e-05, "loss": 1.5075, "step": 3600 }, { "epoch": 0.05741864398888872, "grad_norm": 2.351057529449463, "learning_rate": 4.994259687456354e-05, "loss": 1.4646, "step": 3700 }, { "epoch": 0.058970499231831654, "grad_norm": 2.7702980041503906, "learning_rate": 4.9941045019320596e-05, "loss": 1.4716, "step": 3800 }, { "epoch": 0.060522354474774595, "grad_norm": 2.2205615043640137, "learning_rate": 4.9939493164077654e-05, "loss": 1.4822, "step": 3900 }, { "epoch": 0.06207420971771753, "grad_norm": 2.963843584060669, "learning_rate": 4.993794130883471e-05, "loss": 1.4838, "step": 4000 }, { "epoch": 0.06362606496066046, "grad_norm": 2.826446294784546, "learning_rate": 4.993638945359177e-05, "loss": 1.4826, "step": 4100 }, { "epoch": 0.06517792020360341, "grad_norm": 2.8247299194335938, "learning_rate": 4.993483759834883e-05, "loss": 1.4548, "step": 4200 }, { "epoch": 0.06672977544654635, "grad_norm": 2.6167891025543213, "learning_rate": 4.9933285743105885e-05, "loss": 1.4484, "step": 4300 }, { "epoch": 0.06828163068948928, "grad_norm": 3.185368061065674, "learning_rate": 4.993173388786294e-05, "loss": 1.451, "step": 4400 }, { "epoch": 0.06983348593243223, "grad_norm": 3.5228018760681152, "learning_rate": 4.993018203262e-05, "loss": 1.45, "step": 4500 }, { "epoch": 0.07138534117537516, "grad_norm": 2.5988821983337402, "learning_rate": 4.992863017737706e-05, "loss": 1.4408, "step": 4600 }, { "epoch": 0.0729371964183181, "grad_norm": 2.6914050579071045, "learning_rate": 4.9927078322134116e-05, "loss": 1.4487, "step": 4700 }, { "epoch": 0.07448905166126103, "grad_norm": 2.8541464805603027, "learning_rate": 4.9925526466891174e-05, "loss": 1.4307, "step": 4800 }, { "epoch": 0.07604090690420398, "grad_norm": 2.572587013244629, "learning_rate": 4.9923974611648225e-05, "loss": 1.4524, "step": 4900 }, { "epoch": 0.07759276214714692, "grad_norm": 3.3482115268707275, "learning_rate": 4.992242275640528e-05, "loss": 1.4292, "step": 5000 }, { "epoch": 0.07914461739008985, "grad_norm": 2.5461199283599854, "learning_rate": 4.992087090116234e-05, "loss": 1.4228, "step": 5100 }, { "epoch": 0.08069647263303278, "grad_norm": 2.8455793857574463, "learning_rate": 4.99193190459194e-05, "loss": 1.4195, "step": 5200 }, { "epoch": 0.08224832787597573, "grad_norm": 2.7463977336883545, "learning_rate": 4.9917767190676456e-05, "loss": 1.439, "step": 5300 }, { "epoch": 0.08380018311891867, "grad_norm": 2.151210308074951, "learning_rate": 4.991621533543351e-05, "loss": 1.4169, "step": 5400 }, { "epoch": 0.0853520383618616, "grad_norm": 2.944941520690918, "learning_rate": 4.991466348019057e-05, "loss": 1.4379, "step": 5500 }, { "epoch": 0.08690389360480455, "grad_norm": 3.146421194076538, "learning_rate": 4.991311162494763e-05, "loss": 1.43, "step": 5600 }, { "epoch": 0.08845574884774748, "grad_norm": 2.7587380409240723, "learning_rate": 4.9911559769704687e-05, "loss": 1.4324, "step": 5700 }, { "epoch": 0.09000760409069042, "grad_norm": 2.6391189098358154, "learning_rate": 4.9910007914461744e-05, "loss": 1.439, "step": 5800 }, { "epoch": 0.09155945933363335, "grad_norm": 3.000552177429199, "learning_rate": 4.99084560592188e-05, "loss": 1.4203, "step": 5900 }, { "epoch": 0.0931113145765763, "grad_norm": 3.165254831314087, "learning_rate": 4.990690420397586e-05, "loss": 1.4145, "step": 6000 }, { "epoch": 0.09466316981951924, "grad_norm": 3.03849196434021, "learning_rate": 4.990535234873292e-05, "loss": 1.4127, "step": 6100 }, { "epoch": 0.09621502506246217, "grad_norm": 2.2464165687561035, "learning_rate": 4.990380049348997e-05, "loss": 1.4034, "step": 6200 }, { "epoch": 0.09776688030540512, "grad_norm": 2.8351242542266846, "learning_rate": 4.9902248638247026e-05, "loss": 1.4171, "step": 6300 }, { "epoch": 0.09931873554834805, "grad_norm": 3.134185314178467, "learning_rate": 4.9900696783004084e-05, "loss": 1.4158, "step": 6400 }, { "epoch": 0.10087059079129099, "grad_norm": 2.591951847076416, "learning_rate": 4.989914492776114e-05, "loss": 1.4308, "step": 6500 }, { "epoch": 0.10242244603423392, "grad_norm": 2.6345736980438232, "learning_rate": 4.98975930725182e-05, "loss": 1.3935, "step": 6600 }, { "epoch": 0.10397430127717687, "grad_norm": 2.4368953704833984, "learning_rate": 4.989604121727525e-05, "loss": 1.4165, "step": 6700 }, { "epoch": 0.1055261565201198, "grad_norm": 2.444155693054199, "learning_rate": 4.989448936203231e-05, "loss": 1.4096, "step": 6800 }, { "epoch": 0.10707801176306274, "grad_norm": 3.1478230953216553, "learning_rate": 4.9892937506789366e-05, "loss": 1.4328, "step": 6900 }, { "epoch": 0.10862986700600567, "grad_norm": 2.6753594875335693, "learning_rate": 4.9891385651546424e-05, "loss": 1.3985, "step": 7000 }, { "epoch": 0.11018172224894862, "grad_norm": 2.5885367393493652, "learning_rate": 4.988983379630348e-05, "loss": 1.3757, "step": 7100 }, { "epoch": 0.11173357749189156, "grad_norm": 2.546741485595703, "learning_rate": 4.988828194106054e-05, "loss": 1.3861, "step": 7200 }, { "epoch": 0.11328543273483449, "grad_norm": 2.6451854705810547, "learning_rate": 4.98867300858176e-05, "loss": 1.3809, "step": 7300 }, { "epoch": 0.11483728797777744, "grad_norm": 2.460561513900757, "learning_rate": 4.9885178230574655e-05, "loss": 1.3943, "step": 7400 }, { "epoch": 0.11638914322072037, "grad_norm": 2.5842692852020264, "learning_rate": 4.988362637533171e-05, "loss": 1.3833, "step": 7500 }, { "epoch": 0.11794099846366331, "grad_norm": 2.862595796585083, "learning_rate": 4.988207452008877e-05, "loss": 1.3679, "step": 7600 }, { "epoch": 0.11949285370660624, "grad_norm": 2.5362164974212646, "learning_rate": 4.988052266484582e-05, "loss": 1.3763, "step": 7700 }, { "epoch": 0.12104470894954919, "grad_norm": 2.6035406589508057, "learning_rate": 4.987897080960288e-05, "loss": 1.3591, "step": 7800 }, { "epoch": 0.12259656419249212, "grad_norm": 2.9999115467071533, "learning_rate": 4.987741895435994e-05, "loss": 1.3984, "step": 7900 }, { "epoch": 0.12414841943543506, "grad_norm": 2.50899076461792, "learning_rate": 4.9875867099116995e-05, "loss": 1.3648, "step": 8000 }, { "epoch": 0.125700274678378, "grad_norm": 2.866330146789551, "learning_rate": 4.987431524387405e-05, "loss": 1.4131, "step": 8100 }, { "epoch": 0.12725212992132093, "grad_norm": 2.6132583618164062, "learning_rate": 4.987276338863111e-05, "loss": 1.3645, "step": 8200 }, { "epoch": 0.1288039851642639, "grad_norm": 2.6919543743133545, "learning_rate": 4.987121153338817e-05, "loss": 1.3926, "step": 8300 }, { "epoch": 0.13035584040720682, "grad_norm": 2.9526069164276123, "learning_rate": 4.9869659678145226e-05, "loss": 1.3552, "step": 8400 }, { "epoch": 0.13190769565014976, "grad_norm": 2.522690773010254, "learning_rate": 4.986810782290228e-05, "loss": 1.3846, "step": 8500 }, { "epoch": 0.1334595508930927, "grad_norm": 2.8384175300598145, "learning_rate": 4.986655596765934e-05, "loss": 1.3696, "step": 8600 }, { "epoch": 0.13501140613603563, "grad_norm": 2.4619386196136475, "learning_rate": 4.98650041124164e-05, "loss": 1.3572, "step": 8700 }, { "epoch": 0.13656326137897856, "grad_norm": 2.8175203800201416, "learning_rate": 4.9863452257173457e-05, "loss": 1.3758, "step": 8800 }, { "epoch": 0.1381151166219215, "grad_norm": 2.510261297225952, "learning_rate": 4.9861900401930514e-05, "loss": 1.3783, "step": 8900 }, { "epoch": 0.13966697186486446, "grad_norm": 2.833686113357544, "learning_rate": 4.9860348546687565e-05, "loss": 1.3693, "step": 9000 }, { "epoch": 0.1412188271078074, "grad_norm": 2.7892541885375977, "learning_rate": 4.985879669144462e-05, "loss": 1.3656, "step": 9100 }, { "epoch": 0.14277068235075033, "grad_norm": 2.4540627002716064, "learning_rate": 4.985724483620168e-05, "loss": 1.3573, "step": 9200 }, { "epoch": 0.14432253759369326, "grad_norm": 2.7814266681671143, "learning_rate": 4.985569298095874e-05, "loss": 1.347, "step": 9300 }, { "epoch": 0.1458743928366362, "grad_norm": 2.415938138961792, "learning_rate": 4.9854141125715796e-05, "loss": 1.3583, "step": 9400 }, { "epoch": 0.14742624807957913, "grad_norm": 2.5764036178588867, "learning_rate": 4.9852589270472854e-05, "loss": 1.3839, "step": 9500 }, { "epoch": 0.14897810332252207, "grad_norm": 2.4359829425811768, "learning_rate": 4.985103741522991e-05, "loss": 1.3321, "step": 9600 }, { "epoch": 0.150529958565465, "grad_norm": 2.5717275142669678, "learning_rate": 4.984948555998697e-05, "loss": 1.3743, "step": 9700 }, { "epoch": 0.15208181380840796, "grad_norm": 2.3378283977508545, "learning_rate": 4.984793370474403e-05, "loss": 1.3253, "step": 9800 }, { "epoch": 0.1536336690513509, "grad_norm": 2.5946924686431885, "learning_rate": 4.9846381849501085e-05, "loss": 1.3246, "step": 9900 }, { "epoch": 0.15518552429429383, "grad_norm": 2.87147855758667, "learning_rate": 4.9844829994258136e-05, "loss": 1.338, "step": 10000 }, { "epoch": 0.15673737953723677, "grad_norm": 2.368569850921631, "learning_rate": 4.9843278139015194e-05, "loss": 1.3304, "step": 10100 }, { "epoch": 0.1582892347801797, "grad_norm": 2.849220037460327, "learning_rate": 4.984172628377225e-05, "loss": 1.3644, "step": 10200 }, { "epoch": 0.15984109002312263, "grad_norm": 2.7294695377349854, "learning_rate": 4.984017442852931e-05, "loss": 1.3521, "step": 10300 }, { "epoch": 0.16139294526606557, "grad_norm": 2.3562920093536377, "learning_rate": 4.983862257328637e-05, "loss": 1.3154, "step": 10400 }, { "epoch": 0.16294480050900853, "grad_norm": 2.7919921875, "learning_rate": 4.9837070718043425e-05, "loss": 1.3754, "step": 10500 }, { "epoch": 0.16449665575195146, "grad_norm": 2.607933282852173, "learning_rate": 4.9835518862800476e-05, "loss": 1.3256, "step": 10600 }, { "epoch": 0.1660485109948944, "grad_norm": 2.602830410003662, "learning_rate": 4.9833967007557534e-05, "loss": 1.3509, "step": 10700 }, { "epoch": 0.16760036623783733, "grad_norm": 2.629920482635498, "learning_rate": 4.983241515231459e-05, "loss": 1.3251, "step": 10800 }, { "epoch": 0.16915222148078027, "grad_norm": 2.610398054122925, "learning_rate": 4.983086329707165e-05, "loss": 1.3515, "step": 10900 }, { "epoch": 0.1707040767237232, "grad_norm": 3.0583600997924805, "learning_rate": 4.982931144182871e-05, "loss": 1.3496, "step": 11000 }, { "epoch": 0.17225593196666614, "grad_norm": 2.51269268989563, "learning_rate": 4.9827759586585765e-05, "loss": 1.3538, "step": 11100 }, { "epoch": 0.1738077872096091, "grad_norm": 2.8590087890625, "learning_rate": 4.982620773134282e-05, "loss": 1.3555, "step": 11200 }, { "epoch": 0.17535964245255203, "grad_norm": 2.775425910949707, "learning_rate": 4.982465587609988e-05, "loss": 1.3261, "step": 11300 }, { "epoch": 0.17691149769549497, "grad_norm": 2.472468137741089, "learning_rate": 4.982310402085694e-05, "loss": 1.3233, "step": 11400 }, { "epoch": 0.1784633529384379, "grad_norm": 2.447803020477295, "learning_rate": 4.9821552165613996e-05, "loss": 1.3114, "step": 11500 }, { "epoch": 0.18001520818138084, "grad_norm": 2.7661869525909424, "learning_rate": 4.982000031037105e-05, "loss": 1.3327, "step": 11600 }, { "epoch": 0.18156706342432377, "grad_norm": 2.892381191253662, "learning_rate": 4.981844845512811e-05, "loss": 1.3134, "step": 11700 }, { "epoch": 0.1831189186672667, "grad_norm": 2.547635555267334, "learning_rate": 4.981689659988517e-05, "loss": 1.2913, "step": 11800 }, { "epoch": 0.18467077391020967, "grad_norm": 3.205932855606079, "learning_rate": 4.981534474464222e-05, "loss": 1.3398, "step": 11900 }, { "epoch": 0.1862226291531526, "grad_norm": 2.577988386154175, "learning_rate": 4.981379288939928e-05, "loss": 1.3344, "step": 12000 }, { "epoch": 0.18777448439609554, "grad_norm": 2.690061092376709, "learning_rate": 4.9812241034156335e-05, "loss": 1.315, "step": 12100 }, { "epoch": 0.18932633963903847, "grad_norm": 2.4334030151367188, "learning_rate": 4.981068917891339e-05, "loss": 1.3185, "step": 12200 }, { "epoch": 0.1908781948819814, "grad_norm": 2.706022024154663, "learning_rate": 4.980913732367045e-05, "loss": 1.3132, "step": 12300 }, { "epoch": 0.19243005012492434, "grad_norm": 2.400574207305908, "learning_rate": 4.980758546842751e-05, "loss": 1.3076, "step": 12400 }, { "epoch": 0.19398190536786727, "grad_norm": 2.7774713039398193, "learning_rate": 4.9806033613184566e-05, "loss": 1.3081, "step": 12500 }, { "epoch": 0.19553376061081024, "grad_norm": 2.392484664916992, "learning_rate": 4.9804481757941624e-05, "loss": 1.3146, "step": 12600 }, { "epoch": 0.19708561585375317, "grad_norm": 2.4213242530822754, "learning_rate": 4.980292990269868e-05, "loss": 1.3, "step": 12700 }, { "epoch": 0.1986374710966961, "grad_norm": 2.6262598037719727, "learning_rate": 4.980137804745574e-05, "loss": 1.3074, "step": 12800 }, { "epoch": 0.20018932633963904, "grad_norm": 2.6495511531829834, "learning_rate": 4.97998261922128e-05, "loss": 1.3134, "step": 12900 }, { "epoch": 0.20174118158258197, "grad_norm": 2.5671873092651367, "learning_rate": 4.9798274336969855e-05, "loss": 1.3102, "step": 13000 }, { "epoch": 0.2032930368255249, "grad_norm": 2.4559695720672607, "learning_rate": 4.979672248172691e-05, "loss": 1.2916, "step": 13100 }, { "epoch": 0.20484489206846784, "grad_norm": 2.3597123622894287, "learning_rate": 4.9795170626483964e-05, "loss": 1.2894, "step": 13200 }, { "epoch": 0.2063967473114108, "grad_norm": 2.7529051303863525, "learning_rate": 4.979361877124102e-05, "loss": 1.3136, "step": 13300 }, { "epoch": 0.20794860255435374, "grad_norm": 2.6413700580596924, "learning_rate": 4.979206691599807e-05, "loss": 1.2798, "step": 13400 }, { "epoch": 0.20950045779729667, "grad_norm": 2.500199794769287, "learning_rate": 4.979051506075513e-05, "loss": 1.2863, "step": 13500 }, { "epoch": 0.2110523130402396, "grad_norm": 2.5025570392608643, "learning_rate": 4.978896320551219e-05, "loss": 1.3186, "step": 13600 }, { "epoch": 0.21260416828318254, "grad_norm": 2.6481752395629883, "learning_rate": 4.9787411350269246e-05, "loss": 1.3022, "step": 13700 }, { "epoch": 0.21415602352612548, "grad_norm": 2.31376051902771, "learning_rate": 4.9785859495026304e-05, "loss": 1.3118, "step": 13800 }, { "epoch": 0.2157078787690684, "grad_norm": 2.443878412246704, "learning_rate": 4.978430763978336e-05, "loss": 1.3235, "step": 13900 }, { "epoch": 0.21725973401201135, "grad_norm": 2.792278528213501, "learning_rate": 4.978275578454042e-05, "loss": 1.322, "step": 14000 }, { "epoch": 0.2188115892549543, "grad_norm": 2.817594051361084, "learning_rate": 4.978120392929748e-05, "loss": 1.316, "step": 14100 }, { "epoch": 0.22036344449789724, "grad_norm": 2.3896937370300293, "learning_rate": 4.9779652074054535e-05, "loss": 1.3029, "step": 14200 }, { "epoch": 0.22191529974084018, "grad_norm": 2.849748373031616, "learning_rate": 4.977810021881159e-05, "loss": 1.3186, "step": 14300 }, { "epoch": 0.2234671549837831, "grad_norm": 2.8510918617248535, "learning_rate": 4.977654836356865e-05, "loss": 1.2806, "step": 14400 }, { "epoch": 0.22501901022672605, "grad_norm": 2.5479323863983154, "learning_rate": 4.977499650832571e-05, "loss": 1.2917, "step": 14500 }, { "epoch": 0.22657086546966898, "grad_norm": 2.897686243057251, "learning_rate": 4.9773444653082766e-05, "loss": 1.2832, "step": 14600 }, { "epoch": 0.22812272071261191, "grad_norm": 2.7339227199554443, "learning_rate": 4.9771892797839817e-05, "loss": 1.287, "step": 14700 }, { "epoch": 0.22967457595555488, "grad_norm": 2.701669931411743, "learning_rate": 4.9770340942596874e-05, "loss": 1.3081, "step": 14800 }, { "epoch": 0.2312264311984978, "grad_norm": 2.775510549545288, "learning_rate": 4.976878908735393e-05, "loss": 1.3007, "step": 14900 }, { "epoch": 0.23277828644144075, "grad_norm": 2.426663637161255, "learning_rate": 4.976723723211099e-05, "loss": 1.279, "step": 15000 }, { "epoch": 0.23433014168438368, "grad_norm": 2.715815305709839, "learning_rate": 4.976568537686805e-05, "loss": 1.3016, "step": 15100 }, { "epoch": 0.23588199692732661, "grad_norm": 2.6184160709381104, "learning_rate": 4.9764133521625105e-05, "loss": 1.286, "step": 15200 }, { "epoch": 0.23743385217026955, "grad_norm": 2.65220046043396, "learning_rate": 4.976258166638216e-05, "loss": 1.2953, "step": 15300 }, { "epoch": 0.23898570741321248, "grad_norm": 2.455944299697876, "learning_rate": 4.976102981113922e-05, "loss": 1.2863, "step": 15400 }, { "epoch": 0.24053756265615545, "grad_norm": 2.484191417694092, "learning_rate": 4.975947795589628e-05, "loss": 1.2715, "step": 15500 }, { "epoch": 0.24208941789909838, "grad_norm": 2.3590664863586426, "learning_rate": 4.9757926100653336e-05, "loss": 1.2865, "step": 15600 }, { "epoch": 0.24364127314204131, "grad_norm": 2.6133363246917725, "learning_rate": 4.9756374245410394e-05, "loss": 1.3089, "step": 15700 }, { "epoch": 0.24519312838498425, "grad_norm": 2.5714144706726074, "learning_rate": 4.975482239016745e-05, "loss": 1.2799, "step": 15800 }, { "epoch": 0.24674498362792718, "grad_norm": 2.655200242996216, "learning_rate": 4.975327053492451e-05, "loss": 1.2854, "step": 15900 }, { "epoch": 0.24829683887087012, "grad_norm": 2.5497395992279053, "learning_rate": 4.975171867968156e-05, "loss": 1.3059, "step": 16000 }, { "epoch": 0.24984869411381305, "grad_norm": 2.787501573562622, "learning_rate": 4.975016682443862e-05, "loss": 1.2996, "step": 16100 }, { "epoch": 0.251400549356756, "grad_norm": 2.7455899715423584, "learning_rate": 4.9748614969195676e-05, "loss": 1.2996, "step": 16200 }, { "epoch": 0.2529524045996989, "grad_norm": 2.803668737411499, "learning_rate": 4.9747063113952734e-05, "loss": 1.2888, "step": 16300 }, { "epoch": 0.25450425984264186, "grad_norm": 2.38700532913208, "learning_rate": 4.974551125870979e-05, "loss": 1.3025, "step": 16400 }, { "epoch": 0.2560561150855848, "grad_norm": 2.337942123413086, "learning_rate": 4.974395940346684e-05, "loss": 1.2914, "step": 16500 }, { "epoch": 0.2576079703285278, "grad_norm": 2.190355062484741, "learning_rate": 4.97424075482239e-05, "loss": 1.3007, "step": 16600 }, { "epoch": 0.2591598255714707, "grad_norm": 2.6558947563171387, "learning_rate": 4.974085569298096e-05, "loss": 1.2867, "step": 16700 }, { "epoch": 0.26071168081441365, "grad_norm": 2.452852964401245, "learning_rate": 4.9739303837738016e-05, "loss": 1.2921, "step": 16800 }, { "epoch": 0.2622635360573566, "grad_norm": 2.992396831512451, "learning_rate": 4.9737751982495074e-05, "loss": 1.2735, "step": 16900 }, { "epoch": 0.2638153913002995, "grad_norm": 1.9501643180847168, "learning_rate": 4.973620012725213e-05, "loss": 1.2719, "step": 17000 }, { "epoch": 0.26536724654324245, "grad_norm": 2.429457664489746, "learning_rate": 4.973464827200919e-05, "loss": 1.2981, "step": 17100 }, { "epoch": 0.2669191017861854, "grad_norm": 2.4714221954345703, "learning_rate": 4.973309641676625e-05, "loss": 1.2638, "step": 17200 }, { "epoch": 0.2684709570291283, "grad_norm": 2.4382991790771484, "learning_rate": 4.9731544561523305e-05, "loss": 1.2844, "step": 17300 }, { "epoch": 0.27002281227207126, "grad_norm": 2.2101175785064697, "learning_rate": 4.972999270628036e-05, "loss": 1.2809, "step": 17400 }, { "epoch": 0.2715746675150142, "grad_norm": 2.6227142810821533, "learning_rate": 4.972844085103742e-05, "loss": 1.3096, "step": 17500 }, { "epoch": 0.2731265227579571, "grad_norm": 2.3420772552490234, "learning_rate": 4.972688899579447e-05, "loss": 1.2689, "step": 17600 }, { "epoch": 0.27467837800090006, "grad_norm": 2.3048648834228516, "learning_rate": 4.972533714055153e-05, "loss": 1.2695, "step": 17700 }, { "epoch": 0.276230233243843, "grad_norm": 2.3735549449920654, "learning_rate": 4.9723785285308587e-05, "loss": 1.2774, "step": 17800 }, { "epoch": 0.2777820884867859, "grad_norm": 2.56894588470459, "learning_rate": 4.9722233430065644e-05, "loss": 1.2415, "step": 17900 }, { "epoch": 0.2793339437297289, "grad_norm": 2.6230642795562744, "learning_rate": 4.97206815748227e-05, "loss": 1.2775, "step": 18000 }, { "epoch": 0.28088579897267185, "grad_norm": 2.1623568534851074, "learning_rate": 4.971912971957976e-05, "loss": 1.2724, "step": 18100 }, { "epoch": 0.2824376542156148, "grad_norm": 2.7458627223968506, "learning_rate": 4.971757786433682e-05, "loss": 1.252, "step": 18200 }, { "epoch": 0.2839895094585577, "grad_norm": 2.631115198135376, "learning_rate": 4.9716026009093875e-05, "loss": 1.2952, "step": 18300 }, { "epoch": 0.28554136470150066, "grad_norm": 2.432504653930664, "learning_rate": 4.971447415385093e-05, "loss": 1.2697, "step": 18400 }, { "epoch": 0.2870932199444436, "grad_norm": 3.2598044872283936, "learning_rate": 4.971292229860799e-05, "loss": 1.2659, "step": 18500 }, { "epoch": 0.2886450751873865, "grad_norm": 2.5015132427215576, "learning_rate": 4.971137044336505e-05, "loss": 1.2515, "step": 18600 }, { "epoch": 0.29019693043032946, "grad_norm": 2.426882028579712, "learning_rate": 4.9709818588122106e-05, "loss": 1.2575, "step": 18700 }, { "epoch": 0.2917487856732724, "grad_norm": 2.1316773891448975, "learning_rate": 4.9708266732879164e-05, "loss": 1.2785, "step": 18800 }, { "epoch": 0.2933006409162153, "grad_norm": 2.2184412479400635, "learning_rate": 4.9706714877636215e-05, "loss": 1.2692, "step": 18900 }, { "epoch": 0.29485249615915826, "grad_norm": 2.5865681171417236, "learning_rate": 4.970516302239327e-05, "loss": 1.2864, "step": 19000 }, { "epoch": 0.2964043514021012, "grad_norm": 2.5837719440460205, "learning_rate": 4.970361116715033e-05, "loss": 1.2557, "step": 19100 }, { "epoch": 0.29795620664504413, "grad_norm": 2.6286470890045166, "learning_rate": 4.970205931190739e-05, "loss": 1.259, "step": 19200 }, { "epoch": 0.29950806188798706, "grad_norm": 2.8535425662994385, "learning_rate": 4.9700507456664446e-05, "loss": 1.2402, "step": 19300 }, { "epoch": 0.30105991713093, "grad_norm": 2.272538900375366, "learning_rate": 4.9698955601421504e-05, "loss": 1.2728, "step": 19400 }, { "epoch": 0.302611772373873, "grad_norm": 2.6719024181365967, "learning_rate": 4.969740374617856e-05, "loss": 1.269, "step": 19500 }, { "epoch": 0.3041636276168159, "grad_norm": 2.3608758449554443, "learning_rate": 4.969585189093562e-05, "loss": 1.2605, "step": 19600 }, { "epoch": 0.30571548285975886, "grad_norm": 2.552095890045166, "learning_rate": 4.969430003569267e-05, "loss": 1.2451, "step": 19700 }, { "epoch": 0.3072673381027018, "grad_norm": 2.9721288681030273, "learning_rate": 4.969274818044973e-05, "loss": 1.2786, "step": 19800 }, { "epoch": 0.3088191933456447, "grad_norm": 2.478731393814087, "learning_rate": 4.9691196325206786e-05, "loss": 1.2735, "step": 19900 }, { "epoch": 0.31037104858858766, "grad_norm": 2.8701961040496826, "learning_rate": 4.9689644469963844e-05, "loss": 1.2515, "step": 20000 }, { "epoch": 0.3119229038315306, "grad_norm": 2.821871519088745, "learning_rate": 4.96880926147209e-05, "loss": 1.3017, "step": 20100 }, { "epoch": 0.31347475907447353, "grad_norm": 2.907162666320801, "learning_rate": 4.968654075947796e-05, "loss": 1.2539, "step": 20200 }, { "epoch": 0.31502661431741646, "grad_norm": 2.221086025238037, "learning_rate": 4.968498890423502e-05, "loss": 1.2926, "step": 20300 }, { "epoch": 0.3165784695603594, "grad_norm": 2.498689889907837, "learning_rate": 4.968343704899207e-05, "loss": 1.2786, "step": 20400 }, { "epoch": 0.31813032480330233, "grad_norm": 2.6393258571624756, "learning_rate": 4.9681885193749126e-05, "loss": 1.2442, "step": 20500 }, { "epoch": 0.31968218004624527, "grad_norm": 2.6867501735687256, "learning_rate": 4.968033333850618e-05, "loss": 1.2567, "step": 20600 }, { "epoch": 0.3212340352891882, "grad_norm": 2.4331588745117188, "learning_rate": 4.967878148326324e-05, "loss": 1.2737, "step": 20700 }, { "epoch": 0.32278589053213114, "grad_norm": 2.5626893043518066, "learning_rate": 4.96772296280203e-05, "loss": 1.2608, "step": 20800 }, { "epoch": 0.3243377457750741, "grad_norm": 1.7633891105651855, "learning_rate": 4.9675677772777357e-05, "loss": 1.2545, "step": 20900 }, { "epoch": 0.32588960101801706, "grad_norm": 2.918287515640259, "learning_rate": 4.9674125917534414e-05, "loss": 1.2738, "step": 21000 }, { "epoch": 0.32744145626096, "grad_norm": 2.7387962341308594, "learning_rate": 4.967257406229147e-05, "loss": 1.2335, "step": 21100 }, { "epoch": 0.32899331150390293, "grad_norm": 2.6093668937683105, "learning_rate": 4.967102220704853e-05, "loss": 1.2529, "step": 21200 }, { "epoch": 0.33054516674684586, "grad_norm": 2.3680531978607178, "learning_rate": 4.966947035180559e-05, "loss": 1.2644, "step": 21300 }, { "epoch": 0.3320970219897888, "grad_norm": 2.770004987716675, "learning_rate": 4.9667918496562645e-05, "loss": 1.2372, "step": 21400 }, { "epoch": 0.33364887723273173, "grad_norm": 3.0637829303741455, "learning_rate": 4.96663666413197e-05, "loss": 1.2457, "step": 21500 }, { "epoch": 0.33520073247567467, "grad_norm": 2.536048173904419, "learning_rate": 4.966481478607676e-05, "loss": 1.241, "step": 21600 }, { "epoch": 0.3367525877186176, "grad_norm": 2.698406219482422, "learning_rate": 4.966326293083381e-05, "loss": 1.2588, "step": 21700 }, { "epoch": 0.33830444296156054, "grad_norm": 2.545354127883911, "learning_rate": 4.966171107559087e-05, "loss": 1.2393, "step": 21800 }, { "epoch": 0.33985629820450347, "grad_norm": 2.902768611907959, "learning_rate": 4.966015922034793e-05, "loss": 1.2622, "step": 21900 }, { "epoch": 0.3414081534474464, "grad_norm": 2.4308223724365234, "learning_rate": 4.9658607365104985e-05, "loss": 1.2451, "step": 22000 }, { "epoch": 0.34296000869038934, "grad_norm": 2.7511160373687744, "learning_rate": 4.965705550986204e-05, "loss": 1.265, "step": 22100 }, { "epoch": 0.3445118639333323, "grad_norm": 2.269733190536499, "learning_rate": 4.96555036546191e-05, "loss": 1.231, "step": 22200 }, { "epoch": 0.34606371917627526, "grad_norm": 2.5755834579467773, "learning_rate": 4.965395179937616e-05, "loss": 1.2449, "step": 22300 }, { "epoch": 0.3476155744192182, "grad_norm": 2.7427587509155273, "learning_rate": 4.9652399944133216e-05, "loss": 1.2315, "step": 22400 }, { "epoch": 0.34916742966216113, "grad_norm": 2.5260298252105713, "learning_rate": 4.9650848088890274e-05, "loss": 1.2448, "step": 22500 }, { "epoch": 0.35071928490510407, "grad_norm": 2.6664581298828125, "learning_rate": 4.964929623364733e-05, "loss": 1.2658, "step": 22600 }, { "epoch": 0.352271140148047, "grad_norm": 2.3417155742645264, "learning_rate": 4.964774437840439e-05, "loss": 1.2485, "step": 22700 }, { "epoch": 0.35382299539098994, "grad_norm": 2.4381089210510254, "learning_rate": 4.964619252316145e-05, "loss": 1.2271, "step": 22800 }, { "epoch": 0.35537485063393287, "grad_norm": 2.50130558013916, "learning_rate": 4.9644640667918505e-05, "loss": 1.2544, "step": 22900 }, { "epoch": 0.3569267058768758, "grad_norm": 2.5027225017547607, "learning_rate": 4.9643088812675556e-05, "loss": 1.2368, "step": 23000 }, { "epoch": 0.35847856111981874, "grad_norm": 2.594553232192993, "learning_rate": 4.9641536957432614e-05, "loss": 1.2731, "step": 23100 }, { "epoch": 0.3600304163627617, "grad_norm": 2.1037485599517822, "learning_rate": 4.9639985102189665e-05, "loss": 1.2262, "step": 23200 }, { "epoch": 0.3615822716057046, "grad_norm": 3.055147886276245, "learning_rate": 4.963843324694672e-05, "loss": 1.2429, "step": 23300 }, { "epoch": 0.36313412684864754, "grad_norm": 2.637697219848633, "learning_rate": 4.963688139170378e-05, "loss": 1.2274, "step": 23400 }, { "epoch": 0.3646859820915905, "grad_norm": 2.306586980819702, "learning_rate": 4.963532953646084e-05, "loss": 1.252, "step": 23500 }, { "epoch": 0.3662378373345334, "grad_norm": 2.6637966632843018, "learning_rate": 4.9633777681217896e-05, "loss": 1.2433, "step": 23600 }, { "epoch": 0.36778969257747635, "grad_norm": 2.3285157680511475, "learning_rate": 4.963222582597495e-05, "loss": 1.2429, "step": 23700 }, { "epoch": 0.36934154782041934, "grad_norm": 2.5495405197143555, "learning_rate": 4.963067397073201e-05, "loss": 1.2292, "step": 23800 }, { "epoch": 0.37089340306336227, "grad_norm": 2.127140760421753, "learning_rate": 4.962912211548907e-05, "loss": 1.2357, "step": 23900 }, { "epoch": 0.3724452583063052, "grad_norm": 2.2757773399353027, "learning_rate": 4.9627570260246127e-05, "loss": 1.2616, "step": 24000 }, { "epoch": 0.37399711354924814, "grad_norm": 2.529846668243408, "learning_rate": 4.9626018405003184e-05, "loss": 1.2387, "step": 24100 }, { "epoch": 0.3755489687921911, "grad_norm": 2.742988109588623, "learning_rate": 4.962446654976024e-05, "loss": 1.2411, "step": 24200 }, { "epoch": 0.377100824035134, "grad_norm": 2.7472355365753174, "learning_rate": 4.96229146945173e-05, "loss": 1.2726, "step": 24300 }, { "epoch": 0.37865267927807694, "grad_norm": 2.5658702850341797, "learning_rate": 4.962136283927436e-05, "loss": 1.2479, "step": 24400 }, { "epoch": 0.3802045345210199, "grad_norm": 2.6822404861450195, "learning_rate": 4.961981098403141e-05, "loss": 1.2324, "step": 24500 }, { "epoch": 0.3817563897639628, "grad_norm": 2.149616003036499, "learning_rate": 4.9618259128788466e-05, "loss": 1.2335, "step": 24600 }, { "epoch": 0.38330824500690575, "grad_norm": 2.515333890914917, "learning_rate": 4.9616707273545524e-05, "loss": 1.2274, "step": 24700 }, { "epoch": 0.3848601002498487, "grad_norm": 2.617628812789917, "learning_rate": 4.961515541830258e-05, "loss": 1.2316, "step": 24800 }, { "epoch": 0.3864119554927916, "grad_norm": 2.5379648208618164, "learning_rate": 4.961360356305964e-05, "loss": 1.2257, "step": 24900 }, { "epoch": 0.38796381073573455, "grad_norm": 2.619234323501587, "learning_rate": 4.96120517078167e-05, "loss": 1.2044, "step": 25000 }, { "epoch": 0.3895156659786775, "grad_norm": 2.523144483566284, "learning_rate": 4.9610499852573755e-05, "loss": 1.2506, "step": 25100 }, { "epoch": 0.3910675212216205, "grad_norm": 2.6998369693756104, "learning_rate": 4.960894799733081e-05, "loss": 1.2557, "step": 25200 }, { "epoch": 0.3926193764645634, "grad_norm": 2.4023473262786865, "learning_rate": 4.960739614208787e-05, "loss": 1.2526, "step": 25300 }, { "epoch": 0.39417123170750634, "grad_norm": 2.3050835132598877, "learning_rate": 4.960584428684493e-05, "loss": 1.2225, "step": 25400 }, { "epoch": 0.3957230869504493, "grad_norm": 2.1680538654327393, "learning_rate": 4.9604292431601986e-05, "loss": 1.2286, "step": 25500 }, { "epoch": 0.3972749421933922, "grad_norm": 2.603233814239502, "learning_rate": 4.9602740576359044e-05, "loss": 1.2253, "step": 25600 }, { "epoch": 0.39882679743633515, "grad_norm": 2.2895560264587402, "learning_rate": 4.96011887211161e-05, "loss": 1.2318, "step": 25700 }, { "epoch": 0.4003786526792781, "grad_norm": 2.3969969749450684, "learning_rate": 4.959963686587315e-05, "loss": 1.2381, "step": 25800 }, { "epoch": 0.401930507922221, "grad_norm": 2.4148831367492676, "learning_rate": 4.959808501063021e-05, "loss": 1.2117, "step": 25900 }, { "epoch": 0.40348236316516395, "grad_norm": 2.634455442428589, "learning_rate": 4.959653315538727e-05, "loss": 1.2233, "step": 26000 }, { "epoch": 0.4050342184081069, "grad_norm": 2.5249853134155273, "learning_rate": 4.9594981300144326e-05, "loss": 1.2234, "step": 26100 }, { "epoch": 0.4065860736510498, "grad_norm": 2.2868144512176514, "learning_rate": 4.959342944490138e-05, "loss": 1.2397, "step": 26200 }, { "epoch": 0.40813792889399275, "grad_norm": 2.3488306999206543, "learning_rate": 4.9591877589658435e-05, "loss": 1.2479, "step": 26300 }, { "epoch": 0.4096897841369357, "grad_norm": 2.9861233234405518, "learning_rate": 4.959032573441549e-05, "loss": 1.235, "step": 26400 }, { "epoch": 0.4112416393798786, "grad_norm": 2.7346441745758057, "learning_rate": 4.958877387917255e-05, "loss": 1.2238, "step": 26500 }, { "epoch": 0.4127934946228216, "grad_norm": 2.6701862812042236, "learning_rate": 4.958722202392961e-05, "loss": 1.2331, "step": 26600 }, { "epoch": 0.41434534986576455, "grad_norm": 2.398591995239258, "learning_rate": 4.9585670168686666e-05, "loss": 1.224, "step": 26700 }, { "epoch": 0.4158972051087075, "grad_norm": 2.4429726600646973, "learning_rate": 4.958411831344372e-05, "loss": 1.2276, "step": 26800 }, { "epoch": 0.4174490603516504, "grad_norm": 2.251915693283081, "learning_rate": 4.958256645820078e-05, "loss": 1.2405, "step": 26900 }, { "epoch": 0.41900091559459335, "grad_norm": 2.7813985347747803, "learning_rate": 4.958101460295784e-05, "loss": 1.223, "step": 27000 }, { "epoch": 0.4205527708375363, "grad_norm": 2.108531951904297, "learning_rate": 4.9579462747714897e-05, "loss": 1.2304, "step": 27100 }, { "epoch": 0.4221046260804792, "grad_norm": 2.433366060256958, "learning_rate": 4.9577910892471954e-05, "loss": 1.2246, "step": 27200 }, { "epoch": 0.42365648132342215, "grad_norm": 2.5393569469451904, "learning_rate": 4.957635903722901e-05, "loss": 1.2182, "step": 27300 }, { "epoch": 0.4252083365663651, "grad_norm": 2.4050772190093994, "learning_rate": 4.957480718198606e-05, "loss": 1.2049, "step": 27400 }, { "epoch": 0.426760191809308, "grad_norm": 2.2601988315582275, "learning_rate": 4.957325532674312e-05, "loss": 1.2286, "step": 27500 }, { "epoch": 0.42831204705225095, "grad_norm": 2.256404399871826, "learning_rate": 4.957170347150018e-05, "loss": 1.2019, "step": 27600 }, { "epoch": 0.4298639022951939, "grad_norm": 2.2497708797454834, "learning_rate": 4.9570151616257236e-05, "loss": 1.232, "step": 27700 }, { "epoch": 0.4314157575381368, "grad_norm": 2.4851603507995605, "learning_rate": 4.9568599761014294e-05, "loss": 1.239, "step": 27800 }, { "epoch": 0.43296761278107976, "grad_norm": 2.6804866790771484, "learning_rate": 4.956704790577135e-05, "loss": 1.2131, "step": 27900 }, { "epoch": 0.4345194680240227, "grad_norm": 2.489208221435547, "learning_rate": 4.956549605052841e-05, "loss": 1.2201, "step": 28000 }, { "epoch": 0.4360713232669657, "grad_norm": 2.5706231594085693, "learning_rate": 4.956394419528547e-05, "loss": 1.215, "step": 28100 }, { "epoch": 0.4376231785099086, "grad_norm": 2.389012575149536, "learning_rate": 4.9562392340042525e-05, "loss": 1.217, "step": 28200 }, { "epoch": 0.43917503375285155, "grad_norm": 2.303879976272583, "learning_rate": 4.956084048479958e-05, "loss": 1.2309, "step": 28300 }, { "epoch": 0.4407268889957945, "grad_norm": 2.1704635620117188, "learning_rate": 4.955928862955664e-05, "loss": 1.2183, "step": 28400 }, { "epoch": 0.4422787442387374, "grad_norm": 2.5813958644866943, "learning_rate": 4.95577367743137e-05, "loss": 1.2302, "step": 28500 }, { "epoch": 0.44383059948168035, "grad_norm": 2.516469717025757, "learning_rate": 4.9556184919070756e-05, "loss": 1.2101, "step": 28600 }, { "epoch": 0.4453824547246233, "grad_norm": 2.3752951622009277, "learning_rate": 4.955463306382781e-05, "loss": 1.2356, "step": 28700 }, { "epoch": 0.4469343099675662, "grad_norm": 2.4214041233062744, "learning_rate": 4.9553081208584865e-05, "loss": 1.2182, "step": 28800 }, { "epoch": 0.44848616521050916, "grad_norm": 2.519256830215454, "learning_rate": 4.955152935334192e-05, "loss": 1.2132, "step": 28900 }, { "epoch": 0.4500380204534521, "grad_norm": 2.636441707611084, "learning_rate": 4.954997749809898e-05, "loss": 1.2161, "step": 29000 }, { "epoch": 0.451589875696395, "grad_norm": 2.670457601547241, "learning_rate": 4.954842564285604e-05, "loss": 1.213, "step": 29100 }, { "epoch": 0.45314173093933796, "grad_norm": 2.537473201751709, "learning_rate": 4.9546873787613096e-05, "loss": 1.209, "step": 29200 }, { "epoch": 0.4546935861822809, "grad_norm": 2.7418036460876465, "learning_rate": 4.9545321932370154e-05, "loss": 1.2262, "step": 29300 }, { "epoch": 0.45624544142522383, "grad_norm": 2.5072531700134277, "learning_rate": 4.954377007712721e-05, "loss": 1.2484, "step": 29400 }, { "epoch": 0.4577972966681668, "grad_norm": 2.53163480758667, "learning_rate": 4.954221822188426e-05, "loss": 1.2302, "step": 29500 }, { "epoch": 0.45934915191110975, "grad_norm": 2.8376755714416504, "learning_rate": 4.954066636664132e-05, "loss": 1.2309, "step": 29600 }, { "epoch": 0.4609010071540527, "grad_norm": 2.69342303276062, "learning_rate": 4.953911451139838e-05, "loss": 1.1855, "step": 29700 }, { "epoch": 0.4624528623969956, "grad_norm": 1.7856605052947998, "learning_rate": 4.9537562656155436e-05, "loss": 1.2031, "step": 29800 }, { "epoch": 0.46400471763993856, "grad_norm": 2.4834160804748535, "learning_rate": 4.953601080091249e-05, "loss": 1.2339, "step": 29900 }, { "epoch": 0.4655565728828815, "grad_norm": 2.7890806198120117, "learning_rate": 4.953445894566955e-05, "loss": 1.2066, "step": 30000 }, { "epoch": 0.4671084281258244, "grad_norm": 2.3287105560302734, "learning_rate": 4.953290709042661e-05, "loss": 1.2008, "step": 30100 }, { "epoch": 0.46866028336876736, "grad_norm": 2.444023847579956, "learning_rate": 4.953135523518366e-05, "loss": 1.2253, "step": 30200 }, { "epoch": 0.4702121386117103, "grad_norm": 2.365927219390869, "learning_rate": 4.952980337994072e-05, "loss": 1.2339, "step": 30300 }, { "epoch": 0.47176399385465323, "grad_norm": 2.406822443008423, "learning_rate": 4.9528251524697775e-05, "loss": 1.2363, "step": 30400 }, { "epoch": 0.47331584909759616, "grad_norm": 2.8443877696990967, "learning_rate": 4.952669966945483e-05, "loss": 1.1863, "step": 30500 }, { "epoch": 0.4748677043405391, "grad_norm": 2.464362382888794, "learning_rate": 4.952514781421189e-05, "loss": 1.2202, "step": 30600 }, { "epoch": 0.47641955958348203, "grad_norm": 2.1502931118011475, "learning_rate": 4.952359595896895e-05, "loss": 1.2172, "step": 30700 }, { "epoch": 0.47797141482642497, "grad_norm": 2.2455809116363525, "learning_rate": 4.9522044103726006e-05, "loss": 1.1961, "step": 30800 }, { "epoch": 0.4795232700693679, "grad_norm": 2.384324789047241, "learning_rate": 4.9520492248483064e-05, "loss": 1.2245, "step": 30900 }, { "epoch": 0.4810751253123109, "grad_norm": 2.1174631118774414, "learning_rate": 4.951894039324012e-05, "loss": 1.231, "step": 31000 }, { "epoch": 0.4826269805552538, "grad_norm": 2.583064079284668, "learning_rate": 4.951738853799718e-05, "loss": 1.2121, "step": 31100 }, { "epoch": 0.48417883579819676, "grad_norm": 2.519484519958496, "learning_rate": 4.951583668275424e-05, "loss": 1.2247, "step": 31200 }, { "epoch": 0.4857306910411397, "grad_norm": 2.051809310913086, "learning_rate": 4.9514284827511295e-05, "loss": 1.2104, "step": 31300 }, { "epoch": 0.48728254628408263, "grad_norm": 2.2689340114593506, "learning_rate": 4.951273297226835e-05, "loss": 1.2178, "step": 31400 }, { "epoch": 0.48883440152702556, "grad_norm": 2.4677417278289795, "learning_rate": 4.9511181117025404e-05, "loss": 1.2253, "step": 31500 }, { "epoch": 0.4903862567699685, "grad_norm": 2.7817842960357666, "learning_rate": 4.950962926178246e-05, "loss": 1.1976, "step": 31600 }, { "epoch": 0.49193811201291143, "grad_norm": 2.4424521923065186, "learning_rate": 4.950807740653952e-05, "loss": 1.2278, "step": 31700 }, { "epoch": 0.49348996725585437, "grad_norm": 2.227247476577759, "learning_rate": 4.950652555129658e-05, "loss": 1.2264, "step": 31800 }, { "epoch": 0.4950418224987973, "grad_norm": 2.371188163757324, "learning_rate": 4.9504973696053635e-05, "loss": 1.2268, "step": 31900 }, { "epoch": 0.49659367774174024, "grad_norm": 2.4499406814575195, "learning_rate": 4.950342184081069e-05, "loss": 1.2268, "step": 32000 }, { "epoch": 0.49814553298468317, "grad_norm": 2.518974542617798, "learning_rate": 4.950186998556775e-05, "loss": 1.1959, "step": 32100 }, { "epoch": 0.4996973882276261, "grad_norm": 2.6548304557800293, "learning_rate": 4.950031813032481e-05, "loss": 1.1835, "step": 32200 }, { "epoch": 0.5012492434705691, "grad_norm": 2.252213478088379, "learning_rate": 4.9498766275081866e-05, "loss": 1.1971, "step": 32300 }, { "epoch": 0.502801098713512, "grad_norm": 2.8409087657928467, "learning_rate": 4.9497214419838924e-05, "loss": 1.2026, "step": 32400 }, { "epoch": 0.504352953956455, "grad_norm": 2.7140536308288574, "learning_rate": 4.949566256459598e-05, "loss": 1.2188, "step": 32500 }, { "epoch": 0.5059048091993978, "grad_norm": 2.8111579418182373, "learning_rate": 4.949411070935304e-05, "loss": 1.2012, "step": 32600 }, { "epoch": 0.5074566644423408, "grad_norm": 2.5844101905822754, "learning_rate": 4.94925588541101e-05, "loss": 1.233, "step": 32700 }, { "epoch": 0.5090085196852837, "grad_norm": 2.6166341304779053, "learning_rate": 4.949100699886715e-05, "loss": 1.2062, "step": 32800 }, { "epoch": 0.5105603749282267, "grad_norm": 2.949582099914551, "learning_rate": 4.9489455143624206e-05, "loss": 1.2164, "step": 32900 }, { "epoch": 0.5121122301711696, "grad_norm": 2.595062732696533, "learning_rate": 4.9487903288381257e-05, "loss": 1.1917, "step": 33000 }, { "epoch": 0.5136640854141126, "grad_norm": 2.294433832168579, "learning_rate": 4.9486351433138314e-05, "loss": 1.1904, "step": 33100 }, { "epoch": 0.5152159406570556, "grad_norm": 2.191683769226074, "learning_rate": 4.948479957789537e-05, "loss": 1.2262, "step": 33200 }, { "epoch": 0.5167677958999984, "grad_norm": 2.4490966796875, "learning_rate": 4.948324772265243e-05, "loss": 1.2134, "step": 33300 }, { "epoch": 0.5183196511429414, "grad_norm": 2.3449840545654297, "learning_rate": 4.948169586740949e-05, "loss": 1.2198, "step": 33400 }, { "epoch": 0.5198715063858843, "grad_norm": 2.706878423690796, "learning_rate": 4.9480144012166545e-05, "loss": 1.199, "step": 33500 }, { "epoch": 0.5214233616288273, "grad_norm": 2.4588654041290283, "learning_rate": 4.94785921569236e-05, "loss": 1.1981, "step": 33600 }, { "epoch": 0.5229752168717702, "grad_norm": 1.9115830659866333, "learning_rate": 4.947704030168066e-05, "loss": 1.1902, "step": 33700 }, { "epoch": 0.5245270721147132, "grad_norm": 2.573307752609253, "learning_rate": 4.947548844643772e-05, "loss": 1.2159, "step": 33800 }, { "epoch": 0.526078927357656, "grad_norm": 2.6550095081329346, "learning_rate": 4.9473936591194776e-05, "loss": 1.2299, "step": 33900 }, { "epoch": 0.527630782600599, "grad_norm": 2.6623427867889404, "learning_rate": 4.9472384735951834e-05, "loss": 1.1988, "step": 34000 }, { "epoch": 0.5291826378435419, "grad_norm": 2.2674150466918945, "learning_rate": 4.947083288070889e-05, "loss": 1.2129, "step": 34100 }, { "epoch": 0.5307344930864849, "grad_norm": 2.6334874629974365, "learning_rate": 4.946928102546595e-05, "loss": 1.207, "step": 34200 }, { "epoch": 0.5322863483294278, "grad_norm": 2.237846612930298, "learning_rate": 4.946772917022301e-05, "loss": 1.2161, "step": 34300 }, { "epoch": 0.5338382035723708, "grad_norm": 2.623734712600708, "learning_rate": 4.946617731498006e-05, "loss": 1.2111, "step": 34400 }, { "epoch": 0.5353900588153138, "grad_norm": 1.9834754467010498, "learning_rate": 4.9464625459737116e-05, "loss": 1.1908, "step": 34500 }, { "epoch": 0.5369419140582566, "grad_norm": 2.475658893585205, "learning_rate": 4.9463073604494174e-05, "loss": 1.2285, "step": 34600 }, { "epoch": 0.5384937693011996, "grad_norm": 2.4245009422302246, "learning_rate": 4.946152174925123e-05, "loss": 1.1966, "step": 34700 }, { "epoch": 0.5400456245441425, "grad_norm": 3.1218440532684326, "learning_rate": 4.945996989400829e-05, "loss": 1.1862, "step": 34800 }, { "epoch": 0.5415974797870855, "grad_norm": 2.3402438163757324, "learning_rate": 4.945841803876535e-05, "loss": 1.1826, "step": 34900 }, { "epoch": 0.5431493350300284, "grad_norm": 2.3297884464263916, "learning_rate": 4.9456866183522405e-05, "loss": 1.2174, "step": 35000 }, { "epoch": 0.5447011902729714, "grad_norm": 2.5114550590515137, "learning_rate": 4.945531432827946e-05, "loss": 1.1835, "step": 35100 }, { "epoch": 0.5462530455159142, "grad_norm": 2.144836187362671, "learning_rate": 4.945376247303652e-05, "loss": 1.1995, "step": 35200 }, { "epoch": 0.5478049007588572, "grad_norm": 2.517759323120117, "learning_rate": 4.945221061779358e-05, "loss": 1.1771, "step": 35300 }, { "epoch": 0.5493567560018001, "grad_norm": 2.3586151599884033, "learning_rate": 4.9450658762550636e-05, "loss": 1.2175, "step": 35400 }, { "epoch": 0.5509086112447431, "grad_norm": 2.2889606952667236, "learning_rate": 4.9449106907307694e-05, "loss": 1.2047, "step": 35500 }, { "epoch": 0.552460466487686, "grad_norm": 2.326240062713623, "learning_rate": 4.944755505206475e-05, "loss": 1.2015, "step": 35600 }, { "epoch": 0.554012321730629, "grad_norm": 2.626574754714966, "learning_rate": 4.94460031968218e-05, "loss": 1.1887, "step": 35700 }, { "epoch": 0.5555641769735719, "grad_norm": 2.350536823272705, "learning_rate": 4.944445134157886e-05, "loss": 1.1921, "step": 35800 }, { "epoch": 0.5571160322165148, "grad_norm": 2.2332732677459717, "learning_rate": 4.944289948633592e-05, "loss": 1.1849, "step": 35900 }, { "epoch": 0.5586678874594578, "grad_norm": 2.3713724613189697, "learning_rate": 4.944134763109297e-05, "loss": 1.2198, "step": 36000 }, { "epoch": 0.5602197427024007, "grad_norm": 2.2064812183380127, "learning_rate": 4.9439795775850027e-05, "loss": 1.1886, "step": 36100 }, { "epoch": 0.5617715979453437, "grad_norm": 2.26542329788208, "learning_rate": 4.9438243920607084e-05, "loss": 1.2246, "step": 36200 }, { "epoch": 0.5633234531882866, "grad_norm": 2.454099655151367, "learning_rate": 4.943669206536414e-05, "loss": 1.1883, "step": 36300 }, { "epoch": 0.5648753084312296, "grad_norm": 2.276982545852661, "learning_rate": 4.94351402101212e-05, "loss": 1.211, "step": 36400 }, { "epoch": 0.5664271636741725, "grad_norm": 5.26003885269165, "learning_rate": 4.943358835487826e-05, "loss": 1.2051, "step": 36500 }, { "epoch": 0.5679790189171154, "grad_norm": 2.4926297664642334, "learning_rate": 4.9432036499635315e-05, "loss": 1.2035, "step": 36600 }, { "epoch": 0.5695308741600583, "grad_norm": 2.156940221786499, "learning_rate": 4.943048464439237e-05, "loss": 1.1906, "step": 36700 }, { "epoch": 0.5710827294030013, "grad_norm": 2.564739465713501, "learning_rate": 4.942893278914943e-05, "loss": 1.1878, "step": 36800 }, { "epoch": 0.5726345846459442, "grad_norm": 2.4842190742492676, "learning_rate": 4.942738093390649e-05, "loss": 1.1958, "step": 36900 }, { "epoch": 0.5741864398888872, "grad_norm": 2.535102128982544, "learning_rate": 4.9425829078663546e-05, "loss": 1.1939, "step": 37000 }, { "epoch": 0.5757382951318301, "grad_norm": 1.9911900758743286, "learning_rate": 4.9424277223420604e-05, "loss": 1.2075, "step": 37100 }, { "epoch": 0.577290150374773, "grad_norm": 2.1373379230499268, "learning_rate": 4.9422725368177655e-05, "loss": 1.1974, "step": 37200 }, { "epoch": 0.5788420056177159, "grad_norm": 2.4389595985412598, "learning_rate": 4.942117351293471e-05, "loss": 1.1826, "step": 37300 }, { "epoch": 0.5803938608606589, "grad_norm": 2.045496940612793, "learning_rate": 4.941962165769177e-05, "loss": 1.1752, "step": 37400 }, { "epoch": 0.5819457161036019, "grad_norm": 2.981600046157837, "learning_rate": 4.941806980244883e-05, "loss": 1.1878, "step": 37500 }, { "epoch": 0.5834975713465448, "grad_norm": 1.9566932916641235, "learning_rate": 4.9416517947205886e-05, "loss": 1.1831, "step": 37600 }, { "epoch": 0.5850494265894878, "grad_norm": 2.4527249336242676, "learning_rate": 4.9414966091962944e-05, "loss": 1.1865, "step": 37700 }, { "epoch": 0.5866012818324307, "grad_norm": 2.4431509971618652, "learning_rate": 4.941341423672e-05, "loss": 1.1936, "step": 37800 }, { "epoch": 0.5881531370753736, "grad_norm": 2.659628391265869, "learning_rate": 4.941186238147706e-05, "loss": 1.201, "step": 37900 }, { "epoch": 0.5897049923183165, "grad_norm": 2.275744915008545, "learning_rate": 4.941031052623412e-05, "loss": 1.1947, "step": 38000 }, { "epoch": 0.5912568475612595, "grad_norm": 2.5866122245788574, "learning_rate": 4.9408758670991175e-05, "loss": 1.199, "step": 38100 }, { "epoch": 0.5928087028042024, "grad_norm": 2.1171488761901855, "learning_rate": 4.940720681574823e-05, "loss": 1.1872, "step": 38200 }, { "epoch": 0.5943605580471454, "grad_norm": 2.8767952919006348, "learning_rate": 4.940565496050529e-05, "loss": 1.1998, "step": 38300 }, { "epoch": 0.5959124132900883, "grad_norm": 2.4268555641174316, "learning_rate": 4.940410310526235e-05, "loss": 1.2072, "step": 38400 }, { "epoch": 0.5974642685330313, "grad_norm": 2.5592246055603027, "learning_rate": 4.94025512500194e-05, "loss": 1.1669, "step": 38500 }, { "epoch": 0.5990161237759741, "grad_norm": 2.657672643661499, "learning_rate": 4.940099939477646e-05, "loss": 1.1985, "step": 38600 }, { "epoch": 0.6005679790189171, "grad_norm": 2.3190295696258545, "learning_rate": 4.9399447539533515e-05, "loss": 1.2081, "step": 38700 }, { "epoch": 0.60211983426186, "grad_norm": 2.345186948776245, "learning_rate": 4.939789568429057e-05, "loss": 1.182, "step": 38800 }, { "epoch": 0.603671689504803, "grad_norm": 1.9601801633834839, "learning_rate": 4.939634382904763e-05, "loss": 1.2019, "step": 38900 }, { "epoch": 0.605223544747746, "grad_norm": 2.403519630432129, "learning_rate": 4.939479197380469e-05, "loss": 1.1669, "step": 39000 }, { "epoch": 0.6067753999906889, "grad_norm": 2.4064042568206787, "learning_rate": 4.9393240118561746e-05, "loss": 1.2144, "step": 39100 }, { "epoch": 0.6083272552336318, "grad_norm": 2.2512192726135254, "learning_rate": 4.93916882633188e-05, "loss": 1.1835, "step": 39200 }, { "epoch": 0.6098791104765747, "grad_norm": 2.9290380477905273, "learning_rate": 4.9390136408075854e-05, "loss": 1.2249, "step": 39300 }, { "epoch": 0.6114309657195177, "grad_norm": 2.927769184112549, "learning_rate": 4.938858455283291e-05, "loss": 1.1738, "step": 39400 }, { "epoch": 0.6129828209624606, "grad_norm": 2.0681636333465576, "learning_rate": 4.938703269758997e-05, "loss": 1.1852, "step": 39500 }, { "epoch": 0.6145346762054036, "grad_norm": 2.5736312866210938, "learning_rate": 4.938548084234703e-05, "loss": 1.1714, "step": 39600 }, { "epoch": 0.6160865314483465, "grad_norm": 2.8512165546417236, "learning_rate": 4.9383928987104085e-05, "loss": 1.2095, "step": 39700 }, { "epoch": 0.6176383866912895, "grad_norm": 2.954066276550293, "learning_rate": 4.938237713186114e-05, "loss": 1.17, "step": 39800 }, { "epoch": 0.6191902419342323, "grad_norm": 2.868504285812378, "learning_rate": 4.93808252766182e-05, "loss": 1.1838, "step": 39900 }, { "epoch": 0.6207420971771753, "grad_norm": 2.705115795135498, "learning_rate": 4.937927342137525e-05, "loss": 1.1723, "step": 40000 }, { "epoch": 0.6222939524201182, "grad_norm": 2.2740418910980225, "learning_rate": 4.937772156613231e-05, "loss": 1.1834, "step": 40100 }, { "epoch": 0.6238458076630612, "grad_norm": 2.30437970161438, "learning_rate": 4.937616971088937e-05, "loss": 1.1938, "step": 40200 }, { "epoch": 0.6253976629060042, "grad_norm": 2.5184919834136963, "learning_rate": 4.9374617855646425e-05, "loss": 1.1787, "step": 40300 }, { "epoch": 0.6269495181489471, "grad_norm": 2.4476919174194336, "learning_rate": 4.937306600040348e-05, "loss": 1.1752, "step": 40400 }, { "epoch": 0.62850137339189, "grad_norm": 2.5979440212249756, "learning_rate": 4.937151414516054e-05, "loss": 1.1858, "step": 40500 }, { "epoch": 0.6300532286348329, "grad_norm": 2.6650726795196533, "learning_rate": 4.93699622899176e-05, "loss": 1.1962, "step": 40600 }, { "epoch": 0.6316050838777759, "grad_norm": 2.5451135635375977, "learning_rate": 4.9368410434674656e-05, "loss": 1.1736, "step": 40700 }, { "epoch": 0.6331569391207188, "grad_norm": 2.4065561294555664, "learning_rate": 4.9366858579431714e-05, "loss": 1.1918, "step": 40800 }, { "epoch": 0.6347087943636618, "grad_norm": 2.2645041942596436, "learning_rate": 4.936530672418877e-05, "loss": 1.1868, "step": 40900 }, { "epoch": 0.6362606496066047, "grad_norm": 2.4546401500701904, "learning_rate": 4.936375486894583e-05, "loss": 1.1942, "step": 41000 }, { "epoch": 0.6378125048495477, "grad_norm": 2.800821304321289, "learning_rate": 4.936220301370289e-05, "loss": 1.2016, "step": 41100 }, { "epoch": 0.6393643600924905, "grad_norm": 2.5779294967651367, "learning_rate": 4.9360651158459945e-05, "loss": 1.194, "step": 41200 }, { "epoch": 0.6409162153354335, "grad_norm": 2.337620258331299, "learning_rate": 4.9359099303216996e-05, "loss": 1.185, "step": 41300 }, { "epoch": 0.6424680705783764, "grad_norm": 2.5104012489318848, "learning_rate": 4.9357547447974054e-05, "loss": 1.1907, "step": 41400 }, { "epoch": 0.6440199258213194, "grad_norm": 1.9994335174560547, "learning_rate": 4.935599559273111e-05, "loss": 1.1789, "step": 41500 }, { "epoch": 0.6455717810642623, "grad_norm": 2.1678597927093506, "learning_rate": 4.935444373748817e-05, "loss": 1.2008, "step": 41600 }, { "epoch": 0.6471236363072053, "grad_norm": 2.5696334838867188, "learning_rate": 4.935289188224523e-05, "loss": 1.1764, "step": 41700 }, { "epoch": 0.6486754915501483, "grad_norm": 1.8987754583358765, "learning_rate": 4.9351340027002285e-05, "loss": 1.189, "step": 41800 }, { "epoch": 0.6502273467930911, "grad_norm": 2.203730583190918, "learning_rate": 4.934978817175934e-05, "loss": 1.1696, "step": 41900 }, { "epoch": 0.6517792020360341, "grad_norm": 2.6431314945220947, "learning_rate": 4.93482363165164e-05, "loss": 1.1753, "step": 42000 }, { "epoch": 0.653331057278977, "grad_norm": 2.334282159805298, "learning_rate": 4.934668446127346e-05, "loss": 1.1855, "step": 42100 }, { "epoch": 0.65488291252192, "grad_norm": 2.236422538757324, "learning_rate": 4.9345132606030516e-05, "loss": 1.1647, "step": 42200 }, { "epoch": 0.6564347677648629, "grad_norm": 2.5629477500915527, "learning_rate": 4.934358075078757e-05, "loss": 1.1844, "step": 42300 }, { "epoch": 0.6579866230078059, "grad_norm": 2.0746262073516846, "learning_rate": 4.934202889554463e-05, "loss": 1.2069, "step": 42400 }, { "epoch": 0.6595384782507487, "grad_norm": 2.7469513416290283, "learning_rate": 4.934047704030168e-05, "loss": 1.1948, "step": 42500 }, { "epoch": 0.6610903334936917, "grad_norm": 2.3980820178985596, "learning_rate": 4.933892518505874e-05, "loss": 1.1709, "step": 42600 }, { "epoch": 0.6626421887366346, "grad_norm": 2.0644824504852295, "learning_rate": 4.93373733298158e-05, "loss": 1.1578, "step": 42700 }, { "epoch": 0.6641940439795776, "grad_norm": 2.0668458938598633, "learning_rate": 4.9335821474572855e-05, "loss": 1.1685, "step": 42800 }, { "epoch": 0.6657458992225205, "grad_norm": 2.388491153717041, "learning_rate": 4.9334269619329906e-05, "loss": 1.2028, "step": 42900 }, { "epoch": 0.6672977544654635, "grad_norm": 2.12837815284729, "learning_rate": 4.9332717764086964e-05, "loss": 1.1808, "step": 43000 }, { "epoch": 0.6688496097084063, "grad_norm": 2.2032430171966553, "learning_rate": 4.933116590884402e-05, "loss": 1.2024, "step": 43100 }, { "epoch": 0.6704014649513493, "grad_norm": 2.159217119216919, "learning_rate": 4.932961405360108e-05, "loss": 1.1655, "step": 43200 }, { "epoch": 0.6719533201942923, "grad_norm": 2.3776962757110596, "learning_rate": 4.932806219835814e-05, "loss": 1.1871, "step": 43300 }, { "epoch": 0.6735051754372352, "grad_norm": 2.2626845836639404, "learning_rate": 4.9326510343115195e-05, "loss": 1.1781, "step": 43400 }, { "epoch": 0.6750570306801782, "grad_norm": 2.6178290843963623, "learning_rate": 4.932495848787225e-05, "loss": 1.1559, "step": 43500 }, { "epoch": 0.6766088859231211, "grad_norm": 2.829810857772827, "learning_rate": 4.932340663262931e-05, "loss": 1.1873, "step": 43600 }, { "epoch": 0.6781607411660641, "grad_norm": 2.2946605682373047, "learning_rate": 4.932185477738637e-05, "loss": 1.1635, "step": 43700 }, { "epoch": 0.6797125964090069, "grad_norm": 2.151526927947998, "learning_rate": 4.9320302922143426e-05, "loss": 1.1584, "step": 43800 }, { "epoch": 0.6812644516519499, "grad_norm": 2.657257556915283, "learning_rate": 4.9318751066900484e-05, "loss": 1.1814, "step": 43900 }, { "epoch": 0.6828163068948928, "grad_norm": 2.409086227416992, "learning_rate": 4.931719921165754e-05, "loss": 1.1559, "step": 44000 }, { "epoch": 0.6843681621378358, "grad_norm": 2.9715707302093506, "learning_rate": 4.93156473564146e-05, "loss": 1.1686, "step": 44100 }, { "epoch": 0.6859200173807787, "grad_norm": 2.454049825668335, "learning_rate": 4.931409550117165e-05, "loss": 1.1941, "step": 44200 }, { "epoch": 0.6874718726237217, "grad_norm": 2.4585883617401123, "learning_rate": 4.931254364592871e-05, "loss": 1.1679, "step": 44300 }, { "epoch": 0.6890237278666645, "grad_norm": 2.2637085914611816, "learning_rate": 4.9310991790685766e-05, "loss": 1.1554, "step": 44400 }, { "epoch": 0.6905755831096075, "grad_norm": 2.476701498031616, "learning_rate": 4.9309439935442824e-05, "loss": 1.1643, "step": 44500 }, { "epoch": 0.6921274383525505, "grad_norm": 2.5342164039611816, "learning_rate": 4.930788808019988e-05, "loss": 1.1638, "step": 44600 }, { "epoch": 0.6936792935954934, "grad_norm": 2.2393851280212402, "learning_rate": 4.930633622495694e-05, "loss": 1.2029, "step": 44700 }, { "epoch": 0.6952311488384364, "grad_norm": 1.9192265272140503, "learning_rate": 4.9304784369714e-05, "loss": 1.164, "step": 44800 }, { "epoch": 0.6967830040813793, "grad_norm": 2.471797227859497, "learning_rate": 4.9303232514471055e-05, "loss": 1.192, "step": 44900 }, { "epoch": 0.6983348593243223, "grad_norm": 2.543172836303711, "learning_rate": 4.930168065922811e-05, "loss": 1.2015, "step": 45000 }, { "epoch": 0.6998867145672651, "grad_norm": 2.39199161529541, "learning_rate": 4.930012880398517e-05, "loss": 1.1897, "step": 45100 }, { "epoch": 0.7014385698102081, "grad_norm": 2.072190761566162, "learning_rate": 4.929857694874223e-05, "loss": 1.1632, "step": 45200 }, { "epoch": 0.702990425053151, "grad_norm": 2.616067886352539, "learning_rate": 4.9297025093499286e-05, "loss": 1.1706, "step": 45300 }, { "epoch": 0.704542280296094, "grad_norm": 2.564262628555298, "learning_rate": 4.929547323825634e-05, "loss": 1.1801, "step": 45400 }, { "epoch": 0.7060941355390369, "grad_norm": 2.7784082889556885, "learning_rate": 4.9293921383013394e-05, "loss": 1.1692, "step": 45500 }, { "epoch": 0.7076459907819799, "grad_norm": 2.6110994815826416, "learning_rate": 4.929236952777045e-05, "loss": 1.1739, "step": 45600 }, { "epoch": 0.7091978460249228, "grad_norm": 2.4873533248901367, "learning_rate": 4.929081767252751e-05, "loss": 1.1768, "step": 45700 }, { "epoch": 0.7107497012678657, "grad_norm": 2.507765293121338, "learning_rate": 4.928926581728456e-05, "loss": 1.1507, "step": 45800 }, { "epoch": 0.7123015565108086, "grad_norm": 2.48184871673584, "learning_rate": 4.928771396204162e-05, "loss": 1.1544, "step": 45900 }, { "epoch": 0.7138534117537516, "grad_norm": 2.669926404953003, "learning_rate": 4.9286162106798676e-05, "loss": 1.1586, "step": 46000 }, { "epoch": 0.7154052669966946, "grad_norm": 2.7033112049102783, "learning_rate": 4.9284610251555734e-05, "loss": 1.1434, "step": 46100 }, { "epoch": 0.7169571222396375, "grad_norm": 2.89276385307312, "learning_rate": 4.928305839631279e-05, "loss": 1.1791, "step": 46200 }, { "epoch": 0.7185089774825805, "grad_norm": 2.1775076389312744, "learning_rate": 4.928150654106985e-05, "loss": 1.1713, "step": 46300 }, { "epoch": 0.7200608327255233, "grad_norm": 1.9711344242095947, "learning_rate": 4.927995468582691e-05, "loss": 1.1589, "step": 46400 }, { "epoch": 0.7216126879684663, "grad_norm": 2.7642791271209717, "learning_rate": 4.9278402830583965e-05, "loss": 1.1848, "step": 46500 }, { "epoch": 0.7231645432114092, "grad_norm": 2.3391270637512207, "learning_rate": 4.927685097534102e-05, "loss": 1.1541, "step": 46600 }, { "epoch": 0.7247163984543522, "grad_norm": 2.115884304046631, "learning_rate": 4.927529912009808e-05, "loss": 1.1777, "step": 46700 }, { "epoch": 0.7262682536972951, "grad_norm": 2.2775306701660156, "learning_rate": 4.927374726485514e-05, "loss": 1.1581, "step": 46800 }, { "epoch": 0.7278201089402381, "grad_norm": 2.1826443672180176, "learning_rate": 4.9272195409612196e-05, "loss": 1.1632, "step": 46900 }, { "epoch": 0.729371964183181, "grad_norm": 2.4894049167633057, "learning_rate": 4.927064355436925e-05, "loss": 1.1399, "step": 47000 }, { "epoch": 0.7309238194261239, "grad_norm": 2.074448347091675, "learning_rate": 4.9269091699126305e-05, "loss": 1.1723, "step": 47100 }, { "epoch": 0.7324756746690668, "grad_norm": 2.058256149291992, "learning_rate": 4.926753984388336e-05, "loss": 1.154, "step": 47200 }, { "epoch": 0.7340275299120098, "grad_norm": 2.3936984539031982, "learning_rate": 4.926598798864042e-05, "loss": 1.1684, "step": 47300 }, { "epoch": 0.7355793851549527, "grad_norm": 2.1671347618103027, "learning_rate": 4.926443613339748e-05, "loss": 1.1699, "step": 47400 }, { "epoch": 0.7371312403978957, "grad_norm": 2.8088905811309814, "learning_rate": 4.9262884278154536e-05, "loss": 1.1851, "step": 47500 }, { "epoch": 0.7386830956408387, "grad_norm": 2.416687488555908, "learning_rate": 4.9261332422911594e-05, "loss": 1.1686, "step": 47600 }, { "epoch": 0.7402349508837816, "grad_norm": 2.3578453063964844, "learning_rate": 4.925978056766865e-05, "loss": 1.1633, "step": 47700 }, { "epoch": 0.7417868061267245, "grad_norm": 2.4843294620513916, "learning_rate": 4.925822871242571e-05, "loss": 1.193, "step": 47800 }, { "epoch": 0.7433386613696674, "grad_norm": 2.8165409564971924, "learning_rate": 4.925667685718277e-05, "loss": 1.1511, "step": 47900 }, { "epoch": 0.7448905166126104, "grad_norm": 2.834263801574707, "learning_rate": 4.9255125001939825e-05, "loss": 1.1584, "step": 48000 }, { "epoch": 0.7464423718555533, "grad_norm": 2.8275694847106934, "learning_rate": 4.925357314669688e-05, "loss": 1.1521, "step": 48100 }, { "epoch": 0.7479942270984963, "grad_norm": 2.2874298095703125, "learning_rate": 4.925202129145394e-05, "loss": 1.1639, "step": 48200 }, { "epoch": 0.7495460823414392, "grad_norm": 2.553553819656372, "learning_rate": 4.925046943621099e-05, "loss": 1.1721, "step": 48300 }, { "epoch": 0.7510979375843821, "grad_norm": 2.650235891342163, "learning_rate": 4.924891758096805e-05, "loss": 1.1823, "step": 48400 }, { "epoch": 0.752649792827325, "grad_norm": 2.454193115234375, "learning_rate": 4.9247365725725107e-05, "loss": 1.1605, "step": 48500 }, { "epoch": 0.754201648070268, "grad_norm": 1.828253149986267, "learning_rate": 4.9245813870482164e-05, "loss": 1.1657, "step": 48600 }, { "epoch": 0.7557535033132109, "grad_norm": 2.834845781326294, "learning_rate": 4.924426201523922e-05, "loss": 1.164, "step": 48700 }, { "epoch": 0.7573053585561539, "grad_norm": 2.4240832328796387, "learning_rate": 4.924271015999628e-05, "loss": 1.1809, "step": 48800 }, { "epoch": 0.7588572137990969, "grad_norm": 2.1284701824188232, "learning_rate": 4.924115830475334e-05, "loss": 1.1779, "step": 48900 }, { "epoch": 0.7604090690420398, "grad_norm": 2.523000478744507, "learning_rate": 4.923960644951039e-05, "loss": 1.1723, "step": 49000 }, { "epoch": 0.7619609242849827, "grad_norm": 3.0821704864501953, "learning_rate": 4.9238054594267446e-05, "loss": 1.1751, "step": 49100 }, { "epoch": 0.7635127795279256, "grad_norm": 2.4138882160186768, "learning_rate": 4.9236502739024504e-05, "loss": 1.1896, "step": 49200 }, { "epoch": 0.7650646347708686, "grad_norm": 2.178921699523926, "learning_rate": 4.923495088378156e-05, "loss": 1.1525, "step": 49300 }, { "epoch": 0.7666164900138115, "grad_norm": 2.6186108589172363, "learning_rate": 4.923339902853862e-05, "loss": 1.1462, "step": 49400 }, { "epoch": 0.7681683452567545, "grad_norm": 2.2610292434692383, "learning_rate": 4.923184717329568e-05, "loss": 1.144, "step": 49500 }, { "epoch": 0.7697202004996974, "grad_norm": 2.416614532470703, "learning_rate": 4.9230295318052735e-05, "loss": 1.1613, "step": 49600 }, { "epoch": 0.7712720557426404, "grad_norm": 2.1591925621032715, "learning_rate": 4.922874346280979e-05, "loss": 1.1597, "step": 49700 }, { "epoch": 0.7728239109855832, "grad_norm": 2.146529197692871, "learning_rate": 4.9227191607566844e-05, "loss": 1.1813, "step": 49800 }, { "epoch": 0.7743757662285262, "grad_norm": 2.157388687133789, "learning_rate": 4.92256397523239e-05, "loss": 1.1822, "step": 49900 }, { "epoch": 0.7759276214714691, "grad_norm": 2.572301149368286, "learning_rate": 4.922408789708096e-05, "loss": 1.1529, "step": 50000 }, { "epoch": 0.7774794767144121, "grad_norm": 2.306133508682251, "learning_rate": 4.922253604183802e-05, "loss": 1.159, "step": 50100 }, { "epoch": 0.779031331957355, "grad_norm": 2.453583002090454, "learning_rate": 4.9220984186595075e-05, "loss": 1.1693, "step": 50200 }, { "epoch": 0.780583187200298, "grad_norm": 2.0289504528045654, "learning_rate": 4.921943233135213e-05, "loss": 1.1689, "step": 50300 }, { "epoch": 0.782135042443241, "grad_norm": 1.9087867736816406, "learning_rate": 4.921788047610919e-05, "loss": 1.1394, "step": 50400 }, { "epoch": 0.7836868976861838, "grad_norm": 2.9248099327087402, "learning_rate": 4.921632862086625e-05, "loss": 1.2043, "step": 50500 }, { "epoch": 0.7852387529291268, "grad_norm": 2.4978187084198, "learning_rate": 4.9214776765623306e-05, "loss": 1.1652, "step": 50600 }, { "epoch": 0.7867906081720697, "grad_norm": 2.174706220626831, "learning_rate": 4.9213224910380364e-05, "loss": 1.1464, "step": 50700 }, { "epoch": 0.7883424634150127, "grad_norm": 2.3342819213867188, "learning_rate": 4.921167305513742e-05, "loss": 1.1577, "step": 50800 }, { "epoch": 0.7898943186579556, "grad_norm": 2.4408538341522217, "learning_rate": 4.921012119989448e-05, "loss": 1.1765, "step": 50900 }, { "epoch": 0.7914461739008986, "grad_norm": 2.0617897510528564, "learning_rate": 4.920856934465154e-05, "loss": 1.1725, "step": 51000 }, { "epoch": 0.7929980291438414, "grad_norm": 2.025838613510132, "learning_rate": 4.920701748940859e-05, "loss": 1.1499, "step": 51100 }, { "epoch": 0.7945498843867844, "grad_norm": 2.2229199409484863, "learning_rate": 4.9205465634165645e-05, "loss": 1.1594, "step": 51200 }, { "epoch": 0.7961017396297273, "grad_norm": 2.204599618911743, "learning_rate": 4.92039137789227e-05, "loss": 1.1582, "step": 51300 }, { "epoch": 0.7976535948726703, "grad_norm": 2.3283772468566895, "learning_rate": 4.920236192367976e-05, "loss": 1.1775, "step": 51400 }, { "epoch": 0.7992054501156132, "grad_norm": 2.6537115573883057, "learning_rate": 4.920081006843682e-05, "loss": 1.1585, "step": 51500 }, { "epoch": 0.8007573053585562, "grad_norm": 2.701730966567993, "learning_rate": 4.9199258213193877e-05, "loss": 1.1458, "step": 51600 }, { "epoch": 0.802309160601499, "grad_norm": 2.5185375213623047, "learning_rate": 4.9197706357950934e-05, "loss": 1.1652, "step": 51700 }, { "epoch": 0.803861015844442, "grad_norm": 2.5204620361328125, "learning_rate": 4.919615450270799e-05, "loss": 1.1552, "step": 51800 }, { "epoch": 0.805412871087385, "grad_norm": 2.1909873485565186, "learning_rate": 4.919460264746505e-05, "loss": 1.1573, "step": 51900 }, { "epoch": 0.8069647263303279, "grad_norm": 2.4740612506866455, "learning_rate": 4.919305079222211e-05, "loss": 1.1607, "step": 52000 }, { "epoch": 0.8085165815732709, "grad_norm": 2.4286158084869385, "learning_rate": 4.9191498936979165e-05, "loss": 1.1464, "step": 52100 }, { "epoch": 0.8100684368162138, "grad_norm": 3.082249402999878, "learning_rate": 4.918994708173622e-05, "loss": 1.1656, "step": 52200 }, { "epoch": 0.8116202920591568, "grad_norm": 2.3874146938323975, "learning_rate": 4.9188395226493274e-05, "loss": 1.1537, "step": 52300 }, { "epoch": 0.8131721473020996, "grad_norm": 1.971757411956787, "learning_rate": 4.918684337125033e-05, "loss": 1.147, "step": 52400 }, { "epoch": 0.8147240025450426, "grad_norm": 2.264465808868408, "learning_rate": 4.918529151600739e-05, "loss": 1.1783, "step": 52500 }, { "epoch": 0.8162758577879855, "grad_norm": 2.1094117164611816, "learning_rate": 4.918373966076445e-05, "loss": 1.1577, "step": 52600 }, { "epoch": 0.8178277130309285, "grad_norm": 2.7979469299316406, "learning_rate": 4.91821878055215e-05, "loss": 1.1858, "step": 52700 }, { "epoch": 0.8193795682738714, "grad_norm": 2.128664255142212, "learning_rate": 4.9180635950278556e-05, "loss": 1.1684, "step": 52800 }, { "epoch": 0.8209314235168144, "grad_norm": 2.249389886856079, "learning_rate": 4.9179084095035614e-05, "loss": 1.1516, "step": 52900 }, { "epoch": 0.8224832787597572, "grad_norm": 2.815535306930542, "learning_rate": 4.917753223979267e-05, "loss": 1.1731, "step": 53000 }, { "epoch": 0.8240351340027002, "grad_norm": 2.402355670928955, "learning_rate": 4.917598038454973e-05, "loss": 1.1582, "step": 53100 }, { "epoch": 0.8255869892456432, "grad_norm": 2.3408007621765137, "learning_rate": 4.917442852930679e-05, "loss": 1.1628, "step": 53200 }, { "epoch": 0.8271388444885861, "grad_norm": 1.9599260091781616, "learning_rate": 4.9172876674063845e-05, "loss": 1.1622, "step": 53300 }, { "epoch": 0.8286906997315291, "grad_norm": 2.192831039428711, "learning_rate": 4.91713248188209e-05, "loss": 1.1531, "step": 53400 }, { "epoch": 0.830242554974472, "grad_norm": 2.5755300521850586, "learning_rate": 4.916977296357796e-05, "loss": 1.1222, "step": 53500 }, { "epoch": 0.831794410217415, "grad_norm": 2.547351598739624, "learning_rate": 4.916822110833502e-05, "loss": 1.1501, "step": 53600 }, { "epoch": 0.8333462654603578, "grad_norm": 2.4684717655181885, "learning_rate": 4.9166669253092076e-05, "loss": 1.1566, "step": 53700 }, { "epoch": 0.8348981207033008, "grad_norm": 2.467470169067383, "learning_rate": 4.9165117397849134e-05, "loss": 1.1623, "step": 53800 }, { "epoch": 0.8364499759462437, "grad_norm": 2.0983071327209473, "learning_rate": 4.916356554260619e-05, "loss": 1.1362, "step": 53900 }, { "epoch": 0.8380018311891867, "grad_norm": 2.5323948860168457, "learning_rate": 4.916201368736324e-05, "loss": 1.154, "step": 54000 }, { "epoch": 0.8395536864321296, "grad_norm": 2.034572124481201, "learning_rate": 4.91604618321203e-05, "loss": 1.1444, "step": 54100 }, { "epoch": 0.8411055416750726, "grad_norm": 2.1224417686462402, "learning_rate": 4.915890997687736e-05, "loss": 1.154, "step": 54200 }, { "epoch": 0.8426573969180154, "grad_norm": 3.1934731006622314, "learning_rate": 4.9157358121634415e-05, "loss": 1.1572, "step": 54300 }, { "epoch": 0.8442092521609584, "grad_norm": 2.6752889156341553, "learning_rate": 4.915580626639147e-05, "loss": 1.1566, "step": 54400 }, { "epoch": 0.8457611074039013, "grad_norm": 2.5010483264923096, "learning_rate": 4.915425441114853e-05, "loss": 1.137, "step": 54500 }, { "epoch": 0.8473129626468443, "grad_norm": 2.677424669265747, "learning_rate": 4.915270255590559e-05, "loss": 1.1516, "step": 54600 }, { "epoch": 0.8488648178897873, "grad_norm": 2.1494462490081787, "learning_rate": 4.9151150700662647e-05, "loss": 1.1595, "step": 54700 }, { "epoch": 0.8504166731327302, "grad_norm": 2.7015750408172607, "learning_rate": 4.9149598845419704e-05, "loss": 1.1254, "step": 54800 }, { "epoch": 0.8519685283756732, "grad_norm": 2.3260157108306885, "learning_rate": 4.914804699017676e-05, "loss": 1.1585, "step": 54900 }, { "epoch": 0.853520383618616, "grad_norm": 2.4272515773773193, "learning_rate": 4.914649513493382e-05, "loss": 1.1311, "step": 55000 }, { "epoch": 0.855072238861559, "grad_norm": 2.377215623855591, "learning_rate": 4.914494327969088e-05, "loss": 1.124, "step": 55100 }, { "epoch": 0.8566240941045019, "grad_norm": 2.164163827896118, "learning_rate": 4.9143391424447935e-05, "loss": 1.1517, "step": 55200 }, { "epoch": 0.8581759493474449, "grad_norm": 2.0977590084075928, "learning_rate": 4.9141839569204986e-05, "loss": 1.1502, "step": 55300 }, { "epoch": 0.8597278045903878, "grad_norm": 2.5076000690460205, "learning_rate": 4.9140287713962044e-05, "loss": 1.1643, "step": 55400 }, { "epoch": 0.8612796598333308, "grad_norm": 2.3000075817108154, "learning_rate": 4.91387358587191e-05, "loss": 1.1459, "step": 55500 }, { "epoch": 0.8628315150762736, "grad_norm": 2.194390296936035, "learning_rate": 4.913718400347615e-05, "loss": 1.1429, "step": 55600 }, { "epoch": 0.8643833703192166, "grad_norm": 2.2268261909484863, "learning_rate": 4.913563214823321e-05, "loss": 1.1535, "step": 55700 }, { "epoch": 0.8659352255621595, "grad_norm": 2.369276285171509, "learning_rate": 4.913408029299027e-05, "loss": 1.1481, "step": 55800 }, { "epoch": 0.8674870808051025, "grad_norm": 2.3929903507232666, "learning_rate": 4.9132528437747326e-05, "loss": 1.1272, "step": 55900 }, { "epoch": 0.8690389360480454, "grad_norm": 2.2794313430786133, "learning_rate": 4.9130976582504384e-05, "loss": 1.1344, "step": 56000 }, { "epoch": 0.8705907912909884, "grad_norm": 2.3763296604156494, "learning_rate": 4.912942472726144e-05, "loss": 1.1352, "step": 56100 }, { "epoch": 0.8721426465339314, "grad_norm": 1.988471508026123, "learning_rate": 4.91278728720185e-05, "loss": 1.1746, "step": 56200 }, { "epoch": 0.8736945017768742, "grad_norm": 2.535183906555176, "learning_rate": 4.912632101677556e-05, "loss": 1.1507, "step": 56300 }, { "epoch": 0.8752463570198172, "grad_norm": 2.1881957054138184, "learning_rate": 4.9124769161532615e-05, "loss": 1.1351, "step": 56400 }, { "epoch": 0.8767982122627601, "grad_norm": 2.5924620628356934, "learning_rate": 4.912321730628967e-05, "loss": 1.1247, "step": 56500 }, { "epoch": 0.8783500675057031, "grad_norm": 2.0894575119018555, "learning_rate": 4.912166545104673e-05, "loss": 1.1498, "step": 56600 }, { "epoch": 0.879901922748646, "grad_norm": 2.1576178073883057, "learning_rate": 4.912011359580379e-05, "loss": 1.1684, "step": 56700 }, { "epoch": 0.881453777991589, "grad_norm": 2.6149630546569824, "learning_rate": 4.911856174056084e-05, "loss": 1.1556, "step": 56800 }, { "epoch": 0.8830056332345319, "grad_norm": 2.472132682800293, "learning_rate": 4.91170098853179e-05, "loss": 1.1493, "step": 56900 }, { "epoch": 0.8845574884774748, "grad_norm": 2.693777322769165, "learning_rate": 4.9115458030074954e-05, "loss": 1.1607, "step": 57000 }, { "epoch": 0.8861093437204177, "grad_norm": 2.4241716861724854, "learning_rate": 4.911390617483201e-05, "loss": 1.157, "step": 57100 }, { "epoch": 0.8876611989633607, "grad_norm": 2.381190061569214, "learning_rate": 4.911235431958907e-05, "loss": 1.1493, "step": 57200 }, { "epoch": 0.8892130542063036, "grad_norm": 2.6688320636749268, "learning_rate": 4.911080246434613e-05, "loss": 1.1479, "step": 57300 }, { "epoch": 0.8907649094492466, "grad_norm": 2.5460402965545654, "learning_rate": 4.9109250609103185e-05, "loss": 1.1582, "step": 57400 }, { "epoch": 0.8923167646921896, "grad_norm": 2.1748390197753906, "learning_rate": 4.910769875386024e-05, "loss": 1.156, "step": 57500 }, { "epoch": 0.8938686199351324, "grad_norm": 2.1711413860321045, "learning_rate": 4.91061468986173e-05, "loss": 1.1302, "step": 57600 }, { "epoch": 0.8954204751780754, "grad_norm": 2.1466925144195557, "learning_rate": 4.910459504337436e-05, "loss": 1.1266, "step": 57700 }, { "epoch": 0.8969723304210183, "grad_norm": 2.3548076152801514, "learning_rate": 4.9103043188131417e-05, "loss": 1.1514, "step": 57800 }, { "epoch": 0.8985241856639613, "grad_norm": 2.4585494995117188, "learning_rate": 4.9101491332888474e-05, "loss": 1.1548, "step": 57900 }, { "epoch": 0.9000760409069042, "grad_norm": 2.0424818992614746, "learning_rate": 4.909993947764553e-05, "loss": 1.1562, "step": 58000 }, { "epoch": 0.9016278961498472, "grad_norm": 2.4888811111450195, "learning_rate": 4.909838762240258e-05, "loss": 1.1365, "step": 58100 }, { "epoch": 0.90317975139279, "grad_norm": 2.359149694442749, "learning_rate": 4.909683576715964e-05, "loss": 1.1652, "step": 58200 }, { "epoch": 0.904731606635733, "grad_norm": 2.323132038116455, "learning_rate": 4.90952839119167e-05, "loss": 1.1558, "step": 58300 }, { "epoch": 0.9062834618786759, "grad_norm": 2.355215311050415, "learning_rate": 4.9093732056673756e-05, "loss": 1.1366, "step": 58400 }, { "epoch": 0.9078353171216189, "grad_norm": 2.5962116718292236, "learning_rate": 4.9092180201430814e-05, "loss": 1.1229, "step": 58500 }, { "epoch": 0.9093871723645618, "grad_norm": 2.4846644401550293, "learning_rate": 4.909062834618787e-05, "loss": 1.1543, "step": 58600 }, { "epoch": 0.9109390276075048, "grad_norm": 1.9102394580841064, "learning_rate": 4.908907649094493e-05, "loss": 1.1381, "step": 58700 }, { "epoch": 0.9124908828504477, "grad_norm": 2.333954095840454, "learning_rate": 4.908752463570198e-05, "loss": 1.1396, "step": 58800 }, { "epoch": 0.9140427380933906, "grad_norm": 2.653343915939331, "learning_rate": 4.908597278045904e-05, "loss": 1.1265, "step": 58900 }, { "epoch": 0.9155945933363336, "grad_norm": 2.3244781494140625, "learning_rate": 4.9084420925216096e-05, "loss": 1.1486, "step": 59000 }, { "epoch": 0.9171464485792765, "grad_norm": 2.522923469543457, "learning_rate": 4.9082869069973154e-05, "loss": 1.1305, "step": 59100 }, { "epoch": 0.9186983038222195, "grad_norm": 2.374894618988037, "learning_rate": 4.908131721473021e-05, "loss": 1.134, "step": 59200 }, { "epoch": 0.9202501590651624, "grad_norm": 2.0529372692108154, "learning_rate": 4.907976535948727e-05, "loss": 1.1355, "step": 59300 }, { "epoch": 0.9218020143081054, "grad_norm": 2.5219955444335938, "learning_rate": 4.907821350424433e-05, "loss": 1.1363, "step": 59400 }, { "epoch": 0.9233538695510483, "grad_norm": 2.275097370147705, "learning_rate": 4.9076661649001385e-05, "loss": 1.1624, "step": 59500 }, { "epoch": 0.9249057247939912, "grad_norm": 2.720305919647217, "learning_rate": 4.907510979375844e-05, "loss": 1.1596, "step": 59600 }, { "epoch": 0.9264575800369341, "grad_norm": 2.3368608951568604, "learning_rate": 4.9073557938515493e-05, "loss": 1.1253, "step": 59700 }, { "epoch": 0.9280094352798771, "grad_norm": 2.565573215484619, "learning_rate": 4.907200608327255e-05, "loss": 1.1385, "step": 59800 }, { "epoch": 0.92956129052282, "grad_norm": 2.826049327850342, "learning_rate": 4.907045422802961e-05, "loss": 1.1543, "step": 59900 }, { "epoch": 0.931113145765763, "grad_norm": 2.1456282138824463, "learning_rate": 4.906890237278667e-05, "loss": 1.1624, "step": 60000 }, { "epoch": 0.9326650010087059, "grad_norm": 1.9751325845718384, "learning_rate": 4.9067350517543724e-05, "loss": 1.1451, "step": 60100 }, { "epoch": 0.9342168562516489, "grad_norm": 2.428513526916504, "learning_rate": 4.906579866230078e-05, "loss": 1.1496, "step": 60200 }, { "epoch": 0.9357687114945917, "grad_norm": 2.0277485847473145, "learning_rate": 4.906424680705784e-05, "loss": 1.1404, "step": 60300 }, { "epoch": 0.9373205667375347, "grad_norm": 2.4008748531341553, "learning_rate": 4.90626949518149e-05, "loss": 1.1345, "step": 60400 }, { "epoch": 0.9388724219804777, "grad_norm": 2.1889076232910156, "learning_rate": 4.9061143096571955e-05, "loss": 1.1351, "step": 60500 }, { "epoch": 0.9404242772234206, "grad_norm": 2.0435073375701904, "learning_rate": 4.905959124132901e-05, "loss": 1.1466, "step": 60600 }, { "epoch": 0.9419761324663636, "grad_norm": 2.1171984672546387, "learning_rate": 4.905803938608607e-05, "loss": 1.117, "step": 60700 }, { "epoch": 0.9435279877093065, "grad_norm": 2.4131886959075928, "learning_rate": 4.905648753084313e-05, "loss": 1.1528, "step": 60800 }, { "epoch": 0.9450798429522494, "grad_norm": 2.251917839050293, "learning_rate": 4.9054935675600187e-05, "loss": 1.1577, "step": 60900 }, { "epoch": 0.9466316981951923, "grad_norm": 2.514467716217041, "learning_rate": 4.905338382035724e-05, "loss": 1.1501, "step": 61000 }, { "epoch": 0.9481835534381353, "grad_norm": 2.3124213218688965, "learning_rate": 4.9051831965114295e-05, "loss": 1.1464, "step": 61100 }, { "epoch": 0.9497354086810782, "grad_norm": 2.8343303203582764, "learning_rate": 4.905028010987135e-05, "loss": 1.1506, "step": 61200 }, { "epoch": 0.9512872639240212, "grad_norm": 2.353212356567383, "learning_rate": 4.904872825462841e-05, "loss": 1.1598, "step": 61300 }, { "epoch": 0.9528391191669641, "grad_norm": 2.13781476020813, "learning_rate": 4.904717639938547e-05, "loss": 1.1614, "step": 61400 }, { "epoch": 0.9543909744099071, "grad_norm": 2.4410207271575928, "learning_rate": 4.9045624544142526e-05, "loss": 1.1465, "step": 61500 }, { "epoch": 0.9559428296528499, "grad_norm": 2.4736881256103516, "learning_rate": 4.9044072688899584e-05, "loss": 1.142, "step": 61600 }, { "epoch": 0.9574946848957929, "grad_norm": 2.5588748455047607, "learning_rate": 4.904252083365664e-05, "loss": 1.142, "step": 61700 }, { "epoch": 0.9590465401387358, "grad_norm": 2.3748042583465576, "learning_rate": 4.90409689784137e-05, "loss": 1.1198, "step": 61800 }, { "epoch": 0.9605983953816788, "grad_norm": 2.3566460609436035, "learning_rate": 4.903941712317076e-05, "loss": 1.1478, "step": 61900 }, { "epoch": 0.9621502506246218, "grad_norm": 2.16545033454895, "learning_rate": 4.9037865267927815e-05, "loss": 1.1251, "step": 62000 }, { "epoch": 0.9637021058675647, "grad_norm": 2.2538864612579346, "learning_rate": 4.9036313412684866e-05, "loss": 1.1182, "step": 62100 }, { "epoch": 0.9652539611105077, "grad_norm": 2.1791608333587646, "learning_rate": 4.9034761557441924e-05, "loss": 1.1415, "step": 62200 }, { "epoch": 0.9668058163534505, "grad_norm": 2.3963756561279297, "learning_rate": 4.903320970219898e-05, "loss": 1.1554, "step": 62300 }, { "epoch": 0.9683576715963935, "grad_norm": 2.318190813064575, "learning_rate": 4.903165784695604e-05, "loss": 1.1583, "step": 62400 }, { "epoch": 0.9699095268393364, "grad_norm": 2.463308572769165, "learning_rate": 4.903010599171309e-05, "loss": 1.1356, "step": 62500 }, { "epoch": 0.9714613820822794, "grad_norm": 2.5972485542297363, "learning_rate": 4.902855413647015e-05, "loss": 1.1615, "step": 62600 }, { "epoch": 0.9730132373252223, "grad_norm": 2.307711362838745, "learning_rate": 4.9027002281227206e-05, "loss": 1.136, "step": 62700 }, { "epoch": 0.9745650925681653, "grad_norm": 2.5087623596191406, "learning_rate": 4.9025450425984263e-05, "loss": 1.1111, "step": 62800 }, { "epoch": 0.9761169478111081, "grad_norm": 2.0486855506896973, "learning_rate": 4.902389857074132e-05, "loss": 1.1293, "step": 62900 }, { "epoch": 0.9776688030540511, "grad_norm": 2.275099039077759, "learning_rate": 4.902234671549838e-05, "loss": 1.1552, "step": 63000 }, { "epoch": 0.979220658296994, "grad_norm": 2.1463348865509033, "learning_rate": 4.902079486025544e-05, "loss": 1.1491, "step": 63100 }, { "epoch": 0.980772513539937, "grad_norm": 2.0790605545043945, "learning_rate": 4.9019243005012494e-05, "loss": 1.1463, "step": 63200 }, { "epoch": 0.98232436878288, "grad_norm": 2.1598682403564453, "learning_rate": 4.901769114976955e-05, "loss": 1.15, "step": 63300 }, { "epoch": 0.9838762240258229, "grad_norm": 2.1825709342956543, "learning_rate": 4.901613929452661e-05, "loss": 1.1327, "step": 63400 }, { "epoch": 0.9854280792687659, "grad_norm": 2.0131757259368896, "learning_rate": 4.901458743928367e-05, "loss": 1.1529, "step": 63500 }, { "epoch": 0.9869799345117087, "grad_norm": 2.467144012451172, "learning_rate": 4.9013035584040725e-05, "loss": 1.148, "step": 63600 }, { "epoch": 0.9885317897546517, "grad_norm": 2.1347553730010986, "learning_rate": 4.901148372879778e-05, "loss": 1.1521, "step": 63700 }, { "epoch": 0.9900836449975946, "grad_norm": 1.8552659749984741, "learning_rate": 4.9009931873554834e-05, "loss": 1.1246, "step": 63800 }, { "epoch": 0.9916355002405376, "grad_norm": 2.1700820922851562, "learning_rate": 4.900838001831189e-05, "loss": 1.1601, "step": 63900 }, { "epoch": 0.9931873554834805, "grad_norm": 2.525789499282837, "learning_rate": 4.900682816306895e-05, "loss": 1.1187, "step": 64000 }, { "epoch": 0.9947392107264235, "grad_norm": 2.2519590854644775, "learning_rate": 4.900527630782601e-05, "loss": 1.1405, "step": 64100 }, { "epoch": 0.9962910659693663, "grad_norm": 2.1992874145507812, "learning_rate": 4.9003724452583065e-05, "loss": 1.1021, "step": 64200 }, { "epoch": 0.9978429212123093, "grad_norm": 2.641643762588501, "learning_rate": 4.900217259734012e-05, "loss": 1.1421, "step": 64300 }, { "epoch": 0.9993947764552522, "grad_norm": 2.3939990997314453, "learning_rate": 4.900062074209718e-05, "loss": 1.1421, "step": 64400 }, { "epoch": 1.0009466316981952, "grad_norm": 2.475574016571045, "learning_rate": 4.899906888685424e-05, "loss": 1.1344, "step": 64500 }, { "epoch": 1.0024984869411382, "grad_norm": 2.404494047164917, "learning_rate": 4.8997517031611296e-05, "loss": 1.1433, "step": 64600 }, { "epoch": 1.0040503421840812, "grad_norm": 2.2565083503723145, "learning_rate": 4.8995965176368354e-05, "loss": 1.1195, "step": 64700 }, { "epoch": 1.005602197427024, "grad_norm": 2.1716673374176025, "learning_rate": 4.899441332112541e-05, "loss": 1.126, "step": 64800 }, { "epoch": 1.007154052669967, "grad_norm": 2.1893420219421387, "learning_rate": 4.899286146588247e-05, "loss": 1.1386, "step": 64900 }, { "epoch": 1.00870590791291, "grad_norm": 2.226348876953125, "learning_rate": 4.899130961063953e-05, "loss": 1.1257, "step": 65000 }, { "epoch": 1.010257763155853, "grad_norm": 2.3709893226623535, "learning_rate": 4.898975775539658e-05, "loss": 1.1219, "step": 65100 }, { "epoch": 1.0118096183987957, "grad_norm": 2.157656669616699, "learning_rate": 4.8988205900153636e-05, "loss": 1.1369, "step": 65200 }, { "epoch": 1.0133614736417387, "grad_norm": 2.7935526371002197, "learning_rate": 4.898665404491069e-05, "loss": 1.1407, "step": 65300 }, { "epoch": 1.0149133288846817, "grad_norm": 2.5528039932250977, "learning_rate": 4.8985102189667745e-05, "loss": 1.1347, "step": 65400 }, { "epoch": 1.0164651841276247, "grad_norm": 2.5023515224456787, "learning_rate": 4.89835503344248e-05, "loss": 1.1333, "step": 65500 }, { "epoch": 1.0180170393705674, "grad_norm": 2.134716272354126, "learning_rate": 4.898199847918186e-05, "loss": 1.1342, "step": 65600 }, { "epoch": 1.0195688946135104, "grad_norm": 2.233942747116089, "learning_rate": 4.898044662393892e-05, "loss": 1.1213, "step": 65700 }, { "epoch": 1.0211207498564534, "grad_norm": 2.079174280166626, "learning_rate": 4.8978894768695976e-05, "loss": 1.1175, "step": 65800 }, { "epoch": 1.0226726050993964, "grad_norm": 2.600405693054199, "learning_rate": 4.8977342913453033e-05, "loss": 1.124, "step": 65900 }, { "epoch": 1.0242244603423392, "grad_norm": 2.2721426486968994, "learning_rate": 4.897579105821009e-05, "loss": 1.1279, "step": 66000 }, { "epoch": 1.0257763155852822, "grad_norm": 2.5375139713287354, "learning_rate": 4.897423920296715e-05, "loss": 1.1233, "step": 66100 }, { "epoch": 1.0273281708282251, "grad_norm": 2.007460355758667, "learning_rate": 4.897268734772421e-05, "loss": 1.1239, "step": 66200 }, { "epoch": 1.0288800260711681, "grad_norm": 2.4504077434539795, "learning_rate": 4.8971135492481264e-05, "loss": 1.1412, "step": 66300 }, { "epoch": 1.0304318813141111, "grad_norm": 2.283846616744995, "learning_rate": 4.896958363723832e-05, "loss": 1.1463, "step": 66400 }, { "epoch": 1.0319837365570539, "grad_norm": 2.5684947967529297, "learning_rate": 4.896803178199538e-05, "loss": 1.1288, "step": 66500 }, { "epoch": 1.0335355917999969, "grad_norm": 2.768786907196045, "learning_rate": 4.896647992675243e-05, "loss": 1.1344, "step": 66600 }, { "epoch": 1.0350874470429399, "grad_norm": 2.195347309112549, "learning_rate": 4.896492807150949e-05, "loss": 1.1032, "step": 66700 }, { "epoch": 1.0366393022858829, "grad_norm": 2.4260852336883545, "learning_rate": 4.8963376216266546e-05, "loss": 1.1314, "step": 66800 }, { "epoch": 1.0381911575288256, "grad_norm": 2.155688524246216, "learning_rate": 4.8961824361023604e-05, "loss": 1.1276, "step": 66900 }, { "epoch": 1.0397430127717686, "grad_norm": 2.5889296531677246, "learning_rate": 4.896027250578066e-05, "loss": 1.1214, "step": 67000 }, { "epoch": 1.0412948680147116, "grad_norm": 1.9910494089126587, "learning_rate": 4.895872065053772e-05, "loss": 1.1284, "step": 67100 }, { "epoch": 1.0428467232576546, "grad_norm": 2.4980008602142334, "learning_rate": 4.895716879529478e-05, "loss": 1.1312, "step": 67200 }, { "epoch": 1.0443985785005974, "grad_norm": 2.067190647125244, "learning_rate": 4.8955616940051835e-05, "loss": 1.1239, "step": 67300 }, { "epoch": 1.0459504337435404, "grad_norm": 2.1788415908813477, "learning_rate": 4.895406508480889e-05, "loss": 1.1316, "step": 67400 }, { "epoch": 1.0475022889864833, "grad_norm": 2.1426210403442383, "learning_rate": 4.895251322956595e-05, "loss": 1.1665, "step": 67500 }, { "epoch": 1.0490541442294263, "grad_norm": 2.366445779800415, "learning_rate": 4.895096137432301e-05, "loss": 1.1554, "step": 67600 }, { "epoch": 1.0506059994723693, "grad_norm": 2.381641149520874, "learning_rate": 4.8949409519080066e-05, "loss": 1.1363, "step": 67700 }, { "epoch": 1.052157854715312, "grad_norm": 2.5074310302734375, "learning_rate": 4.8947857663837124e-05, "loss": 1.127, "step": 67800 }, { "epoch": 1.053709709958255, "grad_norm": 2.143097162246704, "learning_rate": 4.8946305808594175e-05, "loss": 1.1321, "step": 67900 }, { "epoch": 1.055261565201198, "grad_norm": 2.051079511642456, "learning_rate": 4.894475395335123e-05, "loss": 1.1248, "step": 68000 }, { "epoch": 1.056813420444141, "grad_norm": 2.2431132793426514, "learning_rate": 4.894320209810829e-05, "loss": 1.1023, "step": 68100 }, { "epoch": 1.0583652756870838, "grad_norm": 2.1707863807678223, "learning_rate": 4.894165024286535e-05, "loss": 1.1428, "step": 68200 }, { "epoch": 1.0599171309300268, "grad_norm": 2.3224987983703613, "learning_rate": 4.8940098387622406e-05, "loss": 1.1147, "step": 68300 }, { "epoch": 1.0614689861729698, "grad_norm": 2.5109481811523438, "learning_rate": 4.8938546532379464e-05, "loss": 1.1263, "step": 68400 }, { "epoch": 1.0630208414159128, "grad_norm": 2.5767602920532227, "learning_rate": 4.893699467713652e-05, "loss": 1.1385, "step": 68500 }, { "epoch": 1.0645726966588556, "grad_norm": 2.0844085216522217, "learning_rate": 4.893544282189357e-05, "loss": 1.131, "step": 68600 }, { "epoch": 1.0661245519017986, "grad_norm": 2.0231311321258545, "learning_rate": 4.893389096665063e-05, "loss": 1.1136, "step": 68700 }, { "epoch": 1.0676764071447415, "grad_norm": 2.810903787612915, "learning_rate": 4.893233911140769e-05, "loss": 1.1221, "step": 68800 }, { "epoch": 1.0692282623876845, "grad_norm": 2.2353179454803467, "learning_rate": 4.8930787256164746e-05, "loss": 1.1242, "step": 68900 }, { "epoch": 1.0707801176306275, "grad_norm": 2.5603678226470947, "learning_rate": 4.8929235400921803e-05, "loss": 1.1207, "step": 69000 }, { "epoch": 1.0723319728735703, "grad_norm": 2.389472723007202, "learning_rate": 4.892768354567886e-05, "loss": 1.1246, "step": 69100 }, { "epoch": 1.0738838281165133, "grad_norm": 2.076622486114502, "learning_rate": 4.892613169043592e-05, "loss": 1.1193, "step": 69200 }, { "epoch": 1.0754356833594563, "grad_norm": 2.4756710529327393, "learning_rate": 4.892457983519298e-05, "loss": 1.1344, "step": 69300 }, { "epoch": 1.0769875386023993, "grad_norm": 2.454780101776123, "learning_rate": 4.8923027979950034e-05, "loss": 1.1415, "step": 69400 }, { "epoch": 1.078539393845342, "grad_norm": 2.361328125, "learning_rate": 4.8921476124707085e-05, "loss": 1.0997, "step": 69500 }, { "epoch": 1.080091249088285, "grad_norm": 2.194244623184204, "learning_rate": 4.891992426946414e-05, "loss": 1.1327, "step": 69600 }, { "epoch": 1.081643104331228, "grad_norm": 2.312431573867798, "learning_rate": 4.89183724142212e-05, "loss": 1.1287, "step": 69700 }, { "epoch": 1.083194959574171, "grad_norm": 2.243638038635254, "learning_rate": 4.891682055897826e-05, "loss": 1.1074, "step": 69800 }, { "epoch": 1.0847468148171138, "grad_norm": 2.416748046875, "learning_rate": 4.8915268703735316e-05, "loss": 1.0981, "step": 69900 }, { "epoch": 1.0862986700600568, "grad_norm": 2.2611000537872314, "learning_rate": 4.8913716848492374e-05, "loss": 1.1052, "step": 70000 }, { "epoch": 1.0878505253029997, "grad_norm": 2.5724949836730957, "learning_rate": 4.891216499324943e-05, "loss": 1.1139, "step": 70100 }, { "epoch": 1.0894023805459427, "grad_norm": 2.333221197128296, "learning_rate": 4.891061313800649e-05, "loss": 1.1286, "step": 70200 }, { "epoch": 1.0909542357888857, "grad_norm": 2.2546133995056152, "learning_rate": 4.890906128276355e-05, "loss": 1.1359, "step": 70300 }, { "epoch": 1.0925060910318285, "grad_norm": 2.327009916305542, "learning_rate": 4.8907509427520605e-05, "loss": 1.1243, "step": 70400 }, { "epoch": 1.0940579462747715, "grad_norm": 2.5289971828460693, "learning_rate": 4.890595757227766e-05, "loss": 1.1238, "step": 70500 }, { "epoch": 1.0956098015177145, "grad_norm": 2.1109981536865234, "learning_rate": 4.890440571703472e-05, "loss": 1.1208, "step": 70600 }, { "epoch": 1.0971616567606575, "grad_norm": 2.357865810394287, "learning_rate": 4.890285386179178e-05, "loss": 1.0835, "step": 70700 }, { "epoch": 1.0987135120036002, "grad_norm": 1.9613914489746094, "learning_rate": 4.890130200654883e-05, "loss": 1.1382, "step": 70800 }, { "epoch": 1.1002653672465432, "grad_norm": 2.448016405105591, "learning_rate": 4.889975015130589e-05, "loss": 1.1443, "step": 70900 }, { "epoch": 1.1018172224894862, "grad_norm": 2.627293586730957, "learning_rate": 4.8898198296062945e-05, "loss": 1.1332, "step": 71000 }, { "epoch": 1.1033690777324292, "grad_norm": 2.505042552947998, "learning_rate": 4.889664644082e-05, "loss": 1.1255, "step": 71100 }, { "epoch": 1.104920932975372, "grad_norm": 2.3451147079467773, "learning_rate": 4.889509458557706e-05, "loss": 1.1246, "step": 71200 }, { "epoch": 1.106472788218315, "grad_norm": 2.861656904220581, "learning_rate": 4.889354273033412e-05, "loss": 1.1213, "step": 71300 }, { "epoch": 1.108024643461258, "grad_norm": 1.72523033618927, "learning_rate": 4.8891990875091176e-05, "loss": 1.1387, "step": 71400 }, { "epoch": 1.109576498704201, "grad_norm": 2.3678364753723145, "learning_rate": 4.8890439019848234e-05, "loss": 1.1343, "step": 71500 }, { "epoch": 1.1111283539471437, "grad_norm": 2.4903464317321777, "learning_rate": 4.888888716460529e-05, "loss": 1.1297, "step": 71600 }, { "epoch": 1.1126802091900867, "grad_norm": 2.282066822052002, "learning_rate": 4.888733530936235e-05, "loss": 1.1206, "step": 71700 }, { "epoch": 1.1142320644330297, "grad_norm": 2.2484939098358154, "learning_rate": 4.888578345411941e-05, "loss": 1.1301, "step": 71800 }, { "epoch": 1.1157839196759727, "grad_norm": 2.3275198936462402, "learning_rate": 4.888423159887646e-05, "loss": 1.1381, "step": 71900 }, { "epoch": 1.1173357749189154, "grad_norm": 2.6546790599823, "learning_rate": 4.8882679743633516e-05, "loss": 1.1092, "step": 72000 }, { "epoch": 1.1188876301618584, "grad_norm": 2.0898914337158203, "learning_rate": 4.8881127888390573e-05, "loss": 1.1235, "step": 72100 }, { "epoch": 1.1204394854048014, "grad_norm": 2.0487396717071533, "learning_rate": 4.887957603314763e-05, "loss": 1.1279, "step": 72200 }, { "epoch": 1.1219913406477444, "grad_norm": 2.117793321609497, "learning_rate": 4.887802417790468e-05, "loss": 1.1277, "step": 72300 }, { "epoch": 1.1235431958906874, "grad_norm": 2.5232748985290527, "learning_rate": 4.887647232266174e-05, "loss": 1.1214, "step": 72400 }, { "epoch": 1.1250950511336302, "grad_norm": 2.1948370933532715, "learning_rate": 4.88749204674188e-05, "loss": 1.13, "step": 72500 }, { "epoch": 1.1266469063765732, "grad_norm": 2.259174108505249, "learning_rate": 4.8873368612175855e-05, "loss": 1.0967, "step": 72600 }, { "epoch": 1.1281987616195162, "grad_norm": 2.6604952812194824, "learning_rate": 4.887181675693291e-05, "loss": 1.0942, "step": 72700 }, { "epoch": 1.1297506168624591, "grad_norm": 3.044663906097412, "learning_rate": 4.887026490168997e-05, "loss": 1.1424, "step": 72800 }, { "epoch": 1.131302472105402, "grad_norm": 2.2361817359924316, "learning_rate": 4.886871304644703e-05, "loss": 1.1246, "step": 72900 }, { "epoch": 1.132854327348345, "grad_norm": 2.3046019077301025, "learning_rate": 4.8867161191204086e-05, "loss": 1.1244, "step": 73000 }, { "epoch": 1.134406182591288, "grad_norm": 2.2907626628875732, "learning_rate": 4.8865609335961144e-05, "loss": 1.1059, "step": 73100 }, { "epoch": 1.1359580378342309, "grad_norm": 2.809164047241211, "learning_rate": 4.88640574807182e-05, "loss": 1.1126, "step": 73200 }, { "epoch": 1.1375098930771737, "grad_norm": 2.2181551456451416, "learning_rate": 4.886250562547526e-05, "loss": 1.1062, "step": 73300 }, { "epoch": 1.1390617483201166, "grad_norm": 2.2118139266967773, "learning_rate": 4.886095377023232e-05, "loss": 1.1329, "step": 73400 }, { "epoch": 1.1406136035630596, "grad_norm": 2.2996928691864014, "learning_rate": 4.8859401914989375e-05, "loss": 1.1187, "step": 73500 }, { "epoch": 1.1421654588060026, "grad_norm": 2.208285093307495, "learning_rate": 4.8857850059746426e-05, "loss": 1.1083, "step": 73600 }, { "epoch": 1.1437173140489456, "grad_norm": 2.3561384677886963, "learning_rate": 4.8856298204503484e-05, "loss": 1.14, "step": 73700 }, { "epoch": 1.1452691692918884, "grad_norm": 2.080418586730957, "learning_rate": 4.885474634926054e-05, "loss": 1.096, "step": 73800 }, { "epoch": 1.1468210245348314, "grad_norm": 2.51823091506958, "learning_rate": 4.88531944940176e-05, "loss": 1.1224, "step": 73900 }, { "epoch": 1.1483728797777744, "grad_norm": 2.5477023124694824, "learning_rate": 4.885164263877466e-05, "loss": 1.1156, "step": 74000 }, { "epoch": 1.1499247350207173, "grad_norm": 2.4625790119171143, "learning_rate": 4.8850090783531715e-05, "loss": 1.114, "step": 74100 }, { "epoch": 1.1514765902636601, "grad_norm": 1.9517508745193481, "learning_rate": 4.884853892828877e-05, "loss": 1.1223, "step": 74200 }, { "epoch": 1.153028445506603, "grad_norm": 2.3685731887817383, "learning_rate": 4.884698707304583e-05, "loss": 1.1175, "step": 74300 }, { "epoch": 1.154580300749546, "grad_norm": 2.2436087131500244, "learning_rate": 4.884543521780289e-05, "loss": 1.1158, "step": 74400 }, { "epoch": 1.156132155992489, "grad_norm": 1.794718861579895, "learning_rate": 4.8843883362559946e-05, "loss": 1.108, "step": 74500 }, { "epoch": 1.1576840112354319, "grad_norm": 2.3092503547668457, "learning_rate": 4.8842331507317004e-05, "loss": 1.1249, "step": 74600 }, { "epoch": 1.1592358664783748, "grad_norm": 2.821805238723755, "learning_rate": 4.884077965207406e-05, "loss": 1.1062, "step": 74700 }, { "epoch": 1.1607877217213178, "grad_norm": 2.215714454650879, "learning_rate": 4.883922779683112e-05, "loss": 1.1163, "step": 74800 }, { "epoch": 1.1623395769642608, "grad_norm": 2.593536615371704, "learning_rate": 4.883767594158817e-05, "loss": 1.1088, "step": 74900 }, { "epoch": 1.1638914322072038, "grad_norm": 2.684123992919922, "learning_rate": 4.883612408634523e-05, "loss": 1.1392, "step": 75000 }, { "epoch": 1.1654432874501466, "grad_norm": 2.0079808235168457, "learning_rate": 4.8834572231102286e-05, "loss": 1.1066, "step": 75100 }, { "epoch": 1.1669951426930896, "grad_norm": 2.3010594844818115, "learning_rate": 4.883302037585934e-05, "loss": 1.1097, "step": 75200 }, { "epoch": 1.1685469979360326, "grad_norm": 1.901662826538086, "learning_rate": 4.8831468520616394e-05, "loss": 1.1249, "step": 75300 }, { "epoch": 1.1700988531789756, "grad_norm": 2.384251594543457, "learning_rate": 4.882991666537345e-05, "loss": 1.1023, "step": 75400 }, { "epoch": 1.1716507084219183, "grad_norm": 2.661597490310669, "learning_rate": 4.882836481013051e-05, "loss": 1.1202, "step": 75500 }, { "epoch": 1.1732025636648613, "grad_norm": 2.466740608215332, "learning_rate": 4.882681295488757e-05, "loss": 1.1092, "step": 75600 }, { "epoch": 1.1747544189078043, "grad_norm": 2.1879634857177734, "learning_rate": 4.8825261099644625e-05, "loss": 1.1207, "step": 75700 }, { "epoch": 1.1763062741507473, "grad_norm": 2.2805233001708984, "learning_rate": 4.882370924440168e-05, "loss": 1.1154, "step": 75800 }, { "epoch": 1.17785812939369, "grad_norm": 2.2602341175079346, "learning_rate": 4.882215738915874e-05, "loss": 1.1194, "step": 75900 }, { "epoch": 1.179409984636633, "grad_norm": 2.6319894790649414, "learning_rate": 4.88206055339158e-05, "loss": 1.1203, "step": 76000 }, { "epoch": 1.180961839879576, "grad_norm": 1.7584298849105835, "learning_rate": 4.8819053678672856e-05, "loss": 1.1068, "step": 76100 }, { "epoch": 1.182513695122519, "grad_norm": 2.8603458404541016, "learning_rate": 4.8817501823429914e-05, "loss": 1.1076, "step": 76200 }, { "epoch": 1.184065550365462, "grad_norm": 2.515455484390259, "learning_rate": 4.881594996818697e-05, "loss": 1.1081, "step": 76300 }, { "epoch": 1.1856174056084048, "grad_norm": 2.2006876468658447, "learning_rate": 4.881439811294403e-05, "loss": 1.116, "step": 76400 }, { "epoch": 1.1871692608513478, "grad_norm": 2.294879674911499, "learning_rate": 4.881284625770108e-05, "loss": 1.1128, "step": 76500 }, { "epoch": 1.1887211160942908, "grad_norm": 2.5908255577087402, "learning_rate": 4.881129440245814e-05, "loss": 1.1319, "step": 76600 }, { "epoch": 1.1902729713372338, "grad_norm": 2.358715057373047, "learning_rate": 4.8809742547215196e-05, "loss": 1.1311, "step": 76700 }, { "epoch": 1.1918248265801765, "grad_norm": 2.575032949447632, "learning_rate": 4.8808190691972254e-05, "loss": 1.1051, "step": 76800 }, { "epoch": 1.1933766818231195, "grad_norm": 1.993265151977539, "learning_rate": 4.880663883672931e-05, "loss": 1.1194, "step": 76900 }, { "epoch": 1.1949285370660625, "grad_norm": 1.9357131719589233, "learning_rate": 4.880508698148637e-05, "loss": 1.1034, "step": 77000 }, { "epoch": 1.1964803923090055, "grad_norm": 2.110600233078003, "learning_rate": 4.880353512624343e-05, "loss": 1.1222, "step": 77100 }, { "epoch": 1.1980322475519483, "grad_norm": 2.849364757537842, "learning_rate": 4.8801983271000485e-05, "loss": 1.1082, "step": 77200 }, { "epoch": 1.1995841027948912, "grad_norm": 2.535916805267334, "learning_rate": 4.880043141575754e-05, "loss": 1.1171, "step": 77300 }, { "epoch": 1.2011359580378342, "grad_norm": 2.435857057571411, "learning_rate": 4.87988795605146e-05, "loss": 1.1251, "step": 77400 }, { "epoch": 1.2026878132807772, "grad_norm": 2.0573973655700684, "learning_rate": 4.879732770527166e-05, "loss": 1.1083, "step": 77500 }, { "epoch": 1.2042396685237202, "grad_norm": 2.450852394104004, "learning_rate": 4.8795775850028716e-05, "loss": 1.1121, "step": 77600 }, { "epoch": 1.205791523766663, "grad_norm": 2.6014044284820557, "learning_rate": 4.8794223994785774e-05, "loss": 1.1065, "step": 77700 }, { "epoch": 1.207343379009606, "grad_norm": 1.7768484354019165, "learning_rate": 4.8792672139542825e-05, "loss": 1.1166, "step": 77800 }, { "epoch": 1.208895234252549, "grad_norm": 2.511260986328125, "learning_rate": 4.879112028429988e-05, "loss": 1.1028, "step": 77900 }, { "epoch": 1.210447089495492, "grad_norm": 2.6407852172851562, "learning_rate": 4.878956842905694e-05, "loss": 1.1105, "step": 78000 }, { "epoch": 1.2119989447384347, "grad_norm": 2.5305416584014893, "learning_rate": 4.8788016573814e-05, "loss": 1.1078, "step": 78100 }, { "epoch": 1.2135507999813777, "grad_norm": 1.7494454383850098, "learning_rate": 4.8786464718571056e-05, "loss": 1.1014, "step": 78200 }, { "epoch": 1.2151026552243207, "grad_norm": 2.407849073410034, "learning_rate": 4.8784912863328113e-05, "loss": 1.1084, "step": 78300 }, { "epoch": 1.2166545104672637, "grad_norm": 2.26851224899292, "learning_rate": 4.8783361008085164e-05, "loss": 1.1219, "step": 78400 }, { "epoch": 1.2182063657102065, "grad_norm": 2.121371030807495, "learning_rate": 4.878180915284222e-05, "loss": 1.1023, "step": 78500 }, { "epoch": 1.2197582209531495, "grad_norm": 2.2187118530273438, "learning_rate": 4.878025729759928e-05, "loss": 1.0841, "step": 78600 }, { "epoch": 1.2213100761960924, "grad_norm": 2.2014241218566895, "learning_rate": 4.877870544235634e-05, "loss": 1.1137, "step": 78700 }, { "epoch": 1.2228619314390354, "grad_norm": 2.2093615531921387, "learning_rate": 4.8777153587113395e-05, "loss": 1.1093, "step": 78800 }, { "epoch": 1.2244137866819784, "grad_norm": 1.9585342407226562, "learning_rate": 4.877560173187045e-05, "loss": 1.1248, "step": 78900 }, { "epoch": 1.2259656419249212, "grad_norm": 2.365720272064209, "learning_rate": 4.877404987662751e-05, "loss": 1.1303, "step": 79000 }, { "epoch": 1.2275174971678642, "grad_norm": 2.6853487491607666, "learning_rate": 4.877249802138457e-05, "loss": 1.1003, "step": 79100 }, { "epoch": 1.2290693524108072, "grad_norm": 2.127527952194214, "learning_rate": 4.8770946166141626e-05, "loss": 1.1123, "step": 79200 }, { "epoch": 1.23062120765375, "grad_norm": 2.4248695373535156, "learning_rate": 4.876939431089868e-05, "loss": 1.1194, "step": 79300 }, { "epoch": 1.232173062896693, "grad_norm": 2.3345465660095215, "learning_rate": 4.8767842455655735e-05, "loss": 1.1159, "step": 79400 }, { "epoch": 1.233724918139636, "grad_norm": 2.347710371017456, "learning_rate": 4.876629060041279e-05, "loss": 1.0867, "step": 79500 }, { "epoch": 1.235276773382579, "grad_norm": 2.7182693481445312, "learning_rate": 4.876473874516985e-05, "loss": 1.0936, "step": 79600 }, { "epoch": 1.236828628625522, "grad_norm": 2.3278050422668457, "learning_rate": 4.876318688992691e-05, "loss": 1.1071, "step": 79700 }, { "epoch": 1.2383804838684647, "grad_norm": 2.615981340408325, "learning_rate": 4.8761635034683966e-05, "loss": 1.1483, "step": 79800 }, { "epoch": 1.2399323391114077, "grad_norm": 2.1018478870391846, "learning_rate": 4.8760083179441024e-05, "loss": 1.1049, "step": 79900 }, { "epoch": 1.2414841943543506, "grad_norm": 2.3852782249450684, "learning_rate": 4.875853132419808e-05, "loss": 1.1416, "step": 80000 }, { "epoch": 1.2430360495972936, "grad_norm": 2.2100820541381836, "learning_rate": 4.875697946895514e-05, "loss": 1.1111, "step": 80100 }, { "epoch": 1.2445879048402366, "grad_norm": 2.3852789402008057, "learning_rate": 4.87554276137122e-05, "loss": 1.1309, "step": 80200 }, { "epoch": 1.2461397600831794, "grad_norm": 2.390979766845703, "learning_rate": 4.8753875758469255e-05, "loss": 1.1189, "step": 80300 }, { "epoch": 1.2476916153261224, "grad_norm": 2.1343493461608887, "learning_rate": 4.875232390322631e-05, "loss": 1.1065, "step": 80400 }, { "epoch": 1.2492434705690654, "grad_norm": 2.163029670715332, "learning_rate": 4.875077204798337e-05, "loss": 1.1176, "step": 80500 }, { "epoch": 1.2507953258120081, "grad_norm": 2.0536952018737793, "learning_rate": 4.874922019274042e-05, "loss": 1.0989, "step": 80600 }, { "epoch": 1.2523471810549511, "grad_norm": 2.367103099822998, "learning_rate": 4.874766833749748e-05, "loss": 1.1156, "step": 80700 }, { "epoch": 1.2538990362978941, "grad_norm": 2.1768832206726074, "learning_rate": 4.874611648225454e-05, "loss": 1.1134, "step": 80800 }, { "epoch": 1.255450891540837, "grad_norm": 1.9017785787582397, "learning_rate": 4.8744564627011595e-05, "loss": 1.109, "step": 80900 }, { "epoch": 1.25700274678378, "grad_norm": 2.1839749813079834, "learning_rate": 4.874301277176865e-05, "loss": 1.1015, "step": 81000 }, { "epoch": 1.2585546020267229, "grad_norm": 2.312918186187744, "learning_rate": 4.874146091652571e-05, "loss": 1.1079, "step": 81100 }, { "epoch": 1.2601064572696659, "grad_norm": 2.8707101345062256, "learning_rate": 4.873990906128277e-05, "loss": 1.1289, "step": 81200 }, { "epoch": 1.2616583125126088, "grad_norm": 2.3373329639434814, "learning_rate": 4.8738357206039826e-05, "loss": 1.1078, "step": 81300 }, { "epoch": 1.2632101677555518, "grad_norm": 2.244046926498413, "learning_rate": 4.8736805350796883e-05, "loss": 1.1323, "step": 81400 }, { "epoch": 1.2647620229984948, "grad_norm": 2.097012519836426, "learning_rate": 4.873525349555394e-05, "loss": 1.0956, "step": 81500 }, { "epoch": 1.2663138782414376, "grad_norm": 2.4968135356903076, "learning_rate": 4.873370164031099e-05, "loss": 1.1182, "step": 81600 }, { "epoch": 1.2678657334843806, "grad_norm": 2.7941203117370605, "learning_rate": 4.873214978506805e-05, "loss": 1.1228, "step": 81700 }, { "epoch": 1.2694175887273236, "grad_norm": 2.2232351303100586, "learning_rate": 4.873059792982511e-05, "loss": 1.1, "step": 81800 }, { "epoch": 1.2709694439702663, "grad_norm": 1.9244650602340698, "learning_rate": 4.8729046074582165e-05, "loss": 1.1156, "step": 81900 }, { "epoch": 1.2725212992132093, "grad_norm": 2.22092866897583, "learning_rate": 4.872749421933922e-05, "loss": 1.1156, "step": 82000 }, { "epoch": 1.2740731544561523, "grad_norm": 2.7483675479888916, "learning_rate": 4.8725942364096274e-05, "loss": 1.1184, "step": 82100 }, { "epoch": 1.2756250096990953, "grad_norm": 2.1573240756988525, "learning_rate": 4.872439050885333e-05, "loss": 1.1166, "step": 82200 }, { "epoch": 1.2771768649420383, "grad_norm": 2.4324727058410645, "learning_rate": 4.872283865361039e-05, "loss": 1.1138, "step": 82300 }, { "epoch": 1.278728720184981, "grad_norm": 2.7243478298187256, "learning_rate": 4.872128679836745e-05, "loss": 1.0961, "step": 82400 }, { "epoch": 1.280280575427924, "grad_norm": 1.8685250282287598, "learning_rate": 4.8719734943124505e-05, "loss": 1.1104, "step": 82500 }, { "epoch": 1.281832430670867, "grad_norm": 2.515303373336792, "learning_rate": 4.871818308788156e-05, "loss": 1.1209, "step": 82600 }, { "epoch": 1.28338428591381, "grad_norm": 2.028463125228882, "learning_rate": 4.871663123263862e-05, "loss": 1.0991, "step": 82700 }, { "epoch": 1.284936141156753, "grad_norm": 2.218553304672241, "learning_rate": 4.871507937739568e-05, "loss": 1.1049, "step": 82800 }, { "epoch": 1.2864879963996958, "grad_norm": 2.079744577407837, "learning_rate": 4.8713527522152736e-05, "loss": 1.0811, "step": 82900 }, { "epoch": 1.2880398516426388, "grad_norm": 2.152311086654663, "learning_rate": 4.8711975666909794e-05, "loss": 1.111, "step": 83000 }, { "epoch": 1.2895917068855818, "grad_norm": 2.2847890853881836, "learning_rate": 4.871042381166685e-05, "loss": 1.1185, "step": 83100 }, { "epoch": 1.2911435621285245, "grad_norm": 2.5103321075439453, "learning_rate": 4.870887195642391e-05, "loss": 1.1029, "step": 83200 }, { "epoch": 1.2926954173714675, "grad_norm": 2.4194531440734863, "learning_rate": 4.870732010118097e-05, "loss": 1.1068, "step": 83300 }, { "epoch": 1.2942472726144105, "grad_norm": 2.6363720893859863, "learning_rate": 4.870576824593802e-05, "loss": 1.1173, "step": 83400 }, { "epoch": 1.2957991278573535, "grad_norm": 2.5055174827575684, "learning_rate": 4.8704216390695076e-05, "loss": 1.1149, "step": 83500 }, { "epoch": 1.2973509831002965, "grad_norm": 2.532381057739258, "learning_rate": 4.8702664535452134e-05, "loss": 1.129, "step": 83600 }, { "epoch": 1.2989028383432393, "grad_norm": 1.9616467952728271, "learning_rate": 4.870111268020919e-05, "loss": 1.1239, "step": 83700 }, { "epoch": 1.3004546935861823, "grad_norm": 2.455014228820801, "learning_rate": 4.869956082496625e-05, "loss": 1.1165, "step": 83800 }, { "epoch": 1.3020065488291253, "grad_norm": 1.827030062675476, "learning_rate": 4.869800896972331e-05, "loss": 1.1004, "step": 83900 }, { "epoch": 1.303558404072068, "grad_norm": 2.5139052867889404, "learning_rate": 4.8696457114480365e-05, "loss": 1.1081, "step": 84000 }, { "epoch": 1.3051102593150112, "grad_norm": 2.587278366088867, "learning_rate": 4.869490525923742e-05, "loss": 1.1236, "step": 84100 }, { "epoch": 1.306662114557954, "grad_norm": 2.0649638175964355, "learning_rate": 4.869335340399448e-05, "loss": 1.1001, "step": 84200 }, { "epoch": 1.308213969800897, "grad_norm": 2.487438201904297, "learning_rate": 4.869180154875154e-05, "loss": 1.0837, "step": 84300 }, { "epoch": 1.30976582504384, "grad_norm": 2.7843523025512695, "learning_rate": 4.8690249693508596e-05, "loss": 1.1157, "step": 84400 }, { "epoch": 1.3113176802867827, "grad_norm": 2.4103662967681885, "learning_rate": 4.8688697838265653e-05, "loss": 1.1223, "step": 84500 }, { "epoch": 1.3128695355297257, "grad_norm": 2.319493532180786, "learning_rate": 4.868714598302271e-05, "loss": 1.0921, "step": 84600 }, { "epoch": 1.3144213907726687, "grad_norm": 2.398345947265625, "learning_rate": 4.868559412777976e-05, "loss": 1.0854, "step": 84700 }, { "epoch": 1.3159732460156117, "grad_norm": 2.239546060562134, "learning_rate": 4.868404227253682e-05, "loss": 1.0861, "step": 84800 }, { "epoch": 1.3175251012585547, "grad_norm": 2.4055612087249756, "learning_rate": 4.868249041729388e-05, "loss": 1.1129, "step": 84900 }, { "epoch": 1.3190769565014975, "grad_norm": 2.7176923751831055, "learning_rate": 4.868093856205093e-05, "loss": 1.1187, "step": 85000 }, { "epoch": 1.3206288117444405, "grad_norm": 2.2004525661468506, "learning_rate": 4.8679386706807986e-05, "loss": 1.0869, "step": 85100 }, { "epoch": 1.3221806669873835, "grad_norm": 2.2112913131713867, "learning_rate": 4.8677834851565044e-05, "loss": 1.1104, "step": 85200 }, { "epoch": 1.3237325222303262, "grad_norm": 2.4282455444335938, "learning_rate": 4.86762829963221e-05, "loss": 1.1057, "step": 85300 }, { "epoch": 1.3252843774732692, "grad_norm": 2.6111245155334473, "learning_rate": 4.867473114107916e-05, "loss": 1.0764, "step": 85400 }, { "epoch": 1.3268362327162122, "grad_norm": 2.459329605102539, "learning_rate": 4.867317928583622e-05, "loss": 1.1173, "step": 85500 }, { "epoch": 1.3283880879591552, "grad_norm": 2.4787330627441406, "learning_rate": 4.8671627430593275e-05, "loss": 1.1068, "step": 85600 }, { "epoch": 1.3299399432020982, "grad_norm": 2.342802047729492, "learning_rate": 4.867007557535033e-05, "loss": 1.1067, "step": 85700 }, { "epoch": 1.331491798445041, "grad_norm": 2.576122283935547, "learning_rate": 4.866852372010739e-05, "loss": 1.1048, "step": 85800 }, { "epoch": 1.333043653687984, "grad_norm": 2.542020559310913, "learning_rate": 4.866697186486445e-05, "loss": 1.0989, "step": 85900 }, { "epoch": 1.334595508930927, "grad_norm": 2.414774179458618, "learning_rate": 4.8665420009621506e-05, "loss": 1.0741, "step": 86000 }, { "epoch": 1.33614736417387, "grad_norm": 2.438695192337036, "learning_rate": 4.8663868154378564e-05, "loss": 1.1096, "step": 86100 }, { "epoch": 1.337699219416813, "grad_norm": 2.066688299179077, "learning_rate": 4.866231629913562e-05, "loss": 1.1083, "step": 86200 }, { "epoch": 1.3392510746597557, "grad_norm": 2.383652448654175, "learning_rate": 4.866076444389267e-05, "loss": 1.1034, "step": 86300 }, { "epoch": 1.3408029299026987, "grad_norm": 2.4665942192077637, "learning_rate": 4.865921258864973e-05, "loss": 1.1141, "step": 86400 }, { "epoch": 1.3423547851456417, "grad_norm": 2.3365814685821533, "learning_rate": 4.865766073340679e-05, "loss": 1.1207, "step": 86500 }, { "epoch": 1.3439066403885844, "grad_norm": 2.1258933544158936, "learning_rate": 4.8656108878163846e-05, "loss": 1.1238, "step": 86600 }, { "epoch": 1.3454584956315274, "grad_norm": 2.463226318359375, "learning_rate": 4.8654557022920904e-05, "loss": 1.0907, "step": 86700 }, { "epoch": 1.3470103508744704, "grad_norm": 2.3676583766937256, "learning_rate": 4.865300516767796e-05, "loss": 1.0928, "step": 86800 }, { "epoch": 1.3485622061174134, "grad_norm": 2.4078824520111084, "learning_rate": 4.865145331243502e-05, "loss": 1.1046, "step": 86900 }, { "epoch": 1.3501140613603564, "grad_norm": 2.385486602783203, "learning_rate": 4.864990145719208e-05, "loss": 1.1336, "step": 87000 }, { "epoch": 1.3516659166032992, "grad_norm": 2.1174564361572266, "learning_rate": 4.8648349601949135e-05, "loss": 1.1081, "step": 87100 }, { "epoch": 1.3532177718462421, "grad_norm": 2.3165135383605957, "learning_rate": 4.864679774670619e-05, "loss": 1.1124, "step": 87200 }, { "epoch": 1.3547696270891851, "grad_norm": 2.289062976837158, "learning_rate": 4.864524589146325e-05, "loss": 1.1163, "step": 87300 }, { "epoch": 1.3563214823321281, "grad_norm": 2.558507204055786, "learning_rate": 4.864369403622031e-05, "loss": 1.114, "step": 87400 }, { "epoch": 1.3578733375750711, "grad_norm": 2.215275526046753, "learning_rate": 4.8642142180977366e-05, "loss": 1.1004, "step": 87500 }, { "epoch": 1.3594251928180139, "grad_norm": 2.6778178215026855, "learning_rate": 4.864059032573442e-05, "loss": 1.1195, "step": 87600 }, { "epoch": 1.3609770480609569, "grad_norm": 2.3095626831054688, "learning_rate": 4.8639038470491474e-05, "loss": 1.0885, "step": 87700 }, { "epoch": 1.3625289033038999, "grad_norm": 2.2825675010681152, "learning_rate": 4.863748661524853e-05, "loss": 1.0986, "step": 87800 }, { "epoch": 1.3640807585468426, "grad_norm": 2.4111886024475098, "learning_rate": 4.863593476000559e-05, "loss": 1.1198, "step": 87900 }, { "epoch": 1.3656326137897856, "grad_norm": 2.4561874866485596, "learning_rate": 4.863438290476265e-05, "loss": 1.1111, "step": 88000 }, { "epoch": 1.3671844690327286, "grad_norm": 2.816755533218384, "learning_rate": 4.86328310495197e-05, "loss": 1.0981, "step": 88100 }, { "epoch": 1.3687363242756716, "grad_norm": 2.6150989532470703, "learning_rate": 4.8631279194276756e-05, "loss": 1.1013, "step": 88200 }, { "epoch": 1.3702881795186146, "grad_norm": 2.587946891784668, "learning_rate": 4.8629727339033814e-05, "loss": 1.1291, "step": 88300 }, { "epoch": 1.3718400347615574, "grad_norm": 2.921314239501953, "learning_rate": 4.862817548379087e-05, "loss": 1.1209, "step": 88400 }, { "epoch": 1.3733918900045003, "grad_norm": 1.9959654808044434, "learning_rate": 4.862662362854793e-05, "loss": 1.1078, "step": 88500 }, { "epoch": 1.3749437452474433, "grad_norm": 2.308357000350952, "learning_rate": 4.862507177330499e-05, "loss": 1.1152, "step": 88600 }, { "epoch": 1.3764956004903863, "grad_norm": 2.9256973266601562, "learning_rate": 4.8623519918062045e-05, "loss": 1.0843, "step": 88700 }, { "epoch": 1.3780474557333293, "grad_norm": 2.4615092277526855, "learning_rate": 4.86219680628191e-05, "loss": 1.0818, "step": 88800 }, { "epoch": 1.379599310976272, "grad_norm": 2.446812391281128, "learning_rate": 4.862041620757616e-05, "loss": 1.0925, "step": 88900 }, { "epoch": 1.381151166219215, "grad_norm": 2.7313692569732666, "learning_rate": 4.861886435233322e-05, "loss": 1.0902, "step": 89000 }, { "epoch": 1.382703021462158, "grad_norm": 2.348445177078247, "learning_rate": 4.861731249709027e-05, "loss": 1.1012, "step": 89100 }, { "epoch": 1.3842548767051008, "grad_norm": 2.393333673477173, "learning_rate": 4.861576064184733e-05, "loss": 1.1085, "step": 89200 }, { "epoch": 1.3858067319480438, "grad_norm": 2.2570338249206543, "learning_rate": 4.8614208786604385e-05, "loss": 1.1059, "step": 89300 }, { "epoch": 1.3873585871909868, "grad_norm": 2.2027170658111572, "learning_rate": 4.861265693136144e-05, "loss": 1.0874, "step": 89400 }, { "epoch": 1.3889104424339298, "grad_norm": 1.8995881080627441, "learning_rate": 4.86111050761185e-05, "loss": 1.1106, "step": 89500 }, { "epoch": 1.3904622976768728, "grad_norm": 2.977391481399536, "learning_rate": 4.860955322087556e-05, "loss": 1.1068, "step": 89600 }, { "epoch": 1.3920141529198156, "grad_norm": 2.1075680255889893, "learning_rate": 4.8608001365632616e-05, "loss": 1.1045, "step": 89700 }, { "epoch": 1.3935660081627586, "grad_norm": 2.444673776626587, "learning_rate": 4.8606449510389674e-05, "loss": 1.1307, "step": 89800 }, { "epoch": 1.3951178634057015, "grad_norm": 2.2788937091827393, "learning_rate": 4.860489765514673e-05, "loss": 1.0991, "step": 89900 }, { "epoch": 1.3966697186486445, "grad_norm": 2.2362778186798096, "learning_rate": 4.860334579990379e-05, "loss": 1.0801, "step": 90000 }, { "epoch": 1.3982215738915875, "grad_norm": 2.303495168685913, "learning_rate": 4.860179394466085e-05, "loss": 1.1049, "step": 90100 }, { "epoch": 1.3997734291345303, "grad_norm": 2.496016025543213, "learning_rate": 4.8600242089417905e-05, "loss": 1.0875, "step": 90200 }, { "epoch": 1.4013252843774733, "grad_norm": 2.088832378387451, "learning_rate": 4.859869023417496e-05, "loss": 1.1205, "step": 90300 }, { "epoch": 1.4028771396204163, "grad_norm": 2.3098864555358887, "learning_rate": 4.8597138378932013e-05, "loss": 1.1103, "step": 90400 }, { "epoch": 1.404428994863359, "grad_norm": 2.163699150085449, "learning_rate": 4.859558652368907e-05, "loss": 1.1078, "step": 90500 }, { "epoch": 1.405980850106302, "grad_norm": 2.240849733352661, "learning_rate": 4.859403466844613e-05, "loss": 1.0926, "step": 90600 }, { "epoch": 1.407532705349245, "grad_norm": 2.175450563430786, "learning_rate": 4.859248281320319e-05, "loss": 1.0944, "step": 90700 }, { "epoch": 1.409084560592188, "grad_norm": 2.29375958442688, "learning_rate": 4.8590930957960244e-05, "loss": 1.1037, "step": 90800 }, { "epoch": 1.410636415835131, "grad_norm": 2.3014724254608154, "learning_rate": 4.85893791027173e-05, "loss": 1.0915, "step": 90900 }, { "epoch": 1.4121882710780738, "grad_norm": 2.7147958278656006, "learning_rate": 4.858782724747436e-05, "loss": 1.1222, "step": 91000 }, { "epoch": 1.4137401263210168, "grad_norm": 2.025317668914795, "learning_rate": 4.858627539223142e-05, "loss": 1.0864, "step": 91100 }, { "epoch": 1.4152919815639597, "grad_norm": 2.586599826812744, "learning_rate": 4.8584723536988475e-05, "loss": 1.1084, "step": 91200 }, { "epoch": 1.4168438368069027, "grad_norm": 2.205930471420288, "learning_rate": 4.858317168174553e-05, "loss": 1.0884, "step": 91300 }, { "epoch": 1.4183956920498457, "grad_norm": 2.263949394226074, "learning_rate": 4.8581619826502584e-05, "loss": 1.0906, "step": 91400 }, { "epoch": 1.4199475472927885, "grad_norm": 2.1652019023895264, "learning_rate": 4.858006797125964e-05, "loss": 1.0805, "step": 91500 }, { "epoch": 1.4214994025357315, "grad_norm": 2.4087634086608887, "learning_rate": 4.85785161160167e-05, "loss": 1.1166, "step": 91600 }, { "epoch": 1.4230512577786745, "grad_norm": 2.43849515914917, "learning_rate": 4.857696426077376e-05, "loss": 1.0994, "step": 91700 }, { "epoch": 1.4246031130216172, "grad_norm": 2.8346004486083984, "learning_rate": 4.8575412405530815e-05, "loss": 1.0919, "step": 91800 }, { "epoch": 1.4261549682645602, "grad_norm": 2.0506465435028076, "learning_rate": 4.857386055028787e-05, "loss": 1.1118, "step": 91900 }, { "epoch": 1.4277068235075032, "grad_norm": 2.5766263008117676, "learning_rate": 4.8572308695044924e-05, "loss": 1.1151, "step": 92000 }, { "epoch": 1.4292586787504462, "grad_norm": 2.260166645050049, "learning_rate": 4.857075683980198e-05, "loss": 1.0907, "step": 92100 }, { "epoch": 1.4308105339933892, "grad_norm": 2.205695390701294, "learning_rate": 4.856920498455904e-05, "loss": 1.1179, "step": 92200 }, { "epoch": 1.432362389236332, "grad_norm": 2.001887559890747, "learning_rate": 4.85676531293161e-05, "loss": 1.0972, "step": 92300 }, { "epoch": 1.433914244479275, "grad_norm": 2.280686140060425, "learning_rate": 4.8566101274073155e-05, "loss": 1.1089, "step": 92400 }, { "epoch": 1.435466099722218, "grad_norm": 1.7954293489456177, "learning_rate": 4.856454941883021e-05, "loss": 1.0893, "step": 92500 }, { "epoch": 1.4370179549651607, "grad_norm": 2.0910868644714355, "learning_rate": 4.856299756358727e-05, "loss": 1.1146, "step": 92600 }, { "epoch": 1.4385698102081037, "grad_norm": 2.3523671627044678, "learning_rate": 4.856144570834433e-05, "loss": 1.105, "step": 92700 }, { "epoch": 1.4401216654510467, "grad_norm": 2.5177364349365234, "learning_rate": 4.8559893853101386e-05, "loss": 1.1026, "step": 92800 }, { "epoch": 1.4416735206939897, "grad_norm": 2.642850399017334, "learning_rate": 4.8558341997858444e-05, "loss": 1.0924, "step": 92900 }, { "epoch": 1.4432253759369327, "grad_norm": 2.4015111923217773, "learning_rate": 4.85567901426155e-05, "loss": 1.0824, "step": 93000 }, { "epoch": 1.4447772311798754, "grad_norm": 2.785917282104492, "learning_rate": 4.855523828737256e-05, "loss": 1.087, "step": 93100 }, { "epoch": 1.4463290864228184, "grad_norm": 2.587714672088623, "learning_rate": 4.855368643212962e-05, "loss": 1.1315, "step": 93200 }, { "epoch": 1.4478809416657614, "grad_norm": 2.5632598400115967, "learning_rate": 4.855213457688667e-05, "loss": 1.1101, "step": 93300 }, { "epoch": 1.4494327969087044, "grad_norm": 1.9527606964111328, "learning_rate": 4.8550582721643726e-05, "loss": 1.0916, "step": 93400 }, { "epoch": 1.4509846521516474, "grad_norm": 2.3354501724243164, "learning_rate": 4.8549030866400783e-05, "loss": 1.0842, "step": 93500 }, { "epoch": 1.4525365073945902, "grad_norm": 2.434691905975342, "learning_rate": 4.854747901115784e-05, "loss": 1.1153, "step": 93600 }, { "epoch": 1.4540883626375332, "grad_norm": 2.458353042602539, "learning_rate": 4.85459271559149e-05, "loss": 1.106, "step": 93700 }, { "epoch": 1.4556402178804762, "grad_norm": 2.31325364112854, "learning_rate": 4.854437530067196e-05, "loss": 1.1061, "step": 93800 }, { "epoch": 1.457192073123419, "grad_norm": 2.5093953609466553, "learning_rate": 4.8542823445429014e-05, "loss": 1.0818, "step": 93900 }, { "epoch": 1.458743928366362, "grad_norm": 2.224592447280884, "learning_rate": 4.854127159018607e-05, "loss": 1.1263, "step": 94000 }, { "epoch": 1.460295783609305, "grad_norm": 2.1180953979492188, "learning_rate": 4.853971973494313e-05, "loss": 1.0958, "step": 94100 }, { "epoch": 1.4618476388522479, "grad_norm": 2.6123318672180176, "learning_rate": 4.853816787970019e-05, "loss": 1.1093, "step": 94200 }, { "epoch": 1.4633994940951909, "grad_norm": 2.0120418071746826, "learning_rate": 4.8536616024457245e-05, "loss": 1.0914, "step": 94300 }, { "epoch": 1.4649513493381336, "grad_norm": 2.1120519638061523, "learning_rate": 4.85350641692143e-05, "loss": 1.1289, "step": 94400 }, { "epoch": 1.4665032045810766, "grad_norm": 2.2654669284820557, "learning_rate": 4.853351231397136e-05, "loss": 1.0939, "step": 94500 }, { "epoch": 1.4680550598240196, "grad_norm": 2.5117177963256836, "learning_rate": 4.853196045872841e-05, "loss": 1.0839, "step": 94600 }, { "epoch": 1.4696069150669626, "grad_norm": 2.5389342308044434, "learning_rate": 4.853040860348547e-05, "loss": 1.0958, "step": 94700 }, { "epoch": 1.4711587703099056, "grad_norm": 2.5889949798583984, "learning_rate": 4.852885674824252e-05, "loss": 1.107, "step": 94800 }, { "epoch": 1.4727106255528484, "grad_norm": 2.972501516342163, "learning_rate": 4.852730489299958e-05, "loss": 1.107, "step": 94900 }, { "epoch": 1.4742624807957914, "grad_norm": 2.1755499839782715, "learning_rate": 4.8525753037756636e-05, "loss": 1.0804, "step": 95000 }, { "epoch": 1.4758143360387344, "grad_norm": 2.343222141265869, "learning_rate": 4.8524201182513694e-05, "loss": 1.1132, "step": 95100 }, { "epoch": 1.4773661912816771, "grad_norm": 2.0029778480529785, "learning_rate": 4.852264932727075e-05, "loss": 1.0648, "step": 95200 }, { "epoch": 1.4789180465246201, "grad_norm": 2.5098717212677, "learning_rate": 4.852109747202781e-05, "loss": 1.0883, "step": 95300 }, { "epoch": 1.480469901767563, "grad_norm": 2.020259380340576, "learning_rate": 4.851954561678487e-05, "loss": 1.0841, "step": 95400 }, { "epoch": 1.482021757010506, "grad_norm": 2.137216091156006, "learning_rate": 4.8517993761541925e-05, "loss": 1.0916, "step": 95500 }, { "epoch": 1.483573612253449, "grad_norm": 2.2599966526031494, "learning_rate": 4.851644190629898e-05, "loss": 1.1158, "step": 95600 }, { "epoch": 1.4851254674963918, "grad_norm": 2.5027835369110107, "learning_rate": 4.851489005105604e-05, "loss": 1.1076, "step": 95700 }, { "epoch": 1.4866773227393348, "grad_norm": 2.1276371479034424, "learning_rate": 4.85133381958131e-05, "loss": 1.0956, "step": 95800 }, { "epoch": 1.4882291779822778, "grad_norm": 2.3590199947357178, "learning_rate": 4.8511786340570156e-05, "loss": 1.0832, "step": 95900 }, { "epoch": 1.4897810332252208, "grad_norm": 2.2994654178619385, "learning_rate": 4.8510234485327214e-05, "loss": 1.096, "step": 96000 }, { "epoch": 1.4913328884681638, "grad_norm": 2.2821874618530273, "learning_rate": 4.8508682630084265e-05, "loss": 1.1047, "step": 96100 }, { "epoch": 1.4928847437111066, "grad_norm": 2.377063274383545, "learning_rate": 4.850713077484132e-05, "loss": 1.095, "step": 96200 }, { "epoch": 1.4944365989540496, "grad_norm": 2.467970848083496, "learning_rate": 4.850557891959838e-05, "loss": 1.0818, "step": 96300 }, { "epoch": 1.4959884541969926, "grad_norm": 2.613098382949829, "learning_rate": 4.850402706435544e-05, "loss": 1.1014, "step": 96400 }, { "epoch": 1.4975403094399353, "grad_norm": 2.434502601623535, "learning_rate": 4.8502475209112496e-05, "loss": 1.1125, "step": 96500 }, { "epoch": 1.4990921646828783, "grad_norm": 2.31904935836792, "learning_rate": 4.8500923353869553e-05, "loss": 1.1075, "step": 96600 }, { "epoch": 1.5006440199258213, "grad_norm": 2.3460516929626465, "learning_rate": 4.849937149862661e-05, "loss": 1.0989, "step": 96700 }, { "epoch": 1.5021958751687643, "grad_norm": 2.311793804168701, "learning_rate": 4.849781964338367e-05, "loss": 1.0994, "step": 96800 }, { "epoch": 1.5037477304117073, "grad_norm": 3.030339479446411, "learning_rate": 4.849626778814073e-05, "loss": 1.0758, "step": 96900 }, { "epoch": 1.50529958565465, "grad_norm": 2.2871413230895996, "learning_rate": 4.8494715932897784e-05, "loss": 1.1092, "step": 97000 }, { "epoch": 1.506851440897593, "grad_norm": 2.2784247398376465, "learning_rate": 4.849316407765484e-05, "loss": 1.0695, "step": 97100 }, { "epoch": 1.508403296140536, "grad_norm": 2.4598681926727295, "learning_rate": 4.84916122224119e-05, "loss": 1.1045, "step": 97200 }, { "epoch": 1.5099551513834788, "grad_norm": 2.7960808277130127, "learning_rate": 4.849006036716896e-05, "loss": 1.1138, "step": 97300 }, { "epoch": 1.511507006626422, "grad_norm": 2.394219398498535, "learning_rate": 4.848850851192601e-05, "loss": 1.1138, "step": 97400 }, { "epoch": 1.5130588618693648, "grad_norm": 2.312546968460083, "learning_rate": 4.8486956656683066e-05, "loss": 1.089, "step": 97500 }, { "epoch": 1.5146107171123078, "grad_norm": 2.181861162185669, "learning_rate": 4.8485404801440124e-05, "loss": 1.1244, "step": 97600 }, { "epoch": 1.5161625723552508, "grad_norm": 2.3561577796936035, "learning_rate": 4.848385294619718e-05, "loss": 1.0938, "step": 97700 }, { "epoch": 1.5177144275981935, "grad_norm": 1.8671692609786987, "learning_rate": 4.848230109095424e-05, "loss": 1.08, "step": 97800 }, { "epoch": 1.5192662828411367, "grad_norm": 2.1992411613464355, "learning_rate": 4.848074923571129e-05, "loss": 1.1053, "step": 97900 }, { "epoch": 1.5208181380840795, "grad_norm": 2.339897632598877, "learning_rate": 4.847919738046835e-05, "loss": 1.0964, "step": 98000 }, { "epoch": 1.5223699933270225, "grad_norm": 2.0564520359039307, "learning_rate": 4.8477645525225406e-05, "loss": 1.092, "step": 98100 }, { "epoch": 1.5239218485699655, "grad_norm": 2.5134778022766113, "learning_rate": 4.8476093669982464e-05, "loss": 1.1088, "step": 98200 }, { "epoch": 1.5254737038129083, "grad_norm": 2.253459930419922, "learning_rate": 4.847454181473952e-05, "loss": 1.0826, "step": 98300 }, { "epoch": 1.5270255590558512, "grad_norm": 2.5602986812591553, "learning_rate": 4.847298995949658e-05, "loss": 1.0843, "step": 98400 }, { "epoch": 1.5285774142987942, "grad_norm": 2.2525172233581543, "learning_rate": 4.847143810425364e-05, "loss": 1.079, "step": 98500 }, { "epoch": 1.530129269541737, "grad_norm": 2.373265266418457, "learning_rate": 4.8469886249010695e-05, "loss": 1.1158, "step": 98600 }, { "epoch": 1.5316811247846802, "grad_norm": 2.2877092361450195, "learning_rate": 4.846833439376775e-05, "loss": 1.1057, "step": 98700 }, { "epoch": 1.533232980027623, "grad_norm": 2.0692262649536133, "learning_rate": 4.846678253852481e-05, "loss": 1.0738, "step": 98800 }, { "epoch": 1.534784835270566, "grad_norm": 2.559347629547119, "learning_rate": 4.846523068328186e-05, "loss": 1.0809, "step": 98900 }, { "epoch": 1.536336690513509, "grad_norm": 2.3724586963653564, "learning_rate": 4.846367882803892e-05, "loss": 1.0862, "step": 99000 }, { "epoch": 1.5378885457564517, "grad_norm": 2.499943256378174, "learning_rate": 4.846212697279598e-05, "loss": 1.0969, "step": 99100 }, { "epoch": 1.539440400999395, "grad_norm": 2.0188519954681396, "learning_rate": 4.8460575117553035e-05, "loss": 1.0934, "step": 99200 }, { "epoch": 1.5409922562423377, "grad_norm": 2.1920857429504395, "learning_rate": 4.845902326231009e-05, "loss": 1.0842, "step": 99300 }, { "epoch": 1.5425441114852807, "grad_norm": 2.47813081741333, "learning_rate": 4.845747140706715e-05, "loss": 1.0966, "step": 99400 }, { "epoch": 1.5440959667282237, "grad_norm": 2.5390424728393555, "learning_rate": 4.845591955182421e-05, "loss": 1.0873, "step": 99500 }, { "epoch": 1.5456478219711665, "grad_norm": 2.214303731918335, "learning_rate": 4.8454367696581266e-05, "loss": 1.0945, "step": 99600 }, { "epoch": 1.5471996772141094, "grad_norm": 2.6074516773223877, "learning_rate": 4.8452815841338323e-05, "loss": 1.067, "step": 99700 }, { "epoch": 1.5487515324570524, "grad_norm": 2.670158863067627, "learning_rate": 4.845126398609538e-05, "loss": 1.074, "step": 99800 }, { "epoch": 1.5503033876999952, "grad_norm": 2.4149551391601562, "learning_rate": 4.844971213085244e-05, "loss": 1.103, "step": 99900 }, { "epoch": 1.5518552429429384, "grad_norm": 2.4299428462982178, "learning_rate": 4.84481602756095e-05, "loss": 1.0943, "step": 100000 }, { "epoch": 1.5534070981858812, "grad_norm": 2.447786331176758, "learning_rate": 4.8446608420366554e-05, "loss": 1.0866, "step": 100100 }, { "epoch": 1.5549589534288242, "grad_norm": 2.346238374710083, "learning_rate": 4.8445056565123605e-05, "loss": 1.1272, "step": 100200 }, { "epoch": 1.5565108086717672, "grad_norm": 2.3809621334075928, "learning_rate": 4.844350470988066e-05, "loss": 1.1039, "step": 100300 }, { "epoch": 1.55806266391471, "grad_norm": 2.2030515670776367, "learning_rate": 4.844195285463772e-05, "loss": 1.0865, "step": 100400 }, { "epoch": 1.559614519157653, "grad_norm": 2.4423165321350098, "learning_rate": 4.844040099939478e-05, "loss": 1.0883, "step": 100500 }, { "epoch": 1.561166374400596, "grad_norm": 1.9698563814163208, "learning_rate": 4.8438849144151836e-05, "loss": 1.0872, "step": 100600 }, { "epoch": 1.562718229643539, "grad_norm": 2.018266201019287, "learning_rate": 4.8437297288908894e-05, "loss": 1.0915, "step": 100700 }, { "epoch": 1.564270084886482, "grad_norm": 2.2007317543029785, "learning_rate": 4.843574543366595e-05, "loss": 1.0931, "step": 100800 }, { "epoch": 1.5658219401294247, "grad_norm": 2.259827136993408, "learning_rate": 4.843419357842301e-05, "loss": 1.1038, "step": 100900 }, { "epoch": 1.5673737953723677, "grad_norm": 2.568490982055664, "learning_rate": 4.843264172318007e-05, "loss": 1.0814, "step": 101000 }, { "epoch": 1.5689256506153106, "grad_norm": 2.2101008892059326, "learning_rate": 4.8431089867937125e-05, "loss": 1.102, "step": 101100 }, { "epoch": 1.5704775058582534, "grad_norm": 2.0225656032562256, "learning_rate": 4.8429538012694176e-05, "loss": 1.1161, "step": 101200 }, { "epoch": 1.5720293611011966, "grad_norm": 2.544382333755493, "learning_rate": 4.8427986157451234e-05, "loss": 1.0915, "step": 101300 }, { "epoch": 1.5735812163441394, "grad_norm": 2.1352312564849854, "learning_rate": 4.842643430220829e-05, "loss": 1.0963, "step": 101400 }, { "epoch": 1.5751330715870824, "grad_norm": 2.783745527267456, "learning_rate": 4.842488244696535e-05, "loss": 1.0833, "step": 101500 }, { "epoch": 1.5766849268300254, "grad_norm": 2.208181858062744, "learning_rate": 4.842333059172241e-05, "loss": 1.1058, "step": 101600 }, { "epoch": 1.5782367820729681, "grad_norm": 2.9489893913269043, "learning_rate": 4.8421778736479465e-05, "loss": 1.0939, "step": 101700 }, { "epoch": 1.5797886373159111, "grad_norm": 2.812354564666748, "learning_rate": 4.8420226881236516e-05, "loss": 1.0871, "step": 101800 }, { "epoch": 1.5813404925588541, "grad_norm": 2.402590274810791, "learning_rate": 4.8418675025993574e-05, "loss": 1.0897, "step": 101900 }, { "epoch": 1.5828923478017969, "grad_norm": 2.327899694442749, "learning_rate": 4.841712317075063e-05, "loss": 1.0848, "step": 102000 }, { "epoch": 1.58444420304474, "grad_norm": 2.308974504470825, "learning_rate": 4.841557131550769e-05, "loss": 1.0889, "step": 102100 }, { "epoch": 1.5859960582876829, "grad_norm": 1.8988031148910522, "learning_rate": 4.841401946026475e-05, "loss": 1.0916, "step": 102200 }, { "epoch": 1.5875479135306259, "grad_norm": 2.443079710006714, "learning_rate": 4.8412467605021805e-05, "loss": 1.0783, "step": 102300 }, { "epoch": 1.5890997687735688, "grad_norm": 2.450003147125244, "learning_rate": 4.841091574977886e-05, "loss": 1.0788, "step": 102400 }, { "epoch": 1.5906516240165116, "grad_norm": 2.2849245071411133, "learning_rate": 4.840936389453592e-05, "loss": 1.0927, "step": 102500 }, { "epoch": 1.5922034792594548, "grad_norm": 2.752744436264038, "learning_rate": 4.840781203929298e-05, "loss": 1.086, "step": 102600 }, { "epoch": 1.5937553345023976, "grad_norm": 2.229478597640991, "learning_rate": 4.8406260184050036e-05, "loss": 1.0857, "step": 102700 }, { "epoch": 1.5953071897453406, "grad_norm": 1.991997480392456, "learning_rate": 4.8404708328807093e-05, "loss": 1.0987, "step": 102800 }, { "epoch": 1.5968590449882836, "grad_norm": 2.213447093963623, "learning_rate": 4.840315647356415e-05, "loss": 1.0723, "step": 102900 }, { "epoch": 1.5984109002312263, "grad_norm": 2.0651183128356934, "learning_rate": 4.840160461832121e-05, "loss": 1.0883, "step": 103000 }, { "epoch": 1.5999627554741693, "grad_norm": 2.27315092086792, "learning_rate": 4.840005276307826e-05, "loss": 1.0964, "step": 103100 }, { "epoch": 1.6015146107171123, "grad_norm": 2.5370280742645264, "learning_rate": 4.839850090783532e-05, "loss": 1.0791, "step": 103200 }, { "epoch": 1.603066465960055, "grad_norm": 2.3335041999816895, "learning_rate": 4.8396949052592375e-05, "loss": 1.1005, "step": 103300 }, { "epoch": 1.6046183212029983, "grad_norm": 2.0514893531799316, "learning_rate": 4.839539719734943e-05, "loss": 1.1146, "step": 103400 }, { "epoch": 1.606170176445941, "grad_norm": 2.582895040512085, "learning_rate": 4.839384534210649e-05, "loss": 1.091, "step": 103500 }, { "epoch": 1.607722031688884, "grad_norm": 2.004833936691284, "learning_rate": 4.839229348686355e-05, "loss": 1.0627, "step": 103600 }, { "epoch": 1.609273886931827, "grad_norm": 2.5738303661346436, "learning_rate": 4.8390741631620606e-05, "loss": 1.0775, "step": 103700 }, { "epoch": 1.6108257421747698, "grad_norm": 2.5076212882995605, "learning_rate": 4.8389189776377664e-05, "loss": 1.1139, "step": 103800 }, { "epoch": 1.612377597417713, "grad_norm": 2.3125298023223877, "learning_rate": 4.838763792113472e-05, "loss": 1.072, "step": 103900 }, { "epoch": 1.6139294526606558, "grad_norm": 2.3345038890838623, "learning_rate": 4.838608606589178e-05, "loss": 1.0998, "step": 104000 }, { "epoch": 1.6154813079035988, "grad_norm": 2.6464548110961914, "learning_rate": 4.838453421064884e-05, "loss": 1.0903, "step": 104100 }, { "epoch": 1.6170331631465418, "grad_norm": 2.383648633956909, "learning_rate": 4.8382982355405895e-05, "loss": 1.0996, "step": 104200 }, { "epoch": 1.6185850183894845, "grad_norm": 2.0716552734375, "learning_rate": 4.838143050016295e-05, "loss": 1.0979, "step": 104300 }, { "epoch": 1.6201368736324275, "grad_norm": 2.2472150325775146, "learning_rate": 4.8379878644920004e-05, "loss": 1.0753, "step": 104400 }, { "epoch": 1.6216887288753705, "grad_norm": 2.413191318511963, "learning_rate": 4.837832678967706e-05, "loss": 1.0789, "step": 104500 }, { "epoch": 1.6232405841183133, "grad_norm": 2.343839168548584, "learning_rate": 4.837677493443411e-05, "loss": 1.1059, "step": 104600 }, { "epoch": 1.6247924393612565, "grad_norm": 2.3699560165405273, "learning_rate": 4.837522307919117e-05, "loss": 1.102, "step": 104700 }, { "epoch": 1.6263442946041993, "grad_norm": 2.4309887886047363, "learning_rate": 4.837367122394823e-05, "loss": 1.0849, "step": 104800 }, { "epoch": 1.6278961498471423, "grad_norm": 2.464973211288452, "learning_rate": 4.8372119368705286e-05, "loss": 1.1039, "step": 104900 }, { "epoch": 1.6294480050900852, "grad_norm": 2.468033790588379, "learning_rate": 4.8370567513462344e-05, "loss": 1.0626, "step": 105000 }, { "epoch": 1.630999860333028, "grad_norm": 2.6809568405151367, "learning_rate": 4.83690156582194e-05, "loss": 1.0831, "step": 105100 }, { "epoch": 1.6325517155759712, "grad_norm": 2.0503652095794678, "learning_rate": 4.836746380297646e-05, "loss": 1.1011, "step": 105200 }, { "epoch": 1.634103570818914, "grad_norm": 2.310662031173706, "learning_rate": 4.836591194773352e-05, "loss": 1.0995, "step": 105300 }, { "epoch": 1.635655426061857, "grad_norm": 2.173369884490967, "learning_rate": 4.8364360092490575e-05, "loss": 1.0826, "step": 105400 }, { "epoch": 1.6372072813048, "grad_norm": 1.8807876110076904, "learning_rate": 4.836280823724763e-05, "loss": 1.0779, "step": 105500 }, { "epoch": 1.6387591365477427, "grad_norm": 2.493671417236328, "learning_rate": 4.836125638200469e-05, "loss": 1.1023, "step": 105600 }, { "epoch": 1.6403109917906857, "grad_norm": 2.055972099304199, "learning_rate": 4.835970452676175e-05, "loss": 1.0805, "step": 105700 }, { "epoch": 1.6418628470336287, "grad_norm": 2.361999750137329, "learning_rate": 4.8358152671518806e-05, "loss": 1.0922, "step": 105800 }, { "epoch": 1.6434147022765715, "grad_norm": 2.321727991104126, "learning_rate": 4.835660081627586e-05, "loss": 1.0765, "step": 105900 }, { "epoch": 1.6449665575195147, "grad_norm": 2.269177198410034, "learning_rate": 4.8355048961032914e-05, "loss": 1.0961, "step": 106000 }, { "epoch": 1.6465184127624575, "grad_norm": 2.213909387588501, "learning_rate": 4.835349710578997e-05, "loss": 1.0659, "step": 106100 }, { "epoch": 1.6480702680054005, "grad_norm": 2.539454221725464, "learning_rate": 4.835194525054703e-05, "loss": 1.0631, "step": 106200 }, { "epoch": 1.6496221232483435, "grad_norm": 2.369426727294922, "learning_rate": 4.835039339530409e-05, "loss": 1.1118, "step": 106300 }, { "epoch": 1.6511739784912862, "grad_norm": 2.130342483520508, "learning_rate": 4.8348841540061145e-05, "loss": 1.0777, "step": 106400 }, { "epoch": 1.6527258337342294, "grad_norm": 2.4684133529663086, "learning_rate": 4.83472896848182e-05, "loss": 1.0849, "step": 106500 }, { "epoch": 1.6542776889771722, "grad_norm": 2.468327045440674, "learning_rate": 4.834573782957526e-05, "loss": 1.0841, "step": 106600 }, { "epoch": 1.6558295442201152, "grad_norm": 2.0428032875061035, "learning_rate": 4.834418597433232e-05, "loss": 1.0714, "step": 106700 }, { "epoch": 1.6573813994630582, "grad_norm": 2.2243077754974365, "learning_rate": 4.8342634119089376e-05, "loss": 1.1006, "step": 106800 }, { "epoch": 1.658933254706001, "grad_norm": 2.0093777179718018, "learning_rate": 4.8341082263846434e-05, "loss": 1.0677, "step": 106900 }, { "epoch": 1.660485109948944, "grad_norm": 2.3485684394836426, "learning_rate": 4.833953040860349e-05, "loss": 1.095, "step": 107000 }, { "epoch": 1.662036965191887, "grad_norm": 2.494544744491577, "learning_rate": 4.833797855336055e-05, "loss": 1.1014, "step": 107100 }, { "epoch": 1.6635888204348297, "grad_norm": 2.3687689304351807, "learning_rate": 4.83364266981176e-05, "loss": 1.0776, "step": 107200 }, { "epoch": 1.665140675677773, "grad_norm": 2.286187171936035, "learning_rate": 4.833487484287466e-05, "loss": 1.1119, "step": 107300 }, { "epoch": 1.6666925309207157, "grad_norm": 2.225325584411621, "learning_rate": 4.8333322987631716e-05, "loss": 1.0674, "step": 107400 }, { "epoch": 1.6682443861636587, "grad_norm": 2.416855573654175, "learning_rate": 4.8331771132388774e-05, "loss": 1.0843, "step": 107500 }, { "epoch": 1.6697962414066017, "grad_norm": 2.224277973175049, "learning_rate": 4.833021927714583e-05, "loss": 1.0832, "step": 107600 }, { "epoch": 1.6713480966495444, "grad_norm": 2.3071978092193604, "learning_rate": 4.832866742190288e-05, "loss": 1.1245, "step": 107700 }, { "epoch": 1.6728999518924874, "grad_norm": 2.6478264331817627, "learning_rate": 4.832711556665994e-05, "loss": 1.086, "step": 107800 }, { "epoch": 1.6744518071354304, "grad_norm": 2.273693799972534, "learning_rate": 4.8325563711417e-05, "loss": 1.1043, "step": 107900 }, { "epoch": 1.6760036623783734, "grad_norm": 2.275265693664551, "learning_rate": 4.8324011856174056e-05, "loss": 1.1134, "step": 108000 }, { "epoch": 1.6775555176213164, "grad_norm": 2.539862871170044, "learning_rate": 4.8322460000931114e-05, "loss": 1.0871, "step": 108100 }, { "epoch": 1.6791073728642592, "grad_norm": 2.447202205657959, "learning_rate": 4.832090814568817e-05, "loss": 1.1026, "step": 108200 }, { "epoch": 1.6806592281072021, "grad_norm": 2.1919965744018555, "learning_rate": 4.831935629044523e-05, "loss": 1.074, "step": 108300 }, { "epoch": 1.6822110833501451, "grad_norm": 2.276486396789551, "learning_rate": 4.831780443520229e-05, "loss": 1.0973, "step": 108400 }, { "epoch": 1.683762938593088, "grad_norm": 2.577850341796875, "learning_rate": 4.8316252579959345e-05, "loss": 1.0961, "step": 108500 }, { "epoch": 1.685314793836031, "grad_norm": 2.4232072830200195, "learning_rate": 4.83147007247164e-05, "loss": 1.0824, "step": 108600 }, { "epoch": 1.6868666490789739, "grad_norm": 2.321362257003784, "learning_rate": 4.831314886947346e-05, "loss": 1.0833, "step": 108700 }, { "epoch": 1.6884185043219169, "grad_norm": 2.5468544960021973, "learning_rate": 4.831159701423051e-05, "loss": 1.0847, "step": 108800 }, { "epoch": 1.6899703595648599, "grad_norm": 2.3139944076538086, "learning_rate": 4.831004515898757e-05, "loss": 1.0887, "step": 108900 }, { "epoch": 1.6915222148078026, "grad_norm": 2.2984142303466797, "learning_rate": 4.830849330374463e-05, "loss": 1.0979, "step": 109000 }, { "epoch": 1.6930740700507456, "grad_norm": 2.5074856281280518, "learning_rate": 4.8306941448501684e-05, "loss": 1.074, "step": 109100 }, { "epoch": 1.6946259252936886, "grad_norm": 2.1612424850463867, "learning_rate": 4.830538959325874e-05, "loss": 1.0839, "step": 109200 }, { "epoch": 1.6961777805366316, "grad_norm": 2.457338809967041, "learning_rate": 4.83038377380158e-05, "loss": 1.0883, "step": 109300 }, { "epoch": 1.6977296357795746, "grad_norm": 2.4035794734954834, "learning_rate": 4.830228588277286e-05, "loss": 1.0619, "step": 109400 }, { "epoch": 1.6992814910225174, "grad_norm": 2.3478317260742188, "learning_rate": 4.8300734027529915e-05, "loss": 1.0911, "step": 109500 }, { "epoch": 1.7008333462654603, "grad_norm": 2.210148572921753, "learning_rate": 4.829918217228697e-05, "loss": 1.086, "step": 109600 }, { "epoch": 1.7023852015084033, "grad_norm": 2.5271687507629395, "learning_rate": 4.829763031704403e-05, "loss": 1.0951, "step": 109700 }, { "epoch": 1.703937056751346, "grad_norm": 2.0551509857177734, "learning_rate": 4.829607846180109e-05, "loss": 1.1064, "step": 109800 }, { "epoch": 1.7054889119942893, "grad_norm": 2.443232297897339, "learning_rate": 4.8294526606558146e-05, "loss": 1.0767, "step": 109900 }, { "epoch": 1.707040767237232, "grad_norm": 2.0984151363372803, "learning_rate": 4.8292974751315204e-05, "loss": 1.0801, "step": 110000 }, { "epoch": 1.708592622480175, "grad_norm": 2.4197680950164795, "learning_rate": 4.8291422896072255e-05, "loss": 1.105, "step": 110100 }, { "epoch": 1.710144477723118, "grad_norm": 2.436518430709839, "learning_rate": 4.828987104082931e-05, "loss": 1.0684, "step": 110200 }, { "epoch": 1.7116963329660608, "grad_norm": 2.4050610065460205, "learning_rate": 4.828831918558637e-05, "loss": 1.0832, "step": 110300 }, { "epoch": 1.7132481882090038, "grad_norm": 2.0894031524658203, "learning_rate": 4.828676733034343e-05, "loss": 1.0764, "step": 110400 }, { "epoch": 1.7148000434519468, "grad_norm": 2.1264595985412598, "learning_rate": 4.8285215475100486e-05, "loss": 1.0866, "step": 110500 }, { "epoch": 1.7163518986948896, "grad_norm": 2.6174960136413574, "learning_rate": 4.8283663619857544e-05, "loss": 1.0887, "step": 110600 }, { "epoch": 1.7179037539378328, "grad_norm": 2.2942001819610596, "learning_rate": 4.82821117646146e-05, "loss": 1.0831, "step": 110700 }, { "epoch": 1.7194556091807756, "grad_norm": 1.934415340423584, "learning_rate": 4.828055990937166e-05, "loss": 1.0837, "step": 110800 }, { "epoch": 1.7210074644237185, "grad_norm": 2.1386795043945312, "learning_rate": 4.827900805412871e-05, "loss": 1.0718, "step": 110900 }, { "epoch": 1.7225593196666615, "grad_norm": 2.057543992996216, "learning_rate": 4.827745619888577e-05, "loss": 1.0997, "step": 111000 }, { "epoch": 1.7241111749096043, "grad_norm": 2.408341407775879, "learning_rate": 4.8275904343642826e-05, "loss": 1.1036, "step": 111100 }, { "epoch": 1.7256630301525475, "grad_norm": 2.4271554946899414, "learning_rate": 4.8274352488399884e-05, "loss": 1.0797, "step": 111200 }, { "epoch": 1.7272148853954903, "grad_norm": 2.4379501342773438, "learning_rate": 4.827280063315694e-05, "loss": 1.0766, "step": 111300 }, { "epoch": 1.7287667406384333, "grad_norm": 2.436361312866211, "learning_rate": 4.8271248777914e-05, "loss": 1.0903, "step": 111400 }, { "epoch": 1.7303185958813763, "grad_norm": 2.2575366497039795, "learning_rate": 4.826969692267106e-05, "loss": 1.0968, "step": 111500 }, { "epoch": 1.731870451124319, "grad_norm": 2.1207995414733887, "learning_rate": 4.826814506742811e-05, "loss": 1.085, "step": 111600 }, { "epoch": 1.733422306367262, "grad_norm": 2.482725143432617, "learning_rate": 4.8266593212185166e-05, "loss": 1.0599, "step": 111700 }, { "epoch": 1.734974161610205, "grad_norm": 2.253748655319214, "learning_rate": 4.8265041356942223e-05, "loss": 1.0496, "step": 111800 }, { "epoch": 1.7365260168531478, "grad_norm": 2.0366814136505127, "learning_rate": 4.826348950169928e-05, "loss": 1.0767, "step": 111900 }, { "epoch": 1.738077872096091, "grad_norm": 2.260399341583252, "learning_rate": 4.826193764645634e-05, "loss": 1.0844, "step": 112000 }, { "epoch": 1.7396297273390338, "grad_norm": 2.378796339035034, "learning_rate": 4.82603857912134e-05, "loss": 1.0899, "step": 112100 }, { "epoch": 1.7411815825819768, "grad_norm": 2.3033385276794434, "learning_rate": 4.8258833935970454e-05, "loss": 1.0573, "step": 112200 }, { "epoch": 1.7427334378249197, "grad_norm": 2.3621747493743896, "learning_rate": 4.825728208072751e-05, "loss": 1.0694, "step": 112300 }, { "epoch": 1.7442852930678625, "grad_norm": 2.5841023921966553, "learning_rate": 4.825573022548457e-05, "loss": 1.0558, "step": 112400 }, { "epoch": 1.7458371483108057, "grad_norm": 2.4916257858276367, "learning_rate": 4.825417837024163e-05, "loss": 1.1038, "step": 112500 }, { "epoch": 1.7473890035537485, "grad_norm": 1.9130871295928955, "learning_rate": 4.8252626514998685e-05, "loss": 1.0835, "step": 112600 }, { "epoch": 1.7489408587966915, "grad_norm": 2.4349403381347656, "learning_rate": 4.825107465975574e-05, "loss": 1.0776, "step": 112700 }, { "epoch": 1.7504927140396345, "grad_norm": 2.358518123626709, "learning_rate": 4.82495228045128e-05, "loss": 1.0856, "step": 112800 }, { "epoch": 1.7520445692825772, "grad_norm": 1.981557011604309, "learning_rate": 4.824797094926985e-05, "loss": 1.0957, "step": 112900 }, { "epoch": 1.7535964245255202, "grad_norm": 2.483365297317505, "learning_rate": 4.824641909402691e-05, "loss": 1.0798, "step": 113000 }, { "epoch": 1.7551482797684632, "grad_norm": 2.0990121364593506, "learning_rate": 4.824486723878397e-05, "loss": 1.0944, "step": 113100 }, { "epoch": 1.756700135011406, "grad_norm": 2.4795525074005127, "learning_rate": 4.8243315383541025e-05, "loss": 1.0656, "step": 113200 }, { "epoch": 1.7582519902543492, "grad_norm": 2.1153392791748047, "learning_rate": 4.824176352829808e-05, "loss": 1.0887, "step": 113300 }, { "epoch": 1.759803845497292, "grad_norm": 1.9288995265960693, "learning_rate": 4.824021167305514e-05, "loss": 1.0884, "step": 113400 }, { "epoch": 1.761355700740235, "grad_norm": 2.1801743507385254, "learning_rate": 4.82386598178122e-05, "loss": 1.072, "step": 113500 }, { "epoch": 1.762907555983178, "grad_norm": 2.234731674194336, "learning_rate": 4.8237107962569256e-05, "loss": 1.0774, "step": 113600 }, { "epoch": 1.7644594112261207, "grad_norm": 2.6526927947998047, "learning_rate": 4.8235556107326314e-05, "loss": 1.0804, "step": 113700 }, { "epoch": 1.766011266469064, "grad_norm": 2.3809196949005127, "learning_rate": 4.823400425208337e-05, "loss": 1.1025, "step": 113800 }, { "epoch": 1.7675631217120067, "grad_norm": 2.5246968269348145, "learning_rate": 4.823245239684043e-05, "loss": 1.0804, "step": 113900 }, { "epoch": 1.7691149769549497, "grad_norm": 2.3987340927124023, "learning_rate": 4.823090054159749e-05, "loss": 1.0853, "step": 114000 }, { "epoch": 1.7706668321978927, "grad_norm": 2.483727216720581, "learning_rate": 4.8229348686354545e-05, "loss": 1.0808, "step": 114100 }, { "epoch": 1.7722186874408354, "grad_norm": 2.440681219100952, "learning_rate": 4.8227796831111596e-05, "loss": 1.0799, "step": 114200 }, { "epoch": 1.7737705426837784, "grad_norm": 2.3732333183288574, "learning_rate": 4.8226244975868654e-05, "loss": 1.0963, "step": 114300 }, { "epoch": 1.7753223979267214, "grad_norm": 2.1413209438323975, "learning_rate": 4.8224693120625705e-05, "loss": 1.0618, "step": 114400 }, { "epoch": 1.7768742531696642, "grad_norm": 2.341684103012085, "learning_rate": 4.822314126538276e-05, "loss": 1.0748, "step": 114500 }, { "epoch": 1.7784261084126074, "grad_norm": 2.107227325439453, "learning_rate": 4.822158941013982e-05, "loss": 1.0923, "step": 114600 }, { "epoch": 1.7799779636555502, "grad_norm": 2.0643272399902344, "learning_rate": 4.822003755489688e-05, "loss": 1.1075, "step": 114700 }, { "epoch": 1.7815298188984932, "grad_norm": 2.261986494064331, "learning_rate": 4.8218485699653936e-05, "loss": 1.0811, "step": 114800 }, { "epoch": 1.7830816741414361, "grad_norm": 2.0926873683929443, "learning_rate": 4.8216933844410993e-05, "loss": 1.0665, "step": 114900 }, { "epoch": 1.784633529384379, "grad_norm": 2.431126832962036, "learning_rate": 4.821538198916805e-05, "loss": 1.0766, "step": 115000 }, { "epoch": 1.7861853846273221, "grad_norm": 2.0504894256591797, "learning_rate": 4.821383013392511e-05, "loss": 1.1035, "step": 115100 }, { "epoch": 1.787737239870265, "grad_norm": 2.1101536750793457, "learning_rate": 4.821227827868217e-05, "loss": 1.0799, "step": 115200 }, { "epoch": 1.7892890951132079, "grad_norm": 2.618450880050659, "learning_rate": 4.8210726423439224e-05, "loss": 1.081, "step": 115300 }, { "epoch": 1.7908409503561509, "grad_norm": 2.42140793800354, "learning_rate": 4.820917456819628e-05, "loss": 1.0586, "step": 115400 }, { "epoch": 1.7923928055990936, "grad_norm": 2.140634059906006, "learning_rate": 4.820762271295334e-05, "loss": 1.0696, "step": 115500 }, { "epoch": 1.7939446608420366, "grad_norm": 2.527614116668701, "learning_rate": 4.82060708577104e-05, "loss": 1.0794, "step": 115600 }, { "epoch": 1.7954965160849796, "grad_norm": 2.8312926292419434, "learning_rate": 4.820451900246745e-05, "loss": 1.0781, "step": 115700 }, { "epoch": 1.7970483713279224, "grad_norm": 2.3439109325408936, "learning_rate": 4.8202967147224506e-05, "loss": 1.0636, "step": 115800 }, { "epoch": 1.7986002265708656, "grad_norm": 2.349926471710205, "learning_rate": 4.8201415291981564e-05, "loss": 1.0682, "step": 115900 }, { "epoch": 1.8001520818138084, "grad_norm": 2.3758840560913086, "learning_rate": 4.819986343673862e-05, "loss": 1.0961, "step": 116000 }, { "epoch": 1.8017039370567514, "grad_norm": 2.378147840499878, "learning_rate": 4.819831158149568e-05, "loss": 1.0963, "step": 116100 }, { "epoch": 1.8032557922996943, "grad_norm": 2.429334878921509, "learning_rate": 4.819675972625274e-05, "loss": 1.0804, "step": 116200 }, { "epoch": 1.8048076475426371, "grad_norm": 2.3186795711517334, "learning_rate": 4.8195207871009795e-05, "loss": 1.0972, "step": 116300 }, { "epoch": 1.80635950278558, "grad_norm": 2.449737071990967, "learning_rate": 4.819365601576685e-05, "loss": 1.072, "step": 116400 }, { "epoch": 1.807911358028523, "grad_norm": 2.0092339515686035, "learning_rate": 4.819210416052391e-05, "loss": 1.0762, "step": 116500 }, { "epoch": 1.809463213271466, "grad_norm": 2.553809642791748, "learning_rate": 4.819055230528097e-05, "loss": 1.0738, "step": 116600 }, { "epoch": 1.811015068514409, "grad_norm": 2.23128080368042, "learning_rate": 4.8189000450038026e-05, "loss": 1.0816, "step": 116700 }, { "epoch": 1.8125669237573518, "grad_norm": 2.235153913497925, "learning_rate": 4.8187448594795084e-05, "loss": 1.0561, "step": 116800 }, { "epoch": 1.8141187790002948, "grad_norm": 2.4133265018463135, "learning_rate": 4.818589673955214e-05, "loss": 1.0759, "step": 116900 }, { "epoch": 1.8156706342432378, "grad_norm": 2.5476932525634766, "learning_rate": 4.818434488430919e-05, "loss": 1.0983, "step": 117000 }, { "epoch": 1.8172224894861806, "grad_norm": 2.037759304046631, "learning_rate": 4.818279302906625e-05, "loss": 1.0796, "step": 117100 }, { "epoch": 1.8187743447291238, "grad_norm": 1.8041654825210571, "learning_rate": 4.818124117382331e-05, "loss": 1.0743, "step": 117200 }, { "epoch": 1.8203261999720666, "grad_norm": 2.1287336349487305, "learning_rate": 4.8179689318580366e-05, "loss": 1.0491, "step": 117300 }, { "epoch": 1.8218780552150096, "grad_norm": 2.779341220855713, "learning_rate": 4.8178137463337424e-05, "loss": 1.0649, "step": 117400 }, { "epoch": 1.8234299104579526, "grad_norm": 2.8096201419830322, "learning_rate": 4.8176585608094475e-05, "loss": 1.1033, "step": 117500 }, { "epoch": 1.8249817657008953, "grad_norm": 2.444505453109741, "learning_rate": 4.817503375285153e-05, "loss": 1.0688, "step": 117600 }, { "epoch": 1.8265336209438383, "grad_norm": 2.2384262084960938, "learning_rate": 4.817348189760859e-05, "loss": 1.0746, "step": 117700 }, { "epoch": 1.8280854761867813, "grad_norm": 2.4848833084106445, "learning_rate": 4.817193004236565e-05, "loss": 1.0785, "step": 117800 }, { "epoch": 1.8296373314297243, "grad_norm": 1.9756971597671509, "learning_rate": 4.8170378187122706e-05, "loss": 1.0819, "step": 117900 }, { "epoch": 1.8311891866726673, "grad_norm": 2.0762622356414795, "learning_rate": 4.8168826331879763e-05, "loss": 1.084, "step": 118000 }, { "epoch": 1.83274104191561, "grad_norm": 2.1194162368774414, "learning_rate": 4.816727447663682e-05, "loss": 1.0658, "step": 118100 }, { "epoch": 1.834292897158553, "grad_norm": 2.2249464988708496, "learning_rate": 4.816572262139388e-05, "loss": 1.0682, "step": 118200 }, { "epoch": 1.835844752401496, "grad_norm": 2.7198054790496826, "learning_rate": 4.816417076615094e-05, "loss": 1.0685, "step": 118300 }, { "epoch": 1.8373966076444388, "grad_norm": 2.395421266555786, "learning_rate": 4.8162618910907994e-05, "loss": 1.0476, "step": 118400 }, { "epoch": 1.838948462887382, "grad_norm": 2.5260422229766846, "learning_rate": 4.816106705566505e-05, "loss": 1.0814, "step": 118500 }, { "epoch": 1.8405003181303248, "grad_norm": 2.381462574005127, "learning_rate": 4.81595152004221e-05, "loss": 1.0948, "step": 118600 }, { "epoch": 1.8420521733732678, "grad_norm": 2.6957783699035645, "learning_rate": 4.815796334517916e-05, "loss": 1.0594, "step": 118700 }, { "epoch": 1.8436040286162108, "grad_norm": 2.6098296642303467, "learning_rate": 4.815641148993622e-05, "loss": 1.0691, "step": 118800 }, { "epoch": 1.8451558838591535, "grad_norm": 2.336134672164917, "learning_rate": 4.8154859634693276e-05, "loss": 1.0754, "step": 118900 }, { "epoch": 1.8467077391020965, "grad_norm": 2.165764093399048, "learning_rate": 4.8153307779450334e-05, "loss": 1.0785, "step": 119000 }, { "epoch": 1.8482595943450395, "grad_norm": 2.52502179145813, "learning_rate": 4.815175592420739e-05, "loss": 1.0789, "step": 119100 }, { "epoch": 1.8498114495879823, "grad_norm": 2.594719409942627, "learning_rate": 4.815020406896445e-05, "loss": 1.0751, "step": 119200 }, { "epoch": 1.8513633048309255, "grad_norm": 2.4303109645843506, "learning_rate": 4.814865221372151e-05, "loss": 1.075, "step": 119300 }, { "epoch": 1.8529151600738683, "grad_norm": 2.4006989002227783, "learning_rate": 4.8147100358478565e-05, "loss": 1.0862, "step": 119400 }, { "epoch": 1.8544670153168112, "grad_norm": 2.044224977493286, "learning_rate": 4.814554850323562e-05, "loss": 1.0807, "step": 119500 }, { "epoch": 1.8560188705597542, "grad_norm": 2.192030668258667, "learning_rate": 4.814399664799268e-05, "loss": 1.0972, "step": 119600 }, { "epoch": 1.857570725802697, "grad_norm": 2.3220582008361816, "learning_rate": 4.814244479274974e-05, "loss": 1.0908, "step": 119700 }, { "epoch": 1.8591225810456402, "grad_norm": 2.401507616043091, "learning_rate": 4.8140892937506796e-05, "loss": 1.091, "step": 119800 }, { "epoch": 1.860674436288583, "grad_norm": 2.05117130279541, "learning_rate": 4.813934108226385e-05, "loss": 1.081, "step": 119900 }, { "epoch": 1.862226291531526, "grad_norm": 2.2260279655456543, "learning_rate": 4.8137789227020905e-05, "loss": 1.0739, "step": 120000 }, { "epoch": 1.863778146774469, "grad_norm": 2.2794127464294434, "learning_rate": 4.813623737177796e-05, "loss": 1.0681, "step": 120100 }, { "epoch": 1.8653300020174117, "grad_norm": 2.2540292739868164, "learning_rate": 4.813468551653502e-05, "loss": 1.0758, "step": 120200 }, { "epoch": 1.8668818572603547, "grad_norm": 2.580883026123047, "learning_rate": 4.813313366129208e-05, "loss": 1.1091, "step": 120300 }, { "epoch": 1.8684337125032977, "grad_norm": 2.353062152862549, "learning_rate": 4.8131581806049136e-05, "loss": 1.0861, "step": 120400 }, { "epoch": 1.8699855677462405, "grad_norm": 2.2393531799316406, "learning_rate": 4.8130029950806194e-05, "loss": 1.0746, "step": 120500 }, { "epoch": 1.8715374229891837, "grad_norm": 2.7816336154937744, "learning_rate": 4.812847809556325e-05, "loss": 1.0866, "step": 120600 }, { "epoch": 1.8730892782321265, "grad_norm": 2.420135259628296, "learning_rate": 4.81269262403203e-05, "loss": 1.0895, "step": 120700 }, { "epoch": 1.8746411334750694, "grad_norm": 2.068148136138916, "learning_rate": 4.812537438507736e-05, "loss": 1.0727, "step": 120800 }, { "epoch": 1.8761929887180124, "grad_norm": 2.162074327468872, "learning_rate": 4.812382252983442e-05, "loss": 1.0858, "step": 120900 }, { "epoch": 1.8777448439609552, "grad_norm": 2.095033645629883, "learning_rate": 4.8122270674591476e-05, "loss": 1.0673, "step": 121000 }, { "epoch": 1.8792966992038984, "grad_norm": 2.1828792095184326, "learning_rate": 4.8120718819348533e-05, "loss": 1.0894, "step": 121100 }, { "epoch": 1.8808485544468412, "grad_norm": 2.5313050746917725, "learning_rate": 4.811916696410559e-05, "loss": 1.0648, "step": 121200 }, { "epoch": 1.8824004096897842, "grad_norm": 2.1740057468414307, "learning_rate": 4.811761510886265e-05, "loss": 1.0775, "step": 121300 }, { "epoch": 1.8839522649327272, "grad_norm": 2.04083251953125, "learning_rate": 4.81160632536197e-05, "loss": 1.0887, "step": 121400 }, { "epoch": 1.88550412017567, "grad_norm": 2.138786792755127, "learning_rate": 4.811451139837676e-05, "loss": 1.0649, "step": 121500 }, { "epoch": 1.887055975418613, "grad_norm": 2.0873091220855713, "learning_rate": 4.8112959543133815e-05, "loss": 1.0792, "step": 121600 }, { "epoch": 1.888607830661556, "grad_norm": 2.4566869735717773, "learning_rate": 4.811140768789087e-05, "loss": 1.0782, "step": 121700 }, { "epoch": 1.8901596859044987, "grad_norm": 2.4272701740264893, "learning_rate": 4.810985583264793e-05, "loss": 1.0624, "step": 121800 }, { "epoch": 1.8917115411474419, "grad_norm": 2.4703564643859863, "learning_rate": 4.810830397740499e-05, "loss": 1.0885, "step": 121900 }, { "epoch": 1.8932633963903847, "grad_norm": 2.5849311351776123, "learning_rate": 4.8106752122162046e-05, "loss": 1.0674, "step": 122000 }, { "epoch": 1.8948152516333276, "grad_norm": 2.0134143829345703, "learning_rate": 4.8105200266919104e-05, "loss": 1.0808, "step": 122100 }, { "epoch": 1.8963671068762706, "grad_norm": 2.4470725059509277, "learning_rate": 4.810364841167616e-05, "loss": 1.0833, "step": 122200 }, { "epoch": 1.8979189621192134, "grad_norm": 2.008669137954712, "learning_rate": 4.810209655643322e-05, "loss": 1.0646, "step": 122300 }, { "epoch": 1.8994708173621566, "grad_norm": 2.218724489212036, "learning_rate": 4.810054470119028e-05, "loss": 1.0711, "step": 122400 }, { "epoch": 1.9010226726050994, "grad_norm": 2.7357943058013916, "learning_rate": 4.8098992845947335e-05, "loss": 1.07, "step": 122500 }, { "epoch": 1.9025745278480424, "grad_norm": 1.8966476917266846, "learning_rate": 4.809744099070439e-05, "loss": 1.0899, "step": 122600 }, { "epoch": 1.9041263830909854, "grad_norm": 2.341580629348755, "learning_rate": 4.8095889135461444e-05, "loss": 1.0754, "step": 122700 }, { "epoch": 1.9056782383339281, "grad_norm": 2.0863394737243652, "learning_rate": 4.80943372802185e-05, "loss": 1.075, "step": 122800 }, { "epoch": 1.9072300935768711, "grad_norm": 2.4585351943969727, "learning_rate": 4.809278542497556e-05, "loss": 1.0641, "step": 122900 }, { "epoch": 1.9087819488198141, "grad_norm": 2.6105740070343018, "learning_rate": 4.809123356973262e-05, "loss": 1.0848, "step": 123000 }, { "epoch": 1.9103338040627569, "grad_norm": 2.510451316833496, "learning_rate": 4.8089681714489675e-05, "loss": 1.0702, "step": 123100 }, { "epoch": 1.9118856593057, "grad_norm": 2.0170397758483887, "learning_rate": 4.808812985924673e-05, "loss": 1.0634, "step": 123200 }, { "epoch": 1.9134375145486429, "grad_norm": 2.2080132961273193, "learning_rate": 4.808657800400379e-05, "loss": 1.0771, "step": 123300 }, { "epoch": 1.9149893697915858, "grad_norm": 2.509495735168457, "learning_rate": 4.808502614876085e-05, "loss": 1.0737, "step": 123400 }, { "epoch": 1.9165412250345288, "grad_norm": 2.6319026947021484, "learning_rate": 4.8083474293517906e-05, "loss": 1.0771, "step": 123500 }, { "epoch": 1.9180930802774716, "grad_norm": 2.0826144218444824, "learning_rate": 4.8081922438274964e-05, "loss": 1.0698, "step": 123600 }, { "epoch": 1.9196449355204148, "grad_norm": 2.6355018615722656, "learning_rate": 4.808037058303202e-05, "loss": 1.0792, "step": 123700 }, { "epoch": 1.9211967907633576, "grad_norm": 1.8507258892059326, "learning_rate": 4.807881872778908e-05, "loss": 1.073, "step": 123800 }, { "epoch": 1.9227486460063006, "grad_norm": 2.1909666061401367, "learning_rate": 4.807726687254614e-05, "loss": 1.0626, "step": 123900 }, { "epoch": 1.9243005012492436, "grad_norm": 2.3938145637512207, "learning_rate": 4.807571501730319e-05, "loss": 1.0549, "step": 124000 }, { "epoch": 1.9258523564921863, "grad_norm": 2.001340389251709, "learning_rate": 4.8074163162060246e-05, "loss": 1.0693, "step": 124100 }, { "epoch": 1.9274042117351293, "grad_norm": 2.752593994140625, "learning_rate": 4.8072611306817303e-05, "loss": 1.0608, "step": 124200 }, { "epoch": 1.9289560669780723, "grad_norm": 1.8566434383392334, "learning_rate": 4.8071059451574354e-05, "loss": 1.0636, "step": 124300 }, { "epoch": 1.930507922221015, "grad_norm": 2.8065900802612305, "learning_rate": 4.806950759633141e-05, "loss": 1.0877, "step": 124400 }, { "epoch": 1.9320597774639583, "grad_norm": 1.9071614742279053, "learning_rate": 4.806795574108847e-05, "loss": 1.0696, "step": 124500 }, { "epoch": 1.933611632706901, "grad_norm": 2.7064669132232666, "learning_rate": 4.806640388584553e-05, "loss": 1.0633, "step": 124600 }, { "epoch": 1.935163487949844, "grad_norm": 2.330824613571167, "learning_rate": 4.8064852030602585e-05, "loss": 1.0996, "step": 124700 }, { "epoch": 1.936715343192787, "grad_norm": 1.9912841320037842, "learning_rate": 4.806330017535964e-05, "loss": 1.0613, "step": 124800 }, { "epoch": 1.9382671984357298, "grad_norm": 2.084195613861084, "learning_rate": 4.80617483201167e-05, "loss": 1.0548, "step": 124900 }, { "epoch": 1.9398190536786728, "grad_norm": 2.2805652618408203, "learning_rate": 4.806019646487376e-05, "loss": 1.053, "step": 125000 }, { "epoch": 1.9413709089216158, "grad_norm": 2.013556957244873, "learning_rate": 4.8058644609630816e-05, "loss": 1.0651, "step": 125100 }, { "epoch": 1.9429227641645588, "grad_norm": 2.333754301071167, "learning_rate": 4.8057092754387874e-05, "loss": 1.0716, "step": 125200 }, { "epoch": 1.9444746194075018, "grad_norm": 2.6845860481262207, "learning_rate": 4.805554089914493e-05, "loss": 1.0646, "step": 125300 }, { "epoch": 1.9460264746504445, "grad_norm": 2.2428903579711914, "learning_rate": 4.805398904390199e-05, "loss": 1.0818, "step": 125400 }, { "epoch": 1.9475783298933875, "grad_norm": 2.3152270317077637, "learning_rate": 4.805243718865905e-05, "loss": 1.0688, "step": 125500 }, { "epoch": 1.9491301851363305, "grad_norm": 2.0691685676574707, "learning_rate": 4.80508853334161e-05, "loss": 1.077, "step": 125600 }, { "epoch": 1.9506820403792733, "grad_norm": 2.0317060947418213, "learning_rate": 4.8049333478173156e-05, "loss": 1.0657, "step": 125700 }, { "epoch": 1.9522338956222165, "grad_norm": 1.8157519102096558, "learning_rate": 4.8047781622930214e-05, "loss": 1.0762, "step": 125800 }, { "epoch": 1.9537857508651593, "grad_norm": 2.2983384132385254, "learning_rate": 4.804622976768727e-05, "loss": 1.063, "step": 125900 }, { "epoch": 1.9553376061081023, "grad_norm": 1.9815373420715332, "learning_rate": 4.804467791244433e-05, "loss": 1.0817, "step": 126000 }, { "epoch": 1.9568894613510452, "grad_norm": 2.76365065574646, "learning_rate": 4.804312605720139e-05, "loss": 1.0603, "step": 126100 }, { "epoch": 1.958441316593988, "grad_norm": 1.856594204902649, "learning_rate": 4.8041574201958445e-05, "loss": 1.0891, "step": 126200 }, { "epoch": 1.959993171836931, "grad_norm": 2.3030471801757812, "learning_rate": 4.80400223467155e-05, "loss": 1.0517, "step": 126300 }, { "epoch": 1.961545027079874, "grad_norm": 2.2358100414276123, "learning_rate": 4.803847049147256e-05, "loss": 1.0746, "step": 126400 }, { "epoch": 1.963096882322817, "grad_norm": 2.43802547454834, "learning_rate": 4.803691863622962e-05, "loss": 1.0627, "step": 126500 }, { "epoch": 1.96464873756576, "grad_norm": 2.9986813068389893, "learning_rate": 4.8035366780986676e-05, "loss": 1.0766, "step": 126600 }, { "epoch": 1.9662005928087027, "grad_norm": 1.985355019569397, "learning_rate": 4.8033814925743734e-05, "loss": 1.051, "step": 126700 }, { "epoch": 1.9677524480516457, "grad_norm": 2.38588809967041, "learning_rate": 4.803226307050079e-05, "loss": 1.0497, "step": 126800 }, { "epoch": 1.9693043032945887, "grad_norm": 2.0955090522766113, "learning_rate": 4.803071121525784e-05, "loss": 1.0722, "step": 126900 }, { "epoch": 1.9708561585375315, "grad_norm": 2.1312978267669678, "learning_rate": 4.80291593600149e-05, "loss": 1.0849, "step": 127000 }, { "epoch": 1.9724080137804747, "grad_norm": 2.509342670440674, "learning_rate": 4.802760750477196e-05, "loss": 1.0661, "step": 127100 }, { "epoch": 1.9739598690234175, "grad_norm": 2.378692626953125, "learning_rate": 4.802605564952901e-05, "loss": 1.084, "step": 127200 }, { "epoch": 1.9755117242663605, "grad_norm": 2.361170530319214, "learning_rate": 4.802450379428607e-05, "loss": 1.0692, "step": 127300 }, { "epoch": 1.9770635795093034, "grad_norm": 2.198575258255005, "learning_rate": 4.8022951939043124e-05, "loss": 1.0548, "step": 127400 }, { "epoch": 1.9786154347522462, "grad_norm": 1.966523289680481, "learning_rate": 4.802140008380018e-05, "loss": 1.0658, "step": 127500 }, { "epoch": 1.9801672899951892, "grad_norm": 2.093989610671997, "learning_rate": 4.801984822855724e-05, "loss": 1.0226, "step": 127600 }, { "epoch": 1.9817191452381322, "grad_norm": 2.2246530055999756, "learning_rate": 4.80182963733143e-05, "loss": 1.0453, "step": 127700 }, { "epoch": 1.983271000481075, "grad_norm": 2.239438533782959, "learning_rate": 4.8016744518071355e-05, "loss": 1.0663, "step": 127800 }, { "epoch": 1.9848228557240182, "grad_norm": 2.4696121215820312, "learning_rate": 4.801519266282841e-05, "loss": 1.0621, "step": 127900 }, { "epoch": 1.986374710966961, "grad_norm": 2.4301156997680664, "learning_rate": 4.801364080758547e-05, "loss": 1.0658, "step": 128000 }, { "epoch": 1.987926566209904, "grad_norm": 2.231473445892334, "learning_rate": 4.801208895234253e-05, "loss": 1.0761, "step": 128100 }, { "epoch": 1.989478421452847, "grad_norm": 2.036868095397949, "learning_rate": 4.8010537097099586e-05, "loss": 1.0813, "step": 128200 }, { "epoch": 1.9910302766957897, "grad_norm": 2.6127278804779053, "learning_rate": 4.8008985241856644e-05, "loss": 1.0674, "step": 128300 }, { "epoch": 1.992582131938733, "grad_norm": 2.0679049491882324, "learning_rate": 4.8007433386613695e-05, "loss": 1.0546, "step": 128400 }, { "epoch": 1.9941339871816757, "grad_norm": 2.076267957687378, "learning_rate": 4.800588153137075e-05, "loss": 1.0843, "step": 128500 }, { "epoch": 1.9956858424246187, "grad_norm": 2.563143253326416, "learning_rate": 4.800432967612781e-05, "loss": 1.0841, "step": 128600 }, { "epoch": 1.9972376976675617, "grad_norm": 2.4184048175811768, "learning_rate": 4.800277782088487e-05, "loss": 1.0774, "step": 128700 }, { "epoch": 1.9987895529105044, "grad_norm": 2.103375196456909, "learning_rate": 4.8001225965641926e-05, "loss": 1.0844, "step": 128800 }, { "epoch": 2.0003414081534476, "grad_norm": 2.2401986122131348, "learning_rate": 4.7999674110398984e-05, "loss": 1.0518, "step": 128900 }, { "epoch": 2.0018932633963904, "grad_norm": 2.137556314468384, "learning_rate": 4.799812225515604e-05, "loss": 1.0793, "step": 129000 }, { "epoch": 2.003445118639333, "grad_norm": 2.2270452976226807, "learning_rate": 4.79965703999131e-05, "loss": 1.0871, "step": 129100 }, { "epoch": 2.0049969738822764, "grad_norm": 2.251593589782715, "learning_rate": 4.799501854467016e-05, "loss": 1.0646, "step": 129200 }, { "epoch": 2.006548829125219, "grad_norm": 2.5513393878936768, "learning_rate": 4.7993466689427215e-05, "loss": 1.0721, "step": 129300 }, { "epoch": 2.0081006843681624, "grad_norm": 2.4913833141326904, "learning_rate": 4.799191483418427e-05, "loss": 1.0505, "step": 129400 }, { "epoch": 2.009652539611105, "grad_norm": 2.5230865478515625, "learning_rate": 4.799036297894133e-05, "loss": 1.081, "step": 129500 }, { "epoch": 2.011204394854048, "grad_norm": 2.1321935653686523, "learning_rate": 4.798881112369839e-05, "loss": 1.0728, "step": 129600 }, { "epoch": 2.012756250096991, "grad_norm": 2.1135218143463135, "learning_rate": 4.798725926845544e-05, "loss": 1.0559, "step": 129700 }, { "epoch": 2.014308105339934, "grad_norm": 2.455204725265503, "learning_rate": 4.79857074132125e-05, "loss": 1.0686, "step": 129800 }, { "epoch": 2.0158599605828766, "grad_norm": 2.4912171363830566, "learning_rate": 4.7984155557969555e-05, "loss": 1.0658, "step": 129900 }, { "epoch": 2.01741181582582, "grad_norm": 2.3110179901123047, "learning_rate": 4.798260370272661e-05, "loss": 1.0688, "step": 130000 }, { "epoch": 2.0189636710687626, "grad_norm": 2.387244939804077, "learning_rate": 4.798105184748367e-05, "loss": 1.0802, "step": 130100 }, { "epoch": 2.020515526311706, "grad_norm": 2.1166555881500244, "learning_rate": 4.797949999224073e-05, "loss": 1.0673, "step": 130200 }, { "epoch": 2.0220673815546486, "grad_norm": 2.5111243724823, "learning_rate": 4.7977948136997786e-05, "loss": 1.0664, "step": 130300 }, { "epoch": 2.0236192367975914, "grad_norm": 2.174515724182129, "learning_rate": 4.7976396281754843e-05, "loss": 1.0653, "step": 130400 }, { "epoch": 2.0251710920405346, "grad_norm": 2.346059560775757, "learning_rate": 4.7974844426511894e-05, "loss": 1.0811, "step": 130500 }, { "epoch": 2.0267229472834773, "grad_norm": 2.2295713424682617, "learning_rate": 4.797329257126895e-05, "loss": 1.058, "step": 130600 }, { "epoch": 2.02827480252642, "grad_norm": 1.9734086990356445, "learning_rate": 4.797174071602601e-05, "loss": 1.0757, "step": 130700 }, { "epoch": 2.0298266577693633, "grad_norm": 2.238551139831543, "learning_rate": 4.797018886078307e-05, "loss": 1.0426, "step": 130800 }, { "epoch": 2.031378513012306, "grad_norm": 2.4086573123931885, "learning_rate": 4.7968637005540125e-05, "loss": 1.0721, "step": 130900 }, { "epoch": 2.0329303682552493, "grad_norm": 2.3574037551879883, "learning_rate": 4.796708515029718e-05, "loss": 1.0734, "step": 131000 }, { "epoch": 2.034482223498192, "grad_norm": 2.0901331901550293, "learning_rate": 4.796553329505424e-05, "loss": 1.069, "step": 131100 }, { "epoch": 2.036034078741135, "grad_norm": 2.7794599533081055, "learning_rate": 4.796398143981129e-05, "loss": 1.0334, "step": 131200 }, { "epoch": 2.037585933984078, "grad_norm": 2.2324187755584717, "learning_rate": 4.796242958456835e-05, "loss": 1.0765, "step": 131300 }, { "epoch": 2.039137789227021, "grad_norm": 2.6240456104278564, "learning_rate": 4.796087772932541e-05, "loss": 1.06, "step": 131400 }, { "epoch": 2.040689644469964, "grad_norm": 2.271939754486084, "learning_rate": 4.7959325874082465e-05, "loss": 1.0617, "step": 131500 }, { "epoch": 2.042241499712907, "grad_norm": 2.1382699012756348, "learning_rate": 4.795777401883952e-05, "loss": 1.0835, "step": 131600 }, { "epoch": 2.0437933549558496, "grad_norm": 2.61220645904541, "learning_rate": 4.795622216359658e-05, "loss": 1.0743, "step": 131700 }, { "epoch": 2.045345210198793, "grad_norm": 2.5175349712371826, "learning_rate": 4.795467030835364e-05, "loss": 1.0408, "step": 131800 }, { "epoch": 2.0468970654417356, "grad_norm": 1.8807514905929565, "learning_rate": 4.7953118453110696e-05, "loss": 1.0403, "step": 131900 }, { "epoch": 2.0484489206846783, "grad_norm": 2.0968101024627686, "learning_rate": 4.7951566597867754e-05, "loss": 1.0435, "step": 132000 }, { "epoch": 2.0500007759276215, "grad_norm": 2.2672133445739746, "learning_rate": 4.795001474262481e-05, "loss": 1.0406, "step": 132100 }, { "epoch": 2.0515526311705643, "grad_norm": 2.366468906402588, "learning_rate": 4.794846288738187e-05, "loss": 1.0459, "step": 132200 }, { "epoch": 2.0531044864135075, "grad_norm": 2.247690200805664, "learning_rate": 4.794691103213893e-05, "loss": 1.0539, "step": 132300 }, { "epoch": 2.0546563416564503, "grad_norm": 1.8925704956054688, "learning_rate": 4.7945359176895985e-05, "loss": 1.0493, "step": 132400 }, { "epoch": 2.056208196899393, "grad_norm": 1.930070161819458, "learning_rate": 4.7943807321653036e-05, "loss": 1.0664, "step": 132500 }, { "epoch": 2.0577600521423363, "grad_norm": 1.8710217475891113, "learning_rate": 4.7942255466410094e-05, "loss": 1.0624, "step": 132600 }, { "epoch": 2.059311907385279, "grad_norm": 2.365832805633545, "learning_rate": 4.794070361116715e-05, "loss": 1.056, "step": 132700 }, { "epoch": 2.0608637626282222, "grad_norm": 2.0719170570373535, "learning_rate": 4.793915175592421e-05, "loss": 1.0608, "step": 132800 }, { "epoch": 2.062415617871165, "grad_norm": 3.2959883213043213, "learning_rate": 4.793759990068127e-05, "loss": 1.0539, "step": 132900 }, { "epoch": 2.0639674731141078, "grad_norm": 2.6179490089416504, "learning_rate": 4.7936048045438325e-05, "loss": 1.06, "step": 133000 }, { "epoch": 2.065519328357051, "grad_norm": 1.979307770729065, "learning_rate": 4.793449619019538e-05, "loss": 1.0611, "step": 133100 }, { "epoch": 2.0670711835999938, "grad_norm": 2.7358052730560303, "learning_rate": 4.793294433495244e-05, "loss": 1.0437, "step": 133200 }, { "epoch": 2.0686230388429365, "grad_norm": 2.2443244457244873, "learning_rate": 4.79313924797095e-05, "loss": 1.0703, "step": 133300 }, { "epoch": 2.0701748940858797, "grad_norm": 1.9455335140228271, "learning_rate": 4.7929840624466556e-05, "loss": 1.07, "step": 133400 }, { "epoch": 2.0717267493288225, "grad_norm": 2.095489501953125, "learning_rate": 4.7928288769223613e-05, "loss": 1.0783, "step": 133500 }, { "epoch": 2.0732786045717657, "grad_norm": 2.3454062938690186, "learning_rate": 4.792673691398067e-05, "loss": 1.0612, "step": 133600 }, { "epoch": 2.0748304598147085, "grad_norm": 1.7862906455993652, "learning_rate": 4.792518505873773e-05, "loss": 1.0449, "step": 133700 }, { "epoch": 2.0763823150576513, "grad_norm": 1.7941592931747437, "learning_rate": 4.792363320349478e-05, "loss": 1.083, "step": 133800 }, { "epoch": 2.0779341703005945, "grad_norm": 2.3605029582977295, "learning_rate": 4.792208134825184e-05, "loss": 1.0819, "step": 133900 }, { "epoch": 2.0794860255435372, "grad_norm": 3.062452793121338, "learning_rate": 4.7920529493008895e-05, "loss": 1.0524, "step": 134000 }, { "epoch": 2.0810378807864804, "grad_norm": 2.369102954864502, "learning_rate": 4.7918977637765946e-05, "loss": 1.0747, "step": 134100 }, { "epoch": 2.082589736029423, "grad_norm": 4.67025899887085, "learning_rate": 4.7917425782523004e-05, "loss": 1.0761, "step": 134200 }, { "epoch": 2.084141591272366, "grad_norm": 2.3892180919647217, "learning_rate": 4.791587392728006e-05, "loss": 1.0776, "step": 134300 }, { "epoch": 2.085693446515309, "grad_norm": 2.34405255317688, "learning_rate": 4.791432207203712e-05, "loss": 1.0763, "step": 134400 }, { "epoch": 2.087245301758252, "grad_norm": 2.067636251449585, "learning_rate": 4.791277021679418e-05, "loss": 1.0402, "step": 134500 }, { "epoch": 2.0887971570011947, "grad_norm": 2.8550288677215576, "learning_rate": 4.7911218361551235e-05, "loss": 1.0499, "step": 134600 }, { "epoch": 2.090349012244138, "grad_norm": 2.4426872730255127, "learning_rate": 4.790966650630829e-05, "loss": 1.0637, "step": 134700 }, { "epoch": 2.0919008674870807, "grad_norm": 2.5001494884490967, "learning_rate": 4.790811465106535e-05, "loss": 1.085, "step": 134800 }, { "epoch": 2.093452722730024, "grad_norm": 2.5100629329681396, "learning_rate": 4.790656279582241e-05, "loss": 1.0563, "step": 134900 }, { "epoch": 2.0950045779729667, "grad_norm": 2.1518192291259766, "learning_rate": 4.7905010940579466e-05, "loss": 1.0575, "step": 135000 }, { "epoch": 2.0965564332159095, "grad_norm": 1.876930594444275, "learning_rate": 4.7903459085336524e-05, "loss": 1.0619, "step": 135100 }, { "epoch": 2.0981082884588527, "grad_norm": 2.4290332794189453, "learning_rate": 4.790190723009358e-05, "loss": 1.0562, "step": 135200 }, { "epoch": 2.0996601437017954, "grad_norm": 2.172323226928711, "learning_rate": 4.790035537485064e-05, "loss": 1.0765, "step": 135300 }, { "epoch": 2.1012119989447386, "grad_norm": 2.4881210327148438, "learning_rate": 4.789880351960769e-05, "loss": 1.0433, "step": 135400 }, { "epoch": 2.1027638541876814, "grad_norm": 2.0730140209198, "learning_rate": 4.789725166436475e-05, "loss": 1.0639, "step": 135500 }, { "epoch": 2.104315709430624, "grad_norm": 2.4232640266418457, "learning_rate": 4.7895699809121806e-05, "loss": 1.0418, "step": 135600 }, { "epoch": 2.1058675646735674, "grad_norm": 2.2779343128204346, "learning_rate": 4.7894147953878864e-05, "loss": 1.0515, "step": 135700 }, { "epoch": 2.10741941991651, "grad_norm": 2.540282726287842, "learning_rate": 4.789259609863592e-05, "loss": 1.0798, "step": 135800 }, { "epoch": 2.108971275159453, "grad_norm": 2.4380288124084473, "learning_rate": 4.789104424339298e-05, "loss": 1.0632, "step": 135900 }, { "epoch": 2.110523130402396, "grad_norm": 2.7196033000946045, "learning_rate": 4.788949238815004e-05, "loss": 1.0826, "step": 136000 }, { "epoch": 2.112074985645339, "grad_norm": 2.6508028507232666, "learning_rate": 4.7887940532907095e-05, "loss": 1.0594, "step": 136100 }, { "epoch": 2.113626840888282, "grad_norm": 2.185214042663574, "learning_rate": 4.788638867766415e-05, "loss": 1.069, "step": 136200 }, { "epoch": 2.115178696131225, "grad_norm": 2.311896562576294, "learning_rate": 4.788483682242121e-05, "loss": 1.0512, "step": 136300 }, { "epoch": 2.1167305513741677, "grad_norm": 2.380798101425171, "learning_rate": 4.788328496717827e-05, "loss": 1.0558, "step": 136400 }, { "epoch": 2.118282406617111, "grad_norm": 2.257411479949951, "learning_rate": 4.7881733111935326e-05, "loss": 1.0274, "step": 136500 }, { "epoch": 2.1198342618600536, "grad_norm": 2.556018829345703, "learning_rate": 4.7880181256692383e-05, "loss": 1.0364, "step": 136600 }, { "epoch": 2.121386117102997, "grad_norm": 2.053537368774414, "learning_rate": 4.7878629401449434e-05, "loss": 1.0501, "step": 136700 }, { "epoch": 2.1229379723459396, "grad_norm": 2.075845718383789, "learning_rate": 4.787707754620649e-05, "loss": 1.0751, "step": 136800 }, { "epoch": 2.1244898275888824, "grad_norm": 1.9716755151748657, "learning_rate": 4.787552569096355e-05, "loss": 1.0498, "step": 136900 }, { "epoch": 2.1260416828318256, "grad_norm": 2.6828722953796387, "learning_rate": 4.78739738357206e-05, "loss": 1.0683, "step": 137000 }, { "epoch": 2.1275935380747684, "grad_norm": 2.434020519256592, "learning_rate": 4.787242198047766e-05, "loss": 1.0791, "step": 137100 }, { "epoch": 2.129145393317711, "grad_norm": 2.335705280303955, "learning_rate": 4.7870870125234716e-05, "loss": 1.0379, "step": 137200 }, { "epoch": 2.1306972485606543, "grad_norm": 2.5148603916168213, "learning_rate": 4.7869318269991774e-05, "loss": 1.0629, "step": 137300 }, { "epoch": 2.132249103803597, "grad_norm": 2.0944297313690186, "learning_rate": 4.786776641474883e-05, "loss": 1.0474, "step": 137400 }, { "epoch": 2.1338009590465403, "grad_norm": 2.3225951194763184, "learning_rate": 4.786621455950589e-05, "loss": 1.0318, "step": 137500 }, { "epoch": 2.135352814289483, "grad_norm": 2.3951828479766846, "learning_rate": 4.786466270426295e-05, "loss": 1.067, "step": 137600 }, { "epoch": 2.136904669532426, "grad_norm": 2.2558112144470215, "learning_rate": 4.7863110849020005e-05, "loss": 1.0404, "step": 137700 }, { "epoch": 2.138456524775369, "grad_norm": 2.2582268714904785, "learning_rate": 4.786155899377706e-05, "loss": 1.0716, "step": 137800 }, { "epoch": 2.140008380018312, "grad_norm": 2.364276885986328, "learning_rate": 4.786000713853412e-05, "loss": 1.0602, "step": 137900 }, { "epoch": 2.141560235261255, "grad_norm": 2.464024305343628, "learning_rate": 4.785845528329118e-05, "loss": 1.0571, "step": 138000 }, { "epoch": 2.143112090504198, "grad_norm": 2.0748398303985596, "learning_rate": 4.7856903428048236e-05, "loss": 1.0695, "step": 138100 }, { "epoch": 2.1446639457471406, "grad_norm": 1.9313088655471802, "learning_rate": 4.785535157280529e-05, "loss": 1.0603, "step": 138200 }, { "epoch": 2.146215800990084, "grad_norm": 2.4812843799591064, "learning_rate": 4.7853799717562345e-05, "loss": 1.047, "step": 138300 }, { "epoch": 2.1477676562330266, "grad_norm": 2.34519100189209, "learning_rate": 4.78522478623194e-05, "loss": 1.081, "step": 138400 }, { "epoch": 2.1493195114759693, "grad_norm": 2.3789329528808594, "learning_rate": 4.785069600707646e-05, "loss": 1.0548, "step": 138500 }, { "epoch": 2.1508713667189125, "grad_norm": 2.2253458499908447, "learning_rate": 4.784914415183352e-05, "loss": 1.0848, "step": 138600 }, { "epoch": 2.1524232219618553, "grad_norm": 2.3522236347198486, "learning_rate": 4.7847592296590576e-05, "loss": 1.0632, "step": 138700 }, { "epoch": 2.1539750772047985, "grad_norm": 2.1813430786132812, "learning_rate": 4.7846040441347634e-05, "loss": 1.0576, "step": 138800 }, { "epoch": 2.1555269324477413, "grad_norm": 2.080888271331787, "learning_rate": 4.784448858610469e-05, "loss": 1.0775, "step": 138900 }, { "epoch": 2.157078787690684, "grad_norm": 2.6952641010284424, "learning_rate": 4.784293673086175e-05, "loss": 1.053, "step": 139000 }, { "epoch": 2.1586306429336273, "grad_norm": 2.1119320392608643, "learning_rate": 4.784138487561881e-05, "loss": 1.0453, "step": 139100 }, { "epoch": 2.16018249817657, "grad_norm": 2.2789254188537598, "learning_rate": 4.7839833020375865e-05, "loss": 1.0487, "step": 139200 }, { "epoch": 2.1617343534195133, "grad_norm": 2.819838523864746, "learning_rate": 4.783828116513292e-05, "loss": 1.0581, "step": 139300 }, { "epoch": 2.163286208662456, "grad_norm": 2.0911495685577393, "learning_rate": 4.783672930988998e-05, "loss": 1.0586, "step": 139400 }, { "epoch": 2.164838063905399, "grad_norm": 2.501739501953125, "learning_rate": 4.783517745464703e-05, "loss": 1.0686, "step": 139500 }, { "epoch": 2.166389919148342, "grad_norm": 2.484483003616333, "learning_rate": 4.783362559940409e-05, "loss": 1.0483, "step": 139600 }, { "epoch": 2.1679417743912848, "grad_norm": 1.9543863534927368, "learning_rate": 4.783207374416115e-05, "loss": 1.0493, "step": 139700 }, { "epoch": 2.1694936296342275, "grad_norm": 2.141782283782959, "learning_rate": 4.7830521888918204e-05, "loss": 1.0655, "step": 139800 }, { "epoch": 2.1710454848771708, "grad_norm": 2.1450002193450928, "learning_rate": 4.782897003367526e-05, "loss": 1.0814, "step": 139900 }, { "epoch": 2.1725973401201135, "grad_norm": 2.242635726928711, "learning_rate": 4.782741817843232e-05, "loss": 1.05, "step": 140000 }, { "epoch": 2.1741491953630567, "grad_norm": 2.4195568561553955, "learning_rate": 4.782586632318938e-05, "loss": 1.039, "step": 140100 }, { "epoch": 2.1757010506059995, "grad_norm": 2.152660608291626, "learning_rate": 4.7824314467946435e-05, "loss": 1.046, "step": 140200 }, { "epoch": 2.1772529058489423, "grad_norm": 2.4316248893737793, "learning_rate": 4.7822762612703486e-05, "loss": 1.057, "step": 140300 }, { "epoch": 2.1788047610918855, "grad_norm": 2.2033448219299316, "learning_rate": 4.7821210757460544e-05, "loss": 1.0719, "step": 140400 }, { "epoch": 2.1803566163348282, "grad_norm": 2.146704912185669, "learning_rate": 4.78196589022176e-05, "loss": 1.0602, "step": 140500 }, { "epoch": 2.1819084715777715, "grad_norm": 3.163799524307251, "learning_rate": 4.781810704697466e-05, "loss": 1.0271, "step": 140600 }, { "epoch": 2.1834603268207142, "grad_norm": 2.3232812881469727, "learning_rate": 4.781655519173172e-05, "loss": 1.026, "step": 140700 }, { "epoch": 2.185012182063657, "grad_norm": 2.2287683486938477, "learning_rate": 4.7815003336488775e-05, "loss": 1.0637, "step": 140800 }, { "epoch": 2.1865640373066, "grad_norm": 2.2433431148529053, "learning_rate": 4.781345148124583e-05, "loss": 1.028, "step": 140900 }, { "epoch": 2.188115892549543, "grad_norm": 1.9653691053390503, "learning_rate": 4.781189962600289e-05, "loss": 1.0513, "step": 141000 }, { "epoch": 2.1896677477924857, "grad_norm": 2.280937910079956, "learning_rate": 4.781034777075994e-05, "loss": 1.0474, "step": 141100 }, { "epoch": 2.191219603035429, "grad_norm": 2.641557216644287, "learning_rate": 4.7808795915517e-05, "loss": 1.0453, "step": 141200 }, { "epoch": 2.1927714582783717, "grad_norm": 2.0779197216033936, "learning_rate": 4.780724406027406e-05, "loss": 1.0412, "step": 141300 }, { "epoch": 2.194323313521315, "grad_norm": 2.32072377204895, "learning_rate": 4.7805692205031115e-05, "loss": 1.0731, "step": 141400 }, { "epoch": 2.1958751687642577, "grad_norm": 2.3527817726135254, "learning_rate": 4.780414034978817e-05, "loss": 1.0712, "step": 141500 }, { "epoch": 2.1974270240072005, "grad_norm": 2.154773712158203, "learning_rate": 4.780258849454523e-05, "loss": 1.0474, "step": 141600 }, { "epoch": 2.1989788792501437, "grad_norm": 2.3761403560638428, "learning_rate": 4.780103663930229e-05, "loss": 1.0418, "step": 141700 }, { "epoch": 2.2005307344930864, "grad_norm": 2.551168918609619, "learning_rate": 4.7799484784059346e-05, "loss": 1.0588, "step": 141800 }, { "epoch": 2.202082589736029, "grad_norm": 2.4025044441223145, "learning_rate": 4.7797932928816404e-05, "loss": 1.0558, "step": 141900 }, { "epoch": 2.2036344449789724, "grad_norm": 2.3074281215667725, "learning_rate": 4.779638107357346e-05, "loss": 1.0601, "step": 142000 }, { "epoch": 2.205186300221915, "grad_norm": 1.9330724477767944, "learning_rate": 4.779482921833052e-05, "loss": 1.0652, "step": 142100 }, { "epoch": 2.2067381554648584, "grad_norm": 2.2744085788726807, "learning_rate": 4.779327736308758e-05, "loss": 1.0801, "step": 142200 }, { "epoch": 2.208290010707801, "grad_norm": 2.1365199089050293, "learning_rate": 4.7791725507844635e-05, "loss": 1.062, "step": 142300 }, { "epoch": 2.209841865950744, "grad_norm": 2.1086266040802, "learning_rate": 4.7790173652601686e-05, "loss": 1.0703, "step": 142400 }, { "epoch": 2.211393721193687, "grad_norm": 2.3713245391845703, "learning_rate": 4.778862179735874e-05, "loss": 1.0495, "step": 142500 }, { "epoch": 2.21294557643663, "grad_norm": 2.628063678741455, "learning_rate": 4.77870699421158e-05, "loss": 1.0703, "step": 142600 }, { "epoch": 2.214497431679573, "grad_norm": 2.2222647666931152, "learning_rate": 4.778551808687286e-05, "loss": 1.0607, "step": 142700 }, { "epoch": 2.216049286922516, "grad_norm": 2.295289993286133, "learning_rate": 4.778396623162992e-05, "loss": 1.061, "step": 142800 }, { "epoch": 2.2176011421654587, "grad_norm": 2.6212644577026367, "learning_rate": 4.7782414376386974e-05, "loss": 1.0627, "step": 142900 }, { "epoch": 2.219152997408402, "grad_norm": 2.4660825729370117, "learning_rate": 4.778086252114403e-05, "loss": 1.0458, "step": 143000 }, { "epoch": 2.2207048526513447, "grad_norm": 2.1224730014801025, "learning_rate": 4.777931066590109e-05, "loss": 1.0422, "step": 143100 }, { "epoch": 2.2222567078942874, "grad_norm": 2.405404567718506, "learning_rate": 4.777775881065815e-05, "loss": 1.0339, "step": 143200 }, { "epoch": 2.2238085631372306, "grad_norm": 1.9070557355880737, "learning_rate": 4.7776206955415205e-05, "loss": 1.0727, "step": 143300 }, { "epoch": 2.2253604183801734, "grad_norm": 2.354372501373291, "learning_rate": 4.777465510017226e-05, "loss": 1.0542, "step": 143400 }, { "epoch": 2.2269122736231166, "grad_norm": 2.394808769226074, "learning_rate": 4.7773103244929314e-05, "loss": 1.0594, "step": 143500 }, { "epoch": 2.2284641288660594, "grad_norm": 2.148226022720337, "learning_rate": 4.777155138968637e-05, "loss": 1.0533, "step": 143600 }, { "epoch": 2.230015984109002, "grad_norm": 2.3424553871154785, "learning_rate": 4.776999953444343e-05, "loss": 1.0554, "step": 143700 }, { "epoch": 2.2315678393519454, "grad_norm": 2.2306673526763916, "learning_rate": 4.776844767920049e-05, "loss": 1.0297, "step": 143800 }, { "epoch": 2.233119694594888, "grad_norm": 2.1870059967041016, "learning_rate": 4.776689582395754e-05, "loss": 1.0707, "step": 143900 }, { "epoch": 2.234671549837831, "grad_norm": 1.9275952577590942, "learning_rate": 4.7765343968714596e-05, "loss": 1.0456, "step": 144000 }, { "epoch": 2.236223405080774, "grad_norm": 2.018854856491089, "learning_rate": 4.7763792113471654e-05, "loss": 1.0543, "step": 144100 }, { "epoch": 2.237775260323717, "grad_norm": 2.496302843093872, "learning_rate": 4.776224025822871e-05, "loss": 1.0546, "step": 144200 }, { "epoch": 2.23932711556666, "grad_norm": 2.695812702178955, "learning_rate": 4.776068840298577e-05, "loss": 1.0696, "step": 144300 }, { "epoch": 2.240878970809603, "grad_norm": 2.6274967193603516, "learning_rate": 4.775913654774283e-05, "loss": 1.068, "step": 144400 }, { "epoch": 2.2424308260525456, "grad_norm": 2.355994701385498, "learning_rate": 4.7757584692499885e-05, "loss": 1.0713, "step": 144500 }, { "epoch": 2.243982681295489, "grad_norm": 2.223212957382202, "learning_rate": 4.775603283725694e-05, "loss": 1.0568, "step": 144600 }, { "epoch": 2.2455345365384316, "grad_norm": 1.9176734685897827, "learning_rate": 4.7754480982014e-05, "loss": 1.0566, "step": 144700 }, { "epoch": 2.247086391781375, "grad_norm": 2.3282992839813232, "learning_rate": 4.775292912677106e-05, "loss": 1.0693, "step": 144800 }, { "epoch": 2.2486382470243176, "grad_norm": 2.0825443267822266, "learning_rate": 4.7751377271528116e-05, "loss": 1.0731, "step": 144900 }, { "epoch": 2.2501901022672604, "grad_norm": 2.376631736755371, "learning_rate": 4.7749825416285174e-05, "loss": 1.0633, "step": 145000 }, { "epoch": 2.2517419575102036, "grad_norm": 2.3078227043151855, "learning_rate": 4.774827356104223e-05, "loss": 1.0818, "step": 145100 }, { "epoch": 2.2532938127531463, "grad_norm": 2.1128923892974854, "learning_rate": 4.774672170579928e-05, "loss": 1.044, "step": 145200 }, { "epoch": 2.254845667996089, "grad_norm": 1.8535137176513672, "learning_rate": 4.774516985055634e-05, "loss": 1.0272, "step": 145300 }, { "epoch": 2.2563975232390323, "grad_norm": 2.605168342590332, "learning_rate": 4.77436179953134e-05, "loss": 1.0724, "step": 145400 }, { "epoch": 2.257949378481975, "grad_norm": 2.319013833999634, "learning_rate": 4.7742066140070456e-05, "loss": 1.0656, "step": 145500 }, { "epoch": 2.2595012337249183, "grad_norm": 2.407966375350952, "learning_rate": 4.774051428482751e-05, "loss": 1.0547, "step": 145600 }, { "epoch": 2.261053088967861, "grad_norm": 2.496340751647949, "learning_rate": 4.773896242958457e-05, "loss": 1.0663, "step": 145700 }, { "epoch": 2.262604944210804, "grad_norm": 2.252469301223755, "learning_rate": 4.773741057434163e-05, "loss": 1.061, "step": 145800 }, { "epoch": 2.264156799453747, "grad_norm": 2.1885874271392822, "learning_rate": 4.773585871909869e-05, "loss": 1.0475, "step": 145900 }, { "epoch": 2.26570865469669, "grad_norm": 2.185015916824341, "learning_rate": 4.7734306863855744e-05, "loss": 1.0393, "step": 146000 }, { "epoch": 2.267260509939633, "grad_norm": 2.1869454383850098, "learning_rate": 4.77327550086128e-05, "loss": 1.0415, "step": 146100 }, { "epoch": 2.268812365182576, "grad_norm": 2.1967248916625977, "learning_rate": 4.773120315336986e-05, "loss": 1.0583, "step": 146200 }, { "epoch": 2.2703642204255186, "grad_norm": 2.3129613399505615, "learning_rate": 4.772965129812692e-05, "loss": 1.053, "step": 146300 }, { "epoch": 2.2719160756684618, "grad_norm": 2.094412088394165, "learning_rate": 4.7728099442883975e-05, "loss": 1.0623, "step": 146400 }, { "epoch": 2.2734679309114045, "grad_norm": 2.1780972480773926, "learning_rate": 4.7726547587641026e-05, "loss": 1.0633, "step": 146500 }, { "epoch": 2.2750197861543473, "grad_norm": 2.5608599185943604, "learning_rate": 4.7724995732398084e-05, "loss": 1.045, "step": 146600 }, { "epoch": 2.2765716413972905, "grad_norm": 2.292449712753296, "learning_rate": 4.772344387715514e-05, "loss": 1.0416, "step": 146700 }, { "epoch": 2.2781234966402333, "grad_norm": 2.345844268798828, "learning_rate": 4.772189202191219e-05, "loss": 1.0562, "step": 146800 }, { "epoch": 2.2796753518831765, "grad_norm": 2.333442449569702, "learning_rate": 4.772034016666925e-05, "loss": 1.0507, "step": 146900 }, { "epoch": 2.2812272071261193, "grad_norm": 2.647368907928467, "learning_rate": 4.771878831142631e-05, "loss": 1.0524, "step": 147000 }, { "epoch": 2.282779062369062, "grad_norm": 2.471221923828125, "learning_rate": 4.7717236456183366e-05, "loss": 1.0581, "step": 147100 }, { "epoch": 2.2843309176120052, "grad_norm": 2.3910906314849854, "learning_rate": 4.7715684600940424e-05, "loss": 1.0873, "step": 147200 }, { "epoch": 2.285882772854948, "grad_norm": 2.331425666809082, "learning_rate": 4.771413274569748e-05, "loss": 1.0553, "step": 147300 }, { "epoch": 2.287434628097891, "grad_norm": 2.3420097827911377, "learning_rate": 4.771258089045454e-05, "loss": 1.0707, "step": 147400 }, { "epoch": 2.288986483340834, "grad_norm": 1.8440392017364502, "learning_rate": 4.77110290352116e-05, "loss": 1.0706, "step": 147500 }, { "epoch": 2.2905383385837768, "grad_norm": 2.0832135677337646, "learning_rate": 4.7709477179968655e-05, "loss": 1.0724, "step": 147600 }, { "epoch": 2.29209019382672, "grad_norm": 1.790666937828064, "learning_rate": 4.770792532472571e-05, "loss": 1.04, "step": 147700 }, { "epoch": 2.2936420490696627, "grad_norm": 2.523416757583618, "learning_rate": 4.770637346948277e-05, "loss": 1.035, "step": 147800 }, { "epoch": 2.2951939043126055, "grad_norm": 2.341848373413086, "learning_rate": 4.770482161423983e-05, "loss": 1.0577, "step": 147900 }, { "epoch": 2.2967457595555487, "grad_norm": 2.3899857997894287, "learning_rate": 4.770326975899688e-05, "loss": 1.0477, "step": 148000 }, { "epoch": 2.2982976147984915, "grad_norm": 2.122985363006592, "learning_rate": 4.770171790375394e-05, "loss": 1.0548, "step": 148100 }, { "epoch": 2.2998494700414347, "grad_norm": 1.9313324689865112, "learning_rate": 4.7700166048510995e-05, "loss": 1.0584, "step": 148200 }, { "epoch": 2.3014013252843775, "grad_norm": 2.4943015575408936, "learning_rate": 4.769861419326805e-05, "loss": 1.0351, "step": 148300 }, { "epoch": 2.3029531805273202, "grad_norm": 2.653111457824707, "learning_rate": 4.769706233802511e-05, "loss": 1.0483, "step": 148400 }, { "epoch": 2.3045050357702634, "grad_norm": 2.2838118076324463, "learning_rate": 4.769551048278217e-05, "loss": 1.0485, "step": 148500 }, { "epoch": 2.306056891013206, "grad_norm": 2.3461153507232666, "learning_rate": 4.7693958627539226e-05, "loss": 1.0508, "step": 148600 }, { "epoch": 2.3076087462561494, "grad_norm": 2.42330002784729, "learning_rate": 4.769240677229628e-05, "loss": 1.0796, "step": 148700 }, { "epoch": 2.309160601499092, "grad_norm": 2.3097708225250244, "learning_rate": 4.769085491705334e-05, "loss": 1.0496, "step": 148800 }, { "epoch": 2.310712456742035, "grad_norm": 2.0397136211395264, "learning_rate": 4.76893030618104e-05, "loss": 1.0509, "step": 148900 }, { "epoch": 2.312264311984978, "grad_norm": 2.363084316253662, "learning_rate": 4.768775120656746e-05, "loss": 1.0342, "step": 149000 }, { "epoch": 2.313816167227921, "grad_norm": 2.190420389175415, "learning_rate": 4.7686199351324514e-05, "loss": 1.0668, "step": 149100 }, { "epoch": 2.3153680224708637, "grad_norm": 2.466182231903076, "learning_rate": 4.768464749608157e-05, "loss": 1.0614, "step": 149200 }, { "epoch": 2.316919877713807, "grad_norm": 2.198457956314087, "learning_rate": 4.768309564083862e-05, "loss": 1.0568, "step": 149300 }, { "epoch": 2.3184717329567497, "grad_norm": 2.161055088043213, "learning_rate": 4.768154378559568e-05, "loss": 1.0534, "step": 149400 }, { "epoch": 2.320023588199693, "grad_norm": 2.147515296936035, "learning_rate": 4.767999193035274e-05, "loss": 1.0281, "step": 149500 }, { "epoch": 2.3215754434426357, "grad_norm": 2.2802913188934326, "learning_rate": 4.7678440075109796e-05, "loss": 1.048, "step": 149600 }, { "epoch": 2.3231272986855784, "grad_norm": 2.000180721282959, "learning_rate": 4.7676888219866854e-05, "loss": 1.043, "step": 149700 }, { "epoch": 2.3246791539285216, "grad_norm": 2.777062177658081, "learning_rate": 4.767533636462391e-05, "loss": 1.0575, "step": 149800 }, { "epoch": 2.3262310091714644, "grad_norm": 2.241762399673462, "learning_rate": 4.767378450938097e-05, "loss": 1.0437, "step": 149900 }, { "epoch": 2.3277828644144076, "grad_norm": 2.5319266319274902, "learning_rate": 4.767223265413802e-05, "loss": 1.0556, "step": 150000 }, { "epoch": 2.3293347196573504, "grad_norm": 2.237144708633423, "learning_rate": 4.767068079889508e-05, "loss": 1.0476, "step": 150100 }, { "epoch": 2.330886574900293, "grad_norm": 2.430987596511841, "learning_rate": 4.7669128943652136e-05, "loss": 1.0488, "step": 150200 }, { "epoch": 2.3324384301432364, "grad_norm": 2.464299440383911, "learning_rate": 4.7667577088409194e-05, "loss": 1.0383, "step": 150300 }, { "epoch": 2.333990285386179, "grad_norm": 2.4423747062683105, "learning_rate": 4.766602523316625e-05, "loss": 1.0516, "step": 150400 }, { "epoch": 2.335542140629122, "grad_norm": 2.6412713527679443, "learning_rate": 4.766447337792331e-05, "loss": 1.079, "step": 150500 }, { "epoch": 2.337093995872065, "grad_norm": 2.2306251525878906, "learning_rate": 4.766292152268037e-05, "loss": 1.0465, "step": 150600 }, { "epoch": 2.338645851115008, "grad_norm": 2.1720101833343506, "learning_rate": 4.7661369667437425e-05, "loss": 1.0469, "step": 150700 }, { "epoch": 2.340197706357951, "grad_norm": 2.2147459983825684, "learning_rate": 4.765981781219448e-05, "loss": 1.0604, "step": 150800 }, { "epoch": 2.341749561600894, "grad_norm": 2.310157299041748, "learning_rate": 4.7658265956951534e-05, "loss": 1.0717, "step": 150900 }, { "epoch": 2.3433014168438366, "grad_norm": 2.3886160850524902, "learning_rate": 4.765671410170859e-05, "loss": 1.0552, "step": 151000 }, { "epoch": 2.34485327208678, "grad_norm": 2.4885671138763428, "learning_rate": 4.765516224646565e-05, "loss": 1.0506, "step": 151100 }, { "epoch": 2.3464051273297226, "grad_norm": 2.5518290996551514, "learning_rate": 4.765361039122271e-05, "loss": 1.0614, "step": 151200 }, { "epoch": 2.347956982572666, "grad_norm": 1.9272632598876953, "learning_rate": 4.7652058535979765e-05, "loss": 1.0401, "step": 151300 }, { "epoch": 2.3495088378156086, "grad_norm": 2.372792959213257, "learning_rate": 4.765050668073682e-05, "loss": 1.0477, "step": 151400 }, { "epoch": 2.3510606930585514, "grad_norm": 1.8132848739624023, "learning_rate": 4.764895482549388e-05, "loss": 1.0298, "step": 151500 }, { "epoch": 2.3526125483014946, "grad_norm": 3.2817459106445312, "learning_rate": 4.764740297025094e-05, "loss": 1.0396, "step": 151600 }, { "epoch": 2.3541644035444373, "grad_norm": 2.5971763134002686, "learning_rate": 4.7645851115007996e-05, "loss": 1.0474, "step": 151700 }, { "epoch": 2.35571625878738, "grad_norm": 1.8911468982696533, "learning_rate": 4.764429925976505e-05, "loss": 1.0416, "step": 151800 }, { "epoch": 2.3572681140303233, "grad_norm": 1.9852266311645508, "learning_rate": 4.764274740452211e-05, "loss": 1.0501, "step": 151900 }, { "epoch": 2.358819969273266, "grad_norm": 2.1220972537994385, "learning_rate": 4.764119554927917e-05, "loss": 1.0519, "step": 152000 }, { "epoch": 2.3603718245162093, "grad_norm": 1.8463906049728394, "learning_rate": 4.763964369403623e-05, "loss": 1.0459, "step": 152100 }, { "epoch": 2.361923679759152, "grad_norm": 2.333005905151367, "learning_rate": 4.763809183879328e-05, "loss": 1.0372, "step": 152200 }, { "epoch": 2.363475535002095, "grad_norm": 2.468670606613159, "learning_rate": 4.7636539983550335e-05, "loss": 1.0335, "step": 152300 }, { "epoch": 2.365027390245038, "grad_norm": 2.165095329284668, "learning_rate": 4.763498812830739e-05, "loss": 1.0566, "step": 152400 }, { "epoch": 2.366579245487981, "grad_norm": 2.281447410583496, "learning_rate": 4.763343627306445e-05, "loss": 1.0557, "step": 152500 }, { "epoch": 2.368131100730924, "grad_norm": 2.539764642715454, "learning_rate": 4.763188441782151e-05, "loss": 1.0475, "step": 152600 }, { "epoch": 2.369682955973867, "grad_norm": 2.510406017303467, "learning_rate": 4.7630332562578566e-05, "loss": 1.0271, "step": 152700 }, { "epoch": 2.3712348112168096, "grad_norm": 2.220874309539795, "learning_rate": 4.7628780707335624e-05, "loss": 1.0429, "step": 152800 }, { "epoch": 2.372786666459753, "grad_norm": 2.0885510444641113, "learning_rate": 4.762722885209268e-05, "loss": 1.0473, "step": 152900 }, { "epoch": 2.3743385217026955, "grad_norm": 2.1182026863098145, "learning_rate": 4.762567699684974e-05, "loss": 1.036, "step": 153000 }, { "epoch": 2.3758903769456383, "grad_norm": 2.2428579330444336, "learning_rate": 4.76241251416068e-05, "loss": 1.0657, "step": 153100 }, { "epoch": 2.3774422321885815, "grad_norm": 2.295353412628174, "learning_rate": 4.7622573286363855e-05, "loss": 1.0885, "step": 153200 }, { "epoch": 2.3789940874315243, "grad_norm": 2.3112423419952393, "learning_rate": 4.7621021431120906e-05, "loss": 1.0391, "step": 153300 }, { "epoch": 2.3805459426744675, "grad_norm": 2.634587049484253, "learning_rate": 4.7619469575877964e-05, "loss": 1.0579, "step": 153400 }, { "epoch": 2.3820977979174103, "grad_norm": 2.181894540786743, "learning_rate": 4.761791772063502e-05, "loss": 1.0518, "step": 153500 }, { "epoch": 2.383649653160353, "grad_norm": 2.0486185550689697, "learning_rate": 4.761636586539208e-05, "loss": 1.0723, "step": 153600 }, { "epoch": 2.3852015084032963, "grad_norm": 2.4686059951782227, "learning_rate": 4.761481401014913e-05, "loss": 1.0413, "step": 153700 }, { "epoch": 2.386753363646239, "grad_norm": 2.0938847064971924, "learning_rate": 4.761326215490619e-05, "loss": 1.0697, "step": 153800 }, { "epoch": 2.3883052188891822, "grad_norm": 2.0420076847076416, "learning_rate": 4.7611710299663246e-05, "loss": 1.0519, "step": 153900 }, { "epoch": 2.389857074132125, "grad_norm": 2.1430273056030273, "learning_rate": 4.7610158444420304e-05, "loss": 1.0558, "step": 154000 }, { "epoch": 2.3914089293750678, "grad_norm": 2.037324905395508, "learning_rate": 4.760860658917736e-05, "loss": 1.0506, "step": 154100 }, { "epoch": 2.392960784618011, "grad_norm": 2.1305556297302246, "learning_rate": 4.760705473393442e-05, "loss": 1.0267, "step": 154200 }, { "epoch": 2.3945126398609538, "grad_norm": 2.3603744506835938, "learning_rate": 4.760550287869148e-05, "loss": 1.033, "step": 154300 }, { "epoch": 2.3960644951038965, "grad_norm": 2.3444020748138428, "learning_rate": 4.7603951023448535e-05, "loss": 1.0543, "step": 154400 }, { "epoch": 2.3976163503468397, "grad_norm": 2.0857348442077637, "learning_rate": 4.760239916820559e-05, "loss": 1.0576, "step": 154500 }, { "epoch": 2.3991682055897825, "grad_norm": 2.5950815677642822, "learning_rate": 4.760084731296265e-05, "loss": 1.0375, "step": 154600 }, { "epoch": 2.4007200608327257, "grad_norm": 1.9735974073410034, "learning_rate": 4.759929545771971e-05, "loss": 1.0626, "step": 154700 }, { "epoch": 2.4022719160756685, "grad_norm": 1.823829174041748, "learning_rate": 4.7597743602476766e-05, "loss": 1.0564, "step": 154800 }, { "epoch": 2.4038237713186112, "grad_norm": 2.4093971252441406, "learning_rate": 4.759619174723382e-05, "loss": 1.0457, "step": 154900 }, { "epoch": 2.4053756265615545, "grad_norm": 2.218122720718384, "learning_rate": 4.7594639891990874e-05, "loss": 1.0778, "step": 155000 }, { "epoch": 2.4069274818044972, "grad_norm": 2.118809461593628, "learning_rate": 4.759308803674793e-05, "loss": 1.055, "step": 155100 }, { "epoch": 2.4084793370474404, "grad_norm": 2.1717329025268555, "learning_rate": 4.759153618150499e-05, "loss": 1.0446, "step": 155200 }, { "epoch": 2.410031192290383, "grad_norm": 2.3796184062957764, "learning_rate": 4.758998432626205e-05, "loss": 1.0475, "step": 155300 }, { "epoch": 2.411583047533326, "grad_norm": 2.2108540534973145, "learning_rate": 4.7588432471019105e-05, "loss": 1.0752, "step": 155400 }, { "epoch": 2.413134902776269, "grad_norm": 2.346567153930664, "learning_rate": 4.758688061577616e-05, "loss": 1.0578, "step": 155500 }, { "epoch": 2.414686758019212, "grad_norm": 2.4406721591949463, "learning_rate": 4.758532876053322e-05, "loss": 1.0492, "step": 155600 }, { "epoch": 2.4162386132621547, "grad_norm": 2.3978302478790283, "learning_rate": 4.758377690529028e-05, "loss": 1.041, "step": 155700 }, { "epoch": 2.417790468505098, "grad_norm": 2.165689468383789, "learning_rate": 4.7582225050047336e-05, "loss": 1.0386, "step": 155800 }, { "epoch": 2.4193423237480407, "grad_norm": 2.235287666320801, "learning_rate": 4.7580673194804394e-05, "loss": 1.0632, "step": 155900 }, { "epoch": 2.420894178990984, "grad_norm": 2.384352207183838, "learning_rate": 4.757912133956145e-05, "loss": 1.0468, "step": 156000 }, { "epoch": 2.4224460342339267, "grad_norm": 1.9341685771942139, "learning_rate": 4.757756948431851e-05, "loss": 1.0413, "step": 156100 }, { "epoch": 2.4239978894768694, "grad_norm": 2.594360113143921, "learning_rate": 4.757601762907557e-05, "loss": 1.0726, "step": 156200 }, { "epoch": 2.4255497447198127, "grad_norm": 2.6018567085266113, "learning_rate": 4.757446577383262e-05, "loss": 1.0286, "step": 156300 }, { "epoch": 2.4271015999627554, "grad_norm": 2.0682055950164795, "learning_rate": 4.7572913918589676e-05, "loss": 1.0471, "step": 156400 }, { "epoch": 2.4286534552056986, "grad_norm": 2.3083670139312744, "learning_rate": 4.7571362063346734e-05, "loss": 1.0546, "step": 156500 }, { "epoch": 2.4302053104486414, "grad_norm": 2.548862934112549, "learning_rate": 4.7569810208103785e-05, "loss": 1.0537, "step": 156600 }, { "epoch": 2.431757165691584, "grad_norm": 2.381843090057373, "learning_rate": 4.756825835286084e-05, "loss": 1.0489, "step": 156700 }, { "epoch": 2.4333090209345274, "grad_norm": 2.127727508544922, "learning_rate": 4.75667064976179e-05, "loss": 1.0771, "step": 156800 }, { "epoch": 2.43486087617747, "grad_norm": 2.370854139328003, "learning_rate": 4.756515464237496e-05, "loss": 1.048, "step": 156900 }, { "epoch": 2.436412731420413, "grad_norm": 2.1895341873168945, "learning_rate": 4.7563602787132016e-05, "loss": 1.0281, "step": 157000 }, { "epoch": 2.437964586663356, "grad_norm": 1.860305666923523, "learning_rate": 4.7562050931889074e-05, "loss": 1.0371, "step": 157100 }, { "epoch": 2.439516441906299, "grad_norm": 2.2114458084106445, "learning_rate": 4.756049907664613e-05, "loss": 1.0743, "step": 157200 }, { "epoch": 2.4410682971492417, "grad_norm": 2.7622485160827637, "learning_rate": 4.755894722140319e-05, "loss": 1.0435, "step": 157300 }, { "epoch": 2.442620152392185, "grad_norm": 2.377544403076172, "learning_rate": 4.755739536616025e-05, "loss": 1.0459, "step": 157400 }, { "epoch": 2.4441720076351277, "grad_norm": 2.1643855571746826, "learning_rate": 4.7555843510917305e-05, "loss": 1.0751, "step": 157500 }, { "epoch": 2.445723862878071, "grad_norm": 2.5202760696411133, "learning_rate": 4.755429165567436e-05, "loss": 1.0466, "step": 157600 }, { "epoch": 2.4472757181210136, "grad_norm": 2.090636730194092, "learning_rate": 4.755273980043142e-05, "loss": 1.0601, "step": 157700 }, { "epoch": 2.448827573363957, "grad_norm": 2.1124720573425293, "learning_rate": 4.755118794518848e-05, "loss": 1.0396, "step": 157800 }, { "epoch": 2.4503794286068996, "grad_norm": 2.507014274597168, "learning_rate": 4.754963608994553e-05, "loss": 1.0599, "step": 157900 }, { "epoch": 2.4519312838498424, "grad_norm": 2.0493221282958984, "learning_rate": 4.7548084234702587e-05, "loss": 1.0426, "step": 158000 }, { "epoch": 2.4534831390927856, "grad_norm": 2.2023959159851074, "learning_rate": 4.7546532379459644e-05, "loss": 1.0645, "step": 158100 }, { "epoch": 2.4550349943357284, "grad_norm": 2.1573829650878906, "learning_rate": 4.75449805242167e-05, "loss": 1.0651, "step": 158200 }, { "epoch": 2.456586849578671, "grad_norm": 2.179300308227539, "learning_rate": 4.754342866897376e-05, "loss": 1.0585, "step": 158300 }, { "epoch": 2.4581387048216143, "grad_norm": 2.278463363647461, "learning_rate": 4.754187681373082e-05, "loss": 1.0429, "step": 158400 }, { "epoch": 2.459690560064557, "grad_norm": 1.7499622106552124, "learning_rate": 4.7540324958487875e-05, "loss": 1.0415, "step": 158500 }, { "epoch": 2.4612424153075, "grad_norm": 2.3482213020324707, "learning_rate": 4.753877310324493e-05, "loss": 1.0441, "step": 158600 }, { "epoch": 2.462794270550443, "grad_norm": 2.2545645236968994, "learning_rate": 4.753722124800199e-05, "loss": 1.0574, "step": 158700 }, { "epoch": 2.464346125793386, "grad_norm": 2.5652778148651123, "learning_rate": 4.753566939275905e-05, "loss": 1.0747, "step": 158800 }, { "epoch": 2.465897981036329, "grad_norm": 2.000574827194214, "learning_rate": 4.7534117537516106e-05, "loss": 1.0419, "step": 158900 }, { "epoch": 2.467449836279272, "grad_norm": 2.224050998687744, "learning_rate": 4.7532565682273164e-05, "loss": 1.0384, "step": 159000 }, { "epoch": 2.469001691522215, "grad_norm": 2.237063407897949, "learning_rate": 4.753101382703022e-05, "loss": 1.0713, "step": 159100 }, { "epoch": 2.470553546765158, "grad_norm": 2.1370975971221924, "learning_rate": 4.752946197178727e-05, "loss": 1.0346, "step": 159200 }, { "epoch": 2.4721054020081006, "grad_norm": 2.3766438961029053, "learning_rate": 4.752791011654433e-05, "loss": 1.0487, "step": 159300 }, { "epoch": 2.473657257251044, "grad_norm": 2.182241678237915, "learning_rate": 4.752635826130139e-05, "loss": 1.0299, "step": 159400 }, { "epoch": 2.4752091124939866, "grad_norm": 2.0750985145568848, "learning_rate": 4.7524806406058446e-05, "loss": 1.0341, "step": 159500 }, { "epoch": 2.4767609677369293, "grad_norm": 2.2225234508514404, "learning_rate": 4.7523254550815504e-05, "loss": 1.0465, "step": 159600 }, { "epoch": 2.4783128229798725, "grad_norm": 2.247349500656128, "learning_rate": 4.752170269557256e-05, "loss": 1.0326, "step": 159700 }, { "epoch": 2.4798646782228153, "grad_norm": 2.508463144302368, "learning_rate": 4.752015084032961e-05, "loss": 1.072, "step": 159800 }, { "epoch": 2.481416533465758, "grad_norm": 2.166019916534424, "learning_rate": 4.751859898508667e-05, "loss": 1.042, "step": 159900 }, { "epoch": 2.4829683887087013, "grad_norm": 2.0317604541778564, "learning_rate": 4.751704712984373e-05, "loss": 1.0407, "step": 160000 }, { "epoch": 2.484520243951644, "grad_norm": 2.2698965072631836, "learning_rate": 4.7515495274600786e-05, "loss": 1.0577, "step": 160100 }, { "epoch": 2.4860720991945873, "grad_norm": 2.5224087238311768, "learning_rate": 4.7513943419357844e-05, "loss": 1.0455, "step": 160200 }, { "epoch": 2.48762395443753, "grad_norm": 2.213451385498047, "learning_rate": 4.75123915641149e-05, "loss": 1.0589, "step": 160300 }, { "epoch": 2.4891758096804733, "grad_norm": 2.409013509750366, "learning_rate": 4.751083970887196e-05, "loss": 1.0606, "step": 160400 }, { "epoch": 2.490727664923416, "grad_norm": 2.1013107299804688, "learning_rate": 4.750928785362902e-05, "loss": 1.0599, "step": 160500 }, { "epoch": 2.492279520166359, "grad_norm": 2.3335561752319336, "learning_rate": 4.7507735998386075e-05, "loss": 1.0314, "step": 160600 }, { "epoch": 2.493831375409302, "grad_norm": 2.0536320209503174, "learning_rate": 4.7506184143143126e-05, "loss": 1.0572, "step": 160700 }, { "epoch": 2.4953832306522448, "grad_norm": 2.5244200229644775, "learning_rate": 4.750463228790018e-05, "loss": 1.05, "step": 160800 }, { "epoch": 2.4969350858951875, "grad_norm": 1.8753924369812012, "learning_rate": 4.750308043265724e-05, "loss": 1.0463, "step": 160900 }, { "epoch": 2.4984869411381307, "grad_norm": 2.6177942752838135, "learning_rate": 4.75015285774143e-05, "loss": 1.0485, "step": 161000 }, { "epoch": 2.5000387963810735, "grad_norm": 2.049344301223755, "learning_rate": 4.7499976722171357e-05, "loss": 1.058, "step": 161100 }, { "epoch": 2.5015906516240163, "grad_norm": 2.446894407272339, "learning_rate": 4.7498424866928414e-05, "loss": 1.0801, "step": 161200 }, { "epoch": 2.5031425068669595, "grad_norm": 2.094968557357788, "learning_rate": 4.749687301168547e-05, "loss": 1.045, "step": 161300 }, { "epoch": 2.5046943621099023, "grad_norm": 2.4712958335876465, "learning_rate": 4.749532115644253e-05, "loss": 1.0631, "step": 161400 }, { "epoch": 2.5062462173528455, "grad_norm": 1.9819856882095337, "learning_rate": 4.749376930119959e-05, "loss": 1.0383, "step": 161500 }, { "epoch": 2.5077980725957882, "grad_norm": 2.4259374141693115, "learning_rate": 4.7492217445956645e-05, "loss": 1.0396, "step": 161600 }, { "epoch": 2.5093499278387315, "grad_norm": 2.312303304672241, "learning_rate": 4.74906655907137e-05, "loss": 1.064, "step": 161700 }, { "epoch": 2.510901783081674, "grad_norm": 2.351372718811035, "learning_rate": 4.748911373547076e-05, "loss": 1.0341, "step": 161800 }, { "epoch": 2.512453638324617, "grad_norm": 2.074159860610962, "learning_rate": 4.748756188022782e-05, "loss": 1.0159, "step": 161900 }, { "epoch": 2.51400549356756, "grad_norm": 2.592299461364746, "learning_rate": 4.748601002498487e-05, "loss": 1.0596, "step": 162000 }, { "epoch": 2.515557348810503, "grad_norm": 2.0001986026763916, "learning_rate": 4.748445816974193e-05, "loss": 1.0548, "step": 162100 }, { "epoch": 2.5171092040534457, "grad_norm": 1.8096919059753418, "learning_rate": 4.7482906314498985e-05, "loss": 1.0709, "step": 162200 }, { "epoch": 2.518661059296389, "grad_norm": 1.8577805757522583, "learning_rate": 4.748135445925604e-05, "loss": 1.0426, "step": 162300 }, { "epoch": 2.5202129145393317, "grad_norm": 2.1133625507354736, "learning_rate": 4.74798026040131e-05, "loss": 1.0508, "step": 162400 }, { "epoch": 2.5217647697822745, "grad_norm": 2.109609842300415, "learning_rate": 4.747825074877016e-05, "loss": 1.0408, "step": 162500 }, { "epoch": 2.5233166250252177, "grad_norm": 2.1927883625030518, "learning_rate": 4.7476698893527216e-05, "loss": 1.0313, "step": 162600 }, { "epoch": 2.5248684802681605, "grad_norm": 2.262117385864258, "learning_rate": 4.7475147038284274e-05, "loss": 1.043, "step": 162700 }, { "epoch": 2.5264203355111037, "grad_norm": 2.3188188076019287, "learning_rate": 4.747359518304133e-05, "loss": 1.039, "step": 162800 }, { "epoch": 2.5279721907540464, "grad_norm": 2.0471930503845215, "learning_rate": 4.747204332779839e-05, "loss": 1.0363, "step": 162900 }, { "epoch": 2.5295240459969897, "grad_norm": 1.9992185831069946, "learning_rate": 4.747049147255545e-05, "loss": 1.0326, "step": 163000 }, { "epoch": 2.5310759012399324, "grad_norm": 2.333908796310425, "learning_rate": 4.74689396173125e-05, "loss": 1.0544, "step": 163100 }, { "epoch": 2.532627756482875, "grad_norm": 1.9311968088150024, "learning_rate": 4.7467387762069556e-05, "loss": 1.0583, "step": 163200 }, { "epoch": 2.5341796117258184, "grad_norm": 2.388669490814209, "learning_rate": 4.7465835906826614e-05, "loss": 1.0747, "step": 163300 }, { "epoch": 2.535731466968761, "grad_norm": 2.2378740310668945, "learning_rate": 4.746428405158367e-05, "loss": 1.0475, "step": 163400 }, { "epoch": 2.537283322211704, "grad_norm": 2.5829851627349854, "learning_rate": 4.746273219634072e-05, "loss": 1.0354, "step": 163500 }, { "epoch": 2.538835177454647, "grad_norm": 2.3610262870788574, "learning_rate": 4.746118034109778e-05, "loss": 1.0336, "step": 163600 }, { "epoch": 2.54038703269759, "grad_norm": 2.523416757583618, "learning_rate": 4.745962848585484e-05, "loss": 1.0621, "step": 163700 }, { "epoch": 2.5419388879405327, "grad_norm": 2.299258232116699, "learning_rate": 4.7458076630611896e-05, "loss": 1.0601, "step": 163800 }, { "epoch": 2.543490743183476, "grad_norm": 2.02618408203125, "learning_rate": 4.745652477536895e-05, "loss": 1.0595, "step": 163900 }, { "epoch": 2.5450425984264187, "grad_norm": 2.161907196044922, "learning_rate": 4.745497292012601e-05, "loss": 1.0603, "step": 164000 }, { "epoch": 2.546594453669362, "grad_norm": 2.0180680751800537, "learning_rate": 4.745342106488307e-05, "loss": 1.0441, "step": 164100 }, { "epoch": 2.5481463089123046, "grad_norm": 1.9091253280639648, "learning_rate": 4.7451869209640127e-05, "loss": 1.0356, "step": 164200 }, { "epoch": 2.549698164155248, "grad_norm": 2.026906967163086, "learning_rate": 4.7450317354397184e-05, "loss": 1.04, "step": 164300 }, { "epoch": 2.5512500193981906, "grad_norm": 2.476252555847168, "learning_rate": 4.744876549915424e-05, "loss": 1.0475, "step": 164400 }, { "epoch": 2.5528018746411334, "grad_norm": 2.367262125015259, "learning_rate": 4.74472136439113e-05, "loss": 1.0256, "step": 164500 }, { "epoch": 2.5543537298840766, "grad_norm": 2.2345073223114014, "learning_rate": 4.744566178866836e-05, "loss": 1.0441, "step": 164600 }, { "epoch": 2.5559055851270194, "grad_norm": 2.453378438949585, "learning_rate": 4.7444109933425415e-05, "loss": 1.0463, "step": 164700 }, { "epoch": 2.557457440369962, "grad_norm": 2.4341559410095215, "learning_rate": 4.7442558078182466e-05, "loss": 1.0524, "step": 164800 }, { "epoch": 2.5590092956129054, "grad_norm": 2.466679573059082, "learning_rate": 4.7441006222939524e-05, "loss": 1.0519, "step": 164900 }, { "epoch": 2.560561150855848, "grad_norm": 2.243474006652832, "learning_rate": 4.743945436769658e-05, "loss": 1.0588, "step": 165000 }, { "epoch": 2.562113006098791, "grad_norm": 2.1142807006835938, "learning_rate": 4.743790251245364e-05, "loss": 1.0335, "step": 165100 }, { "epoch": 2.563664861341734, "grad_norm": 1.6868747472763062, "learning_rate": 4.74363506572107e-05, "loss": 1.0388, "step": 165200 }, { "epoch": 2.565216716584677, "grad_norm": 2.5993452072143555, "learning_rate": 4.7434798801967755e-05, "loss": 1.0413, "step": 165300 }, { "epoch": 2.56676857182762, "grad_norm": 2.2801432609558105, "learning_rate": 4.743324694672481e-05, "loss": 1.0637, "step": 165400 }, { "epoch": 2.568320427070563, "grad_norm": 2.533191442489624, "learning_rate": 4.743169509148187e-05, "loss": 1.0147, "step": 165500 }, { "epoch": 2.569872282313506, "grad_norm": 2.1488561630249023, "learning_rate": 4.743014323623893e-05, "loss": 1.0315, "step": 165600 }, { "epoch": 2.571424137556449, "grad_norm": 1.9489185810089111, "learning_rate": 4.7428591380995986e-05, "loss": 1.0396, "step": 165700 }, { "epoch": 2.5729759927993916, "grad_norm": 2.29132080078125, "learning_rate": 4.7427039525753044e-05, "loss": 1.0544, "step": 165800 }, { "epoch": 2.574527848042335, "grad_norm": 2.374694585800171, "learning_rate": 4.74254876705101e-05, "loss": 1.0409, "step": 165900 }, { "epoch": 2.5760797032852776, "grad_norm": 1.9349309206008911, "learning_rate": 4.742393581526716e-05, "loss": 1.0522, "step": 166000 }, { "epoch": 2.5776315585282203, "grad_norm": 2.0343072414398193, "learning_rate": 4.742238396002421e-05, "loss": 1.0376, "step": 166100 }, { "epoch": 2.5791834137711636, "grad_norm": 2.2302796840667725, "learning_rate": 4.742083210478127e-05, "loss": 1.0438, "step": 166200 }, { "epoch": 2.5807352690141063, "grad_norm": 2.174546241760254, "learning_rate": 4.7419280249538326e-05, "loss": 1.0515, "step": 166300 }, { "epoch": 2.582287124257049, "grad_norm": 2.080028533935547, "learning_rate": 4.741772839429538e-05, "loss": 1.0383, "step": 166400 }, { "epoch": 2.5838389794999923, "grad_norm": 1.830137014389038, "learning_rate": 4.7416176539052435e-05, "loss": 1.052, "step": 166500 }, { "epoch": 2.585390834742935, "grad_norm": 1.9905749559402466, "learning_rate": 4.741462468380949e-05, "loss": 1.0238, "step": 166600 }, { "epoch": 2.586942689985878, "grad_norm": 2.3765504360198975, "learning_rate": 4.741307282856655e-05, "loss": 1.0491, "step": 166700 }, { "epoch": 2.588494545228821, "grad_norm": 2.989565134048462, "learning_rate": 4.741152097332361e-05, "loss": 1.034, "step": 166800 }, { "epoch": 2.5900464004717643, "grad_norm": 2.3565328121185303, "learning_rate": 4.7409969118080666e-05, "loss": 1.0505, "step": 166900 }, { "epoch": 2.591598255714707, "grad_norm": 2.297006368637085, "learning_rate": 4.740841726283772e-05, "loss": 1.0521, "step": 167000 }, { "epoch": 2.59315011095765, "grad_norm": 3.0977842807769775, "learning_rate": 4.740686540759478e-05, "loss": 1.0333, "step": 167100 }, { "epoch": 2.594701966200593, "grad_norm": 2.3634941577911377, "learning_rate": 4.740531355235184e-05, "loss": 1.0339, "step": 167200 }, { "epoch": 2.596253821443536, "grad_norm": 2.169315814971924, "learning_rate": 4.7403761697108897e-05, "loss": 1.0569, "step": 167300 }, { "epoch": 2.5978056766864785, "grad_norm": 2.4711475372314453, "learning_rate": 4.7402209841865954e-05, "loss": 1.0451, "step": 167400 }, { "epoch": 2.5993575319294218, "grad_norm": 2.0106093883514404, "learning_rate": 4.740065798662301e-05, "loss": 1.0715, "step": 167500 }, { "epoch": 2.6009093871723645, "grad_norm": 2.1006646156311035, "learning_rate": 4.739910613138007e-05, "loss": 1.0461, "step": 167600 }, { "epoch": 2.6024612424153073, "grad_norm": 2.278226137161255, "learning_rate": 4.739755427613712e-05, "loss": 1.0325, "step": 167700 }, { "epoch": 2.6040130976582505, "grad_norm": 2.365501880645752, "learning_rate": 4.739600242089418e-05, "loss": 1.0643, "step": 167800 }, { "epoch": 2.6055649529011933, "grad_norm": 2.0916943550109863, "learning_rate": 4.7394450565651236e-05, "loss": 1.0432, "step": 167900 }, { "epoch": 2.607116808144136, "grad_norm": 2.1060738563537598, "learning_rate": 4.7392898710408294e-05, "loss": 1.0385, "step": 168000 }, { "epoch": 2.6086686633870793, "grad_norm": 2.485832691192627, "learning_rate": 4.739134685516535e-05, "loss": 1.0429, "step": 168100 }, { "epoch": 2.6102205186300225, "grad_norm": 2.1706671714782715, "learning_rate": 4.738979499992241e-05, "loss": 1.0489, "step": 168200 }, { "epoch": 2.6117723738729652, "grad_norm": 2.2199583053588867, "learning_rate": 4.738824314467947e-05, "loss": 1.0531, "step": 168300 }, { "epoch": 2.613324229115908, "grad_norm": 2.1484732627868652, "learning_rate": 4.7386691289436525e-05, "loss": 1.0306, "step": 168400 }, { "epoch": 2.614876084358851, "grad_norm": 2.219813823699951, "learning_rate": 4.738513943419358e-05, "loss": 1.0625, "step": 168500 }, { "epoch": 2.616427939601794, "grad_norm": 2.367612838745117, "learning_rate": 4.738358757895064e-05, "loss": 1.0371, "step": 168600 }, { "epoch": 2.6179797948447368, "grad_norm": 2.052858591079712, "learning_rate": 4.73820357237077e-05, "loss": 1.0409, "step": 168700 }, { "epoch": 2.61953165008768, "grad_norm": 2.7634003162384033, "learning_rate": 4.7380483868464756e-05, "loss": 1.0595, "step": 168800 }, { "epoch": 2.6210835053306227, "grad_norm": 2.206904172897339, "learning_rate": 4.7378932013221814e-05, "loss": 1.0602, "step": 168900 }, { "epoch": 2.6226353605735655, "grad_norm": 2.2186241149902344, "learning_rate": 4.7377380157978865e-05, "loss": 1.0545, "step": 169000 }, { "epoch": 2.6241872158165087, "grad_norm": 2.3548686504364014, "learning_rate": 4.737582830273592e-05, "loss": 1.034, "step": 169100 }, { "epoch": 2.6257390710594515, "grad_norm": 2.368831157684326, "learning_rate": 4.737427644749298e-05, "loss": 1.0459, "step": 169200 }, { "epoch": 2.6272909263023942, "grad_norm": 2.3844170570373535, "learning_rate": 4.737272459225004e-05, "loss": 1.0716, "step": 169300 }, { "epoch": 2.6288427815453375, "grad_norm": 2.102041482925415, "learning_rate": 4.7371172737007096e-05, "loss": 1.0371, "step": 169400 }, { "epoch": 2.6303946367882802, "grad_norm": 2.22416090965271, "learning_rate": 4.7369620881764154e-05, "loss": 1.0717, "step": 169500 }, { "epoch": 2.6319464920312234, "grad_norm": 2.3140690326690674, "learning_rate": 4.7368069026521205e-05, "loss": 1.0494, "step": 169600 }, { "epoch": 2.633498347274166, "grad_norm": 2.264110803604126, "learning_rate": 4.736651717127826e-05, "loss": 1.0289, "step": 169700 }, { "epoch": 2.6350502025171094, "grad_norm": 2.4174273014068604, "learning_rate": 4.736496531603532e-05, "loss": 1.0373, "step": 169800 }, { "epoch": 2.636602057760052, "grad_norm": 2.5184743404388428, "learning_rate": 4.736341346079238e-05, "loss": 1.037, "step": 169900 }, { "epoch": 2.638153913002995, "grad_norm": 2.5399415493011475, "learning_rate": 4.7361861605549436e-05, "loss": 1.0347, "step": 170000 }, { "epoch": 2.639705768245938, "grad_norm": 2.6229746341705322, "learning_rate": 4.736030975030649e-05, "loss": 1.045, "step": 170100 }, { "epoch": 2.641257623488881, "grad_norm": 2.6230993270874023, "learning_rate": 4.735875789506355e-05, "loss": 1.047, "step": 170200 }, { "epoch": 2.6428094787318237, "grad_norm": 2.6295454502105713, "learning_rate": 4.735720603982061e-05, "loss": 1.0492, "step": 170300 }, { "epoch": 2.644361333974767, "grad_norm": 2.217151165008545, "learning_rate": 4.7355654184577667e-05, "loss": 1.0374, "step": 170400 }, { "epoch": 2.6459131892177097, "grad_norm": 2.2779507637023926, "learning_rate": 4.735410232933472e-05, "loss": 1.0543, "step": 170500 }, { "epoch": 2.6474650444606525, "grad_norm": 2.2059710025787354, "learning_rate": 4.7352550474091775e-05, "loss": 1.0694, "step": 170600 }, { "epoch": 2.6490168997035957, "grad_norm": 2.124436378479004, "learning_rate": 4.735099861884883e-05, "loss": 1.034, "step": 170700 }, { "epoch": 2.6505687549465384, "grad_norm": 2.118906259536743, "learning_rate": 4.734944676360589e-05, "loss": 1.0401, "step": 170800 }, { "epoch": 2.6521206101894816, "grad_norm": 1.9342942237854004, "learning_rate": 4.734789490836295e-05, "loss": 1.0378, "step": 170900 }, { "epoch": 2.6536724654324244, "grad_norm": 2.6422057151794434, "learning_rate": 4.7346343053120006e-05, "loss": 1.0462, "step": 171000 }, { "epoch": 2.6552243206753676, "grad_norm": 2.120544195175171, "learning_rate": 4.7344791197877064e-05, "loss": 1.0325, "step": 171100 }, { "epoch": 2.6567761759183104, "grad_norm": 2.004960298538208, "learning_rate": 4.734323934263412e-05, "loss": 1.0694, "step": 171200 }, { "epoch": 2.658328031161253, "grad_norm": 2.4182827472686768, "learning_rate": 4.734168748739118e-05, "loss": 1.0352, "step": 171300 }, { "epoch": 2.6598798864041964, "grad_norm": 2.2767324447631836, "learning_rate": 4.734013563214824e-05, "loss": 1.0336, "step": 171400 }, { "epoch": 2.661431741647139, "grad_norm": 2.1682827472686768, "learning_rate": 4.7338583776905295e-05, "loss": 1.0514, "step": 171500 }, { "epoch": 2.662983596890082, "grad_norm": 2.4701051712036133, "learning_rate": 4.733703192166235e-05, "loss": 1.0517, "step": 171600 }, { "epoch": 2.664535452133025, "grad_norm": 2.312225103378296, "learning_rate": 4.733548006641941e-05, "loss": 1.0084, "step": 171700 }, { "epoch": 2.666087307375968, "grad_norm": 2.305318832397461, "learning_rate": 4.733392821117646e-05, "loss": 1.0604, "step": 171800 }, { "epoch": 2.6676391626189107, "grad_norm": 2.0694854259490967, "learning_rate": 4.733237635593352e-05, "loss": 1.0294, "step": 171900 }, { "epoch": 2.669191017861854, "grad_norm": 2.2506048679351807, "learning_rate": 4.733082450069058e-05, "loss": 1.0588, "step": 172000 }, { "epoch": 2.6707428731047966, "grad_norm": 2.3620123863220215, "learning_rate": 4.7329272645447635e-05, "loss": 1.0617, "step": 172100 }, { "epoch": 2.67229472834774, "grad_norm": 2.18591570854187, "learning_rate": 4.732772079020469e-05, "loss": 1.0646, "step": 172200 }, { "epoch": 2.6738465835906826, "grad_norm": 2.3200643062591553, "learning_rate": 4.732616893496175e-05, "loss": 1.0587, "step": 172300 }, { "epoch": 2.675398438833626, "grad_norm": 1.9267265796661377, "learning_rate": 4.732461707971881e-05, "loss": 1.0508, "step": 172400 }, { "epoch": 2.6769502940765686, "grad_norm": 2.3770933151245117, "learning_rate": 4.7323065224475866e-05, "loss": 1.0508, "step": 172500 }, { "epoch": 2.6785021493195114, "grad_norm": 2.2253923416137695, "learning_rate": 4.7321513369232924e-05, "loss": 1.0602, "step": 172600 }, { "epoch": 2.6800540045624546, "grad_norm": 2.6522648334503174, "learning_rate": 4.731996151398998e-05, "loss": 1.0377, "step": 172700 }, { "epoch": 2.6816058598053973, "grad_norm": 3.115670919418335, "learning_rate": 4.731840965874703e-05, "loss": 1.0315, "step": 172800 }, { "epoch": 2.68315771504834, "grad_norm": 2.218207597732544, "learning_rate": 4.731685780350409e-05, "loss": 1.0584, "step": 172900 }, { "epoch": 2.6847095702912833, "grad_norm": 2.2877306938171387, "learning_rate": 4.731530594826115e-05, "loss": 1.0474, "step": 173000 }, { "epoch": 2.686261425534226, "grad_norm": 2.023756504058838, "learning_rate": 4.7313754093018206e-05, "loss": 1.0345, "step": 173100 }, { "epoch": 2.687813280777169, "grad_norm": 2.463364362716675, "learning_rate": 4.731220223777526e-05, "loss": 1.0645, "step": 173200 }, { "epoch": 2.689365136020112, "grad_norm": 2.195937156677246, "learning_rate": 4.731065038253232e-05, "loss": 1.0433, "step": 173300 }, { "epoch": 2.690916991263055, "grad_norm": 2.1433589458465576, "learning_rate": 4.730909852728937e-05, "loss": 1.0198, "step": 173400 }, { "epoch": 2.692468846505998, "grad_norm": 2.3581550121307373, "learning_rate": 4.730754667204643e-05, "loss": 1.0508, "step": 173500 }, { "epoch": 2.694020701748941, "grad_norm": 2.3441619873046875, "learning_rate": 4.730599481680349e-05, "loss": 1.0425, "step": 173600 }, { "epoch": 2.695572556991884, "grad_norm": 2.695829153060913, "learning_rate": 4.7304442961560545e-05, "loss": 1.0162, "step": 173700 }, { "epoch": 2.697124412234827, "grad_norm": 2.4131481647491455, "learning_rate": 4.73028911063176e-05, "loss": 1.04, "step": 173800 }, { "epoch": 2.6986762674777696, "grad_norm": 2.350926160812378, "learning_rate": 4.730133925107466e-05, "loss": 1.0613, "step": 173900 }, { "epoch": 2.7002281227207128, "grad_norm": 2.3652498722076416, "learning_rate": 4.729978739583172e-05, "loss": 1.0539, "step": 174000 }, { "epoch": 2.7017799779636555, "grad_norm": 2.51181697845459, "learning_rate": 4.7298235540588776e-05, "loss": 1.0507, "step": 174100 }, { "epoch": 2.7033318332065983, "grad_norm": 2.526365041732788, "learning_rate": 4.7296683685345834e-05, "loss": 1.0507, "step": 174200 }, { "epoch": 2.7048836884495415, "grad_norm": 1.77544367313385, "learning_rate": 4.729513183010289e-05, "loss": 1.0542, "step": 174300 }, { "epoch": 2.7064355436924843, "grad_norm": 2.2926077842712402, "learning_rate": 4.729357997485995e-05, "loss": 1.0477, "step": 174400 }, { "epoch": 2.707987398935427, "grad_norm": 2.35168719291687, "learning_rate": 4.729202811961701e-05, "loss": 1.056, "step": 174500 }, { "epoch": 2.7095392541783703, "grad_norm": 1.8608139753341675, "learning_rate": 4.7290476264374065e-05, "loss": 1.0387, "step": 174600 }, { "epoch": 2.711091109421313, "grad_norm": 2.592602252960205, "learning_rate": 4.7288924409131116e-05, "loss": 1.0377, "step": 174700 }, { "epoch": 2.7126429646642563, "grad_norm": 2.336280345916748, "learning_rate": 4.7287372553888174e-05, "loss": 1.0474, "step": 174800 }, { "epoch": 2.714194819907199, "grad_norm": 1.7845664024353027, "learning_rate": 4.728582069864523e-05, "loss": 1.0581, "step": 174900 }, { "epoch": 2.7157466751501422, "grad_norm": 1.9279907941818237, "learning_rate": 4.728426884340229e-05, "loss": 1.0308, "step": 175000 }, { "epoch": 2.717298530393085, "grad_norm": 2.0942225456237793, "learning_rate": 4.728271698815935e-05, "loss": 1.0505, "step": 175100 }, { "epoch": 2.7188503856360278, "grad_norm": 2.2798030376434326, "learning_rate": 4.7281165132916405e-05, "loss": 1.056, "step": 175200 }, { "epoch": 2.720402240878971, "grad_norm": 2.1082921028137207, "learning_rate": 4.727961327767346e-05, "loss": 1.0372, "step": 175300 }, { "epoch": 2.7219540961219137, "grad_norm": 2.104358434677124, "learning_rate": 4.727806142243052e-05, "loss": 1.0314, "step": 175400 }, { "epoch": 2.7235059513648565, "grad_norm": 2.410003900527954, "learning_rate": 4.727650956718758e-05, "loss": 1.0316, "step": 175500 }, { "epoch": 2.7250578066077997, "grad_norm": 2.8375027179718018, "learning_rate": 4.7274957711944636e-05, "loss": 1.0782, "step": 175600 }, { "epoch": 2.7266096618507425, "grad_norm": 2.205787420272827, "learning_rate": 4.7273405856701694e-05, "loss": 1.0152, "step": 175700 }, { "epoch": 2.7281615170936853, "grad_norm": 2.4399054050445557, "learning_rate": 4.727185400145875e-05, "loss": 1.0432, "step": 175800 }, { "epoch": 2.7297133723366285, "grad_norm": 2.211919069290161, "learning_rate": 4.727030214621581e-05, "loss": 1.0566, "step": 175900 }, { "epoch": 2.7312652275795712, "grad_norm": 2.1625940799713135, "learning_rate": 4.726875029097286e-05, "loss": 1.0448, "step": 176000 }, { "epoch": 2.7328170828225145, "grad_norm": 2.0947794914245605, "learning_rate": 4.726719843572992e-05, "loss": 1.0334, "step": 176100 }, { "epoch": 2.734368938065457, "grad_norm": 2.0673089027404785, "learning_rate": 4.726564658048697e-05, "loss": 1.0504, "step": 176200 }, { "epoch": 2.7359207933084004, "grad_norm": 1.955876350402832, "learning_rate": 4.7264094725244027e-05, "loss": 1.0269, "step": 176300 }, { "epoch": 2.737472648551343, "grad_norm": 2.182792901992798, "learning_rate": 4.7262542870001084e-05, "loss": 1.0259, "step": 176400 }, { "epoch": 2.739024503794286, "grad_norm": 2.2036147117614746, "learning_rate": 4.726099101475814e-05, "loss": 1.0469, "step": 176500 }, { "epoch": 2.740576359037229, "grad_norm": 2.2865328788757324, "learning_rate": 4.72594391595152e-05, "loss": 1.0268, "step": 176600 }, { "epoch": 2.742128214280172, "grad_norm": 2.1760189533233643, "learning_rate": 4.725788730427226e-05, "loss": 1.0346, "step": 176700 }, { "epoch": 2.7436800695231147, "grad_norm": 2.034281015396118, "learning_rate": 4.7256335449029315e-05, "loss": 1.0414, "step": 176800 }, { "epoch": 2.745231924766058, "grad_norm": 2.272381544113159, "learning_rate": 4.725478359378637e-05, "loss": 1.0493, "step": 176900 }, { "epoch": 2.7467837800090007, "grad_norm": 2.412069082260132, "learning_rate": 4.725323173854343e-05, "loss": 1.0474, "step": 177000 }, { "epoch": 2.7483356352519435, "grad_norm": 2.2948646545410156, "learning_rate": 4.725167988330049e-05, "loss": 1.0282, "step": 177100 }, { "epoch": 2.7498874904948867, "grad_norm": 1.870371699333191, "learning_rate": 4.7250128028057546e-05, "loss": 1.0349, "step": 177200 }, { "epoch": 2.7514393457378294, "grad_norm": 2.3925862312316895, "learning_rate": 4.7248576172814604e-05, "loss": 1.0319, "step": 177300 }, { "epoch": 2.7529912009807727, "grad_norm": 2.6727235317230225, "learning_rate": 4.724702431757166e-05, "loss": 1.035, "step": 177400 }, { "epoch": 2.7545430562237154, "grad_norm": 2.3208634853363037, "learning_rate": 4.724547246232871e-05, "loss": 1.041, "step": 177500 }, { "epoch": 2.7560949114666586, "grad_norm": 2.103750705718994, "learning_rate": 4.724392060708577e-05, "loss": 1.0714, "step": 177600 }, { "epoch": 2.7576467667096014, "grad_norm": 2.2714037895202637, "learning_rate": 4.724236875184283e-05, "loss": 1.0486, "step": 177700 }, { "epoch": 2.759198621952544, "grad_norm": 2.135613441467285, "learning_rate": 4.7240816896599886e-05, "loss": 1.0447, "step": 177800 }, { "epoch": 2.7607504771954874, "grad_norm": 2.8759498596191406, "learning_rate": 4.7239265041356944e-05, "loss": 1.0645, "step": 177900 }, { "epoch": 2.76230233243843, "grad_norm": 2.096858024597168, "learning_rate": 4.7237713186114e-05, "loss": 1.0624, "step": 178000 }, { "epoch": 2.763854187681373, "grad_norm": 2.065045118331909, "learning_rate": 4.723616133087106e-05, "loss": 1.0235, "step": 178100 }, { "epoch": 2.765406042924316, "grad_norm": 2.4445083141326904, "learning_rate": 4.723460947562812e-05, "loss": 1.0471, "step": 178200 }, { "epoch": 2.766957898167259, "grad_norm": 2.600390911102295, "learning_rate": 4.7233057620385175e-05, "loss": 1.0201, "step": 178300 }, { "epoch": 2.7685097534102017, "grad_norm": 2.073873281478882, "learning_rate": 4.723150576514223e-05, "loss": 1.0305, "step": 178400 }, { "epoch": 2.770061608653145, "grad_norm": 1.985986590385437, "learning_rate": 4.722995390989929e-05, "loss": 1.0285, "step": 178500 }, { "epoch": 2.7716134638960876, "grad_norm": 2.2954366207122803, "learning_rate": 4.722840205465635e-05, "loss": 1.0419, "step": 178600 }, { "epoch": 2.773165319139031, "grad_norm": 2.4491689205169678, "learning_rate": 4.7226850199413406e-05, "loss": 1.0235, "step": 178700 }, { "epoch": 2.7747171743819736, "grad_norm": 2.285947799682617, "learning_rate": 4.722529834417046e-05, "loss": 1.0585, "step": 178800 }, { "epoch": 2.776269029624917, "grad_norm": 2.216391086578369, "learning_rate": 4.7223746488927515e-05, "loss": 1.0451, "step": 178900 }, { "epoch": 2.7778208848678596, "grad_norm": 2.3627991676330566, "learning_rate": 4.722219463368457e-05, "loss": 1.0553, "step": 179000 }, { "epoch": 2.7793727401108024, "grad_norm": 2.602961778640747, "learning_rate": 4.722064277844163e-05, "loss": 1.0311, "step": 179100 }, { "epoch": 2.7809245953537456, "grad_norm": 2.1210010051727295, "learning_rate": 4.721909092319869e-05, "loss": 1.029, "step": 179200 }, { "epoch": 2.7824764505966884, "grad_norm": 2.0719997882843018, "learning_rate": 4.7217539067955746e-05, "loss": 1.0385, "step": 179300 }, { "epoch": 2.784028305839631, "grad_norm": 2.9635632038116455, "learning_rate": 4.7215987212712797e-05, "loss": 1.0255, "step": 179400 }, { "epoch": 2.7855801610825743, "grad_norm": 2.32891583442688, "learning_rate": 4.7214435357469854e-05, "loss": 1.0463, "step": 179500 }, { "epoch": 2.787132016325517, "grad_norm": 1.9537440538406372, "learning_rate": 4.721288350222691e-05, "loss": 1.009, "step": 179600 }, { "epoch": 2.78868387156846, "grad_norm": 1.9633527994155884, "learning_rate": 4.721133164698397e-05, "loss": 1.0566, "step": 179700 }, { "epoch": 2.790235726811403, "grad_norm": 2.7678892612457275, "learning_rate": 4.720977979174103e-05, "loss": 1.0455, "step": 179800 }, { "epoch": 2.791787582054346, "grad_norm": 2.4441540241241455, "learning_rate": 4.7208227936498085e-05, "loss": 1.0414, "step": 179900 }, { "epoch": 2.793339437297289, "grad_norm": 2.1662371158599854, "learning_rate": 4.720667608125514e-05, "loss": 1.0222, "step": 180000 }, { "epoch": 2.794891292540232, "grad_norm": 1.887628197669983, "learning_rate": 4.72051242260122e-05, "loss": 1.0427, "step": 180100 }, { "epoch": 2.796443147783175, "grad_norm": 2.570028305053711, "learning_rate": 4.720357237076926e-05, "loss": 1.0462, "step": 180200 }, { "epoch": 2.797995003026118, "grad_norm": 1.7297128438949585, "learning_rate": 4.720202051552631e-05, "loss": 1.045, "step": 180300 }, { "epoch": 2.7995468582690606, "grad_norm": 2.0217483043670654, "learning_rate": 4.720046866028337e-05, "loss": 1.0563, "step": 180400 }, { "epoch": 2.801098713512004, "grad_norm": 2.2368738651275635, "learning_rate": 4.7198916805040425e-05, "loss": 1.0444, "step": 180500 }, { "epoch": 2.8026505687549466, "grad_norm": 2.3537797927856445, "learning_rate": 4.719736494979748e-05, "loss": 1.0314, "step": 180600 }, { "epoch": 2.8042024239978893, "grad_norm": 2.481429100036621, "learning_rate": 4.719581309455454e-05, "loss": 1.0197, "step": 180700 }, { "epoch": 2.8057542792408325, "grad_norm": 2.388336181640625, "learning_rate": 4.71942612393116e-05, "loss": 1.0475, "step": 180800 }, { "epoch": 2.8073061344837753, "grad_norm": 2.736994504928589, "learning_rate": 4.7192709384068656e-05, "loss": 1.0312, "step": 180900 }, { "epoch": 2.808857989726718, "grad_norm": 2.5262064933776855, "learning_rate": 4.7191157528825714e-05, "loss": 1.0531, "step": 181000 }, { "epoch": 2.8104098449696613, "grad_norm": 1.9915307760238647, "learning_rate": 4.718960567358277e-05, "loss": 1.0462, "step": 181100 }, { "epoch": 2.811961700212604, "grad_norm": 2.344263792037964, "learning_rate": 4.718805381833983e-05, "loss": 1.0091, "step": 181200 }, { "epoch": 2.8135135554555473, "grad_norm": 2.3820552825927734, "learning_rate": 4.718650196309689e-05, "loss": 1.0361, "step": 181300 }, { "epoch": 2.81506541069849, "grad_norm": 2.1658577919006348, "learning_rate": 4.7184950107853945e-05, "loss": 1.0436, "step": 181400 }, { "epoch": 2.8166172659414332, "grad_norm": 2.4356305599212646, "learning_rate": 4.7183398252611e-05, "loss": 1.035, "step": 181500 }, { "epoch": 2.818169121184376, "grad_norm": 2.114675760269165, "learning_rate": 4.7181846397368054e-05, "loss": 1.0465, "step": 181600 }, { "epoch": 2.819720976427319, "grad_norm": 2.0706875324249268, "learning_rate": 4.718029454212511e-05, "loss": 1.057, "step": 181700 }, { "epoch": 2.821272831670262, "grad_norm": 2.4009530544281006, "learning_rate": 4.717874268688217e-05, "loss": 1.0285, "step": 181800 }, { "epoch": 2.8228246869132048, "grad_norm": 2.498262643814087, "learning_rate": 4.717719083163923e-05, "loss": 1.036, "step": 181900 }, { "epoch": 2.8243765421561475, "grad_norm": 2.347062587738037, "learning_rate": 4.7175638976396285e-05, "loss": 1.0438, "step": 182000 }, { "epoch": 2.8259283973990907, "grad_norm": 2.0232503414154053, "learning_rate": 4.717408712115334e-05, "loss": 1.0382, "step": 182100 }, { "epoch": 2.8274802526420335, "grad_norm": 2.401533842086792, "learning_rate": 4.71725352659104e-05, "loss": 1.0261, "step": 182200 }, { "epoch": 2.8290321078849763, "grad_norm": 1.9756814241409302, "learning_rate": 4.717098341066746e-05, "loss": 1.0485, "step": 182300 }, { "epoch": 2.8305839631279195, "grad_norm": 2.5054469108581543, "learning_rate": 4.7169431555424516e-05, "loss": 1.0553, "step": 182400 }, { "epoch": 2.8321358183708623, "grad_norm": 2.3585667610168457, "learning_rate": 4.716787970018157e-05, "loss": 1.0382, "step": 182500 }, { "epoch": 2.8336876736138055, "grad_norm": 1.8390778303146362, "learning_rate": 4.7166327844938624e-05, "loss": 1.0374, "step": 182600 }, { "epoch": 2.8352395288567482, "grad_norm": 2.443948745727539, "learning_rate": 4.716477598969568e-05, "loss": 1.026, "step": 182700 }, { "epoch": 2.8367913840996914, "grad_norm": 2.17437744140625, "learning_rate": 4.716322413445274e-05, "loss": 1.0449, "step": 182800 }, { "epoch": 2.838343239342634, "grad_norm": 2.456134557723999, "learning_rate": 4.71616722792098e-05, "loss": 1.0381, "step": 182900 }, { "epoch": 2.839895094585577, "grad_norm": 1.9899870157241821, "learning_rate": 4.7160120423966855e-05, "loss": 1.019, "step": 183000 }, { "epoch": 2.84144694982852, "grad_norm": 2.103511333465576, "learning_rate": 4.715856856872391e-05, "loss": 1.035, "step": 183100 }, { "epoch": 2.842998805071463, "grad_norm": 2.0619964599609375, "learning_rate": 4.7157016713480964e-05, "loss": 1.0415, "step": 183200 }, { "epoch": 2.8445506603144057, "grad_norm": 2.082120656967163, "learning_rate": 4.715546485823802e-05, "loss": 1.0491, "step": 183300 }, { "epoch": 2.846102515557349, "grad_norm": 2.3282582759857178, "learning_rate": 4.715391300299508e-05, "loss": 1.053, "step": 183400 }, { "epoch": 2.8476543708002917, "grad_norm": 2.5951757431030273, "learning_rate": 4.715236114775214e-05, "loss": 1.038, "step": 183500 }, { "epoch": 2.8492062260432345, "grad_norm": 2.437112331390381, "learning_rate": 4.7150809292509195e-05, "loss": 1.0142, "step": 183600 }, { "epoch": 2.8507580812861777, "grad_norm": 2.447910785675049, "learning_rate": 4.714925743726625e-05, "loss": 1.0344, "step": 183700 }, { "epoch": 2.8523099365291205, "grad_norm": 2.774915933609009, "learning_rate": 4.714770558202331e-05, "loss": 1.0377, "step": 183800 }, { "epoch": 2.8538617917720632, "grad_norm": 1.8265098333358765, "learning_rate": 4.714615372678037e-05, "loss": 1.0396, "step": 183900 }, { "epoch": 2.8554136470150064, "grad_norm": 2.375612497329712, "learning_rate": 4.7144601871537426e-05, "loss": 1.0392, "step": 184000 }, { "epoch": 2.8569655022579497, "grad_norm": 2.156926155090332, "learning_rate": 4.7143050016294484e-05, "loss": 1.0469, "step": 184100 }, { "epoch": 2.8585173575008924, "grad_norm": 2.006354331970215, "learning_rate": 4.714149816105154e-05, "loss": 1.0334, "step": 184200 }, { "epoch": 2.860069212743835, "grad_norm": 2.555560827255249, "learning_rate": 4.71399463058086e-05, "loss": 1.0473, "step": 184300 }, { "epoch": 2.8616210679867784, "grad_norm": 2.1230671405792236, "learning_rate": 4.713839445056566e-05, "loss": 1.0402, "step": 184400 }, { "epoch": 2.863172923229721, "grad_norm": 2.0263772010803223, "learning_rate": 4.713684259532271e-05, "loss": 1.031, "step": 184500 }, { "epoch": 2.864724778472664, "grad_norm": 1.9387094974517822, "learning_rate": 4.7135290740079766e-05, "loss": 1.0022, "step": 184600 }, { "epoch": 2.866276633715607, "grad_norm": 2.5151021480560303, "learning_rate": 4.7133738884836824e-05, "loss": 1.0422, "step": 184700 }, { "epoch": 2.86782848895855, "grad_norm": 1.9107468128204346, "learning_rate": 4.713218702959388e-05, "loss": 1.0337, "step": 184800 }, { "epoch": 2.8693803442014927, "grad_norm": 2.1471190452575684, "learning_rate": 4.713063517435094e-05, "loss": 1.0074, "step": 184900 }, { "epoch": 2.870932199444436, "grad_norm": 2.553988456726074, "learning_rate": 4.7129083319108e-05, "loss": 1.0299, "step": 185000 }, { "epoch": 2.8724840546873787, "grad_norm": 2.2979228496551514, "learning_rate": 4.7127531463865055e-05, "loss": 1.036, "step": 185100 }, { "epoch": 2.8740359099303214, "grad_norm": 2.090477466583252, "learning_rate": 4.712597960862211e-05, "loss": 1.025, "step": 185200 }, { "epoch": 2.8755877651732646, "grad_norm": 2.046459913253784, "learning_rate": 4.712442775337917e-05, "loss": 1.0495, "step": 185300 }, { "epoch": 2.8771396204162074, "grad_norm": 2.2629613876342773, "learning_rate": 4.712287589813623e-05, "loss": 1.0419, "step": 185400 }, { "epoch": 2.8786914756591506, "grad_norm": 1.9247848987579346, "learning_rate": 4.7121324042893286e-05, "loss": 1.018, "step": 185500 }, { "epoch": 2.8802433309020934, "grad_norm": 2.1730329990386963, "learning_rate": 4.711977218765034e-05, "loss": 1.0186, "step": 185600 }, { "epoch": 2.8817951861450366, "grad_norm": 2.3092572689056396, "learning_rate": 4.71182203324074e-05, "loss": 1.0235, "step": 185700 }, { "epoch": 2.8833470413879794, "grad_norm": 2.4735634326934814, "learning_rate": 4.711666847716445e-05, "loss": 1.0264, "step": 185800 }, { "epoch": 2.884898896630922, "grad_norm": 1.7579586505889893, "learning_rate": 4.711511662192151e-05, "loss": 1.0437, "step": 185900 }, { "epoch": 2.8864507518738654, "grad_norm": 2.1527419090270996, "learning_rate": 4.711356476667856e-05, "loss": 1.0413, "step": 186000 }, { "epoch": 2.888002607116808, "grad_norm": 2.3001890182495117, "learning_rate": 4.711201291143562e-05, "loss": 1.0335, "step": 186100 }, { "epoch": 2.889554462359751, "grad_norm": 3.010706663131714, "learning_rate": 4.7110461056192676e-05, "loss": 1.0345, "step": 186200 }, { "epoch": 2.891106317602694, "grad_norm": 2.3039863109588623, "learning_rate": 4.7108909200949734e-05, "loss": 1.0476, "step": 186300 }, { "epoch": 2.892658172845637, "grad_norm": 2.2111706733703613, "learning_rate": 4.710735734570679e-05, "loss": 1.0293, "step": 186400 }, { "epoch": 2.8942100280885796, "grad_norm": 2.36498761177063, "learning_rate": 4.710580549046385e-05, "loss": 1.031, "step": 186500 }, { "epoch": 2.895761883331523, "grad_norm": 1.7939813137054443, "learning_rate": 4.710425363522091e-05, "loss": 1.0265, "step": 186600 }, { "epoch": 2.8973137385744656, "grad_norm": 2.330825090408325, "learning_rate": 4.7102701779977965e-05, "loss": 1.0625, "step": 186700 }, { "epoch": 2.898865593817409, "grad_norm": 2.151470184326172, "learning_rate": 4.710114992473502e-05, "loss": 1.0237, "step": 186800 }, { "epoch": 2.9004174490603516, "grad_norm": 2.305598258972168, "learning_rate": 4.709959806949208e-05, "loss": 1.0594, "step": 186900 }, { "epoch": 2.901969304303295, "grad_norm": 2.183980941772461, "learning_rate": 4.709804621424914e-05, "loss": 1.0406, "step": 187000 }, { "epoch": 2.9035211595462376, "grad_norm": 2.6724209785461426, "learning_rate": 4.7096494359006196e-05, "loss": 1.0574, "step": 187100 }, { "epoch": 2.9050730147891803, "grad_norm": 2.159895181655884, "learning_rate": 4.7094942503763254e-05, "loss": 1.0236, "step": 187200 }, { "epoch": 2.9066248700321236, "grad_norm": 2.602503538131714, "learning_rate": 4.7093390648520305e-05, "loss": 1.0191, "step": 187300 }, { "epoch": 2.9081767252750663, "grad_norm": 2.2885754108428955, "learning_rate": 4.709183879327736e-05, "loss": 1.0375, "step": 187400 }, { "epoch": 2.909728580518009, "grad_norm": 2.3764402866363525, "learning_rate": 4.709028693803442e-05, "loss": 1.0421, "step": 187500 }, { "epoch": 2.9112804357609523, "grad_norm": 2.0926246643066406, "learning_rate": 4.708873508279148e-05, "loss": 1.0243, "step": 187600 }, { "epoch": 2.912832291003895, "grad_norm": 2.254619836807251, "learning_rate": 4.7087183227548536e-05, "loss": 1.0133, "step": 187700 }, { "epoch": 2.914384146246838, "grad_norm": 2.3661041259765625, "learning_rate": 4.7085631372305594e-05, "loss": 1.0287, "step": 187800 }, { "epoch": 2.915936001489781, "grad_norm": 2.4754600524902344, "learning_rate": 4.708407951706265e-05, "loss": 1.0391, "step": 187900 }, { "epoch": 2.917487856732724, "grad_norm": 2.4904284477233887, "learning_rate": 4.708252766181971e-05, "loss": 1.0278, "step": 188000 }, { "epoch": 2.919039711975667, "grad_norm": 2.2915358543395996, "learning_rate": 4.708097580657677e-05, "loss": 1.0371, "step": 188100 }, { "epoch": 2.92059156721861, "grad_norm": 2.5260331630706787, "learning_rate": 4.7079423951333825e-05, "loss": 1.0438, "step": 188200 }, { "epoch": 2.922143422461553, "grad_norm": 2.7706286907196045, "learning_rate": 4.707787209609088e-05, "loss": 1.0525, "step": 188300 }, { "epoch": 2.9236952777044958, "grad_norm": 2.768744945526123, "learning_rate": 4.707632024084794e-05, "loss": 1.0359, "step": 188400 }, { "epoch": 2.9252471329474385, "grad_norm": 2.4445366859436035, "learning_rate": 4.7074768385605e-05, "loss": 1.0659, "step": 188500 }, { "epoch": 2.9267989881903818, "grad_norm": 2.203753709793091, "learning_rate": 4.707321653036205e-05, "loss": 1.0294, "step": 188600 }, { "epoch": 2.9283508434333245, "grad_norm": 2.6416361331939697, "learning_rate": 4.7071664675119107e-05, "loss": 1.0164, "step": 188700 }, { "epoch": 2.9299026986762673, "grad_norm": 2.0499093532562256, "learning_rate": 4.7070112819876164e-05, "loss": 1.0113, "step": 188800 }, { "epoch": 2.9314545539192105, "grad_norm": 2.1293187141418457, "learning_rate": 4.706856096463322e-05, "loss": 1.0468, "step": 188900 }, { "epoch": 2.9330064091621533, "grad_norm": 2.1990184783935547, "learning_rate": 4.706700910939028e-05, "loss": 1.0318, "step": 189000 }, { "epoch": 2.934558264405096, "grad_norm": 2.421170473098755, "learning_rate": 4.706545725414733e-05, "loss": 1.0362, "step": 189100 }, { "epoch": 2.9361101196480393, "grad_norm": 1.8146953582763672, "learning_rate": 4.706390539890439e-05, "loss": 1.0267, "step": 189200 }, { "epoch": 2.937661974890982, "grad_norm": 2.159708023071289, "learning_rate": 4.7062353543661446e-05, "loss": 1.0154, "step": 189300 }, { "epoch": 2.9392138301339252, "grad_norm": 2.5605711936950684, "learning_rate": 4.7060801688418504e-05, "loss": 1.0293, "step": 189400 }, { "epoch": 2.940765685376868, "grad_norm": 2.3768041133880615, "learning_rate": 4.705924983317556e-05, "loss": 1.019, "step": 189500 }, { "epoch": 2.942317540619811, "grad_norm": 2.371068000793457, "learning_rate": 4.705769797793262e-05, "loss": 1.0269, "step": 189600 }, { "epoch": 2.943869395862754, "grad_norm": 2.5876991748809814, "learning_rate": 4.705614612268968e-05, "loss": 1.0401, "step": 189700 }, { "epoch": 2.9454212511056967, "grad_norm": 2.299680233001709, "learning_rate": 4.7054594267446735e-05, "loss": 1.041, "step": 189800 }, { "epoch": 2.94697310634864, "grad_norm": 2.342620372772217, "learning_rate": 4.705304241220379e-05, "loss": 1.0326, "step": 189900 }, { "epoch": 2.9485249615915827, "grad_norm": 2.1082839965820312, "learning_rate": 4.705149055696085e-05, "loss": 1.0386, "step": 190000 }, { "epoch": 2.9500768168345255, "grad_norm": 1.9586697816848755, "learning_rate": 4.704993870171791e-05, "loss": 1.0479, "step": 190100 }, { "epoch": 2.9516286720774687, "grad_norm": 2.053753137588501, "learning_rate": 4.704838684647496e-05, "loss": 1.0584, "step": 190200 }, { "epoch": 2.9531805273204115, "grad_norm": 2.466909885406494, "learning_rate": 4.704683499123202e-05, "loss": 1.0305, "step": 190300 }, { "epoch": 2.9547323825633542, "grad_norm": 1.973644733428955, "learning_rate": 4.7045283135989075e-05, "loss": 1.0163, "step": 190400 }, { "epoch": 2.9562842378062975, "grad_norm": 2.056447982788086, "learning_rate": 4.704373128074613e-05, "loss": 1.0215, "step": 190500 }, { "epoch": 2.9578360930492402, "grad_norm": 2.1640326976776123, "learning_rate": 4.704217942550319e-05, "loss": 1.0026, "step": 190600 }, { "epoch": 2.9593879482921834, "grad_norm": 2.4542548656463623, "learning_rate": 4.704062757026025e-05, "loss": 1.0527, "step": 190700 }, { "epoch": 2.960939803535126, "grad_norm": 2.8114395141601562, "learning_rate": 4.7039075715017306e-05, "loss": 1.0339, "step": 190800 }, { "epoch": 2.9624916587780694, "grad_norm": 1.7165937423706055, "learning_rate": 4.7037523859774364e-05, "loss": 1.0486, "step": 190900 }, { "epoch": 2.964043514021012, "grad_norm": 1.8165018558502197, "learning_rate": 4.703597200453142e-05, "loss": 1.0484, "step": 191000 }, { "epoch": 2.965595369263955, "grad_norm": 2.210017681121826, "learning_rate": 4.703442014928848e-05, "loss": 1.0208, "step": 191100 }, { "epoch": 2.967147224506898, "grad_norm": 1.9644007682800293, "learning_rate": 4.703286829404554e-05, "loss": 1.03, "step": 191200 }, { "epoch": 2.968699079749841, "grad_norm": 2.1285908222198486, "learning_rate": 4.7031316438802595e-05, "loss": 1.0425, "step": 191300 }, { "epoch": 2.9702509349927837, "grad_norm": 2.1437644958496094, "learning_rate": 4.702976458355965e-05, "loss": 1.0337, "step": 191400 }, { "epoch": 2.971802790235727, "grad_norm": 2.3303661346435547, "learning_rate": 4.70282127283167e-05, "loss": 1.0239, "step": 191500 }, { "epoch": 2.9733546454786697, "grad_norm": 2.347890615463257, "learning_rate": 4.702666087307376e-05, "loss": 1.0488, "step": 191600 }, { "epoch": 2.9749065007216124, "grad_norm": 2.1174182891845703, "learning_rate": 4.702510901783082e-05, "loss": 1.0281, "step": 191700 }, { "epoch": 2.9764583559645557, "grad_norm": 2.1659739017486572, "learning_rate": 4.7023557162587877e-05, "loss": 1.026, "step": 191800 }, { "epoch": 2.9780102112074984, "grad_norm": 2.4842588901519775, "learning_rate": 4.7022005307344934e-05, "loss": 1.0445, "step": 191900 }, { "epoch": 2.9795620664504416, "grad_norm": 2.3452415466308594, "learning_rate": 4.702045345210199e-05, "loss": 1.037, "step": 192000 }, { "epoch": 2.9811139216933844, "grad_norm": 1.9928044080734253, "learning_rate": 4.701890159685905e-05, "loss": 1.0123, "step": 192100 }, { "epoch": 2.9826657769363276, "grad_norm": 2.5669846534729004, "learning_rate": 4.701734974161611e-05, "loss": 1.0552, "step": 192200 }, { "epoch": 2.9842176321792704, "grad_norm": 2.2527923583984375, "learning_rate": 4.7015797886373165e-05, "loss": 1.0273, "step": 192300 }, { "epoch": 2.985769487422213, "grad_norm": 2.880582094192505, "learning_rate": 4.7014246031130216e-05, "loss": 1.0294, "step": 192400 }, { "epoch": 2.9873213426651564, "grad_norm": 2.2426044940948486, "learning_rate": 4.7012694175887274e-05, "loss": 1.0365, "step": 192500 }, { "epoch": 2.988873197908099, "grad_norm": 2.2233119010925293, "learning_rate": 4.701114232064433e-05, "loss": 1.0293, "step": 192600 }, { "epoch": 2.990425053151042, "grad_norm": 2.3352088928222656, "learning_rate": 4.700959046540139e-05, "loss": 1.0327, "step": 192700 }, { "epoch": 2.991976908393985, "grad_norm": 2.204885482788086, "learning_rate": 4.700803861015845e-05, "loss": 1.0384, "step": 192800 }, { "epoch": 2.993528763636928, "grad_norm": 2.6361398696899414, "learning_rate": 4.7006486754915505e-05, "loss": 1.0477, "step": 192900 }, { "epoch": 2.9950806188798706, "grad_norm": 1.9669020175933838, "learning_rate": 4.7004934899672556e-05, "loss": 1.0282, "step": 193000 }, { "epoch": 2.996632474122814, "grad_norm": 2.1864845752716064, "learning_rate": 4.7003383044429614e-05, "loss": 1.0192, "step": 193100 }, { "epoch": 2.9981843293657566, "grad_norm": 2.3880720138549805, "learning_rate": 4.700183118918667e-05, "loss": 1.0402, "step": 193200 }, { "epoch": 2.9997361846087, "grad_norm": 2.177459478378296, "learning_rate": 4.700027933394373e-05, "loss": 1.0148, "step": 193300 }, { "epoch": 3.0012880398516426, "grad_norm": 2.38643217086792, "learning_rate": 4.699872747870079e-05, "loss": 1.0377, "step": 193400 }, { "epoch": 3.0028398950945854, "grad_norm": 2.6639795303344727, "learning_rate": 4.6997175623457845e-05, "loss": 1.0563, "step": 193500 }, { "epoch": 3.0043917503375286, "grad_norm": 2.132826328277588, "learning_rate": 4.69956237682149e-05, "loss": 1.0268, "step": 193600 }, { "epoch": 3.0059436055804714, "grad_norm": 2.4893951416015625, "learning_rate": 4.699407191297196e-05, "loss": 1.0565, "step": 193700 }, { "epoch": 3.0074954608234146, "grad_norm": 2.316396474838257, "learning_rate": 4.699252005772902e-05, "loss": 1.0274, "step": 193800 }, { "epoch": 3.0090473160663573, "grad_norm": 2.286466360092163, "learning_rate": 4.6990968202486076e-05, "loss": 1.036, "step": 193900 }, { "epoch": 3.0105991713093, "grad_norm": 2.2694833278656006, "learning_rate": 4.6989416347243134e-05, "loss": 1.025, "step": 194000 }, { "epoch": 3.0121510265522433, "grad_norm": 2.5719926357269287, "learning_rate": 4.698786449200019e-05, "loss": 1.0261, "step": 194100 }, { "epoch": 3.013702881795186, "grad_norm": 5.869794845581055, "learning_rate": 4.698631263675725e-05, "loss": 1.0307, "step": 194200 }, { "epoch": 3.0152547370381293, "grad_norm": 2.239471912384033, "learning_rate": 4.69847607815143e-05, "loss": 1.0107, "step": 194300 }, { "epoch": 3.016806592281072, "grad_norm": 2.3318614959716797, "learning_rate": 4.698320892627136e-05, "loss": 1.0316, "step": 194400 }, { "epoch": 3.018358447524015, "grad_norm": 2.550088882446289, "learning_rate": 4.6981657071028416e-05, "loss": 1.0412, "step": 194500 }, { "epoch": 3.019910302766958, "grad_norm": 2.599668502807617, "learning_rate": 4.698010521578547e-05, "loss": 1.0343, "step": 194600 }, { "epoch": 3.021462158009901, "grad_norm": 2.2850258350372314, "learning_rate": 4.697855336054253e-05, "loss": 1.0558, "step": 194700 }, { "epoch": 3.0230140132528436, "grad_norm": 2.204380512237549, "learning_rate": 4.697700150529959e-05, "loss": 1.0128, "step": 194800 }, { "epoch": 3.024565868495787, "grad_norm": 2.282245397567749, "learning_rate": 4.6975449650056647e-05, "loss": 1.0511, "step": 194900 }, { "epoch": 3.0261177237387296, "grad_norm": 2.1987249851226807, "learning_rate": 4.6973897794813704e-05, "loss": 1.0328, "step": 195000 }, { "epoch": 3.0276695789816728, "grad_norm": 2.3703179359436035, "learning_rate": 4.697234593957076e-05, "loss": 1.031, "step": 195100 }, { "epoch": 3.0292214342246155, "grad_norm": 2.0956859588623047, "learning_rate": 4.697079408432782e-05, "loss": 1.0068, "step": 195200 }, { "epoch": 3.0307732894675583, "grad_norm": 2.2914812564849854, "learning_rate": 4.696924222908488e-05, "loss": 1.0327, "step": 195300 }, { "epoch": 3.0323251447105015, "grad_norm": 2.428166627883911, "learning_rate": 4.6967690373841935e-05, "loss": 1.0435, "step": 195400 }, { "epoch": 3.0338769999534443, "grad_norm": 2.140897750854492, "learning_rate": 4.696613851859899e-05, "loss": 1.041, "step": 195500 }, { "epoch": 3.0354288551963875, "grad_norm": 2.000777244567871, "learning_rate": 4.6964586663356044e-05, "loss": 1.0155, "step": 195600 }, { "epoch": 3.0369807104393303, "grad_norm": 2.3136231899261475, "learning_rate": 4.69630348081131e-05, "loss": 1.0425, "step": 195700 }, { "epoch": 3.038532565682273, "grad_norm": 2.113478183746338, "learning_rate": 4.696148295287015e-05, "loss": 1.0389, "step": 195800 }, { "epoch": 3.0400844209252162, "grad_norm": 2.2906436920166016, "learning_rate": 4.695993109762721e-05, "loss": 1.0313, "step": 195900 }, { "epoch": 3.041636276168159, "grad_norm": 2.1110403537750244, "learning_rate": 4.695837924238427e-05, "loss": 1.028, "step": 196000 }, { "epoch": 3.043188131411102, "grad_norm": 2.0373849868774414, "learning_rate": 4.6956827387141326e-05, "loss": 1.0144, "step": 196100 }, { "epoch": 3.044739986654045, "grad_norm": 2.5317904949188232, "learning_rate": 4.6955275531898384e-05, "loss": 1.0362, "step": 196200 }, { "epoch": 3.0462918418969878, "grad_norm": 1.8818609714508057, "learning_rate": 4.695372367665544e-05, "loss": 1.0294, "step": 196300 }, { "epoch": 3.047843697139931, "grad_norm": 2.0409302711486816, "learning_rate": 4.69521718214125e-05, "loss": 1.0188, "step": 196400 }, { "epoch": 3.0493955523828737, "grad_norm": 2.394728899002075, "learning_rate": 4.695061996616956e-05, "loss": 1.0306, "step": 196500 }, { "epoch": 3.0509474076258165, "grad_norm": 2.4261462688446045, "learning_rate": 4.6949068110926615e-05, "loss": 1.0447, "step": 196600 }, { "epoch": 3.0524992628687597, "grad_norm": 1.7856899499893188, "learning_rate": 4.694751625568367e-05, "loss": 1.0403, "step": 196700 }, { "epoch": 3.0540511181117025, "grad_norm": 2.1757800579071045, "learning_rate": 4.694596440044073e-05, "loss": 1.0305, "step": 196800 }, { "epoch": 3.0556029733546453, "grad_norm": 2.5751469135284424, "learning_rate": 4.694441254519779e-05, "loss": 1.0433, "step": 196900 }, { "epoch": 3.0571548285975885, "grad_norm": 2.0382914543151855, "learning_rate": 4.6942860689954846e-05, "loss": 1.0157, "step": 197000 }, { "epoch": 3.0587066838405312, "grad_norm": 2.3272671699523926, "learning_rate": 4.69413088347119e-05, "loss": 1.0261, "step": 197100 }, { "epoch": 3.0602585390834744, "grad_norm": 2.044356346130371, "learning_rate": 4.6939756979468955e-05, "loss": 1.0429, "step": 197200 }, { "epoch": 3.061810394326417, "grad_norm": 1.988494873046875, "learning_rate": 4.693820512422601e-05, "loss": 1.0252, "step": 197300 }, { "epoch": 3.06336224956936, "grad_norm": 2.347303867340088, "learning_rate": 4.693665326898307e-05, "loss": 1.0427, "step": 197400 }, { "epoch": 3.064914104812303, "grad_norm": 2.1885454654693604, "learning_rate": 4.693510141374013e-05, "loss": 1.0323, "step": 197500 }, { "epoch": 3.066465960055246, "grad_norm": 2.6395416259765625, "learning_rate": 4.6933549558497186e-05, "loss": 1.0268, "step": 197600 }, { "epoch": 3.068017815298189, "grad_norm": 2.166287422180176, "learning_rate": 4.693199770325424e-05, "loss": 1.012, "step": 197700 }, { "epoch": 3.069569670541132, "grad_norm": 2.3830583095550537, "learning_rate": 4.69304458480113e-05, "loss": 1.0399, "step": 197800 }, { "epoch": 3.0711215257840747, "grad_norm": 1.889291524887085, "learning_rate": 4.692889399276836e-05, "loss": 1.0247, "step": 197900 }, { "epoch": 3.072673381027018, "grad_norm": 2.4712934494018555, "learning_rate": 4.6927342137525417e-05, "loss": 1.0245, "step": 198000 }, { "epoch": 3.0742252362699607, "grad_norm": 2.633347988128662, "learning_rate": 4.6925790282282474e-05, "loss": 1.0305, "step": 198100 }, { "epoch": 3.0757770915129035, "grad_norm": 2.4769883155822754, "learning_rate": 4.692423842703953e-05, "loss": 1.0298, "step": 198200 }, { "epoch": 3.0773289467558467, "grad_norm": 1.9867851734161377, "learning_rate": 4.692268657179659e-05, "loss": 1.0044, "step": 198300 }, { "epoch": 3.0788808019987894, "grad_norm": 2.213604688644409, "learning_rate": 4.692113471655364e-05, "loss": 1.0302, "step": 198400 }, { "epoch": 3.0804326572417327, "grad_norm": 2.464210271835327, "learning_rate": 4.69195828613107e-05, "loss": 1.0357, "step": 198500 }, { "epoch": 3.0819845124846754, "grad_norm": 2.238145112991333, "learning_rate": 4.6918031006067756e-05, "loss": 1.0173, "step": 198600 }, { "epoch": 3.083536367727618, "grad_norm": 2.5813677310943604, "learning_rate": 4.6916479150824814e-05, "loss": 1.0095, "step": 198700 }, { "epoch": 3.0850882229705614, "grad_norm": 2.3975701332092285, "learning_rate": 4.691492729558187e-05, "loss": 1.0407, "step": 198800 }, { "epoch": 3.086640078213504, "grad_norm": 2.134744644165039, "learning_rate": 4.691337544033892e-05, "loss": 1.0186, "step": 198900 }, { "epoch": 3.0881919334564474, "grad_norm": 2.562459707260132, "learning_rate": 4.691182358509598e-05, "loss": 1.0278, "step": 199000 }, { "epoch": 3.08974378869939, "grad_norm": 2.1110596656799316, "learning_rate": 4.691027172985304e-05, "loss": 1.0094, "step": 199100 }, { "epoch": 3.091295643942333, "grad_norm": 2.220705032348633, "learning_rate": 4.6908719874610096e-05, "loss": 1.0193, "step": 199200 }, { "epoch": 3.092847499185276, "grad_norm": 2.34887957572937, "learning_rate": 4.6907168019367154e-05, "loss": 1.0276, "step": 199300 }, { "epoch": 3.094399354428219, "grad_norm": 2.3114545345306396, "learning_rate": 4.690561616412421e-05, "loss": 1.0492, "step": 199400 }, { "epoch": 3.0959512096711617, "grad_norm": 2.16153621673584, "learning_rate": 4.690406430888127e-05, "loss": 1.0264, "step": 199500 }, { "epoch": 3.097503064914105, "grad_norm": 2.1943531036376953, "learning_rate": 4.690251245363833e-05, "loss": 1.0291, "step": 199600 }, { "epoch": 3.0990549201570476, "grad_norm": 2.46353816986084, "learning_rate": 4.6900960598395385e-05, "loss": 1.0239, "step": 199700 }, { "epoch": 3.100606775399991, "grad_norm": 2.054356098175049, "learning_rate": 4.689940874315244e-05, "loss": 1.0399, "step": 199800 }, { "epoch": 3.1021586306429336, "grad_norm": 2.286006450653076, "learning_rate": 4.68978568879095e-05, "loss": 1.0342, "step": 199900 }, { "epoch": 3.1037104858858764, "grad_norm": 2.0622918605804443, "learning_rate": 4.689630503266655e-05, "loss": 1.0129, "step": 200000 }, { "epoch": 3.1052623411288196, "grad_norm": 2.433764696121216, "learning_rate": 4.689475317742361e-05, "loss": 1.0577, "step": 200100 }, { "epoch": 3.1068141963717624, "grad_norm": 2.215843915939331, "learning_rate": 4.689320132218067e-05, "loss": 1.0293, "step": 200200 }, { "epoch": 3.1083660516147056, "grad_norm": 2.349459409713745, "learning_rate": 4.6891649466937725e-05, "loss": 1.034, "step": 200300 }, { "epoch": 3.1099179068576484, "grad_norm": 2.0848443508148193, "learning_rate": 4.689009761169478e-05, "loss": 1.0095, "step": 200400 }, { "epoch": 3.111469762100591, "grad_norm": 2.5262529850006104, "learning_rate": 4.688854575645184e-05, "loss": 1.0339, "step": 200500 }, { "epoch": 3.1130216173435343, "grad_norm": 2.55330491065979, "learning_rate": 4.68869939012089e-05, "loss": 1.0273, "step": 200600 }, { "epoch": 3.114573472586477, "grad_norm": 2.2021408081054688, "learning_rate": 4.6885442045965956e-05, "loss": 1.0336, "step": 200700 }, { "epoch": 3.11612532782942, "grad_norm": 2.430277109146118, "learning_rate": 4.688389019072301e-05, "loss": 1.0361, "step": 200800 }, { "epoch": 3.117677183072363, "grad_norm": 2.0476455688476562, "learning_rate": 4.688233833548007e-05, "loss": 1.0085, "step": 200900 }, { "epoch": 3.119229038315306, "grad_norm": 2.5166189670562744, "learning_rate": 4.688078648023713e-05, "loss": 1.0314, "step": 201000 }, { "epoch": 3.120780893558249, "grad_norm": 2.221287488937378, "learning_rate": 4.6879234624994187e-05, "loss": 1.0317, "step": 201100 }, { "epoch": 3.122332748801192, "grad_norm": 3.0448193550109863, "learning_rate": 4.6877682769751244e-05, "loss": 1.0127, "step": 201200 }, { "epoch": 3.1238846040441346, "grad_norm": 2.4233529567718506, "learning_rate": 4.6876130914508295e-05, "loss": 1.0354, "step": 201300 }, { "epoch": 3.125436459287078, "grad_norm": 2.2048444747924805, "learning_rate": 4.687457905926535e-05, "loss": 1.0269, "step": 201400 }, { "epoch": 3.1269883145300206, "grad_norm": 2.5136120319366455, "learning_rate": 4.687302720402241e-05, "loss": 1.022, "step": 201500 }, { "epoch": 3.128540169772964, "grad_norm": 2.1857991218566895, "learning_rate": 4.687147534877947e-05, "loss": 1.0294, "step": 201600 }, { "epoch": 3.1300920250159066, "grad_norm": 2.1854701042175293, "learning_rate": 4.6869923493536526e-05, "loss": 0.983, "step": 201700 }, { "epoch": 3.1316438802588493, "grad_norm": 2.0431981086730957, "learning_rate": 4.6868371638293584e-05, "loss": 1.0318, "step": 201800 }, { "epoch": 3.1331957355017925, "grad_norm": 2.220003843307495, "learning_rate": 4.686681978305064e-05, "loss": 1.03, "step": 201900 }, { "epoch": 3.1347475907447353, "grad_norm": 2.234104871749878, "learning_rate": 4.68652679278077e-05, "loss": 1.0193, "step": 202000 }, { "epoch": 3.136299445987678, "grad_norm": 2.0799477100372314, "learning_rate": 4.686371607256476e-05, "loss": 1.0456, "step": 202100 }, { "epoch": 3.1378513012306213, "grad_norm": 2.006730556488037, "learning_rate": 4.686216421732181e-05, "loss": 1.0284, "step": 202200 }, { "epoch": 3.139403156473564, "grad_norm": 1.9896783828735352, "learning_rate": 4.6860612362078866e-05, "loss": 1.0332, "step": 202300 }, { "epoch": 3.1409550117165073, "grad_norm": 2.3062851428985596, "learning_rate": 4.6859060506835924e-05, "loss": 1.0093, "step": 202400 }, { "epoch": 3.14250686695945, "grad_norm": 2.223362922668457, "learning_rate": 4.685750865159298e-05, "loss": 1.0238, "step": 202500 }, { "epoch": 3.144058722202393, "grad_norm": 2.6560285091400146, "learning_rate": 4.685595679635004e-05, "loss": 1.0299, "step": 202600 }, { "epoch": 3.145610577445336, "grad_norm": 2.1673824787139893, "learning_rate": 4.68544049411071e-05, "loss": 1.0113, "step": 202700 }, { "epoch": 3.1471624326882788, "grad_norm": 2.15631103515625, "learning_rate": 4.685285308586415e-05, "loss": 1.0066, "step": 202800 }, { "epoch": 3.148714287931222, "grad_norm": 2.0604913234710693, "learning_rate": 4.6851301230621206e-05, "loss": 1.029, "step": 202900 }, { "epoch": 3.1502661431741648, "grad_norm": 2.4676761627197266, "learning_rate": 4.6849749375378264e-05, "loss": 1.0442, "step": 203000 }, { "epoch": 3.1518179984171075, "grad_norm": 2.266402006149292, "learning_rate": 4.684819752013532e-05, "loss": 1.0117, "step": 203100 }, { "epoch": 3.1533698536600507, "grad_norm": 2.346513032913208, "learning_rate": 4.684664566489238e-05, "loss": 1.0354, "step": 203200 }, { "epoch": 3.1549217089029935, "grad_norm": 2.32383394241333, "learning_rate": 4.684509380964944e-05, "loss": 1.0368, "step": 203300 }, { "epoch": 3.1564735641459363, "grad_norm": 2.427879571914673, "learning_rate": 4.6843541954406495e-05, "loss": 1.0413, "step": 203400 }, { "epoch": 3.1580254193888795, "grad_norm": 2.173727512359619, "learning_rate": 4.684199009916355e-05, "loss": 1.0365, "step": 203500 }, { "epoch": 3.1595772746318223, "grad_norm": 2.270315408706665, "learning_rate": 4.684043824392061e-05, "loss": 1.0329, "step": 203600 }, { "epoch": 3.1611291298747655, "grad_norm": 1.945071816444397, "learning_rate": 4.683888638867767e-05, "loss": 1.0324, "step": 203700 }, { "epoch": 3.1626809851177082, "grad_norm": 2.603261947631836, "learning_rate": 4.6837334533434726e-05, "loss": 1.025, "step": 203800 }, { "epoch": 3.164232840360651, "grad_norm": 2.2213709354400635, "learning_rate": 4.683578267819178e-05, "loss": 1.0304, "step": 203900 }, { "epoch": 3.165784695603594, "grad_norm": 2.2088725566864014, "learning_rate": 4.683423082294884e-05, "loss": 1.0174, "step": 204000 }, { "epoch": 3.167336550846537, "grad_norm": 2.2640299797058105, "learning_rate": 4.683267896770589e-05, "loss": 1.0359, "step": 204100 }, { "epoch": 3.16888840608948, "grad_norm": 2.0523431301116943, "learning_rate": 4.683112711246295e-05, "loss": 1.0167, "step": 204200 }, { "epoch": 3.170440261332423, "grad_norm": 2.373732089996338, "learning_rate": 4.682957525722001e-05, "loss": 1.0365, "step": 204300 }, { "epoch": 3.1719921165753657, "grad_norm": 2.410597801208496, "learning_rate": 4.6828023401977065e-05, "loss": 0.9916, "step": 204400 }, { "epoch": 3.173543971818309, "grad_norm": 2.2707905769348145, "learning_rate": 4.682647154673412e-05, "loss": 1.0386, "step": 204500 }, { "epoch": 3.1750958270612517, "grad_norm": 2.3058300018310547, "learning_rate": 4.682491969149118e-05, "loss": 1.015, "step": 204600 }, { "epoch": 3.1766476823041945, "grad_norm": 2.025825262069702, "learning_rate": 4.682336783624824e-05, "loss": 1.0513, "step": 204700 }, { "epoch": 3.1781995375471377, "grad_norm": 2.966294527053833, "learning_rate": 4.6821815981005296e-05, "loss": 1.0269, "step": 204800 }, { "epoch": 3.1797513927900805, "grad_norm": 2.1126255989074707, "learning_rate": 4.6820264125762354e-05, "loss": 1.0018, "step": 204900 }, { "epoch": 3.1813032480330237, "grad_norm": 1.8233951330184937, "learning_rate": 4.681871227051941e-05, "loss": 1.0126, "step": 205000 }, { "epoch": 3.1828551032759664, "grad_norm": 2.4195187091827393, "learning_rate": 4.681716041527647e-05, "loss": 1.0121, "step": 205100 }, { "epoch": 3.184406958518909, "grad_norm": 2.181208848953247, "learning_rate": 4.681560856003353e-05, "loss": 1.0281, "step": 205200 }, { "epoch": 3.1859588137618524, "grad_norm": 2.162034511566162, "learning_rate": 4.6814056704790585e-05, "loss": 1.0346, "step": 205300 }, { "epoch": 3.187510669004795, "grad_norm": 2.2806320190429688, "learning_rate": 4.6812504849547636e-05, "loss": 1.0002, "step": 205400 }, { "epoch": 3.1890625242477384, "grad_norm": 2.4645237922668457, "learning_rate": 4.6810952994304694e-05, "loss": 1.0427, "step": 205500 }, { "epoch": 3.190614379490681, "grad_norm": 2.1412951946258545, "learning_rate": 4.680940113906175e-05, "loss": 1.0107, "step": 205600 }, { "epoch": 3.192166234733624, "grad_norm": 2.173511028289795, "learning_rate": 4.68078492838188e-05, "loss": 1.0407, "step": 205700 }, { "epoch": 3.193718089976567, "grad_norm": 2.1099634170532227, "learning_rate": 4.680629742857586e-05, "loss": 1.0116, "step": 205800 }, { "epoch": 3.19526994521951, "grad_norm": 2.658093214035034, "learning_rate": 4.680474557333292e-05, "loss": 1.01, "step": 205900 }, { "epoch": 3.1968218004624527, "grad_norm": 1.9747809171676636, "learning_rate": 4.6803193718089976e-05, "loss": 1.0344, "step": 206000 }, { "epoch": 3.198373655705396, "grad_norm": 1.9452918767929077, "learning_rate": 4.6801641862847034e-05, "loss": 1.033, "step": 206100 }, { "epoch": 3.1999255109483387, "grad_norm": 2.2257845401763916, "learning_rate": 4.680009000760409e-05, "loss": 1.0129, "step": 206200 }, { "epoch": 3.201477366191282, "grad_norm": 2.123009443283081, "learning_rate": 4.679853815236115e-05, "loss": 1.0366, "step": 206300 }, { "epoch": 3.2030292214342246, "grad_norm": 2.2326512336730957, "learning_rate": 4.679698629711821e-05, "loss": 1.0508, "step": 206400 }, { "epoch": 3.2045810766771674, "grad_norm": 1.9837554693222046, "learning_rate": 4.6795434441875265e-05, "loss": 1.0286, "step": 206500 }, { "epoch": 3.2061329319201106, "grad_norm": 2.502143621444702, "learning_rate": 4.679388258663232e-05, "loss": 1.0204, "step": 206600 }, { "epoch": 3.2076847871630534, "grad_norm": 2.494403839111328, "learning_rate": 4.679233073138938e-05, "loss": 1.0308, "step": 206700 }, { "epoch": 3.2092366424059966, "grad_norm": 1.7361814975738525, "learning_rate": 4.679077887614644e-05, "loss": 1.0272, "step": 206800 }, { "epoch": 3.2107884976489394, "grad_norm": 2.041317939758301, "learning_rate": 4.6789227020903496e-05, "loss": 1.0396, "step": 206900 }, { "epoch": 3.212340352891882, "grad_norm": 2.230377435684204, "learning_rate": 4.6787675165660547e-05, "loss": 1.008, "step": 207000 }, { "epoch": 3.2138922081348253, "grad_norm": 2.4669148921966553, "learning_rate": 4.6786123310417604e-05, "loss": 0.9976, "step": 207100 }, { "epoch": 3.215444063377768, "grad_norm": 2.2005035877227783, "learning_rate": 4.678457145517466e-05, "loss": 1.0517, "step": 207200 }, { "epoch": 3.216995918620711, "grad_norm": 1.968554973602295, "learning_rate": 4.678301959993172e-05, "loss": 1.0371, "step": 207300 }, { "epoch": 3.218547773863654, "grad_norm": 2.0097222328186035, "learning_rate": 4.678146774468878e-05, "loss": 1.0256, "step": 207400 }, { "epoch": 3.220099629106597, "grad_norm": 2.425882577896118, "learning_rate": 4.6779915889445835e-05, "loss": 1.0254, "step": 207500 }, { "epoch": 3.22165148434954, "grad_norm": 2.139723539352417, "learning_rate": 4.677836403420289e-05, "loss": 1.0321, "step": 207600 }, { "epoch": 3.223203339592483, "grad_norm": 2.0133137702941895, "learning_rate": 4.677681217895995e-05, "loss": 1.0072, "step": 207700 }, { "epoch": 3.2247551948354256, "grad_norm": 2.0798697471618652, "learning_rate": 4.677526032371701e-05, "loss": 1.0206, "step": 207800 }, { "epoch": 3.226307050078369, "grad_norm": 2.5051324367523193, "learning_rate": 4.6773708468474066e-05, "loss": 1.0218, "step": 207900 }, { "epoch": 3.2278589053213116, "grad_norm": 2.0431697368621826, "learning_rate": 4.6772156613231124e-05, "loss": 1.0069, "step": 208000 }, { "epoch": 3.229410760564255, "grad_norm": 2.1214516162872314, "learning_rate": 4.677060475798818e-05, "loss": 1.0173, "step": 208100 }, { "epoch": 3.2309626158071976, "grad_norm": 2.190187931060791, "learning_rate": 4.676905290274524e-05, "loss": 0.9937, "step": 208200 }, { "epoch": 3.2325144710501403, "grad_norm": 2.0025250911712646, "learning_rate": 4.676750104750229e-05, "loss": 0.9958, "step": 208300 }, { "epoch": 3.2340663262930835, "grad_norm": 2.4372048377990723, "learning_rate": 4.676594919225935e-05, "loss": 1.0241, "step": 208400 }, { "epoch": 3.2356181815360263, "grad_norm": 2.207690954208374, "learning_rate": 4.6764397337016406e-05, "loss": 1.002, "step": 208500 }, { "epoch": 3.237170036778969, "grad_norm": 2.0978829860687256, "learning_rate": 4.6762845481773464e-05, "loss": 1.0188, "step": 208600 }, { "epoch": 3.2387218920219123, "grad_norm": 2.116551399230957, "learning_rate": 4.6761293626530515e-05, "loss": 1.0255, "step": 208700 }, { "epoch": 3.240273747264855, "grad_norm": 2.3361663818359375, "learning_rate": 4.675974177128757e-05, "loss": 1.0149, "step": 208800 }, { "epoch": 3.2418256025077983, "grad_norm": 2.535315990447998, "learning_rate": 4.675818991604463e-05, "loss": 1.0124, "step": 208900 }, { "epoch": 3.243377457750741, "grad_norm": 2.7970681190490723, "learning_rate": 4.675663806080169e-05, "loss": 1.0383, "step": 209000 }, { "epoch": 3.244929312993684, "grad_norm": 2.3555800914764404, "learning_rate": 4.6755086205558746e-05, "loss": 1.0425, "step": 209100 }, { "epoch": 3.246481168236627, "grad_norm": 2.4953675270080566, "learning_rate": 4.6753534350315804e-05, "loss": 1.0364, "step": 209200 }, { "epoch": 3.24803302347957, "grad_norm": 2.2091288566589355, "learning_rate": 4.675198249507286e-05, "loss": 1.0468, "step": 209300 }, { "epoch": 3.2495848787225126, "grad_norm": 2.506892681121826, "learning_rate": 4.675043063982992e-05, "loss": 1.0261, "step": 209400 }, { "epoch": 3.2511367339654558, "grad_norm": 2.3813135623931885, "learning_rate": 4.674887878458698e-05, "loss": 1.0191, "step": 209500 }, { "epoch": 3.2526885892083985, "grad_norm": 2.2571282386779785, "learning_rate": 4.6747326929344035e-05, "loss": 1.0314, "step": 209600 }, { "epoch": 3.2542404444513418, "grad_norm": 2.226078510284424, "learning_rate": 4.674577507410109e-05, "loss": 0.9957, "step": 209700 }, { "epoch": 3.2557922996942845, "grad_norm": 2.2413437366485596, "learning_rate": 4.674422321885814e-05, "loss": 1.0197, "step": 209800 }, { "epoch": 3.2573441549372273, "grad_norm": 2.10233211517334, "learning_rate": 4.67426713636152e-05, "loss": 1.0135, "step": 209900 }, { "epoch": 3.2588960101801705, "grad_norm": 2.1843860149383545, "learning_rate": 4.674111950837226e-05, "loss": 1.0263, "step": 210000 }, { "epoch": 3.2604478654231133, "grad_norm": 2.4360811710357666, "learning_rate": 4.6739567653129317e-05, "loss": 1.0159, "step": 210100 }, { "epoch": 3.261999720666056, "grad_norm": 2.196308135986328, "learning_rate": 4.6738015797886374e-05, "loss": 1.0226, "step": 210200 }, { "epoch": 3.2635515759089992, "grad_norm": 2.0656545162200928, "learning_rate": 4.673646394264343e-05, "loss": 0.9997, "step": 210300 }, { "epoch": 3.265103431151942, "grad_norm": 2.1971232891082764, "learning_rate": 4.673491208740049e-05, "loss": 1.0189, "step": 210400 }, { "epoch": 3.2666552863948852, "grad_norm": 2.175978183746338, "learning_rate": 4.673336023215755e-05, "loss": 1.0117, "step": 210500 }, { "epoch": 3.268207141637828, "grad_norm": 2.458247423171997, "learning_rate": 4.6731808376914605e-05, "loss": 1.0195, "step": 210600 }, { "epoch": 3.269758996880771, "grad_norm": 2.230919361114502, "learning_rate": 4.673025652167166e-05, "loss": 1.0377, "step": 210700 }, { "epoch": 3.271310852123714, "grad_norm": 2.2012929916381836, "learning_rate": 4.672870466642872e-05, "loss": 1.0167, "step": 210800 }, { "epoch": 3.2728627073666567, "grad_norm": 1.9774216413497925, "learning_rate": 4.672715281118578e-05, "loss": 1.0122, "step": 210900 }, { "epoch": 3.2744145626096, "grad_norm": 2.210353136062622, "learning_rate": 4.6725600955942836e-05, "loss": 1.0282, "step": 211000 }, { "epoch": 3.2759664178525427, "grad_norm": 2.970851421356201, "learning_rate": 4.672404910069989e-05, "loss": 1.0533, "step": 211100 }, { "epoch": 3.2775182730954855, "grad_norm": 2.132545232772827, "learning_rate": 4.6722497245456945e-05, "loss": 1.0053, "step": 211200 }, { "epoch": 3.2790701283384287, "grad_norm": 2.40109920501709, "learning_rate": 4.6720945390214e-05, "loss": 1.0051, "step": 211300 }, { "epoch": 3.2806219835813715, "grad_norm": 2.6015913486480713, "learning_rate": 4.671939353497106e-05, "loss": 1.0452, "step": 211400 }, { "epoch": 3.2821738388243142, "grad_norm": 1.8698290586471558, "learning_rate": 4.671784167972812e-05, "loss": 1.0321, "step": 211500 }, { "epoch": 3.2837256940672575, "grad_norm": 2.216176748275757, "learning_rate": 4.6716289824485176e-05, "loss": 1.0381, "step": 211600 }, { "epoch": 3.2852775493102, "grad_norm": 2.0425329208374023, "learning_rate": 4.6714737969242234e-05, "loss": 1.0353, "step": 211700 }, { "epoch": 3.2868294045531434, "grad_norm": 2.206223726272583, "learning_rate": 4.671318611399929e-05, "loss": 1.0242, "step": 211800 }, { "epoch": 3.288381259796086, "grad_norm": 2.5931057929992676, "learning_rate": 4.671163425875634e-05, "loss": 1.0188, "step": 211900 }, { "epoch": 3.289933115039029, "grad_norm": 2.1801204681396484, "learning_rate": 4.67100824035134e-05, "loss": 1.0166, "step": 212000 }, { "epoch": 3.291484970281972, "grad_norm": 2.1543045043945312, "learning_rate": 4.670853054827046e-05, "loss": 1.0175, "step": 212100 }, { "epoch": 3.293036825524915, "grad_norm": 2.411215305328369, "learning_rate": 4.6706978693027516e-05, "loss": 1.0257, "step": 212200 }, { "epoch": 3.294588680767858, "grad_norm": 1.8291096687316895, "learning_rate": 4.6705426837784574e-05, "loss": 1.0232, "step": 212300 }, { "epoch": 3.296140536010801, "grad_norm": 2.4392714500427246, "learning_rate": 4.670387498254163e-05, "loss": 1.0129, "step": 212400 }, { "epoch": 3.2976923912537437, "grad_norm": 2.2798540592193604, "learning_rate": 4.670232312729869e-05, "loss": 1.0058, "step": 212500 }, { "epoch": 3.299244246496687, "grad_norm": 2.0743155479431152, "learning_rate": 4.670077127205574e-05, "loss": 1.0177, "step": 212600 }, { "epoch": 3.3007961017396297, "grad_norm": 2.104858875274658, "learning_rate": 4.66992194168128e-05, "loss": 1.0276, "step": 212700 }, { "epoch": 3.3023479569825724, "grad_norm": 2.282646656036377, "learning_rate": 4.6697667561569855e-05, "loss": 1.0191, "step": 212800 }, { "epoch": 3.3038998122255157, "grad_norm": 2.202138662338257, "learning_rate": 4.669611570632691e-05, "loss": 1.0253, "step": 212900 }, { "epoch": 3.3054516674684584, "grad_norm": 2.2599270343780518, "learning_rate": 4.669456385108397e-05, "loss": 1.0213, "step": 213000 }, { "epoch": 3.3070035227114016, "grad_norm": 1.8559143543243408, "learning_rate": 4.669301199584103e-05, "loss": 1.0341, "step": 213100 }, { "epoch": 3.3085553779543444, "grad_norm": 2.231807231903076, "learning_rate": 4.6691460140598087e-05, "loss": 1.0194, "step": 213200 }, { "epoch": 3.310107233197287, "grad_norm": 2.157139301300049, "learning_rate": 4.6689908285355144e-05, "loss": 1.0181, "step": 213300 }, { "epoch": 3.3116590884402304, "grad_norm": 2.08923077583313, "learning_rate": 4.66883564301122e-05, "loss": 1.0445, "step": 213400 }, { "epoch": 3.313210943683173, "grad_norm": 2.389971971511841, "learning_rate": 4.668680457486926e-05, "loss": 1.021, "step": 213500 }, { "epoch": 3.3147627989261164, "grad_norm": 2.368169069290161, "learning_rate": 4.668525271962632e-05, "loss": 1.0075, "step": 213600 }, { "epoch": 3.316314654169059, "grad_norm": 2.2019619941711426, "learning_rate": 4.6683700864383375e-05, "loss": 1.009, "step": 213700 }, { "epoch": 3.317866509412002, "grad_norm": 1.9306806325912476, "learning_rate": 4.668214900914043e-05, "loss": 1.0335, "step": 213800 }, { "epoch": 3.319418364654945, "grad_norm": 2.281291961669922, "learning_rate": 4.6680597153897484e-05, "loss": 1.0319, "step": 213900 }, { "epoch": 3.320970219897888, "grad_norm": 2.270554542541504, "learning_rate": 4.667904529865454e-05, "loss": 1.0277, "step": 214000 }, { "epoch": 3.3225220751408306, "grad_norm": 1.9186290502548218, "learning_rate": 4.66774934434116e-05, "loss": 1.0241, "step": 214100 }, { "epoch": 3.324073930383774, "grad_norm": 2.196626901626587, "learning_rate": 4.667594158816866e-05, "loss": 1.0353, "step": 214200 }, { "epoch": 3.3256257856267166, "grad_norm": 2.2863614559173584, "learning_rate": 4.6674389732925715e-05, "loss": 1.0148, "step": 214300 }, { "epoch": 3.32717764086966, "grad_norm": 2.506890296936035, "learning_rate": 4.667283787768277e-05, "loss": 1.0237, "step": 214400 }, { "epoch": 3.3287294961126026, "grad_norm": 2.128335475921631, "learning_rate": 4.667128602243983e-05, "loss": 1.0238, "step": 214500 }, { "epoch": 3.3302813513555454, "grad_norm": 2.2486555576324463, "learning_rate": 4.666973416719689e-05, "loss": 1.0266, "step": 214600 }, { "epoch": 3.3318332065984886, "grad_norm": 2.162100076675415, "learning_rate": 4.6668182311953946e-05, "loss": 1.0295, "step": 214700 }, { "epoch": 3.3333850618414314, "grad_norm": 2.2312748432159424, "learning_rate": 4.6666630456711004e-05, "loss": 1.0357, "step": 214800 }, { "epoch": 3.3349369170843746, "grad_norm": 2.4596164226531982, "learning_rate": 4.666507860146806e-05, "loss": 1.023, "step": 214900 }, { "epoch": 3.3364887723273173, "grad_norm": 2.076465606689453, "learning_rate": 4.666352674622512e-05, "loss": 1.0182, "step": 215000 }, { "epoch": 3.33804062757026, "grad_norm": 2.1031620502471924, "learning_rate": 4.666197489098218e-05, "loss": 1.0178, "step": 215100 }, { "epoch": 3.3395924828132033, "grad_norm": 2.3582725524902344, "learning_rate": 4.666042303573923e-05, "loss": 1.029, "step": 215200 }, { "epoch": 3.341144338056146, "grad_norm": 1.918215274810791, "learning_rate": 4.6658871180496286e-05, "loss": 1.0107, "step": 215300 }, { "epoch": 3.342696193299089, "grad_norm": 2.1029980182647705, "learning_rate": 4.6657319325253344e-05, "loss": 1.0183, "step": 215400 }, { "epoch": 3.344248048542032, "grad_norm": 2.1243014335632324, "learning_rate": 4.6655767470010394e-05, "loss": 1.0259, "step": 215500 }, { "epoch": 3.345799903784975, "grad_norm": 2.0930585861206055, "learning_rate": 4.665421561476745e-05, "loss": 1.0275, "step": 215600 }, { "epoch": 3.347351759027918, "grad_norm": 1.947841763496399, "learning_rate": 4.665266375952451e-05, "loss": 1.0033, "step": 215700 }, { "epoch": 3.348903614270861, "grad_norm": 2.280519723892212, "learning_rate": 4.665111190428157e-05, "loss": 1.024, "step": 215800 }, { "epoch": 3.3504554695138036, "grad_norm": 2.247267007827759, "learning_rate": 4.6649560049038625e-05, "loss": 1.0385, "step": 215900 }, { "epoch": 3.352007324756747, "grad_norm": 1.7428474426269531, "learning_rate": 4.664800819379568e-05, "loss": 1.0163, "step": 216000 }, { "epoch": 3.3535591799996896, "grad_norm": 2.2367842197418213, "learning_rate": 4.664645633855274e-05, "loss": 1.0246, "step": 216100 }, { "epoch": 3.3551110352426328, "grad_norm": 2.297909736633301, "learning_rate": 4.66449044833098e-05, "loss": 1.0191, "step": 216200 }, { "epoch": 3.3566628904855755, "grad_norm": 2.3930795192718506, "learning_rate": 4.6643352628066857e-05, "loss": 1.0169, "step": 216300 }, { "epoch": 3.3582147457285183, "grad_norm": 2.3360438346862793, "learning_rate": 4.6641800772823914e-05, "loss": 1.0062, "step": 216400 }, { "epoch": 3.3597666009714615, "grad_norm": 2.2535812854766846, "learning_rate": 4.664024891758097e-05, "loss": 1.0024, "step": 216500 }, { "epoch": 3.3613184562144043, "grad_norm": 2.942143678665161, "learning_rate": 4.663869706233803e-05, "loss": 1.0224, "step": 216600 }, { "epoch": 3.362870311457347, "grad_norm": 2.323899269104004, "learning_rate": 4.663714520709509e-05, "loss": 1.0049, "step": 216700 }, { "epoch": 3.3644221667002903, "grad_norm": 2.870634078979492, "learning_rate": 4.663559335185214e-05, "loss": 1.0287, "step": 216800 }, { "epoch": 3.365974021943233, "grad_norm": 2.5708534717559814, "learning_rate": 4.6634041496609196e-05, "loss": 1.0235, "step": 216900 }, { "epoch": 3.3675258771861762, "grad_norm": 2.0997581481933594, "learning_rate": 4.6632489641366254e-05, "loss": 1.0314, "step": 217000 }, { "epoch": 3.369077732429119, "grad_norm": 2.034555435180664, "learning_rate": 4.663093778612331e-05, "loss": 1.0264, "step": 217100 }, { "epoch": 3.3706295876720618, "grad_norm": 1.9683167934417725, "learning_rate": 4.662938593088037e-05, "loss": 1.0338, "step": 217200 }, { "epoch": 3.372181442915005, "grad_norm": 2.3981857299804688, "learning_rate": 4.662783407563743e-05, "loss": 1.0279, "step": 217300 }, { "epoch": 3.3737332981579478, "grad_norm": 2.1136646270751953, "learning_rate": 4.6626282220394485e-05, "loss": 1.0274, "step": 217400 }, { "epoch": 3.375285153400891, "grad_norm": 2.747699737548828, "learning_rate": 4.662473036515154e-05, "loss": 1.0276, "step": 217500 }, { "epoch": 3.3768370086438337, "grad_norm": 2.3531923294067383, "learning_rate": 4.66231785099086e-05, "loss": 1.0039, "step": 217600 }, { "epoch": 3.3783888638867765, "grad_norm": 2.1912336349487305, "learning_rate": 4.662162665466566e-05, "loss": 1.0195, "step": 217700 }, { "epoch": 3.3799407191297197, "grad_norm": 2.2018327713012695, "learning_rate": 4.6620074799422716e-05, "loss": 1.0117, "step": 217800 }, { "epoch": 3.3814925743726625, "grad_norm": 2.1897764205932617, "learning_rate": 4.6618522944179774e-05, "loss": 1.0378, "step": 217900 }, { "epoch": 3.3830444296156053, "grad_norm": 2.4949533939361572, "learning_rate": 4.661697108893683e-05, "loss": 1.0223, "step": 218000 }, { "epoch": 3.3845962848585485, "grad_norm": 2.1576850414276123, "learning_rate": 4.661541923369388e-05, "loss": 1.0393, "step": 218100 }, { "epoch": 3.3861481401014912, "grad_norm": 2.3476812839508057, "learning_rate": 4.661386737845094e-05, "loss": 1.005, "step": 218200 }, { "epoch": 3.3876999953444344, "grad_norm": 2.248135805130005, "learning_rate": 4.6612315523208e-05, "loss": 1.0058, "step": 218300 }, { "epoch": 3.389251850587377, "grad_norm": 2.3704135417938232, "learning_rate": 4.6610763667965056e-05, "loss": 1.0044, "step": 218400 }, { "epoch": 3.39080370583032, "grad_norm": 2.0789783000946045, "learning_rate": 4.660921181272211e-05, "loss": 1.021, "step": 218500 }, { "epoch": 3.392355561073263, "grad_norm": 2.1760096549987793, "learning_rate": 4.6607659957479164e-05, "loss": 1.0199, "step": 218600 }, { "epoch": 3.393907416316206, "grad_norm": 2.3323814868927, "learning_rate": 4.660610810223622e-05, "loss": 1.0276, "step": 218700 }, { "epoch": 3.395459271559149, "grad_norm": 2.5076801776885986, "learning_rate": 4.660455624699328e-05, "loss": 1.0278, "step": 218800 }, { "epoch": 3.397011126802092, "grad_norm": 2.9879953861236572, "learning_rate": 4.660300439175034e-05, "loss": 1.0225, "step": 218900 }, { "epoch": 3.3985629820450347, "grad_norm": 1.9574092626571655, "learning_rate": 4.6601452536507395e-05, "loss": 1.0232, "step": 219000 }, { "epoch": 3.400114837287978, "grad_norm": 2.3615403175354004, "learning_rate": 4.659990068126445e-05, "loss": 1.0165, "step": 219100 }, { "epoch": 3.4016666925309207, "grad_norm": 2.209057092666626, "learning_rate": 4.659834882602151e-05, "loss": 1.0245, "step": 219200 }, { "epoch": 3.4032185477738635, "grad_norm": 1.8723456859588623, "learning_rate": 4.659679697077857e-05, "loss": 1.0185, "step": 219300 }, { "epoch": 3.4047704030168067, "grad_norm": 2.1733956336975098, "learning_rate": 4.6595245115535627e-05, "loss": 1.0173, "step": 219400 }, { "epoch": 3.4063222582597494, "grad_norm": 2.328073024749756, "learning_rate": 4.6593693260292684e-05, "loss": 1.0395, "step": 219500 }, { "epoch": 3.4078741135026926, "grad_norm": 2.7424774169921875, "learning_rate": 4.6592141405049735e-05, "loss": 1.0367, "step": 219600 }, { "epoch": 3.4094259687456354, "grad_norm": 2.1806962490081787, "learning_rate": 4.659058954980679e-05, "loss": 1.0174, "step": 219700 }, { "epoch": 3.410977823988578, "grad_norm": 2.260183811187744, "learning_rate": 4.658903769456385e-05, "loss": 1.0326, "step": 219800 }, { "epoch": 3.4125296792315214, "grad_norm": 2.6208088397979736, "learning_rate": 4.658748583932091e-05, "loss": 1.0161, "step": 219900 }, { "epoch": 3.414081534474464, "grad_norm": 2.407381772994995, "learning_rate": 4.6585933984077966e-05, "loss": 1.0169, "step": 220000 }, { "epoch": 3.4156333897174074, "grad_norm": 1.9718668460845947, "learning_rate": 4.6584382128835024e-05, "loss": 1.0085, "step": 220100 }, { "epoch": 3.41718524496035, "grad_norm": 1.956365942955017, "learning_rate": 4.658283027359208e-05, "loss": 1.0055, "step": 220200 }, { "epoch": 3.418737100203293, "grad_norm": 2.448484182357788, "learning_rate": 4.658127841834914e-05, "loss": 1.0261, "step": 220300 }, { "epoch": 3.420288955446236, "grad_norm": 2.324747085571289, "learning_rate": 4.65797265631062e-05, "loss": 1.0318, "step": 220400 }, { "epoch": 3.421840810689179, "grad_norm": 2.905900239944458, "learning_rate": 4.6578174707863255e-05, "loss": 1.01, "step": 220500 }, { "epoch": 3.4233926659321217, "grad_norm": 2.10194993019104, "learning_rate": 4.657662285262031e-05, "loss": 1.0312, "step": 220600 }, { "epoch": 3.424944521175065, "grad_norm": 2.0558583736419678, "learning_rate": 4.657507099737737e-05, "loss": 1.0044, "step": 220700 }, { "epoch": 3.4264963764180076, "grad_norm": 2.0191500186920166, "learning_rate": 4.657351914213443e-05, "loss": 1.0145, "step": 220800 }, { "epoch": 3.428048231660951, "grad_norm": 2.301386594772339, "learning_rate": 4.657196728689148e-05, "loss": 1.0343, "step": 220900 }, { "epoch": 3.4296000869038936, "grad_norm": 2.0982818603515625, "learning_rate": 4.657041543164854e-05, "loss": 1.0261, "step": 221000 }, { "epoch": 3.4311519421468364, "grad_norm": 2.562927007675171, "learning_rate": 4.6568863576405595e-05, "loss": 1.018, "step": 221100 }, { "epoch": 3.4327037973897796, "grad_norm": 2.371718406677246, "learning_rate": 4.656731172116265e-05, "loss": 1.012, "step": 221200 }, { "epoch": 3.4342556526327224, "grad_norm": 2.111654043197632, "learning_rate": 4.656575986591971e-05, "loss": 1.0206, "step": 221300 }, { "epoch": 3.4358075078756656, "grad_norm": 2.294008731842041, "learning_rate": 4.656420801067677e-05, "loss": 1.0145, "step": 221400 }, { "epoch": 3.4373593631186083, "grad_norm": 2.5183184146881104, "learning_rate": 4.6562656155433826e-05, "loss": 1.0184, "step": 221500 }, { "epoch": 3.438911218361551, "grad_norm": 2.5621912479400635, "learning_rate": 4.6561104300190884e-05, "loss": 1.0196, "step": 221600 }, { "epoch": 3.4404630736044943, "grad_norm": 2.4350814819335938, "learning_rate": 4.6559552444947934e-05, "loss": 1.0086, "step": 221700 }, { "epoch": 3.442014928847437, "grad_norm": 2.164003610610962, "learning_rate": 4.655800058970499e-05, "loss": 1.0002, "step": 221800 }, { "epoch": 3.44356678409038, "grad_norm": 2.2738611698150635, "learning_rate": 4.655644873446205e-05, "loss": 1.0305, "step": 221900 }, { "epoch": 3.445118639333323, "grad_norm": 1.9323899745941162, "learning_rate": 4.655489687921911e-05, "loss": 0.9929, "step": 222000 }, { "epoch": 3.446670494576266, "grad_norm": 2.3244051933288574, "learning_rate": 4.6553345023976165e-05, "loss": 1.0242, "step": 222100 }, { "epoch": 3.448222349819209, "grad_norm": 2.2730369567871094, "learning_rate": 4.655179316873322e-05, "loss": 1.0208, "step": 222200 }, { "epoch": 3.449774205062152, "grad_norm": 2.7483091354370117, "learning_rate": 4.655024131349028e-05, "loss": 1.0215, "step": 222300 }, { "epoch": 3.4513260603050946, "grad_norm": 2.1802148818969727, "learning_rate": 4.654868945824734e-05, "loss": 1.0119, "step": 222400 }, { "epoch": 3.452877915548038, "grad_norm": 2.256516218185425, "learning_rate": 4.654713760300439e-05, "loss": 1.0251, "step": 222500 }, { "epoch": 3.4544297707909806, "grad_norm": 2.4180619716644287, "learning_rate": 4.654558574776145e-05, "loss": 1.012, "step": 222600 }, { "epoch": 3.455981626033924, "grad_norm": 2.606107711791992, "learning_rate": 4.6544033892518505e-05, "loss": 1.0223, "step": 222700 }, { "epoch": 3.4575334812768665, "grad_norm": 2.804283857345581, "learning_rate": 4.654248203727556e-05, "loss": 1.0081, "step": 222800 }, { "epoch": 3.4590853365198093, "grad_norm": 2.062540292739868, "learning_rate": 4.654093018203262e-05, "loss": 1.0194, "step": 222900 }, { "epoch": 3.4606371917627525, "grad_norm": 2.3496198654174805, "learning_rate": 4.653937832678968e-05, "loss": 1.0011, "step": 223000 }, { "epoch": 3.4621890470056953, "grad_norm": 2.4609780311584473, "learning_rate": 4.6537826471546736e-05, "loss": 1.0262, "step": 223100 }, { "epoch": 3.463740902248638, "grad_norm": 2.257575035095215, "learning_rate": 4.6536274616303794e-05, "loss": 1.014, "step": 223200 }, { "epoch": 3.4652927574915813, "grad_norm": 2.069559097290039, "learning_rate": 4.653472276106085e-05, "loss": 1.0254, "step": 223300 }, { "epoch": 3.466844612734524, "grad_norm": 2.2982141971588135, "learning_rate": 4.653317090581791e-05, "loss": 1.0238, "step": 223400 }, { "epoch": 3.468396467977467, "grad_norm": 2.3976995944976807, "learning_rate": 4.653161905057497e-05, "loss": 1.0073, "step": 223500 }, { "epoch": 3.46994832322041, "grad_norm": 2.147540330886841, "learning_rate": 4.6530067195332025e-05, "loss": 0.9989, "step": 223600 }, { "epoch": 3.471500178463353, "grad_norm": 2.3275585174560547, "learning_rate": 4.652851534008908e-05, "loss": 1.0209, "step": 223700 }, { "epoch": 3.473052033706296, "grad_norm": 2.6353914737701416, "learning_rate": 4.6526963484846134e-05, "loss": 1.0261, "step": 223800 }, { "epoch": 3.4746038889492388, "grad_norm": 1.9287621974945068, "learning_rate": 4.652541162960319e-05, "loss": 1.0214, "step": 223900 }, { "epoch": 3.476155744192182, "grad_norm": 2.2961585521698, "learning_rate": 4.652385977436025e-05, "loss": 0.9991, "step": 224000 }, { "epoch": 3.4777075994351248, "grad_norm": 2.4120821952819824, "learning_rate": 4.652230791911731e-05, "loss": 1.0176, "step": 224100 }, { "epoch": 3.4792594546780675, "grad_norm": 2.458669424057007, "learning_rate": 4.6520756063874365e-05, "loss": 1.0125, "step": 224200 }, { "epoch": 3.4808113099210107, "grad_norm": 1.9512863159179688, "learning_rate": 4.651920420863142e-05, "loss": 1.0156, "step": 224300 }, { "epoch": 3.4823631651639535, "grad_norm": 2.217475414276123, "learning_rate": 4.651765235338848e-05, "loss": 1.0182, "step": 224400 }, { "epoch": 3.4839150204068963, "grad_norm": 2.7593281269073486, "learning_rate": 4.651610049814554e-05, "loss": 1.0153, "step": 224500 }, { "epoch": 3.4854668756498395, "grad_norm": 2.3587729930877686, "learning_rate": 4.6514548642902596e-05, "loss": 1.027, "step": 224600 }, { "epoch": 3.4870187308927822, "grad_norm": 2.3511438369750977, "learning_rate": 4.6512996787659654e-05, "loss": 1.0253, "step": 224700 }, { "epoch": 3.488570586135725, "grad_norm": 2.1176085472106934, "learning_rate": 4.651144493241671e-05, "loss": 1.0204, "step": 224800 }, { "epoch": 3.4901224413786682, "grad_norm": 2.0496795177459717, "learning_rate": 4.650989307717377e-05, "loss": 1.0193, "step": 224900 }, { "epoch": 3.491674296621611, "grad_norm": 2.0768377780914307, "learning_rate": 4.650834122193082e-05, "loss": 1.0156, "step": 225000 }, { "epoch": 3.493226151864554, "grad_norm": 2.1627848148345947, "learning_rate": 4.650678936668788e-05, "loss": 1.0214, "step": 225100 }, { "epoch": 3.494778007107497, "grad_norm": 2.2670319080352783, "learning_rate": 4.6505237511444935e-05, "loss": 1.0252, "step": 225200 }, { "epoch": 3.49632986235044, "grad_norm": 2.3662946224212646, "learning_rate": 4.6503685656201986e-05, "loss": 1.0254, "step": 225300 }, { "epoch": 3.497881717593383, "grad_norm": 2.097546100616455, "learning_rate": 4.6502133800959044e-05, "loss": 1.0247, "step": 225400 }, { "epoch": 3.4994335728363257, "grad_norm": 2.059945821762085, "learning_rate": 4.65005819457161e-05, "loss": 1.0339, "step": 225500 }, { "epoch": 3.500985428079269, "grad_norm": 1.7830817699432373, "learning_rate": 4.649903009047316e-05, "loss": 1.0145, "step": 225600 }, { "epoch": 3.5025372833222117, "grad_norm": 2.035282611846924, "learning_rate": 4.649747823523022e-05, "loss": 0.9986, "step": 225700 }, { "epoch": 3.5040891385651545, "grad_norm": 2.0063650608062744, "learning_rate": 4.6495926379987275e-05, "loss": 1.0023, "step": 225800 }, { "epoch": 3.5056409938080977, "grad_norm": 2.157651901245117, "learning_rate": 4.649437452474433e-05, "loss": 1.0343, "step": 225900 }, { "epoch": 3.5071928490510405, "grad_norm": 2.2376537322998047, "learning_rate": 4.649282266950139e-05, "loss": 1.0417, "step": 226000 }, { "epoch": 3.508744704293983, "grad_norm": 2.8245861530303955, "learning_rate": 4.649127081425845e-05, "loss": 1.007, "step": 226100 }, { "epoch": 3.5102965595369264, "grad_norm": 2.3284294605255127, "learning_rate": 4.6489718959015506e-05, "loss": 1.0312, "step": 226200 }, { "epoch": 3.511848414779869, "grad_norm": 2.2699787616729736, "learning_rate": 4.6488167103772564e-05, "loss": 1.0055, "step": 226300 }, { "epoch": 3.5134002700228124, "grad_norm": 2.213945150375366, "learning_rate": 4.648661524852962e-05, "loss": 1.0086, "step": 226400 }, { "epoch": 3.514952125265755, "grad_norm": 1.8312371969223022, "learning_rate": 4.648506339328668e-05, "loss": 1.0208, "step": 226500 }, { "epoch": 3.5165039805086984, "grad_norm": 2.0561037063598633, "learning_rate": 4.648351153804373e-05, "loss": 1.0229, "step": 226600 }, { "epoch": 3.518055835751641, "grad_norm": 2.158634662628174, "learning_rate": 4.648195968280079e-05, "loss": 1.025, "step": 226700 }, { "epoch": 3.519607690994584, "grad_norm": 2.0920844078063965, "learning_rate": 4.6480407827557846e-05, "loss": 1.0109, "step": 226800 }, { "epoch": 3.521159546237527, "grad_norm": 2.116969585418701, "learning_rate": 4.6478855972314904e-05, "loss": 1.0109, "step": 226900 }, { "epoch": 3.52271140148047, "grad_norm": 2.1268246173858643, "learning_rate": 4.647730411707196e-05, "loss": 1.0194, "step": 227000 }, { "epoch": 3.5242632567234127, "grad_norm": 2.363027811050415, "learning_rate": 4.647575226182902e-05, "loss": 1.0197, "step": 227100 }, { "epoch": 3.525815111966356, "grad_norm": 2.2287521362304688, "learning_rate": 4.647420040658608e-05, "loss": 1.0287, "step": 227200 }, { "epoch": 3.5273669672092987, "grad_norm": 2.2710227966308594, "learning_rate": 4.6472648551343135e-05, "loss": 1.0459, "step": 227300 }, { "epoch": 3.5289188224522414, "grad_norm": 2.393834352493286, "learning_rate": 4.647109669610019e-05, "loss": 1.0146, "step": 227400 }, { "epoch": 3.5304706776951846, "grad_norm": 2.3930397033691406, "learning_rate": 4.646954484085725e-05, "loss": 1.0115, "step": 227500 }, { "epoch": 3.5320225329381274, "grad_norm": 2.3626060485839844, "learning_rate": 4.646799298561431e-05, "loss": 1.0267, "step": 227600 }, { "epoch": 3.5335743881810706, "grad_norm": 1.9492485523223877, "learning_rate": 4.6466441130371366e-05, "loss": 1.0082, "step": 227700 }, { "epoch": 3.5351262434240134, "grad_norm": 2.1411900520324707, "learning_rate": 4.6464889275128424e-05, "loss": 1.0173, "step": 227800 }, { "epoch": 3.5366780986669566, "grad_norm": 2.3113627433776855, "learning_rate": 4.6463337419885474e-05, "loss": 1.0376, "step": 227900 }, { "epoch": 3.5382299539098994, "grad_norm": 2.434312105178833, "learning_rate": 4.646178556464253e-05, "loss": 1.0231, "step": 228000 }, { "epoch": 3.539781809152842, "grad_norm": 2.245375871658325, "learning_rate": 4.646023370939959e-05, "loss": 1.0328, "step": 228100 }, { "epoch": 3.5413336643957853, "grad_norm": 2.077577829360962, "learning_rate": 4.645868185415664e-05, "loss": 1.0284, "step": 228200 }, { "epoch": 3.542885519638728, "grad_norm": 2.1776371002197266, "learning_rate": 4.64571299989137e-05, "loss": 1.0213, "step": 228300 }, { "epoch": 3.544437374881671, "grad_norm": 2.1184732913970947, "learning_rate": 4.6455578143670756e-05, "loss": 1.0225, "step": 228400 }, { "epoch": 3.545989230124614, "grad_norm": 2.2448039054870605, "learning_rate": 4.6454026288427814e-05, "loss": 1.0352, "step": 228500 }, { "epoch": 3.547541085367557, "grad_norm": 2.0606822967529297, "learning_rate": 4.645247443318487e-05, "loss": 1.0368, "step": 228600 }, { "epoch": 3.5490929406104996, "grad_norm": 2.523189067840576, "learning_rate": 4.645092257794193e-05, "loss": 1.0158, "step": 228700 }, { "epoch": 3.550644795853443, "grad_norm": 2.3495540618896484, "learning_rate": 4.644937072269899e-05, "loss": 1.0067, "step": 228800 }, { "epoch": 3.5521966510963856, "grad_norm": 2.143821954727173, "learning_rate": 4.6447818867456045e-05, "loss": 1.0398, "step": 228900 }, { "epoch": 3.553748506339329, "grad_norm": 2.1730520725250244, "learning_rate": 4.64462670122131e-05, "loss": 1.0001, "step": 229000 }, { "epoch": 3.5553003615822716, "grad_norm": 2.2635035514831543, "learning_rate": 4.644471515697016e-05, "loss": 1.041, "step": 229100 }, { "epoch": 3.556852216825215, "grad_norm": 2.173370838165283, "learning_rate": 4.644316330172722e-05, "loss": 1.0091, "step": 229200 }, { "epoch": 3.5584040720681576, "grad_norm": 2.0543220043182373, "learning_rate": 4.6441611446484276e-05, "loss": 1.0162, "step": 229300 }, { "epoch": 3.5599559273111003, "grad_norm": 2.084263563156128, "learning_rate": 4.644005959124133e-05, "loss": 1.0374, "step": 229400 }, { "epoch": 3.5615077825540435, "grad_norm": 2.394080877304077, "learning_rate": 4.6438507735998385e-05, "loss": 1.006, "step": 229500 }, { "epoch": 3.5630596377969863, "grad_norm": 2.039065361022949, "learning_rate": 4.643695588075544e-05, "loss": 1.0138, "step": 229600 }, { "epoch": 3.564611493039929, "grad_norm": 1.911515712738037, "learning_rate": 4.64354040255125e-05, "loss": 1.0535, "step": 229700 }, { "epoch": 3.5661633482828723, "grad_norm": 2.2423110008239746, "learning_rate": 4.643385217026956e-05, "loss": 1.0588, "step": 229800 }, { "epoch": 3.567715203525815, "grad_norm": 1.9198731184005737, "learning_rate": 4.6432300315026616e-05, "loss": 1.0115, "step": 229900 }, { "epoch": 3.569267058768758, "grad_norm": 2.2048838138580322, "learning_rate": 4.6430748459783674e-05, "loss": 1.0021, "step": 230000 }, { "epoch": 3.570818914011701, "grad_norm": 2.0658535957336426, "learning_rate": 4.642919660454073e-05, "loss": 1.0241, "step": 230100 }, { "epoch": 3.572370769254644, "grad_norm": 2.2744975090026855, "learning_rate": 4.642764474929779e-05, "loss": 0.9956, "step": 230200 }, { "epoch": 3.573922624497587, "grad_norm": 2.0728373527526855, "learning_rate": 4.642609289405485e-05, "loss": 0.9953, "step": 230300 }, { "epoch": 3.57547447974053, "grad_norm": 2.233289957046509, "learning_rate": 4.6424541038811905e-05, "loss": 1.0301, "step": 230400 }, { "epoch": 3.577026334983473, "grad_norm": 1.983513593673706, "learning_rate": 4.642298918356896e-05, "loss": 1.0154, "step": 230500 }, { "epoch": 3.5785781902264158, "grad_norm": 2.523077964782715, "learning_rate": 4.642143732832602e-05, "loss": 1.0509, "step": 230600 }, { "epoch": 3.5801300454693585, "grad_norm": 2.6744301319122314, "learning_rate": 4.641988547308307e-05, "loss": 1.0164, "step": 230700 }, { "epoch": 3.5816819007123017, "grad_norm": 2.7268240451812744, "learning_rate": 4.641833361784013e-05, "loss": 1.0061, "step": 230800 }, { "epoch": 3.5832337559552445, "grad_norm": 1.968333125114441, "learning_rate": 4.641678176259719e-05, "loss": 1.0106, "step": 230900 }, { "epoch": 3.5847856111981873, "grad_norm": 2.2497141361236572, "learning_rate": 4.6415229907354244e-05, "loss": 1.0256, "step": 231000 }, { "epoch": 3.5863374664411305, "grad_norm": 1.9975284337997437, "learning_rate": 4.64136780521113e-05, "loss": 1.0294, "step": 231100 }, { "epoch": 3.5878893216840733, "grad_norm": 2.34073543548584, "learning_rate": 4.641212619686836e-05, "loss": 1.0312, "step": 231200 }, { "epoch": 3.589441176927016, "grad_norm": 2.346445083618164, "learning_rate": 4.641057434162542e-05, "loss": 1.0243, "step": 231300 }, { "epoch": 3.5909930321699592, "grad_norm": 2.0903470516204834, "learning_rate": 4.6409022486382475e-05, "loss": 1.0323, "step": 231400 }, { "epoch": 3.592544887412902, "grad_norm": 2.795607328414917, "learning_rate": 4.6407470631139526e-05, "loss": 1.02, "step": 231500 }, { "epoch": 3.5940967426558452, "grad_norm": 1.859204888343811, "learning_rate": 4.6405918775896584e-05, "loss": 1.0239, "step": 231600 }, { "epoch": 3.595648597898788, "grad_norm": 2.015591621398926, "learning_rate": 4.640436692065364e-05, "loss": 1.0566, "step": 231700 }, { "epoch": 3.597200453141731, "grad_norm": 2.3207833766937256, "learning_rate": 4.64028150654107e-05, "loss": 1.0178, "step": 231800 }, { "epoch": 3.598752308384674, "grad_norm": 2.4403696060180664, "learning_rate": 4.640126321016776e-05, "loss": 1.0204, "step": 231900 }, { "epoch": 3.6003041636276167, "grad_norm": 2.2235662937164307, "learning_rate": 4.6399711354924815e-05, "loss": 1.0201, "step": 232000 }, { "epoch": 3.60185601887056, "grad_norm": 2.2444968223571777, "learning_rate": 4.639815949968187e-05, "loss": 1.0405, "step": 232100 }, { "epoch": 3.6034078741135027, "grad_norm": 2.5562655925750732, "learning_rate": 4.639660764443893e-05, "loss": 1.0084, "step": 232200 }, { "epoch": 3.6049597293564455, "grad_norm": 2.244760036468506, "learning_rate": 4.639505578919598e-05, "loss": 1.0146, "step": 232300 }, { "epoch": 3.6065115845993887, "grad_norm": 2.4502768516540527, "learning_rate": 4.639350393395304e-05, "loss": 1.0334, "step": 232400 }, { "epoch": 3.6080634398423315, "grad_norm": 1.9223589897155762, "learning_rate": 4.63919520787101e-05, "loss": 1.0364, "step": 232500 }, { "epoch": 3.6096152950852742, "grad_norm": 2.419853448867798, "learning_rate": 4.6390400223467155e-05, "loss": 1.0369, "step": 232600 }, { "epoch": 3.6111671503282174, "grad_norm": 1.8432646989822388, "learning_rate": 4.638884836822421e-05, "loss": 1.0242, "step": 232700 }, { "epoch": 3.61271900557116, "grad_norm": 2.2077646255493164, "learning_rate": 4.638729651298127e-05, "loss": 0.9861, "step": 232800 }, { "epoch": 3.6142708608141034, "grad_norm": 2.41416597366333, "learning_rate": 4.638574465773833e-05, "loss": 1.0404, "step": 232900 }, { "epoch": 3.615822716057046, "grad_norm": 2.2876474857330322, "learning_rate": 4.6384192802495386e-05, "loss": 1.0186, "step": 233000 }, { "epoch": 3.6173745712999894, "grad_norm": 2.0390963554382324, "learning_rate": 4.6382640947252444e-05, "loss": 1.0331, "step": 233100 }, { "epoch": 3.618926426542932, "grad_norm": 2.240492105484009, "learning_rate": 4.63810890920095e-05, "loss": 1.0172, "step": 233200 }, { "epoch": 3.620478281785875, "grad_norm": 2.3397746086120605, "learning_rate": 4.637953723676656e-05, "loss": 0.999, "step": 233300 }, { "epoch": 3.622030137028818, "grad_norm": 2.0867655277252197, "learning_rate": 4.637798538152362e-05, "loss": 1.0126, "step": 233400 }, { "epoch": 3.623581992271761, "grad_norm": 2.0293049812316895, "learning_rate": 4.6376433526280675e-05, "loss": 0.9968, "step": 233500 }, { "epoch": 3.6251338475147037, "grad_norm": 2.438554286956787, "learning_rate": 4.6374881671037726e-05, "loss": 1.0078, "step": 233600 }, { "epoch": 3.626685702757647, "grad_norm": 2.5825388431549072, "learning_rate": 4.6373329815794783e-05, "loss": 1.0145, "step": 233700 }, { "epoch": 3.6282375580005897, "grad_norm": 2.400360107421875, "learning_rate": 4.637177796055184e-05, "loss": 1.0311, "step": 233800 }, { "epoch": 3.6297894132435324, "grad_norm": 2.0948238372802734, "learning_rate": 4.63702261053089e-05, "loss": 1.0182, "step": 233900 }, { "epoch": 3.6313412684864756, "grad_norm": 1.9506988525390625, "learning_rate": 4.636867425006596e-05, "loss": 1.0212, "step": 234000 }, { "epoch": 3.6328931237294184, "grad_norm": 2.3656206130981445, "learning_rate": 4.6367122394823014e-05, "loss": 1.015, "step": 234100 }, { "epoch": 3.634444978972361, "grad_norm": 2.086054801940918, "learning_rate": 4.636557053958007e-05, "loss": 1.0172, "step": 234200 }, { "epoch": 3.6359968342153044, "grad_norm": 1.9722099304199219, "learning_rate": 4.636401868433713e-05, "loss": 1.016, "step": 234300 }, { "epoch": 3.6375486894582476, "grad_norm": 2.556325912475586, "learning_rate": 4.636246682909419e-05, "loss": 1.0331, "step": 234400 }, { "epoch": 3.6391005447011904, "grad_norm": 1.857521414756775, "learning_rate": 4.6360914973851245e-05, "loss": 0.9999, "step": 234500 }, { "epoch": 3.640652399944133, "grad_norm": 2.031285047531128, "learning_rate": 4.63593631186083e-05, "loss": 1.001, "step": 234600 }, { "epoch": 3.6422042551870764, "grad_norm": 2.178656816482544, "learning_rate": 4.6357811263365354e-05, "loss": 1.0213, "step": 234700 }, { "epoch": 3.643756110430019, "grad_norm": 2.235032081604004, "learning_rate": 4.635625940812241e-05, "loss": 1.032, "step": 234800 }, { "epoch": 3.645307965672962, "grad_norm": 2.5598344802856445, "learning_rate": 4.635470755287947e-05, "loss": 1.0078, "step": 234900 }, { "epoch": 3.646859820915905, "grad_norm": 2.3724629878997803, "learning_rate": 4.635315569763653e-05, "loss": 1.035, "step": 235000 }, { "epoch": 3.648411676158848, "grad_norm": 2.4333388805389404, "learning_rate": 4.635160384239358e-05, "loss": 0.9835, "step": 235100 }, { "epoch": 3.6499635314017906, "grad_norm": 1.9867881536483765, "learning_rate": 4.6350051987150636e-05, "loss": 1.0154, "step": 235200 }, { "epoch": 3.651515386644734, "grad_norm": 2.417407751083374, "learning_rate": 4.6348500131907694e-05, "loss": 1.019, "step": 235300 }, { "epoch": 3.6530672418876766, "grad_norm": 2.040393590927124, "learning_rate": 4.634694827666475e-05, "loss": 1.0201, "step": 235400 }, { "epoch": 3.6546190971306194, "grad_norm": 2.0924699306488037, "learning_rate": 4.634539642142181e-05, "loss": 1.0296, "step": 235500 }, { "epoch": 3.6561709523735626, "grad_norm": 2.062406301498413, "learning_rate": 4.634384456617887e-05, "loss": 1.0092, "step": 235600 }, { "epoch": 3.6577228076165054, "grad_norm": 2.349290132522583, "learning_rate": 4.6342292710935925e-05, "loss": 1.0281, "step": 235700 }, { "epoch": 3.6592746628594486, "grad_norm": 2.181644916534424, "learning_rate": 4.634074085569298e-05, "loss": 1.0143, "step": 235800 }, { "epoch": 3.6608265181023913, "grad_norm": 2.0875437259674072, "learning_rate": 4.633918900045004e-05, "loss": 1.0099, "step": 235900 }, { "epoch": 3.6623783733453346, "grad_norm": 2.418869733810425, "learning_rate": 4.63376371452071e-05, "loss": 1.0195, "step": 236000 }, { "epoch": 3.6639302285882773, "grad_norm": 2.2838144302368164, "learning_rate": 4.6336085289964156e-05, "loss": 1.0136, "step": 236100 }, { "epoch": 3.66548208383122, "grad_norm": 2.2078473567962646, "learning_rate": 4.6334533434721214e-05, "loss": 1.022, "step": 236200 }, { "epoch": 3.6670339390741633, "grad_norm": 2.5008809566497803, "learning_rate": 4.633298157947827e-05, "loss": 1.0229, "step": 236300 }, { "epoch": 3.668585794317106, "grad_norm": 2.4339613914489746, "learning_rate": 4.633142972423532e-05, "loss": 1.0245, "step": 236400 }, { "epoch": 3.670137649560049, "grad_norm": 2.0810017585754395, "learning_rate": 4.632987786899238e-05, "loss": 1.0142, "step": 236500 }, { "epoch": 3.671689504802992, "grad_norm": 2.1632440090179443, "learning_rate": 4.632832601374944e-05, "loss": 1.0463, "step": 236600 }, { "epoch": 3.673241360045935, "grad_norm": 2.486035108566284, "learning_rate": 4.6326774158506496e-05, "loss": 1.0124, "step": 236700 }, { "epoch": 3.6747932152888776, "grad_norm": 2.093404769897461, "learning_rate": 4.6325222303263553e-05, "loss": 0.9985, "step": 236800 }, { "epoch": 3.676345070531821, "grad_norm": 2.04958176612854, "learning_rate": 4.632367044802061e-05, "loss": 0.996, "step": 236900 }, { "epoch": 3.6778969257747636, "grad_norm": 1.9408975839614868, "learning_rate": 4.632211859277767e-05, "loss": 1.0338, "step": 237000 }, { "epoch": 3.679448781017707, "grad_norm": 2.4448330402374268, "learning_rate": 4.632056673753473e-05, "loss": 1.0033, "step": 237100 }, { "epoch": 3.6810006362606496, "grad_norm": 2.017882823944092, "learning_rate": 4.6319014882291784e-05, "loss": 1.0175, "step": 237200 }, { "epoch": 3.6825524915035928, "grad_norm": 2.3582608699798584, "learning_rate": 4.631746302704884e-05, "loss": 1.0086, "step": 237300 }, { "epoch": 3.6841043467465355, "grad_norm": 2.182378053665161, "learning_rate": 4.63159111718059e-05, "loss": 1.01, "step": 237400 }, { "epoch": 3.6856562019894783, "grad_norm": 2.016113519668579, "learning_rate": 4.631435931656296e-05, "loss": 1.0044, "step": 237500 }, { "epoch": 3.6872080572324215, "grad_norm": 3.188378095626831, "learning_rate": 4.6312807461320015e-05, "loss": 1.0112, "step": 237600 }, { "epoch": 3.6887599124753643, "grad_norm": 2.140146017074585, "learning_rate": 4.6311255606077066e-05, "loss": 1.0087, "step": 237700 }, { "epoch": 3.690311767718307, "grad_norm": 2.254169464111328, "learning_rate": 4.6309703750834124e-05, "loss": 1.0109, "step": 237800 }, { "epoch": 3.6918636229612503, "grad_norm": 1.9383188486099243, "learning_rate": 4.630815189559118e-05, "loss": 1.0115, "step": 237900 }, { "epoch": 3.693415478204193, "grad_norm": 2.0068345069885254, "learning_rate": 4.630660004034823e-05, "loss": 1.0154, "step": 238000 }, { "epoch": 3.694967333447136, "grad_norm": 1.9899184703826904, "learning_rate": 4.630504818510529e-05, "loss": 1.0055, "step": 238100 }, { "epoch": 3.696519188690079, "grad_norm": 2.1594367027282715, "learning_rate": 4.630349632986235e-05, "loss": 0.9956, "step": 238200 }, { "epoch": 3.6980710439330218, "grad_norm": 2.3453996181488037, "learning_rate": 4.6301944474619406e-05, "loss": 1.0253, "step": 238300 }, { "epoch": 3.699622899175965, "grad_norm": 1.785625696182251, "learning_rate": 4.6300392619376464e-05, "loss": 1.0078, "step": 238400 }, { "epoch": 3.7011747544189078, "grad_norm": 2.0943689346313477, "learning_rate": 4.629884076413352e-05, "loss": 0.9961, "step": 238500 }, { "epoch": 3.702726609661851, "grad_norm": 1.874801754951477, "learning_rate": 4.629728890889058e-05, "loss": 1.0289, "step": 238600 }, { "epoch": 3.7042784649047937, "grad_norm": 2.128554105758667, "learning_rate": 4.629573705364764e-05, "loss": 1.0201, "step": 238700 }, { "epoch": 3.7058303201477365, "grad_norm": 2.1019725799560547, "learning_rate": 4.6294185198404695e-05, "loss": 1.013, "step": 238800 }, { "epoch": 3.7073821753906797, "grad_norm": 1.6486116647720337, "learning_rate": 4.629263334316175e-05, "loss": 1.0029, "step": 238900 }, { "epoch": 3.7089340306336225, "grad_norm": 1.9009175300598145, "learning_rate": 4.629108148791881e-05, "loss": 1.0143, "step": 239000 }, { "epoch": 3.7104858858765652, "grad_norm": 2.5965166091918945, "learning_rate": 4.628952963267587e-05, "loss": 1.0241, "step": 239100 }, { "epoch": 3.7120377411195085, "grad_norm": 2.3161373138427734, "learning_rate": 4.6287977777432926e-05, "loss": 1.0172, "step": 239200 }, { "epoch": 3.7135895963624512, "grad_norm": 1.735445261001587, "learning_rate": 4.628642592218998e-05, "loss": 0.9958, "step": 239300 }, { "epoch": 3.715141451605394, "grad_norm": 2.472639799118042, "learning_rate": 4.6284874066947035e-05, "loss": 1.0172, "step": 239400 }, { "epoch": 3.716693306848337, "grad_norm": 2.109117031097412, "learning_rate": 4.628332221170409e-05, "loss": 1.0195, "step": 239500 }, { "epoch": 3.71824516209128, "grad_norm": 2.0460574626922607, "learning_rate": 4.628177035646115e-05, "loss": 1.004, "step": 239600 }, { "epoch": 3.719797017334223, "grad_norm": 2.4595510959625244, "learning_rate": 4.628021850121821e-05, "loss": 0.9977, "step": 239700 }, { "epoch": 3.721348872577166, "grad_norm": 2.2073729038238525, "learning_rate": 4.6278666645975266e-05, "loss": 1.0023, "step": 239800 }, { "epoch": 3.722900727820109, "grad_norm": 2.154287815093994, "learning_rate": 4.6277114790732323e-05, "loss": 1.0148, "step": 239900 }, { "epoch": 3.724452583063052, "grad_norm": 2.0740292072296143, "learning_rate": 4.627556293548938e-05, "loss": 1.0131, "step": 240000 }, { "epoch": 3.7260044383059947, "grad_norm": 2.6240620613098145, "learning_rate": 4.627401108024644e-05, "loss": 1.0124, "step": 240100 }, { "epoch": 3.727556293548938, "grad_norm": 1.9416288137435913, "learning_rate": 4.62724592250035e-05, "loss": 1.0119, "step": 240200 }, { "epoch": 3.7291081487918807, "grad_norm": 2.1373040676116943, "learning_rate": 4.6270907369760554e-05, "loss": 1.009, "step": 240300 }, { "epoch": 3.7306600040348235, "grad_norm": 1.6925129890441895, "learning_rate": 4.626935551451761e-05, "loss": 1.0065, "step": 240400 }, { "epoch": 3.7322118592777667, "grad_norm": 2.3495755195617676, "learning_rate": 4.626780365927467e-05, "loss": 1.0094, "step": 240500 }, { "epoch": 3.7337637145207094, "grad_norm": 2.175574779510498, "learning_rate": 4.626625180403172e-05, "loss": 1.036, "step": 240600 }, { "epoch": 3.735315569763652, "grad_norm": 2.241285562515259, "learning_rate": 4.626469994878878e-05, "loss": 1.0111, "step": 240700 }, { "epoch": 3.7368674250065954, "grad_norm": 2.4279415607452393, "learning_rate": 4.6263148093545836e-05, "loss": 1.007, "step": 240800 }, { "epoch": 3.738419280249538, "grad_norm": 2.5062997341156006, "learning_rate": 4.6261596238302894e-05, "loss": 1.0106, "step": 240900 }, { "epoch": 3.7399711354924814, "grad_norm": 1.7721853256225586, "learning_rate": 4.626004438305995e-05, "loss": 1.0039, "step": 241000 }, { "epoch": 3.741522990735424, "grad_norm": 2.4409706592559814, "learning_rate": 4.625849252781701e-05, "loss": 0.9988, "step": 241100 }, { "epoch": 3.7430748459783674, "grad_norm": 2.2143893241882324, "learning_rate": 4.625694067257407e-05, "loss": 1.0211, "step": 241200 }, { "epoch": 3.74462670122131, "grad_norm": 2.529294490814209, "learning_rate": 4.625538881733112e-05, "loss": 1.0369, "step": 241300 }, { "epoch": 3.746178556464253, "grad_norm": 2.6420693397521973, "learning_rate": 4.6253836962088176e-05, "loss": 1.0154, "step": 241400 }, { "epoch": 3.747730411707196, "grad_norm": 2.7493693828582764, "learning_rate": 4.6252285106845234e-05, "loss": 0.9988, "step": 241500 }, { "epoch": 3.749282266950139, "grad_norm": 2.006983757019043, "learning_rate": 4.625073325160229e-05, "loss": 1.0263, "step": 241600 }, { "epoch": 3.7508341221930817, "grad_norm": 2.709113597869873, "learning_rate": 4.624918139635935e-05, "loss": 1.0013, "step": 241700 }, { "epoch": 3.752385977436025, "grad_norm": 2.125389575958252, "learning_rate": 4.624762954111641e-05, "loss": 1.0224, "step": 241800 }, { "epoch": 3.7539378326789676, "grad_norm": 2.0216705799102783, "learning_rate": 4.6246077685873465e-05, "loss": 1.0338, "step": 241900 }, { "epoch": 3.7554896879219104, "grad_norm": 2.261094093322754, "learning_rate": 4.624452583063052e-05, "loss": 1.0214, "step": 242000 }, { "epoch": 3.7570415431648536, "grad_norm": 2.273343086242676, "learning_rate": 4.6242973975387574e-05, "loss": 1.0174, "step": 242100 }, { "epoch": 3.7585933984077964, "grad_norm": 2.352085828781128, "learning_rate": 4.624142212014463e-05, "loss": 1.0171, "step": 242200 }, { "epoch": 3.7601452536507396, "grad_norm": 2.17459774017334, "learning_rate": 4.623987026490169e-05, "loss": 1.0293, "step": 242300 }, { "epoch": 3.7616971088936824, "grad_norm": 2.500917434692383, "learning_rate": 4.623831840965875e-05, "loss": 1.0251, "step": 242400 }, { "epoch": 3.7632489641366256, "grad_norm": 2.330687999725342, "learning_rate": 4.6236766554415805e-05, "loss": 1.0277, "step": 242500 }, { "epoch": 3.7648008193795683, "grad_norm": 2.0285775661468506, "learning_rate": 4.623521469917286e-05, "loss": 1.0147, "step": 242600 }, { "epoch": 3.766352674622511, "grad_norm": 2.1773195266723633, "learning_rate": 4.623366284392992e-05, "loss": 1.0226, "step": 242700 }, { "epoch": 3.7679045298654543, "grad_norm": 2.0629265308380127, "learning_rate": 4.623211098868698e-05, "loss": 0.9957, "step": 242800 }, { "epoch": 3.769456385108397, "grad_norm": 2.370910882949829, "learning_rate": 4.6230559133444036e-05, "loss": 0.9989, "step": 242900 }, { "epoch": 3.77100824035134, "grad_norm": 2.348848819732666, "learning_rate": 4.6229007278201093e-05, "loss": 1.0077, "step": 243000 }, { "epoch": 3.772560095594283, "grad_norm": 2.2534232139587402, "learning_rate": 4.622745542295815e-05, "loss": 1.0189, "step": 243100 }, { "epoch": 3.774111950837226, "grad_norm": 2.028949022293091, "learning_rate": 4.622590356771521e-05, "loss": 1.0374, "step": 243200 }, { "epoch": 3.7756638060801686, "grad_norm": 2.0232532024383545, "learning_rate": 4.622435171247227e-05, "loss": 1.0097, "step": 243300 }, { "epoch": 3.777215661323112, "grad_norm": 1.9040672779083252, "learning_rate": 4.622279985722932e-05, "loss": 1.0093, "step": 243400 }, { "epoch": 3.7787675165660546, "grad_norm": 2.019620418548584, "learning_rate": 4.6221248001986375e-05, "loss": 1.0186, "step": 243500 }, { "epoch": 3.780319371808998, "grad_norm": 3.3194851875305176, "learning_rate": 4.621969614674343e-05, "loss": 1.0146, "step": 243600 }, { "epoch": 3.7818712270519406, "grad_norm": 2.1685357093811035, "learning_rate": 4.621814429150049e-05, "loss": 0.9949, "step": 243700 }, { "epoch": 3.7834230822948838, "grad_norm": 2.325676202774048, "learning_rate": 4.621659243625755e-05, "loss": 1.0073, "step": 243800 }, { "epoch": 3.7849749375378265, "grad_norm": 2.2917325496673584, "learning_rate": 4.6215040581014606e-05, "loss": 1.0292, "step": 243900 }, { "epoch": 3.7865267927807693, "grad_norm": 2.2997236251831055, "learning_rate": 4.6213488725771664e-05, "loss": 1.0146, "step": 244000 }, { "epoch": 3.7880786480237125, "grad_norm": 1.9001855850219727, "learning_rate": 4.621193687052872e-05, "loss": 1.0333, "step": 244100 }, { "epoch": 3.7896305032666553, "grad_norm": 1.9370079040527344, "learning_rate": 4.621038501528578e-05, "loss": 1.0064, "step": 244200 }, { "epoch": 3.791182358509598, "grad_norm": 1.9764480590820312, "learning_rate": 4.620883316004284e-05, "loss": 1.0031, "step": 244300 }, { "epoch": 3.7927342137525413, "grad_norm": 2.7530527114868164, "learning_rate": 4.6207281304799895e-05, "loss": 1.0009, "step": 244400 }, { "epoch": 3.794286068995484, "grad_norm": 2.693455696105957, "learning_rate": 4.6205729449556946e-05, "loss": 1.007, "step": 244500 }, { "epoch": 3.795837924238427, "grad_norm": 2.3306798934936523, "learning_rate": 4.6204177594314004e-05, "loss": 0.9902, "step": 244600 }, { "epoch": 3.79738977948137, "grad_norm": 1.8344670534133911, "learning_rate": 4.620262573907106e-05, "loss": 1.0159, "step": 244700 }, { "epoch": 3.798941634724313, "grad_norm": 2.201124429702759, "learning_rate": 4.620107388382812e-05, "loss": 0.9813, "step": 244800 }, { "epoch": 3.800493489967256, "grad_norm": 2.0185108184814453, "learning_rate": 4.619952202858517e-05, "loss": 1.0284, "step": 244900 }, { "epoch": 3.8020453452101988, "grad_norm": 2.5429863929748535, "learning_rate": 4.619797017334223e-05, "loss": 1.001, "step": 245000 }, { "epoch": 3.803597200453142, "grad_norm": 2.29585337638855, "learning_rate": 4.6196418318099286e-05, "loss": 1.0062, "step": 245100 }, { "epoch": 3.8051490556960847, "grad_norm": 2.349848747253418, "learning_rate": 4.6194866462856344e-05, "loss": 1.0047, "step": 245200 }, { "epoch": 3.8067009109390275, "grad_norm": 2.093092918395996, "learning_rate": 4.61933146076134e-05, "loss": 1.0143, "step": 245300 }, { "epoch": 3.8082527661819707, "grad_norm": 1.997149109840393, "learning_rate": 4.619176275237046e-05, "loss": 1.0163, "step": 245400 }, { "epoch": 3.8098046214249135, "grad_norm": 2.0725667476654053, "learning_rate": 4.619021089712752e-05, "loss": 1.0231, "step": 245500 }, { "epoch": 3.8113564766678563, "grad_norm": 2.260093927383423, "learning_rate": 4.6188659041884575e-05, "loss": 1.0143, "step": 245600 }, { "epoch": 3.8129083319107995, "grad_norm": 2.2428982257843018, "learning_rate": 4.618710718664163e-05, "loss": 1.0054, "step": 245700 }, { "epoch": 3.8144601871537422, "grad_norm": 2.4710702896118164, "learning_rate": 4.618555533139869e-05, "loss": 1.0173, "step": 245800 }, { "epoch": 3.816012042396685, "grad_norm": 1.989723563194275, "learning_rate": 4.618400347615575e-05, "loss": 1.0017, "step": 245900 }, { "epoch": 3.8175638976396282, "grad_norm": 2.8177998065948486, "learning_rate": 4.6182451620912806e-05, "loss": 0.9993, "step": 246000 }, { "epoch": 3.819115752882571, "grad_norm": 2.377488136291504, "learning_rate": 4.6180899765669863e-05, "loss": 1.0073, "step": 246100 }, { "epoch": 3.820667608125514, "grad_norm": 2.229889392852783, "learning_rate": 4.6179347910426914e-05, "loss": 1.0073, "step": 246200 }, { "epoch": 3.822219463368457, "grad_norm": 2.1102564334869385, "learning_rate": 4.617779605518397e-05, "loss": 1.0158, "step": 246300 }, { "epoch": 3.8237713186114, "grad_norm": 2.011847734451294, "learning_rate": 4.617624419994103e-05, "loss": 1.0139, "step": 246400 }, { "epoch": 3.825323173854343, "grad_norm": 2.3082711696624756, "learning_rate": 4.617469234469809e-05, "loss": 1.015, "step": 246500 }, { "epoch": 3.8268750290972857, "grad_norm": 2.7512242794036865, "learning_rate": 4.6173140489455145e-05, "loss": 1.0104, "step": 246600 }, { "epoch": 3.828426884340229, "grad_norm": 2.3987085819244385, "learning_rate": 4.61715886342122e-05, "loss": 1.023, "step": 246700 }, { "epoch": 3.8299787395831717, "grad_norm": 2.2878665924072266, "learning_rate": 4.617003677896926e-05, "loss": 1.0251, "step": 246800 }, { "epoch": 3.8315305948261145, "grad_norm": 2.284198522567749, "learning_rate": 4.616848492372632e-05, "loss": 1.0123, "step": 246900 }, { "epoch": 3.8330824500690577, "grad_norm": 2.4234066009521484, "learning_rate": 4.6166933068483376e-05, "loss": 1.0232, "step": 247000 }, { "epoch": 3.8346343053120004, "grad_norm": 2.310163974761963, "learning_rate": 4.6165381213240434e-05, "loss": 1.025, "step": 247100 }, { "epoch": 3.836186160554943, "grad_norm": 2.2584455013275146, "learning_rate": 4.616382935799749e-05, "loss": 1.0408, "step": 247200 }, { "epoch": 3.8377380157978864, "grad_norm": 1.9721930027008057, "learning_rate": 4.616227750275455e-05, "loss": 0.9959, "step": 247300 }, { "epoch": 3.839289871040829, "grad_norm": 2.072402000427246, "learning_rate": 4.616072564751161e-05, "loss": 1.0124, "step": 247400 }, { "epoch": 3.8408417262837724, "grad_norm": 2.4587271213531494, "learning_rate": 4.615917379226866e-05, "loss": 1.0163, "step": 247500 }, { "epoch": 3.842393581526715, "grad_norm": 2.407766103744507, "learning_rate": 4.6157621937025716e-05, "loss": 1.012, "step": 247600 }, { "epoch": 3.8439454367696584, "grad_norm": 2.135498285293579, "learning_rate": 4.6156070081782774e-05, "loss": 1.0076, "step": 247700 }, { "epoch": 3.845497292012601, "grad_norm": 2.279269218444824, "learning_rate": 4.6154518226539825e-05, "loss": 1.019, "step": 247800 }, { "epoch": 3.847049147255544, "grad_norm": 2.4332151412963867, "learning_rate": 4.615296637129688e-05, "loss": 1.0035, "step": 247900 }, { "epoch": 3.848601002498487, "grad_norm": 2.711205244064331, "learning_rate": 4.615141451605394e-05, "loss": 1.0026, "step": 248000 }, { "epoch": 3.85015285774143, "grad_norm": 1.9770572185516357, "learning_rate": 4.6149862660811e-05, "loss": 1.0148, "step": 248100 }, { "epoch": 3.8517047129843727, "grad_norm": 2.3384222984313965, "learning_rate": 4.6148310805568056e-05, "loss": 1.0205, "step": 248200 }, { "epoch": 3.853256568227316, "grad_norm": 2.100019931793213, "learning_rate": 4.6146758950325114e-05, "loss": 1.0121, "step": 248300 }, { "epoch": 3.8548084234702586, "grad_norm": 2.0259838104248047, "learning_rate": 4.614520709508217e-05, "loss": 1.0121, "step": 248400 }, { "epoch": 3.8563602787132014, "grad_norm": 2.5189080238342285, "learning_rate": 4.614365523983923e-05, "loss": 1.0095, "step": 248500 }, { "epoch": 3.8579121339561446, "grad_norm": 2.0332727432250977, "learning_rate": 4.614210338459629e-05, "loss": 1.0201, "step": 248600 }, { "epoch": 3.8594639891990874, "grad_norm": 2.2269084453582764, "learning_rate": 4.6140551529353345e-05, "loss": 1.0274, "step": 248700 }, { "epoch": 3.8610158444420306, "grad_norm": 1.881072759628296, "learning_rate": 4.61389996741104e-05, "loss": 1.0157, "step": 248800 }, { "epoch": 3.8625676996849734, "grad_norm": 2.1389148235321045, "learning_rate": 4.613744781886746e-05, "loss": 0.9943, "step": 248900 }, { "epoch": 3.8641195549279166, "grad_norm": 2.409604072570801, "learning_rate": 4.613589596362452e-05, "loss": 1.0225, "step": 249000 }, { "epoch": 3.8656714101708594, "grad_norm": 2.4702587127685547, "learning_rate": 4.613434410838157e-05, "loss": 0.9991, "step": 249100 }, { "epoch": 3.867223265413802, "grad_norm": 2.048389196395874, "learning_rate": 4.613279225313863e-05, "loss": 1.0358, "step": 249200 }, { "epoch": 3.8687751206567453, "grad_norm": 3.0865375995635986, "learning_rate": 4.6131240397895684e-05, "loss": 1.0141, "step": 249300 }, { "epoch": 3.870326975899688, "grad_norm": 2.165404796600342, "learning_rate": 4.612968854265274e-05, "loss": 1.0115, "step": 249400 }, { "epoch": 3.871878831142631, "grad_norm": 2.406437873840332, "learning_rate": 4.61281366874098e-05, "loss": 1.0022, "step": 249500 }, { "epoch": 3.873430686385574, "grad_norm": 2.142561912536621, "learning_rate": 4.612658483216686e-05, "loss": 1.0028, "step": 249600 }, { "epoch": 3.874982541628517, "grad_norm": 2.099102258682251, "learning_rate": 4.6125032976923915e-05, "loss": 1.0243, "step": 249700 }, { "epoch": 3.8765343968714596, "grad_norm": 1.9429491758346558, "learning_rate": 4.612348112168097e-05, "loss": 1.0147, "step": 249800 }, { "epoch": 3.878086252114403, "grad_norm": 2.4290895462036133, "learning_rate": 4.612192926643803e-05, "loss": 1.0241, "step": 249900 }, { "epoch": 3.8796381073573456, "grad_norm": 1.9997496604919434, "learning_rate": 4.612037741119509e-05, "loss": 1.007, "step": 250000 }, { "epoch": 3.8811899626002884, "grad_norm": 2.474494218826294, "learning_rate": 4.6118825555952146e-05, "loss": 0.992, "step": 250100 }, { "epoch": 3.8827418178432316, "grad_norm": 2.500391721725464, "learning_rate": 4.6117273700709204e-05, "loss": 1.0045, "step": 250200 }, { "epoch": 3.884293673086175, "grad_norm": 2.389751672744751, "learning_rate": 4.611572184546626e-05, "loss": 1.0108, "step": 250300 }, { "epoch": 3.8858455283291176, "grad_norm": 2.125541925430298, "learning_rate": 4.611416999022331e-05, "loss": 1.0037, "step": 250400 }, { "epoch": 3.8873973835720603, "grad_norm": 1.850882649421692, "learning_rate": 4.611261813498037e-05, "loss": 1.0017, "step": 250500 }, { "epoch": 3.8889492388150035, "grad_norm": 2.331622362136841, "learning_rate": 4.611106627973743e-05, "loss": 1.0239, "step": 250600 }, { "epoch": 3.8905010940579463, "grad_norm": 2.4492251873016357, "learning_rate": 4.6109514424494486e-05, "loss": 1.0074, "step": 250700 }, { "epoch": 3.892052949300889, "grad_norm": 2.2282145023345947, "learning_rate": 4.6107962569251544e-05, "loss": 1.0033, "step": 250800 }, { "epoch": 3.8936048045438323, "grad_norm": 2.3184473514556885, "learning_rate": 4.61064107140086e-05, "loss": 1.0039, "step": 250900 }, { "epoch": 3.895156659786775, "grad_norm": 2.4361066818237305, "learning_rate": 4.610485885876565e-05, "loss": 0.9914, "step": 251000 }, { "epoch": 3.896708515029718, "grad_norm": 2.44022798538208, "learning_rate": 4.610330700352271e-05, "loss": 1.0096, "step": 251100 }, { "epoch": 3.898260370272661, "grad_norm": 2.6742916107177734, "learning_rate": 4.610175514827977e-05, "loss": 1.0153, "step": 251200 }, { "epoch": 3.899812225515604, "grad_norm": 2.2265617847442627, "learning_rate": 4.6100203293036826e-05, "loss": 1.0184, "step": 251300 }, { "epoch": 3.9013640807585466, "grad_norm": 2.047741651535034, "learning_rate": 4.6098651437793884e-05, "loss": 0.9966, "step": 251400 }, { "epoch": 3.90291593600149, "grad_norm": 1.9516500234603882, "learning_rate": 4.609709958255094e-05, "loss": 1.021, "step": 251500 }, { "epoch": 3.904467791244433, "grad_norm": 2.1483633518218994, "learning_rate": 4.6095547727308e-05, "loss": 1.0045, "step": 251600 }, { "epoch": 3.9060196464873758, "grad_norm": 2.2886312007904053, "learning_rate": 4.609399587206506e-05, "loss": 1.0047, "step": 251700 }, { "epoch": 3.9075715017303185, "grad_norm": 2.082888603210449, "learning_rate": 4.6092444016822115e-05, "loss": 1.0131, "step": 251800 }, { "epoch": 3.9091233569732617, "grad_norm": 2.2767488956451416, "learning_rate": 4.6090892161579166e-05, "loss": 0.9936, "step": 251900 }, { "epoch": 3.9106752122162045, "grad_norm": 2.366884231567383, "learning_rate": 4.6089340306336223e-05, "loss": 0.9991, "step": 252000 }, { "epoch": 3.9122270674591473, "grad_norm": 2.117441177368164, "learning_rate": 4.608778845109328e-05, "loss": 1.0047, "step": 252100 }, { "epoch": 3.9137789227020905, "grad_norm": 2.2630763053894043, "learning_rate": 4.608623659585034e-05, "loss": 0.989, "step": 252200 }, { "epoch": 3.9153307779450333, "grad_norm": 2.3058600425720215, "learning_rate": 4.60846847406074e-05, "loss": 0.9771, "step": 252300 }, { "epoch": 3.916882633187976, "grad_norm": 2.203831911087036, "learning_rate": 4.6083132885364454e-05, "loss": 0.9869, "step": 252400 }, { "epoch": 3.9184344884309192, "grad_norm": 2.3880887031555176, "learning_rate": 4.608158103012151e-05, "loss": 0.9929, "step": 252500 }, { "epoch": 3.919986343673862, "grad_norm": 2.3750267028808594, "learning_rate": 4.608002917487857e-05, "loss": 1.0143, "step": 252600 }, { "epoch": 3.9215381989168048, "grad_norm": 2.575727701187134, "learning_rate": 4.607847731963563e-05, "loss": 1.0076, "step": 252700 }, { "epoch": 3.923090054159748, "grad_norm": 2.406012773513794, "learning_rate": 4.6076925464392685e-05, "loss": 1.0205, "step": 252800 }, { "epoch": 3.9246419094026908, "grad_norm": 2.0970282554626465, "learning_rate": 4.607537360914974e-05, "loss": 1.0122, "step": 252900 }, { "epoch": 3.926193764645634, "grad_norm": 2.0307211875915527, "learning_rate": 4.60738217539068e-05, "loss": 1.0216, "step": 253000 }, { "epoch": 3.9277456198885767, "grad_norm": 1.9782108068466187, "learning_rate": 4.607226989866386e-05, "loss": 1.0109, "step": 253100 }, { "epoch": 3.92929747513152, "grad_norm": 2.2451000213623047, "learning_rate": 4.607071804342091e-05, "loss": 1.0028, "step": 253200 }, { "epoch": 3.9308493303744627, "grad_norm": 2.2572503089904785, "learning_rate": 4.606916618817797e-05, "loss": 0.9944, "step": 253300 }, { "epoch": 3.9324011856174055, "grad_norm": 2.0190742015838623, "learning_rate": 4.6067614332935025e-05, "loss": 1.0097, "step": 253400 }, { "epoch": 3.9339530408603487, "grad_norm": 2.285947322845459, "learning_rate": 4.606606247769208e-05, "loss": 1.0194, "step": 253500 }, { "epoch": 3.9355048961032915, "grad_norm": 2.201874017715454, "learning_rate": 4.606451062244914e-05, "loss": 1.0209, "step": 253600 }, { "epoch": 3.9370567513462342, "grad_norm": 2.6722114086151123, "learning_rate": 4.60629587672062e-05, "loss": 0.9902, "step": 253700 }, { "epoch": 3.9386086065891774, "grad_norm": 2.5566859245300293, "learning_rate": 4.6061406911963256e-05, "loss": 0.9832, "step": 253800 }, { "epoch": 3.94016046183212, "grad_norm": 2.0157370567321777, "learning_rate": 4.6059855056720314e-05, "loss": 0.9794, "step": 253900 }, { "epoch": 3.941712317075063, "grad_norm": 2.0200371742248535, "learning_rate": 4.605830320147737e-05, "loss": 0.9939, "step": 254000 }, { "epoch": 3.943264172318006, "grad_norm": 2.292102336883545, "learning_rate": 4.605675134623443e-05, "loss": 1.0105, "step": 254100 }, { "epoch": 3.944816027560949, "grad_norm": 2.0531704425811768, "learning_rate": 4.605519949099149e-05, "loss": 1.0091, "step": 254200 }, { "epoch": 3.946367882803892, "grad_norm": 1.9320237636566162, "learning_rate": 4.605364763574854e-05, "loss": 0.9875, "step": 254300 }, { "epoch": 3.947919738046835, "grad_norm": 2.305433988571167, "learning_rate": 4.6052095780505596e-05, "loss": 1.0017, "step": 254400 }, { "epoch": 3.949471593289778, "grad_norm": 2.343123435974121, "learning_rate": 4.6050543925262654e-05, "loss": 1.0057, "step": 254500 }, { "epoch": 3.951023448532721, "grad_norm": 2.5635619163513184, "learning_rate": 4.604899207001971e-05, "loss": 0.9993, "step": 254600 }, { "epoch": 3.9525753037756637, "grad_norm": 2.2476115226745605, "learning_rate": 4.604744021477677e-05, "loss": 1.0299, "step": 254700 }, { "epoch": 3.954127159018607, "grad_norm": 2.1379034519195557, "learning_rate": 4.604588835953382e-05, "loss": 0.9934, "step": 254800 }, { "epoch": 3.9556790142615497, "grad_norm": 2.1756467819213867, "learning_rate": 4.604433650429088e-05, "loss": 1.0148, "step": 254900 }, { "epoch": 3.9572308695044924, "grad_norm": 2.511732339859009, "learning_rate": 4.6042784649047936e-05, "loss": 0.9982, "step": 255000 }, { "epoch": 3.9587827247474356, "grad_norm": 2.0807454586029053, "learning_rate": 4.6041232793804993e-05, "loss": 1.005, "step": 255100 }, { "epoch": 3.9603345799903784, "grad_norm": 2.1180360317230225, "learning_rate": 4.603968093856205e-05, "loss": 1.003, "step": 255200 }, { "epoch": 3.961886435233321, "grad_norm": 2.301769733428955, "learning_rate": 4.603812908331911e-05, "loss": 1.0037, "step": 255300 }, { "epoch": 3.9634382904762644, "grad_norm": 2.085273504257202, "learning_rate": 4.603657722807617e-05, "loss": 1.0402, "step": 255400 }, { "epoch": 3.964990145719207, "grad_norm": 2.3542139530181885, "learning_rate": 4.6035025372833224e-05, "loss": 0.9969, "step": 255500 }, { "epoch": 3.9665420009621504, "grad_norm": 2.3570122718811035, "learning_rate": 4.603347351759028e-05, "loss": 0.9882, "step": 255600 }, { "epoch": 3.968093856205093, "grad_norm": 2.451073408126831, "learning_rate": 4.603192166234734e-05, "loss": 1.0217, "step": 255700 }, { "epoch": 3.9696457114480364, "grad_norm": 2.239896297454834, "learning_rate": 4.60303698071044e-05, "loss": 1.0256, "step": 255800 }, { "epoch": 3.971197566690979, "grad_norm": 2.051710844039917, "learning_rate": 4.6028817951861455e-05, "loss": 0.9869, "step": 255900 }, { "epoch": 3.972749421933922, "grad_norm": 2.2589364051818848, "learning_rate": 4.602726609661851e-05, "loss": 1.014, "step": 256000 }, { "epoch": 3.974301277176865, "grad_norm": 2.065429210662842, "learning_rate": 4.6025714241375564e-05, "loss": 0.9957, "step": 256100 }, { "epoch": 3.975853132419808, "grad_norm": 2.291639804840088, "learning_rate": 4.602416238613262e-05, "loss": 1.0029, "step": 256200 }, { "epoch": 3.9774049876627506, "grad_norm": 1.8173938989639282, "learning_rate": 4.602261053088968e-05, "loss": 0.9764, "step": 256300 }, { "epoch": 3.978956842905694, "grad_norm": 1.8184092044830322, "learning_rate": 4.602105867564674e-05, "loss": 1.012, "step": 256400 }, { "epoch": 3.9805086981486366, "grad_norm": 2.0670435428619385, "learning_rate": 4.6019506820403795e-05, "loss": 0.9894, "step": 256500 }, { "epoch": 3.9820605533915794, "grad_norm": 2.2796432971954346, "learning_rate": 4.601795496516085e-05, "loss": 0.998, "step": 256600 }, { "epoch": 3.9836124086345226, "grad_norm": 2.5332934856414795, "learning_rate": 4.601640310991791e-05, "loss": 0.9865, "step": 256700 }, { "epoch": 3.9851642638774654, "grad_norm": 2.750664472579956, "learning_rate": 4.601485125467497e-05, "loss": 0.998, "step": 256800 }, { "epoch": 3.9867161191204086, "grad_norm": 2.2038350105285645, "learning_rate": 4.6013299399432026e-05, "loss": 1.0146, "step": 256900 }, { "epoch": 3.9882679743633513, "grad_norm": 2.1637487411499023, "learning_rate": 4.6011747544189084e-05, "loss": 1.0171, "step": 257000 }, { "epoch": 3.9898198296062946, "grad_norm": 2.032949447631836, "learning_rate": 4.601019568894614e-05, "loss": 0.9905, "step": 257100 }, { "epoch": 3.9913716848492373, "grad_norm": 1.9132201671600342, "learning_rate": 4.60086438337032e-05, "loss": 1.0105, "step": 257200 }, { "epoch": 3.99292354009218, "grad_norm": 2.50614070892334, "learning_rate": 4.600709197846026e-05, "loss": 1.032, "step": 257300 }, { "epoch": 3.9944753953351233, "grad_norm": 1.8686712980270386, "learning_rate": 4.600554012321731e-05, "loss": 1.0212, "step": 257400 }, { "epoch": 3.996027250578066, "grad_norm": 1.7639912366867065, "learning_rate": 4.6003988267974366e-05, "loss": 1.02, "step": 257500 }, { "epoch": 3.997579105821009, "grad_norm": 1.807686448097229, "learning_rate": 4.600243641273142e-05, "loss": 1.0066, "step": 257600 }, { "epoch": 3.999130961063952, "grad_norm": 2.4362223148345947, "learning_rate": 4.6000884557488475e-05, "loss": 0.9818, "step": 257700 }, { "epoch": 4.000682816306895, "grad_norm": 2.0881383419036865, "learning_rate": 4.599933270224553e-05, "loss": 1.0072, "step": 257800 }, { "epoch": 4.002234671549838, "grad_norm": 2.0971968173980713, "learning_rate": 4.599778084700259e-05, "loss": 0.996, "step": 257900 }, { "epoch": 4.003786526792781, "grad_norm": 2.6785244941711426, "learning_rate": 4.599622899175965e-05, "loss": 1.0064, "step": 258000 }, { "epoch": 4.005338382035724, "grad_norm": 2.0060083866119385, "learning_rate": 4.5994677136516706e-05, "loss": 0.9939, "step": 258100 }, { "epoch": 4.006890237278666, "grad_norm": 2.1636555194854736, "learning_rate": 4.5993125281273763e-05, "loss": 1.0001, "step": 258200 }, { "epoch": 4.0084420925216095, "grad_norm": 2.343156099319458, "learning_rate": 4.599157342603082e-05, "loss": 1.0007, "step": 258300 }, { "epoch": 4.009993947764553, "grad_norm": 2.4066388607025146, "learning_rate": 4.599002157078788e-05, "loss": 1.0014, "step": 258400 }, { "epoch": 4.011545803007495, "grad_norm": 2.094865560531616, "learning_rate": 4.598846971554494e-05, "loss": 1.0128, "step": 258500 }, { "epoch": 4.013097658250438, "grad_norm": 1.8208328485488892, "learning_rate": 4.5986917860301994e-05, "loss": 0.9886, "step": 258600 }, { "epoch": 4.0146495134933815, "grad_norm": 2.268063545227051, "learning_rate": 4.598536600505905e-05, "loss": 0.999, "step": 258700 }, { "epoch": 4.016201368736325, "grad_norm": 2.788959264755249, "learning_rate": 4.598381414981611e-05, "loss": 1.0116, "step": 258800 }, { "epoch": 4.017753223979267, "grad_norm": 2.5502994060516357, "learning_rate": 4.598226229457316e-05, "loss": 1.0105, "step": 258900 }, { "epoch": 4.01930507922221, "grad_norm": 2.312920331954956, "learning_rate": 4.598071043933022e-05, "loss": 1.0157, "step": 259000 }, { "epoch": 4.0208569344651535, "grad_norm": 2.3428311347961426, "learning_rate": 4.5979158584087276e-05, "loss": 1.0139, "step": 259100 }, { "epoch": 4.022408789708096, "grad_norm": 2.463869333267212, "learning_rate": 4.5977606728844334e-05, "loss": 1.0156, "step": 259200 }, { "epoch": 4.023960644951039, "grad_norm": 2.294227123260498, "learning_rate": 4.597605487360139e-05, "loss": 1.0073, "step": 259300 }, { "epoch": 4.025512500193982, "grad_norm": 2.375056266784668, "learning_rate": 4.597450301835845e-05, "loss": 1.0253, "step": 259400 }, { "epoch": 4.0270643554369245, "grad_norm": 2.5101332664489746, "learning_rate": 4.597295116311551e-05, "loss": 1.0256, "step": 259500 }, { "epoch": 4.028616210679868, "grad_norm": 2.167787790298462, "learning_rate": 4.5971399307872565e-05, "loss": 0.9979, "step": 259600 }, { "epoch": 4.030168065922811, "grad_norm": 1.9655741453170776, "learning_rate": 4.596984745262962e-05, "loss": 1.0224, "step": 259700 }, { "epoch": 4.031719921165753, "grad_norm": 2.8912274837493896, "learning_rate": 4.596829559738668e-05, "loss": 1.006, "step": 259800 }, { "epoch": 4.0332717764086965, "grad_norm": 2.127323627471924, "learning_rate": 4.596674374214374e-05, "loss": 1.0002, "step": 259900 }, { "epoch": 4.03482363165164, "grad_norm": 2.308802843093872, "learning_rate": 4.5965191886900796e-05, "loss": 1.0108, "step": 260000 }, { "epoch": 4.036375486894583, "grad_norm": 2.542405128479004, "learning_rate": 4.5963640031657854e-05, "loss": 1.0331, "step": 260100 }, { "epoch": 4.037927342137525, "grad_norm": 1.9634498357772827, "learning_rate": 4.5962088176414905e-05, "loss": 0.9962, "step": 260200 }, { "epoch": 4.0394791973804685, "grad_norm": 2.1191060543060303, "learning_rate": 4.596053632117196e-05, "loss": 0.9932, "step": 260300 }, { "epoch": 4.041031052623412, "grad_norm": 2.114780902862549, "learning_rate": 4.595898446592902e-05, "loss": 0.9979, "step": 260400 }, { "epoch": 4.042582907866354, "grad_norm": 2.1260523796081543, "learning_rate": 4.595743261068608e-05, "loss": 0.9973, "step": 260500 }, { "epoch": 4.044134763109297, "grad_norm": 2.372663974761963, "learning_rate": 4.5955880755443136e-05, "loss": 0.9875, "step": 260600 }, { "epoch": 4.04568661835224, "grad_norm": 2.4009511470794678, "learning_rate": 4.5954328900200194e-05, "loss": 0.9994, "step": 260700 }, { "epoch": 4.047238473595183, "grad_norm": 1.8778742551803589, "learning_rate": 4.5952777044957245e-05, "loss": 1.0148, "step": 260800 }, { "epoch": 4.048790328838126, "grad_norm": 2.228926181793213, "learning_rate": 4.59512251897143e-05, "loss": 0.989, "step": 260900 }, { "epoch": 4.050342184081069, "grad_norm": 2.1706323623657227, "learning_rate": 4.594967333447136e-05, "loss": 1.0027, "step": 261000 }, { "epoch": 4.0518940393240115, "grad_norm": 2.4426918029785156, "learning_rate": 4.594812147922842e-05, "loss": 0.9911, "step": 261100 }, { "epoch": 4.053445894566955, "grad_norm": 2.161242723464966, "learning_rate": 4.5946569623985476e-05, "loss": 1.0056, "step": 261200 }, { "epoch": 4.054997749809898, "grad_norm": 1.8238660097122192, "learning_rate": 4.5945017768742533e-05, "loss": 1.0124, "step": 261300 }, { "epoch": 4.05654960505284, "grad_norm": 2.6080873012542725, "learning_rate": 4.594346591349959e-05, "loss": 1.0104, "step": 261400 }, { "epoch": 4.0581014602957834, "grad_norm": 2.3893115520477295, "learning_rate": 4.594191405825665e-05, "loss": 0.9913, "step": 261500 }, { "epoch": 4.059653315538727, "grad_norm": 2.1053595542907715, "learning_rate": 4.594036220301371e-05, "loss": 0.9868, "step": 261600 }, { "epoch": 4.06120517078167, "grad_norm": 2.303849935531616, "learning_rate": 4.593881034777076e-05, "loss": 1.0127, "step": 261700 }, { "epoch": 4.062757026024612, "grad_norm": 1.7979545593261719, "learning_rate": 4.5937258492527815e-05, "loss": 0.9976, "step": 261800 }, { "epoch": 4.064308881267555, "grad_norm": 2.176025152206421, "learning_rate": 4.593570663728487e-05, "loss": 1.0364, "step": 261900 }, { "epoch": 4.065860736510499, "grad_norm": 2.457047939300537, "learning_rate": 4.593415478204193e-05, "loss": 1.0102, "step": 262000 }, { "epoch": 4.067412591753441, "grad_norm": 2.103670120239258, "learning_rate": 4.593260292679899e-05, "loss": 1.0176, "step": 262100 }, { "epoch": 4.068964446996384, "grad_norm": 2.3812785148620605, "learning_rate": 4.5931051071556046e-05, "loss": 0.9834, "step": 262200 }, { "epoch": 4.070516302239327, "grad_norm": 2.1732730865478516, "learning_rate": 4.5929499216313104e-05, "loss": 0.9911, "step": 262300 }, { "epoch": 4.07206815748227, "grad_norm": 2.4105000495910645, "learning_rate": 4.592794736107016e-05, "loss": 1.0138, "step": 262400 }, { "epoch": 4.073620012725213, "grad_norm": 2.6054818630218506, "learning_rate": 4.592639550582722e-05, "loss": 1.0001, "step": 262500 }, { "epoch": 4.075171867968156, "grad_norm": 1.9678267240524292, "learning_rate": 4.592484365058428e-05, "loss": 1.005, "step": 262600 }, { "epoch": 4.076723723211098, "grad_norm": 2.3858137130737305, "learning_rate": 4.5923291795341335e-05, "loss": 0.9852, "step": 262700 }, { "epoch": 4.078275578454042, "grad_norm": 1.9260298013687134, "learning_rate": 4.592173994009839e-05, "loss": 0.9995, "step": 262800 }, { "epoch": 4.079827433696985, "grad_norm": 2.237461566925049, "learning_rate": 4.592018808485545e-05, "loss": 1.002, "step": 262900 }, { "epoch": 4.081379288939928, "grad_norm": 1.8933922052383423, "learning_rate": 4.59186362296125e-05, "loss": 1.0063, "step": 263000 }, { "epoch": 4.08293114418287, "grad_norm": 2.222546339035034, "learning_rate": 4.591708437436956e-05, "loss": 0.9858, "step": 263100 }, { "epoch": 4.084482999425814, "grad_norm": 2.262171745300293, "learning_rate": 4.591553251912662e-05, "loss": 0.9998, "step": 263200 }, { "epoch": 4.086034854668757, "grad_norm": 2.122223377227783, "learning_rate": 4.5913980663883675e-05, "loss": 0.9885, "step": 263300 }, { "epoch": 4.087586709911699, "grad_norm": 2.471280097961426, "learning_rate": 4.591242880864073e-05, "loss": 1.0067, "step": 263400 }, { "epoch": 4.089138565154642, "grad_norm": 2.067857027053833, "learning_rate": 4.591087695339779e-05, "loss": 0.9984, "step": 263500 }, { "epoch": 4.090690420397586, "grad_norm": 2.6935949325561523, "learning_rate": 4.590932509815485e-05, "loss": 1.0045, "step": 263600 }, { "epoch": 4.092242275640528, "grad_norm": 1.9788835048675537, "learning_rate": 4.5907773242911906e-05, "loss": 1.0129, "step": 263700 }, { "epoch": 4.093794130883471, "grad_norm": 2.317003011703491, "learning_rate": 4.5906221387668964e-05, "loss": 1.0158, "step": 263800 }, { "epoch": 4.095345986126414, "grad_norm": 2.620793104171753, "learning_rate": 4.590466953242602e-05, "loss": 1.0176, "step": 263900 }, { "epoch": 4.096897841369357, "grad_norm": 2.2395946979522705, "learning_rate": 4.590311767718308e-05, "loss": 1.0171, "step": 264000 }, { "epoch": 4.0984496966123, "grad_norm": 2.3391828536987305, "learning_rate": 4.590156582194013e-05, "loss": 1.0139, "step": 264100 }, { "epoch": 4.100001551855243, "grad_norm": 2.5829570293426514, "learning_rate": 4.590001396669719e-05, "loss": 1.004, "step": 264200 }, { "epoch": 4.101553407098186, "grad_norm": 1.987902283668518, "learning_rate": 4.5898462111454246e-05, "loss": 0.9995, "step": 264300 }, { "epoch": 4.103105262341129, "grad_norm": 2.1167943477630615, "learning_rate": 4.5896910256211303e-05, "loss": 1.0284, "step": 264400 }, { "epoch": 4.104657117584072, "grad_norm": 2.583871364593506, "learning_rate": 4.589535840096836e-05, "loss": 1.0008, "step": 264500 }, { "epoch": 4.106208972827015, "grad_norm": 2.3861241340637207, "learning_rate": 4.589380654572541e-05, "loss": 1.014, "step": 264600 }, { "epoch": 4.107760828069957, "grad_norm": 2.2229995727539062, "learning_rate": 4.589225469048247e-05, "loss": 1.0051, "step": 264700 }, { "epoch": 4.109312683312901, "grad_norm": 1.8920927047729492, "learning_rate": 4.589070283523953e-05, "loss": 1.0078, "step": 264800 }, { "epoch": 4.110864538555844, "grad_norm": 2.1924002170562744, "learning_rate": 4.5889150979996585e-05, "loss": 0.9962, "step": 264900 }, { "epoch": 4.112416393798786, "grad_norm": 2.237762212753296, "learning_rate": 4.588759912475364e-05, "loss": 1.0097, "step": 265000 }, { "epoch": 4.113968249041729, "grad_norm": 2.4558346271514893, "learning_rate": 4.58860472695107e-05, "loss": 1.0142, "step": 265100 }, { "epoch": 4.1155201042846725, "grad_norm": 2.175586700439453, "learning_rate": 4.588449541426776e-05, "loss": 1.0048, "step": 265200 }, { "epoch": 4.117071959527615, "grad_norm": 2.0444562435150146, "learning_rate": 4.5882943559024816e-05, "loss": 1.0, "step": 265300 }, { "epoch": 4.118623814770558, "grad_norm": 2.110651731491089, "learning_rate": 4.5881391703781874e-05, "loss": 1.0183, "step": 265400 }, { "epoch": 4.120175670013501, "grad_norm": 1.7949814796447754, "learning_rate": 4.587983984853893e-05, "loss": 1.0086, "step": 265500 }, { "epoch": 4.1217275252564445, "grad_norm": 1.9625879526138306, "learning_rate": 4.587828799329599e-05, "loss": 1.0165, "step": 265600 }, { "epoch": 4.123279380499387, "grad_norm": 1.9224804639816284, "learning_rate": 4.587673613805305e-05, "loss": 0.9843, "step": 265700 }, { "epoch": 4.12483123574233, "grad_norm": 2.26285982131958, "learning_rate": 4.5875184282810105e-05, "loss": 1.0028, "step": 265800 }, { "epoch": 4.126383090985273, "grad_norm": 2.680020809173584, "learning_rate": 4.5873632427567156e-05, "loss": 1.0128, "step": 265900 }, { "epoch": 4.1279349462282156, "grad_norm": 2.2132368087768555, "learning_rate": 4.5872080572324214e-05, "loss": 0.9966, "step": 266000 }, { "epoch": 4.129486801471159, "grad_norm": 2.2778048515319824, "learning_rate": 4.587052871708127e-05, "loss": 1.0001, "step": 266100 }, { "epoch": 4.131038656714102, "grad_norm": 2.3839142322540283, "learning_rate": 4.586897686183833e-05, "loss": 1.005, "step": 266200 }, { "epoch": 4.132590511957044, "grad_norm": 2.1487812995910645, "learning_rate": 4.586742500659539e-05, "loss": 1.0014, "step": 266300 }, { "epoch": 4.1341423671999875, "grad_norm": 2.3841116428375244, "learning_rate": 4.5865873151352445e-05, "loss": 0.9951, "step": 266400 }, { "epoch": 4.135694222442931, "grad_norm": 2.0241899490356445, "learning_rate": 4.58643212961095e-05, "loss": 0.9868, "step": 266500 }, { "epoch": 4.137246077685873, "grad_norm": 2.3595705032348633, "learning_rate": 4.586276944086656e-05, "loss": 1.0066, "step": 266600 }, { "epoch": 4.138797932928816, "grad_norm": 2.4611387252807617, "learning_rate": 4.586121758562362e-05, "loss": 0.9933, "step": 266700 }, { "epoch": 4.1403497881717595, "grad_norm": 2.13482666015625, "learning_rate": 4.5859665730380676e-05, "loss": 1.022, "step": 266800 }, { "epoch": 4.141901643414703, "grad_norm": 1.967960238456726, "learning_rate": 4.5858113875137734e-05, "loss": 1.0124, "step": 266900 }, { "epoch": 4.143453498657645, "grad_norm": 2.3471012115478516, "learning_rate": 4.585656201989479e-05, "loss": 1.0231, "step": 267000 }, { "epoch": 4.145005353900588, "grad_norm": 2.416116237640381, "learning_rate": 4.585501016465185e-05, "loss": 1.011, "step": 267100 }, { "epoch": 4.146557209143531, "grad_norm": 2.276646852493286, "learning_rate": 4.58534583094089e-05, "loss": 0.9942, "step": 267200 }, { "epoch": 4.148109064386474, "grad_norm": 2.473435878753662, "learning_rate": 4.585190645416596e-05, "loss": 1.0015, "step": 267300 }, { "epoch": 4.149660919629417, "grad_norm": 2.810506582260132, "learning_rate": 4.585035459892301e-05, "loss": 1.0061, "step": 267400 }, { "epoch": 4.15121277487236, "grad_norm": 2.3710923194885254, "learning_rate": 4.584880274368007e-05, "loss": 1.0058, "step": 267500 }, { "epoch": 4.1527646301153025, "grad_norm": 2.1666676998138428, "learning_rate": 4.5847250888437124e-05, "loss": 0.9779, "step": 267600 }, { "epoch": 4.154316485358246, "grad_norm": 2.4987688064575195, "learning_rate": 4.584569903319418e-05, "loss": 1.0247, "step": 267700 }, { "epoch": 4.155868340601189, "grad_norm": 2.13677978515625, "learning_rate": 4.584414717795124e-05, "loss": 0.9948, "step": 267800 }, { "epoch": 4.157420195844131, "grad_norm": 1.9034311771392822, "learning_rate": 4.58425953227083e-05, "loss": 0.987, "step": 267900 }, { "epoch": 4.1589720510870745, "grad_norm": 2.2017834186553955, "learning_rate": 4.5841043467465355e-05, "loss": 0.9963, "step": 268000 }, { "epoch": 4.160523906330018, "grad_norm": 1.7424348592758179, "learning_rate": 4.583949161222241e-05, "loss": 1.0013, "step": 268100 }, { "epoch": 4.162075761572961, "grad_norm": 2.314422369003296, "learning_rate": 4.583793975697947e-05, "loss": 1.0111, "step": 268200 }, { "epoch": 4.163627616815903, "grad_norm": 2.1232736110687256, "learning_rate": 4.583638790173653e-05, "loss": 1.0087, "step": 268300 }, { "epoch": 4.165179472058846, "grad_norm": 2.373929738998413, "learning_rate": 4.5834836046493586e-05, "loss": 0.9955, "step": 268400 }, { "epoch": 4.16673132730179, "grad_norm": 2.3503201007843018, "learning_rate": 4.5833284191250644e-05, "loss": 0.9906, "step": 268500 }, { "epoch": 4.168283182544732, "grad_norm": 2.191762924194336, "learning_rate": 4.58317323360077e-05, "loss": 0.9964, "step": 268600 }, { "epoch": 4.169835037787675, "grad_norm": 1.852673888206482, "learning_rate": 4.583018048076475e-05, "loss": 1.0132, "step": 268700 }, { "epoch": 4.171386893030618, "grad_norm": 2.5372464656829834, "learning_rate": 4.582862862552181e-05, "loss": 1.0212, "step": 268800 }, { "epoch": 4.172938748273561, "grad_norm": 2.0085675716400146, "learning_rate": 4.582707677027887e-05, "loss": 0.9946, "step": 268900 }, { "epoch": 4.174490603516504, "grad_norm": 2.2607789039611816, "learning_rate": 4.5825524915035926e-05, "loss": 1.0125, "step": 269000 }, { "epoch": 4.176042458759447, "grad_norm": 2.4367029666900635, "learning_rate": 4.5823973059792984e-05, "loss": 1.0167, "step": 269100 }, { "epoch": 4.1775943140023895, "grad_norm": 2.8601229190826416, "learning_rate": 4.582242120455004e-05, "loss": 0.9838, "step": 269200 }, { "epoch": 4.179146169245333, "grad_norm": 2.394378662109375, "learning_rate": 4.58208693493071e-05, "loss": 1.004, "step": 269300 }, { "epoch": 4.180698024488276, "grad_norm": 2.2384700775146484, "learning_rate": 4.581931749406416e-05, "loss": 0.9967, "step": 269400 }, { "epoch": 4.182249879731219, "grad_norm": 2.6478142738342285, "learning_rate": 4.5817765638821215e-05, "loss": 1.0079, "step": 269500 }, { "epoch": 4.183801734974161, "grad_norm": 2.617509365081787, "learning_rate": 4.581621378357827e-05, "loss": 0.9938, "step": 269600 }, { "epoch": 4.185353590217105, "grad_norm": 2.2071099281311035, "learning_rate": 4.581466192833533e-05, "loss": 1.0259, "step": 269700 }, { "epoch": 4.186905445460048, "grad_norm": 1.9501489400863647, "learning_rate": 4.581311007309239e-05, "loss": 0.9782, "step": 269800 }, { "epoch": 4.18845730070299, "grad_norm": 2.1013565063476562, "learning_rate": 4.5811558217849446e-05, "loss": 0.9917, "step": 269900 }, { "epoch": 4.190009155945933, "grad_norm": 2.2761118412017822, "learning_rate": 4.58100063626065e-05, "loss": 1.008, "step": 270000 }, { "epoch": 4.191561011188877, "grad_norm": 2.5501368045806885, "learning_rate": 4.5808454507363555e-05, "loss": 1.005, "step": 270100 }, { "epoch": 4.193112866431819, "grad_norm": 2.3103885650634766, "learning_rate": 4.580690265212061e-05, "loss": 0.9972, "step": 270200 }, { "epoch": 4.194664721674762, "grad_norm": 2.1419904232025146, "learning_rate": 4.580535079687767e-05, "loss": 1.004, "step": 270300 }, { "epoch": 4.196216576917705, "grad_norm": 2.082042694091797, "learning_rate": 4.580379894163473e-05, "loss": 1.0084, "step": 270400 }, { "epoch": 4.197768432160648, "grad_norm": 2.577298402786255, "learning_rate": 4.5802247086391786e-05, "loss": 0.9993, "step": 270500 }, { "epoch": 4.199320287403591, "grad_norm": 2.507154703140259, "learning_rate": 4.580069523114884e-05, "loss": 1.0134, "step": 270600 }, { "epoch": 4.200872142646534, "grad_norm": 2.416017770767212, "learning_rate": 4.5799143375905894e-05, "loss": 1.0128, "step": 270700 }, { "epoch": 4.202423997889477, "grad_norm": 1.842239499092102, "learning_rate": 4.579759152066295e-05, "loss": 1.0115, "step": 270800 }, { "epoch": 4.20397585313242, "grad_norm": 1.9943252801895142, "learning_rate": 4.579603966542001e-05, "loss": 0.9845, "step": 270900 }, { "epoch": 4.205527708375363, "grad_norm": 1.9691641330718994, "learning_rate": 4.579448781017707e-05, "loss": 1.0245, "step": 271000 }, { "epoch": 4.207079563618306, "grad_norm": 2.22060227394104, "learning_rate": 4.5792935954934125e-05, "loss": 1.0037, "step": 271100 }, { "epoch": 4.208631418861248, "grad_norm": 2.158911943435669, "learning_rate": 4.579138409969118e-05, "loss": 0.9987, "step": 271200 }, { "epoch": 4.210183274104192, "grad_norm": 2.005728244781494, "learning_rate": 4.578983224444824e-05, "loss": 0.9958, "step": 271300 }, { "epoch": 4.211735129347135, "grad_norm": 2.0646462440490723, "learning_rate": 4.57882803892053e-05, "loss": 0.99, "step": 271400 }, { "epoch": 4.213286984590077, "grad_norm": 2.2937633991241455, "learning_rate": 4.5786728533962356e-05, "loss": 1.0077, "step": 271500 }, { "epoch": 4.21483883983302, "grad_norm": 2.073124885559082, "learning_rate": 4.578517667871941e-05, "loss": 1.0018, "step": 271600 }, { "epoch": 4.2163906950759635, "grad_norm": 2.7373855113983154, "learning_rate": 4.5783624823476465e-05, "loss": 0.9897, "step": 271700 }, { "epoch": 4.217942550318906, "grad_norm": 2.8275275230407715, "learning_rate": 4.578207296823352e-05, "loss": 1.0004, "step": 271800 }, { "epoch": 4.219494405561849, "grad_norm": 2.43312931060791, "learning_rate": 4.578052111299058e-05, "loss": 1.0084, "step": 271900 }, { "epoch": 4.221046260804792, "grad_norm": 2.6437008380889893, "learning_rate": 4.577896925774764e-05, "loss": 1.006, "step": 272000 }, { "epoch": 4.2225981160477355, "grad_norm": 2.0690057277679443, "learning_rate": 4.5777417402504696e-05, "loss": 1.0017, "step": 272100 }, { "epoch": 4.224149971290678, "grad_norm": 2.295323133468628, "learning_rate": 4.5775865547261754e-05, "loss": 0.9817, "step": 272200 }, { "epoch": 4.225701826533621, "grad_norm": 2.1846346855163574, "learning_rate": 4.577431369201881e-05, "loss": 0.9995, "step": 272300 }, { "epoch": 4.227253681776564, "grad_norm": 2.26727032661438, "learning_rate": 4.577276183677587e-05, "loss": 1.0049, "step": 272400 }, { "epoch": 4.228805537019507, "grad_norm": 1.7309058904647827, "learning_rate": 4.577120998153293e-05, "loss": 1.0125, "step": 272500 }, { "epoch": 4.23035739226245, "grad_norm": 2.034524440765381, "learning_rate": 4.5769658126289985e-05, "loss": 0.994, "step": 272600 }, { "epoch": 4.231909247505393, "grad_norm": 2.2921416759490967, "learning_rate": 4.576810627104704e-05, "loss": 0.9927, "step": 272700 }, { "epoch": 4.233461102748335, "grad_norm": 2.246654987335205, "learning_rate": 4.57665544158041e-05, "loss": 1.0003, "step": 272800 }, { "epoch": 4.2350129579912785, "grad_norm": 2.1161704063415527, "learning_rate": 4.576500256056115e-05, "loss": 0.9972, "step": 272900 }, { "epoch": 4.236564813234222, "grad_norm": 2.2752392292022705, "learning_rate": 4.576345070531821e-05, "loss": 0.9959, "step": 273000 }, { "epoch": 4.238116668477164, "grad_norm": 2.1410627365112305, "learning_rate": 4.576189885007527e-05, "loss": 1.0011, "step": 273100 }, { "epoch": 4.239668523720107, "grad_norm": 2.3855819702148438, "learning_rate": 4.5760346994832325e-05, "loss": 1.0142, "step": 273200 }, { "epoch": 4.2412203789630505, "grad_norm": 2.5783579349517822, "learning_rate": 4.575879513958938e-05, "loss": 1.0061, "step": 273300 }, { "epoch": 4.242772234205994, "grad_norm": 2.3426921367645264, "learning_rate": 4.575724328434644e-05, "loss": 1.0072, "step": 273400 }, { "epoch": 4.244324089448936, "grad_norm": 1.7720756530761719, "learning_rate": 4.57556914291035e-05, "loss": 0.9977, "step": 273500 }, { "epoch": 4.245875944691879, "grad_norm": 2.1338977813720703, "learning_rate": 4.5754139573860556e-05, "loss": 1.0035, "step": 273600 }, { "epoch": 4.2474277999348224, "grad_norm": 2.3656187057495117, "learning_rate": 4.5752587718617613e-05, "loss": 0.9986, "step": 273700 }, { "epoch": 4.248979655177765, "grad_norm": 2.384273052215576, "learning_rate": 4.5751035863374664e-05, "loss": 0.9988, "step": 273800 }, { "epoch": 4.250531510420708, "grad_norm": 2.058539390563965, "learning_rate": 4.574948400813172e-05, "loss": 0.9896, "step": 273900 }, { "epoch": 4.252083365663651, "grad_norm": 1.7449809312820435, "learning_rate": 4.574793215288878e-05, "loss": 0.9869, "step": 274000 }, { "epoch": 4.2536352209065935, "grad_norm": 2.3793601989746094, "learning_rate": 4.574638029764584e-05, "loss": 1.0008, "step": 274100 }, { "epoch": 4.255187076149537, "grad_norm": 2.0956764221191406, "learning_rate": 4.5744828442402895e-05, "loss": 1.0047, "step": 274200 }, { "epoch": 4.25673893139248, "grad_norm": 2.55118727684021, "learning_rate": 4.574327658715995e-05, "loss": 0.9955, "step": 274300 }, { "epoch": 4.258290786635422, "grad_norm": 2.321115255355835, "learning_rate": 4.5741724731917004e-05, "loss": 0.9962, "step": 274400 }, { "epoch": 4.2598426418783655, "grad_norm": 2.1780483722686768, "learning_rate": 4.574017287667406e-05, "loss": 0.9975, "step": 274500 }, { "epoch": 4.261394497121309, "grad_norm": 2.335296630859375, "learning_rate": 4.573862102143112e-05, "loss": 1.0023, "step": 274600 }, { "epoch": 4.262946352364251, "grad_norm": 2.26134991645813, "learning_rate": 4.573706916618818e-05, "loss": 0.9976, "step": 274700 }, { "epoch": 4.264498207607194, "grad_norm": 2.400446891784668, "learning_rate": 4.5735517310945235e-05, "loss": 0.9804, "step": 274800 }, { "epoch": 4.266050062850137, "grad_norm": 2.2282209396362305, "learning_rate": 4.573396545570229e-05, "loss": 1.0262, "step": 274900 }, { "epoch": 4.267601918093081, "grad_norm": 2.1054229736328125, "learning_rate": 4.573241360045935e-05, "loss": 0.9946, "step": 275000 }, { "epoch": 4.269153773336023, "grad_norm": 2.3565878868103027, "learning_rate": 4.573086174521641e-05, "loss": 1.0141, "step": 275100 }, { "epoch": 4.270705628578966, "grad_norm": 2.2007217407226562, "learning_rate": 4.5729309889973466e-05, "loss": 0.9891, "step": 275200 }, { "epoch": 4.272257483821909, "grad_norm": 1.6125657558441162, "learning_rate": 4.5727758034730524e-05, "loss": 0.9873, "step": 275300 }, { "epoch": 4.273809339064852, "grad_norm": 2.150374412536621, "learning_rate": 4.572620617948758e-05, "loss": 0.9911, "step": 275400 }, { "epoch": 4.275361194307795, "grad_norm": 2.2643942832946777, "learning_rate": 4.572465432424464e-05, "loss": 0.9994, "step": 275500 }, { "epoch": 4.276913049550738, "grad_norm": 2.4527463912963867, "learning_rate": 4.57231024690017e-05, "loss": 1.0159, "step": 275600 }, { "epoch": 4.2784649047936805, "grad_norm": 2.031367540359497, "learning_rate": 4.572155061375875e-05, "loss": 0.977, "step": 275700 }, { "epoch": 4.280016760036624, "grad_norm": 2.356466054916382, "learning_rate": 4.5719998758515806e-05, "loss": 1.0154, "step": 275800 }, { "epoch": 4.281568615279567, "grad_norm": 2.7374279499053955, "learning_rate": 4.5718446903272864e-05, "loss": 0.9855, "step": 275900 }, { "epoch": 4.28312047052251, "grad_norm": 1.8986940383911133, "learning_rate": 4.571689504802992e-05, "loss": 0.9983, "step": 276000 }, { "epoch": 4.284672325765452, "grad_norm": 2.3046984672546387, "learning_rate": 4.571534319278698e-05, "loss": 0.994, "step": 276100 }, { "epoch": 4.286224181008396, "grad_norm": 2.569552421569824, "learning_rate": 4.571379133754404e-05, "loss": 0.9994, "step": 276200 }, { "epoch": 4.287776036251339, "grad_norm": 2.2096495628356934, "learning_rate": 4.5712239482301095e-05, "loss": 1.0082, "step": 276300 }, { "epoch": 4.289327891494281, "grad_norm": 2.335878372192383, "learning_rate": 4.571068762705815e-05, "loss": 0.9885, "step": 276400 }, { "epoch": 4.290879746737224, "grad_norm": 2.1121490001678467, "learning_rate": 4.570913577181521e-05, "loss": 1.0162, "step": 276500 }, { "epoch": 4.292431601980168, "grad_norm": 3.2806036472320557, "learning_rate": 4.570758391657227e-05, "loss": 0.9987, "step": 276600 }, { "epoch": 4.29398345722311, "grad_norm": 2.2483205795288086, "learning_rate": 4.5706032061329326e-05, "loss": 1.0087, "step": 276700 }, { "epoch": 4.295535312466053, "grad_norm": 2.3774726390838623, "learning_rate": 4.5704480206086383e-05, "loss": 1.0039, "step": 276800 }, { "epoch": 4.297087167708996, "grad_norm": 2.5040836334228516, "learning_rate": 4.570292835084344e-05, "loss": 1.0008, "step": 276900 }, { "epoch": 4.298639022951939, "grad_norm": 2.234832525253296, "learning_rate": 4.570137649560049e-05, "loss": 0.9856, "step": 277000 }, { "epoch": 4.300190878194882, "grad_norm": 2.3915202617645264, "learning_rate": 4.569982464035755e-05, "loss": 1.0277, "step": 277100 }, { "epoch": 4.301742733437825, "grad_norm": 2.0684545040130615, "learning_rate": 4.56982727851146e-05, "loss": 0.992, "step": 277200 }, { "epoch": 4.303294588680767, "grad_norm": 2.7259724140167236, "learning_rate": 4.569672092987166e-05, "loss": 0.9959, "step": 277300 }, { "epoch": 4.304846443923711, "grad_norm": 2.1833019256591797, "learning_rate": 4.5695169074628716e-05, "loss": 1.0151, "step": 277400 }, { "epoch": 4.306398299166654, "grad_norm": 2.460402011871338, "learning_rate": 4.5693617219385774e-05, "loss": 1.0078, "step": 277500 }, { "epoch": 4.307950154409597, "grad_norm": 2.342869520187378, "learning_rate": 4.569206536414283e-05, "loss": 0.9922, "step": 277600 }, { "epoch": 4.309502009652539, "grad_norm": 2.3955397605895996, "learning_rate": 4.569051350889989e-05, "loss": 0.981, "step": 277700 }, { "epoch": 4.311053864895483, "grad_norm": 2.3654611110687256, "learning_rate": 4.568896165365695e-05, "loss": 0.9956, "step": 277800 }, { "epoch": 4.312605720138426, "grad_norm": 2.312753915786743, "learning_rate": 4.5687409798414005e-05, "loss": 0.9896, "step": 277900 }, { "epoch": 4.314157575381368, "grad_norm": 3.2461416721343994, "learning_rate": 4.568585794317106e-05, "loss": 0.9784, "step": 278000 }, { "epoch": 4.315709430624311, "grad_norm": 2.4776813983917236, "learning_rate": 4.568430608792812e-05, "loss": 0.9959, "step": 278100 }, { "epoch": 4.3172612858672545, "grad_norm": 2.307433605194092, "learning_rate": 4.568275423268518e-05, "loss": 1.01, "step": 278200 }, { "epoch": 4.318813141110197, "grad_norm": 2.2424583435058594, "learning_rate": 4.5681202377442236e-05, "loss": 0.9998, "step": 278300 }, { "epoch": 4.32036499635314, "grad_norm": 2.229107141494751, "learning_rate": 4.5679650522199294e-05, "loss": 1.0093, "step": 278400 }, { "epoch": 4.321916851596083, "grad_norm": 2.090376853942871, "learning_rate": 4.5678098666956345e-05, "loss": 0.986, "step": 278500 }, { "epoch": 4.3234687068390265, "grad_norm": 2.31658673286438, "learning_rate": 4.56765468117134e-05, "loss": 0.9978, "step": 278600 }, { "epoch": 4.325020562081969, "grad_norm": 2.3370983600616455, "learning_rate": 4.567499495647046e-05, "loss": 1.0239, "step": 278700 }, { "epoch": 4.326572417324912, "grad_norm": 1.6979576349258423, "learning_rate": 4.567344310122752e-05, "loss": 1.0087, "step": 278800 }, { "epoch": 4.328124272567855, "grad_norm": 2.154461145401001, "learning_rate": 4.5671891245984576e-05, "loss": 0.9954, "step": 278900 }, { "epoch": 4.329676127810798, "grad_norm": 2.2807297706604004, "learning_rate": 4.5670339390741634e-05, "loss": 0.9911, "step": 279000 }, { "epoch": 4.331227983053741, "grad_norm": 2.143202066421509, "learning_rate": 4.566878753549869e-05, "loss": 1.0176, "step": 279100 }, { "epoch": 4.332779838296684, "grad_norm": 1.9185640811920166, "learning_rate": 4.566723568025575e-05, "loss": 0.9845, "step": 279200 }, { "epoch": 4.334331693539626, "grad_norm": 2.1047873497009277, "learning_rate": 4.566568382501281e-05, "loss": 0.9939, "step": 279300 }, { "epoch": 4.3358835487825695, "grad_norm": 2.0561420917510986, "learning_rate": 4.5664131969769865e-05, "loss": 1.0122, "step": 279400 }, { "epoch": 4.337435404025513, "grad_norm": 2.3428125381469727, "learning_rate": 4.566258011452692e-05, "loss": 0.9925, "step": 279500 }, { "epoch": 4.338987259268455, "grad_norm": 2.0884201526641846, "learning_rate": 4.566102825928398e-05, "loss": 1.0204, "step": 279600 }, { "epoch": 4.340539114511398, "grad_norm": 2.344045639038086, "learning_rate": 4.565947640404104e-05, "loss": 1.0065, "step": 279700 }, { "epoch": 4.3420909697543415, "grad_norm": 2.1906960010528564, "learning_rate": 4.565792454879809e-05, "loss": 0.998, "step": 279800 }, { "epoch": 4.343642824997284, "grad_norm": 2.172529935836792, "learning_rate": 4.565637269355515e-05, "loss": 0.9874, "step": 279900 }, { "epoch": 4.345194680240227, "grad_norm": 2.2087390422821045, "learning_rate": 4.5654820838312204e-05, "loss": 1.0095, "step": 280000 }, { "epoch": 4.34674653548317, "grad_norm": 2.0882227420806885, "learning_rate": 4.565326898306926e-05, "loss": 0.9951, "step": 280100 }, { "epoch": 4.3482983907261135, "grad_norm": 2.5011677742004395, "learning_rate": 4.565171712782632e-05, "loss": 1.0165, "step": 280200 }, { "epoch": 4.349850245969056, "grad_norm": 2.17232084274292, "learning_rate": 4.565016527258338e-05, "loss": 0.9878, "step": 280300 }, { "epoch": 4.351402101211999, "grad_norm": 2.1016626358032227, "learning_rate": 4.564861341734043e-05, "loss": 0.9951, "step": 280400 }, { "epoch": 4.352953956454942, "grad_norm": 2.209808588027954, "learning_rate": 4.5647061562097486e-05, "loss": 1.0023, "step": 280500 }, { "epoch": 4.3545058116978845, "grad_norm": 2.129488706588745, "learning_rate": 4.5645509706854544e-05, "loss": 1.003, "step": 280600 }, { "epoch": 4.356057666940828, "grad_norm": 2.3624589443206787, "learning_rate": 4.56439578516116e-05, "loss": 1.0079, "step": 280700 }, { "epoch": 4.357609522183771, "grad_norm": 2.0404021739959717, "learning_rate": 4.564240599636866e-05, "loss": 1.02, "step": 280800 }, { "epoch": 4.359161377426713, "grad_norm": 1.9726099967956543, "learning_rate": 4.564085414112572e-05, "loss": 1.0063, "step": 280900 }, { "epoch": 4.3607132326696565, "grad_norm": 2.4254438877105713, "learning_rate": 4.5639302285882775e-05, "loss": 1.004, "step": 281000 }, { "epoch": 4.3622650879126, "grad_norm": 2.4621245861053467, "learning_rate": 4.563775043063983e-05, "loss": 1.0091, "step": 281100 }, { "epoch": 4.363816943155543, "grad_norm": 2.6106221675872803, "learning_rate": 4.563619857539689e-05, "loss": 0.9977, "step": 281200 }, { "epoch": 4.365368798398485, "grad_norm": 2.1493639945983887, "learning_rate": 4.563464672015395e-05, "loss": 1.0162, "step": 281300 }, { "epoch": 4.3669206536414285, "grad_norm": 2.2255125045776367, "learning_rate": 4.5633094864911e-05, "loss": 1.0134, "step": 281400 }, { "epoch": 4.368472508884372, "grad_norm": 2.635594129562378, "learning_rate": 4.563154300966806e-05, "loss": 1.012, "step": 281500 }, { "epoch": 4.370024364127314, "grad_norm": 2.3431122303009033, "learning_rate": 4.5629991154425115e-05, "loss": 0.997, "step": 281600 }, { "epoch": 4.371576219370257, "grad_norm": 2.3153669834136963, "learning_rate": 4.562843929918217e-05, "loss": 0.9958, "step": 281700 }, { "epoch": 4.3731280746132, "grad_norm": 4.537783622741699, "learning_rate": 4.562688744393923e-05, "loss": 0.9947, "step": 281800 }, { "epoch": 4.374679929856143, "grad_norm": 2.447627544403076, "learning_rate": 4.562533558869629e-05, "loss": 1.006, "step": 281900 }, { "epoch": 4.376231785099086, "grad_norm": 2.10001277923584, "learning_rate": 4.5623783733453346e-05, "loss": 0.9723, "step": 282000 }, { "epoch": 4.377783640342029, "grad_norm": 2.1870005130767822, "learning_rate": 4.5622231878210404e-05, "loss": 0.9942, "step": 282100 }, { "epoch": 4.3793354955849715, "grad_norm": 2.346949815750122, "learning_rate": 4.562068002296746e-05, "loss": 1.0051, "step": 282200 }, { "epoch": 4.380887350827915, "grad_norm": 2.1735942363739014, "learning_rate": 4.561912816772452e-05, "loss": 0.9875, "step": 282300 }, { "epoch": 4.382439206070858, "grad_norm": 2.3719325065612793, "learning_rate": 4.561757631248158e-05, "loss": 1.0064, "step": 282400 }, { "epoch": 4.3839910613138, "grad_norm": 2.253920555114746, "learning_rate": 4.5616024457238635e-05, "loss": 1.0154, "step": 282500 }, { "epoch": 4.385542916556743, "grad_norm": 2.520620822906494, "learning_rate": 4.561447260199569e-05, "loss": 1.0141, "step": 282600 }, { "epoch": 4.387094771799687, "grad_norm": 2.1080615520477295, "learning_rate": 4.5612920746752743e-05, "loss": 1.0136, "step": 282700 }, { "epoch": 4.38864662704263, "grad_norm": 2.0734851360321045, "learning_rate": 4.56113688915098e-05, "loss": 0.9874, "step": 282800 }, { "epoch": 4.390198482285572, "grad_norm": 2.5511300563812256, "learning_rate": 4.560981703626686e-05, "loss": 1.001, "step": 282900 }, { "epoch": 4.391750337528515, "grad_norm": 2.716226577758789, "learning_rate": 4.560826518102392e-05, "loss": 0.9855, "step": 283000 }, { "epoch": 4.393302192771459, "grad_norm": 2.402078151702881, "learning_rate": 4.5606713325780974e-05, "loss": 1.0057, "step": 283100 }, { "epoch": 4.394854048014401, "grad_norm": 2.090461015701294, "learning_rate": 4.560516147053803e-05, "loss": 0.9867, "step": 283200 }, { "epoch": 4.396405903257344, "grad_norm": 2.0088253021240234, "learning_rate": 4.560360961529509e-05, "loss": 0.9929, "step": 283300 }, { "epoch": 4.397957758500287, "grad_norm": 2.3430347442626953, "learning_rate": 4.560205776005215e-05, "loss": 1.0067, "step": 283400 }, { "epoch": 4.39950961374323, "grad_norm": 1.9524778127670288, "learning_rate": 4.5600505904809205e-05, "loss": 1.0175, "step": 283500 }, { "epoch": 4.401061468986173, "grad_norm": 2.400179147720337, "learning_rate": 4.5598954049566256e-05, "loss": 1.0102, "step": 283600 }, { "epoch": 4.402613324229116, "grad_norm": 1.9491347074508667, "learning_rate": 4.5597402194323314e-05, "loss": 0.9908, "step": 283700 }, { "epoch": 4.404165179472058, "grad_norm": 2.32865571975708, "learning_rate": 4.559585033908037e-05, "loss": 0.9896, "step": 283800 }, { "epoch": 4.405717034715002, "grad_norm": 1.8991413116455078, "learning_rate": 4.559429848383743e-05, "loss": 1.0114, "step": 283900 }, { "epoch": 4.407268889957945, "grad_norm": 2.652299404144287, "learning_rate": 4.559274662859449e-05, "loss": 1.0057, "step": 284000 }, { "epoch": 4.408820745200888, "grad_norm": 2.5429272651672363, "learning_rate": 4.5591194773351545e-05, "loss": 0.9826, "step": 284100 }, { "epoch": 4.41037260044383, "grad_norm": 2.3713974952697754, "learning_rate": 4.5589642918108596e-05, "loss": 1.003, "step": 284200 }, { "epoch": 4.411924455686774, "grad_norm": 1.9270364046096802, "learning_rate": 4.5588091062865654e-05, "loss": 1.0092, "step": 284300 }, { "epoch": 4.413476310929717, "grad_norm": 2.2040717601776123, "learning_rate": 4.558653920762271e-05, "loss": 1.0128, "step": 284400 }, { "epoch": 4.415028166172659, "grad_norm": 2.1370856761932373, "learning_rate": 4.558498735237977e-05, "loss": 0.9874, "step": 284500 }, { "epoch": 4.416580021415602, "grad_norm": 1.8606032133102417, "learning_rate": 4.558343549713683e-05, "loss": 0.9766, "step": 284600 }, { "epoch": 4.418131876658546, "grad_norm": 2.4526243209838867, "learning_rate": 4.5581883641893885e-05, "loss": 0.9875, "step": 284700 }, { "epoch": 4.419683731901488, "grad_norm": 2.1099183559417725, "learning_rate": 4.558033178665094e-05, "loss": 0.9844, "step": 284800 }, { "epoch": 4.421235587144431, "grad_norm": 2.3032143115997314, "learning_rate": 4.5578779931408e-05, "loss": 0.9873, "step": 284900 }, { "epoch": 4.422787442387374, "grad_norm": 2.948617935180664, "learning_rate": 4.557722807616506e-05, "loss": 0.9921, "step": 285000 }, { "epoch": 4.424339297630317, "grad_norm": 1.72019362449646, "learning_rate": 4.5575676220922116e-05, "loss": 0.9849, "step": 285100 }, { "epoch": 4.42589115287326, "grad_norm": 2.4202182292938232, "learning_rate": 4.5574124365679174e-05, "loss": 1.0031, "step": 285200 }, { "epoch": 4.427443008116203, "grad_norm": 2.4255611896514893, "learning_rate": 4.557257251043623e-05, "loss": 0.9786, "step": 285300 }, { "epoch": 4.428994863359146, "grad_norm": 2.403902769088745, "learning_rate": 4.557102065519329e-05, "loss": 0.9986, "step": 285400 }, { "epoch": 4.430546718602089, "grad_norm": 2.456495761871338, "learning_rate": 4.556946879995034e-05, "loss": 0.987, "step": 285500 }, { "epoch": 4.432098573845032, "grad_norm": 2.1072821617126465, "learning_rate": 4.55679169447074e-05, "loss": 0.9976, "step": 285600 }, { "epoch": 4.433650429087975, "grad_norm": 1.931612491607666, "learning_rate": 4.5566365089464456e-05, "loss": 1.0216, "step": 285700 }, { "epoch": 4.435202284330917, "grad_norm": 2.1282880306243896, "learning_rate": 4.5564813234221513e-05, "loss": 0.9973, "step": 285800 }, { "epoch": 4.4367541395738606, "grad_norm": 2.056551456451416, "learning_rate": 4.556326137897857e-05, "loss": 1.0079, "step": 285900 }, { "epoch": 4.438305994816804, "grad_norm": 2.656358242034912, "learning_rate": 4.556170952373563e-05, "loss": 0.9771, "step": 286000 }, { "epoch": 4.439857850059746, "grad_norm": 2.1979901790618896, "learning_rate": 4.556015766849269e-05, "loss": 0.9827, "step": 286100 }, { "epoch": 4.441409705302689, "grad_norm": 1.9258511066436768, "learning_rate": 4.5558605813249744e-05, "loss": 0.9825, "step": 286200 }, { "epoch": 4.4429615605456325, "grad_norm": 2.113699197769165, "learning_rate": 4.55570539580068e-05, "loss": 0.9846, "step": 286300 }, { "epoch": 4.444513415788575, "grad_norm": 2.060818910598755, "learning_rate": 4.555550210276386e-05, "loss": 0.9953, "step": 286400 }, { "epoch": 4.446065271031518, "grad_norm": 2.09224534034729, "learning_rate": 4.555395024752092e-05, "loss": 0.9928, "step": 286500 }, { "epoch": 4.447617126274461, "grad_norm": 3.010362148284912, "learning_rate": 4.5552398392277975e-05, "loss": 0.994, "step": 286600 }, { "epoch": 4.4491689815174045, "grad_norm": 2.243070125579834, "learning_rate": 4.555084653703503e-05, "loss": 1.0119, "step": 286700 }, { "epoch": 4.450720836760347, "grad_norm": 2.002566337585449, "learning_rate": 4.5549294681792084e-05, "loss": 0.9866, "step": 286800 }, { "epoch": 4.45227269200329, "grad_norm": 2.2695510387420654, "learning_rate": 4.554774282654914e-05, "loss": 0.9989, "step": 286900 }, { "epoch": 4.453824547246233, "grad_norm": 1.9779748916625977, "learning_rate": 4.55461909713062e-05, "loss": 0.98, "step": 287000 }, { "epoch": 4.4553764024891755, "grad_norm": 2.309352397918701, "learning_rate": 4.554463911606325e-05, "loss": 0.9898, "step": 287100 }, { "epoch": 4.456928257732119, "grad_norm": 2.298234701156616, "learning_rate": 4.554308726082031e-05, "loss": 1.014, "step": 287200 }, { "epoch": 4.458480112975062, "grad_norm": 2.090150833129883, "learning_rate": 4.5541535405577366e-05, "loss": 0.9757, "step": 287300 }, { "epoch": 4.460031968218004, "grad_norm": 2.5438733100891113, "learning_rate": 4.5539983550334424e-05, "loss": 0.9778, "step": 287400 }, { "epoch": 4.4615838234609475, "grad_norm": 2.3629579544067383, "learning_rate": 4.553843169509148e-05, "loss": 0.997, "step": 287500 }, { "epoch": 4.463135678703891, "grad_norm": 2.2515103816986084, "learning_rate": 4.553687983984854e-05, "loss": 0.9896, "step": 287600 }, { "epoch": 4.464687533946833, "grad_norm": 2.4853055477142334, "learning_rate": 4.55353279846056e-05, "loss": 1.0019, "step": 287700 }, { "epoch": 4.466239389189776, "grad_norm": 1.969152569770813, "learning_rate": 4.5533776129362655e-05, "loss": 0.9939, "step": 287800 }, { "epoch": 4.4677912444327195, "grad_norm": 2.290210723876953, "learning_rate": 4.553222427411971e-05, "loss": 0.9918, "step": 287900 }, { "epoch": 4.469343099675662, "grad_norm": 2.0896894931793213, "learning_rate": 4.553067241887677e-05, "loss": 0.9968, "step": 288000 }, { "epoch": 4.470894954918605, "grad_norm": 2.698840856552124, "learning_rate": 4.552912056363383e-05, "loss": 1.0068, "step": 288100 }, { "epoch": 4.472446810161548, "grad_norm": 2.261298894882202, "learning_rate": 4.5527568708390886e-05, "loss": 1.0201, "step": 288200 }, { "epoch": 4.473998665404491, "grad_norm": 2.2422170639038086, "learning_rate": 4.5526016853147944e-05, "loss": 0.9831, "step": 288300 }, { "epoch": 4.475550520647434, "grad_norm": 2.165275812149048, "learning_rate": 4.5524464997904995e-05, "loss": 0.9842, "step": 288400 }, { "epoch": 4.477102375890377, "grad_norm": 1.9195927381515503, "learning_rate": 4.552291314266205e-05, "loss": 0.9799, "step": 288500 }, { "epoch": 4.47865423113332, "grad_norm": 2.3623716831207275, "learning_rate": 4.552136128741911e-05, "loss": 0.9923, "step": 288600 }, { "epoch": 4.4802060863762625, "grad_norm": 2.3932673931121826, "learning_rate": 4.551980943217617e-05, "loss": 0.9865, "step": 288700 }, { "epoch": 4.481757941619206, "grad_norm": 1.7156976461410522, "learning_rate": 4.5518257576933226e-05, "loss": 0.9932, "step": 288800 }, { "epoch": 4.483309796862149, "grad_norm": 2.1037440299987793, "learning_rate": 4.5516705721690283e-05, "loss": 1.0139, "step": 288900 }, { "epoch": 4.484861652105091, "grad_norm": 2.0449819564819336, "learning_rate": 4.551515386644734e-05, "loss": 1.0049, "step": 289000 }, { "epoch": 4.4864135073480345, "grad_norm": 2.0198733806610107, "learning_rate": 4.55136020112044e-05, "loss": 0.9872, "step": 289100 }, { "epoch": 4.487965362590978, "grad_norm": 2.188941240310669, "learning_rate": 4.551205015596146e-05, "loss": 0.9831, "step": 289200 }, { "epoch": 4.489517217833921, "grad_norm": 2.4358768463134766, "learning_rate": 4.5510498300718514e-05, "loss": 0.9982, "step": 289300 }, { "epoch": 4.491069073076863, "grad_norm": 2.595031976699829, "learning_rate": 4.550894644547557e-05, "loss": 0.9948, "step": 289400 }, { "epoch": 4.492620928319806, "grad_norm": 1.8055930137634277, "learning_rate": 4.550739459023263e-05, "loss": 0.9916, "step": 289500 }, { "epoch": 4.49417278356275, "grad_norm": 2.531336784362793, "learning_rate": 4.550584273498969e-05, "loss": 0.9974, "step": 289600 }, { "epoch": 4.495724638805692, "grad_norm": 1.9998985528945923, "learning_rate": 4.550429087974674e-05, "loss": 1.0086, "step": 289700 }, { "epoch": 4.497276494048635, "grad_norm": 2.4632577896118164, "learning_rate": 4.5502739024503796e-05, "loss": 1.0142, "step": 289800 }, { "epoch": 4.498828349291578, "grad_norm": 1.9779094457626343, "learning_rate": 4.5501187169260854e-05, "loss": 0.9987, "step": 289900 }, { "epoch": 4.500380204534521, "grad_norm": 2.818877696990967, "learning_rate": 4.549963531401791e-05, "loss": 0.9816, "step": 290000 }, { "epoch": 4.501932059777464, "grad_norm": 1.737345576286316, "learning_rate": 4.549808345877496e-05, "loss": 0.9997, "step": 290100 }, { "epoch": 4.503483915020407, "grad_norm": 2.178837776184082, "learning_rate": 4.549653160353202e-05, "loss": 1.0328, "step": 290200 }, { "epoch": 4.5050357702633494, "grad_norm": 2.073700428009033, "learning_rate": 4.549497974828908e-05, "loss": 0.9848, "step": 290300 }, { "epoch": 4.506587625506293, "grad_norm": 2.192124843597412, "learning_rate": 4.5493427893046136e-05, "loss": 0.984, "step": 290400 }, { "epoch": 4.508139480749236, "grad_norm": 2.240753650665283, "learning_rate": 4.5491876037803194e-05, "loss": 0.9918, "step": 290500 }, { "epoch": 4.509691335992178, "grad_norm": 2.257843017578125, "learning_rate": 4.549032418256025e-05, "loss": 0.9898, "step": 290600 }, { "epoch": 4.511243191235121, "grad_norm": 1.9702281951904297, "learning_rate": 4.548877232731731e-05, "loss": 1.0081, "step": 290700 }, { "epoch": 4.512795046478065, "grad_norm": 2.0653622150421143, "learning_rate": 4.548722047207437e-05, "loss": 0.9936, "step": 290800 }, { "epoch": 4.514346901721008, "grad_norm": 2.2016284465789795, "learning_rate": 4.5485668616831425e-05, "loss": 1.0005, "step": 290900 }, { "epoch": 4.51589875696395, "grad_norm": 1.957107424736023, "learning_rate": 4.548411676158848e-05, "loss": 0.9952, "step": 291000 }, { "epoch": 4.517450612206893, "grad_norm": 2.218761682510376, "learning_rate": 4.548256490634554e-05, "loss": 0.9957, "step": 291100 }, { "epoch": 4.519002467449837, "grad_norm": 2.4304659366607666, "learning_rate": 4.548101305110259e-05, "loss": 0.9989, "step": 291200 }, { "epoch": 4.520554322692779, "grad_norm": 2.1235270500183105, "learning_rate": 4.547946119585965e-05, "loss": 1.0112, "step": 291300 }, { "epoch": 4.522106177935722, "grad_norm": 2.281203508377075, "learning_rate": 4.547790934061671e-05, "loss": 0.9907, "step": 291400 }, { "epoch": 4.523658033178665, "grad_norm": 1.9063180685043335, "learning_rate": 4.5476357485373765e-05, "loss": 0.9974, "step": 291500 }, { "epoch": 4.525209888421608, "grad_norm": 2.5643959045410156, "learning_rate": 4.547480563013082e-05, "loss": 1.0075, "step": 291600 }, { "epoch": 4.526761743664551, "grad_norm": 2.390983819961548, "learning_rate": 4.547325377488788e-05, "loss": 0.9906, "step": 291700 }, { "epoch": 4.528313598907494, "grad_norm": 2.0381689071655273, "learning_rate": 4.547170191964494e-05, "loss": 1.0249, "step": 291800 }, { "epoch": 4.529865454150437, "grad_norm": 2.567638874053955, "learning_rate": 4.5470150064401996e-05, "loss": 1.0053, "step": 291900 }, { "epoch": 4.53141730939338, "grad_norm": 2.4526500701904297, "learning_rate": 4.5468598209159053e-05, "loss": 0.9843, "step": 292000 }, { "epoch": 4.532969164636323, "grad_norm": 2.254894256591797, "learning_rate": 4.546704635391611e-05, "loss": 1.0166, "step": 292100 }, { "epoch": 4.534521019879266, "grad_norm": 2.157649517059326, "learning_rate": 4.546549449867317e-05, "loss": 0.9917, "step": 292200 }, { "epoch": 4.536072875122208, "grad_norm": 2.449471950531006, "learning_rate": 4.546394264343023e-05, "loss": 0.9843, "step": 292300 }, { "epoch": 4.537624730365152, "grad_norm": 2.03277850151062, "learning_rate": 4.5462390788187284e-05, "loss": 1.0109, "step": 292400 }, { "epoch": 4.539176585608095, "grad_norm": 2.191074848175049, "learning_rate": 4.5460838932944335e-05, "loss": 1.0053, "step": 292500 }, { "epoch": 4.540728440851037, "grad_norm": 2.225010395050049, "learning_rate": 4.545928707770139e-05, "loss": 0.9716, "step": 292600 }, { "epoch": 4.54228029609398, "grad_norm": 1.9364932775497437, "learning_rate": 4.545773522245845e-05, "loss": 0.9762, "step": 292700 }, { "epoch": 4.5438321513369235, "grad_norm": 2.308445692062378, "learning_rate": 4.545618336721551e-05, "loss": 0.9831, "step": 292800 }, { "epoch": 4.545384006579866, "grad_norm": 2.489295244216919, "learning_rate": 4.5454631511972566e-05, "loss": 0.9944, "step": 292900 }, { "epoch": 4.546935861822809, "grad_norm": 2.206125259399414, "learning_rate": 4.5453079656729624e-05, "loss": 1.0031, "step": 293000 }, { "epoch": 4.548487717065752, "grad_norm": 2.2450289726257324, "learning_rate": 4.545152780148668e-05, "loss": 0.9916, "step": 293100 }, { "epoch": 4.550039572308695, "grad_norm": 2.4696760177612305, "learning_rate": 4.544997594624374e-05, "loss": 1.0163, "step": 293200 }, { "epoch": 4.551591427551638, "grad_norm": 2.165466547012329, "learning_rate": 4.54484240910008e-05, "loss": 0.9767, "step": 293300 }, { "epoch": 4.553143282794581, "grad_norm": 2.521078109741211, "learning_rate": 4.544687223575785e-05, "loss": 0.9852, "step": 293400 }, { "epoch": 4.554695138037524, "grad_norm": 2.1630663871765137, "learning_rate": 4.5445320380514906e-05, "loss": 0.9939, "step": 293500 }, { "epoch": 4.556246993280467, "grad_norm": 2.229306221008301, "learning_rate": 4.5443768525271964e-05, "loss": 1.0157, "step": 293600 }, { "epoch": 4.55779884852341, "grad_norm": 2.200291633605957, "learning_rate": 4.544221667002902e-05, "loss": 1.0082, "step": 293700 }, { "epoch": 4.559350703766353, "grad_norm": 2.1165928840637207, "learning_rate": 4.544066481478608e-05, "loss": 1.0013, "step": 293800 }, { "epoch": 4.560902559009295, "grad_norm": 1.8286538124084473, "learning_rate": 4.543911295954314e-05, "loss": 0.9902, "step": 293900 }, { "epoch": 4.5624544142522385, "grad_norm": 2.3261590003967285, "learning_rate": 4.543756110430019e-05, "loss": 1.0035, "step": 294000 }, { "epoch": 4.564006269495182, "grad_norm": 2.176161289215088, "learning_rate": 4.5436009249057246e-05, "loss": 0.9872, "step": 294100 }, { "epoch": 4.565558124738124, "grad_norm": 2.6515414714813232, "learning_rate": 4.5434457393814304e-05, "loss": 0.9776, "step": 294200 }, { "epoch": 4.567109979981067, "grad_norm": 2.072274684906006, "learning_rate": 4.543290553857136e-05, "loss": 0.9843, "step": 294300 }, { "epoch": 4.5686618352240105, "grad_norm": 2.182770013809204, "learning_rate": 4.543135368332842e-05, "loss": 0.9804, "step": 294400 }, { "epoch": 4.570213690466954, "grad_norm": 2.168555974960327, "learning_rate": 4.542980182808548e-05, "loss": 0.9833, "step": 294500 }, { "epoch": 4.571765545709896, "grad_norm": 1.6700162887573242, "learning_rate": 4.5428249972842535e-05, "loss": 1.0007, "step": 294600 }, { "epoch": 4.573317400952839, "grad_norm": 1.9702121019363403, "learning_rate": 4.542669811759959e-05, "loss": 0.9926, "step": 294700 }, { "epoch": 4.574869256195782, "grad_norm": 2.87471342086792, "learning_rate": 4.542514626235665e-05, "loss": 1.0119, "step": 294800 }, { "epoch": 4.576421111438725, "grad_norm": 2.0131750106811523, "learning_rate": 4.542359440711371e-05, "loss": 1.0124, "step": 294900 }, { "epoch": 4.577972966681668, "grad_norm": 2.314908981323242, "learning_rate": 4.5422042551870766e-05, "loss": 1.0003, "step": 295000 }, { "epoch": 4.579524821924611, "grad_norm": 2.1337711811065674, "learning_rate": 4.5420490696627823e-05, "loss": 1.0033, "step": 295100 }, { "epoch": 4.5810766771675535, "grad_norm": 2.309758186340332, "learning_rate": 4.541893884138488e-05, "loss": 1.01, "step": 295200 }, { "epoch": 4.582628532410497, "grad_norm": 2.365941047668457, "learning_rate": 4.541738698614193e-05, "loss": 1.0149, "step": 295300 }, { "epoch": 4.58418038765344, "grad_norm": 2.3478477001190186, "learning_rate": 4.541583513089899e-05, "loss": 0.9849, "step": 295400 }, { "epoch": 4.585732242896382, "grad_norm": 2.2278852462768555, "learning_rate": 4.541428327565605e-05, "loss": 1.0097, "step": 295500 }, { "epoch": 4.5872840981393255, "grad_norm": 2.4725451469421387, "learning_rate": 4.5412731420413105e-05, "loss": 1.0085, "step": 295600 }, { "epoch": 4.588835953382269, "grad_norm": 1.7953201532363892, "learning_rate": 4.541117956517016e-05, "loss": 0.9874, "step": 295700 }, { "epoch": 4.590387808625211, "grad_norm": 2.429874897003174, "learning_rate": 4.540962770992722e-05, "loss": 0.9995, "step": 295800 }, { "epoch": 4.591939663868154, "grad_norm": 2.4995932579040527, "learning_rate": 4.540807585468428e-05, "loss": 0.9981, "step": 295900 }, { "epoch": 4.593491519111097, "grad_norm": 2.5319671630859375, "learning_rate": 4.5406523999441336e-05, "loss": 0.9713, "step": 296000 }, { "epoch": 4.59504337435404, "grad_norm": 2.855954885482788, "learning_rate": 4.5404972144198394e-05, "loss": 0.992, "step": 296100 }, { "epoch": 4.596595229596983, "grad_norm": 1.8545739650726318, "learning_rate": 4.540342028895545e-05, "loss": 0.9736, "step": 296200 }, { "epoch": 4.598147084839926, "grad_norm": 2.542313575744629, "learning_rate": 4.540186843371251e-05, "loss": 1.0157, "step": 296300 }, { "epoch": 4.599698940082869, "grad_norm": 2.090695381164551, "learning_rate": 4.540031657846957e-05, "loss": 0.9859, "step": 296400 }, { "epoch": 4.601250795325812, "grad_norm": 2.1615982055664062, "learning_rate": 4.5398764723226625e-05, "loss": 0.9832, "step": 296500 }, { "epoch": 4.602802650568755, "grad_norm": 2.0939669609069824, "learning_rate": 4.5397212867983676e-05, "loss": 1.0131, "step": 296600 }, { "epoch": 4.604354505811698, "grad_norm": 2.218007802963257, "learning_rate": 4.5395661012740734e-05, "loss": 0.9836, "step": 296700 }, { "epoch": 4.6059063610546405, "grad_norm": 2.3938162326812744, "learning_rate": 4.539410915749779e-05, "loss": 0.9996, "step": 296800 }, { "epoch": 4.607458216297584, "grad_norm": 2.6263668537139893, "learning_rate": 4.539255730225484e-05, "loss": 0.9872, "step": 296900 }, { "epoch": 4.609010071540527, "grad_norm": 2.299152374267578, "learning_rate": 4.53910054470119e-05, "loss": 1.0188, "step": 297000 }, { "epoch": 4.61056192678347, "grad_norm": 2.205808162689209, "learning_rate": 4.538945359176896e-05, "loss": 1.0027, "step": 297100 }, { "epoch": 4.612113782026412, "grad_norm": 2.102097988128662, "learning_rate": 4.5387901736526016e-05, "loss": 0.9848, "step": 297200 }, { "epoch": 4.613665637269356, "grad_norm": 1.9192564487457275, "learning_rate": 4.5386349881283074e-05, "loss": 1.0018, "step": 297300 }, { "epoch": 4.615217492512299, "grad_norm": 2.2630560398101807, "learning_rate": 4.538479802604013e-05, "loss": 1.0156, "step": 297400 }, { "epoch": 4.616769347755241, "grad_norm": 2.279552936553955, "learning_rate": 4.538324617079719e-05, "loss": 0.9843, "step": 297500 }, { "epoch": 4.618321202998184, "grad_norm": 2.186372995376587, "learning_rate": 4.538169431555425e-05, "loss": 1.0196, "step": 297600 }, { "epoch": 4.619873058241128, "grad_norm": 2.1136887073516846, "learning_rate": 4.5380142460311305e-05, "loss": 0.9971, "step": 297700 }, { "epoch": 4.62142491348407, "grad_norm": 2.56472110748291, "learning_rate": 4.537859060506836e-05, "loss": 1.0058, "step": 297800 }, { "epoch": 4.622976768727013, "grad_norm": 1.9326163530349731, "learning_rate": 4.537703874982542e-05, "loss": 0.9791, "step": 297900 }, { "epoch": 4.624528623969956, "grad_norm": 2.8529694080352783, "learning_rate": 4.537548689458248e-05, "loss": 1.0055, "step": 298000 }, { "epoch": 4.626080479212899, "grad_norm": 1.6828869581222534, "learning_rate": 4.5373935039339536e-05, "loss": 0.9932, "step": 298100 }, { "epoch": 4.627632334455842, "grad_norm": 2.295825958251953, "learning_rate": 4.537238318409659e-05, "loss": 1.0028, "step": 298200 }, { "epoch": 4.629184189698785, "grad_norm": 2.0858960151672363, "learning_rate": 4.5370831328853644e-05, "loss": 0.9802, "step": 298300 }, { "epoch": 4.630736044941727, "grad_norm": 2.1927192211151123, "learning_rate": 4.53692794736107e-05, "loss": 0.9858, "step": 298400 }, { "epoch": 4.632287900184671, "grad_norm": 2.8608124256134033, "learning_rate": 4.536772761836776e-05, "loss": 0.9913, "step": 298500 }, { "epoch": 4.633839755427614, "grad_norm": 2.372511863708496, "learning_rate": 4.536617576312482e-05, "loss": 0.9922, "step": 298600 }, { "epoch": 4.635391610670556, "grad_norm": 2.015420913696289, "learning_rate": 4.5364623907881875e-05, "loss": 0.9957, "step": 298700 }, { "epoch": 4.636943465913499, "grad_norm": 2.1719632148742676, "learning_rate": 4.536307205263893e-05, "loss": 0.9994, "step": 298800 }, { "epoch": 4.638495321156443, "grad_norm": 2.063201665878296, "learning_rate": 4.536152019739599e-05, "loss": 1.0103, "step": 298900 }, { "epoch": 4.640047176399386, "grad_norm": 2.673893928527832, "learning_rate": 4.535996834215305e-05, "loss": 1.006, "step": 299000 }, { "epoch": 4.641599031642328, "grad_norm": 2.41896915435791, "learning_rate": 4.5358416486910106e-05, "loss": 1.0095, "step": 299100 }, { "epoch": 4.643150886885271, "grad_norm": 1.9028549194335938, "learning_rate": 4.5356864631667164e-05, "loss": 1.0122, "step": 299200 }, { "epoch": 4.6447027421282145, "grad_norm": 2.820322036743164, "learning_rate": 4.535531277642422e-05, "loss": 1.0017, "step": 299300 }, { "epoch": 4.646254597371157, "grad_norm": 2.2285239696502686, "learning_rate": 4.535376092118128e-05, "loss": 0.9867, "step": 299400 }, { "epoch": 4.6478064526141, "grad_norm": 2.2062690258026123, "learning_rate": 4.535220906593833e-05, "loss": 1.0204, "step": 299500 }, { "epoch": 4.649358307857043, "grad_norm": 2.4299566745758057, "learning_rate": 4.535065721069539e-05, "loss": 1.018, "step": 299600 }, { "epoch": 4.6509101630999865, "grad_norm": 2.242126703262329, "learning_rate": 4.5349105355452446e-05, "loss": 1.0052, "step": 299700 }, { "epoch": 4.652462018342929, "grad_norm": 2.1155760288238525, "learning_rate": 4.5347553500209504e-05, "loss": 0.9945, "step": 299800 }, { "epoch": 4.654013873585872, "grad_norm": 2.1999547481536865, "learning_rate": 4.5346001644966555e-05, "loss": 0.9897, "step": 299900 }, { "epoch": 4.655565728828815, "grad_norm": 2.0622591972351074, "learning_rate": 4.534444978972361e-05, "loss": 1.0064, "step": 300000 }, { "epoch": 4.657117584071758, "grad_norm": 2.2337169647216797, "learning_rate": 4.534289793448067e-05, "loss": 1.0082, "step": 300100 }, { "epoch": 4.658669439314701, "grad_norm": 2.4536898136138916, "learning_rate": 4.534134607923773e-05, "loss": 0.984, "step": 300200 }, { "epoch": 4.660221294557644, "grad_norm": 2.200178384780884, "learning_rate": 4.5339794223994786e-05, "loss": 0.9946, "step": 300300 }, { "epoch": 4.661773149800586, "grad_norm": 2.038768768310547, "learning_rate": 4.5338242368751844e-05, "loss": 0.9959, "step": 300400 }, { "epoch": 4.6633250050435295, "grad_norm": 2.1221811771392822, "learning_rate": 4.53366905135089e-05, "loss": 0.9891, "step": 300500 }, { "epoch": 4.664876860286473, "grad_norm": 2.423968553543091, "learning_rate": 4.533513865826596e-05, "loss": 0.997, "step": 300600 }, { "epoch": 4.666428715529415, "grad_norm": 2.2653868198394775, "learning_rate": 4.533358680302302e-05, "loss": 0.9694, "step": 300700 }, { "epoch": 4.667980570772358, "grad_norm": 2.121770143508911, "learning_rate": 4.5332034947780075e-05, "loss": 0.9759, "step": 300800 }, { "epoch": 4.6695324260153015, "grad_norm": 2.3002452850341797, "learning_rate": 4.533048309253713e-05, "loss": 1.0008, "step": 300900 }, { "epoch": 4.671084281258244, "grad_norm": 2.14780330657959, "learning_rate": 4.532893123729418e-05, "loss": 0.9991, "step": 301000 }, { "epoch": 4.672636136501187, "grad_norm": 2.5276637077331543, "learning_rate": 4.532737938205124e-05, "loss": 1.0057, "step": 301100 }, { "epoch": 4.67418799174413, "grad_norm": 2.351923704147339, "learning_rate": 4.53258275268083e-05, "loss": 0.9895, "step": 301200 }, { "epoch": 4.675739846987073, "grad_norm": 2.697972536087036, "learning_rate": 4.532427567156536e-05, "loss": 0.9825, "step": 301300 }, { "epoch": 4.677291702230016, "grad_norm": 2.461207628250122, "learning_rate": 4.5322723816322414e-05, "loss": 0.9975, "step": 301400 }, { "epoch": 4.678843557472959, "grad_norm": 2.165151834487915, "learning_rate": 4.532117196107947e-05, "loss": 1.0008, "step": 301500 }, { "epoch": 4.680395412715902, "grad_norm": 2.336306095123291, "learning_rate": 4.531962010583653e-05, "loss": 0.9773, "step": 301600 }, { "epoch": 4.6819472679588445, "grad_norm": 2.433595895767212, "learning_rate": 4.531806825059359e-05, "loss": 0.9846, "step": 301700 }, { "epoch": 4.683499123201788, "grad_norm": 2.1923928260803223, "learning_rate": 4.5316516395350645e-05, "loss": 0.9881, "step": 301800 }, { "epoch": 4.685050978444731, "grad_norm": 2.0921261310577393, "learning_rate": 4.53149645401077e-05, "loss": 0.9812, "step": 301900 }, { "epoch": 4.686602833687673, "grad_norm": 2.3394947052001953, "learning_rate": 4.531341268486476e-05, "loss": 0.9821, "step": 302000 }, { "epoch": 4.6881546889306165, "grad_norm": 2.5028738975524902, "learning_rate": 4.531186082962182e-05, "loss": 0.9903, "step": 302100 }, { "epoch": 4.68970654417356, "grad_norm": 2.1756536960601807, "learning_rate": 4.5310308974378876e-05, "loss": 0.9802, "step": 302200 }, { "epoch": 4.691258399416503, "grad_norm": 1.8686116933822632, "learning_rate": 4.530875711913593e-05, "loss": 1.0064, "step": 302300 }, { "epoch": 4.692810254659445, "grad_norm": 2.3257672786712646, "learning_rate": 4.5307205263892985e-05, "loss": 1.0109, "step": 302400 }, { "epoch": 4.6943621099023884, "grad_norm": 2.293214797973633, "learning_rate": 4.530565340865004e-05, "loss": 0.9729, "step": 302500 }, { "epoch": 4.695913965145332, "grad_norm": 2.150447130203247, "learning_rate": 4.53041015534071e-05, "loss": 0.9702, "step": 302600 }, { "epoch": 4.697465820388274, "grad_norm": 2.234912395477295, "learning_rate": 4.530254969816416e-05, "loss": 0.9876, "step": 302700 }, { "epoch": 4.699017675631217, "grad_norm": 1.9642179012298584, "learning_rate": 4.5300997842921216e-05, "loss": 0.989, "step": 302800 }, { "epoch": 4.70056953087416, "grad_norm": 2.5795674324035645, "learning_rate": 4.5299445987678274e-05, "loss": 0.9802, "step": 302900 }, { "epoch": 4.702121386117103, "grad_norm": 1.7124474048614502, "learning_rate": 4.529789413243533e-05, "loss": 0.9993, "step": 303000 }, { "epoch": 4.703673241360046, "grad_norm": 2.22861647605896, "learning_rate": 4.529634227719239e-05, "loss": 1.0023, "step": 303100 }, { "epoch": 4.705225096602989, "grad_norm": 2.579798460006714, "learning_rate": 4.529479042194944e-05, "loss": 0.9942, "step": 303200 }, { "epoch": 4.7067769518459315, "grad_norm": 2.3989596366882324, "learning_rate": 4.52932385667065e-05, "loss": 0.9941, "step": 303300 }, { "epoch": 4.708328807088875, "grad_norm": 2.2754008769989014, "learning_rate": 4.5291686711463556e-05, "loss": 0.9984, "step": 303400 }, { "epoch": 4.709880662331818, "grad_norm": 2.5523416996002197, "learning_rate": 4.5290134856220614e-05, "loss": 1.0024, "step": 303500 }, { "epoch": 4.71143251757476, "grad_norm": 2.5854275226593018, "learning_rate": 4.528858300097767e-05, "loss": 1.0118, "step": 303600 }, { "epoch": 4.712984372817703, "grad_norm": 2.160099983215332, "learning_rate": 4.528703114573473e-05, "loss": 0.993, "step": 303700 }, { "epoch": 4.714536228060647, "grad_norm": 2.3430426120758057, "learning_rate": 4.528547929049179e-05, "loss": 0.9646, "step": 303800 }, { "epoch": 4.716088083303589, "grad_norm": 2.329451322555542, "learning_rate": 4.528392743524884e-05, "loss": 0.9812, "step": 303900 }, { "epoch": 4.717639938546532, "grad_norm": 2.225872755050659, "learning_rate": 4.5282375580005896e-05, "loss": 0.9924, "step": 304000 }, { "epoch": 4.719191793789475, "grad_norm": 1.8935221433639526, "learning_rate": 4.528082372476295e-05, "loss": 0.9887, "step": 304100 }, { "epoch": 4.720743649032419, "grad_norm": 1.9854744672775269, "learning_rate": 4.527927186952001e-05, "loss": 1.0062, "step": 304200 }, { "epoch": 4.722295504275361, "grad_norm": 2.467484712600708, "learning_rate": 4.527772001427707e-05, "loss": 1.0174, "step": 304300 }, { "epoch": 4.723847359518304, "grad_norm": 1.760667085647583, "learning_rate": 4.527616815903413e-05, "loss": 0.9869, "step": 304400 }, { "epoch": 4.725399214761247, "grad_norm": 2.415112257003784, "learning_rate": 4.5274616303791184e-05, "loss": 0.9997, "step": 304500 }, { "epoch": 4.72695107000419, "grad_norm": 2.0632479190826416, "learning_rate": 4.527306444854824e-05, "loss": 0.9813, "step": 304600 }, { "epoch": 4.728502925247133, "grad_norm": 1.8340388536453247, "learning_rate": 4.52715125933053e-05, "loss": 0.9965, "step": 304700 }, { "epoch": 4.730054780490076, "grad_norm": 2.0908021926879883, "learning_rate": 4.526996073806236e-05, "loss": 1.0083, "step": 304800 }, { "epoch": 4.731606635733019, "grad_norm": 2.1623146533966064, "learning_rate": 4.5268408882819415e-05, "loss": 1.0052, "step": 304900 }, { "epoch": 4.733158490975962, "grad_norm": 2.4359967708587646, "learning_rate": 4.526685702757647e-05, "loss": 0.9861, "step": 305000 }, { "epoch": 4.734710346218905, "grad_norm": 2.168280839920044, "learning_rate": 4.526530517233353e-05, "loss": 0.9995, "step": 305100 }, { "epoch": 4.736262201461848, "grad_norm": 2.062199831008911, "learning_rate": 4.526375331709058e-05, "loss": 0.9858, "step": 305200 }, { "epoch": 4.73781405670479, "grad_norm": 1.9365390539169312, "learning_rate": 4.526220146184764e-05, "loss": 1.0009, "step": 305300 }, { "epoch": 4.739365911947734, "grad_norm": 2.0393035411834717, "learning_rate": 4.52606496066047e-05, "loss": 0.9709, "step": 305400 }, { "epoch": 4.740917767190677, "grad_norm": 2.06400728225708, "learning_rate": 4.5259097751361755e-05, "loss": 1.001, "step": 305500 }, { "epoch": 4.742469622433619, "grad_norm": 3.0484304428100586, "learning_rate": 4.525754589611881e-05, "loss": 0.9995, "step": 305600 }, { "epoch": 4.744021477676562, "grad_norm": 2.1534829139709473, "learning_rate": 4.525599404087587e-05, "loss": 0.9965, "step": 305700 }, { "epoch": 4.745573332919506, "grad_norm": 2.2801458835601807, "learning_rate": 4.525444218563293e-05, "loss": 0.9937, "step": 305800 }, { "epoch": 4.747125188162448, "grad_norm": 2.2503480911254883, "learning_rate": 4.5252890330389986e-05, "loss": 0.9841, "step": 305900 }, { "epoch": 4.748677043405391, "grad_norm": 2.228114128112793, "learning_rate": 4.5251338475147044e-05, "loss": 1.007, "step": 306000 }, { "epoch": 4.750228898648334, "grad_norm": 2.0279667377471924, "learning_rate": 4.52497866199041e-05, "loss": 0.9897, "step": 306100 }, { "epoch": 4.751780753891277, "grad_norm": 2.4756104946136475, "learning_rate": 4.524823476466116e-05, "loss": 0.9842, "step": 306200 }, { "epoch": 4.75333260913422, "grad_norm": 2.138491630554199, "learning_rate": 4.524668290941822e-05, "loss": 0.9799, "step": 306300 }, { "epoch": 4.754884464377163, "grad_norm": 2.3243930339813232, "learning_rate": 4.524513105417527e-05, "loss": 0.9912, "step": 306400 }, { "epoch": 4.756436319620105, "grad_norm": 2.1239287853240967, "learning_rate": 4.5243579198932326e-05, "loss": 0.9935, "step": 306500 }, { "epoch": 4.757988174863049, "grad_norm": 2.103210926055908, "learning_rate": 4.5242027343689384e-05, "loss": 0.9969, "step": 306600 }, { "epoch": 4.759540030105992, "grad_norm": 2.1429154872894287, "learning_rate": 4.5240475488446435e-05, "loss": 0.9772, "step": 306700 }, { "epoch": 4.761091885348935, "grad_norm": 2.703026533126831, "learning_rate": 4.523892363320349e-05, "loss": 0.9794, "step": 306800 }, { "epoch": 4.762643740591877, "grad_norm": 2.5382027626037598, "learning_rate": 4.523737177796055e-05, "loss": 0.9771, "step": 306900 }, { "epoch": 4.7641955958348206, "grad_norm": 2.189344882965088, "learning_rate": 4.523581992271761e-05, "loss": 0.9942, "step": 307000 }, { "epoch": 4.765747451077764, "grad_norm": 2.2709391117095947, "learning_rate": 4.5234268067474666e-05, "loss": 1.01, "step": 307100 }, { "epoch": 4.767299306320706, "grad_norm": 2.155897378921509, "learning_rate": 4.523271621223172e-05, "loss": 0.9796, "step": 307200 }, { "epoch": 4.768851161563649, "grad_norm": 2.1065542697906494, "learning_rate": 4.523116435698878e-05, "loss": 0.9707, "step": 307300 }, { "epoch": 4.7704030168065925, "grad_norm": 2.1619253158569336, "learning_rate": 4.522961250174584e-05, "loss": 0.9895, "step": 307400 }, { "epoch": 4.771954872049535, "grad_norm": 2.0945324897766113, "learning_rate": 4.52280606465029e-05, "loss": 0.9718, "step": 307500 }, { "epoch": 4.773506727292478, "grad_norm": 2.4745171070098877, "learning_rate": 4.5226508791259954e-05, "loss": 0.9794, "step": 307600 }, { "epoch": 4.775058582535421, "grad_norm": 2.1260809898376465, "learning_rate": 4.522495693601701e-05, "loss": 0.9886, "step": 307700 }, { "epoch": 4.7766104377783645, "grad_norm": 2.019519805908203, "learning_rate": 4.522340508077407e-05, "loss": 0.9939, "step": 307800 }, { "epoch": 4.778162293021307, "grad_norm": 1.9292490482330322, "learning_rate": 4.522185322553113e-05, "loss": 0.9848, "step": 307900 }, { "epoch": 4.77971414826425, "grad_norm": 2.8044991493225098, "learning_rate": 4.522030137028818e-05, "loss": 0.9788, "step": 308000 }, { "epoch": 4.781266003507193, "grad_norm": 2.1083643436431885, "learning_rate": 4.5218749515045236e-05, "loss": 1.001, "step": 308100 }, { "epoch": 4.7828178587501355, "grad_norm": 2.6458473205566406, "learning_rate": 4.5217197659802294e-05, "loss": 0.983, "step": 308200 }, { "epoch": 4.784369713993079, "grad_norm": 2.059161901473999, "learning_rate": 4.521564580455935e-05, "loss": 0.985, "step": 308300 }, { "epoch": 4.785921569236022, "grad_norm": 2.3176801204681396, "learning_rate": 4.521409394931641e-05, "loss": 0.9939, "step": 308400 }, { "epoch": 4.787473424478964, "grad_norm": 2.340977191925049, "learning_rate": 4.521254209407347e-05, "loss": 0.9588, "step": 308500 }, { "epoch": 4.7890252797219075, "grad_norm": 1.9410039186477661, "learning_rate": 4.5210990238830525e-05, "loss": 0.9828, "step": 308600 }, { "epoch": 4.790577134964851, "grad_norm": 2.3338623046875, "learning_rate": 4.520943838358758e-05, "loss": 0.9828, "step": 308700 }, { "epoch": 4.792128990207793, "grad_norm": 1.9343339204788208, "learning_rate": 4.520788652834464e-05, "loss": 0.9921, "step": 308800 }, { "epoch": 4.793680845450736, "grad_norm": 2.3858683109283447, "learning_rate": 4.52063346731017e-05, "loss": 1.0177, "step": 308900 }, { "epoch": 4.7952327006936795, "grad_norm": 2.2674553394317627, "learning_rate": 4.5204782817858756e-05, "loss": 0.9824, "step": 309000 }, { "epoch": 4.796784555936622, "grad_norm": 2.558554172515869, "learning_rate": 4.5203230962615814e-05, "loss": 1.0095, "step": 309100 }, { "epoch": 4.798336411179565, "grad_norm": 2.6413729190826416, "learning_rate": 4.520167910737287e-05, "loss": 0.9757, "step": 309200 }, { "epoch": 4.799888266422508, "grad_norm": 1.7679721117019653, "learning_rate": 4.520012725212992e-05, "loss": 0.9771, "step": 309300 }, { "epoch": 4.801440121665451, "grad_norm": 1.9725315570831299, "learning_rate": 4.519857539688698e-05, "loss": 1.0053, "step": 309400 }, { "epoch": 4.802991976908394, "grad_norm": 2.084390640258789, "learning_rate": 4.519702354164404e-05, "loss": 0.9903, "step": 309500 }, { "epoch": 4.804543832151337, "grad_norm": 2.53524112701416, "learning_rate": 4.5195471686401096e-05, "loss": 0.9825, "step": 309600 }, { "epoch": 4.80609568739428, "grad_norm": 2.4231560230255127, "learning_rate": 4.519391983115815e-05, "loss": 0.9813, "step": 309700 }, { "epoch": 4.8076475426372225, "grad_norm": 2.319136619567871, "learning_rate": 4.5192367975915205e-05, "loss": 1.0041, "step": 309800 }, { "epoch": 4.809199397880166, "grad_norm": 2.1990528106689453, "learning_rate": 4.519081612067226e-05, "loss": 0.9864, "step": 309900 }, { "epoch": 4.810751253123109, "grad_norm": 2.3455920219421387, "learning_rate": 4.518926426542932e-05, "loss": 0.9708, "step": 310000 }, { "epoch": 4.812303108366051, "grad_norm": 2.130215883255005, "learning_rate": 4.518771241018638e-05, "loss": 0.9891, "step": 310100 }, { "epoch": 4.8138549636089945, "grad_norm": 2.2608296871185303, "learning_rate": 4.5186160554943436e-05, "loss": 0.9923, "step": 310200 }, { "epoch": 4.815406818851938, "grad_norm": 2.3929078578948975, "learning_rate": 4.518460869970049e-05, "loss": 0.9903, "step": 310300 }, { "epoch": 4.816958674094881, "grad_norm": 2.420306444168091, "learning_rate": 4.518305684445755e-05, "loss": 0.9769, "step": 310400 }, { "epoch": 4.818510529337823, "grad_norm": 2.4495632648468018, "learning_rate": 4.518150498921461e-05, "loss": 0.9902, "step": 310500 }, { "epoch": 4.820062384580766, "grad_norm": 2.5356087684631348, "learning_rate": 4.517995313397167e-05, "loss": 1.0062, "step": 310600 }, { "epoch": 4.82161423982371, "grad_norm": 2.158783435821533, "learning_rate": 4.5178401278728724e-05, "loss": 0.9685, "step": 310700 }, { "epoch": 4.823166095066652, "grad_norm": 1.907422423362732, "learning_rate": 4.5176849423485775e-05, "loss": 0.9689, "step": 310800 }, { "epoch": 4.824717950309595, "grad_norm": 2.1155383586883545, "learning_rate": 4.517529756824283e-05, "loss": 0.9939, "step": 310900 }, { "epoch": 4.826269805552538, "grad_norm": 2.372523546218872, "learning_rate": 4.517374571299989e-05, "loss": 1.0008, "step": 311000 }, { "epoch": 4.827821660795481, "grad_norm": 2.369626760482788, "learning_rate": 4.517219385775695e-05, "loss": 0.982, "step": 311100 }, { "epoch": 4.829373516038424, "grad_norm": 1.8950207233428955, "learning_rate": 4.5170642002514006e-05, "loss": 0.9838, "step": 311200 }, { "epoch": 4.830925371281367, "grad_norm": 2.445340871810913, "learning_rate": 4.5169090147271064e-05, "loss": 0.9776, "step": 311300 }, { "epoch": 4.832477226524309, "grad_norm": 2.245357036590576, "learning_rate": 4.516753829202812e-05, "loss": 1.0046, "step": 311400 }, { "epoch": 4.834029081767253, "grad_norm": 1.8052308559417725, "learning_rate": 4.516598643678518e-05, "loss": 0.9806, "step": 311500 }, { "epoch": 4.835580937010196, "grad_norm": 2.458707332611084, "learning_rate": 4.516443458154224e-05, "loss": 0.9934, "step": 311600 }, { "epoch": 4.837132792253138, "grad_norm": 2.2918500900268555, "learning_rate": 4.5162882726299295e-05, "loss": 0.9905, "step": 311700 }, { "epoch": 4.838684647496081, "grad_norm": 2.3220841884613037, "learning_rate": 4.516133087105635e-05, "loss": 0.995, "step": 311800 }, { "epoch": 4.840236502739025, "grad_norm": 2.166215181350708, "learning_rate": 4.515977901581341e-05, "loss": 0.9792, "step": 311900 }, { "epoch": 4.841788357981968, "grad_norm": 2.400585412979126, "learning_rate": 4.515822716057047e-05, "loss": 0.9908, "step": 312000 }, { "epoch": 4.84334021322491, "grad_norm": 2.4989497661590576, "learning_rate": 4.515667530532752e-05, "loss": 0.9835, "step": 312100 }, { "epoch": 4.844892068467853, "grad_norm": 2.049469232559204, "learning_rate": 4.515512345008458e-05, "loss": 0.9979, "step": 312200 }, { "epoch": 4.846443923710797, "grad_norm": 2.144580602645874, "learning_rate": 4.5153571594841635e-05, "loss": 0.9944, "step": 312300 }, { "epoch": 4.847995778953739, "grad_norm": 2.1540298461914062, "learning_rate": 4.515201973959869e-05, "loss": 0.9827, "step": 312400 }, { "epoch": 4.849547634196682, "grad_norm": 2.292189121246338, "learning_rate": 4.515046788435575e-05, "loss": 0.9862, "step": 312500 }, { "epoch": 4.851099489439625, "grad_norm": 2.9079971313476562, "learning_rate": 4.514891602911281e-05, "loss": 0.9729, "step": 312600 }, { "epoch": 4.852651344682568, "grad_norm": 2.5702126026153564, "learning_rate": 4.5147364173869866e-05, "loss": 1.001, "step": 312700 }, { "epoch": 4.854203199925511, "grad_norm": 1.8238567113876343, "learning_rate": 4.5145812318626924e-05, "loss": 0.9971, "step": 312800 }, { "epoch": 4.855755055168454, "grad_norm": 2.1982083320617676, "learning_rate": 4.5144260463383975e-05, "loss": 0.9819, "step": 312900 }, { "epoch": 4.857306910411397, "grad_norm": 2.0572688579559326, "learning_rate": 4.514270860814103e-05, "loss": 1.0122, "step": 313000 }, { "epoch": 4.85885876565434, "grad_norm": 2.3243939876556396, "learning_rate": 4.514115675289809e-05, "loss": 0.9688, "step": 313100 }, { "epoch": 4.860410620897283, "grad_norm": 1.879243016242981, "learning_rate": 4.513960489765515e-05, "loss": 0.9859, "step": 313200 }, { "epoch": 4.861962476140226, "grad_norm": 2.0839011669158936, "learning_rate": 4.5138053042412206e-05, "loss": 0.9712, "step": 313300 }, { "epoch": 4.863514331383168, "grad_norm": 1.9096519947052002, "learning_rate": 4.513650118716926e-05, "loss": 1.0063, "step": 313400 }, { "epoch": 4.865066186626112, "grad_norm": 2.4051766395568848, "learning_rate": 4.513494933192632e-05, "loss": 0.9932, "step": 313500 }, { "epoch": 4.866618041869055, "grad_norm": 1.9783592224121094, "learning_rate": 4.513339747668338e-05, "loss": 1.021, "step": 313600 }, { "epoch": 4.868169897111997, "grad_norm": 2.3197271823883057, "learning_rate": 4.513184562144043e-05, "loss": 0.9703, "step": 313700 }, { "epoch": 4.86972175235494, "grad_norm": 2.0943002700805664, "learning_rate": 4.513029376619749e-05, "loss": 0.9897, "step": 313800 }, { "epoch": 4.8712736075978835, "grad_norm": 2.539102792739868, "learning_rate": 4.5128741910954545e-05, "loss": 0.9885, "step": 313900 }, { "epoch": 4.872825462840826, "grad_norm": 2.569833517074585, "learning_rate": 4.51271900557116e-05, "loss": 1.0052, "step": 314000 }, { "epoch": 4.874377318083769, "grad_norm": 2.141819477081299, "learning_rate": 4.512563820046866e-05, "loss": 0.9849, "step": 314100 }, { "epoch": 4.875929173326712, "grad_norm": 2.1459314823150635, "learning_rate": 4.512408634522572e-05, "loss": 0.9939, "step": 314200 }, { "epoch": 4.877481028569655, "grad_norm": 2.138211965560913, "learning_rate": 4.5122534489982776e-05, "loss": 1.0196, "step": 314300 }, { "epoch": 4.879032883812598, "grad_norm": 2.1440787315368652, "learning_rate": 4.5120982634739834e-05, "loss": 0.9893, "step": 314400 }, { "epoch": 4.880584739055541, "grad_norm": 1.9738491773605347, "learning_rate": 4.511943077949689e-05, "loss": 0.9883, "step": 314500 }, { "epoch": 4.882136594298483, "grad_norm": 2.0031113624572754, "learning_rate": 4.511787892425395e-05, "loss": 0.9808, "step": 314600 }, { "epoch": 4.883688449541427, "grad_norm": 1.838338851928711, "learning_rate": 4.511632706901101e-05, "loss": 0.9773, "step": 314700 }, { "epoch": 4.88524030478437, "grad_norm": 2.5889651775360107, "learning_rate": 4.5114775213768065e-05, "loss": 0.9903, "step": 314800 }, { "epoch": 4.886792160027313, "grad_norm": 1.8924591541290283, "learning_rate": 4.511322335852512e-05, "loss": 0.9823, "step": 314900 }, { "epoch": 4.888344015270255, "grad_norm": 2.262155771255493, "learning_rate": 4.5111671503282174e-05, "loss": 0.9868, "step": 315000 }, { "epoch": 4.8898958705131985, "grad_norm": 2.004237413406372, "learning_rate": 4.511011964803923e-05, "loss": 0.9847, "step": 315100 }, { "epoch": 4.891447725756142, "grad_norm": 1.9922564029693604, "learning_rate": 4.510856779279629e-05, "loss": 0.9897, "step": 315200 }, { "epoch": 4.892999580999084, "grad_norm": 2.1039512157440186, "learning_rate": 4.510701593755335e-05, "loss": 0.9976, "step": 315300 }, { "epoch": 4.894551436242027, "grad_norm": 2.3522932529449463, "learning_rate": 4.5105464082310405e-05, "loss": 0.9902, "step": 315400 }, { "epoch": 4.8961032914849705, "grad_norm": 3.199152946472168, "learning_rate": 4.510391222706746e-05, "loss": 1.0005, "step": 315500 }, { "epoch": 4.897655146727914, "grad_norm": 2.3306031227111816, "learning_rate": 4.510236037182452e-05, "loss": 0.9912, "step": 315600 }, { "epoch": 4.899207001970856, "grad_norm": 1.844973087310791, "learning_rate": 4.510080851658158e-05, "loss": 0.9901, "step": 315700 }, { "epoch": 4.900758857213799, "grad_norm": 2.1430327892303467, "learning_rate": 4.5099256661338636e-05, "loss": 0.9988, "step": 315800 }, { "epoch": 4.902310712456742, "grad_norm": 1.9988670349121094, "learning_rate": 4.5097704806095694e-05, "loss": 0.9826, "step": 315900 }, { "epoch": 4.903862567699685, "grad_norm": 1.9529697895050049, "learning_rate": 4.509615295085275e-05, "loss": 1.0013, "step": 316000 }, { "epoch": 4.905414422942628, "grad_norm": 2.120429754257202, "learning_rate": 4.509460109560981e-05, "loss": 0.9941, "step": 316100 }, { "epoch": 4.906966278185571, "grad_norm": 1.7236740589141846, "learning_rate": 4.509304924036686e-05, "loss": 0.9939, "step": 316200 }, { "epoch": 4.9085181334285135, "grad_norm": 2.757526159286499, "learning_rate": 4.509149738512392e-05, "loss": 0.9928, "step": 316300 }, { "epoch": 4.910069988671457, "grad_norm": 2.2114367485046387, "learning_rate": 4.5089945529880976e-05, "loss": 1.0006, "step": 316400 }, { "epoch": 4.9116218439144, "grad_norm": 2.275763750076294, "learning_rate": 4.5088393674638027e-05, "loss": 0.9995, "step": 316500 }, { "epoch": 4.913173699157342, "grad_norm": 2.070420026779175, "learning_rate": 4.5086841819395084e-05, "loss": 0.9977, "step": 316600 }, { "epoch": 4.9147255544002855, "grad_norm": 2.078160285949707, "learning_rate": 4.508528996415214e-05, "loss": 0.9926, "step": 316700 }, { "epoch": 4.916277409643229, "grad_norm": 2.0535004138946533, "learning_rate": 4.50837381089092e-05, "loss": 1.0036, "step": 316800 }, { "epoch": 4.917829264886171, "grad_norm": 1.8921369314193726, "learning_rate": 4.508218625366626e-05, "loss": 0.9813, "step": 316900 }, { "epoch": 4.919381120129114, "grad_norm": 2.2783751487731934, "learning_rate": 4.5080634398423315e-05, "loss": 0.9746, "step": 317000 }, { "epoch": 4.920932975372057, "grad_norm": 2.1465771198272705, "learning_rate": 4.507908254318037e-05, "loss": 0.9771, "step": 317100 }, { "epoch": 4.922484830615, "grad_norm": 2.3580241203308105, "learning_rate": 4.507753068793743e-05, "loss": 0.9956, "step": 317200 }, { "epoch": 4.924036685857943, "grad_norm": 2.3010239601135254, "learning_rate": 4.507597883269449e-05, "loss": 0.9863, "step": 317300 }, { "epoch": 4.925588541100886, "grad_norm": 1.9766231775283813, "learning_rate": 4.5074426977451546e-05, "loss": 1.0005, "step": 317400 }, { "epoch": 4.927140396343829, "grad_norm": 2.148529052734375, "learning_rate": 4.5072875122208604e-05, "loss": 0.9843, "step": 317500 }, { "epoch": 4.928692251586772, "grad_norm": 2.9304866790771484, "learning_rate": 4.507132326696566e-05, "loss": 1.0141, "step": 317600 }, { "epoch": 4.930244106829715, "grad_norm": 2.2647006511688232, "learning_rate": 4.506977141172272e-05, "loss": 0.9937, "step": 317700 }, { "epoch": 4.931795962072658, "grad_norm": 2.278210401535034, "learning_rate": 4.506821955647977e-05, "loss": 1.0097, "step": 317800 }, { "epoch": 4.9333478173156005, "grad_norm": 1.9699627161026, "learning_rate": 4.506666770123683e-05, "loss": 0.9968, "step": 317900 }, { "epoch": 4.934899672558544, "grad_norm": 2.1486823558807373, "learning_rate": 4.5065115845993886e-05, "loss": 1.0099, "step": 318000 }, { "epoch": 4.936451527801487, "grad_norm": 1.8996237516403198, "learning_rate": 4.5063563990750944e-05, "loss": 0.9992, "step": 318100 }, { "epoch": 4.93800338304443, "grad_norm": 2.353362798690796, "learning_rate": 4.5062012135508e-05, "loss": 0.9961, "step": 318200 }, { "epoch": 4.939555238287372, "grad_norm": 2.2171120643615723, "learning_rate": 4.506046028026506e-05, "loss": 0.9911, "step": 318300 }, { "epoch": 4.941107093530316, "grad_norm": 2.42317533493042, "learning_rate": 4.505890842502212e-05, "loss": 0.9772, "step": 318400 }, { "epoch": 4.942658948773259, "grad_norm": 2.350301504135132, "learning_rate": 4.5057356569779175e-05, "loss": 0.9862, "step": 318500 }, { "epoch": 4.944210804016201, "grad_norm": 2.3713793754577637, "learning_rate": 4.505580471453623e-05, "loss": 1.0027, "step": 318600 }, { "epoch": 4.945762659259144, "grad_norm": 1.9676238298416138, "learning_rate": 4.505425285929329e-05, "loss": 0.9957, "step": 318700 }, { "epoch": 4.947314514502088, "grad_norm": 2.1496734619140625, "learning_rate": 4.505270100405035e-05, "loss": 0.9971, "step": 318800 }, { "epoch": 4.94886636974503, "grad_norm": 2.2096948623657227, "learning_rate": 4.5051149148807406e-05, "loss": 0.9802, "step": 318900 }, { "epoch": 4.950418224987973, "grad_norm": 1.9896800518035889, "learning_rate": 4.5049597293564464e-05, "loss": 1.0068, "step": 319000 }, { "epoch": 4.951970080230916, "grad_norm": 2.3184118270874023, "learning_rate": 4.5048045438321515e-05, "loss": 0.9628, "step": 319100 }, { "epoch": 4.953521935473859, "grad_norm": 2.2848899364471436, "learning_rate": 4.504649358307857e-05, "loss": 0.9781, "step": 319200 }, { "epoch": 4.955073790716802, "grad_norm": 2.2715554237365723, "learning_rate": 4.504494172783563e-05, "loss": 0.9828, "step": 319300 }, { "epoch": 4.956625645959745, "grad_norm": 2.0597498416900635, "learning_rate": 4.504338987259268e-05, "loss": 0.9878, "step": 319400 }, { "epoch": 4.958177501202687, "grad_norm": 2.066211700439453, "learning_rate": 4.504183801734974e-05, "loss": 0.9954, "step": 319500 }, { "epoch": 4.959729356445631, "grad_norm": 2.5709476470947266, "learning_rate": 4.5040286162106797e-05, "loss": 0.9929, "step": 319600 }, { "epoch": 4.961281211688574, "grad_norm": 2.1759355068206787, "learning_rate": 4.5038734306863854e-05, "loss": 0.9693, "step": 319700 }, { "epoch": 4.962833066931516, "grad_norm": 2.348320245742798, "learning_rate": 4.503718245162091e-05, "loss": 1.0064, "step": 319800 }, { "epoch": 4.964384922174459, "grad_norm": 2.4137532711029053, "learning_rate": 4.503563059637797e-05, "loss": 1.0024, "step": 319900 }, { "epoch": 4.965936777417403, "grad_norm": 2.276843547821045, "learning_rate": 4.503407874113503e-05, "loss": 1.0082, "step": 320000 }, { "epoch": 4.967488632660346, "grad_norm": 2.1787757873535156, "learning_rate": 4.5032526885892085e-05, "loss": 0.9911, "step": 320100 }, { "epoch": 4.969040487903288, "grad_norm": 2.136810779571533, "learning_rate": 4.503097503064914e-05, "loss": 0.9857, "step": 320200 }, { "epoch": 4.970592343146231, "grad_norm": 2.3838462829589844, "learning_rate": 4.50294231754062e-05, "loss": 0.9849, "step": 320300 }, { "epoch": 4.9721441983891745, "grad_norm": 2.494656801223755, "learning_rate": 4.502787132016326e-05, "loss": 1.0124, "step": 320400 }, { "epoch": 4.973696053632117, "grad_norm": 2.2673838138580322, "learning_rate": 4.5026319464920316e-05, "loss": 0.967, "step": 320500 }, { "epoch": 4.97524790887506, "grad_norm": 2.2103631496429443, "learning_rate": 4.5024767609677374e-05, "loss": 0.9964, "step": 320600 }, { "epoch": 4.976799764118003, "grad_norm": 2.6784791946411133, "learning_rate": 4.5023215754434425e-05, "loss": 0.9971, "step": 320700 }, { "epoch": 4.9783516193609465, "grad_norm": 2.480079174041748, "learning_rate": 4.502166389919148e-05, "loss": 0.9589, "step": 320800 }, { "epoch": 4.979903474603889, "grad_norm": 2.04945707321167, "learning_rate": 4.502011204394854e-05, "loss": 1.0014, "step": 320900 }, { "epoch": 4.981455329846832, "grad_norm": 2.558013677597046, "learning_rate": 4.50185601887056e-05, "loss": 0.9928, "step": 321000 }, { "epoch": 4.983007185089775, "grad_norm": 2.3200182914733887, "learning_rate": 4.5017008333462656e-05, "loss": 0.9969, "step": 321100 }, { "epoch": 4.984559040332718, "grad_norm": 2.510084629058838, "learning_rate": 4.5015456478219714e-05, "loss": 0.998, "step": 321200 }, { "epoch": 4.986110895575661, "grad_norm": 2.0798165798187256, "learning_rate": 4.501390462297677e-05, "loss": 0.996, "step": 321300 }, { "epoch": 4.987662750818604, "grad_norm": 2.3555374145507812, "learning_rate": 4.501235276773383e-05, "loss": 0.9795, "step": 321400 }, { "epoch": 4.989214606061546, "grad_norm": 2.3246071338653564, "learning_rate": 4.501080091249089e-05, "loss": 0.992, "step": 321500 }, { "epoch": 4.9907664613044895, "grad_norm": 2.4454610347747803, "learning_rate": 4.5009249057247945e-05, "loss": 1.0054, "step": 321600 }, { "epoch": 4.992318316547433, "grad_norm": 2.497229814529419, "learning_rate": 4.5007697202005e-05, "loss": 1.0186, "step": 321700 }, { "epoch": 4.993870171790375, "grad_norm": 2.51166033744812, "learning_rate": 4.500614534676206e-05, "loss": 0.9986, "step": 321800 }, { "epoch": 4.995422027033318, "grad_norm": 2.4567017555236816, "learning_rate": 4.500459349151912e-05, "loss": 0.9972, "step": 321900 }, { "epoch": 4.9969738822762615, "grad_norm": 2.2930448055267334, "learning_rate": 4.500304163627617e-05, "loss": 0.9825, "step": 322000 }, { "epoch": 4.998525737519204, "grad_norm": 2.1482913494110107, "learning_rate": 4.500148978103323e-05, "loss": 0.9983, "step": 322100 }, { "epoch": 5.000077592762147, "grad_norm": 2.2424023151397705, "learning_rate": 4.4999937925790285e-05, "loss": 0.9822, "step": 322200 }, { "epoch": 5.00162944800509, "grad_norm": 2.2823832035064697, "learning_rate": 4.499838607054734e-05, "loss": 0.9885, "step": 322300 }, { "epoch": 5.0031813032480335, "grad_norm": 2.415766716003418, "learning_rate": 4.49968342153044e-05, "loss": 0.9916, "step": 322400 }, { "epoch": 5.004733158490976, "grad_norm": 1.97372567653656, "learning_rate": 4.499528236006146e-05, "loss": 0.9873, "step": 322500 }, { "epoch": 5.006285013733919, "grad_norm": 1.8922631740570068, "learning_rate": 4.4993730504818516e-05, "loss": 0.9904, "step": 322600 }, { "epoch": 5.007836868976862, "grad_norm": 2.2126123905181885, "learning_rate": 4.4992178649575567e-05, "loss": 0.9724, "step": 322700 }, { "epoch": 5.0093887242198045, "grad_norm": 2.1099987030029297, "learning_rate": 4.4990626794332624e-05, "loss": 0.9924, "step": 322800 }, { "epoch": 5.010940579462748, "grad_norm": 2.4179370403289795, "learning_rate": 4.498907493908968e-05, "loss": 0.981, "step": 322900 }, { "epoch": 5.012492434705691, "grad_norm": 2.197739839553833, "learning_rate": 4.498752308384674e-05, "loss": 0.9809, "step": 323000 }, { "epoch": 5.014044289948633, "grad_norm": 2.29628324508667, "learning_rate": 4.49859712286038e-05, "loss": 0.9741, "step": 323100 }, { "epoch": 5.0155961451915765, "grad_norm": 2.441187620162964, "learning_rate": 4.4984419373360855e-05, "loss": 0.9729, "step": 323200 }, { "epoch": 5.01714800043452, "grad_norm": 1.7777624130249023, "learning_rate": 4.498286751811791e-05, "loss": 0.9646, "step": 323300 }, { "epoch": 5.018699855677462, "grad_norm": 2.440723180770874, "learning_rate": 4.498131566287497e-05, "loss": 0.9771, "step": 323400 }, { "epoch": 5.020251710920405, "grad_norm": 2.075007915496826, "learning_rate": 4.497976380763202e-05, "loss": 0.9746, "step": 323500 }, { "epoch": 5.021803566163348, "grad_norm": 1.9657238721847534, "learning_rate": 4.497821195238908e-05, "loss": 0.973, "step": 323600 }, { "epoch": 5.023355421406292, "grad_norm": 2.0091726779937744, "learning_rate": 4.497666009714614e-05, "loss": 0.9853, "step": 323700 }, { "epoch": 5.024907276649234, "grad_norm": 2.3106119632720947, "learning_rate": 4.4975108241903195e-05, "loss": 0.9977, "step": 323800 }, { "epoch": 5.026459131892177, "grad_norm": 2.1008052825927734, "learning_rate": 4.497355638666025e-05, "loss": 0.9742, "step": 323900 }, { "epoch": 5.02801098713512, "grad_norm": 2.4200737476348877, "learning_rate": 4.497200453141731e-05, "loss": 0.9889, "step": 324000 }, { "epoch": 5.029562842378063, "grad_norm": 2.3959174156188965, "learning_rate": 4.497045267617437e-05, "loss": 0.9626, "step": 324100 }, { "epoch": 5.031114697621006, "grad_norm": 2.0759968757629395, "learning_rate": 4.4968900820931426e-05, "loss": 0.9723, "step": 324200 }, { "epoch": 5.032666552863949, "grad_norm": 2.0733158588409424, "learning_rate": 4.4967348965688484e-05, "loss": 1.005, "step": 324300 }, { "epoch": 5.0342184081068915, "grad_norm": 2.3548550605773926, "learning_rate": 4.496579711044554e-05, "loss": 0.9703, "step": 324400 }, { "epoch": 5.035770263349835, "grad_norm": 2.2286078929901123, "learning_rate": 4.49642452552026e-05, "loss": 0.9645, "step": 324500 }, { "epoch": 5.037322118592778, "grad_norm": 2.5244295597076416, "learning_rate": 4.496269339995966e-05, "loss": 0.9677, "step": 324600 }, { "epoch": 5.03887397383572, "grad_norm": 2.2448983192443848, "learning_rate": 4.4961141544716715e-05, "loss": 0.9954, "step": 324700 }, { "epoch": 5.040425829078663, "grad_norm": 2.460890531539917, "learning_rate": 4.4959589689473766e-05, "loss": 0.9627, "step": 324800 }, { "epoch": 5.041977684321607, "grad_norm": 2.4765453338623047, "learning_rate": 4.4958037834230824e-05, "loss": 1.0006, "step": 324900 }, { "epoch": 5.04352953956455, "grad_norm": 2.5767853260040283, "learning_rate": 4.495648597898788e-05, "loss": 0.9935, "step": 325000 }, { "epoch": 5.045081394807492, "grad_norm": 3.2556047439575195, "learning_rate": 4.495493412374494e-05, "loss": 0.9818, "step": 325100 }, { "epoch": 5.046633250050435, "grad_norm": 2.1471121311187744, "learning_rate": 4.4953382268502e-05, "loss": 0.9956, "step": 325200 }, { "epoch": 5.048185105293379, "grad_norm": 2.146810531616211, "learning_rate": 4.4951830413259055e-05, "loss": 0.9869, "step": 325300 }, { "epoch": 5.049736960536321, "grad_norm": 1.9521751403808594, "learning_rate": 4.495027855801611e-05, "loss": 0.9755, "step": 325400 }, { "epoch": 5.051288815779264, "grad_norm": 2.013068675994873, "learning_rate": 4.494872670277317e-05, "loss": 0.985, "step": 325500 }, { "epoch": 5.052840671022207, "grad_norm": 2.042806386947632, "learning_rate": 4.494717484753023e-05, "loss": 0.9754, "step": 325600 }, { "epoch": 5.05439252626515, "grad_norm": 2.262981653213501, "learning_rate": 4.4945622992287286e-05, "loss": 0.9974, "step": 325700 }, { "epoch": 5.055944381508093, "grad_norm": 2.10441255569458, "learning_rate": 4.494407113704434e-05, "loss": 0.9566, "step": 325800 }, { "epoch": 5.057496236751036, "grad_norm": 2.24804425239563, "learning_rate": 4.49425192818014e-05, "loss": 0.9934, "step": 325900 }, { "epoch": 5.059048091993978, "grad_norm": 2.182786703109741, "learning_rate": 4.494096742655845e-05, "loss": 0.9887, "step": 326000 }, { "epoch": 5.060599947236922, "grad_norm": 2.216348171234131, "learning_rate": 4.493941557131551e-05, "loss": 0.9928, "step": 326100 }, { "epoch": 5.062151802479865, "grad_norm": 2.180921792984009, "learning_rate": 4.493786371607257e-05, "loss": 0.9664, "step": 326200 }, { "epoch": 5.063703657722808, "grad_norm": 2.5766446590423584, "learning_rate": 4.493631186082962e-05, "loss": 0.9823, "step": 326300 }, { "epoch": 5.06525551296575, "grad_norm": 2.099266290664673, "learning_rate": 4.4934760005586676e-05, "loss": 0.973, "step": 326400 }, { "epoch": 5.066807368208694, "grad_norm": 2.3833377361297607, "learning_rate": 4.4933208150343734e-05, "loss": 0.9837, "step": 326500 }, { "epoch": 5.068359223451637, "grad_norm": 2.3511757850646973, "learning_rate": 4.493165629510079e-05, "loss": 1.0028, "step": 326600 }, { "epoch": 5.069911078694579, "grad_norm": 2.3296661376953125, "learning_rate": 4.493010443985785e-05, "loss": 0.973, "step": 326700 }, { "epoch": 5.071462933937522, "grad_norm": 2.1355676651000977, "learning_rate": 4.492855258461491e-05, "loss": 0.9714, "step": 326800 }, { "epoch": 5.0730147891804656, "grad_norm": 2.3457136154174805, "learning_rate": 4.4927000729371965e-05, "loss": 0.9647, "step": 326900 }, { "epoch": 5.074566644423408, "grad_norm": 2.4499151706695557, "learning_rate": 4.492544887412902e-05, "loss": 0.9798, "step": 327000 }, { "epoch": 5.076118499666351, "grad_norm": 2.6530094146728516, "learning_rate": 4.492389701888608e-05, "loss": 1.0072, "step": 327100 }, { "epoch": 5.077670354909294, "grad_norm": 2.5040955543518066, "learning_rate": 4.492234516364314e-05, "loss": 1.0017, "step": 327200 }, { "epoch": 5.079222210152237, "grad_norm": 2.2727019786834717, "learning_rate": 4.4920793308400196e-05, "loss": 0.9797, "step": 327300 }, { "epoch": 5.08077406539518, "grad_norm": 2.231516122817993, "learning_rate": 4.4919241453157254e-05, "loss": 0.9766, "step": 327400 }, { "epoch": 5.082325920638123, "grad_norm": 2.1343374252319336, "learning_rate": 4.491768959791431e-05, "loss": 0.9679, "step": 327500 }, { "epoch": 5.083877775881065, "grad_norm": 1.9527649879455566, "learning_rate": 4.491613774267136e-05, "loss": 0.9785, "step": 327600 }, { "epoch": 5.085429631124009, "grad_norm": 2.5343494415283203, "learning_rate": 4.491458588742842e-05, "loss": 0.999, "step": 327700 }, { "epoch": 5.086981486366952, "grad_norm": 2.1045212745666504, "learning_rate": 4.491303403218548e-05, "loss": 1.0005, "step": 327800 }, { "epoch": 5.088533341609895, "grad_norm": 2.259221076965332, "learning_rate": 4.4911482176942536e-05, "loss": 0.9895, "step": 327900 }, { "epoch": 5.090085196852837, "grad_norm": 2.301227331161499, "learning_rate": 4.4909930321699594e-05, "loss": 0.9904, "step": 328000 }, { "epoch": 5.0916370520957805, "grad_norm": 2.032723903656006, "learning_rate": 4.490837846645665e-05, "loss": 0.9837, "step": 328100 }, { "epoch": 5.093188907338724, "grad_norm": 2.1664624214172363, "learning_rate": 4.490682661121371e-05, "loss": 1.0095, "step": 328200 }, { "epoch": 5.094740762581666, "grad_norm": 2.5637974739074707, "learning_rate": 4.490527475597077e-05, "loss": 1.0016, "step": 328300 }, { "epoch": 5.096292617824609, "grad_norm": 2.775192975997925, "learning_rate": 4.4903722900727825e-05, "loss": 1.0008, "step": 328400 }, { "epoch": 5.0978444730675525, "grad_norm": 2.0807552337646484, "learning_rate": 4.490217104548488e-05, "loss": 0.9837, "step": 328500 }, { "epoch": 5.099396328310495, "grad_norm": 2.2242581844329834, "learning_rate": 4.490061919024194e-05, "loss": 0.9981, "step": 328600 }, { "epoch": 5.100948183553438, "grad_norm": 1.8216568231582642, "learning_rate": 4.4899067334999e-05, "loss": 1.0021, "step": 328700 }, { "epoch": 5.102500038796381, "grad_norm": 2.2259035110473633, "learning_rate": 4.4897515479756056e-05, "loss": 1.0082, "step": 328800 }, { "epoch": 5.104051894039324, "grad_norm": 2.189532518386841, "learning_rate": 4.4895963624513107e-05, "loss": 1.008, "step": 328900 }, { "epoch": 5.105603749282267, "grad_norm": 2.4402806758880615, "learning_rate": 4.4894411769270164e-05, "loss": 1.0065, "step": 329000 }, { "epoch": 5.10715560452521, "grad_norm": 2.2367091178894043, "learning_rate": 4.489285991402722e-05, "loss": 0.9872, "step": 329100 }, { "epoch": 5.108707459768153, "grad_norm": 2.6189193725585938, "learning_rate": 4.489130805878427e-05, "loss": 0.9856, "step": 329200 }, { "epoch": 5.1102593150110955, "grad_norm": 2.663487195968628, "learning_rate": 4.488975620354133e-05, "loss": 0.975, "step": 329300 }, { "epoch": 5.111811170254039, "grad_norm": 2.0868868827819824, "learning_rate": 4.488820434829839e-05, "loss": 0.9888, "step": 329400 }, { "epoch": 5.113363025496982, "grad_norm": 2.2460601329803467, "learning_rate": 4.4886652493055446e-05, "loss": 1.0017, "step": 329500 }, { "epoch": 5.114914880739924, "grad_norm": 2.796231746673584, "learning_rate": 4.4885100637812504e-05, "loss": 0.973, "step": 329600 }, { "epoch": 5.1164667359828675, "grad_norm": 2.2700040340423584, "learning_rate": 4.488354878256956e-05, "loss": 0.9873, "step": 329700 }, { "epoch": 5.118018591225811, "grad_norm": 2.3378028869628906, "learning_rate": 4.488199692732662e-05, "loss": 0.9736, "step": 329800 }, { "epoch": 5.119570446468753, "grad_norm": 1.8950952291488647, "learning_rate": 4.488044507208368e-05, "loss": 0.9791, "step": 329900 }, { "epoch": 5.121122301711696, "grad_norm": 2.1657330989837646, "learning_rate": 4.4878893216840735e-05, "loss": 0.9752, "step": 330000 }, { "epoch": 5.1226741569546395, "grad_norm": 2.277442455291748, "learning_rate": 4.487734136159779e-05, "loss": 0.9884, "step": 330100 }, { "epoch": 5.124226012197582, "grad_norm": 2.029282331466675, "learning_rate": 4.487578950635485e-05, "loss": 0.99, "step": 330200 }, { "epoch": 5.125777867440525, "grad_norm": 2.328399896621704, "learning_rate": 4.487423765111191e-05, "loss": 0.985, "step": 330300 }, { "epoch": 5.127329722683468, "grad_norm": 2.042090892791748, "learning_rate": 4.4872685795868966e-05, "loss": 0.9752, "step": 330400 }, { "epoch": 5.128881577926411, "grad_norm": 2.092154026031494, "learning_rate": 4.487113394062602e-05, "loss": 0.9834, "step": 330500 }, { "epoch": 5.130433433169354, "grad_norm": 2.164201259613037, "learning_rate": 4.4869582085383075e-05, "loss": 1.006, "step": 330600 }, { "epoch": 5.131985288412297, "grad_norm": 2.257042646408081, "learning_rate": 4.486803023014013e-05, "loss": 1.0192, "step": 330700 }, { "epoch": 5.13353714365524, "grad_norm": 2.147996187210083, "learning_rate": 4.486647837489719e-05, "loss": 0.9702, "step": 330800 }, { "epoch": 5.1350889988981825, "grad_norm": 2.2125768661499023, "learning_rate": 4.486492651965425e-05, "loss": 0.9834, "step": 330900 }, { "epoch": 5.136640854141126, "grad_norm": 2.200608015060425, "learning_rate": 4.4863374664411306e-05, "loss": 0.9925, "step": 331000 }, { "epoch": 5.138192709384069, "grad_norm": 2.172070026397705, "learning_rate": 4.4861822809168364e-05, "loss": 0.9818, "step": 331100 }, { "epoch": 5.139744564627011, "grad_norm": 1.8601043224334717, "learning_rate": 4.486027095392542e-05, "loss": 0.9737, "step": 331200 }, { "epoch": 5.1412964198699544, "grad_norm": 2.188028335571289, "learning_rate": 4.485871909868248e-05, "loss": 0.9903, "step": 331300 }, { "epoch": 5.142848275112898, "grad_norm": 2.4077775478363037, "learning_rate": 4.485716724343954e-05, "loss": 0.9788, "step": 331400 }, { "epoch": 5.14440013035584, "grad_norm": 1.905815601348877, "learning_rate": 4.4855615388196595e-05, "loss": 0.9895, "step": 331500 }, { "epoch": 5.145951985598783, "grad_norm": 2.180434226989746, "learning_rate": 4.485406353295365e-05, "loss": 1.0077, "step": 331600 }, { "epoch": 5.147503840841726, "grad_norm": 2.543748617172241, "learning_rate": 4.485251167771071e-05, "loss": 0.9815, "step": 331700 }, { "epoch": 5.14905569608467, "grad_norm": 1.920924186706543, "learning_rate": 4.485095982246776e-05, "loss": 1.0053, "step": 331800 }, { "epoch": 5.150607551327612, "grad_norm": 2.1416079998016357, "learning_rate": 4.484940796722482e-05, "loss": 1.0021, "step": 331900 }, { "epoch": 5.152159406570555, "grad_norm": 2.3306236267089844, "learning_rate": 4.4847856111981877e-05, "loss": 0.9761, "step": 332000 }, { "epoch": 5.153711261813498, "grad_norm": 2.3076112270355225, "learning_rate": 4.4846304256738934e-05, "loss": 0.9958, "step": 332100 }, { "epoch": 5.155263117056441, "grad_norm": 1.98060142993927, "learning_rate": 4.484475240149599e-05, "loss": 0.9937, "step": 332200 }, { "epoch": 5.156814972299384, "grad_norm": 2.002647876739502, "learning_rate": 4.484320054625305e-05, "loss": 0.9718, "step": 332300 }, { "epoch": 5.158366827542327, "grad_norm": 2.1483147144317627, "learning_rate": 4.484164869101011e-05, "loss": 0.966, "step": 332400 }, { "epoch": 5.159918682785269, "grad_norm": 2.1069324016571045, "learning_rate": 4.484009683576716e-05, "loss": 0.9874, "step": 332500 }, { "epoch": 5.161470538028213, "grad_norm": 2.287614345550537, "learning_rate": 4.4838544980524216e-05, "loss": 0.9804, "step": 332600 }, { "epoch": 5.163022393271156, "grad_norm": 2.2705307006835938, "learning_rate": 4.4836993125281274e-05, "loss": 0.9806, "step": 332700 }, { "epoch": 5.164574248514098, "grad_norm": 2.164897918701172, "learning_rate": 4.483544127003833e-05, "loss": 1.0, "step": 332800 }, { "epoch": 5.166126103757041, "grad_norm": 2.4844210147857666, "learning_rate": 4.483388941479539e-05, "loss": 0.956, "step": 332900 }, { "epoch": 5.167677958999985, "grad_norm": 2.5571064949035645, "learning_rate": 4.483233755955245e-05, "loss": 0.9946, "step": 333000 }, { "epoch": 5.169229814242928, "grad_norm": 1.5777077674865723, "learning_rate": 4.4830785704309505e-05, "loss": 0.9976, "step": 333100 }, { "epoch": 5.17078166948587, "grad_norm": 2.184669017791748, "learning_rate": 4.482923384906656e-05, "loss": 0.9631, "step": 333200 }, { "epoch": 5.172333524728813, "grad_norm": 2.055509328842163, "learning_rate": 4.4827681993823614e-05, "loss": 0.9849, "step": 333300 }, { "epoch": 5.173885379971757, "grad_norm": 2.57855224609375, "learning_rate": 4.482613013858067e-05, "loss": 0.981, "step": 333400 }, { "epoch": 5.175437235214699, "grad_norm": 2.0482442378997803, "learning_rate": 4.482457828333773e-05, "loss": 0.9696, "step": 333500 }, { "epoch": 5.176989090457642, "grad_norm": 2.0093798637390137, "learning_rate": 4.482302642809479e-05, "loss": 1.0008, "step": 333600 }, { "epoch": 5.178540945700585, "grad_norm": 2.007098913192749, "learning_rate": 4.4821474572851845e-05, "loss": 0.9702, "step": 333700 }, { "epoch": 5.180092800943528, "grad_norm": 2.0783870220184326, "learning_rate": 4.48199227176089e-05, "loss": 0.9866, "step": 333800 }, { "epoch": 5.181644656186471, "grad_norm": 5.008114337921143, "learning_rate": 4.481837086236596e-05, "loss": 0.9755, "step": 333900 }, { "epoch": 5.183196511429414, "grad_norm": 2.196830987930298, "learning_rate": 4.481681900712302e-05, "loss": 1.0013, "step": 334000 }, { "epoch": 5.184748366672356, "grad_norm": 2.244760274887085, "learning_rate": 4.4815267151880076e-05, "loss": 0.9681, "step": 334100 }, { "epoch": 5.1863002219153, "grad_norm": 2.2426247596740723, "learning_rate": 4.4813715296637134e-05, "loss": 0.9665, "step": 334200 }, { "epoch": 5.187852077158243, "grad_norm": 2.5527238845825195, "learning_rate": 4.481216344139419e-05, "loss": 0.9829, "step": 334300 }, { "epoch": 5.189403932401186, "grad_norm": 2.1004483699798584, "learning_rate": 4.481061158615125e-05, "loss": 0.9783, "step": 334400 }, { "epoch": 5.190955787644128, "grad_norm": 2.1624903678894043, "learning_rate": 4.480905973090831e-05, "loss": 1.0078, "step": 334500 }, { "epoch": 5.192507642887072, "grad_norm": 2.4121296405792236, "learning_rate": 4.480750787566536e-05, "loss": 0.9972, "step": 334600 }, { "epoch": 5.194059498130015, "grad_norm": 2.3691892623901367, "learning_rate": 4.4805956020422416e-05, "loss": 1.0261, "step": 334700 }, { "epoch": 5.195611353372957, "grad_norm": 2.145981788635254, "learning_rate": 4.480440416517947e-05, "loss": 0.989, "step": 334800 }, { "epoch": 5.1971632086159, "grad_norm": 2.2127740383148193, "learning_rate": 4.480285230993653e-05, "loss": 0.9894, "step": 334900 }, { "epoch": 5.1987150638588435, "grad_norm": 2.211792469024658, "learning_rate": 4.480130045469359e-05, "loss": 0.9869, "step": 335000 }, { "epoch": 5.200266919101786, "grad_norm": 1.8884594440460205, "learning_rate": 4.4799748599450647e-05, "loss": 0.9848, "step": 335100 }, { "epoch": 5.201818774344729, "grad_norm": 2.436602830886841, "learning_rate": 4.4798196744207704e-05, "loss": 0.9926, "step": 335200 }, { "epoch": 5.203370629587672, "grad_norm": 2.0790979862213135, "learning_rate": 4.479664488896476e-05, "loss": 0.9883, "step": 335300 }, { "epoch": 5.204922484830615, "grad_norm": 2.403123378753662, "learning_rate": 4.479509303372182e-05, "loss": 0.9989, "step": 335400 }, { "epoch": 5.206474340073558, "grad_norm": 2.6842238903045654, "learning_rate": 4.479354117847888e-05, "loss": 0.9815, "step": 335500 }, { "epoch": 5.208026195316501, "grad_norm": 2.16825008392334, "learning_rate": 4.4791989323235935e-05, "loss": 0.9827, "step": 335600 }, { "epoch": 5.209578050559444, "grad_norm": 2.311147689819336, "learning_rate": 4.4790437467992986e-05, "loss": 0.9878, "step": 335700 }, { "epoch": 5.2111299058023866, "grad_norm": 2.3077635765075684, "learning_rate": 4.4788885612750044e-05, "loss": 0.9787, "step": 335800 }, { "epoch": 5.21268176104533, "grad_norm": 2.1991968154907227, "learning_rate": 4.47873337575071e-05, "loss": 0.9862, "step": 335900 }, { "epoch": 5.214233616288273, "grad_norm": 2.2159273624420166, "learning_rate": 4.478578190226416e-05, "loss": 0.9948, "step": 336000 }, { "epoch": 5.215785471531215, "grad_norm": 2.11722469329834, "learning_rate": 4.478423004702121e-05, "loss": 0.9928, "step": 336100 }, { "epoch": 5.2173373267741585, "grad_norm": 2.129912853240967, "learning_rate": 4.478267819177827e-05, "loss": 0.9837, "step": 336200 }, { "epoch": 5.218889182017102, "grad_norm": 2.3540561199188232, "learning_rate": 4.4781126336535326e-05, "loss": 0.969, "step": 336300 }, { "epoch": 5.220441037260044, "grad_norm": 2.9386038780212402, "learning_rate": 4.4779574481292384e-05, "loss": 1.0009, "step": 336400 }, { "epoch": 5.221992892502987, "grad_norm": 2.277980089187622, "learning_rate": 4.477802262604944e-05, "loss": 0.9833, "step": 336500 }, { "epoch": 5.2235447477459305, "grad_norm": 2.2846670150756836, "learning_rate": 4.47764707708065e-05, "loss": 0.9867, "step": 336600 }, { "epoch": 5.225096602988873, "grad_norm": 2.242232322692871, "learning_rate": 4.477491891556356e-05, "loss": 1.0026, "step": 336700 }, { "epoch": 5.226648458231816, "grad_norm": 2.33859920501709, "learning_rate": 4.4773367060320615e-05, "loss": 0.9743, "step": 336800 }, { "epoch": 5.228200313474759, "grad_norm": 2.392976760864258, "learning_rate": 4.477181520507767e-05, "loss": 0.9821, "step": 336900 }, { "epoch": 5.229752168717702, "grad_norm": 2.31386399269104, "learning_rate": 4.477026334983473e-05, "loss": 0.9777, "step": 337000 }, { "epoch": 5.231304023960645, "grad_norm": 1.855766773223877, "learning_rate": 4.476871149459179e-05, "loss": 0.9878, "step": 337100 }, { "epoch": 5.232855879203588, "grad_norm": 2.1748948097229004, "learning_rate": 4.4767159639348846e-05, "loss": 0.9918, "step": 337200 }, { "epoch": 5.234407734446531, "grad_norm": 2.3267738819122314, "learning_rate": 4.4765607784105904e-05, "loss": 0.9866, "step": 337300 }, { "epoch": 5.2359595896894735, "grad_norm": 2.455652952194214, "learning_rate": 4.476405592886296e-05, "loss": 0.9869, "step": 337400 }, { "epoch": 5.237511444932417, "grad_norm": 2.5350844860076904, "learning_rate": 4.476250407362001e-05, "loss": 0.9847, "step": 337500 }, { "epoch": 5.23906330017536, "grad_norm": 1.9691929817199707, "learning_rate": 4.476095221837707e-05, "loss": 1.0111, "step": 337600 }, { "epoch": 5.240615155418302, "grad_norm": 2.290410280227661, "learning_rate": 4.475940036313413e-05, "loss": 0.9729, "step": 337700 }, { "epoch": 5.2421670106612455, "grad_norm": 2.3395233154296875, "learning_rate": 4.4757848507891186e-05, "loss": 0.9821, "step": 337800 }, { "epoch": 5.243718865904189, "grad_norm": 2.4305055141448975, "learning_rate": 4.475629665264824e-05, "loss": 0.9782, "step": 337900 }, { "epoch": 5.245270721147131, "grad_norm": 2.369328022003174, "learning_rate": 4.47547447974053e-05, "loss": 0.988, "step": 338000 }, { "epoch": 5.246822576390074, "grad_norm": 2.1362602710723877, "learning_rate": 4.475319294216236e-05, "loss": 0.984, "step": 338100 }, { "epoch": 5.248374431633017, "grad_norm": 1.7460198402404785, "learning_rate": 4.4751641086919417e-05, "loss": 0.9447, "step": 338200 }, { "epoch": 5.249926286875961, "grad_norm": 1.9936199188232422, "learning_rate": 4.4750089231676474e-05, "loss": 0.9798, "step": 338300 }, { "epoch": 5.251478142118903, "grad_norm": 2.407552480697632, "learning_rate": 4.474853737643353e-05, "loss": 0.9769, "step": 338400 }, { "epoch": 5.253029997361846, "grad_norm": 1.9618628025054932, "learning_rate": 4.474698552119059e-05, "loss": 0.9798, "step": 338500 }, { "epoch": 5.254581852604789, "grad_norm": 2.495535373687744, "learning_rate": 4.474543366594765e-05, "loss": 0.9669, "step": 338600 }, { "epoch": 5.256133707847732, "grad_norm": 2.052490234375, "learning_rate": 4.4743881810704705e-05, "loss": 0.9913, "step": 338700 }, { "epoch": 5.257685563090675, "grad_norm": 2.4209463596343994, "learning_rate": 4.4742329955461756e-05, "loss": 0.9773, "step": 338800 }, { "epoch": 5.259237418333618, "grad_norm": 2.619950771331787, "learning_rate": 4.4740778100218814e-05, "loss": 0.9861, "step": 338900 }, { "epoch": 5.2607892735765605, "grad_norm": 2.135354518890381, "learning_rate": 4.4739226244975865e-05, "loss": 0.9784, "step": 339000 }, { "epoch": 5.262341128819504, "grad_norm": 2.1868016719818115, "learning_rate": 4.473767438973292e-05, "loss": 0.9905, "step": 339100 }, { "epoch": 5.263892984062447, "grad_norm": 2.497985601425171, "learning_rate": 4.473612253448998e-05, "loss": 0.9944, "step": 339200 }, { "epoch": 5.265444839305389, "grad_norm": 2.0293495655059814, "learning_rate": 4.473457067924704e-05, "loss": 0.9879, "step": 339300 }, { "epoch": 5.266996694548332, "grad_norm": 2.726874589920044, "learning_rate": 4.4733018824004096e-05, "loss": 0.9737, "step": 339400 }, { "epoch": 5.268548549791276, "grad_norm": 2.117095470428467, "learning_rate": 4.4731466968761154e-05, "loss": 0.9873, "step": 339500 }, { "epoch": 5.270100405034219, "grad_norm": 2.224587917327881, "learning_rate": 4.472991511351821e-05, "loss": 0.9789, "step": 339600 }, { "epoch": 5.271652260277161, "grad_norm": 2.0570099353790283, "learning_rate": 4.472836325827527e-05, "loss": 0.9839, "step": 339700 }, { "epoch": 5.273204115520104, "grad_norm": 2.1905441284179688, "learning_rate": 4.472681140303233e-05, "loss": 0.9822, "step": 339800 }, { "epoch": 5.274755970763048, "grad_norm": 2.166013240814209, "learning_rate": 4.4725259547789385e-05, "loss": 0.9869, "step": 339900 }, { "epoch": 5.27630782600599, "grad_norm": 2.6489365100860596, "learning_rate": 4.472370769254644e-05, "loss": 0.9815, "step": 340000 }, { "epoch": 5.277859681248933, "grad_norm": 2.2155096530914307, "learning_rate": 4.47221558373035e-05, "loss": 0.9828, "step": 340100 }, { "epoch": 5.279411536491876, "grad_norm": 2.6348648071289062, "learning_rate": 4.472060398206056e-05, "loss": 0.9677, "step": 340200 }, { "epoch": 5.280963391734819, "grad_norm": 2.451829671859741, "learning_rate": 4.471905212681761e-05, "loss": 0.9748, "step": 340300 }, { "epoch": 5.282515246977762, "grad_norm": 2.5996105670928955, "learning_rate": 4.471750027157467e-05, "loss": 0.9853, "step": 340400 }, { "epoch": 5.284067102220705, "grad_norm": 2.399988889694214, "learning_rate": 4.4715948416331725e-05, "loss": 0.9943, "step": 340500 }, { "epoch": 5.285618957463647, "grad_norm": 2.194737195968628, "learning_rate": 4.471439656108878e-05, "loss": 0.9532, "step": 340600 }, { "epoch": 5.287170812706591, "grad_norm": 2.5447442531585693, "learning_rate": 4.471284470584584e-05, "loss": 0.9994, "step": 340700 }, { "epoch": 5.288722667949534, "grad_norm": 2.354578733444214, "learning_rate": 4.47112928506029e-05, "loss": 0.9734, "step": 340800 }, { "epoch": 5.290274523192476, "grad_norm": 2.04677414894104, "learning_rate": 4.4709740995359956e-05, "loss": 0.9934, "step": 340900 }, { "epoch": 5.291826378435419, "grad_norm": 1.9037381410598755, "learning_rate": 4.470818914011701e-05, "loss": 0.9914, "step": 341000 }, { "epoch": 5.293378233678363, "grad_norm": 2.4629368782043457, "learning_rate": 4.470663728487407e-05, "loss": 0.9707, "step": 341100 }, { "epoch": 5.294930088921306, "grad_norm": 2.1286580562591553, "learning_rate": 4.470508542963113e-05, "loss": 0.9607, "step": 341200 }, { "epoch": 5.296481944164248, "grad_norm": 2.3212268352508545, "learning_rate": 4.4703533574388187e-05, "loss": 0.9821, "step": 341300 }, { "epoch": 5.298033799407191, "grad_norm": 2.7215399742126465, "learning_rate": 4.4701981719145244e-05, "loss": 0.9894, "step": 341400 }, { "epoch": 5.2995856546501345, "grad_norm": 2.2249298095703125, "learning_rate": 4.47004298639023e-05, "loss": 0.998, "step": 341500 }, { "epoch": 5.301137509893077, "grad_norm": 2.135056257247925, "learning_rate": 4.469887800865935e-05, "loss": 0.9642, "step": 341600 }, { "epoch": 5.30268936513602, "grad_norm": 2.1030986309051514, "learning_rate": 4.469732615341641e-05, "loss": 0.993, "step": 341700 }, { "epoch": 5.304241220378963, "grad_norm": 2.1474967002868652, "learning_rate": 4.469577429817347e-05, "loss": 0.9791, "step": 341800 }, { "epoch": 5.305793075621906, "grad_norm": 2.359597682952881, "learning_rate": 4.4694222442930526e-05, "loss": 0.9841, "step": 341900 }, { "epoch": 5.307344930864849, "grad_norm": 2.608402729034424, "learning_rate": 4.4692670587687584e-05, "loss": 0.9792, "step": 342000 }, { "epoch": 5.308896786107792, "grad_norm": 2.196988821029663, "learning_rate": 4.469111873244464e-05, "loss": 0.9683, "step": 342100 }, { "epoch": 5.310448641350735, "grad_norm": 2.070812225341797, "learning_rate": 4.468956687720169e-05, "loss": 0.9758, "step": 342200 }, { "epoch": 5.312000496593678, "grad_norm": 2.317857027053833, "learning_rate": 4.468801502195875e-05, "loss": 0.978, "step": 342300 }, { "epoch": 5.313552351836621, "grad_norm": 2.209301710128784, "learning_rate": 4.468646316671581e-05, "loss": 0.9912, "step": 342400 }, { "epoch": 5.315104207079564, "grad_norm": 2.183950424194336, "learning_rate": 4.4684911311472866e-05, "loss": 0.9834, "step": 342500 }, { "epoch": 5.316656062322506, "grad_norm": 2.5670201778411865, "learning_rate": 4.4683359456229924e-05, "loss": 0.9767, "step": 342600 }, { "epoch": 5.3182079175654495, "grad_norm": 2.189917802810669, "learning_rate": 4.468180760098698e-05, "loss": 0.9861, "step": 342700 }, { "epoch": 5.319759772808393, "grad_norm": 1.9095392227172852, "learning_rate": 4.468025574574404e-05, "loss": 0.9823, "step": 342800 }, { "epoch": 5.321311628051335, "grad_norm": 1.9903500080108643, "learning_rate": 4.46787038905011e-05, "loss": 0.9746, "step": 342900 }, { "epoch": 5.322863483294278, "grad_norm": 2.349290370941162, "learning_rate": 4.4677152035258155e-05, "loss": 0.9814, "step": 343000 }, { "epoch": 5.3244153385372215, "grad_norm": 2.532093048095703, "learning_rate": 4.4675600180015206e-05, "loss": 0.9785, "step": 343100 }, { "epoch": 5.325967193780164, "grad_norm": 1.9769519567489624, "learning_rate": 4.4674048324772264e-05, "loss": 0.9691, "step": 343200 }, { "epoch": 5.327519049023107, "grad_norm": 1.9752399921417236, "learning_rate": 4.467249646952932e-05, "loss": 0.9798, "step": 343300 }, { "epoch": 5.32907090426605, "grad_norm": 2.3799397945404053, "learning_rate": 4.467094461428638e-05, "loss": 0.9785, "step": 343400 }, { "epoch": 5.330622759508993, "grad_norm": 2.1321828365325928, "learning_rate": 4.466939275904344e-05, "loss": 0.9723, "step": 343500 }, { "epoch": 5.332174614751936, "grad_norm": 2.1118037700653076, "learning_rate": 4.4667840903800495e-05, "loss": 0.9844, "step": 343600 }, { "epoch": 5.333726469994879, "grad_norm": 2.0471959114074707, "learning_rate": 4.466628904855755e-05, "loss": 0.9906, "step": 343700 }, { "epoch": 5.335278325237822, "grad_norm": 1.8844975233078003, "learning_rate": 4.466473719331461e-05, "loss": 0.9718, "step": 343800 }, { "epoch": 5.3368301804807645, "grad_norm": 2.0539329051971436, "learning_rate": 4.466318533807167e-05, "loss": 0.9823, "step": 343900 }, { "epoch": 5.338382035723708, "grad_norm": 2.500180721282959, "learning_rate": 4.4661633482828726e-05, "loss": 0.9889, "step": 344000 }, { "epoch": 5.339933890966651, "grad_norm": 2.1466572284698486, "learning_rate": 4.466008162758578e-05, "loss": 0.9845, "step": 344100 }, { "epoch": 5.341485746209593, "grad_norm": 2.273542881011963, "learning_rate": 4.465852977234284e-05, "loss": 0.9841, "step": 344200 }, { "epoch": 5.3430376014525365, "grad_norm": 1.9769847393035889, "learning_rate": 4.46569779170999e-05, "loss": 0.9833, "step": 344300 }, { "epoch": 5.34458945669548, "grad_norm": 2.615112543106079, "learning_rate": 4.465542606185695e-05, "loss": 0.9932, "step": 344400 }, { "epoch": 5.346141311938422, "grad_norm": 2.338524341583252, "learning_rate": 4.465387420661401e-05, "loss": 0.9795, "step": 344500 }, { "epoch": 5.347693167181365, "grad_norm": 1.9836337566375732, "learning_rate": 4.4652322351371065e-05, "loss": 0.9838, "step": 344600 }, { "epoch": 5.349245022424308, "grad_norm": 2.2555506229400635, "learning_rate": 4.465077049612812e-05, "loss": 0.9708, "step": 344700 }, { "epoch": 5.350796877667252, "grad_norm": 2.740187168121338, "learning_rate": 4.464921864088518e-05, "loss": 0.9723, "step": 344800 }, { "epoch": 5.352348732910194, "grad_norm": 1.9778735637664795, "learning_rate": 4.464766678564224e-05, "loss": 0.9842, "step": 344900 }, { "epoch": 5.353900588153137, "grad_norm": 1.9008959531784058, "learning_rate": 4.4646114930399296e-05, "loss": 0.9826, "step": 345000 }, { "epoch": 5.35545244339608, "grad_norm": 2.404672861099243, "learning_rate": 4.4644563075156354e-05, "loss": 0.9902, "step": 345100 }, { "epoch": 5.357004298639023, "grad_norm": 2.307003974914551, "learning_rate": 4.464301121991341e-05, "loss": 0.9809, "step": 345200 }, { "epoch": 5.358556153881966, "grad_norm": 1.891075611114502, "learning_rate": 4.464145936467047e-05, "loss": 0.9686, "step": 345300 }, { "epoch": 5.360108009124909, "grad_norm": 2.324019432067871, "learning_rate": 4.463990750942753e-05, "loss": 0.9853, "step": 345400 }, { "epoch": 5.3616598643678515, "grad_norm": 2.3114638328552246, "learning_rate": 4.463835565418458e-05, "loss": 0.9648, "step": 345500 }, { "epoch": 5.363211719610795, "grad_norm": 1.9445880651474, "learning_rate": 4.4636803798941636e-05, "loss": 0.9707, "step": 345600 }, { "epoch": 5.364763574853738, "grad_norm": 2.129798173904419, "learning_rate": 4.4635251943698694e-05, "loss": 0.9935, "step": 345700 }, { "epoch": 5.36631543009668, "grad_norm": 1.9039580821990967, "learning_rate": 4.463370008845575e-05, "loss": 0.9712, "step": 345800 }, { "epoch": 5.367867285339623, "grad_norm": 2.0943281650543213, "learning_rate": 4.463214823321281e-05, "loss": 0.9799, "step": 345900 }, { "epoch": 5.369419140582567, "grad_norm": 2.4390809535980225, "learning_rate": 4.463059637796986e-05, "loss": 0.9815, "step": 346000 }, { "epoch": 5.370970995825509, "grad_norm": 2.434919834136963, "learning_rate": 4.462904452272692e-05, "loss": 0.9927, "step": 346100 }, { "epoch": 5.372522851068452, "grad_norm": 2.5707101821899414, "learning_rate": 4.4627492667483976e-05, "loss": 0.9515, "step": 346200 }, { "epoch": 5.374074706311395, "grad_norm": 2.235417604446411, "learning_rate": 4.4625940812241034e-05, "loss": 0.9654, "step": 346300 }, { "epoch": 5.375626561554339, "grad_norm": 2.514286518096924, "learning_rate": 4.462438895699809e-05, "loss": 0.9883, "step": 346400 }, { "epoch": 5.377178416797281, "grad_norm": 2.027215003967285, "learning_rate": 4.462283710175515e-05, "loss": 0.9851, "step": 346500 }, { "epoch": 5.378730272040224, "grad_norm": 2.1994903087615967, "learning_rate": 4.462128524651221e-05, "loss": 0.9754, "step": 346600 }, { "epoch": 5.380282127283167, "grad_norm": 2.5119285583496094, "learning_rate": 4.4619733391269265e-05, "loss": 0.9877, "step": 346700 }, { "epoch": 5.38183398252611, "grad_norm": 2.0300240516662598, "learning_rate": 4.461818153602632e-05, "loss": 0.9654, "step": 346800 }, { "epoch": 5.383385837769053, "grad_norm": 2.0062859058380127, "learning_rate": 4.461662968078338e-05, "loss": 0.9852, "step": 346900 }, { "epoch": 5.384937693011996, "grad_norm": 2.1682381629943848, "learning_rate": 4.461507782554044e-05, "loss": 0.9736, "step": 347000 }, { "epoch": 5.386489548254938, "grad_norm": 2.5994319915771484, "learning_rate": 4.4613525970297496e-05, "loss": 0.9849, "step": 347100 }, { "epoch": 5.388041403497882, "grad_norm": 2.322558641433716, "learning_rate": 4.461197411505455e-05, "loss": 0.9813, "step": 347200 }, { "epoch": 5.389593258740825, "grad_norm": 1.945400595664978, "learning_rate": 4.4610422259811604e-05, "loss": 0.9582, "step": 347300 }, { "epoch": 5.391145113983768, "grad_norm": 2.2319095134735107, "learning_rate": 4.460887040456866e-05, "loss": 0.9601, "step": 347400 }, { "epoch": 5.39269696922671, "grad_norm": 1.8645366430282593, "learning_rate": 4.460731854932572e-05, "loss": 0.9896, "step": 347500 }, { "epoch": 5.394248824469654, "grad_norm": 2.1026902198791504, "learning_rate": 4.460576669408278e-05, "loss": 0.9909, "step": 347600 }, { "epoch": 5.395800679712597, "grad_norm": 1.818767786026001, "learning_rate": 4.4604214838839835e-05, "loss": 0.9796, "step": 347700 }, { "epoch": 5.397352534955539, "grad_norm": 2.3829264640808105, "learning_rate": 4.460266298359689e-05, "loss": 0.9986, "step": 347800 }, { "epoch": 5.398904390198482, "grad_norm": 2.27321720123291, "learning_rate": 4.460111112835395e-05, "loss": 0.982, "step": 347900 }, { "epoch": 5.4004562454414256, "grad_norm": 2.143763303756714, "learning_rate": 4.459955927311101e-05, "loss": 0.9623, "step": 348000 }, { "epoch": 5.402008100684368, "grad_norm": 2.14553165435791, "learning_rate": 4.4598007417868066e-05, "loss": 0.958, "step": 348100 }, { "epoch": 5.403559955927311, "grad_norm": 1.955831527709961, "learning_rate": 4.4596455562625124e-05, "loss": 0.9947, "step": 348200 }, { "epoch": 5.405111811170254, "grad_norm": 2.215580940246582, "learning_rate": 4.459490370738218e-05, "loss": 0.9639, "step": 348300 }, { "epoch": 5.406663666413197, "grad_norm": 2.009143590927124, "learning_rate": 4.459335185213924e-05, "loss": 0.953, "step": 348400 }, { "epoch": 5.40821552165614, "grad_norm": 2.26324462890625, "learning_rate": 4.45917999968963e-05, "loss": 0.9666, "step": 348500 }, { "epoch": 5.409767376899083, "grad_norm": 1.974342703819275, "learning_rate": 4.459024814165335e-05, "loss": 0.9733, "step": 348600 }, { "epoch": 5.411319232142025, "grad_norm": 2.0890650749206543, "learning_rate": 4.4588696286410406e-05, "loss": 0.9782, "step": 348700 }, { "epoch": 5.412871087384969, "grad_norm": 1.9821761846542358, "learning_rate": 4.458714443116746e-05, "loss": 0.9828, "step": 348800 }, { "epoch": 5.414422942627912, "grad_norm": 1.9559098482131958, "learning_rate": 4.4585592575924515e-05, "loss": 0.9915, "step": 348900 }, { "epoch": 5.415974797870855, "grad_norm": 2.713721990585327, "learning_rate": 4.458404072068157e-05, "loss": 0.9794, "step": 349000 }, { "epoch": 5.417526653113797, "grad_norm": 2.756824254989624, "learning_rate": 4.458248886543863e-05, "loss": 0.9858, "step": 349100 }, { "epoch": 5.4190785083567405, "grad_norm": 1.7667917013168335, "learning_rate": 4.458093701019569e-05, "loss": 0.9872, "step": 349200 }, { "epoch": 5.420630363599684, "grad_norm": 2.0684385299682617, "learning_rate": 4.4579385154952746e-05, "loss": 1.0042, "step": 349300 }, { "epoch": 5.422182218842626, "grad_norm": 2.0621657371520996, "learning_rate": 4.4577833299709804e-05, "loss": 0.9711, "step": 349400 }, { "epoch": 5.423734074085569, "grad_norm": 2.46278715133667, "learning_rate": 4.457628144446686e-05, "loss": 0.9818, "step": 349500 }, { "epoch": 5.4252859293285125, "grad_norm": 1.9518847465515137, "learning_rate": 4.457472958922392e-05, "loss": 0.9832, "step": 349600 }, { "epoch": 5.426837784571455, "grad_norm": 2.389207601547241, "learning_rate": 4.457317773398098e-05, "loss": 0.9957, "step": 349700 }, { "epoch": 5.428389639814398, "grad_norm": 2.028789520263672, "learning_rate": 4.4571625878738035e-05, "loss": 0.979, "step": 349800 }, { "epoch": 5.429941495057341, "grad_norm": 1.9984185695648193, "learning_rate": 4.457007402349509e-05, "loss": 0.9547, "step": 349900 }, { "epoch": 5.431493350300284, "grad_norm": 2.103748083114624, "learning_rate": 4.456852216825215e-05, "loss": 0.9807, "step": 350000 }, { "epoch": 5.433045205543227, "grad_norm": 2.00935959815979, "learning_rate": 4.45669703130092e-05, "loss": 0.9786, "step": 350100 }, { "epoch": 5.43459706078617, "grad_norm": 2.1263985633850098, "learning_rate": 4.456541845776626e-05, "loss": 0.9679, "step": 350200 }, { "epoch": 5.436148916029113, "grad_norm": 2.2835018634796143, "learning_rate": 4.4563866602523317e-05, "loss": 0.9793, "step": 350300 }, { "epoch": 5.4377007712720555, "grad_norm": 2.894418478012085, "learning_rate": 4.4562314747280374e-05, "loss": 0.9889, "step": 350400 }, { "epoch": 5.439252626514999, "grad_norm": 2.039304733276367, "learning_rate": 4.456076289203743e-05, "loss": 0.9978, "step": 350500 }, { "epoch": 5.440804481757942, "grad_norm": 2.1859359741210938, "learning_rate": 4.455921103679449e-05, "loss": 0.9801, "step": 350600 }, { "epoch": 5.442356337000884, "grad_norm": 1.9913524389266968, "learning_rate": 4.455765918155155e-05, "loss": 0.9743, "step": 350700 }, { "epoch": 5.4439081922438275, "grad_norm": 1.6992727518081665, "learning_rate": 4.4556107326308605e-05, "loss": 1.0108, "step": 350800 }, { "epoch": 5.445460047486771, "grad_norm": 2.630546808242798, "learning_rate": 4.455455547106566e-05, "loss": 0.9704, "step": 350900 }, { "epoch": 5.447011902729713, "grad_norm": 2.0170834064483643, "learning_rate": 4.455300361582272e-05, "loss": 1.0016, "step": 351000 }, { "epoch": 5.448563757972656, "grad_norm": 2.492382049560547, "learning_rate": 4.455145176057978e-05, "loss": 0.984, "step": 351100 }, { "epoch": 5.4501156132155995, "grad_norm": 2.2098348140716553, "learning_rate": 4.4549899905336836e-05, "loss": 0.9668, "step": 351200 }, { "epoch": 5.451667468458542, "grad_norm": 2.287720203399658, "learning_rate": 4.4548348050093894e-05, "loss": 0.9897, "step": 351300 }, { "epoch": 5.453219323701485, "grad_norm": 2.221672534942627, "learning_rate": 4.4546796194850945e-05, "loss": 0.9638, "step": 351400 }, { "epoch": 5.454771178944428, "grad_norm": 2.2400195598602295, "learning_rate": 4.4545244339608e-05, "loss": 0.9848, "step": 351500 }, { "epoch": 5.456323034187371, "grad_norm": 2.104464530944824, "learning_rate": 4.454369248436506e-05, "loss": 0.9622, "step": 351600 }, { "epoch": 5.457874889430314, "grad_norm": 2.383382558822632, "learning_rate": 4.454214062912212e-05, "loss": 0.9837, "step": 351700 }, { "epoch": 5.459426744673257, "grad_norm": 2.198345422744751, "learning_rate": 4.4540588773879176e-05, "loss": 0.9813, "step": 351800 }, { "epoch": 5.4609785999162, "grad_norm": 2.3290531635284424, "learning_rate": 4.4539036918636234e-05, "loss": 0.9767, "step": 351900 }, { "epoch": 5.4625304551591425, "grad_norm": 2.2324042320251465, "learning_rate": 4.4537485063393285e-05, "loss": 0.9879, "step": 352000 }, { "epoch": 5.464082310402086, "grad_norm": 1.9537746906280518, "learning_rate": 4.453593320815034e-05, "loss": 0.9793, "step": 352100 }, { "epoch": 5.465634165645029, "grad_norm": 2.270908832550049, "learning_rate": 4.45343813529074e-05, "loss": 0.9824, "step": 352200 }, { "epoch": 5.467186020887971, "grad_norm": 2.5710361003875732, "learning_rate": 4.453282949766446e-05, "loss": 0.9831, "step": 352300 }, { "epoch": 5.468737876130914, "grad_norm": 2.1779885292053223, "learning_rate": 4.4531277642421516e-05, "loss": 0.9743, "step": 352400 }, { "epoch": 5.470289731373858, "grad_norm": 2.0371127128601074, "learning_rate": 4.4529725787178574e-05, "loss": 0.9924, "step": 352500 }, { "epoch": 5.4718415866168, "grad_norm": 1.9643186330795288, "learning_rate": 4.452817393193563e-05, "loss": 0.9823, "step": 352600 }, { "epoch": 5.473393441859743, "grad_norm": 2.0076684951782227, "learning_rate": 4.452662207669269e-05, "loss": 0.9836, "step": 352700 }, { "epoch": 5.474945297102686, "grad_norm": 2.449479579925537, "learning_rate": 4.452507022144975e-05, "loss": 0.9852, "step": 352800 }, { "epoch": 5.47649715234563, "grad_norm": 2.2750444412231445, "learning_rate": 4.45235183662068e-05, "loss": 0.9882, "step": 352900 }, { "epoch": 5.478049007588572, "grad_norm": 2.4435880184173584, "learning_rate": 4.4521966510963856e-05, "loss": 0.9808, "step": 353000 }, { "epoch": 5.479600862831515, "grad_norm": 2.2818408012390137, "learning_rate": 4.452041465572091e-05, "loss": 0.991, "step": 353100 }, { "epoch": 5.481152718074458, "grad_norm": 2.2061777114868164, "learning_rate": 4.451886280047797e-05, "loss": 0.9875, "step": 353200 }, { "epoch": 5.482704573317401, "grad_norm": 2.0868592262268066, "learning_rate": 4.451731094523503e-05, "loss": 0.9636, "step": 353300 }, { "epoch": 5.484256428560344, "grad_norm": 2.4877538681030273, "learning_rate": 4.4515759089992087e-05, "loss": 0.9684, "step": 353400 }, { "epoch": 5.485808283803287, "grad_norm": 1.973732829093933, "learning_rate": 4.4514207234749144e-05, "loss": 0.9731, "step": 353500 }, { "epoch": 5.487360139046229, "grad_norm": 2.477951765060425, "learning_rate": 4.45126553795062e-05, "loss": 0.9804, "step": 353600 }, { "epoch": 5.488911994289173, "grad_norm": 2.750450372695923, "learning_rate": 4.451110352426326e-05, "loss": 0.9763, "step": 353700 }, { "epoch": 5.490463849532116, "grad_norm": 2.3875765800476074, "learning_rate": 4.450955166902032e-05, "loss": 0.9838, "step": 353800 }, { "epoch": 5.492015704775058, "grad_norm": 2.0958762168884277, "learning_rate": 4.4507999813777375e-05, "loss": 0.9634, "step": 353900 }, { "epoch": 5.493567560018001, "grad_norm": 2.1959280967712402, "learning_rate": 4.450644795853443e-05, "loss": 0.9785, "step": 354000 }, { "epoch": 5.495119415260945, "grad_norm": 1.7072805166244507, "learning_rate": 4.450489610329149e-05, "loss": 0.9595, "step": 354100 }, { "epoch": 5.496671270503887, "grad_norm": 2.3442811965942383, "learning_rate": 4.450334424804855e-05, "loss": 0.9803, "step": 354200 }, { "epoch": 5.49822312574683, "grad_norm": 1.561077356338501, "learning_rate": 4.45017923928056e-05, "loss": 0.9713, "step": 354300 }, { "epoch": 5.499774980989773, "grad_norm": 2.1972949504852295, "learning_rate": 4.450024053756266e-05, "loss": 0.9754, "step": 354400 }, { "epoch": 5.501326836232717, "grad_norm": 1.9204434156417847, "learning_rate": 4.4498688682319715e-05, "loss": 0.9673, "step": 354500 }, { "epoch": 5.502878691475659, "grad_norm": 2.437697410583496, "learning_rate": 4.449713682707677e-05, "loss": 0.9978, "step": 354600 }, { "epoch": 5.504430546718602, "grad_norm": 2.310438632965088, "learning_rate": 4.449558497183383e-05, "loss": 0.9803, "step": 354700 }, { "epoch": 5.505982401961545, "grad_norm": 1.9296797513961792, "learning_rate": 4.449403311659089e-05, "loss": 0.9878, "step": 354800 }, { "epoch": 5.507534257204488, "grad_norm": 2.261770248413086, "learning_rate": 4.4492481261347946e-05, "loss": 0.986, "step": 354900 }, { "epoch": 5.509086112447431, "grad_norm": 1.9454330205917358, "learning_rate": 4.4490929406105004e-05, "loss": 0.9931, "step": 355000 }, { "epoch": 5.510637967690374, "grad_norm": 2.4823288917541504, "learning_rate": 4.448937755086206e-05, "loss": 0.9976, "step": 355100 }, { "epoch": 5.512189822933317, "grad_norm": 2.0919179916381836, "learning_rate": 4.448782569561912e-05, "loss": 0.9805, "step": 355200 }, { "epoch": 5.51374167817626, "grad_norm": 2.2404212951660156, "learning_rate": 4.448627384037617e-05, "loss": 0.9941, "step": 355300 }, { "epoch": 5.515293533419203, "grad_norm": 2.0819284915924072, "learning_rate": 4.448472198513323e-05, "loss": 0.9837, "step": 355400 }, { "epoch": 5.516845388662146, "grad_norm": 2.157360076904297, "learning_rate": 4.4483170129890286e-05, "loss": 0.9622, "step": 355500 }, { "epoch": 5.518397243905088, "grad_norm": 1.9269564151763916, "learning_rate": 4.4481618274647344e-05, "loss": 0.9624, "step": 355600 }, { "epoch": 5.519949099148032, "grad_norm": 2.549567461013794, "learning_rate": 4.44800664194044e-05, "loss": 0.989, "step": 355700 }, { "epoch": 5.521500954390975, "grad_norm": 2.4770264625549316, "learning_rate": 4.447851456416145e-05, "loss": 0.9737, "step": 355800 }, { "epoch": 5.523052809633917, "grad_norm": 1.941084623336792, "learning_rate": 4.447696270891851e-05, "loss": 0.9558, "step": 355900 }, { "epoch": 5.52460466487686, "grad_norm": 2.5409600734710693, "learning_rate": 4.447541085367557e-05, "loss": 0.9725, "step": 356000 }, { "epoch": 5.5261565201198035, "grad_norm": 2.013537883758545, "learning_rate": 4.4473858998432626e-05, "loss": 0.9894, "step": 356100 }, { "epoch": 5.527708375362746, "grad_norm": 2.299778699874878, "learning_rate": 4.447230714318968e-05, "loss": 0.9958, "step": 356200 }, { "epoch": 5.529260230605689, "grad_norm": 2.6637446880340576, "learning_rate": 4.447075528794674e-05, "loss": 0.9867, "step": 356300 }, { "epoch": 5.530812085848632, "grad_norm": 2.1968934535980225, "learning_rate": 4.44692034327038e-05, "loss": 0.9809, "step": 356400 }, { "epoch": 5.532363941091575, "grad_norm": 2.3023293018341064, "learning_rate": 4.4467651577460857e-05, "loss": 1.0052, "step": 356500 }, { "epoch": 5.533915796334518, "grad_norm": 1.9183952808380127, "learning_rate": 4.4466099722217914e-05, "loss": 0.9762, "step": 356600 }, { "epoch": 5.535467651577461, "grad_norm": 1.8733736276626587, "learning_rate": 4.446454786697497e-05, "loss": 0.9859, "step": 356700 }, { "epoch": 5.537019506820403, "grad_norm": 2.0433640480041504, "learning_rate": 4.446299601173203e-05, "loss": 0.9673, "step": 356800 }, { "epoch": 5.5385713620633465, "grad_norm": 2.3340017795562744, "learning_rate": 4.446144415648909e-05, "loss": 0.9684, "step": 356900 }, { "epoch": 5.54012321730629, "grad_norm": 2.0930838584899902, "learning_rate": 4.4459892301246145e-05, "loss": 0.9661, "step": 357000 }, { "epoch": 5.541675072549233, "grad_norm": 2.2251813411712646, "learning_rate": 4.4458340446003196e-05, "loss": 0.9914, "step": 357100 }, { "epoch": 5.543226927792175, "grad_norm": 2.4446849822998047, "learning_rate": 4.4456788590760254e-05, "loss": 0.9913, "step": 357200 }, { "epoch": 5.5447787830351185, "grad_norm": 2.2096974849700928, "learning_rate": 4.445523673551731e-05, "loss": 0.9937, "step": 357300 }, { "epoch": 5.546330638278062, "grad_norm": 2.4166781902313232, "learning_rate": 4.445368488027437e-05, "loss": 0.974, "step": 357400 }, { "epoch": 5.547882493521004, "grad_norm": 2.3222157955169678, "learning_rate": 4.445213302503143e-05, "loss": 0.9783, "step": 357500 }, { "epoch": 5.549434348763947, "grad_norm": 2.3722078800201416, "learning_rate": 4.4450581169788485e-05, "loss": 0.9772, "step": 357600 }, { "epoch": 5.5509862040068905, "grad_norm": 2.143782615661621, "learning_rate": 4.444902931454554e-05, "loss": 0.9675, "step": 357700 }, { "epoch": 5.552538059249833, "grad_norm": 2.000396490097046, "learning_rate": 4.44474774593026e-05, "loss": 0.9852, "step": 357800 }, { "epoch": 5.554089914492776, "grad_norm": 2.1700656414031982, "learning_rate": 4.444592560405966e-05, "loss": 0.9755, "step": 357900 }, { "epoch": 5.555641769735719, "grad_norm": 2.1937971115112305, "learning_rate": 4.4444373748816716e-05, "loss": 0.971, "step": 358000 }, { "epoch": 5.557193624978662, "grad_norm": 2.1302905082702637, "learning_rate": 4.4442821893573774e-05, "loss": 0.9704, "step": 358100 }, { "epoch": 5.558745480221605, "grad_norm": 2.1540932655334473, "learning_rate": 4.444127003833083e-05, "loss": 0.9635, "step": 358200 }, { "epoch": 5.560297335464548, "grad_norm": 2.2759764194488525, "learning_rate": 4.443971818308789e-05, "loss": 0.9748, "step": 358300 }, { "epoch": 5.561849190707491, "grad_norm": 2.4250035285949707, "learning_rate": 4.443816632784494e-05, "loss": 0.9848, "step": 358400 }, { "epoch": 5.5634010459504335, "grad_norm": 2.174387216567993, "learning_rate": 4.4436614472602e-05, "loss": 0.9906, "step": 358500 }, { "epoch": 5.564952901193377, "grad_norm": 2.1169731616973877, "learning_rate": 4.443506261735905e-05, "loss": 0.9907, "step": 358600 }, { "epoch": 5.56650475643632, "grad_norm": 2.1221742630004883, "learning_rate": 4.443351076211611e-05, "loss": 0.9863, "step": 358700 }, { "epoch": 5.568056611679262, "grad_norm": 2.291260242462158, "learning_rate": 4.4431958906873165e-05, "loss": 0.962, "step": 358800 }, { "epoch": 5.5696084669222055, "grad_norm": 2.050210952758789, "learning_rate": 4.443040705163022e-05, "loss": 0.9817, "step": 358900 }, { "epoch": 5.571160322165149, "grad_norm": 2.361135959625244, "learning_rate": 4.442885519638728e-05, "loss": 0.976, "step": 359000 }, { "epoch": 5.572712177408091, "grad_norm": 2.499438524246216, "learning_rate": 4.442730334114434e-05, "loss": 0.9792, "step": 359100 }, { "epoch": 5.574264032651034, "grad_norm": 2.4830102920532227, "learning_rate": 4.4425751485901396e-05, "loss": 0.9988, "step": 359200 }, { "epoch": 5.575815887893977, "grad_norm": 2.354757785797119, "learning_rate": 4.442419963065845e-05, "loss": 0.9965, "step": 359300 }, { "epoch": 5.57736774313692, "grad_norm": 1.9289546012878418, "learning_rate": 4.442264777541551e-05, "loss": 0.9778, "step": 359400 }, { "epoch": 5.578919598379863, "grad_norm": 2.037522554397583, "learning_rate": 4.442109592017257e-05, "loss": 0.9672, "step": 359500 }, { "epoch": 5.580471453622806, "grad_norm": 2.0955095291137695, "learning_rate": 4.4419544064929627e-05, "loss": 0.9784, "step": 359600 }, { "epoch": 5.582023308865749, "grad_norm": 1.8834164142608643, "learning_rate": 4.4417992209686684e-05, "loss": 0.9851, "step": 359700 }, { "epoch": 5.583575164108692, "grad_norm": 2.162356376647949, "learning_rate": 4.441644035444374e-05, "loss": 0.9835, "step": 359800 }, { "epoch": 5.585127019351635, "grad_norm": 2.5465216636657715, "learning_rate": 4.441488849920079e-05, "loss": 0.9587, "step": 359900 }, { "epoch": 5.586678874594578, "grad_norm": 1.9492729902267456, "learning_rate": 4.441333664395785e-05, "loss": 0.9641, "step": 360000 }, { "epoch": 5.5882307298375204, "grad_norm": 2.319061756134033, "learning_rate": 4.441178478871491e-05, "loss": 1.0023, "step": 360100 }, { "epoch": 5.589782585080464, "grad_norm": 2.4109034538269043, "learning_rate": 4.4410232933471966e-05, "loss": 0.9946, "step": 360200 }, { "epoch": 5.591334440323407, "grad_norm": 2.2688701152801514, "learning_rate": 4.4408681078229024e-05, "loss": 1.013, "step": 360300 }, { "epoch": 5.592886295566349, "grad_norm": 2.359365224838257, "learning_rate": 4.440712922298608e-05, "loss": 0.9804, "step": 360400 }, { "epoch": 5.594438150809292, "grad_norm": 2.4496281147003174, "learning_rate": 4.440557736774314e-05, "loss": 0.9953, "step": 360500 }, { "epoch": 5.595990006052236, "grad_norm": 2.2714340686798096, "learning_rate": 4.44040255125002e-05, "loss": 0.9713, "step": 360600 }, { "epoch": 5.597541861295179, "grad_norm": 1.8760069608688354, "learning_rate": 4.4402473657257255e-05, "loss": 0.9873, "step": 360700 }, { "epoch": 5.599093716538121, "grad_norm": 2.1799817085266113, "learning_rate": 4.440092180201431e-05, "loss": 0.9997, "step": 360800 }, { "epoch": 5.600645571781064, "grad_norm": 1.8943510055541992, "learning_rate": 4.439936994677137e-05, "loss": 0.9841, "step": 360900 }, { "epoch": 5.602197427024008, "grad_norm": 2.2320775985717773, "learning_rate": 4.439781809152843e-05, "loss": 0.9752, "step": 361000 }, { "epoch": 5.60374928226695, "grad_norm": 2.5199897289276123, "learning_rate": 4.4396266236285486e-05, "loss": 0.988, "step": 361100 }, { "epoch": 5.605301137509893, "grad_norm": 2.2784907817840576, "learning_rate": 4.439471438104254e-05, "loss": 0.9715, "step": 361200 }, { "epoch": 5.606852992752836, "grad_norm": 2.2994918823242188, "learning_rate": 4.4393162525799595e-05, "loss": 0.9844, "step": 361300 }, { "epoch": 5.608404847995779, "grad_norm": 2.062472105026245, "learning_rate": 4.439161067055665e-05, "loss": 0.9573, "step": 361400 }, { "epoch": 5.609956703238722, "grad_norm": 2.3974623680114746, "learning_rate": 4.439005881531371e-05, "loss": 0.9629, "step": 361500 }, { "epoch": 5.611508558481665, "grad_norm": 1.936026692390442, "learning_rate": 4.438850696007077e-05, "loss": 0.9653, "step": 361600 }, { "epoch": 5.613060413724607, "grad_norm": 2.03934907913208, "learning_rate": 4.4386955104827826e-05, "loss": 0.9722, "step": 361700 }, { "epoch": 5.614612268967551, "grad_norm": 1.9631593227386475, "learning_rate": 4.438540324958488e-05, "loss": 0.9788, "step": 361800 }, { "epoch": 5.616164124210494, "grad_norm": 2.4654765129089355, "learning_rate": 4.4383851394341935e-05, "loss": 0.9649, "step": 361900 }, { "epoch": 5.617715979453436, "grad_norm": 1.980949878692627, "learning_rate": 4.438229953909899e-05, "loss": 0.9955, "step": 362000 }, { "epoch": 5.619267834696379, "grad_norm": 2.2959792613983154, "learning_rate": 4.438074768385605e-05, "loss": 0.9915, "step": 362100 }, { "epoch": 5.620819689939323, "grad_norm": 1.780027151107788, "learning_rate": 4.437919582861311e-05, "loss": 0.9695, "step": 362200 }, { "epoch": 5.622371545182266, "grad_norm": 1.9942281246185303, "learning_rate": 4.4377643973370166e-05, "loss": 1.0067, "step": 362300 }, { "epoch": 5.623923400425208, "grad_norm": 2.60976243019104, "learning_rate": 4.437609211812722e-05, "loss": 0.9921, "step": 362400 }, { "epoch": 5.625475255668151, "grad_norm": 2.156275987625122, "learning_rate": 4.437454026288428e-05, "loss": 0.9991, "step": 362500 }, { "epoch": 5.6270271109110945, "grad_norm": 2.118537664413452, "learning_rate": 4.437298840764134e-05, "loss": 0.9871, "step": 362600 }, { "epoch": 5.628578966154037, "grad_norm": 2.250690221786499, "learning_rate": 4.4371436552398397e-05, "loss": 0.9847, "step": 362700 }, { "epoch": 5.63013082139698, "grad_norm": 1.9881632328033447, "learning_rate": 4.436988469715545e-05, "loss": 0.9814, "step": 362800 }, { "epoch": 5.631682676639923, "grad_norm": 2.4273569583892822, "learning_rate": 4.4368332841912505e-05, "loss": 0.9885, "step": 362900 }, { "epoch": 5.633234531882866, "grad_norm": 2.7762176990509033, "learning_rate": 4.436678098666956e-05, "loss": 0.9843, "step": 363000 }, { "epoch": 5.634786387125809, "grad_norm": 2.532621145248413, "learning_rate": 4.436522913142662e-05, "loss": 0.9717, "step": 363100 }, { "epoch": 5.636338242368752, "grad_norm": 2.637486696243286, "learning_rate": 4.436367727618368e-05, "loss": 0.986, "step": 363200 }, { "epoch": 5.637890097611695, "grad_norm": 2.3964123725891113, "learning_rate": 4.4362125420940736e-05, "loss": 0.9781, "step": 363300 }, { "epoch": 5.639441952854638, "grad_norm": 2.11099910736084, "learning_rate": 4.4360573565697794e-05, "loss": 0.9705, "step": 363400 }, { "epoch": 5.640993808097581, "grad_norm": 2.423428773880005, "learning_rate": 4.435902171045485e-05, "loss": 0.9805, "step": 363500 }, { "epoch": 5.642545663340524, "grad_norm": 2.279287338256836, "learning_rate": 4.435746985521191e-05, "loss": 0.9682, "step": 363600 }, { "epoch": 5.644097518583466, "grad_norm": 2.2667236328125, "learning_rate": 4.435591799996897e-05, "loss": 1.0055, "step": 363700 }, { "epoch": 5.6456493738264095, "grad_norm": 2.0388057231903076, "learning_rate": 4.4354366144726025e-05, "loss": 0.9951, "step": 363800 }, { "epoch": 5.647201229069353, "grad_norm": 2.438066005706787, "learning_rate": 4.435281428948308e-05, "loss": 0.9715, "step": 363900 }, { "epoch": 5.648753084312295, "grad_norm": 2.158094882965088, "learning_rate": 4.435126243424014e-05, "loss": 0.9785, "step": 364000 }, { "epoch": 5.650304939555238, "grad_norm": 2.2734158039093018, "learning_rate": 4.434971057899719e-05, "loss": 0.9798, "step": 364100 }, { "epoch": 5.6518567947981815, "grad_norm": 2.5884833335876465, "learning_rate": 4.434815872375425e-05, "loss": 0.968, "step": 364200 }, { "epoch": 5.653408650041124, "grad_norm": 2.0443115234375, "learning_rate": 4.434660686851131e-05, "loss": 0.9735, "step": 364300 }, { "epoch": 5.654960505284067, "grad_norm": 2.1035728454589844, "learning_rate": 4.4345055013268365e-05, "loss": 0.959, "step": 364400 }, { "epoch": 5.65651236052701, "grad_norm": 2.3837780952453613, "learning_rate": 4.434350315802542e-05, "loss": 0.9839, "step": 364500 }, { "epoch": 5.6580642157699526, "grad_norm": 2.4859204292297363, "learning_rate": 4.434195130278248e-05, "loss": 0.9855, "step": 364600 }, { "epoch": 5.659616071012896, "grad_norm": 2.1114308834075928, "learning_rate": 4.434039944753954e-05, "loss": 0.9803, "step": 364700 }, { "epoch": 5.661167926255839, "grad_norm": 2.7344343662261963, "learning_rate": 4.4338847592296596e-05, "loss": 0.9818, "step": 364800 }, { "epoch": 5.662719781498781, "grad_norm": 2.4254872798919678, "learning_rate": 4.4337295737053654e-05, "loss": 0.9571, "step": 364900 }, { "epoch": 5.6642716367417245, "grad_norm": 2.118703842163086, "learning_rate": 4.433574388181071e-05, "loss": 0.9859, "step": 365000 }, { "epoch": 5.665823491984668, "grad_norm": 2.064786911010742, "learning_rate": 4.433419202656776e-05, "loss": 0.9677, "step": 365100 }, { "epoch": 5.667375347227611, "grad_norm": 2.073610544204712, "learning_rate": 4.433264017132482e-05, "loss": 0.9748, "step": 365200 }, { "epoch": 5.668927202470553, "grad_norm": 1.9572569131851196, "learning_rate": 4.433108831608188e-05, "loss": 0.9938, "step": 365300 }, { "epoch": 5.6704790577134965, "grad_norm": 2.6718883514404297, "learning_rate": 4.4329536460838936e-05, "loss": 0.9724, "step": 365400 }, { "epoch": 5.67203091295644, "grad_norm": 2.709984064102173, "learning_rate": 4.432798460559599e-05, "loss": 0.9761, "step": 365500 }, { "epoch": 5.673582768199382, "grad_norm": 1.9509752988815308, "learning_rate": 4.4326432750353044e-05, "loss": 0.9813, "step": 365600 }, { "epoch": 5.675134623442325, "grad_norm": 1.991582989692688, "learning_rate": 4.43248808951101e-05, "loss": 0.973, "step": 365700 }, { "epoch": 5.676686478685268, "grad_norm": 2.279745101928711, "learning_rate": 4.432332903986716e-05, "loss": 0.9754, "step": 365800 }, { "epoch": 5.678238333928212, "grad_norm": 2.0734293460845947, "learning_rate": 4.432177718462422e-05, "loss": 0.9706, "step": 365900 }, { "epoch": 5.679790189171154, "grad_norm": 2.3504891395568848, "learning_rate": 4.4320225329381275e-05, "loss": 0.9653, "step": 366000 }, { "epoch": 5.681342044414097, "grad_norm": 2.5119516849517822, "learning_rate": 4.431867347413833e-05, "loss": 0.9888, "step": 366100 }, { "epoch": 5.68289389965704, "grad_norm": 1.8904026746749878, "learning_rate": 4.431712161889539e-05, "loss": 0.9722, "step": 366200 }, { "epoch": 5.684445754899983, "grad_norm": 2.247629404067993, "learning_rate": 4.431556976365245e-05, "loss": 0.9781, "step": 366300 }, { "epoch": 5.685997610142926, "grad_norm": 2.233731985092163, "learning_rate": 4.4314017908409506e-05, "loss": 0.9756, "step": 366400 }, { "epoch": 5.687549465385869, "grad_norm": 1.6566970348358154, "learning_rate": 4.4312466053166564e-05, "loss": 0.9755, "step": 366500 }, { "epoch": 5.6891013206288115, "grad_norm": 2.4865779876708984, "learning_rate": 4.431091419792362e-05, "loss": 0.9735, "step": 366600 }, { "epoch": 5.690653175871755, "grad_norm": 2.3704440593719482, "learning_rate": 4.430936234268068e-05, "loss": 0.9986, "step": 366700 }, { "epoch": 5.692205031114698, "grad_norm": 2.21870493888855, "learning_rate": 4.430781048743774e-05, "loss": 0.9656, "step": 366800 }, { "epoch": 5.69375688635764, "grad_norm": 1.976405143737793, "learning_rate": 4.430625863219479e-05, "loss": 0.9802, "step": 366900 }, { "epoch": 5.695308741600583, "grad_norm": 2.2283897399902344, "learning_rate": 4.4304706776951846e-05, "loss": 1.0038, "step": 367000 }, { "epoch": 5.696860596843527, "grad_norm": 2.641090154647827, "learning_rate": 4.4303154921708904e-05, "loss": 0.9895, "step": 367100 }, { "epoch": 5.698412452086469, "grad_norm": 2.4293911457061768, "learning_rate": 4.430160306646596e-05, "loss": 0.9811, "step": 367200 }, { "epoch": 5.699964307329412, "grad_norm": 1.9061189889907837, "learning_rate": 4.430005121122302e-05, "loss": 0.9638, "step": 367300 }, { "epoch": 5.701516162572355, "grad_norm": 2.30849552154541, "learning_rate": 4.429849935598008e-05, "loss": 0.9808, "step": 367400 }, { "epoch": 5.703068017815298, "grad_norm": 2.3032681941986084, "learning_rate": 4.4296947500737135e-05, "loss": 0.9599, "step": 367500 }, { "epoch": 5.704619873058241, "grad_norm": 1.7757885456085205, "learning_rate": 4.429539564549419e-05, "loss": 0.9832, "step": 367600 }, { "epoch": 5.706171728301184, "grad_norm": 2.3700315952301025, "learning_rate": 4.429384379025125e-05, "loss": 0.9867, "step": 367700 }, { "epoch": 5.707723583544127, "grad_norm": 2.00622296333313, "learning_rate": 4.429229193500831e-05, "loss": 0.9851, "step": 367800 }, { "epoch": 5.70927543878707, "grad_norm": 2.456026554107666, "learning_rate": 4.4290740079765366e-05, "loss": 0.9851, "step": 367900 }, { "epoch": 5.710827294030013, "grad_norm": 2.2019221782684326, "learning_rate": 4.4289188224522424e-05, "loss": 0.9873, "step": 368000 }, { "epoch": 5.712379149272956, "grad_norm": 2.2395827770233154, "learning_rate": 4.428763636927948e-05, "loss": 0.978, "step": 368100 }, { "epoch": 5.713931004515898, "grad_norm": 1.8059297800064087, "learning_rate": 4.428608451403653e-05, "loss": 0.9671, "step": 368200 }, { "epoch": 5.715482859758842, "grad_norm": 1.5375856161117554, "learning_rate": 4.428453265879359e-05, "loss": 0.9501, "step": 368300 }, { "epoch": 5.717034715001785, "grad_norm": 1.846731185913086, "learning_rate": 4.428298080355064e-05, "loss": 0.9916, "step": 368400 }, { "epoch": 5.718586570244728, "grad_norm": 2.528812885284424, "learning_rate": 4.42814289483077e-05, "loss": 0.9642, "step": 368500 }, { "epoch": 5.72013842548767, "grad_norm": 2.11910343170166, "learning_rate": 4.4279877093064757e-05, "loss": 0.977, "step": 368600 }, { "epoch": 5.721690280730614, "grad_norm": 2.360161781311035, "learning_rate": 4.4278325237821814e-05, "loss": 0.9816, "step": 368700 }, { "epoch": 5.723242135973557, "grad_norm": 2.2701621055603027, "learning_rate": 4.427677338257887e-05, "loss": 0.9824, "step": 368800 }, { "epoch": 5.724793991216499, "grad_norm": 2.580899715423584, "learning_rate": 4.427522152733593e-05, "loss": 0.9567, "step": 368900 }, { "epoch": 5.726345846459442, "grad_norm": 2.189096212387085, "learning_rate": 4.427366967209299e-05, "loss": 0.9943, "step": 369000 }, { "epoch": 5.7278977017023855, "grad_norm": 2.4974076747894287, "learning_rate": 4.4272117816850045e-05, "loss": 1.0001, "step": 369100 }, { "epoch": 5.729449556945328, "grad_norm": 2.4761641025543213, "learning_rate": 4.42705659616071e-05, "loss": 0.9907, "step": 369200 }, { "epoch": 5.731001412188271, "grad_norm": 1.7815697193145752, "learning_rate": 4.426901410636416e-05, "loss": 1.0, "step": 369300 }, { "epoch": 5.732553267431214, "grad_norm": 2.054739475250244, "learning_rate": 4.426746225112122e-05, "loss": 0.9767, "step": 369400 }, { "epoch": 5.734105122674157, "grad_norm": 1.956700086593628, "learning_rate": 4.4265910395878276e-05, "loss": 0.9688, "step": 369500 }, { "epoch": 5.7356569779171, "grad_norm": 2.280845880508423, "learning_rate": 4.4264358540635334e-05, "loss": 0.994, "step": 369600 }, { "epoch": 5.737208833160043, "grad_norm": 2.2923591136932373, "learning_rate": 4.4262806685392385e-05, "loss": 0.9911, "step": 369700 }, { "epoch": 5.738760688402985, "grad_norm": 2.2898237705230713, "learning_rate": 4.426125483014944e-05, "loss": 0.9652, "step": 369800 }, { "epoch": 5.740312543645929, "grad_norm": 2.1933066844940186, "learning_rate": 4.42597029749065e-05, "loss": 0.978, "step": 369900 }, { "epoch": 5.741864398888872, "grad_norm": 2.2684850692749023, "learning_rate": 4.425815111966356e-05, "loss": 0.9722, "step": 370000 }, { "epoch": 5.743416254131814, "grad_norm": 2.007585048675537, "learning_rate": 4.4256599264420616e-05, "loss": 0.9852, "step": 370100 }, { "epoch": 5.744968109374757, "grad_norm": 3.4612252712249756, "learning_rate": 4.4255047409177674e-05, "loss": 0.9962, "step": 370200 }, { "epoch": 5.7465199646177005, "grad_norm": 2.068007469177246, "learning_rate": 4.425349555393473e-05, "loss": 0.9944, "step": 370300 }, { "epoch": 5.748071819860644, "grad_norm": 2.2374267578125, "learning_rate": 4.425194369869179e-05, "loss": 0.9684, "step": 370400 }, { "epoch": 5.749623675103586, "grad_norm": 2.325625419616699, "learning_rate": 4.425039184344885e-05, "loss": 0.9633, "step": 370500 }, { "epoch": 5.751175530346529, "grad_norm": 2.280376672744751, "learning_rate": 4.4248839988205905e-05, "loss": 0.9766, "step": 370600 }, { "epoch": 5.7527273855894725, "grad_norm": 1.8664424419403076, "learning_rate": 4.424728813296296e-05, "loss": 0.9654, "step": 370700 }, { "epoch": 5.754279240832415, "grad_norm": 1.771437406539917, "learning_rate": 4.424573627772002e-05, "loss": 0.9706, "step": 370800 }, { "epoch": 5.755831096075358, "grad_norm": 2.1752688884735107, "learning_rate": 4.424418442247708e-05, "loss": 0.9998, "step": 370900 }, { "epoch": 5.757382951318301, "grad_norm": 2.3009305000305176, "learning_rate": 4.4242632567234136e-05, "loss": 0.9593, "step": 371000 }, { "epoch": 5.7589348065612445, "grad_norm": 2.323765516281128, "learning_rate": 4.424108071199119e-05, "loss": 0.9644, "step": 371100 }, { "epoch": 5.760486661804187, "grad_norm": 2.078720808029175, "learning_rate": 4.4239528856748245e-05, "loss": 0.9851, "step": 371200 }, { "epoch": 5.76203851704713, "grad_norm": 2.0161325931549072, "learning_rate": 4.42379770015053e-05, "loss": 0.9686, "step": 371300 }, { "epoch": 5.763590372290073, "grad_norm": 2.4879724979400635, "learning_rate": 4.423642514626236e-05, "loss": 0.9712, "step": 371400 }, { "epoch": 5.7651422275330155, "grad_norm": 2.339918851852417, "learning_rate": 4.423487329101942e-05, "loss": 0.9813, "step": 371500 }, { "epoch": 5.766694082775959, "grad_norm": 2.619237184524536, "learning_rate": 4.423332143577647e-05, "loss": 0.9816, "step": 371600 }, { "epoch": 5.768245938018902, "grad_norm": 2.4511849880218506, "learning_rate": 4.4231769580533527e-05, "loss": 0.9873, "step": 371700 }, { "epoch": 5.769797793261844, "grad_norm": 2.32576322555542, "learning_rate": 4.4230217725290584e-05, "loss": 0.973, "step": 371800 }, { "epoch": 5.7713496485047875, "grad_norm": 2.0943193435668945, "learning_rate": 4.422866587004764e-05, "loss": 0.984, "step": 371900 }, { "epoch": 5.772901503747731, "grad_norm": 2.5125226974487305, "learning_rate": 4.42271140148047e-05, "loss": 0.9834, "step": 372000 }, { "epoch": 5.774453358990673, "grad_norm": 1.7996180057525635, "learning_rate": 4.422556215956176e-05, "loss": 0.9573, "step": 372100 }, { "epoch": 5.776005214233616, "grad_norm": 2.6592886447906494, "learning_rate": 4.4224010304318815e-05, "loss": 0.9756, "step": 372200 }, { "epoch": 5.7775570694765594, "grad_norm": 2.104086399078369, "learning_rate": 4.422245844907587e-05, "loss": 0.9862, "step": 372300 }, { "epoch": 5.779108924719502, "grad_norm": 2.2881596088409424, "learning_rate": 4.422090659383293e-05, "loss": 0.9776, "step": 372400 }, { "epoch": 5.780660779962445, "grad_norm": 2.197622060775757, "learning_rate": 4.421935473858999e-05, "loss": 0.9878, "step": 372500 }, { "epoch": 5.782212635205388, "grad_norm": 2.0641188621520996, "learning_rate": 4.421780288334704e-05, "loss": 0.9907, "step": 372600 }, { "epoch": 5.7837644904483305, "grad_norm": 2.9450650215148926, "learning_rate": 4.42162510281041e-05, "loss": 0.9992, "step": 372700 }, { "epoch": 5.785316345691274, "grad_norm": 2.5186665058135986, "learning_rate": 4.4214699172861155e-05, "loss": 0.987, "step": 372800 }, { "epoch": 5.786868200934217, "grad_norm": 2.3204572200775146, "learning_rate": 4.421314731761821e-05, "loss": 0.9616, "step": 372900 }, { "epoch": 5.78842005617716, "grad_norm": 2.0976014137268066, "learning_rate": 4.421159546237527e-05, "loss": 0.9595, "step": 373000 }, { "epoch": 5.7899719114201025, "grad_norm": 2.344311237335205, "learning_rate": 4.421004360713233e-05, "loss": 0.9817, "step": 373100 }, { "epoch": 5.791523766663046, "grad_norm": 2.5947647094726562, "learning_rate": 4.4208491751889386e-05, "loss": 0.9839, "step": 373200 }, { "epoch": 5.793075621905989, "grad_norm": 2.466034173965454, "learning_rate": 4.4206939896646444e-05, "loss": 0.9705, "step": 373300 }, { "epoch": 5.794627477148931, "grad_norm": 2.138929843902588, "learning_rate": 4.42053880414035e-05, "loss": 0.9637, "step": 373400 }, { "epoch": 5.796179332391874, "grad_norm": 2.1223342418670654, "learning_rate": 4.420383618616056e-05, "loss": 0.9927, "step": 373500 }, { "epoch": 5.797731187634818, "grad_norm": 2.2264058589935303, "learning_rate": 4.420228433091762e-05, "loss": 0.9862, "step": 373600 }, { "epoch": 5.79928304287776, "grad_norm": 1.9103014469146729, "learning_rate": 4.4200732475674675e-05, "loss": 0.9733, "step": 373700 }, { "epoch": 5.800834898120703, "grad_norm": 2.2136528491973877, "learning_rate": 4.419918062043173e-05, "loss": 0.9758, "step": 373800 }, { "epoch": 5.802386753363646, "grad_norm": 1.8761736154556274, "learning_rate": 4.4197628765188784e-05, "loss": 0.9641, "step": 373900 }, { "epoch": 5.80393860860659, "grad_norm": 2.298898935317993, "learning_rate": 4.419607690994584e-05, "loss": 0.9767, "step": 374000 }, { "epoch": 5.805490463849532, "grad_norm": 2.324634075164795, "learning_rate": 4.41945250547029e-05, "loss": 0.9727, "step": 374100 }, { "epoch": 5.807042319092475, "grad_norm": 2.0672354698181152, "learning_rate": 4.419297319945996e-05, "loss": 0.9526, "step": 374200 }, { "epoch": 5.808594174335418, "grad_norm": 2.249347448348999, "learning_rate": 4.4191421344217015e-05, "loss": 0.9805, "step": 374300 }, { "epoch": 5.810146029578361, "grad_norm": 1.9111438989639282, "learning_rate": 4.418986948897407e-05, "loss": 0.9811, "step": 374400 }, { "epoch": 5.811697884821304, "grad_norm": 2.5725488662719727, "learning_rate": 4.418831763373113e-05, "loss": 0.9623, "step": 374500 }, { "epoch": 5.813249740064247, "grad_norm": 1.861810326576233, "learning_rate": 4.418676577848819e-05, "loss": 0.9737, "step": 374600 }, { "epoch": 5.814801595307189, "grad_norm": 2.182191848754883, "learning_rate": 4.4185213923245246e-05, "loss": 0.9779, "step": 374700 }, { "epoch": 5.816353450550133, "grad_norm": 2.236327886581421, "learning_rate": 4.4183662068002297e-05, "loss": 0.9689, "step": 374800 }, { "epoch": 5.817905305793076, "grad_norm": 1.841439962387085, "learning_rate": 4.4182110212759354e-05, "loss": 0.9819, "step": 374900 }, { "epoch": 5.819457161036018, "grad_norm": 2.1086463928222656, "learning_rate": 4.418055835751641e-05, "loss": 0.9807, "step": 375000 }, { "epoch": 5.821009016278961, "grad_norm": 2.0292575359344482, "learning_rate": 4.417900650227347e-05, "loss": 0.966, "step": 375100 }, { "epoch": 5.822560871521905, "grad_norm": 2.2531540393829346, "learning_rate": 4.417745464703053e-05, "loss": 0.9733, "step": 375200 }, { "epoch": 5.824112726764847, "grad_norm": 2.0251357555389404, "learning_rate": 4.4175902791787585e-05, "loss": 0.9733, "step": 375300 }, { "epoch": 5.82566458200779, "grad_norm": 2.3481085300445557, "learning_rate": 4.4174350936544636e-05, "loss": 0.9778, "step": 375400 }, { "epoch": 5.827216437250733, "grad_norm": 2.078108072280884, "learning_rate": 4.4172799081301694e-05, "loss": 0.9829, "step": 375500 }, { "epoch": 5.828768292493677, "grad_norm": 2.5529396533966064, "learning_rate": 4.417124722605875e-05, "loss": 0.9915, "step": 375600 }, { "epoch": 5.830320147736619, "grad_norm": 2.742966413497925, "learning_rate": 4.416969537081581e-05, "loss": 0.9655, "step": 375700 }, { "epoch": 5.831872002979562, "grad_norm": 2.233820915222168, "learning_rate": 4.416814351557287e-05, "loss": 1.0054, "step": 375800 }, { "epoch": 5.833423858222505, "grad_norm": 2.059288501739502, "learning_rate": 4.4166591660329925e-05, "loss": 0.9575, "step": 375900 }, { "epoch": 5.834975713465448, "grad_norm": 2.4791860580444336, "learning_rate": 4.416503980508698e-05, "loss": 0.9981, "step": 376000 }, { "epoch": 5.836527568708391, "grad_norm": 1.7062909603118896, "learning_rate": 4.416348794984404e-05, "loss": 0.9753, "step": 376100 }, { "epoch": 5.838079423951334, "grad_norm": 2.218681573867798, "learning_rate": 4.41619360946011e-05, "loss": 0.9926, "step": 376200 }, { "epoch": 5.839631279194276, "grad_norm": 2.3979766368865967, "learning_rate": 4.4160384239358156e-05, "loss": 0.9836, "step": 376300 }, { "epoch": 5.84118313443722, "grad_norm": 2.0802865028381348, "learning_rate": 4.4158832384115214e-05, "loss": 0.9766, "step": 376400 }, { "epoch": 5.842734989680163, "grad_norm": 2.42814302444458, "learning_rate": 4.415728052887227e-05, "loss": 0.9783, "step": 376500 }, { "epoch": 5.844286844923106, "grad_norm": 2.0447919368743896, "learning_rate": 4.415572867362933e-05, "loss": 0.9684, "step": 376600 }, { "epoch": 5.845838700166048, "grad_norm": 1.781868815422058, "learning_rate": 4.415417681838638e-05, "loss": 0.9806, "step": 376700 }, { "epoch": 5.8473905554089916, "grad_norm": 2.0194475650787354, "learning_rate": 4.415262496314344e-05, "loss": 0.9821, "step": 376800 }, { "epoch": 5.848942410651935, "grad_norm": 2.089266300201416, "learning_rate": 4.4151073107900496e-05, "loss": 0.9828, "step": 376900 }, { "epoch": 5.850494265894877, "grad_norm": 2.3462977409362793, "learning_rate": 4.4149521252657554e-05, "loss": 0.9866, "step": 377000 }, { "epoch": 5.85204612113782, "grad_norm": 2.3346357345581055, "learning_rate": 4.414796939741461e-05, "loss": 0.9712, "step": 377100 }, { "epoch": 5.8535979763807635, "grad_norm": 2.413771152496338, "learning_rate": 4.414641754217167e-05, "loss": 0.9934, "step": 377200 }, { "epoch": 5.855149831623706, "grad_norm": 1.955029845237732, "learning_rate": 4.414486568692873e-05, "loss": 0.9634, "step": 377300 }, { "epoch": 5.856701686866649, "grad_norm": 2.580462694168091, "learning_rate": 4.4143313831685785e-05, "loss": 0.9718, "step": 377400 }, { "epoch": 5.858253542109592, "grad_norm": 2.2274577617645264, "learning_rate": 4.414176197644284e-05, "loss": 0.9646, "step": 377500 }, { "epoch": 5.859805397352535, "grad_norm": 2.120365858078003, "learning_rate": 4.41402101211999e-05, "loss": 0.9967, "step": 377600 }, { "epoch": 5.861357252595478, "grad_norm": 2.3951311111450195, "learning_rate": 4.413865826595696e-05, "loss": 0.9752, "step": 377700 }, { "epoch": 5.862909107838421, "grad_norm": 2.3355135917663574, "learning_rate": 4.4137106410714016e-05, "loss": 0.9774, "step": 377800 }, { "epoch": 5.864460963081363, "grad_norm": 1.7190966606140137, "learning_rate": 4.413555455547107e-05, "loss": 0.9802, "step": 377900 }, { "epoch": 5.8660128183243065, "grad_norm": 1.934046983718872, "learning_rate": 4.4134002700228124e-05, "loss": 0.9496, "step": 378000 }, { "epoch": 5.86756467356725, "grad_norm": 2.399535894393921, "learning_rate": 4.413245084498518e-05, "loss": 0.9811, "step": 378100 }, { "epoch": 5.869116528810193, "grad_norm": 2.241931676864624, "learning_rate": 4.413089898974224e-05, "loss": 0.9815, "step": 378200 }, { "epoch": 5.870668384053135, "grad_norm": 2.0503292083740234, "learning_rate": 4.412934713449929e-05, "loss": 0.9734, "step": 378300 }, { "epoch": 5.8722202392960785, "grad_norm": 2.3869497776031494, "learning_rate": 4.412779527925635e-05, "loss": 0.9763, "step": 378400 }, { "epoch": 5.873772094539022, "grad_norm": 2.2284083366394043, "learning_rate": 4.4126243424013406e-05, "loss": 0.9717, "step": 378500 }, { "epoch": 5.875323949781964, "grad_norm": 2.028057813644409, "learning_rate": 4.4124691568770464e-05, "loss": 0.9794, "step": 378600 }, { "epoch": 5.876875805024907, "grad_norm": 1.99371337890625, "learning_rate": 4.412313971352752e-05, "loss": 0.9655, "step": 378700 }, { "epoch": 5.8784276602678505, "grad_norm": 2.3860819339752197, "learning_rate": 4.412158785828458e-05, "loss": 0.9851, "step": 378800 }, { "epoch": 5.879979515510793, "grad_norm": 2.0741422176361084, "learning_rate": 4.412003600304164e-05, "loss": 0.9856, "step": 378900 }, { "epoch": 5.881531370753736, "grad_norm": 2.468414783477783, "learning_rate": 4.4118484147798695e-05, "loss": 0.9746, "step": 379000 }, { "epoch": 5.883083225996679, "grad_norm": 2.512202262878418, "learning_rate": 4.411693229255575e-05, "loss": 0.9681, "step": 379100 }, { "epoch": 5.884635081239622, "grad_norm": 2.6756885051727295, "learning_rate": 4.411538043731281e-05, "loss": 0.9735, "step": 379200 }, { "epoch": 5.886186936482565, "grad_norm": 2.1895599365234375, "learning_rate": 4.411382858206987e-05, "loss": 0.9814, "step": 379300 }, { "epoch": 5.887738791725508, "grad_norm": 2.8431382179260254, "learning_rate": 4.4112276726826926e-05, "loss": 0.9715, "step": 379400 }, { "epoch": 5.889290646968451, "grad_norm": 2.049410581588745, "learning_rate": 4.4110724871583984e-05, "loss": 0.9832, "step": 379500 }, { "epoch": 5.8908425022113935, "grad_norm": 2.2480385303497314, "learning_rate": 4.4109173016341035e-05, "loss": 0.9764, "step": 379600 }, { "epoch": 5.892394357454337, "grad_norm": 2.210885524749756, "learning_rate": 4.410762116109809e-05, "loss": 0.9729, "step": 379700 }, { "epoch": 5.89394621269728, "grad_norm": 1.8914847373962402, "learning_rate": 4.410606930585515e-05, "loss": 0.9836, "step": 379800 }, { "epoch": 5.895498067940222, "grad_norm": 2.6204538345336914, "learning_rate": 4.410451745061221e-05, "loss": 0.9681, "step": 379900 }, { "epoch": 5.8970499231831655, "grad_norm": 2.115401268005371, "learning_rate": 4.4102965595369266e-05, "loss": 0.98, "step": 380000 }, { "epoch": 5.898601778426109, "grad_norm": 2.5905754566192627, "learning_rate": 4.4101413740126324e-05, "loss": 0.981, "step": 380100 }, { "epoch": 5.900153633669051, "grad_norm": 2.018876552581787, "learning_rate": 4.409986188488338e-05, "loss": 0.9667, "step": 380200 }, { "epoch": 5.901705488911994, "grad_norm": 2.142106533050537, "learning_rate": 4.409831002964044e-05, "loss": 0.9735, "step": 380300 }, { "epoch": 5.903257344154937, "grad_norm": 2.063349962234497, "learning_rate": 4.40967581743975e-05, "loss": 0.9653, "step": 380400 }, { "epoch": 5.90480919939788, "grad_norm": 2.105029344558716, "learning_rate": 4.4095206319154555e-05, "loss": 0.9664, "step": 380500 }, { "epoch": 5.906361054640823, "grad_norm": 1.9629491567611694, "learning_rate": 4.409365446391161e-05, "loss": 0.9836, "step": 380600 }, { "epoch": 5.907912909883766, "grad_norm": 2.533064126968384, "learning_rate": 4.409210260866867e-05, "loss": 0.9727, "step": 380700 }, { "epoch": 5.9094647651267085, "grad_norm": 2.4336819648742676, "learning_rate": 4.409055075342573e-05, "loss": 0.9861, "step": 380800 }, { "epoch": 5.911016620369652, "grad_norm": 2.2445414066314697, "learning_rate": 4.408899889818278e-05, "loss": 0.9875, "step": 380900 }, { "epoch": 5.912568475612595, "grad_norm": 1.9899882078170776, "learning_rate": 4.4087447042939837e-05, "loss": 0.997, "step": 381000 }, { "epoch": 5.914120330855538, "grad_norm": 2.161224603652954, "learning_rate": 4.4085895187696894e-05, "loss": 0.9705, "step": 381100 }, { "epoch": 5.9156721860984804, "grad_norm": 1.702340841293335, "learning_rate": 4.408434333245395e-05, "loss": 0.9906, "step": 381200 }, { "epoch": 5.917224041341424, "grad_norm": 2.1645522117614746, "learning_rate": 4.4082791477211e-05, "loss": 0.9715, "step": 381300 }, { "epoch": 5.918775896584367, "grad_norm": 1.9506837129592896, "learning_rate": 4.408123962196806e-05, "loss": 0.9542, "step": 381400 }, { "epoch": 5.920327751827309, "grad_norm": 2.4919285774230957, "learning_rate": 4.407968776672512e-05, "loss": 0.9472, "step": 381500 }, { "epoch": 5.921879607070252, "grad_norm": 2.1404054164886475, "learning_rate": 4.4078135911482176e-05, "loss": 0.98, "step": 381600 }, { "epoch": 5.923431462313196, "grad_norm": 2.1776788234710693, "learning_rate": 4.4076584056239234e-05, "loss": 0.9793, "step": 381700 }, { "epoch": 5.924983317556139, "grad_norm": 2.2884573936462402, "learning_rate": 4.407503220099629e-05, "loss": 0.9855, "step": 381800 }, { "epoch": 5.926535172799081, "grad_norm": 2.4969139099121094, "learning_rate": 4.407348034575335e-05, "loss": 0.9622, "step": 381900 }, { "epoch": 5.928087028042024, "grad_norm": 2.166203260421753, "learning_rate": 4.407192849051041e-05, "loss": 0.9908, "step": 382000 }, { "epoch": 5.929638883284968, "grad_norm": 2.4165701866149902, "learning_rate": 4.4070376635267465e-05, "loss": 0.9571, "step": 382100 }, { "epoch": 5.93119073852791, "grad_norm": 1.8546074628829956, "learning_rate": 4.406882478002452e-05, "loss": 0.9829, "step": 382200 }, { "epoch": 5.932742593770853, "grad_norm": 2.0221145153045654, "learning_rate": 4.406727292478158e-05, "loss": 0.9814, "step": 382300 }, { "epoch": 5.934294449013796, "grad_norm": 1.9089550971984863, "learning_rate": 4.406572106953863e-05, "loss": 1.0114, "step": 382400 }, { "epoch": 5.935846304256739, "grad_norm": 2.201709270477295, "learning_rate": 4.406416921429569e-05, "loss": 0.9635, "step": 382500 }, { "epoch": 5.937398159499682, "grad_norm": 1.7766344547271729, "learning_rate": 4.406261735905275e-05, "loss": 0.9902, "step": 382600 }, { "epoch": 5.938950014742625, "grad_norm": 2.4231464862823486, "learning_rate": 4.4061065503809805e-05, "loss": 0.9776, "step": 382700 }, { "epoch": 5.940501869985567, "grad_norm": 1.9196124076843262, "learning_rate": 4.405951364856686e-05, "loss": 0.955, "step": 382800 }, { "epoch": 5.942053725228511, "grad_norm": 2.0492331981658936, "learning_rate": 4.405796179332392e-05, "loss": 0.9819, "step": 382900 }, { "epoch": 5.943605580471454, "grad_norm": 2.2187132835388184, "learning_rate": 4.405640993808098e-05, "loss": 0.9615, "step": 383000 }, { "epoch": 5.945157435714396, "grad_norm": 2.631035327911377, "learning_rate": 4.4054858082838036e-05, "loss": 0.9849, "step": 383100 }, { "epoch": 5.946709290957339, "grad_norm": 2.206575632095337, "learning_rate": 4.4053306227595094e-05, "loss": 0.9638, "step": 383200 }, { "epoch": 5.948261146200283, "grad_norm": 2.0412869453430176, "learning_rate": 4.405175437235215e-05, "loss": 0.9709, "step": 383300 }, { "epoch": 5.949813001443225, "grad_norm": 2.055488109588623, "learning_rate": 4.405020251710921e-05, "loss": 0.9785, "step": 383400 }, { "epoch": 5.951364856686168, "grad_norm": 2.452118396759033, "learning_rate": 4.404865066186627e-05, "loss": 1.0, "step": 383500 }, { "epoch": 5.952916711929111, "grad_norm": 2.1359150409698486, "learning_rate": 4.4047098806623325e-05, "loss": 0.9879, "step": 383600 }, { "epoch": 5.9544685671720545, "grad_norm": 1.9746674299240112, "learning_rate": 4.4045546951380376e-05, "loss": 0.9708, "step": 383700 }, { "epoch": 5.956020422414997, "grad_norm": 2.2791800498962402, "learning_rate": 4.404399509613743e-05, "loss": 0.9781, "step": 383800 }, { "epoch": 5.95757227765794, "grad_norm": 1.9660913944244385, "learning_rate": 4.404244324089449e-05, "loss": 0.9687, "step": 383900 }, { "epoch": 5.959124132900883, "grad_norm": 2.287830114364624, "learning_rate": 4.404089138565155e-05, "loss": 0.9561, "step": 384000 }, { "epoch": 5.960675988143826, "grad_norm": 2.1479904651641846, "learning_rate": 4.4039339530408607e-05, "loss": 0.9654, "step": 384100 }, { "epoch": 5.962227843386769, "grad_norm": 2.3872644901275635, "learning_rate": 4.4037787675165664e-05, "loss": 0.9656, "step": 384200 }, { "epoch": 5.963779698629712, "grad_norm": 2.462561845779419, "learning_rate": 4.403623581992272e-05, "loss": 0.9917, "step": 384300 }, { "epoch": 5.965331553872655, "grad_norm": 2.225314140319824, "learning_rate": 4.403468396467978e-05, "loss": 0.986, "step": 384400 }, { "epoch": 5.966883409115598, "grad_norm": 2.2320430278778076, "learning_rate": 4.403313210943684e-05, "loss": 0.9701, "step": 384500 }, { "epoch": 5.968435264358541, "grad_norm": 2.099632740020752, "learning_rate": 4.403158025419389e-05, "loss": 0.9824, "step": 384600 }, { "epoch": 5.969987119601484, "grad_norm": 2.051234245300293, "learning_rate": 4.4030028398950946e-05, "loss": 0.9818, "step": 384700 }, { "epoch": 5.971538974844426, "grad_norm": 1.902363657951355, "learning_rate": 4.4028476543708004e-05, "loss": 0.9866, "step": 384800 }, { "epoch": 5.9730908300873695, "grad_norm": 2.5692336559295654, "learning_rate": 4.402692468846506e-05, "loss": 0.9729, "step": 384900 }, { "epoch": 5.974642685330313, "grad_norm": 2.3582212924957275, "learning_rate": 4.402537283322212e-05, "loss": 0.9598, "step": 385000 }, { "epoch": 5.976194540573255, "grad_norm": 1.9450781345367432, "learning_rate": 4.402382097797918e-05, "loss": 0.9704, "step": 385100 }, { "epoch": 5.977746395816198, "grad_norm": 2.6561570167541504, "learning_rate": 4.402226912273623e-05, "loss": 0.9858, "step": 385200 }, { "epoch": 5.9792982510591415, "grad_norm": 1.969224452972412, "learning_rate": 4.4020717267493286e-05, "loss": 0.9733, "step": 385300 }, { "epoch": 5.980850106302084, "grad_norm": 2.226437568664551, "learning_rate": 4.4019165412250344e-05, "loss": 0.9774, "step": 385400 }, { "epoch": 5.982401961545027, "grad_norm": 2.0637199878692627, "learning_rate": 4.40176135570074e-05, "loss": 0.976, "step": 385500 }, { "epoch": 5.98395381678797, "grad_norm": 2.163114309310913, "learning_rate": 4.401606170176446e-05, "loss": 0.972, "step": 385600 }, { "epoch": 5.9855056720309125, "grad_norm": 2.5656440258026123, "learning_rate": 4.401450984652152e-05, "loss": 0.9693, "step": 385700 }, { "epoch": 5.987057527273856, "grad_norm": 2.2277262210845947, "learning_rate": 4.4012957991278575e-05, "loss": 0.9789, "step": 385800 }, { "epoch": 5.988609382516799, "grad_norm": 2.257556200027466, "learning_rate": 4.401140613603563e-05, "loss": 0.984, "step": 385900 }, { "epoch": 5.990161237759741, "grad_norm": 2.409364700317383, "learning_rate": 4.400985428079269e-05, "loss": 0.9887, "step": 386000 }, { "epoch": 5.9917130930026845, "grad_norm": 1.9965325593948364, "learning_rate": 4.400830242554975e-05, "loss": 0.9736, "step": 386100 }, { "epoch": 5.993264948245628, "grad_norm": 2.191364288330078, "learning_rate": 4.4006750570306806e-05, "loss": 0.9752, "step": 386200 }, { "epoch": 5.994816803488571, "grad_norm": 2.4621355533599854, "learning_rate": 4.4005198715063864e-05, "loss": 0.9787, "step": 386300 }, { "epoch": 5.996368658731513, "grad_norm": 2.0752222537994385, "learning_rate": 4.400364685982092e-05, "loss": 0.9741, "step": 386400 }, { "epoch": 5.9979205139744565, "grad_norm": 2.313493490219116, "learning_rate": 4.400209500457797e-05, "loss": 0.9662, "step": 386500 }, { "epoch": 5.9994723692174, "grad_norm": 2.3873605728149414, "learning_rate": 4.400054314933503e-05, "loss": 0.9745, "step": 386600 }, { "epoch": 6.001024224460342, "grad_norm": 2.000333070755005, "learning_rate": 4.399899129409209e-05, "loss": 0.9511, "step": 386700 }, { "epoch": 6.002576079703285, "grad_norm": 2.271426200866699, "learning_rate": 4.3997439438849146e-05, "loss": 0.9531, "step": 386800 }, { "epoch": 6.004127934946228, "grad_norm": 2.0502657890319824, "learning_rate": 4.39958875836062e-05, "loss": 0.962, "step": 386900 }, { "epoch": 6.005679790189171, "grad_norm": 2.212423801422119, "learning_rate": 4.399433572836326e-05, "loss": 0.9593, "step": 387000 }, { "epoch": 6.007231645432114, "grad_norm": 2.1144490242004395, "learning_rate": 4.399278387312032e-05, "loss": 0.9611, "step": 387100 }, { "epoch": 6.008783500675057, "grad_norm": 2.4727673530578613, "learning_rate": 4.3991232017877377e-05, "loss": 0.9381, "step": 387200 }, { "epoch": 6.010335355918, "grad_norm": 1.871916651725769, "learning_rate": 4.3989680162634434e-05, "loss": 0.9543, "step": 387300 }, { "epoch": 6.011887211160943, "grad_norm": 2.118690252304077, "learning_rate": 4.398812830739149e-05, "loss": 0.9445, "step": 387400 }, { "epoch": 6.013439066403886, "grad_norm": 2.5680811405181885, "learning_rate": 4.398657645214855e-05, "loss": 0.9319, "step": 387500 }, { "epoch": 6.014990921646829, "grad_norm": 2.3992414474487305, "learning_rate": 4.398502459690561e-05, "loss": 0.9731, "step": 387600 }, { "epoch": 6.0165427768897715, "grad_norm": 2.5568087100982666, "learning_rate": 4.3983472741662665e-05, "loss": 0.9698, "step": 387700 }, { "epoch": 6.018094632132715, "grad_norm": 2.3998708724975586, "learning_rate": 4.3981920886419716e-05, "loss": 0.9699, "step": 387800 }, { "epoch": 6.019646487375658, "grad_norm": 2.113197088241577, "learning_rate": 4.3980369031176774e-05, "loss": 0.9643, "step": 387900 }, { "epoch": 6.0211983426186, "grad_norm": 2.3096909523010254, "learning_rate": 4.397881717593383e-05, "loss": 0.954, "step": 388000 }, { "epoch": 6.022750197861543, "grad_norm": 2.312952756881714, "learning_rate": 4.397726532069088e-05, "loss": 0.974, "step": 388100 }, { "epoch": 6.024302053104487, "grad_norm": 2.1558468341827393, "learning_rate": 4.397571346544794e-05, "loss": 0.9675, "step": 388200 }, { "epoch": 6.025853908347429, "grad_norm": 2.493319272994995, "learning_rate": 4.3974161610205e-05, "loss": 0.9674, "step": 388300 }, { "epoch": 6.027405763590372, "grad_norm": 2.3087003231048584, "learning_rate": 4.3972609754962056e-05, "loss": 0.9808, "step": 388400 }, { "epoch": 6.028957618833315, "grad_norm": 2.2011890411376953, "learning_rate": 4.3971057899719114e-05, "loss": 0.9704, "step": 388500 }, { "epoch": 6.030509474076259, "grad_norm": 2.37614369392395, "learning_rate": 4.396950604447617e-05, "loss": 0.9536, "step": 388600 }, { "epoch": 6.032061329319201, "grad_norm": 2.337226390838623, "learning_rate": 4.396795418923323e-05, "loss": 0.9571, "step": 388700 }, { "epoch": 6.033613184562144, "grad_norm": 2.3683433532714844, "learning_rate": 4.396640233399029e-05, "loss": 0.9488, "step": 388800 }, { "epoch": 6.035165039805087, "grad_norm": 1.9313045740127563, "learning_rate": 4.3964850478747345e-05, "loss": 0.9855, "step": 388900 }, { "epoch": 6.03671689504803, "grad_norm": 2.032167911529541, "learning_rate": 4.39632986235044e-05, "loss": 0.9681, "step": 389000 }, { "epoch": 6.038268750290973, "grad_norm": 2.1099116802215576, "learning_rate": 4.396174676826146e-05, "loss": 0.9495, "step": 389100 }, { "epoch": 6.039820605533916, "grad_norm": 2.557539224624634, "learning_rate": 4.396019491301852e-05, "loss": 0.9694, "step": 389200 }, { "epoch": 6.041372460776858, "grad_norm": 1.9979205131530762, "learning_rate": 4.3958643057775576e-05, "loss": 0.9522, "step": 389300 }, { "epoch": 6.042924316019802, "grad_norm": 1.9996343851089478, "learning_rate": 4.395709120253263e-05, "loss": 0.96, "step": 389400 }, { "epoch": 6.044476171262745, "grad_norm": 2.359251022338867, "learning_rate": 4.3955539347289684e-05, "loss": 0.9539, "step": 389500 }, { "epoch": 6.046028026505687, "grad_norm": 2.0301082134246826, "learning_rate": 4.395398749204674e-05, "loss": 0.9822, "step": 389600 }, { "epoch": 6.04757988174863, "grad_norm": 1.966744065284729, "learning_rate": 4.39524356368038e-05, "loss": 0.981, "step": 389700 }, { "epoch": 6.049131736991574, "grad_norm": 2.130514144897461, "learning_rate": 4.395088378156086e-05, "loss": 0.9745, "step": 389800 }, { "epoch": 6.050683592234517, "grad_norm": 2.057539701461792, "learning_rate": 4.3949331926317916e-05, "loss": 0.9573, "step": 389900 }, { "epoch": 6.052235447477459, "grad_norm": 2.3234894275665283, "learning_rate": 4.394778007107497e-05, "loss": 0.945, "step": 390000 }, { "epoch": 6.053787302720402, "grad_norm": 2.1992385387420654, "learning_rate": 4.394622821583203e-05, "loss": 0.9687, "step": 390100 }, { "epoch": 6.0553391579633455, "grad_norm": 1.8857234716415405, "learning_rate": 4.394467636058909e-05, "loss": 0.9773, "step": 390200 }, { "epoch": 6.056891013206288, "grad_norm": 2.4906578063964844, "learning_rate": 4.3943124505346147e-05, "loss": 0.9675, "step": 390300 }, { "epoch": 6.058442868449231, "grad_norm": 2.2270302772521973, "learning_rate": 4.3941572650103204e-05, "loss": 0.9451, "step": 390400 }, { "epoch": 6.059994723692174, "grad_norm": 2.8498144149780273, "learning_rate": 4.394002079486026e-05, "loss": 0.9737, "step": 390500 }, { "epoch": 6.061546578935117, "grad_norm": 2.224904775619507, "learning_rate": 4.393846893961732e-05, "loss": 0.9733, "step": 390600 }, { "epoch": 6.06309843417806, "grad_norm": 2.4438703060150146, "learning_rate": 4.393691708437437e-05, "loss": 0.9609, "step": 390700 }, { "epoch": 6.064650289421003, "grad_norm": 2.195847988128662, "learning_rate": 4.393536522913143e-05, "loss": 0.9534, "step": 390800 }, { "epoch": 6.066202144663945, "grad_norm": 2.295825242996216, "learning_rate": 4.3933813373888486e-05, "loss": 0.9491, "step": 390900 }, { "epoch": 6.067753999906889, "grad_norm": 2.180386543273926, "learning_rate": 4.3932261518645544e-05, "loss": 0.9645, "step": 391000 }, { "epoch": 6.069305855149832, "grad_norm": 2.033557415008545, "learning_rate": 4.3930709663402595e-05, "loss": 0.9631, "step": 391100 }, { "epoch": 6.070857710392775, "grad_norm": 2.0717227458953857, "learning_rate": 4.392915780815965e-05, "loss": 0.9841, "step": 391200 }, { "epoch": 6.072409565635717, "grad_norm": 2.347292184829712, "learning_rate": 4.392760595291671e-05, "loss": 0.9644, "step": 391300 }, { "epoch": 6.0739614208786605, "grad_norm": 2.1732914447784424, "learning_rate": 4.392605409767377e-05, "loss": 0.9853, "step": 391400 }, { "epoch": 6.075513276121604, "grad_norm": 2.3305435180664062, "learning_rate": 4.3924502242430826e-05, "loss": 0.9797, "step": 391500 }, { "epoch": 6.077065131364546, "grad_norm": 2.158534049987793, "learning_rate": 4.3922950387187884e-05, "loss": 0.991, "step": 391600 }, { "epoch": 6.078616986607489, "grad_norm": 2.0103869438171387, "learning_rate": 4.392139853194494e-05, "loss": 0.9674, "step": 391700 }, { "epoch": 6.0801688418504325, "grad_norm": 2.5109951496124268, "learning_rate": 4.3919846676702e-05, "loss": 0.9547, "step": 391800 }, { "epoch": 6.081720697093375, "grad_norm": 1.924285650253296, "learning_rate": 4.391829482145906e-05, "loss": 0.9509, "step": 391900 }, { "epoch": 6.083272552336318, "grad_norm": 2.2401115894317627, "learning_rate": 4.3916742966216115e-05, "loss": 0.9695, "step": 392000 }, { "epoch": 6.084824407579261, "grad_norm": 2.2518539428710938, "learning_rate": 4.391519111097317e-05, "loss": 0.973, "step": 392100 }, { "epoch": 6.086376262822204, "grad_norm": 2.5874836444854736, "learning_rate": 4.3913639255730223e-05, "loss": 0.9701, "step": 392200 }, { "epoch": 6.087928118065147, "grad_norm": 2.1396758556365967, "learning_rate": 4.391208740048728e-05, "loss": 0.9588, "step": 392300 }, { "epoch": 6.08947997330809, "grad_norm": 2.413935661315918, "learning_rate": 4.391053554524434e-05, "loss": 0.9735, "step": 392400 }, { "epoch": 6.091031828551033, "grad_norm": 2.398164987564087, "learning_rate": 4.39089836900014e-05, "loss": 0.9579, "step": 392500 }, { "epoch": 6.0925836837939755, "grad_norm": 2.415902853012085, "learning_rate": 4.3907431834758454e-05, "loss": 0.968, "step": 392600 }, { "epoch": 6.094135539036919, "grad_norm": 2.1242220401763916, "learning_rate": 4.390587997951551e-05, "loss": 0.9603, "step": 392700 }, { "epoch": 6.095687394279862, "grad_norm": 1.976881504058838, "learning_rate": 4.390432812427257e-05, "loss": 0.962, "step": 392800 }, { "epoch": 6.097239249522804, "grad_norm": 2.1509594917297363, "learning_rate": 4.390277626902963e-05, "loss": 0.9571, "step": 392900 }, { "epoch": 6.0987911047657475, "grad_norm": 1.9936940670013428, "learning_rate": 4.3901224413786686e-05, "loss": 0.9584, "step": 393000 }, { "epoch": 6.100342960008691, "grad_norm": 2.323820114135742, "learning_rate": 4.389967255854374e-05, "loss": 0.9719, "step": 393100 }, { "epoch": 6.101894815251633, "grad_norm": 2.1440021991729736, "learning_rate": 4.38981207033008e-05, "loss": 0.9842, "step": 393200 }, { "epoch": 6.103446670494576, "grad_norm": 2.216322422027588, "learning_rate": 4.389656884805786e-05, "loss": 0.9483, "step": 393300 }, { "epoch": 6.104998525737519, "grad_norm": 1.8636776208877563, "learning_rate": 4.3895016992814917e-05, "loss": 0.9722, "step": 393400 }, { "epoch": 6.106550380980462, "grad_norm": 2.2130370140075684, "learning_rate": 4.389346513757197e-05, "loss": 0.9786, "step": 393500 }, { "epoch": 6.108102236223405, "grad_norm": 2.094633102416992, "learning_rate": 4.3891913282329025e-05, "loss": 0.9726, "step": 393600 }, { "epoch": 6.109654091466348, "grad_norm": 2.696074962615967, "learning_rate": 4.389036142708608e-05, "loss": 0.9711, "step": 393700 }, { "epoch": 6.1112059467092905, "grad_norm": 2.100592613220215, "learning_rate": 4.388880957184314e-05, "loss": 0.9471, "step": 393800 }, { "epoch": 6.112757801952234, "grad_norm": 2.4279162883758545, "learning_rate": 4.38872577166002e-05, "loss": 0.9809, "step": 393900 }, { "epoch": 6.114309657195177, "grad_norm": 1.997194528579712, "learning_rate": 4.3885705861357256e-05, "loss": 0.9669, "step": 394000 }, { "epoch": 6.11586151243812, "grad_norm": 2.1823298931121826, "learning_rate": 4.3884154006114314e-05, "loss": 0.9617, "step": 394100 }, { "epoch": 6.1174133676810625, "grad_norm": 2.295586347579956, "learning_rate": 4.388260215087137e-05, "loss": 0.9735, "step": 394200 }, { "epoch": 6.118965222924006, "grad_norm": 2.3986918926239014, "learning_rate": 4.388105029562843e-05, "loss": 0.9708, "step": 394300 }, { "epoch": 6.120517078166949, "grad_norm": 2.3040828704833984, "learning_rate": 4.387949844038548e-05, "loss": 0.9671, "step": 394400 }, { "epoch": 6.122068933409891, "grad_norm": 1.8620610237121582, "learning_rate": 4.387794658514254e-05, "loss": 0.9661, "step": 394500 }, { "epoch": 6.123620788652834, "grad_norm": 2.3233370780944824, "learning_rate": 4.3876394729899596e-05, "loss": 0.9647, "step": 394600 }, { "epoch": 6.125172643895778, "grad_norm": 2.0119471549987793, "learning_rate": 4.3874842874656654e-05, "loss": 0.9744, "step": 394700 }, { "epoch": 6.12672449913872, "grad_norm": 2.443408966064453, "learning_rate": 4.387329101941371e-05, "loss": 0.9617, "step": 394800 }, { "epoch": 6.128276354381663, "grad_norm": 2.2621512413024902, "learning_rate": 4.387173916417077e-05, "loss": 0.9535, "step": 394900 }, { "epoch": 6.129828209624606, "grad_norm": 2.188690185546875, "learning_rate": 4.387018730892783e-05, "loss": 0.966, "step": 395000 }, { "epoch": 6.13138006486755, "grad_norm": 2.1281745433807373, "learning_rate": 4.386863545368488e-05, "loss": 0.9743, "step": 395100 }, { "epoch": 6.132931920110492, "grad_norm": 2.5643653869628906, "learning_rate": 4.3867083598441936e-05, "loss": 0.9816, "step": 395200 }, { "epoch": 6.134483775353435, "grad_norm": 2.4219088554382324, "learning_rate": 4.3865531743198993e-05, "loss": 0.9659, "step": 395300 }, { "epoch": 6.136035630596378, "grad_norm": 2.4001505374908447, "learning_rate": 4.386397988795605e-05, "loss": 0.965, "step": 395400 }, { "epoch": 6.137587485839321, "grad_norm": 2.4284939765930176, "learning_rate": 4.386242803271311e-05, "loss": 0.9867, "step": 395500 }, { "epoch": 6.139139341082264, "grad_norm": 2.2324256896972656, "learning_rate": 4.386087617747017e-05, "loss": 0.9635, "step": 395600 }, { "epoch": 6.140691196325207, "grad_norm": 2.049727201461792, "learning_rate": 4.3859324322227224e-05, "loss": 0.9631, "step": 395700 }, { "epoch": 6.142243051568149, "grad_norm": 2.241694688796997, "learning_rate": 4.385777246698428e-05, "loss": 0.9973, "step": 395800 }, { "epoch": 6.143794906811093, "grad_norm": 2.153416156768799, "learning_rate": 4.385622061174134e-05, "loss": 0.9601, "step": 395900 }, { "epoch": 6.145346762054036, "grad_norm": 2.5247418880462646, "learning_rate": 4.38546687564984e-05, "loss": 0.9885, "step": 396000 }, { "epoch": 6.146898617296978, "grad_norm": 2.599604368209839, "learning_rate": 4.3853116901255456e-05, "loss": 0.9429, "step": 396100 }, { "epoch": 6.148450472539921, "grad_norm": 1.9845662117004395, "learning_rate": 4.385156504601251e-05, "loss": 0.9642, "step": 396200 }, { "epoch": 6.150002327782865, "grad_norm": 2.245769500732422, "learning_rate": 4.385001319076957e-05, "loss": 0.9803, "step": 396300 }, { "epoch": 6.151554183025807, "grad_norm": 2.1591033935546875, "learning_rate": 4.384846133552662e-05, "loss": 0.9838, "step": 396400 }, { "epoch": 6.15310603826875, "grad_norm": 1.9319617748260498, "learning_rate": 4.384690948028368e-05, "loss": 0.9481, "step": 396500 }, { "epoch": 6.154657893511693, "grad_norm": 2.1349427700042725, "learning_rate": 4.384535762504074e-05, "loss": 0.9625, "step": 396600 }, { "epoch": 6.156209748754637, "grad_norm": 2.028501510620117, "learning_rate": 4.3843805769797795e-05, "loss": 0.9644, "step": 396700 }, { "epoch": 6.157761603997579, "grad_norm": 2.086404323577881, "learning_rate": 4.384225391455485e-05, "loss": 0.9894, "step": 396800 }, { "epoch": 6.159313459240522, "grad_norm": 2.086738348007202, "learning_rate": 4.384070205931191e-05, "loss": 0.969, "step": 396900 }, { "epoch": 6.160865314483465, "grad_norm": 2.644864320755005, "learning_rate": 4.383915020406897e-05, "loss": 0.9802, "step": 397000 }, { "epoch": 6.162417169726408, "grad_norm": 2.4811015129089355, "learning_rate": 4.3837598348826026e-05, "loss": 0.9568, "step": 397100 }, { "epoch": 6.163969024969351, "grad_norm": 2.1142282485961914, "learning_rate": 4.3836046493583084e-05, "loss": 0.9766, "step": 397200 }, { "epoch": 6.165520880212294, "grad_norm": 2.1053740978240967, "learning_rate": 4.383449463834014e-05, "loss": 0.9574, "step": 397300 }, { "epoch": 6.167072735455236, "grad_norm": 2.153528928756714, "learning_rate": 4.38329427830972e-05, "loss": 0.954, "step": 397400 }, { "epoch": 6.16862459069818, "grad_norm": 2.322955369949341, "learning_rate": 4.383139092785426e-05, "loss": 0.9665, "step": 397500 }, { "epoch": 6.170176445941123, "grad_norm": 2.08217716217041, "learning_rate": 4.382983907261131e-05, "loss": 0.98, "step": 397600 }, { "epoch": 6.171728301184065, "grad_norm": 2.3591394424438477, "learning_rate": 4.3828287217368366e-05, "loss": 0.9548, "step": 397700 }, { "epoch": 6.173280156427008, "grad_norm": 2.25370192527771, "learning_rate": 4.3826735362125424e-05, "loss": 0.9598, "step": 397800 }, { "epoch": 6.1748320116699515, "grad_norm": 1.865186095237732, "learning_rate": 4.3825183506882475e-05, "loss": 0.9732, "step": 397900 }, { "epoch": 6.176383866912895, "grad_norm": 2.2095091342926025, "learning_rate": 4.382363165163953e-05, "loss": 0.9627, "step": 398000 }, { "epoch": 6.177935722155837, "grad_norm": 1.9672092199325562, "learning_rate": 4.382207979639659e-05, "loss": 0.9669, "step": 398100 }, { "epoch": 6.17948757739878, "grad_norm": 2.8939859867095947, "learning_rate": 4.382052794115365e-05, "loss": 0.9858, "step": 398200 }, { "epoch": 6.1810394326417235, "grad_norm": 2.215050458908081, "learning_rate": 4.3818976085910706e-05, "loss": 0.9568, "step": 398300 }, { "epoch": 6.182591287884666, "grad_norm": 2.344846725463867, "learning_rate": 4.3817424230667763e-05, "loss": 0.9555, "step": 398400 }, { "epoch": 6.184143143127609, "grad_norm": 1.8797385692596436, "learning_rate": 4.381587237542482e-05, "loss": 0.984, "step": 398500 }, { "epoch": 6.185694998370552, "grad_norm": 2.302672863006592, "learning_rate": 4.381432052018188e-05, "loss": 0.9754, "step": 398600 }, { "epoch": 6.187246853613495, "grad_norm": 2.176729202270508, "learning_rate": 4.381276866493894e-05, "loss": 0.9676, "step": 398700 }, { "epoch": 6.188798708856438, "grad_norm": 2.451247215270996, "learning_rate": 4.3811216809695994e-05, "loss": 0.9502, "step": 398800 }, { "epoch": 6.190350564099381, "grad_norm": 2.0738155841827393, "learning_rate": 4.380966495445305e-05, "loss": 0.9857, "step": 398900 }, { "epoch": 6.191902419342323, "grad_norm": 2.0178303718566895, "learning_rate": 4.380811309921011e-05, "loss": 0.9761, "step": 399000 }, { "epoch": 6.1934542745852665, "grad_norm": 2.6101417541503906, "learning_rate": 4.380656124396717e-05, "loss": 0.9568, "step": 399100 }, { "epoch": 6.19500612982821, "grad_norm": 2.6814277172088623, "learning_rate": 4.380500938872422e-05, "loss": 0.9716, "step": 399200 }, { "epoch": 6.196557985071153, "grad_norm": 1.9939662218093872, "learning_rate": 4.3803457533481276e-05, "loss": 0.9692, "step": 399300 }, { "epoch": 6.198109840314095, "grad_norm": 2.08327054977417, "learning_rate": 4.3801905678238334e-05, "loss": 0.9518, "step": 399400 }, { "epoch": 6.1996616955570385, "grad_norm": 2.3259949684143066, "learning_rate": 4.380035382299539e-05, "loss": 0.9645, "step": 399500 }, { "epoch": 6.201213550799982, "grad_norm": 2.0609633922576904, "learning_rate": 4.379880196775245e-05, "loss": 0.9632, "step": 399600 }, { "epoch": 6.202765406042924, "grad_norm": 2.267864465713501, "learning_rate": 4.379725011250951e-05, "loss": 0.9697, "step": 399700 }, { "epoch": 6.204317261285867, "grad_norm": 2.342073440551758, "learning_rate": 4.3795698257266565e-05, "loss": 0.9816, "step": 399800 }, { "epoch": 6.2058691165288105, "grad_norm": 1.8497778177261353, "learning_rate": 4.379414640202362e-05, "loss": 0.9784, "step": 399900 }, { "epoch": 6.207420971771753, "grad_norm": 2.0065464973449707, "learning_rate": 4.379259454678068e-05, "loss": 0.9537, "step": 400000 }, { "epoch": 6.208972827014696, "grad_norm": 1.79105806350708, "learning_rate": 4.379104269153774e-05, "loss": 0.9547, "step": 400100 }, { "epoch": 6.210524682257639, "grad_norm": 2.1997838020324707, "learning_rate": 4.3789490836294796e-05, "loss": 0.9706, "step": 400200 }, { "epoch": 6.2120765375005815, "grad_norm": 3.2123444080352783, "learning_rate": 4.3787938981051854e-05, "loss": 0.9797, "step": 400300 }, { "epoch": 6.213628392743525, "grad_norm": 2.3421761989593506, "learning_rate": 4.378638712580891e-05, "loss": 0.944, "step": 400400 }, { "epoch": 6.215180247986468, "grad_norm": 2.1467018127441406, "learning_rate": 4.378483527056596e-05, "loss": 0.965, "step": 400500 }, { "epoch": 6.216732103229411, "grad_norm": 2.2525482177734375, "learning_rate": 4.378328341532302e-05, "loss": 0.9683, "step": 400600 }, { "epoch": 6.2182839584723535, "grad_norm": 2.026404857635498, "learning_rate": 4.378173156008008e-05, "loss": 0.9648, "step": 400700 }, { "epoch": 6.219835813715297, "grad_norm": 2.476609468460083, "learning_rate": 4.3780179704837136e-05, "loss": 0.9824, "step": 400800 }, { "epoch": 6.22138766895824, "grad_norm": 3.2324390411376953, "learning_rate": 4.377862784959419e-05, "loss": 0.9878, "step": 400900 }, { "epoch": 6.222939524201182, "grad_norm": 2.266871213912964, "learning_rate": 4.3777075994351245e-05, "loss": 0.9638, "step": 401000 }, { "epoch": 6.2244913794441254, "grad_norm": 2.1155683994293213, "learning_rate": 4.37755241391083e-05, "loss": 0.9689, "step": 401100 }, { "epoch": 6.226043234687069, "grad_norm": 2.209477663040161, "learning_rate": 4.377397228386536e-05, "loss": 0.9707, "step": 401200 }, { "epoch": 6.227595089930011, "grad_norm": 2.6555047035217285, "learning_rate": 4.377242042862242e-05, "loss": 0.9812, "step": 401300 }, { "epoch": 6.229146945172954, "grad_norm": 2.175626039505005, "learning_rate": 4.3770868573379476e-05, "loss": 0.9736, "step": 401400 }, { "epoch": 6.230698800415897, "grad_norm": 2.1035797595977783, "learning_rate": 4.3769316718136533e-05, "loss": 0.9798, "step": 401500 }, { "epoch": 6.23225065565884, "grad_norm": 2.43630313873291, "learning_rate": 4.376776486289359e-05, "loss": 0.9693, "step": 401600 }, { "epoch": 6.233802510901783, "grad_norm": 2.7505812644958496, "learning_rate": 4.376621300765065e-05, "loss": 0.982, "step": 401700 }, { "epoch": 6.235354366144726, "grad_norm": 2.1097216606140137, "learning_rate": 4.376466115240771e-05, "loss": 0.9735, "step": 401800 }, { "epoch": 6.236906221387669, "grad_norm": 2.1382088661193848, "learning_rate": 4.3763109297164764e-05, "loss": 0.9906, "step": 401900 }, { "epoch": 6.238458076630612, "grad_norm": 2.587602376937866, "learning_rate": 4.3761557441921815e-05, "loss": 0.9753, "step": 402000 }, { "epoch": 6.240009931873555, "grad_norm": 2.404689073562622, "learning_rate": 4.376000558667887e-05, "loss": 0.9768, "step": 402100 }, { "epoch": 6.241561787116498, "grad_norm": 2.5374016761779785, "learning_rate": 4.375845373143593e-05, "loss": 0.9573, "step": 402200 }, { "epoch": 6.24311364235944, "grad_norm": 2.259457588195801, "learning_rate": 4.375690187619299e-05, "loss": 0.9714, "step": 402300 }, { "epoch": 6.244665497602384, "grad_norm": 2.333061695098877, "learning_rate": 4.3755350020950046e-05, "loss": 0.9598, "step": 402400 }, { "epoch": 6.246217352845327, "grad_norm": 1.965182900428772, "learning_rate": 4.3753798165707104e-05, "loss": 0.9852, "step": 402500 }, { "epoch": 6.247769208088269, "grad_norm": 2.0629656314849854, "learning_rate": 4.375224631046416e-05, "loss": 0.9785, "step": 402600 }, { "epoch": 6.249321063331212, "grad_norm": 2.192657232284546, "learning_rate": 4.375069445522122e-05, "loss": 0.9797, "step": 402700 }, { "epoch": 6.250872918574156, "grad_norm": 2.3315765857696533, "learning_rate": 4.374914259997828e-05, "loss": 0.975, "step": 402800 }, { "epoch": 6.252424773817098, "grad_norm": 2.2813796997070312, "learning_rate": 4.3747590744735335e-05, "loss": 0.9629, "step": 402900 }, { "epoch": 6.253976629060041, "grad_norm": 2.268996000289917, "learning_rate": 4.374603888949239e-05, "loss": 0.9624, "step": 403000 }, { "epoch": 6.255528484302984, "grad_norm": 2.316983461380005, "learning_rate": 4.374448703424945e-05, "loss": 0.9691, "step": 403100 }, { "epoch": 6.257080339545928, "grad_norm": 2.075517416000366, "learning_rate": 4.374293517900651e-05, "loss": 0.9547, "step": 403200 }, { "epoch": 6.25863219478887, "grad_norm": 2.1352179050445557, "learning_rate": 4.374138332376356e-05, "loss": 0.9812, "step": 403300 }, { "epoch": 6.260184050031813, "grad_norm": 2.2294087409973145, "learning_rate": 4.373983146852062e-05, "loss": 0.9477, "step": 403400 }, { "epoch": 6.261735905274756, "grad_norm": 1.9272500276565552, "learning_rate": 4.3738279613277675e-05, "loss": 0.9628, "step": 403500 }, { "epoch": 6.263287760517699, "grad_norm": 2.7708661556243896, "learning_rate": 4.373672775803473e-05, "loss": 0.9695, "step": 403600 }, { "epoch": 6.264839615760642, "grad_norm": 2.3564934730529785, "learning_rate": 4.373517590279179e-05, "loss": 0.9656, "step": 403700 }, { "epoch": 6.266391471003585, "grad_norm": 1.877121090888977, "learning_rate": 4.373362404754885e-05, "loss": 0.9798, "step": 403800 }, { "epoch": 6.267943326246527, "grad_norm": 1.9194297790527344, "learning_rate": 4.3732072192305906e-05, "loss": 0.9925, "step": 403900 }, { "epoch": 6.269495181489471, "grad_norm": 2.463531255722046, "learning_rate": 4.3730520337062964e-05, "loss": 0.9706, "step": 404000 }, { "epoch": 6.271047036732414, "grad_norm": 2.00711989402771, "learning_rate": 4.3728968481820015e-05, "loss": 0.9541, "step": 404100 }, { "epoch": 6.272598891975356, "grad_norm": 2.0191845893859863, "learning_rate": 4.372741662657707e-05, "loss": 0.9604, "step": 404200 }, { "epoch": 6.274150747218299, "grad_norm": 1.917065978050232, "learning_rate": 4.372586477133413e-05, "loss": 0.9763, "step": 404300 }, { "epoch": 6.275702602461243, "grad_norm": 2.2994229793548584, "learning_rate": 4.372431291609119e-05, "loss": 0.9739, "step": 404400 }, { "epoch": 6.277254457704185, "grad_norm": 2.306817054748535, "learning_rate": 4.3722761060848246e-05, "loss": 0.9642, "step": 404500 }, { "epoch": 6.278806312947128, "grad_norm": 2.211359977722168, "learning_rate": 4.3721209205605303e-05, "loss": 0.9832, "step": 404600 }, { "epoch": 6.280358168190071, "grad_norm": 2.2913103103637695, "learning_rate": 4.371965735036236e-05, "loss": 0.9585, "step": 404700 }, { "epoch": 6.2819100234330145, "grad_norm": 2.4069390296936035, "learning_rate": 4.371810549511942e-05, "loss": 0.9838, "step": 404800 }, { "epoch": 6.283461878675957, "grad_norm": 1.8786569833755493, "learning_rate": 4.371655363987647e-05, "loss": 0.9647, "step": 404900 }, { "epoch": 6.2850137339189, "grad_norm": 2.4478113651275635, "learning_rate": 4.371500178463353e-05, "loss": 0.9512, "step": 405000 }, { "epoch": 6.286565589161843, "grad_norm": 2.4815709590911865, "learning_rate": 4.3713449929390585e-05, "loss": 0.9589, "step": 405100 }, { "epoch": 6.288117444404786, "grad_norm": 1.9261494874954224, "learning_rate": 4.371189807414764e-05, "loss": 0.9694, "step": 405200 }, { "epoch": 6.289669299647729, "grad_norm": 1.9647706747055054, "learning_rate": 4.37103462189047e-05, "loss": 0.9729, "step": 405300 }, { "epoch": 6.291221154890672, "grad_norm": 2.7827138900756836, "learning_rate": 4.370879436366176e-05, "loss": 0.9714, "step": 405400 }, { "epoch": 6.292773010133614, "grad_norm": 2.246864080429077, "learning_rate": 4.3707242508418816e-05, "loss": 0.978, "step": 405500 }, { "epoch": 6.2943248653765576, "grad_norm": 1.8927825689315796, "learning_rate": 4.3705690653175874e-05, "loss": 0.965, "step": 405600 }, { "epoch": 6.295876720619501, "grad_norm": 2.4421863555908203, "learning_rate": 4.370413879793293e-05, "loss": 0.9768, "step": 405700 }, { "epoch": 6.297428575862444, "grad_norm": 2.244401216506958, "learning_rate": 4.370258694268999e-05, "loss": 0.9602, "step": 405800 }, { "epoch": 6.298980431105386, "grad_norm": 2.523974895477295, "learning_rate": 4.370103508744705e-05, "loss": 0.9771, "step": 405900 }, { "epoch": 6.3005322863483295, "grad_norm": 2.228769063949585, "learning_rate": 4.3699483232204105e-05, "loss": 0.9668, "step": 406000 }, { "epoch": 6.302084141591273, "grad_norm": 2.35884690284729, "learning_rate": 4.369793137696116e-05, "loss": 0.9701, "step": 406100 }, { "epoch": 6.303635996834215, "grad_norm": 2.2261874675750732, "learning_rate": 4.3696379521718214e-05, "loss": 0.9796, "step": 406200 }, { "epoch": 6.305187852077158, "grad_norm": 2.264378070831299, "learning_rate": 4.369482766647527e-05, "loss": 0.9812, "step": 406300 }, { "epoch": 6.3067397073201015, "grad_norm": 2.2602195739746094, "learning_rate": 4.369327581123233e-05, "loss": 0.9765, "step": 406400 }, { "epoch": 6.308291562563044, "grad_norm": 2.4524857997894287, "learning_rate": 4.369172395598939e-05, "loss": 0.9625, "step": 406500 }, { "epoch": 6.309843417805987, "grad_norm": 2.772489070892334, "learning_rate": 4.3690172100746445e-05, "loss": 0.959, "step": 406600 }, { "epoch": 6.31139527304893, "grad_norm": 2.6323797702789307, "learning_rate": 4.36886202455035e-05, "loss": 0.9792, "step": 406700 }, { "epoch": 6.3129471282918725, "grad_norm": 1.8720982074737549, "learning_rate": 4.368706839026056e-05, "loss": 0.9622, "step": 406800 }, { "epoch": 6.314498983534816, "grad_norm": 1.8062094449996948, "learning_rate": 4.368551653501762e-05, "loss": 0.9811, "step": 406900 }, { "epoch": 6.316050838777759, "grad_norm": 1.9324839115142822, "learning_rate": 4.3683964679774676e-05, "loss": 0.9562, "step": 407000 }, { "epoch": 6.317602694020701, "grad_norm": 2.0377235412597656, "learning_rate": 4.3682412824531734e-05, "loss": 0.96, "step": 407100 }, { "epoch": 6.3191545492636445, "grad_norm": 2.7106664180755615, "learning_rate": 4.368086096928879e-05, "loss": 0.9773, "step": 407200 }, { "epoch": 6.320706404506588, "grad_norm": 3.068756341934204, "learning_rate": 4.367930911404585e-05, "loss": 0.9746, "step": 407300 }, { "epoch": 6.322258259749531, "grad_norm": 2.437727928161621, "learning_rate": 4.36777572588029e-05, "loss": 0.9823, "step": 407400 }, { "epoch": 6.323810114992473, "grad_norm": 2.0479278564453125, "learning_rate": 4.367620540355996e-05, "loss": 0.979, "step": 407500 }, { "epoch": 6.3253619702354165, "grad_norm": 1.9924557209014893, "learning_rate": 4.3674653548317016e-05, "loss": 0.9723, "step": 407600 }, { "epoch": 6.32691382547836, "grad_norm": 2.3464393615722656, "learning_rate": 4.367310169307407e-05, "loss": 0.9752, "step": 407700 }, { "epoch": 6.328465680721302, "grad_norm": 2.749913454055786, "learning_rate": 4.3671549837831124e-05, "loss": 0.9754, "step": 407800 }, { "epoch": 6.330017535964245, "grad_norm": 2.1107990741729736, "learning_rate": 4.366999798258818e-05, "loss": 0.9559, "step": 407900 }, { "epoch": 6.331569391207188, "grad_norm": 2.4505043029785156, "learning_rate": 4.366844612734524e-05, "loss": 0.9591, "step": 408000 }, { "epoch": 6.333121246450131, "grad_norm": 2.0991528034210205, "learning_rate": 4.36668942721023e-05, "loss": 0.9817, "step": 408100 }, { "epoch": 6.334673101693074, "grad_norm": 2.3653876781463623, "learning_rate": 4.3665342416859355e-05, "loss": 0.9517, "step": 408200 }, { "epoch": 6.336224956936017, "grad_norm": 2.2427282333374023, "learning_rate": 4.366379056161641e-05, "loss": 0.9762, "step": 408300 }, { "epoch": 6.33777681217896, "grad_norm": 2.1615517139434814, "learning_rate": 4.366223870637347e-05, "loss": 0.9494, "step": 408400 }, { "epoch": 6.339328667421903, "grad_norm": 2.4494051933288574, "learning_rate": 4.366068685113053e-05, "loss": 0.9653, "step": 408500 }, { "epoch": 6.340880522664846, "grad_norm": 1.9103201627731323, "learning_rate": 4.3659134995887586e-05, "loss": 0.9591, "step": 408600 }, { "epoch": 6.342432377907789, "grad_norm": 2.474471092224121, "learning_rate": 4.3657583140644644e-05, "loss": 0.9669, "step": 408700 }, { "epoch": 6.3439842331507315, "grad_norm": 1.961702823638916, "learning_rate": 4.36560312854017e-05, "loss": 0.9592, "step": 408800 }, { "epoch": 6.345536088393675, "grad_norm": 2.607668876647949, "learning_rate": 4.365447943015876e-05, "loss": 0.9961, "step": 408900 }, { "epoch": 6.347087943636618, "grad_norm": 1.994560956954956, "learning_rate": 4.365292757491581e-05, "loss": 0.9599, "step": 409000 }, { "epoch": 6.34863979887956, "grad_norm": 2.195416212081909, "learning_rate": 4.365137571967287e-05, "loss": 0.9673, "step": 409100 }, { "epoch": 6.350191654122503, "grad_norm": 2.1197633743286133, "learning_rate": 4.3649823864429926e-05, "loss": 0.9782, "step": 409200 }, { "epoch": 6.351743509365447, "grad_norm": 2.4562883377075195, "learning_rate": 4.3648272009186984e-05, "loss": 0.9688, "step": 409300 }, { "epoch": 6.353295364608389, "grad_norm": 2.2004032135009766, "learning_rate": 4.364672015394404e-05, "loss": 0.9737, "step": 409400 }, { "epoch": 6.354847219851332, "grad_norm": 2.0319318771362305, "learning_rate": 4.36451682987011e-05, "loss": 0.9703, "step": 409500 }, { "epoch": 6.356399075094275, "grad_norm": 2.3951470851898193, "learning_rate": 4.364361644345816e-05, "loss": 0.9604, "step": 409600 }, { "epoch": 6.357950930337218, "grad_norm": 2.9845194816589355, "learning_rate": 4.3642064588215215e-05, "loss": 0.9767, "step": 409700 }, { "epoch": 6.359502785580161, "grad_norm": 2.3163154125213623, "learning_rate": 4.364051273297227e-05, "loss": 0.9768, "step": 409800 }, { "epoch": 6.361054640823104, "grad_norm": 2.6817493438720703, "learning_rate": 4.363896087772933e-05, "loss": 0.9551, "step": 409900 }, { "epoch": 6.362606496066047, "grad_norm": 2.1725223064422607, "learning_rate": 4.363740902248639e-05, "loss": 0.9699, "step": 410000 }, { "epoch": 6.36415835130899, "grad_norm": 2.197974920272827, "learning_rate": 4.3635857167243446e-05, "loss": 0.9663, "step": 410100 }, { "epoch": 6.365710206551933, "grad_norm": 2.164172410964966, "learning_rate": 4.3634305312000504e-05, "loss": 0.9615, "step": 410200 }, { "epoch": 6.367262061794876, "grad_norm": 2.543231964111328, "learning_rate": 4.3632753456757555e-05, "loss": 0.9589, "step": 410300 }, { "epoch": 6.368813917037818, "grad_norm": 2.252706289291382, "learning_rate": 4.363120160151461e-05, "loss": 0.9395, "step": 410400 }, { "epoch": 6.370365772280762, "grad_norm": 2.0108518600463867, "learning_rate": 4.362964974627167e-05, "loss": 0.9963, "step": 410500 }, { "epoch": 6.371917627523705, "grad_norm": 2.201422929763794, "learning_rate": 4.362809789102873e-05, "loss": 0.9398, "step": 410600 }, { "epoch": 6.373469482766647, "grad_norm": 2.2797775268554688, "learning_rate": 4.362654603578578e-05, "loss": 0.9753, "step": 410700 }, { "epoch": 6.37502133800959, "grad_norm": 2.103529214859009, "learning_rate": 4.362499418054284e-05, "loss": 0.9501, "step": 410800 }, { "epoch": 6.376573193252534, "grad_norm": 2.6470530033111572, "learning_rate": 4.3623442325299894e-05, "loss": 0.9791, "step": 410900 }, { "epoch": 6.378125048495477, "grad_norm": 2.4140331745147705, "learning_rate": 4.362189047005695e-05, "loss": 0.9911, "step": 411000 }, { "epoch": 6.379676903738419, "grad_norm": 2.3171579837799072, "learning_rate": 4.362033861481401e-05, "loss": 0.9565, "step": 411100 }, { "epoch": 6.381228758981362, "grad_norm": 2.5742545127868652, "learning_rate": 4.361878675957107e-05, "loss": 0.9718, "step": 411200 }, { "epoch": 6.3827806142243055, "grad_norm": 2.818953037261963, "learning_rate": 4.3617234904328125e-05, "loss": 0.9776, "step": 411300 }, { "epoch": 6.384332469467248, "grad_norm": 2.123659133911133, "learning_rate": 4.361568304908518e-05, "loss": 0.9656, "step": 411400 }, { "epoch": 6.385884324710191, "grad_norm": 2.3287277221679688, "learning_rate": 4.361413119384224e-05, "loss": 0.9774, "step": 411500 }, { "epoch": 6.387436179953134, "grad_norm": 2.071604013442993, "learning_rate": 4.36125793385993e-05, "loss": 0.9758, "step": 411600 }, { "epoch": 6.388988035196077, "grad_norm": 2.004335880279541, "learning_rate": 4.3611027483356356e-05, "loss": 0.9845, "step": 411700 }, { "epoch": 6.39053989043902, "grad_norm": 2.0574488639831543, "learning_rate": 4.3609475628113414e-05, "loss": 0.952, "step": 411800 }, { "epoch": 6.392091745681963, "grad_norm": 2.0608198642730713, "learning_rate": 4.3607923772870465e-05, "loss": 0.9804, "step": 411900 }, { "epoch": 6.393643600924905, "grad_norm": 1.768845796585083, "learning_rate": 4.360637191762752e-05, "loss": 0.9759, "step": 412000 }, { "epoch": 6.395195456167849, "grad_norm": 2.503519058227539, "learning_rate": 4.360482006238458e-05, "loss": 0.9682, "step": 412100 }, { "epoch": 6.396747311410792, "grad_norm": 2.4141995906829834, "learning_rate": 4.360326820714164e-05, "loss": 0.9832, "step": 412200 }, { "epoch": 6.398299166653734, "grad_norm": 2.4085330963134766, "learning_rate": 4.3601716351898696e-05, "loss": 0.9728, "step": 412300 }, { "epoch": 6.399851021896677, "grad_norm": 2.5037903785705566, "learning_rate": 4.3600164496655754e-05, "loss": 0.977, "step": 412400 }, { "epoch": 6.4014028771396205, "grad_norm": 2.4910197257995605, "learning_rate": 4.359861264141281e-05, "loss": 0.9379, "step": 412500 }, { "epoch": 6.402954732382564, "grad_norm": 2.4563679695129395, "learning_rate": 4.359706078616987e-05, "loss": 0.9758, "step": 412600 }, { "epoch": 6.404506587625506, "grad_norm": 2.0692555904388428, "learning_rate": 4.359550893092693e-05, "loss": 0.9809, "step": 412700 }, { "epoch": 6.406058442868449, "grad_norm": 2.471853494644165, "learning_rate": 4.3593957075683985e-05, "loss": 0.9795, "step": 412800 }, { "epoch": 6.4076102981113925, "grad_norm": 2.385113000869751, "learning_rate": 4.359240522044104e-05, "loss": 0.9644, "step": 412900 }, { "epoch": 6.409162153354335, "grad_norm": 2.337942123413086, "learning_rate": 4.35908533651981e-05, "loss": 0.9622, "step": 413000 }, { "epoch": 6.410714008597278, "grad_norm": 2.487560749053955, "learning_rate": 4.358930150995516e-05, "loss": 0.9557, "step": 413100 }, { "epoch": 6.412265863840221, "grad_norm": 2.2363314628601074, "learning_rate": 4.358774965471221e-05, "loss": 0.9741, "step": 413200 }, { "epoch": 6.413817719083164, "grad_norm": 2.336608409881592, "learning_rate": 4.358619779946927e-05, "loss": 0.9842, "step": 413300 }, { "epoch": 6.415369574326107, "grad_norm": 2.480377197265625, "learning_rate": 4.3584645944226325e-05, "loss": 0.9542, "step": 413400 }, { "epoch": 6.41692142956905, "grad_norm": 2.4966518878936768, "learning_rate": 4.358309408898338e-05, "loss": 0.992, "step": 413500 }, { "epoch": 6.418473284811993, "grad_norm": 2.2411019802093506, "learning_rate": 4.358154223374044e-05, "loss": 0.9564, "step": 413600 }, { "epoch": 6.4200251400549355, "grad_norm": 2.3539044857025146, "learning_rate": 4.35799903784975e-05, "loss": 0.9619, "step": 413700 }, { "epoch": 6.421576995297879, "grad_norm": 2.5996665954589844, "learning_rate": 4.3578438523254556e-05, "loss": 0.9857, "step": 413800 }, { "epoch": 6.423128850540822, "grad_norm": 2.0330357551574707, "learning_rate": 4.357688666801161e-05, "loss": 0.9635, "step": 413900 }, { "epoch": 6.424680705783764, "grad_norm": 2.0502448081970215, "learning_rate": 4.3575334812768664e-05, "loss": 0.9655, "step": 414000 }, { "epoch": 6.4262325610267075, "grad_norm": 2.112494707107544, "learning_rate": 4.357378295752572e-05, "loss": 0.9791, "step": 414100 }, { "epoch": 6.427784416269651, "grad_norm": 2.4739649295806885, "learning_rate": 4.357223110228278e-05, "loss": 0.9709, "step": 414200 }, { "epoch": 6.429336271512593, "grad_norm": 2.0352835655212402, "learning_rate": 4.357067924703984e-05, "loss": 0.9496, "step": 414300 }, { "epoch": 6.430888126755536, "grad_norm": 2.2815184593200684, "learning_rate": 4.3569127391796895e-05, "loss": 0.9564, "step": 414400 }, { "epoch": 6.432439981998479, "grad_norm": 2.576735734939575, "learning_rate": 4.356757553655395e-05, "loss": 0.9646, "step": 414500 }, { "epoch": 6.433991837241422, "grad_norm": 2.3162832260131836, "learning_rate": 4.356602368131101e-05, "loss": 0.9592, "step": 414600 }, { "epoch": 6.435543692484365, "grad_norm": 2.7075114250183105, "learning_rate": 4.356447182606806e-05, "loss": 0.977, "step": 414700 }, { "epoch": 6.437095547727308, "grad_norm": 2.3292689323425293, "learning_rate": 4.356291997082512e-05, "loss": 0.9722, "step": 414800 }, { "epoch": 6.4386474029702505, "grad_norm": 2.404096841812134, "learning_rate": 4.356136811558218e-05, "loss": 0.9467, "step": 414900 }, { "epoch": 6.440199258213194, "grad_norm": 2.202815055847168, "learning_rate": 4.3559816260339235e-05, "loss": 0.9692, "step": 415000 }, { "epoch": 6.441751113456137, "grad_norm": 2.2449398040771484, "learning_rate": 4.355826440509629e-05, "loss": 0.9729, "step": 415100 }, { "epoch": 6.44330296869908, "grad_norm": 2.9412636756896973, "learning_rate": 4.355671254985335e-05, "loss": 0.9859, "step": 415200 }, { "epoch": 6.4448548239420225, "grad_norm": 2.2181241512298584, "learning_rate": 4.355516069461041e-05, "loss": 0.974, "step": 415300 }, { "epoch": 6.446406679184966, "grad_norm": 2.460430145263672, "learning_rate": 4.3553608839367466e-05, "loss": 0.9481, "step": 415400 }, { "epoch": 6.447958534427909, "grad_norm": 2.7513928413391113, "learning_rate": 4.3552056984124524e-05, "loss": 0.9648, "step": 415500 }, { "epoch": 6.449510389670851, "grad_norm": 2.3180179595947266, "learning_rate": 4.355050512888158e-05, "loss": 0.979, "step": 415600 }, { "epoch": 6.451062244913794, "grad_norm": 2.396193027496338, "learning_rate": 4.354895327363864e-05, "loss": 0.9736, "step": 415700 }, { "epoch": 6.452614100156738, "grad_norm": 2.209988832473755, "learning_rate": 4.35474014183957e-05, "loss": 0.9683, "step": 415800 }, { "epoch": 6.45416595539968, "grad_norm": 2.1172661781311035, "learning_rate": 4.3545849563152755e-05, "loss": 0.9637, "step": 415900 }, { "epoch": 6.455717810642623, "grad_norm": 2.075331687927246, "learning_rate": 4.3544297707909806e-05, "loss": 0.9823, "step": 416000 }, { "epoch": 6.457269665885566, "grad_norm": 2.2483606338500977, "learning_rate": 4.3542745852666864e-05, "loss": 0.9736, "step": 416100 }, { "epoch": 6.45882152112851, "grad_norm": 2.4009900093078613, "learning_rate": 4.354119399742392e-05, "loss": 0.9689, "step": 416200 }, { "epoch": 6.460373376371452, "grad_norm": 2.197854518890381, "learning_rate": 4.353964214218098e-05, "loss": 0.9753, "step": 416300 }, { "epoch": 6.461925231614395, "grad_norm": 2.52730655670166, "learning_rate": 4.353809028693804e-05, "loss": 0.955, "step": 416400 }, { "epoch": 6.463477086857338, "grad_norm": 2.1753857135772705, "learning_rate": 4.3536538431695095e-05, "loss": 0.9755, "step": 416500 }, { "epoch": 6.465028942100281, "grad_norm": 2.1001245975494385, "learning_rate": 4.353498657645215e-05, "loss": 0.9516, "step": 416600 }, { "epoch": 6.466580797343224, "grad_norm": 2.1102499961853027, "learning_rate": 4.353343472120921e-05, "loss": 0.9537, "step": 416700 }, { "epoch": 6.468132652586167, "grad_norm": 2.656090259552002, "learning_rate": 4.353188286596627e-05, "loss": 0.9995, "step": 416800 }, { "epoch": 6.469684507829109, "grad_norm": 2.182312488555908, "learning_rate": 4.3530331010723326e-05, "loss": 0.9695, "step": 416900 }, { "epoch": 6.471236363072053, "grad_norm": 2.736952781677246, "learning_rate": 4.3528779155480383e-05, "loss": 0.9663, "step": 417000 }, { "epoch": 6.472788218314996, "grad_norm": 2.4398269653320312, "learning_rate": 4.352722730023744e-05, "loss": 0.9536, "step": 417100 }, { "epoch": 6.474340073557938, "grad_norm": 2.5215742588043213, "learning_rate": 4.352567544499449e-05, "loss": 0.9841, "step": 417200 }, { "epoch": 6.475891928800881, "grad_norm": 1.9735440015792847, "learning_rate": 4.352412358975155e-05, "loss": 0.9538, "step": 417300 }, { "epoch": 6.477443784043825, "grad_norm": 2.09611177444458, "learning_rate": 4.352257173450861e-05, "loss": 0.9554, "step": 417400 }, { "epoch": 6.478995639286767, "grad_norm": 2.1164262294769287, "learning_rate": 4.352101987926566e-05, "loss": 0.9651, "step": 417500 }, { "epoch": 6.48054749452971, "grad_norm": 2.137082099914551, "learning_rate": 4.3519468024022716e-05, "loss": 0.9688, "step": 417600 }, { "epoch": 6.482099349772653, "grad_norm": 2.5171310901641846, "learning_rate": 4.3517916168779774e-05, "loss": 0.9744, "step": 417700 }, { "epoch": 6.4836512050155966, "grad_norm": 2.2302048206329346, "learning_rate": 4.351636431353683e-05, "loss": 0.9542, "step": 417800 }, { "epoch": 6.485203060258539, "grad_norm": 2.363054037094116, "learning_rate": 4.351481245829389e-05, "loss": 0.9592, "step": 417900 }, { "epoch": 6.486754915501482, "grad_norm": 2.129586935043335, "learning_rate": 4.351326060305095e-05, "loss": 0.9648, "step": 418000 }, { "epoch": 6.488306770744425, "grad_norm": 2.0864787101745605, "learning_rate": 4.3511708747808005e-05, "loss": 0.9477, "step": 418100 }, { "epoch": 6.489858625987368, "grad_norm": 2.5064265727996826, "learning_rate": 4.351015689256506e-05, "loss": 0.9688, "step": 418200 }, { "epoch": 6.491410481230311, "grad_norm": 2.431048631668091, "learning_rate": 4.350860503732212e-05, "loss": 0.9731, "step": 418300 }, { "epoch": 6.492962336473254, "grad_norm": 2.903663396835327, "learning_rate": 4.350705318207918e-05, "loss": 0.9534, "step": 418400 }, { "epoch": 6.494514191716196, "grad_norm": 2.116795778274536, "learning_rate": 4.3505501326836236e-05, "loss": 0.9817, "step": 418500 }, { "epoch": 6.49606604695914, "grad_norm": 2.5668110847473145, "learning_rate": 4.3503949471593294e-05, "loss": 0.9706, "step": 418600 }, { "epoch": 6.497617902202083, "grad_norm": 2.0160510540008545, "learning_rate": 4.350239761635035e-05, "loss": 0.9851, "step": 418700 }, { "epoch": 6.499169757445025, "grad_norm": 1.8833749294281006, "learning_rate": 4.35008457611074e-05, "loss": 0.959, "step": 418800 }, { "epoch": 6.500721612687968, "grad_norm": 2.4650609493255615, "learning_rate": 4.349929390586446e-05, "loss": 0.9372, "step": 418900 }, { "epoch": 6.5022734679309115, "grad_norm": 2.117232322692871, "learning_rate": 4.349774205062152e-05, "loss": 0.9827, "step": 419000 }, { "epoch": 6.503825323173855, "grad_norm": 2.1824660301208496, "learning_rate": 4.3496190195378576e-05, "loss": 0.9566, "step": 419100 }, { "epoch": 6.505377178416797, "grad_norm": 2.1648807525634766, "learning_rate": 4.3494638340135634e-05, "loss": 0.9667, "step": 419200 }, { "epoch": 6.50692903365974, "grad_norm": 2.1830403804779053, "learning_rate": 4.349308648489269e-05, "loss": 0.9748, "step": 419300 }, { "epoch": 6.5084808889026835, "grad_norm": 2.006662368774414, "learning_rate": 4.349153462964975e-05, "loss": 0.9716, "step": 419400 }, { "epoch": 6.510032744145626, "grad_norm": 2.364398956298828, "learning_rate": 4.348998277440681e-05, "loss": 0.9601, "step": 419500 }, { "epoch": 6.511584599388569, "grad_norm": 1.81155526638031, "learning_rate": 4.3488430919163865e-05, "loss": 0.9727, "step": 419600 }, { "epoch": 6.513136454631512, "grad_norm": 1.618113398551941, "learning_rate": 4.348687906392092e-05, "loss": 0.9601, "step": 419700 }, { "epoch": 6.514688309874455, "grad_norm": 2.53412127494812, "learning_rate": 4.348532720867798e-05, "loss": 0.9627, "step": 419800 }, { "epoch": 6.516240165117398, "grad_norm": 2.1579275131225586, "learning_rate": 4.348377535343504e-05, "loss": 0.9653, "step": 419900 }, { "epoch": 6.517792020360341, "grad_norm": 2.228121519088745, "learning_rate": 4.3482223498192096e-05, "loss": 0.9528, "step": 420000 }, { "epoch": 6.519343875603283, "grad_norm": 2.453296184539795, "learning_rate": 4.348067164294915e-05, "loss": 0.9689, "step": 420100 }, { "epoch": 6.5208957308462265, "grad_norm": 2.2692201137542725, "learning_rate": 4.3479119787706204e-05, "loss": 0.9582, "step": 420200 }, { "epoch": 6.52244758608917, "grad_norm": 1.6923892498016357, "learning_rate": 4.347756793246326e-05, "loss": 0.9693, "step": 420300 }, { "epoch": 6.523999441332112, "grad_norm": 2.295419931411743, "learning_rate": 4.347601607722031e-05, "loss": 0.961, "step": 420400 }, { "epoch": 6.525551296575055, "grad_norm": 2.2035574913024902, "learning_rate": 4.347446422197737e-05, "loss": 0.9802, "step": 420500 }, { "epoch": 6.5271031518179985, "grad_norm": 2.311668872833252, "learning_rate": 4.347291236673443e-05, "loss": 0.9567, "step": 420600 }, { "epoch": 6.528655007060942, "grad_norm": 2.2198069095611572, "learning_rate": 4.3471360511491486e-05, "loss": 0.9622, "step": 420700 }, { "epoch": 6.530206862303884, "grad_norm": 1.9989866018295288, "learning_rate": 4.3469808656248544e-05, "loss": 0.9552, "step": 420800 }, { "epoch": 6.531758717546827, "grad_norm": 2.0765514373779297, "learning_rate": 4.34682568010056e-05, "loss": 0.9663, "step": 420900 }, { "epoch": 6.5333105727897705, "grad_norm": 1.9403561353683472, "learning_rate": 4.346670494576266e-05, "loss": 0.9447, "step": 421000 }, { "epoch": 6.534862428032713, "grad_norm": 2.1181282997131348, "learning_rate": 4.346515309051972e-05, "loss": 0.9549, "step": 421100 }, { "epoch": 6.536414283275656, "grad_norm": 2.55145263671875, "learning_rate": 4.3463601235276775e-05, "loss": 0.9608, "step": 421200 }, { "epoch": 6.537966138518599, "grad_norm": 1.966909408569336, "learning_rate": 4.346204938003383e-05, "loss": 0.9511, "step": 421300 }, { "epoch": 6.539517993761542, "grad_norm": 2.0743229389190674, "learning_rate": 4.346049752479089e-05, "loss": 0.9773, "step": 421400 }, { "epoch": 6.541069849004485, "grad_norm": 2.037116289138794, "learning_rate": 4.345894566954795e-05, "loss": 0.9605, "step": 421500 }, { "epoch": 6.542621704247428, "grad_norm": 2.333439588546753, "learning_rate": 4.3457393814305006e-05, "loss": 0.9628, "step": 421600 }, { "epoch": 6.544173559490371, "grad_norm": 2.0217251777648926, "learning_rate": 4.345584195906206e-05, "loss": 0.9599, "step": 421700 }, { "epoch": 6.5457254147333135, "grad_norm": 1.8909906148910522, "learning_rate": 4.3454290103819115e-05, "loss": 0.9803, "step": 421800 }, { "epoch": 6.547277269976257, "grad_norm": 2.204542398452759, "learning_rate": 4.345273824857617e-05, "loss": 0.9688, "step": 421900 }, { "epoch": 6.5488291252192, "grad_norm": 2.45114803314209, "learning_rate": 4.345118639333323e-05, "loss": 1.004, "step": 422000 }, { "epoch": 6.550380980462142, "grad_norm": 2.446262836456299, "learning_rate": 4.344963453809029e-05, "loss": 0.9702, "step": 422100 }, { "epoch": 6.5519328357050854, "grad_norm": 2.1272571086883545, "learning_rate": 4.3448082682847346e-05, "loss": 0.9928, "step": 422200 }, { "epoch": 6.553484690948029, "grad_norm": 2.0498876571655273, "learning_rate": 4.3446530827604404e-05, "loss": 0.9567, "step": 422300 }, { "epoch": 6.555036546190971, "grad_norm": 2.039031744003296, "learning_rate": 4.344497897236146e-05, "loss": 0.96, "step": 422400 }, { "epoch": 6.556588401433914, "grad_norm": 1.8312418460845947, "learning_rate": 4.344342711711852e-05, "loss": 0.9648, "step": 422500 }, { "epoch": 6.558140256676857, "grad_norm": 2.2084076404571533, "learning_rate": 4.344187526187558e-05, "loss": 0.9759, "step": 422600 }, { "epoch": 6.5596921119198, "grad_norm": 2.29850697517395, "learning_rate": 4.3440323406632635e-05, "loss": 0.9838, "step": 422700 }, { "epoch": 6.561243967162743, "grad_norm": 2.330251932144165, "learning_rate": 4.343877155138969e-05, "loss": 0.9489, "step": 422800 }, { "epoch": 6.562795822405686, "grad_norm": 2.4128081798553467, "learning_rate": 4.343721969614675e-05, "loss": 0.9499, "step": 422900 }, { "epoch": 6.5643476776486285, "grad_norm": 2.6081063747406006, "learning_rate": 4.34356678409038e-05, "loss": 0.9723, "step": 423000 }, { "epoch": 6.565899532891572, "grad_norm": 2.206106424331665, "learning_rate": 4.343411598566086e-05, "loss": 0.9606, "step": 423100 }, { "epoch": 6.567451388134515, "grad_norm": 2.5828254222869873, "learning_rate": 4.343256413041792e-05, "loss": 0.9711, "step": 423200 }, { "epoch": 6.569003243377458, "grad_norm": 2.2720930576324463, "learning_rate": 4.3431012275174974e-05, "loss": 0.9587, "step": 423300 }, { "epoch": 6.5705550986204, "grad_norm": 2.2720694541931152, "learning_rate": 4.342946041993203e-05, "loss": 0.9602, "step": 423400 }, { "epoch": 6.572106953863344, "grad_norm": 2.7443814277648926, "learning_rate": 4.342790856468909e-05, "loss": 0.9489, "step": 423500 }, { "epoch": 6.573658809106287, "grad_norm": 2.089165687561035, "learning_rate": 4.342635670944615e-05, "loss": 0.953, "step": 423600 }, { "epoch": 6.575210664349229, "grad_norm": 2.6473116874694824, "learning_rate": 4.34248048542032e-05, "loss": 0.9593, "step": 423700 }, { "epoch": 6.576762519592172, "grad_norm": 2.5216798782348633, "learning_rate": 4.3423252998960256e-05, "loss": 0.9502, "step": 423800 }, { "epoch": 6.578314374835116, "grad_norm": 2.220024585723877, "learning_rate": 4.3421701143717314e-05, "loss": 0.9571, "step": 423900 }, { "epoch": 6.579866230078058, "grad_norm": 2.1186959743499756, "learning_rate": 4.342014928847437e-05, "loss": 0.9581, "step": 424000 }, { "epoch": 6.581418085321001, "grad_norm": 2.213369131088257, "learning_rate": 4.341859743323143e-05, "loss": 0.9729, "step": 424100 }, { "epoch": 6.582969940563944, "grad_norm": 2.0517101287841797, "learning_rate": 4.341704557798849e-05, "loss": 0.9773, "step": 424200 }, { "epoch": 6.584521795806888, "grad_norm": 2.7900636196136475, "learning_rate": 4.3415493722745545e-05, "loss": 0.959, "step": 424300 }, { "epoch": 6.58607365104983, "grad_norm": 2.2764623165130615, "learning_rate": 4.34139418675026e-05, "loss": 0.9689, "step": 424400 }, { "epoch": 6.587625506292773, "grad_norm": 2.284311056137085, "learning_rate": 4.3412390012259654e-05, "loss": 0.9698, "step": 424500 }, { "epoch": 6.589177361535716, "grad_norm": 2.0992631912231445, "learning_rate": 4.341083815701671e-05, "loss": 0.9773, "step": 424600 }, { "epoch": 6.590729216778659, "grad_norm": 2.812044858932495, "learning_rate": 4.340928630177377e-05, "loss": 0.9632, "step": 424700 }, { "epoch": 6.592281072021602, "grad_norm": 2.621793031692505, "learning_rate": 4.340773444653083e-05, "loss": 0.9627, "step": 424800 }, { "epoch": 6.593832927264545, "grad_norm": 2.128485679626465, "learning_rate": 4.3406182591287885e-05, "loss": 0.9609, "step": 424900 }, { "epoch": 6.595384782507487, "grad_norm": 2.5162482261657715, "learning_rate": 4.340463073604494e-05, "loss": 0.9492, "step": 425000 }, { "epoch": 6.596936637750431, "grad_norm": 2.232084035873413, "learning_rate": 4.3403078880802e-05, "loss": 0.9496, "step": 425100 }, { "epoch": 6.598488492993374, "grad_norm": 2.37396502494812, "learning_rate": 4.340152702555906e-05, "loss": 0.964, "step": 425200 }, { "epoch": 6.600040348236316, "grad_norm": 2.530705213546753, "learning_rate": 4.3399975170316116e-05, "loss": 0.9633, "step": 425300 }, { "epoch": 6.601592203479259, "grad_norm": 2.438948154449463, "learning_rate": 4.3398423315073174e-05, "loss": 0.9635, "step": 425400 }, { "epoch": 6.603144058722203, "grad_norm": 1.9976087808609009, "learning_rate": 4.339687145983023e-05, "loss": 0.9708, "step": 425500 }, { "epoch": 6.604695913965145, "grad_norm": 2.3859241008758545, "learning_rate": 4.339531960458729e-05, "loss": 0.9753, "step": 425600 }, { "epoch": 6.606247769208088, "grad_norm": 2.56014084815979, "learning_rate": 4.339376774934435e-05, "loss": 0.9589, "step": 425700 }, { "epoch": 6.607799624451031, "grad_norm": 2.2402284145355225, "learning_rate": 4.33922158941014e-05, "loss": 0.9566, "step": 425800 }, { "epoch": 6.6093514796939745, "grad_norm": 1.9381129741668701, "learning_rate": 4.3390664038858456e-05, "loss": 0.9678, "step": 425900 }, { "epoch": 6.610903334936917, "grad_norm": 2.5849661827087402, "learning_rate": 4.3389112183615513e-05, "loss": 0.964, "step": 426000 }, { "epoch": 6.61245519017986, "grad_norm": 2.0342535972595215, "learning_rate": 4.338756032837257e-05, "loss": 0.9654, "step": 426100 }, { "epoch": 6.614007045422803, "grad_norm": 2.233546257019043, "learning_rate": 4.338600847312963e-05, "loss": 0.9747, "step": 426200 }, { "epoch": 6.615558900665746, "grad_norm": 2.464162826538086, "learning_rate": 4.338445661788669e-05, "loss": 0.9636, "step": 426300 }, { "epoch": 6.617110755908689, "grad_norm": 1.985101580619812, "learning_rate": 4.3382904762643744e-05, "loss": 0.9635, "step": 426400 }, { "epoch": 6.618662611151632, "grad_norm": 1.6616899967193604, "learning_rate": 4.33813529074008e-05, "loss": 0.9625, "step": 426500 }, { "epoch": 6.620214466394574, "grad_norm": 1.789625883102417, "learning_rate": 4.337980105215786e-05, "loss": 0.9757, "step": 426600 }, { "epoch": 6.6217663216375175, "grad_norm": 2.6055991649627686, "learning_rate": 4.337824919691492e-05, "loss": 0.9679, "step": 426700 }, { "epoch": 6.623318176880461, "grad_norm": 1.6995948553085327, "learning_rate": 4.3376697341671975e-05, "loss": 0.9599, "step": 426800 }, { "epoch": 6.624870032123404, "grad_norm": 2.0643959045410156, "learning_rate": 4.337514548642903e-05, "loss": 0.9689, "step": 426900 }, { "epoch": 6.626421887366346, "grad_norm": 2.1341381072998047, "learning_rate": 4.3373593631186084e-05, "loss": 0.9667, "step": 427000 }, { "epoch": 6.6279737426092895, "grad_norm": 1.8998945951461792, "learning_rate": 4.337204177594314e-05, "loss": 0.9709, "step": 427100 }, { "epoch": 6.629525597852233, "grad_norm": 2.09203839302063, "learning_rate": 4.33704899207002e-05, "loss": 0.9689, "step": 427200 }, { "epoch": 6.631077453095175, "grad_norm": 2.0841968059539795, "learning_rate": 4.336893806545726e-05, "loss": 0.966, "step": 427300 }, { "epoch": 6.632629308338118, "grad_norm": 1.915770411491394, "learning_rate": 4.336738621021431e-05, "loss": 0.9363, "step": 427400 }, { "epoch": 6.6341811635810615, "grad_norm": 2.285073757171631, "learning_rate": 4.3365834354971366e-05, "loss": 0.963, "step": 427500 }, { "epoch": 6.635733018824004, "grad_norm": 2.538499355316162, "learning_rate": 4.3364282499728424e-05, "loss": 0.9612, "step": 427600 }, { "epoch": 6.637284874066947, "grad_norm": 2.306152105331421, "learning_rate": 4.336273064448548e-05, "loss": 0.9468, "step": 427700 }, { "epoch": 6.63883672930989, "grad_norm": 2.27289080619812, "learning_rate": 4.336117878924254e-05, "loss": 0.9669, "step": 427800 }, { "epoch": 6.6403885845528325, "grad_norm": 1.9609386920928955, "learning_rate": 4.33596269339996e-05, "loss": 0.9427, "step": 427900 }, { "epoch": 6.641940439795776, "grad_norm": 2.150015115737915, "learning_rate": 4.3358075078756655e-05, "loss": 0.9592, "step": 428000 }, { "epoch": 6.643492295038719, "grad_norm": 2.085994005203247, "learning_rate": 4.335652322351371e-05, "loss": 0.9576, "step": 428100 }, { "epoch": 6.645044150281661, "grad_norm": 2.246272563934326, "learning_rate": 4.335497136827077e-05, "loss": 0.9792, "step": 428200 }, { "epoch": 6.6465960055246045, "grad_norm": 2.502819538116455, "learning_rate": 4.335341951302783e-05, "loss": 0.9841, "step": 428300 }, { "epoch": 6.648147860767548, "grad_norm": 2.2759687900543213, "learning_rate": 4.3351867657784886e-05, "loss": 0.981, "step": 428400 }, { "epoch": 6.649699716010491, "grad_norm": 1.9931080341339111, "learning_rate": 4.3350315802541944e-05, "loss": 0.9718, "step": 428500 }, { "epoch": 6.651251571253433, "grad_norm": 2.1190154552459717, "learning_rate": 4.3348763947299e-05, "loss": 0.96, "step": 428600 }, { "epoch": 6.6528034264963765, "grad_norm": 2.188507556915283, "learning_rate": 4.334721209205605e-05, "loss": 0.9706, "step": 428700 }, { "epoch": 6.65435528173932, "grad_norm": 2.1184189319610596, "learning_rate": 4.334566023681311e-05, "loss": 0.9589, "step": 428800 }, { "epoch": 6.655907136982262, "grad_norm": 2.58760929107666, "learning_rate": 4.334410838157017e-05, "loss": 0.9551, "step": 428900 }, { "epoch": 6.657458992225205, "grad_norm": 2.420637845993042, "learning_rate": 4.3342556526327226e-05, "loss": 0.952, "step": 429000 }, { "epoch": 6.659010847468148, "grad_norm": 2.369595527648926, "learning_rate": 4.3341004671084283e-05, "loss": 0.9683, "step": 429100 }, { "epoch": 6.660562702711091, "grad_norm": 2.055168628692627, "learning_rate": 4.333945281584134e-05, "loss": 0.9641, "step": 429200 }, { "epoch": 6.662114557954034, "grad_norm": 2.1766114234924316, "learning_rate": 4.33379009605984e-05, "loss": 0.9614, "step": 429300 }, { "epoch": 6.663666413196977, "grad_norm": 2.3143973350524902, "learning_rate": 4.333634910535546e-05, "loss": 0.9803, "step": 429400 }, { "epoch": 6.66521826843992, "grad_norm": 2.3827552795410156, "learning_rate": 4.3334797250112514e-05, "loss": 0.9738, "step": 429500 }, { "epoch": 6.666770123682863, "grad_norm": 2.6563448905944824, "learning_rate": 4.333324539486957e-05, "loss": 0.9801, "step": 429600 }, { "epoch": 6.668321978925806, "grad_norm": 2.5449228286743164, "learning_rate": 4.333169353962663e-05, "loss": 0.9633, "step": 429700 }, { "epoch": 6.669873834168749, "grad_norm": 1.899495005607605, "learning_rate": 4.333014168438369e-05, "loss": 0.9524, "step": 429800 }, { "epoch": 6.6714256894116915, "grad_norm": 2.0042564868927, "learning_rate": 4.3328589829140745e-05, "loss": 0.9818, "step": 429900 }, { "epoch": 6.672977544654635, "grad_norm": 2.081190824508667, "learning_rate": 4.3327037973897796e-05, "loss": 0.9743, "step": 430000 }, { "epoch": 6.674529399897578, "grad_norm": 2.2775936126708984, "learning_rate": 4.3325486118654854e-05, "loss": 0.9717, "step": 430100 }, { "epoch": 6.67608125514052, "grad_norm": 2.2846286296844482, "learning_rate": 4.3323934263411905e-05, "loss": 0.958, "step": 430200 }, { "epoch": 6.677633110383463, "grad_norm": 2.0574283599853516, "learning_rate": 4.332238240816896e-05, "loss": 0.9641, "step": 430300 }, { "epoch": 6.679184965626407, "grad_norm": 2.179159164428711, "learning_rate": 4.332083055292602e-05, "loss": 0.9428, "step": 430400 }, { "epoch": 6.680736820869349, "grad_norm": 1.985873818397522, "learning_rate": 4.331927869768308e-05, "loss": 0.9519, "step": 430500 }, { "epoch": 6.682288676112292, "grad_norm": 2.1224048137664795, "learning_rate": 4.3317726842440136e-05, "loss": 0.9807, "step": 430600 }, { "epoch": 6.683840531355235, "grad_norm": 2.1241307258605957, "learning_rate": 4.3316174987197194e-05, "loss": 0.9715, "step": 430700 }, { "epoch": 6.685392386598178, "grad_norm": 2.251311779022217, "learning_rate": 4.331462313195425e-05, "loss": 0.9776, "step": 430800 }, { "epoch": 6.686944241841121, "grad_norm": 1.9129951000213623, "learning_rate": 4.331307127671131e-05, "loss": 0.9647, "step": 430900 }, { "epoch": 6.688496097084064, "grad_norm": 2.598257064819336, "learning_rate": 4.331151942146837e-05, "loss": 0.9893, "step": 431000 }, { "epoch": 6.690047952327006, "grad_norm": 2.1570096015930176, "learning_rate": 4.3309967566225425e-05, "loss": 0.967, "step": 431100 }, { "epoch": 6.69159980756995, "grad_norm": 1.9991873502731323, "learning_rate": 4.330841571098248e-05, "loss": 0.9677, "step": 431200 }, { "epoch": 6.693151662812893, "grad_norm": 2.108656167984009, "learning_rate": 4.330686385573954e-05, "loss": 0.9805, "step": 431300 }, { "epoch": 6.694703518055836, "grad_norm": 2.5301196575164795, "learning_rate": 4.33053120004966e-05, "loss": 0.9821, "step": 431400 }, { "epoch": 6.696255373298778, "grad_norm": 2.2393667697906494, "learning_rate": 4.330376014525365e-05, "loss": 0.9449, "step": 431500 }, { "epoch": 6.697807228541722, "grad_norm": 2.1805450916290283, "learning_rate": 4.330220829001071e-05, "loss": 0.9795, "step": 431600 }, { "epoch": 6.699359083784665, "grad_norm": 2.4003820419311523, "learning_rate": 4.3300656434767765e-05, "loss": 0.9792, "step": 431700 }, { "epoch": 6.700910939027607, "grad_norm": 2.1959033012390137, "learning_rate": 4.329910457952482e-05, "loss": 0.9577, "step": 431800 }, { "epoch": 6.70246279427055, "grad_norm": 2.564363479614258, "learning_rate": 4.329755272428188e-05, "loss": 0.9637, "step": 431900 }, { "epoch": 6.704014649513494, "grad_norm": 2.331374406814575, "learning_rate": 4.329600086903894e-05, "loss": 0.9616, "step": 432000 }, { "epoch": 6.705566504756437, "grad_norm": 1.8874894380569458, "learning_rate": 4.3294449013795996e-05, "loss": 0.971, "step": 432100 }, { "epoch": 6.707118359999379, "grad_norm": 2.1836538314819336, "learning_rate": 4.3292897158553053e-05, "loss": 0.969, "step": 432200 }, { "epoch": 6.708670215242322, "grad_norm": 2.307040214538574, "learning_rate": 4.329134530331011e-05, "loss": 0.9627, "step": 432300 }, { "epoch": 6.7102220704852655, "grad_norm": 2.0233044624328613, "learning_rate": 4.328979344806717e-05, "loss": 0.9515, "step": 432400 }, { "epoch": 6.711773925728208, "grad_norm": 2.0123188495635986, "learning_rate": 4.328824159282423e-05, "loss": 0.9861, "step": 432500 }, { "epoch": 6.713325780971151, "grad_norm": 2.378725051879883, "learning_rate": 4.3286689737581284e-05, "loss": 0.9651, "step": 432600 }, { "epoch": 6.714877636214094, "grad_norm": 2.286808729171753, "learning_rate": 4.328513788233834e-05, "loss": 0.9613, "step": 432700 }, { "epoch": 6.716429491457037, "grad_norm": 2.237541675567627, "learning_rate": 4.328358602709539e-05, "loss": 0.9666, "step": 432800 }, { "epoch": 6.71798134669998, "grad_norm": 2.449968099594116, "learning_rate": 4.328203417185245e-05, "loss": 0.9755, "step": 432900 }, { "epoch": 6.719533201942923, "grad_norm": 2.467799425125122, "learning_rate": 4.328048231660951e-05, "loss": 0.9691, "step": 433000 }, { "epoch": 6.721085057185865, "grad_norm": 2.7805891036987305, "learning_rate": 4.3278930461366566e-05, "loss": 0.9504, "step": 433100 }, { "epoch": 6.722636912428809, "grad_norm": 1.891644835472107, "learning_rate": 4.3277378606123624e-05, "loss": 0.9625, "step": 433200 }, { "epoch": 6.724188767671752, "grad_norm": 2.1541478633880615, "learning_rate": 4.327582675088068e-05, "loss": 0.9613, "step": 433300 }, { "epoch": 6.725740622914694, "grad_norm": 2.3072755336761475, "learning_rate": 4.327427489563774e-05, "loss": 0.9424, "step": 433400 }, { "epoch": 6.727292478157637, "grad_norm": 2.7168760299682617, "learning_rate": 4.327272304039479e-05, "loss": 0.9745, "step": 433500 }, { "epoch": 6.7288443334005805, "grad_norm": 2.453251838684082, "learning_rate": 4.327117118515185e-05, "loss": 0.9873, "step": 433600 }, { "epoch": 6.730396188643523, "grad_norm": 2.4342517852783203, "learning_rate": 4.3269619329908906e-05, "loss": 0.959, "step": 433700 }, { "epoch": 6.731948043886466, "grad_norm": 2.4158682823181152, "learning_rate": 4.3268067474665964e-05, "loss": 0.956, "step": 433800 }, { "epoch": 6.733499899129409, "grad_norm": 2.526888370513916, "learning_rate": 4.326651561942302e-05, "loss": 0.9602, "step": 433900 }, { "epoch": 6.7350517543723525, "grad_norm": 2.4079947471618652, "learning_rate": 4.326496376418008e-05, "loss": 0.9749, "step": 434000 }, { "epoch": 6.736603609615295, "grad_norm": 2.1496877670288086, "learning_rate": 4.326341190893714e-05, "loss": 0.9712, "step": 434100 }, { "epoch": 6.738155464858238, "grad_norm": 2.4161407947540283, "learning_rate": 4.3261860053694195e-05, "loss": 0.9587, "step": 434200 }, { "epoch": 6.739707320101181, "grad_norm": 2.3657705783843994, "learning_rate": 4.3260308198451246e-05, "loss": 0.9619, "step": 434300 }, { "epoch": 6.7412591753441236, "grad_norm": 2.353041172027588, "learning_rate": 4.3258756343208304e-05, "loss": 0.9697, "step": 434400 }, { "epoch": 6.742811030587067, "grad_norm": 2.350092649459839, "learning_rate": 4.325720448796536e-05, "loss": 0.9607, "step": 434500 }, { "epoch": 6.74436288583001, "grad_norm": 2.28694486618042, "learning_rate": 4.325565263272242e-05, "loss": 0.9629, "step": 434600 }, { "epoch": 6.745914741072953, "grad_norm": 2.063788890838623, "learning_rate": 4.325410077747948e-05, "loss": 0.9716, "step": 434700 }, { "epoch": 6.7474665963158955, "grad_norm": 2.3966639041900635, "learning_rate": 4.3252548922236535e-05, "loss": 0.9543, "step": 434800 }, { "epoch": 6.749018451558839, "grad_norm": 2.3289926052093506, "learning_rate": 4.325099706699359e-05, "loss": 0.962, "step": 434900 }, { "epoch": 6.750570306801782, "grad_norm": 2.1907060146331787, "learning_rate": 4.324944521175065e-05, "loss": 0.9762, "step": 435000 }, { "epoch": 6.752122162044724, "grad_norm": 1.9231843948364258, "learning_rate": 4.324789335650771e-05, "loss": 0.9681, "step": 435100 }, { "epoch": 6.7536740172876675, "grad_norm": 2.1809346675872803, "learning_rate": 4.3246341501264766e-05, "loss": 0.9539, "step": 435200 }, { "epoch": 6.755225872530611, "grad_norm": 2.1257243156433105, "learning_rate": 4.3244789646021823e-05, "loss": 0.9708, "step": 435300 }, { "epoch": 6.756777727773553, "grad_norm": 2.094426155090332, "learning_rate": 4.324323779077888e-05, "loss": 0.9741, "step": 435400 }, { "epoch": 6.758329583016496, "grad_norm": 2.3985161781311035, "learning_rate": 4.324168593553594e-05, "loss": 0.9677, "step": 435500 }, { "epoch": 6.759881438259439, "grad_norm": 2.131499767303467, "learning_rate": 4.324013408029299e-05, "loss": 0.9576, "step": 435600 }, { "epoch": 6.761433293502382, "grad_norm": 2.861760139465332, "learning_rate": 4.323858222505005e-05, "loss": 0.9631, "step": 435700 }, { "epoch": 6.762985148745325, "grad_norm": 1.9125932455062866, "learning_rate": 4.3237030369807105e-05, "loss": 0.9623, "step": 435800 }, { "epoch": 6.764537003988268, "grad_norm": 2.1935696601867676, "learning_rate": 4.323547851456416e-05, "loss": 0.9452, "step": 435900 }, { "epoch": 6.7660888592312105, "grad_norm": 2.0657761096954346, "learning_rate": 4.323392665932122e-05, "loss": 0.9628, "step": 436000 }, { "epoch": 6.767640714474154, "grad_norm": 2.278221845626831, "learning_rate": 4.323237480407828e-05, "loss": 0.9688, "step": 436100 }, { "epoch": 6.769192569717097, "grad_norm": 2.1598312854766846, "learning_rate": 4.3230822948835336e-05, "loss": 0.9691, "step": 436200 }, { "epoch": 6.770744424960039, "grad_norm": 1.6591007709503174, "learning_rate": 4.3229271093592394e-05, "loss": 0.963, "step": 436300 }, { "epoch": 6.7722962802029825, "grad_norm": 2.2094757556915283, "learning_rate": 4.322771923834945e-05, "loss": 0.9561, "step": 436400 }, { "epoch": 6.773848135445926, "grad_norm": 2.10581374168396, "learning_rate": 4.322616738310651e-05, "loss": 0.9623, "step": 436500 }, { "epoch": 6.775399990688869, "grad_norm": 2.179748773574829, "learning_rate": 4.322461552786357e-05, "loss": 0.9918, "step": 436600 }, { "epoch": 6.776951845931811, "grad_norm": 2.3291804790496826, "learning_rate": 4.322306367262062e-05, "loss": 0.9712, "step": 436700 }, { "epoch": 6.778503701174754, "grad_norm": 2.1765499114990234, "learning_rate": 4.3221511817377676e-05, "loss": 0.9502, "step": 436800 }, { "epoch": 6.780055556417698, "grad_norm": 2.0082015991210938, "learning_rate": 4.3219959962134734e-05, "loss": 0.9732, "step": 436900 }, { "epoch": 6.78160741166064, "grad_norm": 2.4658210277557373, "learning_rate": 4.321840810689179e-05, "loss": 0.9522, "step": 437000 }, { "epoch": 6.783159266903583, "grad_norm": 2.3163704872131348, "learning_rate": 4.321685625164885e-05, "loss": 0.9536, "step": 437100 }, { "epoch": 6.784711122146526, "grad_norm": 2.341322183609009, "learning_rate": 4.32153043964059e-05, "loss": 0.9562, "step": 437200 }, { "epoch": 6.78626297738947, "grad_norm": 2.8513011932373047, "learning_rate": 4.321375254116296e-05, "loss": 0.981, "step": 437300 }, { "epoch": 6.787814832632412, "grad_norm": 2.0854692459106445, "learning_rate": 4.3212200685920016e-05, "loss": 0.971, "step": 437400 }, { "epoch": 6.789366687875355, "grad_norm": 2.393674612045288, "learning_rate": 4.3210648830677074e-05, "loss": 0.9685, "step": 437500 }, { "epoch": 6.790918543118298, "grad_norm": 2.2793149948120117, "learning_rate": 4.320909697543413e-05, "loss": 0.986, "step": 437600 }, { "epoch": 6.792470398361241, "grad_norm": 2.5252761840820312, "learning_rate": 4.320754512019119e-05, "loss": 0.94, "step": 437700 }, { "epoch": 6.794022253604184, "grad_norm": 2.4589061737060547, "learning_rate": 4.320599326494825e-05, "loss": 0.9562, "step": 437800 }, { "epoch": 6.795574108847127, "grad_norm": 2.3959240913391113, "learning_rate": 4.3204441409705305e-05, "loss": 0.9604, "step": 437900 }, { "epoch": 6.797125964090069, "grad_norm": 2.208648681640625, "learning_rate": 4.320288955446236e-05, "loss": 0.9622, "step": 438000 }, { "epoch": 6.798677819333013, "grad_norm": 1.9950900077819824, "learning_rate": 4.320133769921942e-05, "loss": 0.9511, "step": 438100 }, { "epoch": 6.800229674575956, "grad_norm": 1.8762975931167603, "learning_rate": 4.319978584397648e-05, "loss": 0.9541, "step": 438200 }, { "epoch": 6.801781529818898, "grad_norm": 2.114628553390503, "learning_rate": 4.3198233988733536e-05, "loss": 0.9444, "step": 438300 }, { "epoch": 6.803333385061841, "grad_norm": 2.45674991607666, "learning_rate": 4.3196682133490593e-05, "loss": 0.9856, "step": 438400 }, { "epoch": 6.804885240304785, "grad_norm": 1.924883484840393, "learning_rate": 4.3195130278247644e-05, "loss": 0.96, "step": 438500 }, { "epoch": 6.806437095547727, "grad_norm": 2.017695665359497, "learning_rate": 4.31935784230047e-05, "loss": 0.9738, "step": 438600 }, { "epoch": 6.80798895079067, "grad_norm": 1.899924635887146, "learning_rate": 4.319202656776176e-05, "loss": 0.9702, "step": 438700 }, { "epoch": 6.809540806033613, "grad_norm": 2.0267887115478516, "learning_rate": 4.319047471251882e-05, "loss": 0.9567, "step": 438800 }, { "epoch": 6.811092661276556, "grad_norm": 2.2755370140075684, "learning_rate": 4.3188922857275875e-05, "loss": 0.9492, "step": 438900 }, { "epoch": 6.812644516519499, "grad_norm": 2.377073287963867, "learning_rate": 4.318737100203293e-05, "loss": 0.9425, "step": 439000 }, { "epoch": 6.814196371762442, "grad_norm": 2.4295871257781982, "learning_rate": 4.318581914678999e-05, "loss": 0.956, "step": 439100 }, { "epoch": 6.815748227005385, "grad_norm": 2.429774761199951, "learning_rate": 4.318426729154705e-05, "loss": 0.9779, "step": 439200 }, { "epoch": 6.817300082248328, "grad_norm": 2.101464033126831, "learning_rate": 4.3182715436304106e-05, "loss": 0.9613, "step": 439300 }, { "epoch": 6.818851937491271, "grad_norm": 2.6782474517822266, "learning_rate": 4.3181163581061164e-05, "loss": 0.9687, "step": 439400 }, { "epoch": 6.820403792734214, "grad_norm": 2.628763437271118, "learning_rate": 4.317961172581822e-05, "loss": 0.9579, "step": 439500 }, { "epoch": 6.821955647977156, "grad_norm": 2.305074691772461, "learning_rate": 4.317805987057528e-05, "loss": 0.9732, "step": 439600 }, { "epoch": 6.8235075032201, "grad_norm": 1.7819008827209473, "learning_rate": 4.317650801533234e-05, "loss": 0.9546, "step": 439700 }, { "epoch": 6.825059358463043, "grad_norm": 2.134722948074341, "learning_rate": 4.317495616008939e-05, "loss": 0.9604, "step": 439800 }, { "epoch": 6.826611213705986, "grad_norm": 2.064683675765991, "learning_rate": 4.3173404304846446e-05, "loss": 0.9579, "step": 439900 }, { "epoch": 6.828163068948928, "grad_norm": 1.8675159215927124, "learning_rate": 4.31718524496035e-05, "loss": 0.9522, "step": 440000 }, { "epoch": 6.8297149241918715, "grad_norm": 2.1733498573303223, "learning_rate": 4.3170300594360555e-05, "loss": 0.9696, "step": 440100 }, { "epoch": 6.831266779434815, "grad_norm": 2.1657094955444336, "learning_rate": 4.316874873911761e-05, "loss": 0.9693, "step": 440200 }, { "epoch": 6.832818634677757, "grad_norm": 2.0879313945770264, "learning_rate": 4.316719688387467e-05, "loss": 0.9507, "step": 440300 }, { "epoch": 6.8343704899207, "grad_norm": 1.994692325592041, "learning_rate": 4.316564502863173e-05, "loss": 0.9719, "step": 440400 }, { "epoch": 6.8359223451636435, "grad_norm": 2.3186893463134766, "learning_rate": 4.3164093173388786e-05, "loss": 0.9487, "step": 440500 }, { "epoch": 6.837474200406586, "grad_norm": 2.3696393966674805, "learning_rate": 4.3162541318145844e-05, "loss": 0.9543, "step": 440600 }, { "epoch": 6.839026055649529, "grad_norm": 2.0898354053497314, "learning_rate": 4.31609894629029e-05, "loss": 0.9659, "step": 440700 }, { "epoch": 6.840577910892472, "grad_norm": 2.858240842819214, "learning_rate": 4.315943760765996e-05, "loss": 0.9523, "step": 440800 }, { "epoch": 6.842129766135415, "grad_norm": 2.2911605834960938, "learning_rate": 4.315788575241702e-05, "loss": 0.9499, "step": 440900 }, { "epoch": 6.843681621378358, "grad_norm": 2.49033260345459, "learning_rate": 4.3156333897174075e-05, "loss": 0.9646, "step": 441000 }, { "epoch": 6.845233476621301, "grad_norm": 2.080137014389038, "learning_rate": 4.315478204193113e-05, "loss": 0.9512, "step": 441100 }, { "epoch": 6.846785331864243, "grad_norm": 2.1112749576568604, "learning_rate": 4.315323018668819e-05, "loss": 0.9716, "step": 441200 }, { "epoch": 6.8483371871071865, "grad_norm": 2.253533363342285, "learning_rate": 4.315167833144524e-05, "loss": 0.9776, "step": 441300 }, { "epoch": 6.84988904235013, "grad_norm": 2.4929633140563965, "learning_rate": 4.31501264762023e-05, "loss": 0.972, "step": 441400 }, { "epoch": 6.851440897593072, "grad_norm": 1.8048045635223389, "learning_rate": 4.314857462095936e-05, "loss": 0.964, "step": 441500 }, { "epoch": 6.852992752836015, "grad_norm": 1.9912177324295044, "learning_rate": 4.3147022765716414e-05, "loss": 0.9504, "step": 441600 }, { "epoch": 6.8545446080789585, "grad_norm": 2.2173547744750977, "learning_rate": 4.314547091047347e-05, "loss": 0.9557, "step": 441700 }, { "epoch": 6.856096463321902, "grad_norm": 2.3875458240509033, "learning_rate": 4.314391905523053e-05, "loss": 0.944, "step": 441800 }, { "epoch": 6.857648318564844, "grad_norm": 2.2754809856414795, "learning_rate": 4.314236719998759e-05, "loss": 0.9621, "step": 441900 }, { "epoch": 6.859200173807787, "grad_norm": 2.4385297298431396, "learning_rate": 4.3140815344744645e-05, "loss": 0.9583, "step": 442000 }, { "epoch": 6.8607520290507304, "grad_norm": 1.9793715476989746, "learning_rate": 4.31392634895017e-05, "loss": 0.9733, "step": 442100 }, { "epoch": 6.862303884293673, "grad_norm": 2.110994815826416, "learning_rate": 4.313771163425876e-05, "loss": 0.9617, "step": 442200 }, { "epoch": 6.863855739536616, "grad_norm": 2.355104446411133, "learning_rate": 4.313615977901582e-05, "loss": 0.9593, "step": 442300 }, { "epoch": 6.865407594779559, "grad_norm": 2.1024646759033203, "learning_rate": 4.3134607923772876e-05, "loss": 0.9553, "step": 442400 }, { "epoch": 6.8669594500225015, "grad_norm": 1.7912112474441528, "learning_rate": 4.3133056068529934e-05, "loss": 0.9434, "step": 442500 }, { "epoch": 6.868511305265445, "grad_norm": 2.832334280014038, "learning_rate": 4.3131504213286985e-05, "loss": 0.9479, "step": 442600 }, { "epoch": 6.870063160508388, "grad_norm": 1.8377877473831177, "learning_rate": 4.312995235804404e-05, "loss": 0.9565, "step": 442700 }, { "epoch": 6.871615015751331, "grad_norm": 2.196176290512085, "learning_rate": 4.31284005028011e-05, "loss": 0.9678, "step": 442800 }, { "epoch": 6.8731668709942735, "grad_norm": 2.351644277572632, "learning_rate": 4.312684864755816e-05, "loss": 0.9679, "step": 442900 }, { "epoch": 6.874718726237217, "grad_norm": 2.0880608558654785, "learning_rate": 4.3125296792315216e-05, "loss": 0.9696, "step": 443000 }, { "epoch": 6.87627058148016, "grad_norm": 2.1842544078826904, "learning_rate": 4.3123744937072274e-05, "loss": 0.9511, "step": 443100 }, { "epoch": 6.877822436723102, "grad_norm": 2.013806104660034, "learning_rate": 4.3122193081829325e-05, "loss": 0.944, "step": 443200 }, { "epoch": 6.879374291966045, "grad_norm": 2.2502963542938232, "learning_rate": 4.312064122658638e-05, "loss": 0.9625, "step": 443300 }, { "epoch": 6.880926147208989, "grad_norm": 2.0300559997558594, "learning_rate": 4.311908937134344e-05, "loss": 0.9589, "step": 443400 }, { "epoch": 6.882478002451931, "grad_norm": 2.4204795360565186, "learning_rate": 4.31175375161005e-05, "loss": 0.9768, "step": 443500 }, { "epoch": 6.884029857694874, "grad_norm": 2.3040518760681152, "learning_rate": 4.3115985660857556e-05, "loss": 0.9474, "step": 443600 }, { "epoch": 6.885581712937817, "grad_norm": 2.5396111011505127, "learning_rate": 4.3114433805614614e-05, "loss": 0.9639, "step": 443700 }, { "epoch": 6.88713356818076, "grad_norm": 2.6410367488861084, "learning_rate": 4.311288195037167e-05, "loss": 0.9557, "step": 443800 }, { "epoch": 6.888685423423703, "grad_norm": 2.078646659851074, "learning_rate": 4.311133009512873e-05, "loss": 0.9605, "step": 443900 }, { "epoch": 6.890237278666646, "grad_norm": 2.4347574710845947, "learning_rate": 4.310977823988579e-05, "loss": 0.9662, "step": 444000 }, { "epoch": 6.8917891339095885, "grad_norm": 2.726022720336914, "learning_rate": 4.3108226384642845e-05, "loss": 0.9647, "step": 444100 }, { "epoch": 6.893340989152532, "grad_norm": 1.7981305122375488, "learning_rate": 4.3106674529399896e-05, "loss": 0.965, "step": 444200 }, { "epoch": 6.894892844395475, "grad_norm": 2.1100399494171143, "learning_rate": 4.3105122674156953e-05, "loss": 0.9623, "step": 444300 }, { "epoch": 6.896444699638418, "grad_norm": 2.584831476211548, "learning_rate": 4.310357081891401e-05, "loss": 0.9745, "step": 444400 }, { "epoch": 6.89799655488136, "grad_norm": 2.6576380729675293, "learning_rate": 4.310201896367107e-05, "loss": 0.9648, "step": 444500 }, { "epoch": 6.899548410124304, "grad_norm": 1.8713663816452026, "learning_rate": 4.310046710842813e-05, "loss": 0.959, "step": 444600 }, { "epoch": 6.901100265367247, "grad_norm": 2.099529981613159, "learning_rate": 4.3098915253185184e-05, "loss": 0.9598, "step": 444700 }, { "epoch": 6.902652120610189, "grad_norm": 2.174076557159424, "learning_rate": 4.309736339794224e-05, "loss": 0.9506, "step": 444800 }, { "epoch": 6.904203975853132, "grad_norm": 2.304044485092163, "learning_rate": 4.30958115426993e-05, "loss": 0.9724, "step": 444900 }, { "epoch": 6.905755831096076, "grad_norm": 2.2297749519348145, "learning_rate": 4.309425968745636e-05, "loss": 0.9604, "step": 445000 }, { "epoch": 6.907307686339018, "grad_norm": 2.4866652488708496, "learning_rate": 4.3092707832213415e-05, "loss": 0.9699, "step": 445100 }, { "epoch": 6.908859541581961, "grad_norm": 2.327308416366577, "learning_rate": 4.309115597697047e-05, "loss": 0.9636, "step": 445200 }, { "epoch": 6.910411396824904, "grad_norm": 2.3069162368774414, "learning_rate": 4.308960412172753e-05, "loss": 0.9493, "step": 445300 }, { "epoch": 6.911963252067848, "grad_norm": 1.8557943105697632, "learning_rate": 4.308805226648459e-05, "loss": 0.9565, "step": 445400 }, { "epoch": 6.91351510731079, "grad_norm": 2.817706823348999, "learning_rate": 4.308650041124164e-05, "loss": 0.9686, "step": 445500 }, { "epoch": 6.915066962553733, "grad_norm": 2.60986590385437, "learning_rate": 4.30849485559987e-05, "loss": 0.9816, "step": 445600 }, { "epoch": 6.916618817796676, "grad_norm": 2.1873059272766113, "learning_rate": 4.3083396700755755e-05, "loss": 0.9683, "step": 445700 }, { "epoch": 6.918170673039619, "grad_norm": 2.1571836471557617, "learning_rate": 4.308184484551281e-05, "loss": 0.9534, "step": 445800 }, { "epoch": 6.919722528282562, "grad_norm": 2.315070390701294, "learning_rate": 4.308029299026987e-05, "loss": 0.9517, "step": 445900 }, { "epoch": 6.921274383525505, "grad_norm": 2.337183952331543, "learning_rate": 4.307874113502693e-05, "loss": 0.9658, "step": 446000 }, { "epoch": 6.922826238768447, "grad_norm": 2.563176155090332, "learning_rate": 4.3077189279783986e-05, "loss": 0.9382, "step": 446100 }, { "epoch": 6.924378094011391, "grad_norm": 2.0938093662261963, "learning_rate": 4.3075637424541044e-05, "loss": 0.9577, "step": 446200 }, { "epoch": 6.925929949254334, "grad_norm": 2.144468069076538, "learning_rate": 4.30740855692981e-05, "loss": 0.9499, "step": 446300 }, { "epoch": 6.927481804497276, "grad_norm": 2.0903782844543457, "learning_rate": 4.307253371405516e-05, "loss": 0.9626, "step": 446400 }, { "epoch": 6.929033659740219, "grad_norm": 2.0059690475463867, "learning_rate": 4.307098185881221e-05, "loss": 0.9745, "step": 446500 }, { "epoch": 6.9305855149831626, "grad_norm": 1.8438208103179932, "learning_rate": 4.306943000356927e-05, "loss": 0.9837, "step": 446600 }, { "epoch": 6.932137370226105, "grad_norm": 2.2943475246429443, "learning_rate": 4.3067878148326326e-05, "loss": 0.9596, "step": 446700 }, { "epoch": 6.933689225469048, "grad_norm": 2.3373594284057617, "learning_rate": 4.3066326293083384e-05, "loss": 0.9573, "step": 446800 }, { "epoch": 6.935241080711991, "grad_norm": 1.9522019624710083, "learning_rate": 4.306477443784044e-05, "loss": 0.9578, "step": 446900 }, { "epoch": 6.936792935954934, "grad_norm": 1.945548415184021, "learning_rate": 4.306322258259749e-05, "loss": 0.9521, "step": 447000 }, { "epoch": 6.938344791197877, "grad_norm": 2.0586867332458496, "learning_rate": 4.306167072735455e-05, "loss": 0.9589, "step": 447100 }, { "epoch": 6.93989664644082, "grad_norm": 2.0932538509368896, "learning_rate": 4.306011887211161e-05, "loss": 0.9719, "step": 447200 }, { "epoch": 6.941448501683763, "grad_norm": 2.5355565547943115, "learning_rate": 4.3058567016868666e-05, "loss": 0.9657, "step": 447300 }, { "epoch": 6.943000356926706, "grad_norm": 2.374173164367676, "learning_rate": 4.3057015161625723e-05, "loss": 0.9668, "step": 447400 }, { "epoch": 6.944552212169649, "grad_norm": 2.0808701515197754, "learning_rate": 4.305546330638278e-05, "loss": 0.9353, "step": 447500 }, { "epoch": 6.946104067412592, "grad_norm": 2.030280828475952, "learning_rate": 4.305391145113984e-05, "loss": 0.9427, "step": 447600 }, { "epoch": 6.947655922655534, "grad_norm": 2.1320133209228516, "learning_rate": 4.30523595958969e-05, "loss": 0.9676, "step": 447700 }, { "epoch": 6.9492077778984775, "grad_norm": 2.4068048000335693, "learning_rate": 4.3050807740653954e-05, "loss": 0.97, "step": 447800 }, { "epoch": 6.950759633141421, "grad_norm": 2.26277232170105, "learning_rate": 4.304925588541101e-05, "loss": 0.9914, "step": 447900 }, { "epoch": 6.952311488384364, "grad_norm": 2.5127828121185303, "learning_rate": 4.304770403016807e-05, "loss": 0.9751, "step": 448000 }, { "epoch": 6.953863343627306, "grad_norm": 2.4725217819213867, "learning_rate": 4.304615217492513e-05, "loss": 0.9739, "step": 448100 }, { "epoch": 6.9554151988702495, "grad_norm": 2.240823745727539, "learning_rate": 4.3044600319682185e-05, "loss": 0.9753, "step": 448200 }, { "epoch": 6.956967054113193, "grad_norm": 2.3082690238952637, "learning_rate": 4.3043048464439236e-05, "loss": 0.9785, "step": 448300 }, { "epoch": 6.958518909356135, "grad_norm": 2.2240993976593018, "learning_rate": 4.3041496609196294e-05, "loss": 0.964, "step": 448400 }, { "epoch": 6.960070764599078, "grad_norm": 2.344449758529663, "learning_rate": 4.303994475395335e-05, "loss": 0.9565, "step": 448500 }, { "epoch": 6.9616226198420215, "grad_norm": 2.5229413509368896, "learning_rate": 4.303839289871041e-05, "loss": 0.9606, "step": 448600 }, { "epoch": 6.963174475084964, "grad_norm": 2.295170307159424, "learning_rate": 4.303684104346747e-05, "loss": 0.9737, "step": 448700 }, { "epoch": 6.964726330327907, "grad_norm": 2.2986700534820557, "learning_rate": 4.3035289188224525e-05, "loss": 0.9665, "step": 448800 }, { "epoch": 6.96627818557085, "grad_norm": 2.0462729930877686, "learning_rate": 4.303373733298158e-05, "loss": 0.9652, "step": 448900 }, { "epoch": 6.9678300408137925, "grad_norm": 2.406602621078491, "learning_rate": 4.303218547773864e-05, "loss": 0.9565, "step": 449000 }, { "epoch": 6.969381896056736, "grad_norm": 2.062739610671997, "learning_rate": 4.30306336224957e-05, "loss": 0.9551, "step": 449100 }, { "epoch": 6.970933751299679, "grad_norm": 2.230595827102661, "learning_rate": 4.3029081767252756e-05, "loss": 0.9674, "step": 449200 }, { "epoch": 6.972485606542621, "grad_norm": 1.9942388534545898, "learning_rate": 4.3027529912009814e-05, "loss": 0.9728, "step": 449300 }, { "epoch": 6.9740374617855645, "grad_norm": 2.3794138431549072, "learning_rate": 4.302597805676687e-05, "loss": 0.966, "step": 449400 }, { "epoch": 6.975589317028508, "grad_norm": 2.215716600418091, "learning_rate": 4.302442620152393e-05, "loss": 0.9835, "step": 449500 }, { "epoch": 6.97714117227145, "grad_norm": 2.5608744621276855, "learning_rate": 4.302287434628098e-05, "loss": 0.9754, "step": 449600 }, { "epoch": 6.978693027514393, "grad_norm": 2.413003444671631, "learning_rate": 4.302132249103804e-05, "loss": 0.9318, "step": 449700 }, { "epoch": 6.9802448827573365, "grad_norm": 2.1478822231292725, "learning_rate": 4.301977063579509e-05, "loss": 0.9624, "step": 449800 }, { "epoch": 6.98179673800028, "grad_norm": 2.1789090633392334, "learning_rate": 4.301821878055215e-05, "loss": 0.9451, "step": 449900 }, { "epoch": 6.983348593243222, "grad_norm": 2.1605465412139893, "learning_rate": 4.3016666925309205e-05, "loss": 0.9582, "step": 450000 }, { "epoch": 6.984900448486165, "grad_norm": 3.094702959060669, "learning_rate": 4.301511507006626e-05, "loss": 0.9774, "step": 450100 }, { "epoch": 6.986452303729108, "grad_norm": 2.4087531566619873, "learning_rate": 4.301356321482332e-05, "loss": 0.9514, "step": 450200 }, { "epoch": 6.988004158972051, "grad_norm": 2.0128543376922607, "learning_rate": 4.301201135958038e-05, "loss": 0.9566, "step": 450300 }, { "epoch": 6.989556014214994, "grad_norm": 2.319578170776367, "learning_rate": 4.3010459504337436e-05, "loss": 0.9657, "step": 450400 }, { "epoch": 6.991107869457937, "grad_norm": 2.412431001663208, "learning_rate": 4.3008907649094493e-05, "loss": 0.9625, "step": 450500 }, { "epoch": 6.99265972470088, "grad_norm": 2.422497034072876, "learning_rate": 4.300735579385155e-05, "loss": 0.9687, "step": 450600 }, { "epoch": 6.994211579943823, "grad_norm": 3.001018762588501, "learning_rate": 4.300580393860861e-05, "loss": 0.9675, "step": 450700 }, { "epoch": 6.995763435186766, "grad_norm": 2.0682923793792725, "learning_rate": 4.300425208336567e-05, "loss": 0.958, "step": 450800 }, { "epoch": 6.997315290429709, "grad_norm": 2.0898776054382324, "learning_rate": 4.3002700228122724e-05, "loss": 0.9673, "step": 450900 }, { "epoch": 6.9988671456726514, "grad_norm": 2.364039659500122, "learning_rate": 4.300114837287978e-05, "loss": 0.9613, "step": 451000 }, { "epoch": 7.000419000915595, "grad_norm": 1.913339614868164, "learning_rate": 4.299959651763683e-05, "loss": 0.9494, "step": 451100 }, { "epoch": 7.001970856158538, "grad_norm": 2.6814982891082764, "learning_rate": 4.299804466239389e-05, "loss": 0.9445, "step": 451200 }, { "epoch": 7.00352271140148, "grad_norm": 2.17089581489563, "learning_rate": 4.299649280715095e-05, "loss": 0.9586, "step": 451300 }, { "epoch": 7.005074566644423, "grad_norm": 2.1242733001708984, "learning_rate": 4.2994940951908006e-05, "loss": 0.949, "step": 451400 }, { "epoch": 7.006626421887367, "grad_norm": 2.1148810386657715, "learning_rate": 4.2993389096665064e-05, "loss": 0.9479, "step": 451500 }, { "epoch": 7.008178277130309, "grad_norm": 2.0669050216674805, "learning_rate": 4.299183724142212e-05, "loss": 0.9489, "step": 451600 }, { "epoch": 7.009730132373252, "grad_norm": 2.0324018001556396, "learning_rate": 4.299028538617918e-05, "loss": 0.9454, "step": 451700 }, { "epoch": 7.011281987616195, "grad_norm": 1.8813183307647705, "learning_rate": 4.298873353093624e-05, "loss": 0.9759, "step": 451800 }, { "epoch": 7.012833842859138, "grad_norm": 2.76200270652771, "learning_rate": 4.2987181675693295e-05, "loss": 0.9507, "step": 451900 }, { "epoch": 7.014385698102081, "grad_norm": 2.1816463470458984, "learning_rate": 4.298562982045035e-05, "loss": 0.9451, "step": 452000 }, { "epoch": 7.015937553345024, "grad_norm": 2.047480344772339, "learning_rate": 4.298407796520741e-05, "loss": 0.9484, "step": 452100 }, { "epoch": 7.017489408587967, "grad_norm": 2.437556505203247, "learning_rate": 4.298252610996447e-05, "loss": 0.9544, "step": 452200 }, { "epoch": 7.01904126383091, "grad_norm": 2.104931354522705, "learning_rate": 4.2980974254721526e-05, "loss": 0.9447, "step": 452300 }, { "epoch": 7.020593119073853, "grad_norm": 2.3649117946624756, "learning_rate": 4.297942239947858e-05, "loss": 0.941, "step": 452400 }, { "epoch": 7.022144974316796, "grad_norm": 2.2500951290130615, "learning_rate": 4.2977870544235635e-05, "loss": 0.9468, "step": 452500 }, { "epoch": 7.023696829559738, "grad_norm": 2.018099784851074, "learning_rate": 4.297631868899269e-05, "loss": 0.9468, "step": 452600 }, { "epoch": 7.025248684802682, "grad_norm": 2.2592520713806152, "learning_rate": 4.297476683374975e-05, "loss": 0.9489, "step": 452700 }, { "epoch": 7.026800540045625, "grad_norm": 2.210730791091919, "learning_rate": 4.297321497850681e-05, "loss": 0.9601, "step": 452800 }, { "epoch": 7.028352395288567, "grad_norm": 2.3604366779327393, "learning_rate": 4.2971663123263866e-05, "loss": 0.9544, "step": 452900 }, { "epoch": 7.02990425053151, "grad_norm": 2.5241575241088867, "learning_rate": 4.297011126802092e-05, "loss": 0.9599, "step": 453000 }, { "epoch": 7.031456105774454, "grad_norm": 2.678372859954834, "learning_rate": 4.2968559412777975e-05, "loss": 0.9859, "step": 453100 }, { "epoch": 7.033007961017396, "grad_norm": 2.160053014755249, "learning_rate": 4.296700755753503e-05, "loss": 0.9674, "step": 453200 }, { "epoch": 7.034559816260339, "grad_norm": 1.896772861480713, "learning_rate": 4.296545570229209e-05, "loss": 0.9677, "step": 453300 }, { "epoch": 7.036111671503282, "grad_norm": 2.3262500762939453, "learning_rate": 4.296390384704915e-05, "loss": 0.9679, "step": 453400 }, { "epoch": 7.0376635267462255, "grad_norm": 1.9697010517120361, "learning_rate": 4.2962351991806206e-05, "loss": 0.949, "step": 453500 }, { "epoch": 7.039215381989168, "grad_norm": 2.1979610919952393, "learning_rate": 4.2960800136563263e-05, "loss": 0.973, "step": 453600 }, { "epoch": 7.040767237232111, "grad_norm": 2.1452407836914062, "learning_rate": 4.295924828132032e-05, "loss": 0.9551, "step": 453700 }, { "epoch": 7.042319092475054, "grad_norm": 2.3135056495666504, "learning_rate": 4.295769642607738e-05, "loss": 0.959, "step": 453800 }, { "epoch": 7.043870947717997, "grad_norm": 2.273770809173584, "learning_rate": 4.295614457083444e-05, "loss": 0.9623, "step": 453900 }, { "epoch": 7.04542280296094, "grad_norm": 2.3189713954925537, "learning_rate": 4.295459271559149e-05, "loss": 0.9538, "step": 454000 }, { "epoch": 7.046974658203883, "grad_norm": 2.1957008838653564, "learning_rate": 4.2953040860348545e-05, "loss": 0.9677, "step": 454100 }, { "epoch": 7.048526513446825, "grad_norm": 2.5264923572540283, "learning_rate": 4.29514890051056e-05, "loss": 0.9583, "step": 454200 }, { "epoch": 7.050078368689769, "grad_norm": 2.3417556285858154, "learning_rate": 4.294993714986266e-05, "loss": 0.9661, "step": 454300 }, { "epoch": 7.051630223932712, "grad_norm": 2.3444340229034424, "learning_rate": 4.294838529461972e-05, "loss": 0.9454, "step": 454400 }, { "epoch": 7.053182079175654, "grad_norm": 2.1473686695098877, "learning_rate": 4.2946833439376776e-05, "loss": 0.9663, "step": 454500 }, { "epoch": 7.054733934418597, "grad_norm": 2.139326572418213, "learning_rate": 4.2945281584133834e-05, "loss": 0.957, "step": 454600 }, { "epoch": 7.0562857896615405, "grad_norm": 2.1516165733337402, "learning_rate": 4.294372972889089e-05, "loss": 0.9535, "step": 454700 }, { "epoch": 7.057837644904484, "grad_norm": 2.4904158115386963, "learning_rate": 4.294217787364795e-05, "loss": 0.9421, "step": 454800 }, { "epoch": 7.059389500147426, "grad_norm": 2.3278799057006836, "learning_rate": 4.294062601840501e-05, "loss": 0.95, "step": 454900 }, { "epoch": 7.060941355390369, "grad_norm": 2.6046054363250732, "learning_rate": 4.2939074163162065e-05, "loss": 0.965, "step": 455000 }, { "epoch": 7.0624932106333125, "grad_norm": 2.363619565963745, "learning_rate": 4.293752230791912e-05, "loss": 0.9654, "step": 455100 }, { "epoch": 7.064045065876255, "grad_norm": 2.2985658645629883, "learning_rate": 4.293597045267618e-05, "loss": 0.9579, "step": 455200 }, { "epoch": 7.065596921119198, "grad_norm": 2.117795944213867, "learning_rate": 4.293441859743323e-05, "loss": 0.9368, "step": 455300 }, { "epoch": 7.067148776362141, "grad_norm": 1.7555428743362427, "learning_rate": 4.293286674219029e-05, "loss": 0.9348, "step": 455400 }, { "epoch": 7.0687006316050836, "grad_norm": 2.2877862453460693, "learning_rate": 4.293131488694735e-05, "loss": 0.9638, "step": 455500 }, { "epoch": 7.070252486848027, "grad_norm": 1.8652008771896362, "learning_rate": 4.2929763031704405e-05, "loss": 0.9378, "step": 455600 }, { "epoch": 7.07180434209097, "grad_norm": 2.0074143409729004, "learning_rate": 4.292821117646146e-05, "loss": 0.9574, "step": 455700 }, { "epoch": 7.073356197333912, "grad_norm": 2.6291511058807373, "learning_rate": 4.292665932121852e-05, "loss": 0.935, "step": 455800 }, { "epoch": 7.0749080525768555, "grad_norm": 1.9829723834991455, "learning_rate": 4.292510746597558e-05, "loss": 0.9676, "step": 455900 }, { "epoch": 7.076459907819799, "grad_norm": 2.560279369354248, "learning_rate": 4.2923555610732636e-05, "loss": 0.9617, "step": 456000 }, { "epoch": 7.078011763062742, "grad_norm": 2.0156502723693848, "learning_rate": 4.2922003755489694e-05, "loss": 0.9656, "step": 456100 }, { "epoch": 7.079563618305684, "grad_norm": 2.2178444862365723, "learning_rate": 4.292045190024675e-05, "loss": 0.9655, "step": 456200 }, { "epoch": 7.0811154735486275, "grad_norm": 1.987246036529541, "learning_rate": 4.29189000450038e-05, "loss": 0.9515, "step": 456300 }, { "epoch": 7.082667328791571, "grad_norm": 2.313192129135132, "learning_rate": 4.291734818976086e-05, "loss": 0.9611, "step": 456400 }, { "epoch": 7.084219184034513, "grad_norm": 1.8505592346191406, "learning_rate": 4.291579633451792e-05, "loss": 0.9526, "step": 456500 }, { "epoch": 7.085771039277456, "grad_norm": 1.95948326587677, "learning_rate": 4.2914244479274976e-05, "loss": 0.9511, "step": 456600 }, { "epoch": 7.087322894520399, "grad_norm": 2.267521381378174, "learning_rate": 4.2912692624032033e-05, "loss": 0.9578, "step": 456700 }, { "epoch": 7.088874749763342, "grad_norm": 2.2177438735961914, "learning_rate": 4.2911140768789084e-05, "loss": 0.9555, "step": 456800 }, { "epoch": 7.090426605006285, "grad_norm": 2.168781280517578, "learning_rate": 4.290958891354614e-05, "loss": 0.9789, "step": 456900 }, { "epoch": 7.091978460249228, "grad_norm": 2.333838701248169, "learning_rate": 4.29080370583032e-05, "loss": 0.9346, "step": 457000 }, { "epoch": 7.0935303154921705, "grad_norm": 2.0888421535491943, "learning_rate": 4.290648520306026e-05, "loss": 0.9695, "step": 457100 }, { "epoch": 7.095082170735114, "grad_norm": 2.3843302726745605, "learning_rate": 4.2904933347817315e-05, "loss": 0.9627, "step": 457200 }, { "epoch": 7.096634025978057, "grad_norm": 1.9824546575546265, "learning_rate": 4.290338149257437e-05, "loss": 0.9751, "step": 457300 }, { "epoch": 7.098185881221, "grad_norm": 2.068843364715576, "learning_rate": 4.290182963733143e-05, "loss": 0.9623, "step": 457400 }, { "epoch": 7.0997377364639425, "grad_norm": 2.5886178016662598, "learning_rate": 4.290027778208849e-05, "loss": 0.9735, "step": 457500 }, { "epoch": 7.101289591706886, "grad_norm": 2.1320242881774902, "learning_rate": 4.2898725926845546e-05, "loss": 0.9405, "step": 457600 }, { "epoch": 7.102841446949829, "grad_norm": 2.2554919719696045, "learning_rate": 4.2897174071602604e-05, "loss": 0.9605, "step": 457700 }, { "epoch": 7.104393302192771, "grad_norm": 1.943920373916626, "learning_rate": 4.289562221635966e-05, "loss": 0.9636, "step": 457800 }, { "epoch": 7.105945157435714, "grad_norm": 2.3253748416900635, "learning_rate": 4.289407036111672e-05, "loss": 0.9788, "step": 457900 }, { "epoch": 7.107497012678658, "grad_norm": 2.5349743366241455, "learning_rate": 4.289251850587378e-05, "loss": 0.9612, "step": 458000 }, { "epoch": 7.1090488679216, "grad_norm": 2.255227565765381, "learning_rate": 4.289096665063083e-05, "loss": 0.927, "step": 458100 }, { "epoch": 7.110600723164543, "grad_norm": 2.362051010131836, "learning_rate": 4.2889414795387886e-05, "loss": 0.9808, "step": 458200 }, { "epoch": 7.112152578407486, "grad_norm": 2.2470040321350098, "learning_rate": 4.2887862940144944e-05, "loss": 0.9588, "step": 458300 }, { "epoch": 7.113704433650429, "grad_norm": 2.225729465484619, "learning_rate": 4.2886311084902e-05, "loss": 0.953, "step": 458400 }, { "epoch": 7.115256288893372, "grad_norm": 2.5031213760375977, "learning_rate": 4.288475922965906e-05, "loss": 0.9447, "step": 458500 }, { "epoch": 7.116808144136315, "grad_norm": 2.101773500442505, "learning_rate": 4.288320737441612e-05, "loss": 0.9688, "step": 458600 }, { "epoch": 7.118359999379258, "grad_norm": 1.8641526699066162, "learning_rate": 4.2881655519173175e-05, "loss": 0.9641, "step": 458700 }, { "epoch": 7.119911854622201, "grad_norm": 2.222161293029785, "learning_rate": 4.288010366393023e-05, "loss": 0.9718, "step": 458800 }, { "epoch": 7.121463709865144, "grad_norm": 2.2137527465820312, "learning_rate": 4.287855180868729e-05, "loss": 0.9474, "step": 458900 }, { "epoch": 7.123015565108087, "grad_norm": 2.1790454387664795, "learning_rate": 4.287699995344435e-05, "loss": 0.9333, "step": 459000 }, { "epoch": 7.124567420351029, "grad_norm": 2.2381160259246826, "learning_rate": 4.2875448098201406e-05, "loss": 0.9751, "step": 459100 }, { "epoch": 7.126119275593973, "grad_norm": 2.3690028190612793, "learning_rate": 4.2873896242958464e-05, "loss": 0.9811, "step": 459200 }, { "epoch": 7.127671130836916, "grad_norm": 2.355292558670044, "learning_rate": 4.287234438771552e-05, "loss": 0.9444, "step": 459300 }, { "epoch": 7.129222986079858, "grad_norm": 2.025624990463257, "learning_rate": 4.287079253247257e-05, "loss": 0.9545, "step": 459400 }, { "epoch": 7.130774841322801, "grad_norm": 2.651099681854248, "learning_rate": 4.286924067722963e-05, "loss": 0.9552, "step": 459500 }, { "epoch": 7.132326696565745, "grad_norm": 2.292207717895508, "learning_rate": 4.286768882198669e-05, "loss": 0.9602, "step": 459600 }, { "epoch": 7.133878551808687, "grad_norm": 2.2821760177612305, "learning_rate": 4.286613696674374e-05, "loss": 0.9359, "step": 459700 }, { "epoch": 7.13543040705163, "grad_norm": 2.2225759029388428, "learning_rate": 4.28645851115008e-05, "loss": 0.9521, "step": 459800 }, { "epoch": 7.136982262294573, "grad_norm": 2.255540370941162, "learning_rate": 4.2863033256257854e-05, "loss": 0.9668, "step": 459900 }, { "epoch": 7.138534117537516, "grad_norm": 2.313821792602539, "learning_rate": 4.286148140101491e-05, "loss": 0.9569, "step": 460000 }, { "epoch": 7.140085972780459, "grad_norm": 2.455730676651001, "learning_rate": 4.285992954577197e-05, "loss": 0.9472, "step": 460100 }, { "epoch": 7.141637828023402, "grad_norm": 2.7190046310424805, "learning_rate": 4.285837769052903e-05, "loss": 0.953, "step": 460200 }, { "epoch": 7.143189683266345, "grad_norm": 2.2392213344573975, "learning_rate": 4.2856825835286085e-05, "loss": 0.9594, "step": 460300 }, { "epoch": 7.144741538509288, "grad_norm": 2.6095943450927734, "learning_rate": 4.285527398004314e-05, "loss": 0.9718, "step": 460400 }, { "epoch": 7.146293393752231, "grad_norm": 1.970005989074707, "learning_rate": 4.28537221248002e-05, "loss": 0.9666, "step": 460500 }, { "epoch": 7.147845248995174, "grad_norm": 2.335620403289795, "learning_rate": 4.285217026955726e-05, "loss": 0.9588, "step": 460600 }, { "epoch": 7.149397104238116, "grad_norm": 2.791313648223877, "learning_rate": 4.2850618414314316e-05, "loss": 0.9664, "step": 460700 }, { "epoch": 7.15094895948106, "grad_norm": 2.0274670124053955, "learning_rate": 4.2849066559071374e-05, "loss": 0.9676, "step": 460800 }, { "epoch": 7.152500814724003, "grad_norm": 2.9864020347595215, "learning_rate": 4.284751470382843e-05, "loss": 0.9761, "step": 460900 }, { "epoch": 7.154052669966945, "grad_norm": 1.9967398643493652, "learning_rate": 4.284596284858548e-05, "loss": 0.9525, "step": 461000 }, { "epoch": 7.155604525209888, "grad_norm": 2.2921247482299805, "learning_rate": 4.284441099334254e-05, "loss": 0.9651, "step": 461100 }, { "epoch": 7.1571563804528315, "grad_norm": 2.121875762939453, "learning_rate": 4.28428591380996e-05, "loss": 0.9486, "step": 461200 }, { "epoch": 7.158708235695775, "grad_norm": 2.1138126850128174, "learning_rate": 4.2841307282856656e-05, "loss": 0.9671, "step": 461300 }, { "epoch": 7.160260090938717, "grad_norm": 2.4395244121551514, "learning_rate": 4.2839755427613714e-05, "loss": 0.9476, "step": 461400 }, { "epoch": 7.16181194618166, "grad_norm": 1.7443664073944092, "learning_rate": 4.283820357237077e-05, "loss": 0.923, "step": 461500 }, { "epoch": 7.1633638014246035, "grad_norm": 2.550983428955078, "learning_rate": 4.283665171712783e-05, "loss": 0.9596, "step": 461600 }, { "epoch": 7.164915656667546, "grad_norm": 2.354785203933716, "learning_rate": 4.283509986188489e-05, "loss": 0.9641, "step": 461700 }, { "epoch": 7.166467511910489, "grad_norm": 2.1780269145965576, "learning_rate": 4.2833548006641945e-05, "loss": 0.9571, "step": 461800 }, { "epoch": 7.168019367153432, "grad_norm": 2.2425289154052734, "learning_rate": 4.2831996151399e-05, "loss": 0.9576, "step": 461900 }, { "epoch": 7.169571222396375, "grad_norm": 2.303395986557007, "learning_rate": 4.283044429615606e-05, "loss": 0.9689, "step": 462000 }, { "epoch": 7.171123077639318, "grad_norm": 2.4535417556762695, "learning_rate": 4.282889244091312e-05, "loss": 0.9642, "step": 462100 }, { "epoch": 7.172674932882261, "grad_norm": 2.597946882247925, "learning_rate": 4.2827340585670176e-05, "loss": 0.95, "step": 462200 }, { "epoch": 7.174226788125203, "grad_norm": 2.0032639503479004, "learning_rate": 4.282578873042723e-05, "loss": 0.9519, "step": 462300 }, { "epoch": 7.1757786433681465, "grad_norm": 2.6025397777557373, "learning_rate": 4.2824236875184285e-05, "loss": 0.9444, "step": 462400 }, { "epoch": 7.17733049861109, "grad_norm": 2.4813663959503174, "learning_rate": 4.282268501994134e-05, "loss": 0.9633, "step": 462500 }, { "epoch": 7.178882353854032, "grad_norm": 2.078444242477417, "learning_rate": 4.28211331646984e-05, "loss": 0.9542, "step": 462600 }, { "epoch": 7.180434209096975, "grad_norm": 2.0881171226501465, "learning_rate": 4.281958130945546e-05, "loss": 0.951, "step": 462700 }, { "epoch": 7.1819860643399185, "grad_norm": 2.452178955078125, "learning_rate": 4.281802945421251e-05, "loss": 0.9504, "step": 462800 }, { "epoch": 7.183537919582862, "grad_norm": 2.191159248352051, "learning_rate": 4.281647759896957e-05, "loss": 0.9551, "step": 462900 }, { "epoch": 7.185089774825804, "grad_norm": 2.599208116531372, "learning_rate": 4.2814925743726624e-05, "loss": 0.9507, "step": 463000 }, { "epoch": 7.186641630068747, "grad_norm": 2.333869457244873, "learning_rate": 4.281337388848368e-05, "loss": 0.9521, "step": 463100 }, { "epoch": 7.1881934853116904, "grad_norm": 2.291795015335083, "learning_rate": 4.281182203324074e-05, "loss": 0.949, "step": 463200 }, { "epoch": 7.189745340554633, "grad_norm": 2.338108539581299, "learning_rate": 4.28102701779978e-05, "loss": 0.9403, "step": 463300 }, { "epoch": 7.191297195797576, "grad_norm": 2.32407546043396, "learning_rate": 4.2808718322754855e-05, "loss": 0.9594, "step": 463400 }, { "epoch": 7.192849051040519, "grad_norm": 2.1662728786468506, "learning_rate": 4.280716646751191e-05, "loss": 0.9642, "step": 463500 }, { "epoch": 7.1944009062834615, "grad_norm": 2.087822675704956, "learning_rate": 4.280561461226897e-05, "loss": 0.9596, "step": 463600 }, { "epoch": 7.195952761526405, "grad_norm": 1.9519245624542236, "learning_rate": 4.280406275702603e-05, "loss": 0.9539, "step": 463700 }, { "epoch": 7.197504616769348, "grad_norm": 2.13206148147583, "learning_rate": 4.280251090178308e-05, "loss": 0.9594, "step": 463800 }, { "epoch": 7.19905647201229, "grad_norm": 2.1854920387268066, "learning_rate": 4.280095904654014e-05, "loss": 0.9621, "step": 463900 }, { "epoch": 7.2006083272552335, "grad_norm": 2.114339828491211, "learning_rate": 4.2799407191297195e-05, "loss": 0.9519, "step": 464000 }, { "epoch": 7.202160182498177, "grad_norm": 2.2599399089813232, "learning_rate": 4.279785533605425e-05, "loss": 0.9689, "step": 464100 }, { "epoch": 7.20371203774112, "grad_norm": 2.068944215774536, "learning_rate": 4.279630348081131e-05, "loss": 0.9428, "step": 464200 }, { "epoch": 7.205263892984062, "grad_norm": 2.29356050491333, "learning_rate": 4.279475162556837e-05, "loss": 0.9715, "step": 464300 }, { "epoch": 7.206815748227005, "grad_norm": 2.2453391551971436, "learning_rate": 4.2793199770325426e-05, "loss": 0.9506, "step": 464400 }, { "epoch": 7.208367603469949, "grad_norm": 1.9269475936889648, "learning_rate": 4.2791647915082484e-05, "loss": 0.9453, "step": 464500 }, { "epoch": 7.209919458712891, "grad_norm": 1.9869329929351807, "learning_rate": 4.279009605983954e-05, "loss": 0.9686, "step": 464600 }, { "epoch": 7.211471313955834, "grad_norm": 2.1967082023620605, "learning_rate": 4.27885442045966e-05, "loss": 0.9489, "step": 464700 }, { "epoch": 7.213023169198777, "grad_norm": 2.396719455718994, "learning_rate": 4.278699234935366e-05, "loss": 0.9577, "step": 464800 }, { "epoch": 7.21457502444172, "grad_norm": 2.17808198928833, "learning_rate": 4.2785440494110715e-05, "loss": 0.9668, "step": 464900 }, { "epoch": 7.216126879684663, "grad_norm": 1.9736754894256592, "learning_rate": 4.278388863886777e-05, "loss": 0.954, "step": 465000 }, { "epoch": 7.217678734927606, "grad_norm": 2.349987030029297, "learning_rate": 4.2782336783624824e-05, "loss": 0.9665, "step": 465100 }, { "epoch": 7.2192305901705485, "grad_norm": 2.635378360748291, "learning_rate": 4.278078492838188e-05, "loss": 0.9681, "step": 465200 }, { "epoch": 7.220782445413492, "grad_norm": 2.589090347290039, "learning_rate": 4.277923307313894e-05, "loss": 0.9449, "step": 465300 }, { "epoch": 7.222334300656435, "grad_norm": 2.449859619140625, "learning_rate": 4.2777681217896e-05, "loss": 0.9462, "step": 465400 }, { "epoch": 7.223886155899378, "grad_norm": 1.8678507804870605, "learning_rate": 4.2776129362653055e-05, "loss": 0.9416, "step": 465500 }, { "epoch": 7.22543801114232, "grad_norm": 2.215139627456665, "learning_rate": 4.277457750741011e-05, "loss": 0.9444, "step": 465600 }, { "epoch": 7.226989866385264, "grad_norm": 1.9364641904830933, "learning_rate": 4.277302565216717e-05, "loss": 0.9492, "step": 465700 }, { "epoch": 7.228541721628207, "grad_norm": 2.338667392730713, "learning_rate": 4.277147379692423e-05, "loss": 0.9456, "step": 465800 }, { "epoch": 7.230093576871149, "grad_norm": 2.1778111457824707, "learning_rate": 4.2769921941681286e-05, "loss": 0.953, "step": 465900 }, { "epoch": 7.231645432114092, "grad_norm": 2.0356953144073486, "learning_rate": 4.276837008643834e-05, "loss": 0.9602, "step": 466000 }, { "epoch": 7.233197287357036, "grad_norm": 2.1283013820648193, "learning_rate": 4.2766818231195394e-05, "loss": 0.9508, "step": 466100 }, { "epoch": 7.234749142599978, "grad_norm": 2.4937241077423096, "learning_rate": 4.276526637595245e-05, "loss": 0.9532, "step": 466200 }, { "epoch": 7.236300997842921, "grad_norm": 2.469480276107788, "learning_rate": 4.276371452070951e-05, "loss": 0.9607, "step": 466300 }, { "epoch": 7.237852853085864, "grad_norm": 1.7941803932189941, "learning_rate": 4.276216266546657e-05, "loss": 0.9565, "step": 466400 }, { "epoch": 7.239404708328807, "grad_norm": 2.239121198654175, "learning_rate": 4.2760610810223625e-05, "loss": 0.9771, "step": 466500 }, { "epoch": 7.24095656357175, "grad_norm": 2.150179862976074, "learning_rate": 4.2759058954980676e-05, "loss": 0.9421, "step": 466600 }, { "epoch": 7.242508418814693, "grad_norm": 2.1357245445251465, "learning_rate": 4.2757507099737734e-05, "loss": 0.945, "step": 466700 }, { "epoch": 7.244060274057636, "grad_norm": 2.290334463119507, "learning_rate": 4.275595524449479e-05, "loss": 0.9515, "step": 466800 }, { "epoch": 7.245612129300579, "grad_norm": 1.9361125230789185, "learning_rate": 4.275440338925185e-05, "loss": 0.9516, "step": 466900 }, { "epoch": 7.247163984543522, "grad_norm": 2.115213394165039, "learning_rate": 4.275285153400891e-05, "loss": 0.9475, "step": 467000 }, { "epoch": 7.248715839786465, "grad_norm": 2.2862818241119385, "learning_rate": 4.2751299678765965e-05, "loss": 0.9559, "step": 467100 }, { "epoch": 7.250267695029407, "grad_norm": 2.427246570587158, "learning_rate": 4.274974782352302e-05, "loss": 0.9839, "step": 467200 }, { "epoch": 7.251819550272351, "grad_norm": 2.161555528640747, "learning_rate": 4.274819596828008e-05, "loss": 0.9609, "step": 467300 }, { "epoch": 7.253371405515294, "grad_norm": 1.9250975847244263, "learning_rate": 4.274664411303714e-05, "loss": 0.9483, "step": 467400 }, { "epoch": 7.254923260758236, "grad_norm": 2.2785122394561768, "learning_rate": 4.2745092257794196e-05, "loss": 0.9362, "step": 467500 }, { "epoch": 7.256475116001179, "grad_norm": 2.565977096557617, "learning_rate": 4.2743540402551254e-05, "loss": 0.9722, "step": 467600 }, { "epoch": 7.2580269712441225, "grad_norm": 2.245760917663574, "learning_rate": 4.274198854730831e-05, "loss": 0.9745, "step": 467700 }, { "epoch": 7.259578826487065, "grad_norm": 2.2956316471099854, "learning_rate": 4.274043669206537e-05, "loss": 0.964, "step": 467800 }, { "epoch": 7.261130681730008, "grad_norm": 2.3138315677642822, "learning_rate": 4.273888483682242e-05, "loss": 0.9591, "step": 467900 }, { "epoch": 7.262682536972951, "grad_norm": 2.2791948318481445, "learning_rate": 4.273733298157948e-05, "loss": 0.9391, "step": 468000 }, { "epoch": 7.2642343922158945, "grad_norm": 2.307992458343506, "learning_rate": 4.2735781126336536e-05, "loss": 0.9594, "step": 468100 }, { "epoch": 7.265786247458837, "grad_norm": 2.3418588638305664, "learning_rate": 4.2734229271093594e-05, "loss": 0.951, "step": 468200 }, { "epoch": 7.26733810270178, "grad_norm": 2.0843400955200195, "learning_rate": 4.273267741585065e-05, "loss": 0.9619, "step": 468300 }, { "epoch": 7.268889957944723, "grad_norm": 2.0552427768707275, "learning_rate": 4.273112556060771e-05, "loss": 0.9624, "step": 468400 }, { "epoch": 7.270441813187666, "grad_norm": 2.0975191593170166, "learning_rate": 4.272957370536477e-05, "loss": 0.9532, "step": 468500 }, { "epoch": 7.271993668430609, "grad_norm": 2.250615119934082, "learning_rate": 4.2728021850121825e-05, "loss": 0.9532, "step": 468600 }, { "epoch": 7.273545523673552, "grad_norm": 1.945852518081665, "learning_rate": 4.272646999487888e-05, "loss": 0.9457, "step": 468700 }, { "epoch": 7.275097378916494, "grad_norm": 1.880228877067566, "learning_rate": 4.272491813963594e-05, "loss": 0.956, "step": 468800 }, { "epoch": 7.2766492341594375, "grad_norm": 2.206754207611084, "learning_rate": 4.2723366284393e-05, "loss": 0.9604, "step": 468900 }, { "epoch": 7.278201089402381, "grad_norm": 1.9625083208084106, "learning_rate": 4.2721814429150056e-05, "loss": 0.9507, "step": 469000 }, { "epoch": 7.279752944645323, "grad_norm": 1.8727223873138428, "learning_rate": 4.2720262573907113e-05, "loss": 0.9683, "step": 469100 }, { "epoch": 7.281304799888266, "grad_norm": 1.9759860038757324, "learning_rate": 4.2718710718664164e-05, "loss": 0.9434, "step": 469200 }, { "epoch": 7.2828566551312095, "grad_norm": 1.8300950527191162, "learning_rate": 4.271715886342122e-05, "loss": 0.9447, "step": 469300 }, { "epoch": 7.284408510374153, "grad_norm": 2.4198191165924072, "learning_rate": 4.271560700817828e-05, "loss": 0.9581, "step": 469400 }, { "epoch": 7.285960365617095, "grad_norm": 2.22871732711792, "learning_rate": 4.271405515293533e-05, "loss": 0.978, "step": 469500 }, { "epoch": 7.287512220860038, "grad_norm": 2.2410459518432617, "learning_rate": 4.271250329769239e-05, "loss": 0.9611, "step": 469600 }, { "epoch": 7.2890640761029815, "grad_norm": 2.4857144355773926, "learning_rate": 4.2710951442449446e-05, "loss": 0.9556, "step": 469700 }, { "epoch": 7.290615931345924, "grad_norm": 2.3882670402526855, "learning_rate": 4.2709399587206504e-05, "loss": 0.9589, "step": 469800 }, { "epoch": 7.292167786588867, "grad_norm": 1.9593956470489502, "learning_rate": 4.270784773196356e-05, "loss": 0.9565, "step": 469900 }, { "epoch": 7.29371964183181, "grad_norm": 2.2318294048309326, "learning_rate": 4.270629587672062e-05, "loss": 0.9595, "step": 470000 }, { "epoch": 7.2952714970747525, "grad_norm": 2.3668556213378906, "learning_rate": 4.270474402147768e-05, "loss": 0.9644, "step": 470100 }, { "epoch": 7.296823352317696, "grad_norm": 1.8320237398147583, "learning_rate": 4.2703192166234735e-05, "loss": 0.9551, "step": 470200 }, { "epoch": 7.298375207560639, "grad_norm": 2.304394006729126, "learning_rate": 4.270164031099179e-05, "loss": 0.9422, "step": 470300 }, { "epoch": 7.299927062803581, "grad_norm": 1.8527318239212036, "learning_rate": 4.270008845574885e-05, "loss": 0.9504, "step": 470400 }, { "epoch": 7.3014789180465245, "grad_norm": 1.8852194547653198, "learning_rate": 4.269853660050591e-05, "loss": 0.9305, "step": 470500 } ], "logging_steps": 100, "max_steps": 3221950, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1888326280962714e+19, "train_batch_size": 96, "trial_name": null, "trial_params": null }