diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,32969 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.3014789180465245, + "eval_steps": 500, + "global_step": 470500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015518552429429383, + "grad_norm": 3.45402455329895, + "learning_rate": 4.999846366330949e-05, + "loss": 2.2311, + "step": 100 + }, + { + "epoch": 0.0031037104858858766, + "grad_norm": 3.8429577350616455, + "learning_rate": 4.9996911808066544e-05, + "loss": 1.9447, + "step": 200 + }, + { + "epoch": 0.0046555657288288145, + "grad_norm": 2.976985454559326, + "learning_rate": 4.99953599528236e-05, + "loss": 1.8563, + "step": 300 + }, + { + "epoch": 0.006207420971771753, + "grad_norm": 3.213278293609619, + "learning_rate": 4.999380809758066e-05, + "loss": 1.8092, + "step": 400 + }, + { + "epoch": 0.007759276214714691, + "grad_norm": 3.1327388286590576, + "learning_rate": 4.999225624233772e-05, + "loss": 1.7633, + "step": 500 + }, + { + "epoch": 0.009311131457657629, + "grad_norm": 2.9328699111938477, + "learning_rate": 4.9990704387094775e-05, + "loss": 1.7589, + "step": 600 + }, + { + "epoch": 0.010862986700600569, + "grad_norm": 3.349161148071289, + "learning_rate": 4.998915253185183e-05, + "loss": 1.7504, + "step": 700 + }, + { + "epoch": 0.012414841943543507, + "grad_norm": 3.1088201999664307, + "learning_rate": 4.9987600676608884e-05, + "loss": 1.6966, + "step": 800 + }, + { + "epoch": 0.013966697186486444, + "grad_norm": 3.4292752742767334, + "learning_rate": 4.998604882136594e-05, + "loss": 1.7051, + "step": 900 + }, + { + "epoch": 0.015518552429429382, + "grad_norm": 2.640695571899414, + "learning_rate": 4.9984496966123e-05, + "loss": 1.6682, + "step": 1000 + }, + { + "epoch": 0.01707040767237232, + "grad_norm": 2.923456907272339, + "learning_rate": 4.998294511088006e-05, + "loss": 1.6522, + "step": 1100 + }, + { + "epoch": 0.018622262915315258, + "grad_norm": 3.0905439853668213, + "learning_rate": 4.9981393255637115e-05, + "loss": 1.6395, + "step": 1200 + }, + { + "epoch": 0.020174118158258196, + "grad_norm": 3.6706044673919678, + "learning_rate": 4.997984140039417e-05, + "loss": 1.6346, + "step": 1300 + }, + { + "epoch": 0.021725973401201137, + "grad_norm": 2.609037399291992, + "learning_rate": 4.997828954515123e-05, + "loss": 1.607, + "step": 1400 + }, + { + "epoch": 0.023277828644144075, + "grad_norm": 2.7228541374206543, + "learning_rate": 4.997673768990829e-05, + "loss": 1.6165, + "step": 1500 + }, + { + "epoch": 0.024829683887087013, + "grad_norm": 2.900372266769409, + "learning_rate": 4.9975185834665346e-05, + "loss": 1.6259, + "step": 1600 + }, + { + "epoch": 0.02638153913002995, + "grad_norm": 2.7823703289031982, + "learning_rate": 4.9973633979422404e-05, + "loss": 1.5951, + "step": 1700 + }, + { + "epoch": 0.02793339437297289, + "grad_norm": 2.436638355255127, + "learning_rate": 4.997208212417946e-05, + "loss": 1.5733, + "step": 1800 + }, + { + "epoch": 0.029485249615915827, + "grad_norm": 2.976652145385742, + "learning_rate": 4.997053026893652e-05, + "loss": 1.5745, + "step": 1900 + }, + { + "epoch": 0.031037104858858765, + "grad_norm": 2.9422554969787598, + "learning_rate": 4.996897841369358e-05, + "loss": 1.5691, + "step": 2000 + }, + { + "epoch": 0.032588960101801706, + "grad_norm": 2.7876274585723877, + "learning_rate": 4.996742655845063e-05, + "loss": 1.5458, + "step": 2100 + }, + { + "epoch": 0.03414081534474464, + "grad_norm": 2.8115477561950684, + "learning_rate": 4.9965874703207686e-05, + "loss": 1.5562, + "step": 2200 + }, + { + "epoch": 0.03569267058768758, + "grad_norm": 2.705639123916626, + "learning_rate": 4.996432284796474e-05, + "loss": 1.5358, + "step": 2300 + }, + { + "epoch": 0.037244525830630516, + "grad_norm": 3.510756492614746, + "learning_rate": 4.99627709927218e-05, + "loss": 1.5405, + "step": 2400 + }, + { + "epoch": 0.03879638107357346, + "grad_norm": 2.5096957683563232, + "learning_rate": 4.996121913747886e-05, + "loss": 1.5489, + "step": 2500 + }, + { + "epoch": 0.04034823631651639, + "grad_norm": 2.9852073192596436, + "learning_rate": 4.9959667282235917e-05, + "loss": 1.5493, + "step": 2600 + }, + { + "epoch": 0.04190009155945933, + "grad_norm": 2.7485008239746094, + "learning_rate": 4.9958115426992974e-05, + "loss": 1.5243, + "step": 2700 + }, + { + "epoch": 0.043451946802402275, + "grad_norm": 3.1658973693847656, + "learning_rate": 4.995656357175003e-05, + "loss": 1.5096, + "step": 2800 + }, + { + "epoch": 0.04500380204534521, + "grad_norm": 3.20015549659729, + "learning_rate": 4.995501171650709e-05, + "loss": 1.496, + "step": 2900 + }, + { + "epoch": 0.04655565728828815, + "grad_norm": 3.0995285511016846, + "learning_rate": 4.995345986126415e-05, + "loss": 1.5095, + "step": 3000 + }, + { + "epoch": 0.048107512531231085, + "grad_norm": 3.0172009468078613, + "learning_rate": 4.9951908006021205e-05, + "loss": 1.4852, + "step": 3100 + }, + { + "epoch": 0.049659367774174026, + "grad_norm": 3.3135604858398438, + "learning_rate": 4.995035615077826e-05, + "loss": 1.4845, + "step": 3200 + }, + { + "epoch": 0.05121122301711696, + "grad_norm": 3.2055881023406982, + "learning_rate": 4.994880429553532e-05, + "loss": 1.5104, + "step": 3300 + }, + { + "epoch": 0.0527630782600599, + "grad_norm": 2.850691080093384, + "learning_rate": 4.994725244029237e-05, + "loss": 1.4877, + "step": 3400 + }, + { + "epoch": 0.05431493350300284, + "grad_norm": 2.659492015838623, + "learning_rate": 4.994570058504943e-05, + "loss": 1.496, + "step": 3500 + }, + { + "epoch": 0.05586678874594578, + "grad_norm": 3.2706923484802246, + "learning_rate": 4.994414872980648e-05, + "loss": 1.5075, + "step": 3600 + }, + { + "epoch": 0.05741864398888872, + "grad_norm": 2.351057529449463, + "learning_rate": 4.994259687456354e-05, + "loss": 1.4646, + "step": 3700 + }, + { + "epoch": 0.058970499231831654, + "grad_norm": 2.7702980041503906, + "learning_rate": 4.9941045019320596e-05, + "loss": 1.4716, + "step": 3800 + }, + { + "epoch": 0.060522354474774595, + "grad_norm": 2.2205615043640137, + "learning_rate": 4.9939493164077654e-05, + "loss": 1.4822, + "step": 3900 + }, + { + "epoch": 0.06207420971771753, + "grad_norm": 2.963843584060669, + "learning_rate": 4.993794130883471e-05, + "loss": 1.4838, + "step": 4000 + }, + { + "epoch": 0.06362606496066046, + "grad_norm": 2.826446294784546, + "learning_rate": 4.993638945359177e-05, + "loss": 1.4826, + "step": 4100 + }, + { + "epoch": 0.06517792020360341, + "grad_norm": 2.8247299194335938, + "learning_rate": 4.993483759834883e-05, + "loss": 1.4548, + "step": 4200 + }, + { + "epoch": 0.06672977544654635, + "grad_norm": 2.6167891025543213, + "learning_rate": 4.9933285743105885e-05, + "loss": 1.4484, + "step": 4300 + }, + { + "epoch": 0.06828163068948928, + "grad_norm": 3.185368061065674, + "learning_rate": 4.993173388786294e-05, + "loss": 1.451, + "step": 4400 + }, + { + "epoch": 0.06983348593243223, + "grad_norm": 3.5228018760681152, + "learning_rate": 4.993018203262e-05, + "loss": 1.45, + "step": 4500 + }, + { + "epoch": 0.07138534117537516, + "grad_norm": 2.5988821983337402, + "learning_rate": 4.992863017737706e-05, + "loss": 1.4408, + "step": 4600 + }, + { + "epoch": 0.0729371964183181, + "grad_norm": 2.6914050579071045, + "learning_rate": 4.9927078322134116e-05, + "loss": 1.4487, + "step": 4700 + }, + { + "epoch": 0.07448905166126103, + "grad_norm": 2.8541464805603027, + "learning_rate": 4.9925526466891174e-05, + "loss": 1.4307, + "step": 4800 + }, + { + "epoch": 0.07604090690420398, + "grad_norm": 2.572587013244629, + "learning_rate": 4.9923974611648225e-05, + "loss": 1.4524, + "step": 4900 + }, + { + "epoch": 0.07759276214714692, + "grad_norm": 3.3482115268707275, + "learning_rate": 4.992242275640528e-05, + "loss": 1.4292, + "step": 5000 + }, + { + "epoch": 0.07914461739008985, + "grad_norm": 2.5461199283599854, + "learning_rate": 4.992087090116234e-05, + "loss": 1.4228, + "step": 5100 + }, + { + "epoch": 0.08069647263303278, + "grad_norm": 2.8455793857574463, + "learning_rate": 4.99193190459194e-05, + "loss": 1.4195, + "step": 5200 + }, + { + "epoch": 0.08224832787597573, + "grad_norm": 2.7463977336883545, + "learning_rate": 4.9917767190676456e-05, + "loss": 1.439, + "step": 5300 + }, + { + "epoch": 0.08380018311891867, + "grad_norm": 2.151210308074951, + "learning_rate": 4.991621533543351e-05, + "loss": 1.4169, + "step": 5400 + }, + { + "epoch": 0.0853520383618616, + "grad_norm": 2.944941520690918, + "learning_rate": 4.991466348019057e-05, + "loss": 1.4379, + "step": 5500 + }, + { + "epoch": 0.08690389360480455, + "grad_norm": 3.146421194076538, + "learning_rate": 4.991311162494763e-05, + "loss": 1.43, + "step": 5600 + }, + { + "epoch": 0.08845574884774748, + "grad_norm": 2.7587380409240723, + "learning_rate": 4.9911559769704687e-05, + "loss": 1.4324, + "step": 5700 + }, + { + "epoch": 0.09000760409069042, + "grad_norm": 2.6391189098358154, + "learning_rate": 4.9910007914461744e-05, + "loss": 1.439, + "step": 5800 + }, + { + "epoch": 0.09155945933363335, + "grad_norm": 3.000552177429199, + "learning_rate": 4.99084560592188e-05, + "loss": 1.4203, + "step": 5900 + }, + { + "epoch": 0.0931113145765763, + "grad_norm": 3.165254831314087, + "learning_rate": 4.990690420397586e-05, + "loss": 1.4145, + "step": 6000 + }, + { + "epoch": 0.09466316981951924, + "grad_norm": 3.03849196434021, + "learning_rate": 4.990535234873292e-05, + "loss": 1.4127, + "step": 6100 + }, + { + "epoch": 0.09621502506246217, + "grad_norm": 2.2464165687561035, + "learning_rate": 4.990380049348997e-05, + "loss": 1.4034, + "step": 6200 + }, + { + "epoch": 0.09776688030540512, + "grad_norm": 2.8351242542266846, + "learning_rate": 4.9902248638247026e-05, + "loss": 1.4171, + "step": 6300 + }, + { + "epoch": 0.09931873554834805, + "grad_norm": 3.134185314178467, + "learning_rate": 4.9900696783004084e-05, + "loss": 1.4158, + "step": 6400 + }, + { + "epoch": 0.10087059079129099, + "grad_norm": 2.591951847076416, + "learning_rate": 4.989914492776114e-05, + "loss": 1.4308, + "step": 6500 + }, + { + "epoch": 0.10242244603423392, + "grad_norm": 2.6345736980438232, + "learning_rate": 4.98975930725182e-05, + "loss": 1.3935, + "step": 6600 + }, + { + "epoch": 0.10397430127717687, + "grad_norm": 2.4368953704833984, + "learning_rate": 4.989604121727525e-05, + "loss": 1.4165, + "step": 6700 + }, + { + "epoch": 0.1055261565201198, + "grad_norm": 2.444155693054199, + "learning_rate": 4.989448936203231e-05, + "loss": 1.4096, + "step": 6800 + }, + { + "epoch": 0.10707801176306274, + "grad_norm": 3.1478230953216553, + "learning_rate": 4.9892937506789366e-05, + "loss": 1.4328, + "step": 6900 + }, + { + "epoch": 0.10862986700600567, + "grad_norm": 2.6753594875335693, + "learning_rate": 4.9891385651546424e-05, + "loss": 1.3985, + "step": 7000 + }, + { + "epoch": 0.11018172224894862, + "grad_norm": 2.5885367393493652, + "learning_rate": 4.988983379630348e-05, + "loss": 1.3757, + "step": 7100 + }, + { + "epoch": 0.11173357749189156, + "grad_norm": 2.546741485595703, + "learning_rate": 4.988828194106054e-05, + "loss": 1.3861, + "step": 7200 + }, + { + "epoch": 0.11328543273483449, + "grad_norm": 2.6451854705810547, + "learning_rate": 4.98867300858176e-05, + "loss": 1.3809, + "step": 7300 + }, + { + "epoch": 0.11483728797777744, + "grad_norm": 2.460561513900757, + "learning_rate": 4.9885178230574655e-05, + "loss": 1.3943, + "step": 7400 + }, + { + "epoch": 0.11638914322072037, + "grad_norm": 2.5842692852020264, + "learning_rate": 4.988362637533171e-05, + "loss": 1.3833, + "step": 7500 + }, + { + "epoch": 0.11794099846366331, + "grad_norm": 2.862595796585083, + "learning_rate": 4.988207452008877e-05, + "loss": 1.3679, + "step": 7600 + }, + { + "epoch": 0.11949285370660624, + "grad_norm": 2.5362164974212646, + "learning_rate": 4.988052266484582e-05, + "loss": 1.3763, + "step": 7700 + }, + { + "epoch": 0.12104470894954919, + "grad_norm": 2.6035406589508057, + "learning_rate": 4.987897080960288e-05, + "loss": 1.3591, + "step": 7800 + }, + { + "epoch": 0.12259656419249212, + "grad_norm": 2.9999115467071533, + "learning_rate": 4.987741895435994e-05, + "loss": 1.3984, + "step": 7900 + }, + { + "epoch": 0.12414841943543506, + "grad_norm": 2.50899076461792, + "learning_rate": 4.9875867099116995e-05, + "loss": 1.3648, + "step": 8000 + }, + { + "epoch": 0.125700274678378, + "grad_norm": 2.866330146789551, + "learning_rate": 4.987431524387405e-05, + "loss": 1.4131, + "step": 8100 + }, + { + "epoch": 0.12725212992132093, + "grad_norm": 2.6132583618164062, + "learning_rate": 4.987276338863111e-05, + "loss": 1.3645, + "step": 8200 + }, + { + "epoch": 0.1288039851642639, + "grad_norm": 2.6919543743133545, + "learning_rate": 4.987121153338817e-05, + "loss": 1.3926, + "step": 8300 + }, + { + "epoch": 0.13035584040720682, + "grad_norm": 2.9526069164276123, + "learning_rate": 4.9869659678145226e-05, + "loss": 1.3552, + "step": 8400 + }, + { + "epoch": 0.13190769565014976, + "grad_norm": 2.522690773010254, + "learning_rate": 4.986810782290228e-05, + "loss": 1.3846, + "step": 8500 + }, + { + "epoch": 0.1334595508930927, + "grad_norm": 2.8384175300598145, + "learning_rate": 4.986655596765934e-05, + "loss": 1.3696, + "step": 8600 + }, + { + "epoch": 0.13501140613603563, + "grad_norm": 2.4619386196136475, + "learning_rate": 4.98650041124164e-05, + "loss": 1.3572, + "step": 8700 + }, + { + "epoch": 0.13656326137897856, + "grad_norm": 2.8175203800201416, + "learning_rate": 4.9863452257173457e-05, + "loss": 1.3758, + "step": 8800 + }, + { + "epoch": 0.1381151166219215, + "grad_norm": 2.510261297225952, + "learning_rate": 4.9861900401930514e-05, + "loss": 1.3783, + "step": 8900 + }, + { + "epoch": 0.13966697186486446, + "grad_norm": 2.833686113357544, + "learning_rate": 4.9860348546687565e-05, + "loss": 1.3693, + "step": 9000 + }, + { + "epoch": 0.1412188271078074, + "grad_norm": 2.7892541885375977, + "learning_rate": 4.985879669144462e-05, + "loss": 1.3656, + "step": 9100 + }, + { + "epoch": 0.14277068235075033, + "grad_norm": 2.4540627002716064, + "learning_rate": 4.985724483620168e-05, + "loss": 1.3573, + "step": 9200 + }, + { + "epoch": 0.14432253759369326, + "grad_norm": 2.7814266681671143, + "learning_rate": 4.985569298095874e-05, + "loss": 1.347, + "step": 9300 + }, + { + "epoch": 0.1458743928366362, + "grad_norm": 2.415938138961792, + "learning_rate": 4.9854141125715796e-05, + "loss": 1.3583, + "step": 9400 + }, + { + "epoch": 0.14742624807957913, + "grad_norm": 2.5764036178588867, + "learning_rate": 4.9852589270472854e-05, + "loss": 1.3839, + "step": 9500 + }, + { + "epoch": 0.14897810332252207, + "grad_norm": 2.4359829425811768, + "learning_rate": 4.985103741522991e-05, + "loss": 1.3321, + "step": 9600 + }, + { + "epoch": 0.150529958565465, + "grad_norm": 2.5717275142669678, + "learning_rate": 4.984948555998697e-05, + "loss": 1.3743, + "step": 9700 + }, + { + "epoch": 0.15208181380840796, + "grad_norm": 2.3378283977508545, + "learning_rate": 4.984793370474403e-05, + "loss": 1.3253, + "step": 9800 + }, + { + "epoch": 0.1536336690513509, + "grad_norm": 2.5946924686431885, + "learning_rate": 4.9846381849501085e-05, + "loss": 1.3246, + "step": 9900 + }, + { + "epoch": 0.15518552429429383, + "grad_norm": 2.87147855758667, + "learning_rate": 4.9844829994258136e-05, + "loss": 1.338, + "step": 10000 + }, + { + "epoch": 0.15673737953723677, + "grad_norm": 2.368569850921631, + "learning_rate": 4.9843278139015194e-05, + "loss": 1.3304, + "step": 10100 + }, + { + "epoch": 0.1582892347801797, + "grad_norm": 2.849220037460327, + "learning_rate": 4.984172628377225e-05, + "loss": 1.3644, + "step": 10200 + }, + { + "epoch": 0.15984109002312263, + "grad_norm": 2.7294695377349854, + "learning_rate": 4.984017442852931e-05, + "loss": 1.3521, + "step": 10300 + }, + { + "epoch": 0.16139294526606557, + "grad_norm": 2.3562920093536377, + "learning_rate": 4.983862257328637e-05, + "loss": 1.3154, + "step": 10400 + }, + { + "epoch": 0.16294480050900853, + "grad_norm": 2.7919921875, + "learning_rate": 4.9837070718043425e-05, + "loss": 1.3754, + "step": 10500 + }, + { + "epoch": 0.16449665575195146, + "grad_norm": 2.607933282852173, + "learning_rate": 4.9835518862800476e-05, + "loss": 1.3256, + "step": 10600 + }, + { + "epoch": 0.1660485109948944, + "grad_norm": 2.602830410003662, + "learning_rate": 4.9833967007557534e-05, + "loss": 1.3509, + "step": 10700 + }, + { + "epoch": 0.16760036623783733, + "grad_norm": 2.629920482635498, + "learning_rate": 4.983241515231459e-05, + "loss": 1.3251, + "step": 10800 + }, + { + "epoch": 0.16915222148078027, + "grad_norm": 2.610398054122925, + "learning_rate": 4.983086329707165e-05, + "loss": 1.3515, + "step": 10900 + }, + { + "epoch": 0.1707040767237232, + "grad_norm": 3.0583600997924805, + "learning_rate": 4.982931144182871e-05, + "loss": 1.3496, + "step": 11000 + }, + { + "epoch": 0.17225593196666614, + "grad_norm": 2.51269268989563, + "learning_rate": 4.9827759586585765e-05, + "loss": 1.3538, + "step": 11100 + }, + { + "epoch": 0.1738077872096091, + "grad_norm": 2.8590087890625, + "learning_rate": 4.982620773134282e-05, + "loss": 1.3555, + "step": 11200 + }, + { + "epoch": 0.17535964245255203, + "grad_norm": 2.775425910949707, + "learning_rate": 4.982465587609988e-05, + "loss": 1.3261, + "step": 11300 + }, + { + "epoch": 0.17691149769549497, + "grad_norm": 2.472468137741089, + "learning_rate": 4.982310402085694e-05, + "loss": 1.3233, + "step": 11400 + }, + { + "epoch": 0.1784633529384379, + "grad_norm": 2.447803020477295, + "learning_rate": 4.9821552165613996e-05, + "loss": 1.3114, + "step": 11500 + }, + { + "epoch": 0.18001520818138084, + "grad_norm": 2.7661869525909424, + "learning_rate": 4.982000031037105e-05, + "loss": 1.3327, + "step": 11600 + }, + { + "epoch": 0.18156706342432377, + "grad_norm": 2.892381191253662, + "learning_rate": 4.981844845512811e-05, + "loss": 1.3134, + "step": 11700 + }, + { + "epoch": 0.1831189186672667, + "grad_norm": 2.547635555267334, + "learning_rate": 4.981689659988517e-05, + "loss": 1.2913, + "step": 11800 + }, + { + "epoch": 0.18467077391020967, + "grad_norm": 3.205932855606079, + "learning_rate": 4.981534474464222e-05, + "loss": 1.3398, + "step": 11900 + }, + { + "epoch": 0.1862226291531526, + "grad_norm": 2.577988386154175, + "learning_rate": 4.981379288939928e-05, + "loss": 1.3344, + "step": 12000 + }, + { + "epoch": 0.18777448439609554, + "grad_norm": 2.690061092376709, + "learning_rate": 4.9812241034156335e-05, + "loss": 1.315, + "step": 12100 + }, + { + "epoch": 0.18932633963903847, + "grad_norm": 2.4334030151367188, + "learning_rate": 4.981068917891339e-05, + "loss": 1.3185, + "step": 12200 + }, + { + "epoch": 0.1908781948819814, + "grad_norm": 2.706022024154663, + "learning_rate": 4.980913732367045e-05, + "loss": 1.3132, + "step": 12300 + }, + { + "epoch": 0.19243005012492434, + "grad_norm": 2.400574207305908, + "learning_rate": 4.980758546842751e-05, + "loss": 1.3076, + "step": 12400 + }, + { + "epoch": 0.19398190536786727, + "grad_norm": 2.7774713039398193, + "learning_rate": 4.9806033613184566e-05, + "loss": 1.3081, + "step": 12500 + }, + { + "epoch": 0.19553376061081024, + "grad_norm": 2.392484664916992, + "learning_rate": 4.9804481757941624e-05, + "loss": 1.3146, + "step": 12600 + }, + { + "epoch": 0.19708561585375317, + "grad_norm": 2.4213242530822754, + "learning_rate": 4.980292990269868e-05, + "loss": 1.3, + "step": 12700 + }, + { + "epoch": 0.1986374710966961, + "grad_norm": 2.6262598037719727, + "learning_rate": 4.980137804745574e-05, + "loss": 1.3074, + "step": 12800 + }, + { + "epoch": 0.20018932633963904, + "grad_norm": 2.6495511531829834, + "learning_rate": 4.97998261922128e-05, + "loss": 1.3134, + "step": 12900 + }, + { + "epoch": 0.20174118158258197, + "grad_norm": 2.5671873092651367, + "learning_rate": 4.9798274336969855e-05, + "loss": 1.3102, + "step": 13000 + }, + { + "epoch": 0.2032930368255249, + "grad_norm": 2.4559695720672607, + "learning_rate": 4.979672248172691e-05, + "loss": 1.2916, + "step": 13100 + }, + { + "epoch": 0.20484489206846784, + "grad_norm": 2.3597123622894287, + "learning_rate": 4.9795170626483964e-05, + "loss": 1.2894, + "step": 13200 + }, + { + "epoch": 0.2063967473114108, + "grad_norm": 2.7529051303863525, + "learning_rate": 4.979361877124102e-05, + "loss": 1.3136, + "step": 13300 + }, + { + "epoch": 0.20794860255435374, + "grad_norm": 2.6413700580596924, + "learning_rate": 4.979206691599807e-05, + "loss": 1.2798, + "step": 13400 + }, + { + "epoch": 0.20950045779729667, + "grad_norm": 2.500199794769287, + "learning_rate": 4.979051506075513e-05, + "loss": 1.2863, + "step": 13500 + }, + { + "epoch": 0.2110523130402396, + "grad_norm": 2.5025570392608643, + "learning_rate": 4.978896320551219e-05, + "loss": 1.3186, + "step": 13600 + }, + { + "epoch": 0.21260416828318254, + "grad_norm": 2.6481752395629883, + "learning_rate": 4.9787411350269246e-05, + "loss": 1.3022, + "step": 13700 + }, + { + "epoch": 0.21415602352612548, + "grad_norm": 2.31376051902771, + "learning_rate": 4.9785859495026304e-05, + "loss": 1.3118, + "step": 13800 + }, + { + "epoch": 0.2157078787690684, + "grad_norm": 2.443878412246704, + "learning_rate": 4.978430763978336e-05, + "loss": 1.3235, + "step": 13900 + }, + { + "epoch": 0.21725973401201135, + "grad_norm": 2.792278528213501, + "learning_rate": 4.978275578454042e-05, + "loss": 1.322, + "step": 14000 + }, + { + "epoch": 0.2188115892549543, + "grad_norm": 2.817594051361084, + "learning_rate": 4.978120392929748e-05, + "loss": 1.316, + "step": 14100 + }, + { + "epoch": 0.22036344449789724, + "grad_norm": 2.3896937370300293, + "learning_rate": 4.9779652074054535e-05, + "loss": 1.3029, + "step": 14200 + }, + { + "epoch": 0.22191529974084018, + "grad_norm": 2.849748373031616, + "learning_rate": 4.977810021881159e-05, + "loss": 1.3186, + "step": 14300 + }, + { + "epoch": 0.2234671549837831, + "grad_norm": 2.8510918617248535, + "learning_rate": 4.977654836356865e-05, + "loss": 1.2806, + "step": 14400 + }, + { + "epoch": 0.22501901022672605, + "grad_norm": 2.5479323863983154, + "learning_rate": 4.977499650832571e-05, + "loss": 1.2917, + "step": 14500 + }, + { + "epoch": 0.22657086546966898, + "grad_norm": 2.897686243057251, + "learning_rate": 4.9773444653082766e-05, + "loss": 1.2832, + "step": 14600 + }, + { + "epoch": 0.22812272071261191, + "grad_norm": 2.7339227199554443, + "learning_rate": 4.9771892797839817e-05, + "loss": 1.287, + "step": 14700 + }, + { + "epoch": 0.22967457595555488, + "grad_norm": 2.701669931411743, + "learning_rate": 4.9770340942596874e-05, + "loss": 1.3081, + "step": 14800 + }, + { + "epoch": 0.2312264311984978, + "grad_norm": 2.775510549545288, + "learning_rate": 4.976878908735393e-05, + "loss": 1.3007, + "step": 14900 + }, + { + "epoch": 0.23277828644144075, + "grad_norm": 2.426663637161255, + "learning_rate": 4.976723723211099e-05, + "loss": 1.279, + "step": 15000 + }, + { + "epoch": 0.23433014168438368, + "grad_norm": 2.715815305709839, + "learning_rate": 4.976568537686805e-05, + "loss": 1.3016, + "step": 15100 + }, + { + "epoch": 0.23588199692732661, + "grad_norm": 2.6184160709381104, + "learning_rate": 4.9764133521625105e-05, + "loss": 1.286, + "step": 15200 + }, + { + "epoch": 0.23743385217026955, + "grad_norm": 2.65220046043396, + "learning_rate": 4.976258166638216e-05, + "loss": 1.2953, + "step": 15300 + }, + { + "epoch": 0.23898570741321248, + "grad_norm": 2.455944299697876, + "learning_rate": 4.976102981113922e-05, + "loss": 1.2863, + "step": 15400 + }, + { + "epoch": 0.24053756265615545, + "grad_norm": 2.484191417694092, + "learning_rate": 4.975947795589628e-05, + "loss": 1.2715, + "step": 15500 + }, + { + "epoch": 0.24208941789909838, + "grad_norm": 2.3590664863586426, + "learning_rate": 4.9757926100653336e-05, + "loss": 1.2865, + "step": 15600 + }, + { + "epoch": 0.24364127314204131, + "grad_norm": 2.6133363246917725, + "learning_rate": 4.9756374245410394e-05, + "loss": 1.3089, + "step": 15700 + }, + { + "epoch": 0.24519312838498425, + "grad_norm": 2.5714144706726074, + "learning_rate": 4.975482239016745e-05, + "loss": 1.2799, + "step": 15800 + }, + { + "epoch": 0.24674498362792718, + "grad_norm": 2.655200242996216, + "learning_rate": 4.975327053492451e-05, + "loss": 1.2854, + "step": 15900 + }, + { + "epoch": 0.24829683887087012, + "grad_norm": 2.5497395992279053, + "learning_rate": 4.975171867968156e-05, + "loss": 1.3059, + "step": 16000 + }, + { + "epoch": 0.24984869411381305, + "grad_norm": 2.787501573562622, + "learning_rate": 4.975016682443862e-05, + "loss": 1.2996, + "step": 16100 + }, + { + "epoch": 0.251400549356756, + "grad_norm": 2.7455899715423584, + "learning_rate": 4.9748614969195676e-05, + "loss": 1.2996, + "step": 16200 + }, + { + "epoch": 0.2529524045996989, + "grad_norm": 2.803668737411499, + "learning_rate": 4.9747063113952734e-05, + "loss": 1.2888, + "step": 16300 + }, + { + "epoch": 0.25450425984264186, + "grad_norm": 2.38700532913208, + "learning_rate": 4.974551125870979e-05, + "loss": 1.3025, + "step": 16400 + }, + { + "epoch": 0.2560561150855848, + "grad_norm": 2.337942123413086, + "learning_rate": 4.974395940346684e-05, + "loss": 1.2914, + "step": 16500 + }, + { + "epoch": 0.2576079703285278, + "grad_norm": 2.190355062484741, + "learning_rate": 4.97424075482239e-05, + "loss": 1.3007, + "step": 16600 + }, + { + "epoch": 0.2591598255714707, + "grad_norm": 2.6558947563171387, + "learning_rate": 4.974085569298096e-05, + "loss": 1.2867, + "step": 16700 + }, + { + "epoch": 0.26071168081441365, + "grad_norm": 2.452852964401245, + "learning_rate": 4.9739303837738016e-05, + "loss": 1.2921, + "step": 16800 + }, + { + "epoch": 0.2622635360573566, + "grad_norm": 2.992396831512451, + "learning_rate": 4.9737751982495074e-05, + "loss": 1.2735, + "step": 16900 + }, + { + "epoch": 0.2638153913002995, + "grad_norm": 1.9501643180847168, + "learning_rate": 4.973620012725213e-05, + "loss": 1.2719, + "step": 17000 + }, + { + "epoch": 0.26536724654324245, + "grad_norm": 2.429457664489746, + "learning_rate": 4.973464827200919e-05, + "loss": 1.2981, + "step": 17100 + }, + { + "epoch": 0.2669191017861854, + "grad_norm": 2.4714221954345703, + "learning_rate": 4.973309641676625e-05, + "loss": 1.2638, + "step": 17200 + }, + { + "epoch": 0.2684709570291283, + "grad_norm": 2.4382991790771484, + "learning_rate": 4.9731544561523305e-05, + "loss": 1.2844, + "step": 17300 + }, + { + "epoch": 0.27002281227207126, + "grad_norm": 2.2101175785064697, + "learning_rate": 4.972999270628036e-05, + "loss": 1.2809, + "step": 17400 + }, + { + "epoch": 0.2715746675150142, + "grad_norm": 2.6227142810821533, + "learning_rate": 4.972844085103742e-05, + "loss": 1.3096, + "step": 17500 + }, + { + "epoch": 0.2731265227579571, + "grad_norm": 2.3420772552490234, + "learning_rate": 4.972688899579447e-05, + "loss": 1.2689, + "step": 17600 + }, + { + "epoch": 0.27467837800090006, + "grad_norm": 2.3048648834228516, + "learning_rate": 4.972533714055153e-05, + "loss": 1.2695, + "step": 17700 + }, + { + "epoch": 0.276230233243843, + "grad_norm": 2.3735549449920654, + "learning_rate": 4.9723785285308587e-05, + "loss": 1.2774, + "step": 17800 + }, + { + "epoch": 0.2777820884867859, + "grad_norm": 2.56894588470459, + "learning_rate": 4.9722233430065644e-05, + "loss": 1.2415, + "step": 17900 + }, + { + "epoch": 0.2793339437297289, + "grad_norm": 2.6230642795562744, + "learning_rate": 4.97206815748227e-05, + "loss": 1.2775, + "step": 18000 + }, + { + "epoch": 0.28088579897267185, + "grad_norm": 2.1623568534851074, + "learning_rate": 4.971912971957976e-05, + "loss": 1.2724, + "step": 18100 + }, + { + "epoch": 0.2824376542156148, + "grad_norm": 2.7458627223968506, + "learning_rate": 4.971757786433682e-05, + "loss": 1.252, + "step": 18200 + }, + { + "epoch": 0.2839895094585577, + "grad_norm": 2.631115198135376, + "learning_rate": 4.9716026009093875e-05, + "loss": 1.2952, + "step": 18300 + }, + { + "epoch": 0.28554136470150066, + "grad_norm": 2.432504653930664, + "learning_rate": 4.971447415385093e-05, + "loss": 1.2697, + "step": 18400 + }, + { + "epoch": 0.2870932199444436, + "grad_norm": 3.2598044872283936, + "learning_rate": 4.971292229860799e-05, + "loss": 1.2659, + "step": 18500 + }, + { + "epoch": 0.2886450751873865, + "grad_norm": 2.5015132427215576, + "learning_rate": 4.971137044336505e-05, + "loss": 1.2515, + "step": 18600 + }, + { + "epoch": 0.29019693043032946, + "grad_norm": 2.426882028579712, + "learning_rate": 4.9709818588122106e-05, + "loss": 1.2575, + "step": 18700 + }, + { + "epoch": 0.2917487856732724, + "grad_norm": 2.1316773891448975, + "learning_rate": 4.9708266732879164e-05, + "loss": 1.2785, + "step": 18800 + }, + { + "epoch": 0.2933006409162153, + "grad_norm": 2.2184412479400635, + "learning_rate": 4.9706714877636215e-05, + "loss": 1.2692, + "step": 18900 + }, + { + "epoch": 0.29485249615915826, + "grad_norm": 2.5865681171417236, + "learning_rate": 4.970516302239327e-05, + "loss": 1.2864, + "step": 19000 + }, + { + "epoch": 0.2964043514021012, + "grad_norm": 2.5837719440460205, + "learning_rate": 4.970361116715033e-05, + "loss": 1.2557, + "step": 19100 + }, + { + "epoch": 0.29795620664504413, + "grad_norm": 2.6286470890045166, + "learning_rate": 4.970205931190739e-05, + "loss": 1.259, + "step": 19200 + }, + { + "epoch": 0.29950806188798706, + "grad_norm": 2.8535425662994385, + "learning_rate": 4.9700507456664446e-05, + "loss": 1.2402, + "step": 19300 + }, + { + "epoch": 0.30105991713093, + "grad_norm": 2.272538900375366, + "learning_rate": 4.9698955601421504e-05, + "loss": 1.2728, + "step": 19400 + }, + { + "epoch": 0.302611772373873, + "grad_norm": 2.6719024181365967, + "learning_rate": 4.969740374617856e-05, + "loss": 1.269, + "step": 19500 + }, + { + "epoch": 0.3041636276168159, + "grad_norm": 2.3608758449554443, + "learning_rate": 4.969585189093562e-05, + "loss": 1.2605, + "step": 19600 + }, + { + "epoch": 0.30571548285975886, + "grad_norm": 2.552095890045166, + "learning_rate": 4.969430003569267e-05, + "loss": 1.2451, + "step": 19700 + }, + { + "epoch": 0.3072673381027018, + "grad_norm": 2.9721288681030273, + "learning_rate": 4.969274818044973e-05, + "loss": 1.2786, + "step": 19800 + }, + { + "epoch": 0.3088191933456447, + "grad_norm": 2.478731393814087, + "learning_rate": 4.9691196325206786e-05, + "loss": 1.2735, + "step": 19900 + }, + { + "epoch": 0.31037104858858766, + "grad_norm": 2.8701961040496826, + "learning_rate": 4.9689644469963844e-05, + "loss": 1.2515, + "step": 20000 + }, + { + "epoch": 0.3119229038315306, + "grad_norm": 2.821871519088745, + "learning_rate": 4.96880926147209e-05, + "loss": 1.3017, + "step": 20100 + }, + { + "epoch": 0.31347475907447353, + "grad_norm": 2.907162666320801, + "learning_rate": 4.968654075947796e-05, + "loss": 1.2539, + "step": 20200 + }, + { + "epoch": 0.31502661431741646, + "grad_norm": 2.221086025238037, + "learning_rate": 4.968498890423502e-05, + "loss": 1.2926, + "step": 20300 + }, + { + "epoch": 0.3165784695603594, + "grad_norm": 2.498689889907837, + "learning_rate": 4.968343704899207e-05, + "loss": 1.2786, + "step": 20400 + }, + { + "epoch": 0.31813032480330233, + "grad_norm": 2.6393258571624756, + "learning_rate": 4.9681885193749126e-05, + "loss": 1.2442, + "step": 20500 + }, + { + "epoch": 0.31968218004624527, + "grad_norm": 2.6867501735687256, + "learning_rate": 4.968033333850618e-05, + "loss": 1.2567, + "step": 20600 + }, + { + "epoch": 0.3212340352891882, + "grad_norm": 2.4331588745117188, + "learning_rate": 4.967878148326324e-05, + "loss": 1.2737, + "step": 20700 + }, + { + "epoch": 0.32278589053213114, + "grad_norm": 2.5626893043518066, + "learning_rate": 4.96772296280203e-05, + "loss": 1.2608, + "step": 20800 + }, + { + "epoch": 0.3243377457750741, + "grad_norm": 1.7633891105651855, + "learning_rate": 4.9675677772777357e-05, + "loss": 1.2545, + "step": 20900 + }, + { + "epoch": 0.32588960101801706, + "grad_norm": 2.918287515640259, + "learning_rate": 4.9674125917534414e-05, + "loss": 1.2738, + "step": 21000 + }, + { + "epoch": 0.32744145626096, + "grad_norm": 2.7387962341308594, + "learning_rate": 4.967257406229147e-05, + "loss": 1.2335, + "step": 21100 + }, + { + "epoch": 0.32899331150390293, + "grad_norm": 2.6093668937683105, + "learning_rate": 4.967102220704853e-05, + "loss": 1.2529, + "step": 21200 + }, + { + "epoch": 0.33054516674684586, + "grad_norm": 2.3680531978607178, + "learning_rate": 4.966947035180559e-05, + "loss": 1.2644, + "step": 21300 + }, + { + "epoch": 0.3320970219897888, + "grad_norm": 2.770004987716675, + "learning_rate": 4.9667918496562645e-05, + "loss": 1.2372, + "step": 21400 + }, + { + "epoch": 0.33364887723273173, + "grad_norm": 3.0637829303741455, + "learning_rate": 4.96663666413197e-05, + "loss": 1.2457, + "step": 21500 + }, + { + "epoch": 0.33520073247567467, + "grad_norm": 2.536048173904419, + "learning_rate": 4.966481478607676e-05, + "loss": 1.241, + "step": 21600 + }, + { + "epoch": 0.3367525877186176, + "grad_norm": 2.698406219482422, + "learning_rate": 4.966326293083381e-05, + "loss": 1.2588, + "step": 21700 + }, + { + "epoch": 0.33830444296156054, + "grad_norm": 2.545354127883911, + "learning_rate": 4.966171107559087e-05, + "loss": 1.2393, + "step": 21800 + }, + { + "epoch": 0.33985629820450347, + "grad_norm": 2.902768611907959, + "learning_rate": 4.966015922034793e-05, + "loss": 1.2622, + "step": 21900 + }, + { + "epoch": 0.3414081534474464, + "grad_norm": 2.4308223724365234, + "learning_rate": 4.9658607365104985e-05, + "loss": 1.2451, + "step": 22000 + }, + { + "epoch": 0.34296000869038934, + "grad_norm": 2.7511160373687744, + "learning_rate": 4.965705550986204e-05, + "loss": 1.265, + "step": 22100 + }, + { + "epoch": 0.3445118639333323, + "grad_norm": 2.269733190536499, + "learning_rate": 4.96555036546191e-05, + "loss": 1.231, + "step": 22200 + }, + { + "epoch": 0.34606371917627526, + "grad_norm": 2.5755834579467773, + "learning_rate": 4.965395179937616e-05, + "loss": 1.2449, + "step": 22300 + }, + { + "epoch": 0.3476155744192182, + "grad_norm": 2.7427587509155273, + "learning_rate": 4.9652399944133216e-05, + "loss": 1.2315, + "step": 22400 + }, + { + "epoch": 0.34916742966216113, + "grad_norm": 2.5260298252105713, + "learning_rate": 4.9650848088890274e-05, + "loss": 1.2448, + "step": 22500 + }, + { + "epoch": 0.35071928490510407, + "grad_norm": 2.6664581298828125, + "learning_rate": 4.964929623364733e-05, + "loss": 1.2658, + "step": 22600 + }, + { + "epoch": 0.352271140148047, + "grad_norm": 2.3417155742645264, + "learning_rate": 4.964774437840439e-05, + "loss": 1.2485, + "step": 22700 + }, + { + "epoch": 0.35382299539098994, + "grad_norm": 2.4381089210510254, + "learning_rate": 4.964619252316145e-05, + "loss": 1.2271, + "step": 22800 + }, + { + "epoch": 0.35537485063393287, + "grad_norm": 2.50130558013916, + "learning_rate": 4.9644640667918505e-05, + "loss": 1.2544, + "step": 22900 + }, + { + "epoch": 0.3569267058768758, + "grad_norm": 2.5027225017547607, + "learning_rate": 4.9643088812675556e-05, + "loss": 1.2368, + "step": 23000 + }, + { + "epoch": 0.35847856111981874, + "grad_norm": 2.594553232192993, + "learning_rate": 4.9641536957432614e-05, + "loss": 1.2731, + "step": 23100 + }, + { + "epoch": 0.3600304163627617, + "grad_norm": 2.1037485599517822, + "learning_rate": 4.9639985102189665e-05, + "loss": 1.2262, + "step": 23200 + }, + { + "epoch": 0.3615822716057046, + "grad_norm": 3.055147886276245, + "learning_rate": 4.963843324694672e-05, + "loss": 1.2429, + "step": 23300 + }, + { + "epoch": 0.36313412684864754, + "grad_norm": 2.637697219848633, + "learning_rate": 4.963688139170378e-05, + "loss": 1.2274, + "step": 23400 + }, + { + "epoch": 0.3646859820915905, + "grad_norm": 2.306586980819702, + "learning_rate": 4.963532953646084e-05, + "loss": 1.252, + "step": 23500 + }, + { + "epoch": 0.3662378373345334, + "grad_norm": 2.6637966632843018, + "learning_rate": 4.9633777681217896e-05, + "loss": 1.2433, + "step": 23600 + }, + { + "epoch": 0.36778969257747635, + "grad_norm": 2.3285157680511475, + "learning_rate": 4.963222582597495e-05, + "loss": 1.2429, + "step": 23700 + }, + { + "epoch": 0.36934154782041934, + "grad_norm": 2.5495405197143555, + "learning_rate": 4.963067397073201e-05, + "loss": 1.2292, + "step": 23800 + }, + { + "epoch": 0.37089340306336227, + "grad_norm": 2.127140760421753, + "learning_rate": 4.962912211548907e-05, + "loss": 1.2357, + "step": 23900 + }, + { + "epoch": 0.3724452583063052, + "grad_norm": 2.2757773399353027, + "learning_rate": 4.9627570260246127e-05, + "loss": 1.2616, + "step": 24000 + }, + { + "epoch": 0.37399711354924814, + "grad_norm": 2.529846668243408, + "learning_rate": 4.9626018405003184e-05, + "loss": 1.2387, + "step": 24100 + }, + { + "epoch": 0.3755489687921911, + "grad_norm": 2.742988109588623, + "learning_rate": 4.962446654976024e-05, + "loss": 1.2411, + "step": 24200 + }, + { + "epoch": 0.377100824035134, + "grad_norm": 2.7472355365753174, + "learning_rate": 4.96229146945173e-05, + "loss": 1.2726, + "step": 24300 + }, + { + "epoch": 0.37865267927807694, + "grad_norm": 2.5658702850341797, + "learning_rate": 4.962136283927436e-05, + "loss": 1.2479, + "step": 24400 + }, + { + "epoch": 0.3802045345210199, + "grad_norm": 2.6822404861450195, + "learning_rate": 4.961981098403141e-05, + "loss": 1.2324, + "step": 24500 + }, + { + "epoch": 0.3817563897639628, + "grad_norm": 2.149616003036499, + "learning_rate": 4.9618259128788466e-05, + "loss": 1.2335, + "step": 24600 + }, + { + "epoch": 0.38330824500690575, + "grad_norm": 2.515333890914917, + "learning_rate": 4.9616707273545524e-05, + "loss": 1.2274, + "step": 24700 + }, + { + "epoch": 0.3848601002498487, + "grad_norm": 2.617628812789917, + "learning_rate": 4.961515541830258e-05, + "loss": 1.2316, + "step": 24800 + }, + { + "epoch": 0.3864119554927916, + "grad_norm": 2.5379648208618164, + "learning_rate": 4.961360356305964e-05, + "loss": 1.2257, + "step": 24900 + }, + { + "epoch": 0.38796381073573455, + "grad_norm": 2.619234323501587, + "learning_rate": 4.96120517078167e-05, + "loss": 1.2044, + "step": 25000 + }, + { + "epoch": 0.3895156659786775, + "grad_norm": 2.523144483566284, + "learning_rate": 4.9610499852573755e-05, + "loss": 1.2506, + "step": 25100 + }, + { + "epoch": 0.3910675212216205, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.960894799733081e-05, + "loss": 1.2557, + "step": 25200 + }, + { + "epoch": 0.3926193764645634, + "grad_norm": 2.4023473262786865, + "learning_rate": 4.960739614208787e-05, + "loss": 1.2526, + "step": 25300 + }, + { + "epoch": 0.39417123170750634, + "grad_norm": 2.3050835132598877, + "learning_rate": 4.960584428684493e-05, + "loss": 1.2225, + "step": 25400 + }, + { + "epoch": 0.3957230869504493, + "grad_norm": 2.1680538654327393, + "learning_rate": 4.9604292431601986e-05, + "loss": 1.2286, + "step": 25500 + }, + { + "epoch": 0.3972749421933922, + "grad_norm": 2.603233814239502, + "learning_rate": 4.9602740576359044e-05, + "loss": 1.2253, + "step": 25600 + }, + { + "epoch": 0.39882679743633515, + "grad_norm": 2.2895560264587402, + "learning_rate": 4.96011887211161e-05, + "loss": 1.2318, + "step": 25700 + }, + { + "epoch": 0.4003786526792781, + "grad_norm": 2.3969969749450684, + "learning_rate": 4.959963686587315e-05, + "loss": 1.2381, + "step": 25800 + }, + { + "epoch": 0.401930507922221, + "grad_norm": 2.4148831367492676, + "learning_rate": 4.959808501063021e-05, + "loss": 1.2117, + "step": 25900 + }, + { + "epoch": 0.40348236316516395, + "grad_norm": 2.634455442428589, + "learning_rate": 4.959653315538727e-05, + "loss": 1.2233, + "step": 26000 + }, + { + "epoch": 0.4050342184081069, + "grad_norm": 2.5249853134155273, + "learning_rate": 4.9594981300144326e-05, + "loss": 1.2234, + "step": 26100 + }, + { + "epoch": 0.4065860736510498, + "grad_norm": 2.2868144512176514, + "learning_rate": 4.959342944490138e-05, + "loss": 1.2397, + "step": 26200 + }, + { + "epoch": 0.40813792889399275, + "grad_norm": 2.3488306999206543, + "learning_rate": 4.9591877589658435e-05, + "loss": 1.2479, + "step": 26300 + }, + { + "epoch": 0.4096897841369357, + "grad_norm": 2.9861233234405518, + "learning_rate": 4.959032573441549e-05, + "loss": 1.235, + "step": 26400 + }, + { + "epoch": 0.4112416393798786, + "grad_norm": 2.7346441745758057, + "learning_rate": 4.958877387917255e-05, + "loss": 1.2238, + "step": 26500 + }, + { + "epoch": 0.4127934946228216, + "grad_norm": 2.6701862812042236, + "learning_rate": 4.958722202392961e-05, + "loss": 1.2331, + "step": 26600 + }, + { + "epoch": 0.41434534986576455, + "grad_norm": 2.398591995239258, + "learning_rate": 4.9585670168686666e-05, + "loss": 1.224, + "step": 26700 + }, + { + "epoch": 0.4158972051087075, + "grad_norm": 2.4429726600646973, + "learning_rate": 4.958411831344372e-05, + "loss": 1.2276, + "step": 26800 + }, + { + "epoch": 0.4174490603516504, + "grad_norm": 2.251915693283081, + "learning_rate": 4.958256645820078e-05, + "loss": 1.2405, + "step": 26900 + }, + { + "epoch": 0.41900091559459335, + "grad_norm": 2.7813985347747803, + "learning_rate": 4.958101460295784e-05, + "loss": 1.223, + "step": 27000 + }, + { + "epoch": 0.4205527708375363, + "grad_norm": 2.108531951904297, + "learning_rate": 4.9579462747714897e-05, + "loss": 1.2304, + "step": 27100 + }, + { + "epoch": 0.4221046260804792, + "grad_norm": 2.433366060256958, + "learning_rate": 4.9577910892471954e-05, + "loss": 1.2246, + "step": 27200 + }, + { + "epoch": 0.42365648132342215, + "grad_norm": 2.5393569469451904, + "learning_rate": 4.957635903722901e-05, + "loss": 1.2182, + "step": 27300 + }, + { + "epoch": 0.4252083365663651, + "grad_norm": 2.4050772190093994, + "learning_rate": 4.957480718198606e-05, + "loss": 1.2049, + "step": 27400 + }, + { + "epoch": 0.426760191809308, + "grad_norm": 2.2601988315582275, + "learning_rate": 4.957325532674312e-05, + "loss": 1.2286, + "step": 27500 + }, + { + "epoch": 0.42831204705225095, + "grad_norm": 2.256404399871826, + "learning_rate": 4.957170347150018e-05, + "loss": 1.2019, + "step": 27600 + }, + { + "epoch": 0.4298639022951939, + "grad_norm": 2.2497708797454834, + "learning_rate": 4.9570151616257236e-05, + "loss": 1.232, + "step": 27700 + }, + { + "epoch": 0.4314157575381368, + "grad_norm": 2.4851603507995605, + "learning_rate": 4.9568599761014294e-05, + "loss": 1.239, + "step": 27800 + }, + { + "epoch": 0.43296761278107976, + "grad_norm": 2.6804866790771484, + "learning_rate": 4.956704790577135e-05, + "loss": 1.2131, + "step": 27900 + }, + { + "epoch": 0.4345194680240227, + "grad_norm": 2.489208221435547, + "learning_rate": 4.956549605052841e-05, + "loss": 1.2201, + "step": 28000 + }, + { + "epoch": 0.4360713232669657, + "grad_norm": 2.5706231594085693, + "learning_rate": 4.956394419528547e-05, + "loss": 1.215, + "step": 28100 + }, + { + "epoch": 0.4376231785099086, + "grad_norm": 2.389012575149536, + "learning_rate": 4.9562392340042525e-05, + "loss": 1.217, + "step": 28200 + }, + { + "epoch": 0.43917503375285155, + "grad_norm": 2.303879976272583, + "learning_rate": 4.956084048479958e-05, + "loss": 1.2309, + "step": 28300 + }, + { + "epoch": 0.4407268889957945, + "grad_norm": 2.1704635620117188, + "learning_rate": 4.955928862955664e-05, + "loss": 1.2183, + "step": 28400 + }, + { + "epoch": 0.4422787442387374, + "grad_norm": 2.5813958644866943, + "learning_rate": 4.95577367743137e-05, + "loss": 1.2302, + "step": 28500 + }, + { + "epoch": 0.44383059948168035, + "grad_norm": 2.516469717025757, + "learning_rate": 4.9556184919070756e-05, + "loss": 1.2101, + "step": 28600 + }, + { + "epoch": 0.4453824547246233, + "grad_norm": 2.3752951622009277, + "learning_rate": 4.955463306382781e-05, + "loss": 1.2356, + "step": 28700 + }, + { + "epoch": 0.4469343099675662, + "grad_norm": 2.4214041233062744, + "learning_rate": 4.9553081208584865e-05, + "loss": 1.2182, + "step": 28800 + }, + { + "epoch": 0.44848616521050916, + "grad_norm": 2.519256830215454, + "learning_rate": 4.955152935334192e-05, + "loss": 1.2132, + "step": 28900 + }, + { + "epoch": 0.4500380204534521, + "grad_norm": 2.636441707611084, + "learning_rate": 4.954997749809898e-05, + "loss": 1.2161, + "step": 29000 + }, + { + "epoch": 0.451589875696395, + "grad_norm": 2.670457601547241, + "learning_rate": 4.954842564285604e-05, + "loss": 1.213, + "step": 29100 + }, + { + "epoch": 0.45314173093933796, + "grad_norm": 2.537473201751709, + "learning_rate": 4.9546873787613096e-05, + "loss": 1.209, + "step": 29200 + }, + { + "epoch": 0.4546935861822809, + "grad_norm": 2.7418036460876465, + "learning_rate": 4.9545321932370154e-05, + "loss": 1.2262, + "step": 29300 + }, + { + "epoch": 0.45624544142522383, + "grad_norm": 2.5072531700134277, + "learning_rate": 4.954377007712721e-05, + "loss": 1.2484, + "step": 29400 + }, + { + "epoch": 0.4577972966681668, + "grad_norm": 2.53163480758667, + "learning_rate": 4.954221822188426e-05, + "loss": 1.2302, + "step": 29500 + }, + { + "epoch": 0.45934915191110975, + "grad_norm": 2.8376755714416504, + "learning_rate": 4.954066636664132e-05, + "loss": 1.2309, + "step": 29600 + }, + { + "epoch": 0.4609010071540527, + "grad_norm": 2.69342303276062, + "learning_rate": 4.953911451139838e-05, + "loss": 1.1855, + "step": 29700 + }, + { + "epoch": 0.4624528623969956, + "grad_norm": 1.7856605052947998, + "learning_rate": 4.9537562656155436e-05, + "loss": 1.2031, + "step": 29800 + }, + { + "epoch": 0.46400471763993856, + "grad_norm": 2.4834160804748535, + "learning_rate": 4.953601080091249e-05, + "loss": 1.2339, + "step": 29900 + }, + { + "epoch": 0.4655565728828815, + "grad_norm": 2.7890806198120117, + "learning_rate": 4.953445894566955e-05, + "loss": 1.2066, + "step": 30000 + }, + { + "epoch": 0.4671084281258244, + "grad_norm": 2.3287105560302734, + "learning_rate": 4.953290709042661e-05, + "loss": 1.2008, + "step": 30100 + }, + { + "epoch": 0.46866028336876736, + "grad_norm": 2.444023847579956, + "learning_rate": 4.953135523518366e-05, + "loss": 1.2253, + "step": 30200 + }, + { + "epoch": 0.4702121386117103, + "grad_norm": 2.365927219390869, + "learning_rate": 4.952980337994072e-05, + "loss": 1.2339, + "step": 30300 + }, + { + "epoch": 0.47176399385465323, + "grad_norm": 2.406822443008423, + "learning_rate": 4.9528251524697775e-05, + "loss": 1.2363, + "step": 30400 + }, + { + "epoch": 0.47331584909759616, + "grad_norm": 2.8443877696990967, + "learning_rate": 4.952669966945483e-05, + "loss": 1.1863, + "step": 30500 + }, + { + "epoch": 0.4748677043405391, + "grad_norm": 2.464362382888794, + "learning_rate": 4.952514781421189e-05, + "loss": 1.2202, + "step": 30600 + }, + { + "epoch": 0.47641955958348203, + "grad_norm": 2.1502931118011475, + "learning_rate": 4.952359595896895e-05, + "loss": 1.2172, + "step": 30700 + }, + { + "epoch": 0.47797141482642497, + "grad_norm": 2.2455809116363525, + "learning_rate": 4.9522044103726006e-05, + "loss": 1.1961, + "step": 30800 + }, + { + "epoch": 0.4795232700693679, + "grad_norm": 2.384324789047241, + "learning_rate": 4.9520492248483064e-05, + "loss": 1.2245, + "step": 30900 + }, + { + "epoch": 0.4810751253123109, + "grad_norm": 2.1174631118774414, + "learning_rate": 4.951894039324012e-05, + "loss": 1.231, + "step": 31000 + }, + { + "epoch": 0.4826269805552538, + "grad_norm": 2.583064079284668, + "learning_rate": 4.951738853799718e-05, + "loss": 1.2121, + "step": 31100 + }, + { + "epoch": 0.48417883579819676, + "grad_norm": 2.519484519958496, + "learning_rate": 4.951583668275424e-05, + "loss": 1.2247, + "step": 31200 + }, + { + "epoch": 0.4857306910411397, + "grad_norm": 2.051809310913086, + "learning_rate": 4.9514284827511295e-05, + "loss": 1.2104, + "step": 31300 + }, + { + "epoch": 0.48728254628408263, + "grad_norm": 2.2689340114593506, + "learning_rate": 4.951273297226835e-05, + "loss": 1.2178, + "step": 31400 + }, + { + "epoch": 0.48883440152702556, + "grad_norm": 2.4677417278289795, + "learning_rate": 4.9511181117025404e-05, + "loss": 1.2253, + "step": 31500 + }, + { + "epoch": 0.4903862567699685, + "grad_norm": 2.7817842960357666, + "learning_rate": 4.950962926178246e-05, + "loss": 1.1976, + "step": 31600 + }, + { + "epoch": 0.49193811201291143, + "grad_norm": 2.4424521923065186, + "learning_rate": 4.950807740653952e-05, + "loss": 1.2278, + "step": 31700 + }, + { + "epoch": 0.49348996725585437, + "grad_norm": 2.227247476577759, + "learning_rate": 4.950652555129658e-05, + "loss": 1.2264, + "step": 31800 + }, + { + "epoch": 0.4950418224987973, + "grad_norm": 2.371188163757324, + "learning_rate": 4.9504973696053635e-05, + "loss": 1.2268, + "step": 31900 + }, + { + "epoch": 0.49659367774174024, + "grad_norm": 2.4499406814575195, + "learning_rate": 4.950342184081069e-05, + "loss": 1.2268, + "step": 32000 + }, + { + "epoch": 0.49814553298468317, + "grad_norm": 2.518974542617798, + "learning_rate": 4.950186998556775e-05, + "loss": 1.1959, + "step": 32100 + }, + { + "epoch": 0.4996973882276261, + "grad_norm": 2.6548304557800293, + "learning_rate": 4.950031813032481e-05, + "loss": 1.1835, + "step": 32200 + }, + { + "epoch": 0.5012492434705691, + "grad_norm": 2.252213478088379, + "learning_rate": 4.9498766275081866e-05, + "loss": 1.1971, + "step": 32300 + }, + { + "epoch": 0.502801098713512, + "grad_norm": 2.8409087657928467, + "learning_rate": 4.9497214419838924e-05, + "loss": 1.2026, + "step": 32400 + }, + { + "epoch": 0.504352953956455, + "grad_norm": 2.7140536308288574, + "learning_rate": 4.949566256459598e-05, + "loss": 1.2188, + "step": 32500 + }, + { + "epoch": 0.5059048091993978, + "grad_norm": 2.8111579418182373, + "learning_rate": 4.949411070935304e-05, + "loss": 1.2012, + "step": 32600 + }, + { + "epoch": 0.5074566644423408, + "grad_norm": 2.5844101905822754, + "learning_rate": 4.94925588541101e-05, + "loss": 1.233, + "step": 32700 + }, + { + "epoch": 0.5090085196852837, + "grad_norm": 2.6166341304779053, + "learning_rate": 4.949100699886715e-05, + "loss": 1.2062, + "step": 32800 + }, + { + "epoch": 0.5105603749282267, + "grad_norm": 2.949582099914551, + "learning_rate": 4.9489455143624206e-05, + "loss": 1.2164, + "step": 32900 + }, + { + "epoch": 0.5121122301711696, + "grad_norm": 2.595062732696533, + "learning_rate": 4.9487903288381257e-05, + "loss": 1.1917, + "step": 33000 + }, + { + "epoch": 0.5136640854141126, + "grad_norm": 2.294433832168579, + "learning_rate": 4.9486351433138314e-05, + "loss": 1.1904, + "step": 33100 + }, + { + "epoch": 0.5152159406570556, + "grad_norm": 2.191683769226074, + "learning_rate": 4.948479957789537e-05, + "loss": 1.2262, + "step": 33200 + }, + { + "epoch": 0.5167677958999984, + "grad_norm": 2.4490966796875, + "learning_rate": 4.948324772265243e-05, + "loss": 1.2134, + "step": 33300 + }, + { + "epoch": 0.5183196511429414, + "grad_norm": 2.3449840545654297, + "learning_rate": 4.948169586740949e-05, + "loss": 1.2198, + "step": 33400 + }, + { + "epoch": 0.5198715063858843, + "grad_norm": 2.706878423690796, + "learning_rate": 4.9480144012166545e-05, + "loss": 1.199, + "step": 33500 + }, + { + "epoch": 0.5214233616288273, + "grad_norm": 2.4588654041290283, + "learning_rate": 4.94785921569236e-05, + "loss": 1.1981, + "step": 33600 + }, + { + "epoch": 0.5229752168717702, + "grad_norm": 1.9115830659866333, + "learning_rate": 4.947704030168066e-05, + "loss": 1.1902, + "step": 33700 + }, + { + "epoch": 0.5245270721147132, + "grad_norm": 2.573307752609253, + "learning_rate": 4.947548844643772e-05, + "loss": 1.2159, + "step": 33800 + }, + { + "epoch": 0.526078927357656, + "grad_norm": 2.6550095081329346, + "learning_rate": 4.9473936591194776e-05, + "loss": 1.2299, + "step": 33900 + }, + { + "epoch": 0.527630782600599, + "grad_norm": 2.6623427867889404, + "learning_rate": 4.9472384735951834e-05, + "loss": 1.1988, + "step": 34000 + }, + { + "epoch": 0.5291826378435419, + "grad_norm": 2.2674150466918945, + "learning_rate": 4.947083288070889e-05, + "loss": 1.2129, + "step": 34100 + }, + { + "epoch": 0.5307344930864849, + "grad_norm": 2.6334874629974365, + "learning_rate": 4.946928102546595e-05, + "loss": 1.207, + "step": 34200 + }, + { + "epoch": 0.5322863483294278, + "grad_norm": 2.237846612930298, + "learning_rate": 4.946772917022301e-05, + "loss": 1.2161, + "step": 34300 + }, + { + "epoch": 0.5338382035723708, + "grad_norm": 2.623734712600708, + "learning_rate": 4.946617731498006e-05, + "loss": 1.2111, + "step": 34400 + }, + { + "epoch": 0.5353900588153138, + "grad_norm": 1.9834754467010498, + "learning_rate": 4.9464625459737116e-05, + "loss": 1.1908, + "step": 34500 + }, + { + "epoch": 0.5369419140582566, + "grad_norm": 2.475658893585205, + "learning_rate": 4.9463073604494174e-05, + "loss": 1.2285, + "step": 34600 + }, + { + "epoch": 0.5384937693011996, + "grad_norm": 2.4245009422302246, + "learning_rate": 4.946152174925123e-05, + "loss": 1.1966, + "step": 34700 + }, + { + "epoch": 0.5400456245441425, + "grad_norm": 3.1218440532684326, + "learning_rate": 4.945996989400829e-05, + "loss": 1.1862, + "step": 34800 + }, + { + "epoch": 0.5415974797870855, + "grad_norm": 2.3402438163757324, + "learning_rate": 4.945841803876535e-05, + "loss": 1.1826, + "step": 34900 + }, + { + "epoch": 0.5431493350300284, + "grad_norm": 2.3297884464263916, + "learning_rate": 4.9456866183522405e-05, + "loss": 1.2174, + "step": 35000 + }, + { + "epoch": 0.5447011902729714, + "grad_norm": 2.5114550590515137, + "learning_rate": 4.945531432827946e-05, + "loss": 1.1835, + "step": 35100 + }, + { + "epoch": 0.5462530455159142, + "grad_norm": 2.144836187362671, + "learning_rate": 4.945376247303652e-05, + "loss": 1.1995, + "step": 35200 + }, + { + "epoch": 0.5478049007588572, + "grad_norm": 2.517759323120117, + "learning_rate": 4.945221061779358e-05, + "loss": 1.1771, + "step": 35300 + }, + { + "epoch": 0.5493567560018001, + "grad_norm": 2.3586151599884033, + "learning_rate": 4.9450658762550636e-05, + "loss": 1.2175, + "step": 35400 + }, + { + "epoch": 0.5509086112447431, + "grad_norm": 2.2889606952667236, + "learning_rate": 4.9449106907307694e-05, + "loss": 1.2047, + "step": 35500 + }, + { + "epoch": 0.552460466487686, + "grad_norm": 2.326240062713623, + "learning_rate": 4.944755505206475e-05, + "loss": 1.2015, + "step": 35600 + }, + { + "epoch": 0.554012321730629, + "grad_norm": 2.626574754714966, + "learning_rate": 4.94460031968218e-05, + "loss": 1.1887, + "step": 35700 + }, + { + "epoch": 0.5555641769735719, + "grad_norm": 2.350536823272705, + "learning_rate": 4.944445134157886e-05, + "loss": 1.1921, + "step": 35800 + }, + { + "epoch": 0.5571160322165148, + "grad_norm": 2.2332732677459717, + "learning_rate": 4.944289948633592e-05, + "loss": 1.1849, + "step": 35900 + }, + { + "epoch": 0.5586678874594578, + "grad_norm": 2.3713724613189697, + "learning_rate": 4.944134763109297e-05, + "loss": 1.2198, + "step": 36000 + }, + { + "epoch": 0.5602197427024007, + "grad_norm": 2.2064812183380127, + "learning_rate": 4.9439795775850027e-05, + "loss": 1.1886, + "step": 36100 + }, + { + "epoch": 0.5617715979453437, + "grad_norm": 2.26542329788208, + "learning_rate": 4.9438243920607084e-05, + "loss": 1.2246, + "step": 36200 + }, + { + "epoch": 0.5633234531882866, + "grad_norm": 2.454099655151367, + "learning_rate": 4.943669206536414e-05, + "loss": 1.1883, + "step": 36300 + }, + { + "epoch": 0.5648753084312296, + "grad_norm": 2.276982545852661, + "learning_rate": 4.94351402101212e-05, + "loss": 1.211, + "step": 36400 + }, + { + "epoch": 0.5664271636741725, + "grad_norm": 5.26003885269165, + "learning_rate": 4.943358835487826e-05, + "loss": 1.2051, + "step": 36500 + }, + { + "epoch": 0.5679790189171154, + "grad_norm": 2.4926297664642334, + "learning_rate": 4.9432036499635315e-05, + "loss": 1.2035, + "step": 36600 + }, + { + "epoch": 0.5695308741600583, + "grad_norm": 2.156940221786499, + "learning_rate": 4.943048464439237e-05, + "loss": 1.1906, + "step": 36700 + }, + { + "epoch": 0.5710827294030013, + "grad_norm": 2.564739465713501, + "learning_rate": 4.942893278914943e-05, + "loss": 1.1878, + "step": 36800 + }, + { + "epoch": 0.5726345846459442, + "grad_norm": 2.4842190742492676, + "learning_rate": 4.942738093390649e-05, + "loss": 1.1958, + "step": 36900 + }, + { + "epoch": 0.5741864398888872, + "grad_norm": 2.535102128982544, + "learning_rate": 4.9425829078663546e-05, + "loss": 1.1939, + "step": 37000 + }, + { + "epoch": 0.5757382951318301, + "grad_norm": 1.9911900758743286, + "learning_rate": 4.9424277223420604e-05, + "loss": 1.2075, + "step": 37100 + }, + { + "epoch": 0.577290150374773, + "grad_norm": 2.1373379230499268, + "learning_rate": 4.9422725368177655e-05, + "loss": 1.1974, + "step": 37200 + }, + { + "epoch": 0.5788420056177159, + "grad_norm": 2.4389595985412598, + "learning_rate": 4.942117351293471e-05, + "loss": 1.1826, + "step": 37300 + }, + { + "epoch": 0.5803938608606589, + "grad_norm": 2.045496940612793, + "learning_rate": 4.941962165769177e-05, + "loss": 1.1752, + "step": 37400 + }, + { + "epoch": 0.5819457161036019, + "grad_norm": 2.981600046157837, + "learning_rate": 4.941806980244883e-05, + "loss": 1.1878, + "step": 37500 + }, + { + "epoch": 0.5834975713465448, + "grad_norm": 1.9566932916641235, + "learning_rate": 4.9416517947205886e-05, + "loss": 1.1831, + "step": 37600 + }, + { + "epoch": 0.5850494265894878, + "grad_norm": 2.4527249336242676, + "learning_rate": 4.9414966091962944e-05, + "loss": 1.1865, + "step": 37700 + }, + { + "epoch": 0.5866012818324307, + "grad_norm": 2.4431509971618652, + "learning_rate": 4.941341423672e-05, + "loss": 1.1936, + "step": 37800 + }, + { + "epoch": 0.5881531370753736, + "grad_norm": 2.659628391265869, + "learning_rate": 4.941186238147706e-05, + "loss": 1.201, + "step": 37900 + }, + { + "epoch": 0.5897049923183165, + "grad_norm": 2.275744915008545, + "learning_rate": 4.941031052623412e-05, + "loss": 1.1947, + "step": 38000 + }, + { + "epoch": 0.5912568475612595, + "grad_norm": 2.5866122245788574, + "learning_rate": 4.9408758670991175e-05, + "loss": 1.199, + "step": 38100 + }, + { + "epoch": 0.5928087028042024, + "grad_norm": 2.1171488761901855, + "learning_rate": 4.940720681574823e-05, + "loss": 1.1872, + "step": 38200 + }, + { + "epoch": 0.5943605580471454, + "grad_norm": 2.8767952919006348, + "learning_rate": 4.940565496050529e-05, + "loss": 1.1998, + "step": 38300 + }, + { + "epoch": 0.5959124132900883, + "grad_norm": 2.4268555641174316, + "learning_rate": 4.940410310526235e-05, + "loss": 1.2072, + "step": 38400 + }, + { + "epoch": 0.5974642685330313, + "grad_norm": 2.5592246055603027, + "learning_rate": 4.94025512500194e-05, + "loss": 1.1669, + "step": 38500 + }, + { + "epoch": 0.5990161237759741, + "grad_norm": 2.657672643661499, + "learning_rate": 4.940099939477646e-05, + "loss": 1.1985, + "step": 38600 + }, + { + "epoch": 0.6005679790189171, + "grad_norm": 2.3190295696258545, + "learning_rate": 4.9399447539533515e-05, + "loss": 1.2081, + "step": 38700 + }, + { + "epoch": 0.60211983426186, + "grad_norm": 2.345186948776245, + "learning_rate": 4.939789568429057e-05, + "loss": 1.182, + "step": 38800 + }, + { + "epoch": 0.603671689504803, + "grad_norm": 1.9601801633834839, + "learning_rate": 4.939634382904763e-05, + "loss": 1.2019, + "step": 38900 + }, + { + "epoch": 0.605223544747746, + "grad_norm": 2.403519630432129, + "learning_rate": 4.939479197380469e-05, + "loss": 1.1669, + "step": 39000 + }, + { + "epoch": 0.6067753999906889, + "grad_norm": 2.4064042568206787, + "learning_rate": 4.9393240118561746e-05, + "loss": 1.2144, + "step": 39100 + }, + { + "epoch": 0.6083272552336318, + "grad_norm": 2.2512192726135254, + "learning_rate": 4.93916882633188e-05, + "loss": 1.1835, + "step": 39200 + }, + { + "epoch": 0.6098791104765747, + "grad_norm": 2.9290380477905273, + "learning_rate": 4.9390136408075854e-05, + "loss": 1.2249, + "step": 39300 + }, + { + "epoch": 0.6114309657195177, + "grad_norm": 2.927769184112549, + "learning_rate": 4.938858455283291e-05, + "loss": 1.1738, + "step": 39400 + }, + { + "epoch": 0.6129828209624606, + "grad_norm": 2.0681636333465576, + "learning_rate": 4.938703269758997e-05, + "loss": 1.1852, + "step": 39500 + }, + { + "epoch": 0.6145346762054036, + "grad_norm": 2.5736312866210938, + "learning_rate": 4.938548084234703e-05, + "loss": 1.1714, + "step": 39600 + }, + { + "epoch": 0.6160865314483465, + "grad_norm": 2.8512165546417236, + "learning_rate": 4.9383928987104085e-05, + "loss": 1.2095, + "step": 39700 + }, + { + "epoch": 0.6176383866912895, + "grad_norm": 2.954066276550293, + "learning_rate": 4.938237713186114e-05, + "loss": 1.17, + "step": 39800 + }, + { + "epoch": 0.6191902419342323, + "grad_norm": 2.868504285812378, + "learning_rate": 4.93808252766182e-05, + "loss": 1.1838, + "step": 39900 + }, + { + "epoch": 0.6207420971771753, + "grad_norm": 2.705115795135498, + "learning_rate": 4.937927342137525e-05, + "loss": 1.1723, + "step": 40000 + }, + { + "epoch": 0.6222939524201182, + "grad_norm": 2.2740418910980225, + "learning_rate": 4.937772156613231e-05, + "loss": 1.1834, + "step": 40100 + }, + { + "epoch": 0.6238458076630612, + "grad_norm": 2.30437970161438, + "learning_rate": 4.937616971088937e-05, + "loss": 1.1938, + "step": 40200 + }, + { + "epoch": 0.6253976629060042, + "grad_norm": 2.5184919834136963, + "learning_rate": 4.9374617855646425e-05, + "loss": 1.1787, + "step": 40300 + }, + { + "epoch": 0.6269495181489471, + "grad_norm": 2.4476919174194336, + "learning_rate": 4.937306600040348e-05, + "loss": 1.1752, + "step": 40400 + }, + { + "epoch": 0.62850137339189, + "grad_norm": 2.5979440212249756, + "learning_rate": 4.937151414516054e-05, + "loss": 1.1858, + "step": 40500 + }, + { + "epoch": 0.6300532286348329, + "grad_norm": 2.6650726795196533, + "learning_rate": 4.93699622899176e-05, + "loss": 1.1962, + "step": 40600 + }, + { + "epoch": 0.6316050838777759, + "grad_norm": 2.5451135635375977, + "learning_rate": 4.9368410434674656e-05, + "loss": 1.1736, + "step": 40700 + }, + { + "epoch": 0.6331569391207188, + "grad_norm": 2.4065561294555664, + "learning_rate": 4.9366858579431714e-05, + "loss": 1.1918, + "step": 40800 + }, + { + "epoch": 0.6347087943636618, + "grad_norm": 2.2645041942596436, + "learning_rate": 4.936530672418877e-05, + "loss": 1.1868, + "step": 40900 + }, + { + "epoch": 0.6362606496066047, + "grad_norm": 2.4546401500701904, + "learning_rate": 4.936375486894583e-05, + "loss": 1.1942, + "step": 41000 + }, + { + "epoch": 0.6378125048495477, + "grad_norm": 2.800821304321289, + "learning_rate": 4.936220301370289e-05, + "loss": 1.2016, + "step": 41100 + }, + { + "epoch": 0.6393643600924905, + "grad_norm": 2.5779294967651367, + "learning_rate": 4.9360651158459945e-05, + "loss": 1.194, + "step": 41200 + }, + { + "epoch": 0.6409162153354335, + "grad_norm": 2.337620258331299, + "learning_rate": 4.9359099303216996e-05, + "loss": 1.185, + "step": 41300 + }, + { + "epoch": 0.6424680705783764, + "grad_norm": 2.5104012489318848, + "learning_rate": 4.9357547447974054e-05, + "loss": 1.1907, + "step": 41400 + }, + { + "epoch": 0.6440199258213194, + "grad_norm": 1.9994335174560547, + "learning_rate": 4.935599559273111e-05, + "loss": 1.1789, + "step": 41500 + }, + { + "epoch": 0.6455717810642623, + "grad_norm": 2.1678597927093506, + "learning_rate": 4.935444373748817e-05, + "loss": 1.2008, + "step": 41600 + }, + { + "epoch": 0.6471236363072053, + "grad_norm": 2.5696334838867188, + "learning_rate": 4.935289188224523e-05, + "loss": 1.1764, + "step": 41700 + }, + { + "epoch": 0.6486754915501483, + "grad_norm": 1.8987754583358765, + "learning_rate": 4.9351340027002285e-05, + "loss": 1.189, + "step": 41800 + }, + { + "epoch": 0.6502273467930911, + "grad_norm": 2.203730583190918, + "learning_rate": 4.934978817175934e-05, + "loss": 1.1696, + "step": 41900 + }, + { + "epoch": 0.6517792020360341, + "grad_norm": 2.6431314945220947, + "learning_rate": 4.93482363165164e-05, + "loss": 1.1753, + "step": 42000 + }, + { + "epoch": 0.653331057278977, + "grad_norm": 2.334282159805298, + "learning_rate": 4.934668446127346e-05, + "loss": 1.1855, + "step": 42100 + }, + { + "epoch": 0.65488291252192, + "grad_norm": 2.236422538757324, + "learning_rate": 4.9345132606030516e-05, + "loss": 1.1647, + "step": 42200 + }, + { + "epoch": 0.6564347677648629, + "grad_norm": 2.5629477500915527, + "learning_rate": 4.934358075078757e-05, + "loss": 1.1844, + "step": 42300 + }, + { + "epoch": 0.6579866230078059, + "grad_norm": 2.0746262073516846, + "learning_rate": 4.934202889554463e-05, + "loss": 1.2069, + "step": 42400 + }, + { + "epoch": 0.6595384782507487, + "grad_norm": 2.7469513416290283, + "learning_rate": 4.934047704030168e-05, + "loss": 1.1948, + "step": 42500 + }, + { + "epoch": 0.6610903334936917, + "grad_norm": 2.3980820178985596, + "learning_rate": 4.933892518505874e-05, + "loss": 1.1709, + "step": 42600 + }, + { + "epoch": 0.6626421887366346, + "grad_norm": 2.0644824504852295, + "learning_rate": 4.93373733298158e-05, + "loss": 1.1578, + "step": 42700 + }, + { + "epoch": 0.6641940439795776, + "grad_norm": 2.0668458938598633, + "learning_rate": 4.9335821474572855e-05, + "loss": 1.1685, + "step": 42800 + }, + { + "epoch": 0.6657458992225205, + "grad_norm": 2.388491153717041, + "learning_rate": 4.9334269619329906e-05, + "loss": 1.2028, + "step": 42900 + }, + { + "epoch": 0.6672977544654635, + "grad_norm": 2.12837815284729, + "learning_rate": 4.9332717764086964e-05, + "loss": 1.1808, + "step": 43000 + }, + { + "epoch": 0.6688496097084063, + "grad_norm": 2.2032430171966553, + "learning_rate": 4.933116590884402e-05, + "loss": 1.2024, + "step": 43100 + }, + { + "epoch": 0.6704014649513493, + "grad_norm": 2.159217119216919, + "learning_rate": 4.932961405360108e-05, + "loss": 1.1655, + "step": 43200 + }, + { + "epoch": 0.6719533201942923, + "grad_norm": 2.3776962757110596, + "learning_rate": 4.932806219835814e-05, + "loss": 1.1871, + "step": 43300 + }, + { + "epoch": 0.6735051754372352, + "grad_norm": 2.2626845836639404, + "learning_rate": 4.9326510343115195e-05, + "loss": 1.1781, + "step": 43400 + }, + { + "epoch": 0.6750570306801782, + "grad_norm": 2.6178290843963623, + "learning_rate": 4.932495848787225e-05, + "loss": 1.1559, + "step": 43500 + }, + { + "epoch": 0.6766088859231211, + "grad_norm": 2.829810857772827, + "learning_rate": 4.932340663262931e-05, + "loss": 1.1873, + "step": 43600 + }, + { + "epoch": 0.6781607411660641, + "grad_norm": 2.2946605682373047, + "learning_rate": 4.932185477738637e-05, + "loss": 1.1635, + "step": 43700 + }, + { + "epoch": 0.6797125964090069, + "grad_norm": 2.151526927947998, + "learning_rate": 4.9320302922143426e-05, + "loss": 1.1584, + "step": 43800 + }, + { + "epoch": 0.6812644516519499, + "grad_norm": 2.657257556915283, + "learning_rate": 4.9318751066900484e-05, + "loss": 1.1814, + "step": 43900 + }, + { + "epoch": 0.6828163068948928, + "grad_norm": 2.409086227416992, + "learning_rate": 4.931719921165754e-05, + "loss": 1.1559, + "step": 44000 + }, + { + "epoch": 0.6843681621378358, + "grad_norm": 2.9715707302093506, + "learning_rate": 4.93156473564146e-05, + "loss": 1.1686, + "step": 44100 + }, + { + "epoch": 0.6859200173807787, + "grad_norm": 2.454049825668335, + "learning_rate": 4.931409550117165e-05, + "loss": 1.1941, + "step": 44200 + }, + { + "epoch": 0.6874718726237217, + "grad_norm": 2.4585883617401123, + "learning_rate": 4.931254364592871e-05, + "loss": 1.1679, + "step": 44300 + }, + { + "epoch": 0.6890237278666645, + "grad_norm": 2.2637085914611816, + "learning_rate": 4.9310991790685766e-05, + "loss": 1.1554, + "step": 44400 + }, + { + "epoch": 0.6905755831096075, + "grad_norm": 2.476701498031616, + "learning_rate": 4.9309439935442824e-05, + "loss": 1.1643, + "step": 44500 + }, + { + "epoch": 0.6921274383525505, + "grad_norm": 2.5342164039611816, + "learning_rate": 4.930788808019988e-05, + "loss": 1.1638, + "step": 44600 + }, + { + "epoch": 0.6936792935954934, + "grad_norm": 2.2393851280212402, + "learning_rate": 4.930633622495694e-05, + "loss": 1.2029, + "step": 44700 + }, + { + "epoch": 0.6952311488384364, + "grad_norm": 1.9192265272140503, + "learning_rate": 4.9304784369714e-05, + "loss": 1.164, + "step": 44800 + }, + { + "epoch": 0.6967830040813793, + "grad_norm": 2.471797227859497, + "learning_rate": 4.9303232514471055e-05, + "loss": 1.192, + "step": 44900 + }, + { + "epoch": 0.6983348593243223, + "grad_norm": 2.543172836303711, + "learning_rate": 4.930168065922811e-05, + "loss": 1.2015, + "step": 45000 + }, + { + "epoch": 0.6998867145672651, + "grad_norm": 2.39199161529541, + "learning_rate": 4.930012880398517e-05, + "loss": 1.1897, + "step": 45100 + }, + { + "epoch": 0.7014385698102081, + "grad_norm": 2.072190761566162, + "learning_rate": 4.929857694874223e-05, + "loss": 1.1632, + "step": 45200 + }, + { + "epoch": 0.702990425053151, + "grad_norm": 2.616067886352539, + "learning_rate": 4.9297025093499286e-05, + "loss": 1.1706, + "step": 45300 + }, + { + "epoch": 0.704542280296094, + "grad_norm": 2.564262628555298, + "learning_rate": 4.929547323825634e-05, + "loss": 1.1801, + "step": 45400 + }, + { + "epoch": 0.7060941355390369, + "grad_norm": 2.7784082889556885, + "learning_rate": 4.9293921383013394e-05, + "loss": 1.1692, + "step": 45500 + }, + { + "epoch": 0.7076459907819799, + "grad_norm": 2.6110994815826416, + "learning_rate": 4.929236952777045e-05, + "loss": 1.1739, + "step": 45600 + }, + { + "epoch": 0.7091978460249228, + "grad_norm": 2.4873533248901367, + "learning_rate": 4.929081767252751e-05, + "loss": 1.1768, + "step": 45700 + }, + { + "epoch": 0.7107497012678657, + "grad_norm": 2.507765293121338, + "learning_rate": 4.928926581728456e-05, + "loss": 1.1507, + "step": 45800 + }, + { + "epoch": 0.7123015565108086, + "grad_norm": 2.48184871673584, + "learning_rate": 4.928771396204162e-05, + "loss": 1.1544, + "step": 45900 + }, + { + "epoch": 0.7138534117537516, + "grad_norm": 2.669926404953003, + "learning_rate": 4.9286162106798676e-05, + "loss": 1.1586, + "step": 46000 + }, + { + "epoch": 0.7154052669966946, + "grad_norm": 2.7033112049102783, + "learning_rate": 4.9284610251555734e-05, + "loss": 1.1434, + "step": 46100 + }, + { + "epoch": 0.7169571222396375, + "grad_norm": 2.89276385307312, + "learning_rate": 4.928305839631279e-05, + "loss": 1.1791, + "step": 46200 + }, + { + "epoch": 0.7185089774825805, + "grad_norm": 2.1775076389312744, + "learning_rate": 4.928150654106985e-05, + "loss": 1.1713, + "step": 46300 + }, + { + "epoch": 0.7200608327255233, + "grad_norm": 1.9711344242095947, + "learning_rate": 4.927995468582691e-05, + "loss": 1.1589, + "step": 46400 + }, + { + "epoch": 0.7216126879684663, + "grad_norm": 2.7642791271209717, + "learning_rate": 4.9278402830583965e-05, + "loss": 1.1848, + "step": 46500 + }, + { + "epoch": 0.7231645432114092, + "grad_norm": 2.3391270637512207, + "learning_rate": 4.927685097534102e-05, + "loss": 1.1541, + "step": 46600 + }, + { + "epoch": 0.7247163984543522, + "grad_norm": 2.115884304046631, + "learning_rate": 4.927529912009808e-05, + "loss": 1.1777, + "step": 46700 + }, + { + "epoch": 0.7262682536972951, + "grad_norm": 2.2775306701660156, + "learning_rate": 4.927374726485514e-05, + "loss": 1.1581, + "step": 46800 + }, + { + "epoch": 0.7278201089402381, + "grad_norm": 2.1826443672180176, + "learning_rate": 4.9272195409612196e-05, + "loss": 1.1632, + "step": 46900 + }, + { + "epoch": 0.729371964183181, + "grad_norm": 2.4894049167633057, + "learning_rate": 4.927064355436925e-05, + "loss": 1.1399, + "step": 47000 + }, + { + "epoch": 0.7309238194261239, + "grad_norm": 2.074448347091675, + "learning_rate": 4.9269091699126305e-05, + "loss": 1.1723, + "step": 47100 + }, + { + "epoch": 0.7324756746690668, + "grad_norm": 2.058256149291992, + "learning_rate": 4.926753984388336e-05, + "loss": 1.154, + "step": 47200 + }, + { + "epoch": 0.7340275299120098, + "grad_norm": 2.3936984539031982, + "learning_rate": 4.926598798864042e-05, + "loss": 1.1684, + "step": 47300 + }, + { + "epoch": 0.7355793851549527, + "grad_norm": 2.1671347618103027, + "learning_rate": 4.926443613339748e-05, + "loss": 1.1699, + "step": 47400 + }, + { + "epoch": 0.7371312403978957, + "grad_norm": 2.8088905811309814, + "learning_rate": 4.9262884278154536e-05, + "loss": 1.1851, + "step": 47500 + }, + { + "epoch": 0.7386830956408387, + "grad_norm": 2.416687488555908, + "learning_rate": 4.9261332422911594e-05, + "loss": 1.1686, + "step": 47600 + }, + { + "epoch": 0.7402349508837816, + "grad_norm": 2.3578453063964844, + "learning_rate": 4.925978056766865e-05, + "loss": 1.1633, + "step": 47700 + }, + { + "epoch": 0.7417868061267245, + "grad_norm": 2.4843294620513916, + "learning_rate": 4.925822871242571e-05, + "loss": 1.193, + "step": 47800 + }, + { + "epoch": 0.7433386613696674, + "grad_norm": 2.8165409564971924, + "learning_rate": 4.925667685718277e-05, + "loss": 1.1511, + "step": 47900 + }, + { + "epoch": 0.7448905166126104, + "grad_norm": 2.834263801574707, + "learning_rate": 4.9255125001939825e-05, + "loss": 1.1584, + "step": 48000 + }, + { + "epoch": 0.7464423718555533, + "grad_norm": 2.8275694847106934, + "learning_rate": 4.925357314669688e-05, + "loss": 1.1521, + "step": 48100 + }, + { + "epoch": 0.7479942270984963, + "grad_norm": 2.2874298095703125, + "learning_rate": 4.925202129145394e-05, + "loss": 1.1639, + "step": 48200 + }, + { + "epoch": 0.7495460823414392, + "grad_norm": 2.553553819656372, + "learning_rate": 4.925046943621099e-05, + "loss": 1.1721, + "step": 48300 + }, + { + "epoch": 0.7510979375843821, + "grad_norm": 2.650235891342163, + "learning_rate": 4.924891758096805e-05, + "loss": 1.1823, + "step": 48400 + }, + { + "epoch": 0.752649792827325, + "grad_norm": 2.454193115234375, + "learning_rate": 4.9247365725725107e-05, + "loss": 1.1605, + "step": 48500 + }, + { + "epoch": 0.754201648070268, + "grad_norm": 1.828253149986267, + "learning_rate": 4.9245813870482164e-05, + "loss": 1.1657, + "step": 48600 + }, + { + "epoch": 0.7557535033132109, + "grad_norm": 2.834845781326294, + "learning_rate": 4.924426201523922e-05, + "loss": 1.164, + "step": 48700 + }, + { + "epoch": 0.7573053585561539, + "grad_norm": 2.4240832328796387, + "learning_rate": 4.924271015999628e-05, + "loss": 1.1809, + "step": 48800 + }, + { + "epoch": 0.7588572137990969, + "grad_norm": 2.1284701824188232, + "learning_rate": 4.924115830475334e-05, + "loss": 1.1779, + "step": 48900 + }, + { + "epoch": 0.7604090690420398, + "grad_norm": 2.523000478744507, + "learning_rate": 4.923960644951039e-05, + "loss": 1.1723, + "step": 49000 + }, + { + "epoch": 0.7619609242849827, + "grad_norm": 3.0821704864501953, + "learning_rate": 4.9238054594267446e-05, + "loss": 1.1751, + "step": 49100 + }, + { + "epoch": 0.7635127795279256, + "grad_norm": 2.4138882160186768, + "learning_rate": 4.9236502739024504e-05, + "loss": 1.1896, + "step": 49200 + }, + { + "epoch": 0.7650646347708686, + "grad_norm": 2.178921699523926, + "learning_rate": 4.923495088378156e-05, + "loss": 1.1525, + "step": 49300 + }, + { + "epoch": 0.7666164900138115, + "grad_norm": 2.6186108589172363, + "learning_rate": 4.923339902853862e-05, + "loss": 1.1462, + "step": 49400 + }, + { + "epoch": 0.7681683452567545, + "grad_norm": 2.2610292434692383, + "learning_rate": 4.923184717329568e-05, + "loss": 1.144, + "step": 49500 + }, + { + "epoch": 0.7697202004996974, + "grad_norm": 2.416614532470703, + "learning_rate": 4.9230295318052735e-05, + "loss": 1.1613, + "step": 49600 + }, + { + "epoch": 0.7712720557426404, + "grad_norm": 2.1591925621032715, + "learning_rate": 4.922874346280979e-05, + "loss": 1.1597, + "step": 49700 + }, + { + "epoch": 0.7728239109855832, + "grad_norm": 2.146529197692871, + "learning_rate": 4.9227191607566844e-05, + "loss": 1.1813, + "step": 49800 + }, + { + "epoch": 0.7743757662285262, + "grad_norm": 2.157388687133789, + "learning_rate": 4.92256397523239e-05, + "loss": 1.1822, + "step": 49900 + }, + { + "epoch": 0.7759276214714691, + "grad_norm": 2.572301149368286, + "learning_rate": 4.922408789708096e-05, + "loss": 1.1529, + "step": 50000 + }, + { + "epoch": 0.7774794767144121, + "grad_norm": 2.306133508682251, + "learning_rate": 4.922253604183802e-05, + "loss": 1.159, + "step": 50100 + }, + { + "epoch": 0.779031331957355, + "grad_norm": 2.453583002090454, + "learning_rate": 4.9220984186595075e-05, + "loss": 1.1693, + "step": 50200 + }, + { + "epoch": 0.780583187200298, + "grad_norm": 2.0289504528045654, + "learning_rate": 4.921943233135213e-05, + "loss": 1.1689, + "step": 50300 + }, + { + "epoch": 0.782135042443241, + "grad_norm": 1.9087867736816406, + "learning_rate": 4.921788047610919e-05, + "loss": 1.1394, + "step": 50400 + }, + { + "epoch": 0.7836868976861838, + "grad_norm": 2.9248099327087402, + "learning_rate": 4.921632862086625e-05, + "loss": 1.2043, + "step": 50500 + }, + { + "epoch": 0.7852387529291268, + "grad_norm": 2.4978187084198, + "learning_rate": 4.9214776765623306e-05, + "loss": 1.1652, + "step": 50600 + }, + { + "epoch": 0.7867906081720697, + "grad_norm": 2.174706220626831, + "learning_rate": 4.9213224910380364e-05, + "loss": 1.1464, + "step": 50700 + }, + { + "epoch": 0.7883424634150127, + "grad_norm": 2.3342819213867188, + "learning_rate": 4.921167305513742e-05, + "loss": 1.1577, + "step": 50800 + }, + { + "epoch": 0.7898943186579556, + "grad_norm": 2.4408538341522217, + "learning_rate": 4.921012119989448e-05, + "loss": 1.1765, + "step": 50900 + }, + { + "epoch": 0.7914461739008986, + "grad_norm": 2.0617897510528564, + "learning_rate": 4.920856934465154e-05, + "loss": 1.1725, + "step": 51000 + }, + { + "epoch": 0.7929980291438414, + "grad_norm": 2.025838613510132, + "learning_rate": 4.920701748940859e-05, + "loss": 1.1499, + "step": 51100 + }, + { + "epoch": 0.7945498843867844, + "grad_norm": 2.2229199409484863, + "learning_rate": 4.9205465634165645e-05, + "loss": 1.1594, + "step": 51200 + }, + { + "epoch": 0.7961017396297273, + "grad_norm": 2.204599618911743, + "learning_rate": 4.92039137789227e-05, + "loss": 1.1582, + "step": 51300 + }, + { + "epoch": 0.7976535948726703, + "grad_norm": 2.3283772468566895, + "learning_rate": 4.920236192367976e-05, + "loss": 1.1775, + "step": 51400 + }, + { + "epoch": 0.7992054501156132, + "grad_norm": 2.6537115573883057, + "learning_rate": 4.920081006843682e-05, + "loss": 1.1585, + "step": 51500 + }, + { + "epoch": 0.8007573053585562, + "grad_norm": 2.701730966567993, + "learning_rate": 4.9199258213193877e-05, + "loss": 1.1458, + "step": 51600 + }, + { + "epoch": 0.802309160601499, + "grad_norm": 2.5185375213623047, + "learning_rate": 4.9197706357950934e-05, + "loss": 1.1652, + "step": 51700 + }, + { + "epoch": 0.803861015844442, + "grad_norm": 2.5204620361328125, + "learning_rate": 4.919615450270799e-05, + "loss": 1.1552, + "step": 51800 + }, + { + "epoch": 0.805412871087385, + "grad_norm": 2.1909873485565186, + "learning_rate": 4.919460264746505e-05, + "loss": 1.1573, + "step": 51900 + }, + { + "epoch": 0.8069647263303279, + "grad_norm": 2.4740612506866455, + "learning_rate": 4.919305079222211e-05, + "loss": 1.1607, + "step": 52000 + }, + { + "epoch": 0.8085165815732709, + "grad_norm": 2.4286158084869385, + "learning_rate": 4.9191498936979165e-05, + "loss": 1.1464, + "step": 52100 + }, + { + "epoch": 0.8100684368162138, + "grad_norm": 3.082249402999878, + "learning_rate": 4.918994708173622e-05, + "loss": 1.1656, + "step": 52200 + }, + { + "epoch": 0.8116202920591568, + "grad_norm": 2.3874146938323975, + "learning_rate": 4.9188395226493274e-05, + "loss": 1.1537, + "step": 52300 + }, + { + "epoch": 0.8131721473020996, + "grad_norm": 1.971757411956787, + "learning_rate": 4.918684337125033e-05, + "loss": 1.147, + "step": 52400 + }, + { + "epoch": 0.8147240025450426, + "grad_norm": 2.264465808868408, + "learning_rate": 4.918529151600739e-05, + "loss": 1.1783, + "step": 52500 + }, + { + "epoch": 0.8162758577879855, + "grad_norm": 2.1094117164611816, + "learning_rate": 4.918373966076445e-05, + "loss": 1.1577, + "step": 52600 + }, + { + "epoch": 0.8178277130309285, + "grad_norm": 2.7979469299316406, + "learning_rate": 4.91821878055215e-05, + "loss": 1.1858, + "step": 52700 + }, + { + "epoch": 0.8193795682738714, + "grad_norm": 2.128664255142212, + "learning_rate": 4.9180635950278556e-05, + "loss": 1.1684, + "step": 52800 + }, + { + "epoch": 0.8209314235168144, + "grad_norm": 2.249389886856079, + "learning_rate": 4.9179084095035614e-05, + "loss": 1.1516, + "step": 52900 + }, + { + "epoch": 0.8224832787597572, + "grad_norm": 2.815535306930542, + "learning_rate": 4.917753223979267e-05, + "loss": 1.1731, + "step": 53000 + }, + { + "epoch": 0.8240351340027002, + "grad_norm": 2.402355670928955, + "learning_rate": 4.917598038454973e-05, + "loss": 1.1582, + "step": 53100 + }, + { + "epoch": 0.8255869892456432, + "grad_norm": 2.3408007621765137, + "learning_rate": 4.917442852930679e-05, + "loss": 1.1628, + "step": 53200 + }, + { + "epoch": 0.8271388444885861, + "grad_norm": 1.9599260091781616, + "learning_rate": 4.9172876674063845e-05, + "loss": 1.1622, + "step": 53300 + }, + { + "epoch": 0.8286906997315291, + "grad_norm": 2.192831039428711, + "learning_rate": 4.91713248188209e-05, + "loss": 1.1531, + "step": 53400 + }, + { + "epoch": 0.830242554974472, + "grad_norm": 2.5755300521850586, + "learning_rate": 4.916977296357796e-05, + "loss": 1.1222, + "step": 53500 + }, + { + "epoch": 0.831794410217415, + "grad_norm": 2.547351598739624, + "learning_rate": 4.916822110833502e-05, + "loss": 1.1501, + "step": 53600 + }, + { + "epoch": 0.8333462654603578, + "grad_norm": 2.4684717655181885, + "learning_rate": 4.9166669253092076e-05, + "loss": 1.1566, + "step": 53700 + }, + { + "epoch": 0.8348981207033008, + "grad_norm": 2.467470169067383, + "learning_rate": 4.9165117397849134e-05, + "loss": 1.1623, + "step": 53800 + }, + { + "epoch": 0.8364499759462437, + "grad_norm": 2.0983071327209473, + "learning_rate": 4.916356554260619e-05, + "loss": 1.1362, + "step": 53900 + }, + { + "epoch": 0.8380018311891867, + "grad_norm": 2.5323948860168457, + "learning_rate": 4.916201368736324e-05, + "loss": 1.154, + "step": 54000 + }, + { + "epoch": 0.8395536864321296, + "grad_norm": 2.034572124481201, + "learning_rate": 4.91604618321203e-05, + "loss": 1.1444, + "step": 54100 + }, + { + "epoch": 0.8411055416750726, + "grad_norm": 2.1224417686462402, + "learning_rate": 4.915890997687736e-05, + "loss": 1.154, + "step": 54200 + }, + { + "epoch": 0.8426573969180154, + "grad_norm": 3.1934731006622314, + "learning_rate": 4.9157358121634415e-05, + "loss": 1.1572, + "step": 54300 + }, + { + "epoch": 0.8442092521609584, + "grad_norm": 2.6752889156341553, + "learning_rate": 4.915580626639147e-05, + "loss": 1.1566, + "step": 54400 + }, + { + "epoch": 0.8457611074039013, + "grad_norm": 2.5010483264923096, + "learning_rate": 4.915425441114853e-05, + "loss": 1.137, + "step": 54500 + }, + { + "epoch": 0.8473129626468443, + "grad_norm": 2.677424669265747, + "learning_rate": 4.915270255590559e-05, + "loss": 1.1516, + "step": 54600 + }, + { + "epoch": 0.8488648178897873, + "grad_norm": 2.1494462490081787, + "learning_rate": 4.9151150700662647e-05, + "loss": 1.1595, + "step": 54700 + }, + { + "epoch": 0.8504166731327302, + "grad_norm": 2.7015750408172607, + "learning_rate": 4.9149598845419704e-05, + "loss": 1.1254, + "step": 54800 + }, + { + "epoch": 0.8519685283756732, + "grad_norm": 2.3260157108306885, + "learning_rate": 4.914804699017676e-05, + "loss": 1.1585, + "step": 54900 + }, + { + "epoch": 0.853520383618616, + "grad_norm": 2.4272515773773193, + "learning_rate": 4.914649513493382e-05, + "loss": 1.1311, + "step": 55000 + }, + { + "epoch": 0.855072238861559, + "grad_norm": 2.377215623855591, + "learning_rate": 4.914494327969088e-05, + "loss": 1.124, + "step": 55100 + }, + { + "epoch": 0.8566240941045019, + "grad_norm": 2.164163827896118, + "learning_rate": 4.9143391424447935e-05, + "loss": 1.1517, + "step": 55200 + }, + { + "epoch": 0.8581759493474449, + "grad_norm": 2.0977590084075928, + "learning_rate": 4.9141839569204986e-05, + "loss": 1.1502, + "step": 55300 + }, + { + "epoch": 0.8597278045903878, + "grad_norm": 2.5076000690460205, + "learning_rate": 4.9140287713962044e-05, + "loss": 1.1643, + "step": 55400 + }, + { + "epoch": 0.8612796598333308, + "grad_norm": 2.3000075817108154, + "learning_rate": 4.91387358587191e-05, + "loss": 1.1459, + "step": 55500 + }, + { + "epoch": 0.8628315150762736, + "grad_norm": 2.194390296936035, + "learning_rate": 4.913718400347615e-05, + "loss": 1.1429, + "step": 55600 + }, + { + "epoch": 0.8643833703192166, + "grad_norm": 2.2268261909484863, + "learning_rate": 4.913563214823321e-05, + "loss": 1.1535, + "step": 55700 + }, + { + "epoch": 0.8659352255621595, + "grad_norm": 2.369276285171509, + "learning_rate": 4.913408029299027e-05, + "loss": 1.1481, + "step": 55800 + }, + { + "epoch": 0.8674870808051025, + "grad_norm": 2.3929903507232666, + "learning_rate": 4.9132528437747326e-05, + "loss": 1.1272, + "step": 55900 + }, + { + "epoch": 0.8690389360480454, + "grad_norm": 2.2794313430786133, + "learning_rate": 4.9130976582504384e-05, + "loss": 1.1344, + "step": 56000 + }, + { + "epoch": 0.8705907912909884, + "grad_norm": 2.3763296604156494, + "learning_rate": 4.912942472726144e-05, + "loss": 1.1352, + "step": 56100 + }, + { + "epoch": 0.8721426465339314, + "grad_norm": 1.988471508026123, + "learning_rate": 4.91278728720185e-05, + "loss": 1.1746, + "step": 56200 + }, + { + "epoch": 0.8736945017768742, + "grad_norm": 2.535183906555176, + "learning_rate": 4.912632101677556e-05, + "loss": 1.1507, + "step": 56300 + }, + { + "epoch": 0.8752463570198172, + "grad_norm": 2.1881957054138184, + "learning_rate": 4.9124769161532615e-05, + "loss": 1.1351, + "step": 56400 + }, + { + "epoch": 0.8767982122627601, + "grad_norm": 2.5924620628356934, + "learning_rate": 4.912321730628967e-05, + "loss": 1.1247, + "step": 56500 + }, + { + "epoch": 0.8783500675057031, + "grad_norm": 2.0894575119018555, + "learning_rate": 4.912166545104673e-05, + "loss": 1.1498, + "step": 56600 + }, + { + "epoch": 0.879901922748646, + "grad_norm": 2.1576178073883057, + "learning_rate": 4.912011359580379e-05, + "loss": 1.1684, + "step": 56700 + }, + { + "epoch": 0.881453777991589, + "grad_norm": 2.6149630546569824, + "learning_rate": 4.911856174056084e-05, + "loss": 1.1556, + "step": 56800 + }, + { + "epoch": 0.8830056332345319, + "grad_norm": 2.472132682800293, + "learning_rate": 4.91170098853179e-05, + "loss": 1.1493, + "step": 56900 + }, + { + "epoch": 0.8845574884774748, + "grad_norm": 2.693777322769165, + "learning_rate": 4.9115458030074954e-05, + "loss": 1.1607, + "step": 57000 + }, + { + "epoch": 0.8861093437204177, + "grad_norm": 2.4241716861724854, + "learning_rate": 4.911390617483201e-05, + "loss": 1.157, + "step": 57100 + }, + { + "epoch": 0.8876611989633607, + "grad_norm": 2.381190061569214, + "learning_rate": 4.911235431958907e-05, + "loss": 1.1493, + "step": 57200 + }, + { + "epoch": 0.8892130542063036, + "grad_norm": 2.6688320636749268, + "learning_rate": 4.911080246434613e-05, + "loss": 1.1479, + "step": 57300 + }, + { + "epoch": 0.8907649094492466, + "grad_norm": 2.5460402965545654, + "learning_rate": 4.9109250609103185e-05, + "loss": 1.1582, + "step": 57400 + }, + { + "epoch": 0.8923167646921896, + "grad_norm": 2.1748390197753906, + "learning_rate": 4.910769875386024e-05, + "loss": 1.156, + "step": 57500 + }, + { + "epoch": 0.8938686199351324, + "grad_norm": 2.1711413860321045, + "learning_rate": 4.91061468986173e-05, + "loss": 1.1302, + "step": 57600 + }, + { + "epoch": 0.8954204751780754, + "grad_norm": 2.1466925144195557, + "learning_rate": 4.910459504337436e-05, + "loss": 1.1266, + "step": 57700 + }, + { + "epoch": 0.8969723304210183, + "grad_norm": 2.3548076152801514, + "learning_rate": 4.9103043188131417e-05, + "loss": 1.1514, + "step": 57800 + }, + { + "epoch": 0.8985241856639613, + "grad_norm": 2.4585494995117188, + "learning_rate": 4.9101491332888474e-05, + "loss": 1.1548, + "step": 57900 + }, + { + "epoch": 0.9000760409069042, + "grad_norm": 2.0424818992614746, + "learning_rate": 4.909993947764553e-05, + "loss": 1.1562, + "step": 58000 + }, + { + "epoch": 0.9016278961498472, + "grad_norm": 2.4888811111450195, + "learning_rate": 4.909838762240258e-05, + "loss": 1.1365, + "step": 58100 + }, + { + "epoch": 0.90317975139279, + "grad_norm": 2.359149694442749, + "learning_rate": 4.909683576715964e-05, + "loss": 1.1652, + "step": 58200 + }, + { + "epoch": 0.904731606635733, + "grad_norm": 2.323132038116455, + "learning_rate": 4.90952839119167e-05, + "loss": 1.1558, + "step": 58300 + }, + { + "epoch": 0.9062834618786759, + "grad_norm": 2.355215311050415, + "learning_rate": 4.9093732056673756e-05, + "loss": 1.1366, + "step": 58400 + }, + { + "epoch": 0.9078353171216189, + "grad_norm": 2.5962116718292236, + "learning_rate": 4.9092180201430814e-05, + "loss": 1.1229, + "step": 58500 + }, + { + "epoch": 0.9093871723645618, + "grad_norm": 2.4846644401550293, + "learning_rate": 4.909062834618787e-05, + "loss": 1.1543, + "step": 58600 + }, + { + "epoch": 0.9109390276075048, + "grad_norm": 1.9102394580841064, + "learning_rate": 4.908907649094493e-05, + "loss": 1.1381, + "step": 58700 + }, + { + "epoch": 0.9124908828504477, + "grad_norm": 2.333954095840454, + "learning_rate": 4.908752463570198e-05, + "loss": 1.1396, + "step": 58800 + }, + { + "epoch": 0.9140427380933906, + "grad_norm": 2.653343915939331, + "learning_rate": 4.908597278045904e-05, + "loss": 1.1265, + "step": 58900 + }, + { + "epoch": 0.9155945933363336, + "grad_norm": 2.3244781494140625, + "learning_rate": 4.9084420925216096e-05, + "loss": 1.1486, + "step": 59000 + }, + { + "epoch": 0.9171464485792765, + "grad_norm": 2.522923469543457, + "learning_rate": 4.9082869069973154e-05, + "loss": 1.1305, + "step": 59100 + }, + { + "epoch": 0.9186983038222195, + "grad_norm": 2.374894618988037, + "learning_rate": 4.908131721473021e-05, + "loss": 1.134, + "step": 59200 + }, + { + "epoch": 0.9202501590651624, + "grad_norm": 2.0529372692108154, + "learning_rate": 4.907976535948727e-05, + "loss": 1.1355, + "step": 59300 + }, + { + "epoch": 0.9218020143081054, + "grad_norm": 2.5219955444335938, + "learning_rate": 4.907821350424433e-05, + "loss": 1.1363, + "step": 59400 + }, + { + "epoch": 0.9233538695510483, + "grad_norm": 2.275097370147705, + "learning_rate": 4.9076661649001385e-05, + "loss": 1.1624, + "step": 59500 + }, + { + "epoch": 0.9249057247939912, + "grad_norm": 2.720305919647217, + "learning_rate": 4.907510979375844e-05, + "loss": 1.1596, + "step": 59600 + }, + { + "epoch": 0.9264575800369341, + "grad_norm": 2.3368608951568604, + "learning_rate": 4.9073557938515493e-05, + "loss": 1.1253, + "step": 59700 + }, + { + "epoch": 0.9280094352798771, + "grad_norm": 2.565573215484619, + "learning_rate": 4.907200608327255e-05, + "loss": 1.1385, + "step": 59800 + }, + { + "epoch": 0.92956129052282, + "grad_norm": 2.826049327850342, + "learning_rate": 4.907045422802961e-05, + "loss": 1.1543, + "step": 59900 + }, + { + "epoch": 0.931113145765763, + "grad_norm": 2.1456282138824463, + "learning_rate": 4.906890237278667e-05, + "loss": 1.1624, + "step": 60000 + }, + { + "epoch": 0.9326650010087059, + "grad_norm": 1.9751325845718384, + "learning_rate": 4.9067350517543724e-05, + "loss": 1.1451, + "step": 60100 + }, + { + "epoch": 0.9342168562516489, + "grad_norm": 2.428513526916504, + "learning_rate": 4.906579866230078e-05, + "loss": 1.1496, + "step": 60200 + }, + { + "epoch": 0.9357687114945917, + "grad_norm": 2.0277485847473145, + "learning_rate": 4.906424680705784e-05, + "loss": 1.1404, + "step": 60300 + }, + { + "epoch": 0.9373205667375347, + "grad_norm": 2.4008748531341553, + "learning_rate": 4.90626949518149e-05, + "loss": 1.1345, + "step": 60400 + }, + { + "epoch": 0.9388724219804777, + "grad_norm": 2.1889076232910156, + "learning_rate": 4.9061143096571955e-05, + "loss": 1.1351, + "step": 60500 + }, + { + "epoch": 0.9404242772234206, + "grad_norm": 2.0435073375701904, + "learning_rate": 4.905959124132901e-05, + "loss": 1.1466, + "step": 60600 + }, + { + "epoch": 0.9419761324663636, + "grad_norm": 2.1171984672546387, + "learning_rate": 4.905803938608607e-05, + "loss": 1.117, + "step": 60700 + }, + { + "epoch": 0.9435279877093065, + "grad_norm": 2.4131886959075928, + "learning_rate": 4.905648753084313e-05, + "loss": 1.1528, + "step": 60800 + }, + { + "epoch": 0.9450798429522494, + "grad_norm": 2.251917839050293, + "learning_rate": 4.9054935675600187e-05, + "loss": 1.1577, + "step": 60900 + }, + { + "epoch": 0.9466316981951923, + "grad_norm": 2.514467716217041, + "learning_rate": 4.905338382035724e-05, + "loss": 1.1501, + "step": 61000 + }, + { + "epoch": 0.9481835534381353, + "grad_norm": 2.3124213218688965, + "learning_rate": 4.9051831965114295e-05, + "loss": 1.1464, + "step": 61100 + }, + { + "epoch": 0.9497354086810782, + "grad_norm": 2.8343303203582764, + "learning_rate": 4.905028010987135e-05, + "loss": 1.1506, + "step": 61200 + }, + { + "epoch": 0.9512872639240212, + "grad_norm": 2.353212356567383, + "learning_rate": 4.904872825462841e-05, + "loss": 1.1598, + "step": 61300 + }, + { + "epoch": 0.9528391191669641, + "grad_norm": 2.13781476020813, + "learning_rate": 4.904717639938547e-05, + "loss": 1.1614, + "step": 61400 + }, + { + "epoch": 0.9543909744099071, + "grad_norm": 2.4410207271575928, + "learning_rate": 4.9045624544142526e-05, + "loss": 1.1465, + "step": 61500 + }, + { + "epoch": 0.9559428296528499, + "grad_norm": 2.4736881256103516, + "learning_rate": 4.9044072688899584e-05, + "loss": 1.142, + "step": 61600 + }, + { + "epoch": 0.9574946848957929, + "grad_norm": 2.5588748455047607, + "learning_rate": 4.904252083365664e-05, + "loss": 1.142, + "step": 61700 + }, + { + "epoch": 0.9590465401387358, + "grad_norm": 2.3748042583465576, + "learning_rate": 4.90409689784137e-05, + "loss": 1.1198, + "step": 61800 + }, + { + "epoch": 0.9605983953816788, + "grad_norm": 2.3566460609436035, + "learning_rate": 4.903941712317076e-05, + "loss": 1.1478, + "step": 61900 + }, + { + "epoch": 0.9621502506246218, + "grad_norm": 2.16545033454895, + "learning_rate": 4.9037865267927815e-05, + "loss": 1.1251, + "step": 62000 + }, + { + "epoch": 0.9637021058675647, + "grad_norm": 2.2538864612579346, + "learning_rate": 4.9036313412684866e-05, + "loss": 1.1182, + "step": 62100 + }, + { + "epoch": 0.9652539611105077, + "grad_norm": 2.1791608333587646, + "learning_rate": 4.9034761557441924e-05, + "loss": 1.1415, + "step": 62200 + }, + { + "epoch": 0.9668058163534505, + "grad_norm": 2.3963756561279297, + "learning_rate": 4.903320970219898e-05, + "loss": 1.1554, + "step": 62300 + }, + { + "epoch": 0.9683576715963935, + "grad_norm": 2.318190813064575, + "learning_rate": 4.903165784695604e-05, + "loss": 1.1583, + "step": 62400 + }, + { + "epoch": 0.9699095268393364, + "grad_norm": 2.463308572769165, + "learning_rate": 4.903010599171309e-05, + "loss": 1.1356, + "step": 62500 + }, + { + "epoch": 0.9714613820822794, + "grad_norm": 2.5972485542297363, + "learning_rate": 4.902855413647015e-05, + "loss": 1.1615, + "step": 62600 + }, + { + "epoch": 0.9730132373252223, + "grad_norm": 2.307711362838745, + "learning_rate": 4.9027002281227206e-05, + "loss": 1.136, + "step": 62700 + }, + { + "epoch": 0.9745650925681653, + "grad_norm": 2.5087623596191406, + "learning_rate": 4.9025450425984263e-05, + "loss": 1.1111, + "step": 62800 + }, + { + "epoch": 0.9761169478111081, + "grad_norm": 2.0486855506896973, + "learning_rate": 4.902389857074132e-05, + "loss": 1.1293, + "step": 62900 + }, + { + "epoch": 0.9776688030540511, + "grad_norm": 2.275099039077759, + "learning_rate": 4.902234671549838e-05, + "loss": 1.1552, + "step": 63000 + }, + { + "epoch": 0.979220658296994, + "grad_norm": 2.1463348865509033, + "learning_rate": 4.902079486025544e-05, + "loss": 1.1491, + "step": 63100 + }, + { + "epoch": 0.980772513539937, + "grad_norm": 2.0790605545043945, + "learning_rate": 4.9019243005012494e-05, + "loss": 1.1463, + "step": 63200 + }, + { + "epoch": 0.98232436878288, + "grad_norm": 2.1598682403564453, + "learning_rate": 4.901769114976955e-05, + "loss": 1.15, + "step": 63300 + }, + { + "epoch": 0.9838762240258229, + "grad_norm": 2.1825709342956543, + "learning_rate": 4.901613929452661e-05, + "loss": 1.1327, + "step": 63400 + }, + { + "epoch": 0.9854280792687659, + "grad_norm": 2.0131757259368896, + "learning_rate": 4.901458743928367e-05, + "loss": 1.1529, + "step": 63500 + }, + { + "epoch": 0.9869799345117087, + "grad_norm": 2.467144012451172, + "learning_rate": 4.9013035584040725e-05, + "loss": 1.148, + "step": 63600 + }, + { + "epoch": 0.9885317897546517, + "grad_norm": 2.1347553730010986, + "learning_rate": 4.901148372879778e-05, + "loss": 1.1521, + "step": 63700 + }, + { + "epoch": 0.9900836449975946, + "grad_norm": 1.8552659749984741, + "learning_rate": 4.9009931873554834e-05, + "loss": 1.1246, + "step": 63800 + }, + { + "epoch": 0.9916355002405376, + "grad_norm": 2.1700820922851562, + "learning_rate": 4.900838001831189e-05, + "loss": 1.1601, + "step": 63900 + }, + { + "epoch": 0.9931873554834805, + "grad_norm": 2.525789499282837, + "learning_rate": 4.900682816306895e-05, + "loss": 1.1187, + "step": 64000 + }, + { + "epoch": 0.9947392107264235, + "grad_norm": 2.2519590854644775, + "learning_rate": 4.900527630782601e-05, + "loss": 1.1405, + "step": 64100 + }, + { + "epoch": 0.9962910659693663, + "grad_norm": 2.1992874145507812, + "learning_rate": 4.9003724452583065e-05, + "loss": 1.1021, + "step": 64200 + }, + { + "epoch": 0.9978429212123093, + "grad_norm": 2.641643762588501, + "learning_rate": 4.900217259734012e-05, + "loss": 1.1421, + "step": 64300 + }, + { + "epoch": 0.9993947764552522, + "grad_norm": 2.3939990997314453, + "learning_rate": 4.900062074209718e-05, + "loss": 1.1421, + "step": 64400 + }, + { + "epoch": 1.0009466316981952, + "grad_norm": 2.475574016571045, + "learning_rate": 4.899906888685424e-05, + "loss": 1.1344, + "step": 64500 + }, + { + "epoch": 1.0024984869411382, + "grad_norm": 2.404494047164917, + "learning_rate": 4.8997517031611296e-05, + "loss": 1.1433, + "step": 64600 + }, + { + "epoch": 1.0040503421840812, + "grad_norm": 2.2565083503723145, + "learning_rate": 4.8995965176368354e-05, + "loss": 1.1195, + "step": 64700 + }, + { + "epoch": 1.005602197427024, + "grad_norm": 2.1716673374176025, + "learning_rate": 4.899441332112541e-05, + "loss": 1.126, + "step": 64800 + }, + { + "epoch": 1.007154052669967, + "grad_norm": 2.1893420219421387, + "learning_rate": 4.899286146588247e-05, + "loss": 1.1386, + "step": 64900 + }, + { + "epoch": 1.00870590791291, + "grad_norm": 2.226348876953125, + "learning_rate": 4.899130961063953e-05, + "loss": 1.1257, + "step": 65000 + }, + { + "epoch": 1.010257763155853, + "grad_norm": 2.3709893226623535, + "learning_rate": 4.898975775539658e-05, + "loss": 1.1219, + "step": 65100 + }, + { + "epoch": 1.0118096183987957, + "grad_norm": 2.157656669616699, + "learning_rate": 4.8988205900153636e-05, + "loss": 1.1369, + "step": 65200 + }, + { + "epoch": 1.0133614736417387, + "grad_norm": 2.7935526371002197, + "learning_rate": 4.898665404491069e-05, + "loss": 1.1407, + "step": 65300 + }, + { + "epoch": 1.0149133288846817, + "grad_norm": 2.5528039932250977, + "learning_rate": 4.8985102189667745e-05, + "loss": 1.1347, + "step": 65400 + }, + { + "epoch": 1.0164651841276247, + "grad_norm": 2.5023515224456787, + "learning_rate": 4.89835503344248e-05, + "loss": 1.1333, + "step": 65500 + }, + { + "epoch": 1.0180170393705674, + "grad_norm": 2.134716272354126, + "learning_rate": 4.898199847918186e-05, + "loss": 1.1342, + "step": 65600 + }, + { + "epoch": 1.0195688946135104, + "grad_norm": 2.233942747116089, + "learning_rate": 4.898044662393892e-05, + "loss": 1.1213, + "step": 65700 + }, + { + "epoch": 1.0211207498564534, + "grad_norm": 2.079174280166626, + "learning_rate": 4.8978894768695976e-05, + "loss": 1.1175, + "step": 65800 + }, + { + "epoch": 1.0226726050993964, + "grad_norm": 2.600405693054199, + "learning_rate": 4.8977342913453033e-05, + "loss": 1.124, + "step": 65900 + }, + { + "epoch": 1.0242244603423392, + "grad_norm": 2.2721426486968994, + "learning_rate": 4.897579105821009e-05, + "loss": 1.1279, + "step": 66000 + }, + { + "epoch": 1.0257763155852822, + "grad_norm": 2.5375139713287354, + "learning_rate": 4.897423920296715e-05, + "loss": 1.1233, + "step": 66100 + }, + { + "epoch": 1.0273281708282251, + "grad_norm": 2.007460355758667, + "learning_rate": 4.897268734772421e-05, + "loss": 1.1239, + "step": 66200 + }, + { + "epoch": 1.0288800260711681, + "grad_norm": 2.4504077434539795, + "learning_rate": 4.8971135492481264e-05, + "loss": 1.1412, + "step": 66300 + }, + { + "epoch": 1.0304318813141111, + "grad_norm": 2.283846616744995, + "learning_rate": 4.896958363723832e-05, + "loss": 1.1463, + "step": 66400 + }, + { + "epoch": 1.0319837365570539, + "grad_norm": 2.5684947967529297, + "learning_rate": 4.896803178199538e-05, + "loss": 1.1288, + "step": 66500 + }, + { + "epoch": 1.0335355917999969, + "grad_norm": 2.768786907196045, + "learning_rate": 4.896647992675243e-05, + "loss": 1.1344, + "step": 66600 + }, + { + "epoch": 1.0350874470429399, + "grad_norm": 2.195347309112549, + "learning_rate": 4.896492807150949e-05, + "loss": 1.1032, + "step": 66700 + }, + { + "epoch": 1.0366393022858829, + "grad_norm": 2.4260852336883545, + "learning_rate": 4.8963376216266546e-05, + "loss": 1.1314, + "step": 66800 + }, + { + "epoch": 1.0381911575288256, + "grad_norm": 2.155688524246216, + "learning_rate": 4.8961824361023604e-05, + "loss": 1.1276, + "step": 66900 + }, + { + "epoch": 1.0397430127717686, + "grad_norm": 2.5889296531677246, + "learning_rate": 4.896027250578066e-05, + "loss": 1.1214, + "step": 67000 + }, + { + "epoch": 1.0412948680147116, + "grad_norm": 1.9910494089126587, + "learning_rate": 4.895872065053772e-05, + "loss": 1.1284, + "step": 67100 + }, + { + "epoch": 1.0428467232576546, + "grad_norm": 2.4980008602142334, + "learning_rate": 4.895716879529478e-05, + "loss": 1.1312, + "step": 67200 + }, + { + "epoch": 1.0443985785005974, + "grad_norm": 2.067190647125244, + "learning_rate": 4.8955616940051835e-05, + "loss": 1.1239, + "step": 67300 + }, + { + "epoch": 1.0459504337435404, + "grad_norm": 2.1788415908813477, + "learning_rate": 4.895406508480889e-05, + "loss": 1.1316, + "step": 67400 + }, + { + "epoch": 1.0475022889864833, + "grad_norm": 2.1426210403442383, + "learning_rate": 4.895251322956595e-05, + "loss": 1.1665, + "step": 67500 + }, + { + "epoch": 1.0490541442294263, + "grad_norm": 2.366445779800415, + "learning_rate": 4.895096137432301e-05, + "loss": 1.1554, + "step": 67600 + }, + { + "epoch": 1.0506059994723693, + "grad_norm": 2.381641149520874, + "learning_rate": 4.8949409519080066e-05, + "loss": 1.1363, + "step": 67700 + }, + { + "epoch": 1.052157854715312, + "grad_norm": 2.5074310302734375, + "learning_rate": 4.8947857663837124e-05, + "loss": 1.127, + "step": 67800 + }, + { + "epoch": 1.053709709958255, + "grad_norm": 2.143097162246704, + "learning_rate": 4.8946305808594175e-05, + "loss": 1.1321, + "step": 67900 + }, + { + "epoch": 1.055261565201198, + "grad_norm": 2.051079511642456, + "learning_rate": 4.894475395335123e-05, + "loss": 1.1248, + "step": 68000 + }, + { + "epoch": 1.056813420444141, + "grad_norm": 2.2431132793426514, + "learning_rate": 4.894320209810829e-05, + "loss": 1.1023, + "step": 68100 + }, + { + "epoch": 1.0583652756870838, + "grad_norm": 2.1707863807678223, + "learning_rate": 4.894165024286535e-05, + "loss": 1.1428, + "step": 68200 + }, + { + "epoch": 1.0599171309300268, + "grad_norm": 2.3224987983703613, + "learning_rate": 4.8940098387622406e-05, + "loss": 1.1147, + "step": 68300 + }, + { + "epoch": 1.0614689861729698, + "grad_norm": 2.5109481811523438, + "learning_rate": 4.8938546532379464e-05, + "loss": 1.1263, + "step": 68400 + }, + { + "epoch": 1.0630208414159128, + "grad_norm": 2.5767602920532227, + "learning_rate": 4.893699467713652e-05, + "loss": 1.1385, + "step": 68500 + }, + { + "epoch": 1.0645726966588556, + "grad_norm": 2.0844085216522217, + "learning_rate": 4.893544282189357e-05, + "loss": 1.131, + "step": 68600 + }, + { + "epoch": 1.0661245519017986, + "grad_norm": 2.0231311321258545, + "learning_rate": 4.893389096665063e-05, + "loss": 1.1136, + "step": 68700 + }, + { + "epoch": 1.0676764071447415, + "grad_norm": 2.810903787612915, + "learning_rate": 4.893233911140769e-05, + "loss": 1.1221, + "step": 68800 + }, + { + "epoch": 1.0692282623876845, + "grad_norm": 2.2353179454803467, + "learning_rate": 4.8930787256164746e-05, + "loss": 1.1242, + "step": 68900 + }, + { + "epoch": 1.0707801176306275, + "grad_norm": 2.5603678226470947, + "learning_rate": 4.8929235400921803e-05, + "loss": 1.1207, + "step": 69000 + }, + { + "epoch": 1.0723319728735703, + "grad_norm": 2.389472723007202, + "learning_rate": 4.892768354567886e-05, + "loss": 1.1246, + "step": 69100 + }, + { + "epoch": 1.0738838281165133, + "grad_norm": 2.076622486114502, + "learning_rate": 4.892613169043592e-05, + "loss": 1.1193, + "step": 69200 + }, + { + "epoch": 1.0754356833594563, + "grad_norm": 2.4756710529327393, + "learning_rate": 4.892457983519298e-05, + "loss": 1.1344, + "step": 69300 + }, + { + "epoch": 1.0769875386023993, + "grad_norm": 2.454780101776123, + "learning_rate": 4.8923027979950034e-05, + "loss": 1.1415, + "step": 69400 + }, + { + "epoch": 1.078539393845342, + "grad_norm": 2.361328125, + "learning_rate": 4.8921476124707085e-05, + "loss": 1.0997, + "step": 69500 + }, + { + "epoch": 1.080091249088285, + "grad_norm": 2.194244623184204, + "learning_rate": 4.891992426946414e-05, + "loss": 1.1327, + "step": 69600 + }, + { + "epoch": 1.081643104331228, + "grad_norm": 2.312431573867798, + "learning_rate": 4.89183724142212e-05, + "loss": 1.1287, + "step": 69700 + }, + { + "epoch": 1.083194959574171, + "grad_norm": 2.243638038635254, + "learning_rate": 4.891682055897826e-05, + "loss": 1.1074, + "step": 69800 + }, + { + "epoch": 1.0847468148171138, + "grad_norm": 2.416748046875, + "learning_rate": 4.8915268703735316e-05, + "loss": 1.0981, + "step": 69900 + }, + { + "epoch": 1.0862986700600568, + "grad_norm": 2.2611000537872314, + "learning_rate": 4.8913716848492374e-05, + "loss": 1.1052, + "step": 70000 + }, + { + "epoch": 1.0878505253029997, + "grad_norm": 2.5724949836730957, + "learning_rate": 4.891216499324943e-05, + "loss": 1.1139, + "step": 70100 + }, + { + "epoch": 1.0894023805459427, + "grad_norm": 2.333221197128296, + "learning_rate": 4.891061313800649e-05, + "loss": 1.1286, + "step": 70200 + }, + { + "epoch": 1.0909542357888857, + "grad_norm": 2.2546133995056152, + "learning_rate": 4.890906128276355e-05, + "loss": 1.1359, + "step": 70300 + }, + { + "epoch": 1.0925060910318285, + "grad_norm": 2.327009916305542, + "learning_rate": 4.8907509427520605e-05, + "loss": 1.1243, + "step": 70400 + }, + { + "epoch": 1.0940579462747715, + "grad_norm": 2.5289971828460693, + "learning_rate": 4.890595757227766e-05, + "loss": 1.1238, + "step": 70500 + }, + { + "epoch": 1.0956098015177145, + "grad_norm": 2.1109981536865234, + "learning_rate": 4.890440571703472e-05, + "loss": 1.1208, + "step": 70600 + }, + { + "epoch": 1.0971616567606575, + "grad_norm": 2.357865810394287, + "learning_rate": 4.890285386179178e-05, + "loss": 1.0835, + "step": 70700 + }, + { + "epoch": 1.0987135120036002, + "grad_norm": 1.9613914489746094, + "learning_rate": 4.890130200654883e-05, + "loss": 1.1382, + "step": 70800 + }, + { + "epoch": 1.1002653672465432, + "grad_norm": 2.448016405105591, + "learning_rate": 4.889975015130589e-05, + "loss": 1.1443, + "step": 70900 + }, + { + "epoch": 1.1018172224894862, + "grad_norm": 2.627293586730957, + "learning_rate": 4.8898198296062945e-05, + "loss": 1.1332, + "step": 71000 + }, + { + "epoch": 1.1033690777324292, + "grad_norm": 2.505042552947998, + "learning_rate": 4.889664644082e-05, + "loss": 1.1255, + "step": 71100 + }, + { + "epoch": 1.104920932975372, + "grad_norm": 2.3451147079467773, + "learning_rate": 4.889509458557706e-05, + "loss": 1.1246, + "step": 71200 + }, + { + "epoch": 1.106472788218315, + "grad_norm": 2.861656904220581, + "learning_rate": 4.889354273033412e-05, + "loss": 1.1213, + "step": 71300 + }, + { + "epoch": 1.108024643461258, + "grad_norm": 1.72523033618927, + "learning_rate": 4.8891990875091176e-05, + "loss": 1.1387, + "step": 71400 + }, + { + "epoch": 1.109576498704201, + "grad_norm": 2.3678364753723145, + "learning_rate": 4.8890439019848234e-05, + "loss": 1.1343, + "step": 71500 + }, + { + "epoch": 1.1111283539471437, + "grad_norm": 2.4903464317321777, + "learning_rate": 4.888888716460529e-05, + "loss": 1.1297, + "step": 71600 + }, + { + "epoch": 1.1126802091900867, + "grad_norm": 2.282066822052002, + "learning_rate": 4.888733530936235e-05, + "loss": 1.1206, + "step": 71700 + }, + { + "epoch": 1.1142320644330297, + "grad_norm": 2.2484939098358154, + "learning_rate": 4.888578345411941e-05, + "loss": 1.1301, + "step": 71800 + }, + { + "epoch": 1.1157839196759727, + "grad_norm": 2.3275198936462402, + "learning_rate": 4.888423159887646e-05, + "loss": 1.1381, + "step": 71900 + }, + { + "epoch": 1.1173357749189154, + "grad_norm": 2.6546790599823, + "learning_rate": 4.8882679743633516e-05, + "loss": 1.1092, + "step": 72000 + }, + { + "epoch": 1.1188876301618584, + "grad_norm": 2.0898914337158203, + "learning_rate": 4.8881127888390573e-05, + "loss": 1.1235, + "step": 72100 + }, + { + "epoch": 1.1204394854048014, + "grad_norm": 2.0487396717071533, + "learning_rate": 4.887957603314763e-05, + "loss": 1.1279, + "step": 72200 + }, + { + "epoch": 1.1219913406477444, + "grad_norm": 2.117793321609497, + "learning_rate": 4.887802417790468e-05, + "loss": 1.1277, + "step": 72300 + }, + { + "epoch": 1.1235431958906874, + "grad_norm": 2.5232748985290527, + "learning_rate": 4.887647232266174e-05, + "loss": 1.1214, + "step": 72400 + }, + { + "epoch": 1.1250950511336302, + "grad_norm": 2.1948370933532715, + "learning_rate": 4.88749204674188e-05, + "loss": 1.13, + "step": 72500 + }, + { + "epoch": 1.1266469063765732, + "grad_norm": 2.259174108505249, + "learning_rate": 4.8873368612175855e-05, + "loss": 1.0967, + "step": 72600 + }, + { + "epoch": 1.1281987616195162, + "grad_norm": 2.6604952812194824, + "learning_rate": 4.887181675693291e-05, + "loss": 1.0942, + "step": 72700 + }, + { + "epoch": 1.1297506168624591, + "grad_norm": 3.044663906097412, + "learning_rate": 4.887026490168997e-05, + "loss": 1.1424, + "step": 72800 + }, + { + "epoch": 1.131302472105402, + "grad_norm": 2.2361817359924316, + "learning_rate": 4.886871304644703e-05, + "loss": 1.1246, + "step": 72900 + }, + { + "epoch": 1.132854327348345, + "grad_norm": 2.3046019077301025, + "learning_rate": 4.8867161191204086e-05, + "loss": 1.1244, + "step": 73000 + }, + { + "epoch": 1.134406182591288, + "grad_norm": 2.2907626628875732, + "learning_rate": 4.8865609335961144e-05, + "loss": 1.1059, + "step": 73100 + }, + { + "epoch": 1.1359580378342309, + "grad_norm": 2.809164047241211, + "learning_rate": 4.88640574807182e-05, + "loss": 1.1126, + "step": 73200 + }, + { + "epoch": 1.1375098930771737, + "grad_norm": 2.2181551456451416, + "learning_rate": 4.886250562547526e-05, + "loss": 1.1062, + "step": 73300 + }, + { + "epoch": 1.1390617483201166, + "grad_norm": 2.2118139266967773, + "learning_rate": 4.886095377023232e-05, + "loss": 1.1329, + "step": 73400 + }, + { + "epoch": 1.1406136035630596, + "grad_norm": 2.2996928691864014, + "learning_rate": 4.8859401914989375e-05, + "loss": 1.1187, + "step": 73500 + }, + { + "epoch": 1.1421654588060026, + "grad_norm": 2.208285093307495, + "learning_rate": 4.8857850059746426e-05, + "loss": 1.1083, + "step": 73600 + }, + { + "epoch": 1.1437173140489456, + "grad_norm": 2.3561384677886963, + "learning_rate": 4.8856298204503484e-05, + "loss": 1.14, + "step": 73700 + }, + { + "epoch": 1.1452691692918884, + "grad_norm": 2.080418586730957, + "learning_rate": 4.885474634926054e-05, + "loss": 1.096, + "step": 73800 + }, + { + "epoch": 1.1468210245348314, + "grad_norm": 2.51823091506958, + "learning_rate": 4.88531944940176e-05, + "loss": 1.1224, + "step": 73900 + }, + { + "epoch": 1.1483728797777744, + "grad_norm": 2.5477023124694824, + "learning_rate": 4.885164263877466e-05, + "loss": 1.1156, + "step": 74000 + }, + { + "epoch": 1.1499247350207173, + "grad_norm": 2.4625790119171143, + "learning_rate": 4.8850090783531715e-05, + "loss": 1.114, + "step": 74100 + }, + { + "epoch": 1.1514765902636601, + "grad_norm": 1.9517508745193481, + "learning_rate": 4.884853892828877e-05, + "loss": 1.1223, + "step": 74200 + }, + { + "epoch": 1.153028445506603, + "grad_norm": 2.3685731887817383, + "learning_rate": 4.884698707304583e-05, + "loss": 1.1175, + "step": 74300 + }, + { + "epoch": 1.154580300749546, + "grad_norm": 2.2436087131500244, + "learning_rate": 4.884543521780289e-05, + "loss": 1.1158, + "step": 74400 + }, + { + "epoch": 1.156132155992489, + "grad_norm": 1.794718861579895, + "learning_rate": 4.8843883362559946e-05, + "loss": 1.108, + "step": 74500 + }, + { + "epoch": 1.1576840112354319, + "grad_norm": 2.3092503547668457, + "learning_rate": 4.8842331507317004e-05, + "loss": 1.1249, + "step": 74600 + }, + { + "epoch": 1.1592358664783748, + "grad_norm": 2.821805238723755, + "learning_rate": 4.884077965207406e-05, + "loss": 1.1062, + "step": 74700 + }, + { + "epoch": 1.1607877217213178, + "grad_norm": 2.215714454650879, + "learning_rate": 4.883922779683112e-05, + "loss": 1.1163, + "step": 74800 + }, + { + "epoch": 1.1623395769642608, + "grad_norm": 2.593536615371704, + "learning_rate": 4.883767594158817e-05, + "loss": 1.1088, + "step": 74900 + }, + { + "epoch": 1.1638914322072038, + "grad_norm": 2.684123992919922, + "learning_rate": 4.883612408634523e-05, + "loss": 1.1392, + "step": 75000 + }, + { + "epoch": 1.1654432874501466, + "grad_norm": 2.0079808235168457, + "learning_rate": 4.8834572231102286e-05, + "loss": 1.1066, + "step": 75100 + }, + { + "epoch": 1.1669951426930896, + "grad_norm": 2.3010594844818115, + "learning_rate": 4.883302037585934e-05, + "loss": 1.1097, + "step": 75200 + }, + { + "epoch": 1.1685469979360326, + "grad_norm": 1.901662826538086, + "learning_rate": 4.8831468520616394e-05, + "loss": 1.1249, + "step": 75300 + }, + { + "epoch": 1.1700988531789756, + "grad_norm": 2.384251594543457, + "learning_rate": 4.882991666537345e-05, + "loss": 1.1023, + "step": 75400 + }, + { + "epoch": 1.1716507084219183, + "grad_norm": 2.661597490310669, + "learning_rate": 4.882836481013051e-05, + "loss": 1.1202, + "step": 75500 + }, + { + "epoch": 1.1732025636648613, + "grad_norm": 2.466740608215332, + "learning_rate": 4.882681295488757e-05, + "loss": 1.1092, + "step": 75600 + }, + { + "epoch": 1.1747544189078043, + "grad_norm": 2.1879634857177734, + "learning_rate": 4.8825261099644625e-05, + "loss": 1.1207, + "step": 75700 + }, + { + "epoch": 1.1763062741507473, + "grad_norm": 2.2805233001708984, + "learning_rate": 4.882370924440168e-05, + "loss": 1.1154, + "step": 75800 + }, + { + "epoch": 1.17785812939369, + "grad_norm": 2.2602341175079346, + "learning_rate": 4.882215738915874e-05, + "loss": 1.1194, + "step": 75900 + }, + { + "epoch": 1.179409984636633, + "grad_norm": 2.6319894790649414, + "learning_rate": 4.88206055339158e-05, + "loss": 1.1203, + "step": 76000 + }, + { + "epoch": 1.180961839879576, + "grad_norm": 1.7584298849105835, + "learning_rate": 4.8819053678672856e-05, + "loss": 1.1068, + "step": 76100 + }, + { + "epoch": 1.182513695122519, + "grad_norm": 2.8603458404541016, + "learning_rate": 4.8817501823429914e-05, + "loss": 1.1076, + "step": 76200 + }, + { + "epoch": 1.184065550365462, + "grad_norm": 2.515455484390259, + "learning_rate": 4.881594996818697e-05, + "loss": 1.1081, + "step": 76300 + }, + { + "epoch": 1.1856174056084048, + "grad_norm": 2.2006876468658447, + "learning_rate": 4.881439811294403e-05, + "loss": 1.116, + "step": 76400 + }, + { + "epoch": 1.1871692608513478, + "grad_norm": 2.294879674911499, + "learning_rate": 4.881284625770108e-05, + "loss": 1.1128, + "step": 76500 + }, + { + "epoch": 1.1887211160942908, + "grad_norm": 2.5908255577087402, + "learning_rate": 4.881129440245814e-05, + "loss": 1.1319, + "step": 76600 + }, + { + "epoch": 1.1902729713372338, + "grad_norm": 2.358715057373047, + "learning_rate": 4.8809742547215196e-05, + "loss": 1.1311, + "step": 76700 + }, + { + "epoch": 1.1918248265801765, + "grad_norm": 2.575032949447632, + "learning_rate": 4.8808190691972254e-05, + "loss": 1.1051, + "step": 76800 + }, + { + "epoch": 1.1933766818231195, + "grad_norm": 1.993265151977539, + "learning_rate": 4.880663883672931e-05, + "loss": 1.1194, + "step": 76900 + }, + { + "epoch": 1.1949285370660625, + "grad_norm": 1.9357131719589233, + "learning_rate": 4.880508698148637e-05, + "loss": 1.1034, + "step": 77000 + }, + { + "epoch": 1.1964803923090055, + "grad_norm": 2.110600233078003, + "learning_rate": 4.880353512624343e-05, + "loss": 1.1222, + "step": 77100 + }, + { + "epoch": 1.1980322475519483, + "grad_norm": 2.849364757537842, + "learning_rate": 4.8801983271000485e-05, + "loss": 1.1082, + "step": 77200 + }, + { + "epoch": 1.1995841027948912, + "grad_norm": 2.535916805267334, + "learning_rate": 4.880043141575754e-05, + "loss": 1.1171, + "step": 77300 + }, + { + "epoch": 1.2011359580378342, + "grad_norm": 2.435857057571411, + "learning_rate": 4.87988795605146e-05, + "loss": 1.1251, + "step": 77400 + }, + { + "epoch": 1.2026878132807772, + "grad_norm": 2.0573973655700684, + "learning_rate": 4.879732770527166e-05, + "loss": 1.1083, + "step": 77500 + }, + { + "epoch": 1.2042396685237202, + "grad_norm": 2.450852394104004, + "learning_rate": 4.8795775850028716e-05, + "loss": 1.1121, + "step": 77600 + }, + { + "epoch": 1.205791523766663, + "grad_norm": 2.6014044284820557, + "learning_rate": 4.8794223994785774e-05, + "loss": 1.1065, + "step": 77700 + }, + { + "epoch": 1.207343379009606, + "grad_norm": 1.7768484354019165, + "learning_rate": 4.8792672139542825e-05, + "loss": 1.1166, + "step": 77800 + }, + { + "epoch": 1.208895234252549, + "grad_norm": 2.511260986328125, + "learning_rate": 4.879112028429988e-05, + "loss": 1.1028, + "step": 77900 + }, + { + "epoch": 1.210447089495492, + "grad_norm": 2.6407852172851562, + "learning_rate": 4.878956842905694e-05, + "loss": 1.1105, + "step": 78000 + }, + { + "epoch": 1.2119989447384347, + "grad_norm": 2.5305416584014893, + "learning_rate": 4.8788016573814e-05, + "loss": 1.1078, + "step": 78100 + }, + { + "epoch": 1.2135507999813777, + "grad_norm": 1.7494454383850098, + "learning_rate": 4.8786464718571056e-05, + "loss": 1.1014, + "step": 78200 + }, + { + "epoch": 1.2151026552243207, + "grad_norm": 2.407849073410034, + "learning_rate": 4.8784912863328113e-05, + "loss": 1.1084, + "step": 78300 + }, + { + "epoch": 1.2166545104672637, + "grad_norm": 2.26851224899292, + "learning_rate": 4.8783361008085164e-05, + "loss": 1.1219, + "step": 78400 + }, + { + "epoch": 1.2182063657102065, + "grad_norm": 2.121371030807495, + "learning_rate": 4.878180915284222e-05, + "loss": 1.1023, + "step": 78500 + }, + { + "epoch": 1.2197582209531495, + "grad_norm": 2.2187118530273438, + "learning_rate": 4.878025729759928e-05, + "loss": 1.0841, + "step": 78600 + }, + { + "epoch": 1.2213100761960924, + "grad_norm": 2.2014241218566895, + "learning_rate": 4.877870544235634e-05, + "loss": 1.1137, + "step": 78700 + }, + { + "epoch": 1.2228619314390354, + "grad_norm": 2.2093615531921387, + "learning_rate": 4.8777153587113395e-05, + "loss": 1.1093, + "step": 78800 + }, + { + "epoch": 1.2244137866819784, + "grad_norm": 1.9585342407226562, + "learning_rate": 4.877560173187045e-05, + "loss": 1.1248, + "step": 78900 + }, + { + "epoch": 1.2259656419249212, + "grad_norm": 2.365720272064209, + "learning_rate": 4.877404987662751e-05, + "loss": 1.1303, + "step": 79000 + }, + { + "epoch": 1.2275174971678642, + "grad_norm": 2.6853487491607666, + "learning_rate": 4.877249802138457e-05, + "loss": 1.1003, + "step": 79100 + }, + { + "epoch": 1.2290693524108072, + "grad_norm": 2.127527952194214, + "learning_rate": 4.8770946166141626e-05, + "loss": 1.1123, + "step": 79200 + }, + { + "epoch": 1.23062120765375, + "grad_norm": 2.4248695373535156, + "learning_rate": 4.876939431089868e-05, + "loss": 1.1194, + "step": 79300 + }, + { + "epoch": 1.232173062896693, + "grad_norm": 2.3345465660095215, + "learning_rate": 4.8767842455655735e-05, + "loss": 1.1159, + "step": 79400 + }, + { + "epoch": 1.233724918139636, + "grad_norm": 2.347710371017456, + "learning_rate": 4.876629060041279e-05, + "loss": 1.0867, + "step": 79500 + }, + { + "epoch": 1.235276773382579, + "grad_norm": 2.7182693481445312, + "learning_rate": 4.876473874516985e-05, + "loss": 1.0936, + "step": 79600 + }, + { + "epoch": 1.236828628625522, + "grad_norm": 2.3278050422668457, + "learning_rate": 4.876318688992691e-05, + "loss": 1.1071, + "step": 79700 + }, + { + "epoch": 1.2383804838684647, + "grad_norm": 2.615981340408325, + "learning_rate": 4.8761635034683966e-05, + "loss": 1.1483, + "step": 79800 + }, + { + "epoch": 1.2399323391114077, + "grad_norm": 2.1018478870391846, + "learning_rate": 4.8760083179441024e-05, + "loss": 1.1049, + "step": 79900 + }, + { + "epoch": 1.2414841943543506, + "grad_norm": 2.3852782249450684, + "learning_rate": 4.875853132419808e-05, + "loss": 1.1416, + "step": 80000 + }, + { + "epoch": 1.2430360495972936, + "grad_norm": 2.2100820541381836, + "learning_rate": 4.875697946895514e-05, + "loss": 1.1111, + "step": 80100 + }, + { + "epoch": 1.2445879048402366, + "grad_norm": 2.3852789402008057, + "learning_rate": 4.87554276137122e-05, + "loss": 1.1309, + "step": 80200 + }, + { + "epoch": 1.2461397600831794, + "grad_norm": 2.390979766845703, + "learning_rate": 4.8753875758469255e-05, + "loss": 1.1189, + "step": 80300 + }, + { + "epoch": 1.2476916153261224, + "grad_norm": 2.1343493461608887, + "learning_rate": 4.875232390322631e-05, + "loss": 1.1065, + "step": 80400 + }, + { + "epoch": 1.2492434705690654, + "grad_norm": 2.163029670715332, + "learning_rate": 4.875077204798337e-05, + "loss": 1.1176, + "step": 80500 + }, + { + "epoch": 1.2507953258120081, + "grad_norm": 2.0536952018737793, + "learning_rate": 4.874922019274042e-05, + "loss": 1.0989, + "step": 80600 + }, + { + "epoch": 1.2523471810549511, + "grad_norm": 2.367103099822998, + "learning_rate": 4.874766833749748e-05, + "loss": 1.1156, + "step": 80700 + }, + { + "epoch": 1.2538990362978941, + "grad_norm": 2.1768832206726074, + "learning_rate": 4.874611648225454e-05, + "loss": 1.1134, + "step": 80800 + }, + { + "epoch": 1.255450891540837, + "grad_norm": 1.9017785787582397, + "learning_rate": 4.8744564627011595e-05, + "loss": 1.109, + "step": 80900 + }, + { + "epoch": 1.25700274678378, + "grad_norm": 2.1839749813079834, + "learning_rate": 4.874301277176865e-05, + "loss": 1.1015, + "step": 81000 + }, + { + "epoch": 1.2585546020267229, + "grad_norm": 2.312918186187744, + "learning_rate": 4.874146091652571e-05, + "loss": 1.1079, + "step": 81100 + }, + { + "epoch": 1.2601064572696659, + "grad_norm": 2.8707101345062256, + "learning_rate": 4.873990906128277e-05, + "loss": 1.1289, + "step": 81200 + }, + { + "epoch": 1.2616583125126088, + "grad_norm": 2.3373329639434814, + "learning_rate": 4.8738357206039826e-05, + "loss": 1.1078, + "step": 81300 + }, + { + "epoch": 1.2632101677555518, + "grad_norm": 2.244046926498413, + "learning_rate": 4.8736805350796883e-05, + "loss": 1.1323, + "step": 81400 + }, + { + "epoch": 1.2647620229984948, + "grad_norm": 2.097012519836426, + "learning_rate": 4.873525349555394e-05, + "loss": 1.0956, + "step": 81500 + }, + { + "epoch": 1.2663138782414376, + "grad_norm": 2.4968135356903076, + "learning_rate": 4.873370164031099e-05, + "loss": 1.1182, + "step": 81600 + }, + { + "epoch": 1.2678657334843806, + "grad_norm": 2.7941203117370605, + "learning_rate": 4.873214978506805e-05, + "loss": 1.1228, + "step": 81700 + }, + { + "epoch": 1.2694175887273236, + "grad_norm": 2.2232351303100586, + "learning_rate": 4.873059792982511e-05, + "loss": 1.1, + "step": 81800 + }, + { + "epoch": 1.2709694439702663, + "grad_norm": 1.9244650602340698, + "learning_rate": 4.8729046074582165e-05, + "loss": 1.1156, + "step": 81900 + }, + { + "epoch": 1.2725212992132093, + "grad_norm": 2.22092866897583, + "learning_rate": 4.872749421933922e-05, + "loss": 1.1156, + "step": 82000 + }, + { + "epoch": 1.2740731544561523, + "grad_norm": 2.7483675479888916, + "learning_rate": 4.8725942364096274e-05, + "loss": 1.1184, + "step": 82100 + }, + { + "epoch": 1.2756250096990953, + "grad_norm": 2.1573240756988525, + "learning_rate": 4.872439050885333e-05, + "loss": 1.1166, + "step": 82200 + }, + { + "epoch": 1.2771768649420383, + "grad_norm": 2.4324727058410645, + "learning_rate": 4.872283865361039e-05, + "loss": 1.1138, + "step": 82300 + }, + { + "epoch": 1.278728720184981, + "grad_norm": 2.7243478298187256, + "learning_rate": 4.872128679836745e-05, + "loss": 1.0961, + "step": 82400 + }, + { + "epoch": 1.280280575427924, + "grad_norm": 1.8685250282287598, + "learning_rate": 4.8719734943124505e-05, + "loss": 1.1104, + "step": 82500 + }, + { + "epoch": 1.281832430670867, + "grad_norm": 2.515303373336792, + "learning_rate": 4.871818308788156e-05, + "loss": 1.1209, + "step": 82600 + }, + { + "epoch": 1.28338428591381, + "grad_norm": 2.028463125228882, + "learning_rate": 4.871663123263862e-05, + "loss": 1.0991, + "step": 82700 + }, + { + "epoch": 1.284936141156753, + "grad_norm": 2.218553304672241, + "learning_rate": 4.871507937739568e-05, + "loss": 1.1049, + "step": 82800 + }, + { + "epoch": 1.2864879963996958, + "grad_norm": 2.079744577407837, + "learning_rate": 4.8713527522152736e-05, + "loss": 1.0811, + "step": 82900 + }, + { + "epoch": 1.2880398516426388, + "grad_norm": 2.152311086654663, + "learning_rate": 4.8711975666909794e-05, + "loss": 1.111, + "step": 83000 + }, + { + "epoch": 1.2895917068855818, + "grad_norm": 2.2847890853881836, + "learning_rate": 4.871042381166685e-05, + "loss": 1.1185, + "step": 83100 + }, + { + "epoch": 1.2911435621285245, + "grad_norm": 2.5103321075439453, + "learning_rate": 4.870887195642391e-05, + "loss": 1.1029, + "step": 83200 + }, + { + "epoch": 1.2926954173714675, + "grad_norm": 2.4194531440734863, + "learning_rate": 4.870732010118097e-05, + "loss": 1.1068, + "step": 83300 + }, + { + "epoch": 1.2942472726144105, + "grad_norm": 2.6363720893859863, + "learning_rate": 4.870576824593802e-05, + "loss": 1.1173, + "step": 83400 + }, + { + "epoch": 1.2957991278573535, + "grad_norm": 2.5055174827575684, + "learning_rate": 4.8704216390695076e-05, + "loss": 1.1149, + "step": 83500 + }, + { + "epoch": 1.2973509831002965, + "grad_norm": 2.532381057739258, + "learning_rate": 4.8702664535452134e-05, + "loss": 1.129, + "step": 83600 + }, + { + "epoch": 1.2989028383432393, + "grad_norm": 1.9616467952728271, + "learning_rate": 4.870111268020919e-05, + "loss": 1.1239, + "step": 83700 + }, + { + "epoch": 1.3004546935861823, + "grad_norm": 2.455014228820801, + "learning_rate": 4.869956082496625e-05, + "loss": 1.1165, + "step": 83800 + }, + { + "epoch": 1.3020065488291253, + "grad_norm": 1.827030062675476, + "learning_rate": 4.869800896972331e-05, + "loss": 1.1004, + "step": 83900 + }, + { + "epoch": 1.303558404072068, + "grad_norm": 2.5139052867889404, + "learning_rate": 4.8696457114480365e-05, + "loss": 1.1081, + "step": 84000 + }, + { + "epoch": 1.3051102593150112, + "grad_norm": 2.587278366088867, + "learning_rate": 4.869490525923742e-05, + "loss": 1.1236, + "step": 84100 + }, + { + "epoch": 1.306662114557954, + "grad_norm": 2.0649638175964355, + "learning_rate": 4.869335340399448e-05, + "loss": 1.1001, + "step": 84200 + }, + { + "epoch": 1.308213969800897, + "grad_norm": 2.487438201904297, + "learning_rate": 4.869180154875154e-05, + "loss": 1.0837, + "step": 84300 + }, + { + "epoch": 1.30976582504384, + "grad_norm": 2.7843523025512695, + "learning_rate": 4.8690249693508596e-05, + "loss": 1.1157, + "step": 84400 + }, + { + "epoch": 1.3113176802867827, + "grad_norm": 2.4103662967681885, + "learning_rate": 4.8688697838265653e-05, + "loss": 1.1223, + "step": 84500 + }, + { + "epoch": 1.3128695355297257, + "grad_norm": 2.319493532180786, + "learning_rate": 4.868714598302271e-05, + "loss": 1.0921, + "step": 84600 + }, + { + "epoch": 1.3144213907726687, + "grad_norm": 2.398345947265625, + "learning_rate": 4.868559412777976e-05, + "loss": 1.0854, + "step": 84700 + }, + { + "epoch": 1.3159732460156117, + "grad_norm": 2.239546060562134, + "learning_rate": 4.868404227253682e-05, + "loss": 1.0861, + "step": 84800 + }, + { + "epoch": 1.3175251012585547, + "grad_norm": 2.4055612087249756, + "learning_rate": 4.868249041729388e-05, + "loss": 1.1129, + "step": 84900 + }, + { + "epoch": 1.3190769565014975, + "grad_norm": 2.7176923751831055, + "learning_rate": 4.868093856205093e-05, + "loss": 1.1187, + "step": 85000 + }, + { + "epoch": 1.3206288117444405, + "grad_norm": 2.2004525661468506, + "learning_rate": 4.8679386706807986e-05, + "loss": 1.0869, + "step": 85100 + }, + { + "epoch": 1.3221806669873835, + "grad_norm": 2.2112913131713867, + "learning_rate": 4.8677834851565044e-05, + "loss": 1.1104, + "step": 85200 + }, + { + "epoch": 1.3237325222303262, + "grad_norm": 2.4282455444335938, + "learning_rate": 4.86762829963221e-05, + "loss": 1.1057, + "step": 85300 + }, + { + "epoch": 1.3252843774732692, + "grad_norm": 2.6111245155334473, + "learning_rate": 4.867473114107916e-05, + "loss": 1.0764, + "step": 85400 + }, + { + "epoch": 1.3268362327162122, + "grad_norm": 2.459329605102539, + "learning_rate": 4.867317928583622e-05, + "loss": 1.1173, + "step": 85500 + }, + { + "epoch": 1.3283880879591552, + "grad_norm": 2.4787330627441406, + "learning_rate": 4.8671627430593275e-05, + "loss": 1.1068, + "step": 85600 + }, + { + "epoch": 1.3299399432020982, + "grad_norm": 2.342802047729492, + "learning_rate": 4.867007557535033e-05, + "loss": 1.1067, + "step": 85700 + }, + { + "epoch": 1.331491798445041, + "grad_norm": 2.576122283935547, + "learning_rate": 4.866852372010739e-05, + "loss": 1.1048, + "step": 85800 + }, + { + "epoch": 1.333043653687984, + "grad_norm": 2.542020559310913, + "learning_rate": 4.866697186486445e-05, + "loss": 1.0989, + "step": 85900 + }, + { + "epoch": 1.334595508930927, + "grad_norm": 2.414774179458618, + "learning_rate": 4.8665420009621506e-05, + "loss": 1.0741, + "step": 86000 + }, + { + "epoch": 1.33614736417387, + "grad_norm": 2.438695192337036, + "learning_rate": 4.8663868154378564e-05, + "loss": 1.1096, + "step": 86100 + }, + { + "epoch": 1.337699219416813, + "grad_norm": 2.066688299179077, + "learning_rate": 4.866231629913562e-05, + "loss": 1.1083, + "step": 86200 + }, + { + "epoch": 1.3392510746597557, + "grad_norm": 2.383652448654175, + "learning_rate": 4.866076444389267e-05, + "loss": 1.1034, + "step": 86300 + }, + { + "epoch": 1.3408029299026987, + "grad_norm": 2.4665942192077637, + "learning_rate": 4.865921258864973e-05, + "loss": 1.1141, + "step": 86400 + }, + { + "epoch": 1.3423547851456417, + "grad_norm": 2.3365814685821533, + "learning_rate": 4.865766073340679e-05, + "loss": 1.1207, + "step": 86500 + }, + { + "epoch": 1.3439066403885844, + "grad_norm": 2.1258933544158936, + "learning_rate": 4.8656108878163846e-05, + "loss": 1.1238, + "step": 86600 + }, + { + "epoch": 1.3454584956315274, + "grad_norm": 2.463226318359375, + "learning_rate": 4.8654557022920904e-05, + "loss": 1.0907, + "step": 86700 + }, + { + "epoch": 1.3470103508744704, + "grad_norm": 2.3676583766937256, + "learning_rate": 4.865300516767796e-05, + "loss": 1.0928, + "step": 86800 + }, + { + "epoch": 1.3485622061174134, + "grad_norm": 2.4078824520111084, + "learning_rate": 4.865145331243502e-05, + "loss": 1.1046, + "step": 86900 + }, + { + "epoch": 1.3501140613603564, + "grad_norm": 2.385486602783203, + "learning_rate": 4.864990145719208e-05, + "loss": 1.1336, + "step": 87000 + }, + { + "epoch": 1.3516659166032992, + "grad_norm": 2.1174564361572266, + "learning_rate": 4.8648349601949135e-05, + "loss": 1.1081, + "step": 87100 + }, + { + "epoch": 1.3532177718462421, + "grad_norm": 2.3165135383605957, + "learning_rate": 4.864679774670619e-05, + "loss": 1.1124, + "step": 87200 + }, + { + "epoch": 1.3547696270891851, + "grad_norm": 2.289062976837158, + "learning_rate": 4.864524589146325e-05, + "loss": 1.1163, + "step": 87300 + }, + { + "epoch": 1.3563214823321281, + "grad_norm": 2.558507204055786, + "learning_rate": 4.864369403622031e-05, + "loss": 1.114, + "step": 87400 + }, + { + "epoch": 1.3578733375750711, + "grad_norm": 2.215275526046753, + "learning_rate": 4.8642142180977366e-05, + "loss": 1.1004, + "step": 87500 + }, + { + "epoch": 1.3594251928180139, + "grad_norm": 2.6778178215026855, + "learning_rate": 4.864059032573442e-05, + "loss": 1.1195, + "step": 87600 + }, + { + "epoch": 1.3609770480609569, + "grad_norm": 2.3095626831054688, + "learning_rate": 4.8639038470491474e-05, + "loss": 1.0885, + "step": 87700 + }, + { + "epoch": 1.3625289033038999, + "grad_norm": 2.2825675010681152, + "learning_rate": 4.863748661524853e-05, + "loss": 1.0986, + "step": 87800 + }, + { + "epoch": 1.3640807585468426, + "grad_norm": 2.4111886024475098, + "learning_rate": 4.863593476000559e-05, + "loss": 1.1198, + "step": 87900 + }, + { + "epoch": 1.3656326137897856, + "grad_norm": 2.4561874866485596, + "learning_rate": 4.863438290476265e-05, + "loss": 1.1111, + "step": 88000 + }, + { + "epoch": 1.3671844690327286, + "grad_norm": 2.816755533218384, + "learning_rate": 4.86328310495197e-05, + "loss": 1.0981, + "step": 88100 + }, + { + "epoch": 1.3687363242756716, + "grad_norm": 2.6150989532470703, + "learning_rate": 4.8631279194276756e-05, + "loss": 1.1013, + "step": 88200 + }, + { + "epoch": 1.3702881795186146, + "grad_norm": 2.587946891784668, + "learning_rate": 4.8629727339033814e-05, + "loss": 1.1291, + "step": 88300 + }, + { + "epoch": 1.3718400347615574, + "grad_norm": 2.921314239501953, + "learning_rate": 4.862817548379087e-05, + "loss": 1.1209, + "step": 88400 + }, + { + "epoch": 1.3733918900045003, + "grad_norm": 1.9959654808044434, + "learning_rate": 4.862662362854793e-05, + "loss": 1.1078, + "step": 88500 + }, + { + "epoch": 1.3749437452474433, + "grad_norm": 2.308357000350952, + "learning_rate": 4.862507177330499e-05, + "loss": 1.1152, + "step": 88600 + }, + { + "epoch": 1.3764956004903863, + "grad_norm": 2.9256973266601562, + "learning_rate": 4.8623519918062045e-05, + "loss": 1.0843, + "step": 88700 + }, + { + "epoch": 1.3780474557333293, + "grad_norm": 2.4615092277526855, + "learning_rate": 4.86219680628191e-05, + "loss": 1.0818, + "step": 88800 + }, + { + "epoch": 1.379599310976272, + "grad_norm": 2.446812391281128, + "learning_rate": 4.862041620757616e-05, + "loss": 1.0925, + "step": 88900 + }, + { + "epoch": 1.381151166219215, + "grad_norm": 2.7313692569732666, + "learning_rate": 4.861886435233322e-05, + "loss": 1.0902, + "step": 89000 + }, + { + "epoch": 1.382703021462158, + "grad_norm": 2.348445177078247, + "learning_rate": 4.861731249709027e-05, + "loss": 1.1012, + "step": 89100 + }, + { + "epoch": 1.3842548767051008, + "grad_norm": 2.393333673477173, + "learning_rate": 4.861576064184733e-05, + "loss": 1.1085, + "step": 89200 + }, + { + "epoch": 1.3858067319480438, + "grad_norm": 2.2570338249206543, + "learning_rate": 4.8614208786604385e-05, + "loss": 1.1059, + "step": 89300 + }, + { + "epoch": 1.3873585871909868, + "grad_norm": 2.2027170658111572, + "learning_rate": 4.861265693136144e-05, + "loss": 1.0874, + "step": 89400 + }, + { + "epoch": 1.3889104424339298, + "grad_norm": 1.8995881080627441, + "learning_rate": 4.86111050761185e-05, + "loss": 1.1106, + "step": 89500 + }, + { + "epoch": 1.3904622976768728, + "grad_norm": 2.977391481399536, + "learning_rate": 4.860955322087556e-05, + "loss": 1.1068, + "step": 89600 + }, + { + "epoch": 1.3920141529198156, + "grad_norm": 2.1075680255889893, + "learning_rate": 4.8608001365632616e-05, + "loss": 1.1045, + "step": 89700 + }, + { + "epoch": 1.3935660081627586, + "grad_norm": 2.444673776626587, + "learning_rate": 4.8606449510389674e-05, + "loss": 1.1307, + "step": 89800 + }, + { + "epoch": 1.3951178634057015, + "grad_norm": 2.2788937091827393, + "learning_rate": 4.860489765514673e-05, + "loss": 1.0991, + "step": 89900 + }, + { + "epoch": 1.3966697186486445, + "grad_norm": 2.2362778186798096, + "learning_rate": 4.860334579990379e-05, + "loss": 1.0801, + "step": 90000 + }, + { + "epoch": 1.3982215738915875, + "grad_norm": 2.303495168685913, + "learning_rate": 4.860179394466085e-05, + "loss": 1.1049, + "step": 90100 + }, + { + "epoch": 1.3997734291345303, + "grad_norm": 2.496016025543213, + "learning_rate": 4.8600242089417905e-05, + "loss": 1.0875, + "step": 90200 + }, + { + "epoch": 1.4013252843774733, + "grad_norm": 2.088832378387451, + "learning_rate": 4.859869023417496e-05, + "loss": 1.1205, + "step": 90300 + }, + { + "epoch": 1.4028771396204163, + "grad_norm": 2.3098864555358887, + "learning_rate": 4.8597138378932013e-05, + "loss": 1.1103, + "step": 90400 + }, + { + "epoch": 1.404428994863359, + "grad_norm": 2.163699150085449, + "learning_rate": 4.859558652368907e-05, + "loss": 1.1078, + "step": 90500 + }, + { + "epoch": 1.405980850106302, + "grad_norm": 2.240849733352661, + "learning_rate": 4.859403466844613e-05, + "loss": 1.0926, + "step": 90600 + }, + { + "epoch": 1.407532705349245, + "grad_norm": 2.175450563430786, + "learning_rate": 4.859248281320319e-05, + "loss": 1.0944, + "step": 90700 + }, + { + "epoch": 1.409084560592188, + "grad_norm": 2.29375958442688, + "learning_rate": 4.8590930957960244e-05, + "loss": 1.1037, + "step": 90800 + }, + { + "epoch": 1.410636415835131, + "grad_norm": 2.3014724254608154, + "learning_rate": 4.85893791027173e-05, + "loss": 1.0915, + "step": 90900 + }, + { + "epoch": 1.4121882710780738, + "grad_norm": 2.7147958278656006, + "learning_rate": 4.858782724747436e-05, + "loss": 1.1222, + "step": 91000 + }, + { + "epoch": 1.4137401263210168, + "grad_norm": 2.025317668914795, + "learning_rate": 4.858627539223142e-05, + "loss": 1.0864, + "step": 91100 + }, + { + "epoch": 1.4152919815639597, + "grad_norm": 2.586599826812744, + "learning_rate": 4.8584723536988475e-05, + "loss": 1.1084, + "step": 91200 + }, + { + "epoch": 1.4168438368069027, + "grad_norm": 2.205930471420288, + "learning_rate": 4.858317168174553e-05, + "loss": 1.0884, + "step": 91300 + }, + { + "epoch": 1.4183956920498457, + "grad_norm": 2.263949394226074, + "learning_rate": 4.8581619826502584e-05, + "loss": 1.0906, + "step": 91400 + }, + { + "epoch": 1.4199475472927885, + "grad_norm": 2.1652019023895264, + "learning_rate": 4.858006797125964e-05, + "loss": 1.0805, + "step": 91500 + }, + { + "epoch": 1.4214994025357315, + "grad_norm": 2.4087634086608887, + "learning_rate": 4.85785161160167e-05, + "loss": 1.1166, + "step": 91600 + }, + { + "epoch": 1.4230512577786745, + "grad_norm": 2.43849515914917, + "learning_rate": 4.857696426077376e-05, + "loss": 1.0994, + "step": 91700 + }, + { + "epoch": 1.4246031130216172, + "grad_norm": 2.8346004486083984, + "learning_rate": 4.8575412405530815e-05, + "loss": 1.0919, + "step": 91800 + }, + { + "epoch": 1.4261549682645602, + "grad_norm": 2.0506465435028076, + "learning_rate": 4.857386055028787e-05, + "loss": 1.1118, + "step": 91900 + }, + { + "epoch": 1.4277068235075032, + "grad_norm": 2.5766263008117676, + "learning_rate": 4.8572308695044924e-05, + "loss": 1.1151, + "step": 92000 + }, + { + "epoch": 1.4292586787504462, + "grad_norm": 2.260166645050049, + "learning_rate": 4.857075683980198e-05, + "loss": 1.0907, + "step": 92100 + }, + { + "epoch": 1.4308105339933892, + "grad_norm": 2.205695390701294, + "learning_rate": 4.856920498455904e-05, + "loss": 1.1179, + "step": 92200 + }, + { + "epoch": 1.432362389236332, + "grad_norm": 2.001887559890747, + "learning_rate": 4.85676531293161e-05, + "loss": 1.0972, + "step": 92300 + }, + { + "epoch": 1.433914244479275, + "grad_norm": 2.280686140060425, + "learning_rate": 4.8566101274073155e-05, + "loss": 1.1089, + "step": 92400 + }, + { + "epoch": 1.435466099722218, + "grad_norm": 1.7954293489456177, + "learning_rate": 4.856454941883021e-05, + "loss": 1.0893, + "step": 92500 + }, + { + "epoch": 1.4370179549651607, + "grad_norm": 2.0910868644714355, + "learning_rate": 4.856299756358727e-05, + "loss": 1.1146, + "step": 92600 + }, + { + "epoch": 1.4385698102081037, + "grad_norm": 2.3523671627044678, + "learning_rate": 4.856144570834433e-05, + "loss": 1.105, + "step": 92700 + }, + { + "epoch": 1.4401216654510467, + "grad_norm": 2.5177364349365234, + "learning_rate": 4.8559893853101386e-05, + "loss": 1.1026, + "step": 92800 + }, + { + "epoch": 1.4416735206939897, + "grad_norm": 2.642850399017334, + "learning_rate": 4.8558341997858444e-05, + "loss": 1.0924, + "step": 92900 + }, + { + "epoch": 1.4432253759369327, + "grad_norm": 2.4015111923217773, + "learning_rate": 4.85567901426155e-05, + "loss": 1.0824, + "step": 93000 + }, + { + "epoch": 1.4447772311798754, + "grad_norm": 2.785917282104492, + "learning_rate": 4.855523828737256e-05, + "loss": 1.087, + "step": 93100 + }, + { + "epoch": 1.4463290864228184, + "grad_norm": 2.587714672088623, + "learning_rate": 4.855368643212962e-05, + "loss": 1.1315, + "step": 93200 + }, + { + "epoch": 1.4478809416657614, + "grad_norm": 2.5632598400115967, + "learning_rate": 4.855213457688667e-05, + "loss": 1.1101, + "step": 93300 + }, + { + "epoch": 1.4494327969087044, + "grad_norm": 1.9527606964111328, + "learning_rate": 4.8550582721643726e-05, + "loss": 1.0916, + "step": 93400 + }, + { + "epoch": 1.4509846521516474, + "grad_norm": 2.3354501724243164, + "learning_rate": 4.8549030866400783e-05, + "loss": 1.0842, + "step": 93500 + }, + { + "epoch": 1.4525365073945902, + "grad_norm": 2.434691905975342, + "learning_rate": 4.854747901115784e-05, + "loss": 1.1153, + "step": 93600 + }, + { + "epoch": 1.4540883626375332, + "grad_norm": 2.458353042602539, + "learning_rate": 4.85459271559149e-05, + "loss": 1.106, + "step": 93700 + }, + { + "epoch": 1.4556402178804762, + "grad_norm": 2.31325364112854, + "learning_rate": 4.854437530067196e-05, + "loss": 1.1061, + "step": 93800 + }, + { + "epoch": 1.457192073123419, + "grad_norm": 2.5093953609466553, + "learning_rate": 4.8542823445429014e-05, + "loss": 1.0818, + "step": 93900 + }, + { + "epoch": 1.458743928366362, + "grad_norm": 2.224592447280884, + "learning_rate": 4.854127159018607e-05, + "loss": 1.1263, + "step": 94000 + }, + { + "epoch": 1.460295783609305, + "grad_norm": 2.1180953979492188, + "learning_rate": 4.853971973494313e-05, + "loss": 1.0958, + "step": 94100 + }, + { + "epoch": 1.4618476388522479, + "grad_norm": 2.6123318672180176, + "learning_rate": 4.853816787970019e-05, + "loss": 1.1093, + "step": 94200 + }, + { + "epoch": 1.4633994940951909, + "grad_norm": 2.0120418071746826, + "learning_rate": 4.8536616024457245e-05, + "loss": 1.0914, + "step": 94300 + }, + { + "epoch": 1.4649513493381336, + "grad_norm": 2.1120519638061523, + "learning_rate": 4.85350641692143e-05, + "loss": 1.1289, + "step": 94400 + }, + { + "epoch": 1.4665032045810766, + "grad_norm": 2.2654669284820557, + "learning_rate": 4.853351231397136e-05, + "loss": 1.0939, + "step": 94500 + }, + { + "epoch": 1.4680550598240196, + "grad_norm": 2.5117177963256836, + "learning_rate": 4.853196045872841e-05, + "loss": 1.0839, + "step": 94600 + }, + { + "epoch": 1.4696069150669626, + "grad_norm": 2.5389342308044434, + "learning_rate": 4.853040860348547e-05, + "loss": 1.0958, + "step": 94700 + }, + { + "epoch": 1.4711587703099056, + "grad_norm": 2.5889949798583984, + "learning_rate": 4.852885674824252e-05, + "loss": 1.107, + "step": 94800 + }, + { + "epoch": 1.4727106255528484, + "grad_norm": 2.972501516342163, + "learning_rate": 4.852730489299958e-05, + "loss": 1.107, + "step": 94900 + }, + { + "epoch": 1.4742624807957914, + "grad_norm": 2.1755499839782715, + "learning_rate": 4.8525753037756636e-05, + "loss": 1.0804, + "step": 95000 + }, + { + "epoch": 1.4758143360387344, + "grad_norm": 2.343222141265869, + "learning_rate": 4.8524201182513694e-05, + "loss": 1.1132, + "step": 95100 + }, + { + "epoch": 1.4773661912816771, + "grad_norm": 2.0029778480529785, + "learning_rate": 4.852264932727075e-05, + "loss": 1.0648, + "step": 95200 + }, + { + "epoch": 1.4789180465246201, + "grad_norm": 2.5098717212677, + "learning_rate": 4.852109747202781e-05, + "loss": 1.0883, + "step": 95300 + }, + { + "epoch": 1.480469901767563, + "grad_norm": 2.020259380340576, + "learning_rate": 4.851954561678487e-05, + "loss": 1.0841, + "step": 95400 + }, + { + "epoch": 1.482021757010506, + "grad_norm": 2.137216091156006, + "learning_rate": 4.8517993761541925e-05, + "loss": 1.0916, + "step": 95500 + }, + { + "epoch": 1.483573612253449, + "grad_norm": 2.2599966526031494, + "learning_rate": 4.851644190629898e-05, + "loss": 1.1158, + "step": 95600 + }, + { + "epoch": 1.4851254674963918, + "grad_norm": 2.5027835369110107, + "learning_rate": 4.851489005105604e-05, + "loss": 1.1076, + "step": 95700 + }, + { + "epoch": 1.4866773227393348, + "grad_norm": 2.1276371479034424, + "learning_rate": 4.85133381958131e-05, + "loss": 1.0956, + "step": 95800 + }, + { + "epoch": 1.4882291779822778, + "grad_norm": 2.3590199947357178, + "learning_rate": 4.8511786340570156e-05, + "loss": 1.0832, + "step": 95900 + }, + { + "epoch": 1.4897810332252208, + "grad_norm": 2.2994654178619385, + "learning_rate": 4.8510234485327214e-05, + "loss": 1.096, + "step": 96000 + }, + { + "epoch": 1.4913328884681638, + "grad_norm": 2.2821874618530273, + "learning_rate": 4.8508682630084265e-05, + "loss": 1.1047, + "step": 96100 + }, + { + "epoch": 1.4928847437111066, + "grad_norm": 2.377063274383545, + "learning_rate": 4.850713077484132e-05, + "loss": 1.095, + "step": 96200 + }, + { + "epoch": 1.4944365989540496, + "grad_norm": 2.467970848083496, + "learning_rate": 4.850557891959838e-05, + "loss": 1.0818, + "step": 96300 + }, + { + "epoch": 1.4959884541969926, + "grad_norm": 2.613098382949829, + "learning_rate": 4.850402706435544e-05, + "loss": 1.1014, + "step": 96400 + }, + { + "epoch": 1.4975403094399353, + "grad_norm": 2.434502601623535, + "learning_rate": 4.8502475209112496e-05, + "loss": 1.1125, + "step": 96500 + }, + { + "epoch": 1.4990921646828783, + "grad_norm": 2.31904935836792, + "learning_rate": 4.8500923353869553e-05, + "loss": 1.1075, + "step": 96600 + }, + { + "epoch": 1.5006440199258213, + "grad_norm": 2.3460516929626465, + "learning_rate": 4.849937149862661e-05, + "loss": 1.0989, + "step": 96700 + }, + { + "epoch": 1.5021958751687643, + "grad_norm": 2.311793804168701, + "learning_rate": 4.849781964338367e-05, + "loss": 1.0994, + "step": 96800 + }, + { + "epoch": 1.5037477304117073, + "grad_norm": 3.030339479446411, + "learning_rate": 4.849626778814073e-05, + "loss": 1.0758, + "step": 96900 + }, + { + "epoch": 1.50529958565465, + "grad_norm": 2.2871413230895996, + "learning_rate": 4.8494715932897784e-05, + "loss": 1.1092, + "step": 97000 + }, + { + "epoch": 1.506851440897593, + "grad_norm": 2.2784247398376465, + "learning_rate": 4.849316407765484e-05, + "loss": 1.0695, + "step": 97100 + }, + { + "epoch": 1.508403296140536, + "grad_norm": 2.4598681926727295, + "learning_rate": 4.84916122224119e-05, + "loss": 1.1045, + "step": 97200 + }, + { + "epoch": 1.5099551513834788, + "grad_norm": 2.7960808277130127, + "learning_rate": 4.849006036716896e-05, + "loss": 1.1138, + "step": 97300 + }, + { + "epoch": 1.511507006626422, + "grad_norm": 2.394219398498535, + "learning_rate": 4.848850851192601e-05, + "loss": 1.1138, + "step": 97400 + }, + { + "epoch": 1.5130588618693648, + "grad_norm": 2.312546968460083, + "learning_rate": 4.8486956656683066e-05, + "loss": 1.089, + "step": 97500 + }, + { + "epoch": 1.5146107171123078, + "grad_norm": 2.181861162185669, + "learning_rate": 4.8485404801440124e-05, + "loss": 1.1244, + "step": 97600 + }, + { + "epoch": 1.5161625723552508, + "grad_norm": 2.3561577796936035, + "learning_rate": 4.848385294619718e-05, + "loss": 1.0938, + "step": 97700 + }, + { + "epoch": 1.5177144275981935, + "grad_norm": 1.8671692609786987, + "learning_rate": 4.848230109095424e-05, + "loss": 1.08, + "step": 97800 + }, + { + "epoch": 1.5192662828411367, + "grad_norm": 2.1992411613464355, + "learning_rate": 4.848074923571129e-05, + "loss": 1.1053, + "step": 97900 + }, + { + "epoch": 1.5208181380840795, + "grad_norm": 2.339897632598877, + "learning_rate": 4.847919738046835e-05, + "loss": 1.0964, + "step": 98000 + }, + { + "epoch": 1.5223699933270225, + "grad_norm": 2.0564520359039307, + "learning_rate": 4.8477645525225406e-05, + "loss": 1.092, + "step": 98100 + }, + { + "epoch": 1.5239218485699655, + "grad_norm": 2.5134778022766113, + "learning_rate": 4.8476093669982464e-05, + "loss": 1.1088, + "step": 98200 + }, + { + "epoch": 1.5254737038129083, + "grad_norm": 2.253459930419922, + "learning_rate": 4.847454181473952e-05, + "loss": 1.0826, + "step": 98300 + }, + { + "epoch": 1.5270255590558512, + "grad_norm": 2.5602986812591553, + "learning_rate": 4.847298995949658e-05, + "loss": 1.0843, + "step": 98400 + }, + { + "epoch": 1.5285774142987942, + "grad_norm": 2.2525172233581543, + "learning_rate": 4.847143810425364e-05, + "loss": 1.079, + "step": 98500 + }, + { + "epoch": 1.530129269541737, + "grad_norm": 2.373265266418457, + "learning_rate": 4.8469886249010695e-05, + "loss": 1.1158, + "step": 98600 + }, + { + "epoch": 1.5316811247846802, + "grad_norm": 2.2877092361450195, + "learning_rate": 4.846833439376775e-05, + "loss": 1.1057, + "step": 98700 + }, + { + "epoch": 1.533232980027623, + "grad_norm": 2.0692262649536133, + "learning_rate": 4.846678253852481e-05, + "loss": 1.0738, + "step": 98800 + }, + { + "epoch": 1.534784835270566, + "grad_norm": 2.559347629547119, + "learning_rate": 4.846523068328186e-05, + "loss": 1.0809, + "step": 98900 + }, + { + "epoch": 1.536336690513509, + "grad_norm": 2.3724586963653564, + "learning_rate": 4.846367882803892e-05, + "loss": 1.0862, + "step": 99000 + }, + { + "epoch": 1.5378885457564517, + "grad_norm": 2.499943256378174, + "learning_rate": 4.846212697279598e-05, + "loss": 1.0969, + "step": 99100 + }, + { + "epoch": 1.539440400999395, + "grad_norm": 2.0188519954681396, + "learning_rate": 4.8460575117553035e-05, + "loss": 1.0934, + "step": 99200 + }, + { + "epoch": 1.5409922562423377, + "grad_norm": 2.1920857429504395, + "learning_rate": 4.845902326231009e-05, + "loss": 1.0842, + "step": 99300 + }, + { + "epoch": 1.5425441114852807, + "grad_norm": 2.47813081741333, + "learning_rate": 4.845747140706715e-05, + "loss": 1.0966, + "step": 99400 + }, + { + "epoch": 1.5440959667282237, + "grad_norm": 2.5390424728393555, + "learning_rate": 4.845591955182421e-05, + "loss": 1.0873, + "step": 99500 + }, + { + "epoch": 1.5456478219711665, + "grad_norm": 2.214303731918335, + "learning_rate": 4.8454367696581266e-05, + "loss": 1.0945, + "step": 99600 + }, + { + "epoch": 1.5471996772141094, + "grad_norm": 2.6074516773223877, + "learning_rate": 4.8452815841338323e-05, + "loss": 1.067, + "step": 99700 + }, + { + "epoch": 1.5487515324570524, + "grad_norm": 2.670158863067627, + "learning_rate": 4.845126398609538e-05, + "loss": 1.074, + "step": 99800 + }, + { + "epoch": 1.5503033876999952, + "grad_norm": 2.4149551391601562, + "learning_rate": 4.844971213085244e-05, + "loss": 1.103, + "step": 99900 + }, + { + "epoch": 1.5518552429429384, + "grad_norm": 2.4299428462982178, + "learning_rate": 4.84481602756095e-05, + "loss": 1.0943, + "step": 100000 + }, + { + "epoch": 1.5534070981858812, + "grad_norm": 2.447786331176758, + "learning_rate": 4.8446608420366554e-05, + "loss": 1.0866, + "step": 100100 + }, + { + "epoch": 1.5549589534288242, + "grad_norm": 2.346238374710083, + "learning_rate": 4.8445056565123605e-05, + "loss": 1.1272, + "step": 100200 + }, + { + "epoch": 1.5565108086717672, + "grad_norm": 2.3809621334075928, + "learning_rate": 4.844350470988066e-05, + "loss": 1.1039, + "step": 100300 + }, + { + "epoch": 1.55806266391471, + "grad_norm": 2.2030515670776367, + "learning_rate": 4.844195285463772e-05, + "loss": 1.0865, + "step": 100400 + }, + { + "epoch": 1.559614519157653, + "grad_norm": 2.4423165321350098, + "learning_rate": 4.844040099939478e-05, + "loss": 1.0883, + "step": 100500 + }, + { + "epoch": 1.561166374400596, + "grad_norm": 1.9698563814163208, + "learning_rate": 4.8438849144151836e-05, + "loss": 1.0872, + "step": 100600 + }, + { + "epoch": 1.562718229643539, + "grad_norm": 2.018266201019287, + "learning_rate": 4.8437297288908894e-05, + "loss": 1.0915, + "step": 100700 + }, + { + "epoch": 1.564270084886482, + "grad_norm": 2.2007317543029785, + "learning_rate": 4.843574543366595e-05, + "loss": 1.0931, + "step": 100800 + }, + { + "epoch": 1.5658219401294247, + "grad_norm": 2.259827136993408, + "learning_rate": 4.843419357842301e-05, + "loss": 1.1038, + "step": 100900 + }, + { + "epoch": 1.5673737953723677, + "grad_norm": 2.568490982055664, + "learning_rate": 4.843264172318007e-05, + "loss": 1.0814, + "step": 101000 + }, + { + "epoch": 1.5689256506153106, + "grad_norm": 2.2101008892059326, + "learning_rate": 4.8431089867937125e-05, + "loss": 1.102, + "step": 101100 + }, + { + "epoch": 1.5704775058582534, + "grad_norm": 2.0225656032562256, + "learning_rate": 4.8429538012694176e-05, + "loss": 1.1161, + "step": 101200 + }, + { + "epoch": 1.5720293611011966, + "grad_norm": 2.544382333755493, + "learning_rate": 4.8427986157451234e-05, + "loss": 1.0915, + "step": 101300 + }, + { + "epoch": 1.5735812163441394, + "grad_norm": 2.1352312564849854, + "learning_rate": 4.842643430220829e-05, + "loss": 1.0963, + "step": 101400 + }, + { + "epoch": 1.5751330715870824, + "grad_norm": 2.783745527267456, + "learning_rate": 4.842488244696535e-05, + "loss": 1.0833, + "step": 101500 + }, + { + "epoch": 1.5766849268300254, + "grad_norm": 2.208181858062744, + "learning_rate": 4.842333059172241e-05, + "loss": 1.1058, + "step": 101600 + }, + { + "epoch": 1.5782367820729681, + "grad_norm": 2.9489893913269043, + "learning_rate": 4.8421778736479465e-05, + "loss": 1.0939, + "step": 101700 + }, + { + "epoch": 1.5797886373159111, + "grad_norm": 2.812354564666748, + "learning_rate": 4.8420226881236516e-05, + "loss": 1.0871, + "step": 101800 + }, + { + "epoch": 1.5813404925588541, + "grad_norm": 2.402590274810791, + "learning_rate": 4.8418675025993574e-05, + "loss": 1.0897, + "step": 101900 + }, + { + "epoch": 1.5828923478017969, + "grad_norm": 2.327899694442749, + "learning_rate": 4.841712317075063e-05, + "loss": 1.0848, + "step": 102000 + }, + { + "epoch": 1.58444420304474, + "grad_norm": 2.308974504470825, + "learning_rate": 4.841557131550769e-05, + "loss": 1.0889, + "step": 102100 + }, + { + "epoch": 1.5859960582876829, + "grad_norm": 1.8988031148910522, + "learning_rate": 4.841401946026475e-05, + "loss": 1.0916, + "step": 102200 + }, + { + "epoch": 1.5875479135306259, + "grad_norm": 2.443079710006714, + "learning_rate": 4.8412467605021805e-05, + "loss": 1.0783, + "step": 102300 + }, + { + "epoch": 1.5890997687735688, + "grad_norm": 2.450003147125244, + "learning_rate": 4.841091574977886e-05, + "loss": 1.0788, + "step": 102400 + }, + { + "epoch": 1.5906516240165116, + "grad_norm": 2.2849245071411133, + "learning_rate": 4.840936389453592e-05, + "loss": 1.0927, + "step": 102500 + }, + { + "epoch": 1.5922034792594548, + "grad_norm": 2.752744436264038, + "learning_rate": 4.840781203929298e-05, + "loss": 1.086, + "step": 102600 + }, + { + "epoch": 1.5937553345023976, + "grad_norm": 2.229478597640991, + "learning_rate": 4.8406260184050036e-05, + "loss": 1.0857, + "step": 102700 + }, + { + "epoch": 1.5953071897453406, + "grad_norm": 1.991997480392456, + "learning_rate": 4.8404708328807093e-05, + "loss": 1.0987, + "step": 102800 + }, + { + "epoch": 1.5968590449882836, + "grad_norm": 2.213447093963623, + "learning_rate": 4.840315647356415e-05, + "loss": 1.0723, + "step": 102900 + }, + { + "epoch": 1.5984109002312263, + "grad_norm": 2.0651183128356934, + "learning_rate": 4.840160461832121e-05, + "loss": 1.0883, + "step": 103000 + }, + { + "epoch": 1.5999627554741693, + "grad_norm": 2.27315092086792, + "learning_rate": 4.840005276307826e-05, + "loss": 1.0964, + "step": 103100 + }, + { + "epoch": 1.6015146107171123, + "grad_norm": 2.5370280742645264, + "learning_rate": 4.839850090783532e-05, + "loss": 1.0791, + "step": 103200 + }, + { + "epoch": 1.603066465960055, + "grad_norm": 2.3335041999816895, + "learning_rate": 4.8396949052592375e-05, + "loss": 1.1005, + "step": 103300 + }, + { + "epoch": 1.6046183212029983, + "grad_norm": 2.0514893531799316, + "learning_rate": 4.839539719734943e-05, + "loss": 1.1146, + "step": 103400 + }, + { + "epoch": 1.606170176445941, + "grad_norm": 2.582895040512085, + "learning_rate": 4.839384534210649e-05, + "loss": 1.091, + "step": 103500 + }, + { + "epoch": 1.607722031688884, + "grad_norm": 2.004833936691284, + "learning_rate": 4.839229348686355e-05, + "loss": 1.0627, + "step": 103600 + }, + { + "epoch": 1.609273886931827, + "grad_norm": 2.5738303661346436, + "learning_rate": 4.8390741631620606e-05, + "loss": 1.0775, + "step": 103700 + }, + { + "epoch": 1.6108257421747698, + "grad_norm": 2.5076212882995605, + "learning_rate": 4.8389189776377664e-05, + "loss": 1.1139, + "step": 103800 + }, + { + "epoch": 1.612377597417713, + "grad_norm": 2.3125298023223877, + "learning_rate": 4.838763792113472e-05, + "loss": 1.072, + "step": 103900 + }, + { + "epoch": 1.6139294526606558, + "grad_norm": 2.3345038890838623, + "learning_rate": 4.838608606589178e-05, + "loss": 1.0998, + "step": 104000 + }, + { + "epoch": 1.6154813079035988, + "grad_norm": 2.6464548110961914, + "learning_rate": 4.838453421064884e-05, + "loss": 1.0903, + "step": 104100 + }, + { + "epoch": 1.6170331631465418, + "grad_norm": 2.383648633956909, + "learning_rate": 4.8382982355405895e-05, + "loss": 1.0996, + "step": 104200 + }, + { + "epoch": 1.6185850183894845, + "grad_norm": 2.0716552734375, + "learning_rate": 4.838143050016295e-05, + "loss": 1.0979, + "step": 104300 + }, + { + "epoch": 1.6201368736324275, + "grad_norm": 2.2472150325775146, + "learning_rate": 4.8379878644920004e-05, + "loss": 1.0753, + "step": 104400 + }, + { + "epoch": 1.6216887288753705, + "grad_norm": 2.413191318511963, + "learning_rate": 4.837832678967706e-05, + "loss": 1.0789, + "step": 104500 + }, + { + "epoch": 1.6232405841183133, + "grad_norm": 2.343839168548584, + "learning_rate": 4.837677493443411e-05, + "loss": 1.1059, + "step": 104600 + }, + { + "epoch": 1.6247924393612565, + "grad_norm": 2.3699560165405273, + "learning_rate": 4.837522307919117e-05, + "loss": 1.102, + "step": 104700 + }, + { + "epoch": 1.6263442946041993, + "grad_norm": 2.4309887886047363, + "learning_rate": 4.837367122394823e-05, + "loss": 1.0849, + "step": 104800 + }, + { + "epoch": 1.6278961498471423, + "grad_norm": 2.464973211288452, + "learning_rate": 4.8372119368705286e-05, + "loss": 1.1039, + "step": 104900 + }, + { + "epoch": 1.6294480050900852, + "grad_norm": 2.468033790588379, + "learning_rate": 4.8370567513462344e-05, + "loss": 1.0626, + "step": 105000 + }, + { + "epoch": 1.630999860333028, + "grad_norm": 2.6809568405151367, + "learning_rate": 4.83690156582194e-05, + "loss": 1.0831, + "step": 105100 + }, + { + "epoch": 1.6325517155759712, + "grad_norm": 2.0503652095794678, + "learning_rate": 4.836746380297646e-05, + "loss": 1.1011, + "step": 105200 + }, + { + "epoch": 1.634103570818914, + "grad_norm": 2.310662031173706, + "learning_rate": 4.836591194773352e-05, + "loss": 1.0995, + "step": 105300 + }, + { + "epoch": 1.635655426061857, + "grad_norm": 2.173369884490967, + "learning_rate": 4.8364360092490575e-05, + "loss": 1.0826, + "step": 105400 + }, + { + "epoch": 1.6372072813048, + "grad_norm": 1.8807876110076904, + "learning_rate": 4.836280823724763e-05, + "loss": 1.0779, + "step": 105500 + }, + { + "epoch": 1.6387591365477427, + "grad_norm": 2.493671417236328, + "learning_rate": 4.836125638200469e-05, + "loss": 1.1023, + "step": 105600 + }, + { + "epoch": 1.6403109917906857, + "grad_norm": 2.055972099304199, + "learning_rate": 4.835970452676175e-05, + "loss": 1.0805, + "step": 105700 + }, + { + "epoch": 1.6418628470336287, + "grad_norm": 2.361999750137329, + "learning_rate": 4.8358152671518806e-05, + "loss": 1.0922, + "step": 105800 + }, + { + "epoch": 1.6434147022765715, + "grad_norm": 2.321727991104126, + "learning_rate": 4.835660081627586e-05, + "loss": 1.0765, + "step": 105900 + }, + { + "epoch": 1.6449665575195147, + "grad_norm": 2.269177198410034, + "learning_rate": 4.8355048961032914e-05, + "loss": 1.0961, + "step": 106000 + }, + { + "epoch": 1.6465184127624575, + "grad_norm": 2.213909387588501, + "learning_rate": 4.835349710578997e-05, + "loss": 1.0659, + "step": 106100 + }, + { + "epoch": 1.6480702680054005, + "grad_norm": 2.539454221725464, + "learning_rate": 4.835194525054703e-05, + "loss": 1.0631, + "step": 106200 + }, + { + "epoch": 1.6496221232483435, + "grad_norm": 2.369426727294922, + "learning_rate": 4.835039339530409e-05, + "loss": 1.1118, + "step": 106300 + }, + { + "epoch": 1.6511739784912862, + "grad_norm": 2.130342483520508, + "learning_rate": 4.8348841540061145e-05, + "loss": 1.0777, + "step": 106400 + }, + { + "epoch": 1.6527258337342294, + "grad_norm": 2.4684133529663086, + "learning_rate": 4.83472896848182e-05, + "loss": 1.0849, + "step": 106500 + }, + { + "epoch": 1.6542776889771722, + "grad_norm": 2.468327045440674, + "learning_rate": 4.834573782957526e-05, + "loss": 1.0841, + "step": 106600 + }, + { + "epoch": 1.6558295442201152, + "grad_norm": 2.0428032875061035, + "learning_rate": 4.834418597433232e-05, + "loss": 1.0714, + "step": 106700 + }, + { + "epoch": 1.6573813994630582, + "grad_norm": 2.2243077754974365, + "learning_rate": 4.8342634119089376e-05, + "loss": 1.1006, + "step": 106800 + }, + { + "epoch": 1.658933254706001, + "grad_norm": 2.0093777179718018, + "learning_rate": 4.8341082263846434e-05, + "loss": 1.0677, + "step": 106900 + }, + { + "epoch": 1.660485109948944, + "grad_norm": 2.3485684394836426, + "learning_rate": 4.833953040860349e-05, + "loss": 1.095, + "step": 107000 + }, + { + "epoch": 1.662036965191887, + "grad_norm": 2.494544744491577, + "learning_rate": 4.833797855336055e-05, + "loss": 1.1014, + "step": 107100 + }, + { + "epoch": 1.6635888204348297, + "grad_norm": 2.3687689304351807, + "learning_rate": 4.83364266981176e-05, + "loss": 1.0776, + "step": 107200 + }, + { + "epoch": 1.665140675677773, + "grad_norm": 2.286187171936035, + "learning_rate": 4.833487484287466e-05, + "loss": 1.1119, + "step": 107300 + }, + { + "epoch": 1.6666925309207157, + "grad_norm": 2.225325584411621, + "learning_rate": 4.8333322987631716e-05, + "loss": 1.0674, + "step": 107400 + }, + { + "epoch": 1.6682443861636587, + "grad_norm": 2.416855573654175, + "learning_rate": 4.8331771132388774e-05, + "loss": 1.0843, + "step": 107500 + }, + { + "epoch": 1.6697962414066017, + "grad_norm": 2.224277973175049, + "learning_rate": 4.833021927714583e-05, + "loss": 1.0832, + "step": 107600 + }, + { + "epoch": 1.6713480966495444, + "grad_norm": 2.3071978092193604, + "learning_rate": 4.832866742190288e-05, + "loss": 1.1245, + "step": 107700 + }, + { + "epoch": 1.6728999518924874, + "grad_norm": 2.6478264331817627, + "learning_rate": 4.832711556665994e-05, + "loss": 1.086, + "step": 107800 + }, + { + "epoch": 1.6744518071354304, + "grad_norm": 2.273693799972534, + "learning_rate": 4.8325563711417e-05, + "loss": 1.1043, + "step": 107900 + }, + { + "epoch": 1.6760036623783734, + "grad_norm": 2.275265693664551, + "learning_rate": 4.8324011856174056e-05, + "loss": 1.1134, + "step": 108000 + }, + { + "epoch": 1.6775555176213164, + "grad_norm": 2.539862871170044, + "learning_rate": 4.8322460000931114e-05, + "loss": 1.0871, + "step": 108100 + }, + { + "epoch": 1.6791073728642592, + "grad_norm": 2.447202205657959, + "learning_rate": 4.832090814568817e-05, + "loss": 1.1026, + "step": 108200 + }, + { + "epoch": 1.6806592281072021, + "grad_norm": 2.1919965744018555, + "learning_rate": 4.831935629044523e-05, + "loss": 1.074, + "step": 108300 + }, + { + "epoch": 1.6822110833501451, + "grad_norm": 2.276486396789551, + "learning_rate": 4.831780443520229e-05, + "loss": 1.0973, + "step": 108400 + }, + { + "epoch": 1.683762938593088, + "grad_norm": 2.577850341796875, + "learning_rate": 4.8316252579959345e-05, + "loss": 1.0961, + "step": 108500 + }, + { + "epoch": 1.685314793836031, + "grad_norm": 2.4232072830200195, + "learning_rate": 4.83147007247164e-05, + "loss": 1.0824, + "step": 108600 + }, + { + "epoch": 1.6868666490789739, + "grad_norm": 2.321362257003784, + "learning_rate": 4.831314886947346e-05, + "loss": 1.0833, + "step": 108700 + }, + { + "epoch": 1.6884185043219169, + "grad_norm": 2.5468544960021973, + "learning_rate": 4.831159701423051e-05, + "loss": 1.0847, + "step": 108800 + }, + { + "epoch": 1.6899703595648599, + "grad_norm": 2.3139944076538086, + "learning_rate": 4.831004515898757e-05, + "loss": 1.0887, + "step": 108900 + }, + { + "epoch": 1.6915222148078026, + "grad_norm": 2.2984142303466797, + "learning_rate": 4.830849330374463e-05, + "loss": 1.0979, + "step": 109000 + }, + { + "epoch": 1.6930740700507456, + "grad_norm": 2.5074856281280518, + "learning_rate": 4.8306941448501684e-05, + "loss": 1.074, + "step": 109100 + }, + { + "epoch": 1.6946259252936886, + "grad_norm": 2.1612424850463867, + "learning_rate": 4.830538959325874e-05, + "loss": 1.0839, + "step": 109200 + }, + { + "epoch": 1.6961777805366316, + "grad_norm": 2.457338809967041, + "learning_rate": 4.83038377380158e-05, + "loss": 1.0883, + "step": 109300 + }, + { + "epoch": 1.6977296357795746, + "grad_norm": 2.4035794734954834, + "learning_rate": 4.830228588277286e-05, + "loss": 1.0619, + "step": 109400 + }, + { + "epoch": 1.6992814910225174, + "grad_norm": 2.3478317260742188, + "learning_rate": 4.8300734027529915e-05, + "loss": 1.0911, + "step": 109500 + }, + { + "epoch": 1.7008333462654603, + "grad_norm": 2.210148572921753, + "learning_rate": 4.829918217228697e-05, + "loss": 1.086, + "step": 109600 + }, + { + "epoch": 1.7023852015084033, + "grad_norm": 2.5271687507629395, + "learning_rate": 4.829763031704403e-05, + "loss": 1.0951, + "step": 109700 + }, + { + "epoch": 1.703937056751346, + "grad_norm": 2.0551509857177734, + "learning_rate": 4.829607846180109e-05, + "loss": 1.1064, + "step": 109800 + }, + { + "epoch": 1.7054889119942893, + "grad_norm": 2.443232297897339, + "learning_rate": 4.8294526606558146e-05, + "loss": 1.0767, + "step": 109900 + }, + { + "epoch": 1.707040767237232, + "grad_norm": 2.0984151363372803, + "learning_rate": 4.8292974751315204e-05, + "loss": 1.0801, + "step": 110000 + }, + { + "epoch": 1.708592622480175, + "grad_norm": 2.4197680950164795, + "learning_rate": 4.8291422896072255e-05, + "loss": 1.105, + "step": 110100 + }, + { + "epoch": 1.710144477723118, + "grad_norm": 2.436518430709839, + "learning_rate": 4.828987104082931e-05, + "loss": 1.0684, + "step": 110200 + }, + { + "epoch": 1.7116963329660608, + "grad_norm": 2.4050610065460205, + "learning_rate": 4.828831918558637e-05, + "loss": 1.0832, + "step": 110300 + }, + { + "epoch": 1.7132481882090038, + "grad_norm": 2.0894031524658203, + "learning_rate": 4.828676733034343e-05, + "loss": 1.0764, + "step": 110400 + }, + { + "epoch": 1.7148000434519468, + "grad_norm": 2.1264595985412598, + "learning_rate": 4.8285215475100486e-05, + "loss": 1.0866, + "step": 110500 + }, + { + "epoch": 1.7163518986948896, + "grad_norm": 2.6174960136413574, + "learning_rate": 4.8283663619857544e-05, + "loss": 1.0887, + "step": 110600 + }, + { + "epoch": 1.7179037539378328, + "grad_norm": 2.2942001819610596, + "learning_rate": 4.82821117646146e-05, + "loss": 1.0831, + "step": 110700 + }, + { + "epoch": 1.7194556091807756, + "grad_norm": 1.934415340423584, + "learning_rate": 4.828055990937166e-05, + "loss": 1.0837, + "step": 110800 + }, + { + "epoch": 1.7210074644237185, + "grad_norm": 2.1386795043945312, + "learning_rate": 4.827900805412871e-05, + "loss": 1.0718, + "step": 110900 + }, + { + "epoch": 1.7225593196666615, + "grad_norm": 2.057543992996216, + "learning_rate": 4.827745619888577e-05, + "loss": 1.0997, + "step": 111000 + }, + { + "epoch": 1.7241111749096043, + "grad_norm": 2.408341407775879, + "learning_rate": 4.8275904343642826e-05, + "loss": 1.1036, + "step": 111100 + }, + { + "epoch": 1.7256630301525475, + "grad_norm": 2.4271554946899414, + "learning_rate": 4.8274352488399884e-05, + "loss": 1.0797, + "step": 111200 + }, + { + "epoch": 1.7272148853954903, + "grad_norm": 2.4379501342773438, + "learning_rate": 4.827280063315694e-05, + "loss": 1.0766, + "step": 111300 + }, + { + "epoch": 1.7287667406384333, + "grad_norm": 2.436361312866211, + "learning_rate": 4.8271248777914e-05, + "loss": 1.0903, + "step": 111400 + }, + { + "epoch": 1.7303185958813763, + "grad_norm": 2.2575366497039795, + "learning_rate": 4.826969692267106e-05, + "loss": 1.0968, + "step": 111500 + }, + { + "epoch": 1.731870451124319, + "grad_norm": 2.1207995414733887, + "learning_rate": 4.826814506742811e-05, + "loss": 1.085, + "step": 111600 + }, + { + "epoch": 1.733422306367262, + "grad_norm": 2.482725143432617, + "learning_rate": 4.8266593212185166e-05, + "loss": 1.0599, + "step": 111700 + }, + { + "epoch": 1.734974161610205, + "grad_norm": 2.253748655319214, + "learning_rate": 4.8265041356942223e-05, + "loss": 1.0496, + "step": 111800 + }, + { + "epoch": 1.7365260168531478, + "grad_norm": 2.0366814136505127, + "learning_rate": 4.826348950169928e-05, + "loss": 1.0767, + "step": 111900 + }, + { + "epoch": 1.738077872096091, + "grad_norm": 2.260399341583252, + "learning_rate": 4.826193764645634e-05, + "loss": 1.0844, + "step": 112000 + }, + { + "epoch": 1.7396297273390338, + "grad_norm": 2.378796339035034, + "learning_rate": 4.82603857912134e-05, + "loss": 1.0899, + "step": 112100 + }, + { + "epoch": 1.7411815825819768, + "grad_norm": 2.3033385276794434, + "learning_rate": 4.8258833935970454e-05, + "loss": 1.0573, + "step": 112200 + }, + { + "epoch": 1.7427334378249197, + "grad_norm": 2.3621747493743896, + "learning_rate": 4.825728208072751e-05, + "loss": 1.0694, + "step": 112300 + }, + { + "epoch": 1.7442852930678625, + "grad_norm": 2.5841023921966553, + "learning_rate": 4.825573022548457e-05, + "loss": 1.0558, + "step": 112400 + }, + { + "epoch": 1.7458371483108057, + "grad_norm": 2.4916257858276367, + "learning_rate": 4.825417837024163e-05, + "loss": 1.1038, + "step": 112500 + }, + { + "epoch": 1.7473890035537485, + "grad_norm": 1.9130871295928955, + "learning_rate": 4.8252626514998685e-05, + "loss": 1.0835, + "step": 112600 + }, + { + "epoch": 1.7489408587966915, + "grad_norm": 2.4349403381347656, + "learning_rate": 4.825107465975574e-05, + "loss": 1.0776, + "step": 112700 + }, + { + "epoch": 1.7504927140396345, + "grad_norm": 2.358518123626709, + "learning_rate": 4.82495228045128e-05, + "loss": 1.0856, + "step": 112800 + }, + { + "epoch": 1.7520445692825772, + "grad_norm": 1.981557011604309, + "learning_rate": 4.824797094926985e-05, + "loss": 1.0957, + "step": 112900 + }, + { + "epoch": 1.7535964245255202, + "grad_norm": 2.483365297317505, + "learning_rate": 4.824641909402691e-05, + "loss": 1.0798, + "step": 113000 + }, + { + "epoch": 1.7551482797684632, + "grad_norm": 2.0990121364593506, + "learning_rate": 4.824486723878397e-05, + "loss": 1.0944, + "step": 113100 + }, + { + "epoch": 1.756700135011406, + "grad_norm": 2.4795525074005127, + "learning_rate": 4.8243315383541025e-05, + "loss": 1.0656, + "step": 113200 + }, + { + "epoch": 1.7582519902543492, + "grad_norm": 2.1153392791748047, + "learning_rate": 4.824176352829808e-05, + "loss": 1.0887, + "step": 113300 + }, + { + "epoch": 1.759803845497292, + "grad_norm": 1.9288995265960693, + "learning_rate": 4.824021167305514e-05, + "loss": 1.0884, + "step": 113400 + }, + { + "epoch": 1.761355700740235, + "grad_norm": 2.1801743507385254, + "learning_rate": 4.82386598178122e-05, + "loss": 1.072, + "step": 113500 + }, + { + "epoch": 1.762907555983178, + "grad_norm": 2.234731674194336, + "learning_rate": 4.8237107962569256e-05, + "loss": 1.0774, + "step": 113600 + }, + { + "epoch": 1.7644594112261207, + "grad_norm": 2.6526927947998047, + "learning_rate": 4.8235556107326314e-05, + "loss": 1.0804, + "step": 113700 + }, + { + "epoch": 1.766011266469064, + "grad_norm": 2.3809196949005127, + "learning_rate": 4.823400425208337e-05, + "loss": 1.1025, + "step": 113800 + }, + { + "epoch": 1.7675631217120067, + "grad_norm": 2.5246968269348145, + "learning_rate": 4.823245239684043e-05, + "loss": 1.0804, + "step": 113900 + }, + { + "epoch": 1.7691149769549497, + "grad_norm": 2.3987340927124023, + "learning_rate": 4.823090054159749e-05, + "loss": 1.0853, + "step": 114000 + }, + { + "epoch": 1.7706668321978927, + "grad_norm": 2.483727216720581, + "learning_rate": 4.8229348686354545e-05, + "loss": 1.0808, + "step": 114100 + }, + { + "epoch": 1.7722186874408354, + "grad_norm": 2.440681219100952, + "learning_rate": 4.8227796831111596e-05, + "loss": 1.0799, + "step": 114200 + }, + { + "epoch": 1.7737705426837784, + "grad_norm": 2.3732333183288574, + "learning_rate": 4.8226244975868654e-05, + "loss": 1.0963, + "step": 114300 + }, + { + "epoch": 1.7753223979267214, + "grad_norm": 2.1413209438323975, + "learning_rate": 4.8224693120625705e-05, + "loss": 1.0618, + "step": 114400 + }, + { + "epoch": 1.7768742531696642, + "grad_norm": 2.341684103012085, + "learning_rate": 4.822314126538276e-05, + "loss": 1.0748, + "step": 114500 + }, + { + "epoch": 1.7784261084126074, + "grad_norm": 2.107227325439453, + "learning_rate": 4.822158941013982e-05, + "loss": 1.0923, + "step": 114600 + }, + { + "epoch": 1.7799779636555502, + "grad_norm": 2.0643272399902344, + "learning_rate": 4.822003755489688e-05, + "loss": 1.1075, + "step": 114700 + }, + { + "epoch": 1.7815298188984932, + "grad_norm": 2.261986494064331, + "learning_rate": 4.8218485699653936e-05, + "loss": 1.0811, + "step": 114800 + }, + { + "epoch": 1.7830816741414361, + "grad_norm": 2.0926873683929443, + "learning_rate": 4.8216933844410993e-05, + "loss": 1.0665, + "step": 114900 + }, + { + "epoch": 1.784633529384379, + "grad_norm": 2.431126832962036, + "learning_rate": 4.821538198916805e-05, + "loss": 1.0766, + "step": 115000 + }, + { + "epoch": 1.7861853846273221, + "grad_norm": 2.0504894256591797, + "learning_rate": 4.821383013392511e-05, + "loss": 1.1035, + "step": 115100 + }, + { + "epoch": 1.787737239870265, + "grad_norm": 2.1101536750793457, + "learning_rate": 4.821227827868217e-05, + "loss": 1.0799, + "step": 115200 + }, + { + "epoch": 1.7892890951132079, + "grad_norm": 2.618450880050659, + "learning_rate": 4.8210726423439224e-05, + "loss": 1.081, + "step": 115300 + }, + { + "epoch": 1.7908409503561509, + "grad_norm": 2.42140793800354, + "learning_rate": 4.820917456819628e-05, + "loss": 1.0586, + "step": 115400 + }, + { + "epoch": 1.7923928055990936, + "grad_norm": 2.140634059906006, + "learning_rate": 4.820762271295334e-05, + "loss": 1.0696, + "step": 115500 + }, + { + "epoch": 1.7939446608420366, + "grad_norm": 2.527614116668701, + "learning_rate": 4.82060708577104e-05, + "loss": 1.0794, + "step": 115600 + }, + { + "epoch": 1.7954965160849796, + "grad_norm": 2.8312926292419434, + "learning_rate": 4.820451900246745e-05, + "loss": 1.0781, + "step": 115700 + }, + { + "epoch": 1.7970483713279224, + "grad_norm": 2.3439109325408936, + "learning_rate": 4.8202967147224506e-05, + "loss": 1.0636, + "step": 115800 + }, + { + "epoch": 1.7986002265708656, + "grad_norm": 2.349926471710205, + "learning_rate": 4.8201415291981564e-05, + "loss": 1.0682, + "step": 115900 + }, + { + "epoch": 1.8001520818138084, + "grad_norm": 2.3758840560913086, + "learning_rate": 4.819986343673862e-05, + "loss": 1.0961, + "step": 116000 + }, + { + "epoch": 1.8017039370567514, + "grad_norm": 2.378147840499878, + "learning_rate": 4.819831158149568e-05, + "loss": 1.0963, + "step": 116100 + }, + { + "epoch": 1.8032557922996943, + "grad_norm": 2.429334878921509, + "learning_rate": 4.819675972625274e-05, + "loss": 1.0804, + "step": 116200 + }, + { + "epoch": 1.8048076475426371, + "grad_norm": 2.3186795711517334, + "learning_rate": 4.8195207871009795e-05, + "loss": 1.0972, + "step": 116300 + }, + { + "epoch": 1.80635950278558, + "grad_norm": 2.449737071990967, + "learning_rate": 4.819365601576685e-05, + "loss": 1.072, + "step": 116400 + }, + { + "epoch": 1.807911358028523, + "grad_norm": 2.0092339515686035, + "learning_rate": 4.819210416052391e-05, + "loss": 1.0762, + "step": 116500 + }, + { + "epoch": 1.809463213271466, + "grad_norm": 2.553809642791748, + "learning_rate": 4.819055230528097e-05, + "loss": 1.0738, + "step": 116600 + }, + { + "epoch": 1.811015068514409, + "grad_norm": 2.23128080368042, + "learning_rate": 4.8189000450038026e-05, + "loss": 1.0816, + "step": 116700 + }, + { + "epoch": 1.8125669237573518, + "grad_norm": 2.235153913497925, + "learning_rate": 4.8187448594795084e-05, + "loss": 1.0561, + "step": 116800 + }, + { + "epoch": 1.8141187790002948, + "grad_norm": 2.4133265018463135, + "learning_rate": 4.818589673955214e-05, + "loss": 1.0759, + "step": 116900 + }, + { + "epoch": 1.8156706342432378, + "grad_norm": 2.5476932525634766, + "learning_rate": 4.818434488430919e-05, + "loss": 1.0983, + "step": 117000 + }, + { + "epoch": 1.8172224894861806, + "grad_norm": 2.037759304046631, + "learning_rate": 4.818279302906625e-05, + "loss": 1.0796, + "step": 117100 + }, + { + "epoch": 1.8187743447291238, + "grad_norm": 1.8041654825210571, + "learning_rate": 4.818124117382331e-05, + "loss": 1.0743, + "step": 117200 + }, + { + "epoch": 1.8203261999720666, + "grad_norm": 2.1287336349487305, + "learning_rate": 4.8179689318580366e-05, + "loss": 1.0491, + "step": 117300 + }, + { + "epoch": 1.8218780552150096, + "grad_norm": 2.779341220855713, + "learning_rate": 4.8178137463337424e-05, + "loss": 1.0649, + "step": 117400 + }, + { + "epoch": 1.8234299104579526, + "grad_norm": 2.8096201419830322, + "learning_rate": 4.8176585608094475e-05, + "loss": 1.1033, + "step": 117500 + }, + { + "epoch": 1.8249817657008953, + "grad_norm": 2.444505453109741, + "learning_rate": 4.817503375285153e-05, + "loss": 1.0688, + "step": 117600 + }, + { + "epoch": 1.8265336209438383, + "grad_norm": 2.2384262084960938, + "learning_rate": 4.817348189760859e-05, + "loss": 1.0746, + "step": 117700 + }, + { + "epoch": 1.8280854761867813, + "grad_norm": 2.4848833084106445, + "learning_rate": 4.817193004236565e-05, + "loss": 1.0785, + "step": 117800 + }, + { + "epoch": 1.8296373314297243, + "grad_norm": 1.9756971597671509, + "learning_rate": 4.8170378187122706e-05, + "loss": 1.0819, + "step": 117900 + }, + { + "epoch": 1.8311891866726673, + "grad_norm": 2.0762622356414795, + "learning_rate": 4.8168826331879763e-05, + "loss": 1.084, + "step": 118000 + }, + { + "epoch": 1.83274104191561, + "grad_norm": 2.1194162368774414, + "learning_rate": 4.816727447663682e-05, + "loss": 1.0658, + "step": 118100 + }, + { + "epoch": 1.834292897158553, + "grad_norm": 2.2249464988708496, + "learning_rate": 4.816572262139388e-05, + "loss": 1.0682, + "step": 118200 + }, + { + "epoch": 1.835844752401496, + "grad_norm": 2.7198054790496826, + "learning_rate": 4.816417076615094e-05, + "loss": 1.0685, + "step": 118300 + }, + { + "epoch": 1.8373966076444388, + "grad_norm": 2.395421266555786, + "learning_rate": 4.8162618910907994e-05, + "loss": 1.0476, + "step": 118400 + }, + { + "epoch": 1.838948462887382, + "grad_norm": 2.5260422229766846, + "learning_rate": 4.816106705566505e-05, + "loss": 1.0814, + "step": 118500 + }, + { + "epoch": 1.8405003181303248, + "grad_norm": 2.381462574005127, + "learning_rate": 4.81595152004221e-05, + "loss": 1.0948, + "step": 118600 + }, + { + "epoch": 1.8420521733732678, + "grad_norm": 2.6957783699035645, + "learning_rate": 4.815796334517916e-05, + "loss": 1.0594, + "step": 118700 + }, + { + "epoch": 1.8436040286162108, + "grad_norm": 2.6098296642303467, + "learning_rate": 4.815641148993622e-05, + "loss": 1.0691, + "step": 118800 + }, + { + "epoch": 1.8451558838591535, + "grad_norm": 2.336134672164917, + "learning_rate": 4.8154859634693276e-05, + "loss": 1.0754, + "step": 118900 + }, + { + "epoch": 1.8467077391020965, + "grad_norm": 2.165764093399048, + "learning_rate": 4.8153307779450334e-05, + "loss": 1.0785, + "step": 119000 + }, + { + "epoch": 1.8482595943450395, + "grad_norm": 2.52502179145813, + "learning_rate": 4.815175592420739e-05, + "loss": 1.0789, + "step": 119100 + }, + { + "epoch": 1.8498114495879823, + "grad_norm": 2.594719409942627, + "learning_rate": 4.815020406896445e-05, + "loss": 1.0751, + "step": 119200 + }, + { + "epoch": 1.8513633048309255, + "grad_norm": 2.4303109645843506, + "learning_rate": 4.814865221372151e-05, + "loss": 1.075, + "step": 119300 + }, + { + "epoch": 1.8529151600738683, + "grad_norm": 2.4006989002227783, + "learning_rate": 4.8147100358478565e-05, + "loss": 1.0862, + "step": 119400 + }, + { + "epoch": 1.8544670153168112, + "grad_norm": 2.044224977493286, + "learning_rate": 4.814554850323562e-05, + "loss": 1.0807, + "step": 119500 + }, + { + "epoch": 1.8560188705597542, + "grad_norm": 2.192030668258667, + "learning_rate": 4.814399664799268e-05, + "loss": 1.0972, + "step": 119600 + }, + { + "epoch": 1.857570725802697, + "grad_norm": 2.3220582008361816, + "learning_rate": 4.814244479274974e-05, + "loss": 1.0908, + "step": 119700 + }, + { + "epoch": 1.8591225810456402, + "grad_norm": 2.401507616043091, + "learning_rate": 4.8140892937506796e-05, + "loss": 1.091, + "step": 119800 + }, + { + "epoch": 1.860674436288583, + "grad_norm": 2.05117130279541, + "learning_rate": 4.813934108226385e-05, + "loss": 1.081, + "step": 119900 + }, + { + "epoch": 1.862226291531526, + "grad_norm": 2.2260279655456543, + "learning_rate": 4.8137789227020905e-05, + "loss": 1.0739, + "step": 120000 + }, + { + "epoch": 1.863778146774469, + "grad_norm": 2.2794127464294434, + "learning_rate": 4.813623737177796e-05, + "loss": 1.0681, + "step": 120100 + }, + { + "epoch": 1.8653300020174117, + "grad_norm": 2.2540292739868164, + "learning_rate": 4.813468551653502e-05, + "loss": 1.0758, + "step": 120200 + }, + { + "epoch": 1.8668818572603547, + "grad_norm": 2.580883026123047, + "learning_rate": 4.813313366129208e-05, + "loss": 1.1091, + "step": 120300 + }, + { + "epoch": 1.8684337125032977, + "grad_norm": 2.353062152862549, + "learning_rate": 4.8131581806049136e-05, + "loss": 1.0861, + "step": 120400 + }, + { + "epoch": 1.8699855677462405, + "grad_norm": 2.2393531799316406, + "learning_rate": 4.8130029950806194e-05, + "loss": 1.0746, + "step": 120500 + }, + { + "epoch": 1.8715374229891837, + "grad_norm": 2.7816336154937744, + "learning_rate": 4.812847809556325e-05, + "loss": 1.0866, + "step": 120600 + }, + { + "epoch": 1.8730892782321265, + "grad_norm": 2.420135259628296, + "learning_rate": 4.81269262403203e-05, + "loss": 1.0895, + "step": 120700 + }, + { + "epoch": 1.8746411334750694, + "grad_norm": 2.068148136138916, + "learning_rate": 4.812537438507736e-05, + "loss": 1.0727, + "step": 120800 + }, + { + "epoch": 1.8761929887180124, + "grad_norm": 2.162074327468872, + "learning_rate": 4.812382252983442e-05, + "loss": 1.0858, + "step": 120900 + }, + { + "epoch": 1.8777448439609552, + "grad_norm": 2.095033645629883, + "learning_rate": 4.8122270674591476e-05, + "loss": 1.0673, + "step": 121000 + }, + { + "epoch": 1.8792966992038984, + "grad_norm": 2.1828792095184326, + "learning_rate": 4.8120718819348533e-05, + "loss": 1.0894, + "step": 121100 + }, + { + "epoch": 1.8808485544468412, + "grad_norm": 2.5313050746917725, + "learning_rate": 4.811916696410559e-05, + "loss": 1.0648, + "step": 121200 + }, + { + "epoch": 1.8824004096897842, + "grad_norm": 2.1740057468414307, + "learning_rate": 4.811761510886265e-05, + "loss": 1.0775, + "step": 121300 + }, + { + "epoch": 1.8839522649327272, + "grad_norm": 2.04083251953125, + "learning_rate": 4.81160632536197e-05, + "loss": 1.0887, + "step": 121400 + }, + { + "epoch": 1.88550412017567, + "grad_norm": 2.138786792755127, + "learning_rate": 4.811451139837676e-05, + "loss": 1.0649, + "step": 121500 + }, + { + "epoch": 1.887055975418613, + "grad_norm": 2.0873091220855713, + "learning_rate": 4.8112959543133815e-05, + "loss": 1.0792, + "step": 121600 + }, + { + "epoch": 1.888607830661556, + "grad_norm": 2.4566869735717773, + "learning_rate": 4.811140768789087e-05, + "loss": 1.0782, + "step": 121700 + }, + { + "epoch": 1.8901596859044987, + "grad_norm": 2.4272701740264893, + "learning_rate": 4.810985583264793e-05, + "loss": 1.0624, + "step": 121800 + }, + { + "epoch": 1.8917115411474419, + "grad_norm": 2.4703564643859863, + "learning_rate": 4.810830397740499e-05, + "loss": 1.0885, + "step": 121900 + }, + { + "epoch": 1.8932633963903847, + "grad_norm": 2.5849311351776123, + "learning_rate": 4.8106752122162046e-05, + "loss": 1.0674, + "step": 122000 + }, + { + "epoch": 1.8948152516333276, + "grad_norm": 2.0134143829345703, + "learning_rate": 4.8105200266919104e-05, + "loss": 1.0808, + "step": 122100 + }, + { + "epoch": 1.8963671068762706, + "grad_norm": 2.4470725059509277, + "learning_rate": 4.810364841167616e-05, + "loss": 1.0833, + "step": 122200 + }, + { + "epoch": 1.8979189621192134, + "grad_norm": 2.008669137954712, + "learning_rate": 4.810209655643322e-05, + "loss": 1.0646, + "step": 122300 + }, + { + "epoch": 1.8994708173621566, + "grad_norm": 2.218724489212036, + "learning_rate": 4.810054470119028e-05, + "loss": 1.0711, + "step": 122400 + }, + { + "epoch": 1.9010226726050994, + "grad_norm": 2.7357943058013916, + "learning_rate": 4.8098992845947335e-05, + "loss": 1.07, + "step": 122500 + }, + { + "epoch": 1.9025745278480424, + "grad_norm": 1.8966476917266846, + "learning_rate": 4.809744099070439e-05, + "loss": 1.0899, + "step": 122600 + }, + { + "epoch": 1.9041263830909854, + "grad_norm": 2.341580629348755, + "learning_rate": 4.8095889135461444e-05, + "loss": 1.0754, + "step": 122700 + }, + { + "epoch": 1.9056782383339281, + "grad_norm": 2.0863394737243652, + "learning_rate": 4.80943372802185e-05, + "loss": 1.075, + "step": 122800 + }, + { + "epoch": 1.9072300935768711, + "grad_norm": 2.4585351943969727, + "learning_rate": 4.809278542497556e-05, + "loss": 1.0641, + "step": 122900 + }, + { + "epoch": 1.9087819488198141, + "grad_norm": 2.6105740070343018, + "learning_rate": 4.809123356973262e-05, + "loss": 1.0848, + "step": 123000 + }, + { + "epoch": 1.9103338040627569, + "grad_norm": 2.510451316833496, + "learning_rate": 4.8089681714489675e-05, + "loss": 1.0702, + "step": 123100 + }, + { + "epoch": 1.9118856593057, + "grad_norm": 2.0170397758483887, + "learning_rate": 4.808812985924673e-05, + "loss": 1.0634, + "step": 123200 + }, + { + "epoch": 1.9134375145486429, + "grad_norm": 2.2080132961273193, + "learning_rate": 4.808657800400379e-05, + "loss": 1.0771, + "step": 123300 + }, + { + "epoch": 1.9149893697915858, + "grad_norm": 2.509495735168457, + "learning_rate": 4.808502614876085e-05, + "loss": 1.0737, + "step": 123400 + }, + { + "epoch": 1.9165412250345288, + "grad_norm": 2.6319026947021484, + "learning_rate": 4.8083474293517906e-05, + "loss": 1.0771, + "step": 123500 + }, + { + "epoch": 1.9180930802774716, + "grad_norm": 2.0826144218444824, + "learning_rate": 4.8081922438274964e-05, + "loss": 1.0698, + "step": 123600 + }, + { + "epoch": 1.9196449355204148, + "grad_norm": 2.6355018615722656, + "learning_rate": 4.808037058303202e-05, + "loss": 1.0792, + "step": 123700 + }, + { + "epoch": 1.9211967907633576, + "grad_norm": 1.8507258892059326, + "learning_rate": 4.807881872778908e-05, + "loss": 1.073, + "step": 123800 + }, + { + "epoch": 1.9227486460063006, + "grad_norm": 2.1909666061401367, + "learning_rate": 4.807726687254614e-05, + "loss": 1.0626, + "step": 123900 + }, + { + "epoch": 1.9243005012492436, + "grad_norm": 2.3938145637512207, + "learning_rate": 4.807571501730319e-05, + "loss": 1.0549, + "step": 124000 + }, + { + "epoch": 1.9258523564921863, + "grad_norm": 2.001340389251709, + "learning_rate": 4.8074163162060246e-05, + "loss": 1.0693, + "step": 124100 + }, + { + "epoch": 1.9274042117351293, + "grad_norm": 2.752593994140625, + "learning_rate": 4.8072611306817303e-05, + "loss": 1.0608, + "step": 124200 + }, + { + "epoch": 1.9289560669780723, + "grad_norm": 1.8566434383392334, + "learning_rate": 4.8071059451574354e-05, + "loss": 1.0636, + "step": 124300 + }, + { + "epoch": 1.930507922221015, + "grad_norm": 2.8065900802612305, + "learning_rate": 4.806950759633141e-05, + "loss": 1.0877, + "step": 124400 + }, + { + "epoch": 1.9320597774639583, + "grad_norm": 1.9071614742279053, + "learning_rate": 4.806795574108847e-05, + "loss": 1.0696, + "step": 124500 + }, + { + "epoch": 1.933611632706901, + "grad_norm": 2.7064669132232666, + "learning_rate": 4.806640388584553e-05, + "loss": 1.0633, + "step": 124600 + }, + { + "epoch": 1.935163487949844, + "grad_norm": 2.330824613571167, + "learning_rate": 4.8064852030602585e-05, + "loss": 1.0996, + "step": 124700 + }, + { + "epoch": 1.936715343192787, + "grad_norm": 1.9912841320037842, + "learning_rate": 4.806330017535964e-05, + "loss": 1.0613, + "step": 124800 + }, + { + "epoch": 1.9382671984357298, + "grad_norm": 2.084195613861084, + "learning_rate": 4.80617483201167e-05, + "loss": 1.0548, + "step": 124900 + }, + { + "epoch": 1.9398190536786728, + "grad_norm": 2.2805652618408203, + "learning_rate": 4.806019646487376e-05, + "loss": 1.053, + "step": 125000 + }, + { + "epoch": 1.9413709089216158, + "grad_norm": 2.013556957244873, + "learning_rate": 4.8058644609630816e-05, + "loss": 1.0651, + "step": 125100 + }, + { + "epoch": 1.9429227641645588, + "grad_norm": 2.333754301071167, + "learning_rate": 4.8057092754387874e-05, + "loss": 1.0716, + "step": 125200 + }, + { + "epoch": 1.9444746194075018, + "grad_norm": 2.6845860481262207, + "learning_rate": 4.805554089914493e-05, + "loss": 1.0646, + "step": 125300 + }, + { + "epoch": 1.9460264746504445, + "grad_norm": 2.2428903579711914, + "learning_rate": 4.805398904390199e-05, + "loss": 1.0818, + "step": 125400 + }, + { + "epoch": 1.9475783298933875, + "grad_norm": 2.3152270317077637, + "learning_rate": 4.805243718865905e-05, + "loss": 1.0688, + "step": 125500 + }, + { + "epoch": 1.9491301851363305, + "grad_norm": 2.0691685676574707, + "learning_rate": 4.80508853334161e-05, + "loss": 1.077, + "step": 125600 + }, + { + "epoch": 1.9506820403792733, + "grad_norm": 2.0317060947418213, + "learning_rate": 4.8049333478173156e-05, + "loss": 1.0657, + "step": 125700 + }, + { + "epoch": 1.9522338956222165, + "grad_norm": 1.8157519102096558, + "learning_rate": 4.8047781622930214e-05, + "loss": 1.0762, + "step": 125800 + }, + { + "epoch": 1.9537857508651593, + "grad_norm": 2.2983384132385254, + "learning_rate": 4.804622976768727e-05, + "loss": 1.063, + "step": 125900 + }, + { + "epoch": 1.9553376061081023, + "grad_norm": 1.9815373420715332, + "learning_rate": 4.804467791244433e-05, + "loss": 1.0817, + "step": 126000 + }, + { + "epoch": 1.9568894613510452, + "grad_norm": 2.76365065574646, + "learning_rate": 4.804312605720139e-05, + "loss": 1.0603, + "step": 126100 + }, + { + "epoch": 1.958441316593988, + "grad_norm": 1.856594204902649, + "learning_rate": 4.8041574201958445e-05, + "loss": 1.0891, + "step": 126200 + }, + { + "epoch": 1.959993171836931, + "grad_norm": 2.3030471801757812, + "learning_rate": 4.80400223467155e-05, + "loss": 1.0517, + "step": 126300 + }, + { + "epoch": 1.961545027079874, + "grad_norm": 2.2358100414276123, + "learning_rate": 4.803847049147256e-05, + "loss": 1.0746, + "step": 126400 + }, + { + "epoch": 1.963096882322817, + "grad_norm": 2.43802547454834, + "learning_rate": 4.803691863622962e-05, + "loss": 1.0627, + "step": 126500 + }, + { + "epoch": 1.96464873756576, + "grad_norm": 2.9986813068389893, + "learning_rate": 4.8035366780986676e-05, + "loss": 1.0766, + "step": 126600 + }, + { + "epoch": 1.9662005928087027, + "grad_norm": 1.985355019569397, + "learning_rate": 4.8033814925743734e-05, + "loss": 1.051, + "step": 126700 + }, + { + "epoch": 1.9677524480516457, + "grad_norm": 2.38588809967041, + "learning_rate": 4.803226307050079e-05, + "loss": 1.0497, + "step": 126800 + }, + { + "epoch": 1.9693043032945887, + "grad_norm": 2.0955090522766113, + "learning_rate": 4.803071121525784e-05, + "loss": 1.0722, + "step": 126900 + }, + { + "epoch": 1.9708561585375315, + "grad_norm": 2.1312978267669678, + "learning_rate": 4.80291593600149e-05, + "loss": 1.0849, + "step": 127000 + }, + { + "epoch": 1.9724080137804747, + "grad_norm": 2.509342670440674, + "learning_rate": 4.802760750477196e-05, + "loss": 1.0661, + "step": 127100 + }, + { + "epoch": 1.9739598690234175, + "grad_norm": 2.378692626953125, + "learning_rate": 4.802605564952901e-05, + "loss": 1.084, + "step": 127200 + }, + { + "epoch": 1.9755117242663605, + "grad_norm": 2.361170530319214, + "learning_rate": 4.802450379428607e-05, + "loss": 1.0692, + "step": 127300 + }, + { + "epoch": 1.9770635795093034, + "grad_norm": 2.198575258255005, + "learning_rate": 4.8022951939043124e-05, + "loss": 1.0548, + "step": 127400 + }, + { + "epoch": 1.9786154347522462, + "grad_norm": 1.966523289680481, + "learning_rate": 4.802140008380018e-05, + "loss": 1.0658, + "step": 127500 + }, + { + "epoch": 1.9801672899951892, + "grad_norm": 2.093989610671997, + "learning_rate": 4.801984822855724e-05, + "loss": 1.0226, + "step": 127600 + }, + { + "epoch": 1.9817191452381322, + "grad_norm": 2.2246530055999756, + "learning_rate": 4.80182963733143e-05, + "loss": 1.0453, + "step": 127700 + }, + { + "epoch": 1.983271000481075, + "grad_norm": 2.239438533782959, + "learning_rate": 4.8016744518071355e-05, + "loss": 1.0663, + "step": 127800 + }, + { + "epoch": 1.9848228557240182, + "grad_norm": 2.4696121215820312, + "learning_rate": 4.801519266282841e-05, + "loss": 1.0621, + "step": 127900 + }, + { + "epoch": 1.986374710966961, + "grad_norm": 2.4301156997680664, + "learning_rate": 4.801364080758547e-05, + "loss": 1.0658, + "step": 128000 + }, + { + "epoch": 1.987926566209904, + "grad_norm": 2.231473445892334, + "learning_rate": 4.801208895234253e-05, + "loss": 1.0761, + "step": 128100 + }, + { + "epoch": 1.989478421452847, + "grad_norm": 2.036868095397949, + "learning_rate": 4.8010537097099586e-05, + "loss": 1.0813, + "step": 128200 + }, + { + "epoch": 1.9910302766957897, + "grad_norm": 2.6127278804779053, + "learning_rate": 4.8008985241856644e-05, + "loss": 1.0674, + "step": 128300 + }, + { + "epoch": 1.992582131938733, + "grad_norm": 2.0679049491882324, + "learning_rate": 4.8007433386613695e-05, + "loss": 1.0546, + "step": 128400 + }, + { + "epoch": 1.9941339871816757, + "grad_norm": 2.076267957687378, + "learning_rate": 4.800588153137075e-05, + "loss": 1.0843, + "step": 128500 + }, + { + "epoch": 1.9956858424246187, + "grad_norm": 2.563143253326416, + "learning_rate": 4.800432967612781e-05, + "loss": 1.0841, + "step": 128600 + }, + { + "epoch": 1.9972376976675617, + "grad_norm": 2.4184048175811768, + "learning_rate": 4.800277782088487e-05, + "loss": 1.0774, + "step": 128700 + }, + { + "epoch": 1.9987895529105044, + "grad_norm": 2.103375196456909, + "learning_rate": 4.8001225965641926e-05, + "loss": 1.0844, + "step": 128800 + }, + { + "epoch": 2.0003414081534476, + "grad_norm": 2.2401986122131348, + "learning_rate": 4.7999674110398984e-05, + "loss": 1.0518, + "step": 128900 + }, + { + "epoch": 2.0018932633963904, + "grad_norm": 2.137556314468384, + "learning_rate": 4.799812225515604e-05, + "loss": 1.0793, + "step": 129000 + }, + { + "epoch": 2.003445118639333, + "grad_norm": 2.2270452976226807, + "learning_rate": 4.79965703999131e-05, + "loss": 1.0871, + "step": 129100 + }, + { + "epoch": 2.0049969738822764, + "grad_norm": 2.251593589782715, + "learning_rate": 4.799501854467016e-05, + "loss": 1.0646, + "step": 129200 + }, + { + "epoch": 2.006548829125219, + "grad_norm": 2.5513393878936768, + "learning_rate": 4.7993466689427215e-05, + "loss": 1.0721, + "step": 129300 + }, + { + "epoch": 2.0081006843681624, + "grad_norm": 2.4913833141326904, + "learning_rate": 4.799191483418427e-05, + "loss": 1.0505, + "step": 129400 + }, + { + "epoch": 2.009652539611105, + "grad_norm": 2.5230865478515625, + "learning_rate": 4.799036297894133e-05, + "loss": 1.081, + "step": 129500 + }, + { + "epoch": 2.011204394854048, + "grad_norm": 2.1321935653686523, + "learning_rate": 4.798881112369839e-05, + "loss": 1.0728, + "step": 129600 + }, + { + "epoch": 2.012756250096991, + "grad_norm": 2.1135218143463135, + "learning_rate": 4.798725926845544e-05, + "loss": 1.0559, + "step": 129700 + }, + { + "epoch": 2.014308105339934, + "grad_norm": 2.455204725265503, + "learning_rate": 4.79857074132125e-05, + "loss": 1.0686, + "step": 129800 + }, + { + "epoch": 2.0158599605828766, + "grad_norm": 2.4912171363830566, + "learning_rate": 4.7984155557969555e-05, + "loss": 1.0658, + "step": 129900 + }, + { + "epoch": 2.01741181582582, + "grad_norm": 2.3110179901123047, + "learning_rate": 4.798260370272661e-05, + "loss": 1.0688, + "step": 130000 + }, + { + "epoch": 2.0189636710687626, + "grad_norm": 2.387244939804077, + "learning_rate": 4.798105184748367e-05, + "loss": 1.0802, + "step": 130100 + }, + { + "epoch": 2.020515526311706, + "grad_norm": 2.1166555881500244, + "learning_rate": 4.797949999224073e-05, + "loss": 1.0673, + "step": 130200 + }, + { + "epoch": 2.0220673815546486, + "grad_norm": 2.5111243724823, + "learning_rate": 4.7977948136997786e-05, + "loss": 1.0664, + "step": 130300 + }, + { + "epoch": 2.0236192367975914, + "grad_norm": 2.174515724182129, + "learning_rate": 4.7976396281754843e-05, + "loss": 1.0653, + "step": 130400 + }, + { + "epoch": 2.0251710920405346, + "grad_norm": 2.346059560775757, + "learning_rate": 4.7974844426511894e-05, + "loss": 1.0811, + "step": 130500 + }, + { + "epoch": 2.0267229472834773, + "grad_norm": 2.2295713424682617, + "learning_rate": 4.797329257126895e-05, + "loss": 1.058, + "step": 130600 + }, + { + "epoch": 2.02827480252642, + "grad_norm": 1.9734086990356445, + "learning_rate": 4.797174071602601e-05, + "loss": 1.0757, + "step": 130700 + }, + { + "epoch": 2.0298266577693633, + "grad_norm": 2.238551139831543, + "learning_rate": 4.797018886078307e-05, + "loss": 1.0426, + "step": 130800 + }, + { + "epoch": 2.031378513012306, + "grad_norm": 2.4086573123931885, + "learning_rate": 4.7968637005540125e-05, + "loss": 1.0721, + "step": 130900 + }, + { + "epoch": 2.0329303682552493, + "grad_norm": 2.3574037551879883, + "learning_rate": 4.796708515029718e-05, + "loss": 1.0734, + "step": 131000 + }, + { + "epoch": 2.034482223498192, + "grad_norm": 2.0901331901550293, + "learning_rate": 4.796553329505424e-05, + "loss": 1.069, + "step": 131100 + }, + { + "epoch": 2.036034078741135, + "grad_norm": 2.7794599533081055, + "learning_rate": 4.796398143981129e-05, + "loss": 1.0334, + "step": 131200 + }, + { + "epoch": 2.037585933984078, + "grad_norm": 2.2324187755584717, + "learning_rate": 4.796242958456835e-05, + "loss": 1.0765, + "step": 131300 + }, + { + "epoch": 2.039137789227021, + "grad_norm": 2.6240456104278564, + "learning_rate": 4.796087772932541e-05, + "loss": 1.06, + "step": 131400 + }, + { + "epoch": 2.040689644469964, + "grad_norm": 2.271939754486084, + "learning_rate": 4.7959325874082465e-05, + "loss": 1.0617, + "step": 131500 + }, + { + "epoch": 2.042241499712907, + "grad_norm": 2.1382699012756348, + "learning_rate": 4.795777401883952e-05, + "loss": 1.0835, + "step": 131600 + }, + { + "epoch": 2.0437933549558496, + "grad_norm": 2.61220645904541, + "learning_rate": 4.795622216359658e-05, + "loss": 1.0743, + "step": 131700 + }, + { + "epoch": 2.045345210198793, + "grad_norm": 2.5175349712371826, + "learning_rate": 4.795467030835364e-05, + "loss": 1.0408, + "step": 131800 + }, + { + "epoch": 2.0468970654417356, + "grad_norm": 1.8807514905929565, + "learning_rate": 4.7953118453110696e-05, + "loss": 1.0403, + "step": 131900 + }, + { + "epoch": 2.0484489206846783, + "grad_norm": 2.0968101024627686, + "learning_rate": 4.7951566597867754e-05, + "loss": 1.0435, + "step": 132000 + }, + { + "epoch": 2.0500007759276215, + "grad_norm": 2.2672133445739746, + "learning_rate": 4.795001474262481e-05, + "loss": 1.0406, + "step": 132100 + }, + { + "epoch": 2.0515526311705643, + "grad_norm": 2.366468906402588, + "learning_rate": 4.794846288738187e-05, + "loss": 1.0459, + "step": 132200 + }, + { + "epoch": 2.0531044864135075, + "grad_norm": 2.247690200805664, + "learning_rate": 4.794691103213893e-05, + "loss": 1.0539, + "step": 132300 + }, + { + "epoch": 2.0546563416564503, + "grad_norm": 1.8925704956054688, + "learning_rate": 4.7945359176895985e-05, + "loss": 1.0493, + "step": 132400 + }, + { + "epoch": 2.056208196899393, + "grad_norm": 1.930070161819458, + "learning_rate": 4.7943807321653036e-05, + "loss": 1.0664, + "step": 132500 + }, + { + "epoch": 2.0577600521423363, + "grad_norm": 1.8710217475891113, + "learning_rate": 4.7942255466410094e-05, + "loss": 1.0624, + "step": 132600 + }, + { + "epoch": 2.059311907385279, + "grad_norm": 2.365832805633545, + "learning_rate": 4.794070361116715e-05, + "loss": 1.056, + "step": 132700 + }, + { + "epoch": 2.0608637626282222, + "grad_norm": 2.0719170570373535, + "learning_rate": 4.793915175592421e-05, + "loss": 1.0608, + "step": 132800 + }, + { + "epoch": 2.062415617871165, + "grad_norm": 3.2959883213043213, + "learning_rate": 4.793759990068127e-05, + "loss": 1.0539, + "step": 132900 + }, + { + "epoch": 2.0639674731141078, + "grad_norm": 2.6179490089416504, + "learning_rate": 4.7936048045438325e-05, + "loss": 1.06, + "step": 133000 + }, + { + "epoch": 2.065519328357051, + "grad_norm": 1.979307770729065, + "learning_rate": 4.793449619019538e-05, + "loss": 1.0611, + "step": 133100 + }, + { + "epoch": 2.0670711835999938, + "grad_norm": 2.7358052730560303, + "learning_rate": 4.793294433495244e-05, + "loss": 1.0437, + "step": 133200 + }, + { + "epoch": 2.0686230388429365, + "grad_norm": 2.2443244457244873, + "learning_rate": 4.79313924797095e-05, + "loss": 1.0703, + "step": 133300 + }, + { + "epoch": 2.0701748940858797, + "grad_norm": 1.9455335140228271, + "learning_rate": 4.7929840624466556e-05, + "loss": 1.07, + "step": 133400 + }, + { + "epoch": 2.0717267493288225, + "grad_norm": 2.095489501953125, + "learning_rate": 4.7928288769223613e-05, + "loss": 1.0783, + "step": 133500 + }, + { + "epoch": 2.0732786045717657, + "grad_norm": 2.3454062938690186, + "learning_rate": 4.792673691398067e-05, + "loss": 1.0612, + "step": 133600 + }, + { + "epoch": 2.0748304598147085, + "grad_norm": 1.7862906455993652, + "learning_rate": 4.792518505873773e-05, + "loss": 1.0449, + "step": 133700 + }, + { + "epoch": 2.0763823150576513, + "grad_norm": 1.7941592931747437, + "learning_rate": 4.792363320349478e-05, + "loss": 1.083, + "step": 133800 + }, + { + "epoch": 2.0779341703005945, + "grad_norm": 2.3605029582977295, + "learning_rate": 4.792208134825184e-05, + "loss": 1.0819, + "step": 133900 + }, + { + "epoch": 2.0794860255435372, + "grad_norm": 3.062452793121338, + "learning_rate": 4.7920529493008895e-05, + "loss": 1.0524, + "step": 134000 + }, + { + "epoch": 2.0810378807864804, + "grad_norm": 2.369102954864502, + "learning_rate": 4.7918977637765946e-05, + "loss": 1.0747, + "step": 134100 + }, + { + "epoch": 2.082589736029423, + "grad_norm": 4.67025899887085, + "learning_rate": 4.7917425782523004e-05, + "loss": 1.0761, + "step": 134200 + }, + { + "epoch": 2.084141591272366, + "grad_norm": 2.3892180919647217, + "learning_rate": 4.791587392728006e-05, + "loss": 1.0776, + "step": 134300 + }, + { + "epoch": 2.085693446515309, + "grad_norm": 2.34405255317688, + "learning_rate": 4.791432207203712e-05, + "loss": 1.0763, + "step": 134400 + }, + { + "epoch": 2.087245301758252, + "grad_norm": 2.067636251449585, + "learning_rate": 4.791277021679418e-05, + "loss": 1.0402, + "step": 134500 + }, + { + "epoch": 2.0887971570011947, + "grad_norm": 2.8550288677215576, + "learning_rate": 4.7911218361551235e-05, + "loss": 1.0499, + "step": 134600 + }, + { + "epoch": 2.090349012244138, + "grad_norm": 2.4426872730255127, + "learning_rate": 4.790966650630829e-05, + "loss": 1.0637, + "step": 134700 + }, + { + "epoch": 2.0919008674870807, + "grad_norm": 2.5001494884490967, + "learning_rate": 4.790811465106535e-05, + "loss": 1.085, + "step": 134800 + }, + { + "epoch": 2.093452722730024, + "grad_norm": 2.5100629329681396, + "learning_rate": 4.790656279582241e-05, + "loss": 1.0563, + "step": 134900 + }, + { + "epoch": 2.0950045779729667, + "grad_norm": 2.1518192291259766, + "learning_rate": 4.7905010940579466e-05, + "loss": 1.0575, + "step": 135000 + }, + { + "epoch": 2.0965564332159095, + "grad_norm": 1.876930594444275, + "learning_rate": 4.7903459085336524e-05, + "loss": 1.0619, + "step": 135100 + }, + { + "epoch": 2.0981082884588527, + "grad_norm": 2.4290332794189453, + "learning_rate": 4.790190723009358e-05, + "loss": 1.0562, + "step": 135200 + }, + { + "epoch": 2.0996601437017954, + "grad_norm": 2.172323226928711, + "learning_rate": 4.790035537485064e-05, + "loss": 1.0765, + "step": 135300 + }, + { + "epoch": 2.1012119989447386, + "grad_norm": 2.4881210327148438, + "learning_rate": 4.789880351960769e-05, + "loss": 1.0433, + "step": 135400 + }, + { + "epoch": 2.1027638541876814, + "grad_norm": 2.0730140209198, + "learning_rate": 4.789725166436475e-05, + "loss": 1.0639, + "step": 135500 + }, + { + "epoch": 2.104315709430624, + "grad_norm": 2.4232640266418457, + "learning_rate": 4.7895699809121806e-05, + "loss": 1.0418, + "step": 135600 + }, + { + "epoch": 2.1058675646735674, + "grad_norm": 2.2779343128204346, + "learning_rate": 4.7894147953878864e-05, + "loss": 1.0515, + "step": 135700 + }, + { + "epoch": 2.10741941991651, + "grad_norm": 2.540282726287842, + "learning_rate": 4.789259609863592e-05, + "loss": 1.0798, + "step": 135800 + }, + { + "epoch": 2.108971275159453, + "grad_norm": 2.4380288124084473, + "learning_rate": 4.789104424339298e-05, + "loss": 1.0632, + "step": 135900 + }, + { + "epoch": 2.110523130402396, + "grad_norm": 2.7196033000946045, + "learning_rate": 4.788949238815004e-05, + "loss": 1.0826, + "step": 136000 + }, + { + "epoch": 2.112074985645339, + "grad_norm": 2.6508028507232666, + "learning_rate": 4.7887940532907095e-05, + "loss": 1.0594, + "step": 136100 + }, + { + "epoch": 2.113626840888282, + "grad_norm": 2.185214042663574, + "learning_rate": 4.788638867766415e-05, + "loss": 1.069, + "step": 136200 + }, + { + "epoch": 2.115178696131225, + "grad_norm": 2.311896562576294, + "learning_rate": 4.788483682242121e-05, + "loss": 1.0512, + "step": 136300 + }, + { + "epoch": 2.1167305513741677, + "grad_norm": 2.380798101425171, + "learning_rate": 4.788328496717827e-05, + "loss": 1.0558, + "step": 136400 + }, + { + "epoch": 2.118282406617111, + "grad_norm": 2.257411479949951, + "learning_rate": 4.7881733111935326e-05, + "loss": 1.0274, + "step": 136500 + }, + { + "epoch": 2.1198342618600536, + "grad_norm": 2.556018829345703, + "learning_rate": 4.7880181256692383e-05, + "loss": 1.0364, + "step": 136600 + }, + { + "epoch": 2.121386117102997, + "grad_norm": 2.053537368774414, + "learning_rate": 4.7878629401449434e-05, + "loss": 1.0501, + "step": 136700 + }, + { + "epoch": 2.1229379723459396, + "grad_norm": 2.075845718383789, + "learning_rate": 4.787707754620649e-05, + "loss": 1.0751, + "step": 136800 + }, + { + "epoch": 2.1244898275888824, + "grad_norm": 1.9716755151748657, + "learning_rate": 4.787552569096355e-05, + "loss": 1.0498, + "step": 136900 + }, + { + "epoch": 2.1260416828318256, + "grad_norm": 2.6828722953796387, + "learning_rate": 4.78739738357206e-05, + "loss": 1.0683, + "step": 137000 + }, + { + "epoch": 2.1275935380747684, + "grad_norm": 2.434020519256592, + "learning_rate": 4.787242198047766e-05, + "loss": 1.0791, + "step": 137100 + }, + { + "epoch": 2.129145393317711, + "grad_norm": 2.335705280303955, + "learning_rate": 4.7870870125234716e-05, + "loss": 1.0379, + "step": 137200 + }, + { + "epoch": 2.1306972485606543, + "grad_norm": 2.5148603916168213, + "learning_rate": 4.7869318269991774e-05, + "loss": 1.0629, + "step": 137300 + }, + { + "epoch": 2.132249103803597, + "grad_norm": 2.0944297313690186, + "learning_rate": 4.786776641474883e-05, + "loss": 1.0474, + "step": 137400 + }, + { + "epoch": 2.1338009590465403, + "grad_norm": 2.3225951194763184, + "learning_rate": 4.786621455950589e-05, + "loss": 1.0318, + "step": 137500 + }, + { + "epoch": 2.135352814289483, + "grad_norm": 2.3951828479766846, + "learning_rate": 4.786466270426295e-05, + "loss": 1.067, + "step": 137600 + }, + { + "epoch": 2.136904669532426, + "grad_norm": 2.2558112144470215, + "learning_rate": 4.7863110849020005e-05, + "loss": 1.0404, + "step": 137700 + }, + { + "epoch": 2.138456524775369, + "grad_norm": 2.2582268714904785, + "learning_rate": 4.786155899377706e-05, + "loss": 1.0716, + "step": 137800 + }, + { + "epoch": 2.140008380018312, + "grad_norm": 2.364276885986328, + "learning_rate": 4.786000713853412e-05, + "loss": 1.0602, + "step": 137900 + }, + { + "epoch": 2.141560235261255, + "grad_norm": 2.464024305343628, + "learning_rate": 4.785845528329118e-05, + "loss": 1.0571, + "step": 138000 + }, + { + "epoch": 2.143112090504198, + "grad_norm": 2.0748398303985596, + "learning_rate": 4.7856903428048236e-05, + "loss": 1.0695, + "step": 138100 + }, + { + "epoch": 2.1446639457471406, + "grad_norm": 1.9313088655471802, + "learning_rate": 4.785535157280529e-05, + "loss": 1.0603, + "step": 138200 + }, + { + "epoch": 2.146215800990084, + "grad_norm": 2.4812843799591064, + "learning_rate": 4.7853799717562345e-05, + "loss": 1.047, + "step": 138300 + }, + { + "epoch": 2.1477676562330266, + "grad_norm": 2.34519100189209, + "learning_rate": 4.78522478623194e-05, + "loss": 1.081, + "step": 138400 + }, + { + "epoch": 2.1493195114759693, + "grad_norm": 2.3789329528808594, + "learning_rate": 4.785069600707646e-05, + "loss": 1.0548, + "step": 138500 + }, + { + "epoch": 2.1508713667189125, + "grad_norm": 2.2253458499908447, + "learning_rate": 4.784914415183352e-05, + "loss": 1.0848, + "step": 138600 + }, + { + "epoch": 2.1524232219618553, + "grad_norm": 2.3522236347198486, + "learning_rate": 4.7847592296590576e-05, + "loss": 1.0632, + "step": 138700 + }, + { + "epoch": 2.1539750772047985, + "grad_norm": 2.1813430786132812, + "learning_rate": 4.7846040441347634e-05, + "loss": 1.0576, + "step": 138800 + }, + { + "epoch": 2.1555269324477413, + "grad_norm": 2.080888271331787, + "learning_rate": 4.784448858610469e-05, + "loss": 1.0775, + "step": 138900 + }, + { + "epoch": 2.157078787690684, + "grad_norm": 2.6952641010284424, + "learning_rate": 4.784293673086175e-05, + "loss": 1.053, + "step": 139000 + }, + { + "epoch": 2.1586306429336273, + "grad_norm": 2.1119320392608643, + "learning_rate": 4.784138487561881e-05, + "loss": 1.0453, + "step": 139100 + }, + { + "epoch": 2.16018249817657, + "grad_norm": 2.2789254188537598, + "learning_rate": 4.7839833020375865e-05, + "loss": 1.0487, + "step": 139200 + }, + { + "epoch": 2.1617343534195133, + "grad_norm": 2.819838523864746, + "learning_rate": 4.783828116513292e-05, + "loss": 1.0581, + "step": 139300 + }, + { + "epoch": 2.163286208662456, + "grad_norm": 2.0911495685577393, + "learning_rate": 4.783672930988998e-05, + "loss": 1.0586, + "step": 139400 + }, + { + "epoch": 2.164838063905399, + "grad_norm": 2.501739501953125, + "learning_rate": 4.783517745464703e-05, + "loss": 1.0686, + "step": 139500 + }, + { + "epoch": 2.166389919148342, + "grad_norm": 2.484483003616333, + "learning_rate": 4.783362559940409e-05, + "loss": 1.0483, + "step": 139600 + }, + { + "epoch": 2.1679417743912848, + "grad_norm": 1.9543863534927368, + "learning_rate": 4.783207374416115e-05, + "loss": 1.0493, + "step": 139700 + }, + { + "epoch": 2.1694936296342275, + "grad_norm": 2.141782283782959, + "learning_rate": 4.7830521888918204e-05, + "loss": 1.0655, + "step": 139800 + }, + { + "epoch": 2.1710454848771708, + "grad_norm": 2.1450002193450928, + "learning_rate": 4.782897003367526e-05, + "loss": 1.0814, + "step": 139900 + }, + { + "epoch": 2.1725973401201135, + "grad_norm": 2.242635726928711, + "learning_rate": 4.782741817843232e-05, + "loss": 1.05, + "step": 140000 + }, + { + "epoch": 2.1741491953630567, + "grad_norm": 2.4195568561553955, + "learning_rate": 4.782586632318938e-05, + "loss": 1.039, + "step": 140100 + }, + { + "epoch": 2.1757010506059995, + "grad_norm": 2.152660608291626, + "learning_rate": 4.7824314467946435e-05, + "loss": 1.046, + "step": 140200 + }, + { + "epoch": 2.1772529058489423, + "grad_norm": 2.4316248893737793, + "learning_rate": 4.7822762612703486e-05, + "loss": 1.057, + "step": 140300 + }, + { + "epoch": 2.1788047610918855, + "grad_norm": 2.2033448219299316, + "learning_rate": 4.7821210757460544e-05, + "loss": 1.0719, + "step": 140400 + }, + { + "epoch": 2.1803566163348282, + "grad_norm": 2.146704912185669, + "learning_rate": 4.78196589022176e-05, + "loss": 1.0602, + "step": 140500 + }, + { + "epoch": 2.1819084715777715, + "grad_norm": 3.163799524307251, + "learning_rate": 4.781810704697466e-05, + "loss": 1.0271, + "step": 140600 + }, + { + "epoch": 2.1834603268207142, + "grad_norm": 2.3232812881469727, + "learning_rate": 4.781655519173172e-05, + "loss": 1.026, + "step": 140700 + }, + { + "epoch": 2.185012182063657, + "grad_norm": 2.2287683486938477, + "learning_rate": 4.7815003336488775e-05, + "loss": 1.0637, + "step": 140800 + }, + { + "epoch": 2.1865640373066, + "grad_norm": 2.2433431148529053, + "learning_rate": 4.781345148124583e-05, + "loss": 1.028, + "step": 140900 + }, + { + "epoch": 2.188115892549543, + "grad_norm": 1.9653691053390503, + "learning_rate": 4.781189962600289e-05, + "loss": 1.0513, + "step": 141000 + }, + { + "epoch": 2.1896677477924857, + "grad_norm": 2.280937910079956, + "learning_rate": 4.781034777075994e-05, + "loss": 1.0474, + "step": 141100 + }, + { + "epoch": 2.191219603035429, + "grad_norm": 2.641557216644287, + "learning_rate": 4.7808795915517e-05, + "loss": 1.0453, + "step": 141200 + }, + { + "epoch": 2.1927714582783717, + "grad_norm": 2.0779197216033936, + "learning_rate": 4.780724406027406e-05, + "loss": 1.0412, + "step": 141300 + }, + { + "epoch": 2.194323313521315, + "grad_norm": 2.32072377204895, + "learning_rate": 4.7805692205031115e-05, + "loss": 1.0731, + "step": 141400 + }, + { + "epoch": 2.1958751687642577, + "grad_norm": 2.3527817726135254, + "learning_rate": 4.780414034978817e-05, + "loss": 1.0712, + "step": 141500 + }, + { + "epoch": 2.1974270240072005, + "grad_norm": 2.154773712158203, + "learning_rate": 4.780258849454523e-05, + "loss": 1.0474, + "step": 141600 + }, + { + "epoch": 2.1989788792501437, + "grad_norm": 2.3761403560638428, + "learning_rate": 4.780103663930229e-05, + "loss": 1.0418, + "step": 141700 + }, + { + "epoch": 2.2005307344930864, + "grad_norm": 2.551168918609619, + "learning_rate": 4.7799484784059346e-05, + "loss": 1.0588, + "step": 141800 + }, + { + "epoch": 2.202082589736029, + "grad_norm": 2.4025044441223145, + "learning_rate": 4.7797932928816404e-05, + "loss": 1.0558, + "step": 141900 + }, + { + "epoch": 2.2036344449789724, + "grad_norm": 2.3074281215667725, + "learning_rate": 4.779638107357346e-05, + "loss": 1.0601, + "step": 142000 + }, + { + "epoch": 2.205186300221915, + "grad_norm": 1.9330724477767944, + "learning_rate": 4.779482921833052e-05, + "loss": 1.0652, + "step": 142100 + }, + { + "epoch": 2.2067381554648584, + "grad_norm": 2.2744085788726807, + "learning_rate": 4.779327736308758e-05, + "loss": 1.0801, + "step": 142200 + }, + { + "epoch": 2.208290010707801, + "grad_norm": 2.1365199089050293, + "learning_rate": 4.7791725507844635e-05, + "loss": 1.062, + "step": 142300 + }, + { + "epoch": 2.209841865950744, + "grad_norm": 2.1086266040802, + "learning_rate": 4.7790173652601686e-05, + "loss": 1.0703, + "step": 142400 + }, + { + "epoch": 2.211393721193687, + "grad_norm": 2.3713245391845703, + "learning_rate": 4.778862179735874e-05, + "loss": 1.0495, + "step": 142500 + }, + { + "epoch": 2.21294557643663, + "grad_norm": 2.628063678741455, + "learning_rate": 4.77870699421158e-05, + "loss": 1.0703, + "step": 142600 + }, + { + "epoch": 2.214497431679573, + "grad_norm": 2.2222647666931152, + "learning_rate": 4.778551808687286e-05, + "loss": 1.0607, + "step": 142700 + }, + { + "epoch": 2.216049286922516, + "grad_norm": 2.295289993286133, + "learning_rate": 4.778396623162992e-05, + "loss": 1.061, + "step": 142800 + }, + { + "epoch": 2.2176011421654587, + "grad_norm": 2.6212644577026367, + "learning_rate": 4.7782414376386974e-05, + "loss": 1.0627, + "step": 142900 + }, + { + "epoch": 2.219152997408402, + "grad_norm": 2.4660825729370117, + "learning_rate": 4.778086252114403e-05, + "loss": 1.0458, + "step": 143000 + }, + { + "epoch": 2.2207048526513447, + "grad_norm": 2.1224730014801025, + "learning_rate": 4.777931066590109e-05, + "loss": 1.0422, + "step": 143100 + }, + { + "epoch": 2.2222567078942874, + "grad_norm": 2.405404567718506, + "learning_rate": 4.777775881065815e-05, + "loss": 1.0339, + "step": 143200 + }, + { + "epoch": 2.2238085631372306, + "grad_norm": 1.9070557355880737, + "learning_rate": 4.7776206955415205e-05, + "loss": 1.0727, + "step": 143300 + }, + { + "epoch": 2.2253604183801734, + "grad_norm": 2.354372501373291, + "learning_rate": 4.777465510017226e-05, + "loss": 1.0542, + "step": 143400 + }, + { + "epoch": 2.2269122736231166, + "grad_norm": 2.394808769226074, + "learning_rate": 4.7773103244929314e-05, + "loss": 1.0594, + "step": 143500 + }, + { + "epoch": 2.2284641288660594, + "grad_norm": 2.148226022720337, + "learning_rate": 4.777155138968637e-05, + "loss": 1.0533, + "step": 143600 + }, + { + "epoch": 2.230015984109002, + "grad_norm": 2.3424553871154785, + "learning_rate": 4.776999953444343e-05, + "loss": 1.0554, + "step": 143700 + }, + { + "epoch": 2.2315678393519454, + "grad_norm": 2.2306673526763916, + "learning_rate": 4.776844767920049e-05, + "loss": 1.0297, + "step": 143800 + }, + { + "epoch": 2.233119694594888, + "grad_norm": 2.1870059967041016, + "learning_rate": 4.776689582395754e-05, + "loss": 1.0707, + "step": 143900 + }, + { + "epoch": 2.234671549837831, + "grad_norm": 1.9275952577590942, + "learning_rate": 4.7765343968714596e-05, + "loss": 1.0456, + "step": 144000 + }, + { + "epoch": 2.236223405080774, + "grad_norm": 2.018854856491089, + "learning_rate": 4.7763792113471654e-05, + "loss": 1.0543, + "step": 144100 + }, + { + "epoch": 2.237775260323717, + "grad_norm": 2.496302843093872, + "learning_rate": 4.776224025822871e-05, + "loss": 1.0546, + "step": 144200 + }, + { + "epoch": 2.23932711556666, + "grad_norm": 2.695812702178955, + "learning_rate": 4.776068840298577e-05, + "loss": 1.0696, + "step": 144300 + }, + { + "epoch": 2.240878970809603, + "grad_norm": 2.6274967193603516, + "learning_rate": 4.775913654774283e-05, + "loss": 1.068, + "step": 144400 + }, + { + "epoch": 2.2424308260525456, + "grad_norm": 2.355994701385498, + "learning_rate": 4.7757584692499885e-05, + "loss": 1.0713, + "step": 144500 + }, + { + "epoch": 2.243982681295489, + "grad_norm": 2.223212957382202, + "learning_rate": 4.775603283725694e-05, + "loss": 1.0568, + "step": 144600 + }, + { + "epoch": 2.2455345365384316, + "grad_norm": 1.9176734685897827, + "learning_rate": 4.7754480982014e-05, + "loss": 1.0566, + "step": 144700 + }, + { + "epoch": 2.247086391781375, + "grad_norm": 2.3282992839813232, + "learning_rate": 4.775292912677106e-05, + "loss": 1.0693, + "step": 144800 + }, + { + "epoch": 2.2486382470243176, + "grad_norm": 2.0825443267822266, + "learning_rate": 4.7751377271528116e-05, + "loss": 1.0731, + "step": 144900 + }, + { + "epoch": 2.2501901022672604, + "grad_norm": 2.376631736755371, + "learning_rate": 4.7749825416285174e-05, + "loss": 1.0633, + "step": 145000 + }, + { + "epoch": 2.2517419575102036, + "grad_norm": 2.3078227043151855, + "learning_rate": 4.774827356104223e-05, + "loss": 1.0818, + "step": 145100 + }, + { + "epoch": 2.2532938127531463, + "grad_norm": 2.1128923892974854, + "learning_rate": 4.774672170579928e-05, + "loss": 1.044, + "step": 145200 + }, + { + "epoch": 2.254845667996089, + "grad_norm": 1.8535137176513672, + "learning_rate": 4.774516985055634e-05, + "loss": 1.0272, + "step": 145300 + }, + { + "epoch": 2.2563975232390323, + "grad_norm": 2.605168342590332, + "learning_rate": 4.77436179953134e-05, + "loss": 1.0724, + "step": 145400 + }, + { + "epoch": 2.257949378481975, + "grad_norm": 2.319013833999634, + "learning_rate": 4.7742066140070456e-05, + "loss": 1.0656, + "step": 145500 + }, + { + "epoch": 2.2595012337249183, + "grad_norm": 2.407966375350952, + "learning_rate": 4.774051428482751e-05, + "loss": 1.0547, + "step": 145600 + }, + { + "epoch": 2.261053088967861, + "grad_norm": 2.496340751647949, + "learning_rate": 4.773896242958457e-05, + "loss": 1.0663, + "step": 145700 + }, + { + "epoch": 2.262604944210804, + "grad_norm": 2.252469301223755, + "learning_rate": 4.773741057434163e-05, + "loss": 1.061, + "step": 145800 + }, + { + "epoch": 2.264156799453747, + "grad_norm": 2.1885874271392822, + "learning_rate": 4.773585871909869e-05, + "loss": 1.0475, + "step": 145900 + }, + { + "epoch": 2.26570865469669, + "grad_norm": 2.185015916824341, + "learning_rate": 4.7734306863855744e-05, + "loss": 1.0393, + "step": 146000 + }, + { + "epoch": 2.267260509939633, + "grad_norm": 2.1869454383850098, + "learning_rate": 4.77327550086128e-05, + "loss": 1.0415, + "step": 146100 + }, + { + "epoch": 2.268812365182576, + "grad_norm": 2.1967248916625977, + "learning_rate": 4.773120315336986e-05, + "loss": 1.0583, + "step": 146200 + }, + { + "epoch": 2.2703642204255186, + "grad_norm": 2.3129613399505615, + "learning_rate": 4.772965129812692e-05, + "loss": 1.053, + "step": 146300 + }, + { + "epoch": 2.2719160756684618, + "grad_norm": 2.094412088394165, + "learning_rate": 4.7728099442883975e-05, + "loss": 1.0623, + "step": 146400 + }, + { + "epoch": 2.2734679309114045, + "grad_norm": 2.1780972480773926, + "learning_rate": 4.7726547587641026e-05, + "loss": 1.0633, + "step": 146500 + }, + { + "epoch": 2.2750197861543473, + "grad_norm": 2.5608599185943604, + "learning_rate": 4.7724995732398084e-05, + "loss": 1.045, + "step": 146600 + }, + { + "epoch": 2.2765716413972905, + "grad_norm": 2.292449712753296, + "learning_rate": 4.772344387715514e-05, + "loss": 1.0416, + "step": 146700 + }, + { + "epoch": 2.2781234966402333, + "grad_norm": 2.345844268798828, + "learning_rate": 4.772189202191219e-05, + "loss": 1.0562, + "step": 146800 + }, + { + "epoch": 2.2796753518831765, + "grad_norm": 2.333442449569702, + "learning_rate": 4.772034016666925e-05, + "loss": 1.0507, + "step": 146900 + }, + { + "epoch": 2.2812272071261193, + "grad_norm": 2.647368907928467, + "learning_rate": 4.771878831142631e-05, + "loss": 1.0524, + "step": 147000 + }, + { + "epoch": 2.282779062369062, + "grad_norm": 2.471221923828125, + "learning_rate": 4.7717236456183366e-05, + "loss": 1.0581, + "step": 147100 + }, + { + "epoch": 2.2843309176120052, + "grad_norm": 2.3910906314849854, + "learning_rate": 4.7715684600940424e-05, + "loss": 1.0873, + "step": 147200 + }, + { + "epoch": 2.285882772854948, + "grad_norm": 2.331425666809082, + "learning_rate": 4.771413274569748e-05, + "loss": 1.0553, + "step": 147300 + }, + { + "epoch": 2.287434628097891, + "grad_norm": 2.3420097827911377, + "learning_rate": 4.771258089045454e-05, + "loss": 1.0707, + "step": 147400 + }, + { + "epoch": 2.288986483340834, + "grad_norm": 1.8440392017364502, + "learning_rate": 4.77110290352116e-05, + "loss": 1.0706, + "step": 147500 + }, + { + "epoch": 2.2905383385837768, + "grad_norm": 2.0832135677337646, + "learning_rate": 4.7709477179968655e-05, + "loss": 1.0724, + "step": 147600 + }, + { + "epoch": 2.29209019382672, + "grad_norm": 1.790666937828064, + "learning_rate": 4.770792532472571e-05, + "loss": 1.04, + "step": 147700 + }, + { + "epoch": 2.2936420490696627, + "grad_norm": 2.523416757583618, + "learning_rate": 4.770637346948277e-05, + "loss": 1.035, + "step": 147800 + }, + { + "epoch": 2.2951939043126055, + "grad_norm": 2.341848373413086, + "learning_rate": 4.770482161423983e-05, + "loss": 1.0577, + "step": 147900 + }, + { + "epoch": 2.2967457595555487, + "grad_norm": 2.3899857997894287, + "learning_rate": 4.770326975899688e-05, + "loss": 1.0477, + "step": 148000 + }, + { + "epoch": 2.2982976147984915, + "grad_norm": 2.122985363006592, + "learning_rate": 4.770171790375394e-05, + "loss": 1.0548, + "step": 148100 + }, + { + "epoch": 2.2998494700414347, + "grad_norm": 1.9313324689865112, + "learning_rate": 4.7700166048510995e-05, + "loss": 1.0584, + "step": 148200 + }, + { + "epoch": 2.3014013252843775, + "grad_norm": 2.4943015575408936, + "learning_rate": 4.769861419326805e-05, + "loss": 1.0351, + "step": 148300 + }, + { + "epoch": 2.3029531805273202, + "grad_norm": 2.653111457824707, + "learning_rate": 4.769706233802511e-05, + "loss": 1.0483, + "step": 148400 + }, + { + "epoch": 2.3045050357702634, + "grad_norm": 2.2838118076324463, + "learning_rate": 4.769551048278217e-05, + "loss": 1.0485, + "step": 148500 + }, + { + "epoch": 2.306056891013206, + "grad_norm": 2.3461153507232666, + "learning_rate": 4.7693958627539226e-05, + "loss": 1.0508, + "step": 148600 + }, + { + "epoch": 2.3076087462561494, + "grad_norm": 2.42330002784729, + "learning_rate": 4.769240677229628e-05, + "loss": 1.0796, + "step": 148700 + }, + { + "epoch": 2.309160601499092, + "grad_norm": 2.3097708225250244, + "learning_rate": 4.769085491705334e-05, + "loss": 1.0496, + "step": 148800 + }, + { + "epoch": 2.310712456742035, + "grad_norm": 2.0397136211395264, + "learning_rate": 4.76893030618104e-05, + "loss": 1.0509, + "step": 148900 + }, + { + "epoch": 2.312264311984978, + "grad_norm": 2.363084316253662, + "learning_rate": 4.768775120656746e-05, + "loss": 1.0342, + "step": 149000 + }, + { + "epoch": 2.313816167227921, + "grad_norm": 2.190420389175415, + "learning_rate": 4.7686199351324514e-05, + "loss": 1.0668, + "step": 149100 + }, + { + "epoch": 2.3153680224708637, + "grad_norm": 2.466182231903076, + "learning_rate": 4.768464749608157e-05, + "loss": 1.0614, + "step": 149200 + }, + { + "epoch": 2.316919877713807, + "grad_norm": 2.198457956314087, + "learning_rate": 4.768309564083862e-05, + "loss": 1.0568, + "step": 149300 + }, + { + "epoch": 2.3184717329567497, + "grad_norm": 2.161055088043213, + "learning_rate": 4.768154378559568e-05, + "loss": 1.0534, + "step": 149400 + }, + { + "epoch": 2.320023588199693, + "grad_norm": 2.147515296936035, + "learning_rate": 4.767999193035274e-05, + "loss": 1.0281, + "step": 149500 + }, + { + "epoch": 2.3215754434426357, + "grad_norm": 2.2802913188934326, + "learning_rate": 4.7678440075109796e-05, + "loss": 1.048, + "step": 149600 + }, + { + "epoch": 2.3231272986855784, + "grad_norm": 2.000180721282959, + "learning_rate": 4.7676888219866854e-05, + "loss": 1.043, + "step": 149700 + }, + { + "epoch": 2.3246791539285216, + "grad_norm": 2.777062177658081, + "learning_rate": 4.767533636462391e-05, + "loss": 1.0575, + "step": 149800 + }, + { + "epoch": 2.3262310091714644, + "grad_norm": 2.241762399673462, + "learning_rate": 4.767378450938097e-05, + "loss": 1.0437, + "step": 149900 + }, + { + "epoch": 2.3277828644144076, + "grad_norm": 2.5319266319274902, + "learning_rate": 4.767223265413802e-05, + "loss": 1.0556, + "step": 150000 + }, + { + "epoch": 2.3293347196573504, + "grad_norm": 2.237144708633423, + "learning_rate": 4.767068079889508e-05, + "loss": 1.0476, + "step": 150100 + }, + { + "epoch": 2.330886574900293, + "grad_norm": 2.430987596511841, + "learning_rate": 4.7669128943652136e-05, + "loss": 1.0488, + "step": 150200 + }, + { + "epoch": 2.3324384301432364, + "grad_norm": 2.464299440383911, + "learning_rate": 4.7667577088409194e-05, + "loss": 1.0383, + "step": 150300 + }, + { + "epoch": 2.333990285386179, + "grad_norm": 2.4423747062683105, + "learning_rate": 4.766602523316625e-05, + "loss": 1.0516, + "step": 150400 + }, + { + "epoch": 2.335542140629122, + "grad_norm": 2.6412713527679443, + "learning_rate": 4.766447337792331e-05, + "loss": 1.079, + "step": 150500 + }, + { + "epoch": 2.337093995872065, + "grad_norm": 2.2306251525878906, + "learning_rate": 4.766292152268037e-05, + "loss": 1.0465, + "step": 150600 + }, + { + "epoch": 2.338645851115008, + "grad_norm": 2.1720101833343506, + "learning_rate": 4.7661369667437425e-05, + "loss": 1.0469, + "step": 150700 + }, + { + "epoch": 2.340197706357951, + "grad_norm": 2.2147459983825684, + "learning_rate": 4.765981781219448e-05, + "loss": 1.0604, + "step": 150800 + }, + { + "epoch": 2.341749561600894, + "grad_norm": 2.310157299041748, + "learning_rate": 4.7658265956951534e-05, + "loss": 1.0717, + "step": 150900 + }, + { + "epoch": 2.3433014168438366, + "grad_norm": 2.3886160850524902, + "learning_rate": 4.765671410170859e-05, + "loss": 1.0552, + "step": 151000 + }, + { + "epoch": 2.34485327208678, + "grad_norm": 2.4885671138763428, + "learning_rate": 4.765516224646565e-05, + "loss": 1.0506, + "step": 151100 + }, + { + "epoch": 2.3464051273297226, + "grad_norm": 2.5518290996551514, + "learning_rate": 4.765361039122271e-05, + "loss": 1.0614, + "step": 151200 + }, + { + "epoch": 2.347956982572666, + "grad_norm": 1.9272632598876953, + "learning_rate": 4.7652058535979765e-05, + "loss": 1.0401, + "step": 151300 + }, + { + "epoch": 2.3495088378156086, + "grad_norm": 2.372792959213257, + "learning_rate": 4.765050668073682e-05, + "loss": 1.0477, + "step": 151400 + }, + { + "epoch": 2.3510606930585514, + "grad_norm": 1.8132848739624023, + "learning_rate": 4.764895482549388e-05, + "loss": 1.0298, + "step": 151500 + }, + { + "epoch": 2.3526125483014946, + "grad_norm": 3.2817459106445312, + "learning_rate": 4.764740297025094e-05, + "loss": 1.0396, + "step": 151600 + }, + { + "epoch": 2.3541644035444373, + "grad_norm": 2.5971763134002686, + "learning_rate": 4.7645851115007996e-05, + "loss": 1.0474, + "step": 151700 + }, + { + "epoch": 2.35571625878738, + "grad_norm": 1.8911468982696533, + "learning_rate": 4.764429925976505e-05, + "loss": 1.0416, + "step": 151800 + }, + { + "epoch": 2.3572681140303233, + "grad_norm": 1.9852266311645508, + "learning_rate": 4.764274740452211e-05, + "loss": 1.0501, + "step": 151900 + }, + { + "epoch": 2.358819969273266, + "grad_norm": 2.1220972537994385, + "learning_rate": 4.764119554927917e-05, + "loss": 1.0519, + "step": 152000 + }, + { + "epoch": 2.3603718245162093, + "grad_norm": 1.8463906049728394, + "learning_rate": 4.763964369403623e-05, + "loss": 1.0459, + "step": 152100 + }, + { + "epoch": 2.361923679759152, + "grad_norm": 2.333005905151367, + "learning_rate": 4.763809183879328e-05, + "loss": 1.0372, + "step": 152200 + }, + { + "epoch": 2.363475535002095, + "grad_norm": 2.468670606613159, + "learning_rate": 4.7636539983550335e-05, + "loss": 1.0335, + "step": 152300 + }, + { + "epoch": 2.365027390245038, + "grad_norm": 2.165095329284668, + "learning_rate": 4.763498812830739e-05, + "loss": 1.0566, + "step": 152400 + }, + { + "epoch": 2.366579245487981, + "grad_norm": 2.281447410583496, + "learning_rate": 4.763343627306445e-05, + "loss": 1.0557, + "step": 152500 + }, + { + "epoch": 2.368131100730924, + "grad_norm": 2.539764642715454, + "learning_rate": 4.763188441782151e-05, + "loss": 1.0475, + "step": 152600 + }, + { + "epoch": 2.369682955973867, + "grad_norm": 2.510406017303467, + "learning_rate": 4.7630332562578566e-05, + "loss": 1.0271, + "step": 152700 + }, + { + "epoch": 2.3712348112168096, + "grad_norm": 2.220874309539795, + "learning_rate": 4.7628780707335624e-05, + "loss": 1.0429, + "step": 152800 + }, + { + "epoch": 2.372786666459753, + "grad_norm": 2.0885510444641113, + "learning_rate": 4.762722885209268e-05, + "loss": 1.0473, + "step": 152900 + }, + { + "epoch": 2.3743385217026955, + "grad_norm": 2.1182026863098145, + "learning_rate": 4.762567699684974e-05, + "loss": 1.036, + "step": 153000 + }, + { + "epoch": 2.3758903769456383, + "grad_norm": 2.2428579330444336, + "learning_rate": 4.76241251416068e-05, + "loss": 1.0657, + "step": 153100 + }, + { + "epoch": 2.3774422321885815, + "grad_norm": 2.295353412628174, + "learning_rate": 4.7622573286363855e-05, + "loss": 1.0885, + "step": 153200 + }, + { + "epoch": 2.3789940874315243, + "grad_norm": 2.3112423419952393, + "learning_rate": 4.7621021431120906e-05, + "loss": 1.0391, + "step": 153300 + }, + { + "epoch": 2.3805459426744675, + "grad_norm": 2.634587049484253, + "learning_rate": 4.7619469575877964e-05, + "loss": 1.0579, + "step": 153400 + }, + { + "epoch": 2.3820977979174103, + "grad_norm": 2.181894540786743, + "learning_rate": 4.761791772063502e-05, + "loss": 1.0518, + "step": 153500 + }, + { + "epoch": 2.383649653160353, + "grad_norm": 2.0486185550689697, + "learning_rate": 4.761636586539208e-05, + "loss": 1.0723, + "step": 153600 + }, + { + "epoch": 2.3852015084032963, + "grad_norm": 2.4686059951782227, + "learning_rate": 4.761481401014913e-05, + "loss": 1.0413, + "step": 153700 + }, + { + "epoch": 2.386753363646239, + "grad_norm": 2.0938847064971924, + "learning_rate": 4.761326215490619e-05, + "loss": 1.0697, + "step": 153800 + }, + { + "epoch": 2.3883052188891822, + "grad_norm": 2.0420076847076416, + "learning_rate": 4.7611710299663246e-05, + "loss": 1.0519, + "step": 153900 + }, + { + "epoch": 2.389857074132125, + "grad_norm": 2.1430273056030273, + "learning_rate": 4.7610158444420304e-05, + "loss": 1.0558, + "step": 154000 + }, + { + "epoch": 2.3914089293750678, + "grad_norm": 2.037324905395508, + "learning_rate": 4.760860658917736e-05, + "loss": 1.0506, + "step": 154100 + }, + { + "epoch": 2.392960784618011, + "grad_norm": 2.1305556297302246, + "learning_rate": 4.760705473393442e-05, + "loss": 1.0267, + "step": 154200 + }, + { + "epoch": 2.3945126398609538, + "grad_norm": 2.3603744506835938, + "learning_rate": 4.760550287869148e-05, + "loss": 1.033, + "step": 154300 + }, + { + "epoch": 2.3960644951038965, + "grad_norm": 2.3444020748138428, + "learning_rate": 4.7603951023448535e-05, + "loss": 1.0543, + "step": 154400 + }, + { + "epoch": 2.3976163503468397, + "grad_norm": 2.0857348442077637, + "learning_rate": 4.760239916820559e-05, + "loss": 1.0576, + "step": 154500 + }, + { + "epoch": 2.3991682055897825, + "grad_norm": 2.5950815677642822, + "learning_rate": 4.760084731296265e-05, + "loss": 1.0375, + "step": 154600 + }, + { + "epoch": 2.4007200608327257, + "grad_norm": 1.9735974073410034, + "learning_rate": 4.759929545771971e-05, + "loss": 1.0626, + "step": 154700 + }, + { + "epoch": 2.4022719160756685, + "grad_norm": 1.823829174041748, + "learning_rate": 4.7597743602476766e-05, + "loss": 1.0564, + "step": 154800 + }, + { + "epoch": 2.4038237713186112, + "grad_norm": 2.4093971252441406, + "learning_rate": 4.759619174723382e-05, + "loss": 1.0457, + "step": 154900 + }, + { + "epoch": 2.4053756265615545, + "grad_norm": 2.218122720718384, + "learning_rate": 4.7594639891990874e-05, + "loss": 1.0778, + "step": 155000 + }, + { + "epoch": 2.4069274818044972, + "grad_norm": 2.118809461593628, + "learning_rate": 4.759308803674793e-05, + "loss": 1.055, + "step": 155100 + }, + { + "epoch": 2.4084793370474404, + "grad_norm": 2.1717329025268555, + "learning_rate": 4.759153618150499e-05, + "loss": 1.0446, + "step": 155200 + }, + { + "epoch": 2.410031192290383, + "grad_norm": 2.3796184062957764, + "learning_rate": 4.758998432626205e-05, + "loss": 1.0475, + "step": 155300 + }, + { + "epoch": 2.411583047533326, + "grad_norm": 2.2108540534973145, + "learning_rate": 4.7588432471019105e-05, + "loss": 1.0752, + "step": 155400 + }, + { + "epoch": 2.413134902776269, + "grad_norm": 2.346567153930664, + "learning_rate": 4.758688061577616e-05, + "loss": 1.0578, + "step": 155500 + }, + { + "epoch": 2.414686758019212, + "grad_norm": 2.4406721591949463, + "learning_rate": 4.758532876053322e-05, + "loss": 1.0492, + "step": 155600 + }, + { + "epoch": 2.4162386132621547, + "grad_norm": 2.3978302478790283, + "learning_rate": 4.758377690529028e-05, + "loss": 1.041, + "step": 155700 + }, + { + "epoch": 2.417790468505098, + "grad_norm": 2.165689468383789, + "learning_rate": 4.7582225050047336e-05, + "loss": 1.0386, + "step": 155800 + }, + { + "epoch": 2.4193423237480407, + "grad_norm": 2.235287666320801, + "learning_rate": 4.7580673194804394e-05, + "loss": 1.0632, + "step": 155900 + }, + { + "epoch": 2.420894178990984, + "grad_norm": 2.384352207183838, + "learning_rate": 4.757912133956145e-05, + "loss": 1.0468, + "step": 156000 + }, + { + "epoch": 2.4224460342339267, + "grad_norm": 1.9341685771942139, + "learning_rate": 4.757756948431851e-05, + "loss": 1.0413, + "step": 156100 + }, + { + "epoch": 2.4239978894768694, + "grad_norm": 2.594360113143921, + "learning_rate": 4.757601762907557e-05, + "loss": 1.0726, + "step": 156200 + }, + { + "epoch": 2.4255497447198127, + "grad_norm": 2.6018567085266113, + "learning_rate": 4.757446577383262e-05, + "loss": 1.0286, + "step": 156300 + }, + { + "epoch": 2.4271015999627554, + "grad_norm": 2.0682055950164795, + "learning_rate": 4.7572913918589676e-05, + "loss": 1.0471, + "step": 156400 + }, + { + "epoch": 2.4286534552056986, + "grad_norm": 2.3083670139312744, + "learning_rate": 4.7571362063346734e-05, + "loss": 1.0546, + "step": 156500 + }, + { + "epoch": 2.4302053104486414, + "grad_norm": 2.548862934112549, + "learning_rate": 4.7569810208103785e-05, + "loss": 1.0537, + "step": 156600 + }, + { + "epoch": 2.431757165691584, + "grad_norm": 2.381843090057373, + "learning_rate": 4.756825835286084e-05, + "loss": 1.0489, + "step": 156700 + }, + { + "epoch": 2.4333090209345274, + "grad_norm": 2.127727508544922, + "learning_rate": 4.75667064976179e-05, + "loss": 1.0771, + "step": 156800 + }, + { + "epoch": 2.43486087617747, + "grad_norm": 2.370854139328003, + "learning_rate": 4.756515464237496e-05, + "loss": 1.048, + "step": 156900 + }, + { + "epoch": 2.436412731420413, + "grad_norm": 2.1895341873168945, + "learning_rate": 4.7563602787132016e-05, + "loss": 1.0281, + "step": 157000 + }, + { + "epoch": 2.437964586663356, + "grad_norm": 1.860305666923523, + "learning_rate": 4.7562050931889074e-05, + "loss": 1.0371, + "step": 157100 + }, + { + "epoch": 2.439516441906299, + "grad_norm": 2.2114458084106445, + "learning_rate": 4.756049907664613e-05, + "loss": 1.0743, + "step": 157200 + }, + { + "epoch": 2.4410682971492417, + "grad_norm": 2.7622485160827637, + "learning_rate": 4.755894722140319e-05, + "loss": 1.0435, + "step": 157300 + }, + { + "epoch": 2.442620152392185, + "grad_norm": 2.377544403076172, + "learning_rate": 4.755739536616025e-05, + "loss": 1.0459, + "step": 157400 + }, + { + "epoch": 2.4441720076351277, + "grad_norm": 2.1643855571746826, + "learning_rate": 4.7555843510917305e-05, + "loss": 1.0751, + "step": 157500 + }, + { + "epoch": 2.445723862878071, + "grad_norm": 2.5202760696411133, + "learning_rate": 4.755429165567436e-05, + "loss": 1.0466, + "step": 157600 + }, + { + "epoch": 2.4472757181210136, + "grad_norm": 2.090636730194092, + "learning_rate": 4.755273980043142e-05, + "loss": 1.0601, + "step": 157700 + }, + { + "epoch": 2.448827573363957, + "grad_norm": 2.1124720573425293, + "learning_rate": 4.755118794518848e-05, + "loss": 1.0396, + "step": 157800 + }, + { + "epoch": 2.4503794286068996, + "grad_norm": 2.507014274597168, + "learning_rate": 4.754963608994553e-05, + "loss": 1.0599, + "step": 157900 + }, + { + "epoch": 2.4519312838498424, + "grad_norm": 2.0493221282958984, + "learning_rate": 4.7548084234702587e-05, + "loss": 1.0426, + "step": 158000 + }, + { + "epoch": 2.4534831390927856, + "grad_norm": 2.2023959159851074, + "learning_rate": 4.7546532379459644e-05, + "loss": 1.0645, + "step": 158100 + }, + { + "epoch": 2.4550349943357284, + "grad_norm": 2.1573829650878906, + "learning_rate": 4.75449805242167e-05, + "loss": 1.0651, + "step": 158200 + }, + { + "epoch": 2.456586849578671, + "grad_norm": 2.179300308227539, + "learning_rate": 4.754342866897376e-05, + "loss": 1.0585, + "step": 158300 + }, + { + "epoch": 2.4581387048216143, + "grad_norm": 2.278463363647461, + "learning_rate": 4.754187681373082e-05, + "loss": 1.0429, + "step": 158400 + }, + { + "epoch": 2.459690560064557, + "grad_norm": 1.7499622106552124, + "learning_rate": 4.7540324958487875e-05, + "loss": 1.0415, + "step": 158500 + }, + { + "epoch": 2.4612424153075, + "grad_norm": 2.3482213020324707, + "learning_rate": 4.753877310324493e-05, + "loss": 1.0441, + "step": 158600 + }, + { + "epoch": 2.462794270550443, + "grad_norm": 2.2545645236968994, + "learning_rate": 4.753722124800199e-05, + "loss": 1.0574, + "step": 158700 + }, + { + "epoch": 2.464346125793386, + "grad_norm": 2.5652778148651123, + "learning_rate": 4.753566939275905e-05, + "loss": 1.0747, + "step": 158800 + }, + { + "epoch": 2.465897981036329, + "grad_norm": 2.000574827194214, + "learning_rate": 4.7534117537516106e-05, + "loss": 1.0419, + "step": 158900 + }, + { + "epoch": 2.467449836279272, + "grad_norm": 2.224050998687744, + "learning_rate": 4.7532565682273164e-05, + "loss": 1.0384, + "step": 159000 + }, + { + "epoch": 2.469001691522215, + "grad_norm": 2.237063407897949, + "learning_rate": 4.753101382703022e-05, + "loss": 1.0713, + "step": 159100 + }, + { + "epoch": 2.470553546765158, + "grad_norm": 2.1370975971221924, + "learning_rate": 4.752946197178727e-05, + "loss": 1.0346, + "step": 159200 + }, + { + "epoch": 2.4721054020081006, + "grad_norm": 2.3766438961029053, + "learning_rate": 4.752791011654433e-05, + "loss": 1.0487, + "step": 159300 + }, + { + "epoch": 2.473657257251044, + "grad_norm": 2.182241678237915, + "learning_rate": 4.752635826130139e-05, + "loss": 1.0299, + "step": 159400 + }, + { + "epoch": 2.4752091124939866, + "grad_norm": 2.0750985145568848, + "learning_rate": 4.7524806406058446e-05, + "loss": 1.0341, + "step": 159500 + }, + { + "epoch": 2.4767609677369293, + "grad_norm": 2.2225234508514404, + "learning_rate": 4.7523254550815504e-05, + "loss": 1.0465, + "step": 159600 + }, + { + "epoch": 2.4783128229798725, + "grad_norm": 2.247349500656128, + "learning_rate": 4.752170269557256e-05, + "loss": 1.0326, + "step": 159700 + }, + { + "epoch": 2.4798646782228153, + "grad_norm": 2.508463144302368, + "learning_rate": 4.752015084032961e-05, + "loss": 1.072, + "step": 159800 + }, + { + "epoch": 2.481416533465758, + "grad_norm": 2.166019916534424, + "learning_rate": 4.751859898508667e-05, + "loss": 1.042, + "step": 159900 + }, + { + "epoch": 2.4829683887087013, + "grad_norm": 2.0317604541778564, + "learning_rate": 4.751704712984373e-05, + "loss": 1.0407, + "step": 160000 + }, + { + "epoch": 2.484520243951644, + "grad_norm": 2.2698965072631836, + "learning_rate": 4.7515495274600786e-05, + "loss": 1.0577, + "step": 160100 + }, + { + "epoch": 2.4860720991945873, + "grad_norm": 2.5224087238311768, + "learning_rate": 4.7513943419357844e-05, + "loss": 1.0455, + "step": 160200 + }, + { + "epoch": 2.48762395443753, + "grad_norm": 2.213451385498047, + "learning_rate": 4.75123915641149e-05, + "loss": 1.0589, + "step": 160300 + }, + { + "epoch": 2.4891758096804733, + "grad_norm": 2.409013509750366, + "learning_rate": 4.751083970887196e-05, + "loss": 1.0606, + "step": 160400 + }, + { + "epoch": 2.490727664923416, + "grad_norm": 2.1013107299804688, + "learning_rate": 4.750928785362902e-05, + "loss": 1.0599, + "step": 160500 + }, + { + "epoch": 2.492279520166359, + "grad_norm": 2.3335561752319336, + "learning_rate": 4.7507735998386075e-05, + "loss": 1.0314, + "step": 160600 + }, + { + "epoch": 2.493831375409302, + "grad_norm": 2.0536320209503174, + "learning_rate": 4.7506184143143126e-05, + "loss": 1.0572, + "step": 160700 + }, + { + "epoch": 2.4953832306522448, + "grad_norm": 2.5244200229644775, + "learning_rate": 4.750463228790018e-05, + "loss": 1.05, + "step": 160800 + }, + { + "epoch": 2.4969350858951875, + "grad_norm": 1.8753924369812012, + "learning_rate": 4.750308043265724e-05, + "loss": 1.0463, + "step": 160900 + }, + { + "epoch": 2.4984869411381307, + "grad_norm": 2.6177942752838135, + "learning_rate": 4.75015285774143e-05, + "loss": 1.0485, + "step": 161000 + }, + { + "epoch": 2.5000387963810735, + "grad_norm": 2.049344301223755, + "learning_rate": 4.7499976722171357e-05, + "loss": 1.058, + "step": 161100 + }, + { + "epoch": 2.5015906516240163, + "grad_norm": 2.446894407272339, + "learning_rate": 4.7498424866928414e-05, + "loss": 1.0801, + "step": 161200 + }, + { + "epoch": 2.5031425068669595, + "grad_norm": 2.094968557357788, + "learning_rate": 4.749687301168547e-05, + "loss": 1.045, + "step": 161300 + }, + { + "epoch": 2.5046943621099023, + "grad_norm": 2.4712958335876465, + "learning_rate": 4.749532115644253e-05, + "loss": 1.0631, + "step": 161400 + }, + { + "epoch": 2.5062462173528455, + "grad_norm": 1.9819856882095337, + "learning_rate": 4.749376930119959e-05, + "loss": 1.0383, + "step": 161500 + }, + { + "epoch": 2.5077980725957882, + "grad_norm": 2.4259374141693115, + "learning_rate": 4.7492217445956645e-05, + "loss": 1.0396, + "step": 161600 + }, + { + "epoch": 2.5093499278387315, + "grad_norm": 2.312303304672241, + "learning_rate": 4.74906655907137e-05, + "loss": 1.064, + "step": 161700 + }, + { + "epoch": 2.510901783081674, + "grad_norm": 2.351372718811035, + "learning_rate": 4.748911373547076e-05, + "loss": 1.0341, + "step": 161800 + }, + { + "epoch": 2.512453638324617, + "grad_norm": 2.074159860610962, + "learning_rate": 4.748756188022782e-05, + "loss": 1.0159, + "step": 161900 + }, + { + "epoch": 2.51400549356756, + "grad_norm": 2.592299461364746, + "learning_rate": 4.748601002498487e-05, + "loss": 1.0596, + "step": 162000 + }, + { + "epoch": 2.515557348810503, + "grad_norm": 2.0001986026763916, + "learning_rate": 4.748445816974193e-05, + "loss": 1.0548, + "step": 162100 + }, + { + "epoch": 2.5171092040534457, + "grad_norm": 1.8096919059753418, + "learning_rate": 4.7482906314498985e-05, + "loss": 1.0709, + "step": 162200 + }, + { + "epoch": 2.518661059296389, + "grad_norm": 1.8577805757522583, + "learning_rate": 4.748135445925604e-05, + "loss": 1.0426, + "step": 162300 + }, + { + "epoch": 2.5202129145393317, + "grad_norm": 2.1133625507354736, + "learning_rate": 4.74798026040131e-05, + "loss": 1.0508, + "step": 162400 + }, + { + "epoch": 2.5217647697822745, + "grad_norm": 2.109609842300415, + "learning_rate": 4.747825074877016e-05, + "loss": 1.0408, + "step": 162500 + }, + { + "epoch": 2.5233166250252177, + "grad_norm": 2.1927883625030518, + "learning_rate": 4.7476698893527216e-05, + "loss": 1.0313, + "step": 162600 + }, + { + "epoch": 2.5248684802681605, + "grad_norm": 2.262117385864258, + "learning_rate": 4.7475147038284274e-05, + "loss": 1.043, + "step": 162700 + }, + { + "epoch": 2.5264203355111037, + "grad_norm": 2.3188188076019287, + "learning_rate": 4.747359518304133e-05, + "loss": 1.039, + "step": 162800 + }, + { + "epoch": 2.5279721907540464, + "grad_norm": 2.0471930503845215, + "learning_rate": 4.747204332779839e-05, + "loss": 1.0363, + "step": 162900 + }, + { + "epoch": 2.5295240459969897, + "grad_norm": 1.9992185831069946, + "learning_rate": 4.747049147255545e-05, + "loss": 1.0326, + "step": 163000 + }, + { + "epoch": 2.5310759012399324, + "grad_norm": 2.333908796310425, + "learning_rate": 4.74689396173125e-05, + "loss": 1.0544, + "step": 163100 + }, + { + "epoch": 2.532627756482875, + "grad_norm": 1.9311968088150024, + "learning_rate": 4.7467387762069556e-05, + "loss": 1.0583, + "step": 163200 + }, + { + "epoch": 2.5341796117258184, + "grad_norm": 2.388669490814209, + "learning_rate": 4.7465835906826614e-05, + "loss": 1.0747, + "step": 163300 + }, + { + "epoch": 2.535731466968761, + "grad_norm": 2.2378740310668945, + "learning_rate": 4.746428405158367e-05, + "loss": 1.0475, + "step": 163400 + }, + { + "epoch": 2.537283322211704, + "grad_norm": 2.5829851627349854, + "learning_rate": 4.746273219634072e-05, + "loss": 1.0354, + "step": 163500 + }, + { + "epoch": 2.538835177454647, + "grad_norm": 2.3610262870788574, + "learning_rate": 4.746118034109778e-05, + "loss": 1.0336, + "step": 163600 + }, + { + "epoch": 2.54038703269759, + "grad_norm": 2.523416757583618, + "learning_rate": 4.745962848585484e-05, + "loss": 1.0621, + "step": 163700 + }, + { + "epoch": 2.5419388879405327, + "grad_norm": 2.299258232116699, + "learning_rate": 4.7458076630611896e-05, + "loss": 1.0601, + "step": 163800 + }, + { + "epoch": 2.543490743183476, + "grad_norm": 2.02618408203125, + "learning_rate": 4.745652477536895e-05, + "loss": 1.0595, + "step": 163900 + }, + { + "epoch": 2.5450425984264187, + "grad_norm": 2.161907196044922, + "learning_rate": 4.745497292012601e-05, + "loss": 1.0603, + "step": 164000 + }, + { + "epoch": 2.546594453669362, + "grad_norm": 2.0180680751800537, + "learning_rate": 4.745342106488307e-05, + "loss": 1.0441, + "step": 164100 + }, + { + "epoch": 2.5481463089123046, + "grad_norm": 1.9091253280639648, + "learning_rate": 4.7451869209640127e-05, + "loss": 1.0356, + "step": 164200 + }, + { + "epoch": 2.549698164155248, + "grad_norm": 2.026906967163086, + "learning_rate": 4.7450317354397184e-05, + "loss": 1.04, + "step": 164300 + }, + { + "epoch": 2.5512500193981906, + "grad_norm": 2.476252555847168, + "learning_rate": 4.744876549915424e-05, + "loss": 1.0475, + "step": 164400 + }, + { + "epoch": 2.5528018746411334, + "grad_norm": 2.367262125015259, + "learning_rate": 4.74472136439113e-05, + "loss": 1.0256, + "step": 164500 + }, + { + "epoch": 2.5543537298840766, + "grad_norm": 2.2345073223114014, + "learning_rate": 4.744566178866836e-05, + "loss": 1.0441, + "step": 164600 + }, + { + "epoch": 2.5559055851270194, + "grad_norm": 2.453378438949585, + "learning_rate": 4.7444109933425415e-05, + "loss": 1.0463, + "step": 164700 + }, + { + "epoch": 2.557457440369962, + "grad_norm": 2.4341559410095215, + "learning_rate": 4.7442558078182466e-05, + "loss": 1.0524, + "step": 164800 + }, + { + "epoch": 2.5590092956129054, + "grad_norm": 2.466679573059082, + "learning_rate": 4.7441006222939524e-05, + "loss": 1.0519, + "step": 164900 + }, + { + "epoch": 2.560561150855848, + "grad_norm": 2.243474006652832, + "learning_rate": 4.743945436769658e-05, + "loss": 1.0588, + "step": 165000 + }, + { + "epoch": 2.562113006098791, + "grad_norm": 2.1142807006835938, + "learning_rate": 4.743790251245364e-05, + "loss": 1.0335, + "step": 165100 + }, + { + "epoch": 2.563664861341734, + "grad_norm": 1.6868747472763062, + "learning_rate": 4.74363506572107e-05, + "loss": 1.0388, + "step": 165200 + }, + { + "epoch": 2.565216716584677, + "grad_norm": 2.5993452072143555, + "learning_rate": 4.7434798801967755e-05, + "loss": 1.0413, + "step": 165300 + }, + { + "epoch": 2.56676857182762, + "grad_norm": 2.2801432609558105, + "learning_rate": 4.743324694672481e-05, + "loss": 1.0637, + "step": 165400 + }, + { + "epoch": 2.568320427070563, + "grad_norm": 2.533191442489624, + "learning_rate": 4.743169509148187e-05, + "loss": 1.0147, + "step": 165500 + }, + { + "epoch": 2.569872282313506, + "grad_norm": 2.1488561630249023, + "learning_rate": 4.743014323623893e-05, + "loss": 1.0315, + "step": 165600 + }, + { + "epoch": 2.571424137556449, + "grad_norm": 1.9489185810089111, + "learning_rate": 4.7428591380995986e-05, + "loss": 1.0396, + "step": 165700 + }, + { + "epoch": 2.5729759927993916, + "grad_norm": 2.29132080078125, + "learning_rate": 4.7427039525753044e-05, + "loss": 1.0544, + "step": 165800 + }, + { + "epoch": 2.574527848042335, + "grad_norm": 2.374694585800171, + "learning_rate": 4.74254876705101e-05, + "loss": 1.0409, + "step": 165900 + }, + { + "epoch": 2.5760797032852776, + "grad_norm": 1.9349309206008911, + "learning_rate": 4.742393581526716e-05, + "loss": 1.0522, + "step": 166000 + }, + { + "epoch": 2.5776315585282203, + "grad_norm": 2.0343072414398193, + "learning_rate": 4.742238396002421e-05, + "loss": 1.0376, + "step": 166100 + }, + { + "epoch": 2.5791834137711636, + "grad_norm": 2.2302796840667725, + "learning_rate": 4.742083210478127e-05, + "loss": 1.0438, + "step": 166200 + }, + { + "epoch": 2.5807352690141063, + "grad_norm": 2.174546241760254, + "learning_rate": 4.7419280249538326e-05, + "loss": 1.0515, + "step": 166300 + }, + { + "epoch": 2.582287124257049, + "grad_norm": 2.080028533935547, + "learning_rate": 4.741772839429538e-05, + "loss": 1.0383, + "step": 166400 + }, + { + "epoch": 2.5838389794999923, + "grad_norm": 1.830137014389038, + "learning_rate": 4.7416176539052435e-05, + "loss": 1.052, + "step": 166500 + }, + { + "epoch": 2.585390834742935, + "grad_norm": 1.9905749559402466, + "learning_rate": 4.741462468380949e-05, + "loss": 1.0238, + "step": 166600 + }, + { + "epoch": 2.586942689985878, + "grad_norm": 2.3765504360198975, + "learning_rate": 4.741307282856655e-05, + "loss": 1.0491, + "step": 166700 + }, + { + "epoch": 2.588494545228821, + "grad_norm": 2.989565134048462, + "learning_rate": 4.741152097332361e-05, + "loss": 1.034, + "step": 166800 + }, + { + "epoch": 2.5900464004717643, + "grad_norm": 2.3565328121185303, + "learning_rate": 4.7409969118080666e-05, + "loss": 1.0505, + "step": 166900 + }, + { + "epoch": 2.591598255714707, + "grad_norm": 2.297006368637085, + "learning_rate": 4.740841726283772e-05, + "loss": 1.0521, + "step": 167000 + }, + { + "epoch": 2.59315011095765, + "grad_norm": 3.0977842807769775, + "learning_rate": 4.740686540759478e-05, + "loss": 1.0333, + "step": 167100 + }, + { + "epoch": 2.594701966200593, + "grad_norm": 2.3634941577911377, + "learning_rate": 4.740531355235184e-05, + "loss": 1.0339, + "step": 167200 + }, + { + "epoch": 2.596253821443536, + "grad_norm": 2.169315814971924, + "learning_rate": 4.7403761697108897e-05, + "loss": 1.0569, + "step": 167300 + }, + { + "epoch": 2.5978056766864785, + "grad_norm": 2.4711475372314453, + "learning_rate": 4.7402209841865954e-05, + "loss": 1.0451, + "step": 167400 + }, + { + "epoch": 2.5993575319294218, + "grad_norm": 2.0106093883514404, + "learning_rate": 4.740065798662301e-05, + "loss": 1.0715, + "step": 167500 + }, + { + "epoch": 2.6009093871723645, + "grad_norm": 2.1006646156311035, + "learning_rate": 4.739910613138007e-05, + "loss": 1.0461, + "step": 167600 + }, + { + "epoch": 2.6024612424153073, + "grad_norm": 2.278226137161255, + "learning_rate": 4.739755427613712e-05, + "loss": 1.0325, + "step": 167700 + }, + { + "epoch": 2.6040130976582505, + "grad_norm": 2.365501880645752, + "learning_rate": 4.739600242089418e-05, + "loss": 1.0643, + "step": 167800 + }, + { + "epoch": 2.6055649529011933, + "grad_norm": 2.0916943550109863, + "learning_rate": 4.7394450565651236e-05, + "loss": 1.0432, + "step": 167900 + }, + { + "epoch": 2.607116808144136, + "grad_norm": 2.1060738563537598, + "learning_rate": 4.7392898710408294e-05, + "loss": 1.0385, + "step": 168000 + }, + { + "epoch": 2.6086686633870793, + "grad_norm": 2.485832691192627, + "learning_rate": 4.739134685516535e-05, + "loss": 1.0429, + "step": 168100 + }, + { + "epoch": 2.6102205186300225, + "grad_norm": 2.1706671714782715, + "learning_rate": 4.738979499992241e-05, + "loss": 1.0489, + "step": 168200 + }, + { + "epoch": 2.6117723738729652, + "grad_norm": 2.2199583053588867, + "learning_rate": 4.738824314467947e-05, + "loss": 1.0531, + "step": 168300 + }, + { + "epoch": 2.613324229115908, + "grad_norm": 2.1484732627868652, + "learning_rate": 4.7386691289436525e-05, + "loss": 1.0306, + "step": 168400 + }, + { + "epoch": 2.614876084358851, + "grad_norm": 2.219813823699951, + "learning_rate": 4.738513943419358e-05, + "loss": 1.0625, + "step": 168500 + }, + { + "epoch": 2.616427939601794, + "grad_norm": 2.367612838745117, + "learning_rate": 4.738358757895064e-05, + "loss": 1.0371, + "step": 168600 + }, + { + "epoch": 2.6179797948447368, + "grad_norm": 2.052858591079712, + "learning_rate": 4.73820357237077e-05, + "loss": 1.0409, + "step": 168700 + }, + { + "epoch": 2.61953165008768, + "grad_norm": 2.7634003162384033, + "learning_rate": 4.7380483868464756e-05, + "loss": 1.0595, + "step": 168800 + }, + { + "epoch": 2.6210835053306227, + "grad_norm": 2.206904172897339, + "learning_rate": 4.7378932013221814e-05, + "loss": 1.0602, + "step": 168900 + }, + { + "epoch": 2.6226353605735655, + "grad_norm": 2.2186241149902344, + "learning_rate": 4.7377380157978865e-05, + "loss": 1.0545, + "step": 169000 + }, + { + "epoch": 2.6241872158165087, + "grad_norm": 2.3548686504364014, + "learning_rate": 4.737582830273592e-05, + "loss": 1.034, + "step": 169100 + }, + { + "epoch": 2.6257390710594515, + "grad_norm": 2.368831157684326, + "learning_rate": 4.737427644749298e-05, + "loss": 1.0459, + "step": 169200 + }, + { + "epoch": 2.6272909263023942, + "grad_norm": 2.3844170570373535, + "learning_rate": 4.737272459225004e-05, + "loss": 1.0716, + "step": 169300 + }, + { + "epoch": 2.6288427815453375, + "grad_norm": 2.102041482925415, + "learning_rate": 4.7371172737007096e-05, + "loss": 1.0371, + "step": 169400 + }, + { + "epoch": 2.6303946367882802, + "grad_norm": 2.22416090965271, + "learning_rate": 4.7369620881764154e-05, + "loss": 1.0717, + "step": 169500 + }, + { + "epoch": 2.6319464920312234, + "grad_norm": 2.3140690326690674, + "learning_rate": 4.7368069026521205e-05, + "loss": 1.0494, + "step": 169600 + }, + { + "epoch": 2.633498347274166, + "grad_norm": 2.264110803604126, + "learning_rate": 4.736651717127826e-05, + "loss": 1.0289, + "step": 169700 + }, + { + "epoch": 2.6350502025171094, + "grad_norm": 2.4174273014068604, + "learning_rate": 4.736496531603532e-05, + "loss": 1.0373, + "step": 169800 + }, + { + "epoch": 2.636602057760052, + "grad_norm": 2.5184743404388428, + "learning_rate": 4.736341346079238e-05, + "loss": 1.037, + "step": 169900 + }, + { + "epoch": 2.638153913002995, + "grad_norm": 2.5399415493011475, + "learning_rate": 4.7361861605549436e-05, + "loss": 1.0347, + "step": 170000 + }, + { + "epoch": 2.639705768245938, + "grad_norm": 2.6229746341705322, + "learning_rate": 4.736030975030649e-05, + "loss": 1.045, + "step": 170100 + }, + { + "epoch": 2.641257623488881, + "grad_norm": 2.6230993270874023, + "learning_rate": 4.735875789506355e-05, + "loss": 1.047, + "step": 170200 + }, + { + "epoch": 2.6428094787318237, + "grad_norm": 2.6295454502105713, + "learning_rate": 4.735720603982061e-05, + "loss": 1.0492, + "step": 170300 + }, + { + "epoch": 2.644361333974767, + "grad_norm": 2.217151165008545, + "learning_rate": 4.7355654184577667e-05, + "loss": 1.0374, + "step": 170400 + }, + { + "epoch": 2.6459131892177097, + "grad_norm": 2.2779507637023926, + "learning_rate": 4.735410232933472e-05, + "loss": 1.0543, + "step": 170500 + }, + { + "epoch": 2.6474650444606525, + "grad_norm": 2.2059710025787354, + "learning_rate": 4.7352550474091775e-05, + "loss": 1.0694, + "step": 170600 + }, + { + "epoch": 2.6490168997035957, + "grad_norm": 2.124436378479004, + "learning_rate": 4.735099861884883e-05, + "loss": 1.034, + "step": 170700 + }, + { + "epoch": 2.6505687549465384, + "grad_norm": 2.118906259536743, + "learning_rate": 4.734944676360589e-05, + "loss": 1.0401, + "step": 170800 + }, + { + "epoch": 2.6521206101894816, + "grad_norm": 1.9342942237854004, + "learning_rate": 4.734789490836295e-05, + "loss": 1.0378, + "step": 170900 + }, + { + "epoch": 2.6536724654324244, + "grad_norm": 2.6422057151794434, + "learning_rate": 4.7346343053120006e-05, + "loss": 1.0462, + "step": 171000 + }, + { + "epoch": 2.6552243206753676, + "grad_norm": 2.120544195175171, + "learning_rate": 4.7344791197877064e-05, + "loss": 1.0325, + "step": 171100 + }, + { + "epoch": 2.6567761759183104, + "grad_norm": 2.004960298538208, + "learning_rate": 4.734323934263412e-05, + "loss": 1.0694, + "step": 171200 + }, + { + "epoch": 2.658328031161253, + "grad_norm": 2.4182827472686768, + "learning_rate": 4.734168748739118e-05, + "loss": 1.0352, + "step": 171300 + }, + { + "epoch": 2.6598798864041964, + "grad_norm": 2.2767324447631836, + "learning_rate": 4.734013563214824e-05, + "loss": 1.0336, + "step": 171400 + }, + { + "epoch": 2.661431741647139, + "grad_norm": 2.1682827472686768, + "learning_rate": 4.7338583776905295e-05, + "loss": 1.0514, + "step": 171500 + }, + { + "epoch": 2.662983596890082, + "grad_norm": 2.4701051712036133, + "learning_rate": 4.733703192166235e-05, + "loss": 1.0517, + "step": 171600 + }, + { + "epoch": 2.664535452133025, + "grad_norm": 2.312225103378296, + "learning_rate": 4.733548006641941e-05, + "loss": 1.0084, + "step": 171700 + }, + { + "epoch": 2.666087307375968, + "grad_norm": 2.305318832397461, + "learning_rate": 4.733392821117646e-05, + "loss": 1.0604, + "step": 171800 + }, + { + "epoch": 2.6676391626189107, + "grad_norm": 2.0694854259490967, + "learning_rate": 4.733237635593352e-05, + "loss": 1.0294, + "step": 171900 + }, + { + "epoch": 2.669191017861854, + "grad_norm": 2.2506048679351807, + "learning_rate": 4.733082450069058e-05, + "loss": 1.0588, + "step": 172000 + }, + { + "epoch": 2.6707428731047966, + "grad_norm": 2.3620123863220215, + "learning_rate": 4.7329272645447635e-05, + "loss": 1.0617, + "step": 172100 + }, + { + "epoch": 2.67229472834774, + "grad_norm": 2.18591570854187, + "learning_rate": 4.732772079020469e-05, + "loss": 1.0646, + "step": 172200 + }, + { + "epoch": 2.6738465835906826, + "grad_norm": 2.3200643062591553, + "learning_rate": 4.732616893496175e-05, + "loss": 1.0587, + "step": 172300 + }, + { + "epoch": 2.675398438833626, + "grad_norm": 1.9267265796661377, + "learning_rate": 4.732461707971881e-05, + "loss": 1.0508, + "step": 172400 + }, + { + "epoch": 2.6769502940765686, + "grad_norm": 2.3770933151245117, + "learning_rate": 4.7323065224475866e-05, + "loss": 1.0508, + "step": 172500 + }, + { + "epoch": 2.6785021493195114, + "grad_norm": 2.2253923416137695, + "learning_rate": 4.7321513369232924e-05, + "loss": 1.0602, + "step": 172600 + }, + { + "epoch": 2.6800540045624546, + "grad_norm": 2.6522648334503174, + "learning_rate": 4.731996151398998e-05, + "loss": 1.0377, + "step": 172700 + }, + { + "epoch": 2.6816058598053973, + "grad_norm": 3.115670919418335, + "learning_rate": 4.731840965874703e-05, + "loss": 1.0315, + "step": 172800 + }, + { + "epoch": 2.68315771504834, + "grad_norm": 2.218207597732544, + "learning_rate": 4.731685780350409e-05, + "loss": 1.0584, + "step": 172900 + }, + { + "epoch": 2.6847095702912833, + "grad_norm": 2.2877306938171387, + "learning_rate": 4.731530594826115e-05, + "loss": 1.0474, + "step": 173000 + }, + { + "epoch": 2.686261425534226, + "grad_norm": 2.023756504058838, + "learning_rate": 4.7313754093018206e-05, + "loss": 1.0345, + "step": 173100 + }, + { + "epoch": 2.687813280777169, + "grad_norm": 2.463364362716675, + "learning_rate": 4.731220223777526e-05, + "loss": 1.0645, + "step": 173200 + }, + { + "epoch": 2.689365136020112, + "grad_norm": 2.195937156677246, + "learning_rate": 4.731065038253232e-05, + "loss": 1.0433, + "step": 173300 + }, + { + "epoch": 2.690916991263055, + "grad_norm": 2.1433589458465576, + "learning_rate": 4.730909852728937e-05, + "loss": 1.0198, + "step": 173400 + }, + { + "epoch": 2.692468846505998, + "grad_norm": 2.3581550121307373, + "learning_rate": 4.730754667204643e-05, + "loss": 1.0508, + "step": 173500 + }, + { + "epoch": 2.694020701748941, + "grad_norm": 2.3441619873046875, + "learning_rate": 4.730599481680349e-05, + "loss": 1.0425, + "step": 173600 + }, + { + "epoch": 2.695572556991884, + "grad_norm": 2.695829153060913, + "learning_rate": 4.7304442961560545e-05, + "loss": 1.0162, + "step": 173700 + }, + { + "epoch": 2.697124412234827, + "grad_norm": 2.4131481647491455, + "learning_rate": 4.73028911063176e-05, + "loss": 1.04, + "step": 173800 + }, + { + "epoch": 2.6986762674777696, + "grad_norm": 2.350926160812378, + "learning_rate": 4.730133925107466e-05, + "loss": 1.0613, + "step": 173900 + }, + { + "epoch": 2.7002281227207128, + "grad_norm": 2.3652498722076416, + "learning_rate": 4.729978739583172e-05, + "loss": 1.0539, + "step": 174000 + }, + { + "epoch": 2.7017799779636555, + "grad_norm": 2.51181697845459, + "learning_rate": 4.7298235540588776e-05, + "loss": 1.0507, + "step": 174100 + }, + { + "epoch": 2.7033318332065983, + "grad_norm": 2.526365041732788, + "learning_rate": 4.7296683685345834e-05, + "loss": 1.0507, + "step": 174200 + }, + { + "epoch": 2.7048836884495415, + "grad_norm": 1.77544367313385, + "learning_rate": 4.729513183010289e-05, + "loss": 1.0542, + "step": 174300 + }, + { + "epoch": 2.7064355436924843, + "grad_norm": 2.2926077842712402, + "learning_rate": 4.729357997485995e-05, + "loss": 1.0477, + "step": 174400 + }, + { + "epoch": 2.707987398935427, + "grad_norm": 2.35168719291687, + "learning_rate": 4.729202811961701e-05, + "loss": 1.056, + "step": 174500 + }, + { + "epoch": 2.7095392541783703, + "grad_norm": 1.8608139753341675, + "learning_rate": 4.7290476264374065e-05, + "loss": 1.0387, + "step": 174600 + }, + { + "epoch": 2.711091109421313, + "grad_norm": 2.592602252960205, + "learning_rate": 4.7288924409131116e-05, + "loss": 1.0377, + "step": 174700 + }, + { + "epoch": 2.7126429646642563, + "grad_norm": 2.336280345916748, + "learning_rate": 4.7287372553888174e-05, + "loss": 1.0474, + "step": 174800 + }, + { + "epoch": 2.714194819907199, + "grad_norm": 1.7845664024353027, + "learning_rate": 4.728582069864523e-05, + "loss": 1.0581, + "step": 174900 + }, + { + "epoch": 2.7157466751501422, + "grad_norm": 1.9279907941818237, + "learning_rate": 4.728426884340229e-05, + "loss": 1.0308, + "step": 175000 + }, + { + "epoch": 2.717298530393085, + "grad_norm": 2.0942225456237793, + "learning_rate": 4.728271698815935e-05, + "loss": 1.0505, + "step": 175100 + }, + { + "epoch": 2.7188503856360278, + "grad_norm": 2.2798030376434326, + "learning_rate": 4.7281165132916405e-05, + "loss": 1.056, + "step": 175200 + }, + { + "epoch": 2.720402240878971, + "grad_norm": 2.1082921028137207, + "learning_rate": 4.727961327767346e-05, + "loss": 1.0372, + "step": 175300 + }, + { + "epoch": 2.7219540961219137, + "grad_norm": 2.104358434677124, + "learning_rate": 4.727806142243052e-05, + "loss": 1.0314, + "step": 175400 + }, + { + "epoch": 2.7235059513648565, + "grad_norm": 2.410003900527954, + "learning_rate": 4.727650956718758e-05, + "loss": 1.0316, + "step": 175500 + }, + { + "epoch": 2.7250578066077997, + "grad_norm": 2.8375027179718018, + "learning_rate": 4.7274957711944636e-05, + "loss": 1.0782, + "step": 175600 + }, + { + "epoch": 2.7266096618507425, + "grad_norm": 2.205787420272827, + "learning_rate": 4.7273405856701694e-05, + "loss": 1.0152, + "step": 175700 + }, + { + "epoch": 2.7281615170936853, + "grad_norm": 2.4399054050445557, + "learning_rate": 4.727185400145875e-05, + "loss": 1.0432, + "step": 175800 + }, + { + "epoch": 2.7297133723366285, + "grad_norm": 2.211919069290161, + "learning_rate": 4.727030214621581e-05, + "loss": 1.0566, + "step": 175900 + }, + { + "epoch": 2.7312652275795712, + "grad_norm": 2.1625940799713135, + "learning_rate": 4.726875029097286e-05, + "loss": 1.0448, + "step": 176000 + }, + { + "epoch": 2.7328170828225145, + "grad_norm": 2.0947794914245605, + "learning_rate": 4.726719843572992e-05, + "loss": 1.0334, + "step": 176100 + }, + { + "epoch": 2.734368938065457, + "grad_norm": 2.0673089027404785, + "learning_rate": 4.726564658048697e-05, + "loss": 1.0504, + "step": 176200 + }, + { + "epoch": 2.7359207933084004, + "grad_norm": 1.955876350402832, + "learning_rate": 4.7264094725244027e-05, + "loss": 1.0269, + "step": 176300 + }, + { + "epoch": 2.737472648551343, + "grad_norm": 2.182792901992798, + "learning_rate": 4.7262542870001084e-05, + "loss": 1.0259, + "step": 176400 + }, + { + "epoch": 2.739024503794286, + "grad_norm": 2.2036147117614746, + "learning_rate": 4.726099101475814e-05, + "loss": 1.0469, + "step": 176500 + }, + { + "epoch": 2.740576359037229, + "grad_norm": 2.2865328788757324, + "learning_rate": 4.72594391595152e-05, + "loss": 1.0268, + "step": 176600 + }, + { + "epoch": 2.742128214280172, + "grad_norm": 2.1760189533233643, + "learning_rate": 4.725788730427226e-05, + "loss": 1.0346, + "step": 176700 + }, + { + "epoch": 2.7436800695231147, + "grad_norm": 2.034281015396118, + "learning_rate": 4.7256335449029315e-05, + "loss": 1.0414, + "step": 176800 + }, + { + "epoch": 2.745231924766058, + "grad_norm": 2.272381544113159, + "learning_rate": 4.725478359378637e-05, + "loss": 1.0493, + "step": 176900 + }, + { + "epoch": 2.7467837800090007, + "grad_norm": 2.412069082260132, + "learning_rate": 4.725323173854343e-05, + "loss": 1.0474, + "step": 177000 + }, + { + "epoch": 2.7483356352519435, + "grad_norm": 2.2948646545410156, + "learning_rate": 4.725167988330049e-05, + "loss": 1.0282, + "step": 177100 + }, + { + "epoch": 2.7498874904948867, + "grad_norm": 1.870371699333191, + "learning_rate": 4.7250128028057546e-05, + "loss": 1.0349, + "step": 177200 + }, + { + "epoch": 2.7514393457378294, + "grad_norm": 2.3925862312316895, + "learning_rate": 4.7248576172814604e-05, + "loss": 1.0319, + "step": 177300 + }, + { + "epoch": 2.7529912009807727, + "grad_norm": 2.6727235317230225, + "learning_rate": 4.724702431757166e-05, + "loss": 1.035, + "step": 177400 + }, + { + "epoch": 2.7545430562237154, + "grad_norm": 2.3208634853363037, + "learning_rate": 4.724547246232871e-05, + "loss": 1.041, + "step": 177500 + }, + { + "epoch": 2.7560949114666586, + "grad_norm": 2.103750705718994, + "learning_rate": 4.724392060708577e-05, + "loss": 1.0714, + "step": 177600 + }, + { + "epoch": 2.7576467667096014, + "grad_norm": 2.2714037895202637, + "learning_rate": 4.724236875184283e-05, + "loss": 1.0486, + "step": 177700 + }, + { + "epoch": 2.759198621952544, + "grad_norm": 2.135613441467285, + "learning_rate": 4.7240816896599886e-05, + "loss": 1.0447, + "step": 177800 + }, + { + "epoch": 2.7607504771954874, + "grad_norm": 2.8759498596191406, + "learning_rate": 4.7239265041356944e-05, + "loss": 1.0645, + "step": 177900 + }, + { + "epoch": 2.76230233243843, + "grad_norm": 2.096858024597168, + "learning_rate": 4.7237713186114e-05, + "loss": 1.0624, + "step": 178000 + }, + { + "epoch": 2.763854187681373, + "grad_norm": 2.065045118331909, + "learning_rate": 4.723616133087106e-05, + "loss": 1.0235, + "step": 178100 + }, + { + "epoch": 2.765406042924316, + "grad_norm": 2.4445083141326904, + "learning_rate": 4.723460947562812e-05, + "loss": 1.0471, + "step": 178200 + }, + { + "epoch": 2.766957898167259, + "grad_norm": 2.600390911102295, + "learning_rate": 4.7233057620385175e-05, + "loss": 1.0201, + "step": 178300 + }, + { + "epoch": 2.7685097534102017, + "grad_norm": 2.073873281478882, + "learning_rate": 4.723150576514223e-05, + "loss": 1.0305, + "step": 178400 + }, + { + "epoch": 2.770061608653145, + "grad_norm": 1.985986590385437, + "learning_rate": 4.722995390989929e-05, + "loss": 1.0285, + "step": 178500 + }, + { + "epoch": 2.7716134638960876, + "grad_norm": 2.2954366207122803, + "learning_rate": 4.722840205465635e-05, + "loss": 1.0419, + "step": 178600 + }, + { + "epoch": 2.773165319139031, + "grad_norm": 2.4491689205169678, + "learning_rate": 4.7226850199413406e-05, + "loss": 1.0235, + "step": 178700 + }, + { + "epoch": 2.7747171743819736, + "grad_norm": 2.285947799682617, + "learning_rate": 4.722529834417046e-05, + "loss": 1.0585, + "step": 178800 + }, + { + "epoch": 2.776269029624917, + "grad_norm": 2.216391086578369, + "learning_rate": 4.7223746488927515e-05, + "loss": 1.0451, + "step": 178900 + }, + { + "epoch": 2.7778208848678596, + "grad_norm": 2.3627991676330566, + "learning_rate": 4.722219463368457e-05, + "loss": 1.0553, + "step": 179000 + }, + { + "epoch": 2.7793727401108024, + "grad_norm": 2.602961778640747, + "learning_rate": 4.722064277844163e-05, + "loss": 1.0311, + "step": 179100 + }, + { + "epoch": 2.7809245953537456, + "grad_norm": 2.1210010051727295, + "learning_rate": 4.721909092319869e-05, + "loss": 1.029, + "step": 179200 + }, + { + "epoch": 2.7824764505966884, + "grad_norm": 2.0719997882843018, + "learning_rate": 4.7217539067955746e-05, + "loss": 1.0385, + "step": 179300 + }, + { + "epoch": 2.784028305839631, + "grad_norm": 2.9635632038116455, + "learning_rate": 4.7215987212712797e-05, + "loss": 1.0255, + "step": 179400 + }, + { + "epoch": 2.7855801610825743, + "grad_norm": 2.32891583442688, + "learning_rate": 4.7214435357469854e-05, + "loss": 1.0463, + "step": 179500 + }, + { + "epoch": 2.787132016325517, + "grad_norm": 1.9537440538406372, + "learning_rate": 4.721288350222691e-05, + "loss": 1.009, + "step": 179600 + }, + { + "epoch": 2.78868387156846, + "grad_norm": 1.9633527994155884, + "learning_rate": 4.721133164698397e-05, + "loss": 1.0566, + "step": 179700 + }, + { + "epoch": 2.790235726811403, + "grad_norm": 2.7678892612457275, + "learning_rate": 4.720977979174103e-05, + "loss": 1.0455, + "step": 179800 + }, + { + "epoch": 2.791787582054346, + "grad_norm": 2.4441540241241455, + "learning_rate": 4.7208227936498085e-05, + "loss": 1.0414, + "step": 179900 + }, + { + "epoch": 2.793339437297289, + "grad_norm": 2.1662371158599854, + "learning_rate": 4.720667608125514e-05, + "loss": 1.0222, + "step": 180000 + }, + { + "epoch": 2.794891292540232, + "grad_norm": 1.887628197669983, + "learning_rate": 4.72051242260122e-05, + "loss": 1.0427, + "step": 180100 + }, + { + "epoch": 2.796443147783175, + "grad_norm": 2.570028305053711, + "learning_rate": 4.720357237076926e-05, + "loss": 1.0462, + "step": 180200 + }, + { + "epoch": 2.797995003026118, + "grad_norm": 1.7297128438949585, + "learning_rate": 4.720202051552631e-05, + "loss": 1.045, + "step": 180300 + }, + { + "epoch": 2.7995468582690606, + "grad_norm": 2.0217483043670654, + "learning_rate": 4.720046866028337e-05, + "loss": 1.0563, + "step": 180400 + }, + { + "epoch": 2.801098713512004, + "grad_norm": 2.2368738651275635, + "learning_rate": 4.7198916805040425e-05, + "loss": 1.0444, + "step": 180500 + }, + { + "epoch": 2.8026505687549466, + "grad_norm": 2.3537797927856445, + "learning_rate": 4.719736494979748e-05, + "loss": 1.0314, + "step": 180600 + }, + { + "epoch": 2.8042024239978893, + "grad_norm": 2.481429100036621, + "learning_rate": 4.719581309455454e-05, + "loss": 1.0197, + "step": 180700 + }, + { + "epoch": 2.8057542792408325, + "grad_norm": 2.388336181640625, + "learning_rate": 4.71942612393116e-05, + "loss": 1.0475, + "step": 180800 + }, + { + "epoch": 2.8073061344837753, + "grad_norm": 2.736994504928589, + "learning_rate": 4.7192709384068656e-05, + "loss": 1.0312, + "step": 180900 + }, + { + "epoch": 2.808857989726718, + "grad_norm": 2.5262064933776855, + "learning_rate": 4.7191157528825714e-05, + "loss": 1.0531, + "step": 181000 + }, + { + "epoch": 2.8104098449696613, + "grad_norm": 1.9915307760238647, + "learning_rate": 4.718960567358277e-05, + "loss": 1.0462, + "step": 181100 + }, + { + "epoch": 2.811961700212604, + "grad_norm": 2.344263792037964, + "learning_rate": 4.718805381833983e-05, + "loss": 1.0091, + "step": 181200 + }, + { + "epoch": 2.8135135554555473, + "grad_norm": 2.3820552825927734, + "learning_rate": 4.718650196309689e-05, + "loss": 1.0361, + "step": 181300 + }, + { + "epoch": 2.81506541069849, + "grad_norm": 2.1658577919006348, + "learning_rate": 4.7184950107853945e-05, + "loss": 1.0436, + "step": 181400 + }, + { + "epoch": 2.8166172659414332, + "grad_norm": 2.4356305599212646, + "learning_rate": 4.7183398252611e-05, + "loss": 1.035, + "step": 181500 + }, + { + "epoch": 2.818169121184376, + "grad_norm": 2.114675760269165, + "learning_rate": 4.7181846397368054e-05, + "loss": 1.0465, + "step": 181600 + }, + { + "epoch": 2.819720976427319, + "grad_norm": 2.0706875324249268, + "learning_rate": 4.718029454212511e-05, + "loss": 1.057, + "step": 181700 + }, + { + "epoch": 2.821272831670262, + "grad_norm": 2.4009530544281006, + "learning_rate": 4.717874268688217e-05, + "loss": 1.0285, + "step": 181800 + }, + { + "epoch": 2.8228246869132048, + "grad_norm": 2.498262643814087, + "learning_rate": 4.717719083163923e-05, + "loss": 1.036, + "step": 181900 + }, + { + "epoch": 2.8243765421561475, + "grad_norm": 2.347062587738037, + "learning_rate": 4.7175638976396285e-05, + "loss": 1.0438, + "step": 182000 + }, + { + "epoch": 2.8259283973990907, + "grad_norm": 2.0232503414154053, + "learning_rate": 4.717408712115334e-05, + "loss": 1.0382, + "step": 182100 + }, + { + "epoch": 2.8274802526420335, + "grad_norm": 2.401533842086792, + "learning_rate": 4.71725352659104e-05, + "loss": 1.0261, + "step": 182200 + }, + { + "epoch": 2.8290321078849763, + "grad_norm": 1.9756814241409302, + "learning_rate": 4.717098341066746e-05, + "loss": 1.0485, + "step": 182300 + }, + { + "epoch": 2.8305839631279195, + "grad_norm": 2.5054469108581543, + "learning_rate": 4.7169431555424516e-05, + "loss": 1.0553, + "step": 182400 + }, + { + "epoch": 2.8321358183708623, + "grad_norm": 2.3585667610168457, + "learning_rate": 4.716787970018157e-05, + "loss": 1.0382, + "step": 182500 + }, + { + "epoch": 2.8336876736138055, + "grad_norm": 1.8390778303146362, + "learning_rate": 4.7166327844938624e-05, + "loss": 1.0374, + "step": 182600 + }, + { + "epoch": 2.8352395288567482, + "grad_norm": 2.443948745727539, + "learning_rate": 4.716477598969568e-05, + "loss": 1.026, + "step": 182700 + }, + { + "epoch": 2.8367913840996914, + "grad_norm": 2.17437744140625, + "learning_rate": 4.716322413445274e-05, + "loss": 1.0449, + "step": 182800 + }, + { + "epoch": 2.838343239342634, + "grad_norm": 2.456134557723999, + "learning_rate": 4.71616722792098e-05, + "loss": 1.0381, + "step": 182900 + }, + { + "epoch": 2.839895094585577, + "grad_norm": 1.9899870157241821, + "learning_rate": 4.7160120423966855e-05, + "loss": 1.019, + "step": 183000 + }, + { + "epoch": 2.84144694982852, + "grad_norm": 2.103511333465576, + "learning_rate": 4.715856856872391e-05, + "loss": 1.035, + "step": 183100 + }, + { + "epoch": 2.842998805071463, + "grad_norm": 2.0619964599609375, + "learning_rate": 4.7157016713480964e-05, + "loss": 1.0415, + "step": 183200 + }, + { + "epoch": 2.8445506603144057, + "grad_norm": 2.082120656967163, + "learning_rate": 4.715546485823802e-05, + "loss": 1.0491, + "step": 183300 + }, + { + "epoch": 2.846102515557349, + "grad_norm": 2.3282582759857178, + "learning_rate": 4.715391300299508e-05, + "loss": 1.053, + "step": 183400 + }, + { + "epoch": 2.8476543708002917, + "grad_norm": 2.5951757431030273, + "learning_rate": 4.715236114775214e-05, + "loss": 1.038, + "step": 183500 + }, + { + "epoch": 2.8492062260432345, + "grad_norm": 2.437112331390381, + "learning_rate": 4.7150809292509195e-05, + "loss": 1.0142, + "step": 183600 + }, + { + "epoch": 2.8507580812861777, + "grad_norm": 2.447910785675049, + "learning_rate": 4.714925743726625e-05, + "loss": 1.0344, + "step": 183700 + }, + { + "epoch": 2.8523099365291205, + "grad_norm": 2.774915933609009, + "learning_rate": 4.714770558202331e-05, + "loss": 1.0377, + "step": 183800 + }, + { + "epoch": 2.8538617917720632, + "grad_norm": 1.8265098333358765, + "learning_rate": 4.714615372678037e-05, + "loss": 1.0396, + "step": 183900 + }, + { + "epoch": 2.8554136470150064, + "grad_norm": 2.375612497329712, + "learning_rate": 4.7144601871537426e-05, + "loss": 1.0392, + "step": 184000 + }, + { + "epoch": 2.8569655022579497, + "grad_norm": 2.156926155090332, + "learning_rate": 4.7143050016294484e-05, + "loss": 1.0469, + "step": 184100 + }, + { + "epoch": 2.8585173575008924, + "grad_norm": 2.006354331970215, + "learning_rate": 4.714149816105154e-05, + "loss": 1.0334, + "step": 184200 + }, + { + "epoch": 2.860069212743835, + "grad_norm": 2.555560827255249, + "learning_rate": 4.71399463058086e-05, + "loss": 1.0473, + "step": 184300 + }, + { + "epoch": 2.8616210679867784, + "grad_norm": 2.1230671405792236, + "learning_rate": 4.713839445056566e-05, + "loss": 1.0402, + "step": 184400 + }, + { + "epoch": 2.863172923229721, + "grad_norm": 2.0263772010803223, + "learning_rate": 4.713684259532271e-05, + "loss": 1.031, + "step": 184500 + }, + { + "epoch": 2.864724778472664, + "grad_norm": 1.9387094974517822, + "learning_rate": 4.7135290740079766e-05, + "loss": 1.0022, + "step": 184600 + }, + { + "epoch": 2.866276633715607, + "grad_norm": 2.5151021480560303, + "learning_rate": 4.7133738884836824e-05, + "loss": 1.0422, + "step": 184700 + }, + { + "epoch": 2.86782848895855, + "grad_norm": 1.9107468128204346, + "learning_rate": 4.713218702959388e-05, + "loss": 1.0337, + "step": 184800 + }, + { + "epoch": 2.8693803442014927, + "grad_norm": 2.1471190452575684, + "learning_rate": 4.713063517435094e-05, + "loss": 1.0074, + "step": 184900 + }, + { + "epoch": 2.870932199444436, + "grad_norm": 2.553988456726074, + "learning_rate": 4.7129083319108e-05, + "loss": 1.0299, + "step": 185000 + }, + { + "epoch": 2.8724840546873787, + "grad_norm": 2.2979228496551514, + "learning_rate": 4.7127531463865055e-05, + "loss": 1.036, + "step": 185100 + }, + { + "epoch": 2.8740359099303214, + "grad_norm": 2.090477466583252, + "learning_rate": 4.712597960862211e-05, + "loss": 1.025, + "step": 185200 + }, + { + "epoch": 2.8755877651732646, + "grad_norm": 2.046459913253784, + "learning_rate": 4.712442775337917e-05, + "loss": 1.0495, + "step": 185300 + }, + { + "epoch": 2.8771396204162074, + "grad_norm": 2.2629613876342773, + "learning_rate": 4.712287589813623e-05, + "loss": 1.0419, + "step": 185400 + }, + { + "epoch": 2.8786914756591506, + "grad_norm": 1.9247848987579346, + "learning_rate": 4.7121324042893286e-05, + "loss": 1.018, + "step": 185500 + }, + { + "epoch": 2.8802433309020934, + "grad_norm": 2.1730329990386963, + "learning_rate": 4.711977218765034e-05, + "loss": 1.0186, + "step": 185600 + }, + { + "epoch": 2.8817951861450366, + "grad_norm": 2.3092572689056396, + "learning_rate": 4.71182203324074e-05, + "loss": 1.0235, + "step": 185700 + }, + { + "epoch": 2.8833470413879794, + "grad_norm": 2.4735634326934814, + "learning_rate": 4.711666847716445e-05, + "loss": 1.0264, + "step": 185800 + }, + { + "epoch": 2.884898896630922, + "grad_norm": 1.7579586505889893, + "learning_rate": 4.711511662192151e-05, + "loss": 1.0437, + "step": 185900 + }, + { + "epoch": 2.8864507518738654, + "grad_norm": 2.1527419090270996, + "learning_rate": 4.711356476667856e-05, + "loss": 1.0413, + "step": 186000 + }, + { + "epoch": 2.888002607116808, + "grad_norm": 2.3001890182495117, + "learning_rate": 4.711201291143562e-05, + "loss": 1.0335, + "step": 186100 + }, + { + "epoch": 2.889554462359751, + "grad_norm": 3.010706663131714, + "learning_rate": 4.7110461056192676e-05, + "loss": 1.0345, + "step": 186200 + }, + { + "epoch": 2.891106317602694, + "grad_norm": 2.3039863109588623, + "learning_rate": 4.7108909200949734e-05, + "loss": 1.0476, + "step": 186300 + }, + { + "epoch": 2.892658172845637, + "grad_norm": 2.2111706733703613, + "learning_rate": 4.710735734570679e-05, + "loss": 1.0293, + "step": 186400 + }, + { + "epoch": 2.8942100280885796, + "grad_norm": 2.36498761177063, + "learning_rate": 4.710580549046385e-05, + "loss": 1.031, + "step": 186500 + }, + { + "epoch": 2.895761883331523, + "grad_norm": 1.7939813137054443, + "learning_rate": 4.710425363522091e-05, + "loss": 1.0265, + "step": 186600 + }, + { + "epoch": 2.8973137385744656, + "grad_norm": 2.330825090408325, + "learning_rate": 4.7102701779977965e-05, + "loss": 1.0625, + "step": 186700 + }, + { + "epoch": 2.898865593817409, + "grad_norm": 2.151470184326172, + "learning_rate": 4.710114992473502e-05, + "loss": 1.0237, + "step": 186800 + }, + { + "epoch": 2.9004174490603516, + "grad_norm": 2.305598258972168, + "learning_rate": 4.709959806949208e-05, + "loss": 1.0594, + "step": 186900 + }, + { + "epoch": 2.901969304303295, + "grad_norm": 2.183980941772461, + "learning_rate": 4.709804621424914e-05, + "loss": 1.0406, + "step": 187000 + }, + { + "epoch": 2.9035211595462376, + "grad_norm": 2.6724209785461426, + "learning_rate": 4.7096494359006196e-05, + "loss": 1.0574, + "step": 187100 + }, + { + "epoch": 2.9050730147891803, + "grad_norm": 2.159895181655884, + "learning_rate": 4.7094942503763254e-05, + "loss": 1.0236, + "step": 187200 + }, + { + "epoch": 2.9066248700321236, + "grad_norm": 2.602503538131714, + "learning_rate": 4.7093390648520305e-05, + "loss": 1.0191, + "step": 187300 + }, + { + "epoch": 2.9081767252750663, + "grad_norm": 2.2885754108428955, + "learning_rate": 4.709183879327736e-05, + "loss": 1.0375, + "step": 187400 + }, + { + "epoch": 2.909728580518009, + "grad_norm": 2.3764402866363525, + "learning_rate": 4.709028693803442e-05, + "loss": 1.0421, + "step": 187500 + }, + { + "epoch": 2.9112804357609523, + "grad_norm": 2.0926246643066406, + "learning_rate": 4.708873508279148e-05, + "loss": 1.0243, + "step": 187600 + }, + { + "epoch": 2.912832291003895, + "grad_norm": 2.254619836807251, + "learning_rate": 4.7087183227548536e-05, + "loss": 1.0133, + "step": 187700 + }, + { + "epoch": 2.914384146246838, + "grad_norm": 2.3661041259765625, + "learning_rate": 4.7085631372305594e-05, + "loss": 1.0287, + "step": 187800 + }, + { + "epoch": 2.915936001489781, + "grad_norm": 2.4754600524902344, + "learning_rate": 4.708407951706265e-05, + "loss": 1.0391, + "step": 187900 + }, + { + "epoch": 2.917487856732724, + "grad_norm": 2.4904284477233887, + "learning_rate": 4.708252766181971e-05, + "loss": 1.0278, + "step": 188000 + }, + { + "epoch": 2.919039711975667, + "grad_norm": 2.2915358543395996, + "learning_rate": 4.708097580657677e-05, + "loss": 1.0371, + "step": 188100 + }, + { + "epoch": 2.92059156721861, + "grad_norm": 2.5260331630706787, + "learning_rate": 4.7079423951333825e-05, + "loss": 1.0438, + "step": 188200 + }, + { + "epoch": 2.922143422461553, + "grad_norm": 2.7706286907196045, + "learning_rate": 4.707787209609088e-05, + "loss": 1.0525, + "step": 188300 + }, + { + "epoch": 2.9236952777044958, + "grad_norm": 2.768744945526123, + "learning_rate": 4.707632024084794e-05, + "loss": 1.0359, + "step": 188400 + }, + { + "epoch": 2.9252471329474385, + "grad_norm": 2.4445366859436035, + "learning_rate": 4.7074768385605e-05, + "loss": 1.0659, + "step": 188500 + }, + { + "epoch": 2.9267989881903818, + "grad_norm": 2.203753709793091, + "learning_rate": 4.707321653036205e-05, + "loss": 1.0294, + "step": 188600 + }, + { + "epoch": 2.9283508434333245, + "grad_norm": 2.6416361331939697, + "learning_rate": 4.7071664675119107e-05, + "loss": 1.0164, + "step": 188700 + }, + { + "epoch": 2.9299026986762673, + "grad_norm": 2.0499093532562256, + "learning_rate": 4.7070112819876164e-05, + "loss": 1.0113, + "step": 188800 + }, + { + "epoch": 2.9314545539192105, + "grad_norm": 2.1293187141418457, + "learning_rate": 4.706856096463322e-05, + "loss": 1.0468, + "step": 188900 + }, + { + "epoch": 2.9330064091621533, + "grad_norm": 2.1990184783935547, + "learning_rate": 4.706700910939028e-05, + "loss": 1.0318, + "step": 189000 + }, + { + "epoch": 2.934558264405096, + "grad_norm": 2.421170473098755, + "learning_rate": 4.706545725414733e-05, + "loss": 1.0362, + "step": 189100 + }, + { + "epoch": 2.9361101196480393, + "grad_norm": 1.8146953582763672, + "learning_rate": 4.706390539890439e-05, + "loss": 1.0267, + "step": 189200 + }, + { + "epoch": 2.937661974890982, + "grad_norm": 2.159708023071289, + "learning_rate": 4.7062353543661446e-05, + "loss": 1.0154, + "step": 189300 + }, + { + "epoch": 2.9392138301339252, + "grad_norm": 2.5605711936950684, + "learning_rate": 4.7060801688418504e-05, + "loss": 1.0293, + "step": 189400 + }, + { + "epoch": 2.940765685376868, + "grad_norm": 2.3768041133880615, + "learning_rate": 4.705924983317556e-05, + "loss": 1.019, + "step": 189500 + }, + { + "epoch": 2.942317540619811, + "grad_norm": 2.371068000793457, + "learning_rate": 4.705769797793262e-05, + "loss": 1.0269, + "step": 189600 + }, + { + "epoch": 2.943869395862754, + "grad_norm": 2.5876991748809814, + "learning_rate": 4.705614612268968e-05, + "loss": 1.0401, + "step": 189700 + }, + { + "epoch": 2.9454212511056967, + "grad_norm": 2.299680233001709, + "learning_rate": 4.7054594267446735e-05, + "loss": 1.041, + "step": 189800 + }, + { + "epoch": 2.94697310634864, + "grad_norm": 2.342620372772217, + "learning_rate": 4.705304241220379e-05, + "loss": 1.0326, + "step": 189900 + }, + { + "epoch": 2.9485249615915827, + "grad_norm": 2.1082839965820312, + "learning_rate": 4.705149055696085e-05, + "loss": 1.0386, + "step": 190000 + }, + { + "epoch": 2.9500768168345255, + "grad_norm": 1.9586697816848755, + "learning_rate": 4.704993870171791e-05, + "loss": 1.0479, + "step": 190100 + }, + { + "epoch": 2.9516286720774687, + "grad_norm": 2.053753137588501, + "learning_rate": 4.704838684647496e-05, + "loss": 1.0584, + "step": 190200 + }, + { + "epoch": 2.9531805273204115, + "grad_norm": 2.466909885406494, + "learning_rate": 4.704683499123202e-05, + "loss": 1.0305, + "step": 190300 + }, + { + "epoch": 2.9547323825633542, + "grad_norm": 1.973644733428955, + "learning_rate": 4.7045283135989075e-05, + "loss": 1.0163, + "step": 190400 + }, + { + "epoch": 2.9562842378062975, + "grad_norm": 2.056447982788086, + "learning_rate": 4.704373128074613e-05, + "loss": 1.0215, + "step": 190500 + }, + { + "epoch": 2.9578360930492402, + "grad_norm": 2.1640326976776123, + "learning_rate": 4.704217942550319e-05, + "loss": 1.0026, + "step": 190600 + }, + { + "epoch": 2.9593879482921834, + "grad_norm": 2.4542548656463623, + "learning_rate": 4.704062757026025e-05, + "loss": 1.0527, + "step": 190700 + }, + { + "epoch": 2.960939803535126, + "grad_norm": 2.8114395141601562, + "learning_rate": 4.7039075715017306e-05, + "loss": 1.0339, + "step": 190800 + }, + { + "epoch": 2.9624916587780694, + "grad_norm": 1.7165937423706055, + "learning_rate": 4.7037523859774364e-05, + "loss": 1.0486, + "step": 190900 + }, + { + "epoch": 2.964043514021012, + "grad_norm": 1.8165018558502197, + "learning_rate": 4.703597200453142e-05, + "loss": 1.0484, + "step": 191000 + }, + { + "epoch": 2.965595369263955, + "grad_norm": 2.210017681121826, + "learning_rate": 4.703442014928848e-05, + "loss": 1.0208, + "step": 191100 + }, + { + "epoch": 2.967147224506898, + "grad_norm": 1.9644007682800293, + "learning_rate": 4.703286829404554e-05, + "loss": 1.03, + "step": 191200 + }, + { + "epoch": 2.968699079749841, + "grad_norm": 2.1285908222198486, + "learning_rate": 4.7031316438802595e-05, + "loss": 1.0425, + "step": 191300 + }, + { + "epoch": 2.9702509349927837, + "grad_norm": 2.1437644958496094, + "learning_rate": 4.702976458355965e-05, + "loss": 1.0337, + "step": 191400 + }, + { + "epoch": 2.971802790235727, + "grad_norm": 2.3303661346435547, + "learning_rate": 4.70282127283167e-05, + "loss": 1.0239, + "step": 191500 + }, + { + "epoch": 2.9733546454786697, + "grad_norm": 2.347890615463257, + "learning_rate": 4.702666087307376e-05, + "loss": 1.0488, + "step": 191600 + }, + { + "epoch": 2.9749065007216124, + "grad_norm": 2.1174182891845703, + "learning_rate": 4.702510901783082e-05, + "loss": 1.0281, + "step": 191700 + }, + { + "epoch": 2.9764583559645557, + "grad_norm": 2.1659739017486572, + "learning_rate": 4.7023557162587877e-05, + "loss": 1.026, + "step": 191800 + }, + { + "epoch": 2.9780102112074984, + "grad_norm": 2.4842588901519775, + "learning_rate": 4.7022005307344934e-05, + "loss": 1.0445, + "step": 191900 + }, + { + "epoch": 2.9795620664504416, + "grad_norm": 2.3452415466308594, + "learning_rate": 4.702045345210199e-05, + "loss": 1.037, + "step": 192000 + }, + { + "epoch": 2.9811139216933844, + "grad_norm": 1.9928044080734253, + "learning_rate": 4.701890159685905e-05, + "loss": 1.0123, + "step": 192100 + }, + { + "epoch": 2.9826657769363276, + "grad_norm": 2.5669846534729004, + "learning_rate": 4.701734974161611e-05, + "loss": 1.0552, + "step": 192200 + }, + { + "epoch": 2.9842176321792704, + "grad_norm": 2.2527923583984375, + "learning_rate": 4.7015797886373165e-05, + "loss": 1.0273, + "step": 192300 + }, + { + "epoch": 2.985769487422213, + "grad_norm": 2.880582094192505, + "learning_rate": 4.7014246031130216e-05, + "loss": 1.0294, + "step": 192400 + }, + { + "epoch": 2.9873213426651564, + "grad_norm": 2.2426044940948486, + "learning_rate": 4.7012694175887274e-05, + "loss": 1.0365, + "step": 192500 + }, + { + "epoch": 2.988873197908099, + "grad_norm": 2.2233119010925293, + "learning_rate": 4.701114232064433e-05, + "loss": 1.0293, + "step": 192600 + }, + { + "epoch": 2.990425053151042, + "grad_norm": 2.3352088928222656, + "learning_rate": 4.700959046540139e-05, + "loss": 1.0327, + "step": 192700 + }, + { + "epoch": 2.991976908393985, + "grad_norm": 2.204885482788086, + "learning_rate": 4.700803861015845e-05, + "loss": 1.0384, + "step": 192800 + }, + { + "epoch": 2.993528763636928, + "grad_norm": 2.6361398696899414, + "learning_rate": 4.7006486754915505e-05, + "loss": 1.0477, + "step": 192900 + }, + { + "epoch": 2.9950806188798706, + "grad_norm": 1.9669020175933838, + "learning_rate": 4.7004934899672556e-05, + "loss": 1.0282, + "step": 193000 + }, + { + "epoch": 2.996632474122814, + "grad_norm": 2.1864845752716064, + "learning_rate": 4.7003383044429614e-05, + "loss": 1.0192, + "step": 193100 + }, + { + "epoch": 2.9981843293657566, + "grad_norm": 2.3880720138549805, + "learning_rate": 4.700183118918667e-05, + "loss": 1.0402, + "step": 193200 + }, + { + "epoch": 2.9997361846087, + "grad_norm": 2.177459478378296, + "learning_rate": 4.700027933394373e-05, + "loss": 1.0148, + "step": 193300 + }, + { + "epoch": 3.0012880398516426, + "grad_norm": 2.38643217086792, + "learning_rate": 4.699872747870079e-05, + "loss": 1.0377, + "step": 193400 + }, + { + "epoch": 3.0028398950945854, + "grad_norm": 2.6639795303344727, + "learning_rate": 4.6997175623457845e-05, + "loss": 1.0563, + "step": 193500 + }, + { + "epoch": 3.0043917503375286, + "grad_norm": 2.132826328277588, + "learning_rate": 4.69956237682149e-05, + "loss": 1.0268, + "step": 193600 + }, + { + "epoch": 3.0059436055804714, + "grad_norm": 2.4893951416015625, + "learning_rate": 4.699407191297196e-05, + "loss": 1.0565, + "step": 193700 + }, + { + "epoch": 3.0074954608234146, + "grad_norm": 2.316396474838257, + "learning_rate": 4.699252005772902e-05, + "loss": 1.0274, + "step": 193800 + }, + { + "epoch": 3.0090473160663573, + "grad_norm": 2.286466360092163, + "learning_rate": 4.6990968202486076e-05, + "loss": 1.036, + "step": 193900 + }, + { + "epoch": 3.0105991713093, + "grad_norm": 2.2694833278656006, + "learning_rate": 4.6989416347243134e-05, + "loss": 1.025, + "step": 194000 + }, + { + "epoch": 3.0121510265522433, + "grad_norm": 2.5719926357269287, + "learning_rate": 4.698786449200019e-05, + "loss": 1.0261, + "step": 194100 + }, + { + "epoch": 3.013702881795186, + "grad_norm": 5.869794845581055, + "learning_rate": 4.698631263675725e-05, + "loss": 1.0307, + "step": 194200 + }, + { + "epoch": 3.0152547370381293, + "grad_norm": 2.239471912384033, + "learning_rate": 4.69847607815143e-05, + "loss": 1.0107, + "step": 194300 + }, + { + "epoch": 3.016806592281072, + "grad_norm": 2.3318614959716797, + "learning_rate": 4.698320892627136e-05, + "loss": 1.0316, + "step": 194400 + }, + { + "epoch": 3.018358447524015, + "grad_norm": 2.550088882446289, + "learning_rate": 4.6981657071028416e-05, + "loss": 1.0412, + "step": 194500 + }, + { + "epoch": 3.019910302766958, + "grad_norm": 2.599668502807617, + "learning_rate": 4.698010521578547e-05, + "loss": 1.0343, + "step": 194600 + }, + { + "epoch": 3.021462158009901, + "grad_norm": 2.2850258350372314, + "learning_rate": 4.697855336054253e-05, + "loss": 1.0558, + "step": 194700 + }, + { + "epoch": 3.0230140132528436, + "grad_norm": 2.204380512237549, + "learning_rate": 4.697700150529959e-05, + "loss": 1.0128, + "step": 194800 + }, + { + "epoch": 3.024565868495787, + "grad_norm": 2.282245397567749, + "learning_rate": 4.6975449650056647e-05, + "loss": 1.0511, + "step": 194900 + }, + { + "epoch": 3.0261177237387296, + "grad_norm": 2.1987249851226807, + "learning_rate": 4.6973897794813704e-05, + "loss": 1.0328, + "step": 195000 + }, + { + "epoch": 3.0276695789816728, + "grad_norm": 2.3703179359436035, + "learning_rate": 4.697234593957076e-05, + "loss": 1.031, + "step": 195100 + }, + { + "epoch": 3.0292214342246155, + "grad_norm": 2.0956859588623047, + "learning_rate": 4.697079408432782e-05, + "loss": 1.0068, + "step": 195200 + }, + { + "epoch": 3.0307732894675583, + "grad_norm": 2.2914812564849854, + "learning_rate": 4.696924222908488e-05, + "loss": 1.0327, + "step": 195300 + }, + { + "epoch": 3.0323251447105015, + "grad_norm": 2.428166627883911, + "learning_rate": 4.6967690373841935e-05, + "loss": 1.0435, + "step": 195400 + }, + { + "epoch": 3.0338769999534443, + "grad_norm": 2.140897750854492, + "learning_rate": 4.696613851859899e-05, + "loss": 1.041, + "step": 195500 + }, + { + "epoch": 3.0354288551963875, + "grad_norm": 2.000777244567871, + "learning_rate": 4.6964586663356044e-05, + "loss": 1.0155, + "step": 195600 + }, + { + "epoch": 3.0369807104393303, + "grad_norm": 2.3136231899261475, + "learning_rate": 4.69630348081131e-05, + "loss": 1.0425, + "step": 195700 + }, + { + "epoch": 3.038532565682273, + "grad_norm": 2.113478183746338, + "learning_rate": 4.696148295287015e-05, + "loss": 1.0389, + "step": 195800 + }, + { + "epoch": 3.0400844209252162, + "grad_norm": 2.2906436920166016, + "learning_rate": 4.695993109762721e-05, + "loss": 1.0313, + "step": 195900 + }, + { + "epoch": 3.041636276168159, + "grad_norm": 2.1110403537750244, + "learning_rate": 4.695837924238427e-05, + "loss": 1.028, + "step": 196000 + }, + { + "epoch": 3.043188131411102, + "grad_norm": 2.0373849868774414, + "learning_rate": 4.6956827387141326e-05, + "loss": 1.0144, + "step": 196100 + }, + { + "epoch": 3.044739986654045, + "grad_norm": 2.5317904949188232, + "learning_rate": 4.6955275531898384e-05, + "loss": 1.0362, + "step": 196200 + }, + { + "epoch": 3.0462918418969878, + "grad_norm": 1.8818609714508057, + "learning_rate": 4.695372367665544e-05, + "loss": 1.0294, + "step": 196300 + }, + { + "epoch": 3.047843697139931, + "grad_norm": 2.0409302711486816, + "learning_rate": 4.69521718214125e-05, + "loss": 1.0188, + "step": 196400 + }, + { + "epoch": 3.0493955523828737, + "grad_norm": 2.394728899002075, + "learning_rate": 4.695061996616956e-05, + "loss": 1.0306, + "step": 196500 + }, + { + "epoch": 3.0509474076258165, + "grad_norm": 2.4261462688446045, + "learning_rate": 4.6949068110926615e-05, + "loss": 1.0447, + "step": 196600 + }, + { + "epoch": 3.0524992628687597, + "grad_norm": 1.7856899499893188, + "learning_rate": 4.694751625568367e-05, + "loss": 1.0403, + "step": 196700 + }, + { + "epoch": 3.0540511181117025, + "grad_norm": 2.1757800579071045, + "learning_rate": 4.694596440044073e-05, + "loss": 1.0305, + "step": 196800 + }, + { + "epoch": 3.0556029733546453, + "grad_norm": 2.5751469135284424, + "learning_rate": 4.694441254519779e-05, + "loss": 1.0433, + "step": 196900 + }, + { + "epoch": 3.0571548285975885, + "grad_norm": 2.0382914543151855, + "learning_rate": 4.6942860689954846e-05, + "loss": 1.0157, + "step": 197000 + }, + { + "epoch": 3.0587066838405312, + "grad_norm": 2.3272671699523926, + "learning_rate": 4.69413088347119e-05, + "loss": 1.0261, + "step": 197100 + }, + { + "epoch": 3.0602585390834744, + "grad_norm": 2.044356346130371, + "learning_rate": 4.6939756979468955e-05, + "loss": 1.0429, + "step": 197200 + }, + { + "epoch": 3.061810394326417, + "grad_norm": 1.988494873046875, + "learning_rate": 4.693820512422601e-05, + "loss": 1.0252, + "step": 197300 + }, + { + "epoch": 3.06336224956936, + "grad_norm": 2.347303867340088, + "learning_rate": 4.693665326898307e-05, + "loss": 1.0427, + "step": 197400 + }, + { + "epoch": 3.064914104812303, + "grad_norm": 2.1885454654693604, + "learning_rate": 4.693510141374013e-05, + "loss": 1.0323, + "step": 197500 + }, + { + "epoch": 3.066465960055246, + "grad_norm": 2.6395416259765625, + "learning_rate": 4.6933549558497186e-05, + "loss": 1.0268, + "step": 197600 + }, + { + "epoch": 3.068017815298189, + "grad_norm": 2.166287422180176, + "learning_rate": 4.693199770325424e-05, + "loss": 1.012, + "step": 197700 + }, + { + "epoch": 3.069569670541132, + "grad_norm": 2.3830583095550537, + "learning_rate": 4.69304458480113e-05, + "loss": 1.0399, + "step": 197800 + }, + { + "epoch": 3.0711215257840747, + "grad_norm": 1.889291524887085, + "learning_rate": 4.692889399276836e-05, + "loss": 1.0247, + "step": 197900 + }, + { + "epoch": 3.072673381027018, + "grad_norm": 2.4712934494018555, + "learning_rate": 4.6927342137525417e-05, + "loss": 1.0245, + "step": 198000 + }, + { + "epoch": 3.0742252362699607, + "grad_norm": 2.633347988128662, + "learning_rate": 4.6925790282282474e-05, + "loss": 1.0305, + "step": 198100 + }, + { + "epoch": 3.0757770915129035, + "grad_norm": 2.4769883155822754, + "learning_rate": 4.692423842703953e-05, + "loss": 1.0298, + "step": 198200 + }, + { + "epoch": 3.0773289467558467, + "grad_norm": 1.9867851734161377, + "learning_rate": 4.692268657179659e-05, + "loss": 1.0044, + "step": 198300 + }, + { + "epoch": 3.0788808019987894, + "grad_norm": 2.213604688644409, + "learning_rate": 4.692113471655364e-05, + "loss": 1.0302, + "step": 198400 + }, + { + "epoch": 3.0804326572417327, + "grad_norm": 2.464210271835327, + "learning_rate": 4.69195828613107e-05, + "loss": 1.0357, + "step": 198500 + }, + { + "epoch": 3.0819845124846754, + "grad_norm": 2.238145112991333, + "learning_rate": 4.6918031006067756e-05, + "loss": 1.0173, + "step": 198600 + }, + { + "epoch": 3.083536367727618, + "grad_norm": 2.5813677310943604, + "learning_rate": 4.6916479150824814e-05, + "loss": 1.0095, + "step": 198700 + }, + { + "epoch": 3.0850882229705614, + "grad_norm": 2.3975701332092285, + "learning_rate": 4.691492729558187e-05, + "loss": 1.0407, + "step": 198800 + }, + { + "epoch": 3.086640078213504, + "grad_norm": 2.134744644165039, + "learning_rate": 4.691337544033892e-05, + "loss": 1.0186, + "step": 198900 + }, + { + "epoch": 3.0881919334564474, + "grad_norm": 2.562459707260132, + "learning_rate": 4.691182358509598e-05, + "loss": 1.0278, + "step": 199000 + }, + { + "epoch": 3.08974378869939, + "grad_norm": 2.1110596656799316, + "learning_rate": 4.691027172985304e-05, + "loss": 1.0094, + "step": 199100 + }, + { + "epoch": 3.091295643942333, + "grad_norm": 2.220705032348633, + "learning_rate": 4.6908719874610096e-05, + "loss": 1.0193, + "step": 199200 + }, + { + "epoch": 3.092847499185276, + "grad_norm": 2.34887957572937, + "learning_rate": 4.6907168019367154e-05, + "loss": 1.0276, + "step": 199300 + }, + { + "epoch": 3.094399354428219, + "grad_norm": 2.3114545345306396, + "learning_rate": 4.690561616412421e-05, + "loss": 1.0492, + "step": 199400 + }, + { + "epoch": 3.0959512096711617, + "grad_norm": 2.16153621673584, + "learning_rate": 4.690406430888127e-05, + "loss": 1.0264, + "step": 199500 + }, + { + "epoch": 3.097503064914105, + "grad_norm": 2.1943531036376953, + "learning_rate": 4.690251245363833e-05, + "loss": 1.0291, + "step": 199600 + }, + { + "epoch": 3.0990549201570476, + "grad_norm": 2.46353816986084, + "learning_rate": 4.6900960598395385e-05, + "loss": 1.0239, + "step": 199700 + }, + { + "epoch": 3.100606775399991, + "grad_norm": 2.054356098175049, + "learning_rate": 4.689940874315244e-05, + "loss": 1.0399, + "step": 199800 + }, + { + "epoch": 3.1021586306429336, + "grad_norm": 2.286006450653076, + "learning_rate": 4.68978568879095e-05, + "loss": 1.0342, + "step": 199900 + }, + { + "epoch": 3.1037104858858764, + "grad_norm": 2.0622918605804443, + "learning_rate": 4.689630503266655e-05, + "loss": 1.0129, + "step": 200000 + }, + { + "epoch": 3.1052623411288196, + "grad_norm": 2.433764696121216, + "learning_rate": 4.689475317742361e-05, + "loss": 1.0577, + "step": 200100 + }, + { + "epoch": 3.1068141963717624, + "grad_norm": 2.215843915939331, + "learning_rate": 4.689320132218067e-05, + "loss": 1.0293, + "step": 200200 + }, + { + "epoch": 3.1083660516147056, + "grad_norm": 2.349459409713745, + "learning_rate": 4.6891649466937725e-05, + "loss": 1.034, + "step": 200300 + }, + { + "epoch": 3.1099179068576484, + "grad_norm": 2.0848443508148193, + "learning_rate": 4.689009761169478e-05, + "loss": 1.0095, + "step": 200400 + }, + { + "epoch": 3.111469762100591, + "grad_norm": 2.5262529850006104, + "learning_rate": 4.688854575645184e-05, + "loss": 1.0339, + "step": 200500 + }, + { + "epoch": 3.1130216173435343, + "grad_norm": 2.55330491065979, + "learning_rate": 4.68869939012089e-05, + "loss": 1.0273, + "step": 200600 + }, + { + "epoch": 3.114573472586477, + "grad_norm": 2.2021408081054688, + "learning_rate": 4.6885442045965956e-05, + "loss": 1.0336, + "step": 200700 + }, + { + "epoch": 3.11612532782942, + "grad_norm": 2.430277109146118, + "learning_rate": 4.688389019072301e-05, + "loss": 1.0361, + "step": 200800 + }, + { + "epoch": 3.117677183072363, + "grad_norm": 2.0476455688476562, + "learning_rate": 4.688233833548007e-05, + "loss": 1.0085, + "step": 200900 + }, + { + "epoch": 3.119229038315306, + "grad_norm": 2.5166189670562744, + "learning_rate": 4.688078648023713e-05, + "loss": 1.0314, + "step": 201000 + }, + { + "epoch": 3.120780893558249, + "grad_norm": 2.221287488937378, + "learning_rate": 4.6879234624994187e-05, + "loss": 1.0317, + "step": 201100 + }, + { + "epoch": 3.122332748801192, + "grad_norm": 3.0448193550109863, + "learning_rate": 4.6877682769751244e-05, + "loss": 1.0127, + "step": 201200 + }, + { + "epoch": 3.1238846040441346, + "grad_norm": 2.4233529567718506, + "learning_rate": 4.6876130914508295e-05, + "loss": 1.0354, + "step": 201300 + }, + { + "epoch": 3.125436459287078, + "grad_norm": 2.2048444747924805, + "learning_rate": 4.687457905926535e-05, + "loss": 1.0269, + "step": 201400 + }, + { + "epoch": 3.1269883145300206, + "grad_norm": 2.5136120319366455, + "learning_rate": 4.687302720402241e-05, + "loss": 1.022, + "step": 201500 + }, + { + "epoch": 3.128540169772964, + "grad_norm": 2.1857991218566895, + "learning_rate": 4.687147534877947e-05, + "loss": 1.0294, + "step": 201600 + }, + { + "epoch": 3.1300920250159066, + "grad_norm": 2.1854701042175293, + "learning_rate": 4.6869923493536526e-05, + "loss": 0.983, + "step": 201700 + }, + { + "epoch": 3.1316438802588493, + "grad_norm": 2.0431981086730957, + "learning_rate": 4.6868371638293584e-05, + "loss": 1.0318, + "step": 201800 + }, + { + "epoch": 3.1331957355017925, + "grad_norm": 2.220003843307495, + "learning_rate": 4.686681978305064e-05, + "loss": 1.03, + "step": 201900 + }, + { + "epoch": 3.1347475907447353, + "grad_norm": 2.234104871749878, + "learning_rate": 4.68652679278077e-05, + "loss": 1.0193, + "step": 202000 + }, + { + "epoch": 3.136299445987678, + "grad_norm": 2.0799477100372314, + "learning_rate": 4.686371607256476e-05, + "loss": 1.0456, + "step": 202100 + }, + { + "epoch": 3.1378513012306213, + "grad_norm": 2.006730556488037, + "learning_rate": 4.686216421732181e-05, + "loss": 1.0284, + "step": 202200 + }, + { + "epoch": 3.139403156473564, + "grad_norm": 1.9896783828735352, + "learning_rate": 4.6860612362078866e-05, + "loss": 1.0332, + "step": 202300 + }, + { + "epoch": 3.1409550117165073, + "grad_norm": 2.3062851428985596, + "learning_rate": 4.6859060506835924e-05, + "loss": 1.0093, + "step": 202400 + }, + { + "epoch": 3.14250686695945, + "grad_norm": 2.223362922668457, + "learning_rate": 4.685750865159298e-05, + "loss": 1.0238, + "step": 202500 + }, + { + "epoch": 3.144058722202393, + "grad_norm": 2.6560285091400146, + "learning_rate": 4.685595679635004e-05, + "loss": 1.0299, + "step": 202600 + }, + { + "epoch": 3.145610577445336, + "grad_norm": 2.1673824787139893, + "learning_rate": 4.68544049411071e-05, + "loss": 1.0113, + "step": 202700 + }, + { + "epoch": 3.1471624326882788, + "grad_norm": 2.15631103515625, + "learning_rate": 4.685285308586415e-05, + "loss": 1.0066, + "step": 202800 + }, + { + "epoch": 3.148714287931222, + "grad_norm": 2.0604913234710693, + "learning_rate": 4.6851301230621206e-05, + "loss": 1.029, + "step": 202900 + }, + { + "epoch": 3.1502661431741648, + "grad_norm": 2.4676761627197266, + "learning_rate": 4.6849749375378264e-05, + "loss": 1.0442, + "step": 203000 + }, + { + "epoch": 3.1518179984171075, + "grad_norm": 2.266402006149292, + "learning_rate": 4.684819752013532e-05, + "loss": 1.0117, + "step": 203100 + }, + { + "epoch": 3.1533698536600507, + "grad_norm": 2.346513032913208, + "learning_rate": 4.684664566489238e-05, + "loss": 1.0354, + "step": 203200 + }, + { + "epoch": 3.1549217089029935, + "grad_norm": 2.32383394241333, + "learning_rate": 4.684509380964944e-05, + "loss": 1.0368, + "step": 203300 + }, + { + "epoch": 3.1564735641459363, + "grad_norm": 2.427879571914673, + "learning_rate": 4.6843541954406495e-05, + "loss": 1.0413, + "step": 203400 + }, + { + "epoch": 3.1580254193888795, + "grad_norm": 2.173727512359619, + "learning_rate": 4.684199009916355e-05, + "loss": 1.0365, + "step": 203500 + }, + { + "epoch": 3.1595772746318223, + "grad_norm": 2.270315408706665, + "learning_rate": 4.684043824392061e-05, + "loss": 1.0329, + "step": 203600 + }, + { + "epoch": 3.1611291298747655, + "grad_norm": 1.945071816444397, + "learning_rate": 4.683888638867767e-05, + "loss": 1.0324, + "step": 203700 + }, + { + "epoch": 3.1626809851177082, + "grad_norm": 2.603261947631836, + "learning_rate": 4.6837334533434726e-05, + "loss": 1.025, + "step": 203800 + }, + { + "epoch": 3.164232840360651, + "grad_norm": 2.2213709354400635, + "learning_rate": 4.683578267819178e-05, + "loss": 1.0304, + "step": 203900 + }, + { + "epoch": 3.165784695603594, + "grad_norm": 2.2088725566864014, + "learning_rate": 4.683423082294884e-05, + "loss": 1.0174, + "step": 204000 + }, + { + "epoch": 3.167336550846537, + "grad_norm": 2.2640299797058105, + "learning_rate": 4.683267896770589e-05, + "loss": 1.0359, + "step": 204100 + }, + { + "epoch": 3.16888840608948, + "grad_norm": 2.0523431301116943, + "learning_rate": 4.683112711246295e-05, + "loss": 1.0167, + "step": 204200 + }, + { + "epoch": 3.170440261332423, + "grad_norm": 2.373732089996338, + "learning_rate": 4.682957525722001e-05, + "loss": 1.0365, + "step": 204300 + }, + { + "epoch": 3.1719921165753657, + "grad_norm": 2.410597801208496, + "learning_rate": 4.6828023401977065e-05, + "loss": 0.9916, + "step": 204400 + }, + { + "epoch": 3.173543971818309, + "grad_norm": 2.2707905769348145, + "learning_rate": 4.682647154673412e-05, + "loss": 1.0386, + "step": 204500 + }, + { + "epoch": 3.1750958270612517, + "grad_norm": 2.3058300018310547, + "learning_rate": 4.682491969149118e-05, + "loss": 1.015, + "step": 204600 + }, + { + "epoch": 3.1766476823041945, + "grad_norm": 2.025825262069702, + "learning_rate": 4.682336783624824e-05, + "loss": 1.0513, + "step": 204700 + }, + { + "epoch": 3.1781995375471377, + "grad_norm": 2.966294527053833, + "learning_rate": 4.6821815981005296e-05, + "loss": 1.0269, + "step": 204800 + }, + { + "epoch": 3.1797513927900805, + "grad_norm": 2.1126255989074707, + "learning_rate": 4.6820264125762354e-05, + "loss": 1.0018, + "step": 204900 + }, + { + "epoch": 3.1813032480330237, + "grad_norm": 1.8233951330184937, + "learning_rate": 4.681871227051941e-05, + "loss": 1.0126, + "step": 205000 + }, + { + "epoch": 3.1828551032759664, + "grad_norm": 2.4195187091827393, + "learning_rate": 4.681716041527647e-05, + "loss": 1.0121, + "step": 205100 + }, + { + "epoch": 3.184406958518909, + "grad_norm": 2.181208848953247, + "learning_rate": 4.681560856003353e-05, + "loss": 1.0281, + "step": 205200 + }, + { + "epoch": 3.1859588137618524, + "grad_norm": 2.162034511566162, + "learning_rate": 4.6814056704790585e-05, + "loss": 1.0346, + "step": 205300 + }, + { + "epoch": 3.187510669004795, + "grad_norm": 2.2806320190429688, + "learning_rate": 4.6812504849547636e-05, + "loss": 1.0002, + "step": 205400 + }, + { + "epoch": 3.1890625242477384, + "grad_norm": 2.4645237922668457, + "learning_rate": 4.6810952994304694e-05, + "loss": 1.0427, + "step": 205500 + }, + { + "epoch": 3.190614379490681, + "grad_norm": 2.1412951946258545, + "learning_rate": 4.680940113906175e-05, + "loss": 1.0107, + "step": 205600 + }, + { + "epoch": 3.192166234733624, + "grad_norm": 2.173511028289795, + "learning_rate": 4.68078492838188e-05, + "loss": 1.0407, + "step": 205700 + }, + { + "epoch": 3.193718089976567, + "grad_norm": 2.1099634170532227, + "learning_rate": 4.680629742857586e-05, + "loss": 1.0116, + "step": 205800 + }, + { + "epoch": 3.19526994521951, + "grad_norm": 2.658093214035034, + "learning_rate": 4.680474557333292e-05, + "loss": 1.01, + "step": 205900 + }, + { + "epoch": 3.1968218004624527, + "grad_norm": 1.9747809171676636, + "learning_rate": 4.6803193718089976e-05, + "loss": 1.0344, + "step": 206000 + }, + { + "epoch": 3.198373655705396, + "grad_norm": 1.9452918767929077, + "learning_rate": 4.6801641862847034e-05, + "loss": 1.033, + "step": 206100 + }, + { + "epoch": 3.1999255109483387, + "grad_norm": 2.2257845401763916, + "learning_rate": 4.680009000760409e-05, + "loss": 1.0129, + "step": 206200 + }, + { + "epoch": 3.201477366191282, + "grad_norm": 2.123009443283081, + "learning_rate": 4.679853815236115e-05, + "loss": 1.0366, + "step": 206300 + }, + { + "epoch": 3.2030292214342246, + "grad_norm": 2.2326512336730957, + "learning_rate": 4.679698629711821e-05, + "loss": 1.0508, + "step": 206400 + }, + { + "epoch": 3.2045810766771674, + "grad_norm": 1.9837554693222046, + "learning_rate": 4.6795434441875265e-05, + "loss": 1.0286, + "step": 206500 + }, + { + "epoch": 3.2061329319201106, + "grad_norm": 2.502143621444702, + "learning_rate": 4.679388258663232e-05, + "loss": 1.0204, + "step": 206600 + }, + { + "epoch": 3.2076847871630534, + "grad_norm": 2.494403839111328, + "learning_rate": 4.679233073138938e-05, + "loss": 1.0308, + "step": 206700 + }, + { + "epoch": 3.2092366424059966, + "grad_norm": 1.7361814975738525, + "learning_rate": 4.679077887614644e-05, + "loss": 1.0272, + "step": 206800 + }, + { + "epoch": 3.2107884976489394, + "grad_norm": 2.041317939758301, + "learning_rate": 4.6789227020903496e-05, + "loss": 1.0396, + "step": 206900 + }, + { + "epoch": 3.212340352891882, + "grad_norm": 2.230377435684204, + "learning_rate": 4.6787675165660547e-05, + "loss": 1.008, + "step": 207000 + }, + { + "epoch": 3.2138922081348253, + "grad_norm": 2.4669148921966553, + "learning_rate": 4.6786123310417604e-05, + "loss": 0.9976, + "step": 207100 + }, + { + "epoch": 3.215444063377768, + "grad_norm": 2.2005035877227783, + "learning_rate": 4.678457145517466e-05, + "loss": 1.0517, + "step": 207200 + }, + { + "epoch": 3.216995918620711, + "grad_norm": 1.968554973602295, + "learning_rate": 4.678301959993172e-05, + "loss": 1.0371, + "step": 207300 + }, + { + "epoch": 3.218547773863654, + "grad_norm": 2.0097222328186035, + "learning_rate": 4.678146774468878e-05, + "loss": 1.0256, + "step": 207400 + }, + { + "epoch": 3.220099629106597, + "grad_norm": 2.425882577896118, + "learning_rate": 4.6779915889445835e-05, + "loss": 1.0254, + "step": 207500 + }, + { + "epoch": 3.22165148434954, + "grad_norm": 2.139723539352417, + "learning_rate": 4.677836403420289e-05, + "loss": 1.0321, + "step": 207600 + }, + { + "epoch": 3.223203339592483, + "grad_norm": 2.0133137702941895, + "learning_rate": 4.677681217895995e-05, + "loss": 1.0072, + "step": 207700 + }, + { + "epoch": 3.2247551948354256, + "grad_norm": 2.0798697471618652, + "learning_rate": 4.677526032371701e-05, + "loss": 1.0206, + "step": 207800 + }, + { + "epoch": 3.226307050078369, + "grad_norm": 2.5051324367523193, + "learning_rate": 4.6773708468474066e-05, + "loss": 1.0218, + "step": 207900 + }, + { + "epoch": 3.2278589053213116, + "grad_norm": 2.0431697368621826, + "learning_rate": 4.6772156613231124e-05, + "loss": 1.0069, + "step": 208000 + }, + { + "epoch": 3.229410760564255, + "grad_norm": 2.1214516162872314, + "learning_rate": 4.677060475798818e-05, + "loss": 1.0173, + "step": 208100 + }, + { + "epoch": 3.2309626158071976, + "grad_norm": 2.190187931060791, + "learning_rate": 4.676905290274524e-05, + "loss": 0.9937, + "step": 208200 + }, + { + "epoch": 3.2325144710501403, + "grad_norm": 2.0025250911712646, + "learning_rate": 4.676750104750229e-05, + "loss": 0.9958, + "step": 208300 + }, + { + "epoch": 3.2340663262930835, + "grad_norm": 2.4372048377990723, + "learning_rate": 4.676594919225935e-05, + "loss": 1.0241, + "step": 208400 + }, + { + "epoch": 3.2356181815360263, + "grad_norm": 2.207690954208374, + "learning_rate": 4.6764397337016406e-05, + "loss": 1.002, + "step": 208500 + }, + { + "epoch": 3.237170036778969, + "grad_norm": 2.0978829860687256, + "learning_rate": 4.6762845481773464e-05, + "loss": 1.0188, + "step": 208600 + }, + { + "epoch": 3.2387218920219123, + "grad_norm": 2.116551399230957, + "learning_rate": 4.6761293626530515e-05, + "loss": 1.0255, + "step": 208700 + }, + { + "epoch": 3.240273747264855, + "grad_norm": 2.3361663818359375, + "learning_rate": 4.675974177128757e-05, + "loss": 1.0149, + "step": 208800 + }, + { + "epoch": 3.2418256025077983, + "grad_norm": 2.535315990447998, + "learning_rate": 4.675818991604463e-05, + "loss": 1.0124, + "step": 208900 + }, + { + "epoch": 3.243377457750741, + "grad_norm": 2.7970681190490723, + "learning_rate": 4.675663806080169e-05, + "loss": 1.0383, + "step": 209000 + }, + { + "epoch": 3.244929312993684, + "grad_norm": 2.3555800914764404, + "learning_rate": 4.6755086205558746e-05, + "loss": 1.0425, + "step": 209100 + }, + { + "epoch": 3.246481168236627, + "grad_norm": 2.4953675270080566, + "learning_rate": 4.6753534350315804e-05, + "loss": 1.0364, + "step": 209200 + }, + { + "epoch": 3.24803302347957, + "grad_norm": 2.2091288566589355, + "learning_rate": 4.675198249507286e-05, + "loss": 1.0468, + "step": 209300 + }, + { + "epoch": 3.2495848787225126, + "grad_norm": 2.506892681121826, + "learning_rate": 4.675043063982992e-05, + "loss": 1.0261, + "step": 209400 + }, + { + "epoch": 3.2511367339654558, + "grad_norm": 2.3813135623931885, + "learning_rate": 4.674887878458698e-05, + "loss": 1.0191, + "step": 209500 + }, + { + "epoch": 3.2526885892083985, + "grad_norm": 2.2571282386779785, + "learning_rate": 4.6747326929344035e-05, + "loss": 1.0314, + "step": 209600 + }, + { + "epoch": 3.2542404444513418, + "grad_norm": 2.226078510284424, + "learning_rate": 4.674577507410109e-05, + "loss": 0.9957, + "step": 209700 + }, + { + "epoch": 3.2557922996942845, + "grad_norm": 2.2413437366485596, + "learning_rate": 4.674422321885814e-05, + "loss": 1.0197, + "step": 209800 + }, + { + "epoch": 3.2573441549372273, + "grad_norm": 2.10233211517334, + "learning_rate": 4.67426713636152e-05, + "loss": 1.0135, + "step": 209900 + }, + { + "epoch": 3.2588960101801705, + "grad_norm": 2.1843860149383545, + "learning_rate": 4.674111950837226e-05, + "loss": 1.0263, + "step": 210000 + }, + { + "epoch": 3.2604478654231133, + "grad_norm": 2.4360811710357666, + "learning_rate": 4.6739567653129317e-05, + "loss": 1.0159, + "step": 210100 + }, + { + "epoch": 3.261999720666056, + "grad_norm": 2.196308135986328, + "learning_rate": 4.6738015797886374e-05, + "loss": 1.0226, + "step": 210200 + }, + { + "epoch": 3.2635515759089992, + "grad_norm": 2.0656545162200928, + "learning_rate": 4.673646394264343e-05, + "loss": 0.9997, + "step": 210300 + }, + { + "epoch": 3.265103431151942, + "grad_norm": 2.1971232891082764, + "learning_rate": 4.673491208740049e-05, + "loss": 1.0189, + "step": 210400 + }, + { + "epoch": 3.2666552863948852, + "grad_norm": 2.175978183746338, + "learning_rate": 4.673336023215755e-05, + "loss": 1.0117, + "step": 210500 + }, + { + "epoch": 3.268207141637828, + "grad_norm": 2.458247423171997, + "learning_rate": 4.6731808376914605e-05, + "loss": 1.0195, + "step": 210600 + }, + { + "epoch": 3.269758996880771, + "grad_norm": 2.230919361114502, + "learning_rate": 4.673025652167166e-05, + "loss": 1.0377, + "step": 210700 + }, + { + "epoch": 3.271310852123714, + "grad_norm": 2.2012929916381836, + "learning_rate": 4.672870466642872e-05, + "loss": 1.0167, + "step": 210800 + }, + { + "epoch": 3.2728627073666567, + "grad_norm": 1.9774216413497925, + "learning_rate": 4.672715281118578e-05, + "loss": 1.0122, + "step": 210900 + }, + { + "epoch": 3.2744145626096, + "grad_norm": 2.210353136062622, + "learning_rate": 4.6725600955942836e-05, + "loss": 1.0282, + "step": 211000 + }, + { + "epoch": 3.2759664178525427, + "grad_norm": 2.970851421356201, + "learning_rate": 4.672404910069989e-05, + "loss": 1.0533, + "step": 211100 + }, + { + "epoch": 3.2775182730954855, + "grad_norm": 2.132545232772827, + "learning_rate": 4.6722497245456945e-05, + "loss": 1.0053, + "step": 211200 + }, + { + "epoch": 3.2790701283384287, + "grad_norm": 2.40109920501709, + "learning_rate": 4.6720945390214e-05, + "loss": 1.0051, + "step": 211300 + }, + { + "epoch": 3.2806219835813715, + "grad_norm": 2.6015913486480713, + "learning_rate": 4.671939353497106e-05, + "loss": 1.0452, + "step": 211400 + }, + { + "epoch": 3.2821738388243142, + "grad_norm": 1.8698290586471558, + "learning_rate": 4.671784167972812e-05, + "loss": 1.0321, + "step": 211500 + }, + { + "epoch": 3.2837256940672575, + "grad_norm": 2.216176748275757, + "learning_rate": 4.6716289824485176e-05, + "loss": 1.0381, + "step": 211600 + }, + { + "epoch": 3.2852775493102, + "grad_norm": 2.0425329208374023, + "learning_rate": 4.6714737969242234e-05, + "loss": 1.0353, + "step": 211700 + }, + { + "epoch": 3.2868294045531434, + "grad_norm": 2.206223726272583, + "learning_rate": 4.671318611399929e-05, + "loss": 1.0242, + "step": 211800 + }, + { + "epoch": 3.288381259796086, + "grad_norm": 2.5931057929992676, + "learning_rate": 4.671163425875634e-05, + "loss": 1.0188, + "step": 211900 + }, + { + "epoch": 3.289933115039029, + "grad_norm": 2.1801204681396484, + "learning_rate": 4.67100824035134e-05, + "loss": 1.0166, + "step": 212000 + }, + { + "epoch": 3.291484970281972, + "grad_norm": 2.1543045043945312, + "learning_rate": 4.670853054827046e-05, + "loss": 1.0175, + "step": 212100 + }, + { + "epoch": 3.293036825524915, + "grad_norm": 2.411215305328369, + "learning_rate": 4.6706978693027516e-05, + "loss": 1.0257, + "step": 212200 + }, + { + "epoch": 3.294588680767858, + "grad_norm": 1.8291096687316895, + "learning_rate": 4.6705426837784574e-05, + "loss": 1.0232, + "step": 212300 + }, + { + "epoch": 3.296140536010801, + "grad_norm": 2.4392714500427246, + "learning_rate": 4.670387498254163e-05, + "loss": 1.0129, + "step": 212400 + }, + { + "epoch": 3.2976923912537437, + "grad_norm": 2.2798540592193604, + "learning_rate": 4.670232312729869e-05, + "loss": 1.0058, + "step": 212500 + }, + { + "epoch": 3.299244246496687, + "grad_norm": 2.0743155479431152, + "learning_rate": 4.670077127205574e-05, + "loss": 1.0177, + "step": 212600 + }, + { + "epoch": 3.3007961017396297, + "grad_norm": 2.104858875274658, + "learning_rate": 4.66992194168128e-05, + "loss": 1.0276, + "step": 212700 + }, + { + "epoch": 3.3023479569825724, + "grad_norm": 2.282646656036377, + "learning_rate": 4.6697667561569855e-05, + "loss": 1.0191, + "step": 212800 + }, + { + "epoch": 3.3038998122255157, + "grad_norm": 2.202138662338257, + "learning_rate": 4.669611570632691e-05, + "loss": 1.0253, + "step": 212900 + }, + { + "epoch": 3.3054516674684584, + "grad_norm": 2.2599270343780518, + "learning_rate": 4.669456385108397e-05, + "loss": 1.0213, + "step": 213000 + }, + { + "epoch": 3.3070035227114016, + "grad_norm": 1.8559143543243408, + "learning_rate": 4.669301199584103e-05, + "loss": 1.0341, + "step": 213100 + }, + { + "epoch": 3.3085553779543444, + "grad_norm": 2.231807231903076, + "learning_rate": 4.6691460140598087e-05, + "loss": 1.0194, + "step": 213200 + }, + { + "epoch": 3.310107233197287, + "grad_norm": 2.157139301300049, + "learning_rate": 4.6689908285355144e-05, + "loss": 1.0181, + "step": 213300 + }, + { + "epoch": 3.3116590884402304, + "grad_norm": 2.08923077583313, + "learning_rate": 4.66883564301122e-05, + "loss": 1.0445, + "step": 213400 + }, + { + "epoch": 3.313210943683173, + "grad_norm": 2.389971971511841, + "learning_rate": 4.668680457486926e-05, + "loss": 1.021, + "step": 213500 + }, + { + "epoch": 3.3147627989261164, + "grad_norm": 2.368169069290161, + "learning_rate": 4.668525271962632e-05, + "loss": 1.0075, + "step": 213600 + }, + { + "epoch": 3.316314654169059, + "grad_norm": 2.2019619941711426, + "learning_rate": 4.6683700864383375e-05, + "loss": 1.009, + "step": 213700 + }, + { + "epoch": 3.317866509412002, + "grad_norm": 1.9306806325912476, + "learning_rate": 4.668214900914043e-05, + "loss": 1.0335, + "step": 213800 + }, + { + "epoch": 3.319418364654945, + "grad_norm": 2.281291961669922, + "learning_rate": 4.6680597153897484e-05, + "loss": 1.0319, + "step": 213900 + }, + { + "epoch": 3.320970219897888, + "grad_norm": 2.270554542541504, + "learning_rate": 4.667904529865454e-05, + "loss": 1.0277, + "step": 214000 + }, + { + "epoch": 3.3225220751408306, + "grad_norm": 1.9186290502548218, + "learning_rate": 4.66774934434116e-05, + "loss": 1.0241, + "step": 214100 + }, + { + "epoch": 3.324073930383774, + "grad_norm": 2.196626901626587, + "learning_rate": 4.667594158816866e-05, + "loss": 1.0353, + "step": 214200 + }, + { + "epoch": 3.3256257856267166, + "grad_norm": 2.2863614559173584, + "learning_rate": 4.6674389732925715e-05, + "loss": 1.0148, + "step": 214300 + }, + { + "epoch": 3.32717764086966, + "grad_norm": 2.506890296936035, + "learning_rate": 4.667283787768277e-05, + "loss": 1.0237, + "step": 214400 + }, + { + "epoch": 3.3287294961126026, + "grad_norm": 2.128335475921631, + "learning_rate": 4.667128602243983e-05, + "loss": 1.0238, + "step": 214500 + }, + { + "epoch": 3.3302813513555454, + "grad_norm": 2.2486555576324463, + "learning_rate": 4.666973416719689e-05, + "loss": 1.0266, + "step": 214600 + }, + { + "epoch": 3.3318332065984886, + "grad_norm": 2.162100076675415, + "learning_rate": 4.6668182311953946e-05, + "loss": 1.0295, + "step": 214700 + }, + { + "epoch": 3.3333850618414314, + "grad_norm": 2.2312748432159424, + "learning_rate": 4.6666630456711004e-05, + "loss": 1.0357, + "step": 214800 + }, + { + "epoch": 3.3349369170843746, + "grad_norm": 2.4596164226531982, + "learning_rate": 4.666507860146806e-05, + "loss": 1.023, + "step": 214900 + }, + { + "epoch": 3.3364887723273173, + "grad_norm": 2.076465606689453, + "learning_rate": 4.666352674622512e-05, + "loss": 1.0182, + "step": 215000 + }, + { + "epoch": 3.33804062757026, + "grad_norm": 2.1031620502471924, + "learning_rate": 4.666197489098218e-05, + "loss": 1.0178, + "step": 215100 + }, + { + "epoch": 3.3395924828132033, + "grad_norm": 2.3582725524902344, + "learning_rate": 4.666042303573923e-05, + "loss": 1.029, + "step": 215200 + }, + { + "epoch": 3.341144338056146, + "grad_norm": 1.918215274810791, + "learning_rate": 4.6658871180496286e-05, + "loss": 1.0107, + "step": 215300 + }, + { + "epoch": 3.342696193299089, + "grad_norm": 2.1029980182647705, + "learning_rate": 4.6657319325253344e-05, + "loss": 1.0183, + "step": 215400 + }, + { + "epoch": 3.344248048542032, + "grad_norm": 2.1243014335632324, + "learning_rate": 4.6655767470010394e-05, + "loss": 1.0259, + "step": 215500 + }, + { + "epoch": 3.345799903784975, + "grad_norm": 2.0930585861206055, + "learning_rate": 4.665421561476745e-05, + "loss": 1.0275, + "step": 215600 + }, + { + "epoch": 3.347351759027918, + "grad_norm": 1.947841763496399, + "learning_rate": 4.665266375952451e-05, + "loss": 1.0033, + "step": 215700 + }, + { + "epoch": 3.348903614270861, + "grad_norm": 2.280519723892212, + "learning_rate": 4.665111190428157e-05, + "loss": 1.024, + "step": 215800 + }, + { + "epoch": 3.3504554695138036, + "grad_norm": 2.247267007827759, + "learning_rate": 4.6649560049038625e-05, + "loss": 1.0385, + "step": 215900 + }, + { + "epoch": 3.352007324756747, + "grad_norm": 1.7428474426269531, + "learning_rate": 4.664800819379568e-05, + "loss": 1.0163, + "step": 216000 + }, + { + "epoch": 3.3535591799996896, + "grad_norm": 2.2367842197418213, + "learning_rate": 4.664645633855274e-05, + "loss": 1.0246, + "step": 216100 + }, + { + "epoch": 3.3551110352426328, + "grad_norm": 2.297909736633301, + "learning_rate": 4.66449044833098e-05, + "loss": 1.0191, + "step": 216200 + }, + { + "epoch": 3.3566628904855755, + "grad_norm": 2.3930795192718506, + "learning_rate": 4.6643352628066857e-05, + "loss": 1.0169, + "step": 216300 + }, + { + "epoch": 3.3582147457285183, + "grad_norm": 2.3360438346862793, + "learning_rate": 4.6641800772823914e-05, + "loss": 1.0062, + "step": 216400 + }, + { + "epoch": 3.3597666009714615, + "grad_norm": 2.2535812854766846, + "learning_rate": 4.664024891758097e-05, + "loss": 1.0024, + "step": 216500 + }, + { + "epoch": 3.3613184562144043, + "grad_norm": 2.942143678665161, + "learning_rate": 4.663869706233803e-05, + "loss": 1.0224, + "step": 216600 + }, + { + "epoch": 3.362870311457347, + "grad_norm": 2.323899269104004, + "learning_rate": 4.663714520709509e-05, + "loss": 1.0049, + "step": 216700 + }, + { + "epoch": 3.3644221667002903, + "grad_norm": 2.870634078979492, + "learning_rate": 4.663559335185214e-05, + "loss": 1.0287, + "step": 216800 + }, + { + "epoch": 3.365974021943233, + "grad_norm": 2.5708534717559814, + "learning_rate": 4.6634041496609196e-05, + "loss": 1.0235, + "step": 216900 + }, + { + "epoch": 3.3675258771861762, + "grad_norm": 2.0997581481933594, + "learning_rate": 4.6632489641366254e-05, + "loss": 1.0314, + "step": 217000 + }, + { + "epoch": 3.369077732429119, + "grad_norm": 2.034555435180664, + "learning_rate": 4.663093778612331e-05, + "loss": 1.0264, + "step": 217100 + }, + { + "epoch": 3.3706295876720618, + "grad_norm": 1.9683167934417725, + "learning_rate": 4.662938593088037e-05, + "loss": 1.0338, + "step": 217200 + }, + { + "epoch": 3.372181442915005, + "grad_norm": 2.3981857299804688, + "learning_rate": 4.662783407563743e-05, + "loss": 1.0279, + "step": 217300 + }, + { + "epoch": 3.3737332981579478, + "grad_norm": 2.1136646270751953, + "learning_rate": 4.6626282220394485e-05, + "loss": 1.0274, + "step": 217400 + }, + { + "epoch": 3.375285153400891, + "grad_norm": 2.747699737548828, + "learning_rate": 4.662473036515154e-05, + "loss": 1.0276, + "step": 217500 + }, + { + "epoch": 3.3768370086438337, + "grad_norm": 2.3531923294067383, + "learning_rate": 4.66231785099086e-05, + "loss": 1.0039, + "step": 217600 + }, + { + "epoch": 3.3783888638867765, + "grad_norm": 2.1912336349487305, + "learning_rate": 4.662162665466566e-05, + "loss": 1.0195, + "step": 217700 + }, + { + "epoch": 3.3799407191297197, + "grad_norm": 2.2018327713012695, + "learning_rate": 4.6620074799422716e-05, + "loss": 1.0117, + "step": 217800 + }, + { + "epoch": 3.3814925743726625, + "grad_norm": 2.1897764205932617, + "learning_rate": 4.6618522944179774e-05, + "loss": 1.0378, + "step": 217900 + }, + { + "epoch": 3.3830444296156053, + "grad_norm": 2.4949533939361572, + "learning_rate": 4.661697108893683e-05, + "loss": 1.0223, + "step": 218000 + }, + { + "epoch": 3.3845962848585485, + "grad_norm": 2.1576850414276123, + "learning_rate": 4.661541923369388e-05, + "loss": 1.0393, + "step": 218100 + }, + { + "epoch": 3.3861481401014912, + "grad_norm": 2.3476812839508057, + "learning_rate": 4.661386737845094e-05, + "loss": 1.005, + "step": 218200 + }, + { + "epoch": 3.3876999953444344, + "grad_norm": 2.248135805130005, + "learning_rate": 4.6612315523208e-05, + "loss": 1.0058, + "step": 218300 + }, + { + "epoch": 3.389251850587377, + "grad_norm": 2.3704135417938232, + "learning_rate": 4.6610763667965056e-05, + "loss": 1.0044, + "step": 218400 + }, + { + "epoch": 3.39080370583032, + "grad_norm": 2.0789783000946045, + "learning_rate": 4.660921181272211e-05, + "loss": 1.021, + "step": 218500 + }, + { + "epoch": 3.392355561073263, + "grad_norm": 2.1760096549987793, + "learning_rate": 4.6607659957479164e-05, + "loss": 1.0199, + "step": 218600 + }, + { + "epoch": 3.393907416316206, + "grad_norm": 2.3323814868927, + "learning_rate": 4.660610810223622e-05, + "loss": 1.0276, + "step": 218700 + }, + { + "epoch": 3.395459271559149, + "grad_norm": 2.5076801776885986, + "learning_rate": 4.660455624699328e-05, + "loss": 1.0278, + "step": 218800 + }, + { + "epoch": 3.397011126802092, + "grad_norm": 2.9879953861236572, + "learning_rate": 4.660300439175034e-05, + "loss": 1.0225, + "step": 218900 + }, + { + "epoch": 3.3985629820450347, + "grad_norm": 1.9574092626571655, + "learning_rate": 4.6601452536507395e-05, + "loss": 1.0232, + "step": 219000 + }, + { + "epoch": 3.400114837287978, + "grad_norm": 2.3615403175354004, + "learning_rate": 4.659990068126445e-05, + "loss": 1.0165, + "step": 219100 + }, + { + "epoch": 3.4016666925309207, + "grad_norm": 2.209057092666626, + "learning_rate": 4.659834882602151e-05, + "loss": 1.0245, + "step": 219200 + }, + { + "epoch": 3.4032185477738635, + "grad_norm": 1.8723456859588623, + "learning_rate": 4.659679697077857e-05, + "loss": 1.0185, + "step": 219300 + }, + { + "epoch": 3.4047704030168067, + "grad_norm": 2.1733956336975098, + "learning_rate": 4.6595245115535627e-05, + "loss": 1.0173, + "step": 219400 + }, + { + "epoch": 3.4063222582597494, + "grad_norm": 2.328073024749756, + "learning_rate": 4.6593693260292684e-05, + "loss": 1.0395, + "step": 219500 + }, + { + "epoch": 3.4078741135026926, + "grad_norm": 2.7424774169921875, + "learning_rate": 4.6592141405049735e-05, + "loss": 1.0367, + "step": 219600 + }, + { + "epoch": 3.4094259687456354, + "grad_norm": 2.1806962490081787, + "learning_rate": 4.659058954980679e-05, + "loss": 1.0174, + "step": 219700 + }, + { + "epoch": 3.410977823988578, + "grad_norm": 2.260183811187744, + "learning_rate": 4.658903769456385e-05, + "loss": 1.0326, + "step": 219800 + }, + { + "epoch": 3.4125296792315214, + "grad_norm": 2.6208088397979736, + "learning_rate": 4.658748583932091e-05, + "loss": 1.0161, + "step": 219900 + }, + { + "epoch": 3.414081534474464, + "grad_norm": 2.407381772994995, + "learning_rate": 4.6585933984077966e-05, + "loss": 1.0169, + "step": 220000 + }, + { + "epoch": 3.4156333897174074, + "grad_norm": 1.9718668460845947, + "learning_rate": 4.6584382128835024e-05, + "loss": 1.0085, + "step": 220100 + }, + { + "epoch": 3.41718524496035, + "grad_norm": 1.956365942955017, + "learning_rate": 4.658283027359208e-05, + "loss": 1.0055, + "step": 220200 + }, + { + "epoch": 3.418737100203293, + "grad_norm": 2.448484182357788, + "learning_rate": 4.658127841834914e-05, + "loss": 1.0261, + "step": 220300 + }, + { + "epoch": 3.420288955446236, + "grad_norm": 2.324747085571289, + "learning_rate": 4.65797265631062e-05, + "loss": 1.0318, + "step": 220400 + }, + { + "epoch": 3.421840810689179, + "grad_norm": 2.905900239944458, + "learning_rate": 4.6578174707863255e-05, + "loss": 1.01, + "step": 220500 + }, + { + "epoch": 3.4233926659321217, + "grad_norm": 2.10194993019104, + "learning_rate": 4.657662285262031e-05, + "loss": 1.0312, + "step": 220600 + }, + { + "epoch": 3.424944521175065, + "grad_norm": 2.0558583736419678, + "learning_rate": 4.657507099737737e-05, + "loss": 1.0044, + "step": 220700 + }, + { + "epoch": 3.4264963764180076, + "grad_norm": 2.0191500186920166, + "learning_rate": 4.657351914213443e-05, + "loss": 1.0145, + "step": 220800 + }, + { + "epoch": 3.428048231660951, + "grad_norm": 2.301386594772339, + "learning_rate": 4.657196728689148e-05, + "loss": 1.0343, + "step": 220900 + }, + { + "epoch": 3.4296000869038936, + "grad_norm": 2.0982818603515625, + "learning_rate": 4.657041543164854e-05, + "loss": 1.0261, + "step": 221000 + }, + { + "epoch": 3.4311519421468364, + "grad_norm": 2.562927007675171, + "learning_rate": 4.6568863576405595e-05, + "loss": 1.018, + "step": 221100 + }, + { + "epoch": 3.4327037973897796, + "grad_norm": 2.371718406677246, + "learning_rate": 4.656731172116265e-05, + "loss": 1.012, + "step": 221200 + }, + { + "epoch": 3.4342556526327224, + "grad_norm": 2.111654043197632, + "learning_rate": 4.656575986591971e-05, + "loss": 1.0206, + "step": 221300 + }, + { + "epoch": 3.4358075078756656, + "grad_norm": 2.294008731842041, + "learning_rate": 4.656420801067677e-05, + "loss": 1.0145, + "step": 221400 + }, + { + "epoch": 3.4373593631186083, + "grad_norm": 2.5183184146881104, + "learning_rate": 4.6562656155433826e-05, + "loss": 1.0184, + "step": 221500 + }, + { + "epoch": 3.438911218361551, + "grad_norm": 2.5621912479400635, + "learning_rate": 4.6561104300190884e-05, + "loss": 1.0196, + "step": 221600 + }, + { + "epoch": 3.4404630736044943, + "grad_norm": 2.4350814819335938, + "learning_rate": 4.6559552444947934e-05, + "loss": 1.0086, + "step": 221700 + }, + { + "epoch": 3.442014928847437, + "grad_norm": 2.164003610610962, + "learning_rate": 4.655800058970499e-05, + "loss": 1.0002, + "step": 221800 + }, + { + "epoch": 3.44356678409038, + "grad_norm": 2.2738611698150635, + "learning_rate": 4.655644873446205e-05, + "loss": 1.0305, + "step": 221900 + }, + { + "epoch": 3.445118639333323, + "grad_norm": 1.9323899745941162, + "learning_rate": 4.655489687921911e-05, + "loss": 0.9929, + "step": 222000 + }, + { + "epoch": 3.446670494576266, + "grad_norm": 2.3244051933288574, + "learning_rate": 4.6553345023976165e-05, + "loss": 1.0242, + "step": 222100 + }, + { + "epoch": 3.448222349819209, + "grad_norm": 2.2730369567871094, + "learning_rate": 4.655179316873322e-05, + "loss": 1.0208, + "step": 222200 + }, + { + "epoch": 3.449774205062152, + "grad_norm": 2.7483091354370117, + "learning_rate": 4.655024131349028e-05, + "loss": 1.0215, + "step": 222300 + }, + { + "epoch": 3.4513260603050946, + "grad_norm": 2.1802148818969727, + "learning_rate": 4.654868945824734e-05, + "loss": 1.0119, + "step": 222400 + }, + { + "epoch": 3.452877915548038, + "grad_norm": 2.256516218185425, + "learning_rate": 4.654713760300439e-05, + "loss": 1.0251, + "step": 222500 + }, + { + "epoch": 3.4544297707909806, + "grad_norm": 2.4180619716644287, + "learning_rate": 4.654558574776145e-05, + "loss": 1.012, + "step": 222600 + }, + { + "epoch": 3.455981626033924, + "grad_norm": 2.606107711791992, + "learning_rate": 4.6544033892518505e-05, + "loss": 1.0223, + "step": 222700 + }, + { + "epoch": 3.4575334812768665, + "grad_norm": 2.804283857345581, + "learning_rate": 4.654248203727556e-05, + "loss": 1.0081, + "step": 222800 + }, + { + "epoch": 3.4590853365198093, + "grad_norm": 2.062540292739868, + "learning_rate": 4.654093018203262e-05, + "loss": 1.0194, + "step": 222900 + }, + { + "epoch": 3.4606371917627525, + "grad_norm": 2.3496198654174805, + "learning_rate": 4.653937832678968e-05, + "loss": 1.0011, + "step": 223000 + }, + { + "epoch": 3.4621890470056953, + "grad_norm": 2.4609780311584473, + "learning_rate": 4.6537826471546736e-05, + "loss": 1.0262, + "step": 223100 + }, + { + "epoch": 3.463740902248638, + "grad_norm": 2.257575035095215, + "learning_rate": 4.6536274616303794e-05, + "loss": 1.014, + "step": 223200 + }, + { + "epoch": 3.4652927574915813, + "grad_norm": 2.069559097290039, + "learning_rate": 4.653472276106085e-05, + "loss": 1.0254, + "step": 223300 + }, + { + "epoch": 3.466844612734524, + "grad_norm": 2.2982141971588135, + "learning_rate": 4.653317090581791e-05, + "loss": 1.0238, + "step": 223400 + }, + { + "epoch": 3.468396467977467, + "grad_norm": 2.3976995944976807, + "learning_rate": 4.653161905057497e-05, + "loss": 1.0073, + "step": 223500 + }, + { + "epoch": 3.46994832322041, + "grad_norm": 2.147540330886841, + "learning_rate": 4.6530067195332025e-05, + "loss": 0.9989, + "step": 223600 + }, + { + "epoch": 3.471500178463353, + "grad_norm": 2.3275585174560547, + "learning_rate": 4.652851534008908e-05, + "loss": 1.0209, + "step": 223700 + }, + { + "epoch": 3.473052033706296, + "grad_norm": 2.6353914737701416, + "learning_rate": 4.6526963484846134e-05, + "loss": 1.0261, + "step": 223800 + }, + { + "epoch": 3.4746038889492388, + "grad_norm": 1.9287621974945068, + "learning_rate": 4.652541162960319e-05, + "loss": 1.0214, + "step": 223900 + }, + { + "epoch": 3.476155744192182, + "grad_norm": 2.2961585521698, + "learning_rate": 4.652385977436025e-05, + "loss": 0.9991, + "step": 224000 + }, + { + "epoch": 3.4777075994351248, + "grad_norm": 2.4120821952819824, + "learning_rate": 4.652230791911731e-05, + "loss": 1.0176, + "step": 224100 + }, + { + "epoch": 3.4792594546780675, + "grad_norm": 2.458669424057007, + "learning_rate": 4.6520756063874365e-05, + "loss": 1.0125, + "step": 224200 + }, + { + "epoch": 3.4808113099210107, + "grad_norm": 1.9512863159179688, + "learning_rate": 4.651920420863142e-05, + "loss": 1.0156, + "step": 224300 + }, + { + "epoch": 3.4823631651639535, + "grad_norm": 2.217475414276123, + "learning_rate": 4.651765235338848e-05, + "loss": 1.0182, + "step": 224400 + }, + { + "epoch": 3.4839150204068963, + "grad_norm": 2.7593281269073486, + "learning_rate": 4.651610049814554e-05, + "loss": 1.0153, + "step": 224500 + }, + { + "epoch": 3.4854668756498395, + "grad_norm": 2.3587729930877686, + "learning_rate": 4.6514548642902596e-05, + "loss": 1.027, + "step": 224600 + }, + { + "epoch": 3.4870187308927822, + "grad_norm": 2.3511438369750977, + "learning_rate": 4.6512996787659654e-05, + "loss": 1.0253, + "step": 224700 + }, + { + "epoch": 3.488570586135725, + "grad_norm": 2.1176085472106934, + "learning_rate": 4.651144493241671e-05, + "loss": 1.0204, + "step": 224800 + }, + { + "epoch": 3.4901224413786682, + "grad_norm": 2.0496795177459717, + "learning_rate": 4.650989307717377e-05, + "loss": 1.0193, + "step": 224900 + }, + { + "epoch": 3.491674296621611, + "grad_norm": 2.0768377780914307, + "learning_rate": 4.650834122193082e-05, + "loss": 1.0156, + "step": 225000 + }, + { + "epoch": 3.493226151864554, + "grad_norm": 2.1627848148345947, + "learning_rate": 4.650678936668788e-05, + "loss": 1.0214, + "step": 225100 + }, + { + "epoch": 3.494778007107497, + "grad_norm": 2.2670319080352783, + "learning_rate": 4.6505237511444935e-05, + "loss": 1.0252, + "step": 225200 + }, + { + "epoch": 3.49632986235044, + "grad_norm": 2.3662946224212646, + "learning_rate": 4.6503685656201986e-05, + "loss": 1.0254, + "step": 225300 + }, + { + "epoch": 3.497881717593383, + "grad_norm": 2.097546100616455, + "learning_rate": 4.6502133800959044e-05, + "loss": 1.0247, + "step": 225400 + }, + { + "epoch": 3.4994335728363257, + "grad_norm": 2.059945821762085, + "learning_rate": 4.65005819457161e-05, + "loss": 1.0339, + "step": 225500 + }, + { + "epoch": 3.500985428079269, + "grad_norm": 1.7830817699432373, + "learning_rate": 4.649903009047316e-05, + "loss": 1.0145, + "step": 225600 + }, + { + "epoch": 3.5025372833222117, + "grad_norm": 2.035282611846924, + "learning_rate": 4.649747823523022e-05, + "loss": 0.9986, + "step": 225700 + }, + { + "epoch": 3.5040891385651545, + "grad_norm": 2.0063650608062744, + "learning_rate": 4.6495926379987275e-05, + "loss": 1.0023, + "step": 225800 + }, + { + "epoch": 3.5056409938080977, + "grad_norm": 2.157651901245117, + "learning_rate": 4.649437452474433e-05, + "loss": 1.0343, + "step": 225900 + }, + { + "epoch": 3.5071928490510405, + "grad_norm": 2.2376537322998047, + "learning_rate": 4.649282266950139e-05, + "loss": 1.0417, + "step": 226000 + }, + { + "epoch": 3.508744704293983, + "grad_norm": 2.8245861530303955, + "learning_rate": 4.649127081425845e-05, + "loss": 1.007, + "step": 226100 + }, + { + "epoch": 3.5102965595369264, + "grad_norm": 2.3284294605255127, + "learning_rate": 4.6489718959015506e-05, + "loss": 1.0312, + "step": 226200 + }, + { + "epoch": 3.511848414779869, + "grad_norm": 2.2699787616729736, + "learning_rate": 4.6488167103772564e-05, + "loss": 1.0055, + "step": 226300 + }, + { + "epoch": 3.5134002700228124, + "grad_norm": 2.213945150375366, + "learning_rate": 4.648661524852962e-05, + "loss": 1.0086, + "step": 226400 + }, + { + "epoch": 3.514952125265755, + "grad_norm": 1.8312371969223022, + "learning_rate": 4.648506339328668e-05, + "loss": 1.0208, + "step": 226500 + }, + { + "epoch": 3.5165039805086984, + "grad_norm": 2.0561037063598633, + "learning_rate": 4.648351153804373e-05, + "loss": 1.0229, + "step": 226600 + }, + { + "epoch": 3.518055835751641, + "grad_norm": 2.158634662628174, + "learning_rate": 4.648195968280079e-05, + "loss": 1.025, + "step": 226700 + }, + { + "epoch": 3.519607690994584, + "grad_norm": 2.0920844078063965, + "learning_rate": 4.6480407827557846e-05, + "loss": 1.0109, + "step": 226800 + }, + { + "epoch": 3.521159546237527, + "grad_norm": 2.116969585418701, + "learning_rate": 4.6478855972314904e-05, + "loss": 1.0109, + "step": 226900 + }, + { + "epoch": 3.52271140148047, + "grad_norm": 2.1268246173858643, + "learning_rate": 4.647730411707196e-05, + "loss": 1.0194, + "step": 227000 + }, + { + "epoch": 3.5242632567234127, + "grad_norm": 2.363027811050415, + "learning_rate": 4.647575226182902e-05, + "loss": 1.0197, + "step": 227100 + }, + { + "epoch": 3.525815111966356, + "grad_norm": 2.2287521362304688, + "learning_rate": 4.647420040658608e-05, + "loss": 1.0287, + "step": 227200 + }, + { + "epoch": 3.5273669672092987, + "grad_norm": 2.2710227966308594, + "learning_rate": 4.6472648551343135e-05, + "loss": 1.0459, + "step": 227300 + }, + { + "epoch": 3.5289188224522414, + "grad_norm": 2.393834352493286, + "learning_rate": 4.647109669610019e-05, + "loss": 1.0146, + "step": 227400 + }, + { + "epoch": 3.5304706776951846, + "grad_norm": 2.3930397033691406, + "learning_rate": 4.646954484085725e-05, + "loss": 1.0115, + "step": 227500 + }, + { + "epoch": 3.5320225329381274, + "grad_norm": 2.3626060485839844, + "learning_rate": 4.646799298561431e-05, + "loss": 1.0267, + "step": 227600 + }, + { + "epoch": 3.5335743881810706, + "grad_norm": 1.9492485523223877, + "learning_rate": 4.6466441130371366e-05, + "loss": 1.0082, + "step": 227700 + }, + { + "epoch": 3.5351262434240134, + "grad_norm": 2.1411900520324707, + "learning_rate": 4.6464889275128424e-05, + "loss": 1.0173, + "step": 227800 + }, + { + "epoch": 3.5366780986669566, + "grad_norm": 2.3113627433776855, + "learning_rate": 4.6463337419885474e-05, + "loss": 1.0376, + "step": 227900 + }, + { + "epoch": 3.5382299539098994, + "grad_norm": 2.434312105178833, + "learning_rate": 4.646178556464253e-05, + "loss": 1.0231, + "step": 228000 + }, + { + "epoch": 3.539781809152842, + "grad_norm": 2.245375871658325, + "learning_rate": 4.646023370939959e-05, + "loss": 1.0328, + "step": 228100 + }, + { + "epoch": 3.5413336643957853, + "grad_norm": 2.077577829360962, + "learning_rate": 4.645868185415664e-05, + "loss": 1.0284, + "step": 228200 + }, + { + "epoch": 3.542885519638728, + "grad_norm": 2.1776371002197266, + "learning_rate": 4.64571299989137e-05, + "loss": 1.0213, + "step": 228300 + }, + { + "epoch": 3.544437374881671, + "grad_norm": 2.1184732913970947, + "learning_rate": 4.6455578143670756e-05, + "loss": 1.0225, + "step": 228400 + }, + { + "epoch": 3.545989230124614, + "grad_norm": 2.2448039054870605, + "learning_rate": 4.6454026288427814e-05, + "loss": 1.0352, + "step": 228500 + }, + { + "epoch": 3.547541085367557, + "grad_norm": 2.0606822967529297, + "learning_rate": 4.645247443318487e-05, + "loss": 1.0368, + "step": 228600 + }, + { + "epoch": 3.5490929406104996, + "grad_norm": 2.523189067840576, + "learning_rate": 4.645092257794193e-05, + "loss": 1.0158, + "step": 228700 + }, + { + "epoch": 3.550644795853443, + "grad_norm": 2.3495540618896484, + "learning_rate": 4.644937072269899e-05, + "loss": 1.0067, + "step": 228800 + }, + { + "epoch": 3.5521966510963856, + "grad_norm": 2.143821954727173, + "learning_rate": 4.6447818867456045e-05, + "loss": 1.0398, + "step": 228900 + }, + { + "epoch": 3.553748506339329, + "grad_norm": 2.1730520725250244, + "learning_rate": 4.64462670122131e-05, + "loss": 1.0001, + "step": 229000 + }, + { + "epoch": 3.5553003615822716, + "grad_norm": 2.2635035514831543, + "learning_rate": 4.644471515697016e-05, + "loss": 1.041, + "step": 229100 + }, + { + "epoch": 3.556852216825215, + "grad_norm": 2.173370838165283, + "learning_rate": 4.644316330172722e-05, + "loss": 1.0091, + "step": 229200 + }, + { + "epoch": 3.5584040720681576, + "grad_norm": 2.0543220043182373, + "learning_rate": 4.6441611446484276e-05, + "loss": 1.0162, + "step": 229300 + }, + { + "epoch": 3.5599559273111003, + "grad_norm": 2.084263563156128, + "learning_rate": 4.644005959124133e-05, + "loss": 1.0374, + "step": 229400 + }, + { + "epoch": 3.5615077825540435, + "grad_norm": 2.394080877304077, + "learning_rate": 4.6438507735998385e-05, + "loss": 1.006, + "step": 229500 + }, + { + "epoch": 3.5630596377969863, + "grad_norm": 2.039065361022949, + "learning_rate": 4.643695588075544e-05, + "loss": 1.0138, + "step": 229600 + }, + { + "epoch": 3.564611493039929, + "grad_norm": 1.911515712738037, + "learning_rate": 4.64354040255125e-05, + "loss": 1.0535, + "step": 229700 + }, + { + "epoch": 3.5661633482828723, + "grad_norm": 2.2423110008239746, + "learning_rate": 4.643385217026956e-05, + "loss": 1.0588, + "step": 229800 + }, + { + "epoch": 3.567715203525815, + "grad_norm": 1.9198731184005737, + "learning_rate": 4.6432300315026616e-05, + "loss": 1.0115, + "step": 229900 + }, + { + "epoch": 3.569267058768758, + "grad_norm": 2.2048838138580322, + "learning_rate": 4.6430748459783674e-05, + "loss": 1.0021, + "step": 230000 + }, + { + "epoch": 3.570818914011701, + "grad_norm": 2.0658535957336426, + "learning_rate": 4.642919660454073e-05, + "loss": 1.0241, + "step": 230100 + }, + { + "epoch": 3.572370769254644, + "grad_norm": 2.2744975090026855, + "learning_rate": 4.642764474929779e-05, + "loss": 0.9956, + "step": 230200 + }, + { + "epoch": 3.573922624497587, + "grad_norm": 2.0728373527526855, + "learning_rate": 4.642609289405485e-05, + "loss": 0.9953, + "step": 230300 + }, + { + "epoch": 3.57547447974053, + "grad_norm": 2.233289957046509, + "learning_rate": 4.6424541038811905e-05, + "loss": 1.0301, + "step": 230400 + }, + { + "epoch": 3.577026334983473, + "grad_norm": 1.983513593673706, + "learning_rate": 4.642298918356896e-05, + "loss": 1.0154, + "step": 230500 + }, + { + "epoch": 3.5785781902264158, + "grad_norm": 2.523077964782715, + "learning_rate": 4.642143732832602e-05, + "loss": 1.0509, + "step": 230600 + }, + { + "epoch": 3.5801300454693585, + "grad_norm": 2.6744301319122314, + "learning_rate": 4.641988547308307e-05, + "loss": 1.0164, + "step": 230700 + }, + { + "epoch": 3.5816819007123017, + "grad_norm": 2.7268240451812744, + "learning_rate": 4.641833361784013e-05, + "loss": 1.0061, + "step": 230800 + }, + { + "epoch": 3.5832337559552445, + "grad_norm": 1.968333125114441, + "learning_rate": 4.641678176259719e-05, + "loss": 1.0106, + "step": 230900 + }, + { + "epoch": 3.5847856111981873, + "grad_norm": 2.2497141361236572, + "learning_rate": 4.6415229907354244e-05, + "loss": 1.0256, + "step": 231000 + }, + { + "epoch": 3.5863374664411305, + "grad_norm": 1.9975284337997437, + "learning_rate": 4.64136780521113e-05, + "loss": 1.0294, + "step": 231100 + }, + { + "epoch": 3.5878893216840733, + "grad_norm": 2.34073543548584, + "learning_rate": 4.641212619686836e-05, + "loss": 1.0312, + "step": 231200 + }, + { + "epoch": 3.589441176927016, + "grad_norm": 2.346445083618164, + "learning_rate": 4.641057434162542e-05, + "loss": 1.0243, + "step": 231300 + }, + { + "epoch": 3.5909930321699592, + "grad_norm": 2.0903470516204834, + "learning_rate": 4.6409022486382475e-05, + "loss": 1.0323, + "step": 231400 + }, + { + "epoch": 3.592544887412902, + "grad_norm": 2.795607328414917, + "learning_rate": 4.6407470631139526e-05, + "loss": 1.02, + "step": 231500 + }, + { + "epoch": 3.5940967426558452, + "grad_norm": 1.859204888343811, + "learning_rate": 4.6405918775896584e-05, + "loss": 1.0239, + "step": 231600 + }, + { + "epoch": 3.595648597898788, + "grad_norm": 2.015591621398926, + "learning_rate": 4.640436692065364e-05, + "loss": 1.0566, + "step": 231700 + }, + { + "epoch": 3.597200453141731, + "grad_norm": 2.3207833766937256, + "learning_rate": 4.64028150654107e-05, + "loss": 1.0178, + "step": 231800 + }, + { + "epoch": 3.598752308384674, + "grad_norm": 2.4403696060180664, + "learning_rate": 4.640126321016776e-05, + "loss": 1.0204, + "step": 231900 + }, + { + "epoch": 3.6003041636276167, + "grad_norm": 2.2235662937164307, + "learning_rate": 4.6399711354924815e-05, + "loss": 1.0201, + "step": 232000 + }, + { + "epoch": 3.60185601887056, + "grad_norm": 2.2444968223571777, + "learning_rate": 4.639815949968187e-05, + "loss": 1.0405, + "step": 232100 + }, + { + "epoch": 3.6034078741135027, + "grad_norm": 2.5562655925750732, + "learning_rate": 4.639660764443893e-05, + "loss": 1.0084, + "step": 232200 + }, + { + "epoch": 3.6049597293564455, + "grad_norm": 2.244760036468506, + "learning_rate": 4.639505578919598e-05, + "loss": 1.0146, + "step": 232300 + }, + { + "epoch": 3.6065115845993887, + "grad_norm": 2.4502768516540527, + "learning_rate": 4.639350393395304e-05, + "loss": 1.0334, + "step": 232400 + }, + { + "epoch": 3.6080634398423315, + "grad_norm": 1.9223589897155762, + "learning_rate": 4.63919520787101e-05, + "loss": 1.0364, + "step": 232500 + }, + { + "epoch": 3.6096152950852742, + "grad_norm": 2.419853448867798, + "learning_rate": 4.6390400223467155e-05, + "loss": 1.0369, + "step": 232600 + }, + { + "epoch": 3.6111671503282174, + "grad_norm": 1.8432646989822388, + "learning_rate": 4.638884836822421e-05, + "loss": 1.0242, + "step": 232700 + }, + { + "epoch": 3.61271900557116, + "grad_norm": 2.2077646255493164, + "learning_rate": 4.638729651298127e-05, + "loss": 0.9861, + "step": 232800 + }, + { + "epoch": 3.6142708608141034, + "grad_norm": 2.41416597366333, + "learning_rate": 4.638574465773833e-05, + "loss": 1.0404, + "step": 232900 + }, + { + "epoch": 3.615822716057046, + "grad_norm": 2.2876474857330322, + "learning_rate": 4.6384192802495386e-05, + "loss": 1.0186, + "step": 233000 + }, + { + "epoch": 3.6173745712999894, + "grad_norm": 2.0390963554382324, + "learning_rate": 4.6382640947252444e-05, + "loss": 1.0331, + "step": 233100 + }, + { + "epoch": 3.618926426542932, + "grad_norm": 2.240492105484009, + "learning_rate": 4.63810890920095e-05, + "loss": 1.0172, + "step": 233200 + }, + { + "epoch": 3.620478281785875, + "grad_norm": 2.3397746086120605, + "learning_rate": 4.637953723676656e-05, + "loss": 0.999, + "step": 233300 + }, + { + "epoch": 3.622030137028818, + "grad_norm": 2.0867655277252197, + "learning_rate": 4.637798538152362e-05, + "loss": 1.0126, + "step": 233400 + }, + { + "epoch": 3.623581992271761, + "grad_norm": 2.0293049812316895, + "learning_rate": 4.6376433526280675e-05, + "loss": 0.9968, + "step": 233500 + }, + { + "epoch": 3.6251338475147037, + "grad_norm": 2.438554286956787, + "learning_rate": 4.6374881671037726e-05, + "loss": 1.0078, + "step": 233600 + }, + { + "epoch": 3.626685702757647, + "grad_norm": 2.5825388431549072, + "learning_rate": 4.6373329815794783e-05, + "loss": 1.0145, + "step": 233700 + }, + { + "epoch": 3.6282375580005897, + "grad_norm": 2.400360107421875, + "learning_rate": 4.637177796055184e-05, + "loss": 1.0311, + "step": 233800 + }, + { + "epoch": 3.6297894132435324, + "grad_norm": 2.0948238372802734, + "learning_rate": 4.63702261053089e-05, + "loss": 1.0182, + "step": 233900 + }, + { + "epoch": 3.6313412684864756, + "grad_norm": 1.9506988525390625, + "learning_rate": 4.636867425006596e-05, + "loss": 1.0212, + "step": 234000 + }, + { + "epoch": 3.6328931237294184, + "grad_norm": 2.3656206130981445, + "learning_rate": 4.6367122394823014e-05, + "loss": 1.015, + "step": 234100 + }, + { + "epoch": 3.634444978972361, + "grad_norm": 2.086054801940918, + "learning_rate": 4.636557053958007e-05, + "loss": 1.0172, + "step": 234200 + }, + { + "epoch": 3.6359968342153044, + "grad_norm": 1.9722099304199219, + "learning_rate": 4.636401868433713e-05, + "loss": 1.016, + "step": 234300 + }, + { + "epoch": 3.6375486894582476, + "grad_norm": 2.556325912475586, + "learning_rate": 4.636246682909419e-05, + "loss": 1.0331, + "step": 234400 + }, + { + "epoch": 3.6391005447011904, + "grad_norm": 1.857521414756775, + "learning_rate": 4.6360914973851245e-05, + "loss": 0.9999, + "step": 234500 + }, + { + "epoch": 3.640652399944133, + "grad_norm": 2.031285047531128, + "learning_rate": 4.63593631186083e-05, + "loss": 1.001, + "step": 234600 + }, + { + "epoch": 3.6422042551870764, + "grad_norm": 2.178656816482544, + "learning_rate": 4.6357811263365354e-05, + "loss": 1.0213, + "step": 234700 + }, + { + "epoch": 3.643756110430019, + "grad_norm": 2.235032081604004, + "learning_rate": 4.635625940812241e-05, + "loss": 1.032, + "step": 234800 + }, + { + "epoch": 3.645307965672962, + "grad_norm": 2.5598344802856445, + "learning_rate": 4.635470755287947e-05, + "loss": 1.0078, + "step": 234900 + }, + { + "epoch": 3.646859820915905, + "grad_norm": 2.3724629878997803, + "learning_rate": 4.635315569763653e-05, + "loss": 1.035, + "step": 235000 + }, + { + "epoch": 3.648411676158848, + "grad_norm": 2.4333388805389404, + "learning_rate": 4.635160384239358e-05, + "loss": 0.9835, + "step": 235100 + }, + { + "epoch": 3.6499635314017906, + "grad_norm": 1.9867881536483765, + "learning_rate": 4.6350051987150636e-05, + "loss": 1.0154, + "step": 235200 + }, + { + "epoch": 3.651515386644734, + "grad_norm": 2.417407751083374, + "learning_rate": 4.6348500131907694e-05, + "loss": 1.019, + "step": 235300 + }, + { + "epoch": 3.6530672418876766, + "grad_norm": 2.040393590927124, + "learning_rate": 4.634694827666475e-05, + "loss": 1.0201, + "step": 235400 + }, + { + "epoch": 3.6546190971306194, + "grad_norm": 2.0924699306488037, + "learning_rate": 4.634539642142181e-05, + "loss": 1.0296, + "step": 235500 + }, + { + "epoch": 3.6561709523735626, + "grad_norm": 2.062406301498413, + "learning_rate": 4.634384456617887e-05, + "loss": 1.0092, + "step": 235600 + }, + { + "epoch": 3.6577228076165054, + "grad_norm": 2.349290132522583, + "learning_rate": 4.6342292710935925e-05, + "loss": 1.0281, + "step": 235700 + }, + { + "epoch": 3.6592746628594486, + "grad_norm": 2.181644916534424, + "learning_rate": 4.634074085569298e-05, + "loss": 1.0143, + "step": 235800 + }, + { + "epoch": 3.6608265181023913, + "grad_norm": 2.0875437259674072, + "learning_rate": 4.633918900045004e-05, + "loss": 1.0099, + "step": 235900 + }, + { + "epoch": 3.6623783733453346, + "grad_norm": 2.418869733810425, + "learning_rate": 4.63376371452071e-05, + "loss": 1.0195, + "step": 236000 + }, + { + "epoch": 3.6639302285882773, + "grad_norm": 2.2838144302368164, + "learning_rate": 4.6336085289964156e-05, + "loss": 1.0136, + "step": 236100 + }, + { + "epoch": 3.66548208383122, + "grad_norm": 2.2078473567962646, + "learning_rate": 4.6334533434721214e-05, + "loss": 1.022, + "step": 236200 + }, + { + "epoch": 3.6670339390741633, + "grad_norm": 2.5008809566497803, + "learning_rate": 4.633298157947827e-05, + "loss": 1.0229, + "step": 236300 + }, + { + "epoch": 3.668585794317106, + "grad_norm": 2.4339613914489746, + "learning_rate": 4.633142972423532e-05, + "loss": 1.0245, + "step": 236400 + }, + { + "epoch": 3.670137649560049, + "grad_norm": 2.0810017585754395, + "learning_rate": 4.632987786899238e-05, + "loss": 1.0142, + "step": 236500 + }, + { + "epoch": 3.671689504802992, + "grad_norm": 2.1632440090179443, + "learning_rate": 4.632832601374944e-05, + "loss": 1.0463, + "step": 236600 + }, + { + "epoch": 3.673241360045935, + "grad_norm": 2.486035108566284, + "learning_rate": 4.6326774158506496e-05, + "loss": 1.0124, + "step": 236700 + }, + { + "epoch": 3.6747932152888776, + "grad_norm": 2.093404769897461, + "learning_rate": 4.6325222303263553e-05, + "loss": 0.9985, + "step": 236800 + }, + { + "epoch": 3.676345070531821, + "grad_norm": 2.04958176612854, + "learning_rate": 4.632367044802061e-05, + "loss": 0.996, + "step": 236900 + }, + { + "epoch": 3.6778969257747636, + "grad_norm": 1.9408975839614868, + "learning_rate": 4.632211859277767e-05, + "loss": 1.0338, + "step": 237000 + }, + { + "epoch": 3.679448781017707, + "grad_norm": 2.4448330402374268, + "learning_rate": 4.632056673753473e-05, + "loss": 1.0033, + "step": 237100 + }, + { + "epoch": 3.6810006362606496, + "grad_norm": 2.017882823944092, + "learning_rate": 4.6319014882291784e-05, + "loss": 1.0175, + "step": 237200 + }, + { + "epoch": 3.6825524915035928, + "grad_norm": 2.3582608699798584, + "learning_rate": 4.631746302704884e-05, + "loss": 1.0086, + "step": 237300 + }, + { + "epoch": 3.6841043467465355, + "grad_norm": 2.182378053665161, + "learning_rate": 4.63159111718059e-05, + "loss": 1.01, + "step": 237400 + }, + { + "epoch": 3.6856562019894783, + "grad_norm": 2.016113519668579, + "learning_rate": 4.631435931656296e-05, + "loss": 1.0044, + "step": 237500 + }, + { + "epoch": 3.6872080572324215, + "grad_norm": 3.188378095626831, + "learning_rate": 4.6312807461320015e-05, + "loss": 1.0112, + "step": 237600 + }, + { + "epoch": 3.6887599124753643, + "grad_norm": 2.140146017074585, + "learning_rate": 4.6311255606077066e-05, + "loss": 1.0087, + "step": 237700 + }, + { + "epoch": 3.690311767718307, + "grad_norm": 2.254169464111328, + "learning_rate": 4.6309703750834124e-05, + "loss": 1.0109, + "step": 237800 + }, + { + "epoch": 3.6918636229612503, + "grad_norm": 1.9383188486099243, + "learning_rate": 4.630815189559118e-05, + "loss": 1.0115, + "step": 237900 + }, + { + "epoch": 3.693415478204193, + "grad_norm": 2.0068345069885254, + "learning_rate": 4.630660004034823e-05, + "loss": 1.0154, + "step": 238000 + }, + { + "epoch": 3.694967333447136, + "grad_norm": 1.9899184703826904, + "learning_rate": 4.630504818510529e-05, + "loss": 1.0055, + "step": 238100 + }, + { + "epoch": 3.696519188690079, + "grad_norm": 2.1594367027282715, + "learning_rate": 4.630349632986235e-05, + "loss": 0.9956, + "step": 238200 + }, + { + "epoch": 3.6980710439330218, + "grad_norm": 2.3453996181488037, + "learning_rate": 4.6301944474619406e-05, + "loss": 1.0253, + "step": 238300 + }, + { + "epoch": 3.699622899175965, + "grad_norm": 1.785625696182251, + "learning_rate": 4.6300392619376464e-05, + "loss": 1.0078, + "step": 238400 + }, + { + "epoch": 3.7011747544189078, + "grad_norm": 2.0943689346313477, + "learning_rate": 4.629884076413352e-05, + "loss": 0.9961, + "step": 238500 + }, + { + "epoch": 3.702726609661851, + "grad_norm": 1.874801754951477, + "learning_rate": 4.629728890889058e-05, + "loss": 1.0289, + "step": 238600 + }, + { + "epoch": 3.7042784649047937, + "grad_norm": 2.128554105758667, + "learning_rate": 4.629573705364764e-05, + "loss": 1.0201, + "step": 238700 + }, + { + "epoch": 3.7058303201477365, + "grad_norm": 2.1019725799560547, + "learning_rate": 4.6294185198404695e-05, + "loss": 1.013, + "step": 238800 + }, + { + "epoch": 3.7073821753906797, + "grad_norm": 1.6486116647720337, + "learning_rate": 4.629263334316175e-05, + "loss": 1.0029, + "step": 238900 + }, + { + "epoch": 3.7089340306336225, + "grad_norm": 1.9009175300598145, + "learning_rate": 4.629108148791881e-05, + "loss": 1.0143, + "step": 239000 + }, + { + "epoch": 3.7104858858765652, + "grad_norm": 2.5965166091918945, + "learning_rate": 4.628952963267587e-05, + "loss": 1.0241, + "step": 239100 + }, + { + "epoch": 3.7120377411195085, + "grad_norm": 2.3161373138427734, + "learning_rate": 4.6287977777432926e-05, + "loss": 1.0172, + "step": 239200 + }, + { + "epoch": 3.7135895963624512, + "grad_norm": 1.735445261001587, + "learning_rate": 4.628642592218998e-05, + "loss": 0.9958, + "step": 239300 + }, + { + "epoch": 3.715141451605394, + "grad_norm": 2.472639799118042, + "learning_rate": 4.6284874066947035e-05, + "loss": 1.0172, + "step": 239400 + }, + { + "epoch": 3.716693306848337, + "grad_norm": 2.109117031097412, + "learning_rate": 4.628332221170409e-05, + "loss": 1.0195, + "step": 239500 + }, + { + "epoch": 3.71824516209128, + "grad_norm": 2.0460574626922607, + "learning_rate": 4.628177035646115e-05, + "loss": 1.004, + "step": 239600 + }, + { + "epoch": 3.719797017334223, + "grad_norm": 2.4595510959625244, + "learning_rate": 4.628021850121821e-05, + "loss": 0.9977, + "step": 239700 + }, + { + "epoch": 3.721348872577166, + "grad_norm": 2.2073729038238525, + "learning_rate": 4.6278666645975266e-05, + "loss": 1.0023, + "step": 239800 + }, + { + "epoch": 3.722900727820109, + "grad_norm": 2.154287815093994, + "learning_rate": 4.6277114790732323e-05, + "loss": 1.0148, + "step": 239900 + }, + { + "epoch": 3.724452583063052, + "grad_norm": 2.0740292072296143, + "learning_rate": 4.627556293548938e-05, + "loss": 1.0131, + "step": 240000 + }, + { + "epoch": 3.7260044383059947, + "grad_norm": 2.6240620613098145, + "learning_rate": 4.627401108024644e-05, + "loss": 1.0124, + "step": 240100 + }, + { + "epoch": 3.727556293548938, + "grad_norm": 1.9416288137435913, + "learning_rate": 4.62724592250035e-05, + "loss": 1.0119, + "step": 240200 + }, + { + "epoch": 3.7291081487918807, + "grad_norm": 2.1373040676116943, + "learning_rate": 4.6270907369760554e-05, + "loss": 1.009, + "step": 240300 + }, + { + "epoch": 3.7306600040348235, + "grad_norm": 1.6925129890441895, + "learning_rate": 4.626935551451761e-05, + "loss": 1.0065, + "step": 240400 + }, + { + "epoch": 3.7322118592777667, + "grad_norm": 2.3495755195617676, + "learning_rate": 4.626780365927467e-05, + "loss": 1.0094, + "step": 240500 + }, + { + "epoch": 3.7337637145207094, + "grad_norm": 2.175574779510498, + "learning_rate": 4.626625180403172e-05, + "loss": 1.036, + "step": 240600 + }, + { + "epoch": 3.735315569763652, + "grad_norm": 2.241285562515259, + "learning_rate": 4.626469994878878e-05, + "loss": 1.0111, + "step": 240700 + }, + { + "epoch": 3.7368674250065954, + "grad_norm": 2.4279415607452393, + "learning_rate": 4.6263148093545836e-05, + "loss": 1.007, + "step": 240800 + }, + { + "epoch": 3.738419280249538, + "grad_norm": 2.5062997341156006, + "learning_rate": 4.6261596238302894e-05, + "loss": 1.0106, + "step": 240900 + }, + { + "epoch": 3.7399711354924814, + "grad_norm": 1.7721853256225586, + "learning_rate": 4.626004438305995e-05, + "loss": 1.0039, + "step": 241000 + }, + { + "epoch": 3.741522990735424, + "grad_norm": 2.4409706592559814, + "learning_rate": 4.625849252781701e-05, + "loss": 0.9988, + "step": 241100 + }, + { + "epoch": 3.7430748459783674, + "grad_norm": 2.2143893241882324, + "learning_rate": 4.625694067257407e-05, + "loss": 1.0211, + "step": 241200 + }, + { + "epoch": 3.74462670122131, + "grad_norm": 2.529294490814209, + "learning_rate": 4.625538881733112e-05, + "loss": 1.0369, + "step": 241300 + }, + { + "epoch": 3.746178556464253, + "grad_norm": 2.6420693397521973, + "learning_rate": 4.6253836962088176e-05, + "loss": 1.0154, + "step": 241400 + }, + { + "epoch": 3.747730411707196, + "grad_norm": 2.7493693828582764, + "learning_rate": 4.6252285106845234e-05, + "loss": 0.9988, + "step": 241500 + }, + { + "epoch": 3.749282266950139, + "grad_norm": 2.006983757019043, + "learning_rate": 4.625073325160229e-05, + "loss": 1.0263, + "step": 241600 + }, + { + "epoch": 3.7508341221930817, + "grad_norm": 2.709113597869873, + "learning_rate": 4.624918139635935e-05, + "loss": 1.0013, + "step": 241700 + }, + { + "epoch": 3.752385977436025, + "grad_norm": 2.125389575958252, + "learning_rate": 4.624762954111641e-05, + "loss": 1.0224, + "step": 241800 + }, + { + "epoch": 3.7539378326789676, + "grad_norm": 2.0216705799102783, + "learning_rate": 4.6246077685873465e-05, + "loss": 1.0338, + "step": 241900 + }, + { + "epoch": 3.7554896879219104, + "grad_norm": 2.261094093322754, + "learning_rate": 4.624452583063052e-05, + "loss": 1.0214, + "step": 242000 + }, + { + "epoch": 3.7570415431648536, + "grad_norm": 2.273343086242676, + "learning_rate": 4.6242973975387574e-05, + "loss": 1.0174, + "step": 242100 + }, + { + "epoch": 3.7585933984077964, + "grad_norm": 2.352085828781128, + "learning_rate": 4.624142212014463e-05, + "loss": 1.0171, + "step": 242200 + }, + { + "epoch": 3.7601452536507396, + "grad_norm": 2.17459774017334, + "learning_rate": 4.623987026490169e-05, + "loss": 1.0293, + "step": 242300 + }, + { + "epoch": 3.7616971088936824, + "grad_norm": 2.500917434692383, + "learning_rate": 4.623831840965875e-05, + "loss": 1.0251, + "step": 242400 + }, + { + "epoch": 3.7632489641366256, + "grad_norm": 2.330687999725342, + "learning_rate": 4.6236766554415805e-05, + "loss": 1.0277, + "step": 242500 + }, + { + "epoch": 3.7648008193795683, + "grad_norm": 2.0285775661468506, + "learning_rate": 4.623521469917286e-05, + "loss": 1.0147, + "step": 242600 + }, + { + "epoch": 3.766352674622511, + "grad_norm": 2.1773195266723633, + "learning_rate": 4.623366284392992e-05, + "loss": 1.0226, + "step": 242700 + }, + { + "epoch": 3.7679045298654543, + "grad_norm": 2.0629265308380127, + "learning_rate": 4.623211098868698e-05, + "loss": 0.9957, + "step": 242800 + }, + { + "epoch": 3.769456385108397, + "grad_norm": 2.370910882949829, + "learning_rate": 4.6230559133444036e-05, + "loss": 0.9989, + "step": 242900 + }, + { + "epoch": 3.77100824035134, + "grad_norm": 2.348848819732666, + "learning_rate": 4.6229007278201093e-05, + "loss": 1.0077, + "step": 243000 + }, + { + "epoch": 3.772560095594283, + "grad_norm": 2.2534232139587402, + "learning_rate": 4.622745542295815e-05, + "loss": 1.0189, + "step": 243100 + }, + { + "epoch": 3.774111950837226, + "grad_norm": 2.028949022293091, + "learning_rate": 4.622590356771521e-05, + "loss": 1.0374, + "step": 243200 + }, + { + "epoch": 3.7756638060801686, + "grad_norm": 2.0232532024383545, + "learning_rate": 4.622435171247227e-05, + "loss": 1.0097, + "step": 243300 + }, + { + "epoch": 3.777215661323112, + "grad_norm": 1.9040672779083252, + "learning_rate": 4.622279985722932e-05, + "loss": 1.0093, + "step": 243400 + }, + { + "epoch": 3.7787675165660546, + "grad_norm": 2.019620418548584, + "learning_rate": 4.6221248001986375e-05, + "loss": 1.0186, + "step": 243500 + }, + { + "epoch": 3.780319371808998, + "grad_norm": 3.3194851875305176, + "learning_rate": 4.621969614674343e-05, + "loss": 1.0146, + "step": 243600 + }, + { + "epoch": 3.7818712270519406, + "grad_norm": 2.1685357093811035, + "learning_rate": 4.621814429150049e-05, + "loss": 0.9949, + "step": 243700 + }, + { + "epoch": 3.7834230822948838, + "grad_norm": 2.325676202774048, + "learning_rate": 4.621659243625755e-05, + "loss": 1.0073, + "step": 243800 + }, + { + "epoch": 3.7849749375378265, + "grad_norm": 2.2917325496673584, + "learning_rate": 4.6215040581014606e-05, + "loss": 1.0292, + "step": 243900 + }, + { + "epoch": 3.7865267927807693, + "grad_norm": 2.2997236251831055, + "learning_rate": 4.6213488725771664e-05, + "loss": 1.0146, + "step": 244000 + }, + { + "epoch": 3.7880786480237125, + "grad_norm": 1.9001855850219727, + "learning_rate": 4.621193687052872e-05, + "loss": 1.0333, + "step": 244100 + }, + { + "epoch": 3.7896305032666553, + "grad_norm": 1.9370079040527344, + "learning_rate": 4.621038501528578e-05, + "loss": 1.0064, + "step": 244200 + }, + { + "epoch": 3.791182358509598, + "grad_norm": 1.9764480590820312, + "learning_rate": 4.620883316004284e-05, + "loss": 1.0031, + "step": 244300 + }, + { + "epoch": 3.7927342137525413, + "grad_norm": 2.7530527114868164, + "learning_rate": 4.6207281304799895e-05, + "loss": 1.0009, + "step": 244400 + }, + { + "epoch": 3.794286068995484, + "grad_norm": 2.693455696105957, + "learning_rate": 4.6205729449556946e-05, + "loss": 1.007, + "step": 244500 + }, + { + "epoch": 3.795837924238427, + "grad_norm": 2.3306798934936523, + "learning_rate": 4.6204177594314004e-05, + "loss": 0.9902, + "step": 244600 + }, + { + "epoch": 3.79738977948137, + "grad_norm": 1.8344670534133911, + "learning_rate": 4.620262573907106e-05, + "loss": 1.0159, + "step": 244700 + }, + { + "epoch": 3.798941634724313, + "grad_norm": 2.201124429702759, + "learning_rate": 4.620107388382812e-05, + "loss": 0.9813, + "step": 244800 + }, + { + "epoch": 3.800493489967256, + "grad_norm": 2.0185108184814453, + "learning_rate": 4.619952202858517e-05, + "loss": 1.0284, + "step": 244900 + }, + { + "epoch": 3.8020453452101988, + "grad_norm": 2.5429863929748535, + "learning_rate": 4.619797017334223e-05, + "loss": 1.001, + "step": 245000 + }, + { + "epoch": 3.803597200453142, + "grad_norm": 2.29585337638855, + "learning_rate": 4.6196418318099286e-05, + "loss": 1.0062, + "step": 245100 + }, + { + "epoch": 3.8051490556960847, + "grad_norm": 2.349848747253418, + "learning_rate": 4.6194866462856344e-05, + "loss": 1.0047, + "step": 245200 + }, + { + "epoch": 3.8067009109390275, + "grad_norm": 2.093092918395996, + "learning_rate": 4.61933146076134e-05, + "loss": 1.0143, + "step": 245300 + }, + { + "epoch": 3.8082527661819707, + "grad_norm": 1.997149109840393, + "learning_rate": 4.619176275237046e-05, + "loss": 1.0163, + "step": 245400 + }, + { + "epoch": 3.8098046214249135, + "grad_norm": 2.0725667476654053, + "learning_rate": 4.619021089712752e-05, + "loss": 1.0231, + "step": 245500 + }, + { + "epoch": 3.8113564766678563, + "grad_norm": 2.260093927383423, + "learning_rate": 4.6188659041884575e-05, + "loss": 1.0143, + "step": 245600 + }, + { + "epoch": 3.8129083319107995, + "grad_norm": 2.2428982257843018, + "learning_rate": 4.618710718664163e-05, + "loss": 1.0054, + "step": 245700 + }, + { + "epoch": 3.8144601871537422, + "grad_norm": 2.4710702896118164, + "learning_rate": 4.618555533139869e-05, + "loss": 1.0173, + "step": 245800 + }, + { + "epoch": 3.816012042396685, + "grad_norm": 1.989723563194275, + "learning_rate": 4.618400347615575e-05, + "loss": 1.0017, + "step": 245900 + }, + { + "epoch": 3.8175638976396282, + "grad_norm": 2.8177998065948486, + "learning_rate": 4.6182451620912806e-05, + "loss": 0.9993, + "step": 246000 + }, + { + "epoch": 3.819115752882571, + "grad_norm": 2.377488136291504, + "learning_rate": 4.6180899765669863e-05, + "loss": 1.0073, + "step": 246100 + }, + { + "epoch": 3.820667608125514, + "grad_norm": 2.229889392852783, + "learning_rate": 4.6179347910426914e-05, + "loss": 1.0073, + "step": 246200 + }, + { + "epoch": 3.822219463368457, + "grad_norm": 2.1102564334869385, + "learning_rate": 4.617779605518397e-05, + "loss": 1.0158, + "step": 246300 + }, + { + "epoch": 3.8237713186114, + "grad_norm": 2.011847734451294, + "learning_rate": 4.617624419994103e-05, + "loss": 1.0139, + "step": 246400 + }, + { + "epoch": 3.825323173854343, + "grad_norm": 2.3082711696624756, + "learning_rate": 4.617469234469809e-05, + "loss": 1.015, + "step": 246500 + }, + { + "epoch": 3.8268750290972857, + "grad_norm": 2.7512242794036865, + "learning_rate": 4.6173140489455145e-05, + "loss": 1.0104, + "step": 246600 + }, + { + "epoch": 3.828426884340229, + "grad_norm": 2.3987085819244385, + "learning_rate": 4.61715886342122e-05, + "loss": 1.023, + "step": 246700 + }, + { + "epoch": 3.8299787395831717, + "grad_norm": 2.2878665924072266, + "learning_rate": 4.617003677896926e-05, + "loss": 1.0251, + "step": 246800 + }, + { + "epoch": 3.8315305948261145, + "grad_norm": 2.284198522567749, + "learning_rate": 4.616848492372632e-05, + "loss": 1.0123, + "step": 246900 + }, + { + "epoch": 3.8330824500690577, + "grad_norm": 2.4234066009521484, + "learning_rate": 4.6166933068483376e-05, + "loss": 1.0232, + "step": 247000 + }, + { + "epoch": 3.8346343053120004, + "grad_norm": 2.310163974761963, + "learning_rate": 4.6165381213240434e-05, + "loss": 1.025, + "step": 247100 + }, + { + "epoch": 3.836186160554943, + "grad_norm": 2.2584455013275146, + "learning_rate": 4.616382935799749e-05, + "loss": 1.0408, + "step": 247200 + }, + { + "epoch": 3.8377380157978864, + "grad_norm": 1.9721930027008057, + "learning_rate": 4.616227750275455e-05, + "loss": 0.9959, + "step": 247300 + }, + { + "epoch": 3.839289871040829, + "grad_norm": 2.072402000427246, + "learning_rate": 4.616072564751161e-05, + "loss": 1.0124, + "step": 247400 + }, + { + "epoch": 3.8408417262837724, + "grad_norm": 2.4587271213531494, + "learning_rate": 4.615917379226866e-05, + "loss": 1.0163, + "step": 247500 + }, + { + "epoch": 3.842393581526715, + "grad_norm": 2.407766103744507, + "learning_rate": 4.6157621937025716e-05, + "loss": 1.012, + "step": 247600 + }, + { + "epoch": 3.8439454367696584, + "grad_norm": 2.135498285293579, + "learning_rate": 4.6156070081782774e-05, + "loss": 1.0076, + "step": 247700 + }, + { + "epoch": 3.845497292012601, + "grad_norm": 2.279269218444824, + "learning_rate": 4.6154518226539825e-05, + "loss": 1.019, + "step": 247800 + }, + { + "epoch": 3.847049147255544, + "grad_norm": 2.4332151412963867, + "learning_rate": 4.615296637129688e-05, + "loss": 1.0035, + "step": 247900 + }, + { + "epoch": 3.848601002498487, + "grad_norm": 2.711205244064331, + "learning_rate": 4.615141451605394e-05, + "loss": 1.0026, + "step": 248000 + }, + { + "epoch": 3.85015285774143, + "grad_norm": 1.9770572185516357, + "learning_rate": 4.6149862660811e-05, + "loss": 1.0148, + "step": 248100 + }, + { + "epoch": 3.8517047129843727, + "grad_norm": 2.3384222984313965, + "learning_rate": 4.6148310805568056e-05, + "loss": 1.0205, + "step": 248200 + }, + { + "epoch": 3.853256568227316, + "grad_norm": 2.100019931793213, + "learning_rate": 4.6146758950325114e-05, + "loss": 1.0121, + "step": 248300 + }, + { + "epoch": 3.8548084234702586, + "grad_norm": 2.0259838104248047, + "learning_rate": 4.614520709508217e-05, + "loss": 1.0121, + "step": 248400 + }, + { + "epoch": 3.8563602787132014, + "grad_norm": 2.5189080238342285, + "learning_rate": 4.614365523983923e-05, + "loss": 1.0095, + "step": 248500 + }, + { + "epoch": 3.8579121339561446, + "grad_norm": 2.0332727432250977, + "learning_rate": 4.614210338459629e-05, + "loss": 1.0201, + "step": 248600 + }, + { + "epoch": 3.8594639891990874, + "grad_norm": 2.2269084453582764, + "learning_rate": 4.6140551529353345e-05, + "loss": 1.0274, + "step": 248700 + }, + { + "epoch": 3.8610158444420306, + "grad_norm": 1.881072759628296, + "learning_rate": 4.61389996741104e-05, + "loss": 1.0157, + "step": 248800 + }, + { + "epoch": 3.8625676996849734, + "grad_norm": 2.1389148235321045, + "learning_rate": 4.613744781886746e-05, + "loss": 0.9943, + "step": 248900 + }, + { + "epoch": 3.8641195549279166, + "grad_norm": 2.409604072570801, + "learning_rate": 4.613589596362452e-05, + "loss": 1.0225, + "step": 249000 + }, + { + "epoch": 3.8656714101708594, + "grad_norm": 2.4702587127685547, + "learning_rate": 4.613434410838157e-05, + "loss": 0.9991, + "step": 249100 + }, + { + "epoch": 3.867223265413802, + "grad_norm": 2.048389196395874, + "learning_rate": 4.613279225313863e-05, + "loss": 1.0358, + "step": 249200 + }, + { + "epoch": 3.8687751206567453, + "grad_norm": 3.0865375995635986, + "learning_rate": 4.6131240397895684e-05, + "loss": 1.0141, + "step": 249300 + }, + { + "epoch": 3.870326975899688, + "grad_norm": 2.165404796600342, + "learning_rate": 4.612968854265274e-05, + "loss": 1.0115, + "step": 249400 + }, + { + "epoch": 3.871878831142631, + "grad_norm": 2.406437873840332, + "learning_rate": 4.61281366874098e-05, + "loss": 1.0022, + "step": 249500 + }, + { + "epoch": 3.873430686385574, + "grad_norm": 2.142561912536621, + "learning_rate": 4.612658483216686e-05, + "loss": 1.0028, + "step": 249600 + }, + { + "epoch": 3.874982541628517, + "grad_norm": 2.099102258682251, + "learning_rate": 4.6125032976923915e-05, + "loss": 1.0243, + "step": 249700 + }, + { + "epoch": 3.8765343968714596, + "grad_norm": 1.9429491758346558, + "learning_rate": 4.612348112168097e-05, + "loss": 1.0147, + "step": 249800 + }, + { + "epoch": 3.878086252114403, + "grad_norm": 2.4290895462036133, + "learning_rate": 4.612192926643803e-05, + "loss": 1.0241, + "step": 249900 + }, + { + "epoch": 3.8796381073573456, + "grad_norm": 1.9997496604919434, + "learning_rate": 4.612037741119509e-05, + "loss": 1.007, + "step": 250000 + }, + { + "epoch": 3.8811899626002884, + "grad_norm": 2.474494218826294, + "learning_rate": 4.6118825555952146e-05, + "loss": 0.992, + "step": 250100 + }, + { + "epoch": 3.8827418178432316, + "grad_norm": 2.500391721725464, + "learning_rate": 4.6117273700709204e-05, + "loss": 1.0045, + "step": 250200 + }, + { + "epoch": 3.884293673086175, + "grad_norm": 2.389751672744751, + "learning_rate": 4.611572184546626e-05, + "loss": 1.0108, + "step": 250300 + }, + { + "epoch": 3.8858455283291176, + "grad_norm": 2.125541925430298, + "learning_rate": 4.611416999022331e-05, + "loss": 1.0037, + "step": 250400 + }, + { + "epoch": 3.8873973835720603, + "grad_norm": 1.850882649421692, + "learning_rate": 4.611261813498037e-05, + "loss": 1.0017, + "step": 250500 + }, + { + "epoch": 3.8889492388150035, + "grad_norm": 2.331622362136841, + "learning_rate": 4.611106627973743e-05, + "loss": 1.0239, + "step": 250600 + }, + { + "epoch": 3.8905010940579463, + "grad_norm": 2.4492251873016357, + "learning_rate": 4.6109514424494486e-05, + "loss": 1.0074, + "step": 250700 + }, + { + "epoch": 3.892052949300889, + "grad_norm": 2.2282145023345947, + "learning_rate": 4.6107962569251544e-05, + "loss": 1.0033, + "step": 250800 + }, + { + "epoch": 3.8936048045438323, + "grad_norm": 2.3184473514556885, + "learning_rate": 4.61064107140086e-05, + "loss": 1.0039, + "step": 250900 + }, + { + "epoch": 3.895156659786775, + "grad_norm": 2.4361066818237305, + "learning_rate": 4.610485885876565e-05, + "loss": 0.9914, + "step": 251000 + }, + { + "epoch": 3.896708515029718, + "grad_norm": 2.44022798538208, + "learning_rate": 4.610330700352271e-05, + "loss": 1.0096, + "step": 251100 + }, + { + "epoch": 3.898260370272661, + "grad_norm": 2.6742916107177734, + "learning_rate": 4.610175514827977e-05, + "loss": 1.0153, + "step": 251200 + }, + { + "epoch": 3.899812225515604, + "grad_norm": 2.2265617847442627, + "learning_rate": 4.6100203293036826e-05, + "loss": 1.0184, + "step": 251300 + }, + { + "epoch": 3.9013640807585466, + "grad_norm": 2.047741651535034, + "learning_rate": 4.6098651437793884e-05, + "loss": 0.9966, + "step": 251400 + }, + { + "epoch": 3.90291593600149, + "grad_norm": 1.9516500234603882, + "learning_rate": 4.609709958255094e-05, + "loss": 1.021, + "step": 251500 + }, + { + "epoch": 3.904467791244433, + "grad_norm": 2.1483633518218994, + "learning_rate": 4.6095547727308e-05, + "loss": 1.0045, + "step": 251600 + }, + { + "epoch": 3.9060196464873758, + "grad_norm": 2.2886312007904053, + "learning_rate": 4.609399587206506e-05, + "loss": 1.0047, + "step": 251700 + }, + { + "epoch": 3.9075715017303185, + "grad_norm": 2.082888603210449, + "learning_rate": 4.6092444016822115e-05, + "loss": 1.0131, + "step": 251800 + }, + { + "epoch": 3.9091233569732617, + "grad_norm": 2.2767488956451416, + "learning_rate": 4.6090892161579166e-05, + "loss": 0.9936, + "step": 251900 + }, + { + "epoch": 3.9106752122162045, + "grad_norm": 2.366884231567383, + "learning_rate": 4.6089340306336223e-05, + "loss": 0.9991, + "step": 252000 + }, + { + "epoch": 3.9122270674591473, + "grad_norm": 2.117441177368164, + "learning_rate": 4.608778845109328e-05, + "loss": 1.0047, + "step": 252100 + }, + { + "epoch": 3.9137789227020905, + "grad_norm": 2.2630763053894043, + "learning_rate": 4.608623659585034e-05, + "loss": 0.989, + "step": 252200 + }, + { + "epoch": 3.9153307779450333, + "grad_norm": 2.3058600425720215, + "learning_rate": 4.60846847406074e-05, + "loss": 0.9771, + "step": 252300 + }, + { + "epoch": 3.916882633187976, + "grad_norm": 2.203831911087036, + "learning_rate": 4.6083132885364454e-05, + "loss": 0.9869, + "step": 252400 + }, + { + "epoch": 3.9184344884309192, + "grad_norm": 2.3880887031555176, + "learning_rate": 4.608158103012151e-05, + "loss": 0.9929, + "step": 252500 + }, + { + "epoch": 3.919986343673862, + "grad_norm": 2.3750267028808594, + "learning_rate": 4.608002917487857e-05, + "loss": 1.0143, + "step": 252600 + }, + { + "epoch": 3.9215381989168048, + "grad_norm": 2.575727701187134, + "learning_rate": 4.607847731963563e-05, + "loss": 1.0076, + "step": 252700 + }, + { + "epoch": 3.923090054159748, + "grad_norm": 2.406012773513794, + "learning_rate": 4.6076925464392685e-05, + "loss": 1.0205, + "step": 252800 + }, + { + "epoch": 3.9246419094026908, + "grad_norm": 2.0970282554626465, + "learning_rate": 4.607537360914974e-05, + "loss": 1.0122, + "step": 252900 + }, + { + "epoch": 3.926193764645634, + "grad_norm": 2.0307211875915527, + "learning_rate": 4.60738217539068e-05, + "loss": 1.0216, + "step": 253000 + }, + { + "epoch": 3.9277456198885767, + "grad_norm": 1.9782108068466187, + "learning_rate": 4.607226989866386e-05, + "loss": 1.0109, + "step": 253100 + }, + { + "epoch": 3.92929747513152, + "grad_norm": 2.2451000213623047, + "learning_rate": 4.607071804342091e-05, + "loss": 1.0028, + "step": 253200 + }, + { + "epoch": 3.9308493303744627, + "grad_norm": 2.2572503089904785, + "learning_rate": 4.606916618817797e-05, + "loss": 0.9944, + "step": 253300 + }, + { + "epoch": 3.9324011856174055, + "grad_norm": 2.0190742015838623, + "learning_rate": 4.6067614332935025e-05, + "loss": 1.0097, + "step": 253400 + }, + { + "epoch": 3.9339530408603487, + "grad_norm": 2.285947322845459, + "learning_rate": 4.606606247769208e-05, + "loss": 1.0194, + "step": 253500 + }, + { + "epoch": 3.9355048961032915, + "grad_norm": 2.201874017715454, + "learning_rate": 4.606451062244914e-05, + "loss": 1.0209, + "step": 253600 + }, + { + "epoch": 3.9370567513462342, + "grad_norm": 2.6722114086151123, + "learning_rate": 4.60629587672062e-05, + "loss": 0.9902, + "step": 253700 + }, + { + "epoch": 3.9386086065891774, + "grad_norm": 2.5566859245300293, + "learning_rate": 4.6061406911963256e-05, + "loss": 0.9832, + "step": 253800 + }, + { + "epoch": 3.94016046183212, + "grad_norm": 2.0157370567321777, + "learning_rate": 4.6059855056720314e-05, + "loss": 0.9794, + "step": 253900 + }, + { + "epoch": 3.941712317075063, + "grad_norm": 2.0200371742248535, + "learning_rate": 4.605830320147737e-05, + "loss": 0.9939, + "step": 254000 + }, + { + "epoch": 3.943264172318006, + "grad_norm": 2.292102336883545, + "learning_rate": 4.605675134623443e-05, + "loss": 1.0105, + "step": 254100 + }, + { + "epoch": 3.944816027560949, + "grad_norm": 2.0531704425811768, + "learning_rate": 4.605519949099149e-05, + "loss": 1.0091, + "step": 254200 + }, + { + "epoch": 3.946367882803892, + "grad_norm": 1.9320237636566162, + "learning_rate": 4.605364763574854e-05, + "loss": 0.9875, + "step": 254300 + }, + { + "epoch": 3.947919738046835, + "grad_norm": 2.305433988571167, + "learning_rate": 4.6052095780505596e-05, + "loss": 1.0017, + "step": 254400 + }, + { + "epoch": 3.949471593289778, + "grad_norm": 2.343123435974121, + "learning_rate": 4.6050543925262654e-05, + "loss": 1.0057, + "step": 254500 + }, + { + "epoch": 3.951023448532721, + "grad_norm": 2.5635619163513184, + "learning_rate": 4.604899207001971e-05, + "loss": 0.9993, + "step": 254600 + }, + { + "epoch": 3.9525753037756637, + "grad_norm": 2.2476115226745605, + "learning_rate": 4.604744021477677e-05, + "loss": 1.0299, + "step": 254700 + }, + { + "epoch": 3.954127159018607, + "grad_norm": 2.1379034519195557, + "learning_rate": 4.604588835953382e-05, + "loss": 0.9934, + "step": 254800 + }, + { + "epoch": 3.9556790142615497, + "grad_norm": 2.1756467819213867, + "learning_rate": 4.604433650429088e-05, + "loss": 1.0148, + "step": 254900 + }, + { + "epoch": 3.9572308695044924, + "grad_norm": 2.511732339859009, + "learning_rate": 4.6042784649047936e-05, + "loss": 0.9982, + "step": 255000 + }, + { + "epoch": 3.9587827247474356, + "grad_norm": 2.0807454586029053, + "learning_rate": 4.6041232793804993e-05, + "loss": 1.005, + "step": 255100 + }, + { + "epoch": 3.9603345799903784, + "grad_norm": 2.1180360317230225, + "learning_rate": 4.603968093856205e-05, + "loss": 1.003, + "step": 255200 + }, + { + "epoch": 3.961886435233321, + "grad_norm": 2.301769733428955, + "learning_rate": 4.603812908331911e-05, + "loss": 1.0037, + "step": 255300 + }, + { + "epoch": 3.9634382904762644, + "grad_norm": 2.085273504257202, + "learning_rate": 4.603657722807617e-05, + "loss": 1.0402, + "step": 255400 + }, + { + "epoch": 3.964990145719207, + "grad_norm": 2.3542139530181885, + "learning_rate": 4.6035025372833224e-05, + "loss": 0.9969, + "step": 255500 + }, + { + "epoch": 3.9665420009621504, + "grad_norm": 2.3570122718811035, + "learning_rate": 4.603347351759028e-05, + "loss": 0.9882, + "step": 255600 + }, + { + "epoch": 3.968093856205093, + "grad_norm": 2.451073408126831, + "learning_rate": 4.603192166234734e-05, + "loss": 1.0217, + "step": 255700 + }, + { + "epoch": 3.9696457114480364, + "grad_norm": 2.239896297454834, + "learning_rate": 4.60303698071044e-05, + "loss": 1.0256, + "step": 255800 + }, + { + "epoch": 3.971197566690979, + "grad_norm": 2.051710844039917, + "learning_rate": 4.6028817951861455e-05, + "loss": 0.9869, + "step": 255900 + }, + { + "epoch": 3.972749421933922, + "grad_norm": 2.2589364051818848, + "learning_rate": 4.602726609661851e-05, + "loss": 1.014, + "step": 256000 + }, + { + "epoch": 3.974301277176865, + "grad_norm": 2.065429210662842, + "learning_rate": 4.6025714241375564e-05, + "loss": 0.9957, + "step": 256100 + }, + { + "epoch": 3.975853132419808, + "grad_norm": 2.291639804840088, + "learning_rate": 4.602416238613262e-05, + "loss": 1.0029, + "step": 256200 + }, + { + "epoch": 3.9774049876627506, + "grad_norm": 1.8173938989639282, + "learning_rate": 4.602261053088968e-05, + "loss": 0.9764, + "step": 256300 + }, + { + "epoch": 3.978956842905694, + "grad_norm": 1.8184092044830322, + "learning_rate": 4.602105867564674e-05, + "loss": 1.012, + "step": 256400 + }, + { + "epoch": 3.9805086981486366, + "grad_norm": 2.0670435428619385, + "learning_rate": 4.6019506820403795e-05, + "loss": 0.9894, + "step": 256500 + }, + { + "epoch": 3.9820605533915794, + "grad_norm": 2.2796432971954346, + "learning_rate": 4.601795496516085e-05, + "loss": 0.998, + "step": 256600 + }, + { + "epoch": 3.9836124086345226, + "grad_norm": 2.5332934856414795, + "learning_rate": 4.601640310991791e-05, + "loss": 0.9865, + "step": 256700 + }, + { + "epoch": 3.9851642638774654, + "grad_norm": 2.750664472579956, + "learning_rate": 4.601485125467497e-05, + "loss": 0.998, + "step": 256800 + }, + { + "epoch": 3.9867161191204086, + "grad_norm": 2.2038350105285645, + "learning_rate": 4.6013299399432026e-05, + "loss": 1.0146, + "step": 256900 + }, + { + "epoch": 3.9882679743633513, + "grad_norm": 2.1637487411499023, + "learning_rate": 4.6011747544189084e-05, + "loss": 1.0171, + "step": 257000 + }, + { + "epoch": 3.9898198296062946, + "grad_norm": 2.032949447631836, + "learning_rate": 4.601019568894614e-05, + "loss": 0.9905, + "step": 257100 + }, + { + "epoch": 3.9913716848492373, + "grad_norm": 1.9132201671600342, + "learning_rate": 4.60086438337032e-05, + "loss": 1.0105, + "step": 257200 + }, + { + "epoch": 3.99292354009218, + "grad_norm": 2.50614070892334, + "learning_rate": 4.600709197846026e-05, + "loss": 1.032, + "step": 257300 + }, + { + "epoch": 3.9944753953351233, + "grad_norm": 1.8686712980270386, + "learning_rate": 4.600554012321731e-05, + "loss": 1.0212, + "step": 257400 + }, + { + "epoch": 3.996027250578066, + "grad_norm": 1.7639912366867065, + "learning_rate": 4.6003988267974366e-05, + "loss": 1.02, + "step": 257500 + }, + { + "epoch": 3.997579105821009, + "grad_norm": 1.807686448097229, + "learning_rate": 4.600243641273142e-05, + "loss": 1.0066, + "step": 257600 + }, + { + "epoch": 3.999130961063952, + "grad_norm": 2.4362223148345947, + "learning_rate": 4.6000884557488475e-05, + "loss": 0.9818, + "step": 257700 + }, + { + "epoch": 4.000682816306895, + "grad_norm": 2.0881383419036865, + "learning_rate": 4.599933270224553e-05, + "loss": 1.0072, + "step": 257800 + }, + { + "epoch": 4.002234671549838, + "grad_norm": 2.0971968173980713, + "learning_rate": 4.599778084700259e-05, + "loss": 0.996, + "step": 257900 + }, + { + "epoch": 4.003786526792781, + "grad_norm": 2.6785244941711426, + "learning_rate": 4.599622899175965e-05, + "loss": 1.0064, + "step": 258000 + }, + { + "epoch": 4.005338382035724, + "grad_norm": 2.0060083866119385, + "learning_rate": 4.5994677136516706e-05, + "loss": 0.9939, + "step": 258100 + }, + { + "epoch": 4.006890237278666, + "grad_norm": 2.1636555194854736, + "learning_rate": 4.5993125281273763e-05, + "loss": 1.0001, + "step": 258200 + }, + { + "epoch": 4.0084420925216095, + "grad_norm": 2.343156099319458, + "learning_rate": 4.599157342603082e-05, + "loss": 1.0007, + "step": 258300 + }, + { + "epoch": 4.009993947764553, + "grad_norm": 2.4066388607025146, + "learning_rate": 4.599002157078788e-05, + "loss": 1.0014, + "step": 258400 + }, + { + "epoch": 4.011545803007495, + "grad_norm": 2.094865560531616, + "learning_rate": 4.598846971554494e-05, + "loss": 1.0128, + "step": 258500 + }, + { + "epoch": 4.013097658250438, + "grad_norm": 1.8208328485488892, + "learning_rate": 4.5986917860301994e-05, + "loss": 0.9886, + "step": 258600 + }, + { + "epoch": 4.0146495134933815, + "grad_norm": 2.268063545227051, + "learning_rate": 4.598536600505905e-05, + "loss": 0.999, + "step": 258700 + }, + { + "epoch": 4.016201368736325, + "grad_norm": 2.788959264755249, + "learning_rate": 4.598381414981611e-05, + "loss": 1.0116, + "step": 258800 + }, + { + "epoch": 4.017753223979267, + "grad_norm": 2.5502994060516357, + "learning_rate": 4.598226229457316e-05, + "loss": 1.0105, + "step": 258900 + }, + { + "epoch": 4.01930507922221, + "grad_norm": 2.312920331954956, + "learning_rate": 4.598071043933022e-05, + "loss": 1.0157, + "step": 259000 + }, + { + "epoch": 4.0208569344651535, + "grad_norm": 2.3428311347961426, + "learning_rate": 4.5979158584087276e-05, + "loss": 1.0139, + "step": 259100 + }, + { + "epoch": 4.022408789708096, + "grad_norm": 2.463869333267212, + "learning_rate": 4.5977606728844334e-05, + "loss": 1.0156, + "step": 259200 + }, + { + "epoch": 4.023960644951039, + "grad_norm": 2.294227123260498, + "learning_rate": 4.597605487360139e-05, + "loss": 1.0073, + "step": 259300 + }, + { + "epoch": 4.025512500193982, + "grad_norm": 2.375056266784668, + "learning_rate": 4.597450301835845e-05, + "loss": 1.0253, + "step": 259400 + }, + { + "epoch": 4.0270643554369245, + "grad_norm": 2.5101332664489746, + "learning_rate": 4.597295116311551e-05, + "loss": 1.0256, + "step": 259500 + }, + { + "epoch": 4.028616210679868, + "grad_norm": 2.167787790298462, + "learning_rate": 4.5971399307872565e-05, + "loss": 0.9979, + "step": 259600 + }, + { + "epoch": 4.030168065922811, + "grad_norm": 1.9655741453170776, + "learning_rate": 4.596984745262962e-05, + "loss": 1.0224, + "step": 259700 + }, + { + "epoch": 4.031719921165753, + "grad_norm": 2.8912274837493896, + "learning_rate": 4.596829559738668e-05, + "loss": 1.006, + "step": 259800 + }, + { + "epoch": 4.0332717764086965, + "grad_norm": 2.127323627471924, + "learning_rate": 4.596674374214374e-05, + "loss": 1.0002, + "step": 259900 + }, + { + "epoch": 4.03482363165164, + "grad_norm": 2.308802843093872, + "learning_rate": 4.5965191886900796e-05, + "loss": 1.0108, + "step": 260000 + }, + { + "epoch": 4.036375486894583, + "grad_norm": 2.542405128479004, + "learning_rate": 4.5963640031657854e-05, + "loss": 1.0331, + "step": 260100 + }, + { + "epoch": 4.037927342137525, + "grad_norm": 1.9634498357772827, + "learning_rate": 4.5962088176414905e-05, + "loss": 0.9962, + "step": 260200 + }, + { + "epoch": 4.0394791973804685, + "grad_norm": 2.1191060543060303, + "learning_rate": 4.596053632117196e-05, + "loss": 0.9932, + "step": 260300 + }, + { + "epoch": 4.041031052623412, + "grad_norm": 2.114780902862549, + "learning_rate": 4.595898446592902e-05, + "loss": 0.9979, + "step": 260400 + }, + { + "epoch": 4.042582907866354, + "grad_norm": 2.1260523796081543, + "learning_rate": 4.595743261068608e-05, + "loss": 0.9973, + "step": 260500 + }, + { + "epoch": 4.044134763109297, + "grad_norm": 2.372663974761963, + "learning_rate": 4.5955880755443136e-05, + "loss": 0.9875, + "step": 260600 + }, + { + "epoch": 4.04568661835224, + "grad_norm": 2.4009511470794678, + "learning_rate": 4.5954328900200194e-05, + "loss": 0.9994, + "step": 260700 + }, + { + "epoch": 4.047238473595183, + "grad_norm": 1.8778742551803589, + "learning_rate": 4.5952777044957245e-05, + "loss": 1.0148, + "step": 260800 + }, + { + "epoch": 4.048790328838126, + "grad_norm": 2.228926181793213, + "learning_rate": 4.59512251897143e-05, + "loss": 0.989, + "step": 260900 + }, + { + "epoch": 4.050342184081069, + "grad_norm": 2.1706323623657227, + "learning_rate": 4.594967333447136e-05, + "loss": 1.0027, + "step": 261000 + }, + { + "epoch": 4.0518940393240115, + "grad_norm": 2.4426918029785156, + "learning_rate": 4.594812147922842e-05, + "loss": 0.9911, + "step": 261100 + }, + { + "epoch": 4.053445894566955, + "grad_norm": 2.161242723464966, + "learning_rate": 4.5946569623985476e-05, + "loss": 1.0056, + "step": 261200 + }, + { + "epoch": 4.054997749809898, + "grad_norm": 1.8238660097122192, + "learning_rate": 4.5945017768742533e-05, + "loss": 1.0124, + "step": 261300 + }, + { + "epoch": 4.05654960505284, + "grad_norm": 2.6080873012542725, + "learning_rate": 4.594346591349959e-05, + "loss": 1.0104, + "step": 261400 + }, + { + "epoch": 4.0581014602957834, + "grad_norm": 2.3893115520477295, + "learning_rate": 4.594191405825665e-05, + "loss": 0.9913, + "step": 261500 + }, + { + "epoch": 4.059653315538727, + "grad_norm": 2.1053595542907715, + "learning_rate": 4.594036220301371e-05, + "loss": 0.9868, + "step": 261600 + }, + { + "epoch": 4.06120517078167, + "grad_norm": 2.303849935531616, + "learning_rate": 4.593881034777076e-05, + "loss": 1.0127, + "step": 261700 + }, + { + "epoch": 4.062757026024612, + "grad_norm": 1.7979545593261719, + "learning_rate": 4.5937258492527815e-05, + "loss": 0.9976, + "step": 261800 + }, + { + "epoch": 4.064308881267555, + "grad_norm": 2.176025152206421, + "learning_rate": 4.593570663728487e-05, + "loss": 1.0364, + "step": 261900 + }, + { + "epoch": 4.065860736510499, + "grad_norm": 2.457047939300537, + "learning_rate": 4.593415478204193e-05, + "loss": 1.0102, + "step": 262000 + }, + { + "epoch": 4.067412591753441, + "grad_norm": 2.103670120239258, + "learning_rate": 4.593260292679899e-05, + "loss": 1.0176, + "step": 262100 + }, + { + "epoch": 4.068964446996384, + "grad_norm": 2.3812785148620605, + "learning_rate": 4.5931051071556046e-05, + "loss": 0.9834, + "step": 262200 + }, + { + "epoch": 4.070516302239327, + "grad_norm": 2.1732730865478516, + "learning_rate": 4.5929499216313104e-05, + "loss": 0.9911, + "step": 262300 + }, + { + "epoch": 4.07206815748227, + "grad_norm": 2.4105000495910645, + "learning_rate": 4.592794736107016e-05, + "loss": 1.0138, + "step": 262400 + }, + { + "epoch": 4.073620012725213, + "grad_norm": 2.6054818630218506, + "learning_rate": 4.592639550582722e-05, + "loss": 1.0001, + "step": 262500 + }, + { + "epoch": 4.075171867968156, + "grad_norm": 1.9678267240524292, + "learning_rate": 4.592484365058428e-05, + "loss": 1.005, + "step": 262600 + }, + { + "epoch": 4.076723723211098, + "grad_norm": 2.3858137130737305, + "learning_rate": 4.5923291795341335e-05, + "loss": 0.9852, + "step": 262700 + }, + { + "epoch": 4.078275578454042, + "grad_norm": 1.9260298013687134, + "learning_rate": 4.592173994009839e-05, + "loss": 0.9995, + "step": 262800 + }, + { + "epoch": 4.079827433696985, + "grad_norm": 2.237461566925049, + "learning_rate": 4.592018808485545e-05, + "loss": 1.002, + "step": 262900 + }, + { + "epoch": 4.081379288939928, + "grad_norm": 1.8933922052383423, + "learning_rate": 4.59186362296125e-05, + "loss": 1.0063, + "step": 263000 + }, + { + "epoch": 4.08293114418287, + "grad_norm": 2.222546339035034, + "learning_rate": 4.591708437436956e-05, + "loss": 0.9858, + "step": 263100 + }, + { + "epoch": 4.084482999425814, + "grad_norm": 2.262171745300293, + "learning_rate": 4.591553251912662e-05, + "loss": 0.9998, + "step": 263200 + }, + { + "epoch": 4.086034854668757, + "grad_norm": 2.122223377227783, + "learning_rate": 4.5913980663883675e-05, + "loss": 0.9885, + "step": 263300 + }, + { + "epoch": 4.087586709911699, + "grad_norm": 2.471280097961426, + "learning_rate": 4.591242880864073e-05, + "loss": 1.0067, + "step": 263400 + }, + { + "epoch": 4.089138565154642, + "grad_norm": 2.067857027053833, + "learning_rate": 4.591087695339779e-05, + "loss": 0.9984, + "step": 263500 + }, + { + "epoch": 4.090690420397586, + "grad_norm": 2.6935949325561523, + "learning_rate": 4.590932509815485e-05, + "loss": 1.0045, + "step": 263600 + }, + { + "epoch": 4.092242275640528, + "grad_norm": 1.9788835048675537, + "learning_rate": 4.5907773242911906e-05, + "loss": 1.0129, + "step": 263700 + }, + { + "epoch": 4.093794130883471, + "grad_norm": 2.317003011703491, + "learning_rate": 4.5906221387668964e-05, + "loss": 1.0158, + "step": 263800 + }, + { + "epoch": 4.095345986126414, + "grad_norm": 2.620793104171753, + "learning_rate": 4.590466953242602e-05, + "loss": 1.0176, + "step": 263900 + }, + { + "epoch": 4.096897841369357, + "grad_norm": 2.2395946979522705, + "learning_rate": 4.590311767718308e-05, + "loss": 1.0171, + "step": 264000 + }, + { + "epoch": 4.0984496966123, + "grad_norm": 2.3391828536987305, + "learning_rate": 4.590156582194013e-05, + "loss": 1.0139, + "step": 264100 + }, + { + "epoch": 4.100001551855243, + "grad_norm": 2.5829570293426514, + "learning_rate": 4.590001396669719e-05, + "loss": 1.004, + "step": 264200 + }, + { + "epoch": 4.101553407098186, + "grad_norm": 1.987902283668518, + "learning_rate": 4.5898462111454246e-05, + "loss": 0.9995, + "step": 264300 + }, + { + "epoch": 4.103105262341129, + "grad_norm": 2.1167943477630615, + "learning_rate": 4.5896910256211303e-05, + "loss": 1.0284, + "step": 264400 + }, + { + "epoch": 4.104657117584072, + "grad_norm": 2.583871364593506, + "learning_rate": 4.589535840096836e-05, + "loss": 1.0008, + "step": 264500 + }, + { + "epoch": 4.106208972827015, + "grad_norm": 2.3861241340637207, + "learning_rate": 4.589380654572541e-05, + "loss": 1.014, + "step": 264600 + }, + { + "epoch": 4.107760828069957, + "grad_norm": 2.2229995727539062, + "learning_rate": 4.589225469048247e-05, + "loss": 1.0051, + "step": 264700 + }, + { + "epoch": 4.109312683312901, + "grad_norm": 1.8920927047729492, + "learning_rate": 4.589070283523953e-05, + "loss": 1.0078, + "step": 264800 + }, + { + "epoch": 4.110864538555844, + "grad_norm": 2.1924002170562744, + "learning_rate": 4.5889150979996585e-05, + "loss": 0.9962, + "step": 264900 + }, + { + "epoch": 4.112416393798786, + "grad_norm": 2.237762212753296, + "learning_rate": 4.588759912475364e-05, + "loss": 1.0097, + "step": 265000 + }, + { + "epoch": 4.113968249041729, + "grad_norm": 2.4558346271514893, + "learning_rate": 4.58860472695107e-05, + "loss": 1.0142, + "step": 265100 + }, + { + "epoch": 4.1155201042846725, + "grad_norm": 2.175586700439453, + "learning_rate": 4.588449541426776e-05, + "loss": 1.0048, + "step": 265200 + }, + { + "epoch": 4.117071959527615, + "grad_norm": 2.0444562435150146, + "learning_rate": 4.5882943559024816e-05, + "loss": 1.0, + "step": 265300 + }, + { + "epoch": 4.118623814770558, + "grad_norm": 2.110651731491089, + "learning_rate": 4.5881391703781874e-05, + "loss": 1.0183, + "step": 265400 + }, + { + "epoch": 4.120175670013501, + "grad_norm": 1.7949814796447754, + "learning_rate": 4.587983984853893e-05, + "loss": 1.0086, + "step": 265500 + }, + { + "epoch": 4.1217275252564445, + "grad_norm": 1.9625879526138306, + "learning_rate": 4.587828799329599e-05, + "loss": 1.0165, + "step": 265600 + }, + { + "epoch": 4.123279380499387, + "grad_norm": 1.9224804639816284, + "learning_rate": 4.587673613805305e-05, + "loss": 0.9843, + "step": 265700 + }, + { + "epoch": 4.12483123574233, + "grad_norm": 2.26285982131958, + "learning_rate": 4.5875184282810105e-05, + "loss": 1.0028, + "step": 265800 + }, + { + "epoch": 4.126383090985273, + "grad_norm": 2.680020809173584, + "learning_rate": 4.5873632427567156e-05, + "loss": 1.0128, + "step": 265900 + }, + { + "epoch": 4.1279349462282156, + "grad_norm": 2.2132368087768555, + "learning_rate": 4.5872080572324214e-05, + "loss": 0.9966, + "step": 266000 + }, + { + "epoch": 4.129486801471159, + "grad_norm": 2.2778048515319824, + "learning_rate": 4.587052871708127e-05, + "loss": 1.0001, + "step": 266100 + }, + { + "epoch": 4.131038656714102, + "grad_norm": 2.3839142322540283, + "learning_rate": 4.586897686183833e-05, + "loss": 1.005, + "step": 266200 + }, + { + "epoch": 4.132590511957044, + "grad_norm": 2.1487812995910645, + "learning_rate": 4.586742500659539e-05, + "loss": 1.0014, + "step": 266300 + }, + { + "epoch": 4.1341423671999875, + "grad_norm": 2.3841116428375244, + "learning_rate": 4.5865873151352445e-05, + "loss": 0.9951, + "step": 266400 + }, + { + "epoch": 4.135694222442931, + "grad_norm": 2.0241899490356445, + "learning_rate": 4.58643212961095e-05, + "loss": 0.9868, + "step": 266500 + }, + { + "epoch": 4.137246077685873, + "grad_norm": 2.3595705032348633, + "learning_rate": 4.586276944086656e-05, + "loss": 1.0066, + "step": 266600 + }, + { + "epoch": 4.138797932928816, + "grad_norm": 2.4611387252807617, + "learning_rate": 4.586121758562362e-05, + "loss": 0.9933, + "step": 266700 + }, + { + "epoch": 4.1403497881717595, + "grad_norm": 2.13482666015625, + "learning_rate": 4.5859665730380676e-05, + "loss": 1.022, + "step": 266800 + }, + { + "epoch": 4.141901643414703, + "grad_norm": 1.967960238456726, + "learning_rate": 4.5858113875137734e-05, + "loss": 1.0124, + "step": 266900 + }, + { + "epoch": 4.143453498657645, + "grad_norm": 2.3471012115478516, + "learning_rate": 4.585656201989479e-05, + "loss": 1.0231, + "step": 267000 + }, + { + "epoch": 4.145005353900588, + "grad_norm": 2.416116237640381, + "learning_rate": 4.585501016465185e-05, + "loss": 1.011, + "step": 267100 + }, + { + "epoch": 4.146557209143531, + "grad_norm": 2.276646852493286, + "learning_rate": 4.58534583094089e-05, + "loss": 0.9942, + "step": 267200 + }, + { + "epoch": 4.148109064386474, + "grad_norm": 2.473435878753662, + "learning_rate": 4.585190645416596e-05, + "loss": 1.0015, + "step": 267300 + }, + { + "epoch": 4.149660919629417, + "grad_norm": 2.810506582260132, + "learning_rate": 4.585035459892301e-05, + "loss": 1.0061, + "step": 267400 + }, + { + "epoch": 4.15121277487236, + "grad_norm": 2.3710923194885254, + "learning_rate": 4.584880274368007e-05, + "loss": 1.0058, + "step": 267500 + }, + { + "epoch": 4.1527646301153025, + "grad_norm": 2.1666676998138428, + "learning_rate": 4.5847250888437124e-05, + "loss": 0.9779, + "step": 267600 + }, + { + "epoch": 4.154316485358246, + "grad_norm": 2.4987688064575195, + "learning_rate": 4.584569903319418e-05, + "loss": 1.0247, + "step": 267700 + }, + { + "epoch": 4.155868340601189, + "grad_norm": 2.13677978515625, + "learning_rate": 4.584414717795124e-05, + "loss": 0.9948, + "step": 267800 + }, + { + "epoch": 4.157420195844131, + "grad_norm": 1.9034311771392822, + "learning_rate": 4.58425953227083e-05, + "loss": 0.987, + "step": 267900 + }, + { + "epoch": 4.1589720510870745, + "grad_norm": 2.2017834186553955, + "learning_rate": 4.5841043467465355e-05, + "loss": 0.9963, + "step": 268000 + }, + { + "epoch": 4.160523906330018, + "grad_norm": 1.7424348592758179, + "learning_rate": 4.583949161222241e-05, + "loss": 1.0013, + "step": 268100 + }, + { + "epoch": 4.162075761572961, + "grad_norm": 2.314422369003296, + "learning_rate": 4.583793975697947e-05, + "loss": 1.0111, + "step": 268200 + }, + { + "epoch": 4.163627616815903, + "grad_norm": 2.1232736110687256, + "learning_rate": 4.583638790173653e-05, + "loss": 1.0087, + "step": 268300 + }, + { + "epoch": 4.165179472058846, + "grad_norm": 2.373929738998413, + "learning_rate": 4.5834836046493586e-05, + "loss": 0.9955, + "step": 268400 + }, + { + "epoch": 4.16673132730179, + "grad_norm": 2.3503201007843018, + "learning_rate": 4.5833284191250644e-05, + "loss": 0.9906, + "step": 268500 + }, + { + "epoch": 4.168283182544732, + "grad_norm": 2.191762924194336, + "learning_rate": 4.58317323360077e-05, + "loss": 0.9964, + "step": 268600 + }, + { + "epoch": 4.169835037787675, + "grad_norm": 1.852673888206482, + "learning_rate": 4.583018048076475e-05, + "loss": 1.0132, + "step": 268700 + }, + { + "epoch": 4.171386893030618, + "grad_norm": 2.5372464656829834, + "learning_rate": 4.582862862552181e-05, + "loss": 1.0212, + "step": 268800 + }, + { + "epoch": 4.172938748273561, + "grad_norm": 2.0085675716400146, + "learning_rate": 4.582707677027887e-05, + "loss": 0.9946, + "step": 268900 + }, + { + "epoch": 4.174490603516504, + "grad_norm": 2.2607789039611816, + "learning_rate": 4.5825524915035926e-05, + "loss": 1.0125, + "step": 269000 + }, + { + "epoch": 4.176042458759447, + "grad_norm": 2.4367029666900635, + "learning_rate": 4.5823973059792984e-05, + "loss": 1.0167, + "step": 269100 + }, + { + "epoch": 4.1775943140023895, + "grad_norm": 2.8601229190826416, + "learning_rate": 4.582242120455004e-05, + "loss": 0.9838, + "step": 269200 + }, + { + "epoch": 4.179146169245333, + "grad_norm": 2.394378662109375, + "learning_rate": 4.58208693493071e-05, + "loss": 1.004, + "step": 269300 + }, + { + "epoch": 4.180698024488276, + "grad_norm": 2.2384700775146484, + "learning_rate": 4.581931749406416e-05, + "loss": 0.9967, + "step": 269400 + }, + { + "epoch": 4.182249879731219, + "grad_norm": 2.6478142738342285, + "learning_rate": 4.5817765638821215e-05, + "loss": 1.0079, + "step": 269500 + }, + { + "epoch": 4.183801734974161, + "grad_norm": 2.617509365081787, + "learning_rate": 4.581621378357827e-05, + "loss": 0.9938, + "step": 269600 + }, + { + "epoch": 4.185353590217105, + "grad_norm": 2.2071099281311035, + "learning_rate": 4.581466192833533e-05, + "loss": 1.0259, + "step": 269700 + }, + { + "epoch": 4.186905445460048, + "grad_norm": 1.9501489400863647, + "learning_rate": 4.581311007309239e-05, + "loss": 0.9782, + "step": 269800 + }, + { + "epoch": 4.18845730070299, + "grad_norm": 2.1013565063476562, + "learning_rate": 4.5811558217849446e-05, + "loss": 0.9917, + "step": 269900 + }, + { + "epoch": 4.190009155945933, + "grad_norm": 2.2761118412017822, + "learning_rate": 4.58100063626065e-05, + "loss": 1.008, + "step": 270000 + }, + { + "epoch": 4.191561011188877, + "grad_norm": 2.5501368045806885, + "learning_rate": 4.5808454507363555e-05, + "loss": 1.005, + "step": 270100 + }, + { + "epoch": 4.193112866431819, + "grad_norm": 2.3103885650634766, + "learning_rate": 4.580690265212061e-05, + "loss": 0.9972, + "step": 270200 + }, + { + "epoch": 4.194664721674762, + "grad_norm": 2.1419904232025146, + "learning_rate": 4.580535079687767e-05, + "loss": 1.004, + "step": 270300 + }, + { + "epoch": 4.196216576917705, + "grad_norm": 2.082042694091797, + "learning_rate": 4.580379894163473e-05, + "loss": 1.0084, + "step": 270400 + }, + { + "epoch": 4.197768432160648, + "grad_norm": 2.577298402786255, + "learning_rate": 4.5802247086391786e-05, + "loss": 0.9993, + "step": 270500 + }, + { + "epoch": 4.199320287403591, + "grad_norm": 2.507154703140259, + "learning_rate": 4.580069523114884e-05, + "loss": 1.0134, + "step": 270600 + }, + { + "epoch": 4.200872142646534, + "grad_norm": 2.416017770767212, + "learning_rate": 4.5799143375905894e-05, + "loss": 1.0128, + "step": 270700 + }, + { + "epoch": 4.202423997889477, + "grad_norm": 1.842239499092102, + "learning_rate": 4.579759152066295e-05, + "loss": 1.0115, + "step": 270800 + }, + { + "epoch": 4.20397585313242, + "grad_norm": 1.9943252801895142, + "learning_rate": 4.579603966542001e-05, + "loss": 0.9845, + "step": 270900 + }, + { + "epoch": 4.205527708375363, + "grad_norm": 1.9691641330718994, + "learning_rate": 4.579448781017707e-05, + "loss": 1.0245, + "step": 271000 + }, + { + "epoch": 4.207079563618306, + "grad_norm": 2.22060227394104, + "learning_rate": 4.5792935954934125e-05, + "loss": 1.0037, + "step": 271100 + }, + { + "epoch": 4.208631418861248, + "grad_norm": 2.158911943435669, + "learning_rate": 4.579138409969118e-05, + "loss": 0.9987, + "step": 271200 + }, + { + "epoch": 4.210183274104192, + "grad_norm": 2.005728244781494, + "learning_rate": 4.578983224444824e-05, + "loss": 0.9958, + "step": 271300 + }, + { + "epoch": 4.211735129347135, + "grad_norm": 2.0646462440490723, + "learning_rate": 4.57882803892053e-05, + "loss": 0.99, + "step": 271400 + }, + { + "epoch": 4.213286984590077, + "grad_norm": 2.2937633991241455, + "learning_rate": 4.5786728533962356e-05, + "loss": 1.0077, + "step": 271500 + }, + { + "epoch": 4.21483883983302, + "grad_norm": 2.073124885559082, + "learning_rate": 4.578517667871941e-05, + "loss": 1.0018, + "step": 271600 + }, + { + "epoch": 4.2163906950759635, + "grad_norm": 2.7373855113983154, + "learning_rate": 4.5783624823476465e-05, + "loss": 0.9897, + "step": 271700 + }, + { + "epoch": 4.217942550318906, + "grad_norm": 2.8275275230407715, + "learning_rate": 4.578207296823352e-05, + "loss": 1.0004, + "step": 271800 + }, + { + "epoch": 4.219494405561849, + "grad_norm": 2.43312931060791, + "learning_rate": 4.578052111299058e-05, + "loss": 1.0084, + "step": 271900 + }, + { + "epoch": 4.221046260804792, + "grad_norm": 2.6437008380889893, + "learning_rate": 4.577896925774764e-05, + "loss": 1.006, + "step": 272000 + }, + { + "epoch": 4.2225981160477355, + "grad_norm": 2.0690057277679443, + "learning_rate": 4.5777417402504696e-05, + "loss": 1.0017, + "step": 272100 + }, + { + "epoch": 4.224149971290678, + "grad_norm": 2.295323133468628, + "learning_rate": 4.5775865547261754e-05, + "loss": 0.9817, + "step": 272200 + }, + { + "epoch": 4.225701826533621, + "grad_norm": 2.1846346855163574, + "learning_rate": 4.577431369201881e-05, + "loss": 0.9995, + "step": 272300 + }, + { + "epoch": 4.227253681776564, + "grad_norm": 2.26727032661438, + "learning_rate": 4.577276183677587e-05, + "loss": 1.0049, + "step": 272400 + }, + { + "epoch": 4.228805537019507, + "grad_norm": 1.7309058904647827, + "learning_rate": 4.577120998153293e-05, + "loss": 1.0125, + "step": 272500 + }, + { + "epoch": 4.23035739226245, + "grad_norm": 2.034524440765381, + "learning_rate": 4.5769658126289985e-05, + "loss": 0.994, + "step": 272600 + }, + { + "epoch": 4.231909247505393, + "grad_norm": 2.2921416759490967, + "learning_rate": 4.576810627104704e-05, + "loss": 0.9927, + "step": 272700 + }, + { + "epoch": 4.233461102748335, + "grad_norm": 2.246654987335205, + "learning_rate": 4.57665544158041e-05, + "loss": 1.0003, + "step": 272800 + }, + { + "epoch": 4.2350129579912785, + "grad_norm": 2.1161704063415527, + "learning_rate": 4.576500256056115e-05, + "loss": 0.9972, + "step": 272900 + }, + { + "epoch": 4.236564813234222, + "grad_norm": 2.2752392292022705, + "learning_rate": 4.576345070531821e-05, + "loss": 0.9959, + "step": 273000 + }, + { + "epoch": 4.238116668477164, + "grad_norm": 2.1410627365112305, + "learning_rate": 4.576189885007527e-05, + "loss": 1.0011, + "step": 273100 + }, + { + "epoch": 4.239668523720107, + "grad_norm": 2.3855819702148438, + "learning_rate": 4.5760346994832325e-05, + "loss": 1.0142, + "step": 273200 + }, + { + "epoch": 4.2412203789630505, + "grad_norm": 2.5783579349517822, + "learning_rate": 4.575879513958938e-05, + "loss": 1.0061, + "step": 273300 + }, + { + "epoch": 4.242772234205994, + "grad_norm": 2.3426921367645264, + "learning_rate": 4.575724328434644e-05, + "loss": 1.0072, + "step": 273400 + }, + { + "epoch": 4.244324089448936, + "grad_norm": 1.7720756530761719, + "learning_rate": 4.57556914291035e-05, + "loss": 0.9977, + "step": 273500 + }, + { + "epoch": 4.245875944691879, + "grad_norm": 2.1338977813720703, + "learning_rate": 4.5754139573860556e-05, + "loss": 1.0035, + "step": 273600 + }, + { + "epoch": 4.2474277999348224, + "grad_norm": 2.3656187057495117, + "learning_rate": 4.5752587718617613e-05, + "loss": 0.9986, + "step": 273700 + }, + { + "epoch": 4.248979655177765, + "grad_norm": 2.384273052215576, + "learning_rate": 4.5751035863374664e-05, + "loss": 0.9988, + "step": 273800 + }, + { + "epoch": 4.250531510420708, + "grad_norm": 2.058539390563965, + "learning_rate": 4.574948400813172e-05, + "loss": 0.9896, + "step": 273900 + }, + { + "epoch": 4.252083365663651, + "grad_norm": 1.7449809312820435, + "learning_rate": 4.574793215288878e-05, + "loss": 0.9869, + "step": 274000 + }, + { + "epoch": 4.2536352209065935, + "grad_norm": 2.3793601989746094, + "learning_rate": 4.574638029764584e-05, + "loss": 1.0008, + "step": 274100 + }, + { + "epoch": 4.255187076149537, + "grad_norm": 2.0956764221191406, + "learning_rate": 4.5744828442402895e-05, + "loss": 1.0047, + "step": 274200 + }, + { + "epoch": 4.25673893139248, + "grad_norm": 2.55118727684021, + "learning_rate": 4.574327658715995e-05, + "loss": 0.9955, + "step": 274300 + }, + { + "epoch": 4.258290786635422, + "grad_norm": 2.321115255355835, + "learning_rate": 4.5741724731917004e-05, + "loss": 0.9962, + "step": 274400 + }, + { + "epoch": 4.2598426418783655, + "grad_norm": 2.1780483722686768, + "learning_rate": 4.574017287667406e-05, + "loss": 0.9975, + "step": 274500 + }, + { + "epoch": 4.261394497121309, + "grad_norm": 2.335296630859375, + "learning_rate": 4.573862102143112e-05, + "loss": 1.0023, + "step": 274600 + }, + { + "epoch": 4.262946352364251, + "grad_norm": 2.26134991645813, + "learning_rate": 4.573706916618818e-05, + "loss": 0.9976, + "step": 274700 + }, + { + "epoch": 4.264498207607194, + "grad_norm": 2.400446891784668, + "learning_rate": 4.5735517310945235e-05, + "loss": 0.9804, + "step": 274800 + }, + { + "epoch": 4.266050062850137, + "grad_norm": 2.2282209396362305, + "learning_rate": 4.573396545570229e-05, + "loss": 1.0262, + "step": 274900 + }, + { + "epoch": 4.267601918093081, + "grad_norm": 2.1054229736328125, + "learning_rate": 4.573241360045935e-05, + "loss": 0.9946, + "step": 275000 + }, + { + "epoch": 4.269153773336023, + "grad_norm": 2.3565878868103027, + "learning_rate": 4.573086174521641e-05, + "loss": 1.0141, + "step": 275100 + }, + { + "epoch": 4.270705628578966, + "grad_norm": 2.2007217407226562, + "learning_rate": 4.5729309889973466e-05, + "loss": 0.9891, + "step": 275200 + }, + { + "epoch": 4.272257483821909, + "grad_norm": 1.6125657558441162, + "learning_rate": 4.5727758034730524e-05, + "loss": 0.9873, + "step": 275300 + }, + { + "epoch": 4.273809339064852, + "grad_norm": 2.150374412536621, + "learning_rate": 4.572620617948758e-05, + "loss": 0.9911, + "step": 275400 + }, + { + "epoch": 4.275361194307795, + "grad_norm": 2.2643942832946777, + "learning_rate": 4.572465432424464e-05, + "loss": 0.9994, + "step": 275500 + }, + { + "epoch": 4.276913049550738, + "grad_norm": 2.4527463912963867, + "learning_rate": 4.57231024690017e-05, + "loss": 1.0159, + "step": 275600 + }, + { + "epoch": 4.2784649047936805, + "grad_norm": 2.031367540359497, + "learning_rate": 4.572155061375875e-05, + "loss": 0.977, + "step": 275700 + }, + { + "epoch": 4.280016760036624, + "grad_norm": 2.356466054916382, + "learning_rate": 4.5719998758515806e-05, + "loss": 1.0154, + "step": 275800 + }, + { + "epoch": 4.281568615279567, + "grad_norm": 2.7374279499053955, + "learning_rate": 4.5718446903272864e-05, + "loss": 0.9855, + "step": 275900 + }, + { + "epoch": 4.28312047052251, + "grad_norm": 1.8986940383911133, + "learning_rate": 4.571689504802992e-05, + "loss": 0.9983, + "step": 276000 + }, + { + "epoch": 4.284672325765452, + "grad_norm": 2.3046984672546387, + "learning_rate": 4.571534319278698e-05, + "loss": 0.994, + "step": 276100 + }, + { + "epoch": 4.286224181008396, + "grad_norm": 2.569552421569824, + "learning_rate": 4.571379133754404e-05, + "loss": 0.9994, + "step": 276200 + }, + { + "epoch": 4.287776036251339, + "grad_norm": 2.2096495628356934, + "learning_rate": 4.5712239482301095e-05, + "loss": 1.0082, + "step": 276300 + }, + { + "epoch": 4.289327891494281, + "grad_norm": 2.335878372192383, + "learning_rate": 4.571068762705815e-05, + "loss": 0.9885, + "step": 276400 + }, + { + "epoch": 4.290879746737224, + "grad_norm": 2.1121490001678467, + "learning_rate": 4.570913577181521e-05, + "loss": 1.0162, + "step": 276500 + }, + { + "epoch": 4.292431601980168, + "grad_norm": 3.2806036472320557, + "learning_rate": 4.570758391657227e-05, + "loss": 0.9987, + "step": 276600 + }, + { + "epoch": 4.29398345722311, + "grad_norm": 2.2483205795288086, + "learning_rate": 4.5706032061329326e-05, + "loss": 1.0087, + "step": 276700 + }, + { + "epoch": 4.295535312466053, + "grad_norm": 2.3774726390838623, + "learning_rate": 4.5704480206086383e-05, + "loss": 1.0039, + "step": 276800 + }, + { + "epoch": 4.297087167708996, + "grad_norm": 2.5040836334228516, + "learning_rate": 4.570292835084344e-05, + "loss": 1.0008, + "step": 276900 + }, + { + "epoch": 4.298639022951939, + "grad_norm": 2.234832525253296, + "learning_rate": 4.570137649560049e-05, + "loss": 0.9856, + "step": 277000 + }, + { + "epoch": 4.300190878194882, + "grad_norm": 2.3915202617645264, + "learning_rate": 4.569982464035755e-05, + "loss": 1.0277, + "step": 277100 + }, + { + "epoch": 4.301742733437825, + "grad_norm": 2.0684545040130615, + "learning_rate": 4.56982727851146e-05, + "loss": 0.992, + "step": 277200 + }, + { + "epoch": 4.303294588680767, + "grad_norm": 2.7259724140167236, + "learning_rate": 4.569672092987166e-05, + "loss": 0.9959, + "step": 277300 + }, + { + "epoch": 4.304846443923711, + "grad_norm": 2.1833019256591797, + "learning_rate": 4.5695169074628716e-05, + "loss": 1.0151, + "step": 277400 + }, + { + "epoch": 4.306398299166654, + "grad_norm": 2.460402011871338, + "learning_rate": 4.5693617219385774e-05, + "loss": 1.0078, + "step": 277500 + }, + { + "epoch": 4.307950154409597, + "grad_norm": 2.342869520187378, + "learning_rate": 4.569206536414283e-05, + "loss": 0.9922, + "step": 277600 + }, + { + "epoch": 4.309502009652539, + "grad_norm": 2.3955397605895996, + "learning_rate": 4.569051350889989e-05, + "loss": 0.981, + "step": 277700 + }, + { + "epoch": 4.311053864895483, + "grad_norm": 2.3654611110687256, + "learning_rate": 4.568896165365695e-05, + "loss": 0.9956, + "step": 277800 + }, + { + "epoch": 4.312605720138426, + "grad_norm": 2.312753915786743, + "learning_rate": 4.5687409798414005e-05, + "loss": 0.9896, + "step": 277900 + }, + { + "epoch": 4.314157575381368, + "grad_norm": 3.2461416721343994, + "learning_rate": 4.568585794317106e-05, + "loss": 0.9784, + "step": 278000 + }, + { + "epoch": 4.315709430624311, + "grad_norm": 2.4776813983917236, + "learning_rate": 4.568430608792812e-05, + "loss": 0.9959, + "step": 278100 + }, + { + "epoch": 4.3172612858672545, + "grad_norm": 2.307433605194092, + "learning_rate": 4.568275423268518e-05, + "loss": 1.01, + "step": 278200 + }, + { + "epoch": 4.318813141110197, + "grad_norm": 2.2424583435058594, + "learning_rate": 4.5681202377442236e-05, + "loss": 0.9998, + "step": 278300 + }, + { + "epoch": 4.32036499635314, + "grad_norm": 2.229107141494751, + "learning_rate": 4.5679650522199294e-05, + "loss": 1.0093, + "step": 278400 + }, + { + "epoch": 4.321916851596083, + "grad_norm": 2.090376853942871, + "learning_rate": 4.5678098666956345e-05, + "loss": 0.986, + "step": 278500 + }, + { + "epoch": 4.3234687068390265, + "grad_norm": 2.31658673286438, + "learning_rate": 4.56765468117134e-05, + "loss": 0.9978, + "step": 278600 + }, + { + "epoch": 4.325020562081969, + "grad_norm": 2.3370983600616455, + "learning_rate": 4.567499495647046e-05, + "loss": 1.0239, + "step": 278700 + }, + { + "epoch": 4.326572417324912, + "grad_norm": 1.6979576349258423, + "learning_rate": 4.567344310122752e-05, + "loss": 1.0087, + "step": 278800 + }, + { + "epoch": 4.328124272567855, + "grad_norm": 2.154461145401001, + "learning_rate": 4.5671891245984576e-05, + "loss": 0.9954, + "step": 278900 + }, + { + "epoch": 4.329676127810798, + "grad_norm": 2.2807297706604004, + "learning_rate": 4.5670339390741634e-05, + "loss": 0.9911, + "step": 279000 + }, + { + "epoch": 4.331227983053741, + "grad_norm": 2.143202066421509, + "learning_rate": 4.566878753549869e-05, + "loss": 1.0176, + "step": 279100 + }, + { + "epoch": 4.332779838296684, + "grad_norm": 1.9185640811920166, + "learning_rate": 4.566723568025575e-05, + "loss": 0.9845, + "step": 279200 + }, + { + "epoch": 4.334331693539626, + "grad_norm": 2.1047873497009277, + "learning_rate": 4.566568382501281e-05, + "loss": 0.9939, + "step": 279300 + }, + { + "epoch": 4.3358835487825695, + "grad_norm": 2.0561420917510986, + "learning_rate": 4.5664131969769865e-05, + "loss": 1.0122, + "step": 279400 + }, + { + "epoch": 4.337435404025513, + "grad_norm": 2.3428125381469727, + "learning_rate": 4.566258011452692e-05, + "loss": 0.9925, + "step": 279500 + }, + { + "epoch": 4.338987259268455, + "grad_norm": 2.0884201526641846, + "learning_rate": 4.566102825928398e-05, + "loss": 1.0204, + "step": 279600 + }, + { + "epoch": 4.340539114511398, + "grad_norm": 2.344045639038086, + "learning_rate": 4.565947640404104e-05, + "loss": 1.0065, + "step": 279700 + }, + { + "epoch": 4.3420909697543415, + "grad_norm": 2.1906960010528564, + "learning_rate": 4.565792454879809e-05, + "loss": 0.998, + "step": 279800 + }, + { + "epoch": 4.343642824997284, + "grad_norm": 2.172529935836792, + "learning_rate": 4.565637269355515e-05, + "loss": 0.9874, + "step": 279900 + }, + { + "epoch": 4.345194680240227, + "grad_norm": 2.2087390422821045, + "learning_rate": 4.5654820838312204e-05, + "loss": 1.0095, + "step": 280000 + }, + { + "epoch": 4.34674653548317, + "grad_norm": 2.0882227420806885, + "learning_rate": 4.565326898306926e-05, + "loss": 0.9951, + "step": 280100 + }, + { + "epoch": 4.3482983907261135, + "grad_norm": 2.5011677742004395, + "learning_rate": 4.565171712782632e-05, + "loss": 1.0165, + "step": 280200 + }, + { + "epoch": 4.349850245969056, + "grad_norm": 2.17232084274292, + "learning_rate": 4.565016527258338e-05, + "loss": 0.9878, + "step": 280300 + }, + { + "epoch": 4.351402101211999, + "grad_norm": 2.1016626358032227, + "learning_rate": 4.564861341734043e-05, + "loss": 0.9951, + "step": 280400 + }, + { + "epoch": 4.352953956454942, + "grad_norm": 2.209808588027954, + "learning_rate": 4.5647061562097486e-05, + "loss": 1.0023, + "step": 280500 + }, + { + "epoch": 4.3545058116978845, + "grad_norm": 2.129488706588745, + "learning_rate": 4.5645509706854544e-05, + "loss": 1.003, + "step": 280600 + }, + { + "epoch": 4.356057666940828, + "grad_norm": 2.3624589443206787, + "learning_rate": 4.56439578516116e-05, + "loss": 1.0079, + "step": 280700 + }, + { + "epoch": 4.357609522183771, + "grad_norm": 2.0404021739959717, + "learning_rate": 4.564240599636866e-05, + "loss": 1.02, + "step": 280800 + }, + { + "epoch": 4.359161377426713, + "grad_norm": 1.9726099967956543, + "learning_rate": 4.564085414112572e-05, + "loss": 1.0063, + "step": 280900 + }, + { + "epoch": 4.3607132326696565, + "grad_norm": 2.4254438877105713, + "learning_rate": 4.5639302285882775e-05, + "loss": 1.004, + "step": 281000 + }, + { + "epoch": 4.3622650879126, + "grad_norm": 2.4621245861053467, + "learning_rate": 4.563775043063983e-05, + "loss": 1.0091, + "step": 281100 + }, + { + "epoch": 4.363816943155543, + "grad_norm": 2.6106221675872803, + "learning_rate": 4.563619857539689e-05, + "loss": 0.9977, + "step": 281200 + }, + { + "epoch": 4.365368798398485, + "grad_norm": 2.1493639945983887, + "learning_rate": 4.563464672015395e-05, + "loss": 1.0162, + "step": 281300 + }, + { + "epoch": 4.3669206536414285, + "grad_norm": 2.2255125045776367, + "learning_rate": 4.5633094864911e-05, + "loss": 1.0134, + "step": 281400 + }, + { + "epoch": 4.368472508884372, + "grad_norm": 2.635594129562378, + "learning_rate": 4.563154300966806e-05, + "loss": 1.012, + "step": 281500 + }, + { + "epoch": 4.370024364127314, + "grad_norm": 2.3431122303009033, + "learning_rate": 4.5629991154425115e-05, + "loss": 0.997, + "step": 281600 + }, + { + "epoch": 4.371576219370257, + "grad_norm": 2.3153669834136963, + "learning_rate": 4.562843929918217e-05, + "loss": 0.9958, + "step": 281700 + }, + { + "epoch": 4.3731280746132, + "grad_norm": 4.537783622741699, + "learning_rate": 4.562688744393923e-05, + "loss": 0.9947, + "step": 281800 + }, + { + "epoch": 4.374679929856143, + "grad_norm": 2.447627544403076, + "learning_rate": 4.562533558869629e-05, + "loss": 1.006, + "step": 281900 + }, + { + "epoch": 4.376231785099086, + "grad_norm": 2.10001277923584, + "learning_rate": 4.5623783733453346e-05, + "loss": 0.9723, + "step": 282000 + }, + { + "epoch": 4.377783640342029, + "grad_norm": 2.1870005130767822, + "learning_rate": 4.5622231878210404e-05, + "loss": 0.9942, + "step": 282100 + }, + { + "epoch": 4.3793354955849715, + "grad_norm": 2.346949815750122, + "learning_rate": 4.562068002296746e-05, + "loss": 1.0051, + "step": 282200 + }, + { + "epoch": 4.380887350827915, + "grad_norm": 2.1735942363739014, + "learning_rate": 4.561912816772452e-05, + "loss": 0.9875, + "step": 282300 + }, + { + "epoch": 4.382439206070858, + "grad_norm": 2.3719325065612793, + "learning_rate": 4.561757631248158e-05, + "loss": 1.0064, + "step": 282400 + }, + { + "epoch": 4.3839910613138, + "grad_norm": 2.253920555114746, + "learning_rate": 4.5616024457238635e-05, + "loss": 1.0154, + "step": 282500 + }, + { + "epoch": 4.385542916556743, + "grad_norm": 2.520620822906494, + "learning_rate": 4.561447260199569e-05, + "loss": 1.0141, + "step": 282600 + }, + { + "epoch": 4.387094771799687, + "grad_norm": 2.1080615520477295, + "learning_rate": 4.5612920746752743e-05, + "loss": 1.0136, + "step": 282700 + }, + { + "epoch": 4.38864662704263, + "grad_norm": 2.0734851360321045, + "learning_rate": 4.56113688915098e-05, + "loss": 0.9874, + "step": 282800 + }, + { + "epoch": 4.390198482285572, + "grad_norm": 2.5511300563812256, + "learning_rate": 4.560981703626686e-05, + "loss": 1.001, + "step": 282900 + }, + { + "epoch": 4.391750337528515, + "grad_norm": 2.716226577758789, + "learning_rate": 4.560826518102392e-05, + "loss": 0.9855, + "step": 283000 + }, + { + "epoch": 4.393302192771459, + "grad_norm": 2.402078151702881, + "learning_rate": 4.5606713325780974e-05, + "loss": 1.0057, + "step": 283100 + }, + { + "epoch": 4.394854048014401, + "grad_norm": 2.090461015701294, + "learning_rate": 4.560516147053803e-05, + "loss": 0.9867, + "step": 283200 + }, + { + "epoch": 4.396405903257344, + "grad_norm": 2.0088253021240234, + "learning_rate": 4.560360961529509e-05, + "loss": 0.9929, + "step": 283300 + }, + { + "epoch": 4.397957758500287, + "grad_norm": 2.3430347442626953, + "learning_rate": 4.560205776005215e-05, + "loss": 1.0067, + "step": 283400 + }, + { + "epoch": 4.39950961374323, + "grad_norm": 1.9524778127670288, + "learning_rate": 4.5600505904809205e-05, + "loss": 1.0175, + "step": 283500 + }, + { + "epoch": 4.401061468986173, + "grad_norm": 2.400179147720337, + "learning_rate": 4.5598954049566256e-05, + "loss": 1.0102, + "step": 283600 + }, + { + "epoch": 4.402613324229116, + "grad_norm": 1.9491347074508667, + "learning_rate": 4.5597402194323314e-05, + "loss": 0.9908, + "step": 283700 + }, + { + "epoch": 4.404165179472058, + "grad_norm": 2.32865571975708, + "learning_rate": 4.559585033908037e-05, + "loss": 0.9896, + "step": 283800 + }, + { + "epoch": 4.405717034715002, + "grad_norm": 1.8991413116455078, + "learning_rate": 4.559429848383743e-05, + "loss": 1.0114, + "step": 283900 + }, + { + "epoch": 4.407268889957945, + "grad_norm": 2.652299404144287, + "learning_rate": 4.559274662859449e-05, + "loss": 1.0057, + "step": 284000 + }, + { + "epoch": 4.408820745200888, + "grad_norm": 2.5429272651672363, + "learning_rate": 4.5591194773351545e-05, + "loss": 0.9826, + "step": 284100 + }, + { + "epoch": 4.41037260044383, + "grad_norm": 2.3713974952697754, + "learning_rate": 4.5589642918108596e-05, + "loss": 1.003, + "step": 284200 + }, + { + "epoch": 4.411924455686774, + "grad_norm": 1.9270364046096802, + "learning_rate": 4.5588091062865654e-05, + "loss": 1.0092, + "step": 284300 + }, + { + "epoch": 4.413476310929717, + "grad_norm": 2.2040717601776123, + "learning_rate": 4.558653920762271e-05, + "loss": 1.0128, + "step": 284400 + }, + { + "epoch": 4.415028166172659, + "grad_norm": 2.1370856761932373, + "learning_rate": 4.558498735237977e-05, + "loss": 0.9874, + "step": 284500 + }, + { + "epoch": 4.416580021415602, + "grad_norm": 1.8606032133102417, + "learning_rate": 4.558343549713683e-05, + "loss": 0.9766, + "step": 284600 + }, + { + "epoch": 4.418131876658546, + "grad_norm": 2.4526243209838867, + "learning_rate": 4.5581883641893885e-05, + "loss": 0.9875, + "step": 284700 + }, + { + "epoch": 4.419683731901488, + "grad_norm": 2.1099183559417725, + "learning_rate": 4.558033178665094e-05, + "loss": 0.9844, + "step": 284800 + }, + { + "epoch": 4.421235587144431, + "grad_norm": 2.3032143115997314, + "learning_rate": 4.5578779931408e-05, + "loss": 0.9873, + "step": 284900 + }, + { + "epoch": 4.422787442387374, + "grad_norm": 2.948617935180664, + "learning_rate": 4.557722807616506e-05, + "loss": 0.9921, + "step": 285000 + }, + { + "epoch": 4.424339297630317, + "grad_norm": 1.72019362449646, + "learning_rate": 4.5575676220922116e-05, + "loss": 0.9849, + "step": 285100 + }, + { + "epoch": 4.42589115287326, + "grad_norm": 2.4202182292938232, + "learning_rate": 4.5574124365679174e-05, + "loss": 1.0031, + "step": 285200 + }, + { + "epoch": 4.427443008116203, + "grad_norm": 2.4255611896514893, + "learning_rate": 4.557257251043623e-05, + "loss": 0.9786, + "step": 285300 + }, + { + "epoch": 4.428994863359146, + "grad_norm": 2.403902769088745, + "learning_rate": 4.557102065519329e-05, + "loss": 0.9986, + "step": 285400 + }, + { + "epoch": 4.430546718602089, + "grad_norm": 2.456495761871338, + "learning_rate": 4.556946879995034e-05, + "loss": 0.987, + "step": 285500 + }, + { + "epoch": 4.432098573845032, + "grad_norm": 2.1072821617126465, + "learning_rate": 4.55679169447074e-05, + "loss": 0.9976, + "step": 285600 + }, + { + "epoch": 4.433650429087975, + "grad_norm": 1.931612491607666, + "learning_rate": 4.5566365089464456e-05, + "loss": 1.0216, + "step": 285700 + }, + { + "epoch": 4.435202284330917, + "grad_norm": 2.1282880306243896, + "learning_rate": 4.5564813234221513e-05, + "loss": 0.9973, + "step": 285800 + }, + { + "epoch": 4.4367541395738606, + "grad_norm": 2.056551456451416, + "learning_rate": 4.556326137897857e-05, + "loss": 1.0079, + "step": 285900 + }, + { + "epoch": 4.438305994816804, + "grad_norm": 2.656358242034912, + "learning_rate": 4.556170952373563e-05, + "loss": 0.9771, + "step": 286000 + }, + { + "epoch": 4.439857850059746, + "grad_norm": 2.1979901790618896, + "learning_rate": 4.556015766849269e-05, + "loss": 0.9827, + "step": 286100 + }, + { + "epoch": 4.441409705302689, + "grad_norm": 1.9258511066436768, + "learning_rate": 4.5558605813249744e-05, + "loss": 0.9825, + "step": 286200 + }, + { + "epoch": 4.4429615605456325, + "grad_norm": 2.113699197769165, + "learning_rate": 4.55570539580068e-05, + "loss": 0.9846, + "step": 286300 + }, + { + "epoch": 4.444513415788575, + "grad_norm": 2.060818910598755, + "learning_rate": 4.555550210276386e-05, + "loss": 0.9953, + "step": 286400 + }, + { + "epoch": 4.446065271031518, + "grad_norm": 2.09224534034729, + "learning_rate": 4.555395024752092e-05, + "loss": 0.9928, + "step": 286500 + }, + { + "epoch": 4.447617126274461, + "grad_norm": 3.010362148284912, + "learning_rate": 4.5552398392277975e-05, + "loss": 0.994, + "step": 286600 + }, + { + "epoch": 4.4491689815174045, + "grad_norm": 2.243070125579834, + "learning_rate": 4.555084653703503e-05, + "loss": 1.0119, + "step": 286700 + }, + { + "epoch": 4.450720836760347, + "grad_norm": 2.002566337585449, + "learning_rate": 4.5549294681792084e-05, + "loss": 0.9866, + "step": 286800 + }, + { + "epoch": 4.45227269200329, + "grad_norm": 2.2695510387420654, + "learning_rate": 4.554774282654914e-05, + "loss": 0.9989, + "step": 286900 + }, + { + "epoch": 4.453824547246233, + "grad_norm": 1.9779748916625977, + "learning_rate": 4.55461909713062e-05, + "loss": 0.98, + "step": 287000 + }, + { + "epoch": 4.4553764024891755, + "grad_norm": 2.309352397918701, + "learning_rate": 4.554463911606325e-05, + "loss": 0.9898, + "step": 287100 + }, + { + "epoch": 4.456928257732119, + "grad_norm": 2.298234701156616, + "learning_rate": 4.554308726082031e-05, + "loss": 1.014, + "step": 287200 + }, + { + "epoch": 4.458480112975062, + "grad_norm": 2.090150833129883, + "learning_rate": 4.5541535405577366e-05, + "loss": 0.9757, + "step": 287300 + }, + { + "epoch": 4.460031968218004, + "grad_norm": 2.5438733100891113, + "learning_rate": 4.5539983550334424e-05, + "loss": 0.9778, + "step": 287400 + }, + { + "epoch": 4.4615838234609475, + "grad_norm": 2.3629579544067383, + "learning_rate": 4.553843169509148e-05, + "loss": 0.997, + "step": 287500 + }, + { + "epoch": 4.463135678703891, + "grad_norm": 2.2515103816986084, + "learning_rate": 4.553687983984854e-05, + "loss": 0.9896, + "step": 287600 + }, + { + "epoch": 4.464687533946833, + "grad_norm": 2.4853055477142334, + "learning_rate": 4.55353279846056e-05, + "loss": 1.0019, + "step": 287700 + }, + { + "epoch": 4.466239389189776, + "grad_norm": 1.969152569770813, + "learning_rate": 4.5533776129362655e-05, + "loss": 0.9939, + "step": 287800 + }, + { + "epoch": 4.4677912444327195, + "grad_norm": 2.290210723876953, + "learning_rate": 4.553222427411971e-05, + "loss": 0.9918, + "step": 287900 + }, + { + "epoch": 4.469343099675662, + "grad_norm": 2.0896894931793213, + "learning_rate": 4.553067241887677e-05, + "loss": 0.9968, + "step": 288000 + }, + { + "epoch": 4.470894954918605, + "grad_norm": 2.698840856552124, + "learning_rate": 4.552912056363383e-05, + "loss": 1.0068, + "step": 288100 + }, + { + "epoch": 4.472446810161548, + "grad_norm": 2.261298894882202, + "learning_rate": 4.5527568708390886e-05, + "loss": 1.0201, + "step": 288200 + }, + { + "epoch": 4.473998665404491, + "grad_norm": 2.2422170639038086, + "learning_rate": 4.5526016853147944e-05, + "loss": 0.9831, + "step": 288300 + }, + { + "epoch": 4.475550520647434, + "grad_norm": 2.165275812149048, + "learning_rate": 4.5524464997904995e-05, + "loss": 0.9842, + "step": 288400 + }, + { + "epoch": 4.477102375890377, + "grad_norm": 1.9195927381515503, + "learning_rate": 4.552291314266205e-05, + "loss": 0.9799, + "step": 288500 + }, + { + "epoch": 4.47865423113332, + "grad_norm": 2.3623716831207275, + "learning_rate": 4.552136128741911e-05, + "loss": 0.9923, + "step": 288600 + }, + { + "epoch": 4.4802060863762625, + "grad_norm": 2.3932673931121826, + "learning_rate": 4.551980943217617e-05, + "loss": 0.9865, + "step": 288700 + }, + { + "epoch": 4.481757941619206, + "grad_norm": 1.7156976461410522, + "learning_rate": 4.5518257576933226e-05, + "loss": 0.9932, + "step": 288800 + }, + { + "epoch": 4.483309796862149, + "grad_norm": 2.1037440299987793, + "learning_rate": 4.5516705721690283e-05, + "loss": 1.0139, + "step": 288900 + }, + { + "epoch": 4.484861652105091, + "grad_norm": 2.0449819564819336, + "learning_rate": 4.551515386644734e-05, + "loss": 1.0049, + "step": 289000 + }, + { + "epoch": 4.4864135073480345, + "grad_norm": 2.0198733806610107, + "learning_rate": 4.55136020112044e-05, + "loss": 0.9872, + "step": 289100 + }, + { + "epoch": 4.487965362590978, + "grad_norm": 2.188941240310669, + "learning_rate": 4.551205015596146e-05, + "loss": 0.9831, + "step": 289200 + }, + { + "epoch": 4.489517217833921, + "grad_norm": 2.4358768463134766, + "learning_rate": 4.5510498300718514e-05, + "loss": 0.9982, + "step": 289300 + }, + { + "epoch": 4.491069073076863, + "grad_norm": 2.595031976699829, + "learning_rate": 4.550894644547557e-05, + "loss": 0.9948, + "step": 289400 + }, + { + "epoch": 4.492620928319806, + "grad_norm": 1.8055930137634277, + "learning_rate": 4.550739459023263e-05, + "loss": 0.9916, + "step": 289500 + }, + { + "epoch": 4.49417278356275, + "grad_norm": 2.531336784362793, + "learning_rate": 4.550584273498969e-05, + "loss": 0.9974, + "step": 289600 + }, + { + "epoch": 4.495724638805692, + "grad_norm": 1.9998985528945923, + "learning_rate": 4.550429087974674e-05, + "loss": 1.0086, + "step": 289700 + }, + { + "epoch": 4.497276494048635, + "grad_norm": 2.4632577896118164, + "learning_rate": 4.5502739024503796e-05, + "loss": 1.0142, + "step": 289800 + }, + { + "epoch": 4.498828349291578, + "grad_norm": 1.9779094457626343, + "learning_rate": 4.5501187169260854e-05, + "loss": 0.9987, + "step": 289900 + }, + { + "epoch": 4.500380204534521, + "grad_norm": 2.818877696990967, + "learning_rate": 4.549963531401791e-05, + "loss": 0.9816, + "step": 290000 + }, + { + "epoch": 4.501932059777464, + "grad_norm": 1.737345576286316, + "learning_rate": 4.549808345877496e-05, + "loss": 0.9997, + "step": 290100 + }, + { + "epoch": 4.503483915020407, + "grad_norm": 2.178837776184082, + "learning_rate": 4.549653160353202e-05, + "loss": 1.0328, + "step": 290200 + }, + { + "epoch": 4.5050357702633494, + "grad_norm": 2.073700428009033, + "learning_rate": 4.549497974828908e-05, + "loss": 0.9848, + "step": 290300 + }, + { + "epoch": 4.506587625506293, + "grad_norm": 2.192124843597412, + "learning_rate": 4.5493427893046136e-05, + "loss": 0.984, + "step": 290400 + }, + { + "epoch": 4.508139480749236, + "grad_norm": 2.240753650665283, + "learning_rate": 4.5491876037803194e-05, + "loss": 0.9918, + "step": 290500 + }, + { + "epoch": 4.509691335992178, + "grad_norm": 2.257843017578125, + "learning_rate": 4.549032418256025e-05, + "loss": 0.9898, + "step": 290600 + }, + { + "epoch": 4.511243191235121, + "grad_norm": 1.9702281951904297, + "learning_rate": 4.548877232731731e-05, + "loss": 1.0081, + "step": 290700 + }, + { + "epoch": 4.512795046478065, + "grad_norm": 2.0653622150421143, + "learning_rate": 4.548722047207437e-05, + "loss": 0.9936, + "step": 290800 + }, + { + "epoch": 4.514346901721008, + "grad_norm": 2.2016284465789795, + "learning_rate": 4.5485668616831425e-05, + "loss": 1.0005, + "step": 290900 + }, + { + "epoch": 4.51589875696395, + "grad_norm": 1.957107424736023, + "learning_rate": 4.548411676158848e-05, + "loss": 0.9952, + "step": 291000 + }, + { + "epoch": 4.517450612206893, + "grad_norm": 2.218761682510376, + "learning_rate": 4.548256490634554e-05, + "loss": 0.9957, + "step": 291100 + }, + { + "epoch": 4.519002467449837, + "grad_norm": 2.4304659366607666, + "learning_rate": 4.548101305110259e-05, + "loss": 0.9989, + "step": 291200 + }, + { + "epoch": 4.520554322692779, + "grad_norm": 2.1235270500183105, + "learning_rate": 4.547946119585965e-05, + "loss": 1.0112, + "step": 291300 + }, + { + "epoch": 4.522106177935722, + "grad_norm": 2.281203508377075, + "learning_rate": 4.547790934061671e-05, + "loss": 0.9907, + "step": 291400 + }, + { + "epoch": 4.523658033178665, + "grad_norm": 1.9063180685043335, + "learning_rate": 4.5476357485373765e-05, + "loss": 0.9974, + "step": 291500 + }, + { + "epoch": 4.525209888421608, + "grad_norm": 2.5643959045410156, + "learning_rate": 4.547480563013082e-05, + "loss": 1.0075, + "step": 291600 + }, + { + "epoch": 4.526761743664551, + "grad_norm": 2.390983819961548, + "learning_rate": 4.547325377488788e-05, + "loss": 0.9906, + "step": 291700 + }, + { + "epoch": 4.528313598907494, + "grad_norm": 2.0381689071655273, + "learning_rate": 4.547170191964494e-05, + "loss": 1.0249, + "step": 291800 + }, + { + "epoch": 4.529865454150437, + "grad_norm": 2.567638874053955, + "learning_rate": 4.5470150064401996e-05, + "loss": 1.0053, + "step": 291900 + }, + { + "epoch": 4.53141730939338, + "grad_norm": 2.4526500701904297, + "learning_rate": 4.5468598209159053e-05, + "loss": 0.9843, + "step": 292000 + }, + { + "epoch": 4.532969164636323, + "grad_norm": 2.254894256591797, + "learning_rate": 4.546704635391611e-05, + "loss": 1.0166, + "step": 292100 + }, + { + "epoch": 4.534521019879266, + "grad_norm": 2.157649517059326, + "learning_rate": 4.546549449867317e-05, + "loss": 0.9917, + "step": 292200 + }, + { + "epoch": 4.536072875122208, + "grad_norm": 2.449471950531006, + "learning_rate": 4.546394264343023e-05, + "loss": 0.9843, + "step": 292300 + }, + { + "epoch": 4.537624730365152, + "grad_norm": 2.03277850151062, + "learning_rate": 4.5462390788187284e-05, + "loss": 1.0109, + "step": 292400 + }, + { + "epoch": 4.539176585608095, + "grad_norm": 2.191074848175049, + "learning_rate": 4.5460838932944335e-05, + "loss": 1.0053, + "step": 292500 + }, + { + "epoch": 4.540728440851037, + "grad_norm": 2.225010395050049, + "learning_rate": 4.545928707770139e-05, + "loss": 0.9716, + "step": 292600 + }, + { + "epoch": 4.54228029609398, + "grad_norm": 1.9364932775497437, + "learning_rate": 4.545773522245845e-05, + "loss": 0.9762, + "step": 292700 + }, + { + "epoch": 4.5438321513369235, + "grad_norm": 2.308445692062378, + "learning_rate": 4.545618336721551e-05, + "loss": 0.9831, + "step": 292800 + }, + { + "epoch": 4.545384006579866, + "grad_norm": 2.489295244216919, + "learning_rate": 4.5454631511972566e-05, + "loss": 0.9944, + "step": 292900 + }, + { + "epoch": 4.546935861822809, + "grad_norm": 2.206125259399414, + "learning_rate": 4.5453079656729624e-05, + "loss": 1.0031, + "step": 293000 + }, + { + "epoch": 4.548487717065752, + "grad_norm": 2.2450289726257324, + "learning_rate": 4.545152780148668e-05, + "loss": 0.9916, + "step": 293100 + }, + { + "epoch": 4.550039572308695, + "grad_norm": 2.4696760177612305, + "learning_rate": 4.544997594624374e-05, + "loss": 1.0163, + "step": 293200 + }, + { + "epoch": 4.551591427551638, + "grad_norm": 2.165466547012329, + "learning_rate": 4.54484240910008e-05, + "loss": 0.9767, + "step": 293300 + }, + { + "epoch": 4.553143282794581, + "grad_norm": 2.521078109741211, + "learning_rate": 4.544687223575785e-05, + "loss": 0.9852, + "step": 293400 + }, + { + "epoch": 4.554695138037524, + "grad_norm": 2.1630663871765137, + "learning_rate": 4.5445320380514906e-05, + "loss": 0.9939, + "step": 293500 + }, + { + "epoch": 4.556246993280467, + "grad_norm": 2.229306221008301, + "learning_rate": 4.5443768525271964e-05, + "loss": 1.0157, + "step": 293600 + }, + { + "epoch": 4.55779884852341, + "grad_norm": 2.200291633605957, + "learning_rate": 4.544221667002902e-05, + "loss": 1.0082, + "step": 293700 + }, + { + "epoch": 4.559350703766353, + "grad_norm": 2.1165928840637207, + "learning_rate": 4.544066481478608e-05, + "loss": 1.0013, + "step": 293800 + }, + { + "epoch": 4.560902559009295, + "grad_norm": 1.8286538124084473, + "learning_rate": 4.543911295954314e-05, + "loss": 0.9902, + "step": 293900 + }, + { + "epoch": 4.5624544142522385, + "grad_norm": 2.3261590003967285, + "learning_rate": 4.543756110430019e-05, + "loss": 1.0035, + "step": 294000 + }, + { + "epoch": 4.564006269495182, + "grad_norm": 2.176161289215088, + "learning_rate": 4.5436009249057246e-05, + "loss": 0.9872, + "step": 294100 + }, + { + "epoch": 4.565558124738124, + "grad_norm": 2.6515414714813232, + "learning_rate": 4.5434457393814304e-05, + "loss": 0.9776, + "step": 294200 + }, + { + "epoch": 4.567109979981067, + "grad_norm": 2.072274684906006, + "learning_rate": 4.543290553857136e-05, + "loss": 0.9843, + "step": 294300 + }, + { + "epoch": 4.5686618352240105, + "grad_norm": 2.182770013809204, + "learning_rate": 4.543135368332842e-05, + "loss": 0.9804, + "step": 294400 + }, + { + "epoch": 4.570213690466954, + "grad_norm": 2.168555974960327, + "learning_rate": 4.542980182808548e-05, + "loss": 0.9833, + "step": 294500 + }, + { + "epoch": 4.571765545709896, + "grad_norm": 1.6700162887573242, + "learning_rate": 4.5428249972842535e-05, + "loss": 1.0007, + "step": 294600 + }, + { + "epoch": 4.573317400952839, + "grad_norm": 1.9702121019363403, + "learning_rate": 4.542669811759959e-05, + "loss": 0.9926, + "step": 294700 + }, + { + "epoch": 4.574869256195782, + "grad_norm": 2.87471342086792, + "learning_rate": 4.542514626235665e-05, + "loss": 1.0119, + "step": 294800 + }, + { + "epoch": 4.576421111438725, + "grad_norm": 2.0131750106811523, + "learning_rate": 4.542359440711371e-05, + "loss": 1.0124, + "step": 294900 + }, + { + "epoch": 4.577972966681668, + "grad_norm": 2.314908981323242, + "learning_rate": 4.5422042551870766e-05, + "loss": 1.0003, + "step": 295000 + }, + { + "epoch": 4.579524821924611, + "grad_norm": 2.1337711811065674, + "learning_rate": 4.5420490696627823e-05, + "loss": 1.0033, + "step": 295100 + }, + { + "epoch": 4.5810766771675535, + "grad_norm": 2.309758186340332, + "learning_rate": 4.541893884138488e-05, + "loss": 1.01, + "step": 295200 + }, + { + "epoch": 4.582628532410497, + "grad_norm": 2.365941047668457, + "learning_rate": 4.541738698614193e-05, + "loss": 1.0149, + "step": 295300 + }, + { + "epoch": 4.58418038765344, + "grad_norm": 2.3478477001190186, + "learning_rate": 4.541583513089899e-05, + "loss": 0.9849, + "step": 295400 + }, + { + "epoch": 4.585732242896382, + "grad_norm": 2.2278852462768555, + "learning_rate": 4.541428327565605e-05, + "loss": 1.0097, + "step": 295500 + }, + { + "epoch": 4.5872840981393255, + "grad_norm": 2.4725451469421387, + "learning_rate": 4.5412731420413105e-05, + "loss": 1.0085, + "step": 295600 + }, + { + "epoch": 4.588835953382269, + "grad_norm": 1.7953201532363892, + "learning_rate": 4.541117956517016e-05, + "loss": 0.9874, + "step": 295700 + }, + { + "epoch": 4.590387808625211, + "grad_norm": 2.429874897003174, + "learning_rate": 4.540962770992722e-05, + "loss": 0.9995, + "step": 295800 + }, + { + "epoch": 4.591939663868154, + "grad_norm": 2.4995932579040527, + "learning_rate": 4.540807585468428e-05, + "loss": 0.9981, + "step": 295900 + }, + { + "epoch": 4.593491519111097, + "grad_norm": 2.5319671630859375, + "learning_rate": 4.5406523999441336e-05, + "loss": 0.9713, + "step": 296000 + }, + { + "epoch": 4.59504337435404, + "grad_norm": 2.855954885482788, + "learning_rate": 4.5404972144198394e-05, + "loss": 0.992, + "step": 296100 + }, + { + "epoch": 4.596595229596983, + "grad_norm": 1.8545739650726318, + "learning_rate": 4.540342028895545e-05, + "loss": 0.9736, + "step": 296200 + }, + { + "epoch": 4.598147084839926, + "grad_norm": 2.542313575744629, + "learning_rate": 4.540186843371251e-05, + "loss": 1.0157, + "step": 296300 + }, + { + "epoch": 4.599698940082869, + "grad_norm": 2.090695381164551, + "learning_rate": 4.540031657846957e-05, + "loss": 0.9859, + "step": 296400 + }, + { + "epoch": 4.601250795325812, + "grad_norm": 2.1615982055664062, + "learning_rate": 4.5398764723226625e-05, + "loss": 0.9832, + "step": 296500 + }, + { + "epoch": 4.602802650568755, + "grad_norm": 2.0939669609069824, + "learning_rate": 4.5397212867983676e-05, + "loss": 1.0131, + "step": 296600 + }, + { + "epoch": 4.604354505811698, + "grad_norm": 2.218007802963257, + "learning_rate": 4.5395661012740734e-05, + "loss": 0.9836, + "step": 296700 + }, + { + "epoch": 4.6059063610546405, + "grad_norm": 2.3938162326812744, + "learning_rate": 4.539410915749779e-05, + "loss": 0.9996, + "step": 296800 + }, + { + "epoch": 4.607458216297584, + "grad_norm": 2.6263668537139893, + "learning_rate": 4.539255730225484e-05, + "loss": 0.9872, + "step": 296900 + }, + { + "epoch": 4.609010071540527, + "grad_norm": 2.299152374267578, + "learning_rate": 4.53910054470119e-05, + "loss": 1.0188, + "step": 297000 + }, + { + "epoch": 4.61056192678347, + "grad_norm": 2.205808162689209, + "learning_rate": 4.538945359176896e-05, + "loss": 1.0027, + "step": 297100 + }, + { + "epoch": 4.612113782026412, + "grad_norm": 2.102097988128662, + "learning_rate": 4.5387901736526016e-05, + "loss": 0.9848, + "step": 297200 + }, + { + "epoch": 4.613665637269356, + "grad_norm": 1.9192564487457275, + "learning_rate": 4.5386349881283074e-05, + "loss": 1.0018, + "step": 297300 + }, + { + "epoch": 4.615217492512299, + "grad_norm": 2.2630560398101807, + "learning_rate": 4.538479802604013e-05, + "loss": 1.0156, + "step": 297400 + }, + { + "epoch": 4.616769347755241, + "grad_norm": 2.279552936553955, + "learning_rate": 4.538324617079719e-05, + "loss": 0.9843, + "step": 297500 + }, + { + "epoch": 4.618321202998184, + "grad_norm": 2.186372995376587, + "learning_rate": 4.538169431555425e-05, + "loss": 1.0196, + "step": 297600 + }, + { + "epoch": 4.619873058241128, + "grad_norm": 2.1136887073516846, + "learning_rate": 4.5380142460311305e-05, + "loss": 0.9971, + "step": 297700 + }, + { + "epoch": 4.62142491348407, + "grad_norm": 2.56472110748291, + "learning_rate": 4.537859060506836e-05, + "loss": 1.0058, + "step": 297800 + }, + { + "epoch": 4.622976768727013, + "grad_norm": 1.9326163530349731, + "learning_rate": 4.537703874982542e-05, + "loss": 0.9791, + "step": 297900 + }, + { + "epoch": 4.624528623969956, + "grad_norm": 2.8529694080352783, + "learning_rate": 4.537548689458248e-05, + "loss": 1.0055, + "step": 298000 + }, + { + "epoch": 4.626080479212899, + "grad_norm": 1.6828869581222534, + "learning_rate": 4.5373935039339536e-05, + "loss": 0.9932, + "step": 298100 + }, + { + "epoch": 4.627632334455842, + "grad_norm": 2.295825958251953, + "learning_rate": 4.537238318409659e-05, + "loss": 1.0028, + "step": 298200 + }, + { + "epoch": 4.629184189698785, + "grad_norm": 2.0858960151672363, + "learning_rate": 4.5370831328853644e-05, + "loss": 0.9802, + "step": 298300 + }, + { + "epoch": 4.630736044941727, + "grad_norm": 2.1927192211151123, + "learning_rate": 4.53692794736107e-05, + "loss": 0.9858, + "step": 298400 + }, + { + "epoch": 4.632287900184671, + "grad_norm": 2.8608124256134033, + "learning_rate": 4.536772761836776e-05, + "loss": 0.9913, + "step": 298500 + }, + { + "epoch": 4.633839755427614, + "grad_norm": 2.372511863708496, + "learning_rate": 4.536617576312482e-05, + "loss": 0.9922, + "step": 298600 + }, + { + "epoch": 4.635391610670556, + "grad_norm": 2.015420913696289, + "learning_rate": 4.5364623907881875e-05, + "loss": 0.9957, + "step": 298700 + }, + { + "epoch": 4.636943465913499, + "grad_norm": 2.1719632148742676, + "learning_rate": 4.536307205263893e-05, + "loss": 0.9994, + "step": 298800 + }, + { + "epoch": 4.638495321156443, + "grad_norm": 2.063201665878296, + "learning_rate": 4.536152019739599e-05, + "loss": 1.0103, + "step": 298900 + }, + { + "epoch": 4.640047176399386, + "grad_norm": 2.673893928527832, + "learning_rate": 4.535996834215305e-05, + "loss": 1.006, + "step": 299000 + }, + { + "epoch": 4.641599031642328, + "grad_norm": 2.41896915435791, + "learning_rate": 4.5358416486910106e-05, + "loss": 1.0095, + "step": 299100 + }, + { + "epoch": 4.643150886885271, + "grad_norm": 1.9028549194335938, + "learning_rate": 4.5356864631667164e-05, + "loss": 1.0122, + "step": 299200 + }, + { + "epoch": 4.6447027421282145, + "grad_norm": 2.820322036743164, + "learning_rate": 4.535531277642422e-05, + "loss": 1.0017, + "step": 299300 + }, + { + "epoch": 4.646254597371157, + "grad_norm": 2.2285239696502686, + "learning_rate": 4.535376092118128e-05, + "loss": 0.9867, + "step": 299400 + }, + { + "epoch": 4.6478064526141, + "grad_norm": 2.2062690258026123, + "learning_rate": 4.535220906593833e-05, + "loss": 1.0204, + "step": 299500 + }, + { + "epoch": 4.649358307857043, + "grad_norm": 2.4299566745758057, + "learning_rate": 4.535065721069539e-05, + "loss": 1.018, + "step": 299600 + }, + { + "epoch": 4.6509101630999865, + "grad_norm": 2.242126703262329, + "learning_rate": 4.5349105355452446e-05, + "loss": 1.0052, + "step": 299700 + }, + { + "epoch": 4.652462018342929, + "grad_norm": 2.1155760288238525, + "learning_rate": 4.5347553500209504e-05, + "loss": 0.9945, + "step": 299800 + }, + { + "epoch": 4.654013873585872, + "grad_norm": 2.1999547481536865, + "learning_rate": 4.5346001644966555e-05, + "loss": 0.9897, + "step": 299900 + }, + { + "epoch": 4.655565728828815, + "grad_norm": 2.0622591972351074, + "learning_rate": 4.534444978972361e-05, + "loss": 1.0064, + "step": 300000 + }, + { + "epoch": 4.657117584071758, + "grad_norm": 2.2337169647216797, + "learning_rate": 4.534289793448067e-05, + "loss": 1.0082, + "step": 300100 + }, + { + "epoch": 4.658669439314701, + "grad_norm": 2.4536898136138916, + "learning_rate": 4.534134607923773e-05, + "loss": 0.984, + "step": 300200 + }, + { + "epoch": 4.660221294557644, + "grad_norm": 2.200178384780884, + "learning_rate": 4.5339794223994786e-05, + "loss": 0.9946, + "step": 300300 + }, + { + "epoch": 4.661773149800586, + "grad_norm": 2.038768768310547, + "learning_rate": 4.5338242368751844e-05, + "loss": 0.9959, + "step": 300400 + }, + { + "epoch": 4.6633250050435295, + "grad_norm": 2.1221811771392822, + "learning_rate": 4.53366905135089e-05, + "loss": 0.9891, + "step": 300500 + }, + { + "epoch": 4.664876860286473, + "grad_norm": 2.423968553543091, + "learning_rate": 4.533513865826596e-05, + "loss": 0.997, + "step": 300600 + }, + { + "epoch": 4.666428715529415, + "grad_norm": 2.2653868198394775, + "learning_rate": 4.533358680302302e-05, + "loss": 0.9694, + "step": 300700 + }, + { + "epoch": 4.667980570772358, + "grad_norm": 2.121770143508911, + "learning_rate": 4.5332034947780075e-05, + "loss": 0.9759, + "step": 300800 + }, + { + "epoch": 4.6695324260153015, + "grad_norm": 2.3002452850341797, + "learning_rate": 4.533048309253713e-05, + "loss": 1.0008, + "step": 300900 + }, + { + "epoch": 4.671084281258244, + "grad_norm": 2.14780330657959, + "learning_rate": 4.532893123729418e-05, + "loss": 0.9991, + "step": 301000 + }, + { + "epoch": 4.672636136501187, + "grad_norm": 2.5276637077331543, + "learning_rate": 4.532737938205124e-05, + "loss": 1.0057, + "step": 301100 + }, + { + "epoch": 4.67418799174413, + "grad_norm": 2.351923704147339, + "learning_rate": 4.53258275268083e-05, + "loss": 0.9895, + "step": 301200 + }, + { + "epoch": 4.675739846987073, + "grad_norm": 2.697972536087036, + "learning_rate": 4.532427567156536e-05, + "loss": 0.9825, + "step": 301300 + }, + { + "epoch": 4.677291702230016, + "grad_norm": 2.461207628250122, + "learning_rate": 4.5322723816322414e-05, + "loss": 0.9975, + "step": 301400 + }, + { + "epoch": 4.678843557472959, + "grad_norm": 2.165151834487915, + "learning_rate": 4.532117196107947e-05, + "loss": 1.0008, + "step": 301500 + }, + { + "epoch": 4.680395412715902, + "grad_norm": 2.336306095123291, + "learning_rate": 4.531962010583653e-05, + "loss": 0.9773, + "step": 301600 + }, + { + "epoch": 4.6819472679588445, + "grad_norm": 2.433595895767212, + "learning_rate": 4.531806825059359e-05, + "loss": 0.9846, + "step": 301700 + }, + { + "epoch": 4.683499123201788, + "grad_norm": 2.1923928260803223, + "learning_rate": 4.5316516395350645e-05, + "loss": 0.9881, + "step": 301800 + }, + { + "epoch": 4.685050978444731, + "grad_norm": 2.0921261310577393, + "learning_rate": 4.53149645401077e-05, + "loss": 0.9812, + "step": 301900 + }, + { + "epoch": 4.686602833687673, + "grad_norm": 2.3394947052001953, + "learning_rate": 4.531341268486476e-05, + "loss": 0.9821, + "step": 302000 + }, + { + "epoch": 4.6881546889306165, + "grad_norm": 2.5028738975524902, + "learning_rate": 4.531186082962182e-05, + "loss": 0.9903, + "step": 302100 + }, + { + "epoch": 4.68970654417356, + "grad_norm": 2.1756536960601807, + "learning_rate": 4.5310308974378876e-05, + "loss": 0.9802, + "step": 302200 + }, + { + "epoch": 4.691258399416503, + "grad_norm": 1.8686116933822632, + "learning_rate": 4.530875711913593e-05, + "loss": 1.0064, + "step": 302300 + }, + { + "epoch": 4.692810254659445, + "grad_norm": 2.3257672786712646, + "learning_rate": 4.5307205263892985e-05, + "loss": 1.0109, + "step": 302400 + }, + { + "epoch": 4.6943621099023884, + "grad_norm": 2.293214797973633, + "learning_rate": 4.530565340865004e-05, + "loss": 0.9729, + "step": 302500 + }, + { + "epoch": 4.695913965145332, + "grad_norm": 2.150447130203247, + "learning_rate": 4.53041015534071e-05, + "loss": 0.9702, + "step": 302600 + }, + { + "epoch": 4.697465820388274, + "grad_norm": 2.234912395477295, + "learning_rate": 4.530254969816416e-05, + "loss": 0.9876, + "step": 302700 + }, + { + "epoch": 4.699017675631217, + "grad_norm": 1.9642179012298584, + "learning_rate": 4.5300997842921216e-05, + "loss": 0.989, + "step": 302800 + }, + { + "epoch": 4.70056953087416, + "grad_norm": 2.5795674324035645, + "learning_rate": 4.5299445987678274e-05, + "loss": 0.9802, + "step": 302900 + }, + { + "epoch": 4.702121386117103, + "grad_norm": 1.7124474048614502, + "learning_rate": 4.529789413243533e-05, + "loss": 0.9993, + "step": 303000 + }, + { + "epoch": 4.703673241360046, + "grad_norm": 2.22861647605896, + "learning_rate": 4.529634227719239e-05, + "loss": 1.0023, + "step": 303100 + }, + { + "epoch": 4.705225096602989, + "grad_norm": 2.579798460006714, + "learning_rate": 4.529479042194944e-05, + "loss": 0.9942, + "step": 303200 + }, + { + "epoch": 4.7067769518459315, + "grad_norm": 2.3989596366882324, + "learning_rate": 4.52932385667065e-05, + "loss": 0.9941, + "step": 303300 + }, + { + "epoch": 4.708328807088875, + "grad_norm": 2.2754008769989014, + "learning_rate": 4.5291686711463556e-05, + "loss": 0.9984, + "step": 303400 + }, + { + "epoch": 4.709880662331818, + "grad_norm": 2.5523416996002197, + "learning_rate": 4.5290134856220614e-05, + "loss": 1.0024, + "step": 303500 + }, + { + "epoch": 4.71143251757476, + "grad_norm": 2.5854275226593018, + "learning_rate": 4.528858300097767e-05, + "loss": 1.0118, + "step": 303600 + }, + { + "epoch": 4.712984372817703, + "grad_norm": 2.160099983215332, + "learning_rate": 4.528703114573473e-05, + "loss": 0.993, + "step": 303700 + }, + { + "epoch": 4.714536228060647, + "grad_norm": 2.3430426120758057, + "learning_rate": 4.528547929049179e-05, + "loss": 0.9646, + "step": 303800 + }, + { + "epoch": 4.716088083303589, + "grad_norm": 2.329451322555542, + "learning_rate": 4.528392743524884e-05, + "loss": 0.9812, + "step": 303900 + }, + { + "epoch": 4.717639938546532, + "grad_norm": 2.225872755050659, + "learning_rate": 4.5282375580005896e-05, + "loss": 0.9924, + "step": 304000 + }, + { + "epoch": 4.719191793789475, + "grad_norm": 1.8935221433639526, + "learning_rate": 4.528082372476295e-05, + "loss": 0.9887, + "step": 304100 + }, + { + "epoch": 4.720743649032419, + "grad_norm": 1.9854744672775269, + "learning_rate": 4.527927186952001e-05, + "loss": 1.0062, + "step": 304200 + }, + { + "epoch": 4.722295504275361, + "grad_norm": 2.467484712600708, + "learning_rate": 4.527772001427707e-05, + "loss": 1.0174, + "step": 304300 + }, + { + "epoch": 4.723847359518304, + "grad_norm": 1.760667085647583, + "learning_rate": 4.527616815903413e-05, + "loss": 0.9869, + "step": 304400 + }, + { + "epoch": 4.725399214761247, + "grad_norm": 2.415112257003784, + "learning_rate": 4.5274616303791184e-05, + "loss": 0.9997, + "step": 304500 + }, + { + "epoch": 4.72695107000419, + "grad_norm": 2.0632479190826416, + "learning_rate": 4.527306444854824e-05, + "loss": 0.9813, + "step": 304600 + }, + { + "epoch": 4.728502925247133, + "grad_norm": 1.8340388536453247, + "learning_rate": 4.52715125933053e-05, + "loss": 0.9965, + "step": 304700 + }, + { + "epoch": 4.730054780490076, + "grad_norm": 2.0908021926879883, + "learning_rate": 4.526996073806236e-05, + "loss": 1.0083, + "step": 304800 + }, + { + "epoch": 4.731606635733019, + "grad_norm": 2.1623146533966064, + "learning_rate": 4.5268408882819415e-05, + "loss": 1.0052, + "step": 304900 + }, + { + "epoch": 4.733158490975962, + "grad_norm": 2.4359967708587646, + "learning_rate": 4.526685702757647e-05, + "loss": 0.9861, + "step": 305000 + }, + { + "epoch": 4.734710346218905, + "grad_norm": 2.168280839920044, + "learning_rate": 4.526530517233353e-05, + "loss": 0.9995, + "step": 305100 + }, + { + "epoch": 4.736262201461848, + "grad_norm": 2.062199831008911, + "learning_rate": 4.526375331709058e-05, + "loss": 0.9858, + "step": 305200 + }, + { + "epoch": 4.73781405670479, + "grad_norm": 1.9365390539169312, + "learning_rate": 4.526220146184764e-05, + "loss": 1.0009, + "step": 305300 + }, + { + "epoch": 4.739365911947734, + "grad_norm": 2.0393035411834717, + "learning_rate": 4.52606496066047e-05, + "loss": 0.9709, + "step": 305400 + }, + { + "epoch": 4.740917767190677, + "grad_norm": 2.06400728225708, + "learning_rate": 4.5259097751361755e-05, + "loss": 1.001, + "step": 305500 + }, + { + "epoch": 4.742469622433619, + "grad_norm": 3.0484304428100586, + "learning_rate": 4.525754589611881e-05, + "loss": 0.9995, + "step": 305600 + }, + { + "epoch": 4.744021477676562, + "grad_norm": 2.1534829139709473, + "learning_rate": 4.525599404087587e-05, + "loss": 0.9965, + "step": 305700 + }, + { + "epoch": 4.745573332919506, + "grad_norm": 2.2801458835601807, + "learning_rate": 4.525444218563293e-05, + "loss": 0.9937, + "step": 305800 + }, + { + "epoch": 4.747125188162448, + "grad_norm": 2.2503480911254883, + "learning_rate": 4.5252890330389986e-05, + "loss": 0.9841, + "step": 305900 + }, + { + "epoch": 4.748677043405391, + "grad_norm": 2.228114128112793, + "learning_rate": 4.5251338475147044e-05, + "loss": 1.007, + "step": 306000 + }, + { + "epoch": 4.750228898648334, + "grad_norm": 2.0279667377471924, + "learning_rate": 4.52497866199041e-05, + "loss": 0.9897, + "step": 306100 + }, + { + "epoch": 4.751780753891277, + "grad_norm": 2.4756104946136475, + "learning_rate": 4.524823476466116e-05, + "loss": 0.9842, + "step": 306200 + }, + { + "epoch": 4.75333260913422, + "grad_norm": 2.138491630554199, + "learning_rate": 4.524668290941822e-05, + "loss": 0.9799, + "step": 306300 + }, + { + "epoch": 4.754884464377163, + "grad_norm": 2.3243930339813232, + "learning_rate": 4.524513105417527e-05, + "loss": 0.9912, + "step": 306400 + }, + { + "epoch": 4.756436319620105, + "grad_norm": 2.1239287853240967, + "learning_rate": 4.5243579198932326e-05, + "loss": 0.9935, + "step": 306500 + }, + { + "epoch": 4.757988174863049, + "grad_norm": 2.103210926055908, + "learning_rate": 4.5242027343689384e-05, + "loss": 0.9969, + "step": 306600 + }, + { + "epoch": 4.759540030105992, + "grad_norm": 2.1429154872894287, + "learning_rate": 4.5240475488446435e-05, + "loss": 0.9772, + "step": 306700 + }, + { + "epoch": 4.761091885348935, + "grad_norm": 2.703026533126831, + "learning_rate": 4.523892363320349e-05, + "loss": 0.9794, + "step": 306800 + }, + { + "epoch": 4.762643740591877, + "grad_norm": 2.5382027626037598, + "learning_rate": 4.523737177796055e-05, + "loss": 0.9771, + "step": 306900 + }, + { + "epoch": 4.7641955958348206, + "grad_norm": 2.189344882965088, + "learning_rate": 4.523581992271761e-05, + "loss": 0.9942, + "step": 307000 + }, + { + "epoch": 4.765747451077764, + "grad_norm": 2.2709391117095947, + "learning_rate": 4.5234268067474666e-05, + "loss": 1.01, + "step": 307100 + }, + { + "epoch": 4.767299306320706, + "grad_norm": 2.155897378921509, + "learning_rate": 4.523271621223172e-05, + "loss": 0.9796, + "step": 307200 + }, + { + "epoch": 4.768851161563649, + "grad_norm": 2.1065542697906494, + "learning_rate": 4.523116435698878e-05, + "loss": 0.9707, + "step": 307300 + }, + { + "epoch": 4.7704030168065925, + "grad_norm": 2.1619253158569336, + "learning_rate": 4.522961250174584e-05, + "loss": 0.9895, + "step": 307400 + }, + { + "epoch": 4.771954872049535, + "grad_norm": 2.0945324897766113, + "learning_rate": 4.52280606465029e-05, + "loss": 0.9718, + "step": 307500 + }, + { + "epoch": 4.773506727292478, + "grad_norm": 2.4745171070098877, + "learning_rate": 4.5226508791259954e-05, + "loss": 0.9794, + "step": 307600 + }, + { + "epoch": 4.775058582535421, + "grad_norm": 2.1260809898376465, + "learning_rate": 4.522495693601701e-05, + "loss": 0.9886, + "step": 307700 + }, + { + "epoch": 4.7766104377783645, + "grad_norm": 2.019519805908203, + "learning_rate": 4.522340508077407e-05, + "loss": 0.9939, + "step": 307800 + }, + { + "epoch": 4.778162293021307, + "grad_norm": 1.9292490482330322, + "learning_rate": 4.522185322553113e-05, + "loss": 0.9848, + "step": 307900 + }, + { + "epoch": 4.77971414826425, + "grad_norm": 2.8044991493225098, + "learning_rate": 4.522030137028818e-05, + "loss": 0.9788, + "step": 308000 + }, + { + "epoch": 4.781266003507193, + "grad_norm": 2.1083643436431885, + "learning_rate": 4.5218749515045236e-05, + "loss": 1.001, + "step": 308100 + }, + { + "epoch": 4.7828178587501355, + "grad_norm": 2.6458473205566406, + "learning_rate": 4.5217197659802294e-05, + "loss": 0.983, + "step": 308200 + }, + { + "epoch": 4.784369713993079, + "grad_norm": 2.059161901473999, + "learning_rate": 4.521564580455935e-05, + "loss": 0.985, + "step": 308300 + }, + { + "epoch": 4.785921569236022, + "grad_norm": 2.3176801204681396, + "learning_rate": 4.521409394931641e-05, + "loss": 0.9939, + "step": 308400 + }, + { + "epoch": 4.787473424478964, + "grad_norm": 2.340977191925049, + "learning_rate": 4.521254209407347e-05, + "loss": 0.9588, + "step": 308500 + }, + { + "epoch": 4.7890252797219075, + "grad_norm": 1.9410039186477661, + "learning_rate": 4.5210990238830525e-05, + "loss": 0.9828, + "step": 308600 + }, + { + "epoch": 4.790577134964851, + "grad_norm": 2.3338623046875, + "learning_rate": 4.520943838358758e-05, + "loss": 0.9828, + "step": 308700 + }, + { + "epoch": 4.792128990207793, + "grad_norm": 1.9343339204788208, + "learning_rate": 4.520788652834464e-05, + "loss": 0.9921, + "step": 308800 + }, + { + "epoch": 4.793680845450736, + "grad_norm": 2.3858683109283447, + "learning_rate": 4.52063346731017e-05, + "loss": 1.0177, + "step": 308900 + }, + { + "epoch": 4.7952327006936795, + "grad_norm": 2.2674553394317627, + "learning_rate": 4.5204782817858756e-05, + "loss": 0.9824, + "step": 309000 + }, + { + "epoch": 4.796784555936622, + "grad_norm": 2.558554172515869, + "learning_rate": 4.5203230962615814e-05, + "loss": 1.0095, + "step": 309100 + }, + { + "epoch": 4.798336411179565, + "grad_norm": 2.6413729190826416, + "learning_rate": 4.520167910737287e-05, + "loss": 0.9757, + "step": 309200 + }, + { + "epoch": 4.799888266422508, + "grad_norm": 1.7679721117019653, + "learning_rate": 4.520012725212992e-05, + "loss": 0.9771, + "step": 309300 + }, + { + "epoch": 4.801440121665451, + "grad_norm": 1.9725315570831299, + "learning_rate": 4.519857539688698e-05, + "loss": 1.0053, + "step": 309400 + }, + { + "epoch": 4.802991976908394, + "grad_norm": 2.084390640258789, + "learning_rate": 4.519702354164404e-05, + "loss": 0.9903, + "step": 309500 + }, + { + "epoch": 4.804543832151337, + "grad_norm": 2.53524112701416, + "learning_rate": 4.5195471686401096e-05, + "loss": 0.9825, + "step": 309600 + }, + { + "epoch": 4.80609568739428, + "grad_norm": 2.4231560230255127, + "learning_rate": 4.519391983115815e-05, + "loss": 0.9813, + "step": 309700 + }, + { + "epoch": 4.8076475426372225, + "grad_norm": 2.319136619567871, + "learning_rate": 4.5192367975915205e-05, + "loss": 1.0041, + "step": 309800 + }, + { + "epoch": 4.809199397880166, + "grad_norm": 2.1990528106689453, + "learning_rate": 4.519081612067226e-05, + "loss": 0.9864, + "step": 309900 + }, + { + "epoch": 4.810751253123109, + "grad_norm": 2.3455920219421387, + "learning_rate": 4.518926426542932e-05, + "loss": 0.9708, + "step": 310000 + }, + { + "epoch": 4.812303108366051, + "grad_norm": 2.130215883255005, + "learning_rate": 4.518771241018638e-05, + "loss": 0.9891, + "step": 310100 + }, + { + "epoch": 4.8138549636089945, + "grad_norm": 2.2608296871185303, + "learning_rate": 4.5186160554943436e-05, + "loss": 0.9923, + "step": 310200 + }, + { + "epoch": 4.815406818851938, + "grad_norm": 2.3929078578948975, + "learning_rate": 4.518460869970049e-05, + "loss": 0.9903, + "step": 310300 + }, + { + "epoch": 4.816958674094881, + "grad_norm": 2.420306444168091, + "learning_rate": 4.518305684445755e-05, + "loss": 0.9769, + "step": 310400 + }, + { + "epoch": 4.818510529337823, + "grad_norm": 2.4495632648468018, + "learning_rate": 4.518150498921461e-05, + "loss": 0.9902, + "step": 310500 + }, + { + "epoch": 4.820062384580766, + "grad_norm": 2.5356087684631348, + "learning_rate": 4.517995313397167e-05, + "loss": 1.0062, + "step": 310600 + }, + { + "epoch": 4.82161423982371, + "grad_norm": 2.158783435821533, + "learning_rate": 4.5178401278728724e-05, + "loss": 0.9685, + "step": 310700 + }, + { + "epoch": 4.823166095066652, + "grad_norm": 1.907422423362732, + "learning_rate": 4.5176849423485775e-05, + "loss": 0.9689, + "step": 310800 + }, + { + "epoch": 4.824717950309595, + "grad_norm": 2.1155383586883545, + "learning_rate": 4.517529756824283e-05, + "loss": 0.9939, + "step": 310900 + }, + { + "epoch": 4.826269805552538, + "grad_norm": 2.372523546218872, + "learning_rate": 4.517374571299989e-05, + "loss": 1.0008, + "step": 311000 + }, + { + "epoch": 4.827821660795481, + "grad_norm": 2.369626760482788, + "learning_rate": 4.517219385775695e-05, + "loss": 0.982, + "step": 311100 + }, + { + "epoch": 4.829373516038424, + "grad_norm": 1.8950207233428955, + "learning_rate": 4.5170642002514006e-05, + "loss": 0.9838, + "step": 311200 + }, + { + "epoch": 4.830925371281367, + "grad_norm": 2.445340871810913, + "learning_rate": 4.5169090147271064e-05, + "loss": 0.9776, + "step": 311300 + }, + { + "epoch": 4.832477226524309, + "grad_norm": 2.245357036590576, + "learning_rate": 4.516753829202812e-05, + "loss": 1.0046, + "step": 311400 + }, + { + "epoch": 4.834029081767253, + "grad_norm": 1.8052308559417725, + "learning_rate": 4.516598643678518e-05, + "loss": 0.9806, + "step": 311500 + }, + { + "epoch": 4.835580937010196, + "grad_norm": 2.458707332611084, + "learning_rate": 4.516443458154224e-05, + "loss": 0.9934, + "step": 311600 + }, + { + "epoch": 4.837132792253138, + "grad_norm": 2.2918500900268555, + "learning_rate": 4.5162882726299295e-05, + "loss": 0.9905, + "step": 311700 + }, + { + "epoch": 4.838684647496081, + "grad_norm": 2.3220841884613037, + "learning_rate": 4.516133087105635e-05, + "loss": 0.995, + "step": 311800 + }, + { + "epoch": 4.840236502739025, + "grad_norm": 2.166215181350708, + "learning_rate": 4.515977901581341e-05, + "loss": 0.9792, + "step": 311900 + }, + { + "epoch": 4.841788357981968, + "grad_norm": 2.400585412979126, + "learning_rate": 4.515822716057047e-05, + "loss": 0.9908, + "step": 312000 + }, + { + "epoch": 4.84334021322491, + "grad_norm": 2.4989497661590576, + "learning_rate": 4.515667530532752e-05, + "loss": 0.9835, + "step": 312100 + }, + { + "epoch": 4.844892068467853, + "grad_norm": 2.049469232559204, + "learning_rate": 4.515512345008458e-05, + "loss": 0.9979, + "step": 312200 + }, + { + "epoch": 4.846443923710797, + "grad_norm": 2.144580602645874, + "learning_rate": 4.5153571594841635e-05, + "loss": 0.9944, + "step": 312300 + }, + { + "epoch": 4.847995778953739, + "grad_norm": 2.1540298461914062, + "learning_rate": 4.515201973959869e-05, + "loss": 0.9827, + "step": 312400 + }, + { + "epoch": 4.849547634196682, + "grad_norm": 2.292189121246338, + "learning_rate": 4.515046788435575e-05, + "loss": 0.9862, + "step": 312500 + }, + { + "epoch": 4.851099489439625, + "grad_norm": 2.9079971313476562, + "learning_rate": 4.514891602911281e-05, + "loss": 0.9729, + "step": 312600 + }, + { + "epoch": 4.852651344682568, + "grad_norm": 2.5702126026153564, + "learning_rate": 4.5147364173869866e-05, + "loss": 1.001, + "step": 312700 + }, + { + "epoch": 4.854203199925511, + "grad_norm": 1.8238567113876343, + "learning_rate": 4.5145812318626924e-05, + "loss": 0.9971, + "step": 312800 + }, + { + "epoch": 4.855755055168454, + "grad_norm": 2.1982083320617676, + "learning_rate": 4.5144260463383975e-05, + "loss": 0.9819, + "step": 312900 + }, + { + "epoch": 4.857306910411397, + "grad_norm": 2.0572688579559326, + "learning_rate": 4.514270860814103e-05, + "loss": 1.0122, + "step": 313000 + }, + { + "epoch": 4.85885876565434, + "grad_norm": 2.3243939876556396, + "learning_rate": 4.514115675289809e-05, + "loss": 0.9688, + "step": 313100 + }, + { + "epoch": 4.860410620897283, + "grad_norm": 1.879243016242981, + "learning_rate": 4.513960489765515e-05, + "loss": 0.9859, + "step": 313200 + }, + { + "epoch": 4.861962476140226, + "grad_norm": 2.0839011669158936, + "learning_rate": 4.5138053042412206e-05, + "loss": 0.9712, + "step": 313300 + }, + { + "epoch": 4.863514331383168, + "grad_norm": 1.9096519947052002, + "learning_rate": 4.513650118716926e-05, + "loss": 1.0063, + "step": 313400 + }, + { + "epoch": 4.865066186626112, + "grad_norm": 2.4051766395568848, + "learning_rate": 4.513494933192632e-05, + "loss": 0.9932, + "step": 313500 + }, + { + "epoch": 4.866618041869055, + "grad_norm": 1.9783592224121094, + "learning_rate": 4.513339747668338e-05, + "loss": 1.021, + "step": 313600 + }, + { + "epoch": 4.868169897111997, + "grad_norm": 2.3197271823883057, + "learning_rate": 4.513184562144043e-05, + "loss": 0.9703, + "step": 313700 + }, + { + "epoch": 4.86972175235494, + "grad_norm": 2.0943002700805664, + "learning_rate": 4.513029376619749e-05, + "loss": 0.9897, + "step": 313800 + }, + { + "epoch": 4.8712736075978835, + "grad_norm": 2.539102792739868, + "learning_rate": 4.5128741910954545e-05, + "loss": 0.9885, + "step": 313900 + }, + { + "epoch": 4.872825462840826, + "grad_norm": 2.569833517074585, + "learning_rate": 4.51271900557116e-05, + "loss": 1.0052, + "step": 314000 + }, + { + "epoch": 4.874377318083769, + "grad_norm": 2.141819477081299, + "learning_rate": 4.512563820046866e-05, + "loss": 0.9849, + "step": 314100 + }, + { + "epoch": 4.875929173326712, + "grad_norm": 2.1459314823150635, + "learning_rate": 4.512408634522572e-05, + "loss": 0.9939, + "step": 314200 + }, + { + "epoch": 4.877481028569655, + "grad_norm": 2.138211965560913, + "learning_rate": 4.5122534489982776e-05, + "loss": 1.0196, + "step": 314300 + }, + { + "epoch": 4.879032883812598, + "grad_norm": 2.1440787315368652, + "learning_rate": 4.5120982634739834e-05, + "loss": 0.9893, + "step": 314400 + }, + { + "epoch": 4.880584739055541, + "grad_norm": 1.9738491773605347, + "learning_rate": 4.511943077949689e-05, + "loss": 0.9883, + "step": 314500 + }, + { + "epoch": 4.882136594298483, + "grad_norm": 2.0031113624572754, + "learning_rate": 4.511787892425395e-05, + "loss": 0.9808, + "step": 314600 + }, + { + "epoch": 4.883688449541427, + "grad_norm": 1.838338851928711, + "learning_rate": 4.511632706901101e-05, + "loss": 0.9773, + "step": 314700 + }, + { + "epoch": 4.88524030478437, + "grad_norm": 2.5889651775360107, + "learning_rate": 4.5114775213768065e-05, + "loss": 0.9903, + "step": 314800 + }, + { + "epoch": 4.886792160027313, + "grad_norm": 1.8924591541290283, + "learning_rate": 4.511322335852512e-05, + "loss": 0.9823, + "step": 314900 + }, + { + "epoch": 4.888344015270255, + "grad_norm": 2.262155771255493, + "learning_rate": 4.5111671503282174e-05, + "loss": 0.9868, + "step": 315000 + }, + { + "epoch": 4.8898958705131985, + "grad_norm": 2.004237413406372, + "learning_rate": 4.511011964803923e-05, + "loss": 0.9847, + "step": 315100 + }, + { + "epoch": 4.891447725756142, + "grad_norm": 1.9922564029693604, + "learning_rate": 4.510856779279629e-05, + "loss": 0.9897, + "step": 315200 + }, + { + "epoch": 4.892999580999084, + "grad_norm": 2.1039512157440186, + "learning_rate": 4.510701593755335e-05, + "loss": 0.9976, + "step": 315300 + }, + { + "epoch": 4.894551436242027, + "grad_norm": 2.3522932529449463, + "learning_rate": 4.5105464082310405e-05, + "loss": 0.9902, + "step": 315400 + }, + { + "epoch": 4.8961032914849705, + "grad_norm": 3.199152946472168, + "learning_rate": 4.510391222706746e-05, + "loss": 1.0005, + "step": 315500 + }, + { + "epoch": 4.897655146727914, + "grad_norm": 2.3306031227111816, + "learning_rate": 4.510236037182452e-05, + "loss": 0.9912, + "step": 315600 + }, + { + "epoch": 4.899207001970856, + "grad_norm": 1.844973087310791, + "learning_rate": 4.510080851658158e-05, + "loss": 0.9901, + "step": 315700 + }, + { + "epoch": 4.900758857213799, + "grad_norm": 2.1430327892303467, + "learning_rate": 4.5099256661338636e-05, + "loss": 0.9988, + "step": 315800 + }, + { + "epoch": 4.902310712456742, + "grad_norm": 1.9988670349121094, + "learning_rate": 4.5097704806095694e-05, + "loss": 0.9826, + "step": 315900 + }, + { + "epoch": 4.903862567699685, + "grad_norm": 1.9529697895050049, + "learning_rate": 4.509615295085275e-05, + "loss": 1.0013, + "step": 316000 + }, + { + "epoch": 4.905414422942628, + "grad_norm": 2.120429754257202, + "learning_rate": 4.509460109560981e-05, + "loss": 0.9941, + "step": 316100 + }, + { + "epoch": 4.906966278185571, + "grad_norm": 1.7236740589141846, + "learning_rate": 4.509304924036686e-05, + "loss": 0.9939, + "step": 316200 + }, + { + "epoch": 4.9085181334285135, + "grad_norm": 2.757526159286499, + "learning_rate": 4.509149738512392e-05, + "loss": 0.9928, + "step": 316300 + }, + { + "epoch": 4.910069988671457, + "grad_norm": 2.2114367485046387, + "learning_rate": 4.5089945529880976e-05, + "loss": 1.0006, + "step": 316400 + }, + { + "epoch": 4.9116218439144, + "grad_norm": 2.275763750076294, + "learning_rate": 4.5088393674638027e-05, + "loss": 0.9995, + "step": 316500 + }, + { + "epoch": 4.913173699157342, + "grad_norm": 2.070420026779175, + "learning_rate": 4.5086841819395084e-05, + "loss": 0.9977, + "step": 316600 + }, + { + "epoch": 4.9147255544002855, + "grad_norm": 2.078160285949707, + "learning_rate": 4.508528996415214e-05, + "loss": 0.9926, + "step": 316700 + }, + { + "epoch": 4.916277409643229, + "grad_norm": 2.0535004138946533, + "learning_rate": 4.50837381089092e-05, + "loss": 1.0036, + "step": 316800 + }, + { + "epoch": 4.917829264886171, + "grad_norm": 1.8921369314193726, + "learning_rate": 4.508218625366626e-05, + "loss": 0.9813, + "step": 316900 + }, + { + "epoch": 4.919381120129114, + "grad_norm": 2.2783751487731934, + "learning_rate": 4.5080634398423315e-05, + "loss": 0.9746, + "step": 317000 + }, + { + "epoch": 4.920932975372057, + "grad_norm": 2.1465771198272705, + "learning_rate": 4.507908254318037e-05, + "loss": 0.9771, + "step": 317100 + }, + { + "epoch": 4.922484830615, + "grad_norm": 2.3580241203308105, + "learning_rate": 4.507753068793743e-05, + "loss": 0.9956, + "step": 317200 + }, + { + "epoch": 4.924036685857943, + "grad_norm": 2.3010239601135254, + "learning_rate": 4.507597883269449e-05, + "loss": 0.9863, + "step": 317300 + }, + { + "epoch": 4.925588541100886, + "grad_norm": 1.9766231775283813, + "learning_rate": 4.5074426977451546e-05, + "loss": 1.0005, + "step": 317400 + }, + { + "epoch": 4.927140396343829, + "grad_norm": 2.148529052734375, + "learning_rate": 4.5072875122208604e-05, + "loss": 0.9843, + "step": 317500 + }, + { + "epoch": 4.928692251586772, + "grad_norm": 2.9304866790771484, + "learning_rate": 4.507132326696566e-05, + "loss": 1.0141, + "step": 317600 + }, + { + "epoch": 4.930244106829715, + "grad_norm": 2.2647006511688232, + "learning_rate": 4.506977141172272e-05, + "loss": 0.9937, + "step": 317700 + }, + { + "epoch": 4.931795962072658, + "grad_norm": 2.278210401535034, + "learning_rate": 4.506821955647977e-05, + "loss": 1.0097, + "step": 317800 + }, + { + "epoch": 4.9333478173156005, + "grad_norm": 1.9699627161026, + "learning_rate": 4.506666770123683e-05, + "loss": 0.9968, + "step": 317900 + }, + { + "epoch": 4.934899672558544, + "grad_norm": 2.1486823558807373, + "learning_rate": 4.5065115845993886e-05, + "loss": 1.0099, + "step": 318000 + }, + { + "epoch": 4.936451527801487, + "grad_norm": 1.8996237516403198, + "learning_rate": 4.5063563990750944e-05, + "loss": 0.9992, + "step": 318100 + }, + { + "epoch": 4.93800338304443, + "grad_norm": 2.353362798690796, + "learning_rate": 4.5062012135508e-05, + "loss": 0.9961, + "step": 318200 + }, + { + "epoch": 4.939555238287372, + "grad_norm": 2.2171120643615723, + "learning_rate": 4.506046028026506e-05, + "loss": 0.9911, + "step": 318300 + }, + { + "epoch": 4.941107093530316, + "grad_norm": 2.42317533493042, + "learning_rate": 4.505890842502212e-05, + "loss": 0.9772, + "step": 318400 + }, + { + "epoch": 4.942658948773259, + "grad_norm": 2.350301504135132, + "learning_rate": 4.5057356569779175e-05, + "loss": 0.9862, + "step": 318500 + }, + { + "epoch": 4.944210804016201, + "grad_norm": 2.3713793754577637, + "learning_rate": 4.505580471453623e-05, + "loss": 1.0027, + "step": 318600 + }, + { + "epoch": 4.945762659259144, + "grad_norm": 1.9676238298416138, + "learning_rate": 4.505425285929329e-05, + "loss": 0.9957, + "step": 318700 + }, + { + "epoch": 4.947314514502088, + "grad_norm": 2.1496734619140625, + "learning_rate": 4.505270100405035e-05, + "loss": 0.9971, + "step": 318800 + }, + { + "epoch": 4.94886636974503, + "grad_norm": 2.2096948623657227, + "learning_rate": 4.5051149148807406e-05, + "loss": 0.9802, + "step": 318900 + }, + { + "epoch": 4.950418224987973, + "grad_norm": 1.9896800518035889, + "learning_rate": 4.5049597293564464e-05, + "loss": 1.0068, + "step": 319000 + }, + { + "epoch": 4.951970080230916, + "grad_norm": 2.3184118270874023, + "learning_rate": 4.5048045438321515e-05, + "loss": 0.9628, + "step": 319100 + }, + { + "epoch": 4.953521935473859, + "grad_norm": 2.2848899364471436, + "learning_rate": 4.504649358307857e-05, + "loss": 0.9781, + "step": 319200 + }, + { + "epoch": 4.955073790716802, + "grad_norm": 2.2715554237365723, + "learning_rate": 4.504494172783563e-05, + "loss": 0.9828, + "step": 319300 + }, + { + "epoch": 4.956625645959745, + "grad_norm": 2.0597498416900635, + "learning_rate": 4.504338987259268e-05, + "loss": 0.9878, + "step": 319400 + }, + { + "epoch": 4.958177501202687, + "grad_norm": 2.066211700439453, + "learning_rate": 4.504183801734974e-05, + "loss": 0.9954, + "step": 319500 + }, + { + "epoch": 4.959729356445631, + "grad_norm": 2.5709476470947266, + "learning_rate": 4.5040286162106797e-05, + "loss": 0.9929, + "step": 319600 + }, + { + "epoch": 4.961281211688574, + "grad_norm": 2.1759355068206787, + "learning_rate": 4.5038734306863854e-05, + "loss": 0.9693, + "step": 319700 + }, + { + "epoch": 4.962833066931516, + "grad_norm": 2.348320245742798, + "learning_rate": 4.503718245162091e-05, + "loss": 1.0064, + "step": 319800 + }, + { + "epoch": 4.964384922174459, + "grad_norm": 2.4137532711029053, + "learning_rate": 4.503563059637797e-05, + "loss": 1.0024, + "step": 319900 + }, + { + "epoch": 4.965936777417403, + "grad_norm": 2.276843547821045, + "learning_rate": 4.503407874113503e-05, + "loss": 1.0082, + "step": 320000 + }, + { + "epoch": 4.967488632660346, + "grad_norm": 2.1787757873535156, + "learning_rate": 4.5032526885892085e-05, + "loss": 0.9911, + "step": 320100 + }, + { + "epoch": 4.969040487903288, + "grad_norm": 2.136810779571533, + "learning_rate": 4.503097503064914e-05, + "loss": 0.9857, + "step": 320200 + }, + { + "epoch": 4.970592343146231, + "grad_norm": 2.3838462829589844, + "learning_rate": 4.50294231754062e-05, + "loss": 0.9849, + "step": 320300 + }, + { + "epoch": 4.9721441983891745, + "grad_norm": 2.494656801223755, + "learning_rate": 4.502787132016326e-05, + "loss": 1.0124, + "step": 320400 + }, + { + "epoch": 4.973696053632117, + "grad_norm": 2.2673838138580322, + "learning_rate": 4.5026319464920316e-05, + "loss": 0.967, + "step": 320500 + }, + { + "epoch": 4.97524790887506, + "grad_norm": 2.2103631496429443, + "learning_rate": 4.5024767609677374e-05, + "loss": 0.9964, + "step": 320600 + }, + { + "epoch": 4.976799764118003, + "grad_norm": 2.6784791946411133, + "learning_rate": 4.5023215754434425e-05, + "loss": 0.9971, + "step": 320700 + }, + { + "epoch": 4.9783516193609465, + "grad_norm": 2.480079174041748, + "learning_rate": 4.502166389919148e-05, + "loss": 0.9589, + "step": 320800 + }, + { + "epoch": 4.979903474603889, + "grad_norm": 2.04945707321167, + "learning_rate": 4.502011204394854e-05, + "loss": 1.0014, + "step": 320900 + }, + { + "epoch": 4.981455329846832, + "grad_norm": 2.558013677597046, + "learning_rate": 4.50185601887056e-05, + "loss": 0.9928, + "step": 321000 + }, + { + "epoch": 4.983007185089775, + "grad_norm": 2.3200182914733887, + "learning_rate": 4.5017008333462656e-05, + "loss": 0.9969, + "step": 321100 + }, + { + "epoch": 4.984559040332718, + "grad_norm": 2.510084629058838, + "learning_rate": 4.5015456478219714e-05, + "loss": 0.998, + "step": 321200 + }, + { + "epoch": 4.986110895575661, + "grad_norm": 2.0798165798187256, + "learning_rate": 4.501390462297677e-05, + "loss": 0.996, + "step": 321300 + }, + { + "epoch": 4.987662750818604, + "grad_norm": 2.3555374145507812, + "learning_rate": 4.501235276773383e-05, + "loss": 0.9795, + "step": 321400 + }, + { + "epoch": 4.989214606061546, + "grad_norm": 2.3246071338653564, + "learning_rate": 4.501080091249089e-05, + "loss": 0.992, + "step": 321500 + }, + { + "epoch": 4.9907664613044895, + "grad_norm": 2.4454610347747803, + "learning_rate": 4.5009249057247945e-05, + "loss": 1.0054, + "step": 321600 + }, + { + "epoch": 4.992318316547433, + "grad_norm": 2.497229814529419, + "learning_rate": 4.5007697202005e-05, + "loss": 1.0186, + "step": 321700 + }, + { + "epoch": 4.993870171790375, + "grad_norm": 2.51166033744812, + "learning_rate": 4.500614534676206e-05, + "loss": 0.9986, + "step": 321800 + }, + { + "epoch": 4.995422027033318, + "grad_norm": 2.4567017555236816, + "learning_rate": 4.500459349151912e-05, + "loss": 0.9972, + "step": 321900 + }, + { + "epoch": 4.9969738822762615, + "grad_norm": 2.2930448055267334, + "learning_rate": 4.500304163627617e-05, + "loss": 0.9825, + "step": 322000 + }, + { + "epoch": 4.998525737519204, + "grad_norm": 2.1482913494110107, + "learning_rate": 4.500148978103323e-05, + "loss": 0.9983, + "step": 322100 + }, + { + "epoch": 5.000077592762147, + "grad_norm": 2.2424023151397705, + "learning_rate": 4.4999937925790285e-05, + "loss": 0.9822, + "step": 322200 + }, + { + "epoch": 5.00162944800509, + "grad_norm": 2.2823832035064697, + "learning_rate": 4.499838607054734e-05, + "loss": 0.9885, + "step": 322300 + }, + { + "epoch": 5.0031813032480335, + "grad_norm": 2.415766716003418, + "learning_rate": 4.49968342153044e-05, + "loss": 0.9916, + "step": 322400 + }, + { + "epoch": 5.004733158490976, + "grad_norm": 1.97372567653656, + "learning_rate": 4.499528236006146e-05, + "loss": 0.9873, + "step": 322500 + }, + { + "epoch": 5.006285013733919, + "grad_norm": 1.8922631740570068, + "learning_rate": 4.4993730504818516e-05, + "loss": 0.9904, + "step": 322600 + }, + { + "epoch": 5.007836868976862, + "grad_norm": 2.2126123905181885, + "learning_rate": 4.4992178649575567e-05, + "loss": 0.9724, + "step": 322700 + }, + { + "epoch": 5.0093887242198045, + "grad_norm": 2.1099987030029297, + "learning_rate": 4.4990626794332624e-05, + "loss": 0.9924, + "step": 322800 + }, + { + "epoch": 5.010940579462748, + "grad_norm": 2.4179370403289795, + "learning_rate": 4.498907493908968e-05, + "loss": 0.981, + "step": 322900 + }, + { + "epoch": 5.012492434705691, + "grad_norm": 2.197739839553833, + "learning_rate": 4.498752308384674e-05, + "loss": 0.9809, + "step": 323000 + }, + { + "epoch": 5.014044289948633, + "grad_norm": 2.29628324508667, + "learning_rate": 4.49859712286038e-05, + "loss": 0.9741, + "step": 323100 + }, + { + "epoch": 5.0155961451915765, + "grad_norm": 2.441187620162964, + "learning_rate": 4.4984419373360855e-05, + "loss": 0.9729, + "step": 323200 + }, + { + "epoch": 5.01714800043452, + "grad_norm": 1.7777624130249023, + "learning_rate": 4.498286751811791e-05, + "loss": 0.9646, + "step": 323300 + }, + { + "epoch": 5.018699855677462, + "grad_norm": 2.440723180770874, + "learning_rate": 4.498131566287497e-05, + "loss": 0.9771, + "step": 323400 + }, + { + "epoch": 5.020251710920405, + "grad_norm": 2.075007915496826, + "learning_rate": 4.497976380763202e-05, + "loss": 0.9746, + "step": 323500 + }, + { + "epoch": 5.021803566163348, + "grad_norm": 1.9657238721847534, + "learning_rate": 4.497821195238908e-05, + "loss": 0.973, + "step": 323600 + }, + { + "epoch": 5.023355421406292, + "grad_norm": 2.0091726779937744, + "learning_rate": 4.497666009714614e-05, + "loss": 0.9853, + "step": 323700 + }, + { + "epoch": 5.024907276649234, + "grad_norm": 2.3106119632720947, + "learning_rate": 4.4975108241903195e-05, + "loss": 0.9977, + "step": 323800 + }, + { + "epoch": 5.026459131892177, + "grad_norm": 2.1008052825927734, + "learning_rate": 4.497355638666025e-05, + "loss": 0.9742, + "step": 323900 + }, + { + "epoch": 5.02801098713512, + "grad_norm": 2.4200737476348877, + "learning_rate": 4.497200453141731e-05, + "loss": 0.9889, + "step": 324000 + }, + { + "epoch": 5.029562842378063, + "grad_norm": 2.3959174156188965, + "learning_rate": 4.497045267617437e-05, + "loss": 0.9626, + "step": 324100 + }, + { + "epoch": 5.031114697621006, + "grad_norm": 2.0759968757629395, + "learning_rate": 4.4968900820931426e-05, + "loss": 0.9723, + "step": 324200 + }, + { + "epoch": 5.032666552863949, + "grad_norm": 2.0733158588409424, + "learning_rate": 4.4967348965688484e-05, + "loss": 1.005, + "step": 324300 + }, + { + "epoch": 5.0342184081068915, + "grad_norm": 2.3548550605773926, + "learning_rate": 4.496579711044554e-05, + "loss": 0.9703, + "step": 324400 + }, + { + "epoch": 5.035770263349835, + "grad_norm": 2.2286078929901123, + "learning_rate": 4.49642452552026e-05, + "loss": 0.9645, + "step": 324500 + }, + { + "epoch": 5.037322118592778, + "grad_norm": 2.5244295597076416, + "learning_rate": 4.496269339995966e-05, + "loss": 0.9677, + "step": 324600 + }, + { + "epoch": 5.03887397383572, + "grad_norm": 2.2448983192443848, + "learning_rate": 4.4961141544716715e-05, + "loss": 0.9954, + "step": 324700 + }, + { + "epoch": 5.040425829078663, + "grad_norm": 2.460890531539917, + "learning_rate": 4.4959589689473766e-05, + "loss": 0.9627, + "step": 324800 + }, + { + "epoch": 5.041977684321607, + "grad_norm": 2.4765453338623047, + "learning_rate": 4.4958037834230824e-05, + "loss": 1.0006, + "step": 324900 + }, + { + "epoch": 5.04352953956455, + "grad_norm": 2.5767853260040283, + "learning_rate": 4.495648597898788e-05, + "loss": 0.9935, + "step": 325000 + }, + { + "epoch": 5.045081394807492, + "grad_norm": 3.2556047439575195, + "learning_rate": 4.495493412374494e-05, + "loss": 0.9818, + "step": 325100 + }, + { + "epoch": 5.046633250050435, + "grad_norm": 2.1471121311187744, + "learning_rate": 4.4953382268502e-05, + "loss": 0.9956, + "step": 325200 + }, + { + "epoch": 5.048185105293379, + "grad_norm": 2.146810531616211, + "learning_rate": 4.4951830413259055e-05, + "loss": 0.9869, + "step": 325300 + }, + { + "epoch": 5.049736960536321, + "grad_norm": 1.9521751403808594, + "learning_rate": 4.495027855801611e-05, + "loss": 0.9755, + "step": 325400 + }, + { + "epoch": 5.051288815779264, + "grad_norm": 2.013068675994873, + "learning_rate": 4.494872670277317e-05, + "loss": 0.985, + "step": 325500 + }, + { + "epoch": 5.052840671022207, + "grad_norm": 2.042806386947632, + "learning_rate": 4.494717484753023e-05, + "loss": 0.9754, + "step": 325600 + }, + { + "epoch": 5.05439252626515, + "grad_norm": 2.262981653213501, + "learning_rate": 4.4945622992287286e-05, + "loss": 0.9974, + "step": 325700 + }, + { + "epoch": 5.055944381508093, + "grad_norm": 2.10441255569458, + "learning_rate": 4.494407113704434e-05, + "loss": 0.9566, + "step": 325800 + }, + { + "epoch": 5.057496236751036, + "grad_norm": 2.24804425239563, + "learning_rate": 4.49425192818014e-05, + "loss": 0.9934, + "step": 325900 + }, + { + "epoch": 5.059048091993978, + "grad_norm": 2.182786703109741, + "learning_rate": 4.494096742655845e-05, + "loss": 0.9887, + "step": 326000 + }, + { + "epoch": 5.060599947236922, + "grad_norm": 2.216348171234131, + "learning_rate": 4.493941557131551e-05, + "loss": 0.9928, + "step": 326100 + }, + { + "epoch": 5.062151802479865, + "grad_norm": 2.180921792984009, + "learning_rate": 4.493786371607257e-05, + "loss": 0.9664, + "step": 326200 + }, + { + "epoch": 5.063703657722808, + "grad_norm": 2.5766446590423584, + "learning_rate": 4.493631186082962e-05, + "loss": 0.9823, + "step": 326300 + }, + { + "epoch": 5.06525551296575, + "grad_norm": 2.099266290664673, + "learning_rate": 4.4934760005586676e-05, + "loss": 0.973, + "step": 326400 + }, + { + "epoch": 5.066807368208694, + "grad_norm": 2.3833377361297607, + "learning_rate": 4.4933208150343734e-05, + "loss": 0.9837, + "step": 326500 + }, + { + "epoch": 5.068359223451637, + "grad_norm": 2.3511757850646973, + "learning_rate": 4.493165629510079e-05, + "loss": 1.0028, + "step": 326600 + }, + { + "epoch": 5.069911078694579, + "grad_norm": 2.3296661376953125, + "learning_rate": 4.493010443985785e-05, + "loss": 0.973, + "step": 326700 + }, + { + "epoch": 5.071462933937522, + "grad_norm": 2.1355676651000977, + "learning_rate": 4.492855258461491e-05, + "loss": 0.9714, + "step": 326800 + }, + { + "epoch": 5.0730147891804656, + "grad_norm": 2.3457136154174805, + "learning_rate": 4.4927000729371965e-05, + "loss": 0.9647, + "step": 326900 + }, + { + "epoch": 5.074566644423408, + "grad_norm": 2.4499151706695557, + "learning_rate": 4.492544887412902e-05, + "loss": 0.9798, + "step": 327000 + }, + { + "epoch": 5.076118499666351, + "grad_norm": 2.6530094146728516, + "learning_rate": 4.492389701888608e-05, + "loss": 1.0072, + "step": 327100 + }, + { + "epoch": 5.077670354909294, + "grad_norm": 2.5040955543518066, + "learning_rate": 4.492234516364314e-05, + "loss": 1.0017, + "step": 327200 + }, + { + "epoch": 5.079222210152237, + "grad_norm": 2.2727019786834717, + "learning_rate": 4.4920793308400196e-05, + "loss": 0.9797, + "step": 327300 + }, + { + "epoch": 5.08077406539518, + "grad_norm": 2.231516122817993, + "learning_rate": 4.4919241453157254e-05, + "loss": 0.9766, + "step": 327400 + }, + { + "epoch": 5.082325920638123, + "grad_norm": 2.1343374252319336, + "learning_rate": 4.491768959791431e-05, + "loss": 0.9679, + "step": 327500 + }, + { + "epoch": 5.083877775881065, + "grad_norm": 1.9527649879455566, + "learning_rate": 4.491613774267136e-05, + "loss": 0.9785, + "step": 327600 + }, + { + "epoch": 5.085429631124009, + "grad_norm": 2.5343494415283203, + "learning_rate": 4.491458588742842e-05, + "loss": 0.999, + "step": 327700 + }, + { + "epoch": 5.086981486366952, + "grad_norm": 2.1045212745666504, + "learning_rate": 4.491303403218548e-05, + "loss": 1.0005, + "step": 327800 + }, + { + "epoch": 5.088533341609895, + "grad_norm": 2.259221076965332, + "learning_rate": 4.4911482176942536e-05, + "loss": 0.9895, + "step": 327900 + }, + { + "epoch": 5.090085196852837, + "grad_norm": 2.301227331161499, + "learning_rate": 4.4909930321699594e-05, + "loss": 0.9904, + "step": 328000 + }, + { + "epoch": 5.0916370520957805, + "grad_norm": 2.032723903656006, + "learning_rate": 4.490837846645665e-05, + "loss": 0.9837, + "step": 328100 + }, + { + "epoch": 5.093188907338724, + "grad_norm": 2.1664624214172363, + "learning_rate": 4.490682661121371e-05, + "loss": 1.0095, + "step": 328200 + }, + { + "epoch": 5.094740762581666, + "grad_norm": 2.5637974739074707, + "learning_rate": 4.490527475597077e-05, + "loss": 1.0016, + "step": 328300 + }, + { + "epoch": 5.096292617824609, + "grad_norm": 2.775192975997925, + "learning_rate": 4.4903722900727825e-05, + "loss": 1.0008, + "step": 328400 + }, + { + "epoch": 5.0978444730675525, + "grad_norm": 2.0807552337646484, + "learning_rate": 4.490217104548488e-05, + "loss": 0.9837, + "step": 328500 + }, + { + "epoch": 5.099396328310495, + "grad_norm": 2.2242581844329834, + "learning_rate": 4.490061919024194e-05, + "loss": 0.9981, + "step": 328600 + }, + { + "epoch": 5.100948183553438, + "grad_norm": 1.8216568231582642, + "learning_rate": 4.4899067334999e-05, + "loss": 1.0021, + "step": 328700 + }, + { + "epoch": 5.102500038796381, + "grad_norm": 2.2259035110473633, + "learning_rate": 4.4897515479756056e-05, + "loss": 1.0082, + "step": 328800 + }, + { + "epoch": 5.104051894039324, + "grad_norm": 2.189532518386841, + "learning_rate": 4.4895963624513107e-05, + "loss": 1.008, + "step": 328900 + }, + { + "epoch": 5.105603749282267, + "grad_norm": 2.4402806758880615, + "learning_rate": 4.4894411769270164e-05, + "loss": 1.0065, + "step": 329000 + }, + { + "epoch": 5.10715560452521, + "grad_norm": 2.2367091178894043, + "learning_rate": 4.489285991402722e-05, + "loss": 0.9872, + "step": 329100 + }, + { + "epoch": 5.108707459768153, + "grad_norm": 2.6189193725585938, + "learning_rate": 4.489130805878427e-05, + "loss": 0.9856, + "step": 329200 + }, + { + "epoch": 5.1102593150110955, + "grad_norm": 2.663487195968628, + "learning_rate": 4.488975620354133e-05, + "loss": 0.975, + "step": 329300 + }, + { + "epoch": 5.111811170254039, + "grad_norm": 2.0868868827819824, + "learning_rate": 4.488820434829839e-05, + "loss": 0.9888, + "step": 329400 + }, + { + "epoch": 5.113363025496982, + "grad_norm": 2.2460601329803467, + "learning_rate": 4.4886652493055446e-05, + "loss": 1.0017, + "step": 329500 + }, + { + "epoch": 5.114914880739924, + "grad_norm": 2.796231746673584, + "learning_rate": 4.4885100637812504e-05, + "loss": 0.973, + "step": 329600 + }, + { + "epoch": 5.1164667359828675, + "grad_norm": 2.2700040340423584, + "learning_rate": 4.488354878256956e-05, + "loss": 0.9873, + "step": 329700 + }, + { + "epoch": 5.118018591225811, + "grad_norm": 2.3378028869628906, + "learning_rate": 4.488199692732662e-05, + "loss": 0.9736, + "step": 329800 + }, + { + "epoch": 5.119570446468753, + "grad_norm": 1.8950952291488647, + "learning_rate": 4.488044507208368e-05, + "loss": 0.9791, + "step": 329900 + }, + { + "epoch": 5.121122301711696, + "grad_norm": 2.1657330989837646, + "learning_rate": 4.4878893216840735e-05, + "loss": 0.9752, + "step": 330000 + }, + { + "epoch": 5.1226741569546395, + "grad_norm": 2.277442455291748, + "learning_rate": 4.487734136159779e-05, + "loss": 0.9884, + "step": 330100 + }, + { + "epoch": 5.124226012197582, + "grad_norm": 2.029282331466675, + "learning_rate": 4.487578950635485e-05, + "loss": 0.99, + "step": 330200 + }, + { + "epoch": 5.125777867440525, + "grad_norm": 2.328399896621704, + "learning_rate": 4.487423765111191e-05, + "loss": 0.985, + "step": 330300 + }, + { + "epoch": 5.127329722683468, + "grad_norm": 2.042090892791748, + "learning_rate": 4.4872685795868966e-05, + "loss": 0.9752, + "step": 330400 + }, + { + "epoch": 5.128881577926411, + "grad_norm": 2.092154026031494, + "learning_rate": 4.487113394062602e-05, + "loss": 0.9834, + "step": 330500 + }, + { + "epoch": 5.130433433169354, + "grad_norm": 2.164201259613037, + "learning_rate": 4.4869582085383075e-05, + "loss": 1.006, + "step": 330600 + }, + { + "epoch": 5.131985288412297, + "grad_norm": 2.257042646408081, + "learning_rate": 4.486803023014013e-05, + "loss": 1.0192, + "step": 330700 + }, + { + "epoch": 5.13353714365524, + "grad_norm": 2.147996187210083, + "learning_rate": 4.486647837489719e-05, + "loss": 0.9702, + "step": 330800 + }, + { + "epoch": 5.1350889988981825, + "grad_norm": 2.2125768661499023, + "learning_rate": 4.486492651965425e-05, + "loss": 0.9834, + "step": 330900 + }, + { + "epoch": 5.136640854141126, + "grad_norm": 2.200608015060425, + "learning_rate": 4.4863374664411306e-05, + "loss": 0.9925, + "step": 331000 + }, + { + "epoch": 5.138192709384069, + "grad_norm": 2.172070026397705, + "learning_rate": 4.4861822809168364e-05, + "loss": 0.9818, + "step": 331100 + }, + { + "epoch": 5.139744564627011, + "grad_norm": 1.8601043224334717, + "learning_rate": 4.486027095392542e-05, + "loss": 0.9737, + "step": 331200 + }, + { + "epoch": 5.1412964198699544, + "grad_norm": 2.188028335571289, + "learning_rate": 4.485871909868248e-05, + "loss": 0.9903, + "step": 331300 + }, + { + "epoch": 5.142848275112898, + "grad_norm": 2.4077775478363037, + "learning_rate": 4.485716724343954e-05, + "loss": 0.9788, + "step": 331400 + }, + { + "epoch": 5.14440013035584, + "grad_norm": 1.905815601348877, + "learning_rate": 4.4855615388196595e-05, + "loss": 0.9895, + "step": 331500 + }, + { + "epoch": 5.145951985598783, + "grad_norm": 2.180434226989746, + "learning_rate": 4.485406353295365e-05, + "loss": 1.0077, + "step": 331600 + }, + { + "epoch": 5.147503840841726, + "grad_norm": 2.543748617172241, + "learning_rate": 4.485251167771071e-05, + "loss": 0.9815, + "step": 331700 + }, + { + "epoch": 5.14905569608467, + "grad_norm": 1.920924186706543, + "learning_rate": 4.485095982246776e-05, + "loss": 1.0053, + "step": 331800 + }, + { + "epoch": 5.150607551327612, + "grad_norm": 2.1416079998016357, + "learning_rate": 4.484940796722482e-05, + "loss": 1.0021, + "step": 331900 + }, + { + "epoch": 5.152159406570555, + "grad_norm": 2.3306236267089844, + "learning_rate": 4.4847856111981877e-05, + "loss": 0.9761, + "step": 332000 + }, + { + "epoch": 5.153711261813498, + "grad_norm": 2.3076112270355225, + "learning_rate": 4.4846304256738934e-05, + "loss": 0.9958, + "step": 332100 + }, + { + "epoch": 5.155263117056441, + "grad_norm": 1.98060142993927, + "learning_rate": 4.484475240149599e-05, + "loss": 0.9937, + "step": 332200 + }, + { + "epoch": 5.156814972299384, + "grad_norm": 2.002647876739502, + "learning_rate": 4.484320054625305e-05, + "loss": 0.9718, + "step": 332300 + }, + { + "epoch": 5.158366827542327, + "grad_norm": 2.1483147144317627, + "learning_rate": 4.484164869101011e-05, + "loss": 0.966, + "step": 332400 + }, + { + "epoch": 5.159918682785269, + "grad_norm": 2.1069324016571045, + "learning_rate": 4.484009683576716e-05, + "loss": 0.9874, + "step": 332500 + }, + { + "epoch": 5.161470538028213, + "grad_norm": 2.287614345550537, + "learning_rate": 4.4838544980524216e-05, + "loss": 0.9804, + "step": 332600 + }, + { + "epoch": 5.163022393271156, + "grad_norm": 2.2705307006835938, + "learning_rate": 4.4836993125281274e-05, + "loss": 0.9806, + "step": 332700 + }, + { + "epoch": 5.164574248514098, + "grad_norm": 2.164897918701172, + "learning_rate": 4.483544127003833e-05, + "loss": 1.0, + "step": 332800 + }, + { + "epoch": 5.166126103757041, + "grad_norm": 2.4844210147857666, + "learning_rate": 4.483388941479539e-05, + "loss": 0.956, + "step": 332900 + }, + { + "epoch": 5.167677958999985, + "grad_norm": 2.5571064949035645, + "learning_rate": 4.483233755955245e-05, + "loss": 0.9946, + "step": 333000 + }, + { + "epoch": 5.169229814242928, + "grad_norm": 1.5777077674865723, + "learning_rate": 4.4830785704309505e-05, + "loss": 0.9976, + "step": 333100 + }, + { + "epoch": 5.17078166948587, + "grad_norm": 2.184669017791748, + "learning_rate": 4.482923384906656e-05, + "loss": 0.9631, + "step": 333200 + }, + { + "epoch": 5.172333524728813, + "grad_norm": 2.055509328842163, + "learning_rate": 4.4827681993823614e-05, + "loss": 0.9849, + "step": 333300 + }, + { + "epoch": 5.173885379971757, + "grad_norm": 2.57855224609375, + "learning_rate": 4.482613013858067e-05, + "loss": 0.981, + "step": 333400 + }, + { + "epoch": 5.175437235214699, + "grad_norm": 2.0482442378997803, + "learning_rate": 4.482457828333773e-05, + "loss": 0.9696, + "step": 333500 + }, + { + "epoch": 5.176989090457642, + "grad_norm": 2.0093798637390137, + "learning_rate": 4.482302642809479e-05, + "loss": 1.0008, + "step": 333600 + }, + { + "epoch": 5.178540945700585, + "grad_norm": 2.007098913192749, + "learning_rate": 4.4821474572851845e-05, + "loss": 0.9702, + "step": 333700 + }, + { + "epoch": 5.180092800943528, + "grad_norm": 2.0783870220184326, + "learning_rate": 4.48199227176089e-05, + "loss": 0.9866, + "step": 333800 + }, + { + "epoch": 5.181644656186471, + "grad_norm": 5.008114337921143, + "learning_rate": 4.481837086236596e-05, + "loss": 0.9755, + "step": 333900 + }, + { + "epoch": 5.183196511429414, + "grad_norm": 2.196830987930298, + "learning_rate": 4.481681900712302e-05, + "loss": 1.0013, + "step": 334000 + }, + { + "epoch": 5.184748366672356, + "grad_norm": 2.244760274887085, + "learning_rate": 4.4815267151880076e-05, + "loss": 0.9681, + "step": 334100 + }, + { + "epoch": 5.1863002219153, + "grad_norm": 2.2426247596740723, + "learning_rate": 4.4813715296637134e-05, + "loss": 0.9665, + "step": 334200 + }, + { + "epoch": 5.187852077158243, + "grad_norm": 2.5527238845825195, + "learning_rate": 4.481216344139419e-05, + "loss": 0.9829, + "step": 334300 + }, + { + "epoch": 5.189403932401186, + "grad_norm": 2.1004483699798584, + "learning_rate": 4.481061158615125e-05, + "loss": 0.9783, + "step": 334400 + }, + { + "epoch": 5.190955787644128, + "grad_norm": 2.1624903678894043, + "learning_rate": 4.480905973090831e-05, + "loss": 1.0078, + "step": 334500 + }, + { + "epoch": 5.192507642887072, + "grad_norm": 2.4121296405792236, + "learning_rate": 4.480750787566536e-05, + "loss": 0.9972, + "step": 334600 + }, + { + "epoch": 5.194059498130015, + "grad_norm": 2.3691892623901367, + "learning_rate": 4.4805956020422416e-05, + "loss": 1.0261, + "step": 334700 + }, + { + "epoch": 5.195611353372957, + "grad_norm": 2.145981788635254, + "learning_rate": 4.480440416517947e-05, + "loss": 0.989, + "step": 334800 + }, + { + "epoch": 5.1971632086159, + "grad_norm": 2.2127740383148193, + "learning_rate": 4.480285230993653e-05, + "loss": 0.9894, + "step": 334900 + }, + { + "epoch": 5.1987150638588435, + "grad_norm": 2.211792469024658, + "learning_rate": 4.480130045469359e-05, + "loss": 0.9869, + "step": 335000 + }, + { + "epoch": 5.200266919101786, + "grad_norm": 1.8884594440460205, + "learning_rate": 4.4799748599450647e-05, + "loss": 0.9848, + "step": 335100 + }, + { + "epoch": 5.201818774344729, + "grad_norm": 2.436602830886841, + "learning_rate": 4.4798196744207704e-05, + "loss": 0.9926, + "step": 335200 + }, + { + "epoch": 5.203370629587672, + "grad_norm": 2.0790979862213135, + "learning_rate": 4.479664488896476e-05, + "loss": 0.9883, + "step": 335300 + }, + { + "epoch": 5.204922484830615, + "grad_norm": 2.403123378753662, + "learning_rate": 4.479509303372182e-05, + "loss": 0.9989, + "step": 335400 + }, + { + "epoch": 5.206474340073558, + "grad_norm": 2.6842238903045654, + "learning_rate": 4.479354117847888e-05, + "loss": 0.9815, + "step": 335500 + }, + { + "epoch": 5.208026195316501, + "grad_norm": 2.16825008392334, + "learning_rate": 4.4791989323235935e-05, + "loss": 0.9827, + "step": 335600 + }, + { + "epoch": 5.209578050559444, + "grad_norm": 2.311147689819336, + "learning_rate": 4.4790437467992986e-05, + "loss": 0.9878, + "step": 335700 + }, + { + "epoch": 5.2111299058023866, + "grad_norm": 2.3077635765075684, + "learning_rate": 4.4788885612750044e-05, + "loss": 0.9787, + "step": 335800 + }, + { + "epoch": 5.21268176104533, + "grad_norm": 2.1991968154907227, + "learning_rate": 4.47873337575071e-05, + "loss": 0.9862, + "step": 335900 + }, + { + "epoch": 5.214233616288273, + "grad_norm": 2.2159273624420166, + "learning_rate": 4.478578190226416e-05, + "loss": 0.9948, + "step": 336000 + }, + { + "epoch": 5.215785471531215, + "grad_norm": 2.11722469329834, + "learning_rate": 4.478423004702121e-05, + "loss": 0.9928, + "step": 336100 + }, + { + "epoch": 5.2173373267741585, + "grad_norm": 2.129912853240967, + "learning_rate": 4.478267819177827e-05, + "loss": 0.9837, + "step": 336200 + }, + { + "epoch": 5.218889182017102, + "grad_norm": 2.3540561199188232, + "learning_rate": 4.4781126336535326e-05, + "loss": 0.969, + "step": 336300 + }, + { + "epoch": 5.220441037260044, + "grad_norm": 2.9386038780212402, + "learning_rate": 4.4779574481292384e-05, + "loss": 1.0009, + "step": 336400 + }, + { + "epoch": 5.221992892502987, + "grad_norm": 2.277980089187622, + "learning_rate": 4.477802262604944e-05, + "loss": 0.9833, + "step": 336500 + }, + { + "epoch": 5.2235447477459305, + "grad_norm": 2.2846670150756836, + "learning_rate": 4.47764707708065e-05, + "loss": 0.9867, + "step": 336600 + }, + { + "epoch": 5.225096602988873, + "grad_norm": 2.242232322692871, + "learning_rate": 4.477491891556356e-05, + "loss": 1.0026, + "step": 336700 + }, + { + "epoch": 5.226648458231816, + "grad_norm": 2.33859920501709, + "learning_rate": 4.4773367060320615e-05, + "loss": 0.9743, + "step": 336800 + }, + { + "epoch": 5.228200313474759, + "grad_norm": 2.392976760864258, + "learning_rate": 4.477181520507767e-05, + "loss": 0.9821, + "step": 336900 + }, + { + "epoch": 5.229752168717702, + "grad_norm": 2.31386399269104, + "learning_rate": 4.477026334983473e-05, + "loss": 0.9777, + "step": 337000 + }, + { + "epoch": 5.231304023960645, + "grad_norm": 1.855766773223877, + "learning_rate": 4.476871149459179e-05, + "loss": 0.9878, + "step": 337100 + }, + { + "epoch": 5.232855879203588, + "grad_norm": 2.1748948097229004, + "learning_rate": 4.4767159639348846e-05, + "loss": 0.9918, + "step": 337200 + }, + { + "epoch": 5.234407734446531, + "grad_norm": 2.3267738819122314, + "learning_rate": 4.4765607784105904e-05, + "loss": 0.9866, + "step": 337300 + }, + { + "epoch": 5.2359595896894735, + "grad_norm": 2.455652952194214, + "learning_rate": 4.476405592886296e-05, + "loss": 0.9869, + "step": 337400 + }, + { + "epoch": 5.237511444932417, + "grad_norm": 2.5350844860076904, + "learning_rate": 4.476250407362001e-05, + "loss": 0.9847, + "step": 337500 + }, + { + "epoch": 5.23906330017536, + "grad_norm": 1.9691929817199707, + "learning_rate": 4.476095221837707e-05, + "loss": 1.0111, + "step": 337600 + }, + { + "epoch": 5.240615155418302, + "grad_norm": 2.290410280227661, + "learning_rate": 4.475940036313413e-05, + "loss": 0.9729, + "step": 337700 + }, + { + "epoch": 5.2421670106612455, + "grad_norm": 2.3395233154296875, + "learning_rate": 4.4757848507891186e-05, + "loss": 0.9821, + "step": 337800 + }, + { + "epoch": 5.243718865904189, + "grad_norm": 2.4305055141448975, + "learning_rate": 4.475629665264824e-05, + "loss": 0.9782, + "step": 337900 + }, + { + "epoch": 5.245270721147131, + "grad_norm": 2.369328022003174, + "learning_rate": 4.47547447974053e-05, + "loss": 0.988, + "step": 338000 + }, + { + "epoch": 5.246822576390074, + "grad_norm": 2.1362602710723877, + "learning_rate": 4.475319294216236e-05, + "loss": 0.984, + "step": 338100 + }, + { + "epoch": 5.248374431633017, + "grad_norm": 1.7460198402404785, + "learning_rate": 4.4751641086919417e-05, + "loss": 0.9447, + "step": 338200 + }, + { + "epoch": 5.249926286875961, + "grad_norm": 1.9936199188232422, + "learning_rate": 4.4750089231676474e-05, + "loss": 0.9798, + "step": 338300 + }, + { + "epoch": 5.251478142118903, + "grad_norm": 2.407552480697632, + "learning_rate": 4.474853737643353e-05, + "loss": 0.9769, + "step": 338400 + }, + { + "epoch": 5.253029997361846, + "grad_norm": 1.9618628025054932, + "learning_rate": 4.474698552119059e-05, + "loss": 0.9798, + "step": 338500 + }, + { + "epoch": 5.254581852604789, + "grad_norm": 2.495535373687744, + "learning_rate": 4.474543366594765e-05, + "loss": 0.9669, + "step": 338600 + }, + { + "epoch": 5.256133707847732, + "grad_norm": 2.052490234375, + "learning_rate": 4.4743881810704705e-05, + "loss": 0.9913, + "step": 338700 + }, + { + "epoch": 5.257685563090675, + "grad_norm": 2.4209463596343994, + "learning_rate": 4.4742329955461756e-05, + "loss": 0.9773, + "step": 338800 + }, + { + "epoch": 5.259237418333618, + "grad_norm": 2.619950771331787, + "learning_rate": 4.4740778100218814e-05, + "loss": 0.9861, + "step": 338900 + }, + { + "epoch": 5.2607892735765605, + "grad_norm": 2.135354518890381, + "learning_rate": 4.4739226244975865e-05, + "loss": 0.9784, + "step": 339000 + }, + { + "epoch": 5.262341128819504, + "grad_norm": 2.1868016719818115, + "learning_rate": 4.473767438973292e-05, + "loss": 0.9905, + "step": 339100 + }, + { + "epoch": 5.263892984062447, + "grad_norm": 2.497985601425171, + "learning_rate": 4.473612253448998e-05, + "loss": 0.9944, + "step": 339200 + }, + { + "epoch": 5.265444839305389, + "grad_norm": 2.0293495655059814, + "learning_rate": 4.473457067924704e-05, + "loss": 0.9879, + "step": 339300 + }, + { + "epoch": 5.266996694548332, + "grad_norm": 2.726874589920044, + "learning_rate": 4.4733018824004096e-05, + "loss": 0.9737, + "step": 339400 + }, + { + "epoch": 5.268548549791276, + "grad_norm": 2.117095470428467, + "learning_rate": 4.4731466968761154e-05, + "loss": 0.9873, + "step": 339500 + }, + { + "epoch": 5.270100405034219, + "grad_norm": 2.224587917327881, + "learning_rate": 4.472991511351821e-05, + "loss": 0.9789, + "step": 339600 + }, + { + "epoch": 5.271652260277161, + "grad_norm": 2.0570099353790283, + "learning_rate": 4.472836325827527e-05, + "loss": 0.9839, + "step": 339700 + }, + { + "epoch": 5.273204115520104, + "grad_norm": 2.1905441284179688, + "learning_rate": 4.472681140303233e-05, + "loss": 0.9822, + "step": 339800 + }, + { + "epoch": 5.274755970763048, + "grad_norm": 2.166013240814209, + "learning_rate": 4.4725259547789385e-05, + "loss": 0.9869, + "step": 339900 + }, + { + "epoch": 5.27630782600599, + "grad_norm": 2.6489365100860596, + "learning_rate": 4.472370769254644e-05, + "loss": 0.9815, + "step": 340000 + }, + { + "epoch": 5.277859681248933, + "grad_norm": 2.2155096530914307, + "learning_rate": 4.47221558373035e-05, + "loss": 0.9828, + "step": 340100 + }, + { + "epoch": 5.279411536491876, + "grad_norm": 2.6348648071289062, + "learning_rate": 4.472060398206056e-05, + "loss": 0.9677, + "step": 340200 + }, + { + "epoch": 5.280963391734819, + "grad_norm": 2.451829671859741, + "learning_rate": 4.471905212681761e-05, + "loss": 0.9748, + "step": 340300 + }, + { + "epoch": 5.282515246977762, + "grad_norm": 2.5996105670928955, + "learning_rate": 4.471750027157467e-05, + "loss": 0.9853, + "step": 340400 + }, + { + "epoch": 5.284067102220705, + "grad_norm": 2.399988889694214, + "learning_rate": 4.4715948416331725e-05, + "loss": 0.9943, + "step": 340500 + }, + { + "epoch": 5.285618957463647, + "grad_norm": 2.194737195968628, + "learning_rate": 4.471439656108878e-05, + "loss": 0.9532, + "step": 340600 + }, + { + "epoch": 5.287170812706591, + "grad_norm": 2.5447442531585693, + "learning_rate": 4.471284470584584e-05, + "loss": 0.9994, + "step": 340700 + }, + { + "epoch": 5.288722667949534, + "grad_norm": 2.354578733444214, + "learning_rate": 4.47112928506029e-05, + "loss": 0.9734, + "step": 340800 + }, + { + "epoch": 5.290274523192476, + "grad_norm": 2.04677414894104, + "learning_rate": 4.4709740995359956e-05, + "loss": 0.9934, + "step": 340900 + }, + { + "epoch": 5.291826378435419, + "grad_norm": 1.9037381410598755, + "learning_rate": 4.470818914011701e-05, + "loss": 0.9914, + "step": 341000 + }, + { + "epoch": 5.293378233678363, + "grad_norm": 2.4629368782043457, + "learning_rate": 4.470663728487407e-05, + "loss": 0.9707, + "step": 341100 + }, + { + "epoch": 5.294930088921306, + "grad_norm": 2.1286580562591553, + "learning_rate": 4.470508542963113e-05, + "loss": 0.9607, + "step": 341200 + }, + { + "epoch": 5.296481944164248, + "grad_norm": 2.3212268352508545, + "learning_rate": 4.4703533574388187e-05, + "loss": 0.9821, + "step": 341300 + }, + { + "epoch": 5.298033799407191, + "grad_norm": 2.7215399742126465, + "learning_rate": 4.4701981719145244e-05, + "loss": 0.9894, + "step": 341400 + }, + { + "epoch": 5.2995856546501345, + "grad_norm": 2.2249298095703125, + "learning_rate": 4.47004298639023e-05, + "loss": 0.998, + "step": 341500 + }, + { + "epoch": 5.301137509893077, + "grad_norm": 2.135056257247925, + "learning_rate": 4.469887800865935e-05, + "loss": 0.9642, + "step": 341600 + }, + { + "epoch": 5.30268936513602, + "grad_norm": 2.1030986309051514, + "learning_rate": 4.469732615341641e-05, + "loss": 0.993, + "step": 341700 + }, + { + "epoch": 5.304241220378963, + "grad_norm": 2.1474967002868652, + "learning_rate": 4.469577429817347e-05, + "loss": 0.9791, + "step": 341800 + }, + { + "epoch": 5.305793075621906, + "grad_norm": 2.359597682952881, + "learning_rate": 4.4694222442930526e-05, + "loss": 0.9841, + "step": 341900 + }, + { + "epoch": 5.307344930864849, + "grad_norm": 2.608402729034424, + "learning_rate": 4.4692670587687584e-05, + "loss": 0.9792, + "step": 342000 + }, + { + "epoch": 5.308896786107792, + "grad_norm": 2.196988821029663, + "learning_rate": 4.469111873244464e-05, + "loss": 0.9683, + "step": 342100 + }, + { + "epoch": 5.310448641350735, + "grad_norm": 2.070812225341797, + "learning_rate": 4.468956687720169e-05, + "loss": 0.9758, + "step": 342200 + }, + { + "epoch": 5.312000496593678, + "grad_norm": 2.317857027053833, + "learning_rate": 4.468801502195875e-05, + "loss": 0.978, + "step": 342300 + }, + { + "epoch": 5.313552351836621, + "grad_norm": 2.209301710128784, + "learning_rate": 4.468646316671581e-05, + "loss": 0.9912, + "step": 342400 + }, + { + "epoch": 5.315104207079564, + "grad_norm": 2.183950424194336, + "learning_rate": 4.4684911311472866e-05, + "loss": 0.9834, + "step": 342500 + }, + { + "epoch": 5.316656062322506, + "grad_norm": 2.5670201778411865, + "learning_rate": 4.4683359456229924e-05, + "loss": 0.9767, + "step": 342600 + }, + { + "epoch": 5.3182079175654495, + "grad_norm": 2.189917802810669, + "learning_rate": 4.468180760098698e-05, + "loss": 0.9861, + "step": 342700 + }, + { + "epoch": 5.319759772808393, + "grad_norm": 1.9095392227172852, + "learning_rate": 4.468025574574404e-05, + "loss": 0.9823, + "step": 342800 + }, + { + "epoch": 5.321311628051335, + "grad_norm": 1.9903500080108643, + "learning_rate": 4.46787038905011e-05, + "loss": 0.9746, + "step": 342900 + }, + { + "epoch": 5.322863483294278, + "grad_norm": 2.349290370941162, + "learning_rate": 4.4677152035258155e-05, + "loss": 0.9814, + "step": 343000 + }, + { + "epoch": 5.3244153385372215, + "grad_norm": 2.532093048095703, + "learning_rate": 4.4675600180015206e-05, + "loss": 0.9785, + "step": 343100 + }, + { + "epoch": 5.325967193780164, + "grad_norm": 1.9769519567489624, + "learning_rate": 4.4674048324772264e-05, + "loss": 0.9691, + "step": 343200 + }, + { + "epoch": 5.327519049023107, + "grad_norm": 1.9752399921417236, + "learning_rate": 4.467249646952932e-05, + "loss": 0.9798, + "step": 343300 + }, + { + "epoch": 5.32907090426605, + "grad_norm": 2.3799397945404053, + "learning_rate": 4.467094461428638e-05, + "loss": 0.9785, + "step": 343400 + }, + { + "epoch": 5.330622759508993, + "grad_norm": 2.1321828365325928, + "learning_rate": 4.466939275904344e-05, + "loss": 0.9723, + "step": 343500 + }, + { + "epoch": 5.332174614751936, + "grad_norm": 2.1118037700653076, + "learning_rate": 4.4667840903800495e-05, + "loss": 0.9844, + "step": 343600 + }, + { + "epoch": 5.333726469994879, + "grad_norm": 2.0471959114074707, + "learning_rate": 4.466628904855755e-05, + "loss": 0.9906, + "step": 343700 + }, + { + "epoch": 5.335278325237822, + "grad_norm": 1.8844975233078003, + "learning_rate": 4.466473719331461e-05, + "loss": 0.9718, + "step": 343800 + }, + { + "epoch": 5.3368301804807645, + "grad_norm": 2.0539329051971436, + "learning_rate": 4.466318533807167e-05, + "loss": 0.9823, + "step": 343900 + }, + { + "epoch": 5.338382035723708, + "grad_norm": 2.500180721282959, + "learning_rate": 4.4661633482828726e-05, + "loss": 0.9889, + "step": 344000 + }, + { + "epoch": 5.339933890966651, + "grad_norm": 2.1466572284698486, + "learning_rate": 4.466008162758578e-05, + "loss": 0.9845, + "step": 344100 + }, + { + "epoch": 5.341485746209593, + "grad_norm": 2.273542881011963, + "learning_rate": 4.465852977234284e-05, + "loss": 0.9841, + "step": 344200 + }, + { + "epoch": 5.3430376014525365, + "grad_norm": 1.9769847393035889, + "learning_rate": 4.46569779170999e-05, + "loss": 0.9833, + "step": 344300 + }, + { + "epoch": 5.34458945669548, + "grad_norm": 2.615112543106079, + "learning_rate": 4.465542606185695e-05, + "loss": 0.9932, + "step": 344400 + }, + { + "epoch": 5.346141311938422, + "grad_norm": 2.338524341583252, + "learning_rate": 4.465387420661401e-05, + "loss": 0.9795, + "step": 344500 + }, + { + "epoch": 5.347693167181365, + "grad_norm": 1.9836337566375732, + "learning_rate": 4.4652322351371065e-05, + "loss": 0.9838, + "step": 344600 + }, + { + "epoch": 5.349245022424308, + "grad_norm": 2.2555506229400635, + "learning_rate": 4.465077049612812e-05, + "loss": 0.9708, + "step": 344700 + }, + { + "epoch": 5.350796877667252, + "grad_norm": 2.740187168121338, + "learning_rate": 4.464921864088518e-05, + "loss": 0.9723, + "step": 344800 + }, + { + "epoch": 5.352348732910194, + "grad_norm": 1.9778735637664795, + "learning_rate": 4.464766678564224e-05, + "loss": 0.9842, + "step": 344900 + }, + { + "epoch": 5.353900588153137, + "grad_norm": 1.9008959531784058, + "learning_rate": 4.4646114930399296e-05, + "loss": 0.9826, + "step": 345000 + }, + { + "epoch": 5.35545244339608, + "grad_norm": 2.404672861099243, + "learning_rate": 4.4644563075156354e-05, + "loss": 0.9902, + "step": 345100 + }, + { + "epoch": 5.357004298639023, + "grad_norm": 2.307003974914551, + "learning_rate": 4.464301121991341e-05, + "loss": 0.9809, + "step": 345200 + }, + { + "epoch": 5.358556153881966, + "grad_norm": 1.891075611114502, + "learning_rate": 4.464145936467047e-05, + "loss": 0.9686, + "step": 345300 + }, + { + "epoch": 5.360108009124909, + "grad_norm": 2.324019432067871, + "learning_rate": 4.463990750942753e-05, + "loss": 0.9853, + "step": 345400 + }, + { + "epoch": 5.3616598643678515, + "grad_norm": 2.3114638328552246, + "learning_rate": 4.463835565418458e-05, + "loss": 0.9648, + "step": 345500 + }, + { + "epoch": 5.363211719610795, + "grad_norm": 1.9445880651474, + "learning_rate": 4.4636803798941636e-05, + "loss": 0.9707, + "step": 345600 + }, + { + "epoch": 5.364763574853738, + "grad_norm": 2.129798173904419, + "learning_rate": 4.4635251943698694e-05, + "loss": 0.9935, + "step": 345700 + }, + { + "epoch": 5.36631543009668, + "grad_norm": 1.9039580821990967, + "learning_rate": 4.463370008845575e-05, + "loss": 0.9712, + "step": 345800 + }, + { + "epoch": 5.367867285339623, + "grad_norm": 2.0943281650543213, + "learning_rate": 4.463214823321281e-05, + "loss": 0.9799, + "step": 345900 + }, + { + "epoch": 5.369419140582567, + "grad_norm": 2.4390809535980225, + "learning_rate": 4.463059637796986e-05, + "loss": 0.9815, + "step": 346000 + }, + { + "epoch": 5.370970995825509, + "grad_norm": 2.434919834136963, + "learning_rate": 4.462904452272692e-05, + "loss": 0.9927, + "step": 346100 + }, + { + "epoch": 5.372522851068452, + "grad_norm": 2.5707101821899414, + "learning_rate": 4.4627492667483976e-05, + "loss": 0.9515, + "step": 346200 + }, + { + "epoch": 5.374074706311395, + "grad_norm": 2.235417604446411, + "learning_rate": 4.4625940812241034e-05, + "loss": 0.9654, + "step": 346300 + }, + { + "epoch": 5.375626561554339, + "grad_norm": 2.514286518096924, + "learning_rate": 4.462438895699809e-05, + "loss": 0.9883, + "step": 346400 + }, + { + "epoch": 5.377178416797281, + "grad_norm": 2.027215003967285, + "learning_rate": 4.462283710175515e-05, + "loss": 0.9851, + "step": 346500 + }, + { + "epoch": 5.378730272040224, + "grad_norm": 2.1994903087615967, + "learning_rate": 4.462128524651221e-05, + "loss": 0.9754, + "step": 346600 + }, + { + "epoch": 5.380282127283167, + "grad_norm": 2.5119285583496094, + "learning_rate": 4.4619733391269265e-05, + "loss": 0.9877, + "step": 346700 + }, + { + "epoch": 5.38183398252611, + "grad_norm": 2.0300240516662598, + "learning_rate": 4.461818153602632e-05, + "loss": 0.9654, + "step": 346800 + }, + { + "epoch": 5.383385837769053, + "grad_norm": 2.0062859058380127, + "learning_rate": 4.461662968078338e-05, + "loss": 0.9852, + "step": 346900 + }, + { + "epoch": 5.384937693011996, + "grad_norm": 2.1682381629943848, + "learning_rate": 4.461507782554044e-05, + "loss": 0.9736, + "step": 347000 + }, + { + "epoch": 5.386489548254938, + "grad_norm": 2.5994319915771484, + "learning_rate": 4.4613525970297496e-05, + "loss": 0.9849, + "step": 347100 + }, + { + "epoch": 5.388041403497882, + "grad_norm": 2.322558641433716, + "learning_rate": 4.461197411505455e-05, + "loss": 0.9813, + "step": 347200 + }, + { + "epoch": 5.389593258740825, + "grad_norm": 1.945400595664978, + "learning_rate": 4.4610422259811604e-05, + "loss": 0.9582, + "step": 347300 + }, + { + "epoch": 5.391145113983768, + "grad_norm": 2.2319095134735107, + "learning_rate": 4.460887040456866e-05, + "loss": 0.9601, + "step": 347400 + }, + { + "epoch": 5.39269696922671, + "grad_norm": 1.8645366430282593, + "learning_rate": 4.460731854932572e-05, + "loss": 0.9896, + "step": 347500 + }, + { + "epoch": 5.394248824469654, + "grad_norm": 2.1026902198791504, + "learning_rate": 4.460576669408278e-05, + "loss": 0.9909, + "step": 347600 + }, + { + "epoch": 5.395800679712597, + "grad_norm": 1.818767786026001, + "learning_rate": 4.4604214838839835e-05, + "loss": 0.9796, + "step": 347700 + }, + { + "epoch": 5.397352534955539, + "grad_norm": 2.3829264640808105, + "learning_rate": 4.460266298359689e-05, + "loss": 0.9986, + "step": 347800 + }, + { + "epoch": 5.398904390198482, + "grad_norm": 2.27321720123291, + "learning_rate": 4.460111112835395e-05, + "loss": 0.982, + "step": 347900 + }, + { + "epoch": 5.4004562454414256, + "grad_norm": 2.143763303756714, + "learning_rate": 4.459955927311101e-05, + "loss": 0.9623, + "step": 348000 + }, + { + "epoch": 5.402008100684368, + "grad_norm": 2.14553165435791, + "learning_rate": 4.4598007417868066e-05, + "loss": 0.958, + "step": 348100 + }, + { + "epoch": 5.403559955927311, + "grad_norm": 1.955831527709961, + "learning_rate": 4.4596455562625124e-05, + "loss": 0.9947, + "step": 348200 + }, + { + "epoch": 5.405111811170254, + "grad_norm": 2.215580940246582, + "learning_rate": 4.459490370738218e-05, + "loss": 0.9639, + "step": 348300 + }, + { + "epoch": 5.406663666413197, + "grad_norm": 2.009143590927124, + "learning_rate": 4.459335185213924e-05, + "loss": 0.953, + "step": 348400 + }, + { + "epoch": 5.40821552165614, + "grad_norm": 2.26324462890625, + "learning_rate": 4.45917999968963e-05, + "loss": 0.9666, + "step": 348500 + }, + { + "epoch": 5.409767376899083, + "grad_norm": 1.974342703819275, + "learning_rate": 4.459024814165335e-05, + "loss": 0.9733, + "step": 348600 + }, + { + "epoch": 5.411319232142025, + "grad_norm": 2.0890650749206543, + "learning_rate": 4.4588696286410406e-05, + "loss": 0.9782, + "step": 348700 + }, + { + "epoch": 5.412871087384969, + "grad_norm": 1.9821761846542358, + "learning_rate": 4.458714443116746e-05, + "loss": 0.9828, + "step": 348800 + }, + { + "epoch": 5.414422942627912, + "grad_norm": 1.9559098482131958, + "learning_rate": 4.4585592575924515e-05, + "loss": 0.9915, + "step": 348900 + }, + { + "epoch": 5.415974797870855, + "grad_norm": 2.713721990585327, + "learning_rate": 4.458404072068157e-05, + "loss": 0.9794, + "step": 349000 + }, + { + "epoch": 5.417526653113797, + "grad_norm": 2.756824254989624, + "learning_rate": 4.458248886543863e-05, + "loss": 0.9858, + "step": 349100 + }, + { + "epoch": 5.4190785083567405, + "grad_norm": 1.7667917013168335, + "learning_rate": 4.458093701019569e-05, + "loss": 0.9872, + "step": 349200 + }, + { + "epoch": 5.420630363599684, + "grad_norm": 2.0684385299682617, + "learning_rate": 4.4579385154952746e-05, + "loss": 1.0042, + "step": 349300 + }, + { + "epoch": 5.422182218842626, + "grad_norm": 2.0621657371520996, + "learning_rate": 4.4577833299709804e-05, + "loss": 0.9711, + "step": 349400 + }, + { + "epoch": 5.423734074085569, + "grad_norm": 2.46278715133667, + "learning_rate": 4.457628144446686e-05, + "loss": 0.9818, + "step": 349500 + }, + { + "epoch": 5.4252859293285125, + "grad_norm": 1.9518847465515137, + "learning_rate": 4.457472958922392e-05, + "loss": 0.9832, + "step": 349600 + }, + { + "epoch": 5.426837784571455, + "grad_norm": 2.389207601547241, + "learning_rate": 4.457317773398098e-05, + "loss": 0.9957, + "step": 349700 + }, + { + "epoch": 5.428389639814398, + "grad_norm": 2.028789520263672, + "learning_rate": 4.4571625878738035e-05, + "loss": 0.979, + "step": 349800 + }, + { + "epoch": 5.429941495057341, + "grad_norm": 1.9984185695648193, + "learning_rate": 4.457007402349509e-05, + "loss": 0.9547, + "step": 349900 + }, + { + "epoch": 5.431493350300284, + "grad_norm": 2.103748083114624, + "learning_rate": 4.456852216825215e-05, + "loss": 0.9807, + "step": 350000 + }, + { + "epoch": 5.433045205543227, + "grad_norm": 2.00935959815979, + "learning_rate": 4.45669703130092e-05, + "loss": 0.9786, + "step": 350100 + }, + { + "epoch": 5.43459706078617, + "grad_norm": 2.1263985633850098, + "learning_rate": 4.456541845776626e-05, + "loss": 0.9679, + "step": 350200 + }, + { + "epoch": 5.436148916029113, + "grad_norm": 2.2835018634796143, + "learning_rate": 4.4563866602523317e-05, + "loss": 0.9793, + "step": 350300 + }, + { + "epoch": 5.4377007712720555, + "grad_norm": 2.894418478012085, + "learning_rate": 4.4562314747280374e-05, + "loss": 0.9889, + "step": 350400 + }, + { + "epoch": 5.439252626514999, + "grad_norm": 2.039304733276367, + "learning_rate": 4.456076289203743e-05, + "loss": 0.9978, + "step": 350500 + }, + { + "epoch": 5.440804481757942, + "grad_norm": 2.1859359741210938, + "learning_rate": 4.455921103679449e-05, + "loss": 0.9801, + "step": 350600 + }, + { + "epoch": 5.442356337000884, + "grad_norm": 1.9913524389266968, + "learning_rate": 4.455765918155155e-05, + "loss": 0.9743, + "step": 350700 + }, + { + "epoch": 5.4439081922438275, + "grad_norm": 1.6992727518081665, + "learning_rate": 4.4556107326308605e-05, + "loss": 1.0108, + "step": 350800 + }, + { + "epoch": 5.445460047486771, + "grad_norm": 2.630546808242798, + "learning_rate": 4.455455547106566e-05, + "loss": 0.9704, + "step": 350900 + }, + { + "epoch": 5.447011902729713, + "grad_norm": 2.0170834064483643, + "learning_rate": 4.455300361582272e-05, + "loss": 1.0016, + "step": 351000 + }, + { + "epoch": 5.448563757972656, + "grad_norm": 2.492382049560547, + "learning_rate": 4.455145176057978e-05, + "loss": 0.984, + "step": 351100 + }, + { + "epoch": 5.4501156132155995, + "grad_norm": 2.2098348140716553, + "learning_rate": 4.4549899905336836e-05, + "loss": 0.9668, + "step": 351200 + }, + { + "epoch": 5.451667468458542, + "grad_norm": 2.287720203399658, + "learning_rate": 4.4548348050093894e-05, + "loss": 0.9897, + "step": 351300 + }, + { + "epoch": 5.453219323701485, + "grad_norm": 2.221672534942627, + "learning_rate": 4.4546796194850945e-05, + "loss": 0.9638, + "step": 351400 + }, + { + "epoch": 5.454771178944428, + "grad_norm": 2.2400195598602295, + "learning_rate": 4.4545244339608e-05, + "loss": 0.9848, + "step": 351500 + }, + { + "epoch": 5.456323034187371, + "grad_norm": 2.104464530944824, + "learning_rate": 4.454369248436506e-05, + "loss": 0.9622, + "step": 351600 + }, + { + "epoch": 5.457874889430314, + "grad_norm": 2.383382558822632, + "learning_rate": 4.454214062912212e-05, + "loss": 0.9837, + "step": 351700 + }, + { + "epoch": 5.459426744673257, + "grad_norm": 2.198345422744751, + "learning_rate": 4.4540588773879176e-05, + "loss": 0.9813, + "step": 351800 + }, + { + "epoch": 5.4609785999162, + "grad_norm": 2.3290531635284424, + "learning_rate": 4.4539036918636234e-05, + "loss": 0.9767, + "step": 351900 + }, + { + "epoch": 5.4625304551591425, + "grad_norm": 2.2324042320251465, + "learning_rate": 4.4537485063393285e-05, + "loss": 0.9879, + "step": 352000 + }, + { + "epoch": 5.464082310402086, + "grad_norm": 1.9537746906280518, + "learning_rate": 4.453593320815034e-05, + "loss": 0.9793, + "step": 352100 + }, + { + "epoch": 5.465634165645029, + "grad_norm": 2.270908832550049, + "learning_rate": 4.45343813529074e-05, + "loss": 0.9824, + "step": 352200 + }, + { + "epoch": 5.467186020887971, + "grad_norm": 2.5710361003875732, + "learning_rate": 4.453282949766446e-05, + "loss": 0.9831, + "step": 352300 + }, + { + "epoch": 5.468737876130914, + "grad_norm": 2.1779885292053223, + "learning_rate": 4.4531277642421516e-05, + "loss": 0.9743, + "step": 352400 + }, + { + "epoch": 5.470289731373858, + "grad_norm": 2.0371127128601074, + "learning_rate": 4.4529725787178574e-05, + "loss": 0.9924, + "step": 352500 + }, + { + "epoch": 5.4718415866168, + "grad_norm": 1.9643186330795288, + "learning_rate": 4.452817393193563e-05, + "loss": 0.9823, + "step": 352600 + }, + { + "epoch": 5.473393441859743, + "grad_norm": 2.0076684951782227, + "learning_rate": 4.452662207669269e-05, + "loss": 0.9836, + "step": 352700 + }, + { + "epoch": 5.474945297102686, + "grad_norm": 2.449479579925537, + "learning_rate": 4.452507022144975e-05, + "loss": 0.9852, + "step": 352800 + }, + { + "epoch": 5.47649715234563, + "grad_norm": 2.2750444412231445, + "learning_rate": 4.45235183662068e-05, + "loss": 0.9882, + "step": 352900 + }, + { + "epoch": 5.478049007588572, + "grad_norm": 2.4435880184173584, + "learning_rate": 4.4521966510963856e-05, + "loss": 0.9808, + "step": 353000 + }, + { + "epoch": 5.479600862831515, + "grad_norm": 2.2818408012390137, + "learning_rate": 4.452041465572091e-05, + "loss": 0.991, + "step": 353100 + }, + { + "epoch": 5.481152718074458, + "grad_norm": 2.2061777114868164, + "learning_rate": 4.451886280047797e-05, + "loss": 0.9875, + "step": 353200 + }, + { + "epoch": 5.482704573317401, + "grad_norm": 2.0868592262268066, + "learning_rate": 4.451731094523503e-05, + "loss": 0.9636, + "step": 353300 + }, + { + "epoch": 5.484256428560344, + "grad_norm": 2.4877538681030273, + "learning_rate": 4.4515759089992087e-05, + "loss": 0.9684, + "step": 353400 + }, + { + "epoch": 5.485808283803287, + "grad_norm": 1.973732829093933, + "learning_rate": 4.4514207234749144e-05, + "loss": 0.9731, + "step": 353500 + }, + { + "epoch": 5.487360139046229, + "grad_norm": 2.477951765060425, + "learning_rate": 4.45126553795062e-05, + "loss": 0.9804, + "step": 353600 + }, + { + "epoch": 5.488911994289173, + "grad_norm": 2.750450372695923, + "learning_rate": 4.451110352426326e-05, + "loss": 0.9763, + "step": 353700 + }, + { + "epoch": 5.490463849532116, + "grad_norm": 2.3875765800476074, + "learning_rate": 4.450955166902032e-05, + "loss": 0.9838, + "step": 353800 + }, + { + "epoch": 5.492015704775058, + "grad_norm": 2.0958762168884277, + "learning_rate": 4.4507999813777375e-05, + "loss": 0.9634, + "step": 353900 + }, + { + "epoch": 5.493567560018001, + "grad_norm": 2.1959280967712402, + "learning_rate": 4.450644795853443e-05, + "loss": 0.9785, + "step": 354000 + }, + { + "epoch": 5.495119415260945, + "grad_norm": 1.7072805166244507, + "learning_rate": 4.450489610329149e-05, + "loss": 0.9595, + "step": 354100 + }, + { + "epoch": 5.496671270503887, + "grad_norm": 2.3442811965942383, + "learning_rate": 4.450334424804855e-05, + "loss": 0.9803, + "step": 354200 + }, + { + "epoch": 5.49822312574683, + "grad_norm": 1.561077356338501, + "learning_rate": 4.45017923928056e-05, + "loss": 0.9713, + "step": 354300 + }, + { + "epoch": 5.499774980989773, + "grad_norm": 2.1972949504852295, + "learning_rate": 4.450024053756266e-05, + "loss": 0.9754, + "step": 354400 + }, + { + "epoch": 5.501326836232717, + "grad_norm": 1.9204434156417847, + "learning_rate": 4.4498688682319715e-05, + "loss": 0.9673, + "step": 354500 + }, + { + "epoch": 5.502878691475659, + "grad_norm": 2.437697410583496, + "learning_rate": 4.449713682707677e-05, + "loss": 0.9978, + "step": 354600 + }, + { + "epoch": 5.504430546718602, + "grad_norm": 2.310438632965088, + "learning_rate": 4.449558497183383e-05, + "loss": 0.9803, + "step": 354700 + }, + { + "epoch": 5.505982401961545, + "grad_norm": 1.9296797513961792, + "learning_rate": 4.449403311659089e-05, + "loss": 0.9878, + "step": 354800 + }, + { + "epoch": 5.507534257204488, + "grad_norm": 2.261770248413086, + "learning_rate": 4.4492481261347946e-05, + "loss": 0.986, + "step": 354900 + }, + { + "epoch": 5.509086112447431, + "grad_norm": 1.9454330205917358, + "learning_rate": 4.4490929406105004e-05, + "loss": 0.9931, + "step": 355000 + }, + { + "epoch": 5.510637967690374, + "grad_norm": 2.4823288917541504, + "learning_rate": 4.448937755086206e-05, + "loss": 0.9976, + "step": 355100 + }, + { + "epoch": 5.512189822933317, + "grad_norm": 2.0919179916381836, + "learning_rate": 4.448782569561912e-05, + "loss": 0.9805, + "step": 355200 + }, + { + "epoch": 5.51374167817626, + "grad_norm": 2.2404212951660156, + "learning_rate": 4.448627384037617e-05, + "loss": 0.9941, + "step": 355300 + }, + { + "epoch": 5.515293533419203, + "grad_norm": 2.0819284915924072, + "learning_rate": 4.448472198513323e-05, + "loss": 0.9837, + "step": 355400 + }, + { + "epoch": 5.516845388662146, + "grad_norm": 2.157360076904297, + "learning_rate": 4.4483170129890286e-05, + "loss": 0.9622, + "step": 355500 + }, + { + "epoch": 5.518397243905088, + "grad_norm": 1.9269564151763916, + "learning_rate": 4.4481618274647344e-05, + "loss": 0.9624, + "step": 355600 + }, + { + "epoch": 5.519949099148032, + "grad_norm": 2.549567461013794, + "learning_rate": 4.44800664194044e-05, + "loss": 0.989, + "step": 355700 + }, + { + "epoch": 5.521500954390975, + "grad_norm": 2.4770264625549316, + "learning_rate": 4.447851456416145e-05, + "loss": 0.9737, + "step": 355800 + }, + { + "epoch": 5.523052809633917, + "grad_norm": 1.941084623336792, + "learning_rate": 4.447696270891851e-05, + "loss": 0.9558, + "step": 355900 + }, + { + "epoch": 5.52460466487686, + "grad_norm": 2.5409600734710693, + "learning_rate": 4.447541085367557e-05, + "loss": 0.9725, + "step": 356000 + }, + { + "epoch": 5.5261565201198035, + "grad_norm": 2.013537883758545, + "learning_rate": 4.4473858998432626e-05, + "loss": 0.9894, + "step": 356100 + }, + { + "epoch": 5.527708375362746, + "grad_norm": 2.299778699874878, + "learning_rate": 4.447230714318968e-05, + "loss": 0.9958, + "step": 356200 + }, + { + "epoch": 5.529260230605689, + "grad_norm": 2.6637446880340576, + "learning_rate": 4.447075528794674e-05, + "loss": 0.9867, + "step": 356300 + }, + { + "epoch": 5.530812085848632, + "grad_norm": 2.1968934535980225, + "learning_rate": 4.44692034327038e-05, + "loss": 0.9809, + "step": 356400 + }, + { + "epoch": 5.532363941091575, + "grad_norm": 2.3023293018341064, + "learning_rate": 4.4467651577460857e-05, + "loss": 1.0052, + "step": 356500 + }, + { + "epoch": 5.533915796334518, + "grad_norm": 1.9183952808380127, + "learning_rate": 4.4466099722217914e-05, + "loss": 0.9762, + "step": 356600 + }, + { + "epoch": 5.535467651577461, + "grad_norm": 1.8733736276626587, + "learning_rate": 4.446454786697497e-05, + "loss": 0.9859, + "step": 356700 + }, + { + "epoch": 5.537019506820403, + "grad_norm": 2.0433640480041504, + "learning_rate": 4.446299601173203e-05, + "loss": 0.9673, + "step": 356800 + }, + { + "epoch": 5.5385713620633465, + "grad_norm": 2.3340017795562744, + "learning_rate": 4.446144415648909e-05, + "loss": 0.9684, + "step": 356900 + }, + { + "epoch": 5.54012321730629, + "grad_norm": 2.0930838584899902, + "learning_rate": 4.4459892301246145e-05, + "loss": 0.9661, + "step": 357000 + }, + { + "epoch": 5.541675072549233, + "grad_norm": 2.2251813411712646, + "learning_rate": 4.4458340446003196e-05, + "loss": 0.9914, + "step": 357100 + }, + { + "epoch": 5.543226927792175, + "grad_norm": 2.4446849822998047, + "learning_rate": 4.4456788590760254e-05, + "loss": 0.9913, + "step": 357200 + }, + { + "epoch": 5.5447787830351185, + "grad_norm": 2.2096974849700928, + "learning_rate": 4.445523673551731e-05, + "loss": 0.9937, + "step": 357300 + }, + { + "epoch": 5.546330638278062, + "grad_norm": 2.4166781902313232, + "learning_rate": 4.445368488027437e-05, + "loss": 0.974, + "step": 357400 + }, + { + "epoch": 5.547882493521004, + "grad_norm": 2.3222157955169678, + "learning_rate": 4.445213302503143e-05, + "loss": 0.9783, + "step": 357500 + }, + { + "epoch": 5.549434348763947, + "grad_norm": 2.3722078800201416, + "learning_rate": 4.4450581169788485e-05, + "loss": 0.9772, + "step": 357600 + }, + { + "epoch": 5.5509862040068905, + "grad_norm": 2.143782615661621, + "learning_rate": 4.444902931454554e-05, + "loss": 0.9675, + "step": 357700 + }, + { + "epoch": 5.552538059249833, + "grad_norm": 2.000396490097046, + "learning_rate": 4.44474774593026e-05, + "loss": 0.9852, + "step": 357800 + }, + { + "epoch": 5.554089914492776, + "grad_norm": 2.1700656414031982, + "learning_rate": 4.444592560405966e-05, + "loss": 0.9755, + "step": 357900 + }, + { + "epoch": 5.555641769735719, + "grad_norm": 2.1937971115112305, + "learning_rate": 4.4444373748816716e-05, + "loss": 0.971, + "step": 358000 + }, + { + "epoch": 5.557193624978662, + "grad_norm": 2.1302905082702637, + "learning_rate": 4.4442821893573774e-05, + "loss": 0.9704, + "step": 358100 + }, + { + "epoch": 5.558745480221605, + "grad_norm": 2.1540932655334473, + "learning_rate": 4.444127003833083e-05, + "loss": 0.9635, + "step": 358200 + }, + { + "epoch": 5.560297335464548, + "grad_norm": 2.2759764194488525, + "learning_rate": 4.443971818308789e-05, + "loss": 0.9748, + "step": 358300 + }, + { + "epoch": 5.561849190707491, + "grad_norm": 2.4250035285949707, + "learning_rate": 4.443816632784494e-05, + "loss": 0.9848, + "step": 358400 + }, + { + "epoch": 5.5634010459504335, + "grad_norm": 2.174387216567993, + "learning_rate": 4.4436614472602e-05, + "loss": 0.9906, + "step": 358500 + }, + { + "epoch": 5.564952901193377, + "grad_norm": 2.1169731616973877, + "learning_rate": 4.443506261735905e-05, + "loss": 0.9907, + "step": 358600 + }, + { + "epoch": 5.56650475643632, + "grad_norm": 2.1221742630004883, + "learning_rate": 4.443351076211611e-05, + "loss": 0.9863, + "step": 358700 + }, + { + "epoch": 5.568056611679262, + "grad_norm": 2.291260242462158, + "learning_rate": 4.4431958906873165e-05, + "loss": 0.962, + "step": 358800 + }, + { + "epoch": 5.5696084669222055, + "grad_norm": 2.050210952758789, + "learning_rate": 4.443040705163022e-05, + "loss": 0.9817, + "step": 358900 + }, + { + "epoch": 5.571160322165149, + "grad_norm": 2.361135959625244, + "learning_rate": 4.442885519638728e-05, + "loss": 0.976, + "step": 359000 + }, + { + "epoch": 5.572712177408091, + "grad_norm": 2.499438524246216, + "learning_rate": 4.442730334114434e-05, + "loss": 0.9792, + "step": 359100 + }, + { + "epoch": 5.574264032651034, + "grad_norm": 2.4830102920532227, + "learning_rate": 4.4425751485901396e-05, + "loss": 0.9988, + "step": 359200 + }, + { + "epoch": 5.575815887893977, + "grad_norm": 2.354757785797119, + "learning_rate": 4.442419963065845e-05, + "loss": 0.9965, + "step": 359300 + }, + { + "epoch": 5.57736774313692, + "grad_norm": 1.9289546012878418, + "learning_rate": 4.442264777541551e-05, + "loss": 0.9778, + "step": 359400 + }, + { + "epoch": 5.578919598379863, + "grad_norm": 2.037522554397583, + "learning_rate": 4.442109592017257e-05, + "loss": 0.9672, + "step": 359500 + }, + { + "epoch": 5.580471453622806, + "grad_norm": 2.0955095291137695, + "learning_rate": 4.4419544064929627e-05, + "loss": 0.9784, + "step": 359600 + }, + { + "epoch": 5.582023308865749, + "grad_norm": 1.8834164142608643, + "learning_rate": 4.4417992209686684e-05, + "loss": 0.9851, + "step": 359700 + }, + { + "epoch": 5.583575164108692, + "grad_norm": 2.162356376647949, + "learning_rate": 4.441644035444374e-05, + "loss": 0.9835, + "step": 359800 + }, + { + "epoch": 5.585127019351635, + "grad_norm": 2.5465216636657715, + "learning_rate": 4.441488849920079e-05, + "loss": 0.9587, + "step": 359900 + }, + { + "epoch": 5.586678874594578, + "grad_norm": 1.9492729902267456, + "learning_rate": 4.441333664395785e-05, + "loss": 0.9641, + "step": 360000 + }, + { + "epoch": 5.5882307298375204, + "grad_norm": 2.319061756134033, + "learning_rate": 4.441178478871491e-05, + "loss": 1.0023, + "step": 360100 + }, + { + "epoch": 5.589782585080464, + "grad_norm": 2.4109034538269043, + "learning_rate": 4.4410232933471966e-05, + "loss": 0.9946, + "step": 360200 + }, + { + "epoch": 5.591334440323407, + "grad_norm": 2.2688701152801514, + "learning_rate": 4.4408681078229024e-05, + "loss": 1.013, + "step": 360300 + }, + { + "epoch": 5.592886295566349, + "grad_norm": 2.359365224838257, + "learning_rate": 4.440712922298608e-05, + "loss": 0.9804, + "step": 360400 + }, + { + "epoch": 5.594438150809292, + "grad_norm": 2.4496281147003174, + "learning_rate": 4.440557736774314e-05, + "loss": 0.9953, + "step": 360500 + }, + { + "epoch": 5.595990006052236, + "grad_norm": 2.2714340686798096, + "learning_rate": 4.44040255125002e-05, + "loss": 0.9713, + "step": 360600 + }, + { + "epoch": 5.597541861295179, + "grad_norm": 1.8760069608688354, + "learning_rate": 4.4402473657257255e-05, + "loss": 0.9873, + "step": 360700 + }, + { + "epoch": 5.599093716538121, + "grad_norm": 2.1799817085266113, + "learning_rate": 4.440092180201431e-05, + "loss": 0.9997, + "step": 360800 + }, + { + "epoch": 5.600645571781064, + "grad_norm": 1.8943510055541992, + "learning_rate": 4.439936994677137e-05, + "loss": 0.9841, + "step": 360900 + }, + { + "epoch": 5.602197427024008, + "grad_norm": 2.2320775985717773, + "learning_rate": 4.439781809152843e-05, + "loss": 0.9752, + "step": 361000 + }, + { + "epoch": 5.60374928226695, + "grad_norm": 2.5199897289276123, + "learning_rate": 4.4396266236285486e-05, + "loss": 0.988, + "step": 361100 + }, + { + "epoch": 5.605301137509893, + "grad_norm": 2.2784907817840576, + "learning_rate": 4.439471438104254e-05, + "loss": 0.9715, + "step": 361200 + }, + { + "epoch": 5.606852992752836, + "grad_norm": 2.2994918823242188, + "learning_rate": 4.4393162525799595e-05, + "loss": 0.9844, + "step": 361300 + }, + { + "epoch": 5.608404847995779, + "grad_norm": 2.062472105026245, + "learning_rate": 4.439161067055665e-05, + "loss": 0.9573, + "step": 361400 + }, + { + "epoch": 5.609956703238722, + "grad_norm": 2.3974623680114746, + "learning_rate": 4.439005881531371e-05, + "loss": 0.9629, + "step": 361500 + }, + { + "epoch": 5.611508558481665, + "grad_norm": 1.936026692390442, + "learning_rate": 4.438850696007077e-05, + "loss": 0.9653, + "step": 361600 + }, + { + "epoch": 5.613060413724607, + "grad_norm": 2.03934907913208, + "learning_rate": 4.4386955104827826e-05, + "loss": 0.9722, + "step": 361700 + }, + { + "epoch": 5.614612268967551, + "grad_norm": 1.9631593227386475, + "learning_rate": 4.438540324958488e-05, + "loss": 0.9788, + "step": 361800 + }, + { + "epoch": 5.616164124210494, + "grad_norm": 2.4654765129089355, + "learning_rate": 4.4383851394341935e-05, + "loss": 0.9649, + "step": 361900 + }, + { + "epoch": 5.617715979453436, + "grad_norm": 1.980949878692627, + "learning_rate": 4.438229953909899e-05, + "loss": 0.9955, + "step": 362000 + }, + { + "epoch": 5.619267834696379, + "grad_norm": 2.2959792613983154, + "learning_rate": 4.438074768385605e-05, + "loss": 0.9915, + "step": 362100 + }, + { + "epoch": 5.620819689939323, + "grad_norm": 1.780027151107788, + "learning_rate": 4.437919582861311e-05, + "loss": 0.9695, + "step": 362200 + }, + { + "epoch": 5.622371545182266, + "grad_norm": 1.9942281246185303, + "learning_rate": 4.4377643973370166e-05, + "loss": 1.0067, + "step": 362300 + }, + { + "epoch": 5.623923400425208, + "grad_norm": 2.60976243019104, + "learning_rate": 4.437609211812722e-05, + "loss": 0.9921, + "step": 362400 + }, + { + "epoch": 5.625475255668151, + "grad_norm": 2.156275987625122, + "learning_rate": 4.437454026288428e-05, + "loss": 0.9991, + "step": 362500 + }, + { + "epoch": 5.6270271109110945, + "grad_norm": 2.118537664413452, + "learning_rate": 4.437298840764134e-05, + "loss": 0.9871, + "step": 362600 + }, + { + "epoch": 5.628578966154037, + "grad_norm": 2.250690221786499, + "learning_rate": 4.4371436552398397e-05, + "loss": 0.9847, + "step": 362700 + }, + { + "epoch": 5.63013082139698, + "grad_norm": 1.9881632328033447, + "learning_rate": 4.436988469715545e-05, + "loss": 0.9814, + "step": 362800 + }, + { + "epoch": 5.631682676639923, + "grad_norm": 2.4273569583892822, + "learning_rate": 4.4368332841912505e-05, + "loss": 0.9885, + "step": 362900 + }, + { + "epoch": 5.633234531882866, + "grad_norm": 2.7762176990509033, + "learning_rate": 4.436678098666956e-05, + "loss": 0.9843, + "step": 363000 + }, + { + "epoch": 5.634786387125809, + "grad_norm": 2.532621145248413, + "learning_rate": 4.436522913142662e-05, + "loss": 0.9717, + "step": 363100 + }, + { + "epoch": 5.636338242368752, + "grad_norm": 2.637486696243286, + "learning_rate": 4.436367727618368e-05, + "loss": 0.986, + "step": 363200 + }, + { + "epoch": 5.637890097611695, + "grad_norm": 2.3964123725891113, + "learning_rate": 4.4362125420940736e-05, + "loss": 0.9781, + "step": 363300 + }, + { + "epoch": 5.639441952854638, + "grad_norm": 2.11099910736084, + "learning_rate": 4.4360573565697794e-05, + "loss": 0.9705, + "step": 363400 + }, + { + "epoch": 5.640993808097581, + "grad_norm": 2.423428773880005, + "learning_rate": 4.435902171045485e-05, + "loss": 0.9805, + "step": 363500 + }, + { + "epoch": 5.642545663340524, + "grad_norm": 2.279287338256836, + "learning_rate": 4.435746985521191e-05, + "loss": 0.9682, + "step": 363600 + }, + { + "epoch": 5.644097518583466, + "grad_norm": 2.2667236328125, + "learning_rate": 4.435591799996897e-05, + "loss": 1.0055, + "step": 363700 + }, + { + "epoch": 5.6456493738264095, + "grad_norm": 2.0388057231903076, + "learning_rate": 4.4354366144726025e-05, + "loss": 0.9951, + "step": 363800 + }, + { + "epoch": 5.647201229069353, + "grad_norm": 2.438066005706787, + "learning_rate": 4.435281428948308e-05, + "loss": 0.9715, + "step": 363900 + }, + { + "epoch": 5.648753084312295, + "grad_norm": 2.158094882965088, + "learning_rate": 4.435126243424014e-05, + "loss": 0.9785, + "step": 364000 + }, + { + "epoch": 5.650304939555238, + "grad_norm": 2.2734158039093018, + "learning_rate": 4.434971057899719e-05, + "loss": 0.9798, + "step": 364100 + }, + { + "epoch": 5.6518567947981815, + "grad_norm": 2.5884833335876465, + "learning_rate": 4.434815872375425e-05, + "loss": 0.968, + "step": 364200 + }, + { + "epoch": 5.653408650041124, + "grad_norm": 2.0443115234375, + "learning_rate": 4.434660686851131e-05, + "loss": 0.9735, + "step": 364300 + }, + { + "epoch": 5.654960505284067, + "grad_norm": 2.1035728454589844, + "learning_rate": 4.4345055013268365e-05, + "loss": 0.959, + "step": 364400 + }, + { + "epoch": 5.65651236052701, + "grad_norm": 2.3837780952453613, + "learning_rate": 4.434350315802542e-05, + "loss": 0.9839, + "step": 364500 + }, + { + "epoch": 5.6580642157699526, + "grad_norm": 2.4859204292297363, + "learning_rate": 4.434195130278248e-05, + "loss": 0.9855, + "step": 364600 + }, + { + "epoch": 5.659616071012896, + "grad_norm": 2.1114308834075928, + "learning_rate": 4.434039944753954e-05, + "loss": 0.9803, + "step": 364700 + }, + { + "epoch": 5.661167926255839, + "grad_norm": 2.7344343662261963, + "learning_rate": 4.4338847592296596e-05, + "loss": 0.9818, + "step": 364800 + }, + { + "epoch": 5.662719781498781, + "grad_norm": 2.4254872798919678, + "learning_rate": 4.4337295737053654e-05, + "loss": 0.9571, + "step": 364900 + }, + { + "epoch": 5.6642716367417245, + "grad_norm": 2.118703842163086, + "learning_rate": 4.433574388181071e-05, + "loss": 0.9859, + "step": 365000 + }, + { + "epoch": 5.665823491984668, + "grad_norm": 2.064786911010742, + "learning_rate": 4.433419202656776e-05, + "loss": 0.9677, + "step": 365100 + }, + { + "epoch": 5.667375347227611, + "grad_norm": 2.073610544204712, + "learning_rate": 4.433264017132482e-05, + "loss": 0.9748, + "step": 365200 + }, + { + "epoch": 5.668927202470553, + "grad_norm": 1.9572569131851196, + "learning_rate": 4.433108831608188e-05, + "loss": 0.9938, + "step": 365300 + }, + { + "epoch": 5.6704790577134965, + "grad_norm": 2.6718883514404297, + "learning_rate": 4.4329536460838936e-05, + "loss": 0.9724, + "step": 365400 + }, + { + "epoch": 5.67203091295644, + "grad_norm": 2.709984064102173, + "learning_rate": 4.432798460559599e-05, + "loss": 0.9761, + "step": 365500 + }, + { + "epoch": 5.673582768199382, + "grad_norm": 1.9509752988815308, + "learning_rate": 4.4326432750353044e-05, + "loss": 0.9813, + "step": 365600 + }, + { + "epoch": 5.675134623442325, + "grad_norm": 1.991582989692688, + "learning_rate": 4.43248808951101e-05, + "loss": 0.973, + "step": 365700 + }, + { + "epoch": 5.676686478685268, + "grad_norm": 2.279745101928711, + "learning_rate": 4.432332903986716e-05, + "loss": 0.9754, + "step": 365800 + }, + { + "epoch": 5.678238333928212, + "grad_norm": 2.0734293460845947, + "learning_rate": 4.432177718462422e-05, + "loss": 0.9706, + "step": 365900 + }, + { + "epoch": 5.679790189171154, + "grad_norm": 2.3504891395568848, + "learning_rate": 4.4320225329381275e-05, + "loss": 0.9653, + "step": 366000 + }, + { + "epoch": 5.681342044414097, + "grad_norm": 2.5119516849517822, + "learning_rate": 4.431867347413833e-05, + "loss": 0.9888, + "step": 366100 + }, + { + "epoch": 5.68289389965704, + "grad_norm": 1.8904026746749878, + "learning_rate": 4.431712161889539e-05, + "loss": 0.9722, + "step": 366200 + }, + { + "epoch": 5.684445754899983, + "grad_norm": 2.247629404067993, + "learning_rate": 4.431556976365245e-05, + "loss": 0.9781, + "step": 366300 + }, + { + "epoch": 5.685997610142926, + "grad_norm": 2.233731985092163, + "learning_rate": 4.4314017908409506e-05, + "loss": 0.9756, + "step": 366400 + }, + { + "epoch": 5.687549465385869, + "grad_norm": 1.6566970348358154, + "learning_rate": 4.4312466053166564e-05, + "loss": 0.9755, + "step": 366500 + }, + { + "epoch": 5.6891013206288115, + "grad_norm": 2.4865779876708984, + "learning_rate": 4.431091419792362e-05, + "loss": 0.9735, + "step": 366600 + }, + { + "epoch": 5.690653175871755, + "grad_norm": 2.3704440593719482, + "learning_rate": 4.430936234268068e-05, + "loss": 0.9986, + "step": 366700 + }, + { + "epoch": 5.692205031114698, + "grad_norm": 2.21870493888855, + "learning_rate": 4.430781048743774e-05, + "loss": 0.9656, + "step": 366800 + }, + { + "epoch": 5.69375688635764, + "grad_norm": 1.976405143737793, + "learning_rate": 4.430625863219479e-05, + "loss": 0.9802, + "step": 366900 + }, + { + "epoch": 5.695308741600583, + "grad_norm": 2.2283897399902344, + "learning_rate": 4.4304706776951846e-05, + "loss": 1.0038, + "step": 367000 + }, + { + "epoch": 5.696860596843527, + "grad_norm": 2.641090154647827, + "learning_rate": 4.4303154921708904e-05, + "loss": 0.9895, + "step": 367100 + }, + { + "epoch": 5.698412452086469, + "grad_norm": 2.4293911457061768, + "learning_rate": 4.430160306646596e-05, + "loss": 0.9811, + "step": 367200 + }, + { + "epoch": 5.699964307329412, + "grad_norm": 1.9061189889907837, + "learning_rate": 4.430005121122302e-05, + "loss": 0.9638, + "step": 367300 + }, + { + "epoch": 5.701516162572355, + "grad_norm": 2.30849552154541, + "learning_rate": 4.429849935598008e-05, + "loss": 0.9808, + "step": 367400 + }, + { + "epoch": 5.703068017815298, + "grad_norm": 2.3032681941986084, + "learning_rate": 4.4296947500737135e-05, + "loss": 0.9599, + "step": 367500 + }, + { + "epoch": 5.704619873058241, + "grad_norm": 1.7757885456085205, + "learning_rate": 4.429539564549419e-05, + "loss": 0.9832, + "step": 367600 + }, + { + "epoch": 5.706171728301184, + "grad_norm": 2.3700315952301025, + "learning_rate": 4.429384379025125e-05, + "loss": 0.9867, + "step": 367700 + }, + { + "epoch": 5.707723583544127, + "grad_norm": 2.00622296333313, + "learning_rate": 4.429229193500831e-05, + "loss": 0.9851, + "step": 367800 + }, + { + "epoch": 5.70927543878707, + "grad_norm": 2.456026554107666, + "learning_rate": 4.4290740079765366e-05, + "loss": 0.9851, + "step": 367900 + }, + { + "epoch": 5.710827294030013, + "grad_norm": 2.2019221782684326, + "learning_rate": 4.4289188224522424e-05, + "loss": 0.9873, + "step": 368000 + }, + { + "epoch": 5.712379149272956, + "grad_norm": 2.2395827770233154, + "learning_rate": 4.428763636927948e-05, + "loss": 0.978, + "step": 368100 + }, + { + "epoch": 5.713931004515898, + "grad_norm": 1.8059297800064087, + "learning_rate": 4.428608451403653e-05, + "loss": 0.9671, + "step": 368200 + }, + { + "epoch": 5.715482859758842, + "grad_norm": 1.5375856161117554, + "learning_rate": 4.428453265879359e-05, + "loss": 0.9501, + "step": 368300 + }, + { + "epoch": 5.717034715001785, + "grad_norm": 1.846731185913086, + "learning_rate": 4.428298080355064e-05, + "loss": 0.9916, + "step": 368400 + }, + { + "epoch": 5.718586570244728, + "grad_norm": 2.528812885284424, + "learning_rate": 4.42814289483077e-05, + "loss": 0.9642, + "step": 368500 + }, + { + "epoch": 5.72013842548767, + "grad_norm": 2.11910343170166, + "learning_rate": 4.4279877093064757e-05, + "loss": 0.977, + "step": 368600 + }, + { + "epoch": 5.721690280730614, + "grad_norm": 2.360161781311035, + "learning_rate": 4.4278325237821814e-05, + "loss": 0.9816, + "step": 368700 + }, + { + "epoch": 5.723242135973557, + "grad_norm": 2.2701621055603027, + "learning_rate": 4.427677338257887e-05, + "loss": 0.9824, + "step": 368800 + }, + { + "epoch": 5.724793991216499, + "grad_norm": 2.580899715423584, + "learning_rate": 4.427522152733593e-05, + "loss": 0.9567, + "step": 368900 + }, + { + "epoch": 5.726345846459442, + "grad_norm": 2.189096212387085, + "learning_rate": 4.427366967209299e-05, + "loss": 0.9943, + "step": 369000 + }, + { + "epoch": 5.7278977017023855, + "grad_norm": 2.4974076747894287, + "learning_rate": 4.4272117816850045e-05, + "loss": 1.0001, + "step": 369100 + }, + { + "epoch": 5.729449556945328, + "grad_norm": 2.4761641025543213, + "learning_rate": 4.42705659616071e-05, + "loss": 0.9907, + "step": 369200 + }, + { + "epoch": 5.731001412188271, + "grad_norm": 1.7815697193145752, + "learning_rate": 4.426901410636416e-05, + "loss": 1.0, + "step": 369300 + }, + { + "epoch": 5.732553267431214, + "grad_norm": 2.054739475250244, + "learning_rate": 4.426746225112122e-05, + "loss": 0.9767, + "step": 369400 + }, + { + "epoch": 5.734105122674157, + "grad_norm": 1.956700086593628, + "learning_rate": 4.4265910395878276e-05, + "loss": 0.9688, + "step": 369500 + }, + { + "epoch": 5.7356569779171, + "grad_norm": 2.280845880508423, + "learning_rate": 4.4264358540635334e-05, + "loss": 0.994, + "step": 369600 + }, + { + "epoch": 5.737208833160043, + "grad_norm": 2.2923591136932373, + "learning_rate": 4.4262806685392385e-05, + "loss": 0.9911, + "step": 369700 + }, + { + "epoch": 5.738760688402985, + "grad_norm": 2.2898237705230713, + "learning_rate": 4.426125483014944e-05, + "loss": 0.9652, + "step": 369800 + }, + { + "epoch": 5.740312543645929, + "grad_norm": 2.1933066844940186, + "learning_rate": 4.42597029749065e-05, + "loss": 0.978, + "step": 369900 + }, + { + "epoch": 5.741864398888872, + "grad_norm": 2.2684850692749023, + "learning_rate": 4.425815111966356e-05, + "loss": 0.9722, + "step": 370000 + }, + { + "epoch": 5.743416254131814, + "grad_norm": 2.007585048675537, + "learning_rate": 4.4256599264420616e-05, + "loss": 0.9852, + "step": 370100 + }, + { + "epoch": 5.744968109374757, + "grad_norm": 3.4612252712249756, + "learning_rate": 4.4255047409177674e-05, + "loss": 0.9962, + "step": 370200 + }, + { + "epoch": 5.7465199646177005, + "grad_norm": 2.068007469177246, + "learning_rate": 4.425349555393473e-05, + "loss": 0.9944, + "step": 370300 + }, + { + "epoch": 5.748071819860644, + "grad_norm": 2.2374267578125, + "learning_rate": 4.425194369869179e-05, + "loss": 0.9684, + "step": 370400 + }, + { + "epoch": 5.749623675103586, + "grad_norm": 2.325625419616699, + "learning_rate": 4.425039184344885e-05, + "loss": 0.9633, + "step": 370500 + }, + { + "epoch": 5.751175530346529, + "grad_norm": 2.280376672744751, + "learning_rate": 4.4248839988205905e-05, + "loss": 0.9766, + "step": 370600 + }, + { + "epoch": 5.7527273855894725, + "grad_norm": 1.8664424419403076, + "learning_rate": 4.424728813296296e-05, + "loss": 0.9654, + "step": 370700 + }, + { + "epoch": 5.754279240832415, + "grad_norm": 1.771437406539917, + "learning_rate": 4.424573627772002e-05, + "loss": 0.9706, + "step": 370800 + }, + { + "epoch": 5.755831096075358, + "grad_norm": 2.1752688884735107, + "learning_rate": 4.424418442247708e-05, + "loss": 0.9998, + "step": 370900 + }, + { + "epoch": 5.757382951318301, + "grad_norm": 2.3009305000305176, + "learning_rate": 4.4242632567234136e-05, + "loss": 0.9593, + "step": 371000 + }, + { + "epoch": 5.7589348065612445, + "grad_norm": 2.323765516281128, + "learning_rate": 4.424108071199119e-05, + "loss": 0.9644, + "step": 371100 + }, + { + "epoch": 5.760486661804187, + "grad_norm": 2.078720808029175, + "learning_rate": 4.4239528856748245e-05, + "loss": 0.9851, + "step": 371200 + }, + { + "epoch": 5.76203851704713, + "grad_norm": 2.0161325931549072, + "learning_rate": 4.42379770015053e-05, + "loss": 0.9686, + "step": 371300 + }, + { + "epoch": 5.763590372290073, + "grad_norm": 2.4879724979400635, + "learning_rate": 4.423642514626236e-05, + "loss": 0.9712, + "step": 371400 + }, + { + "epoch": 5.7651422275330155, + "grad_norm": 2.339918851852417, + "learning_rate": 4.423487329101942e-05, + "loss": 0.9813, + "step": 371500 + }, + { + "epoch": 5.766694082775959, + "grad_norm": 2.619237184524536, + "learning_rate": 4.423332143577647e-05, + "loss": 0.9816, + "step": 371600 + }, + { + "epoch": 5.768245938018902, + "grad_norm": 2.4511849880218506, + "learning_rate": 4.4231769580533527e-05, + "loss": 0.9873, + "step": 371700 + }, + { + "epoch": 5.769797793261844, + "grad_norm": 2.32576322555542, + "learning_rate": 4.4230217725290584e-05, + "loss": 0.973, + "step": 371800 + }, + { + "epoch": 5.7713496485047875, + "grad_norm": 2.0943193435668945, + "learning_rate": 4.422866587004764e-05, + "loss": 0.984, + "step": 371900 + }, + { + "epoch": 5.772901503747731, + "grad_norm": 2.5125226974487305, + "learning_rate": 4.42271140148047e-05, + "loss": 0.9834, + "step": 372000 + }, + { + "epoch": 5.774453358990673, + "grad_norm": 1.7996180057525635, + "learning_rate": 4.422556215956176e-05, + "loss": 0.9573, + "step": 372100 + }, + { + "epoch": 5.776005214233616, + "grad_norm": 2.6592886447906494, + "learning_rate": 4.4224010304318815e-05, + "loss": 0.9756, + "step": 372200 + }, + { + "epoch": 5.7775570694765594, + "grad_norm": 2.104086399078369, + "learning_rate": 4.422245844907587e-05, + "loss": 0.9862, + "step": 372300 + }, + { + "epoch": 5.779108924719502, + "grad_norm": 2.2881596088409424, + "learning_rate": 4.422090659383293e-05, + "loss": 0.9776, + "step": 372400 + }, + { + "epoch": 5.780660779962445, + "grad_norm": 2.197622060775757, + "learning_rate": 4.421935473858999e-05, + "loss": 0.9878, + "step": 372500 + }, + { + "epoch": 5.782212635205388, + "grad_norm": 2.0641188621520996, + "learning_rate": 4.421780288334704e-05, + "loss": 0.9907, + "step": 372600 + }, + { + "epoch": 5.7837644904483305, + "grad_norm": 2.9450650215148926, + "learning_rate": 4.42162510281041e-05, + "loss": 0.9992, + "step": 372700 + }, + { + "epoch": 5.785316345691274, + "grad_norm": 2.5186665058135986, + "learning_rate": 4.4214699172861155e-05, + "loss": 0.987, + "step": 372800 + }, + { + "epoch": 5.786868200934217, + "grad_norm": 2.3204572200775146, + "learning_rate": 4.421314731761821e-05, + "loss": 0.9616, + "step": 372900 + }, + { + "epoch": 5.78842005617716, + "grad_norm": 2.0976014137268066, + "learning_rate": 4.421159546237527e-05, + "loss": 0.9595, + "step": 373000 + }, + { + "epoch": 5.7899719114201025, + "grad_norm": 2.344311237335205, + "learning_rate": 4.421004360713233e-05, + "loss": 0.9817, + "step": 373100 + }, + { + "epoch": 5.791523766663046, + "grad_norm": 2.5947647094726562, + "learning_rate": 4.4208491751889386e-05, + "loss": 0.9839, + "step": 373200 + }, + { + "epoch": 5.793075621905989, + "grad_norm": 2.466034173965454, + "learning_rate": 4.4206939896646444e-05, + "loss": 0.9705, + "step": 373300 + }, + { + "epoch": 5.794627477148931, + "grad_norm": 2.138929843902588, + "learning_rate": 4.42053880414035e-05, + "loss": 0.9637, + "step": 373400 + }, + { + "epoch": 5.796179332391874, + "grad_norm": 2.1223342418670654, + "learning_rate": 4.420383618616056e-05, + "loss": 0.9927, + "step": 373500 + }, + { + "epoch": 5.797731187634818, + "grad_norm": 2.2264058589935303, + "learning_rate": 4.420228433091762e-05, + "loss": 0.9862, + "step": 373600 + }, + { + "epoch": 5.79928304287776, + "grad_norm": 1.9103014469146729, + "learning_rate": 4.4200732475674675e-05, + "loss": 0.9733, + "step": 373700 + }, + { + "epoch": 5.800834898120703, + "grad_norm": 2.2136528491973877, + "learning_rate": 4.419918062043173e-05, + "loss": 0.9758, + "step": 373800 + }, + { + "epoch": 5.802386753363646, + "grad_norm": 1.8761736154556274, + "learning_rate": 4.4197628765188784e-05, + "loss": 0.9641, + "step": 373900 + }, + { + "epoch": 5.80393860860659, + "grad_norm": 2.298898935317993, + "learning_rate": 4.419607690994584e-05, + "loss": 0.9767, + "step": 374000 + }, + { + "epoch": 5.805490463849532, + "grad_norm": 2.324634075164795, + "learning_rate": 4.41945250547029e-05, + "loss": 0.9727, + "step": 374100 + }, + { + "epoch": 5.807042319092475, + "grad_norm": 2.0672354698181152, + "learning_rate": 4.419297319945996e-05, + "loss": 0.9526, + "step": 374200 + }, + { + "epoch": 5.808594174335418, + "grad_norm": 2.249347448348999, + "learning_rate": 4.4191421344217015e-05, + "loss": 0.9805, + "step": 374300 + }, + { + "epoch": 5.810146029578361, + "grad_norm": 1.9111438989639282, + "learning_rate": 4.418986948897407e-05, + "loss": 0.9811, + "step": 374400 + }, + { + "epoch": 5.811697884821304, + "grad_norm": 2.5725488662719727, + "learning_rate": 4.418831763373113e-05, + "loss": 0.9623, + "step": 374500 + }, + { + "epoch": 5.813249740064247, + "grad_norm": 1.861810326576233, + "learning_rate": 4.418676577848819e-05, + "loss": 0.9737, + "step": 374600 + }, + { + "epoch": 5.814801595307189, + "grad_norm": 2.182191848754883, + "learning_rate": 4.4185213923245246e-05, + "loss": 0.9779, + "step": 374700 + }, + { + "epoch": 5.816353450550133, + "grad_norm": 2.236327886581421, + "learning_rate": 4.4183662068002297e-05, + "loss": 0.9689, + "step": 374800 + }, + { + "epoch": 5.817905305793076, + "grad_norm": 1.841439962387085, + "learning_rate": 4.4182110212759354e-05, + "loss": 0.9819, + "step": 374900 + }, + { + "epoch": 5.819457161036018, + "grad_norm": 2.1086463928222656, + "learning_rate": 4.418055835751641e-05, + "loss": 0.9807, + "step": 375000 + }, + { + "epoch": 5.821009016278961, + "grad_norm": 2.0292575359344482, + "learning_rate": 4.417900650227347e-05, + "loss": 0.966, + "step": 375100 + }, + { + "epoch": 5.822560871521905, + "grad_norm": 2.2531540393829346, + "learning_rate": 4.417745464703053e-05, + "loss": 0.9733, + "step": 375200 + }, + { + "epoch": 5.824112726764847, + "grad_norm": 2.0251357555389404, + "learning_rate": 4.4175902791787585e-05, + "loss": 0.9733, + "step": 375300 + }, + { + "epoch": 5.82566458200779, + "grad_norm": 2.3481085300445557, + "learning_rate": 4.4174350936544636e-05, + "loss": 0.9778, + "step": 375400 + }, + { + "epoch": 5.827216437250733, + "grad_norm": 2.078108072280884, + "learning_rate": 4.4172799081301694e-05, + "loss": 0.9829, + "step": 375500 + }, + { + "epoch": 5.828768292493677, + "grad_norm": 2.5529396533966064, + "learning_rate": 4.417124722605875e-05, + "loss": 0.9915, + "step": 375600 + }, + { + "epoch": 5.830320147736619, + "grad_norm": 2.742966413497925, + "learning_rate": 4.416969537081581e-05, + "loss": 0.9655, + "step": 375700 + }, + { + "epoch": 5.831872002979562, + "grad_norm": 2.233820915222168, + "learning_rate": 4.416814351557287e-05, + "loss": 1.0054, + "step": 375800 + }, + { + "epoch": 5.833423858222505, + "grad_norm": 2.059288501739502, + "learning_rate": 4.4166591660329925e-05, + "loss": 0.9575, + "step": 375900 + }, + { + "epoch": 5.834975713465448, + "grad_norm": 2.4791860580444336, + "learning_rate": 4.416503980508698e-05, + "loss": 0.9981, + "step": 376000 + }, + { + "epoch": 5.836527568708391, + "grad_norm": 1.7062909603118896, + "learning_rate": 4.416348794984404e-05, + "loss": 0.9753, + "step": 376100 + }, + { + "epoch": 5.838079423951334, + "grad_norm": 2.218681573867798, + "learning_rate": 4.41619360946011e-05, + "loss": 0.9926, + "step": 376200 + }, + { + "epoch": 5.839631279194276, + "grad_norm": 2.3979766368865967, + "learning_rate": 4.4160384239358156e-05, + "loss": 0.9836, + "step": 376300 + }, + { + "epoch": 5.84118313443722, + "grad_norm": 2.0802865028381348, + "learning_rate": 4.4158832384115214e-05, + "loss": 0.9766, + "step": 376400 + }, + { + "epoch": 5.842734989680163, + "grad_norm": 2.42814302444458, + "learning_rate": 4.415728052887227e-05, + "loss": 0.9783, + "step": 376500 + }, + { + "epoch": 5.844286844923106, + "grad_norm": 2.0447919368743896, + "learning_rate": 4.415572867362933e-05, + "loss": 0.9684, + "step": 376600 + }, + { + "epoch": 5.845838700166048, + "grad_norm": 1.781868815422058, + "learning_rate": 4.415417681838638e-05, + "loss": 0.9806, + "step": 376700 + }, + { + "epoch": 5.8473905554089916, + "grad_norm": 2.0194475650787354, + "learning_rate": 4.415262496314344e-05, + "loss": 0.9821, + "step": 376800 + }, + { + "epoch": 5.848942410651935, + "grad_norm": 2.089266300201416, + "learning_rate": 4.4151073107900496e-05, + "loss": 0.9828, + "step": 376900 + }, + { + "epoch": 5.850494265894877, + "grad_norm": 2.3462977409362793, + "learning_rate": 4.4149521252657554e-05, + "loss": 0.9866, + "step": 377000 + }, + { + "epoch": 5.85204612113782, + "grad_norm": 2.3346357345581055, + "learning_rate": 4.414796939741461e-05, + "loss": 0.9712, + "step": 377100 + }, + { + "epoch": 5.8535979763807635, + "grad_norm": 2.413771152496338, + "learning_rate": 4.414641754217167e-05, + "loss": 0.9934, + "step": 377200 + }, + { + "epoch": 5.855149831623706, + "grad_norm": 1.955029845237732, + "learning_rate": 4.414486568692873e-05, + "loss": 0.9634, + "step": 377300 + }, + { + "epoch": 5.856701686866649, + "grad_norm": 2.580462694168091, + "learning_rate": 4.4143313831685785e-05, + "loss": 0.9718, + "step": 377400 + }, + { + "epoch": 5.858253542109592, + "grad_norm": 2.2274577617645264, + "learning_rate": 4.414176197644284e-05, + "loss": 0.9646, + "step": 377500 + }, + { + "epoch": 5.859805397352535, + "grad_norm": 2.120365858078003, + "learning_rate": 4.41402101211999e-05, + "loss": 0.9967, + "step": 377600 + }, + { + "epoch": 5.861357252595478, + "grad_norm": 2.3951311111450195, + "learning_rate": 4.413865826595696e-05, + "loss": 0.9752, + "step": 377700 + }, + { + "epoch": 5.862909107838421, + "grad_norm": 2.3355135917663574, + "learning_rate": 4.4137106410714016e-05, + "loss": 0.9774, + "step": 377800 + }, + { + "epoch": 5.864460963081363, + "grad_norm": 1.7190966606140137, + "learning_rate": 4.413555455547107e-05, + "loss": 0.9802, + "step": 377900 + }, + { + "epoch": 5.8660128183243065, + "grad_norm": 1.934046983718872, + "learning_rate": 4.4134002700228124e-05, + "loss": 0.9496, + "step": 378000 + }, + { + "epoch": 5.86756467356725, + "grad_norm": 2.399535894393921, + "learning_rate": 4.413245084498518e-05, + "loss": 0.9811, + "step": 378100 + }, + { + "epoch": 5.869116528810193, + "grad_norm": 2.241931676864624, + "learning_rate": 4.413089898974224e-05, + "loss": 0.9815, + "step": 378200 + }, + { + "epoch": 5.870668384053135, + "grad_norm": 2.0503292083740234, + "learning_rate": 4.412934713449929e-05, + "loss": 0.9734, + "step": 378300 + }, + { + "epoch": 5.8722202392960785, + "grad_norm": 2.3869497776031494, + "learning_rate": 4.412779527925635e-05, + "loss": 0.9763, + "step": 378400 + }, + { + "epoch": 5.873772094539022, + "grad_norm": 2.2284083366394043, + "learning_rate": 4.4126243424013406e-05, + "loss": 0.9717, + "step": 378500 + }, + { + "epoch": 5.875323949781964, + "grad_norm": 2.028057813644409, + "learning_rate": 4.4124691568770464e-05, + "loss": 0.9794, + "step": 378600 + }, + { + "epoch": 5.876875805024907, + "grad_norm": 1.99371337890625, + "learning_rate": 4.412313971352752e-05, + "loss": 0.9655, + "step": 378700 + }, + { + "epoch": 5.8784276602678505, + "grad_norm": 2.3860819339752197, + "learning_rate": 4.412158785828458e-05, + "loss": 0.9851, + "step": 378800 + }, + { + "epoch": 5.879979515510793, + "grad_norm": 2.0741422176361084, + "learning_rate": 4.412003600304164e-05, + "loss": 0.9856, + "step": 378900 + }, + { + "epoch": 5.881531370753736, + "grad_norm": 2.468414783477783, + "learning_rate": 4.4118484147798695e-05, + "loss": 0.9746, + "step": 379000 + }, + { + "epoch": 5.883083225996679, + "grad_norm": 2.512202262878418, + "learning_rate": 4.411693229255575e-05, + "loss": 0.9681, + "step": 379100 + }, + { + "epoch": 5.884635081239622, + "grad_norm": 2.6756885051727295, + "learning_rate": 4.411538043731281e-05, + "loss": 0.9735, + "step": 379200 + }, + { + "epoch": 5.886186936482565, + "grad_norm": 2.1895599365234375, + "learning_rate": 4.411382858206987e-05, + "loss": 0.9814, + "step": 379300 + }, + { + "epoch": 5.887738791725508, + "grad_norm": 2.8431382179260254, + "learning_rate": 4.4112276726826926e-05, + "loss": 0.9715, + "step": 379400 + }, + { + "epoch": 5.889290646968451, + "grad_norm": 2.049410581588745, + "learning_rate": 4.4110724871583984e-05, + "loss": 0.9832, + "step": 379500 + }, + { + "epoch": 5.8908425022113935, + "grad_norm": 2.2480385303497314, + "learning_rate": 4.4109173016341035e-05, + "loss": 0.9764, + "step": 379600 + }, + { + "epoch": 5.892394357454337, + "grad_norm": 2.210885524749756, + "learning_rate": 4.410762116109809e-05, + "loss": 0.9729, + "step": 379700 + }, + { + "epoch": 5.89394621269728, + "grad_norm": 1.8914847373962402, + "learning_rate": 4.410606930585515e-05, + "loss": 0.9836, + "step": 379800 + }, + { + "epoch": 5.895498067940222, + "grad_norm": 2.6204538345336914, + "learning_rate": 4.410451745061221e-05, + "loss": 0.9681, + "step": 379900 + }, + { + "epoch": 5.8970499231831655, + "grad_norm": 2.115401268005371, + "learning_rate": 4.4102965595369266e-05, + "loss": 0.98, + "step": 380000 + }, + { + "epoch": 5.898601778426109, + "grad_norm": 2.5905754566192627, + "learning_rate": 4.4101413740126324e-05, + "loss": 0.981, + "step": 380100 + }, + { + "epoch": 5.900153633669051, + "grad_norm": 2.018876552581787, + "learning_rate": 4.409986188488338e-05, + "loss": 0.9667, + "step": 380200 + }, + { + "epoch": 5.901705488911994, + "grad_norm": 2.142106533050537, + "learning_rate": 4.409831002964044e-05, + "loss": 0.9735, + "step": 380300 + }, + { + "epoch": 5.903257344154937, + "grad_norm": 2.063349962234497, + "learning_rate": 4.40967581743975e-05, + "loss": 0.9653, + "step": 380400 + }, + { + "epoch": 5.90480919939788, + "grad_norm": 2.105029344558716, + "learning_rate": 4.4095206319154555e-05, + "loss": 0.9664, + "step": 380500 + }, + { + "epoch": 5.906361054640823, + "grad_norm": 1.9629491567611694, + "learning_rate": 4.409365446391161e-05, + "loss": 0.9836, + "step": 380600 + }, + { + "epoch": 5.907912909883766, + "grad_norm": 2.533064126968384, + "learning_rate": 4.409210260866867e-05, + "loss": 0.9727, + "step": 380700 + }, + { + "epoch": 5.9094647651267085, + "grad_norm": 2.4336819648742676, + "learning_rate": 4.409055075342573e-05, + "loss": 0.9861, + "step": 380800 + }, + { + "epoch": 5.911016620369652, + "grad_norm": 2.2445414066314697, + "learning_rate": 4.408899889818278e-05, + "loss": 0.9875, + "step": 380900 + }, + { + "epoch": 5.912568475612595, + "grad_norm": 1.9899882078170776, + "learning_rate": 4.4087447042939837e-05, + "loss": 0.997, + "step": 381000 + }, + { + "epoch": 5.914120330855538, + "grad_norm": 2.161224603652954, + "learning_rate": 4.4085895187696894e-05, + "loss": 0.9705, + "step": 381100 + }, + { + "epoch": 5.9156721860984804, + "grad_norm": 1.702340841293335, + "learning_rate": 4.408434333245395e-05, + "loss": 0.9906, + "step": 381200 + }, + { + "epoch": 5.917224041341424, + "grad_norm": 2.1645522117614746, + "learning_rate": 4.4082791477211e-05, + "loss": 0.9715, + "step": 381300 + }, + { + "epoch": 5.918775896584367, + "grad_norm": 1.9506837129592896, + "learning_rate": 4.408123962196806e-05, + "loss": 0.9542, + "step": 381400 + }, + { + "epoch": 5.920327751827309, + "grad_norm": 2.4919285774230957, + "learning_rate": 4.407968776672512e-05, + "loss": 0.9472, + "step": 381500 + }, + { + "epoch": 5.921879607070252, + "grad_norm": 2.1404054164886475, + "learning_rate": 4.4078135911482176e-05, + "loss": 0.98, + "step": 381600 + }, + { + "epoch": 5.923431462313196, + "grad_norm": 2.1776788234710693, + "learning_rate": 4.4076584056239234e-05, + "loss": 0.9793, + "step": 381700 + }, + { + "epoch": 5.924983317556139, + "grad_norm": 2.2884573936462402, + "learning_rate": 4.407503220099629e-05, + "loss": 0.9855, + "step": 381800 + }, + { + "epoch": 5.926535172799081, + "grad_norm": 2.4969139099121094, + "learning_rate": 4.407348034575335e-05, + "loss": 0.9622, + "step": 381900 + }, + { + "epoch": 5.928087028042024, + "grad_norm": 2.166203260421753, + "learning_rate": 4.407192849051041e-05, + "loss": 0.9908, + "step": 382000 + }, + { + "epoch": 5.929638883284968, + "grad_norm": 2.4165701866149902, + "learning_rate": 4.4070376635267465e-05, + "loss": 0.9571, + "step": 382100 + }, + { + "epoch": 5.93119073852791, + "grad_norm": 1.8546074628829956, + "learning_rate": 4.406882478002452e-05, + "loss": 0.9829, + "step": 382200 + }, + { + "epoch": 5.932742593770853, + "grad_norm": 2.0221145153045654, + "learning_rate": 4.406727292478158e-05, + "loss": 0.9814, + "step": 382300 + }, + { + "epoch": 5.934294449013796, + "grad_norm": 1.9089550971984863, + "learning_rate": 4.406572106953863e-05, + "loss": 1.0114, + "step": 382400 + }, + { + "epoch": 5.935846304256739, + "grad_norm": 2.201709270477295, + "learning_rate": 4.406416921429569e-05, + "loss": 0.9635, + "step": 382500 + }, + { + "epoch": 5.937398159499682, + "grad_norm": 1.7766344547271729, + "learning_rate": 4.406261735905275e-05, + "loss": 0.9902, + "step": 382600 + }, + { + "epoch": 5.938950014742625, + "grad_norm": 2.4231464862823486, + "learning_rate": 4.4061065503809805e-05, + "loss": 0.9776, + "step": 382700 + }, + { + "epoch": 5.940501869985567, + "grad_norm": 1.9196124076843262, + "learning_rate": 4.405951364856686e-05, + "loss": 0.955, + "step": 382800 + }, + { + "epoch": 5.942053725228511, + "grad_norm": 2.0492331981658936, + "learning_rate": 4.405796179332392e-05, + "loss": 0.9819, + "step": 382900 + }, + { + "epoch": 5.943605580471454, + "grad_norm": 2.2187132835388184, + "learning_rate": 4.405640993808098e-05, + "loss": 0.9615, + "step": 383000 + }, + { + "epoch": 5.945157435714396, + "grad_norm": 2.631035327911377, + "learning_rate": 4.4054858082838036e-05, + "loss": 0.9849, + "step": 383100 + }, + { + "epoch": 5.946709290957339, + "grad_norm": 2.206575632095337, + "learning_rate": 4.4053306227595094e-05, + "loss": 0.9638, + "step": 383200 + }, + { + "epoch": 5.948261146200283, + "grad_norm": 2.0412869453430176, + "learning_rate": 4.405175437235215e-05, + "loss": 0.9709, + "step": 383300 + }, + { + "epoch": 5.949813001443225, + "grad_norm": 2.055488109588623, + "learning_rate": 4.405020251710921e-05, + "loss": 0.9785, + "step": 383400 + }, + { + "epoch": 5.951364856686168, + "grad_norm": 2.452118396759033, + "learning_rate": 4.404865066186627e-05, + "loss": 1.0, + "step": 383500 + }, + { + "epoch": 5.952916711929111, + "grad_norm": 2.1359150409698486, + "learning_rate": 4.4047098806623325e-05, + "loss": 0.9879, + "step": 383600 + }, + { + "epoch": 5.9544685671720545, + "grad_norm": 1.9746674299240112, + "learning_rate": 4.4045546951380376e-05, + "loss": 0.9708, + "step": 383700 + }, + { + "epoch": 5.956020422414997, + "grad_norm": 2.2791800498962402, + "learning_rate": 4.404399509613743e-05, + "loss": 0.9781, + "step": 383800 + }, + { + "epoch": 5.95757227765794, + "grad_norm": 1.9660913944244385, + "learning_rate": 4.404244324089449e-05, + "loss": 0.9687, + "step": 383900 + }, + { + "epoch": 5.959124132900883, + "grad_norm": 2.287830114364624, + "learning_rate": 4.404089138565155e-05, + "loss": 0.9561, + "step": 384000 + }, + { + "epoch": 5.960675988143826, + "grad_norm": 2.1479904651641846, + "learning_rate": 4.4039339530408607e-05, + "loss": 0.9654, + "step": 384100 + }, + { + "epoch": 5.962227843386769, + "grad_norm": 2.3872644901275635, + "learning_rate": 4.4037787675165664e-05, + "loss": 0.9656, + "step": 384200 + }, + { + "epoch": 5.963779698629712, + "grad_norm": 2.462561845779419, + "learning_rate": 4.403623581992272e-05, + "loss": 0.9917, + "step": 384300 + }, + { + "epoch": 5.965331553872655, + "grad_norm": 2.225314140319824, + "learning_rate": 4.403468396467978e-05, + "loss": 0.986, + "step": 384400 + }, + { + "epoch": 5.966883409115598, + "grad_norm": 2.2320430278778076, + "learning_rate": 4.403313210943684e-05, + "loss": 0.9701, + "step": 384500 + }, + { + "epoch": 5.968435264358541, + "grad_norm": 2.099632740020752, + "learning_rate": 4.403158025419389e-05, + "loss": 0.9824, + "step": 384600 + }, + { + "epoch": 5.969987119601484, + "grad_norm": 2.051234245300293, + "learning_rate": 4.4030028398950946e-05, + "loss": 0.9818, + "step": 384700 + }, + { + "epoch": 5.971538974844426, + "grad_norm": 1.902363657951355, + "learning_rate": 4.4028476543708004e-05, + "loss": 0.9866, + "step": 384800 + }, + { + "epoch": 5.9730908300873695, + "grad_norm": 2.5692336559295654, + "learning_rate": 4.402692468846506e-05, + "loss": 0.9729, + "step": 384900 + }, + { + "epoch": 5.974642685330313, + "grad_norm": 2.3582212924957275, + "learning_rate": 4.402537283322212e-05, + "loss": 0.9598, + "step": 385000 + }, + { + "epoch": 5.976194540573255, + "grad_norm": 1.9450781345367432, + "learning_rate": 4.402382097797918e-05, + "loss": 0.9704, + "step": 385100 + }, + { + "epoch": 5.977746395816198, + "grad_norm": 2.6561570167541504, + "learning_rate": 4.402226912273623e-05, + "loss": 0.9858, + "step": 385200 + }, + { + "epoch": 5.9792982510591415, + "grad_norm": 1.969224452972412, + "learning_rate": 4.4020717267493286e-05, + "loss": 0.9733, + "step": 385300 + }, + { + "epoch": 5.980850106302084, + "grad_norm": 2.226437568664551, + "learning_rate": 4.4019165412250344e-05, + "loss": 0.9774, + "step": 385400 + }, + { + "epoch": 5.982401961545027, + "grad_norm": 2.0637199878692627, + "learning_rate": 4.40176135570074e-05, + "loss": 0.976, + "step": 385500 + }, + { + "epoch": 5.98395381678797, + "grad_norm": 2.163114309310913, + "learning_rate": 4.401606170176446e-05, + "loss": 0.972, + "step": 385600 + }, + { + "epoch": 5.9855056720309125, + "grad_norm": 2.5656440258026123, + "learning_rate": 4.401450984652152e-05, + "loss": 0.9693, + "step": 385700 + }, + { + "epoch": 5.987057527273856, + "grad_norm": 2.2277262210845947, + "learning_rate": 4.4012957991278575e-05, + "loss": 0.9789, + "step": 385800 + }, + { + "epoch": 5.988609382516799, + "grad_norm": 2.257556200027466, + "learning_rate": 4.401140613603563e-05, + "loss": 0.984, + "step": 385900 + }, + { + "epoch": 5.990161237759741, + "grad_norm": 2.409364700317383, + "learning_rate": 4.400985428079269e-05, + "loss": 0.9887, + "step": 386000 + }, + { + "epoch": 5.9917130930026845, + "grad_norm": 1.9965325593948364, + "learning_rate": 4.400830242554975e-05, + "loss": 0.9736, + "step": 386100 + }, + { + "epoch": 5.993264948245628, + "grad_norm": 2.191364288330078, + "learning_rate": 4.4006750570306806e-05, + "loss": 0.9752, + "step": 386200 + }, + { + "epoch": 5.994816803488571, + "grad_norm": 2.4621355533599854, + "learning_rate": 4.4005198715063864e-05, + "loss": 0.9787, + "step": 386300 + }, + { + "epoch": 5.996368658731513, + "grad_norm": 2.0752222537994385, + "learning_rate": 4.400364685982092e-05, + "loss": 0.9741, + "step": 386400 + }, + { + "epoch": 5.9979205139744565, + "grad_norm": 2.313493490219116, + "learning_rate": 4.400209500457797e-05, + "loss": 0.9662, + "step": 386500 + }, + { + "epoch": 5.9994723692174, + "grad_norm": 2.3873605728149414, + "learning_rate": 4.400054314933503e-05, + "loss": 0.9745, + "step": 386600 + }, + { + "epoch": 6.001024224460342, + "grad_norm": 2.000333070755005, + "learning_rate": 4.399899129409209e-05, + "loss": 0.9511, + "step": 386700 + }, + { + "epoch": 6.002576079703285, + "grad_norm": 2.271426200866699, + "learning_rate": 4.3997439438849146e-05, + "loss": 0.9531, + "step": 386800 + }, + { + "epoch": 6.004127934946228, + "grad_norm": 2.0502657890319824, + "learning_rate": 4.39958875836062e-05, + "loss": 0.962, + "step": 386900 + }, + { + "epoch": 6.005679790189171, + "grad_norm": 2.212423801422119, + "learning_rate": 4.399433572836326e-05, + "loss": 0.9593, + "step": 387000 + }, + { + "epoch": 6.007231645432114, + "grad_norm": 2.1144490242004395, + "learning_rate": 4.399278387312032e-05, + "loss": 0.9611, + "step": 387100 + }, + { + "epoch": 6.008783500675057, + "grad_norm": 2.4727673530578613, + "learning_rate": 4.3991232017877377e-05, + "loss": 0.9381, + "step": 387200 + }, + { + "epoch": 6.010335355918, + "grad_norm": 1.871916651725769, + "learning_rate": 4.3989680162634434e-05, + "loss": 0.9543, + "step": 387300 + }, + { + "epoch": 6.011887211160943, + "grad_norm": 2.118690252304077, + "learning_rate": 4.398812830739149e-05, + "loss": 0.9445, + "step": 387400 + }, + { + "epoch": 6.013439066403886, + "grad_norm": 2.5680811405181885, + "learning_rate": 4.398657645214855e-05, + "loss": 0.9319, + "step": 387500 + }, + { + "epoch": 6.014990921646829, + "grad_norm": 2.3992414474487305, + "learning_rate": 4.398502459690561e-05, + "loss": 0.9731, + "step": 387600 + }, + { + "epoch": 6.0165427768897715, + "grad_norm": 2.5568087100982666, + "learning_rate": 4.3983472741662665e-05, + "loss": 0.9698, + "step": 387700 + }, + { + "epoch": 6.018094632132715, + "grad_norm": 2.3998708724975586, + "learning_rate": 4.3981920886419716e-05, + "loss": 0.9699, + "step": 387800 + }, + { + "epoch": 6.019646487375658, + "grad_norm": 2.113197088241577, + "learning_rate": 4.3980369031176774e-05, + "loss": 0.9643, + "step": 387900 + }, + { + "epoch": 6.0211983426186, + "grad_norm": 2.3096909523010254, + "learning_rate": 4.397881717593383e-05, + "loss": 0.954, + "step": 388000 + }, + { + "epoch": 6.022750197861543, + "grad_norm": 2.312952756881714, + "learning_rate": 4.397726532069088e-05, + "loss": 0.974, + "step": 388100 + }, + { + "epoch": 6.024302053104487, + "grad_norm": 2.1558468341827393, + "learning_rate": 4.397571346544794e-05, + "loss": 0.9675, + "step": 388200 + }, + { + "epoch": 6.025853908347429, + "grad_norm": 2.493319272994995, + "learning_rate": 4.3974161610205e-05, + "loss": 0.9674, + "step": 388300 + }, + { + "epoch": 6.027405763590372, + "grad_norm": 2.3087003231048584, + "learning_rate": 4.3972609754962056e-05, + "loss": 0.9808, + "step": 388400 + }, + { + "epoch": 6.028957618833315, + "grad_norm": 2.2011890411376953, + "learning_rate": 4.3971057899719114e-05, + "loss": 0.9704, + "step": 388500 + }, + { + "epoch": 6.030509474076259, + "grad_norm": 2.37614369392395, + "learning_rate": 4.396950604447617e-05, + "loss": 0.9536, + "step": 388600 + }, + { + "epoch": 6.032061329319201, + "grad_norm": 2.337226390838623, + "learning_rate": 4.396795418923323e-05, + "loss": 0.9571, + "step": 388700 + }, + { + "epoch": 6.033613184562144, + "grad_norm": 2.3683433532714844, + "learning_rate": 4.396640233399029e-05, + "loss": 0.9488, + "step": 388800 + }, + { + "epoch": 6.035165039805087, + "grad_norm": 1.9313045740127563, + "learning_rate": 4.3964850478747345e-05, + "loss": 0.9855, + "step": 388900 + }, + { + "epoch": 6.03671689504803, + "grad_norm": 2.032167911529541, + "learning_rate": 4.39632986235044e-05, + "loss": 0.9681, + "step": 389000 + }, + { + "epoch": 6.038268750290973, + "grad_norm": 2.1099116802215576, + "learning_rate": 4.396174676826146e-05, + "loss": 0.9495, + "step": 389100 + }, + { + "epoch": 6.039820605533916, + "grad_norm": 2.557539224624634, + "learning_rate": 4.396019491301852e-05, + "loss": 0.9694, + "step": 389200 + }, + { + "epoch": 6.041372460776858, + "grad_norm": 1.9979205131530762, + "learning_rate": 4.3958643057775576e-05, + "loss": 0.9522, + "step": 389300 + }, + { + "epoch": 6.042924316019802, + "grad_norm": 1.9996343851089478, + "learning_rate": 4.395709120253263e-05, + "loss": 0.96, + "step": 389400 + }, + { + "epoch": 6.044476171262745, + "grad_norm": 2.359251022338867, + "learning_rate": 4.3955539347289684e-05, + "loss": 0.9539, + "step": 389500 + }, + { + "epoch": 6.046028026505687, + "grad_norm": 2.0301082134246826, + "learning_rate": 4.395398749204674e-05, + "loss": 0.9822, + "step": 389600 + }, + { + "epoch": 6.04757988174863, + "grad_norm": 1.966744065284729, + "learning_rate": 4.39524356368038e-05, + "loss": 0.981, + "step": 389700 + }, + { + "epoch": 6.049131736991574, + "grad_norm": 2.130514144897461, + "learning_rate": 4.395088378156086e-05, + "loss": 0.9745, + "step": 389800 + }, + { + "epoch": 6.050683592234517, + "grad_norm": 2.057539701461792, + "learning_rate": 4.3949331926317916e-05, + "loss": 0.9573, + "step": 389900 + }, + { + "epoch": 6.052235447477459, + "grad_norm": 2.3234894275665283, + "learning_rate": 4.394778007107497e-05, + "loss": 0.945, + "step": 390000 + }, + { + "epoch": 6.053787302720402, + "grad_norm": 2.1992385387420654, + "learning_rate": 4.394622821583203e-05, + "loss": 0.9687, + "step": 390100 + }, + { + "epoch": 6.0553391579633455, + "grad_norm": 1.8857234716415405, + "learning_rate": 4.394467636058909e-05, + "loss": 0.9773, + "step": 390200 + }, + { + "epoch": 6.056891013206288, + "grad_norm": 2.4906578063964844, + "learning_rate": 4.3943124505346147e-05, + "loss": 0.9675, + "step": 390300 + }, + { + "epoch": 6.058442868449231, + "grad_norm": 2.2270302772521973, + "learning_rate": 4.3941572650103204e-05, + "loss": 0.9451, + "step": 390400 + }, + { + "epoch": 6.059994723692174, + "grad_norm": 2.8498144149780273, + "learning_rate": 4.394002079486026e-05, + "loss": 0.9737, + "step": 390500 + }, + { + "epoch": 6.061546578935117, + "grad_norm": 2.224904775619507, + "learning_rate": 4.393846893961732e-05, + "loss": 0.9733, + "step": 390600 + }, + { + "epoch": 6.06309843417806, + "grad_norm": 2.4438703060150146, + "learning_rate": 4.393691708437437e-05, + "loss": 0.9609, + "step": 390700 + }, + { + "epoch": 6.064650289421003, + "grad_norm": 2.195847988128662, + "learning_rate": 4.393536522913143e-05, + "loss": 0.9534, + "step": 390800 + }, + { + "epoch": 6.066202144663945, + "grad_norm": 2.295825242996216, + "learning_rate": 4.3933813373888486e-05, + "loss": 0.9491, + "step": 390900 + }, + { + "epoch": 6.067753999906889, + "grad_norm": 2.180386543273926, + "learning_rate": 4.3932261518645544e-05, + "loss": 0.9645, + "step": 391000 + }, + { + "epoch": 6.069305855149832, + "grad_norm": 2.033557415008545, + "learning_rate": 4.3930709663402595e-05, + "loss": 0.9631, + "step": 391100 + }, + { + "epoch": 6.070857710392775, + "grad_norm": 2.0717227458953857, + "learning_rate": 4.392915780815965e-05, + "loss": 0.9841, + "step": 391200 + }, + { + "epoch": 6.072409565635717, + "grad_norm": 2.347292184829712, + "learning_rate": 4.392760595291671e-05, + "loss": 0.9644, + "step": 391300 + }, + { + "epoch": 6.0739614208786605, + "grad_norm": 2.1732914447784424, + "learning_rate": 4.392605409767377e-05, + "loss": 0.9853, + "step": 391400 + }, + { + "epoch": 6.075513276121604, + "grad_norm": 2.3305435180664062, + "learning_rate": 4.3924502242430826e-05, + "loss": 0.9797, + "step": 391500 + }, + { + "epoch": 6.077065131364546, + "grad_norm": 2.158534049987793, + "learning_rate": 4.3922950387187884e-05, + "loss": 0.991, + "step": 391600 + }, + { + "epoch": 6.078616986607489, + "grad_norm": 2.0103869438171387, + "learning_rate": 4.392139853194494e-05, + "loss": 0.9674, + "step": 391700 + }, + { + "epoch": 6.0801688418504325, + "grad_norm": 2.5109951496124268, + "learning_rate": 4.3919846676702e-05, + "loss": 0.9547, + "step": 391800 + }, + { + "epoch": 6.081720697093375, + "grad_norm": 1.924285650253296, + "learning_rate": 4.391829482145906e-05, + "loss": 0.9509, + "step": 391900 + }, + { + "epoch": 6.083272552336318, + "grad_norm": 2.2401115894317627, + "learning_rate": 4.3916742966216115e-05, + "loss": 0.9695, + "step": 392000 + }, + { + "epoch": 6.084824407579261, + "grad_norm": 2.2518539428710938, + "learning_rate": 4.391519111097317e-05, + "loss": 0.973, + "step": 392100 + }, + { + "epoch": 6.086376262822204, + "grad_norm": 2.5874836444854736, + "learning_rate": 4.3913639255730223e-05, + "loss": 0.9701, + "step": 392200 + }, + { + "epoch": 6.087928118065147, + "grad_norm": 2.1396758556365967, + "learning_rate": 4.391208740048728e-05, + "loss": 0.9588, + "step": 392300 + }, + { + "epoch": 6.08947997330809, + "grad_norm": 2.413935661315918, + "learning_rate": 4.391053554524434e-05, + "loss": 0.9735, + "step": 392400 + }, + { + "epoch": 6.091031828551033, + "grad_norm": 2.398164987564087, + "learning_rate": 4.39089836900014e-05, + "loss": 0.9579, + "step": 392500 + }, + { + "epoch": 6.0925836837939755, + "grad_norm": 2.415902853012085, + "learning_rate": 4.3907431834758454e-05, + "loss": 0.968, + "step": 392600 + }, + { + "epoch": 6.094135539036919, + "grad_norm": 2.1242220401763916, + "learning_rate": 4.390587997951551e-05, + "loss": 0.9603, + "step": 392700 + }, + { + "epoch": 6.095687394279862, + "grad_norm": 1.976881504058838, + "learning_rate": 4.390432812427257e-05, + "loss": 0.962, + "step": 392800 + }, + { + "epoch": 6.097239249522804, + "grad_norm": 2.1509594917297363, + "learning_rate": 4.390277626902963e-05, + "loss": 0.9571, + "step": 392900 + }, + { + "epoch": 6.0987911047657475, + "grad_norm": 1.9936940670013428, + "learning_rate": 4.3901224413786686e-05, + "loss": 0.9584, + "step": 393000 + }, + { + "epoch": 6.100342960008691, + "grad_norm": 2.323820114135742, + "learning_rate": 4.389967255854374e-05, + "loss": 0.9719, + "step": 393100 + }, + { + "epoch": 6.101894815251633, + "grad_norm": 2.1440021991729736, + "learning_rate": 4.38981207033008e-05, + "loss": 0.9842, + "step": 393200 + }, + { + "epoch": 6.103446670494576, + "grad_norm": 2.216322422027588, + "learning_rate": 4.389656884805786e-05, + "loss": 0.9483, + "step": 393300 + }, + { + "epoch": 6.104998525737519, + "grad_norm": 1.8636776208877563, + "learning_rate": 4.3895016992814917e-05, + "loss": 0.9722, + "step": 393400 + }, + { + "epoch": 6.106550380980462, + "grad_norm": 2.2130370140075684, + "learning_rate": 4.389346513757197e-05, + "loss": 0.9786, + "step": 393500 + }, + { + "epoch": 6.108102236223405, + "grad_norm": 2.094633102416992, + "learning_rate": 4.3891913282329025e-05, + "loss": 0.9726, + "step": 393600 + }, + { + "epoch": 6.109654091466348, + "grad_norm": 2.696074962615967, + "learning_rate": 4.389036142708608e-05, + "loss": 0.9711, + "step": 393700 + }, + { + "epoch": 6.1112059467092905, + "grad_norm": 2.100592613220215, + "learning_rate": 4.388880957184314e-05, + "loss": 0.9471, + "step": 393800 + }, + { + "epoch": 6.112757801952234, + "grad_norm": 2.4279162883758545, + "learning_rate": 4.38872577166002e-05, + "loss": 0.9809, + "step": 393900 + }, + { + "epoch": 6.114309657195177, + "grad_norm": 1.997194528579712, + "learning_rate": 4.3885705861357256e-05, + "loss": 0.9669, + "step": 394000 + }, + { + "epoch": 6.11586151243812, + "grad_norm": 2.1823298931121826, + "learning_rate": 4.3884154006114314e-05, + "loss": 0.9617, + "step": 394100 + }, + { + "epoch": 6.1174133676810625, + "grad_norm": 2.295586347579956, + "learning_rate": 4.388260215087137e-05, + "loss": 0.9735, + "step": 394200 + }, + { + "epoch": 6.118965222924006, + "grad_norm": 2.3986918926239014, + "learning_rate": 4.388105029562843e-05, + "loss": 0.9708, + "step": 394300 + }, + { + "epoch": 6.120517078166949, + "grad_norm": 2.3040828704833984, + "learning_rate": 4.387949844038548e-05, + "loss": 0.9671, + "step": 394400 + }, + { + "epoch": 6.122068933409891, + "grad_norm": 1.8620610237121582, + "learning_rate": 4.387794658514254e-05, + "loss": 0.9661, + "step": 394500 + }, + { + "epoch": 6.123620788652834, + "grad_norm": 2.3233370780944824, + "learning_rate": 4.3876394729899596e-05, + "loss": 0.9647, + "step": 394600 + }, + { + "epoch": 6.125172643895778, + "grad_norm": 2.0119471549987793, + "learning_rate": 4.3874842874656654e-05, + "loss": 0.9744, + "step": 394700 + }, + { + "epoch": 6.12672449913872, + "grad_norm": 2.443408966064453, + "learning_rate": 4.387329101941371e-05, + "loss": 0.9617, + "step": 394800 + }, + { + "epoch": 6.128276354381663, + "grad_norm": 2.2621512413024902, + "learning_rate": 4.387173916417077e-05, + "loss": 0.9535, + "step": 394900 + }, + { + "epoch": 6.129828209624606, + "grad_norm": 2.188690185546875, + "learning_rate": 4.387018730892783e-05, + "loss": 0.966, + "step": 395000 + }, + { + "epoch": 6.13138006486755, + "grad_norm": 2.1281745433807373, + "learning_rate": 4.386863545368488e-05, + "loss": 0.9743, + "step": 395100 + }, + { + "epoch": 6.132931920110492, + "grad_norm": 2.5643653869628906, + "learning_rate": 4.3867083598441936e-05, + "loss": 0.9816, + "step": 395200 + }, + { + "epoch": 6.134483775353435, + "grad_norm": 2.4219088554382324, + "learning_rate": 4.3865531743198993e-05, + "loss": 0.9659, + "step": 395300 + }, + { + "epoch": 6.136035630596378, + "grad_norm": 2.4001505374908447, + "learning_rate": 4.386397988795605e-05, + "loss": 0.965, + "step": 395400 + }, + { + "epoch": 6.137587485839321, + "grad_norm": 2.4284939765930176, + "learning_rate": 4.386242803271311e-05, + "loss": 0.9867, + "step": 395500 + }, + { + "epoch": 6.139139341082264, + "grad_norm": 2.2324256896972656, + "learning_rate": 4.386087617747017e-05, + "loss": 0.9635, + "step": 395600 + }, + { + "epoch": 6.140691196325207, + "grad_norm": 2.049727201461792, + "learning_rate": 4.3859324322227224e-05, + "loss": 0.9631, + "step": 395700 + }, + { + "epoch": 6.142243051568149, + "grad_norm": 2.241694688796997, + "learning_rate": 4.385777246698428e-05, + "loss": 0.9973, + "step": 395800 + }, + { + "epoch": 6.143794906811093, + "grad_norm": 2.153416156768799, + "learning_rate": 4.385622061174134e-05, + "loss": 0.9601, + "step": 395900 + }, + { + "epoch": 6.145346762054036, + "grad_norm": 2.5247418880462646, + "learning_rate": 4.38546687564984e-05, + "loss": 0.9885, + "step": 396000 + }, + { + "epoch": 6.146898617296978, + "grad_norm": 2.599604368209839, + "learning_rate": 4.3853116901255456e-05, + "loss": 0.9429, + "step": 396100 + }, + { + "epoch": 6.148450472539921, + "grad_norm": 1.9845662117004395, + "learning_rate": 4.385156504601251e-05, + "loss": 0.9642, + "step": 396200 + }, + { + "epoch": 6.150002327782865, + "grad_norm": 2.245769500732422, + "learning_rate": 4.385001319076957e-05, + "loss": 0.9803, + "step": 396300 + }, + { + "epoch": 6.151554183025807, + "grad_norm": 2.1591033935546875, + "learning_rate": 4.384846133552662e-05, + "loss": 0.9838, + "step": 396400 + }, + { + "epoch": 6.15310603826875, + "grad_norm": 1.9319617748260498, + "learning_rate": 4.384690948028368e-05, + "loss": 0.9481, + "step": 396500 + }, + { + "epoch": 6.154657893511693, + "grad_norm": 2.1349427700042725, + "learning_rate": 4.384535762504074e-05, + "loss": 0.9625, + "step": 396600 + }, + { + "epoch": 6.156209748754637, + "grad_norm": 2.028501510620117, + "learning_rate": 4.3843805769797795e-05, + "loss": 0.9644, + "step": 396700 + }, + { + "epoch": 6.157761603997579, + "grad_norm": 2.086404323577881, + "learning_rate": 4.384225391455485e-05, + "loss": 0.9894, + "step": 396800 + }, + { + "epoch": 6.159313459240522, + "grad_norm": 2.086738348007202, + "learning_rate": 4.384070205931191e-05, + "loss": 0.969, + "step": 396900 + }, + { + "epoch": 6.160865314483465, + "grad_norm": 2.644864320755005, + "learning_rate": 4.383915020406897e-05, + "loss": 0.9802, + "step": 397000 + }, + { + "epoch": 6.162417169726408, + "grad_norm": 2.4811015129089355, + "learning_rate": 4.3837598348826026e-05, + "loss": 0.9568, + "step": 397100 + }, + { + "epoch": 6.163969024969351, + "grad_norm": 2.1142282485961914, + "learning_rate": 4.3836046493583084e-05, + "loss": 0.9766, + "step": 397200 + }, + { + "epoch": 6.165520880212294, + "grad_norm": 2.1053740978240967, + "learning_rate": 4.383449463834014e-05, + "loss": 0.9574, + "step": 397300 + }, + { + "epoch": 6.167072735455236, + "grad_norm": 2.153528928756714, + "learning_rate": 4.38329427830972e-05, + "loss": 0.954, + "step": 397400 + }, + { + "epoch": 6.16862459069818, + "grad_norm": 2.322955369949341, + "learning_rate": 4.383139092785426e-05, + "loss": 0.9665, + "step": 397500 + }, + { + "epoch": 6.170176445941123, + "grad_norm": 2.08217716217041, + "learning_rate": 4.382983907261131e-05, + "loss": 0.98, + "step": 397600 + }, + { + "epoch": 6.171728301184065, + "grad_norm": 2.3591394424438477, + "learning_rate": 4.3828287217368366e-05, + "loss": 0.9548, + "step": 397700 + }, + { + "epoch": 6.173280156427008, + "grad_norm": 2.25370192527771, + "learning_rate": 4.3826735362125424e-05, + "loss": 0.9598, + "step": 397800 + }, + { + "epoch": 6.1748320116699515, + "grad_norm": 1.865186095237732, + "learning_rate": 4.3825183506882475e-05, + "loss": 0.9732, + "step": 397900 + }, + { + "epoch": 6.176383866912895, + "grad_norm": 2.2095091342926025, + "learning_rate": 4.382363165163953e-05, + "loss": 0.9627, + "step": 398000 + }, + { + "epoch": 6.177935722155837, + "grad_norm": 1.9672092199325562, + "learning_rate": 4.382207979639659e-05, + "loss": 0.9669, + "step": 398100 + }, + { + "epoch": 6.17948757739878, + "grad_norm": 2.8939859867095947, + "learning_rate": 4.382052794115365e-05, + "loss": 0.9858, + "step": 398200 + }, + { + "epoch": 6.1810394326417235, + "grad_norm": 2.215050458908081, + "learning_rate": 4.3818976085910706e-05, + "loss": 0.9568, + "step": 398300 + }, + { + "epoch": 6.182591287884666, + "grad_norm": 2.344846725463867, + "learning_rate": 4.3817424230667763e-05, + "loss": 0.9555, + "step": 398400 + }, + { + "epoch": 6.184143143127609, + "grad_norm": 1.8797385692596436, + "learning_rate": 4.381587237542482e-05, + "loss": 0.984, + "step": 398500 + }, + { + "epoch": 6.185694998370552, + "grad_norm": 2.302672863006592, + "learning_rate": 4.381432052018188e-05, + "loss": 0.9754, + "step": 398600 + }, + { + "epoch": 6.187246853613495, + "grad_norm": 2.176729202270508, + "learning_rate": 4.381276866493894e-05, + "loss": 0.9676, + "step": 398700 + }, + { + "epoch": 6.188798708856438, + "grad_norm": 2.451247215270996, + "learning_rate": 4.3811216809695994e-05, + "loss": 0.9502, + "step": 398800 + }, + { + "epoch": 6.190350564099381, + "grad_norm": 2.0738155841827393, + "learning_rate": 4.380966495445305e-05, + "loss": 0.9857, + "step": 398900 + }, + { + "epoch": 6.191902419342323, + "grad_norm": 2.0178303718566895, + "learning_rate": 4.380811309921011e-05, + "loss": 0.9761, + "step": 399000 + }, + { + "epoch": 6.1934542745852665, + "grad_norm": 2.6101417541503906, + "learning_rate": 4.380656124396717e-05, + "loss": 0.9568, + "step": 399100 + }, + { + "epoch": 6.19500612982821, + "grad_norm": 2.6814277172088623, + "learning_rate": 4.380500938872422e-05, + "loss": 0.9716, + "step": 399200 + }, + { + "epoch": 6.196557985071153, + "grad_norm": 1.9939662218093872, + "learning_rate": 4.3803457533481276e-05, + "loss": 0.9692, + "step": 399300 + }, + { + "epoch": 6.198109840314095, + "grad_norm": 2.08327054977417, + "learning_rate": 4.3801905678238334e-05, + "loss": 0.9518, + "step": 399400 + }, + { + "epoch": 6.1996616955570385, + "grad_norm": 2.3259949684143066, + "learning_rate": 4.380035382299539e-05, + "loss": 0.9645, + "step": 399500 + }, + { + "epoch": 6.201213550799982, + "grad_norm": 2.0609633922576904, + "learning_rate": 4.379880196775245e-05, + "loss": 0.9632, + "step": 399600 + }, + { + "epoch": 6.202765406042924, + "grad_norm": 2.267864465713501, + "learning_rate": 4.379725011250951e-05, + "loss": 0.9697, + "step": 399700 + }, + { + "epoch": 6.204317261285867, + "grad_norm": 2.342073440551758, + "learning_rate": 4.3795698257266565e-05, + "loss": 0.9816, + "step": 399800 + }, + { + "epoch": 6.2058691165288105, + "grad_norm": 1.8497778177261353, + "learning_rate": 4.379414640202362e-05, + "loss": 0.9784, + "step": 399900 + }, + { + "epoch": 6.207420971771753, + "grad_norm": 2.0065464973449707, + "learning_rate": 4.379259454678068e-05, + "loss": 0.9537, + "step": 400000 + }, + { + "epoch": 6.208972827014696, + "grad_norm": 1.79105806350708, + "learning_rate": 4.379104269153774e-05, + "loss": 0.9547, + "step": 400100 + }, + { + "epoch": 6.210524682257639, + "grad_norm": 2.1997838020324707, + "learning_rate": 4.3789490836294796e-05, + "loss": 0.9706, + "step": 400200 + }, + { + "epoch": 6.2120765375005815, + "grad_norm": 3.2123444080352783, + "learning_rate": 4.3787938981051854e-05, + "loss": 0.9797, + "step": 400300 + }, + { + "epoch": 6.213628392743525, + "grad_norm": 2.3421761989593506, + "learning_rate": 4.378638712580891e-05, + "loss": 0.944, + "step": 400400 + }, + { + "epoch": 6.215180247986468, + "grad_norm": 2.1467018127441406, + "learning_rate": 4.378483527056596e-05, + "loss": 0.965, + "step": 400500 + }, + { + "epoch": 6.216732103229411, + "grad_norm": 2.2525482177734375, + "learning_rate": 4.378328341532302e-05, + "loss": 0.9683, + "step": 400600 + }, + { + "epoch": 6.2182839584723535, + "grad_norm": 2.026404857635498, + "learning_rate": 4.378173156008008e-05, + "loss": 0.9648, + "step": 400700 + }, + { + "epoch": 6.219835813715297, + "grad_norm": 2.476609468460083, + "learning_rate": 4.3780179704837136e-05, + "loss": 0.9824, + "step": 400800 + }, + { + "epoch": 6.22138766895824, + "grad_norm": 3.2324390411376953, + "learning_rate": 4.377862784959419e-05, + "loss": 0.9878, + "step": 400900 + }, + { + "epoch": 6.222939524201182, + "grad_norm": 2.266871213912964, + "learning_rate": 4.3777075994351245e-05, + "loss": 0.9638, + "step": 401000 + }, + { + "epoch": 6.2244913794441254, + "grad_norm": 2.1155683994293213, + "learning_rate": 4.37755241391083e-05, + "loss": 0.9689, + "step": 401100 + }, + { + "epoch": 6.226043234687069, + "grad_norm": 2.209477663040161, + "learning_rate": 4.377397228386536e-05, + "loss": 0.9707, + "step": 401200 + }, + { + "epoch": 6.227595089930011, + "grad_norm": 2.6555047035217285, + "learning_rate": 4.377242042862242e-05, + "loss": 0.9812, + "step": 401300 + }, + { + "epoch": 6.229146945172954, + "grad_norm": 2.175626039505005, + "learning_rate": 4.3770868573379476e-05, + "loss": 0.9736, + "step": 401400 + }, + { + "epoch": 6.230698800415897, + "grad_norm": 2.1035797595977783, + "learning_rate": 4.3769316718136533e-05, + "loss": 0.9798, + "step": 401500 + }, + { + "epoch": 6.23225065565884, + "grad_norm": 2.43630313873291, + "learning_rate": 4.376776486289359e-05, + "loss": 0.9693, + "step": 401600 + }, + { + "epoch": 6.233802510901783, + "grad_norm": 2.7505812644958496, + "learning_rate": 4.376621300765065e-05, + "loss": 0.982, + "step": 401700 + }, + { + "epoch": 6.235354366144726, + "grad_norm": 2.1097216606140137, + "learning_rate": 4.376466115240771e-05, + "loss": 0.9735, + "step": 401800 + }, + { + "epoch": 6.236906221387669, + "grad_norm": 2.1382088661193848, + "learning_rate": 4.3763109297164764e-05, + "loss": 0.9906, + "step": 401900 + }, + { + "epoch": 6.238458076630612, + "grad_norm": 2.587602376937866, + "learning_rate": 4.3761557441921815e-05, + "loss": 0.9753, + "step": 402000 + }, + { + "epoch": 6.240009931873555, + "grad_norm": 2.404689073562622, + "learning_rate": 4.376000558667887e-05, + "loss": 0.9768, + "step": 402100 + }, + { + "epoch": 6.241561787116498, + "grad_norm": 2.5374016761779785, + "learning_rate": 4.375845373143593e-05, + "loss": 0.9573, + "step": 402200 + }, + { + "epoch": 6.24311364235944, + "grad_norm": 2.259457588195801, + "learning_rate": 4.375690187619299e-05, + "loss": 0.9714, + "step": 402300 + }, + { + "epoch": 6.244665497602384, + "grad_norm": 2.333061695098877, + "learning_rate": 4.3755350020950046e-05, + "loss": 0.9598, + "step": 402400 + }, + { + "epoch": 6.246217352845327, + "grad_norm": 1.965182900428772, + "learning_rate": 4.3753798165707104e-05, + "loss": 0.9852, + "step": 402500 + }, + { + "epoch": 6.247769208088269, + "grad_norm": 2.0629656314849854, + "learning_rate": 4.375224631046416e-05, + "loss": 0.9785, + "step": 402600 + }, + { + "epoch": 6.249321063331212, + "grad_norm": 2.192657232284546, + "learning_rate": 4.375069445522122e-05, + "loss": 0.9797, + "step": 402700 + }, + { + "epoch": 6.250872918574156, + "grad_norm": 2.3315765857696533, + "learning_rate": 4.374914259997828e-05, + "loss": 0.975, + "step": 402800 + }, + { + "epoch": 6.252424773817098, + "grad_norm": 2.2813796997070312, + "learning_rate": 4.3747590744735335e-05, + "loss": 0.9629, + "step": 402900 + }, + { + "epoch": 6.253976629060041, + "grad_norm": 2.268996000289917, + "learning_rate": 4.374603888949239e-05, + "loss": 0.9624, + "step": 403000 + }, + { + "epoch": 6.255528484302984, + "grad_norm": 2.316983461380005, + "learning_rate": 4.374448703424945e-05, + "loss": 0.9691, + "step": 403100 + }, + { + "epoch": 6.257080339545928, + "grad_norm": 2.075517416000366, + "learning_rate": 4.374293517900651e-05, + "loss": 0.9547, + "step": 403200 + }, + { + "epoch": 6.25863219478887, + "grad_norm": 2.1352179050445557, + "learning_rate": 4.374138332376356e-05, + "loss": 0.9812, + "step": 403300 + }, + { + "epoch": 6.260184050031813, + "grad_norm": 2.2294087409973145, + "learning_rate": 4.373983146852062e-05, + "loss": 0.9477, + "step": 403400 + }, + { + "epoch": 6.261735905274756, + "grad_norm": 1.9272500276565552, + "learning_rate": 4.3738279613277675e-05, + "loss": 0.9628, + "step": 403500 + }, + { + "epoch": 6.263287760517699, + "grad_norm": 2.7708661556243896, + "learning_rate": 4.373672775803473e-05, + "loss": 0.9695, + "step": 403600 + }, + { + "epoch": 6.264839615760642, + "grad_norm": 2.3564934730529785, + "learning_rate": 4.373517590279179e-05, + "loss": 0.9656, + "step": 403700 + }, + { + "epoch": 6.266391471003585, + "grad_norm": 1.877121090888977, + "learning_rate": 4.373362404754885e-05, + "loss": 0.9798, + "step": 403800 + }, + { + "epoch": 6.267943326246527, + "grad_norm": 1.9194297790527344, + "learning_rate": 4.3732072192305906e-05, + "loss": 0.9925, + "step": 403900 + }, + { + "epoch": 6.269495181489471, + "grad_norm": 2.463531255722046, + "learning_rate": 4.3730520337062964e-05, + "loss": 0.9706, + "step": 404000 + }, + { + "epoch": 6.271047036732414, + "grad_norm": 2.00711989402771, + "learning_rate": 4.3728968481820015e-05, + "loss": 0.9541, + "step": 404100 + }, + { + "epoch": 6.272598891975356, + "grad_norm": 2.0191845893859863, + "learning_rate": 4.372741662657707e-05, + "loss": 0.9604, + "step": 404200 + }, + { + "epoch": 6.274150747218299, + "grad_norm": 1.917065978050232, + "learning_rate": 4.372586477133413e-05, + "loss": 0.9763, + "step": 404300 + }, + { + "epoch": 6.275702602461243, + "grad_norm": 2.2994229793548584, + "learning_rate": 4.372431291609119e-05, + "loss": 0.9739, + "step": 404400 + }, + { + "epoch": 6.277254457704185, + "grad_norm": 2.306817054748535, + "learning_rate": 4.3722761060848246e-05, + "loss": 0.9642, + "step": 404500 + }, + { + "epoch": 6.278806312947128, + "grad_norm": 2.211359977722168, + "learning_rate": 4.3721209205605303e-05, + "loss": 0.9832, + "step": 404600 + }, + { + "epoch": 6.280358168190071, + "grad_norm": 2.2913103103637695, + "learning_rate": 4.371965735036236e-05, + "loss": 0.9585, + "step": 404700 + }, + { + "epoch": 6.2819100234330145, + "grad_norm": 2.4069390296936035, + "learning_rate": 4.371810549511942e-05, + "loss": 0.9838, + "step": 404800 + }, + { + "epoch": 6.283461878675957, + "grad_norm": 1.8786569833755493, + "learning_rate": 4.371655363987647e-05, + "loss": 0.9647, + "step": 404900 + }, + { + "epoch": 6.2850137339189, + "grad_norm": 2.4478113651275635, + "learning_rate": 4.371500178463353e-05, + "loss": 0.9512, + "step": 405000 + }, + { + "epoch": 6.286565589161843, + "grad_norm": 2.4815709590911865, + "learning_rate": 4.3713449929390585e-05, + "loss": 0.9589, + "step": 405100 + }, + { + "epoch": 6.288117444404786, + "grad_norm": 1.9261494874954224, + "learning_rate": 4.371189807414764e-05, + "loss": 0.9694, + "step": 405200 + }, + { + "epoch": 6.289669299647729, + "grad_norm": 1.9647706747055054, + "learning_rate": 4.37103462189047e-05, + "loss": 0.9729, + "step": 405300 + }, + { + "epoch": 6.291221154890672, + "grad_norm": 2.7827138900756836, + "learning_rate": 4.370879436366176e-05, + "loss": 0.9714, + "step": 405400 + }, + { + "epoch": 6.292773010133614, + "grad_norm": 2.246864080429077, + "learning_rate": 4.3707242508418816e-05, + "loss": 0.978, + "step": 405500 + }, + { + "epoch": 6.2943248653765576, + "grad_norm": 1.8927825689315796, + "learning_rate": 4.3705690653175874e-05, + "loss": 0.965, + "step": 405600 + }, + { + "epoch": 6.295876720619501, + "grad_norm": 2.4421863555908203, + "learning_rate": 4.370413879793293e-05, + "loss": 0.9768, + "step": 405700 + }, + { + "epoch": 6.297428575862444, + "grad_norm": 2.244401216506958, + "learning_rate": 4.370258694268999e-05, + "loss": 0.9602, + "step": 405800 + }, + { + "epoch": 6.298980431105386, + "grad_norm": 2.523974895477295, + "learning_rate": 4.370103508744705e-05, + "loss": 0.9771, + "step": 405900 + }, + { + "epoch": 6.3005322863483295, + "grad_norm": 2.228769063949585, + "learning_rate": 4.3699483232204105e-05, + "loss": 0.9668, + "step": 406000 + }, + { + "epoch": 6.302084141591273, + "grad_norm": 2.35884690284729, + "learning_rate": 4.369793137696116e-05, + "loss": 0.9701, + "step": 406100 + }, + { + "epoch": 6.303635996834215, + "grad_norm": 2.2261874675750732, + "learning_rate": 4.3696379521718214e-05, + "loss": 0.9796, + "step": 406200 + }, + { + "epoch": 6.305187852077158, + "grad_norm": 2.264378070831299, + "learning_rate": 4.369482766647527e-05, + "loss": 0.9812, + "step": 406300 + }, + { + "epoch": 6.3067397073201015, + "grad_norm": 2.2602195739746094, + "learning_rate": 4.369327581123233e-05, + "loss": 0.9765, + "step": 406400 + }, + { + "epoch": 6.308291562563044, + "grad_norm": 2.4524857997894287, + "learning_rate": 4.369172395598939e-05, + "loss": 0.9625, + "step": 406500 + }, + { + "epoch": 6.309843417805987, + "grad_norm": 2.772489070892334, + "learning_rate": 4.3690172100746445e-05, + "loss": 0.959, + "step": 406600 + }, + { + "epoch": 6.31139527304893, + "grad_norm": 2.6323797702789307, + "learning_rate": 4.36886202455035e-05, + "loss": 0.9792, + "step": 406700 + }, + { + "epoch": 6.3129471282918725, + "grad_norm": 1.8720982074737549, + "learning_rate": 4.368706839026056e-05, + "loss": 0.9622, + "step": 406800 + }, + { + "epoch": 6.314498983534816, + "grad_norm": 1.8062094449996948, + "learning_rate": 4.368551653501762e-05, + "loss": 0.9811, + "step": 406900 + }, + { + "epoch": 6.316050838777759, + "grad_norm": 1.9324839115142822, + "learning_rate": 4.3683964679774676e-05, + "loss": 0.9562, + "step": 407000 + }, + { + "epoch": 6.317602694020701, + "grad_norm": 2.0377235412597656, + "learning_rate": 4.3682412824531734e-05, + "loss": 0.96, + "step": 407100 + }, + { + "epoch": 6.3191545492636445, + "grad_norm": 2.7106664180755615, + "learning_rate": 4.368086096928879e-05, + "loss": 0.9773, + "step": 407200 + }, + { + "epoch": 6.320706404506588, + "grad_norm": 3.068756341934204, + "learning_rate": 4.367930911404585e-05, + "loss": 0.9746, + "step": 407300 + }, + { + "epoch": 6.322258259749531, + "grad_norm": 2.437727928161621, + "learning_rate": 4.36777572588029e-05, + "loss": 0.9823, + "step": 407400 + }, + { + "epoch": 6.323810114992473, + "grad_norm": 2.0479278564453125, + "learning_rate": 4.367620540355996e-05, + "loss": 0.979, + "step": 407500 + }, + { + "epoch": 6.3253619702354165, + "grad_norm": 1.9924557209014893, + "learning_rate": 4.3674653548317016e-05, + "loss": 0.9723, + "step": 407600 + }, + { + "epoch": 6.32691382547836, + "grad_norm": 2.3464393615722656, + "learning_rate": 4.367310169307407e-05, + "loss": 0.9752, + "step": 407700 + }, + { + "epoch": 6.328465680721302, + "grad_norm": 2.749913454055786, + "learning_rate": 4.3671549837831124e-05, + "loss": 0.9754, + "step": 407800 + }, + { + "epoch": 6.330017535964245, + "grad_norm": 2.1107990741729736, + "learning_rate": 4.366999798258818e-05, + "loss": 0.9559, + "step": 407900 + }, + { + "epoch": 6.331569391207188, + "grad_norm": 2.4505043029785156, + "learning_rate": 4.366844612734524e-05, + "loss": 0.9591, + "step": 408000 + }, + { + "epoch": 6.333121246450131, + "grad_norm": 2.0991528034210205, + "learning_rate": 4.36668942721023e-05, + "loss": 0.9817, + "step": 408100 + }, + { + "epoch": 6.334673101693074, + "grad_norm": 2.3653876781463623, + "learning_rate": 4.3665342416859355e-05, + "loss": 0.9517, + "step": 408200 + }, + { + "epoch": 6.336224956936017, + "grad_norm": 2.2427282333374023, + "learning_rate": 4.366379056161641e-05, + "loss": 0.9762, + "step": 408300 + }, + { + "epoch": 6.33777681217896, + "grad_norm": 2.1615517139434814, + "learning_rate": 4.366223870637347e-05, + "loss": 0.9494, + "step": 408400 + }, + { + "epoch": 6.339328667421903, + "grad_norm": 2.4494051933288574, + "learning_rate": 4.366068685113053e-05, + "loss": 0.9653, + "step": 408500 + }, + { + "epoch": 6.340880522664846, + "grad_norm": 1.9103201627731323, + "learning_rate": 4.3659134995887586e-05, + "loss": 0.9591, + "step": 408600 + }, + { + "epoch": 6.342432377907789, + "grad_norm": 2.474471092224121, + "learning_rate": 4.3657583140644644e-05, + "loss": 0.9669, + "step": 408700 + }, + { + "epoch": 6.3439842331507315, + "grad_norm": 1.961702823638916, + "learning_rate": 4.36560312854017e-05, + "loss": 0.9592, + "step": 408800 + }, + { + "epoch": 6.345536088393675, + "grad_norm": 2.607668876647949, + "learning_rate": 4.365447943015876e-05, + "loss": 0.9961, + "step": 408900 + }, + { + "epoch": 6.347087943636618, + "grad_norm": 1.994560956954956, + "learning_rate": 4.365292757491581e-05, + "loss": 0.9599, + "step": 409000 + }, + { + "epoch": 6.34863979887956, + "grad_norm": 2.195416212081909, + "learning_rate": 4.365137571967287e-05, + "loss": 0.9673, + "step": 409100 + }, + { + "epoch": 6.350191654122503, + "grad_norm": 2.1197633743286133, + "learning_rate": 4.3649823864429926e-05, + "loss": 0.9782, + "step": 409200 + }, + { + "epoch": 6.351743509365447, + "grad_norm": 2.4562883377075195, + "learning_rate": 4.3648272009186984e-05, + "loss": 0.9688, + "step": 409300 + }, + { + "epoch": 6.353295364608389, + "grad_norm": 2.2004032135009766, + "learning_rate": 4.364672015394404e-05, + "loss": 0.9737, + "step": 409400 + }, + { + "epoch": 6.354847219851332, + "grad_norm": 2.0319318771362305, + "learning_rate": 4.36451682987011e-05, + "loss": 0.9703, + "step": 409500 + }, + { + "epoch": 6.356399075094275, + "grad_norm": 2.3951470851898193, + "learning_rate": 4.364361644345816e-05, + "loss": 0.9604, + "step": 409600 + }, + { + "epoch": 6.357950930337218, + "grad_norm": 2.9845194816589355, + "learning_rate": 4.3642064588215215e-05, + "loss": 0.9767, + "step": 409700 + }, + { + "epoch": 6.359502785580161, + "grad_norm": 2.3163154125213623, + "learning_rate": 4.364051273297227e-05, + "loss": 0.9768, + "step": 409800 + }, + { + "epoch": 6.361054640823104, + "grad_norm": 2.6817493438720703, + "learning_rate": 4.363896087772933e-05, + "loss": 0.9551, + "step": 409900 + }, + { + "epoch": 6.362606496066047, + "grad_norm": 2.1725223064422607, + "learning_rate": 4.363740902248639e-05, + "loss": 0.9699, + "step": 410000 + }, + { + "epoch": 6.36415835130899, + "grad_norm": 2.197974920272827, + "learning_rate": 4.3635857167243446e-05, + "loss": 0.9663, + "step": 410100 + }, + { + "epoch": 6.365710206551933, + "grad_norm": 2.164172410964966, + "learning_rate": 4.3634305312000504e-05, + "loss": 0.9615, + "step": 410200 + }, + { + "epoch": 6.367262061794876, + "grad_norm": 2.543231964111328, + "learning_rate": 4.3632753456757555e-05, + "loss": 0.9589, + "step": 410300 + }, + { + "epoch": 6.368813917037818, + "grad_norm": 2.252706289291382, + "learning_rate": 4.363120160151461e-05, + "loss": 0.9395, + "step": 410400 + }, + { + "epoch": 6.370365772280762, + "grad_norm": 2.0108518600463867, + "learning_rate": 4.362964974627167e-05, + "loss": 0.9963, + "step": 410500 + }, + { + "epoch": 6.371917627523705, + "grad_norm": 2.201422929763794, + "learning_rate": 4.362809789102873e-05, + "loss": 0.9398, + "step": 410600 + }, + { + "epoch": 6.373469482766647, + "grad_norm": 2.2797775268554688, + "learning_rate": 4.362654603578578e-05, + "loss": 0.9753, + "step": 410700 + }, + { + "epoch": 6.37502133800959, + "grad_norm": 2.103529214859009, + "learning_rate": 4.362499418054284e-05, + "loss": 0.9501, + "step": 410800 + }, + { + "epoch": 6.376573193252534, + "grad_norm": 2.6470530033111572, + "learning_rate": 4.3623442325299894e-05, + "loss": 0.9791, + "step": 410900 + }, + { + "epoch": 6.378125048495477, + "grad_norm": 2.4140331745147705, + "learning_rate": 4.362189047005695e-05, + "loss": 0.9911, + "step": 411000 + }, + { + "epoch": 6.379676903738419, + "grad_norm": 2.3171579837799072, + "learning_rate": 4.362033861481401e-05, + "loss": 0.9565, + "step": 411100 + }, + { + "epoch": 6.381228758981362, + "grad_norm": 2.5742545127868652, + "learning_rate": 4.361878675957107e-05, + "loss": 0.9718, + "step": 411200 + }, + { + "epoch": 6.3827806142243055, + "grad_norm": 2.818953037261963, + "learning_rate": 4.3617234904328125e-05, + "loss": 0.9776, + "step": 411300 + }, + { + "epoch": 6.384332469467248, + "grad_norm": 2.123659133911133, + "learning_rate": 4.361568304908518e-05, + "loss": 0.9656, + "step": 411400 + }, + { + "epoch": 6.385884324710191, + "grad_norm": 2.3287277221679688, + "learning_rate": 4.361413119384224e-05, + "loss": 0.9774, + "step": 411500 + }, + { + "epoch": 6.387436179953134, + "grad_norm": 2.071604013442993, + "learning_rate": 4.36125793385993e-05, + "loss": 0.9758, + "step": 411600 + }, + { + "epoch": 6.388988035196077, + "grad_norm": 2.004335880279541, + "learning_rate": 4.3611027483356356e-05, + "loss": 0.9845, + "step": 411700 + }, + { + "epoch": 6.39053989043902, + "grad_norm": 2.0574488639831543, + "learning_rate": 4.3609475628113414e-05, + "loss": 0.952, + "step": 411800 + }, + { + "epoch": 6.392091745681963, + "grad_norm": 2.0608198642730713, + "learning_rate": 4.3607923772870465e-05, + "loss": 0.9804, + "step": 411900 + }, + { + "epoch": 6.393643600924905, + "grad_norm": 1.768845796585083, + "learning_rate": 4.360637191762752e-05, + "loss": 0.9759, + "step": 412000 + }, + { + "epoch": 6.395195456167849, + "grad_norm": 2.503519058227539, + "learning_rate": 4.360482006238458e-05, + "loss": 0.9682, + "step": 412100 + }, + { + "epoch": 6.396747311410792, + "grad_norm": 2.4141995906829834, + "learning_rate": 4.360326820714164e-05, + "loss": 0.9832, + "step": 412200 + }, + { + "epoch": 6.398299166653734, + "grad_norm": 2.4085330963134766, + "learning_rate": 4.3601716351898696e-05, + "loss": 0.9728, + "step": 412300 + }, + { + "epoch": 6.399851021896677, + "grad_norm": 2.5037903785705566, + "learning_rate": 4.3600164496655754e-05, + "loss": 0.977, + "step": 412400 + }, + { + "epoch": 6.4014028771396205, + "grad_norm": 2.4910197257995605, + "learning_rate": 4.359861264141281e-05, + "loss": 0.9379, + "step": 412500 + }, + { + "epoch": 6.402954732382564, + "grad_norm": 2.4563679695129395, + "learning_rate": 4.359706078616987e-05, + "loss": 0.9758, + "step": 412600 + }, + { + "epoch": 6.404506587625506, + "grad_norm": 2.0692555904388428, + "learning_rate": 4.359550893092693e-05, + "loss": 0.9809, + "step": 412700 + }, + { + "epoch": 6.406058442868449, + "grad_norm": 2.471853494644165, + "learning_rate": 4.3593957075683985e-05, + "loss": 0.9795, + "step": 412800 + }, + { + "epoch": 6.4076102981113925, + "grad_norm": 2.385113000869751, + "learning_rate": 4.359240522044104e-05, + "loss": 0.9644, + "step": 412900 + }, + { + "epoch": 6.409162153354335, + "grad_norm": 2.337942123413086, + "learning_rate": 4.35908533651981e-05, + "loss": 0.9622, + "step": 413000 + }, + { + "epoch": 6.410714008597278, + "grad_norm": 2.487560749053955, + "learning_rate": 4.358930150995516e-05, + "loss": 0.9557, + "step": 413100 + }, + { + "epoch": 6.412265863840221, + "grad_norm": 2.2363314628601074, + "learning_rate": 4.358774965471221e-05, + "loss": 0.9741, + "step": 413200 + }, + { + "epoch": 6.413817719083164, + "grad_norm": 2.336608409881592, + "learning_rate": 4.358619779946927e-05, + "loss": 0.9842, + "step": 413300 + }, + { + "epoch": 6.415369574326107, + "grad_norm": 2.480377197265625, + "learning_rate": 4.3584645944226325e-05, + "loss": 0.9542, + "step": 413400 + }, + { + "epoch": 6.41692142956905, + "grad_norm": 2.4966518878936768, + "learning_rate": 4.358309408898338e-05, + "loss": 0.992, + "step": 413500 + }, + { + "epoch": 6.418473284811993, + "grad_norm": 2.2411019802093506, + "learning_rate": 4.358154223374044e-05, + "loss": 0.9564, + "step": 413600 + }, + { + "epoch": 6.4200251400549355, + "grad_norm": 2.3539044857025146, + "learning_rate": 4.35799903784975e-05, + "loss": 0.9619, + "step": 413700 + }, + { + "epoch": 6.421576995297879, + "grad_norm": 2.5996665954589844, + "learning_rate": 4.3578438523254556e-05, + "loss": 0.9857, + "step": 413800 + }, + { + "epoch": 6.423128850540822, + "grad_norm": 2.0330357551574707, + "learning_rate": 4.357688666801161e-05, + "loss": 0.9635, + "step": 413900 + }, + { + "epoch": 6.424680705783764, + "grad_norm": 2.0502448081970215, + "learning_rate": 4.3575334812768664e-05, + "loss": 0.9655, + "step": 414000 + }, + { + "epoch": 6.4262325610267075, + "grad_norm": 2.112494707107544, + "learning_rate": 4.357378295752572e-05, + "loss": 0.9791, + "step": 414100 + }, + { + "epoch": 6.427784416269651, + "grad_norm": 2.4739649295806885, + "learning_rate": 4.357223110228278e-05, + "loss": 0.9709, + "step": 414200 + }, + { + "epoch": 6.429336271512593, + "grad_norm": 2.0352835655212402, + "learning_rate": 4.357067924703984e-05, + "loss": 0.9496, + "step": 414300 + }, + { + "epoch": 6.430888126755536, + "grad_norm": 2.2815184593200684, + "learning_rate": 4.3569127391796895e-05, + "loss": 0.9564, + "step": 414400 + }, + { + "epoch": 6.432439981998479, + "grad_norm": 2.576735734939575, + "learning_rate": 4.356757553655395e-05, + "loss": 0.9646, + "step": 414500 + }, + { + "epoch": 6.433991837241422, + "grad_norm": 2.3162832260131836, + "learning_rate": 4.356602368131101e-05, + "loss": 0.9592, + "step": 414600 + }, + { + "epoch": 6.435543692484365, + "grad_norm": 2.7075114250183105, + "learning_rate": 4.356447182606806e-05, + "loss": 0.977, + "step": 414700 + }, + { + "epoch": 6.437095547727308, + "grad_norm": 2.3292689323425293, + "learning_rate": 4.356291997082512e-05, + "loss": 0.9722, + "step": 414800 + }, + { + "epoch": 6.4386474029702505, + "grad_norm": 2.404096841812134, + "learning_rate": 4.356136811558218e-05, + "loss": 0.9467, + "step": 414900 + }, + { + "epoch": 6.440199258213194, + "grad_norm": 2.202815055847168, + "learning_rate": 4.3559816260339235e-05, + "loss": 0.9692, + "step": 415000 + }, + { + "epoch": 6.441751113456137, + "grad_norm": 2.2449398040771484, + "learning_rate": 4.355826440509629e-05, + "loss": 0.9729, + "step": 415100 + }, + { + "epoch": 6.44330296869908, + "grad_norm": 2.9412636756896973, + "learning_rate": 4.355671254985335e-05, + "loss": 0.9859, + "step": 415200 + }, + { + "epoch": 6.4448548239420225, + "grad_norm": 2.2181241512298584, + "learning_rate": 4.355516069461041e-05, + "loss": 0.974, + "step": 415300 + }, + { + "epoch": 6.446406679184966, + "grad_norm": 2.460430145263672, + "learning_rate": 4.3553608839367466e-05, + "loss": 0.9481, + "step": 415400 + }, + { + "epoch": 6.447958534427909, + "grad_norm": 2.7513928413391113, + "learning_rate": 4.3552056984124524e-05, + "loss": 0.9648, + "step": 415500 + }, + { + "epoch": 6.449510389670851, + "grad_norm": 2.3180179595947266, + "learning_rate": 4.355050512888158e-05, + "loss": 0.979, + "step": 415600 + }, + { + "epoch": 6.451062244913794, + "grad_norm": 2.396193027496338, + "learning_rate": 4.354895327363864e-05, + "loss": 0.9736, + "step": 415700 + }, + { + "epoch": 6.452614100156738, + "grad_norm": 2.209988832473755, + "learning_rate": 4.35474014183957e-05, + "loss": 0.9683, + "step": 415800 + }, + { + "epoch": 6.45416595539968, + "grad_norm": 2.1172661781311035, + "learning_rate": 4.3545849563152755e-05, + "loss": 0.9637, + "step": 415900 + }, + { + "epoch": 6.455717810642623, + "grad_norm": 2.075331687927246, + "learning_rate": 4.3544297707909806e-05, + "loss": 0.9823, + "step": 416000 + }, + { + "epoch": 6.457269665885566, + "grad_norm": 2.2483606338500977, + "learning_rate": 4.3542745852666864e-05, + "loss": 0.9736, + "step": 416100 + }, + { + "epoch": 6.45882152112851, + "grad_norm": 2.4009900093078613, + "learning_rate": 4.354119399742392e-05, + "loss": 0.9689, + "step": 416200 + }, + { + "epoch": 6.460373376371452, + "grad_norm": 2.197854518890381, + "learning_rate": 4.353964214218098e-05, + "loss": 0.9753, + "step": 416300 + }, + { + "epoch": 6.461925231614395, + "grad_norm": 2.52730655670166, + "learning_rate": 4.353809028693804e-05, + "loss": 0.955, + "step": 416400 + }, + { + "epoch": 6.463477086857338, + "grad_norm": 2.1753857135772705, + "learning_rate": 4.3536538431695095e-05, + "loss": 0.9755, + "step": 416500 + }, + { + "epoch": 6.465028942100281, + "grad_norm": 2.1001245975494385, + "learning_rate": 4.353498657645215e-05, + "loss": 0.9516, + "step": 416600 + }, + { + "epoch": 6.466580797343224, + "grad_norm": 2.1102499961853027, + "learning_rate": 4.353343472120921e-05, + "loss": 0.9537, + "step": 416700 + }, + { + "epoch": 6.468132652586167, + "grad_norm": 2.656090259552002, + "learning_rate": 4.353188286596627e-05, + "loss": 0.9995, + "step": 416800 + }, + { + "epoch": 6.469684507829109, + "grad_norm": 2.182312488555908, + "learning_rate": 4.3530331010723326e-05, + "loss": 0.9695, + "step": 416900 + }, + { + "epoch": 6.471236363072053, + "grad_norm": 2.736952781677246, + "learning_rate": 4.3528779155480383e-05, + "loss": 0.9663, + "step": 417000 + }, + { + "epoch": 6.472788218314996, + "grad_norm": 2.4398269653320312, + "learning_rate": 4.352722730023744e-05, + "loss": 0.9536, + "step": 417100 + }, + { + "epoch": 6.474340073557938, + "grad_norm": 2.5215742588043213, + "learning_rate": 4.352567544499449e-05, + "loss": 0.9841, + "step": 417200 + }, + { + "epoch": 6.475891928800881, + "grad_norm": 1.9735440015792847, + "learning_rate": 4.352412358975155e-05, + "loss": 0.9538, + "step": 417300 + }, + { + "epoch": 6.477443784043825, + "grad_norm": 2.09611177444458, + "learning_rate": 4.352257173450861e-05, + "loss": 0.9554, + "step": 417400 + }, + { + "epoch": 6.478995639286767, + "grad_norm": 2.1164262294769287, + "learning_rate": 4.352101987926566e-05, + "loss": 0.9651, + "step": 417500 + }, + { + "epoch": 6.48054749452971, + "grad_norm": 2.137082099914551, + "learning_rate": 4.3519468024022716e-05, + "loss": 0.9688, + "step": 417600 + }, + { + "epoch": 6.482099349772653, + "grad_norm": 2.5171310901641846, + "learning_rate": 4.3517916168779774e-05, + "loss": 0.9744, + "step": 417700 + }, + { + "epoch": 6.4836512050155966, + "grad_norm": 2.2302048206329346, + "learning_rate": 4.351636431353683e-05, + "loss": 0.9542, + "step": 417800 + }, + { + "epoch": 6.485203060258539, + "grad_norm": 2.363054037094116, + "learning_rate": 4.351481245829389e-05, + "loss": 0.9592, + "step": 417900 + }, + { + "epoch": 6.486754915501482, + "grad_norm": 2.129586935043335, + "learning_rate": 4.351326060305095e-05, + "loss": 0.9648, + "step": 418000 + }, + { + "epoch": 6.488306770744425, + "grad_norm": 2.0864787101745605, + "learning_rate": 4.3511708747808005e-05, + "loss": 0.9477, + "step": 418100 + }, + { + "epoch": 6.489858625987368, + "grad_norm": 2.5064265727996826, + "learning_rate": 4.351015689256506e-05, + "loss": 0.9688, + "step": 418200 + }, + { + "epoch": 6.491410481230311, + "grad_norm": 2.431048631668091, + "learning_rate": 4.350860503732212e-05, + "loss": 0.9731, + "step": 418300 + }, + { + "epoch": 6.492962336473254, + "grad_norm": 2.903663396835327, + "learning_rate": 4.350705318207918e-05, + "loss": 0.9534, + "step": 418400 + }, + { + "epoch": 6.494514191716196, + "grad_norm": 2.116795778274536, + "learning_rate": 4.3505501326836236e-05, + "loss": 0.9817, + "step": 418500 + }, + { + "epoch": 6.49606604695914, + "grad_norm": 2.5668110847473145, + "learning_rate": 4.3503949471593294e-05, + "loss": 0.9706, + "step": 418600 + }, + { + "epoch": 6.497617902202083, + "grad_norm": 2.0160510540008545, + "learning_rate": 4.350239761635035e-05, + "loss": 0.9851, + "step": 418700 + }, + { + "epoch": 6.499169757445025, + "grad_norm": 1.8833749294281006, + "learning_rate": 4.35008457611074e-05, + "loss": 0.959, + "step": 418800 + }, + { + "epoch": 6.500721612687968, + "grad_norm": 2.4650609493255615, + "learning_rate": 4.349929390586446e-05, + "loss": 0.9372, + "step": 418900 + }, + { + "epoch": 6.5022734679309115, + "grad_norm": 2.117232322692871, + "learning_rate": 4.349774205062152e-05, + "loss": 0.9827, + "step": 419000 + }, + { + "epoch": 6.503825323173855, + "grad_norm": 2.1824660301208496, + "learning_rate": 4.3496190195378576e-05, + "loss": 0.9566, + "step": 419100 + }, + { + "epoch": 6.505377178416797, + "grad_norm": 2.1648807525634766, + "learning_rate": 4.3494638340135634e-05, + "loss": 0.9667, + "step": 419200 + }, + { + "epoch": 6.50692903365974, + "grad_norm": 2.1830403804779053, + "learning_rate": 4.349308648489269e-05, + "loss": 0.9748, + "step": 419300 + }, + { + "epoch": 6.5084808889026835, + "grad_norm": 2.006662368774414, + "learning_rate": 4.349153462964975e-05, + "loss": 0.9716, + "step": 419400 + }, + { + "epoch": 6.510032744145626, + "grad_norm": 2.364398956298828, + "learning_rate": 4.348998277440681e-05, + "loss": 0.9601, + "step": 419500 + }, + { + "epoch": 6.511584599388569, + "grad_norm": 1.81155526638031, + "learning_rate": 4.3488430919163865e-05, + "loss": 0.9727, + "step": 419600 + }, + { + "epoch": 6.513136454631512, + "grad_norm": 1.618113398551941, + "learning_rate": 4.348687906392092e-05, + "loss": 0.9601, + "step": 419700 + }, + { + "epoch": 6.514688309874455, + "grad_norm": 2.53412127494812, + "learning_rate": 4.348532720867798e-05, + "loss": 0.9627, + "step": 419800 + }, + { + "epoch": 6.516240165117398, + "grad_norm": 2.1579275131225586, + "learning_rate": 4.348377535343504e-05, + "loss": 0.9653, + "step": 419900 + }, + { + "epoch": 6.517792020360341, + "grad_norm": 2.228121519088745, + "learning_rate": 4.3482223498192096e-05, + "loss": 0.9528, + "step": 420000 + }, + { + "epoch": 6.519343875603283, + "grad_norm": 2.453296184539795, + "learning_rate": 4.348067164294915e-05, + "loss": 0.9689, + "step": 420100 + }, + { + "epoch": 6.5208957308462265, + "grad_norm": 2.2692201137542725, + "learning_rate": 4.3479119787706204e-05, + "loss": 0.9582, + "step": 420200 + }, + { + "epoch": 6.52244758608917, + "grad_norm": 1.6923892498016357, + "learning_rate": 4.347756793246326e-05, + "loss": 0.9693, + "step": 420300 + }, + { + "epoch": 6.523999441332112, + "grad_norm": 2.295419931411743, + "learning_rate": 4.347601607722031e-05, + "loss": 0.961, + "step": 420400 + }, + { + "epoch": 6.525551296575055, + "grad_norm": 2.2035574913024902, + "learning_rate": 4.347446422197737e-05, + "loss": 0.9802, + "step": 420500 + }, + { + "epoch": 6.5271031518179985, + "grad_norm": 2.311668872833252, + "learning_rate": 4.347291236673443e-05, + "loss": 0.9567, + "step": 420600 + }, + { + "epoch": 6.528655007060942, + "grad_norm": 2.2198069095611572, + "learning_rate": 4.3471360511491486e-05, + "loss": 0.9622, + "step": 420700 + }, + { + "epoch": 6.530206862303884, + "grad_norm": 1.9989866018295288, + "learning_rate": 4.3469808656248544e-05, + "loss": 0.9552, + "step": 420800 + }, + { + "epoch": 6.531758717546827, + "grad_norm": 2.0765514373779297, + "learning_rate": 4.34682568010056e-05, + "loss": 0.9663, + "step": 420900 + }, + { + "epoch": 6.5333105727897705, + "grad_norm": 1.9403561353683472, + "learning_rate": 4.346670494576266e-05, + "loss": 0.9447, + "step": 421000 + }, + { + "epoch": 6.534862428032713, + "grad_norm": 2.1181282997131348, + "learning_rate": 4.346515309051972e-05, + "loss": 0.9549, + "step": 421100 + }, + { + "epoch": 6.536414283275656, + "grad_norm": 2.55145263671875, + "learning_rate": 4.3463601235276775e-05, + "loss": 0.9608, + "step": 421200 + }, + { + "epoch": 6.537966138518599, + "grad_norm": 1.966909408569336, + "learning_rate": 4.346204938003383e-05, + "loss": 0.9511, + "step": 421300 + }, + { + "epoch": 6.539517993761542, + "grad_norm": 2.0743229389190674, + "learning_rate": 4.346049752479089e-05, + "loss": 0.9773, + "step": 421400 + }, + { + "epoch": 6.541069849004485, + "grad_norm": 2.037116289138794, + "learning_rate": 4.345894566954795e-05, + "loss": 0.9605, + "step": 421500 + }, + { + "epoch": 6.542621704247428, + "grad_norm": 2.333439588546753, + "learning_rate": 4.3457393814305006e-05, + "loss": 0.9628, + "step": 421600 + }, + { + "epoch": 6.544173559490371, + "grad_norm": 2.0217251777648926, + "learning_rate": 4.345584195906206e-05, + "loss": 0.9599, + "step": 421700 + }, + { + "epoch": 6.5457254147333135, + "grad_norm": 1.8909906148910522, + "learning_rate": 4.3454290103819115e-05, + "loss": 0.9803, + "step": 421800 + }, + { + "epoch": 6.547277269976257, + "grad_norm": 2.204542398452759, + "learning_rate": 4.345273824857617e-05, + "loss": 0.9688, + "step": 421900 + }, + { + "epoch": 6.5488291252192, + "grad_norm": 2.45114803314209, + "learning_rate": 4.345118639333323e-05, + "loss": 1.004, + "step": 422000 + }, + { + "epoch": 6.550380980462142, + "grad_norm": 2.446262836456299, + "learning_rate": 4.344963453809029e-05, + "loss": 0.9702, + "step": 422100 + }, + { + "epoch": 6.5519328357050854, + "grad_norm": 2.1272571086883545, + "learning_rate": 4.3448082682847346e-05, + "loss": 0.9928, + "step": 422200 + }, + { + "epoch": 6.553484690948029, + "grad_norm": 2.0498876571655273, + "learning_rate": 4.3446530827604404e-05, + "loss": 0.9567, + "step": 422300 + }, + { + "epoch": 6.555036546190971, + "grad_norm": 2.039031744003296, + "learning_rate": 4.344497897236146e-05, + "loss": 0.96, + "step": 422400 + }, + { + "epoch": 6.556588401433914, + "grad_norm": 1.8312418460845947, + "learning_rate": 4.344342711711852e-05, + "loss": 0.9648, + "step": 422500 + }, + { + "epoch": 6.558140256676857, + "grad_norm": 2.2084076404571533, + "learning_rate": 4.344187526187558e-05, + "loss": 0.9759, + "step": 422600 + }, + { + "epoch": 6.5596921119198, + "grad_norm": 2.29850697517395, + "learning_rate": 4.3440323406632635e-05, + "loss": 0.9838, + "step": 422700 + }, + { + "epoch": 6.561243967162743, + "grad_norm": 2.330251932144165, + "learning_rate": 4.343877155138969e-05, + "loss": 0.9489, + "step": 422800 + }, + { + "epoch": 6.562795822405686, + "grad_norm": 2.4128081798553467, + "learning_rate": 4.343721969614675e-05, + "loss": 0.9499, + "step": 422900 + }, + { + "epoch": 6.5643476776486285, + "grad_norm": 2.6081063747406006, + "learning_rate": 4.34356678409038e-05, + "loss": 0.9723, + "step": 423000 + }, + { + "epoch": 6.565899532891572, + "grad_norm": 2.206106424331665, + "learning_rate": 4.343411598566086e-05, + "loss": 0.9606, + "step": 423100 + }, + { + "epoch": 6.567451388134515, + "grad_norm": 2.5828254222869873, + "learning_rate": 4.343256413041792e-05, + "loss": 0.9711, + "step": 423200 + }, + { + "epoch": 6.569003243377458, + "grad_norm": 2.2720930576324463, + "learning_rate": 4.3431012275174974e-05, + "loss": 0.9587, + "step": 423300 + }, + { + "epoch": 6.5705550986204, + "grad_norm": 2.2720694541931152, + "learning_rate": 4.342946041993203e-05, + "loss": 0.9602, + "step": 423400 + }, + { + "epoch": 6.572106953863344, + "grad_norm": 2.7443814277648926, + "learning_rate": 4.342790856468909e-05, + "loss": 0.9489, + "step": 423500 + }, + { + "epoch": 6.573658809106287, + "grad_norm": 2.089165687561035, + "learning_rate": 4.342635670944615e-05, + "loss": 0.953, + "step": 423600 + }, + { + "epoch": 6.575210664349229, + "grad_norm": 2.6473116874694824, + "learning_rate": 4.34248048542032e-05, + "loss": 0.9593, + "step": 423700 + }, + { + "epoch": 6.576762519592172, + "grad_norm": 2.5216798782348633, + "learning_rate": 4.3423252998960256e-05, + "loss": 0.9502, + "step": 423800 + }, + { + "epoch": 6.578314374835116, + "grad_norm": 2.220024585723877, + "learning_rate": 4.3421701143717314e-05, + "loss": 0.9571, + "step": 423900 + }, + { + "epoch": 6.579866230078058, + "grad_norm": 2.1186959743499756, + "learning_rate": 4.342014928847437e-05, + "loss": 0.9581, + "step": 424000 + }, + { + "epoch": 6.581418085321001, + "grad_norm": 2.213369131088257, + "learning_rate": 4.341859743323143e-05, + "loss": 0.9729, + "step": 424100 + }, + { + "epoch": 6.582969940563944, + "grad_norm": 2.0517101287841797, + "learning_rate": 4.341704557798849e-05, + "loss": 0.9773, + "step": 424200 + }, + { + "epoch": 6.584521795806888, + "grad_norm": 2.7900636196136475, + "learning_rate": 4.3415493722745545e-05, + "loss": 0.959, + "step": 424300 + }, + { + "epoch": 6.58607365104983, + "grad_norm": 2.2764623165130615, + "learning_rate": 4.34139418675026e-05, + "loss": 0.9689, + "step": 424400 + }, + { + "epoch": 6.587625506292773, + "grad_norm": 2.284311056137085, + "learning_rate": 4.3412390012259654e-05, + "loss": 0.9698, + "step": 424500 + }, + { + "epoch": 6.589177361535716, + "grad_norm": 2.0992631912231445, + "learning_rate": 4.341083815701671e-05, + "loss": 0.9773, + "step": 424600 + }, + { + "epoch": 6.590729216778659, + "grad_norm": 2.812044858932495, + "learning_rate": 4.340928630177377e-05, + "loss": 0.9632, + "step": 424700 + }, + { + "epoch": 6.592281072021602, + "grad_norm": 2.621793031692505, + "learning_rate": 4.340773444653083e-05, + "loss": 0.9627, + "step": 424800 + }, + { + "epoch": 6.593832927264545, + "grad_norm": 2.128485679626465, + "learning_rate": 4.3406182591287885e-05, + "loss": 0.9609, + "step": 424900 + }, + { + "epoch": 6.595384782507487, + "grad_norm": 2.5162482261657715, + "learning_rate": 4.340463073604494e-05, + "loss": 0.9492, + "step": 425000 + }, + { + "epoch": 6.596936637750431, + "grad_norm": 2.232084035873413, + "learning_rate": 4.3403078880802e-05, + "loss": 0.9496, + "step": 425100 + }, + { + "epoch": 6.598488492993374, + "grad_norm": 2.37396502494812, + "learning_rate": 4.340152702555906e-05, + "loss": 0.964, + "step": 425200 + }, + { + "epoch": 6.600040348236316, + "grad_norm": 2.530705213546753, + "learning_rate": 4.3399975170316116e-05, + "loss": 0.9633, + "step": 425300 + }, + { + "epoch": 6.601592203479259, + "grad_norm": 2.438948154449463, + "learning_rate": 4.3398423315073174e-05, + "loss": 0.9635, + "step": 425400 + }, + { + "epoch": 6.603144058722203, + "grad_norm": 1.9976087808609009, + "learning_rate": 4.339687145983023e-05, + "loss": 0.9708, + "step": 425500 + }, + { + "epoch": 6.604695913965145, + "grad_norm": 2.3859241008758545, + "learning_rate": 4.339531960458729e-05, + "loss": 0.9753, + "step": 425600 + }, + { + "epoch": 6.606247769208088, + "grad_norm": 2.56014084815979, + "learning_rate": 4.339376774934435e-05, + "loss": 0.9589, + "step": 425700 + }, + { + "epoch": 6.607799624451031, + "grad_norm": 2.2402284145355225, + "learning_rate": 4.33922158941014e-05, + "loss": 0.9566, + "step": 425800 + }, + { + "epoch": 6.6093514796939745, + "grad_norm": 1.9381129741668701, + "learning_rate": 4.3390664038858456e-05, + "loss": 0.9678, + "step": 425900 + }, + { + "epoch": 6.610903334936917, + "grad_norm": 2.5849661827087402, + "learning_rate": 4.3389112183615513e-05, + "loss": 0.964, + "step": 426000 + }, + { + "epoch": 6.61245519017986, + "grad_norm": 2.0342535972595215, + "learning_rate": 4.338756032837257e-05, + "loss": 0.9654, + "step": 426100 + }, + { + "epoch": 6.614007045422803, + "grad_norm": 2.233546257019043, + "learning_rate": 4.338600847312963e-05, + "loss": 0.9747, + "step": 426200 + }, + { + "epoch": 6.615558900665746, + "grad_norm": 2.464162826538086, + "learning_rate": 4.338445661788669e-05, + "loss": 0.9636, + "step": 426300 + }, + { + "epoch": 6.617110755908689, + "grad_norm": 1.985101580619812, + "learning_rate": 4.3382904762643744e-05, + "loss": 0.9635, + "step": 426400 + }, + { + "epoch": 6.618662611151632, + "grad_norm": 1.6616899967193604, + "learning_rate": 4.33813529074008e-05, + "loss": 0.9625, + "step": 426500 + }, + { + "epoch": 6.620214466394574, + "grad_norm": 1.789625883102417, + "learning_rate": 4.337980105215786e-05, + "loss": 0.9757, + "step": 426600 + }, + { + "epoch": 6.6217663216375175, + "grad_norm": 2.6055991649627686, + "learning_rate": 4.337824919691492e-05, + "loss": 0.9679, + "step": 426700 + }, + { + "epoch": 6.623318176880461, + "grad_norm": 1.6995948553085327, + "learning_rate": 4.3376697341671975e-05, + "loss": 0.9599, + "step": 426800 + }, + { + "epoch": 6.624870032123404, + "grad_norm": 2.0643959045410156, + "learning_rate": 4.337514548642903e-05, + "loss": 0.9689, + "step": 426900 + }, + { + "epoch": 6.626421887366346, + "grad_norm": 2.1341381072998047, + "learning_rate": 4.3373593631186084e-05, + "loss": 0.9667, + "step": 427000 + }, + { + "epoch": 6.6279737426092895, + "grad_norm": 1.8998945951461792, + "learning_rate": 4.337204177594314e-05, + "loss": 0.9709, + "step": 427100 + }, + { + "epoch": 6.629525597852233, + "grad_norm": 2.09203839302063, + "learning_rate": 4.33704899207002e-05, + "loss": 0.9689, + "step": 427200 + }, + { + "epoch": 6.631077453095175, + "grad_norm": 2.0841968059539795, + "learning_rate": 4.336893806545726e-05, + "loss": 0.966, + "step": 427300 + }, + { + "epoch": 6.632629308338118, + "grad_norm": 1.915770411491394, + "learning_rate": 4.336738621021431e-05, + "loss": 0.9363, + "step": 427400 + }, + { + "epoch": 6.6341811635810615, + "grad_norm": 2.285073757171631, + "learning_rate": 4.3365834354971366e-05, + "loss": 0.963, + "step": 427500 + }, + { + "epoch": 6.635733018824004, + "grad_norm": 2.538499355316162, + "learning_rate": 4.3364282499728424e-05, + "loss": 0.9612, + "step": 427600 + }, + { + "epoch": 6.637284874066947, + "grad_norm": 2.306152105331421, + "learning_rate": 4.336273064448548e-05, + "loss": 0.9468, + "step": 427700 + }, + { + "epoch": 6.63883672930989, + "grad_norm": 2.27289080619812, + "learning_rate": 4.336117878924254e-05, + "loss": 0.9669, + "step": 427800 + }, + { + "epoch": 6.6403885845528325, + "grad_norm": 1.9609386920928955, + "learning_rate": 4.33596269339996e-05, + "loss": 0.9427, + "step": 427900 + }, + { + "epoch": 6.641940439795776, + "grad_norm": 2.150015115737915, + "learning_rate": 4.3358075078756655e-05, + "loss": 0.9592, + "step": 428000 + }, + { + "epoch": 6.643492295038719, + "grad_norm": 2.085994005203247, + "learning_rate": 4.335652322351371e-05, + "loss": 0.9576, + "step": 428100 + }, + { + "epoch": 6.645044150281661, + "grad_norm": 2.246272563934326, + "learning_rate": 4.335497136827077e-05, + "loss": 0.9792, + "step": 428200 + }, + { + "epoch": 6.6465960055246045, + "grad_norm": 2.502819538116455, + "learning_rate": 4.335341951302783e-05, + "loss": 0.9841, + "step": 428300 + }, + { + "epoch": 6.648147860767548, + "grad_norm": 2.2759687900543213, + "learning_rate": 4.3351867657784886e-05, + "loss": 0.981, + "step": 428400 + }, + { + "epoch": 6.649699716010491, + "grad_norm": 1.9931080341339111, + "learning_rate": 4.3350315802541944e-05, + "loss": 0.9718, + "step": 428500 + }, + { + "epoch": 6.651251571253433, + "grad_norm": 2.1190154552459717, + "learning_rate": 4.3348763947299e-05, + "loss": 0.96, + "step": 428600 + }, + { + "epoch": 6.6528034264963765, + "grad_norm": 2.188507556915283, + "learning_rate": 4.334721209205605e-05, + "loss": 0.9706, + "step": 428700 + }, + { + "epoch": 6.65435528173932, + "grad_norm": 2.1184189319610596, + "learning_rate": 4.334566023681311e-05, + "loss": 0.9589, + "step": 428800 + }, + { + "epoch": 6.655907136982262, + "grad_norm": 2.58760929107666, + "learning_rate": 4.334410838157017e-05, + "loss": 0.9551, + "step": 428900 + }, + { + "epoch": 6.657458992225205, + "grad_norm": 2.420637845993042, + "learning_rate": 4.3342556526327226e-05, + "loss": 0.952, + "step": 429000 + }, + { + "epoch": 6.659010847468148, + "grad_norm": 2.369595527648926, + "learning_rate": 4.3341004671084283e-05, + "loss": 0.9683, + "step": 429100 + }, + { + "epoch": 6.660562702711091, + "grad_norm": 2.055168628692627, + "learning_rate": 4.333945281584134e-05, + "loss": 0.9641, + "step": 429200 + }, + { + "epoch": 6.662114557954034, + "grad_norm": 2.1766114234924316, + "learning_rate": 4.33379009605984e-05, + "loss": 0.9614, + "step": 429300 + }, + { + "epoch": 6.663666413196977, + "grad_norm": 2.3143973350524902, + "learning_rate": 4.333634910535546e-05, + "loss": 0.9803, + "step": 429400 + }, + { + "epoch": 6.66521826843992, + "grad_norm": 2.3827552795410156, + "learning_rate": 4.3334797250112514e-05, + "loss": 0.9738, + "step": 429500 + }, + { + "epoch": 6.666770123682863, + "grad_norm": 2.6563448905944824, + "learning_rate": 4.333324539486957e-05, + "loss": 0.9801, + "step": 429600 + }, + { + "epoch": 6.668321978925806, + "grad_norm": 2.5449228286743164, + "learning_rate": 4.333169353962663e-05, + "loss": 0.9633, + "step": 429700 + }, + { + "epoch": 6.669873834168749, + "grad_norm": 1.899495005607605, + "learning_rate": 4.333014168438369e-05, + "loss": 0.9524, + "step": 429800 + }, + { + "epoch": 6.6714256894116915, + "grad_norm": 2.0042564868927, + "learning_rate": 4.3328589829140745e-05, + "loss": 0.9818, + "step": 429900 + }, + { + "epoch": 6.672977544654635, + "grad_norm": 2.081190824508667, + "learning_rate": 4.3327037973897796e-05, + "loss": 0.9743, + "step": 430000 + }, + { + "epoch": 6.674529399897578, + "grad_norm": 2.2775936126708984, + "learning_rate": 4.3325486118654854e-05, + "loss": 0.9717, + "step": 430100 + }, + { + "epoch": 6.67608125514052, + "grad_norm": 2.2846286296844482, + "learning_rate": 4.3323934263411905e-05, + "loss": 0.958, + "step": 430200 + }, + { + "epoch": 6.677633110383463, + "grad_norm": 2.0574283599853516, + "learning_rate": 4.332238240816896e-05, + "loss": 0.9641, + "step": 430300 + }, + { + "epoch": 6.679184965626407, + "grad_norm": 2.179159164428711, + "learning_rate": 4.332083055292602e-05, + "loss": 0.9428, + "step": 430400 + }, + { + "epoch": 6.680736820869349, + "grad_norm": 1.985873818397522, + "learning_rate": 4.331927869768308e-05, + "loss": 0.9519, + "step": 430500 + }, + { + "epoch": 6.682288676112292, + "grad_norm": 2.1224048137664795, + "learning_rate": 4.3317726842440136e-05, + "loss": 0.9807, + "step": 430600 + }, + { + "epoch": 6.683840531355235, + "grad_norm": 2.1241307258605957, + "learning_rate": 4.3316174987197194e-05, + "loss": 0.9715, + "step": 430700 + }, + { + "epoch": 6.685392386598178, + "grad_norm": 2.251311779022217, + "learning_rate": 4.331462313195425e-05, + "loss": 0.9776, + "step": 430800 + }, + { + "epoch": 6.686944241841121, + "grad_norm": 1.9129951000213623, + "learning_rate": 4.331307127671131e-05, + "loss": 0.9647, + "step": 430900 + }, + { + "epoch": 6.688496097084064, + "grad_norm": 2.598257064819336, + "learning_rate": 4.331151942146837e-05, + "loss": 0.9893, + "step": 431000 + }, + { + "epoch": 6.690047952327006, + "grad_norm": 2.1570096015930176, + "learning_rate": 4.3309967566225425e-05, + "loss": 0.967, + "step": 431100 + }, + { + "epoch": 6.69159980756995, + "grad_norm": 1.9991873502731323, + "learning_rate": 4.330841571098248e-05, + "loss": 0.9677, + "step": 431200 + }, + { + "epoch": 6.693151662812893, + "grad_norm": 2.108656167984009, + "learning_rate": 4.330686385573954e-05, + "loss": 0.9805, + "step": 431300 + }, + { + "epoch": 6.694703518055836, + "grad_norm": 2.5301196575164795, + "learning_rate": 4.33053120004966e-05, + "loss": 0.9821, + "step": 431400 + }, + { + "epoch": 6.696255373298778, + "grad_norm": 2.2393667697906494, + "learning_rate": 4.330376014525365e-05, + "loss": 0.9449, + "step": 431500 + }, + { + "epoch": 6.697807228541722, + "grad_norm": 2.1805450916290283, + "learning_rate": 4.330220829001071e-05, + "loss": 0.9795, + "step": 431600 + }, + { + "epoch": 6.699359083784665, + "grad_norm": 2.4003820419311523, + "learning_rate": 4.3300656434767765e-05, + "loss": 0.9792, + "step": 431700 + }, + { + "epoch": 6.700910939027607, + "grad_norm": 2.1959033012390137, + "learning_rate": 4.329910457952482e-05, + "loss": 0.9577, + "step": 431800 + }, + { + "epoch": 6.70246279427055, + "grad_norm": 2.564363479614258, + "learning_rate": 4.329755272428188e-05, + "loss": 0.9637, + "step": 431900 + }, + { + "epoch": 6.704014649513494, + "grad_norm": 2.331374406814575, + "learning_rate": 4.329600086903894e-05, + "loss": 0.9616, + "step": 432000 + }, + { + "epoch": 6.705566504756437, + "grad_norm": 1.8874894380569458, + "learning_rate": 4.3294449013795996e-05, + "loss": 0.971, + "step": 432100 + }, + { + "epoch": 6.707118359999379, + "grad_norm": 2.1836538314819336, + "learning_rate": 4.3292897158553053e-05, + "loss": 0.969, + "step": 432200 + }, + { + "epoch": 6.708670215242322, + "grad_norm": 2.307040214538574, + "learning_rate": 4.329134530331011e-05, + "loss": 0.9627, + "step": 432300 + }, + { + "epoch": 6.7102220704852655, + "grad_norm": 2.0233044624328613, + "learning_rate": 4.328979344806717e-05, + "loss": 0.9515, + "step": 432400 + }, + { + "epoch": 6.711773925728208, + "grad_norm": 2.0123188495635986, + "learning_rate": 4.328824159282423e-05, + "loss": 0.9861, + "step": 432500 + }, + { + "epoch": 6.713325780971151, + "grad_norm": 2.378725051879883, + "learning_rate": 4.3286689737581284e-05, + "loss": 0.9651, + "step": 432600 + }, + { + "epoch": 6.714877636214094, + "grad_norm": 2.286808729171753, + "learning_rate": 4.328513788233834e-05, + "loss": 0.9613, + "step": 432700 + }, + { + "epoch": 6.716429491457037, + "grad_norm": 2.237541675567627, + "learning_rate": 4.328358602709539e-05, + "loss": 0.9666, + "step": 432800 + }, + { + "epoch": 6.71798134669998, + "grad_norm": 2.449968099594116, + "learning_rate": 4.328203417185245e-05, + "loss": 0.9755, + "step": 432900 + }, + { + "epoch": 6.719533201942923, + "grad_norm": 2.467799425125122, + "learning_rate": 4.328048231660951e-05, + "loss": 0.9691, + "step": 433000 + }, + { + "epoch": 6.721085057185865, + "grad_norm": 2.7805891036987305, + "learning_rate": 4.3278930461366566e-05, + "loss": 0.9504, + "step": 433100 + }, + { + "epoch": 6.722636912428809, + "grad_norm": 1.891644835472107, + "learning_rate": 4.3277378606123624e-05, + "loss": 0.9625, + "step": 433200 + }, + { + "epoch": 6.724188767671752, + "grad_norm": 2.1541478633880615, + "learning_rate": 4.327582675088068e-05, + "loss": 0.9613, + "step": 433300 + }, + { + "epoch": 6.725740622914694, + "grad_norm": 2.3072755336761475, + "learning_rate": 4.327427489563774e-05, + "loss": 0.9424, + "step": 433400 + }, + { + "epoch": 6.727292478157637, + "grad_norm": 2.7168760299682617, + "learning_rate": 4.327272304039479e-05, + "loss": 0.9745, + "step": 433500 + }, + { + "epoch": 6.7288443334005805, + "grad_norm": 2.453251838684082, + "learning_rate": 4.327117118515185e-05, + "loss": 0.9873, + "step": 433600 + }, + { + "epoch": 6.730396188643523, + "grad_norm": 2.4342517852783203, + "learning_rate": 4.3269619329908906e-05, + "loss": 0.959, + "step": 433700 + }, + { + "epoch": 6.731948043886466, + "grad_norm": 2.4158682823181152, + "learning_rate": 4.3268067474665964e-05, + "loss": 0.956, + "step": 433800 + }, + { + "epoch": 6.733499899129409, + "grad_norm": 2.526888370513916, + "learning_rate": 4.326651561942302e-05, + "loss": 0.9602, + "step": 433900 + }, + { + "epoch": 6.7350517543723525, + "grad_norm": 2.4079947471618652, + "learning_rate": 4.326496376418008e-05, + "loss": 0.9749, + "step": 434000 + }, + { + "epoch": 6.736603609615295, + "grad_norm": 2.1496877670288086, + "learning_rate": 4.326341190893714e-05, + "loss": 0.9712, + "step": 434100 + }, + { + "epoch": 6.738155464858238, + "grad_norm": 2.4161407947540283, + "learning_rate": 4.3261860053694195e-05, + "loss": 0.9587, + "step": 434200 + }, + { + "epoch": 6.739707320101181, + "grad_norm": 2.3657705783843994, + "learning_rate": 4.3260308198451246e-05, + "loss": 0.9619, + "step": 434300 + }, + { + "epoch": 6.7412591753441236, + "grad_norm": 2.353041172027588, + "learning_rate": 4.3258756343208304e-05, + "loss": 0.9697, + "step": 434400 + }, + { + "epoch": 6.742811030587067, + "grad_norm": 2.350092649459839, + "learning_rate": 4.325720448796536e-05, + "loss": 0.9607, + "step": 434500 + }, + { + "epoch": 6.74436288583001, + "grad_norm": 2.28694486618042, + "learning_rate": 4.325565263272242e-05, + "loss": 0.9629, + "step": 434600 + }, + { + "epoch": 6.745914741072953, + "grad_norm": 2.063788890838623, + "learning_rate": 4.325410077747948e-05, + "loss": 0.9716, + "step": 434700 + }, + { + "epoch": 6.7474665963158955, + "grad_norm": 2.3966639041900635, + "learning_rate": 4.3252548922236535e-05, + "loss": 0.9543, + "step": 434800 + }, + { + "epoch": 6.749018451558839, + "grad_norm": 2.3289926052093506, + "learning_rate": 4.325099706699359e-05, + "loss": 0.962, + "step": 434900 + }, + { + "epoch": 6.750570306801782, + "grad_norm": 2.1907060146331787, + "learning_rate": 4.324944521175065e-05, + "loss": 0.9762, + "step": 435000 + }, + { + "epoch": 6.752122162044724, + "grad_norm": 1.9231843948364258, + "learning_rate": 4.324789335650771e-05, + "loss": 0.9681, + "step": 435100 + }, + { + "epoch": 6.7536740172876675, + "grad_norm": 2.1809346675872803, + "learning_rate": 4.3246341501264766e-05, + "loss": 0.9539, + "step": 435200 + }, + { + "epoch": 6.755225872530611, + "grad_norm": 2.1257243156433105, + "learning_rate": 4.3244789646021823e-05, + "loss": 0.9708, + "step": 435300 + }, + { + "epoch": 6.756777727773553, + "grad_norm": 2.094426155090332, + "learning_rate": 4.324323779077888e-05, + "loss": 0.9741, + "step": 435400 + }, + { + "epoch": 6.758329583016496, + "grad_norm": 2.3985161781311035, + "learning_rate": 4.324168593553594e-05, + "loss": 0.9677, + "step": 435500 + }, + { + "epoch": 6.759881438259439, + "grad_norm": 2.131499767303467, + "learning_rate": 4.324013408029299e-05, + "loss": 0.9576, + "step": 435600 + }, + { + "epoch": 6.761433293502382, + "grad_norm": 2.861760139465332, + "learning_rate": 4.323858222505005e-05, + "loss": 0.9631, + "step": 435700 + }, + { + "epoch": 6.762985148745325, + "grad_norm": 1.9125932455062866, + "learning_rate": 4.3237030369807105e-05, + "loss": 0.9623, + "step": 435800 + }, + { + "epoch": 6.764537003988268, + "grad_norm": 2.1935696601867676, + "learning_rate": 4.323547851456416e-05, + "loss": 0.9452, + "step": 435900 + }, + { + "epoch": 6.7660888592312105, + "grad_norm": 2.0657761096954346, + "learning_rate": 4.323392665932122e-05, + "loss": 0.9628, + "step": 436000 + }, + { + "epoch": 6.767640714474154, + "grad_norm": 2.278221845626831, + "learning_rate": 4.323237480407828e-05, + "loss": 0.9688, + "step": 436100 + }, + { + "epoch": 6.769192569717097, + "grad_norm": 2.1598312854766846, + "learning_rate": 4.3230822948835336e-05, + "loss": 0.9691, + "step": 436200 + }, + { + "epoch": 6.770744424960039, + "grad_norm": 1.6591007709503174, + "learning_rate": 4.3229271093592394e-05, + "loss": 0.963, + "step": 436300 + }, + { + "epoch": 6.7722962802029825, + "grad_norm": 2.2094757556915283, + "learning_rate": 4.322771923834945e-05, + "loss": 0.9561, + "step": 436400 + }, + { + "epoch": 6.773848135445926, + "grad_norm": 2.10581374168396, + "learning_rate": 4.322616738310651e-05, + "loss": 0.9623, + "step": 436500 + }, + { + "epoch": 6.775399990688869, + "grad_norm": 2.179748773574829, + "learning_rate": 4.322461552786357e-05, + "loss": 0.9918, + "step": 436600 + }, + { + "epoch": 6.776951845931811, + "grad_norm": 2.3291804790496826, + "learning_rate": 4.322306367262062e-05, + "loss": 0.9712, + "step": 436700 + }, + { + "epoch": 6.778503701174754, + "grad_norm": 2.1765499114990234, + "learning_rate": 4.3221511817377676e-05, + "loss": 0.9502, + "step": 436800 + }, + { + "epoch": 6.780055556417698, + "grad_norm": 2.0082015991210938, + "learning_rate": 4.3219959962134734e-05, + "loss": 0.9732, + "step": 436900 + }, + { + "epoch": 6.78160741166064, + "grad_norm": 2.4658210277557373, + "learning_rate": 4.321840810689179e-05, + "loss": 0.9522, + "step": 437000 + }, + { + "epoch": 6.783159266903583, + "grad_norm": 2.3163704872131348, + "learning_rate": 4.321685625164885e-05, + "loss": 0.9536, + "step": 437100 + }, + { + "epoch": 6.784711122146526, + "grad_norm": 2.341322183609009, + "learning_rate": 4.32153043964059e-05, + "loss": 0.9562, + "step": 437200 + }, + { + "epoch": 6.78626297738947, + "grad_norm": 2.8513011932373047, + "learning_rate": 4.321375254116296e-05, + "loss": 0.981, + "step": 437300 + }, + { + "epoch": 6.787814832632412, + "grad_norm": 2.0854692459106445, + "learning_rate": 4.3212200685920016e-05, + "loss": 0.971, + "step": 437400 + }, + { + "epoch": 6.789366687875355, + "grad_norm": 2.393674612045288, + "learning_rate": 4.3210648830677074e-05, + "loss": 0.9685, + "step": 437500 + }, + { + "epoch": 6.790918543118298, + "grad_norm": 2.2793149948120117, + "learning_rate": 4.320909697543413e-05, + "loss": 0.986, + "step": 437600 + }, + { + "epoch": 6.792470398361241, + "grad_norm": 2.5252761840820312, + "learning_rate": 4.320754512019119e-05, + "loss": 0.94, + "step": 437700 + }, + { + "epoch": 6.794022253604184, + "grad_norm": 2.4589061737060547, + "learning_rate": 4.320599326494825e-05, + "loss": 0.9562, + "step": 437800 + }, + { + "epoch": 6.795574108847127, + "grad_norm": 2.3959240913391113, + "learning_rate": 4.3204441409705305e-05, + "loss": 0.9604, + "step": 437900 + }, + { + "epoch": 6.797125964090069, + "grad_norm": 2.208648681640625, + "learning_rate": 4.320288955446236e-05, + "loss": 0.9622, + "step": 438000 + }, + { + "epoch": 6.798677819333013, + "grad_norm": 1.9950900077819824, + "learning_rate": 4.320133769921942e-05, + "loss": 0.9511, + "step": 438100 + }, + { + "epoch": 6.800229674575956, + "grad_norm": 1.8762975931167603, + "learning_rate": 4.319978584397648e-05, + "loss": 0.9541, + "step": 438200 + }, + { + "epoch": 6.801781529818898, + "grad_norm": 2.114628553390503, + "learning_rate": 4.3198233988733536e-05, + "loss": 0.9444, + "step": 438300 + }, + { + "epoch": 6.803333385061841, + "grad_norm": 2.45674991607666, + "learning_rate": 4.3196682133490593e-05, + "loss": 0.9856, + "step": 438400 + }, + { + "epoch": 6.804885240304785, + "grad_norm": 1.924883484840393, + "learning_rate": 4.3195130278247644e-05, + "loss": 0.96, + "step": 438500 + }, + { + "epoch": 6.806437095547727, + "grad_norm": 2.017695665359497, + "learning_rate": 4.31935784230047e-05, + "loss": 0.9738, + "step": 438600 + }, + { + "epoch": 6.80798895079067, + "grad_norm": 1.899924635887146, + "learning_rate": 4.319202656776176e-05, + "loss": 0.9702, + "step": 438700 + }, + { + "epoch": 6.809540806033613, + "grad_norm": 2.0267887115478516, + "learning_rate": 4.319047471251882e-05, + "loss": 0.9567, + "step": 438800 + }, + { + "epoch": 6.811092661276556, + "grad_norm": 2.2755370140075684, + "learning_rate": 4.3188922857275875e-05, + "loss": 0.9492, + "step": 438900 + }, + { + "epoch": 6.812644516519499, + "grad_norm": 2.377073287963867, + "learning_rate": 4.318737100203293e-05, + "loss": 0.9425, + "step": 439000 + }, + { + "epoch": 6.814196371762442, + "grad_norm": 2.4295871257781982, + "learning_rate": 4.318581914678999e-05, + "loss": 0.956, + "step": 439100 + }, + { + "epoch": 6.815748227005385, + "grad_norm": 2.429774761199951, + "learning_rate": 4.318426729154705e-05, + "loss": 0.9779, + "step": 439200 + }, + { + "epoch": 6.817300082248328, + "grad_norm": 2.101464033126831, + "learning_rate": 4.3182715436304106e-05, + "loss": 0.9613, + "step": 439300 + }, + { + "epoch": 6.818851937491271, + "grad_norm": 2.6782474517822266, + "learning_rate": 4.3181163581061164e-05, + "loss": 0.9687, + "step": 439400 + }, + { + "epoch": 6.820403792734214, + "grad_norm": 2.628763437271118, + "learning_rate": 4.317961172581822e-05, + "loss": 0.9579, + "step": 439500 + }, + { + "epoch": 6.821955647977156, + "grad_norm": 2.305074691772461, + "learning_rate": 4.317805987057528e-05, + "loss": 0.9732, + "step": 439600 + }, + { + "epoch": 6.8235075032201, + "grad_norm": 1.7819008827209473, + "learning_rate": 4.317650801533234e-05, + "loss": 0.9546, + "step": 439700 + }, + { + "epoch": 6.825059358463043, + "grad_norm": 2.134722948074341, + "learning_rate": 4.317495616008939e-05, + "loss": 0.9604, + "step": 439800 + }, + { + "epoch": 6.826611213705986, + "grad_norm": 2.064683675765991, + "learning_rate": 4.3173404304846446e-05, + "loss": 0.9579, + "step": 439900 + }, + { + "epoch": 6.828163068948928, + "grad_norm": 1.8675159215927124, + "learning_rate": 4.31718524496035e-05, + "loss": 0.9522, + "step": 440000 + }, + { + "epoch": 6.8297149241918715, + "grad_norm": 2.1733498573303223, + "learning_rate": 4.3170300594360555e-05, + "loss": 0.9696, + "step": 440100 + }, + { + "epoch": 6.831266779434815, + "grad_norm": 2.1657094955444336, + "learning_rate": 4.316874873911761e-05, + "loss": 0.9693, + "step": 440200 + }, + { + "epoch": 6.832818634677757, + "grad_norm": 2.0879313945770264, + "learning_rate": 4.316719688387467e-05, + "loss": 0.9507, + "step": 440300 + }, + { + "epoch": 6.8343704899207, + "grad_norm": 1.994692325592041, + "learning_rate": 4.316564502863173e-05, + "loss": 0.9719, + "step": 440400 + }, + { + "epoch": 6.8359223451636435, + "grad_norm": 2.3186893463134766, + "learning_rate": 4.3164093173388786e-05, + "loss": 0.9487, + "step": 440500 + }, + { + "epoch": 6.837474200406586, + "grad_norm": 2.3696393966674805, + "learning_rate": 4.3162541318145844e-05, + "loss": 0.9543, + "step": 440600 + }, + { + "epoch": 6.839026055649529, + "grad_norm": 2.0898354053497314, + "learning_rate": 4.31609894629029e-05, + "loss": 0.9659, + "step": 440700 + }, + { + "epoch": 6.840577910892472, + "grad_norm": 2.858240842819214, + "learning_rate": 4.315943760765996e-05, + "loss": 0.9523, + "step": 440800 + }, + { + "epoch": 6.842129766135415, + "grad_norm": 2.2911605834960938, + "learning_rate": 4.315788575241702e-05, + "loss": 0.9499, + "step": 440900 + }, + { + "epoch": 6.843681621378358, + "grad_norm": 2.49033260345459, + "learning_rate": 4.3156333897174075e-05, + "loss": 0.9646, + "step": 441000 + }, + { + "epoch": 6.845233476621301, + "grad_norm": 2.080137014389038, + "learning_rate": 4.315478204193113e-05, + "loss": 0.9512, + "step": 441100 + }, + { + "epoch": 6.846785331864243, + "grad_norm": 2.1112749576568604, + "learning_rate": 4.315323018668819e-05, + "loss": 0.9716, + "step": 441200 + }, + { + "epoch": 6.8483371871071865, + "grad_norm": 2.253533363342285, + "learning_rate": 4.315167833144524e-05, + "loss": 0.9776, + "step": 441300 + }, + { + "epoch": 6.84988904235013, + "grad_norm": 2.4929633140563965, + "learning_rate": 4.31501264762023e-05, + "loss": 0.972, + "step": 441400 + }, + { + "epoch": 6.851440897593072, + "grad_norm": 1.8048045635223389, + "learning_rate": 4.314857462095936e-05, + "loss": 0.964, + "step": 441500 + }, + { + "epoch": 6.852992752836015, + "grad_norm": 1.9912177324295044, + "learning_rate": 4.3147022765716414e-05, + "loss": 0.9504, + "step": 441600 + }, + { + "epoch": 6.8545446080789585, + "grad_norm": 2.2173547744750977, + "learning_rate": 4.314547091047347e-05, + "loss": 0.9557, + "step": 441700 + }, + { + "epoch": 6.856096463321902, + "grad_norm": 2.3875458240509033, + "learning_rate": 4.314391905523053e-05, + "loss": 0.944, + "step": 441800 + }, + { + "epoch": 6.857648318564844, + "grad_norm": 2.2754809856414795, + "learning_rate": 4.314236719998759e-05, + "loss": 0.9621, + "step": 441900 + }, + { + "epoch": 6.859200173807787, + "grad_norm": 2.4385297298431396, + "learning_rate": 4.3140815344744645e-05, + "loss": 0.9583, + "step": 442000 + }, + { + "epoch": 6.8607520290507304, + "grad_norm": 1.9793715476989746, + "learning_rate": 4.31392634895017e-05, + "loss": 0.9733, + "step": 442100 + }, + { + "epoch": 6.862303884293673, + "grad_norm": 2.110994815826416, + "learning_rate": 4.313771163425876e-05, + "loss": 0.9617, + "step": 442200 + }, + { + "epoch": 6.863855739536616, + "grad_norm": 2.355104446411133, + "learning_rate": 4.313615977901582e-05, + "loss": 0.9593, + "step": 442300 + }, + { + "epoch": 6.865407594779559, + "grad_norm": 2.1024646759033203, + "learning_rate": 4.3134607923772876e-05, + "loss": 0.9553, + "step": 442400 + }, + { + "epoch": 6.8669594500225015, + "grad_norm": 1.7912112474441528, + "learning_rate": 4.3133056068529934e-05, + "loss": 0.9434, + "step": 442500 + }, + { + "epoch": 6.868511305265445, + "grad_norm": 2.832334280014038, + "learning_rate": 4.3131504213286985e-05, + "loss": 0.9479, + "step": 442600 + }, + { + "epoch": 6.870063160508388, + "grad_norm": 1.8377877473831177, + "learning_rate": 4.312995235804404e-05, + "loss": 0.9565, + "step": 442700 + }, + { + "epoch": 6.871615015751331, + "grad_norm": 2.196176290512085, + "learning_rate": 4.31284005028011e-05, + "loss": 0.9678, + "step": 442800 + }, + { + "epoch": 6.8731668709942735, + "grad_norm": 2.351644277572632, + "learning_rate": 4.312684864755816e-05, + "loss": 0.9679, + "step": 442900 + }, + { + "epoch": 6.874718726237217, + "grad_norm": 2.0880608558654785, + "learning_rate": 4.3125296792315216e-05, + "loss": 0.9696, + "step": 443000 + }, + { + "epoch": 6.87627058148016, + "grad_norm": 2.1842544078826904, + "learning_rate": 4.3123744937072274e-05, + "loss": 0.9511, + "step": 443100 + }, + { + "epoch": 6.877822436723102, + "grad_norm": 2.013806104660034, + "learning_rate": 4.3122193081829325e-05, + "loss": 0.944, + "step": 443200 + }, + { + "epoch": 6.879374291966045, + "grad_norm": 2.2502963542938232, + "learning_rate": 4.312064122658638e-05, + "loss": 0.9625, + "step": 443300 + }, + { + "epoch": 6.880926147208989, + "grad_norm": 2.0300559997558594, + "learning_rate": 4.311908937134344e-05, + "loss": 0.9589, + "step": 443400 + }, + { + "epoch": 6.882478002451931, + "grad_norm": 2.4204795360565186, + "learning_rate": 4.31175375161005e-05, + "loss": 0.9768, + "step": 443500 + }, + { + "epoch": 6.884029857694874, + "grad_norm": 2.3040518760681152, + "learning_rate": 4.3115985660857556e-05, + "loss": 0.9474, + "step": 443600 + }, + { + "epoch": 6.885581712937817, + "grad_norm": 2.5396111011505127, + "learning_rate": 4.3114433805614614e-05, + "loss": 0.9639, + "step": 443700 + }, + { + "epoch": 6.88713356818076, + "grad_norm": 2.6410367488861084, + "learning_rate": 4.311288195037167e-05, + "loss": 0.9557, + "step": 443800 + }, + { + "epoch": 6.888685423423703, + "grad_norm": 2.078646659851074, + "learning_rate": 4.311133009512873e-05, + "loss": 0.9605, + "step": 443900 + }, + { + "epoch": 6.890237278666646, + "grad_norm": 2.4347574710845947, + "learning_rate": 4.310977823988579e-05, + "loss": 0.9662, + "step": 444000 + }, + { + "epoch": 6.8917891339095885, + "grad_norm": 2.726022720336914, + "learning_rate": 4.3108226384642845e-05, + "loss": 0.9647, + "step": 444100 + }, + { + "epoch": 6.893340989152532, + "grad_norm": 1.7981305122375488, + "learning_rate": 4.3106674529399896e-05, + "loss": 0.965, + "step": 444200 + }, + { + "epoch": 6.894892844395475, + "grad_norm": 2.1100399494171143, + "learning_rate": 4.3105122674156953e-05, + "loss": 0.9623, + "step": 444300 + }, + { + "epoch": 6.896444699638418, + "grad_norm": 2.584831476211548, + "learning_rate": 4.310357081891401e-05, + "loss": 0.9745, + "step": 444400 + }, + { + "epoch": 6.89799655488136, + "grad_norm": 2.6576380729675293, + "learning_rate": 4.310201896367107e-05, + "loss": 0.9648, + "step": 444500 + }, + { + "epoch": 6.899548410124304, + "grad_norm": 1.8713663816452026, + "learning_rate": 4.310046710842813e-05, + "loss": 0.959, + "step": 444600 + }, + { + "epoch": 6.901100265367247, + "grad_norm": 2.099529981613159, + "learning_rate": 4.3098915253185184e-05, + "loss": 0.9598, + "step": 444700 + }, + { + "epoch": 6.902652120610189, + "grad_norm": 2.174076557159424, + "learning_rate": 4.309736339794224e-05, + "loss": 0.9506, + "step": 444800 + }, + { + "epoch": 6.904203975853132, + "grad_norm": 2.304044485092163, + "learning_rate": 4.30958115426993e-05, + "loss": 0.9724, + "step": 444900 + }, + { + "epoch": 6.905755831096076, + "grad_norm": 2.2297749519348145, + "learning_rate": 4.309425968745636e-05, + "loss": 0.9604, + "step": 445000 + }, + { + "epoch": 6.907307686339018, + "grad_norm": 2.4866652488708496, + "learning_rate": 4.3092707832213415e-05, + "loss": 0.9699, + "step": 445100 + }, + { + "epoch": 6.908859541581961, + "grad_norm": 2.327308416366577, + "learning_rate": 4.309115597697047e-05, + "loss": 0.9636, + "step": 445200 + }, + { + "epoch": 6.910411396824904, + "grad_norm": 2.3069162368774414, + "learning_rate": 4.308960412172753e-05, + "loss": 0.9493, + "step": 445300 + }, + { + "epoch": 6.911963252067848, + "grad_norm": 1.8557943105697632, + "learning_rate": 4.308805226648459e-05, + "loss": 0.9565, + "step": 445400 + }, + { + "epoch": 6.91351510731079, + "grad_norm": 2.817706823348999, + "learning_rate": 4.308650041124164e-05, + "loss": 0.9686, + "step": 445500 + }, + { + "epoch": 6.915066962553733, + "grad_norm": 2.60986590385437, + "learning_rate": 4.30849485559987e-05, + "loss": 0.9816, + "step": 445600 + }, + { + "epoch": 6.916618817796676, + "grad_norm": 2.1873059272766113, + "learning_rate": 4.3083396700755755e-05, + "loss": 0.9683, + "step": 445700 + }, + { + "epoch": 6.918170673039619, + "grad_norm": 2.1571836471557617, + "learning_rate": 4.308184484551281e-05, + "loss": 0.9534, + "step": 445800 + }, + { + "epoch": 6.919722528282562, + "grad_norm": 2.315070390701294, + "learning_rate": 4.308029299026987e-05, + "loss": 0.9517, + "step": 445900 + }, + { + "epoch": 6.921274383525505, + "grad_norm": 2.337183952331543, + "learning_rate": 4.307874113502693e-05, + "loss": 0.9658, + "step": 446000 + }, + { + "epoch": 6.922826238768447, + "grad_norm": 2.563176155090332, + "learning_rate": 4.3077189279783986e-05, + "loss": 0.9382, + "step": 446100 + }, + { + "epoch": 6.924378094011391, + "grad_norm": 2.0938093662261963, + "learning_rate": 4.3075637424541044e-05, + "loss": 0.9577, + "step": 446200 + }, + { + "epoch": 6.925929949254334, + "grad_norm": 2.144468069076538, + "learning_rate": 4.30740855692981e-05, + "loss": 0.9499, + "step": 446300 + }, + { + "epoch": 6.927481804497276, + "grad_norm": 2.0903782844543457, + "learning_rate": 4.307253371405516e-05, + "loss": 0.9626, + "step": 446400 + }, + { + "epoch": 6.929033659740219, + "grad_norm": 2.0059690475463867, + "learning_rate": 4.307098185881221e-05, + "loss": 0.9745, + "step": 446500 + }, + { + "epoch": 6.9305855149831626, + "grad_norm": 1.8438208103179932, + "learning_rate": 4.306943000356927e-05, + "loss": 0.9837, + "step": 446600 + }, + { + "epoch": 6.932137370226105, + "grad_norm": 2.2943475246429443, + "learning_rate": 4.3067878148326326e-05, + "loss": 0.9596, + "step": 446700 + }, + { + "epoch": 6.933689225469048, + "grad_norm": 2.3373594284057617, + "learning_rate": 4.3066326293083384e-05, + "loss": 0.9573, + "step": 446800 + }, + { + "epoch": 6.935241080711991, + "grad_norm": 1.9522019624710083, + "learning_rate": 4.306477443784044e-05, + "loss": 0.9578, + "step": 446900 + }, + { + "epoch": 6.936792935954934, + "grad_norm": 1.945548415184021, + "learning_rate": 4.306322258259749e-05, + "loss": 0.9521, + "step": 447000 + }, + { + "epoch": 6.938344791197877, + "grad_norm": 2.0586867332458496, + "learning_rate": 4.306167072735455e-05, + "loss": 0.9589, + "step": 447100 + }, + { + "epoch": 6.93989664644082, + "grad_norm": 2.0932538509368896, + "learning_rate": 4.306011887211161e-05, + "loss": 0.9719, + "step": 447200 + }, + { + "epoch": 6.941448501683763, + "grad_norm": 2.5355565547943115, + "learning_rate": 4.3058567016868666e-05, + "loss": 0.9657, + "step": 447300 + }, + { + "epoch": 6.943000356926706, + "grad_norm": 2.374173164367676, + "learning_rate": 4.3057015161625723e-05, + "loss": 0.9668, + "step": 447400 + }, + { + "epoch": 6.944552212169649, + "grad_norm": 2.0808701515197754, + "learning_rate": 4.305546330638278e-05, + "loss": 0.9353, + "step": 447500 + }, + { + "epoch": 6.946104067412592, + "grad_norm": 2.030280828475952, + "learning_rate": 4.305391145113984e-05, + "loss": 0.9427, + "step": 447600 + }, + { + "epoch": 6.947655922655534, + "grad_norm": 2.1320133209228516, + "learning_rate": 4.30523595958969e-05, + "loss": 0.9676, + "step": 447700 + }, + { + "epoch": 6.9492077778984775, + "grad_norm": 2.4068048000335693, + "learning_rate": 4.3050807740653954e-05, + "loss": 0.97, + "step": 447800 + }, + { + "epoch": 6.950759633141421, + "grad_norm": 2.26277232170105, + "learning_rate": 4.304925588541101e-05, + "loss": 0.9914, + "step": 447900 + }, + { + "epoch": 6.952311488384364, + "grad_norm": 2.5127828121185303, + "learning_rate": 4.304770403016807e-05, + "loss": 0.9751, + "step": 448000 + }, + { + "epoch": 6.953863343627306, + "grad_norm": 2.4725217819213867, + "learning_rate": 4.304615217492513e-05, + "loss": 0.9739, + "step": 448100 + }, + { + "epoch": 6.9554151988702495, + "grad_norm": 2.240823745727539, + "learning_rate": 4.3044600319682185e-05, + "loss": 0.9753, + "step": 448200 + }, + { + "epoch": 6.956967054113193, + "grad_norm": 2.3082690238952637, + "learning_rate": 4.3043048464439236e-05, + "loss": 0.9785, + "step": 448300 + }, + { + "epoch": 6.958518909356135, + "grad_norm": 2.2240993976593018, + "learning_rate": 4.3041496609196294e-05, + "loss": 0.964, + "step": 448400 + }, + { + "epoch": 6.960070764599078, + "grad_norm": 2.344449758529663, + "learning_rate": 4.303994475395335e-05, + "loss": 0.9565, + "step": 448500 + }, + { + "epoch": 6.9616226198420215, + "grad_norm": 2.5229413509368896, + "learning_rate": 4.303839289871041e-05, + "loss": 0.9606, + "step": 448600 + }, + { + "epoch": 6.963174475084964, + "grad_norm": 2.295170307159424, + "learning_rate": 4.303684104346747e-05, + "loss": 0.9737, + "step": 448700 + }, + { + "epoch": 6.964726330327907, + "grad_norm": 2.2986700534820557, + "learning_rate": 4.3035289188224525e-05, + "loss": 0.9665, + "step": 448800 + }, + { + "epoch": 6.96627818557085, + "grad_norm": 2.0462729930877686, + "learning_rate": 4.303373733298158e-05, + "loss": 0.9652, + "step": 448900 + }, + { + "epoch": 6.9678300408137925, + "grad_norm": 2.406602621078491, + "learning_rate": 4.303218547773864e-05, + "loss": 0.9565, + "step": 449000 + }, + { + "epoch": 6.969381896056736, + "grad_norm": 2.062739610671997, + "learning_rate": 4.30306336224957e-05, + "loss": 0.9551, + "step": 449100 + }, + { + "epoch": 6.970933751299679, + "grad_norm": 2.230595827102661, + "learning_rate": 4.3029081767252756e-05, + "loss": 0.9674, + "step": 449200 + }, + { + "epoch": 6.972485606542621, + "grad_norm": 1.9942388534545898, + "learning_rate": 4.3027529912009814e-05, + "loss": 0.9728, + "step": 449300 + }, + { + "epoch": 6.9740374617855645, + "grad_norm": 2.3794138431549072, + "learning_rate": 4.302597805676687e-05, + "loss": 0.966, + "step": 449400 + }, + { + "epoch": 6.975589317028508, + "grad_norm": 2.215716600418091, + "learning_rate": 4.302442620152393e-05, + "loss": 0.9835, + "step": 449500 + }, + { + "epoch": 6.97714117227145, + "grad_norm": 2.5608744621276855, + "learning_rate": 4.302287434628098e-05, + "loss": 0.9754, + "step": 449600 + }, + { + "epoch": 6.978693027514393, + "grad_norm": 2.413003444671631, + "learning_rate": 4.302132249103804e-05, + "loss": 0.9318, + "step": 449700 + }, + { + "epoch": 6.9802448827573365, + "grad_norm": 2.1478822231292725, + "learning_rate": 4.301977063579509e-05, + "loss": 0.9624, + "step": 449800 + }, + { + "epoch": 6.98179673800028, + "grad_norm": 2.1789090633392334, + "learning_rate": 4.301821878055215e-05, + "loss": 0.9451, + "step": 449900 + }, + { + "epoch": 6.983348593243222, + "grad_norm": 2.1605465412139893, + "learning_rate": 4.3016666925309205e-05, + "loss": 0.9582, + "step": 450000 + }, + { + "epoch": 6.984900448486165, + "grad_norm": 3.094702959060669, + "learning_rate": 4.301511507006626e-05, + "loss": 0.9774, + "step": 450100 + }, + { + "epoch": 6.986452303729108, + "grad_norm": 2.4087531566619873, + "learning_rate": 4.301356321482332e-05, + "loss": 0.9514, + "step": 450200 + }, + { + "epoch": 6.988004158972051, + "grad_norm": 2.0128543376922607, + "learning_rate": 4.301201135958038e-05, + "loss": 0.9566, + "step": 450300 + }, + { + "epoch": 6.989556014214994, + "grad_norm": 2.319578170776367, + "learning_rate": 4.3010459504337436e-05, + "loss": 0.9657, + "step": 450400 + }, + { + "epoch": 6.991107869457937, + "grad_norm": 2.412431001663208, + "learning_rate": 4.3008907649094493e-05, + "loss": 0.9625, + "step": 450500 + }, + { + "epoch": 6.99265972470088, + "grad_norm": 2.422497034072876, + "learning_rate": 4.300735579385155e-05, + "loss": 0.9687, + "step": 450600 + }, + { + "epoch": 6.994211579943823, + "grad_norm": 3.001018762588501, + "learning_rate": 4.300580393860861e-05, + "loss": 0.9675, + "step": 450700 + }, + { + "epoch": 6.995763435186766, + "grad_norm": 2.0682923793792725, + "learning_rate": 4.300425208336567e-05, + "loss": 0.958, + "step": 450800 + }, + { + "epoch": 6.997315290429709, + "grad_norm": 2.0898776054382324, + "learning_rate": 4.3002700228122724e-05, + "loss": 0.9673, + "step": 450900 + }, + { + "epoch": 6.9988671456726514, + "grad_norm": 2.364039659500122, + "learning_rate": 4.300114837287978e-05, + "loss": 0.9613, + "step": 451000 + }, + { + "epoch": 7.000419000915595, + "grad_norm": 1.913339614868164, + "learning_rate": 4.299959651763683e-05, + "loss": 0.9494, + "step": 451100 + }, + { + "epoch": 7.001970856158538, + "grad_norm": 2.6814982891082764, + "learning_rate": 4.299804466239389e-05, + "loss": 0.9445, + "step": 451200 + }, + { + "epoch": 7.00352271140148, + "grad_norm": 2.17089581489563, + "learning_rate": 4.299649280715095e-05, + "loss": 0.9586, + "step": 451300 + }, + { + "epoch": 7.005074566644423, + "grad_norm": 2.1242733001708984, + "learning_rate": 4.2994940951908006e-05, + "loss": 0.949, + "step": 451400 + }, + { + "epoch": 7.006626421887367, + "grad_norm": 2.1148810386657715, + "learning_rate": 4.2993389096665064e-05, + "loss": 0.9479, + "step": 451500 + }, + { + "epoch": 7.008178277130309, + "grad_norm": 2.0669050216674805, + "learning_rate": 4.299183724142212e-05, + "loss": 0.9489, + "step": 451600 + }, + { + "epoch": 7.009730132373252, + "grad_norm": 2.0324018001556396, + "learning_rate": 4.299028538617918e-05, + "loss": 0.9454, + "step": 451700 + }, + { + "epoch": 7.011281987616195, + "grad_norm": 1.8813183307647705, + "learning_rate": 4.298873353093624e-05, + "loss": 0.9759, + "step": 451800 + }, + { + "epoch": 7.012833842859138, + "grad_norm": 2.76200270652771, + "learning_rate": 4.2987181675693295e-05, + "loss": 0.9507, + "step": 451900 + }, + { + "epoch": 7.014385698102081, + "grad_norm": 2.1816463470458984, + "learning_rate": 4.298562982045035e-05, + "loss": 0.9451, + "step": 452000 + }, + { + "epoch": 7.015937553345024, + "grad_norm": 2.047480344772339, + "learning_rate": 4.298407796520741e-05, + "loss": 0.9484, + "step": 452100 + }, + { + "epoch": 7.017489408587967, + "grad_norm": 2.437556505203247, + "learning_rate": 4.298252610996447e-05, + "loss": 0.9544, + "step": 452200 + }, + { + "epoch": 7.01904126383091, + "grad_norm": 2.104931354522705, + "learning_rate": 4.2980974254721526e-05, + "loss": 0.9447, + "step": 452300 + }, + { + "epoch": 7.020593119073853, + "grad_norm": 2.3649117946624756, + "learning_rate": 4.297942239947858e-05, + "loss": 0.941, + "step": 452400 + }, + { + "epoch": 7.022144974316796, + "grad_norm": 2.2500951290130615, + "learning_rate": 4.2977870544235635e-05, + "loss": 0.9468, + "step": 452500 + }, + { + "epoch": 7.023696829559738, + "grad_norm": 2.018099784851074, + "learning_rate": 4.297631868899269e-05, + "loss": 0.9468, + "step": 452600 + }, + { + "epoch": 7.025248684802682, + "grad_norm": 2.2592520713806152, + "learning_rate": 4.297476683374975e-05, + "loss": 0.9489, + "step": 452700 + }, + { + "epoch": 7.026800540045625, + "grad_norm": 2.210730791091919, + "learning_rate": 4.297321497850681e-05, + "loss": 0.9601, + "step": 452800 + }, + { + "epoch": 7.028352395288567, + "grad_norm": 2.3604366779327393, + "learning_rate": 4.2971663123263866e-05, + "loss": 0.9544, + "step": 452900 + }, + { + "epoch": 7.02990425053151, + "grad_norm": 2.5241575241088867, + "learning_rate": 4.297011126802092e-05, + "loss": 0.9599, + "step": 453000 + }, + { + "epoch": 7.031456105774454, + "grad_norm": 2.678372859954834, + "learning_rate": 4.2968559412777975e-05, + "loss": 0.9859, + "step": 453100 + }, + { + "epoch": 7.033007961017396, + "grad_norm": 2.160053014755249, + "learning_rate": 4.296700755753503e-05, + "loss": 0.9674, + "step": 453200 + }, + { + "epoch": 7.034559816260339, + "grad_norm": 1.896772861480713, + "learning_rate": 4.296545570229209e-05, + "loss": 0.9677, + "step": 453300 + }, + { + "epoch": 7.036111671503282, + "grad_norm": 2.3262500762939453, + "learning_rate": 4.296390384704915e-05, + "loss": 0.9679, + "step": 453400 + }, + { + "epoch": 7.0376635267462255, + "grad_norm": 1.9697010517120361, + "learning_rate": 4.2962351991806206e-05, + "loss": 0.949, + "step": 453500 + }, + { + "epoch": 7.039215381989168, + "grad_norm": 2.1979610919952393, + "learning_rate": 4.2960800136563263e-05, + "loss": 0.973, + "step": 453600 + }, + { + "epoch": 7.040767237232111, + "grad_norm": 2.1452407836914062, + "learning_rate": 4.295924828132032e-05, + "loss": 0.9551, + "step": 453700 + }, + { + "epoch": 7.042319092475054, + "grad_norm": 2.3135056495666504, + "learning_rate": 4.295769642607738e-05, + "loss": 0.959, + "step": 453800 + }, + { + "epoch": 7.043870947717997, + "grad_norm": 2.273770809173584, + "learning_rate": 4.295614457083444e-05, + "loss": 0.9623, + "step": 453900 + }, + { + "epoch": 7.04542280296094, + "grad_norm": 2.3189713954925537, + "learning_rate": 4.295459271559149e-05, + "loss": 0.9538, + "step": 454000 + }, + { + "epoch": 7.046974658203883, + "grad_norm": 2.1957008838653564, + "learning_rate": 4.2953040860348545e-05, + "loss": 0.9677, + "step": 454100 + }, + { + "epoch": 7.048526513446825, + "grad_norm": 2.5264923572540283, + "learning_rate": 4.29514890051056e-05, + "loss": 0.9583, + "step": 454200 + }, + { + "epoch": 7.050078368689769, + "grad_norm": 2.3417556285858154, + "learning_rate": 4.294993714986266e-05, + "loss": 0.9661, + "step": 454300 + }, + { + "epoch": 7.051630223932712, + "grad_norm": 2.3444340229034424, + "learning_rate": 4.294838529461972e-05, + "loss": 0.9454, + "step": 454400 + }, + { + "epoch": 7.053182079175654, + "grad_norm": 2.1473686695098877, + "learning_rate": 4.2946833439376776e-05, + "loss": 0.9663, + "step": 454500 + }, + { + "epoch": 7.054733934418597, + "grad_norm": 2.139326572418213, + "learning_rate": 4.2945281584133834e-05, + "loss": 0.957, + "step": 454600 + }, + { + "epoch": 7.0562857896615405, + "grad_norm": 2.1516165733337402, + "learning_rate": 4.294372972889089e-05, + "loss": 0.9535, + "step": 454700 + }, + { + "epoch": 7.057837644904484, + "grad_norm": 2.4904158115386963, + "learning_rate": 4.294217787364795e-05, + "loss": 0.9421, + "step": 454800 + }, + { + "epoch": 7.059389500147426, + "grad_norm": 2.3278799057006836, + "learning_rate": 4.294062601840501e-05, + "loss": 0.95, + "step": 454900 + }, + { + "epoch": 7.060941355390369, + "grad_norm": 2.6046054363250732, + "learning_rate": 4.2939074163162065e-05, + "loss": 0.965, + "step": 455000 + }, + { + "epoch": 7.0624932106333125, + "grad_norm": 2.363619565963745, + "learning_rate": 4.293752230791912e-05, + "loss": 0.9654, + "step": 455100 + }, + { + "epoch": 7.064045065876255, + "grad_norm": 2.2985658645629883, + "learning_rate": 4.293597045267618e-05, + "loss": 0.9579, + "step": 455200 + }, + { + "epoch": 7.065596921119198, + "grad_norm": 2.117795944213867, + "learning_rate": 4.293441859743323e-05, + "loss": 0.9368, + "step": 455300 + }, + { + "epoch": 7.067148776362141, + "grad_norm": 1.7555428743362427, + "learning_rate": 4.293286674219029e-05, + "loss": 0.9348, + "step": 455400 + }, + { + "epoch": 7.0687006316050836, + "grad_norm": 2.2877862453460693, + "learning_rate": 4.293131488694735e-05, + "loss": 0.9638, + "step": 455500 + }, + { + "epoch": 7.070252486848027, + "grad_norm": 1.8652008771896362, + "learning_rate": 4.2929763031704405e-05, + "loss": 0.9378, + "step": 455600 + }, + { + "epoch": 7.07180434209097, + "grad_norm": 2.0074143409729004, + "learning_rate": 4.292821117646146e-05, + "loss": 0.9574, + "step": 455700 + }, + { + "epoch": 7.073356197333912, + "grad_norm": 2.6291511058807373, + "learning_rate": 4.292665932121852e-05, + "loss": 0.935, + "step": 455800 + }, + { + "epoch": 7.0749080525768555, + "grad_norm": 1.9829723834991455, + "learning_rate": 4.292510746597558e-05, + "loss": 0.9676, + "step": 455900 + }, + { + "epoch": 7.076459907819799, + "grad_norm": 2.560279369354248, + "learning_rate": 4.2923555610732636e-05, + "loss": 0.9617, + "step": 456000 + }, + { + "epoch": 7.078011763062742, + "grad_norm": 2.0156502723693848, + "learning_rate": 4.2922003755489694e-05, + "loss": 0.9656, + "step": 456100 + }, + { + "epoch": 7.079563618305684, + "grad_norm": 2.2178444862365723, + "learning_rate": 4.292045190024675e-05, + "loss": 0.9655, + "step": 456200 + }, + { + "epoch": 7.0811154735486275, + "grad_norm": 1.987246036529541, + "learning_rate": 4.29189000450038e-05, + "loss": 0.9515, + "step": 456300 + }, + { + "epoch": 7.082667328791571, + "grad_norm": 2.313192129135132, + "learning_rate": 4.291734818976086e-05, + "loss": 0.9611, + "step": 456400 + }, + { + "epoch": 7.084219184034513, + "grad_norm": 1.8505592346191406, + "learning_rate": 4.291579633451792e-05, + "loss": 0.9526, + "step": 456500 + }, + { + "epoch": 7.085771039277456, + "grad_norm": 1.95948326587677, + "learning_rate": 4.2914244479274976e-05, + "loss": 0.9511, + "step": 456600 + }, + { + "epoch": 7.087322894520399, + "grad_norm": 2.267521381378174, + "learning_rate": 4.2912692624032033e-05, + "loss": 0.9578, + "step": 456700 + }, + { + "epoch": 7.088874749763342, + "grad_norm": 2.2177438735961914, + "learning_rate": 4.2911140768789084e-05, + "loss": 0.9555, + "step": 456800 + }, + { + "epoch": 7.090426605006285, + "grad_norm": 2.168781280517578, + "learning_rate": 4.290958891354614e-05, + "loss": 0.9789, + "step": 456900 + }, + { + "epoch": 7.091978460249228, + "grad_norm": 2.333838701248169, + "learning_rate": 4.29080370583032e-05, + "loss": 0.9346, + "step": 457000 + }, + { + "epoch": 7.0935303154921705, + "grad_norm": 2.0888421535491943, + "learning_rate": 4.290648520306026e-05, + "loss": 0.9695, + "step": 457100 + }, + { + "epoch": 7.095082170735114, + "grad_norm": 2.3843302726745605, + "learning_rate": 4.2904933347817315e-05, + "loss": 0.9627, + "step": 457200 + }, + { + "epoch": 7.096634025978057, + "grad_norm": 1.9824546575546265, + "learning_rate": 4.290338149257437e-05, + "loss": 0.9751, + "step": 457300 + }, + { + "epoch": 7.098185881221, + "grad_norm": 2.068843364715576, + "learning_rate": 4.290182963733143e-05, + "loss": 0.9623, + "step": 457400 + }, + { + "epoch": 7.0997377364639425, + "grad_norm": 2.5886178016662598, + "learning_rate": 4.290027778208849e-05, + "loss": 0.9735, + "step": 457500 + }, + { + "epoch": 7.101289591706886, + "grad_norm": 2.1320242881774902, + "learning_rate": 4.2898725926845546e-05, + "loss": 0.9405, + "step": 457600 + }, + { + "epoch": 7.102841446949829, + "grad_norm": 2.2554919719696045, + "learning_rate": 4.2897174071602604e-05, + "loss": 0.9605, + "step": 457700 + }, + { + "epoch": 7.104393302192771, + "grad_norm": 1.943920373916626, + "learning_rate": 4.289562221635966e-05, + "loss": 0.9636, + "step": 457800 + }, + { + "epoch": 7.105945157435714, + "grad_norm": 2.3253748416900635, + "learning_rate": 4.289407036111672e-05, + "loss": 0.9788, + "step": 457900 + }, + { + "epoch": 7.107497012678658, + "grad_norm": 2.5349743366241455, + "learning_rate": 4.289251850587378e-05, + "loss": 0.9612, + "step": 458000 + }, + { + "epoch": 7.1090488679216, + "grad_norm": 2.255227565765381, + "learning_rate": 4.289096665063083e-05, + "loss": 0.927, + "step": 458100 + }, + { + "epoch": 7.110600723164543, + "grad_norm": 2.362051010131836, + "learning_rate": 4.2889414795387886e-05, + "loss": 0.9808, + "step": 458200 + }, + { + "epoch": 7.112152578407486, + "grad_norm": 2.2470040321350098, + "learning_rate": 4.2887862940144944e-05, + "loss": 0.9588, + "step": 458300 + }, + { + "epoch": 7.113704433650429, + "grad_norm": 2.225729465484619, + "learning_rate": 4.2886311084902e-05, + "loss": 0.953, + "step": 458400 + }, + { + "epoch": 7.115256288893372, + "grad_norm": 2.5031213760375977, + "learning_rate": 4.288475922965906e-05, + "loss": 0.9447, + "step": 458500 + }, + { + "epoch": 7.116808144136315, + "grad_norm": 2.101773500442505, + "learning_rate": 4.288320737441612e-05, + "loss": 0.9688, + "step": 458600 + }, + { + "epoch": 7.118359999379258, + "grad_norm": 1.8641526699066162, + "learning_rate": 4.2881655519173175e-05, + "loss": 0.9641, + "step": 458700 + }, + { + "epoch": 7.119911854622201, + "grad_norm": 2.222161293029785, + "learning_rate": 4.288010366393023e-05, + "loss": 0.9718, + "step": 458800 + }, + { + "epoch": 7.121463709865144, + "grad_norm": 2.2137527465820312, + "learning_rate": 4.287855180868729e-05, + "loss": 0.9474, + "step": 458900 + }, + { + "epoch": 7.123015565108087, + "grad_norm": 2.1790454387664795, + "learning_rate": 4.287699995344435e-05, + "loss": 0.9333, + "step": 459000 + }, + { + "epoch": 7.124567420351029, + "grad_norm": 2.2381160259246826, + "learning_rate": 4.2875448098201406e-05, + "loss": 0.9751, + "step": 459100 + }, + { + "epoch": 7.126119275593973, + "grad_norm": 2.3690028190612793, + "learning_rate": 4.2873896242958464e-05, + "loss": 0.9811, + "step": 459200 + }, + { + "epoch": 7.127671130836916, + "grad_norm": 2.355292558670044, + "learning_rate": 4.287234438771552e-05, + "loss": 0.9444, + "step": 459300 + }, + { + "epoch": 7.129222986079858, + "grad_norm": 2.025624990463257, + "learning_rate": 4.287079253247257e-05, + "loss": 0.9545, + "step": 459400 + }, + { + "epoch": 7.130774841322801, + "grad_norm": 2.651099681854248, + "learning_rate": 4.286924067722963e-05, + "loss": 0.9552, + "step": 459500 + }, + { + "epoch": 7.132326696565745, + "grad_norm": 2.292207717895508, + "learning_rate": 4.286768882198669e-05, + "loss": 0.9602, + "step": 459600 + }, + { + "epoch": 7.133878551808687, + "grad_norm": 2.2821760177612305, + "learning_rate": 4.286613696674374e-05, + "loss": 0.9359, + "step": 459700 + }, + { + "epoch": 7.13543040705163, + "grad_norm": 2.2225759029388428, + "learning_rate": 4.28645851115008e-05, + "loss": 0.9521, + "step": 459800 + }, + { + "epoch": 7.136982262294573, + "grad_norm": 2.255540370941162, + "learning_rate": 4.2863033256257854e-05, + "loss": 0.9668, + "step": 459900 + }, + { + "epoch": 7.138534117537516, + "grad_norm": 2.313821792602539, + "learning_rate": 4.286148140101491e-05, + "loss": 0.9569, + "step": 460000 + }, + { + "epoch": 7.140085972780459, + "grad_norm": 2.455730676651001, + "learning_rate": 4.285992954577197e-05, + "loss": 0.9472, + "step": 460100 + }, + { + "epoch": 7.141637828023402, + "grad_norm": 2.7190046310424805, + "learning_rate": 4.285837769052903e-05, + "loss": 0.953, + "step": 460200 + }, + { + "epoch": 7.143189683266345, + "grad_norm": 2.2392213344573975, + "learning_rate": 4.2856825835286085e-05, + "loss": 0.9594, + "step": 460300 + }, + { + "epoch": 7.144741538509288, + "grad_norm": 2.6095943450927734, + "learning_rate": 4.285527398004314e-05, + "loss": 0.9718, + "step": 460400 + }, + { + "epoch": 7.146293393752231, + "grad_norm": 1.970005989074707, + "learning_rate": 4.28537221248002e-05, + "loss": 0.9666, + "step": 460500 + }, + { + "epoch": 7.147845248995174, + "grad_norm": 2.335620403289795, + "learning_rate": 4.285217026955726e-05, + "loss": 0.9588, + "step": 460600 + }, + { + "epoch": 7.149397104238116, + "grad_norm": 2.791313648223877, + "learning_rate": 4.2850618414314316e-05, + "loss": 0.9664, + "step": 460700 + }, + { + "epoch": 7.15094895948106, + "grad_norm": 2.0274670124053955, + "learning_rate": 4.2849066559071374e-05, + "loss": 0.9676, + "step": 460800 + }, + { + "epoch": 7.152500814724003, + "grad_norm": 2.9864020347595215, + "learning_rate": 4.284751470382843e-05, + "loss": 0.9761, + "step": 460900 + }, + { + "epoch": 7.154052669966945, + "grad_norm": 1.9967398643493652, + "learning_rate": 4.284596284858548e-05, + "loss": 0.9525, + "step": 461000 + }, + { + "epoch": 7.155604525209888, + "grad_norm": 2.2921247482299805, + "learning_rate": 4.284441099334254e-05, + "loss": 0.9651, + "step": 461100 + }, + { + "epoch": 7.1571563804528315, + "grad_norm": 2.121875762939453, + "learning_rate": 4.28428591380996e-05, + "loss": 0.9486, + "step": 461200 + }, + { + "epoch": 7.158708235695775, + "grad_norm": 2.1138126850128174, + "learning_rate": 4.2841307282856656e-05, + "loss": 0.9671, + "step": 461300 + }, + { + "epoch": 7.160260090938717, + "grad_norm": 2.4395244121551514, + "learning_rate": 4.2839755427613714e-05, + "loss": 0.9476, + "step": 461400 + }, + { + "epoch": 7.16181194618166, + "grad_norm": 1.7443664073944092, + "learning_rate": 4.283820357237077e-05, + "loss": 0.923, + "step": 461500 + }, + { + "epoch": 7.1633638014246035, + "grad_norm": 2.550983428955078, + "learning_rate": 4.283665171712783e-05, + "loss": 0.9596, + "step": 461600 + }, + { + "epoch": 7.164915656667546, + "grad_norm": 2.354785203933716, + "learning_rate": 4.283509986188489e-05, + "loss": 0.9641, + "step": 461700 + }, + { + "epoch": 7.166467511910489, + "grad_norm": 2.1780269145965576, + "learning_rate": 4.2833548006641945e-05, + "loss": 0.9571, + "step": 461800 + }, + { + "epoch": 7.168019367153432, + "grad_norm": 2.2425289154052734, + "learning_rate": 4.2831996151399e-05, + "loss": 0.9576, + "step": 461900 + }, + { + "epoch": 7.169571222396375, + "grad_norm": 2.303395986557007, + "learning_rate": 4.283044429615606e-05, + "loss": 0.9689, + "step": 462000 + }, + { + "epoch": 7.171123077639318, + "grad_norm": 2.4535417556762695, + "learning_rate": 4.282889244091312e-05, + "loss": 0.9642, + "step": 462100 + }, + { + "epoch": 7.172674932882261, + "grad_norm": 2.597946882247925, + "learning_rate": 4.2827340585670176e-05, + "loss": 0.95, + "step": 462200 + }, + { + "epoch": 7.174226788125203, + "grad_norm": 2.0032639503479004, + "learning_rate": 4.282578873042723e-05, + "loss": 0.9519, + "step": 462300 + }, + { + "epoch": 7.1757786433681465, + "grad_norm": 2.6025397777557373, + "learning_rate": 4.2824236875184285e-05, + "loss": 0.9444, + "step": 462400 + }, + { + "epoch": 7.17733049861109, + "grad_norm": 2.4813663959503174, + "learning_rate": 4.282268501994134e-05, + "loss": 0.9633, + "step": 462500 + }, + { + "epoch": 7.178882353854032, + "grad_norm": 2.078444242477417, + "learning_rate": 4.28211331646984e-05, + "loss": 0.9542, + "step": 462600 + }, + { + "epoch": 7.180434209096975, + "grad_norm": 2.0881171226501465, + "learning_rate": 4.281958130945546e-05, + "loss": 0.951, + "step": 462700 + }, + { + "epoch": 7.1819860643399185, + "grad_norm": 2.452178955078125, + "learning_rate": 4.281802945421251e-05, + "loss": 0.9504, + "step": 462800 + }, + { + "epoch": 7.183537919582862, + "grad_norm": 2.191159248352051, + "learning_rate": 4.281647759896957e-05, + "loss": 0.9551, + "step": 462900 + }, + { + "epoch": 7.185089774825804, + "grad_norm": 2.599208116531372, + "learning_rate": 4.2814925743726624e-05, + "loss": 0.9507, + "step": 463000 + }, + { + "epoch": 7.186641630068747, + "grad_norm": 2.333869457244873, + "learning_rate": 4.281337388848368e-05, + "loss": 0.9521, + "step": 463100 + }, + { + "epoch": 7.1881934853116904, + "grad_norm": 2.291795015335083, + "learning_rate": 4.281182203324074e-05, + "loss": 0.949, + "step": 463200 + }, + { + "epoch": 7.189745340554633, + "grad_norm": 2.338108539581299, + "learning_rate": 4.28102701779978e-05, + "loss": 0.9403, + "step": 463300 + }, + { + "epoch": 7.191297195797576, + "grad_norm": 2.32407546043396, + "learning_rate": 4.2808718322754855e-05, + "loss": 0.9594, + "step": 463400 + }, + { + "epoch": 7.192849051040519, + "grad_norm": 2.1662728786468506, + "learning_rate": 4.280716646751191e-05, + "loss": 0.9642, + "step": 463500 + }, + { + "epoch": 7.1944009062834615, + "grad_norm": 2.087822675704956, + "learning_rate": 4.280561461226897e-05, + "loss": 0.9596, + "step": 463600 + }, + { + "epoch": 7.195952761526405, + "grad_norm": 1.9519245624542236, + "learning_rate": 4.280406275702603e-05, + "loss": 0.9539, + "step": 463700 + }, + { + "epoch": 7.197504616769348, + "grad_norm": 2.13206148147583, + "learning_rate": 4.280251090178308e-05, + "loss": 0.9594, + "step": 463800 + }, + { + "epoch": 7.19905647201229, + "grad_norm": 2.1854920387268066, + "learning_rate": 4.280095904654014e-05, + "loss": 0.9621, + "step": 463900 + }, + { + "epoch": 7.2006083272552335, + "grad_norm": 2.114339828491211, + "learning_rate": 4.2799407191297195e-05, + "loss": 0.9519, + "step": 464000 + }, + { + "epoch": 7.202160182498177, + "grad_norm": 2.2599399089813232, + "learning_rate": 4.279785533605425e-05, + "loss": 0.9689, + "step": 464100 + }, + { + "epoch": 7.20371203774112, + "grad_norm": 2.068944215774536, + "learning_rate": 4.279630348081131e-05, + "loss": 0.9428, + "step": 464200 + }, + { + "epoch": 7.205263892984062, + "grad_norm": 2.29356050491333, + "learning_rate": 4.279475162556837e-05, + "loss": 0.9715, + "step": 464300 + }, + { + "epoch": 7.206815748227005, + "grad_norm": 2.2453391551971436, + "learning_rate": 4.2793199770325426e-05, + "loss": 0.9506, + "step": 464400 + }, + { + "epoch": 7.208367603469949, + "grad_norm": 1.9269475936889648, + "learning_rate": 4.2791647915082484e-05, + "loss": 0.9453, + "step": 464500 + }, + { + "epoch": 7.209919458712891, + "grad_norm": 1.9869329929351807, + "learning_rate": 4.279009605983954e-05, + "loss": 0.9686, + "step": 464600 + }, + { + "epoch": 7.211471313955834, + "grad_norm": 2.1967082023620605, + "learning_rate": 4.27885442045966e-05, + "loss": 0.9489, + "step": 464700 + }, + { + "epoch": 7.213023169198777, + "grad_norm": 2.396719455718994, + "learning_rate": 4.278699234935366e-05, + "loss": 0.9577, + "step": 464800 + }, + { + "epoch": 7.21457502444172, + "grad_norm": 2.17808198928833, + "learning_rate": 4.2785440494110715e-05, + "loss": 0.9668, + "step": 464900 + }, + { + "epoch": 7.216126879684663, + "grad_norm": 1.9736754894256592, + "learning_rate": 4.278388863886777e-05, + "loss": 0.954, + "step": 465000 + }, + { + "epoch": 7.217678734927606, + "grad_norm": 2.349987030029297, + "learning_rate": 4.2782336783624824e-05, + "loss": 0.9665, + "step": 465100 + }, + { + "epoch": 7.2192305901705485, + "grad_norm": 2.635378360748291, + "learning_rate": 4.278078492838188e-05, + "loss": 0.9681, + "step": 465200 + }, + { + "epoch": 7.220782445413492, + "grad_norm": 2.589090347290039, + "learning_rate": 4.277923307313894e-05, + "loss": 0.9449, + "step": 465300 + }, + { + "epoch": 7.222334300656435, + "grad_norm": 2.449859619140625, + "learning_rate": 4.2777681217896e-05, + "loss": 0.9462, + "step": 465400 + }, + { + "epoch": 7.223886155899378, + "grad_norm": 1.8678507804870605, + "learning_rate": 4.2776129362653055e-05, + "loss": 0.9416, + "step": 465500 + }, + { + "epoch": 7.22543801114232, + "grad_norm": 2.215139627456665, + "learning_rate": 4.277457750741011e-05, + "loss": 0.9444, + "step": 465600 + }, + { + "epoch": 7.226989866385264, + "grad_norm": 1.9364641904830933, + "learning_rate": 4.277302565216717e-05, + "loss": 0.9492, + "step": 465700 + }, + { + "epoch": 7.228541721628207, + "grad_norm": 2.338667392730713, + "learning_rate": 4.277147379692423e-05, + "loss": 0.9456, + "step": 465800 + }, + { + "epoch": 7.230093576871149, + "grad_norm": 2.1778111457824707, + "learning_rate": 4.2769921941681286e-05, + "loss": 0.953, + "step": 465900 + }, + { + "epoch": 7.231645432114092, + "grad_norm": 2.0356953144073486, + "learning_rate": 4.276837008643834e-05, + "loss": 0.9602, + "step": 466000 + }, + { + "epoch": 7.233197287357036, + "grad_norm": 2.1283013820648193, + "learning_rate": 4.2766818231195394e-05, + "loss": 0.9508, + "step": 466100 + }, + { + "epoch": 7.234749142599978, + "grad_norm": 2.4937241077423096, + "learning_rate": 4.276526637595245e-05, + "loss": 0.9532, + "step": 466200 + }, + { + "epoch": 7.236300997842921, + "grad_norm": 2.469480276107788, + "learning_rate": 4.276371452070951e-05, + "loss": 0.9607, + "step": 466300 + }, + { + "epoch": 7.237852853085864, + "grad_norm": 1.7941803932189941, + "learning_rate": 4.276216266546657e-05, + "loss": 0.9565, + "step": 466400 + }, + { + "epoch": 7.239404708328807, + "grad_norm": 2.239121198654175, + "learning_rate": 4.2760610810223625e-05, + "loss": 0.9771, + "step": 466500 + }, + { + "epoch": 7.24095656357175, + "grad_norm": 2.150179862976074, + "learning_rate": 4.2759058954980676e-05, + "loss": 0.9421, + "step": 466600 + }, + { + "epoch": 7.242508418814693, + "grad_norm": 2.1357245445251465, + "learning_rate": 4.2757507099737734e-05, + "loss": 0.945, + "step": 466700 + }, + { + "epoch": 7.244060274057636, + "grad_norm": 2.290334463119507, + "learning_rate": 4.275595524449479e-05, + "loss": 0.9515, + "step": 466800 + }, + { + "epoch": 7.245612129300579, + "grad_norm": 1.9361125230789185, + "learning_rate": 4.275440338925185e-05, + "loss": 0.9516, + "step": 466900 + }, + { + "epoch": 7.247163984543522, + "grad_norm": 2.115213394165039, + "learning_rate": 4.275285153400891e-05, + "loss": 0.9475, + "step": 467000 + }, + { + "epoch": 7.248715839786465, + "grad_norm": 2.2862818241119385, + "learning_rate": 4.2751299678765965e-05, + "loss": 0.9559, + "step": 467100 + }, + { + "epoch": 7.250267695029407, + "grad_norm": 2.427246570587158, + "learning_rate": 4.274974782352302e-05, + "loss": 0.9839, + "step": 467200 + }, + { + "epoch": 7.251819550272351, + "grad_norm": 2.161555528640747, + "learning_rate": 4.274819596828008e-05, + "loss": 0.9609, + "step": 467300 + }, + { + "epoch": 7.253371405515294, + "grad_norm": 1.9250975847244263, + "learning_rate": 4.274664411303714e-05, + "loss": 0.9483, + "step": 467400 + }, + { + "epoch": 7.254923260758236, + "grad_norm": 2.2785122394561768, + "learning_rate": 4.2745092257794196e-05, + "loss": 0.9362, + "step": 467500 + }, + { + "epoch": 7.256475116001179, + "grad_norm": 2.565977096557617, + "learning_rate": 4.2743540402551254e-05, + "loss": 0.9722, + "step": 467600 + }, + { + "epoch": 7.2580269712441225, + "grad_norm": 2.245760917663574, + "learning_rate": 4.274198854730831e-05, + "loss": 0.9745, + "step": 467700 + }, + { + "epoch": 7.259578826487065, + "grad_norm": 2.2956316471099854, + "learning_rate": 4.274043669206537e-05, + "loss": 0.964, + "step": 467800 + }, + { + "epoch": 7.261130681730008, + "grad_norm": 2.3138315677642822, + "learning_rate": 4.273888483682242e-05, + "loss": 0.9591, + "step": 467900 + }, + { + "epoch": 7.262682536972951, + "grad_norm": 2.2791948318481445, + "learning_rate": 4.273733298157948e-05, + "loss": 0.9391, + "step": 468000 + }, + { + "epoch": 7.2642343922158945, + "grad_norm": 2.307992458343506, + "learning_rate": 4.2735781126336536e-05, + "loss": 0.9594, + "step": 468100 + }, + { + "epoch": 7.265786247458837, + "grad_norm": 2.3418588638305664, + "learning_rate": 4.2734229271093594e-05, + "loss": 0.951, + "step": 468200 + }, + { + "epoch": 7.26733810270178, + "grad_norm": 2.0843400955200195, + "learning_rate": 4.273267741585065e-05, + "loss": 0.9619, + "step": 468300 + }, + { + "epoch": 7.268889957944723, + "grad_norm": 2.0552427768707275, + "learning_rate": 4.273112556060771e-05, + "loss": 0.9624, + "step": 468400 + }, + { + "epoch": 7.270441813187666, + "grad_norm": 2.0975191593170166, + "learning_rate": 4.272957370536477e-05, + "loss": 0.9532, + "step": 468500 + }, + { + "epoch": 7.271993668430609, + "grad_norm": 2.250615119934082, + "learning_rate": 4.2728021850121825e-05, + "loss": 0.9532, + "step": 468600 + }, + { + "epoch": 7.273545523673552, + "grad_norm": 1.945852518081665, + "learning_rate": 4.272646999487888e-05, + "loss": 0.9457, + "step": 468700 + }, + { + "epoch": 7.275097378916494, + "grad_norm": 1.880228877067566, + "learning_rate": 4.272491813963594e-05, + "loss": 0.956, + "step": 468800 + }, + { + "epoch": 7.2766492341594375, + "grad_norm": 2.206754207611084, + "learning_rate": 4.2723366284393e-05, + "loss": 0.9604, + "step": 468900 + }, + { + "epoch": 7.278201089402381, + "grad_norm": 1.9625083208084106, + "learning_rate": 4.2721814429150056e-05, + "loss": 0.9507, + "step": 469000 + }, + { + "epoch": 7.279752944645323, + "grad_norm": 1.8727223873138428, + "learning_rate": 4.2720262573907113e-05, + "loss": 0.9683, + "step": 469100 + }, + { + "epoch": 7.281304799888266, + "grad_norm": 1.9759860038757324, + "learning_rate": 4.2718710718664164e-05, + "loss": 0.9434, + "step": 469200 + }, + { + "epoch": 7.2828566551312095, + "grad_norm": 1.8300950527191162, + "learning_rate": 4.271715886342122e-05, + "loss": 0.9447, + "step": 469300 + }, + { + "epoch": 7.284408510374153, + "grad_norm": 2.4198191165924072, + "learning_rate": 4.271560700817828e-05, + "loss": 0.9581, + "step": 469400 + }, + { + "epoch": 7.285960365617095, + "grad_norm": 2.22871732711792, + "learning_rate": 4.271405515293533e-05, + "loss": 0.978, + "step": 469500 + }, + { + "epoch": 7.287512220860038, + "grad_norm": 2.2410459518432617, + "learning_rate": 4.271250329769239e-05, + "loss": 0.9611, + "step": 469600 + }, + { + "epoch": 7.2890640761029815, + "grad_norm": 2.4857144355773926, + "learning_rate": 4.2710951442449446e-05, + "loss": 0.9556, + "step": 469700 + }, + { + "epoch": 7.290615931345924, + "grad_norm": 2.3882670402526855, + "learning_rate": 4.2709399587206504e-05, + "loss": 0.9589, + "step": 469800 + }, + { + "epoch": 7.292167786588867, + "grad_norm": 1.9593956470489502, + "learning_rate": 4.270784773196356e-05, + "loss": 0.9565, + "step": 469900 + }, + { + "epoch": 7.29371964183181, + "grad_norm": 2.2318294048309326, + "learning_rate": 4.270629587672062e-05, + "loss": 0.9595, + "step": 470000 + }, + { + "epoch": 7.2952714970747525, + "grad_norm": 2.3668556213378906, + "learning_rate": 4.270474402147768e-05, + "loss": 0.9644, + "step": 470100 + }, + { + "epoch": 7.296823352317696, + "grad_norm": 1.8320237398147583, + "learning_rate": 4.2703192166234735e-05, + "loss": 0.9551, + "step": 470200 + }, + { + "epoch": 7.298375207560639, + "grad_norm": 2.304394006729126, + "learning_rate": 4.270164031099179e-05, + "loss": 0.9422, + "step": 470300 + }, + { + "epoch": 7.299927062803581, + "grad_norm": 1.8527318239212036, + "learning_rate": 4.270008845574885e-05, + "loss": 0.9504, + "step": 470400 + }, + { + "epoch": 7.3014789180465245, + "grad_norm": 1.8852194547653198, + "learning_rate": 4.269853660050591e-05, + "loss": 0.9305, + "step": 470500 + } + ], + "logging_steps": 100, + "max_steps": 3221950, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1888326280962714e+19, + "train_batch_size": 96, + "trial_name": null, + "trial_params": null +}