{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.013439066403886, "eval_steps": 500, "global_step": 387500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015518552429429383, "grad_norm": 4.051200866699219, "learning_rate": 4.999846366330949e-05, "loss": 2.0462, "step": 100 }, { "epoch": 0.0031037104858858766, "grad_norm": 3.673067808151245, "learning_rate": 4.9996911808066544e-05, "loss": 1.8074, "step": 200 }, { "epoch": 0.0046555657288288145, "grad_norm": 4.040717124938965, "learning_rate": 4.99953599528236e-05, "loss": 1.7323, "step": 300 }, { "epoch": 0.006207420971771753, "grad_norm": 3.5554556846618652, "learning_rate": 4.999380809758066e-05, "loss": 1.685, "step": 400 }, { "epoch": 0.007759276214714691, "grad_norm": 3.334359884262085, "learning_rate": 4.999225624233772e-05, "loss": 1.6494, "step": 500 }, { "epoch": 0.009311131457657629, "grad_norm": 3.0186429023742676, "learning_rate": 4.9990704387094775e-05, "loss": 1.6367, "step": 600 }, { "epoch": 0.010862986700600569, "grad_norm": 3.666841506958008, "learning_rate": 4.998915253185183e-05, "loss": 1.6388, "step": 700 }, { "epoch": 0.012414841943543507, "grad_norm": 3.388770580291748, "learning_rate": 4.9987600676608884e-05, "loss": 1.5903, "step": 800 }, { "epoch": 0.013966697186486444, "grad_norm": 3.3341407775878906, "learning_rate": 4.998604882136594e-05, "loss": 1.5918, "step": 900 }, { "epoch": 0.015518552429429382, "grad_norm": 3.2501940727233887, "learning_rate": 4.9984496966123e-05, "loss": 1.557, "step": 1000 }, { "epoch": 0.01707040767237232, "grad_norm": 3.2041587829589844, "learning_rate": 4.998294511088006e-05, "loss": 1.5438, "step": 1100 }, { "epoch": 0.018622262915315258, "grad_norm": 3.018267869949341, "learning_rate": 4.9981393255637115e-05, "loss": 1.5246, "step": 1200 }, { "epoch": 0.020174118158258196, "grad_norm": 3.312126398086548, "learning_rate": 4.997984140039417e-05, "loss": 1.5262, "step": 1300 }, { "epoch": 0.021725973401201137, "grad_norm": 2.716785192489624, "learning_rate": 4.997828954515123e-05, "loss": 1.5058, "step": 1400 }, { "epoch": 0.023277828644144075, "grad_norm": 2.906491279602051, "learning_rate": 4.997673768990829e-05, "loss": 1.5188, "step": 1500 }, { "epoch": 0.024829683887087013, "grad_norm": 3.170187473297119, "learning_rate": 4.9975185834665346e-05, "loss": 1.5317, "step": 1600 }, { "epoch": 0.02638153913002995, "grad_norm": 2.971956968307495, "learning_rate": 4.9973633979422404e-05, "loss": 1.4999, "step": 1700 }, { "epoch": 0.02793339437297289, "grad_norm": 2.7195184230804443, "learning_rate": 4.997208212417946e-05, "loss": 1.4824, "step": 1800 }, { "epoch": 0.029485249615915827, "grad_norm": 3.998793125152588, "learning_rate": 4.997053026893652e-05, "loss": 1.4891, "step": 1900 }, { "epoch": 0.031037104858858765, "grad_norm": 3.1330697536468506, "learning_rate": 4.996897841369358e-05, "loss": 1.4763, "step": 2000 }, { "epoch": 0.032588960101801706, "grad_norm": 4.046468257904053, "learning_rate": 4.996742655845063e-05, "loss": 1.4514, "step": 2100 }, { "epoch": 0.03414081534474464, "grad_norm": 3.3552005290985107, "learning_rate": 4.9965874703207686e-05, "loss": 1.4633, "step": 2200 }, { "epoch": 0.03569267058768758, "grad_norm": 3.3621249198913574, "learning_rate": 4.996432284796474e-05, "loss": 1.4458, "step": 2300 }, { "epoch": 0.037244525830630516, "grad_norm": 3.8857712745666504, "learning_rate": 4.99627709927218e-05, "loss": 1.4464, "step": 2400 }, { "epoch": 0.03879638107357346, "grad_norm": 3.8447365760803223, "learning_rate": 4.996121913747886e-05, "loss": 1.4584, "step": 2500 }, { "epoch": 0.04034823631651639, "grad_norm": 3.052961587905884, "learning_rate": 4.9959667282235917e-05, "loss": 1.4558, "step": 2600 }, { "epoch": 0.04190009155945933, "grad_norm": 15.13032054901123, "learning_rate": 4.9958115426992974e-05, "loss": 1.4365, "step": 2700 }, { "epoch": 0.043451946802402275, "grad_norm": 3.53932785987854, "learning_rate": 4.995656357175003e-05, "loss": 1.4213, "step": 2800 }, { "epoch": 0.04500380204534521, "grad_norm": 3.461416244506836, "learning_rate": 4.995501171650709e-05, "loss": 1.4179, "step": 2900 }, { "epoch": 0.04655565728828815, "grad_norm": 3.9530489444732666, "learning_rate": 4.995345986126415e-05, "loss": 1.4265, "step": 3000 }, { "epoch": 0.048107512531231085, "grad_norm": 3.240056276321411, "learning_rate": 4.9951908006021205e-05, "loss": 1.4028, "step": 3100 }, { "epoch": 0.049659367774174026, "grad_norm": 3.736490249633789, "learning_rate": 4.995035615077826e-05, "loss": 1.3973, "step": 3200 }, { "epoch": 0.05121122301711696, "grad_norm": 3.3111519813537598, "learning_rate": 4.994880429553532e-05, "loss": 1.4156, "step": 3300 }, { "epoch": 0.0527630782600599, "grad_norm": 3.969008207321167, "learning_rate": 4.994725244029237e-05, "loss": 1.4056, "step": 3400 }, { "epoch": 0.05431493350300284, "grad_norm": 2.995002508163452, "learning_rate": 4.994570058504943e-05, "loss": 1.4122, "step": 3500 }, { "epoch": 0.05586678874594578, "grad_norm": 3.688455581665039, "learning_rate": 4.994414872980648e-05, "loss": 1.4197, "step": 3600 }, { "epoch": 0.05741864398888872, "grad_norm": 2.7155747413635254, "learning_rate": 4.994259687456354e-05, "loss": 1.3802, "step": 3700 }, { "epoch": 0.058970499231831654, "grad_norm": 2.8598766326904297, "learning_rate": 4.9941045019320596e-05, "loss": 1.3909, "step": 3800 }, { "epoch": 0.060522354474774595, "grad_norm": 2.5686147212982178, "learning_rate": 4.9939493164077654e-05, "loss": 1.402, "step": 3900 }, { "epoch": 0.06207420971771753, "grad_norm": 3.489210844039917, "learning_rate": 4.993794130883471e-05, "loss": 1.3909, "step": 4000 }, { "epoch": 0.06362606496066046, "grad_norm": 3.14016056060791, "learning_rate": 4.993638945359177e-05, "loss": 1.3955, "step": 4100 }, { "epoch": 0.06517792020360341, "grad_norm": 3.6264193058013916, "learning_rate": 4.993483759834883e-05, "loss": 1.3717, "step": 4200 }, { "epoch": 0.06672977544654635, "grad_norm": 2.6619110107421875, "learning_rate": 4.9933285743105885e-05, "loss": 1.3583, "step": 4300 }, { "epoch": 0.06828163068948928, "grad_norm": 3.4178626537323, "learning_rate": 4.993173388786294e-05, "loss": 1.3607, "step": 4400 }, { "epoch": 0.06983348593243223, "grad_norm": 3.5453226566314697, "learning_rate": 4.993018203262e-05, "loss": 1.3683, "step": 4500 }, { "epoch": 0.07138534117537516, "grad_norm": 2.7841808795928955, "learning_rate": 4.992863017737706e-05, "loss": 1.3532, "step": 4600 }, { "epoch": 0.0729371964183181, "grad_norm": 2.96911883354187, "learning_rate": 4.9927078322134116e-05, "loss": 1.36, "step": 4700 }, { "epoch": 0.07448905166126103, "grad_norm": 3.166738986968994, "learning_rate": 4.9925526466891174e-05, "loss": 1.3475, "step": 4800 }, { "epoch": 0.07604090690420398, "grad_norm": 2.981917381286621, "learning_rate": 4.9923974611648225e-05, "loss": 1.3656, "step": 4900 }, { "epoch": 0.07759276214714692, "grad_norm": 3.3914477825164795, "learning_rate": 4.992242275640528e-05, "loss": 1.3393, "step": 5000 }, { "epoch": 0.07914461739008985, "grad_norm": 3.348331928253174, "learning_rate": 4.992087090116234e-05, "loss": 1.3401, "step": 5100 }, { "epoch": 0.08069647263303278, "grad_norm": 3.9917657375335693, "learning_rate": 4.99193190459194e-05, "loss": 1.3356, "step": 5200 }, { "epoch": 0.08224832787597573, "grad_norm": 2.8803024291992188, "learning_rate": 4.9917767190676456e-05, "loss": 1.3556, "step": 5300 }, { "epoch": 0.08380018311891867, "grad_norm": 2.2128746509552, "learning_rate": 4.991621533543351e-05, "loss": 1.3314, "step": 5400 }, { "epoch": 0.0853520383618616, "grad_norm": 2.7473807334899902, "learning_rate": 4.991466348019057e-05, "loss": 1.3515, "step": 5500 }, { "epoch": 0.08690389360480455, "grad_norm": 3.842038869857788, "learning_rate": 4.991311162494763e-05, "loss": 1.3445, "step": 5600 }, { "epoch": 0.08845574884774748, "grad_norm": 2.893623113632202, "learning_rate": 4.9911559769704687e-05, "loss": 1.3438, "step": 5700 }, { "epoch": 0.09000760409069042, "grad_norm": 2.750718116760254, "learning_rate": 4.9910007914461744e-05, "loss": 1.3584, "step": 5800 }, { "epoch": 0.09155945933363335, "grad_norm": 2.7798516750335693, "learning_rate": 4.99084560592188e-05, "loss": 1.332, "step": 5900 }, { "epoch": 0.0931113145765763, "grad_norm": 2.9709818363189697, "learning_rate": 4.990690420397586e-05, "loss": 1.3265, "step": 6000 }, { "epoch": 0.09466316981951924, "grad_norm": 3.5495290756225586, "learning_rate": 4.990535234873292e-05, "loss": 1.3278, "step": 6100 }, { "epoch": 0.09621502506246217, "grad_norm": 2.8627617359161377, "learning_rate": 4.990380049348997e-05, "loss": 1.3152, "step": 6200 }, { "epoch": 0.09776688030540512, "grad_norm": 2.852050304412842, "learning_rate": 4.9902248638247026e-05, "loss": 1.3292, "step": 6300 }, { "epoch": 0.09931873554834805, "grad_norm": 3.29921293258667, "learning_rate": 4.9900696783004084e-05, "loss": 1.3308, "step": 6400 }, { "epoch": 0.10087059079129099, "grad_norm": 2.7258365154266357, "learning_rate": 4.989914492776114e-05, "loss": 1.3399, "step": 6500 }, { "epoch": 0.10242244603423392, "grad_norm": 2.5750861167907715, "learning_rate": 4.98975930725182e-05, "loss": 1.3075, "step": 6600 }, { "epoch": 0.10397430127717687, "grad_norm": 2.8146610260009766, "learning_rate": 4.989604121727525e-05, "loss": 1.3305, "step": 6700 }, { "epoch": 0.1055261565201198, "grad_norm": 2.5887491703033447, "learning_rate": 4.989448936203231e-05, "loss": 1.3231, "step": 6800 }, { "epoch": 0.10707801176306274, "grad_norm": 2.894197702407837, "learning_rate": 4.9892937506789366e-05, "loss": 1.3523, "step": 6900 }, { "epoch": 0.10862986700600567, "grad_norm": 2.6501145362854004, "learning_rate": 4.9891385651546424e-05, "loss": 1.3181, "step": 7000 }, { "epoch": 0.11018172224894862, "grad_norm": 2.640371799468994, "learning_rate": 4.988983379630348e-05, "loss": 1.2944, "step": 7100 }, { "epoch": 0.11173357749189156, "grad_norm": 2.6818764209747314, "learning_rate": 4.988828194106054e-05, "loss": 1.3006, "step": 7200 }, { "epoch": 0.11328543273483449, "grad_norm": 2.823805570602417, "learning_rate": 4.98867300858176e-05, "loss": 1.298, "step": 7300 }, { "epoch": 0.11483728797777744, "grad_norm": 8.9484281539917, "learning_rate": 4.9885178230574655e-05, "loss": 1.3129, "step": 7400 }, { "epoch": 0.11638914322072037, "grad_norm": 3.8987064361572266, "learning_rate": 4.988362637533171e-05, "loss": 1.2938, "step": 7500 }, { "epoch": 0.11794099846366331, "grad_norm": 3.1523895263671875, "learning_rate": 4.988207452008877e-05, "loss": 1.2802, "step": 7600 }, { "epoch": 0.11949285370660624, "grad_norm": 2.759547472000122, "learning_rate": 4.988052266484582e-05, "loss": 1.2949, "step": 7700 }, { "epoch": 0.12104470894954919, "grad_norm": 2.8270411491394043, "learning_rate": 4.987897080960288e-05, "loss": 1.2746, "step": 7800 }, { "epoch": 0.12259656419249212, "grad_norm": 3.1175129413604736, "learning_rate": 4.987741895435994e-05, "loss": 1.3144, "step": 7900 }, { "epoch": 0.12414841943543506, "grad_norm": 3.2414727210998535, "learning_rate": 4.9875867099116995e-05, "loss": 1.2801, "step": 8000 }, { "epoch": 0.125700274678378, "grad_norm": 3.0709431171417236, "learning_rate": 4.987431524387405e-05, "loss": 1.3275, "step": 8100 }, { "epoch": 0.12725212992132093, "grad_norm": 2.7845780849456787, "learning_rate": 4.987276338863111e-05, "loss": 1.2853, "step": 8200 }, { "epoch": 0.1288039851642639, "grad_norm": 3.027421712875366, "learning_rate": 4.987121153338817e-05, "loss": 1.3048, "step": 8300 }, { "epoch": 0.13035584040720682, "grad_norm": 3.0614867210388184, "learning_rate": 4.9869659678145226e-05, "loss": 1.2765, "step": 8400 }, { "epoch": 0.13190769565014976, "grad_norm": 2.841259241104126, "learning_rate": 4.986810782290228e-05, "loss": 1.298, "step": 8500 }, { "epoch": 0.1334595508930927, "grad_norm": 2.861579656600952, "learning_rate": 4.986655596765934e-05, "loss": 1.2838, "step": 8600 }, { "epoch": 0.13501140613603563, "grad_norm": 9.477072715759277, "learning_rate": 4.98650041124164e-05, "loss": 1.2718, "step": 8700 }, { "epoch": 0.13656326137897856, "grad_norm": 2.924326181411743, "learning_rate": 4.9863452257173457e-05, "loss": 1.292, "step": 8800 }, { "epoch": 0.1381151166219215, "grad_norm": 2.870556354522705, "learning_rate": 4.9861900401930514e-05, "loss": 1.2963, "step": 8900 }, { "epoch": 0.13966697186486446, "grad_norm": 3.039274215698242, "learning_rate": 4.9860348546687565e-05, "loss": 1.285, "step": 9000 }, { "epoch": 0.1412188271078074, "grad_norm": 2.7829339504241943, "learning_rate": 4.985879669144462e-05, "loss": 1.2799, "step": 9100 }, { "epoch": 0.14277068235075033, "grad_norm": 2.954824447631836, "learning_rate": 4.985724483620168e-05, "loss": 1.272, "step": 9200 }, { "epoch": 0.14432253759369326, "grad_norm": 2.7100729942321777, "learning_rate": 4.985569298095874e-05, "loss": 1.2611, "step": 9300 }, { "epoch": 0.1458743928366362, "grad_norm": 2.522920608520508, "learning_rate": 4.9854141125715796e-05, "loss": 1.2809, "step": 9400 }, { "epoch": 0.14742624807957913, "grad_norm": 2.7318480014801025, "learning_rate": 4.9852589270472854e-05, "loss": 1.2941, "step": 9500 }, { "epoch": 0.14897810332252207, "grad_norm": 2.396790027618408, "learning_rate": 4.985103741522991e-05, "loss": 1.2505, "step": 9600 }, { "epoch": 0.150529958565465, "grad_norm": 2.7038660049438477, "learning_rate": 4.984948555998697e-05, "loss": 1.289, "step": 9700 }, { "epoch": 0.15208181380840796, "grad_norm": 2.7483737468719482, "learning_rate": 4.984793370474403e-05, "loss": 1.2443, "step": 9800 }, { "epoch": 0.1536336690513509, "grad_norm": 2.591855525970459, "learning_rate": 4.9846381849501085e-05, "loss": 1.2408, "step": 9900 }, { "epoch": 0.15518552429429383, "grad_norm": 3.110058307647705, "learning_rate": 4.9844829994258136e-05, "loss": 1.2598, "step": 10000 }, { "epoch": 0.15673737953723677, "grad_norm": 2.8000688552856445, "learning_rate": 4.9843278139015194e-05, "loss": 1.2473, "step": 10100 }, { "epoch": 0.1582892347801797, "grad_norm": 2.793288230895996, "learning_rate": 4.984172628377225e-05, "loss": 1.2827, "step": 10200 }, { "epoch": 0.15984109002312263, "grad_norm": 2.920198678970337, "learning_rate": 4.984017442852931e-05, "loss": 1.2707, "step": 10300 }, { "epoch": 0.16139294526606557, "grad_norm": 48.21563720703125, "learning_rate": 4.983862257328637e-05, "loss": 1.2347, "step": 10400 }, { "epoch": 0.16294480050900853, "grad_norm": 3.2240467071533203, "learning_rate": 4.9837070718043425e-05, "loss": 1.2936, "step": 10500 }, { "epoch": 0.16449665575195146, "grad_norm": 3.241227149963379, "learning_rate": 4.9835518862800476e-05, "loss": 1.2432, "step": 10600 }, { "epoch": 0.1660485109948944, "grad_norm": 2.5175390243530273, "learning_rate": 4.9833967007557534e-05, "loss": 1.2699, "step": 10700 }, { "epoch": 0.16760036623783733, "grad_norm": 2.6832706928253174, "learning_rate": 4.983241515231459e-05, "loss": 1.2456, "step": 10800 }, { "epoch": 0.16915222148078027, "grad_norm": 2.9327988624572754, "learning_rate": 4.983086329707165e-05, "loss": 1.2676, "step": 10900 }, { "epoch": 0.1707040767237232, "grad_norm": 3.1567325592041016, "learning_rate": 4.982931144182871e-05, "loss": 1.2721, "step": 11000 }, { "epoch": 0.17225593196666614, "grad_norm": 2.5393593311309814, "learning_rate": 4.9827759586585765e-05, "loss": 1.2699, "step": 11100 }, { "epoch": 0.1738077872096091, "grad_norm": 4.114811897277832, "learning_rate": 4.982620773134282e-05, "loss": 1.275, "step": 11200 }, { "epoch": 0.17535964245255203, "grad_norm": 2.9810516834259033, "learning_rate": 4.982465587609988e-05, "loss": 1.2433, "step": 11300 }, { "epoch": 0.17691149769549497, "grad_norm": 2.4732532501220703, "learning_rate": 4.982310402085694e-05, "loss": 1.2375, "step": 11400 }, { "epoch": 0.1784633529384379, "grad_norm": 2.371781826019287, "learning_rate": 4.9821552165613996e-05, "loss": 1.2318, "step": 11500 }, { "epoch": 0.18001520818138084, "grad_norm": 2.9060535430908203, "learning_rate": 4.982000031037105e-05, "loss": 1.2525, "step": 11600 }, { "epoch": 0.18156706342432377, "grad_norm": 2.883779287338257, "learning_rate": 4.981844845512811e-05, "loss": 1.2312, "step": 11700 }, { "epoch": 0.1831189186672667, "grad_norm": 2.6672604084014893, "learning_rate": 4.981689659988517e-05, "loss": 1.2144, "step": 11800 }, { "epoch": 0.18467077391020967, "grad_norm": 2.889042615890503, "learning_rate": 4.981534474464222e-05, "loss": 1.2627, "step": 11900 }, { "epoch": 0.1862226291531526, "grad_norm": 2.337663173675537, "learning_rate": 4.981379288939928e-05, "loss": 1.2533, "step": 12000 }, { "epoch": 0.18777448439609554, "grad_norm": 2.6485629081726074, "learning_rate": 4.9812241034156335e-05, "loss": 1.2386, "step": 12100 }, { "epoch": 0.18932633963903847, "grad_norm": 2.67083740234375, "learning_rate": 4.981068917891339e-05, "loss": 1.2387, "step": 12200 }, { "epoch": 0.1908781948819814, "grad_norm": 2.9738054275512695, "learning_rate": 4.980913732367045e-05, "loss": 1.2386, "step": 12300 }, { "epoch": 0.19243005012492434, "grad_norm": 2.654750347137451, "learning_rate": 4.980758546842751e-05, "loss": 1.2308, "step": 12400 }, { "epoch": 0.19398190536786727, "grad_norm": 2.7812435626983643, "learning_rate": 4.9806033613184566e-05, "loss": 1.2366, "step": 12500 }, { "epoch": 0.19553376061081024, "grad_norm": 2.52889347076416, "learning_rate": 4.9804481757941624e-05, "loss": 1.2362, "step": 12600 }, { "epoch": 0.19708561585375317, "grad_norm": 2.2839808464050293, "learning_rate": 4.980292990269868e-05, "loss": 1.2229, "step": 12700 }, { "epoch": 0.1986374710966961, "grad_norm": 2.951615571975708, "learning_rate": 4.980137804745574e-05, "loss": 1.2331, "step": 12800 }, { "epoch": 0.20018932633963904, "grad_norm": 2.770116090774536, "learning_rate": 4.97998261922128e-05, "loss": 1.2415, "step": 12900 }, { "epoch": 0.20174118158258197, "grad_norm": 4.46579647064209, "learning_rate": 4.9798274336969855e-05, "loss": 1.229, "step": 13000 }, { "epoch": 0.2032930368255249, "grad_norm": 2.887786865234375, "learning_rate": 4.979672248172691e-05, "loss": 1.2132, "step": 13100 }, { "epoch": 0.20484489206846784, "grad_norm": 2.517394542694092, "learning_rate": 4.9795170626483964e-05, "loss": 1.2195, "step": 13200 }, { "epoch": 0.2063967473114108, "grad_norm": 2.68794322013855, "learning_rate": 4.979361877124102e-05, "loss": 1.2334, "step": 13300 }, { "epoch": 0.20794860255435374, "grad_norm": 2.680004119873047, "learning_rate": 4.979206691599807e-05, "loss": 1.204, "step": 13400 }, { "epoch": 0.20950045779729667, "grad_norm": 2.5410916805267334, "learning_rate": 4.979051506075513e-05, "loss": 1.2093, "step": 13500 }, { "epoch": 0.2110523130402396, "grad_norm": 2.6248254776000977, "learning_rate": 4.978896320551219e-05, "loss": 1.2395, "step": 13600 }, { "epoch": 0.21260416828318254, "grad_norm": 2.945784330368042, "learning_rate": 4.9787411350269246e-05, "loss": 1.223, "step": 13700 }, { "epoch": 0.21415602352612548, "grad_norm": 2.2903714179992676, "learning_rate": 4.9785859495026304e-05, "loss": 1.2346, "step": 13800 }, { "epoch": 0.2157078787690684, "grad_norm": 2.4133968353271484, "learning_rate": 4.978430763978336e-05, "loss": 1.245, "step": 13900 }, { "epoch": 0.21725973401201135, "grad_norm": 2.8150992393493652, "learning_rate": 4.978275578454042e-05, "loss": 1.2469, "step": 14000 }, { "epoch": 0.2188115892549543, "grad_norm": 2.829127788543701, "learning_rate": 4.978120392929748e-05, "loss": 1.2432, "step": 14100 }, { "epoch": 0.22036344449789724, "grad_norm": 2.4632649421691895, "learning_rate": 4.9779652074054535e-05, "loss": 1.224, "step": 14200 }, { "epoch": 0.22191529974084018, "grad_norm": 2.9762189388275146, "learning_rate": 4.977810021881159e-05, "loss": 1.239, "step": 14300 }, { "epoch": 0.2234671549837831, "grad_norm": 2.984637498855591, "learning_rate": 4.977654836356865e-05, "loss": 1.2047, "step": 14400 }, { "epoch": 0.22501901022672605, "grad_norm": 2.882006883621216, "learning_rate": 4.977499650832571e-05, "loss": 1.2193, "step": 14500 }, { "epoch": 0.22657086546966898, "grad_norm": 2.9959402084350586, "learning_rate": 4.9773444653082766e-05, "loss": 1.2037, "step": 14600 }, { "epoch": 0.22812272071261191, "grad_norm": 2.7421374320983887, "learning_rate": 4.9771892797839817e-05, "loss": 1.2136, "step": 14700 }, { "epoch": 0.22967457595555488, "grad_norm": 2.8233425617218018, "learning_rate": 4.9770340942596874e-05, "loss": 1.2309, "step": 14800 }, { "epoch": 0.2312264311984978, "grad_norm": 2.753596067428589, "learning_rate": 4.976878908735393e-05, "loss": 1.2316, "step": 14900 }, { "epoch": 0.23277828644144075, "grad_norm": 2.545809507369995, "learning_rate": 4.976723723211099e-05, "loss": 1.208, "step": 15000 }, { "epoch": 0.23433014168438368, "grad_norm": 2.952648401260376, "learning_rate": 4.976568537686805e-05, "loss": 1.2229, "step": 15100 }, { "epoch": 0.23588199692732661, "grad_norm": 3.081340789794922, "learning_rate": 4.9764133521625105e-05, "loss": 1.2068, "step": 15200 }, { "epoch": 0.23743385217026955, "grad_norm": 2.7662861347198486, "learning_rate": 4.976258166638216e-05, "loss": 1.2197, "step": 15300 }, { "epoch": 0.23898570741321248, "grad_norm": 2.161072254180908, "learning_rate": 4.976102981113922e-05, "loss": 1.2117, "step": 15400 }, { "epoch": 0.24053756265615545, "grad_norm": 3.2673802375793457, "learning_rate": 4.975947795589628e-05, "loss": 1.2018, "step": 15500 }, { "epoch": 0.24208941789909838, "grad_norm": 2.9241068363189697, "learning_rate": 4.9757926100653336e-05, "loss": 1.2103, "step": 15600 }, { "epoch": 0.24364127314204131, "grad_norm": 2.9034130573272705, "learning_rate": 4.9756374245410394e-05, "loss": 1.233, "step": 15700 }, { "epoch": 0.24519312838498425, "grad_norm": 2.767993211746216, "learning_rate": 4.975482239016745e-05, "loss": 1.2037, "step": 15800 }, { "epoch": 0.24674498362792718, "grad_norm": 2.975708484649658, "learning_rate": 4.975327053492451e-05, "loss": 1.2133, "step": 15900 }, { "epoch": 0.24829683887087012, "grad_norm": 2.7778007984161377, "learning_rate": 4.975171867968156e-05, "loss": 1.2315, "step": 16000 }, { "epoch": 0.24984869411381305, "grad_norm": 3.052387237548828, "learning_rate": 4.975016682443862e-05, "loss": 1.2295, "step": 16100 }, { "epoch": 0.251400549356756, "grad_norm": 2.9833662509918213, "learning_rate": 4.9748614969195676e-05, "loss": 1.2228, "step": 16200 }, { "epoch": 0.2529524045996989, "grad_norm": 2.916350841522217, "learning_rate": 4.9747063113952734e-05, "loss": 1.2129, "step": 16300 }, { "epoch": 0.25450425984264186, "grad_norm": 2.476494312286377, "learning_rate": 4.974551125870979e-05, "loss": 1.2301, "step": 16400 }, { "epoch": 0.2560561150855848, "grad_norm": 2.690457344055176, "learning_rate": 4.974395940346684e-05, "loss": 1.2177, "step": 16500 }, { "epoch": 0.2576079703285278, "grad_norm": 2.43115496635437, "learning_rate": 4.97424075482239e-05, "loss": 1.2253, "step": 16600 }, { "epoch": 0.2591598255714707, "grad_norm": 2.7018213272094727, "learning_rate": 4.974085569298096e-05, "loss": 1.2123, "step": 16700 }, { "epoch": 0.26071168081441365, "grad_norm": 2.511507987976074, "learning_rate": 4.9739303837738016e-05, "loss": 1.2142, "step": 16800 }, { "epoch": 0.2622635360573566, "grad_norm": 3.345106363296509, "learning_rate": 4.9737751982495074e-05, "loss": 1.1978, "step": 16900 }, { "epoch": 0.2638153913002995, "grad_norm": 2.168949842453003, "learning_rate": 4.973620012725213e-05, "loss": 1.1957, "step": 17000 }, { "epoch": 0.26536724654324245, "grad_norm": 4.0209879875183105, "learning_rate": 4.973464827200919e-05, "loss": 1.2217, "step": 17100 }, { "epoch": 0.2669191017861854, "grad_norm": 2.452793836593628, "learning_rate": 4.973309641676625e-05, "loss": 1.1888, "step": 17200 }, { "epoch": 0.2684709570291283, "grad_norm": 2.43424916267395, "learning_rate": 4.9731544561523305e-05, "loss": 1.2125, "step": 17300 }, { "epoch": 0.27002281227207126, "grad_norm": 2.4367752075195312, "learning_rate": 4.972999270628036e-05, "loss": 1.2021, "step": 17400 }, { "epoch": 0.2715746675150142, "grad_norm": 3.776613235473633, "learning_rate": 4.972844085103742e-05, "loss": 1.2292, "step": 17500 }, { "epoch": 0.2731265227579571, "grad_norm": 2.380244255065918, "learning_rate": 4.972688899579447e-05, "loss": 1.1997, "step": 17600 }, { "epoch": 0.27467837800090006, "grad_norm": 2.215346097946167, "learning_rate": 4.972533714055153e-05, "loss": 1.1902, "step": 17700 }, { "epoch": 0.276230233243843, "grad_norm": 2.5526316165924072, "learning_rate": 4.9723785285308587e-05, "loss": 1.202, "step": 17800 }, { "epoch": 0.2777820884867859, "grad_norm": 2.6490514278411865, "learning_rate": 4.9722233430065644e-05, "loss": 1.1679, "step": 17900 }, { "epoch": 0.2793339437297289, "grad_norm": 2.5968635082244873, "learning_rate": 4.97206815748227e-05, "loss": 1.1953, "step": 18000 }, { "epoch": 0.28088579897267185, "grad_norm": 2.3844001293182373, "learning_rate": 4.971912971957976e-05, "loss": 1.1978, "step": 18100 }, { "epoch": 0.2824376542156148, "grad_norm": 2.9162464141845703, "learning_rate": 4.971757786433682e-05, "loss": 1.1812, "step": 18200 }, { "epoch": 0.2839895094585577, "grad_norm": 2.8039488792419434, "learning_rate": 4.9716026009093875e-05, "loss": 1.2201, "step": 18300 }, { "epoch": 0.28554136470150066, "grad_norm": 2.6949081420898438, "learning_rate": 4.971447415385093e-05, "loss": 1.1923, "step": 18400 }, { "epoch": 0.2870932199444436, "grad_norm": 3.427868127822876, "learning_rate": 4.971292229860799e-05, "loss": 1.1928, "step": 18500 }, { "epoch": 0.2886450751873865, "grad_norm": 2.7664973735809326, "learning_rate": 4.971137044336505e-05, "loss": 1.1776, "step": 18600 }, { "epoch": 0.29019693043032946, "grad_norm": 2.5804457664489746, "learning_rate": 4.9709818588122106e-05, "loss": 1.1825, "step": 18700 }, { "epoch": 0.2917487856732724, "grad_norm": 2.0826754570007324, "learning_rate": 4.9708266732879164e-05, "loss": 1.2046, "step": 18800 }, { "epoch": 0.2933006409162153, "grad_norm": 2.0791783332824707, "learning_rate": 4.9706714877636215e-05, "loss": 1.1936, "step": 18900 }, { "epoch": 0.29485249615915826, "grad_norm": 2.9935243129730225, "learning_rate": 4.970516302239327e-05, "loss": 1.2117, "step": 19000 }, { "epoch": 0.2964043514021012, "grad_norm": 2.7947821617126465, "learning_rate": 4.970361116715033e-05, "loss": 1.1847, "step": 19100 }, { "epoch": 0.29795620664504413, "grad_norm": 3.322248697280884, "learning_rate": 4.970205931190739e-05, "loss": 1.1828, "step": 19200 }, { "epoch": 0.29950806188798706, "grad_norm": 2.94177508354187, "learning_rate": 4.9700507456664446e-05, "loss": 1.169, "step": 19300 }, { "epoch": 0.30105991713093, "grad_norm": 2.214639902114868, "learning_rate": 4.9698955601421504e-05, "loss": 1.1985, "step": 19400 }, { "epoch": 0.302611772373873, "grad_norm": 2.946751117706299, "learning_rate": 4.969740374617856e-05, "loss": 1.2027, "step": 19500 }, { "epoch": 0.3041636276168159, "grad_norm": 2.311455011367798, "learning_rate": 4.969585189093562e-05, "loss": 1.1907, "step": 19600 }, { "epoch": 0.30571548285975886, "grad_norm": 2.5364186763763428, "learning_rate": 4.969430003569267e-05, "loss": 1.172, "step": 19700 }, { "epoch": 0.3072673381027018, "grad_norm": 3.111441135406494, "learning_rate": 4.969274818044973e-05, "loss": 1.2063, "step": 19800 }, { "epoch": 0.3088191933456447, "grad_norm": 2.5485990047454834, "learning_rate": 4.9691196325206786e-05, "loss": 1.1993, "step": 19900 }, { "epoch": 0.31037104858858766, "grad_norm": 2.9023211002349854, "learning_rate": 4.9689644469963844e-05, "loss": 1.1784, "step": 20000 }, { "epoch": 0.3119229038315306, "grad_norm": 2.788357973098755, "learning_rate": 4.96880926147209e-05, "loss": 1.2242, "step": 20100 }, { "epoch": 0.31347475907447353, "grad_norm": 2.841526746749878, "learning_rate": 4.968654075947796e-05, "loss": 1.182, "step": 20200 }, { "epoch": 0.31502661431741646, "grad_norm": 2.5089986324310303, "learning_rate": 4.968498890423502e-05, "loss": 1.2189, "step": 20300 }, { "epoch": 0.3165784695603594, "grad_norm": 2.714428424835205, "learning_rate": 4.968343704899207e-05, "loss": 1.2012, "step": 20400 }, { "epoch": 0.31813032480330233, "grad_norm": 2.687601327896118, "learning_rate": 4.9681885193749126e-05, "loss": 1.1655, "step": 20500 }, { "epoch": 0.31968218004624527, "grad_norm": 3.012909412384033, "learning_rate": 4.968033333850618e-05, "loss": 1.1781, "step": 20600 }, { "epoch": 0.3212340352891882, "grad_norm": 2.6074788570404053, "learning_rate": 4.967878148326324e-05, "loss": 1.1976, "step": 20700 }, { "epoch": 0.32278589053213114, "grad_norm": 2.6663897037506104, "learning_rate": 4.96772296280203e-05, "loss": 1.1855, "step": 20800 }, { "epoch": 0.3243377457750741, "grad_norm": 1.7635912895202637, "learning_rate": 4.9675677772777357e-05, "loss": 1.1823, "step": 20900 }, { "epoch": 0.32588960101801706, "grad_norm": 2.90279483795166, "learning_rate": 4.9674125917534414e-05, "loss": 1.2018, "step": 21000 }, { "epoch": 0.32744145626096, "grad_norm": 2.875469923019409, "learning_rate": 4.967257406229147e-05, "loss": 1.1677, "step": 21100 }, { "epoch": 0.32899331150390293, "grad_norm": 2.621666193008423, "learning_rate": 4.967102220704853e-05, "loss": 1.177, "step": 21200 }, { "epoch": 0.33054516674684586, "grad_norm": 2.2737293243408203, "learning_rate": 4.966947035180559e-05, "loss": 1.1937, "step": 21300 }, { "epoch": 0.3320970219897888, "grad_norm": 2.7947998046875, "learning_rate": 4.9667918496562645e-05, "loss": 1.1636, "step": 21400 }, { "epoch": 0.33364887723273173, "grad_norm": 3.27683162689209, "learning_rate": 4.96663666413197e-05, "loss": 1.1737, "step": 21500 }, { "epoch": 0.33520073247567467, "grad_norm": 26.9603328704834, "learning_rate": 4.966481478607676e-05, "loss": 1.1693, "step": 21600 }, { "epoch": 0.3367525877186176, "grad_norm": 2.9683430194854736, "learning_rate": 4.966326293083381e-05, "loss": 1.1793, "step": 21700 }, { "epoch": 0.33830444296156054, "grad_norm": 2.7414071559906006, "learning_rate": 4.966171107559087e-05, "loss": 1.1704, "step": 21800 }, { "epoch": 0.33985629820450347, "grad_norm": 2.71217679977417, "learning_rate": 4.966015922034793e-05, "loss": 1.1848, "step": 21900 }, { "epoch": 0.3414081534474464, "grad_norm": 2.903348684310913, "learning_rate": 4.9658607365104985e-05, "loss": 1.175, "step": 22000 }, { "epoch": 0.34296000869038934, "grad_norm": 2.9217734336853027, "learning_rate": 4.965705550986204e-05, "loss": 1.192, "step": 22100 }, { "epoch": 0.3445118639333323, "grad_norm": 2.362107992172241, "learning_rate": 4.96555036546191e-05, "loss": 1.1597, "step": 22200 }, { "epoch": 0.34606371917627526, "grad_norm": 2.730642557144165, "learning_rate": 4.965395179937616e-05, "loss": 1.171, "step": 22300 }, { "epoch": 0.3476155744192182, "grad_norm": 3.0440003871917725, "learning_rate": 4.9652399944133216e-05, "loss": 1.1621, "step": 22400 }, { "epoch": 0.34916742966216113, "grad_norm": 2.230546712875366, "learning_rate": 4.9650848088890274e-05, "loss": 1.172, "step": 22500 }, { "epoch": 0.35071928490510407, "grad_norm": 2.7254559993743896, "learning_rate": 4.964929623364733e-05, "loss": 1.1925, "step": 22600 }, { "epoch": 0.352271140148047, "grad_norm": 2.2841336727142334, "learning_rate": 4.964774437840439e-05, "loss": 1.1719, "step": 22700 }, { "epoch": 0.35382299539098994, "grad_norm": 2.6718087196350098, "learning_rate": 4.964619252316145e-05, "loss": 1.1562, "step": 22800 }, { "epoch": 0.35537485063393287, "grad_norm": 2.660179615020752, "learning_rate": 4.9644640667918505e-05, "loss": 1.1798, "step": 22900 }, { "epoch": 0.3569267058768758, "grad_norm": 2.939154863357544, "learning_rate": 4.9643088812675556e-05, "loss": 1.1639, "step": 23000 }, { "epoch": 0.35847856111981874, "grad_norm": 2.88409423828125, "learning_rate": 4.9641536957432614e-05, "loss": 1.1984, "step": 23100 }, { "epoch": 0.3600304163627617, "grad_norm": 2.2982733249664307, "learning_rate": 4.9639985102189665e-05, "loss": 1.1564, "step": 23200 }, { "epoch": 0.3615822716057046, "grad_norm": 3.5217339992523193, "learning_rate": 4.963843324694672e-05, "loss": 1.1676, "step": 23300 }, { "epoch": 0.36313412684864754, "grad_norm": 2.7142276763916016, "learning_rate": 4.963688139170378e-05, "loss": 1.1563, "step": 23400 }, { "epoch": 0.3646859820915905, "grad_norm": 2.4074008464813232, "learning_rate": 4.963532953646084e-05, "loss": 1.1753, "step": 23500 }, { "epoch": 0.3662378373345334, "grad_norm": 2.583975076675415, "learning_rate": 4.9633777681217896e-05, "loss": 1.1711, "step": 23600 }, { "epoch": 0.36778969257747635, "grad_norm": 2.313282012939453, "learning_rate": 4.963222582597495e-05, "loss": 1.1722, "step": 23700 }, { "epoch": 0.36934154782041934, "grad_norm": 2.736443519592285, "learning_rate": 4.963067397073201e-05, "loss": 1.1558, "step": 23800 }, { "epoch": 0.37089340306336227, "grad_norm": 2.2908873558044434, "learning_rate": 4.962912211548907e-05, "loss": 1.1671, "step": 23900 }, { "epoch": 0.3724452583063052, "grad_norm": 2.5124282836914062, "learning_rate": 4.9627570260246127e-05, "loss": 1.1858, "step": 24000 }, { "epoch": 0.37399711354924814, "grad_norm": 2.4411725997924805, "learning_rate": 4.9626018405003184e-05, "loss": 1.1705, "step": 24100 }, { "epoch": 0.3755489687921911, "grad_norm": 2.7962143421173096, "learning_rate": 4.962446654976024e-05, "loss": 1.173, "step": 24200 }, { "epoch": 0.377100824035134, "grad_norm": 3.041498899459839, "learning_rate": 4.96229146945173e-05, "loss": 1.199, "step": 24300 }, { "epoch": 0.37865267927807694, "grad_norm": 2.606754779815674, "learning_rate": 4.962136283927436e-05, "loss": 1.1739, "step": 24400 }, { "epoch": 0.3802045345210199, "grad_norm": 2.7982420921325684, "learning_rate": 4.961981098403141e-05, "loss": 1.16, "step": 24500 }, { "epoch": 0.3817563897639628, "grad_norm": 2.406071901321411, "learning_rate": 4.9618259128788466e-05, "loss": 1.1612, "step": 24600 }, { "epoch": 0.38330824500690575, "grad_norm": 2.8252360820770264, "learning_rate": 4.9616707273545524e-05, "loss": 1.1523, "step": 24700 }, { "epoch": 0.3848601002498487, "grad_norm": 3.584843158721924, "learning_rate": 4.961515541830258e-05, "loss": 1.1571, "step": 24800 }, { "epoch": 0.3864119554927916, "grad_norm": 2.9453184604644775, "learning_rate": 4.961360356305964e-05, "loss": 1.158, "step": 24900 }, { "epoch": 0.38796381073573455, "grad_norm": 3.0717360973358154, "learning_rate": 4.96120517078167e-05, "loss": 1.133, "step": 25000 }, { "epoch": 0.3895156659786775, "grad_norm": 2.912917137145996, "learning_rate": 4.9610499852573755e-05, "loss": 1.1833, "step": 25100 }, { "epoch": 0.3910675212216205, "grad_norm": 2.908015489578247, "learning_rate": 4.960894799733081e-05, "loss": 1.1832, "step": 25200 }, { "epoch": 0.3926193764645634, "grad_norm": 2.8290598392486572, "learning_rate": 4.960739614208787e-05, "loss": 1.1833, "step": 25300 }, { "epoch": 0.39417123170750634, "grad_norm": 2.5732614994049072, "learning_rate": 4.960584428684493e-05, "loss": 1.1492, "step": 25400 }, { "epoch": 0.3957230869504493, "grad_norm": 2.2321372032165527, "learning_rate": 4.9604292431601986e-05, "loss": 1.159, "step": 25500 }, { "epoch": 0.3972749421933922, "grad_norm": 2.465810775756836, "learning_rate": 4.9602740576359044e-05, "loss": 1.1573, "step": 25600 }, { "epoch": 0.39882679743633515, "grad_norm": 2.4903128147125244, "learning_rate": 4.96011887211161e-05, "loss": 1.1616, "step": 25700 }, { "epoch": 0.4003786526792781, "grad_norm": 2.489631414413452, "learning_rate": 4.959963686587315e-05, "loss": 1.1683, "step": 25800 }, { "epoch": 0.401930507922221, "grad_norm": 2.5110340118408203, "learning_rate": 4.959808501063021e-05, "loss": 1.1402, "step": 25900 }, { "epoch": 0.40348236316516395, "grad_norm": 2.531416893005371, "learning_rate": 4.959653315538727e-05, "loss": 1.1502, "step": 26000 }, { "epoch": 0.4050342184081069, "grad_norm": 2.8293380737304688, "learning_rate": 4.9594981300144326e-05, "loss": 1.1535, "step": 26100 }, { "epoch": 0.4065860736510498, "grad_norm": 2.5488884449005127, "learning_rate": 4.959342944490138e-05, "loss": 1.1686, "step": 26200 }, { "epoch": 0.40813792889399275, "grad_norm": 2.620323657989502, "learning_rate": 4.9591877589658435e-05, "loss": 1.1791, "step": 26300 }, { "epoch": 0.4096897841369357, "grad_norm": 2.8866984844207764, "learning_rate": 4.959032573441549e-05, "loss": 1.1604, "step": 26400 }, { "epoch": 0.4112416393798786, "grad_norm": 2.5552427768707275, "learning_rate": 4.958877387917255e-05, "loss": 1.1514, "step": 26500 }, { "epoch": 0.4127934946228216, "grad_norm": 2.891822099685669, "learning_rate": 4.958722202392961e-05, "loss": 1.1579, "step": 26600 }, { "epoch": 0.41434534986576455, "grad_norm": 2.2839558124542236, "learning_rate": 4.9585670168686666e-05, "loss": 1.1527, "step": 26700 }, { "epoch": 0.4158972051087075, "grad_norm": 2.4230873584747314, "learning_rate": 4.958411831344372e-05, "loss": 1.1567, "step": 26800 }, { "epoch": 0.4174490603516504, "grad_norm": 2.495814323425293, "learning_rate": 4.958256645820078e-05, "loss": 1.1679, "step": 26900 }, { "epoch": 0.41900091559459335, "grad_norm": 2.8364436626434326, "learning_rate": 4.958101460295784e-05, "loss": 1.1509, "step": 27000 }, { "epoch": 0.4205527708375363, "grad_norm": 2.4355123043060303, "learning_rate": 4.9579462747714897e-05, "loss": 1.1582, "step": 27100 }, { "epoch": 0.4221046260804792, "grad_norm": 2.454505443572998, "learning_rate": 4.9577910892471954e-05, "loss": 1.1545, "step": 27200 }, { "epoch": 0.42365648132342215, "grad_norm": 2.3640785217285156, "learning_rate": 4.957635903722901e-05, "loss": 1.1564, "step": 27300 }, { "epoch": 0.4252083365663651, "grad_norm": 2.8007636070251465, "learning_rate": 4.957480718198606e-05, "loss": 1.1349, "step": 27400 }, { "epoch": 0.426760191809308, "grad_norm": 2.4087722301483154, "learning_rate": 4.957325532674312e-05, "loss": 1.1556, "step": 27500 }, { "epoch": 0.42831204705225095, "grad_norm": 2.6242592334747314, "learning_rate": 4.957170347150018e-05, "loss": 1.1377, "step": 27600 }, { "epoch": 0.4298639022951939, "grad_norm": 2.378854513168335, "learning_rate": 4.9570151616257236e-05, "loss": 1.158, "step": 27700 }, { "epoch": 0.4314157575381368, "grad_norm": 2.6766738891601562, "learning_rate": 4.9568599761014294e-05, "loss": 1.1671, "step": 27800 }, { "epoch": 0.43296761278107976, "grad_norm": 2.683157444000244, "learning_rate": 4.956704790577135e-05, "loss": 1.1448, "step": 27900 }, { "epoch": 0.4345194680240227, "grad_norm": 2.928833484649658, "learning_rate": 4.956549605052841e-05, "loss": 1.1536, "step": 28000 }, { "epoch": 0.4360713232669657, "grad_norm": 2.8657729625701904, "learning_rate": 4.956394419528547e-05, "loss": 1.141, "step": 28100 }, { "epoch": 0.4376231785099086, "grad_norm": 2.5009307861328125, "learning_rate": 4.9562392340042525e-05, "loss": 1.1488, "step": 28200 }, { "epoch": 0.43917503375285155, "grad_norm": 2.4944400787353516, "learning_rate": 4.956084048479958e-05, "loss": 1.1624, "step": 28300 }, { "epoch": 0.4407268889957945, "grad_norm": 2.1814956665039062, "learning_rate": 4.955928862955664e-05, "loss": 1.1487, "step": 28400 }, { "epoch": 0.4422787442387374, "grad_norm": 3.0433692932128906, "learning_rate": 4.95577367743137e-05, "loss": 1.1624, "step": 28500 }, { "epoch": 0.44383059948168035, "grad_norm": 2.600383758544922, "learning_rate": 4.9556184919070756e-05, "loss": 1.1378, "step": 28600 }, { "epoch": 0.4453824547246233, "grad_norm": 2.874927043914795, "learning_rate": 4.955463306382781e-05, "loss": 1.1637, "step": 28700 }, { "epoch": 0.4469343099675662, "grad_norm": 2.6033880710601807, "learning_rate": 4.9553081208584865e-05, "loss": 1.1498, "step": 28800 }, { "epoch": 0.44848616521050916, "grad_norm": 2.7181801795959473, "learning_rate": 4.955152935334192e-05, "loss": 1.1433, "step": 28900 }, { "epoch": 0.4500380204534521, "grad_norm": 3.674260139465332, "learning_rate": 4.954997749809898e-05, "loss": 1.142, "step": 29000 }, { "epoch": 0.451589875696395, "grad_norm": 2.7757997512817383, "learning_rate": 4.954842564285604e-05, "loss": 1.1451, "step": 29100 }, { "epoch": 0.45314173093933796, "grad_norm": 2.6837732791900635, "learning_rate": 4.9546873787613096e-05, "loss": 1.1456, "step": 29200 }, { "epoch": 0.4546935861822809, "grad_norm": 2.720156192779541, "learning_rate": 4.9545321932370154e-05, "loss": 1.1625, "step": 29300 }, { "epoch": 0.45624544142522383, "grad_norm": 2.381976842880249, "learning_rate": 4.954377007712721e-05, "loss": 1.178, "step": 29400 }, { "epoch": 0.4577972966681668, "grad_norm": 2.576415538787842, "learning_rate": 4.954221822188426e-05, "loss": 1.1538, "step": 29500 }, { "epoch": 0.45934915191110975, "grad_norm": 3.0559499263763428, "learning_rate": 4.954066636664132e-05, "loss": 1.1645, "step": 29600 }, { "epoch": 0.4609010071540527, "grad_norm": 3.302246332168579, "learning_rate": 4.953911451139838e-05, "loss": 1.1166, "step": 29700 }, { "epoch": 0.4624528623969956, "grad_norm": 1.8563324213027954, "learning_rate": 4.9537562656155436e-05, "loss": 1.132, "step": 29800 }, { "epoch": 0.46400471763993856, "grad_norm": 2.5362658500671387, "learning_rate": 4.953601080091249e-05, "loss": 1.1632, "step": 29900 }, { "epoch": 0.4655565728828815, "grad_norm": 3.1611690521240234, "learning_rate": 4.953445894566955e-05, "loss": 1.1381, "step": 30000 }, { "epoch": 0.4671084281258244, "grad_norm": 2.5425455570220947, "learning_rate": 4.953290709042661e-05, "loss": 1.1347, "step": 30100 }, { "epoch": 0.46866028336876736, "grad_norm": 2.4715189933776855, "learning_rate": 4.953135523518366e-05, "loss": 1.1552, "step": 30200 }, { "epoch": 0.4702121386117103, "grad_norm": 2.5517332553863525, "learning_rate": 4.952980337994072e-05, "loss": 1.1627, "step": 30300 }, { "epoch": 0.47176399385465323, "grad_norm": 2.395547389984131, "learning_rate": 4.9528251524697775e-05, "loss": 1.1632, "step": 30400 }, { "epoch": 0.47331584909759616, "grad_norm": 2.9471380710601807, "learning_rate": 4.952669966945483e-05, "loss": 1.1188, "step": 30500 }, { "epoch": 0.4748677043405391, "grad_norm": 2.8382833003997803, "learning_rate": 4.952514781421189e-05, "loss": 1.152, "step": 30600 }, { "epoch": 0.47641955958348203, "grad_norm": 2.099393844604492, "learning_rate": 4.952359595896895e-05, "loss": 1.1497, "step": 30700 }, { "epoch": 0.47797141482642497, "grad_norm": 2.370145559310913, "learning_rate": 4.9522044103726006e-05, "loss": 1.1263, "step": 30800 }, { "epoch": 0.4795232700693679, "grad_norm": 2.344712495803833, "learning_rate": 4.9520492248483064e-05, "loss": 1.1525, "step": 30900 }, { "epoch": 0.4810751253123109, "grad_norm": 2.09525203704834, "learning_rate": 4.951894039324012e-05, "loss": 1.1598, "step": 31000 }, { "epoch": 0.4826269805552538, "grad_norm": 2.7542600631713867, "learning_rate": 4.951738853799718e-05, "loss": 1.1388, "step": 31100 }, { "epoch": 0.48417883579819676, "grad_norm": 2.4297680854797363, "learning_rate": 4.951583668275424e-05, "loss": 1.1593, "step": 31200 }, { "epoch": 0.4857306910411397, "grad_norm": 2.206874132156372, "learning_rate": 4.9514284827511295e-05, "loss": 1.1439, "step": 31300 }, { "epoch": 0.48728254628408263, "grad_norm": 2.691545248031616, "learning_rate": 4.951273297226835e-05, "loss": 1.1464, "step": 31400 }, { "epoch": 0.48883440152702556, "grad_norm": 2.7827885150909424, "learning_rate": 4.9511181117025404e-05, "loss": 1.1553, "step": 31500 }, { "epoch": 0.4903862567699685, "grad_norm": 3.0572757720947266, "learning_rate": 4.950962926178246e-05, "loss": 1.1316, "step": 31600 }, { "epoch": 0.49193811201291143, "grad_norm": 2.358288526535034, "learning_rate": 4.950807740653952e-05, "loss": 1.1541, "step": 31700 }, { "epoch": 0.49348996725585437, "grad_norm": 2.3115744590759277, "learning_rate": 4.950652555129658e-05, "loss": 1.1576, "step": 31800 }, { "epoch": 0.4950418224987973, "grad_norm": 2.6620938777923584, "learning_rate": 4.9504973696053635e-05, "loss": 1.1593, "step": 31900 }, { "epoch": 0.49659367774174024, "grad_norm": 2.2709712982177734, "learning_rate": 4.950342184081069e-05, "loss": 1.1588, "step": 32000 }, { "epoch": 0.49814553298468317, "grad_norm": 2.778697967529297, "learning_rate": 4.950186998556775e-05, "loss": 1.1311, "step": 32100 }, { "epoch": 0.4996973882276261, "grad_norm": 3.074932098388672, "learning_rate": 4.950031813032481e-05, "loss": 1.1136, "step": 32200 }, { "epoch": 0.5012492434705691, "grad_norm": 2.1349308490753174, "learning_rate": 4.9498766275081866e-05, "loss": 1.1291, "step": 32300 }, { "epoch": 0.502801098713512, "grad_norm": 3.155304193496704, "learning_rate": 4.9497214419838924e-05, "loss": 1.1354, "step": 32400 }, { "epoch": 0.504352953956455, "grad_norm": 2.6743218898773193, "learning_rate": 4.949566256459598e-05, "loss": 1.1479, "step": 32500 }, { "epoch": 0.5059048091993978, "grad_norm": 3.021655559539795, "learning_rate": 4.949411070935304e-05, "loss": 1.1327, "step": 32600 }, { "epoch": 0.5074566644423408, "grad_norm": 2.8575494289398193, "learning_rate": 4.94925588541101e-05, "loss": 1.1634, "step": 32700 }, { "epoch": 0.5090085196852837, "grad_norm": 2.7251968383789062, "learning_rate": 4.949100699886715e-05, "loss": 1.1361, "step": 32800 }, { "epoch": 0.5105603749282267, "grad_norm": 3.285295009613037, "learning_rate": 4.9489455143624206e-05, "loss": 1.1445, "step": 32900 }, { "epoch": 0.5121122301711696, "grad_norm": 2.538975715637207, "learning_rate": 4.9487903288381257e-05, "loss": 1.1303, "step": 33000 }, { "epoch": 0.5136640854141126, "grad_norm": 2.4272913932800293, "learning_rate": 4.9486351433138314e-05, "loss": 1.128, "step": 33100 }, { "epoch": 0.5152159406570556, "grad_norm": 2.3741321563720703, "learning_rate": 4.948479957789537e-05, "loss": 1.1539, "step": 33200 }, { "epoch": 0.5167677958999984, "grad_norm": 7.188132286071777, "learning_rate": 4.948324772265243e-05, "loss": 1.144, "step": 33300 }, { "epoch": 0.5183196511429414, "grad_norm": 2.5718352794647217, "learning_rate": 4.948169586740949e-05, "loss": 1.1495, "step": 33400 }, { "epoch": 0.5198715063858843, "grad_norm": 2.825176239013672, "learning_rate": 4.9480144012166545e-05, "loss": 1.132, "step": 33500 }, { "epoch": 0.5214233616288273, "grad_norm": 2.5730903148651123, "learning_rate": 4.94785921569236e-05, "loss": 1.1296, "step": 33600 }, { "epoch": 0.5229752168717702, "grad_norm": 2.0883777141571045, "learning_rate": 4.947704030168066e-05, "loss": 1.1228, "step": 33700 }, { "epoch": 0.5245270721147132, "grad_norm": 2.6520566940307617, "learning_rate": 4.947548844643772e-05, "loss": 1.1494, "step": 33800 }, { "epoch": 0.526078927357656, "grad_norm": 2.6946041584014893, "learning_rate": 4.9473936591194776e-05, "loss": 1.1582, "step": 33900 }, { "epoch": 0.527630782600599, "grad_norm": 2.82041597366333, "learning_rate": 4.9472384735951834e-05, "loss": 1.1349, "step": 34000 }, { "epoch": 0.5291826378435419, "grad_norm": 2.5415875911712646, "learning_rate": 4.947083288070889e-05, "loss": 1.1415, "step": 34100 }, { "epoch": 0.5307344930864849, "grad_norm": 6.711753845214844, "learning_rate": 4.946928102546595e-05, "loss": 1.1377, "step": 34200 }, { "epoch": 0.5322863483294278, "grad_norm": 2.3514139652252197, "learning_rate": 4.946772917022301e-05, "loss": 1.1434, "step": 34300 }, { "epoch": 0.5338382035723708, "grad_norm": 2.9995036125183105, "learning_rate": 4.946617731498006e-05, "loss": 1.1386, "step": 34400 }, { "epoch": 0.5353900588153138, "grad_norm": 2.06797194480896, "learning_rate": 4.9464625459737116e-05, "loss": 1.1229, "step": 34500 }, { "epoch": 0.5369419140582566, "grad_norm": 2.6438021659851074, "learning_rate": 4.9463073604494174e-05, "loss": 1.1575, "step": 34600 }, { "epoch": 0.5384937693011996, "grad_norm": 2.378782272338867, "learning_rate": 4.946152174925123e-05, "loss": 1.1301, "step": 34700 }, { "epoch": 0.5400456245441425, "grad_norm": 3.369473695755005, "learning_rate": 4.945996989400829e-05, "loss": 1.1232, "step": 34800 }, { "epoch": 0.5415974797870855, "grad_norm": 2.5553507804870605, "learning_rate": 4.945841803876535e-05, "loss": 1.1191, "step": 34900 }, { "epoch": 0.5431493350300284, "grad_norm": 2.4268648624420166, "learning_rate": 4.9456866183522405e-05, "loss": 1.146, "step": 35000 }, { "epoch": 0.5447011902729714, "grad_norm": 3.1870791912078857, "learning_rate": 4.945531432827946e-05, "loss": 1.1126, "step": 35100 }, { "epoch": 0.5462530455159142, "grad_norm": 2.4038939476013184, "learning_rate": 4.945376247303652e-05, "loss": 1.1303, "step": 35200 }, { "epoch": 0.5478049007588572, "grad_norm": 2.4393744468688965, "learning_rate": 4.945221061779358e-05, "loss": 1.1089, "step": 35300 }, { "epoch": 0.5493567560018001, "grad_norm": 2.5866446495056152, "learning_rate": 4.9450658762550636e-05, "loss": 1.1433, "step": 35400 }, { "epoch": 0.5509086112447431, "grad_norm": 2.4120492935180664, "learning_rate": 4.9449106907307694e-05, "loss": 1.1392, "step": 35500 }, { "epoch": 0.552460466487686, "grad_norm": 2.380028247833252, "learning_rate": 4.944755505206475e-05, "loss": 1.1361, "step": 35600 }, { "epoch": 0.554012321730629, "grad_norm": 2.7485151290893555, "learning_rate": 4.94460031968218e-05, "loss": 1.1232, "step": 35700 }, { "epoch": 0.5555641769735719, "grad_norm": 2.8064818382263184, "learning_rate": 4.944445134157886e-05, "loss": 1.1247, "step": 35800 }, { "epoch": 0.5571160322165148, "grad_norm": 2.3744118213653564, "learning_rate": 4.944289948633592e-05, "loss": 1.1186, "step": 35900 }, { "epoch": 0.5586678874594578, "grad_norm": 2.564366579055786, "learning_rate": 4.944134763109297e-05, "loss": 1.1538, "step": 36000 }, { "epoch": 0.5602197427024007, "grad_norm": 4.201200485229492, "learning_rate": 4.9439795775850027e-05, "loss": 1.125, "step": 36100 }, { "epoch": 0.5617715979453437, "grad_norm": 2.363288402557373, "learning_rate": 4.9438243920607084e-05, "loss": 1.1569, "step": 36200 }, { "epoch": 0.5633234531882866, "grad_norm": 2.488767147064209, "learning_rate": 4.943669206536414e-05, "loss": 1.1213, "step": 36300 }, { "epoch": 0.5648753084312296, "grad_norm": 2.190563678741455, "learning_rate": 4.94351402101212e-05, "loss": 1.1409, "step": 36400 }, { "epoch": 0.5664271636741725, "grad_norm": 2.2449207305908203, "learning_rate": 4.943358835487826e-05, "loss": 1.1357, "step": 36500 }, { "epoch": 0.5679790189171154, "grad_norm": 2.7575862407684326, "learning_rate": 4.9432036499635315e-05, "loss": 1.1372, "step": 36600 }, { "epoch": 0.5695308741600583, "grad_norm": 2.2040743827819824, "learning_rate": 4.943048464439237e-05, "loss": 1.1247, "step": 36700 }, { "epoch": 0.5710827294030013, "grad_norm": 2.739124298095703, "learning_rate": 4.942893278914943e-05, "loss": 1.116, "step": 36800 }, { "epoch": 0.5726345846459442, "grad_norm": 2.419942855834961, "learning_rate": 4.942738093390649e-05, "loss": 1.1328, "step": 36900 }, { "epoch": 0.5741864398888872, "grad_norm": 2.9630236625671387, "learning_rate": 4.9425829078663546e-05, "loss": 1.1298, "step": 37000 }, { "epoch": 0.5757382951318301, "grad_norm": 2.126621961593628, "learning_rate": 4.9424277223420604e-05, "loss": 1.1367, "step": 37100 }, { "epoch": 0.577290150374773, "grad_norm": 2.2420880794525146, "learning_rate": 4.9422725368177655e-05, "loss": 1.1288, "step": 37200 }, { "epoch": 0.5788420056177159, "grad_norm": 2.6294736862182617, "learning_rate": 4.942117351293471e-05, "loss": 1.1103, "step": 37300 }, { "epoch": 0.5803938608606589, "grad_norm": 1.9476730823516846, "learning_rate": 4.941962165769177e-05, "loss": 1.1109, "step": 37400 }, { "epoch": 0.5819457161036019, "grad_norm": 2.9728081226348877, "learning_rate": 4.941806980244883e-05, "loss": 1.1235, "step": 37500 }, { "epoch": 0.5834975713465448, "grad_norm": 2.017223834991455, "learning_rate": 4.9416517947205886e-05, "loss": 1.1178, "step": 37600 }, { "epoch": 0.5850494265894878, "grad_norm": 2.5615036487579346, "learning_rate": 4.9414966091962944e-05, "loss": 1.1249, "step": 37700 }, { "epoch": 0.5866012818324307, "grad_norm": 2.712923049926758, "learning_rate": 4.941341423672e-05, "loss": 1.1243, "step": 37800 }, { "epoch": 0.5881531370753736, "grad_norm": 3.0648932456970215, "learning_rate": 4.941186238147706e-05, "loss": 1.1314, "step": 37900 }, { "epoch": 0.5897049923183165, "grad_norm": 2.3586535453796387, "learning_rate": 4.941031052623412e-05, "loss": 1.1296, "step": 38000 }, { "epoch": 0.5912568475612595, "grad_norm": 2.650059461593628, "learning_rate": 4.9408758670991175e-05, "loss": 1.1336, "step": 38100 }, { "epoch": 0.5928087028042024, "grad_norm": 2.508286476135254, "learning_rate": 4.940720681574823e-05, "loss": 1.122, "step": 38200 }, { "epoch": 0.5943605580471454, "grad_norm": 3.0738933086395264, "learning_rate": 4.940565496050529e-05, "loss": 1.1277, "step": 38300 }, { "epoch": 0.5959124132900883, "grad_norm": 2.4957597255706787, "learning_rate": 4.940410310526235e-05, "loss": 1.1437, "step": 38400 }, { "epoch": 0.5974642685330313, "grad_norm": 2.923720121383667, "learning_rate": 4.94025512500194e-05, "loss": 1.0993, "step": 38500 }, { "epoch": 0.5990161237759741, "grad_norm": 2.633573055267334, "learning_rate": 4.940099939477646e-05, "loss": 1.125, "step": 38600 }, { "epoch": 0.6005679790189171, "grad_norm": 2.9147140979766846, "learning_rate": 4.9399447539533515e-05, "loss": 1.1352, "step": 38700 }, { "epoch": 0.60211983426186, "grad_norm": 2.4714407920837402, "learning_rate": 4.939789568429057e-05, "loss": 1.1179, "step": 38800 }, { "epoch": 0.603671689504803, "grad_norm": 1.9474995136260986, "learning_rate": 4.939634382904763e-05, "loss": 1.1358, "step": 38900 }, { "epoch": 0.605223544747746, "grad_norm": 2.541145086288452, "learning_rate": 4.939479197380469e-05, "loss": 1.1022, "step": 39000 }, { "epoch": 0.6067753999906889, "grad_norm": 2.412468671798706, "learning_rate": 4.9393240118561746e-05, "loss": 1.1407, "step": 39100 }, { "epoch": 0.6083272552336318, "grad_norm": 2.4639394283294678, "learning_rate": 4.93916882633188e-05, "loss": 1.1148, "step": 39200 }, { "epoch": 0.6098791104765747, "grad_norm": 2.705397844314575, "learning_rate": 4.9390136408075854e-05, "loss": 1.1571, "step": 39300 }, { "epoch": 0.6114309657195177, "grad_norm": 2.449930429458618, "learning_rate": 4.938858455283291e-05, "loss": 1.1069, "step": 39400 }, { "epoch": 0.6129828209624606, "grad_norm": 2.239381790161133, "learning_rate": 4.938703269758997e-05, "loss": 1.1131, "step": 39500 }, { "epoch": 0.6145346762054036, "grad_norm": 2.6832222938537598, "learning_rate": 4.938548084234703e-05, "loss": 1.1032, "step": 39600 }, { "epoch": 0.6160865314483465, "grad_norm": 3.0176517963409424, "learning_rate": 4.9383928987104085e-05, "loss": 1.1411, "step": 39700 }, { "epoch": 0.6176383866912895, "grad_norm": 2.8559181690216064, "learning_rate": 4.938237713186114e-05, "loss": 1.1024, "step": 39800 }, { "epoch": 0.6191902419342323, "grad_norm": 3.0682384967803955, "learning_rate": 4.93808252766182e-05, "loss": 1.1149, "step": 39900 }, { "epoch": 0.6207420971771753, "grad_norm": 2.580442190170288, "learning_rate": 4.937927342137525e-05, "loss": 1.1085, "step": 40000 }, { "epoch": 0.6222939524201182, "grad_norm": 2.4026238918304443, "learning_rate": 4.937772156613231e-05, "loss": 1.1156, "step": 40100 }, { "epoch": 0.6238458076630612, "grad_norm": 2.4882469177246094, "learning_rate": 4.937616971088937e-05, "loss": 1.1303, "step": 40200 }, { "epoch": 0.6253976629060042, "grad_norm": 2.6450412273406982, "learning_rate": 4.9374617855646425e-05, "loss": 1.1108, "step": 40300 }, { "epoch": 0.6269495181489471, "grad_norm": 2.4239213466644287, "learning_rate": 4.937306600040348e-05, "loss": 1.1059, "step": 40400 }, { "epoch": 0.62850137339189, "grad_norm": 2.912869691848755, "learning_rate": 4.937151414516054e-05, "loss": 1.1213, "step": 40500 }, { "epoch": 0.6300532286348329, "grad_norm": 2.7472879886627197, "learning_rate": 4.93699622899176e-05, "loss": 1.1243, "step": 40600 }, { "epoch": 0.6316050838777759, "grad_norm": 2.6630377769470215, "learning_rate": 4.9368410434674656e-05, "loss": 1.104, "step": 40700 }, { "epoch": 0.6331569391207188, "grad_norm": 2.353990316390991, "learning_rate": 4.9366858579431714e-05, "loss": 1.1273, "step": 40800 }, { "epoch": 0.6347087943636618, "grad_norm": 2.4834976196289062, "learning_rate": 4.936530672418877e-05, "loss": 1.1142, "step": 40900 }, { "epoch": 0.6362606496066047, "grad_norm": 2.686234712600708, "learning_rate": 4.936375486894583e-05, "loss": 1.1274, "step": 41000 }, { "epoch": 0.6378125048495477, "grad_norm": 2.862837314605713, "learning_rate": 4.936220301370289e-05, "loss": 1.1338, "step": 41100 }, { "epoch": 0.6393643600924905, "grad_norm": 2.9612321853637695, "learning_rate": 4.9360651158459945e-05, "loss": 1.126, "step": 41200 }, { "epoch": 0.6409162153354335, "grad_norm": 2.650003671646118, "learning_rate": 4.9359099303216996e-05, "loss": 1.1191, "step": 41300 }, { "epoch": 0.6424680705783764, "grad_norm": 16.57432746887207, "learning_rate": 4.9357547447974054e-05, "loss": 1.1275, "step": 41400 }, { "epoch": 0.6440199258213194, "grad_norm": 2.125028371810913, "learning_rate": 4.935599559273111e-05, "loss": 1.1134, "step": 41500 }, { "epoch": 0.6455717810642623, "grad_norm": 2.225489854812622, "learning_rate": 4.935444373748817e-05, "loss": 1.1353, "step": 41600 }, { "epoch": 0.6471236363072053, "grad_norm": 2.8373870849609375, "learning_rate": 4.935289188224523e-05, "loss": 1.1109, "step": 41700 }, { "epoch": 0.6486754915501483, "grad_norm": 2.0210273265838623, "learning_rate": 4.9351340027002285e-05, "loss": 1.1252, "step": 41800 }, { "epoch": 0.6502273467930911, "grad_norm": 2.2721107006073, "learning_rate": 4.934978817175934e-05, "loss": 1.1046, "step": 41900 }, { "epoch": 0.6517792020360341, "grad_norm": 2.9533329010009766, "learning_rate": 4.93482363165164e-05, "loss": 1.1079, "step": 42000 }, { "epoch": 0.653331057278977, "grad_norm": 2.634375810623169, "learning_rate": 4.934668446127346e-05, "loss": 1.1156, "step": 42100 }, { "epoch": 0.65488291252192, "grad_norm": 2.975827217102051, "learning_rate": 4.9345132606030516e-05, "loss": 1.102, "step": 42200 }, { "epoch": 0.6564347677648629, "grad_norm": 2.6720640659332275, "learning_rate": 4.934358075078757e-05, "loss": 1.1162, "step": 42300 }, { "epoch": 0.6579866230078059, "grad_norm": 2.710780620574951, "learning_rate": 4.934202889554463e-05, "loss": 1.1404, "step": 42400 }, { "epoch": 0.6595384782507487, "grad_norm": 2.8515689373016357, "learning_rate": 4.934047704030168e-05, "loss": 1.1306, "step": 42500 }, { "epoch": 0.6610903334936917, "grad_norm": 2.520266532897949, "learning_rate": 4.933892518505874e-05, "loss": 1.1057, "step": 42600 }, { "epoch": 0.6626421887366346, "grad_norm": 2.664156436920166, "learning_rate": 4.93373733298158e-05, "loss": 1.094, "step": 42700 }, { "epoch": 0.6641940439795776, "grad_norm": 2.537891387939453, "learning_rate": 4.9335821474572855e-05, "loss": 1.1022, "step": 42800 }, { "epoch": 0.6657458992225205, "grad_norm": 2.7152788639068604, "learning_rate": 4.9334269619329906e-05, "loss": 1.1345, "step": 42900 }, { "epoch": 0.6672977544654635, "grad_norm": 2.616636276245117, "learning_rate": 4.9332717764086964e-05, "loss": 1.1147, "step": 43000 }, { "epoch": 0.6688496097084063, "grad_norm": 2.602280616760254, "learning_rate": 4.933116590884402e-05, "loss": 1.1339, "step": 43100 }, { "epoch": 0.6704014649513493, "grad_norm": 2.2743542194366455, "learning_rate": 4.932961405360108e-05, "loss": 1.1038, "step": 43200 }, { "epoch": 0.6719533201942923, "grad_norm": 2.635343074798584, "learning_rate": 4.932806219835814e-05, "loss": 1.1241, "step": 43300 }, { "epoch": 0.6735051754372352, "grad_norm": 2.385859966278076, "learning_rate": 4.9326510343115195e-05, "loss": 1.1114, "step": 43400 }, { "epoch": 0.6750570306801782, "grad_norm": 2.608365535736084, "learning_rate": 4.932495848787225e-05, "loss": 1.0898, "step": 43500 }, { "epoch": 0.6766088859231211, "grad_norm": 3.1450886726379395, "learning_rate": 4.932340663262931e-05, "loss": 1.1139, "step": 43600 }, { "epoch": 0.6781607411660641, "grad_norm": 2.4519405364990234, "learning_rate": 4.932185477738637e-05, "loss": 1.0961, "step": 43700 }, { "epoch": 0.6797125964090069, "grad_norm": 2.2999653816223145, "learning_rate": 4.9320302922143426e-05, "loss": 1.0965, "step": 43800 }, { "epoch": 0.6812644516519499, "grad_norm": 2.7036595344543457, "learning_rate": 4.9318751066900484e-05, "loss": 1.1185, "step": 43900 }, { "epoch": 0.6828163068948928, "grad_norm": 2.5707831382751465, "learning_rate": 4.931719921165754e-05, "loss": 1.0946, "step": 44000 }, { "epoch": 0.6843681621378358, "grad_norm": 2.936333656311035, "learning_rate": 4.93156473564146e-05, "loss": 1.1, "step": 44100 }, { "epoch": 0.6859200173807787, "grad_norm": 2.6156225204467773, "learning_rate": 4.931409550117165e-05, "loss": 1.1224, "step": 44200 }, { "epoch": 0.6874718726237217, "grad_norm": 2.8167076110839844, "learning_rate": 4.931254364592871e-05, "loss": 1.105, "step": 44300 }, { "epoch": 0.6890237278666645, "grad_norm": 2.7051713466644287, "learning_rate": 4.9310991790685766e-05, "loss": 1.0886, "step": 44400 }, { "epoch": 0.6905755831096075, "grad_norm": 2.4460763931274414, "learning_rate": 4.9309439935442824e-05, "loss": 1.097, "step": 44500 }, { "epoch": 0.6921274383525505, "grad_norm": 2.5376358032226562, "learning_rate": 4.930788808019988e-05, "loss": 1.099, "step": 44600 }, { "epoch": 0.6936792935954934, "grad_norm": 2.4448776245117188, "learning_rate": 4.930633622495694e-05, "loss": 1.1347, "step": 44700 }, { "epoch": 0.6952311488384364, "grad_norm": 1.99077308177948, "learning_rate": 4.9304784369714e-05, "loss": 1.1042, "step": 44800 }, { "epoch": 0.6967830040813793, "grad_norm": 2.4918293952941895, "learning_rate": 4.9303232514471055e-05, "loss": 1.1227, "step": 44900 }, { "epoch": 0.6983348593243223, "grad_norm": 2.7672348022460938, "learning_rate": 4.930168065922811e-05, "loss": 1.136, "step": 45000 }, { "epoch": 0.6998867145672651, "grad_norm": 2.535501718521118, "learning_rate": 4.930012880398517e-05, "loss": 1.1228, "step": 45100 }, { "epoch": 0.7014385698102081, "grad_norm": 2.0237233638763428, "learning_rate": 4.929857694874223e-05, "loss": 1.0964, "step": 45200 }, { "epoch": 0.702990425053151, "grad_norm": 2.909148931503296, "learning_rate": 4.9297025093499286e-05, "loss": 1.1025, "step": 45300 }, { "epoch": 0.704542280296094, "grad_norm": 3.0733165740966797, "learning_rate": 4.929547323825634e-05, "loss": 1.1145, "step": 45400 }, { "epoch": 0.7060941355390369, "grad_norm": 3.0011839866638184, "learning_rate": 4.9293921383013394e-05, "loss": 1.1016, "step": 45500 }, { "epoch": 0.7076459907819799, "grad_norm": 2.568920135498047, "learning_rate": 4.929236952777045e-05, "loss": 1.1107, "step": 45600 }, { "epoch": 0.7091978460249228, "grad_norm": 2.4683704376220703, "learning_rate": 4.929081767252751e-05, "loss": 1.1101, "step": 45700 }, { "epoch": 0.7107497012678657, "grad_norm": 2.7163403034210205, "learning_rate": 4.928926581728456e-05, "loss": 1.0887, "step": 45800 }, { "epoch": 0.7123015565108086, "grad_norm": 2.433684825897217, "learning_rate": 4.928771396204162e-05, "loss": 1.0913, "step": 45900 }, { "epoch": 0.7138534117537516, "grad_norm": 2.5377068519592285, "learning_rate": 4.9286162106798676e-05, "loss": 1.098, "step": 46000 }, { "epoch": 0.7154052669966946, "grad_norm": 2.807605028152466, "learning_rate": 4.9284610251555734e-05, "loss": 1.081, "step": 46100 }, { "epoch": 0.7169571222396375, "grad_norm": 3.076146364212036, "learning_rate": 4.928305839631279e-05, "loss": 1.1156, "step": 46200 }, { "epoch": 0.7185089774825805, "grad_norm": 2.714756727218628, "learning_rate": 4.928150654106985e-05, "loss": 1.1016, "step": 46300 }, { "epoch": 0.7200608327255233, "grad_norm": 3.621466875076294, "learning_rate": 4.927995468582691e-05, "loss": 1.0972, "step": 46400 }, { "epoch": 0.7216126879684663, "grad_norm": 3.0042779445648193, "learning_rate": 4.9278402830583965e-05, "loss": 1.1164, "step": 46500 }, { "epoch": 0.7231645432114092, "grad_norm": 2.5364694595336914, "learning_rate": 4.927685097534102e-05, "loss": 1.0905, "step": 46600 }, { "epoch": 0.7247163984543522, "grad_norm": 2.1452085971832275, "learning_rate": 4.927529912009808e-05, "loss": 1.1139, "step": 46700 }, { "epoch": 0.7262682536972951, "grad_norm": 2.639059543609619, "learning_rate": 4.927374726485514e-05, "loss": 1.0918, "step": 46800 }, { "epoch": 0.7278201089402381, "grad_norm": 2.3126943111419678, "learning_rate": 4.9272195409612196e-05, "loss": 1.0972, "step": 46900 }, { "epoch": 0.729371964183181, "grad_norm": 2.7618236541748047, "learning_rate": 4.927064355436925e-05, "loss": 1.0768, "step": 47000 }, { "epoch": 0.7309238194261239, "grad_norm": 9.789801597595215, "learning_rate": 4.9269091699126305e-05, "loss": 1.1057, "step": 47100 }, { "epoch": 0.7324756746690668, "grad_norm": 2.24517560005188, "learning_rate": 4.926753984388336e-05, "loss": 1.0896, "step": 47200 }, { "epoch": 0.7340275299120098, "grad_norm": 2.4580423831939697, "learning_rate": 4.926598798864042e-05, "loss": 1.1055, "step": 47300 }, { "epoch": 0.7355793851549527, "grad_norm": 2.4072463512420654, "learning_rate": 4.926443613339748e-05, "loss": 1.1033, "step": 47400 }, { "epoch": 0.7371312403978957, "grad_norm": 2.8368380069732666, "learning_rate": 4.9262884278154536e-05, "loss": 1.1266, "step": 47500 }, { "epoch": 0.7386830956408387, "grad_norm": 2.576231002807617, "learning_rate": 4.9261332422911594e-05, "loss": 1.1043, "step": 47600 }, { "epoch": 0.7402349508837816, "grad_norm": 2.436286211013794, "learning_rate": 4.925978056766865e-05, "loss": 1.0973, "step": 47700 }, { "epoch": 0.7417868061267245, "grad_norm": 2.8838682174682617, "learning_rate": 4.925822871242571e-05, "loss": 1.1301, "step": 47800 }, { "epoch": 0.7433386613696674, "grad_norm": 2.8294007778167725, "learning_rate": 4.925667685718277e-05, "loss": 1.0882, "step": 47900 }, { "epoch": 0.7448905166126104, "grad_norm": 2.9646756649017334, "learning_rate": 4.9255125001939825e-05, "loss": 1.0941, "step": 48000 }, { "epoch": 0.7464423718555533, "grad_norm": 2.8057403564453125, "learning_rate": 4.925357314669688e-05, "loss": 1.0894, "step": 48100 }, { "epoch": 0.7479942270984963, "grad_norm": 2.371818780899048, "learning_rate": 4.925202129145394e-05, "loss": 1.0965, "step": 48200 }, { "epoch": 0.7495460823414392, "grad_norm": 2.658454179763794, "learning_rate": 4.925046943621099e-05, "loss": 1.1081, "step": 48300 }, { "epoch": 0.7510979375843821, "grad_norm": 2.69348406791687, "learning_rate": 4.924891758096805e-05, "loss": 1.1165, "step": 48400 }, { "epoch": 0.752649792827325, "grad_norm": 2.6682300567626953, "learning_rate": 4.9247365725725107e-05, "loss": 1.0968, "step": 48500 }, { "epoch": 0.754201648070268, "grad_norm": 1.8431501388549805, "learning_rate": 4.9245813870482164e-05, "loss": 1.1011, "step": 48600 }, { "epoch": 0.7557535033132109, "grad_norm": 2.798457622528076, "learning_rate": 4.924426201523922e-05, "loss": 1.1017, "step": 48700 }, { "epoch": 0.7573053585561539, "grad_norm": 2.3896312713623047, "learning_rate": 4.924271015999628e-05, "loss": 1.1128, "step": 48800 }, { "epoch": 0.7588572137990969, "grad_norm": 2.278090238571167, "learning_rate": 4.924115830475334e-05, "loss": 1.1092, "step": 48900 }, { "epoch": 0.7604090690420398, "grad_norm": 2.4131150245666504, "learning_rate": 4.923960644951039e-05, "loss": 1.1079, "step": 49000 }, { "epoch": 0.7619609242849827, "grad_norm": 2.814453363418579, "learning_rate": 4.9238054594267446e-05, "loss": 1.1094, "step": 49100 }, { "epoch": 0.7635127795279256, "grad_norm": 2.977782964706421, "learning_rate": 4.9236502739024504e-05, "loss": 1.1179, "step": 49200 }, { "epoch": 0.7650646347708686, "grad_norm": 2.3262228965759277, "learning_rate": 4.923495088378156e-05, "loss": 1.0911, "step": 49300 }, { "epoch": 0.7666164900138115, "grad_norm": 2.804374933242798, "learning_rate": 4.923339902853862e-05, "loss": 1.0852, "step": 49400 }, { "epoch": 0.7681683452567545, "grad_norm": 2.4713189601898193, "learning_rate": 4.923184717329568e-05, "loss": 1.0854, "step": 49500 }, { "epoch": 0.7697202004996974, "grad_norm": 2.4610402584075928, "learning_rate": 4.9230295318052735e-05, "loss": 1.0938, "step": 49600 }, { "epoch": 0.7712720557426404, "grad_norm": 2.283956527709961, "learning_rate": 4.922874346280979e-05, "loss": 1.0978, "step": 49700 }, { "epoch": 0.7728239109855832, "grad_norm": 2.3379831314086914, "learning_rate": 4.9227191607566844e-05, "loss": 1.1143, "step": 49800 }, { "epoch": 0.7743757662285262, "grad_norm": 2.246830940246582, "learning_rate": 4.92256397523239e-05, "loss": 1.1188, "step": 49900 }, { "epoch": 0.7759276214714691, "grad_norm": 2.4980430603027344, "learning_rate": 4.922408789708096e-05, "loss": 1.0927, "step": 50000 }, { "epoch": 0.7774794767144121, "grad_norm": 3.645878791809082, "learning_rate": 4.922253604183802e-05, "loss": 1.0943, "step": 50100 }, { "epoch": 0.779031331957355, "grad_norm": 2.5591325759887695, "learning_rate": 4.9220984186595075e-05, "loss": 1.1, "step": 50200 }, { "epoch": 0.780583187200298, "grad_norm": 2.2028520107269287, "learning_rate": 4.921943233135213e-05, "loss": 1.0993, "step": 50300 }, { "epoch": 0.782135042443241, "grad_norm": 2.039440870285034, "learning_rate": 4.921788047610919e-05, "loss": 1.0794, "step": 50400 }, { "epoch": 0.7836868976861838, "grad_norm": 3.0587997436523438, "learning_rate": 4.921632862086625e-05, "loss": 1.1356, "step": 50500 }, { "epoch": 0.7852387529291268, "grad_norm": 2.4542486667633057, "learning_rate": 4.9214776765623306e-05, "loss": 1.1023, "step": 50600 }, { "epoch": 0.7867906081720697, "grad_norm": 2.439394235610962, "learning_rate": 4.9213224910380364e-05, "loss": 1.0803, "step": 50700 }, { "epoch": 0.7883424634150127, "grad_norm": 2.7680699825286865, "learning_rate": 4.921167305513742e-05, "loss": 1.0917, "step": 50800 }, { "epoch": 0.7898943186579556, "grad_norm": 2.598273515701294, "learning_rate": 4.921012119989448e-05, "loss": 1.108, "step": 50900 }, { "epoch": 0.7914461739008986, "grad_norm": 2.2090206146240234, "learning_rate": 4.920856934465154e-05, "loss": 1.1154, "step": 51000 }, { "epoch": 0.7929980291438414, "grad_norm": 1.9663876295089722, "learning_rate": 4.920701748940859e-05, "loss": 1.0909, "step": 51100 }, { "epoch": 0.7945498843867844, "grad_norm": 2.366976499557495, "learning_rate": 4.9205465634165645e-05, "loss": 1.0957, "step": 51200 }, { "epoch": 0.7961017396297273, "grad_norm": 2.138845443725586, "learning_rate": 4.92039137789227e-05, "loss": 1.0918, "step": 51300 }, { "epoch": 0.7976535948726703, "grad_norm": 2.3530983924865723, "learning_rate": 4.920236192367976e-05, "loss": 1.1071, "step": 51400 }, { "epoch": 0.7992054501156132, "grad_norm": 2.620974540710449, "learning_rate": 4.920081006843682e-05, "loss": 1.0942, "step": 51500 }, { "epoch": 0.8007573053585562, "grad_norm": 2.6806552410125732, "learning_rate": 4.9199258213193877e-05, "loss": 1.0875, "step": 51600 }, { "epoch": 0.802309160601499, "grad_norm": 2.469524383544922, "learning_rate": 4.9197706357950934e-05, "loss": 1.0963, "step": 51700 }, { "epoch": 0.803861015844442, "grad_norm": 2.3020830154418945, "learning_rate": 4.919615450270799e-05, "loss": 1.0935, "step": 51800 }, { "epoch": 0.805412871087385, "grad_norm": 2.2846367359161377, "learning_rate": 4.919460264746505e-05, "loss": 1.0968, "step": 51900 }, { "epoch": 0.8069647263303279, "grad_norm": 2.6463048458099365, "learning_rate": 4.919305079222211e-05, "loss": 1.0987, "step": 52000 }, { "epoch": 0.8085165815732709, "grad_norm": 2.5185651779174805, "learning_rate": 4.9191498936979165e-05, "loss": 1.0765, "step": 52100 }, { "epoch": 0.8100684368162138, "grad_norm": 3.434648036956787, "learning_rate": 4.918994708173622e-05, "loss": 1.102, "step": 52200 }, { "epoch": 0.8116202920591568, "grad_norm": 2.5872232913970947, "learning_rate": 4.9188395226493274e-05, "loss": 1.0869, "step": 52300 }, { "epoch": 0.8131721473020996, "grad_norm": 1.964666724205017, "learning_rate": 4.918684337125033e-05, "loss": 1.0828, "step": 52400 }, { "epoch": 0.8147240025450426, "grad_norm": 2.618225336074829, "learning_rate": 4.918529151600739e-05, "loss": 1.1136, "step": 52500 }, { "epoch": 0.8162758577879855, "grad_norm": 2.1805386543273926, "learning_rate": 4.918373966076445e-05, "loss": 1.0913, "step": 52600 }, { "epoch": 0.8178277130309285, "grad_norm": 2.7604925632476807, "learning_rate": 4.91821878055215e-05, "loss": 1.125, "step": 52700 }, { "epoch": 0.8193795682738714, "grad_norm": 12.929091453552246, "learning_rate": 4.9180635950278556e-05, "loss": 1.1067, "step": 52800 }, { "epoch": 0.8209314235168144, "grad_norm": 2.4290385246276855, "learning_rate": 4.9179084095035614e-05, "loss": 1.0902, "step": 52900 }, { "epoch": 0.8224832787597572, "grad_norm": 2.8086464405059814, "learning_rate": 4.917753223979267e-05, "loss": 1.1104, "step": 53000 }, { "epoch": 0.8240351340027002, "grad_norm": 5.885090351104736, "learning_rate": 4.917598038454973e-05, "loss": 1.0916, "step": 53100 }, { "epoch": 0.8255869892456432, "grad_norm": 2.3767476081848145, "learning_rate": 4.917442852930679e-05, "loss": 1.0985, "step": 53200 }, { "epoch": 0.8271388444885861, "grad_norm": 2.060594081878662, "learning_rate": 4.9172876674063845e-05, "loss": 1.104, "step": 53300 }, { "epoch": 0.8286906997315291, "grad_norm": 2.3929853439331055, "learning_rate": 4.91713248188209e-05, "loss": 1.0909, "step": 53400 }, { "epoch": 0.830242554974472, "grad_norm": 2.8813841342926025, "learning_rate": 4.916977296357796e-05, "loss": 1.0602, "step": 53500 }, { "epoch": 0.831794410217415, "grad_norm": 2.7756216526031494, "learning_rate": 4.916822110833502e-05, "loss": 1.0911, "step": 53600 }, { "epoch": 0.8333462654603578, "grad_norm": 2.5732462406158447, "learning_rate": 4.9166669253092076e-05, "loss": 1.0939, "step": 53700 }, { "epoch": 0.8348981207033008, "grad_norm": 2.803236246109009, "learning_rate": 4.9165117397849134e-05, "loss": 1.0935, "step": 53800 }, { "epoch": 0.8364499759462437, "grad_norm": 2.252997875213623, "learning_rate": 4.916356554260619e-05, "loss": 1.0756, "step": 53900 }, { "epoch": 0.8380018311891867, "grad_norm": 2.7027902603149414, "learning_rate": 4.916201368736324e-05, "loss": 1.0892, "step": 54000 }, { "epoch": 0.8395536864321296, "grad_norm": 2.035360336303711, "learning_rate": 4.91604618321203e-05, "loss": 1.0812, "step": 54100 }, { "epoch": 0.8411055416750726, "grad_norm": 2.3151769638061523, "learning_rate": 4.915890997687736e-05, "loss": 1.0873, "step": 54200 }, { "epoch": 0.8426573969180154, "grad_norm": 3.9385781288146973, "learning_rate": 4.9157358121634415e-05, "loss": 1.0966, "step": 54300 }, { "epoch": 0.8442092521609584, "grad_norm": 2.9618349075317383, "learning_rate": 4.915580626639147e-05, "loss": 1.0913, "step": 54400 }, { "epoch": 0.8457611074039013, "grad_norm": 2.8915460109710693, "learning_rate": 4.915425441114853e-05, "loss": 1.0743, "step": 54500 }, { "epoch": 0.8473129626468443, "grad_norm": 2.759511947631836, "learning_rate": 4.915270255590559e-05, "loss": 1.0928, "step": 54600 }, { "epoch": 0.8488648178897873, "grad_norm": 2.203329086303711, "learning_rate": 4.9151150700662647e-05, "loss": 1.1004, "step": 54700 }, { "epoch": 0.8504166731327302, "grad_norm": 2.9016613960266113, "learning_rate": 4.9149598845419704e-05, "loss": 1.0641, "step": 54800 }, { "epoch": 0.8519685283756732, "grad_norm": 2.415769100189209, "learning_rate": 4.914804699017676e-05, "loss": 1.0949, "step": 54900 }, { "epoch": 0.853520383618616, "grad_norm": 3.1354901790618896, "learning_rate": 4.914649513493382e-05, "loss": 1.071, "step": 55000 }, { "epoch": 0.855072238861559, "grad_norm": 2.558598279953003, "learning_rate": 4.914494327969088e-05, "loss": 1.0624, "step": 55100 }, { "epoch": 0.8566240941045019, "grad_norm": 2.051996946334839, "learning_rate": 4.9143391424447935e-05, "loss": 1.0872, "step": 55200 }, { "epoch": 0.8581759493474449, "grad_norm": 2.6188838481903076, "learning_rate": 4.9141839569204986e-05, "loss": 1.0885, "step": 55300 }, { "epoch": 0.8597278045903878, "grad_norm": 2.5522494316101074, "learning_rate": 4.9140287713962044e-05, "loss": 1.0978, "step": 55400 }, { "epoch": 0.8612796598333308, "grad_norm": 2.4020657539367676, "learning_rate": 4.91387358587191e-05, "loss": 1.0757, "step": 55500 }, { "epoch": 0.8628315150762736, "grad_norm": 2.410759687423706, "learning_rate": 4.913718400347615e-05, "loss": 1.0808, "step": 55600 }, { "epoch": 0.8643833703192166, "grad_norm": 2.209291934967041, "learning_rate": 4.913563214823321e-05, "loss": 1.0956, "step": 55700 }, { "epoch": 0.8659352255621595, "grad_norm": 2.711432933807373, "learning_rate": 4.913408029299027e-05, "loss": 1.0863, "step": 55800 }, { "epoch": 0.8674870808051025, "grad_norm": 2.5124526023864746, "learning_rate": 4.9132528437747326e-05, "loss": 1.06, "step": 55900 }, { "epoch": 0.8690389360480454, "grad_norm": 2.5127146244049072, "learning_rate": 4.9130976582504384e-05, "loss": 1.0692, "step": 56000 }, { "epoch": 0.8705907912909884, "grad_norm": 2.892432451248169, "learning_rate": 4.912942472726144e-05, "loss": 1.0699, "step": 56100 }, { "epoch": 0.8721426465339314, "grad_norm": 2.1323468685150146, "learning_rate": 4.91278728720185e-05, "loss": 1.1097, "step": 56200 }, { "epoch": 0.8736945017768742, "grad_norm": 2.7046706676483154, "learning_rate": 4.912632101677556e-05, "loss": 1.0827, "step": 56300 }, { "epoch": 0.8752463570198172, "grad_norm": 2.281324863433838, "learning_rate": 4.9124769161532615e-05, "loss": 1.0727, "step": 56400 }, { "epoch": 0.8767982122627601, "grad_norm": 2.8358213901519775, "learning_rate": 4.912321730628967e-05, "loss": 1.062, "step": 56500 }, { "epoch": 0.8783500675057031, "grad_norm": 2.171729803085327, "learning_rate": 4.912166545104673e-05, "loss": 1.0896, "step": 56600 }, { "epoch": 0.879901922748646, "grad_norm": 2.282668113708496, "learning_rate": 4.912011359580379e-05, "loss": 1.1087, "step": 56700 }, { "epoch": 0.881453777991589, "grad_norm": 2.4975550174713135, "learning_rate": 4.911856174056084e-05, "loss": 1.0889, "step": 56800 }, { "epoch": 0.8830056332345319, "grad_norm": 2.586796760559082, "learning_rate": 4.91170098853179e-05, "loss": 1.0823, "step": 56900 }, { "epoch": 0.8845574884774748, "grad_norm": 2.6647982597351074, "learning_rate": 4.9115458030074954e-05, "loss": 1.0937, "step": 57000 }, { "epoch": 0.8861093437204177, "grad_norm": 2.5157837867736816, "learning_rate": 4.911390617483201e-05, "loss": 1.093, "step": 57100 }, { "epoch": 0.8876611989633607, "grad_norm": 2.413398027420044, "learning_rate": 4.911235431958907e-05, "loss": 1.0889, "step": 57200 }, { "epoch": 0.8892130542063036, "grad_norm": 2.898711919784546, "learning_rate": 4.911080246434613e-05, "loss": 1.0864, "step": 57300 }, { "epoch": 0.8907649094492466, "grad_norm": 2.6997451782226562, "learning_rate": 4.9109250609103185e-05, "loss": 1.0943, "step": 57400 }, { "epoch": 0.8923167646921896, "grad_norm": 2.222785472869873, "learning_rate": 4.910769875386024e-05, "loss": 1.0899, "step": 57500 }, { "epoch": 0.8938686199351324, "grad_norm": 2.6953938007354736, "learning_rate": 4.91061468986173e-05, "loss": 1.0689, "step": 57600 }, { "epoch": 0.8954204751780754, "grad_norm": 2.217766523361206, "learning_rate": 4.910459504337436e-05, "loss": 1.0623, "step": 57700 }, { "epoch": 0.8969723304210183, "grad_norm": 2.39754319190979, "learning_rate": 4.9103043188131417e-05, "loss": 1.0919, "step": 57800 }, { "epoch": 0.8985241856639613, "grad_norm": 2.7976224422454834, "learning_rate": 4.9101491332888474e-05, "loss": 1.0888, "step": 57900 }, { "epoch": 0.9000760409069042, "grad_norm": 2.1985130310058594, "learning_rate": 4.909993947764553e-05, "loss": 1.0884, "step": 58000 }, { "epoch": 0.9016278961498472, "grad_norm": 2.687382221221924, "learning_rate": 4.909838762240258e-05, "loss": 1.0753, "step": 58100 }, { "epoch": 0.90317975139279, "grad_norm": 2.2054173946380615, "learning_rate": 4.909683576715964e-05, "loss": 1.0996, "step": 58200 }, { "epoch": 0.904731606635733, "grad_norm": 2.4016077518463135, "learning_rate": 4.90952839119167e-05, "loss": 1.093, "step": 58300 }, { "epoch": 0.9062834618786759, "grad_norm": 2.39854097366333, "learning_rate": 4.9093732056673756e-05, "loss": 1.0757, "step": 58400 }, { "epoch": 0.9078353171216189, "grad_norm": 2.659069776535034, "learning_rate": 4.9092180201430814e-05, "loss": 1.059, "step": 58500 }, { "epoch": 0.9093871723645618, "grad_norm": 2.614173650741577, "learning_rate": 4.909062834618787e-05, "loss": 1.0917, "step": 58600 }, { "epoch": 0.9109390276075048, "grad_norm": 2.1619582176208496, "learning_rate": 4.908907649094493e-05, "loss": 1.0787, "step": 58700 }, { "epoch": 0.9124908828504477, "grad_norm": 2.587413787841797, "learning_rate": 4.908752463570198e-05, "loss": 1.0792, "step": 58800 }, { "epoch": 0.9140427380933906, "grad_norm": 2.9070513248443604, "learning_rate": 4.908597278045904e-05, "loss": 1.0644, "step": 58900 }, { "epoch": 0.9155945933363336, "grad_norm": 2.3392417430877686, "learning_rate": 4.9084420925216096e-05, "loss": 1.0841, "step": 59000 }, { "epoch": 0.9171464485792765, "grad_norm": 2.254016399383545, "learning_rate": 4.9082869069973154e-05, "loss": 1.0667, "step": 59100 }, { "epoch": 0.9186983038222195, "grad_norm": 2.332127571105957, "learning_rate": 4.908131721473021e-05, "loss": 1.0742, "step": 59200 }, { "epoch": 0.9202501590651624, "grad_norm": 2.302788734436035, "learning_rate": 4.907976535948727e-05, "loss": 1.0746, "step": 59300 }, { "epoch": 0.9218020143081054, "grad_norm": 2.5131382942199707, "learning_rate": 4.907821350424433e-05, "loss": 1.0744, "step": 59400 }, { "epoch": 0.9233538695510483, "grad_norm": 2.5971508026123047, "learning_rate": 4.9076661649001385e-05, "loss": 1.1016, "step": 59500 }, { "epoch": 0.9249057247939912, "grad_norm": 2.4397974014282227, "learning_rate": 4.907510979375844e-05, "loss": 1.0977, "step": 59600 }, { "epoch": 0.9264575800369341, "grad_norm": 2.3344008922576904, "learning_rate": 4.9073557938515493e-05, "loss": 1.061, "step": 59700 }, { "epoch": 0.9280094352798771, "grad_norm": 2.8250203132629395, "learning_rate": 4.907200608327255e-05, "loss": 1.0743, "step": 59800 }, { "epoch": 0.92956129052282, "grad_norm": 2.7472126483917236, "learning_rate": 4.907045422802961e-05, "loss": 1.0865, "step": 59900 }, { "epoch": 0.931113145765763, "grad_norm": 2.347679615020752, "learning_rate": 4.906890237278667e-05, "loss": 1.1001, "step": 60000 }, { "epoch": 0.9326650010087059, "grad_norm": 1.923721194267273, "learning_rate": 4.9067350517543724e-05, "loss": 1.0876, "step": 60100 }, { "epoch": 0.9342168562516489, "grad_norm": 2.4793779850006104, "learning_rate": 4.906579866230078e-05, "loss": 1.0894, "step": 60200 }, { "epoch": 0.9357687114945917, "grad_norm": 2.128314733505249, "learning_rate": 4.906424680705784e-05, "loss": 1.0775, "step": 60300 }, { "epoch": 0.9373205667375347, "grad_norm": 2.659546136856079, "learning_rate": 4.90626949518149e-05, "loss": 1.0679, "step": 60400 }, { "epoch": 0.9388724219804777, "grad_norm": 2.3349568843841553, "learning_rate": 4.9061143096571955e-05, "loss": 1.0693, "step": 60500 }, { "epoch": 0.9404242772234206, "grad_norm": 2.2482144832611084, "learning_rate": 4.905959124132901e-05, "loss": 1.0861, "step": 60600 }, { "epoch": 0.9419761324663636, "grad_norm": 2.140422821044922, "learning_rate": 4.905803938608607e-05, "loss": 1.0576, "step": 60700 }, { "epoch": 0.9435279877093065, "grad_norm": 2.3438034057617188, "learning_rate": 4.905648753084313e-05, "loss": 1.0934, "step": 60800 }, { "epoch": 0.9450798429522494, "grad_norm": 2.406639575958252, "learning_rate": 4.9054935675600187e-05, "loss": 1.0917, "step": 60900 }, { "epoch": 0.9466316981951923, "grad_norm": 2.5059359073638916, "learning_rate": 4.905338382035724e-05, "loss": 1.0858, "step": 61000 }, { "epoch": 0.9481835534381353, "grad_norm": 2.721864938735962, "learning_rate": 4.9051831965114295e-05, "loss": 1.085, "step": 61100 }, { "epoch": 0.9497354086810782, "grad_norm": 3.301384687423706, "learning_rate": 4.905028010987135e-05, "loss": 1.0838, "step": 61200 }, { "epoch": 0.9512872639240212, "grad_norm": 2.563225507736206, "learning_rate": 4.904872825462841e-05, "loss": 1.098, "step": 61300 }, { "epoch": 0.9528391191669641, "grad_norm": 2.531721353530884, "learning_rate": 4.904717639938547e-05, "loss": 1.1011, "step": 61400 }, { "epoch": 0.9543909744099071, "grad_norm": 2.638995885848999, "learning_rate": 4.9045624544142526e-05, "loss": 1.092, "step": 61500 }, { "epoch": 0.9559428296528499, "grad_norm": 2.5860416889190674, "learning_rate": 4.9044072688899584e-05, "loss": 1.0788, "step": 61600 }, { "epoch": 0.9574946848957929, "grad_norm": 2.9228742122650146, "learning_rate": 4.904252083365664e-05, "loss": 1.0795, "step": 61700 }, { "epoch": 0.9590465401387358, "grad_norm": 2.354112148284912, "learning_rate": 4.90409689784137e-05, "loss": 1.0591, "step": 61800 }, { "epoch": 0.9605983953816788, "grad_norm": 2.4461817741394043, "learning_rate": 4.903941712317076e-05, "loss": 1.0893, "step": 61900 }, { "epoch": 0.9621502506246218, "grad_norm": 2.5140132904052734, "learning_rate": 4.9037865267927815e-05, "loss": 1.0596, "step": 62000 }, { "epoch": 0.9637021058675647, "grad_norm": 2.414177656173706, "learning_rate": 4.9036313412684866e-05, "loss": 1.0607, "step": 62100 }, { "epoch": 0.9652539611105077, "grad_norm": 2.311681032180786, "learning_rate": 4.9034761557441924e-05, "loss": 1.0803, "step": 62200 }, { "epoch": 0.9668058163534505, "grad_norm": 2.5589518547058105, "learning_rate": 4.903320970219898e-05, "loss": 1.0913, "step": 62300 }, { "epoch": 0.9683576715963935, "grad_norm": 2.4336955547332764, "learning_rate": 4.903165784695604e-05, "loss": 1.0952, "step": 62400 }, { "epoch": 0.9699095268393364, "grad_norm": 2.896681070327759, "learning_rate": 4.903010599171309e-05, "loss": 1.0725, "step": 62500 }, { "epoch": 0.9714613820822794, "grad_norm": 2.791332960128784, "learning_rate": 4.902855413647015e-05, "loss": 1.0961, "step": 62600 }, { "epoch": 0.9730132373252223, "grad_norm": 2.6748030185699463, "learning_rate": 4.9027002281227206e-05, "loss": 1.0788, "step": 62700 }, { "epoch": 0.9745650925681653, "grad_norm": 3.002559185028076, "learning_rate": 4.9025450425984263e-05, "loss": 1.0497, "step": 62800 }, { "epoch": 0.9761169478111081, "grad_norm": 2.1950531005859375, "learning_rate": 4.902389857074132e-05, "loss": 1.0717, "step": 62900 }, { "epoch": 0.9776688030540511, "grad_norm": 2.224412202835083, "learning_rate": 4.902234671549838e-05, "loss": 1.0913, "step": 63000 }, { "epoch": 0.979220658296994, "grad_norm": 2.4054319858551025, "learning_rate": 4.902079486025544e-05, "loss": 1.0839, "step": 63100 }, { "epoch": 0.980772513539937, "grad_norm": 2.336530923843384, "learning_rate": 4.9019243005012494e-05, "loss": 1.0839, "step": 63200 }, { "epoch": 0.98232436878288, "grad_norm": 2.1968419551849365, "learning_rate": 4.901769114976955e-05, "loss": 1.0937, "step": 63300 }, { "epoch": 0.9838762240258229, "grad_norm": 2.358640432357788, "learning_rate": 4.901613929452661e-05, "loss": 1.0706, "step": 63400 }, { "epoch": 0.9854280792687659, "grad_norm": 2.25819993019104, "learning_rate": 4.901458743928367e-05, "loss": 1.0896, "step": 63500 }, { "epoch": 0.9869799345117087, "grad_norm": 2.4529812335968018, "learning_rate": 4.9013035584040725e-05, "loss": 1.0864, "step": 63600 }, { "epoch": 0.9885317897546517, "grad_norm": 2.1838877201080322, "learning_rate": 4.901148372879778e-05, "loss": 1.0924, "step": 63700 }, { "epoch": 0.9900836449975946, "grad_norm": 1.9184544086456299, "learning_rate": 4.9009931873554834e-05, "loss": 1.0656, "step": 63800 }, { "epoch": 0.9916355002405376, "grad_norm": 2.27970552444458, "learning_rate": 4.900838001831189e-05, "loss": 1.0997, "step": 63900 }, { "epoch": 0.9931873554834805, "grad_norm": 2.437575340270996, "learning_rate": 4.900682816306895e-05, "loss": 1.0563, "step": 64000 }, { "epoch": 0.9947392107264235, "grad_norm": 2.3042783737182617, "learning_rate": 4.900527630782601e-05, "loss": 1.0847, "step": 64100 }, { "epoch": 0.9962910659693663, "grad_norm": 2.1939148902893066, "learning_rate": 4.9003724452583065e-05, "loss": 1.0435, "step": 64200 }, { "epoch": 0.9978429212123093, "grad_norm": 2.6042773723602295, "learning_rate": 4.900217259734012e-05, "loss": 1.0759, "step": 64300 }, { "epoch": 0.9993947764552522, "grad_norm": 2.367938995361328, "learning_rate": 4.900062074209718e-05, "loss": 1.0792, "step": 64400 }, { "epoch": 1.0009466316981952, "grad_norm": 2.6519479751586914, "learning_rate": 4.899906888685424e-05, "loss": 1.0797, "step": 64500 }, { "epoch": 1.0024984869411382, "grad_norm": 2.51983642578125, "learning_rate": 4.8997517031611296e-05, "loss": 1.0767, "step": 64600 }, { "epoch": 1.0040503421840812, "grad_norm": 2.5465996265411377, "learning_rate": 4.8995965176368354e-05, "loss": 1.0571, "step": 64700 }, { "epoch": 1.005602197427024, "grad_norm": 2.162459135055542, "learning_rate": 4.899441332112541e-05, "loss": 1.0664, "step": 64800 }, { "epoch": 1.007154052669967, "grad_norm": 2.2916128635406494, "learning_rate": 4.899286146588247e-05, "loss": 1.0749, "step": 64900 }, { "epoch": 1.00870590791291, "grad_norm": 4.030702114105225, "learning_rate": 4.899130961063953e-05, "loss": 1.0606, "step": 65000 }, { "epoch": 1.010257763155853, "grad_norm": 2.2635715007781982, "learning_rate": 4.898975775539658e-05, "loss": 1.0619, "step": 65100 }, { "epoch": 1.0118096183987957, "grad_norm": 2.387512683868408, "learning_rate": 4.8988205900153636e-05, "loss": 1.0763, "step": 65200 }, { "epoch": 1.0133614736417387, "grad_norm": 2.5903921127319336, "learning_rate": 4.898665404491069e-05, "loss": 1.0737, "step": 65300 }, { "epoch": 1.0149133288846817, "grad_norm": 2.6451869010925293, "learning_rate": 4.8985102189667745e-05, "loss": 1.0727, "step": 65400 }, { "epoch": 1.0164651841276247, "grad_norm": 2.7714970111846924, "learning_rate": 4.89835503344248e-05, "loss": 1.0701, "step": 65500 }, { "epoch": 1.0180170393705674, "grad_norm": 2.2142317295074463, "learning_rate": 4.898199847918186e-05, "loss": 1.0678, "step": 65600 }, { "epoch": 1.0195688946135104, "grad_norm": 2.509476900100708, "learning_rate": 4.898044662393892e-05, "loss": 1.0608, "step": 65700 }, { "epoch": 1.0211207498564534, "grad_norm": 2.1629464626312256, "learning_rate": 4.8978894768695976e-05, "loss": 1.0574, "step": 65800 }, { "epoch": 1.0226726050993964, "grad_norm": 2.6267380714416504, "learning_rate": 4.8977342913453033e-05, "loss": 1.0599, "step": 65900 }, { "epoch": 1.0242244603423392, "grad_norm": 2.3453283309936523, "learning_rate": 4.897579105821009e-05, "loss": 1.0667, "step": 66000 }, { "epoch": 1.0257763155852822, "grad_norm": 2.494065284729004, "learning_rate": 4.897423920296715e-05, "loss": 1.0607, "step": 66100 }, { "epoch": 1.0273281708282251, "grad_norm": 2.2472991943359375, "learning_rate": 4.897268734772421e-05, "loss": 1.0656, "step": 66200 }, { "epoch": 1.0288800260711681, "grad_norm": 2.5747063159942627, "learning_rate": 4.8971135492481264e-05, "loss": 1.0763, "step": 66300 }, { "epoch": 1.0304318813141111, "grad_norm": 2.5225722789764404, "learning_rate": 4.896958363723832e-05, "loss": 1.0889, "step": 66400 }, { "epoch": 1.0319837365570539, "grad_norm": 2.925856590270996, "learning_rate": 4.896803178199538e-05, "loss": 1.0686, "step": 66500 }, { "epoch": 1.0335355917999969, "grad_norm": 2.5085785388946533, "learning_rate": 4.896647992675243e-05, "loss": 1.0747, "step": 66600 }, { "epoch": 1.0350874470429399, "grad_norm": 2.308800220489502, "learning_rate": 4.896492807150949e-05, "loss": 1.0381, "step": 66700 }, { "epoch": 1.0366393022858829, "grad_norm": 2.865907669067383, "learning_rate": 4.8963376216266546e-05, "loss": 1.0709, "step": 66800 }, { "epoch": 1.0381911575288256, "grad_norm": 2.3193445205688477, "learning_rate": 4.8961824361023604e-05, "loss": 1.0681, "step": 66900 }, { "epoch": 1.0397430127717686, "grad_norm": 2.579780340194702, "learning_rate": 4.896027250578066e-05, "loss": 1.0636, "step": 67000 }, { "epoch": 1.0412948680147116, "grad_norm": 2.015373945236206, "learning_rate": 4.895872065053772e-05, "loss": 1.0691, "step": 67100 }, { "epoch": 1.0428467232576546, "grad_norm": 2.585635185241699, "learning_rate": 4.895716879529478e-05, "loss": 1.0689, "step": 67200 }, { "epoch": 1.0443985785005974, "grad_norm": 2.0512821674346924, "learning_rate": 4.8955616940051835e-05, "loss": 1.0597, "step": 67300 }, { "epoch": 1.0459504337435404, "grad_norm": 2.310270309448242, "learning_rate": 4.895406508480889e-05, "loss": 1.0652, "step": 67400 }, { "epoch": 1.0475022889864833, "grad_norm": 3.0121684074401855, "learning_rate": 4.895251322956595e-05, "loss": 1.1025, "step": 67500 }, { "epoch": 1.0490541442294263, "grad_norm": 2.791961431503296, "learning_rate": 4.895096137432301e-05, "loss": 1.0936, "step": 67600 }, { "epoch": 1.0506059994723693, "grad_norm": 2.321652889251709, "learning_rate": 4.8949409519080066e-05, "loss": 1.0744, "step": 67700 }, { "epoch": 1.052157854715312, "grad_norm": 2.584559917449951, "learning_rate": 4.8947857663837124e-05, "loss": 1.0693, "step": 67800 }, { "epoch": 1.053709709958255, "grad_norm": 2.0683956146240234, "learning_rate": 4.8946305808594175e-05, "loss": 1.0665, "step": 67900 }, { "epoch": 1.055261565201198, "grad_norm": 2.237478494644165, "learning_rate": 4.894475395335123e-05, "loss": 1.0665, "step": 68000 }, { "epoch": 1.056813420444141, "grad_norm": 2.586841344833374, "learning_rate": 4.894320209810829e-05, "loss": 1.0396, "step": 68100 }, { "epoch": 1.0583652756870838, "grad_norm": 2.3526418209075928, "learning_rate": 4.894165024286535e-05, "loss": 1.0811, "step": 68200 }, { "epoch": 1.0599171309300268, "grad_norm": 2.3497989177703857, "learning_rate": 4.8940098387622406e-05, "loss": 1.0533, "step": 68300 }, { "epoch": 1.0614689861729698, "grad_norm": 2.36147403717041, "learning_rate": 4.8938546532379464e-05, "loss": 1.0658, "step": 68400 }, { "epoch": 1.0630208414159128, "grad_norm": 2.5031540393829346, "learning_rate": 4.893699467713652e-05, "loss": 1.0752, "step": 68500 }, { "epoch": 1.0645726966588556, "grad_norm": 2.3838536739349365, "learning_rate": 4.893544282189357e-05, "loss": 1.0692, "step": 68600 }, { "epoch": 1.0661245519017986, "grad_norm": 2.539668560028076, "learning_rate": 4.893389096665063e-05, "loss": 1.0492, "step": 68700 }, { "epoch": 1.0676764071447415, "grad_norm": 2.6182823181152344, "learning_rate": 4.893233911140769e-05, "loss": 1.0663, "step": 68800 }, { "epoch": 1.0692282623876845, "grad_norm": 2.2515602111816406, "learning_rate": 4.8930787256164746e-05, "loss": 1.0658, "step": 68900 }, { "epoch": 1.0707801176306275, "grad_norm": 2.6888792514801025, "learning_rate": 4.8929235400921803e-05, "loss": 1.0568, "step": 69000 }, { "epoch": 1.0723319728735703, "grad_norm": 2.3641738891601562, "learning_rate": 4.892768354567886e-05, "loss": 1.0659, "step": 69100 }, { "epoch": 1.0738838281165133, "grad_norm": 2.176417589187622, "learning_rate": 4.892613169043592e-05, "loss": 1.0646, "step": 69200 }, { "epoch": 1.0754356833594563, "grad_norm": 2.678765296936035, "learning_rate": 4.892457983519298e-05, "loss": 1.0718, "step": 69300 }, { "epoch": 1.0769875386023993, "grad_norm": 2.706892728805542, "learning_rate": 4.8923027979950034e-05, "loss": 1.0809, "step": 69400 }, { "epoch": 1.078539393845342, "grad_norm": 2.530580520629883, "learning_rate": 4.8921476124707085e-05, "loss": 1.0382, "step": 69500 }, { "epoch": 1.080091249088285, "grad_norm": 2.3500490188598633, "learning_rate": 4.891992426946414e-05, "loss": 1.0702, "step": 69600 }, { "epoch": 1.081643104331228, "grad_norm": 2.5857555866241455, "learning_rate": 4.89183724142212e-05, "loss": 1.0671, "step": 69700 }, { "epoch": 1.083194959574171, "grad_norm": 2.4008495807647705, "learning_rate": 4.891682055897826e-05, "loss": 1.0453, "step": 69800 }, { "epoch": 1.0847468148171138, "grad_norm": 2.541311502456665, "learning_rate": 4.8915268703735316e-05, "loss": 1.0416, "step": 69900 }, { "epoch": 1.0862986700600568, "grad_norm": 2.5525197982788086, "learning_rate": 4.8913716848492374e-05, "loss": 1.0459, "step": 70000 }, { "epoch": 1.0878505253029997, "grad_norm": 2.490386724472046, "learning_rate": 4.891216499324943e-05, "loss": 1.0507, "step": 70100 }, { "epoch": 1.0894023805459427, "grad_norm": 2.361264228820801, "learning_rate": 4.891061313800649e-05, "loss": 1.0643, "step": 70200 }, { "epoch": 1.0909542357888857, "grad_norm": 2.252572774887085, "learning_rate": 4.890906128276355e-05, "loss": 1.074, "step": 70300 }, { "epoch": 1.0925060910318285, "grad_norm": 2.4451234340667725, "learning_rate": 4.8907509427520605e-05, "loss": 1.0668, "step": 70400 }, { "epoch": 1.0940579462747715, "grad_norm": 2.539808750152588, "learning_rate": 4.890595757227766e-05, "loss": 1.0628, "step": 70500 }, { "epoch": 1.0956098015177145, "grad_norm": 2.6056342124938965, "learning_rate": 4.890440571703472e-05, "loss": 1.0652, "step": 70600 }, { "epoch": 1.0971616567606575, "grad_norm": 2.517812728881836, "learning_rate": 4.890285386179178e-05, "loss": 1.0223, "step": 70700 }, { "epoch": 1.0987135120036002, "grad_norm": 2.015601873397827, "learning_rate": 4.890130200654883e-05, "loss": 1.078, "step": 70800 }, { "epoch": 1.1002653672465432, "grad_norm": 2.611905097961426, "learning_rate": 4.889975015130589e-05, "loss": 1.0842, "step": 70900 }, { "epoch": 1.1018172224894862, "grad_norm": 2.6933865547180176, "learning_rate": 4.8898198296062945e-05, "loss": 1.0733, "step": 71000 }, { "epoch": 1.1033690777324292, "grad_norm": 2.8610610961914062, "learning_rate": 4.889664644082e-05, "loss": 1.0646, "step": 71100 }, { "epoch": 1.104920932975372, "grad_norm": 3.7753584384918213, "learning_rate": 4.889509458557706e-05, "loss": 1.0633, "step": 71200 }, { "epoch": 1.106472788218315, "grad_norm": 3.0731160640716553, "learning_rate": 4.889354273033412e-05, "loss": 1.0598, "step": 71300 }, { "epoch": 1.108024643461258, "grad_norm": 1.8618810176849365, "learning_rate": 4.8891990875091176e-05, "loss": 1.0801, "step": 71400 }, { "epoch": 1.109576498704201, "grad_norm": 2.592750072479248, "learning_rate": 4.8890439019848234e-05, "loss": 1.0684, "step": 71500 }, { "epoch": 1.1111283539471437, "grad_norm": 2.5291922092437744, "learning_rate": 4.888888716460529e-05, "loss": 1.0694, "step": 71600 }, { "epoch": 1.1126802091900867, "grad_norm": 2.3694732189178467, "learning_rate": 4.888733530936235e-05, "loss": 1.0622, "step": 71700 }, { "epoch": 1.1142320644330297, "grad_norm": 2.5593791007995605, "learning_rate": 4.888578345411941e-05, "loss": 1.0693, "step": 71800 }, { "epoch": 1.1157839196759727, "grad_norm": 2.3732500076293945, "learning_rate": 4.888423159887646e-05, "loss": 1.0776, "step": 71900 }, { "epoch": 1.1173357749189154, "grad_norm": 2.921720504760742, "learning_rate": 4.8882679743633516e-05, "loss": 1.054, "step": 72000 }, { "epoch": 1.1188876301618584, "grad_norm": 2.41079044342041, "learning_rate": 4.8881127888390573e-05, "loss": 1.0658, "step": 72100 }, { "epoch": 1.1204394854048014, "grad_norm": 2.1347107887268066, "learning_rate": 4.887957603314763e-05, "loss": 1.0659, "step": 72200 }, { "epoch": 1.1219913406477444, "grad_norm": 2.14314866065979, "learning_rate": 4.887802417790468e-05, "loss": 1.0666, "step": 72300 }, { "epoch": 1.1235431958906874, "grad_norm": 2.482321262359619, "learning_rate": 4.887647232266174e-05, "loss": 1.0596, "step": 72400 }, { "epoch": 1.1250950511336302, "grad_norm": 2.320482015609741, "learning_rate": 4.88749204674188e-05, "loss": 1.0684, "step": 72500 }, { "epoch": 1.1266469063765732, "grad_norm": 2.2989742755889893, "learning_rate": 4.8873368612175855e-05, "loss": 1.0396, "step": 72600 }, { "epoch": 1.1281987616195162, "grad_norm": 2.606377601623535, "learning_rate": 4.887181675693291e-05, "loss": 1.0322, "step": 72700 }, { "epoch": 1.1297506168624591, "grad_norm": 3.1968111991882324, "learning_rate": 4.887026490168997e-05, "loss": 1.0816, "step": 72800 }, { "epoch": 1.131302472105402, "grad_norm": 2.2303757667541504, "learning_rate": 4.886871304644703e-05, "loss": 1.0654, "step": 72900 }, { "epoch": 1.132854327348345, "grad_norm": 2.5193209648132324, "learning_rate": 4.8867161191204086e-05, "loss": 1.0625, "step": 73000 }, { "epoch": 1.134406182591288, "grad_norm": 2.3429617881774902, "learning_rate": 4.8865609335961144e-05, "loss": 1.0467, "step": 73100 }, { "epoch": 1.1359580378342309, "grad_norm": 2.758798122406006, "learning_rate": 4.88640574807182e-05, "loss": 1.054, "step": 73200 }, { "epoch": 1.1375098930771737, "grad_norm": 2.435607671737671, "learning_rate": 4.886250562547526e-05, "loss": 1.044, "step": 73300 }, { "epoch": 1.1390617483201166, "grad_norm": 2.3277082443237305, "learning_rate": 4.886095377023232e-05, "loss": 1.0705, "step": 73400 }, { "epoch": 1.1406136035630596, "grad_norm": 2.5127971172332764, "learning_rate": 4.8859401914989375e-05, "loss": 1.0642, "step": 73500 }, { "epoch": 1.1421654588060026, "grad_norm": 2.4961650371551514, "learning_rate": 4.8857850059746426e-05, "loss": 1.0433, "step": 73600 }, { "epoch": 1.1437173140489456, "grad_norm": 2.786979913711548, "learning_rate": 4.8856298204503484e-05, "loss": 1.0776, "step": 73700 }, { "epoch": 1.1452691692918884, "grad_norm": 2.2501378059387207, "learning_rate": 4.885474634926054e-05, "loss": 1.0364, "step": 73800 }, { "epoch": 1.1468210245348314, "grad_norm": 2.5842654705047607, "learning_rate": 4.88531944940176e-05, "loss": 1.0581, "step": 73900 }, { "epoch": 1.1483728797777744, "grad_norm": 2.6005351543426514, "learning_rate": 4.885164263877466e-05, "loss": 1.0515, "step": 74000 }, { "epoch": 1.1499247350207173, "grad_norm": 2.484531879425049, "learning_rate": 4.8850090783531715e-05, "loss": 1.0578, "step": 74100 }, { "epoch": 1.1514765902636601, "grad_norm": 1.9249801635742188, "learning_rate": 4.884853892828877e-05, "loss": 1.0633, "step": 74200 }, { "epoch": 1.153028445506603, "grad_norm": 2.5429112911224365, "learning_rate": 4.884698707304583e-05, "loss": 1.0575, "step": 74300 }, { "epoch": 1.154580300749546, "grad_norm": 2.355851888656616, "learning_rate": 4.884543521780289e-05, "loss": 1.052, "step": 74400 }, { "epoch": 1.156132155992489, "grad_norm": 1.9789249897003174, "learning_rate": 4.8843883362559946e-05, "loss": 1.0489, "step": 74500 }, { "epoch": 1.1576840112354319, "grad_norm": 2.592796564102173, "learning_rate": 4.8842331507317004e-05, "loss": 1.0658, "step": 74600 }, { "epoch": 1.1592358664783748, "grad_norm": 2.7423107624053955, "learning_rate": 4.884077965207406e-05, "loss": 1.0481, "step": 74700 }, { "epoch": 1.1607877217213178, "grad_norm": 2.4083638191223145, "learning_rate": 4.883922779683112e-05, "loss": 1.0553, "step": 74800 }, { "epoch": 1.1623395769642608, "grad_norm": 2.876737594604492, "learning_rate": 4.883767594158817e-05, "loss": 1.0548, "step": 74900 }, { "epoch": 1.1638914322072038, "grad_norm": 2.7064003944396973, "learning_rate": 4.883612408634523e-05, "loss": 1.0753, "step": 75000 }, { "epoch": 1.1654432874501466, "grad_norm": 1.967431664466858, "learning_rate": 4.8834572231102286e-05, "loss": 1.041, "step": 75100 }, { "epoch": 1.1669951426930896, "grad_norm": 2.364841938018799, "learning_rate": 4.883302037585934e-05, "loss": 1.049, "step": 75200 }, { "epoch": 1.1685469979360326, "grad_norm": 1.9645235538482666, "learning_rate": 4.8831468520616394e-05, "loss": 1.0673, "step": 75300 }, { "epoch": 1.1700988531789756, "grad_norm": 2.3362956047058105, "learning_rate": 4.882991666537345e-05, "loss": 1.0416, "step": 75400 }, { "epoch": 1.1716507084219183, "grad_norm": 2.7678020000457764, "learning_rate": 4.882836481013051e-05, "loss": 1.0572, "step": 75500 }, { "epoch": 1.1732025636648613, "grad_norm": 2.458146810531616, "learning_rate": 4.882681295488757e-05, "loss": 1.0486, "step": 75600 }, { "epoch": 1.1747544189078043, "grad_norm": 2.4043965339660645, "learning_rate": 4.8825261099644625e-05, "loss": 1.0616, "step": 75700 }, { "epoch": 1.1763062741507473, "grad_norm": 2.180474042892456, "learning_rate": 4.882370924440168e-05, "loss": 1.0565, "step": 75800 }, { "epoch": 1.17785812939369, "grad_norm": 2.3034558296203613, "learning_rate": 4.882215738915874e-05, "loss": 1.06, "step": 75900 }, { "epoch": 1.179409984636633, "grad_norm": 2.419978618621826, "learning_rate": 4.88206055339158e-05, "loss": 1.0611, "step": 76000 }, { "epoch": 1.180961839879576, "grad_norm": 2.010608673095703, "learning_rate": 4.8819053678672856e-05, "loss": 1.0456, "step": 76100 }, { "epoch": 1.182513695122519, "grad_norm": 6.079827785491943, "learning_rate": 4.8817501823429914e-05, "loss": 1.047, "step": 76200 }, { "epoch": 1.184065550365462, "grad_norm": 2.345449447631836, "learning_rate": 4.881594996818697e-05, "loss": 1.0501, "step": 76300 }, { "epoch": 1.1856174056084048, "grad_norm": 2.4975788593292236, "learning_rate": 4.881439811294403e-05, "loss": 1.0537, "step": 76400 }, { "epoch": 1.1871692608513478, "grad_norm": 2.6138932704925537, "learning_rate": 4.881284625770108e-05, "loss": 1.0501, "step": 76500 }, { "epoch": 1.1887211160942908, "grad_norm": 2.7621638774871826, "learning_rate": 4.881129440245814e-05, "loss": 1.0653, "step": 76600 }, { "epoch": 1.1902729713372338, "grad_norm": 2.3397762775421143, "learning_rate": 4.8809742547215196e-05, "loss": 1.0707, "step": 76700 }, { "epoch": 1.1918248265801765, "grad_norm": 2.786191940307617, "learning_rate": 4.8808190691972254e-05, "loss": 1.0433, "step": 76800 }, { "epoch": 1.1933766818231195, "grad_norm": 2.018888473510742, "learning_rate": 4.880663883672931e-05, "loss": 1.0586, "step": 76900 }, { "epoch": 1.1949285370660625, "grad_norm": 2.099963426589966, "learning_rate": 4.880508698148637e-05, "loss": 1.0451, "step": 77000 }, { "epoch": 1.1964803923090055, "grad_norm": 2.318004608154297, "learning_rate": 4.880353512624343e-05, "loss": 1.0554, "step": 77100 }, { "epoch": 1.1980322475519483, "grad_norm": 2.8001456260681152, "learning_rate": 4.8801983271000485e-05, "loss": 1.0461, "step": 77200 }, { "epoch": 1.1995841027948912, "grad_norm": 2.64302659034729, "learning_rate": 4.880043141575754e-05, "loss": 1.0581, "step": 77300 }, { "epoch": 1.2011359580378342, "grad_norm": 2.7271790504455566, "learning_rate": 4.87988795605146e-05, "loss": 1.0642, "step": 77400 }, { "epoch": 1.2026878132807772, "grad_norm": 2.226454257965088, "learning_rate": 4.879732770527166e-05, "loss": 1.0471, "step": 77500 }, { "epoch": 1.2042396685237202, "grad_norm": 2.440675973892212, "learning_rate": 4.8795775850028716e-05, "loss": 1.0541, "step": 77600 }, { "epoch": 1.205791523766663, "grad_norm": 2.59090518951416, "learning_rate": 4.8794223994785774e-05, "loss": 1.0448, "step": 77700 }, { "epoch": 1.207343379009606, "grad_norm": 1.9012759923934937, "learning_rate": 4.8792672139542825e-05, "loss": 1.0536, "step": 77800 }, { "epoch": 1.208895234252549, "grad_norm": 2.7634406089782715, "learning_rate": 4.879112028429988e-05, "loss": 1.0453, "step": 77900 }, { "epoch": 1.210447089495492, "grad_norm": 2.5996384620666504, "learning_rate": 4.878956842905694e-05, "loss": 1.0503, "step": 78000 }, { "epoch": 1.2119989447384347, "grad_norm": 2.9367330074310303, "learning_rate": 4.8788016573814e-05, "loss": 1.0489, "step": 78100 }, { "epoch": 1.2135507999813777, "grad_norm": 1.9382590055465698, "learning_rate": 4.8786464718571056e-05, "loss": 1.038, "step": 78200 }, { "epoch": 1.2151026552243207, "grad_norm": 2.6499252319335938, "learning_rate": 4.8784912863328113e-05, "loss": 1.0499, "step": 78300 }, { "epoch": 1.2166545104672637, "grad_norm": 2.6489076614379883, "learning_rate": 4.8783361008085164e-05, "loss": 1.0631, "step": 78400 }, { "epoch": 1.2182063657102065, "grad_norm": 2.2367167472839355, "learning_rate": 4.878180915284222e-05, "loss": 1.0452, "step": 78500 }, { "epoch": 1.2197582209531495, "grad_norm": 2.3587329387664795, "learning_rate": 4.878025729759928e-05, "loss": 1.0281, "step": 78600 }, { "epoch": 1.2213100761960924, "grad_norm": 2.2231717109680176, "learning_rate": 4.877870544235634e-05, "loss": 1.0557, "step": 78700 }, { "epoch": 1.2228619314390354, "grad_norm": 2.2742161750793457, "learning_rate": 4.8777153587113395e-05, "loss": 1.049, "step": 78800 }, { "epoch": 1.2244137866819784, "grad_norm": 2.03485369682312, "learning_rate": 4.877560173187045e-05, "loss": 1.0665, "step": 78900 }, { "epoch": 1.2259656419249212, "grad_norm": 2.4765071868896484, "learning_rate": 4.877404987662751e-05, "loss": 1.0658, "step": 79000 }, { "epoch": 1.2275174971678642, "grad_norm": 2.009545087814331, "learning_rate": 4.877249802138457e-05, "loss": 1.0402, "step": 79100 }, { "epoch": 1.2290693524108072, "grad_norm": 2.3208465576171875, "learning_rate": 4.8770946166141626e-05, "loss": 1.0498, "step": 79200 }, { "epoch": 1.23062120765375, "grad_norm": 2.6699910163879395, "learning_rate": 4.876939431089868e-05, "loss": 1.0597, "step": 79300 }, { "epoch": 1.232173062896693, "grad_norm": 2.575530529022217, "learning_rate": 4.8767842455655735e-05, "loss": 1.0558, "step": 79400 }, { "epoch": 1.233724918139636, "grad_norm": 2.331651210784912, "learning_rate": 4.876629060041279e-05, "loss": 1.0279, "step": 79500 }, { "epoch": 1.235276773382579, "grad_norm": 2.9355666637420654, "learning_rate": 4.876473874516985e-05, "loss": 1.0357, "step": 79600 }, { "epoch": 1.236828628625522, "grad_norm": 2.6148974895477295, "learning_rate": 4.876318688992691e-05, "loss": 1.0474, "step": 79700 }, { "epoch": 1.2383804838684647, "grad_norm": 3.1479880809783936, "learning_rate": 4.8761635034683966e-05, "loss": 1.0875, "step": 79800 }, { "epoch": 1.2399323391114077, "grad_norm": 2.236736297607422, "learning_rate": 4.8760083179441024e-05, "loss": 1.0441, "step": 79900 }, { "epoch": 1.2414841943543506, "grad_norm": 2.532207489013672, "learning_rate": 4.875853132419808e-05, "loss": 1.0789, "step": 80000 }, { "epoch": 1.2430360495972936, "grad_norm": 2.430551528930664, "learning_rate": 4.875697946895514e-05, "loss": 1.0521, "step": 80100 }, { "epoch": 1.2445879048402366, "grad_norm": 2.441737651824951, "learning_rate": 4.87554276137122e-05, "loss": 1.0743, "step": 80200 }, { "epoch": 1.2461397600831794, "grad_norm": 2.2907378673553467, "learning_rate": 4.8753875758469255e-05, "loss": 1.0624, "step": 80300 }, { "epoch": 1.2476916153261224, "grad_norm": 2.010080337524414, "learning_rate": 4.875232390322631e-05, "loss": 1.0423, "step": 80400 }, { "epoch": 1.2492434705690654, "grad_norm": 2.2400338649749756, "learning_rate": 4.875077204798337e-05, "loss": 1.0558, "step": 80500 }, { "epoch": 1.2507953258120081, "grad_norm": 2.2525060176849365, "learning_rate": 4.874922019274042e-05, "loss": 1.0434, "step": 80600 }, { "epoch": 1.2523471810549511, "grad_norm": 2.4823508262634277, "learning_rate": 4.874766833749748e-05, "loss": 1.0521, "step": 80700 }, { "epoch": 1.2538990362978941, "grad_norm": 2.5446553230285645, "learning_rate": 4.874611648225454e-05, "loss": 1.0553, "step": 80800 }, { "epoch": 1.255450891540837, "grad_norm": 2.0295300483703613, "learning_rate": 4.8744564627011595e-05, "loss": 1.0551, "step": 80900 }, { "epoch": 1.25700274678378, "grad_norm": 2.375016689300537, "learning_rate": 4.874301277176865e-05, "loss": 1.0458, "step": 81000 }, { "epoch": 1.2585546020267229, "grad_norm": 2.148250102996826, "learning_rate": 4.874146091652571e-05, "loss": 1.0501, "step": 81100 }, { "epoch": 1.2601064572696659, "grad_norm": 2.9783730506896973, "learning_rate": 4.873990906128277e-05, "loss": 1.0616, "step": 81200 }, { "epoch": 1.2616583125126088, "grad_norm": 2.9982056617736816, "learning_rate": 4.8738357206039826e-05, "loss": 1.0476, "step": 81300 }, { "epoch": 1.2632101677555518, "grad_norm": 2.2925729751586914, "learning_rate": 4.8736805350796883e-05, "loss": 1.0732, "step": 81400 }, { "epoch": 1.2647620229984948, "grad_norm": 2.2186686992645264, "learning_rate": 4.873525349555394e-05, "loss": 1.0344, "step": 81500 }, { "epoch": 1.2663138782414376, "grad_norm": 2.780932664871216, "learning_rate": 4.873370164031099e-05, "loss": 1.0573, "step": 81600 }, { "epoch": 1.2678657334843806, "grad_norm": 2.6456449031829834, "learning_rate": 4.873214978506805e-05, "loss": 1.065, "step": 81700 }, { "epoch": 1.2694175887273236, "grad_norm": 2.3711929321289062, "learning_rate": 4.873059792982511e-05, "loss": 1.0431, "step": 81800 }, { "epoch": 1.2709694439702663, "grad_norm": 1.937865138053894, "learning_rate": 4.8729046074582165e-05, "loss": 1.0556, "step": 81900 }, { "epoch": 1.2725212992132093, "grad_norm": 2.439114809036255, "learning_rate": 4.872749421933922e-05, "loss": 1.0504, "step": 82000 }, { "epoch": 1.2740731544561523, "grad_norm": 2.512678384780884, "learning_rate": 4.8725942364096274e-05, "loss": 1.0597, "step": 82100 }, { "epoch": 1.2756250096990953, "grad_norm": 2.648059844970703, "learning_rate": 4.872439050885333e-05, "loss": 1.0553, "step": 82200 }, { "epoch": 1.2771768649420383, "grad_norm": 2.5972201824188232, "learning_rate": 4.872283865361039e-05, "loss": 1.0495, "step": 82300 }, { "epoch": 1.278728720184981, "grad_norm": 2.959404945373535, "learning_rate": 4.872128679836745e-05, "loss": 1.0374, "step": 82400 }, { "epoch": 1.280280575427924, "grad_norm": 1.9427341222763062, "learning_rate": 4.8719734943124505e-05, "loss": 1.0508, "step": 82500 }, { "epoch": 1.281832430670867, "grad_norm": 2.5581164360046387, "learning_rate": 4.871818308788156e-05, "loss": 1.062, "step": 82600 }, { "epoch": 1.28338428591381, "grad_norm": 2.1727750301361084, "learning_rate": 4.871663123263862e-05, "loss": 1.0395, "step": 82700 }, { "epoch": 1.284936141156753, "grad_norm": 2.438131332397461, "learning_rate": 4.871507937739568e-05, "loss": 1.0424, "step": 82800 }, { "epoch": 1.2864879963996958, "grad_norm": 2.377946376800537, "learning_rate": 4.8713527522152736e-05, "loss": 1.0228, "step": 82900 }, { "epoch": 1.2880398516426388, "grad_norm": 2.323432445526123, "learning_rate": 4.8711975666909794e-05, "loss": 1.0503, "step": 83000 }, { "epoch": 1.2895917068855818, "grad_norm": 2.2485122680664062, "learning_rate": 4.871042381166685e-05, "loss": 1.0584, "step": 83100 }, { "epoch": 1.2911435621285245, "grad_norm": 2.608797073364258, "learning_rate": 4.870887195642391e-05, "loss": 1.0385, "step": 83200 }, { "epoch": 1.2926954173714675, "grad_norm": 2.274207830429077, "learning_rate": 4.870732010118097e-05, "loss": 1.0451, "step": 83300 }, { "epoch": 1.2942472726144105, "grad_norm": 3.038950204849243, "learning_rate": 4.870576824593802e-05, "loss": 1.0591, "step": 83400 }, { "epoch": 1.2957991278573535, "grad_norm": 2.5448522567749023, "learning_rate": 4.8704216390695076e-05, "loss": 1.0537, "step": 83500 }, { "epoch": 1.2973509831002965, "grad_norm": 2.5872418880462646, "learning_rate": 4.8702664535452134e-05, "loss": 1.0655, "step": 83600 }, { "epoch": 1.2989028383432393, "grad_norm": 2.0736887454986572, "learning_rate": 4.870111268020919e-05, "loss": 1.0635, "step": 83700 }, { "epoch": 1.3004546935861823, "grad_norm": 2.5537025928497314, "learning_rate": 4.869956082496625e-05, "loss": 1.0569, "step": 83800 }, { "epoch": 1.3020065488291253, "grad_norm": 1.91986882686615, "learning_rate": 4.869800896972331e-05, "loss": 1.0387, "step": 83900 }, { "epoch": 1.303558404072068, "grad_norm": 2.568892240524292, "learning_rate": 4.8696457114480365e-05, "loss": 1.0527, "step": 84000 }, { "epoch": 1.3051102593150112, "grad_norm": 2.581442356109619, "learning_rate": 4.869490525923742e-05, "loss": 1.0596, "step": 84100 }, { "epoch": 1.306662114557954, "grad_norm": 2.2258124351501465, "learning_rate": 4.869335340399448e-05, "loss": 1.0392, "step": 84200 }, { "epoch": 1.308213969800897, "grad_norm": 2.6356661319732666, "learning_rate": 4.869180154875154e-05, "loss": 1.0237, "step": 84300 }, { "epoch": 1.30976582504384, "grad_norm": 2.6342976093292236, "learning_rate": 4.8690249693508596e-05, "loss": 1.0583, "step": 84400 }, { "epoch": 1.3113176802867827, "grad_norm": 2.647239923477173, "learning_rate": 4.8688697838265653e-05, "loss": 1.0648, "step": 84500 }, { "epoch": 1.3128695355297257, "grad_norm": 2.4482600688934326, "learning_rate": 4.868714598302271e-05, "loss": 1.0365, "step": 84600 }, { "epoch": 1.3144213907726687, "grad_norm": 2.5061986446380615, "learning_rate": 4.868559412777976e-05, "loss": 1.027, "step": 84700 }, { "epoch": 1.3159732460156117, "grad_norm": 2.247843027114868, "learning_rate": 4.868404227253682e-05, "loss": 1.0275, "step": 84800 }, { "epoch": 1.3175251012585547, "grad_norm": 2.4595468044281006, "learning_rate": 4.868249041729388e-05, "loss": 1.0543, "step": 84900 }, { "epoch": 1.3190769565014975, "grad_norm": 2.7042787075042725, "learning_rate": 4.868093856205093e-05, "loss": 1.0589, "step": 85000 }, { "epoch": 1.3206288117444405, "grad_norm": 2.322070598602295, "learning_rate": 4.8679386706807986e-05, "loss": 1.0266, "step": 85100 }, { "epoch": 1.3221806669873835, "grad_norm": 2.3571321964263916, "learning_rate": 4.8677834851565044e-05, "loss": 1.0555, "step": 85200 }, { "epoch": 1.3237325222303262, "grad_norm": 2.5896072387695312, "learning_rate": 4.86762829963221e-05, "loss": 1.0495, "step": 85300 }, { "epoch": 1.3252843774732692, "grad_norm": 2.556939125061035, "learning_rate": 4.867473114107916e-05, "loss": 1.0181, "step": 85400 }, { "epoch": 1.3268362327162122, "grad_norm": 2.522552728652954, "learning_rate": 4.867317928583622e-05, "loss": 1.0542, "step": 85500 }, { "epoch": 1.3283880879591552, "grad_norm": 2.74289870262146, "learning_rate": 4.8671627430593275e-05, "loss": 1.0494, "step": 85600 }, { "epoch": 1.3299399432020982, "grad_norm": 2.4885756969451904, "learning_rate": 4.867007557535033e-05, "loss": 1.0485, "step": 85700 }, { "epoch": 1.331491798445041, "grad_norm": 2.515862226486206, "learning_rate": 4.866852372010739e-05, "loss": 1.0478, "step": 85800 }, { "epoch": 1.333043653687984, "grad_norm": 2.5960795879364014, "learning_rate": 4.866697186486445e-05, "loss": 1.0389, "step": 85900 }, { "epoch": 1.334595508930927, "grad_norm": 2.548943519592285, "learning_rate": 4.8665420009621506e-05, "loss": 1.0185, "step": 86000 }, { "epoch": 1.33614736417387, "grad_norm": 2.428471565246582, "learning_rate": 4.8663868154378564e-05, "loss": 1.0531, "step": 86100 }, { "epoch": 1.337699219416813, "grad_norm": 2.0549609661102295, "learning_rate": 4.866231629913562e-05, "loss": 1.0493, "step": 86200 }, { "epoch": 1.3392510746597557, "grad_norm": 6.152304172515869, "learning_rate": 4.866076444389267e-05, "loss": 1.0463, "step": 86300 }, { "epoch": 1.3408029299026987, "grad_norm": 2.526155471801758, "learning_rate": 4.865921258864973e-05, "loss": 1.0509, "step": 86400 }, { "epoch": 1.3423547851456417, "grad_norm": 2.3234591484069824, "learning_rate": 4.865766073340679e-05, "loss": 1.0581, "step": 86500 }, { "epoch": 1.3439066403885844, "grad_norm": 2.320425033569336, "learning_rate": 4.8656108878163846e-05, "loss": 1.0614, "step": 86600 }, { "epoch": 1.3454584956315274, "grad_norm": 2.7503199577331543, "learning_rate": 4.8654557022920904e-05, "loss": 1.0295, "step": 86700 }, { "epoch": 1.3470103508744704, "grad_norm": 2.619347095489502, "learning_rate": 4.865300516767796e-05, "loss": 1.0313, "step": 86800 }, { "epoch": 1.3485622061174134, "grad_norm": 2.5443243980407715, "learning_rate": 4.865145331243502e-05, "loss": 1.047, "step": 86900 }, { "epoch": 1.3501140613603564, "grad_norm": 2.669553279876709, "learning_rate": 4.864990145719208e-05, "loss": 1.0742, "step": 87000 }, { "epoch": 1.3516659166032992, "grad_norm": 2.082939624786377, "learning_rate": 4.8648349601949135e-05, "loss": 1.0531, "step": 87100 }, { "epoch": 1.3532177718462421, "grad_norm": 2.3499057292938232, "learning_rate": 4.864679774670619e-05, "loss": 1.0548, "step": 87200 }, { "epoch": 1.3547696270891851, "grad_norm": 2.5062954425811768, "learning_rate": 4.864524589146325e-05, "loss": 1.0551, "step": 87300 }, { "epoch": 1.3563214823321281, "grad_norm": 2.4551949501037598, "learning_rate": 4.864369403622031e-05, "loss": 1.0525, "step": 87400 }, { "epoch": 1.3578733375750711, "grad_norm": 2.462491035461426, "learning_rate": 4.8642142180977366e-05, "loss": 1.0435, "step": 87500 }, { "epoch": 1.3594251928180139, "grad_norm": 2.7389044761657715, "learning_rate": 4.864059032573442e-05, "loss": 1.0553, "step": 87600 }, { "epoch": 1.3609770480609569, "grad_norm": 2.4587864875793457, "learning_rate": 4.8639038470491474e-05, "loss": 1.0296, "step": 87700 }, { "epoch": 1.3625289033038999, "grad_norm": 2.245816469192505, "learning_rate": 4.863748661524853e-05, "loss": 1.041, "step": 87800 }, { "epoch": 1.3640807585468426, "grad_norm": 2.0975582599639893, "learning_rate": 4.863593476000559e-05, "loss": 1.0602, "step": 87900 }, { "epoch": 1.3656326137897856, "grad_norm": 2.4069764614105225, "learning_rate": 4.863438290476265e-05, "loss": 1.0495, "step": 88000 }, { "epoch": 1.3671844690327286, "grad_norm": 3.73746919631958, "learning_rate": 4.86328310495197e-05, "loss": 1.0383, "step": 88100 }, { "epoch": 1.3687363242756716, "grad_norm": 2.78701114654541, "learning_rate": 4.8631279194276756e-05, "loss": 1.0456, "step": 88200 }, { "epoch": 1.3702881795186146, "grad_norm": 2.7328758239746094, "learning_rate": 4.8629727339033814e-05, "loss": 1.0717, "step": 88300 }, { "epoch": 1.3718400347615574, "grad_norm": 2.896998405456543, "learning_rate": 4.862817548379087e-05, "loss": 1.0599, "step": 88400 }, { "epoch": 1.3733918900045003, "grad_norm": 2.270195960998535, "learning_rate": 4.862662362854793e-05, "loss": 1.0491, "step": 88500 }, { "epoch": 1.3749437452474433, "grad_norm": 2.6813406944274902, "learning_rate": 4.862507177330499e-05, "loss": 1.0568, "step": 88600 }, { "epoch": 1.3764956004903863, "grad_norm": 2.588879346847534, "learning_rate": 4.8623519918062045e-05, "loss": 1.0236, "step": 88700 }, { "epoch": 1.3780474557333293, "grad_norm": 2.502317190170288, "learning_rate": 4.86219680628191e-05, "loss": 1.0243, "step": 88800 }, { "epoch": 1.379599310976272, "grad_norm": 2.631701946258545, "learning_rate": 4.862041620757616e-05, "loss": 1.0341, "step": 88900 }, { "epoch": 1.381151166219215, "grad_norm": 2.96972393989563, "learning_rate": 4.861886435233322e-05, "loss": 1.0364, "step": 89000 }, { "epoch": 1.382703021462158, "grad_norm": 2.1305992603302, "learning_rate": 4.861731249709027e-05, "loss": 1.0408, "step": 89100 }, { "epoch": 1.3842548767051008, "grad_norm": 2.458451747894287, "learning_rate": 4.861576064184733e-05, "loss": 1.0458, "step": 89200 }, { "epoch": 1.3858067319480438, "grad_norm": 2.3903863430023193, "learning_rate": 4.8614208786604385e-05, "loss": 1.0456, "step": 89300 }, { "epoch": 1.3873585871909868, "grad_norm": 2.647390365600586, "learning_rate": 4.861265693136144e-05, "loss": 1.0299, "step": 89400 }, { "epoch": 1.3889104424339298, "grad_norm": 1.9346131086349487, "learning_rate": 4.86111050761185e-05, "loss": 1.0479, "step": 89500 }, { "epoch": 1.3904622976768728, "grad_norm": 2.754869222640991, "learning_rate": 4.860955322087556e-05, "loss": 1.0417, "step": 89600 }, { "epoch": 1.3920141529198156, "grad_norm": 2.407757043838501, "learning_rate": 4.8608001365632616e-05, "loss": 1.0442, "step": 89700 }, { "epoch": 1.3935660081627586, "grad_norm": 2.588167190551758, "learning_rate": 4.8606449510389674e-05, "loss": 1.0732, "step": 89800 }, { "epoch": 1.3951178634057015, "grad_norm": 2.5167994499206543, "learning_rate": 4.860489765514673e-05, "loss": 1.0456, "step": 89900 }, { "epoch": 1.3966697186486445, "grad_norm": 2.544070243835449, "learning_rate": 4.860334579990379e-05, "loss": 1.0241, "step": 90000 }, { "epoch": 1.3982215738915875, "grad_norm": 2.3652281761169434, "learning_rate": 4.860179394466085e-05, "loss": 1.0512, "step": 90100 }, { "epoch": 1.3997734291345303, "grad_norm": 2.6142187118530273, "learning_rate": 4.8600242089417905e-05, "loss": 1.0281, "step": 90200 }, { "epoch": 1.4013252843774733, "grad_norm": 2.3808839321136475, "learning_rate": 4.859869023417496e-05, "loss": 1.0617, "step": 90300 }, { "epoch": 1.4028771396204163, "grad_norm": 2.7062933444976807, "learning_rate": 4.8597138378932013e-05, "loss": 1.0508, "step": 90400 }, { "epoch": 1.404428994863359, "grad_norm": 2.225489377975464, "learning_rate": 4.859558652368907e-05, "loss": 1.0519, "step": 90500 }, { "epoch": 1.405980850106302, "grad_norm": 2.1889023780822754, "learning_rate": 4.859403466844613e-05, "loss": 1.038, "step": 90600 }, { "epoch": 1.407532705349245, "grad_norm": 2.074084997177124, "learning_rate": 4.859248281320319e-05, "loss": 1.0368, "step": 90700 }, { "epoch": 1.409084560592188, "grad_norm": 2.378923177719116, "learning_rate": 4.8590930957960244e-05, "loss": 1.0471, "step": 90800 }, { "epoch": 1.410636415835131, "grad_norm": 2.370481491088867, "learning_rate": 4.85893791027173e-05, "loss": 1.0318, "step": 90900 }, { "epoch": 1.4121882710780738, "grad_norm": 2.6096673011779785, "learning_rate": 4.858782724747436e-05, "loss": 1.0647, "step": 91000 }, { "epoch": 1.4137401263210168, "grad_norm": 2.072784423828125, "learning_rate": 4.858627539223142e-05, "loss": 1.0854, "step": 91100 }, { "epoch": 1.4152919815639597, "grad_norm": 2.6431357860565186, "learning_rate": 4.8584723536988475e-05, "loss": 1.0503, "step": 91200 }, { "epoch": 1.4168438368069027, "grad_norm": 2.3734118938446045, "learning_rate": 4.858317168174553e-05, "loss": 1.0294, "step": 91300 }, { "epoch": 1.4183956920498457, "grad_norm": 2.892489433288574, "learning_rate": 4.8581619826502584e-05, "loss": 1.0366, "step": 91400 }, { "epoch": 1.4199475472927885, "grad_norm": 2.4368677139282227, "learning_rate": 4.858006797125964e-05, "loss": 1.0249, "step": 91500 }, { "epoch": 1.4214994025357315, "grad_norm": 2.4559972286224365, "learning_rate": 4.85785161160167e-05, "loss": 1.0591, "step": 91600 }, { "epoch": 1.4230512577786745, "grad_norm": 2.695283889770508, "learning_rate": 4.857696426077376e-05, "loss": 1.0401, "step": 91700 }, { "epoch": 1.4246031130216172, "grad_norm": 3.2021265029907227, "learning_rate": 4.8575412405530815e-05, "loss": 1.0336, "step": 91800 }, { "epoch": 1.4261549682645602, "grad_norm": 2.3150668144226074, "learning_rate": 4.857386055028787e-05, "loss": 1.0555, "step": 91900 }, { "epoch": 1.4277068235075032, "grad_norm": 2.712170362472534, "learning_rate": 4.8572308695044924e-05, "loss": 1.0575, "step": 92000 }, { "epoch": 1.4292586787504462, "grad_norm": 2.304283380508423, "learning_rate": 4.857075683980198e-05, "loss": 1.0324, "step": 92100 }, { "epoch": 1.4308105339933892, "grad_norm": 2.0092875957489014, "learning_rate": 4.856920498455904e-05, "loss": 1.0613, "step": 92200 }, { "epoch": 1.432362389236332, "grad_norm": 2.1521353721618652, "learning_rate": 4.85676531293161e-05, "loss": 1.0398, "step": 92300 }, { "epoch": 1.433914244479275, "grad_norm": 2.455341100692749, "learning_rate": 4.8566101274073155e-05, "loss": 1.0523, "step": 92400 }, { "epoch": 1.435466099722218, "grad_norm": 2.065932273864746, "learning_rate": 4.856454941883021e-05, "loss": 1.0326, "step": 92500 }, { "epoch": 1.4370179549651607, "grad_norm": 2.363285779953003, "learning_rate": 4.856299756358727e-05, "loss": 1.0546, "step": 92600 }, { "epoch": 1.4385698102081037, "grad_norm": 2.4742491245269775, "learning_rate": 4.856144570834433e-05, "loss": 1.0497, "step": 92700 }, { "epoch": 1.4401216654510467, "grad_norm": 2.696601629257202, "learning_rate": 4.8559893853101386e-05, "loss": 1.0476, "step": 92800 }, { "epoch": 1.4416735206939897, "grad_norm": 4.366289138793945, "learning_rate": 4.8558341997858444e-05, "loss": 1.0371, "step": 92900 }, { "epoch": 1.4432253759369327, "grad_norm": 2.484636068344116, "learning_rate": 4.85567901426155e-05, "loss": 1.0237, "step": 93000 }, { "epoch": 1.4447772311798754, "grad_norm": 2.702249050140381, "learning_rate": 4.855523828737256e-05, "loss": 1.0276, "step": 93100 }, { "epoch": 1.4463290864228184, "grad_norm": 2.628918409347534, "learning_rate": 4.855368643212962e-05, "loss": 1.0716, "step": 93200 }, { "epoch": 1.4478809416657614, "grad_norm": 2.4079902172088623, "learning_rate": 4.855213457688667e-05, "loss": 1.0537, "step": 93300 }, { "epoch": 1.4494327969087044, "grad_norm": 2.2383265495300293, "learning_rate": 4.8550582721643726e-05, "loss": 1.0287, "step": 93400 }, { "epoch": 1.4509846521516474, "grad_norm": 2.9221346378326416, "learning_rate": 4.8549030866400783e-05, "loss": 1.0248, "step": 93500 }, { "epoch": 1.4525365073945902, "grad_norm": 2.4500772953033447, "learning_rate": 4.854747901115784e-05, "loss": 1.0557, "step": 93600 }, { "epoch": 1.4540883626375332, "grad_norm": 2.4966182708740234, "learning_rate": 4.85459271559149e-05, "loss": 1.0411, "step": 93700 }, { "epoch": 1.4556402178804762, "grad_norm": 2.4531257152557373, "learning_rate": 4.854437530067196e-05, "loss": 1.0476, "step": 93800 }, { "epoch": 1.457192073123419, "grad_norm": 2.508915901184082, "learning_rate": 4.8542823445429014e-05, "loss": 1.0235, "step": 93900 }, { "epoch": 1.458743928366362, "grad_norm": 2.495760202407837, "learning_rate": 4.854127159018607e-05, "loss": 1.061, "step": 94000 }, { "epoch": 1.460295783609305, "grad_norm": 2.259650468826294, "learning_rate": 4.853971973494313e-05, "loss": 1.0383, "step": 94100 }, { "epoch": 1.4618476388522479, "grad_norm": 2.700319290161133, "learning_rate": 4.853816787970019e-05, "loss": 1.048, "step": 94200 }, { "epoch": 1.4633994940951909, "grad_norm": 2.201087236404419, "learning_rate": 4.8536616024457245e-05, "loss": 1.0334, "step": 94300 }, { "epoch": 1.4649513493381336, "grad_norm": 2.2648532390594482, "learning_rate": 4.85350641692143e-05, "loss": 1.0719, "step": 94400 }, { "epoch": 1.4665032045810766, "grad_norm": 2.356703042984009, "learning_rate": 4.853351231397136e-05, "loss": 1.0367, "step": 94500 }, { "epoch": 1.4680550598240196, "grad_norm": 2.51196551322937, "learning_rate": 4.853196045872841e-05, "loss": 1.0263, "step": 94600 }, { "epoch": 1.4696069150669626, "grad_norm": 2.6832783222198486, "learning_rate": 4.853040860348547e-05, "loss": 1.0398, "step": 94700 }, { "epoch": 1.4711587703099056, "grad_norm": 2.966886043548584, "learning_rate": 4.852885674824252e-05, "loss": 1.0476, "step": 94800 }, { "epoch": 1.4727106255528484, "grad_norm": 3.2574920654296875, "learning_rate": 4.852730489299958e-05, "loss": 1.0511, "step": 94900 }, { "epoch": 1.4742624807957914, "grad_norm": 2.14552903175354, "learning_rate": 4.8525753037756636e-05, "loss": 1.0261, "step": 95000 }, { "epoch": 1.4758143360387344, "grad_norm": 2.6332345008850098, "learning_rate": 4.8524201182513694e-05, "loss": 1.0506, "step": 95100 }, { "epoch": 1.4773661912816771, "grad_norm": 2.074132204055786, "learning_rate": 4.852264932727075e-05, "loss": 1.0107, "step": 95200 }, { "epoch": 1.4789180465246201, "grad_norm": 2.341028928756714, "learning_rate": 4.852109747202781e-05, "loss": 1.0292, "step": 95300 }, { "epoch": 1.480469901767563, "grad_norm": 2.1405141353607178, "learning_rate": 4.851954561678487e-05, "loss": 1.0284, "step": 95400 }, { "epoch": 1.482021757010506, "grad_norm": 2.192636489868164, "learning_rate": 4.8517993761541925e-05, "loss": 1.0356, "step": 95500 }, { "epoch": 1.483573612253449, "grad_norm": 2.1814465522766113, "learning_rate": 4.851644190629898e-05, "loss": 1.0624, "step": 95600 }, { "epoch": 1.4851254674963918, "grad_norm": 2.721266031265259, "learning_rate": 4.851489005105604e-05, "loss": 1.0539, "step": 95700 }, { "epoch": 1.4866773227393348, "grad_norm": 2.576841354370117, "learning_rate": 4.85133381958131e-05, "loss": 1.0364, "step": 95800 }, { "epoch": 1.4882291779822778, "grad_norm": 2.71665620803833, "learning_rate": 4.8511786340570156e-05, "loss": 1.0309, "step": 95900 }, { "epoch": 1.4897810332252208, "grad_norm": 2.2186105251312256, "learning_rate": 4.8510234485327214e-05, "loss": 1.0386, "step": 96000 }, { "epoch": 1.4913328884681638, "grad_norm": 2.465268850326538, "learning_rate": 4.8508682630084265e-05, "loss": 1.0432, "step": 96100 }, { "epoch": 1.4928847437111066, "grad_norm": 2.4179365634918213, "learning_rate": 4.850713077484132e-05, "loss": 1.0359, "step": 96200 }, { "epoch": 1.4944365989540496, "grad_norm": 2.218862533569336, "learning_rate": 4.850557891959838e-05, "loss": 1.0271, "step": 96300 }, { "epoch": 1.4959884541969926, "grad_norm": 2.8809542655944824, "learning_rate": 4.850402706435544e-05, "loss": 1.0445, "step": 96400 }, { "epoch": 1.4975403094399353, "grad_norm": 2.712141513824463, "learning_rate": 4.8502475209112496e-05, "loss": 1.0553, "step": 96500 }, { "epoch": 1.4990921646828783, "grad_norm": 2.47284197807312, "learning_rate": 4.8500923353869553e-05, "loss": 1.0487, "step": 96600 }, { "epoch": 1.5006440199258213, "grad_norm": 2.436793327331543, "learning_rate": 4.849937149862661e-05, "loss": 1.0382, "step": 96700 }, { "epoch": 1.5021958751687643, "grad_norm": 2.403978109359741, "learning_rate": 4.849781964338367e-05, "loss": 1.0415, "step": 96800 }, { "epoch": 1.5037477304117073, "grad_norm": 3.166785955429077, "learning_rate": 4.849626778814073e-05, "loss": 1.0196, "step": 96900 }, { "epoch": 1.50529958565465, "grad_norm": 2.512316942214966, "learning_rate": 4.8494715932897784e-05, "loss": 1.0469, "step": 97000 }, { "epoch": 1.506851440897593, "grad_norm": 2.60292649269104, "learning_rate": 4.849316407765484e-05, "loss": 1.014, "step": 97100 }, { "epoch": 1.508403296140536, "grad_norm": 2.820655345916748, "learning_rate": 4.84916122224119e-05, "loss": 1.0498, "step": 97200 }, { "epoch": 1.5099551513834788, "grad_norm": 2.67410945892334, "learning_rate": 4.849006036716896e-05, "loss": 1.0496, "step": 97300 }, { "epoch": 1.511507006626422, "grad_norm": 2.8473763465881348, "learning_rate": 4.848850851192601e-05, "loss": 1.057, "step": 97400 }, { "epoch": 1.5130588618693648, "grad_norm": 2.317537546157837, "learning_rate": 4.8486956656683066e-05, "loss": 1.0322, "step": 97500 }, { "epoch": 1.5146107171123078, "grad_norm": 4.24163818359375, "learning_rate": 4.8485404801440124e-05, "loss": 1.0686, "step": 97600 }, { "epoch": 1.5161625723552508, "grad_norm": 2.571455478668213, "learning_rate": 4.848385294619718e-05, "loss": 1.0347, "step": 97700 }, { "epoch": 1.5177144275981935, "grad_norm": 2.1442370414733887, "learning_rate": 4.848230109095424e-05, "loss": 1.02, "step": 97800 }, { "epoch": 1.5192662828411367, "grad_norm": 2.101008653640747, "learning_rate": 4.848074923571129e-05, "loss": 1.0503, "step": 97900 }, { "epoch": 1.5208181380840795, "grad_norm": 2.4342355728149414, "learning_rate": 4.847919738046835e-05, "loss": 1.0389, "step": 98000 }, { "epoch": 1.5223699933270225, "grad_norm": 2.130798816680908, "learning_rate": 4.8477645525225406e-05, "loss": 1.0464, "step": 98100 }, { "epoch": 1.5239218485699655, "grad_norm": 2.71527099609375, "learning_rate": 4.8476093669982464e-05, "loss": 1.0515, "step": 98200 }, { "epoch": 1.5254737038129083, "grad_norm": 5.213271141052246, "learning_rate": 4.847454181473952e-05, "loss": 1.0294, "step": 98300 }, { "epoch": 1.5270255590558512, "grad_norm": 4.145285129547119, "learning_rate": 4.847298995949658e-05, "loss": 1.0274, "step": 98400 }, { "epoch": 1.5285774142987942, "grad_norm": 2.578350782394409, "learning_rate": 4.847143810425364e-05, "loss": 1.0232, "step": 98500 }, { "epoch": 1.530129269541737, "grad_norm": 2.7888824939727783, "learning_rate": 4.8469886249010695e-05, "loss": 1.0566, "step": 98600 }, { "epoch": 1.5316811247846802, "grad_norm": 2.8916525840759277, "learning_rate": 4.846833439376775e-05, "loss": 1.0472, "step": 98700 }, { "epoch": 1.533232980027623, "grad_norm": 2.306682825088501, "learning_rate": 4.846678253852481e-05, "loss": 1.0176, "step": 98800 }, { "epoch": 1.534784835270566, "grad_norm": 2.6601696014404297, "learning_rate": 4.846523068328186e-05, "loss": 1.0228, "step": 98900 }, { "epoch": 1.536336690513509, "grad_norm": 2.6579089164733887, "learning_rate": 4.846367882803892e-05, "loss": 1.0347, "step": 99000 }, { "epoch": 1.5378885457564517, "grad_norm": 2.85756778717041, "learning_rate": 4.846212697279598e-05, "loss": 1.0412, "step": 99100 }, { "epoch": 1.539440400999395, "grad_norm": 2.507534980773926, "learning_rate": 4.8460575117553035e-05, "loss": 1.0382, "step": 99200 }, { "epoch": 1.5409922562423377, "grad_norm": 2.2480547428131104, "learning_rate": 4.845902326231009e-05, "loss": 1.0258, "step": 99300 }, { "epoch": 1.5425441114852807, "grad_norm": 2.4130470752716064, "learning_rate": 4.845747140706715e-05, "loss": 1.0421, "step": 99400 }, { "epoch": 1.5440959667282237, "grad_norm": 2.6892507076263428, "learning_rate": 4.845591955182421e-05, "loss": 1.0347, "step": 99500 }, { "epoch": 1.5456478219711665, "grad_norm": 2.2742438316345215, "learning_rate": 4.8454367696581266e-05, "loss": 1.0397, "step": 99600 }, { "epoch": 1.5471996772141094, "grad_norm": 2.398554801940918, "learning_rate": 4.8452815841338323e-05, "loss": 1.0112, "step": 99700 }, { "epoch": 1.5487515324570524, "grad_norm": 2.680093288421631, "learning_rate": 4.845126398609538e-05, "loss": 1.0183, "step": 99800 }, { "epoch": 1.5503033876999952, "grad_norm": 2.536892890930176, "learning_rate": 4.844971213085244e-05, "loss": 1.0455, "step": 99900 }, { "epoch": 1.5518552429429384, "grad_norm": 3.957472085952759, "learning_rate": 4.84481602756095e-05, "loss": 1.0469, "step": 100000 }, { "epoch": 1.5534070981858812, "grad_norm": 2.6657824516296387, "learning_rate": 4.8446608420366554e-05, "loss": 1.0305, "step": 100100 }, { "epoch": 1.5549589534288242, "grad_norm": 2.403201103210449, "learning_rate": 4.8445056565123605e-05, "loss": 1.0715, "step": 100200 }, { "epoch": 1.5565108086717672, "grad_norm": 2.65813946723938, "learning_rate": 4.844350470988066e-05, "loss": 1.0458, "step": 100300 }, { "epoch": 1.55806266391471, "grad_norm": 2.3557276725769043, "learning_rate": 4.844195285463772e-05, "loss": 1.0268, "step": 100400 }, { "epoch": 1.559614519157653, "grad_norm": 2.609313488006592, "learning_rate": 4.844040099939478e-05, "loss": 1.0294, "step": 100500 }, { "epoch": 1.561166374400596, "grad_norm": 2.591097116470337, "learning_rate": 4.8438849144151836e-05, "loss": 1.0302, "step": 100600 }, { "epoch": 1.562718229643539, "grad_norm": 2.490663766860962, "learning_rate": 4.8437297288908894e-05, "loss": 1.0337, "step": 100700 }, { "epoch": 1.564270084886482, "grad_norm": 2.336423873901367, "learning_rate": 4.843574543366595e-05, "loss": 1.0344, "step": 100800 }, { "epoch": 1.5658219401294247, "grad_norm": 2.2479634284973145, "learning_rate": 4.843419357842301e-05, "loss": 1.0472, "step": 100900 }, { "epoch": 1.5673737953723677, "grad_norm": 2.462336301803589, "learning_rate": 4.843264172318007e-05, "loss": 1.0228, "step": 101000 }, { "epoch": 1.5689256506153106, "grad_norm": 2.606776237487793, "learning_rate": 4.8431089867937125e-05, "loss": 1.0478, "step": 101100 }, { "epoch": 1.5704775058582534, "grad_norm": 2.0719971656799316, "learning_rate": 4.8429538012694176e-05, "loss": 1.0557, "step": 101200 }, { "epoch": 1.5720293611011966, "grad_norm": 2.6997807025909424, "learning_rate": 4.8427986157451234e-05, "loss": 1.0321, "step": 101300 }, { "epoch": 1.5735812163441394, "grad_norm": 2.132478952407837, "learning_rate": 4.842643430220829e-05, "loss": 1.0438, "step": 101400 }, { "epoch": 1.5751330715870824, "grad_norm": 2.4815993309020996, "learning_rate": 4.842488244696535e-05, "loss": 1.0226, "step": 101500 }, { "epoch": 1.5766849268300254, "grad_norm": 2.361069917678833, "learning_rate": 4.842333059172241e-05, "loss": 1.05, "step": 101600 }, { "epoch": 1.5782367820729681, "grad_norm": 2.4454362392425537, "learning_rate": 4.8421778736479465e-05, "loss": 1.0335, "step": 101700 }, { "epoch": 1.5797886373159111, "grad_norm": 2.5617315769195557, "learning_rate": 4.8420226881236516e-05, "loss": 1.0297, "step": 101800 }, { "epoch": 1.5813404925588541, "grad_norm": 2.6035072803497314, "learning_rate": 4.8418675025993574e-05, "loss": 1.0303, "step": 101900 }, { "epoch": 1.5828923478017969, "grad_norm": 2.7591607570648193, "learning_rate": 4.841712317075063e-05, "loss": 1.0272, "step": 102000 }, { "epoch": 1.58444420304474, "grad_norm": 2.4730143547058105, "learning_rate": 4.841557131550769e-05, "loss": 1.0306, "step": 102100 }, { "epoch": 1.5859960582876829, "grad_norm": 2.3496601581573486, "learning_rate": 4.841401946026475e-05, "loss": 1.0358, "step": 102200 }, { "epoch": 1.5875479135306259, "grad_norm": 2.5361745357513428, "learning_rate": 4.8412467605021805e-05, "loss": 1.0259, "step": 102300 }, { "epoch": 1.5890997687735688, "grad_norm": 2.463172197341919, "learning_rate": 4.841091574977886e-05, "loss": 1.0243, "step": 102400 }, { "epoch": 1.5906516240165116, "grad_norm": 2.413402795791626, "learning_rate": 4.840936389453592e-05, "loss": 1.0369, "step": 102500 }, { "epoch": 1.5922034792594548, "grad_norm": 2.9052634239196777, "learning_rate": 4.840781203929298e-05, "loss": 1.0269, "step": 102600 }, { "epoch": 1.5937553345023976, "grad_norm": 2.3175466060638428, "learning_rate": 4.8406260184050036e-05, "loss": 1.0305, "step": 102700 }, { "epoch": 1.5953071897453406, "grad_norm": 2.1093995571136475, "learning_rate": 4.8404708328807093e-05, "loss": 1.0362, "step": 102800 }, { "epoch": 1.5968590449882836, "grad_norm": 2.177419424057007, "learning_rate": 4.840315647356415e-05, "loss": 1.0177, "step": 102900 }, { "epoch": 1.5984109002312263, "grad_norm": 2.163776397705078, "learning_rate": 4.840160461832121e-05, "loss": 1.0379, "step": 103000 }, { "epoch": 1.5999627554741693, "grad_norm": 2.4889352321624756, "learning_rate": 4.840005276307826e-05, "loss": 1.0323, "step": 103100 }, { "epoch": 1.6015146107171123, "grad_norm": 2.6706483364105225, "learning_rate": 4.839850090783532e-05, "loss": 1.0263, "step": 103200 }, { "epoch": 1.603066465960055, "grad_norm": 2.4786291122436523, "learning_rate": 4.8396949052592375e-05, "loss": 1.0424, "step": 103300 }, { "epoch": 1.6046183212029983, "grad_norm": 2.155407667160034, "learning_rate": 4.839539719734943e-05, "loss": 1.0551, "step": 103400 }, { "epoch": 1.606170176445941, "grad_norm": 2.686784029006958, "learning_rate": 4.839384534210649e-05, "loss": 1.0375, "step": 103500 }, { "epoch": 1.607722031688884, "grad_norm": 2.1338021755218506, "learning_rate": 4.839229348686355e-05, "loss": 1.0064, "step": 103600 }, { "epoch": 1.609273886931827, "grad_norm": 2.6391828060150146, "learning_rate": 4.8390741631620606e-05, "loss": 1.0261, "step": 103700 }, { "epoch": 1.6108257421747698, "grad_norm": 2.5809531211853027, "learning_rate": 4.8389189776377664e-05, "loss": 1.0588, "step": 103800 }, { "epoch": 1.612377597417713, "grad_norm": 2.5263278484344482, "learning_rate": 4.838763792113472e-05, "loss": 1.0176, "step": 103900 }, { "epoch": 1.6139294526606558, "grad_norm": 2.4641308784484863, "learning_rate": 4.838608606589178e-05, "loss": 1.0379, "step": 104000 }, { "epoch": 1.6154813079035988, "grad_norm": 2.978569746017456, "learning_rate": 4.838453421064884e-05, "loss": 1.0312, "step": 104100 }, { "epoch": 1.6170331631465418, "grad_norm": 2.499652862548828, "learning_rate": 4.8382982355405895e-05, "loss": 1.0388, "step": 104200 }, { "epoch": 1.6185850183894845, "grad_norm": 2.189666748046875, "learning_rate": 4.838143050016295e-05, "loss": 1.0413, "step": 104300 }, { "epoch": 1.6201368736324275, "grad_norm": 2.5846946239471436, "learning_rate": 4.8379878644920004e-05, "loss": 1.0176, "step": 104400 }, { "epoch": 1.6216887288753705, "grad_norm": 2.51509428024292, "learning_rate": 4.837832678967706e-05, "loss": 1.0201, "step": 104500 }, { "epoch": 1.6232405841183133, "grad_norm": 2.597632884979248, "learning_rate": 4.837677493443411e-05, "loss": 1.0478, "step": 104600 }, { "epoch": 1.6247924393612565, "grad_norm": 3.05515456199646, "learning_rate": 4.837522307919117e-05, "loss": 1.0424, "step": 104700 }, { "epoch": 1.6263442946041993, "grad_norm": 2.3954620361328125, "learning_rate": 4.837367122394823e-05, "loss": 1.0277, "step": 104800 }, { "epoch": 1.6278961498471423, "grad_norm": 2.5021378993988037, "learning_rate": 4.8372119368705286e-05, "loss": 1.051, "step": 104900 }, { "epoch": 1.6294480050900852, "grad_norm": 3.2989766597747803, "learning_rate": 4.8370567513462344e-05, "loss": 1.0088, "step": 105000 }, { "epoch": 1.630999860333028, "grad_norm": 2.7341039180755615, "learning_rate": 4.83690156582194e-05, "loss": 1.0213, "step": 105100 }, { "epoch": 1.6325517155759712, "grad_norm": 2.1372978687286377, "learning_rate": 4.836746380297646e-05, "loss": 1.0492, "step": 105200 }, { "epoch": 1.634103570818914, "grad_norm": 2.2881431579589844, "learning_rate": 4.836591194773352e-05, "loss": 1.0452, "step": 105300 }, { "epoch": 1.635655426061857, "grad_norm": 2.4673640727996826, "learning_rate": 4.8364360092490575e-05, "loss": 1.0239, "step": 105400 }, { "epoch": 1.6372072813048, "grad_norm": 2.0236401557922363, "learning_rate": 4.836280823724763e-05, "loss": 1.0199, "step": 105500 }, { "epoch": 1.6387591365477427, "grad_norm": 3.1825478076934814, "learning_rate": 4.836125638200469e-05, "loss": 1.0455, "step": 105600 }, { "epoch": 1.6403109917906857, "grad_norm": 2.1954190731048584, "learning_rate": 4.835970452676175e-05, "loss": 1.0237, "step": 105700 }, { "epoch": 1.6418628470336287, "grad_norm": 2.4303133487701416, "learning_rate": 4.8358152671518806e-05, "loss": 1.0334, "step": 105800 }, { "epoch": 1.6434147022765715, "grad_norm": 2.295259714126587, "learning_rate": 4.835660081627586e-05, "loss": 1.0185, "step": 105900 }, { "epoch": 1.6449665575195147, "grad_norm": 2.5711405277252197, "learning_rate": 4.8355048961032914e-05, "loss": 1.0425, "step": 106000 }, { "epoch": 1.6465184127624575, "grad_norm": 3.9194374084472656, "learning_rate": 4.835349710578997e-05, "loss": 1.0132, "step": 106100 }, { "epoch": 1.6480702680054005, "grad_norm": 2.5271852016448975, "learning_rate": 4.835194525054703e-05, "loss": 1.0073, "step": 106200 }, { "epoch": 1.6496221232483435, "grad_norm": 2.4699437618255615, "learning_rate": 4.835039339530409e-05, "loss": 1.0557, "step": 106300 }, { "epoch": 1.6511739784912862, "grad_norm": 2.36885666847229, "learning_rate": 4.8348841540061145e-05, "loss": 1.0209, "step": 106400 }, { "epoch": 1.6527258337342294, "grad_norm": 2.550990104675293, "learning_rate": 4.83472896848182e-05, "loss": 1.0297, "step": 106500 }, { "epoch": 1.6542776889771722, "grad_norm": 2.3559253215789795, "learning_rate": 4.834573782957526e-05, "loss": 1.0315, "step": 106600 }, { "epoch": 1.6558295442201152, "grad_norm": 2.1715376377105713, "learning_rate": 4.834418597433232e-05, "loss": 1.0126, "step": 106700 }, { "epoch": 1.6573813994630582, "grad_norm": 2.351818323135376, "learning_rate": 4.8342634119089376e-05, "loss": 1.044, "step": 106800 }, { "epoch": 1.658933254706001, "grad_norm": 2.068754196166992, "learning_rate": 4.8341082263846434e-05, "loss": 1.01, "step": 106900 }, { "epoch": 1.660485109948944, "grad_norm": 2.365856885910034, "learning_rate": 4.833953040860349e-05, "loss": 1.0359, "step": 107000 }, { "epoch": 1.662036965191887, "grad_norm": 2.6751911640167236, "learning_rate": 4.833797855336055e-05, "loss": 1.0476, "step": 107100 }, { "epoch": 1.6635888204348297, "grad_norm": 2.587235689163208, "learning_rate": 4.83364266981176e-05, "loss": 1.0232, "step": 107200 }, { "epoch": 1.665140675677773, "grad_norm": 2.331709146499634, "learning_rate": 4.833487484287466e-05, "loss": 1.0556, "step": 107300 }, { "epoch": 1.6666925309207157, "grad_norm": 2.683762550354004, "learning_rate": 4.8333322987631716e-05, "loss": 1.0096, "step": 107400 }, { "epoch": 1.6682443861636587, "grad_norm": 2.47930908203125, "learning_rate": 4.8331771132388774e-05, "loss": 1.0326, "step": 107500 }, { "epoch": 1.6697962414066017, "grad_norm": 2.2835793495178223, "learning_rate": 4.833021927714583e-05, "loss": 1.0237, "step": 107600 }, { "epoch": 1.6713480966495444, "grad_norm": 2.4262642860412598, "learning_rate": 4.832866742190288e-05, "loss": 1.0647, "step": 107700 }, { "epoch": 1.6728999518924874, "grad_norm": 2.9424259662628174, "learning_rate": 4.832711556665994e-05, "loss": 1.0272, "step": 107800 }, { "epoch": 1.6744518071354304, "grad_norm": 2.6113853454589844, "learning_rate": 4.8325563711417e-05, "loss": 1.0448, "step": 107900 }, { "epoch": 1.6760036623783734, "grad_norm": 2.581740140914917, "learning_rate": 4.8324011856174056e-05, "loss": 1.0578, "step": 108000 }, { "epoch": 1.6775555176213164, "grad_norm": 2.6260592937469482, "learning_rate": 4.8322460000931114e-05, "loss": 1.0273, "step": 108100 }, { "epoch": 1.6791073728642592, "grad_norm": 2.41333270072937, "learning_rate": 4.832090814568817e-05, "loss": 1.0457, "step": 108200 }, { "epoch": 1.6806592281072021, "grad_norm": 2.339979887008667, "learning_rate": 4.831935629044523e-05, "loss": 1.0179, "step": 108300 }, { "epoch": 1.6822110833501451, "grad_norm": 2.3533267974853516, "learning_rate": 4.831780443520229e-05, "loss": 1.0399, "step": 108400 }, { "epoch": 1.683762938593088, "grad_norm": 2.730045795440674, "learning_rate": 4.8316252579959345e-05, "loss": 1.0386, "step": 108500 }, { "epoch": 1.685314793836031, "grad_norm": 2.608795404434204, "learning_rate": 4.83147007247164e-05, "loss": 1.0239, "step": 108600 }, { "epoch": 1.6868666490789739, "grad_norm": 2.3437068462371826, "learning_rate": 4.831314886947346e-05, "loss": 1.0273, "step": 108700 }, { "epoch": 1.6884185043219169, "grad_norm": 2.5793745517730713, "learning_rate": 4.831159701423051e-05, "loss": 1.0239, "step": 108800 }, { "epoch": 1.6899703595648599, "grad_norm": 2.4704103469848633, "learning_rate": 4.831004515898757e-05, "loss": 1.0356, "step": 108900 }, { "epoch": 1.6915222148078026, "grad_norm": 2.25103497505188, "learning_rate": 4.830849330374463e-05, "loss": 1.036, "step": 109000 }, { "epoch": 1.6930740700507456, "grad_norm": 2.8936445713043213, "learning_rate": 4.8306941448501684e-05, "loss": 1.0196, "step": 109100 }, { "epoch": 1.6946259252936886, "grad_norm": 2.198181629180908, "learning_rate": 4.830538959325874e-05, "loss": 1.0246, "step": 109200 }, { "epoch": 1.6961777805366316, "grad_norm": 2.4963974952697754, "learning_rate": 4.83038377380158e-05, "loss": 1.034, "step": 109300 }, { "epoch": 1.6977296357795746, "grad_norm": 2.2657971382141113, "learning_rate": 4.830228588277286e-05, "loss": 1.0073, "step": 109400 }, { "epoch": 1.6992814910225174, "grad_norm": 2.8547563552856445, "learning_rate": 4.8300734027529915e-05, "loss": 1.0407, "step": 109500 }, { "epoch": 1.7008333462654603, "grad_norm": 2.440886974334717, "learning_rate": 4.829918217228697e-05, "loss": 1.0334, "step": 109600 }, { "epoch": 1.7023852015084033, "grad_norm": 2.567873239517212, "learning_rate": 4.829763031704403e-05, "loss": 1.0421, "step": 109700 }, { "epoch": 1.703937056751346, "grad_norm": 2.1132235527038574, "learning_rate": 4.829607846180109e-05, "loss": 1.0457, "step": 109800 }, { "epoch": 1.7054889119942893, "grad_norm": 2.423190116882324, "learning_rate": 4.8294526606558146e-05, "loss": 1.0177, "step": 109900 }, { "epoch": 1.707040767237232, "grad_norm": 2.1305267810821533, "learning_rate": 4.8292974751315204e-05, "loss": 1.0294, "step": 110000 }, { "epoch": 1.708592622480175, "grad_norm": 2.3493754863739014, "learning_rate": 4.8291422896072255e-05, "loss": 1.0464, "step": 110100 }, { "epoch": 1.710144477723118, "grad_norm": 2.6269371509552, "learning_rate": 4.828987104082931e-05, "loss": 1.0114, "step": 110200 }, { "epoch": 1.7116963329660608, "grad_norm": 2.30163311958313, "learning_rate": 4.828831918558637e-05, "loss": 1.026, "step": 110300 }, { "epoch": 1.7132481882090038, "grad_norm": 1.9489164352416992, "learning_rate": 4.828676733034343e-05, "loss": 1.0176, "step": 110400 }, { "epoch": 1.7148000434519468, "grad_norm": 8.252035140991211, "learning_rate": 4.8285215475100486e-05, "loss": 1.0285, "step": 110500 }, { "epoch": 1.7163518986948896, "grad_norm": 4.504483222961426, "learning_rate": 4.8283663619857544e-05, "loss": 1.0291, "step": 110600 }, { "epoch": 1.7179037539378328, "grad_norm": 2.4104626178741455, "learning_rate": 4.82821117646146e-05, "loss": 1.0248, "step": 110700 }, { "epoch": 1.7194556091807756, "grad_norm": 2.067858934402466, "learning_rate": 4.828055990937166e-05, "loss": 1.0272, "step": 110800 }, { "epoch": 1.7210074644237185, "grad_norm": 2.3199551105499268, "learning_rate": 4.827900805412871e-05, "loss": 1.0189, "step": 110900 }, { "epoch": 1.7225593196666615, "grad_norm": 2.232020854949951, "learning_rate": 4.827745619888577e-05, "loss": 1.0428, "step": 111000 }, { "epoch": 1.7241111749096043, "grad_norm": 2.935426712036133, "learning_rate": 4.8275904343642826e-05, "loss": 1.0456, "step": 111100 }, { "epoch": 1.7256630301525475, "grad_norm": 2.4509568214416504, "learning_rate": 4.8274352488399884e-05, "loss": 1.0224, "step": 111200 }, { "epoch": 1.7272148853954903, "grad_norm": 2.489701271057129, "learning_rate": 4.827280063315694e-05, "loss": 1.022, "step": 111300 }, { "epoch": 1.7287667406384333, "grad_norm": 2.576862096786499, "learning_rate": 4.8271248777914e-05, "loss": 1.035, "step": 111400 }, { "epoch": 1.7303185958813763, "grad_norm": 2.5274600982666016, "learning_rate": 4.826969692267106e-05, "loss": 1.0352, "step": 111500 }, { "epoch": 1.731870451124319, "grad_norm": 2.096816062927246, "learning_rate": 4.826814506742811e-05, "loss": 1.028, "step": 111600 }, { "epoch": 1.733422306367262, "grad_norm": 2.3782708644866943, "learning_rate": 4.8266593212185166e-05, "loss": 1.0097, "step": 111700 }, { "epoch": 1.734974161610205, "grad_norm": 2.1873621940612793, "learning_rate": 4.8265041356942223e-05, "loss": 0.9994, "step": 111800 }, { "epoch": 1.7365260168531478, "grad_norm": 2.199803352355957, "learning_rate": 4.826348950169928e-05, "loss": 1.0162, "step": 111900 }, { "epoch": 1.738077872096091, "grad_norm": 2.373566150665283, "learning_rate": 4.826193764645634e-05, "loss": 1.0211, "step": 112000 }, { "epoch": 1.7396297273390338, "grad_norm": 2.428107261657715, "learning_rate": 4.82603857912134e-05, "loss": 1.0344, "step": 112100 }, { "epoch": 1.7411815825819768, "grad_norm": 3.2218306064605713, "learning_rate": 4.8258833935970454e-05, "loss": 1.008, "step": 112200 }, { "epoch": 1.7427334378249197, "grad_norm": 2.475245237350464, "learning_rate": 4.825728208072751e-05, "loss": 1.01, "step": 112300 }, { "epoch": 1.7442852930678625, "grad_norm": 2.771214008331299, "learning_rate": 4.825573022548457e-05, "loss": 1.0018, "step": 112400 }, { "epoch": 1.7458371483108057, "grad_norm": 2.7797138690948486, "learning_rate": 4.825417837024163e-05, "loss": 1.0424, "step": 112500 }, { "epoch": 1.7473890035537485, "grad_norm": 2.1806671619415283, "learning_rate": 4.8252626514998685e-05, "loss": 1.0325, "step": 112600 }, { "epoch": 1.7489408587966915, "grad_norm": 2.1175684928894043, "learning_rate": 4.825107465975574e-05, "loss": 1.0233, "step": 112700 }, { "epoch": 1.7504927140396345, "grad_norm": 2.4249267578125, "learning_rate": 4.82495228045128e-05, "loss": 1.0292, "step": 112800 }, { "epoch": 1.7520445692825772, "grad_norm": 2.007124185562134, "learning_rate": 4.824797094926985e-05, "loss": 1.0377, "step": 112900 }, { "epoch": 1.7535964245255202, "grad_norm": 2.4387528896331787, "learning_rate": 4.824641909402691e-05, "loss": 1.0251, "step": 113000 }, { "epoch": 1.7551482797684632, "grad_norm": 2.363665819168091, "learning_rate": 4.824486723878397e-05, "loss": 1.0411, "step": 113100 }, { "epoch": 1.756700135011406, "grad_norm": 2.501901388168335, "learning_rate": 4.8243315383541025e-05, "loss": 1.0112, "step": 113200 }, { "epoch": 1.7582519902543492, "grad_norm": 2.154752016067505, "learning_rate": 4.824176352829808e-05, "loss": 1.0327, "step": 113300 }, { "epoch": 1.759803845497292, "grad_norm": 2.0281460285186768, "learning_rate": 4.824021167305514e-05, "loss": 1.0308, "step": 113400 }, { "epoch": 1.761355700740235, "grad_norm": 2.351263999938965, "learning_rate": 4.82386598178122e-05, "loss": 1.0149, "step": 113500 }, { "epoch": 1.762907555983178, "grad_norm": 2.7321178913116455, "learning_rate": 4.8237107962569256e-05, "loss": 1.025, "step": 113600 }, { "epoch": 1.7644594112261207, "grad_norm": 2.8011810779571533, "learning_rate": 4.8235556107326314e-05, "loss": 1.0228, "step": 113700 }, { "epoch": 1.766011266469064, "grad_norm": 2.897516965866089, "learning_rate": 4.823400425208337e-05, "loss": 1.0437, "step": 113800 }, { "epoch": 1.7675631217120067, "grad_norm": 2.767461061477661, "learning_rate": 4.823245239684043e-05, "loss": 1.0235, "step": 113900 }, { "epoch": 1.7691149769549497, "grad_norm": 2.3777735233306885, "learning_rate": 4.823090054159749e-05, "loss": 1.0267, "step": 114000 }, { "epoch": 1.7706668321978927, "grad_norm": 2.8053927421569824, "learning_rate": 4.8229348686354545e-05, "loss": 1.0294, "step": 114100 }, { "epoch": 1.7722186874408354, "grad_norm": 3.1602888107299805, "learning_rate": 4.8227796831111596e-05, "loss": 1.0169, "step": 114200 }, { "epoch": 1.7737705426837784, "grad_norm": 2.4116344451904297, "learning_rate": 4.8226244975868654e-05, "loss": 1.0407, "step": 114300 }, { "epoch": 1.7753223979267214, "grad_norm": 2.565544366836548, "learning_rate": 4.8224693120625705e-05, "loss": 1.0026, "step": 114400 }, { "epoch": 1.7768742531696642, "grad_norm": 2.553924560546875, "learning_rate": 4.822314126538276e-05, "loss": 1.024, "step": 114500 }, { "epoch": 1.7784261084126074, "grad_norm": 2.206970453262329, "learning_rate": 4.822158941013982e-05, "loss": 1.0381, "step": 114600 }, { "epoch": 1.7799779636555502, "grad_norm": 2.2536020278930664, "learning_rate": 4.822003755489688e-05, "loss": 1.0488, "step": 114700 }, { "epoch": 1.7815298188984932, "grad_norm": 2.2535243034362793, "learning_rate": 4.8218485699653936e-05, "loss": 1.0215, "step": 114800 }, { "epoch": 1.7830816741414361, "grad_norm": 2.9539706707000732, "learning_rate": 4.8216933844410993e-05, "loss": 1.0125, "step": 114900 }, { "epoch": 1.784633529384379, "grad_norm": 2.5208640098571777, "learning_rate": 4.821538198916805e-05, "loss": 1.0234, "step": 115000 }, { "epoch": 1.7861853846273221, "grad_norm": 2.206538438796997, "learning_rate": 4.821383013392511e-05, "loss": 1.0418, "step": 115100 }, { "epoch": 1.787737239870265, "grad_norm": 2.0503034591674805, "learning_rate": 4.821227827868217e-05, "loss": 1.0225, "step": 115200 }, { "epoch": 1.7892890951132079, "grad_norm": 2.7764992713928223, "learning_rate": 4.8210726423439224e-05, "loss": 1.0271, "step": 115300 }, { "epoch": 1.7908409503561509, "grad_norm": 2.52107834815979, "learning_rate": 4.820917456819628e-05, "loss": 1.0066, "step": 115400 }, { "epoch": 1.7923928055990936, "grad_norm": 2.4581079483032227, "learning_rate": 4.820762271295334e-05, "loss": 1.0144, "step": 115500 }, { "epoch": 1.7939446608420366, "grad_norm": 2.7939162254333496, "learning_rate": 4.82060708577104e-05, "loss": 1.0254, "step": 115600 }, { "epoch": 1.7954965160849796, "grad_norm": 3.1606225967407227, "learning_rate": 4.820451900246745e-05, "loss": 1.0279, "step": 115700 }, { "epoch": 1.7970483713279224, "grad_norm": 2.539543390274048, "learning_rate": 4.8202967147224506e-05, "loss": 1.0078, "step": 115800 }, { "epoch": 1.7986002265708656, "grad_norm": 2.4680018424987793, "learning_rate": 4.8201415291981564e-05, "loss": 1.0116, "step": 115900 }, { "epoch": 1.8001520818138084, "grad_norm": 2.548048973083496, "learning_rate": 4.819986343673862e-05, "loss": 1.0434, "step": 116000 }, { "epoch": 1.8017039370567514, "grad_norm": 2.2374303340911865, "learning_rate": 4.819831158149568e-05, "loss": 1.0396, "step": 116100 }, { "epoch": 1.8032557922996943, "grad_norm": 2.7799503803253174, "learning_rate": 4.819675972625274e-05, "loss": 1.0208, "step": 116200 }, { "epoch": 1.8048076475426371, "grad_norm": 2.2386441230773926, "learning_rate": 4.8195207871009795e-05, "loss": 1.0374, "step": 116300 }, { "epoch": 1.80635950278558, "grad_norm": 2.3718347549438477, "learning_rate": 4.819365601576685e-05, "loss": 1.0163, "step": 116400 }, { "epoch": 1.807911358028523, "grad_norm": 2.115276336669922, "learning_rate": 4.819210416052391e-05, "loss": 1.0207, "step": 116500 }, { "epoch": 1.809463213271466, "grad_norm": 3.004133462905884, "learning_rate": 4.819055230528097e-05, "loss": 1.0204, "step": 116600 }, { "epoch": 1.811015068514409, "grad_norm": 2.4428024291992188, "learning_rate": 4.8189000450038026e-05, "loss": 1.024, "step": 116700 }, { "epoch": 1.8125669237573518, "grad_norm": 3.419023036956787, "learning_rate": 4.8187448594795084e-05, "loss": 0.9967, "step": 116800 }, { "epoch": 1.8141187790002948, "grad_norm": 2.4446609020233154, "learning_rate": 4.818589673955214e-05, "loss": 1.0159, "step": 116900 }, { "epoch": 1.8156706342432378, "grad_norm": 2.733894109725952, "learning_rate": 4.818434488430919e-05, "loss": 1.038, "step": 117000 }, { "epoch": 1.8172224894861806, "grad_norm": 2.1228413581848145, "learning_rate": 4.818279302906625e-05, "loss": 1.0277, "step": 117100 }, { "epoch": 1.8187743447291238, "grad_norm": 1.866918683052063, "learning_rate": 4.818124117382331e-05, "loss": 1.0192, "step": 117200 }, { "epoch": 1.8203261999720666, "grad_norm": 2.2304766178131104, "learning_rate": 4.8179689318580366e-05, "loss": 0.996, "step": 117300 }, { "epoch": 1.8218780552150096, "grad_norm": 2.963873863220215, "learning_rate": 4.8178137463337424e-05, "loss": 1.0079, "step": 117400 }, { "epoch": 1.8234299104579526, "grad_norm": 2.7369155883789062, "learning_rate": 4.8176585608094475e-05, "loss": 1.0437, "step": 117500 }, { "epoch": 1.8249817657008953, "grad_norm": 2.7096636295318604, "learning_rate": 4.817503375285153e-05, "loss": 1.0154, "step": 117600 }, { "epoch": 1.8265336209438383, "grad_norm": 2.4085943698883057, "learning_rate": 4.817348189760859e-05, "loss": 1.0214, "step": 117700 }, { "epoch": 1.8280854761867813, "grad_norm": 2.7866759300231934, "learning_rate": 4.817193004236565e-05, "loss": 1.0232, "step": 117800 }, { "epoch": 1.8296373314297243, "grad_norm": 2.1857497692108154, "learning_rate": 4.8170378187122706e-05, "loss": 1.0249, "step": 117900 }, { "epoch": 1.8311891866726673, "grad_norm": 2.0040228366851807, "learning_rate": 4.8168826331879763e-05, "loss": 1.0275, "step": 118000 }, { "epoch": 1.83274104191561, "grad_norm": 2.047473669052124, "learning_rate": 4.816727447663682e-05, "loss": 1.0104, "step": 118100 }, { "epoch": 1.834292897158553, "grad_norm": 3.4671640396118164, "learning_rate": 4.816572262139388e-05, "loss": 1.0126, "step": 118200 }, { "epoch": 1.835844752401496, "grad_norm": 2.7492518424987793, "learning_rate": 4.816417076615094e-05, "loss": 1.0136, "step": 118300 }, { "epoch": 1.8373966076444388, "grad_norm": 2.570439338684082, "learning_rate": 4.8162618910907994e-05, "loss": 0.9988, "step": 118400 }, { "epoch": 1.838948462887382, "grad_norm": 2.8855667114257812, "learning_rate": 4.816106705566505e-05, "loss": 1.0268, "step": 118500 }, { "epoch": 1.8405003181303248, "grad_norm": 2.581472635269165, "learning_rate": 4.81595152004221e-05, "loss": 1.037, "step": 118600 }, { "epoch": 1.8420521733732678, "grad_norm": 2.7675156593322754, "learning_rate": 4.815796334517916e-05, "loss": 1.0079, "step": 118700 }, { "epoch": 1.8436040286162108, "grad_norm": 2.866234064102173, "learning_rate": 4.815641148993622e-05, "loss": 1.0174, "step": 118800 }, { "epoch": 1.8451558838591535, "grad_norm": 2.2983806133270264, "learning_rate": 4.8154859634693276e-05, "loss": 1.0236, "step": 118900 }, { "epoch": 1.8467077391020965, "grad_norm": 2.2907536029815674, "learning_rate": 4.8153307779450334e-05, "loss": 1.0194, "step": 119000 }, { "epoch": 1.8482595943450395, "grad_norm": 2.7146682739257812, "learning_rate": 4.815175592420739e-05, "loss": 1.0233, "step": 119100 }, { "epoch": 1.8498114495879823, "grad_norm": 2.5846409797668457, "learning_rate": 4.815020406896445e-05, "loss": 1.0222, "step": 119200 }, { "epoch": 1.8513633048309255, "grad_norm": 2.5325047969818115, "learning_rate": 4.814865221372151e-05, "loss": 1.0162, "step": 119300 }, { "epoch": 1.8529151600738683, "grad_norm": 2.6054675579071045, "learning_rate": 4.8147100358478565e-05, "loss": 1.0315, "step": 119400 }, { "epoch": 1.8544670153168112, "grad_norm": 2.5377357006073, "learning_rate": 4.814554850323562e-05, "loss": 1.0242, "step": 119500 }, { "epoch": 1.8560188705597542, "grad_norm": 2.346802234649658, "learning_rate": 4.814399664799268e-05, "loss": 1.0393, "step": 119600 }, { "epoch": 1.857570725802697, "grad_norm": 2.3707704544067383, "learning_rate": 4.814244479274974e-05, "loss": 1.0346, "step": 119700 }, { "epoch": 1.8591225810456402, "grad_norm": 2.6467478275299072, "learning_rate": 4.8140892937506796e-05, "loss": 1.0325, "step": 119800 }, { "epoch": 1.860674436288583, "grad_norm": 2.196178436279297, "learning_rate": 4.813934108226385e-05, "loss": 1.0238, "step": 119900 }, { "epoch": 1.862226291531526, "grad_norm": 2.223439931869507, "learning_rate": 4.8137789227020905e-05, "loss": 1.0228, "step": 120000 }, { "epoch": 1.863778146774469, "grad_norm": 2.972374439239502, "learning_rate": 4.813623737177796e-05, "loss": 1.0138, "step": 120100 }, { "epoch": 1.8653300020174117, "grad_norm": 2.626208543777466, "learning_rate": 4.813468551653502e-05, "loss": 1.0197, "step": 120200 }, { "epoch": 1.8668818572603547, "grad_norm": 2.8805160522460938, "learning_rate": 4.813313366129208e-05, "loss": 1.0511, "step": 120300 }, { "epoch": 1.8684337125032977, "grad_norm": 2.5027639865875244, "learning_rate": 4.8131581806049136e-05, "loss": 1.0286, "step": 120400 }, { "epoch": 1.8699855677462405, "grad_norm": 2.304419755935669, "learning_rate": 4.8130029950806194e-05, "loss": 1.0218, "step": 120500 }, { "epoch": 1.8715374229891837, "grad_norm": 2.8290162086486816, "learning_rate": 4.812847809556325e-05, "loss": 1.0278, "step": 120600 }, { "epoch": 1.8730892782321265, "grad_norm": 2.5237464904785156, "learning_rate": 4.81269262403203e-05, "loss": 1.0306, "step": 120700 }, { "epoch": 1.8746411334750694, "grad_norm": 2.3679933547973633, "learning_rate": 4.812537438507736e-05, "loss": 1.0192, "step": 120800 }, { "epoch": 1.8761929887180124, "grad_norm": 2.672912359237671, "learning_rate": 4.812382252983442e-05, "loss": 1.0279, "step": 120900 }, { "epoch": 1.8777448439609552, "grad_norm": 2.0581037998199463, "learning_rate": 4.8122270674591476e-05, "loss": 1.0104, "step": 121000 }, { "epoch": 1.8792966992038984, "grad_norm": 2.238527297973633, "learning_rate": 4.8120718819348533e-05, "loss": 1.0379, "step": 121100 }, { "epoch": 1.8808485544468412, "grad_norm": 2.6540327072143555, "learning_rate": 4.811916696410559e-05, "loss": 1.0097, "step": 121200 }, { "epoch": 1.8824004096897842, "grad_norm": 2.565361499786377, "learning_rate": 4.811761510886265e-05, "loss": 1.0202, "step": 121300 }, { "epoch": 1.8839522649327272, "grad_norm": 2.155156373977661, "learning_rate": 4.81160632536197e-05, "loss": 1.0323, "step": 121400 }, { "epoch": 1.88550412017567, "grad_norm": 2.3520073890686035, "learning_rate": 4.811451139837676e-05, "loss": 1.0088, "step": 121500 }, { "epoch": 1.887055975418613, "grad_norm": 2.345146894454956, "learning_rate": 4.8112959543133815e-05, "loss": 1.0218, "step": 121600 }, { "epoch": 1.888607830661556, "grad_norm": 3.163705348968506, "learning_rate": 4.811140768789087e-05, "loss": 1.0233, "step": 121700 }, { "epoch": 1.8901596859044987, "grad_norm": 2.520773410797119, "learning_rate": 4.810985583264793e-05, "loss": 1.0077, "step": 121800 }, { "epoch": 1.8917115411474419, "grad_norm": 2.3323748111724854, "learning_rate": 4.810830397740499e-05, "loss": 1.0377, "step": 121900 }, { "epoch": 1.8932633963903847, "grad_norm": 2.542151689529419, "learning_rate": 4.8106752122162046e-05, "loss": 1.0148, "step": 122000 }, { "epoch": 1.8948152516333276, "grad_norm": 2.202765941619873, "learning_rate": 4.8105200266919104e-05, "loss": 1.0289, "step": 122100 }, { "epoch": 1.8963671068762706, "grad_norm": 2.4793877601623535, "learning_rate": 4.810364841167616e-05, "loss": 1.0297, "step": 122200 }, { "epoch": 1.8979189621192134, "grad_norm": 2.698054313659668, "learning_rate": 4.810209655643322e-05, "loss": 1.0091, "step": 122300 }, { "epoch": 1.8994708173621566, "grad_norm": 2.4055309295654297, "learning_rate": 4.810054470119028e-05, "loss": 1.0166, "step": 122400 }, { "epoch": 1.9010226726050994, "grad_norm": 3.0389952659606934, "learning_rate": 4.8098992845947335e-05, "loss": 1.0132, "step": 122500 }, { "epoch": 1.9025745278480424, "grad_norm": 1.877671241760254, "learning_rate": 4.809744099070439e-05, "loss": 1.034, "step": 122600 }, { "epoch": 1.9041263830909854, "grad_norm": 2.2840425968170166, "learning_rate": 4.8095889135461444e-05, "loss": 1.0196, "step": 122700 }, { "epoch": 1.9056782383339281, "grad_norm": 2.1240336894989014, "learning_rate": 4.80943372802185e-05, "loss": 1.0249, "step": 122800 }, { "epoch": 1.9072300935768711, "grad_norm": 2.5354321002960205, "learning_rate": 4.809278542497556e-05, "loss": 1.0102, "step": 122900 }, { "epoch": 1.9087819488198141, "grad_norm": 2.7257163524627686, "learning_rate": 4.809123356973262e-05, "loss": 1.0338, "step": 123000 }, { "epoch": 1.9103338040627569, "grad_norm": 2.945620059967041, "learning_rate": 4.8089681714489675e-05, "loss": 1.0148, "step": 123100 }, { "epoch": 1.9118856593057, "grad_norm": 2.2530574798583984, "learning_rate": 4.808812985924673e-05, "loss": 1.0092, "step": 123200 }, { "epoch": 1.9134375145486429, "grad_norm": 2.3718366622924805, "learning_rate": 4.808657800400379e-05, "loss": 1.0253, "step": 123300 }, { "epoch": 1.9149893697915858, "grad_norm": 2.5912930965423584, "learning_rate": 4.808502614876085e-05, "loss": 1.0186, "step": 123400 }, { "epoch": 1.9165412250345288, "grad_norm": 3.1436502933502197, "learning_rate": 4.8083474293517906e-05, "loss": 1.0241, "step": 123500 }, { "epoch": 1.9180930802774716, "grad_norm": 2.161839008331299, "learning_rate": 4.8081922438274964e-05, "loss": 1.0207, "step": 123600 }, { "epoch": 1.9196449355204148, "grad_norm": 2.7813308238983154, "learning_rate": 4.808037058303202e-05, "loss": 1.0289, "step": 123700 }, { "epoch": 1.9211967907633576, "grad_norm": 2.096066474914551, "learning_rate": 4.807881872778908e-05, "loss": 1.0192, "step": 123800 }, { "epoch": 1.9227486460063006, "grad_norm": 2.3731138706207275, "learning_rate": 4.807726687254614e-05, "loss": 1.0097, "step": 123900 }, { "epoch": 1.9243005012492436, "grad_norm": 2.39955735206604, "learning_rate": 4.807571501730319e-05, "loss": 1.0066, "step": 124000 }, { "epoch": 1.9258523564921863, "grad_norm": 2.5131313800811768, "learning_rate": 4.8074163162060246e-05, "loss": 1.016, "step": 124100 }, { "epoch": 1.9274042117351293, "grad_norm": 2.7201948165893555, "learning_rate": 4.8072611306817303e-05, "loss": 1.0065, "step": 124200 }, { "epoch": 1.9289560669780723, "grad_norm": 2.1090707778930664, "learning_rate": 4.8071059451574354e-05, "loss": 1.0116, "step": 124300 }, { "epoch": 1.930507922221015, "grad_norm": 2.869464635848999, "learning_rate": 4.806950759633141e-05, "loss": 1.034, "step": 124400 }, { "epoch": 1.9320597774639583, "grad_norm": 1.9806116819381714, "learning_rate": 4.806795574108847e-05, "loss": 1.0147, "step": 124500 }, { "epoch": 1.933611632706901, "grad_norm": 2.8213675022125244, "learning_rate": 4.806640388584553e-05, "loss": 1.0116, "step": 124600 }, { "epoch": 1.935163487949844, "grad_norm": 2.6346912384033203, "learning_rate": 4.8064852030602585e-05, "loss": 1.0426, "step": 124700 }, { "epoch": 1.936715343192787, "grad_norm": 2.074028730392456, "learning_rate": 4.806330017535964e-05, "loss": 1.007, "step": 124800 }, { "epoch": 1.9382671984357298, "grad_norm": 2.0715694427490234, "learning_rate": 4.80617483201167e-05, "loss": 1.0001, "step": 124900 }, { "epoch": 1.9398190536786728, "grad_norm": 2.4837756156921387, "learning_rate": 4.806019646487376e-05, "loss": 1.001, "step": 125000 }, { "epoch": 1.9413709089216158, "grad_norm": 1.995121955871582, "learning_rate": 4.8058644609630816e-05, "loss": 1.0072, "step": 125100 }, { "epoch": 1.9429227641645588, "grad_norm": 2.409510612487793, "learning_rate": 4.8057092754387874e-05, "loss": 1.0169, "step": 125200 }, { "epoch": 1.9444746194075018, "grad_norm": 2.352543592453003, "learning_rate": 4.805554089914493e-05, "loss": 1.0094, "step": 125300 }, { "epoch": 1.9460264746504445, "grad_norm": 2.454559803009033, "learning_rate": 4.805398904390199e-05, "loss": 1.0221, "step": 125400 }, { "epoch": 1.9475783298933875, "grad_norm": 2.3507015705108643, "learning_rate": 4.805243718865905e-05, "loss": 1.0143, "step": 125500 }, { "epoch": 1.9491301851363305, "grad_norm": 2.504887342453003, "learning_rate": 4.80508853334161e-05, "loss": 1.0212, "step": 125600 }, { "epoch": 1.9506820403792733, "grad_norm": 2.1493654251098633, "learning_rate": 4.8049333478173156e-05, "loss": 1.0124, "step": 125700 }, { "epoch": 1.9522338956222165, "grad_norm": 2.0082032680511475, "learning_rate": 4.8047781622930214e-05, "loss": 1.0255, "step": 125800 }, { "epoch": 1.9537857508651593, "grad_norm": 2.716933488845825, "learning_rate": 4.804622976768727e-05, "loss": 1.008, "step": 125900 }, { "epoch": 1.9553376061081023, "grad_norm": 2.308375120162964, "learning_rate": 4.804467791244433e-05, "loss": 1.023, "step": 126000 }, { "epoch": 1.9568894613510452, "grad_norm": 2.7682197093963623, "learning_rate": 4.804312605720139e-05, "loss": 1.0072, "step": 126100 }, { "epoch": 1.958441316593988, "grad_norm": 2.179842710494995, "learning_rate": 4.8041574201958445e-05, "loss": 1.0376, "step": 126200 }, { "epoch": 1.959993171836931, "grad_norm": 2.3493030071258545, "learning_rate": 4.80400223467155e-05, "loss": 1.0019, "step": 126300 }, { "epoch": 1.961545027079874, "grad_norm": 2.295003890991211, "learning_rate": 4.803847049147256e-05, "loss": 1.0201, "step": 126400 }, { "epoch": 1.963096882322817, "grad_norm": 2.513681173324585, "learning_rate": 4.803691863622962e-05, "loss": 1.01, "step": 126500 }, { "epoch": 1.96464873756576, "grad_norm": 2.9557945728302, "learning_rate": 4.8035366780986676e-05, "loss": 1.0234, "step": 126600 }, { "epoch": 1.9662005928087027, "grad_norm": 1.9602794647216797, "learning_rate": 4.8033814925743734e-05, "loss": 0.9991, "step": 126700 }, { "epoch": 1.9677524480516457, "grad_norm": 2.613243818283081, "learning_rate": 4.803226307050079e-05, "loss": 1.0002, "step": 126800 }, { "epoch": 1.9693043032945887, "grad_norm": 2.2246649265289307, "learning_rate": 4.803071121525784e-05, "loss": 1.0177, "step": 126900 }, { "epoch": 1.9708561585375315, "grad_norm": 2.2543396949768066, "learning_rate": 4.80291593600149e-05, "loss": 1.032, "step": 127000 }, { "epoch": 1.9724080137804747, "grad_norm": 2.5448849201202393, "learning_rate": 4.802760750477196e-05, "loss": 1.012, "step": 127100 }, { "epoch": 1.9739598690234175, "grad_norm": 2.3876867294311523, "learning_rate": 4.802605564952901e-05, "loss": 1.0276, "step": 127200 }, { "epoch": 1.9755117242663605, "grad_norm": 2.6352028846740723, "learning_rate": 4.802450379428607e-05, "loss": 1.0117, "step": 127300 }, { "epoch": 1.9770635795093034, "grad_norm": 2.306156635284424, "learning_rate": 4.8022951939043124e-05, "loss": 1.0015, "step": 127400 }, { "epoch": 1.9786154347522462, "grad_norm": 2.1620559692382812, "learning_rate": 4.802140008380018e-05, "loss": 1.0132, "step": 127500 }, { "epoch": 1.9801672899951892, "grad_norm": 2.2716236114501953, "learning_rate": 4.801984822855724e-05, "loss": 0.9717, "step": 127600 }, { "epoch": 1.9817191452381322, "grad_norm": 2.2961597442626953, "learning_rate": 4.80182963733143e-05, "loss": 0.9948, "step": 127700 }, { "epoch": 1.983271000481075, "grad_norm": 2.165555000305176, "learning_rate": 4.8016744518071355e-05, "loss": 1.0137, "step": 127800 }, { "epoch": 1.9848228557240182, "grad_norm": 2.616580009460449, "learning_rate": 4.801519266282841e-05, "loss": 1.0091, "step": 127900 }, { "epoch": 1.986374710966961, "grad_norm": 2.6117653846740723, "learning_rate": 4.801364080758547e-05, "loss": 1.0083, "step": 128000 }, { "epoch": 1.987926566209904, "grad_norm": 2.256971836090088, "learning_rate": 4.801208895234253e-05, "loss": 1.0224, "step": 128100 }, { "epoch": 1.989478421452847, "grad_norm": 2.2237319946289062, "learning_rate": 4.8010537097099586e-05, "loss": 1.0271, "step": 128200 }, { "epoch": 1.9910302766957897, "grad_norm": 2.903693914413452, "learning_rate": 4.8008985241856644e-05, "loss": 1.0105, "step": 128300 }, { "epoch": 1.992582131938733, "grad_norm": 2.233490467071533, "learning_rate": 4.8007433386613695e-05, "loss": 1.0025, "step": 128400 }, { "epoch": 1.9941339871816757, "grad_norm": 2.316995859146118, "learning_rate": 4.800588153137075e-05, "loss": 1.0254, "step": 128500 }, { "epoch": 1.9956858424246187, "grad_norm": 2.6368143558502197, "learning_rate": 4.800432967612781e-05, "loss": 1.0341, "step": 128600 }, { "epoch": 1.9972376976675617, "grad_norm": 5.31099271774292, "learning_rate": 4.800277782088487e-05, "loss": 1.0237, "step": 128700 }, { "epoch": 1.9987895529105044, "grad_norm": 2.646951198577881, "learning_rate": 4.8001225965641926e-05, "loss": 1.0312, "step": 128800 }, { "epoch": 2.0003414081534476, "grad_norm": 2.3072469234466553, "learning_rate": 4.7999674110398984e-05, "loss": 0.9997, "step": 128900 }, { "epoch": 2.0018932633963904, "grad_norm": 2.3450260162353516, "learning_rate": 4.799812225515604e-05, "loss": 1.0237, "step": 129000 }, { "epoch": 2.003445118639333, "grad_norm": 2.921640396118164, "learning_rate": 4.79965703999131e-05, "loss": 1.0305, "step": 129100 }, { "epoch": 2.0049969738822764, "grad_norm": 2.3614583015441895, "learning_rate": 4.799501854467016e-05, "loss": 1.0106, "step": 129200 }, { "epoch": 2.006548829125219, "grad_norm": 2.8055689334869385, "learning_rate": 4.7993466689427215e-05, "loss": 1.019, "step": 129300 }, { "epoch": 2.0081006843681624, "grad_norm": 2.5535237789154053, "learning_rate": 4.799191483418427e-05, "loss": 0.9919, "step": 129400 }, { "epoch": 2.009652539611105, "grad_norm": 2.602146863937378, "learning_rate": 4.799036297894133e-05, "loss": 1.0241, "step": 129500 }, { "epoch": 2.011204394854048, "grad_norm": 2.1555957794189453, "learning_rate": 4.798881112369839e-05, "loss": 1.0152, "step": 129600 }, { "epoch": 2.012756250096991, "grad_norm": 2.2155966758728027, "learning_rate": 4.798725926845544e-05, "loss": 1.0003, "step": 129700 }, { "epoch": 2.014308105339934, "grad_norm": 9.139165878295898, "learning_rate": 4.79857074132125e-05, "loss": 1.0141, "step": 129800 }, { "epoch": 2.0158599605828766, "grad_norm": 2.898930311203003, "learning_rate": 4.7984155557969555e-05, "loss": 1.0125, "step": 129900 }, { "epoch": 2.01741181582582, "grad_norm": 2.3774921894073486, "learning_rate": 4.798260370272661e-05, "loss": 1.0169, "step": 130000 }, { "epoch": 2.0189636710687626, "grad_norm": 2.478914260864258, "learning_rate": 4.798105184748367e-05, "loss": 1.0234, "step": 130100 }, { "epoch": 2.020515526311706, "grad_norm": 2.1113388538360596, "learning_rate": 4.797949999224073e-05, "loss": 1.0114, "step": 130200 }, { "epoch": 2.0220673815546486, "grad_norm": 3.8178200721740723, "learning_rate": 4.7977948136997786e-05, "loss": 1.0103, "step": 130300 }, { "epoch": 2.0236192367975914, "grad_norm": 2.328853130340576, "learning_rate": 4.7976396281754843e-05, "loss": 1.0106, "step": 130400 }, { "epoch": 2.0251710920405346, "grad_norm": 2.419834852218628, "learning_rate": 4.7974844426511894e-05, "loss": 1.0267, "step": 130500 }, { "epoch": 2.0267229472834773, "grad_norm": 2.3748600482940674, "learning_rate": 4.797329257126895e-05, "loss": 1.0034, "step": 130600 }, { "epoch": 2.02827480252642, "grad_norm": 1.918850064277649, "learning_rate": 4.797174071602601e-05, "loss": 1.0228, "step": 130700 }, { "epoch": 2.0298266577693633, "grad_norm": 2.585904836654663, "learning_rate": 4.797018886078307e-05, "loss": 0.9886, "step": 130800 }, { "epoch": 2.031378513012306, "grad_norm": 2.5708611011505127, "learning_rate": 4.7968637005540125e-05, "loss": 1.017, "step": 130900 }, { "epoch": 2.0329303682552493, "grad_norm": 2.5384292602539062, "learning_rate": 4.796708515029718e-05, "loss": 1.0192, "step": 131000 }, { "epoch": 2.034482223498192, "grad_norm": 2.157093048095703, "learning_rate": 4.796553329505424e-05, "loss": 1.0123, "step": 131100 }, { "epoch": 2.036034078741135, "grad_norm": 2.9992048740386963, "learning_rate": 4.796398143981129e-05, "loss": 0.9822, "step": 131200 }, { "epoch": 2.037585933984078, "grad_norm": 2.6075832843780518, "learning_rate": 4.796242958456835e-05, "loss": 1.0167, "step": 131300 }, { "epoch": 2.039137789227021, "grad_norm": 3.7947583198547363, "learning_rate": 4.796087772932541e-05, "loss": 1.0081, "step": 131400 }, { "epoch": 2.040689644469964, "grad_norm": 2.5098185539245605, "learning_rate": 4.7959325874082465e-05, "loss": 1.0026, "step": 131500 }, { "epoch": 2.042241499712907, "grad_norm": 2.2005245685577393, "learning_rate": 4.795777401883952e-05, "loss": 1.0294, "step": 131600 }, { "epoch": 2.0437933549558496, "grad_norm": 2.9244186878204346, "learning_rate": 4.795622216359658e-05, "loss": 1.0106, "step": 131700 }, { "epoch": 2.045345210198793, "grad_norm": 2.651183843612671, "learning_rate": 4.795467030835364e-05, "loss": 0.9858, "step": 131800 }, { "epoch": 2.0468970654417356, "grad_norm": 2.1205084323883057, "learning_rate": 4.7953118453110696e-05, "loss": 0.9852, "step": 131900 }, { "epoch": 2.0484489206846783, "grad_norm": 2.498283863067627, "learning_rate": 4.7951566597867754e-05, "loss": 0.9944, "step": 132000 }, { "epoch": 2.0500007759276215, "grad_norm": 2.428384304046631, "learning_rate": 4.795001474262481e-05, "loss": 0.9869, "step": 132100 }, { "epoch": 2.0515526311705643, "grad_norm": 2.430588960647583, "learning_rate": 4.794846288738187e-05, "loss": 0.9935, "step": 132200 }, { "epoch": 2.0531044864135075, "grad_norm": 2.5057432651519775, "learning_rate": 4.794691103213893e-05, "loss": 0.9977, "step": 132300 }, { "epoch": 2.0546563416564503, "grad_norm": 2.1144509315490723, "learning_rate": 4.7945359176895985e-05, "loss": 0.9966, "step": 132400 }, { "epoch": 2.056208196899393, "grad_norm": 2.1549110412597656, "learning_rate": 4.7943807321653036e-05, "loss": 1.2924, "step": 132500 }, { "epoch": 2.0577600521423363, "grad_norm": 2.189424514770508, "learning_rate": 4.7942255466410094e-05, "loss": 1.0051, "step": 132600 }, { "epoch": 2.059311907385279, "grad_norm": 2.462399482727051, "learning_rate": 4.794070361116715e-05, "loss": 1.0039, "step": 132700 }, { "epoch": 2.0608637626282222, "grad_norm": 2.4399964809417725, "learning_rate": 4.793915175592421e-05, "loss": 1.0028, "step": 132800 }, { "epoch": 2.062415617871165, "grad_norm": 2.493495464324951, "learning_rate": 4.793759990068127e-05, "loss": 0.9986, "step": 132900 }, { "epoch": 2.0639674731141078, "grad_norm": 2.8317437171936035, "learning_rate": 4.7936048045438325e-05, "loss": 1.0034, "step": 133000 }, { "epoch": 2.065519328357051, "grad_norm": 2.0797858238220215, "learning_rate": 4.793449619019538e-05, "loss": 1.0071, "step": 133100 }, { "epoch": 2.0670711835999938, "grad_norm": 2.738300323486328, "learning_rate": 4.793294433495244e-05, "loss": 0.991, "step": 133200 }, { "epoch": 2.0686230388429365, "grad_norm": 3.162799596786499, "learning_rate": 4.79313924797095e-05, "loss": 1.0107, "step": 133300 }, { "epoch": 2.0701748940858797, "grad_norm": 1.8889572620391846, "learning_rate": 4.7929840624466556e-05, "loss": 1.0168, "step": 133400 }, { "epoch": 2.0717267493288225, "grad_norm": 2.1232211589813232, "learning_rate": 4.7928288769223613e-05, "loss": 1.0239, "step": 133500 }, { "epoch": 2.0732786045717657, "grad_norm": 2.3762943744659424, "learning_rate": 4.792673691398067e-05, "loss": 1.0084, "step": 133600 }, { "epoch": 2.0748304598147085, "grad_norm": 1.8906770944595337, "learning_rate": 4.792518505873773e-05, "loss": 0.9917, "step": 133700 }, { "epoch": 2.0763823150576513, "grad_norm": 1.9917376041412354, "learning_rate": 4.792363320349478e-05, "loss": 1.0298, "step": 133800 }, { "epoch": 2.0779341703005945, "grad_norm": 2.4325904846191406, "learning_rate": 4.792208134825184e-05, "loss": 1.027, "step": 133900 }, { "epoch": 2.0794860255435372, "grad_norm": 3.2038519382476807, "learning_rate": 4.7920529493008895e-05, "loss": 0.9985, "step": 134000 }, { "epoch": 2.0810378807864804, "grad_norm": 2.392306089401245, "learning_rate": 4.7918977637765946e-05, "loss": 1.018, "step": 134100 }, { "epoch": 2.082589736029423, "grad_norm": 2.383042097091675, "learning_rate": 4.7917425782523004e-05, "loss": 1.0217, "step": 134200 }, { "epoch": 2.084141591272366, "grad_norm": 2.5467097759246826, "learning_rate": 4.791587392728006e-05, "loss": 1.0197, "step": 134300 }, { "epoch": 2.085693446515309, "grad_norm": 2.2228469848632812, "learning_rate": 4.791432207203712e-05, "loss": 1.0235, "step": 134400 }, { "epoch": 2.087245301758252, "grad_norm": 2.3047292232513428, "learning_rate": 4.791277021679418e-05, "loss": 0.9863, "step": 134500 }, { "epoch": 2.0887971570011947, "grad_norm": 3.135500907897949, "learning_rate": 4.7911218361551235e-05, "loss": 1.0001, "step": 134600 }, { "epoch": 2.090349012244138, "grad_norm": 2.5520179271698, "learning_rate": 4.790966650630829e-05, "loss": 1.0108, "step": 134700 }, { "epoch": 2.0919008674870807, "grad_norm": 2.6624205112457275, "learning_rate": 4.790811465106535e-05, "loss": 1.027, "step": 134800 }, { "epoch": 2.093452722730024, "grad_norm": 3.056994676589966, "learning_rate": 4.790656279582241e-05, "loss": 0.9993, "step": 134900 }, { "epoch": 2.0950045779729667, "grad_norm": 2.3717095851898193, "learning_rate": 4.7905010940579466e-05, "loss": 1.0044, "step": 135000 }, { "epoch": 2.0965564332159095, "grad_norm": 1.935117483139038, "learning_rate": 4.7903459085336524e-05, "loss": 1.0038, "step": 135100 }, { "epoch": 2.0981082884588527, "grad_norm": 3.0761327743530273, "learning_rate": 4.790190723009358e-05, "loss": 1.0051, "step": 135200 }, { "epoch": 2.0996601437017954, "grad_norm": 2.3484625816345215, "learning_rate": 4.790035537485064e-05, "loss": 1.0158, "step": 135300 }, { "epoch": 2.1012119989447386, "grad_norm": 3.408252000808716, "learning_rate": 4.789880351960769e-05, "loss": 0.9912, "step": 135400 }, { "epoch": 2.1027638541876814, "grad_norm": 2.280179023742676, "learning_rate": 4.789725166436475e-05, "loss": 0.9985, "step": 135500 }, { "epoch": 2.104315709430624, "grad_norm": 3.068838357925415, "learning_rate": 4.7895699809121806e-05, "loss": 0.9859, "step": 135600 }, { "epoch": 2.1058675646735674, "grad_norm": 2.6211862564086914, "learning_rate": 4.7894147953878864e-05, "loss": 0.9963, "step": 135700 }, { "epoch": 2.10741941991651, "grad_norm": 2.5835139751434326, "learning_rate": 4.789259609863592e-05, "loss": 1.0224, "step": 135800 }, { "epoch": 2.108971275159453, "grad_norm": 2.4263370037078857, "learning_rate": 4.789104424339298e-05, "loss": 1.0055, "step": 135900 }, { "epoch": 2.110523130402396, "grad_norm": 2.9172561168670654, "learning_rate": 4.788949238815004e-05, "loss": 1.0259, "step": 136000 }, { "epoch": 2.112074985645339, "grad_norm": 2.827183246612549, "learning_rate": 4.7887940532907095e-05, "loss": 1.0025, "step": 136100 }, { "epoch": 2.113626840888282, "grad_norm": 2.408953905105591, "learning_rate": 4.788638867766415e-05, "loss": 1.016, "step": 136200 }, { "epoch": 2.115178696131225, "grad_norm": 2.2993712425231934, "learning_rate": 4.788483682242121e-05, "loss": 1.0253, "step": 136300 }, { "epoch": 2.1167305513741677, "grad_norm": 2.495478868484497, "learning_rate": 4.788328496717827e-05, "loss": 0.9993, "step": 136400 }, { "epoch": 2.118282406617111, "grad_norm": 2.5333378314971924, "learning_rate": 4.7881733111935326e-05, "loss": 0.975, "step": 136500 }, { "epoch": 2.1198342618600536, "grad_norm": 2.5637290477752686, "learning_rate": 4.7880181256692383e-05, "loss": 0.9868, "step": 136600 }, { "epoch": 2.121386117102997, "grad_norm": 2.183947801589966, "learning_rate": 4.7878629401449434e-05, "loss": 0.9955, "step": 136700 }, { "epoch": 2.1229379723459396, "grad_norm": 2.114889144897461, "learning_rate": 4.787707754620649e-05, "loss": 1.0157, "step": 136800 }, { "epoch": 2.1244898275888824, "grad_norm": 2.116473913192749, "learning_rate": 4.787552569096355e-05, "loss": 0.9915, "step": 136900 }, { "epoch": 2.1260416828318256, "grad_norm": 2.6938939094543457, "learning_rate": 4.78739738357206e-05, "loss": 1.0103, "step": 137000 }, { "epoch": 2.1275935380747684, "grad_norm": 2.548093795776367, "learning_rate": 4.787242198047766e-05, "loss": 1.0201, "step": 137100 }, { "epoch": 2.129145393317711, "grad_norm": 2.3260018825531006, "learning_rate": 4.7870870125234716e-05, "loss": 0.9816, "step": 137200 }, { "epoch": 2.1306972485606543, "grad_norm": 2.73062801361084, "learning_rate": 4.7869318269991774e-05, "loss": 1.0061, "step": 137300 }, { "epoch": 2.132249103803597, "grad_norm": 2.2050113677978516, "learning_rate": 4.786776641474883e-05, "loss": 0.9948, "step": 137400 }, { "epoch": 2.1338009590465403, "grad_norm": 2.4298362731933594, "learning_rate": 4.786621455950589e-05, "loss": 0.9842, "step": 137500 }, { "epoch": 2.135352814289483, "grad_norm": 2.3699915409088135, "learning_rate": 4.786466270426295e-05, "loss": 1.009, "step": 137600 }, { "epoch": 2.136904669532426, "grad_norm": 2.5699164867401123, "learning_rate": 4.7863110849020005e-05, "loss": 0.9813, "step": 137700 }, { "epoch": 2.138456524775369, "grad_norm": 2.3024682998657227, "learning_rate": 4.786155899377706e-05, "loss": 1.0216, "step": 137800 }, { "epoch": 2.140008380018312, "grad_norm": 2.36860728263855, "learning_rate": 4.786000713853412e-05, "loss": 1.0044, "step": 137900 }, { "epoch": 2.141560235261255, "grad_norm": 2.538660764694214, "learning_rate": 4.785845528329118e-05, "loss": 0.999, "step": 138000 }, { "epoch": 2.143112090504198, "grad_norm": 2.2634620666503906, "learning_rate": 4.7856903428048236e-05, "loss": 1.0174, "step": 138100 }, { "epoch": 2.1446639457471406, "grad_norm": 2.0777719020843506, "learning_rate": 4.785535157280529e-05, "loss": 1.0082, "step": 138200 }, { "epoch": 2.146215800990084, "grad_norm": 2.396221876144409, "learning_rate": 4.7853799717562345e-05, "loss": 0.9934, "step": 138300 }, { "epoch": 2.1477676562330266, "grad_norm": 2.5391297340393066, "learning_rate": 4.78522478623194e-05, "loss": 1.0276, "step": 138400 }, { "epoch": 2.1493195114759693, "grad_norm": 2.3594343662261963, "learning_rate": 4.785069600707646e-05, "loss": 1.0027, "step": 138500 }, { "epoch": 2.1508713667189125, "grad_norm": 3.577491521835327, "learning_rate": 4.784914415183352e-05, "loss": 1.0312, "step": 138600 }, { "epoch": 2.1524232219618553, "grad_norm": 2.737467050552368, "learning_rate": 4.7847592296590576e-05, "loss": 1.0108, "step": 138700 }, { "epoch": 2.1539750772047985, "grad_norm": 2.334174633026123, "learning_rate": 4.7846040441347634e-05, "loss": 1.0039, "step": 138800 }, { "epoch": 2.1555269324477413, "grad_norm": 2.3640003204345703, "learning_rate": 4.784448858610469e-05, "loss": 1.0225, "step": 138900 }, { "epoch": 2.157078787690684, "grad_norm": 2.760650396347046, "learning_rate": 4.784293673086175e-05, "loss": 0.9952, "step": 139000 }, { "epoch": 2.1586306429336273, "grad_norm": 2.153960943222046, "learning_rate": 4.784138487561881e-05, "loss": 0.9912, "step": 139100 }, { "epoch": 2.16018249817657, "grad_norm": 2.402894973754883, "learning_rate": 4.7839833020375865e-05, "loss": 0.9976, "step": 139200 }, { "epoch": 2.1617343534195133, "grad_norm": 2.8789963722229004, "learning_rate": 4.783828116513292e-05, "loss": 1.0077, "step": 139300 }, { "epoch": 2.163286208662456, "grad_norm": 2.187281847000122, "learning_rate": 4.783672930988998e-05, "loss": 1.005, "step": 139400 }, { "epoch": 2.164838063905399, "grad_norm": 2.513023614883423, "learning_rate": 4.783517745464703e-05, "loss": 1.0126, "step": 139500 }, { "epoch": 2.166389919148342, "grad_norm": 2.672801971435547, "learning_rate": 4.783362559940409e-05, "loss": 0.9905, "step": 139600 }, { "epoch": 2.1679417743912848, "grad_norm": 2.132051467895508, "learning_rate": 4.783207374416115e-05, "loss": 0.9935, "step": 139700 }, { "epoch": 2.1694936296342275, "grad_norm": 2.3468332290649414, "learning_rate": 4.7830521888918204e-05, "loss": 1.0146, "step": 139800 }, { "epoch": 2.1710454848771708, "grad_norm": 2.4264650344848633, "learning_rate": 4.782897003367526e-05, "loss": 1.0237, "step": 139900 }, { "epoch": 2.1725973401201135, "grad_norm": 2.3317036628723145, "learning_rate": 4.782741817843232e-05, "loss": 0.9937, "step": 140000 }, { "epoch": 2.1741491953630567, "grad_norm": 2.5268824100494385, "learning_rate": 4.782586632318938e-05, "loss": 0.9864, "step": 140100 }, { "epoch": 2.1757010506059995, "grad_norm": 2.32130765914917, "learning_rate": 4.7824314467946435e-05, "loss": 0.9962, "step": 140200 }, { "epoch": 2.1772529058489423, "grad_norm": 2.561051607131958, "learning_rate": 4.7822762612703486e-05, "loss": 1.0015, "step": 140300 }, { "epoch": 2.1788047610918855, "grad_norm": 2.6600279808044434, "learning_rate": 4.7821210757460544e-05, "loss": 1.0166, "step": 140400 }, { "epoch": 2.1803566163348282, "grad_norm": 2.0839040279388428, "learning_rate": 4.78196589022176e-05, "loss": 1.0083, "step": 140500 }, { "epoch": 2.1819084715777715, "grad_norm": 2.2701878547668457, "learning_rate": 4.781810704697466e-05, "loss": 0.9703, "step": 140600 }, { "epoch": 2.1834603268207142, "grad_norm": 2.466879367828369, "learning_rate": 4.781655519173172e-05, "loss": 0.9741, "step": 140700 }, { "epoch": 2.185012182063657, "grad_norm": 2.248739004135132, "learning_rate": 4.7815003336488775e-05, "loss": 1.01, "step": 140800 }, { "epoch": 2.1865640373066, "grad_norm": 2.257504463195801, "learning_rate": 4.781345148124583e-05, "loss": 0.976, "step": 140900 }, { "epoch": 2.188115892549543, "grad_norm": 2.2243640422821045, "learning_rate": 4.781189962600289e-05, "loss": 0.99, "step": 141000 }, { "epoch": 2.1896677477924857, "grad_norm": 2.4987125396728516, "learning_rate": 4.781034777075994e-05, "loss": 0.9894, "step": 141100 }, { "epoch": 2.191219603035429, "grad_norm": 2.1721813678741455, "learning_rate": 4.7808795915517e-05, "loss": 0.9922, "step": 141200 }, { "epoch": 2.1927714582783717, "grad_norm": 2.120647430419922, "learning_rate": 4.780724406027406e-05, "loss": 0.9862, "step": 141300 }, { "epoch": 2.194323313521315, "grad_norm": 2.350365161895752, "learning_rate": 4.7805692205031115e-05, "loss": 1.0212, "step": 141400 }, { "epoch": 2.1958751687642577, "grad_norm": 2.500053882598877, "learning_rate": 4.780414034978817e-05, "loss": 1.0146, "step": 141500 }, { "epoch": 2.1974270240072005, "grad_norm": 2.2952253818511963, "learning_rate": 4.780258849454523e-05, "loss": 0.9946, "step": 141600 }, { "epoch": 2.1989788792501437, "grad_norm": 2.337035655975342, "learning_rate": 4.780103663930229e-05, "loss": 0.9915, "step": 141700 }, { "epoch": 2.2005307344930864, "grad_norm": 2.662219285964966, "learning_rate": 4.7799484784059346e-05, "loss": 1.0034, "step": 141800 }, { "epoch": 2.202082589736029, "grad_norm": 2.9376132488250732, "learning_rate": 4.7797932928816404e-05, "loss": 1.0031, "step": 141900 }, { "epoch": 2.2036344449789724, "grad_norm": 2.247138261795044, "learning_rate": 4.779638107357346e-05, "loss": 1.006, "step": 142000 }, { "epoch": 2.205186300221915, "grad_norm": 2.1985185146331787, "learning_rate": 4.779482921833052e-05, "loss": 1.0144, "step": 142100 }, { "epoch": 2.2067381554648584, "grad_norm": 2.32684326171875, "learning_rate": 4.779327736308758e-05, "loss": 1.0257, "step": 142200 }, { "epoch": 2.208290010707801, "grad_norm": 2.274444341659546, "learning_rate": 4.7791725507844635e-05, "loss": 1.0087, "step": 142300 }, { "epoch": 2.209841865950744, "grad_norm": 2.280369520187378, "learning_rate": 4.7790173652601686e-05, "loss": 1.0134, "step": 142400 }, { "epoch": 2.211393721193687, "grad_norm": 2.8536410331726074, "learning_rate": 4.778862179735874e-05, "loss": 0.9982, "step": 142500 }, { "epoch": 2.21294557643663, "grad_norm": 2.6610255241394043, "learning_rate": 4.77870699421158e-05, "loss": 1.0194, "step": 142600 }, { "epoch": 2.214497431679573, "grad_norm": 2.5938422679901123, "learning_rate": 4.778551808687286e-05, "loss": 0.9994, "step": 142700 }, { "epoch": 2.216049286922516, "grad_norm": 2.414273262023926, "learning_rate": 4.778396623162992e-05, "loss": 1.0026, "step": 142800 }, { "epoch": 2.2176011421654587, "grad_norm": 2.5789294242858887, "learning_rate": 4.7782414376386974e-05, "loss": 1.0045, "step": 142900 }, { "epoch": 2.219152997408402, "grad_norm": 2.2323696613311768, "learning_rate": 4.778086252114403e-05, "loss": 0.9919, "step": 143000 }, { "epoch": 2.2207048526513447, "grad_norm": 2.253960371017456, "learning_rate": 4.777931066590109e-05, "loss": 0.9897, "step": 143100 }, { "epoch": 2.2222567078942874, "grad_norm": 2.6455960273742676, "learning_rate": 4.777775881065815e-05, "loss": 0.9832, "step": 143200 }, { "epoch": 2.2238085631372306, "grad_norm": 2.2151122093200684, "learning_rate": 4.7776206955415205e-05, "loss": 1.018, "step": 143300 }, { "epoch": 2.2253604183801734, "grad_norm": 2.5561559200286865, "learning_rate": 4.777465510017226e-05, "loss": 1.0025, "step": 143400 }, { "epoch": 2.2269122736231166, "grad_norm": 2.523472309112549, "learning_rate": 4.7773103244929314e-05, "loss": 1.0057, "step": 143500 }, { "epoch": 2.2284641288660594, "grad_norm": 2.1763851642608643, "learning_rate": 4.777155138968637e-05, "loss": 0.9979, "step": 143600 }, { "epoch": 2.230015984109002, "grad_norm": 3.3345046043395996, "learning_rate": 4.776999953444343e-05, "loss": 1.0041, "step": 143700 }, { "epoch": 2.2315678393519454, "grad_norm": 2.192248821258545, "learning_rate": 4.776844767920049e-05, "loss": 0.9775, "step": 143800 }, { "epoch": 2.233119694594888, "grad_norm": 2.3951237201690674, "learning_rate": 4.776689582395754e-05, "loss": 1.0076, "step": 143900 }, { "epoch": 2.234671549837831, "grad_norm": 2.1646876335144043, "learning_rate": 4.7765343968714596e-05, "loss": 0.9938, "step": 144000 }, { "epoch": 2.236223405080774, "grad_norm": 2.2766425609588623, "learning_rate": 4.7763792113471654e-05, "loss": 1.0017, "step": 144100 }, { "epoch": 2.237775260323717, "grad_norm": 2.6504530906677246, "learning_rate": 4.776224025822871e-05, "loss": 1.0022, "step": 144200 }, { "epoch": 2.23932711556666, "grad_norm": 3.0217010974884033, "learning_rate": 4.776068840298577e-05, "loss": 1.0148, "step": 144300 }, { "epoch": 2.240878970809603, "grad_norm": 2.8178629875183105, "learning_rate": 4.775913654774283e-05, "loss": 1.02, "step": 144400 }, { "epoch": 2.2424308260525456, "grad_norm": 2.6072585582733154, "learning_rate": 4.7757584692499885e-05, "loss": 1.0177, "step": 144500 }, { "epoch": 2.243982681295489, "grad_norm": 2.3502373695373535, "learning_rate": 4.775603283725694e-05, "loss": 1.0026, "step": 144600 }, { "epoch": 2.2455345365384316, "grad_norm": 2.450171947479248, "learning_rate": 4.7754480982014e-05, "loss": 0.9975, "step": 144700 }, { "epoch": 2.247086391781375, "grad_norm": 2.5710248947143555, "learning_rate": 4.775292912677106e-05, "loss": 1.0117, "step": 144800 }, { "epoch": 2.2486382470243176, "grad_norm": 2.116594076156616, "learning_rate": 4.7751377271528116e-05, "loss": 1.0202, "step": 144900 }, { "epoch": 2.2501901022672604, "grad_norm": 2.718474864959717, "learning_rate": 4.7749825416285174e-05, "loss": 1.0141, "step": 145000 }, { "epoch": 2.2517419575102036, "grad_norm": 2.305295944213867, "learning_rate": 4.774827356104223e-05, "loss": 1.0296, "step": 145100 }, { "epoch": 2.2532938127531463, "grad_norm": 6.266622543334961, "learning_rate": 4.774672170579928e-05, "loss": 0.9919, "step": 145200 }, { "epoch": 2.254845667996089, "grad_norm": 1.9756531715393066, "learning_rate": 4.774516985055634e-05, "loss": 0.976, "step": 145300 }, { "epoch": 2.2563975232390323, "grad_norm": 2.556570291519165, "learning_rate": 4.77436179953134e-05, "loss": 1.0203, "step": 145400 }, { "epoch": 2.257949378481975, "grad_norm": 2.4479446411132812, "learning_rate": 4.7742066140070456e-05, "loss": 1.014, "step": 145500 }, { "epoch": 2.2595012337249183, "grad_norm": 2.5070762634277344, "learning_rate": 4.774051428482751e-05, "loss": 0.9972, "step": 145600 }, { "epoch": 2.261053088967861, "grad_norm": 2.503878116607666, "learning_rate": 4.773896242958457e-05, "loss": 1.0158, "step": 145700 }, { "epoch": 2.262604944210804, "grad_norm": 2.372098207473755, "learning_rate": 4.773741057434163e-05, "loss": 1.0068, "step": 145800 }, { "epoch": 2.264156799453747, "grad_norm": 2.3035755157470703, "learning_rate": 4.773585871909869e-05, "loss": 0.994, "step": 145900 }, { "epoch": 2.26570865469669, "grad_norm": 2.3845462799072266, "learning_rate": 4.7734306863855744e-05, "loss": 0.9898, "step": 146000 }, { "epoch": 2.267260509939633, "grad_norm": 2.3504083156585693, "learning_rate": 4.77327550086128e-05, "loss": 0.9857, "step": 146100 }, { "epoch": 2.268812365182576, "grad_norm": 2.4137001037597656, "learning_rate": 4.773120315336986e-05, "loss": 1.0012, "step": 146200 }, { "epoch": 2.2703642204255186, "grad_norm": 2.4122140407562256, "learning_rate": 4.772965129812692e-05, "loss": 0.9964, "step": 146300 }, { "epoch": 2.2719160756684618, "grad_norm": 2.2957944869995117, "learning_rate": 4.7728099442883975e-05, "loss": 1.0131, "step": 146400 }, { "epoch": 2.2734679309114045, "grad_norm": 2.146494150161743, "learning_rate": 4.7726547587641026e-05, "loss": 1.0023, "step": 146500 }, { "epoch": 2.2750197861543473, "grad_norm": 2.592254400253296, "learning_rate": 4.7724995732398084e-05, "loss": 0.995, "step": 146600 }, { "epoch": 2.2765716413972905, "grad_norm": 2.6088461875915527, "learning_rate": 4.772344387715514e-05, "loss": 0.9936, "step": 146700 }, { "epoch": 2.2781234966402333, "grad_norm": 2.528661012649536, "learning_rate": 4.772189202191219e-05, "loss": 1.0011, "step": 146800 }, { "epoch": 2.2796753518831765, "grad_norm": 3.4407155513763428, "learning_rate": 4.772034016666925e-05, "loss": 1.0015, "step": 146900 }, { "epoch": 2.2812272071261193, "grad_norm": 3.058441638946533, "learning_rate": 4.771878831142631e-05, "loss": 1.0003, "step": 147000 }, { "epoch": 2.282779062369062, "grad_norm": 2.543264627456665, "learning_rate": 4.7717236456183366e-05, "loss": 1.0049, "step": 147100 }, { "epoch": 2.2843309176120052, "grad_norm": 3.144533634185791, "learning_rate": 4.7715684600940424e-05, "loss": 1.0333, "step": 147200 }, { "epoch": 2.285882772854948, "grad_norm": 2.186021327972412, "learning_rate": 4.771413274569748e-05, "loss": 1.0034, "step": 147300 }, { "epoch": 2.287434628097891, "grad_norm": 2.1624252796173096, "learning_rate": 4.771258089045454e-05, "loss": 1.015, "step": 147400 }, { "epoch": 2.288986483340834, "grad_norm": 1.8287031650543213, "learning_rate": 4.77110290352116e-05, "loss": 1.0136, "step": 147500 }, { "epoch": 2.2905383385837768, "grad_norm": 3.1081199645996094, "learning_rate": 4.7709477179968655e-05, "loss": 1.0185, "step": 147600 }, { "epoch": 2.29209019382672, "grad_norm": 1.7574325799942017, "learning_rate": 4.770792532472571e-05, "loss": 0.9902, "step": 147700 }, { "epoch": 2.2936420490696627, "grad_norm": 2.86403226852417, "learning_rate": 4.770637346948277e-05, "loss": 0.9831, "step": 147800 }, { "epoch": 2.2951939043126055, "grad_norm": 2.686337947845459, "learning_rate": 4.770482161423983e-05, "loss": 1.0066, "step": 147900 }, { "epoch": 2.2967457595555487, "grad_norm": 2.4498326778411865, "learning_rate": 4.770326975899688e-05, "loss": 0.9911, "step": 148000 }, { "epoch": 2.2982976147984915, "grad_norm": 2.215968370437622, "learning_rate": 4.770171790375394e-05, "loss": 1.002, "step": 148100 }, { "epoch": 2.2998494700414347, "grad_norm": 2.0193228721618652, "learning_rate": 4.7700166048510995e-05, "loss": 1.0031, "step": 148200 }, { "epoch": 2.3014013252843775, "grad_norm": 2.5438108444213867, "learning_rate": 4.769861419326805e-05, "loss": 0.9845, "step": 148300 }, { "epoch": 2.3029531805273202, "grad_norm": 3.3894643783569336, "learning_rate": 4.769706233802511e-05, "loss": 0.9935, "step": 148400 }, { "epoch": 2.3045050357702634, "grad_norm": 2.523437023162842, "learning_rate": 4.769551048278217e-05, "loss": 0.9943, "step": 148500 }, { "epoch": 2.306056891013206, "grad_norm": 2.58048415184021, "learning_rate": 4.7693958627539226e-05, "loss": 0.9985, "step": 148600 }, { "epoch": 2.3076087462561494, "grad_norm": 2.502012252807617, "learning_rate": 4.769240677229628e-05, "loss": 1.0239, "step": 148700 }, { "epoch": 2.309160601499092, "grad_norm": 2.3334977626800537, "learning_rate": 4.769085491705334e-05, "loss": 0.9969, "step": 148800 }, { "epoch": 2.310712456742035, "grad_norm": 2.2678017616271973, "learning_rate": 4.76893030618104e-05, "loss": 0.9937, "step": 148900 }, { "epoch": 2.312264311984978, "grad_norm": 2.857862710952759, "learning_rate": 4.768775120656746e-05, "loss": 0.983, "step": 149000 }, { "epoch": 2.313816167227921, "grad_norm": 2.147540330886841, "learning_rate": 4.7686199351324514e-05, "loss": 1.0089, "step": 149100 }, { "epoch": 2.3153680224708637, "grad_norm": 2.5315401554107666, "learning_rate": 4.768464749608157e-05, "loss": 1.0108, "step": 149200 }, { "epoch": 2.316919877713807, "grad_norm": 2.637763738632202, "learning_rate": 4.768309564083862e-05, "loss": 1.004, "step": 149300 }, { "epoch": 2.3184717329567497, "grad_norm": 2.235074281692505, "learning_rate": 4.768154378559568e-05, "loss": 0.9954, "step": 149400 }, { "epoch": 2.320023588199693, "grad_norm": 2.1634392738342285, "learning_rate": 4.767999193035274e-05, "loss": 0.9768, "step": 149500 }, { "epoch": 2.3215754434426357, "grad_norm": 2.311239719390869, "learning_rate": 4.7678440075109796e-05, "loss": 0.9965, "step": 149600 }, { "epoch": 2.3231272986855784, "grad_norm": 2.194880485534668, "learning_rate": 4.7676888219866854e-05, "loss": 0.9875, "step": 149700 }, { "epoch": 2.3246791539285216, "grad_norm": 3.011655807495117, "learning_rate": 4.767533636462391e-05, "loss": 1.0017, "step": 149800 }, { "epoch": 2.3262310091714644, "grad_norm": 2.231142044067383, "learning_rate": 4.767378450938097e-05, "loss": 0.9944, "step": 149900 }, { "epoch": 2.3277828644144076, "grad_norm": 2.445693254470825, "learning_rate": 4.767223265413802e-05, "loss": 1.0015, "step": 150000 }, { "epoch": 2.3293347196573504, "grad_norm": 2.415174961090088, "learning_rate": 4.767068079889508e-05, "loss": 0.9943, "step": 150100 }, { "epoch": 2.330886574900293, "grad_norm": 2.783594846725464, "learning_rate": 4.7669128943652136e-05, "loss": 0.9973, "step": 150200 }, { "epoch": 2.3324384301432364, "grad_norm": 2.58579683303833, "learning_rate": 4.7667577088409194e-05, "loss": 0.9835, "step": 150300 }, { "epoch": 2.333990285386179, "grad_norm": 2.750359058380127, "learning_rate": 4.766602523316625e-05, "loss": 1.0012, "step": 150400 }, { "epoch": 2.335542140629122, "grad_norm": 2.8054516315460205, "learning_rate": 4.766447337792331e-05, "loss": 1.0235, "step": 150500 }, { "epoch": 2.337093995872065, "grad_norm": 2.2936134338378906, "learning_rate": 4.766292152268037e-05, "loss": 0.9913, "step": 150600 }, { "epoch": 2.338645851115008, "grad_norm": 2.3516228199005127, "learning_rate": 4.7661369667437425e-05, "loss": 0.9905, "step": 150700 }, { "epoch": 2.340197706357951, "grad_norm": 2.1796133518218994, "learning_rate": 4.765981781219448e-05, "loss": 1.0127, "step": 150800 }, { "epoch": 2.341749561600894, "grad_norm": 2.4065890312194824, "learning_rate": 4.7658265956951534e-05, "loss": 1.0177, "step": 150900 }, { "epoch": 2.3433014168438366, "grad_norm": 2.4029288291931152, "learning_rate": 4.765671410170859e-05, "loss": 1.0065, "step": 151000 }, { "epoch": 2.34485327208678, "grad_norm": 2.463568687438965, "learning_rate": 4.765516224646565e-05, "loss": 0.9965, "step": 151100 }, { "epoch": 2.3464051273297226, "grad_norm": 2.595303535461426, "learning_rate": 4.765361039122271e-05, "loss": 1.0088, "step": 151200 }, { "epoch": 2.347956982572666, "grad_norm": 2.0020644664764404, "learning_rate": 4.7652058535979765e-05, "loss": 0.9924, "step": 151300 }, { "epoch": 2.3495088378156086, "grad_norm": 2.6140546798706055, "learning_rate": 4.765050668073682e-05, "loss": 0.9945, "step": 151400 }, { "epoch": 2.3510606930585514, "grad_norm": 2.068150281906128, "learning_rate": 4.764895482549388e-05, "loss": 0.9755, "step": 151500 }, { "epoch": 2.3526125483014946, "grad_norm": 2.3829824924468994, "learning_rate": 4.764740297025094e-05, "loss": 0.9909, "step": 151600 }, { "epoch": 2.3541644035444373, "grad_norm": 2.7408833503723145, "learning_rate": 4.7645851115007996e-05, "loss": 0.993, "step": 151700 }, { "epoch": 2.35571625878738, "grad_norm": 1.8140374422073364, "learning_rate": 4.764429925976505e-05, "loss": 0.9897, "step": 151800 }, { "epoch": 2.3572681140303233, "grad_norm": 2.085921049118042, "learning_rate": 4.764274740452211e-05, "loss": 0.9969, "step": 151900 }, { "epoch": 2.358819969273266, "grad_norm": 2.2533583641052246, "learning_rate": 4.764119554927917e-05, "loss": 1.0018, "step": 152000 }, { "epoch": 2.3603718245162093, "grad_norm": 1.941748857498169, "learning_rate": 4.763964369403623e-05, "loss": 0.996, "step": 152100 }, { "epoch": 2.361923679759152, "grad_norm": 2.417353868484497, "learning_rate": 4.763809183879328e-05, "loss": 0.984, "step": 152200 }, { "epoch": 2.363475535002095, "grad_norm": 2.459218740463257, "learning_rate": 4.7636539983550335e-05, "loss": 0.9842, "step": 152300 }, { "epoch": 2.365027390245038, "grad_norm": 2.3075473308563232, "learning_rate": 4.763498812830739e-05, "loss": 1.002, "step": 152400 }, { "epoch": 2.366579245487981, "grad_norm": 2.5026724338531494, "learning_rate": 4.763343627306445e-05, "loss": 1.0043, "step": 152500 }, { "epoch": 2.368131100730924, "grad_norm": 2.4595234394073486, "learning_rate": 4.763188441782151e-05, "loss": 0.9962, "step": 152600 }, { "epoch": 2.369682955973867, "grad_norm": 2.402949810028076, "learning_rate": 4.7630332562578566e-05, "loss": 0.9777, "step": 152700 }, { "epoch": 2.3712348112168096, "grad_norm": 2.4516587257385254, "learning_rate": 4.7628780707335624e-05, "loss": 0.9911, "step": 152800 }, { "epoch": 2.372786666459753, "grad_norm": 2.3362479209899902, "learning_rate": 4.762722885209268e-05, "loss": 0.9932, "step": 152900 }, { "epoch": 2.3743385217026955, "grad_norm": 2.202928304672241, "learning_rate": 4.762567699684974e-05, "loss": 0.983, "step": 153000 }, { "epoch": 2.3758903769456383, "grad_norm": 2.3659350872039795, "learning_rate": 4.76241251416068e-05, "loss": 1.0099, "step": 153100 }, { "epoch": 2.3774422321885815, "grad_norm": 2.3668930530548096, "learning_rate": 4.7622573286363855e-05, "loss": 1.0316, "step": 153200 }, { "epoch": 2.3789940874315243, "grad_norm": 2.766289710998535, "learning_rate": 4.7621021431120906e-05, "loss": 0.9851, "step": 153300 }, { "epoch": 2.3805459426744675, "grad_norm": 2.908571720123291, "learning_rate": 4.7619469575877964e-05, "loss": 1.0015, "step": 153400 }, { "epoch": 2.3820977979174103, "grad_norm": 2.3693299293518066, "learning_rate": 4.761791772063502e-05, "loss": 0.9964, "step": 153500 }, { "epoch": 2.383649653160353, "grad_norm": 2.210409164428711, "learning_rate": 4.761636586539208e-05, "loss": 1.0186, "step": 153600 }, { "epoch": 2.3852015084032963, "grad_norm": 2.742485523223877, "learning_rate": 4.761481401014913e-05, "loss": 0.9826, "step": 153700 }, { "epoch": 2.386753363646239, "grad_norm": 2.046387195587158, "learning_rate": 4.761326215490619e-05, "loss": 1.0159, "step": 153800 }, { "epoch": 2.3883052188891822, "grad_norm": 2.3415839672088623, "learning_rate": 4.7611710299663246e-05, "loss": 0.9965, "step": 153900 }, { "epoch": 2.389857074132125, "grad_norm": 2.359767198562622, "learning_rate": 4.7610158444420304e-05, "loss": 1.0032, "step": 154000 }, { "epoch": 2.3914089293750678, "grad_norm": 2.0531094074249268, "learning_rate": 4.760860658917736e-05, "loss": 1.0007, "step": 154100 }, { "epoch": 2.392960784618011, "grad_norm": 2.425177812576294, "learning_rate": 4.760705473393442e-05, "loss": 0.975, "step": 154200 }, { "epoch": 2.3945126398609538, "grad_norm": 2.610118865966797, "learning_rate": 4.760550287869148e-05, "loss": 0.9824, "step": 154300 }, { "epoch": 2.3960644951038965, "grad_norm": 2.551523208618164, "learning_rate": 4.7603951023448535e-05, "loss": 1.0032, "step": 154400 }, { "epoch": 2.3976163503468397, "grad_norm": 2.420907974243164, "learning_rate": 4.760239916820559e-05, "loss": 1.008, "step": 154500 }, { "epoch": 2.3991682055897825, "grad_norm": 2.630902051925659, "learning_rate": 4.760084731296265e-05, "loss": 0.987, "step": 154600 }, { "epoch": 2.4007200608327257, "grad_norm": 2.0467114448547363, "learning_rate": 4.759929545771971e-05, "loss": 1.0077, "step": 154700 }, { "epoch": 2.4022719160756685, "grad_norm": 2.7211358547210693, "learning_rate": 4.7597743602476766e-05, "loss": 1.0007, "step": 154800 }, { "epoch": 2.4038237713186112, "grad_norm": 2.443418502807617, "learning_rate": 4.759619174723382e-05, "loss": 0.9931, "step": 154900 }, { "epoch": 2.4053756265615545, "grad_norm": 2.4109535217285156, "learning_rate": 4.7594639891990874e-05, "loss": 1.0209, "step": 155000 }, { "epoch": 2.4069274818044972, "grad_norm": 2.2265148162841797, "learning_rate": 4.759308803674793e-05, "loss": 0.997, "step": 155100 }, { "epoch": 2.4084793370474404, "grad_norm": 2.185681104660034, "learning_rate": 4.759153618150499e-05, "loss": 0.9923, "step": 155200 }, { "epoch": 2.410031192290383, "grad_norm": 2.834728240966797, "learning_rate": 4.758998432626205e-05, "loss": 0.9939, "step": 155300 }, { "epoch": 2.411583047533326, "grad_norm": 2.704516649246216, "learning_rate": 4.7588432471019105e-05, "loss": 1.0206, "step": 155400 }, { "epoch": 2.413134902776269, "grad_norm": 2.5190107822418213, "learning_rate": 4.758688061577616e-05, "loss": 1.0032, "step": 155500 }, { "epoch": 2.414686758019212, "grad_norm": 2.7848029136657715, "learning_rate": 4.758532876053322e-05, "loss": 0.9977, "step": 155600 }, { "epoch": 2.4162386132621547, "grad_norm": 2.3758962154388428, "learning_rate": 4.758377690529028e-05, "loss": 0.9925, "step": 155700 }, { "epoch": 2.417790468505098, "grad_norm": 2.2400784492492676, "learning_rate": 4.7582225050047336e-05, "loss": 0.9817, "step": 155800 }, { "epoch": 2.4193423237480407, "grad_norm": 2.186558961868286, "learning_rate": 4.7580673194804394e-05, "loss": 1.0113, "step": 155900 }, { "epoch": 2.420894178990984, "grad_norm": 2.487929582595825, "learning_rate": 4.757912133956145e-05, "loss": 0.9933, "step": 156000 }, { "epoch": 2.4224460342339267, "grad_norm": 2.0820751190185547, "learning_rate": 4.757756948431851e-05, "loss": 0.9868, "step": 156100 }, { "epoch": 2.4239978894768694, "grad_norm": 2.587728261947632, "learning_rate": 4.757601762907557e-05, "loss": 1.0166, "step": 156200 }, { "epoch": 2.4255497447198127, "grad_norm": 2.767045497894287, "learning_rate": 4.757446577383262e-05, "loss": 0.9798, "step": 156300 }, { "epoch": 2.4271015999627554, "grad_norm": 2.1324193477630615, "learning_rate": 4.7572913918589676e-05, "loss": 0.9914, "step": 156400 }, { "epoch": 2.4286534552056986, "grad_norm": 2.427401065826416, "learning_rate": 4.7571362063346734e-05, "loss": 1.0002, "step": 156500 }, { "epoch": 2.4302053104486414, "grad_norm": 2.809774160385132, "learning_rate": 4.7569810208103785e-05, "loss": 0.9992, "step": 156600 }, { "epoch": 2.431757165691584, "grad_norm": 2.6254732608795166, "learning_rate": 4.756825835286084e-05, "loss": 0.9962, "step": 156700 }, { "epoch": 2.4333090209345274, "grad_norm": 2.359229803085327, "learning_rate": 4.75667064976179e-05, "loss": 1.0253, "step": 156800 }, { "epoch": 2.43486087617747, "grad_norm": 2.6869516372680664, "learning_rate": 4.756515464237496e-05, "loss": 0.9944, "step": 156900 }, { "epoch": 2.436412731420413, "grad_norm": 2.500349283218384, "learning_rate": 4.7563602787132016e-05, "loss": 0.9798, "step": 157000 }, { "epoch": 2.437964586663356, "grad_norm": 2.036566734313965, "learning_rate": 4.7562050931889074e-05, "loss": 0.9836, "step": 157100 }, { "epoch": 2.439516441906299, "grad_norm": 2.5115389823913574, "learning_rate": 4.756049907664613e-05, "loss": 1.0161, "step": 157200 }, { "epoch": 2.4410682971492417, "grad_norm": 2.8797643184661865, "learning_rate": 4.755894722140319e-05, "loss": 0.9902, "step": 157300 }, { "epoch": 2.442620152392185, "grad_norm": 2.6357691287994385, "learning_rate": 4.755739536616025e-05, "loss": 0.9919, "step": 157400 }, { "epoch": 2.4441720076351277, "grad_norm": 2.369741916656494, "learning_rate": 4.7555843510917305e-05, "loss": 1.023, "step": 157500 }, { "epoch": 2.445723862878071, "grad_norm": 2.9061992168426514, "learning_rate": 4.755429165567436e-05, "loss": 0.9938, "step": 157600 }, { "epoch": 2.4472757181210136, "grad_norm": 2.2318854331970215, "learning_rate": 4.755273980043142e-05, "loss": 1.0064, "step": 157700 }, { "epoch": 2.448827573363957, "grad_norm": 2.3017547130584717, "learning_rate": 4.755118794518848e-05, "loss": 0.9919, "step": 157800 }, { "epoch": 2.4503794286068996, "grad_norm": 2.810620069503784, "learning_rate": 4.754963608994553e-05, "loss": 1.0075, "step": 157900 }, { "epoch": 2.4519312838498424, "grad_norm": 2.3018581867218018, "learning_rate": 4.7548084234702587e-05, "loss": 0.9896, "step": 158000 }, { "epoch": 2.4534831390927856, "grad_norm": 2.1516337394714355, "learning_rate": 4.7546532379459644e-05, "loss": 1.0106, "step": 158100 }, { "epoch": 2.4550349943357284, "grad_norm": 2.2663421630859375, "learning_rate": 4.75449805242167e-05, "loss": 1.0095, "step": 158200 }, { "epoch": 2.456586849578671, "grad_norm": 2.744828224182129, "learning_rate": 4.754342866897376e-05, "loss": 1.0062, "step": 158300 }, { "epoch": 2.4581387048216143, "grad_norm": 2.409578800201416, "learning_rate": 4.754187681373082e-05, "loss": 0.9875, "step": 158400 }, { "epoch": 2.459690560064557, "grad_norm": 1.8594049215316772, "learning_rate": 4.7540324958487875e-05, "loss": 0.9906, "step": 158500 }, { "epoch": 2.4612424153075, "grad_norm": 2.703629970550537, "learning_rate": 4.753877310324493e-05, "loss": 0.9883, "step": 158600 }, { "epoch": 2.462794270550443, "grad_norm": 2.3703091144561768, "learning_rate": 4.753722124800199e-05, "loss": 1.0038, "step": 158700 }, { "epoch": 2.464346125793386, "grad_norm": 2.9878735542297363, "learning_rate": 4.753566939275905e-05, "loss": 1.0237, "step": 158800 }, { "epoch": 2.465897981036329, "grad_norm": 2.070500612258911, "learning_rate": 4.7534117537516106e-05, "loss": 0.9887, "step": 158900 }, { "epoch": 2.467449836279272, "grad_norm": 2.292306900024414, "learning_rate": 4.7532565682273164e-05, "loss": 0.983, "step": 159000 }, { "epoch": 2.469001691522215, "grad_norm": 2.33115816116333, "learning_rate": 4.753101382703022e-05, "loss": 1.0165, "step": 159100 }, { "epoch": 2.470553546765158, "grad_norm": 2.223109245300293, "learning_rate": 4.752946197178727e-05, "loss": 0.9824, "step": 159200 }, { "epoch": 2.4721054020081006, "grad_norm": 2.5745558738708496, "learning_rate": 4.752791011654433e-05, "loss": 0.9955, "step": 159300 }, { "epoch": 2.473657257251044, "grad_norm": 2.2125587463378906, "learning_rate": 4.752635826130139e-05, "loss": 0.9783, "step": 159400 }, { "epoch": 2.4752091124939866, "grad_norm": 2.246924638748169, "learning_rate": 4.7524806406058446e-05, "loss": 0.9819, "step": 159500 }, { "epoch": 2.4767609677369293, "grad_norm": 2.1735005378723145, "learning_rate": 4.7523254550815504e-05, "loss": 0.9891, "step": 159600 }, { "epoch": 2.4783128229798725, "grad_norm": 2.4652254581451416, "learning_rate": 4.752170269557256e-05, "loss": 0.9813, "step": 159700 }, { "epoch": 2.4798646782228153, "grad_norm": 2.622347116470337, "learning_rate": 4.752015084032961e-05, "loss": 1.0217, "step": 159800 }, { "epoch": 2.481416533465758, "grad_norm": 2.185213804244995, "learning_rate": 4.751859898508667e-05, "loss": 0.9916, "step": 159900 }, { "epoch": 2.4829683887087013, "grad_norm": 2.0847904682159424, "learning_rate": 4.751704712984373e-05, "loss": 0.9904, "step": 160000 }, { "epoch": 2.484520243951644, "grad_norm": 2.4268147945404053, "learning_rate": 4.7515495274600786e-05, "loss": 1.0084, "step": 160100 }, { "epoch": 2.4860720991945873, "grad_norm": 2.824099540710449, "learning_rate": 4.7513943419357844e-05, "loss": 0.9893, "step": 160200 }, { "epoch": 2.48762395443753, "grad_norm": 2.1871109008789062, "learning_rate": 4.75123915641149e-05, "loss": 1.0058, "step": 160300 }, { "epoch": 2.4891758096804733, "grad_norm": 2.4703660011291504, "learning_rate": 4.751083970887196e-05, "loss": 1.0073, "step": 160400 }, { "epoch": 2.490727664923416, "grad_norm": 2.665696859359741, "learning_rate": 4.750928785362902e-05, "loss": 1.0053, "step": 160500 }, { "epoch": 2.492279520166359, "grad_norm": 2.5043516159057617, "learning_rate": 4.7507735998386075e-05, "loss": 0.9805, "step": 160600 }, { "epoch": 2.493831375409302, "grad_norm": 2.1357104778289795, "learning_rate": 4.7506184143143126e-05, "loss": 1.0068, "step": 160700 }, { "epoch": 2.4953832306522448, "grad_norm": 2.7104854583740234, "learning_rate": 4.750463228790018e-05, "loss": 0.9991, "step": 160800 }, { "epoch": 2.4969350858951875, "grad_norm": 2.0449440479278564, "learning_rate": 4.750308043265724e-05, "loss": 0.9918, "step": 160900 }, { "epoch": 2.4984869411381307, "grad_norm": 2.989248275756836, "learning_rate": 4.75015285774143e-05, "loss": 0.9984, "step": 161000 }, { "epoch": 2.5000387963810735, "grad_norm": 2.3520569801330566, "learning_rate": 4.7499976722171357e-05, "loss": 0.999, "step": 161100 }, { "epoch": 2.5015906516240163, "grad_norm": 2.656609296798706, "learning_rate": 4.7498424866928414e-05, "loss": 1.0286, "step": 161200 }, { "epoch": 2.5031425068669595, "grad_norm": 2.2921128273010254, "learning_rate": 4.749687301168547e-05, "loss": 0.9949, "step": 161300 }, { "epoch": 2.5046943621099023, "grad_norm": 2.5775489807128906, "learning_rate": 4.749532115644253e-05, "loss": 1.0086, "step": 161400 }, { "epoch": 2.5062462173528455, "grad_norm": 2.184391975402832, "learning_rate": 4.749376930119959e-05, "loss": 0.9894, "step": 161500 }, { "epoch": 2.5077980725957882, "grad_norm": 2.3546206951141357, "learning_rate": 4.7492217445956645e-05, "loss": 0.9868, "step": 161600 }, { "epoch": 2.5093499278387315, "grad_norm": 2.73968505859375, "learning_rate": 4.74906655907137e-05, "loss": 1.0056, "step": 161700 }, { "epoch": 2.510901783081674, "grad_norm": 2.4706084728240967, "learning_rate": 4.748911373547076e-05, "loss": 0.9812, "step": 161800 }, { "epoch": 2.512453638324617, "grad_norm": 2.1051104068756104, "learning_rate": 4.748756188022782e-05, "loss": 0.9673, "step": 161900 }, { "epoch": 2.51400549356756, "grad_norm": 2.4008686542510986, "learning_rate": 4.748601002498487e-05, "loss": 1.0106, "step": 162000 }, { "epoch": 2.515557348810503, "grad_norm": 2.02608060836792, "learning_rate": 4.748445816974193e-05, "loss": 1.0016, "step": 162100 }, { "epoch": 2.5171092040534457, "grad_norm": 2.208484411239624, "learning_rate": 4.7482906314498985e-05, "loss": 1.0162, "step": 162200 }, { "epoch": 2.518661059296389, "grad_norm": 1.7745296955108643, "learning_rate": 4.748135445925604e-05, "loss": 0.9952, "step": 162300 }, { "epoch": 2.5202129145393317, "grad_norm": 2.1560864448547363, "learning_rate": 4.74798026040131e-05, "loss": 0.9958, "step": 162400 }, { "epoch": 2.5217647697822745, "grad_norm": 2.0452051162719727, "learning_rate": 4.747825074877016e-05, "loss": 0.9885, "step": 162500 }, { "epoch": 2.5233166250252177, "grad_norm": 2.3455212116241455, "learning_rate": 4.7476698893527216e-05, "loss": 0.9799, "step": 162600 }, { "epoch": 2.5248684802681605, "grad_norm": 2.6644575595855713, "learning_rate": 4.7475147038284274e-05, "loss": 0.99, "step": 162700 }, { "epoch": 2.5264203355111037, "grad_norm": 2.5845577716827393, "learning_rate": 4.747359518304133e-05, "loss": 0.9892, "step": 162800 }, { "epoch": 2.5279721907540464, "grad_norm": 2.0698750019073486, "learning_rate": 4.747204332779839e-05, "loss": 0.9861, "step": 162900 }, { "epoch": 2.5295240459969897, "grad_norm": 2.323091506958008, "learning_rate": 4.747049147255545e-05, "loss": 0.9869, "step": 163000 }, { "epoch": 2.5310759012399324, "grad_norm": 2.3332982063293457, "learning_rate": 4.74689396173125e-05, "loss": 1.0066, "step": 163100 }, { "epoch": 2.532627756482875, "grad_norm": 1.9896697998046875, "learning_rate": 4.7467387762069556e-05, "loss": 1.0055, "step": 163200 }, { "epoch": 2.5341796117258184, "grad_norm": 2.44010329246521, "learning_rate": 4.7465835906826614e-05, "loss": 1.0212, "step": 163300 }, { "epoch": 2.535731466968761, "grad_norm": 2.540306329727173, "learning_rate": 4.746428405158367e-05, "loss": 0.9946, "step": 163400 }, { "epoch": 2.537283322211704, "grad_norm": 2.677839517593384, "learning_rate": 4.746273219634072e-05, "loss": 0.9837, "step": 163500 }, { "epoch": 2.538835177454647, "grad_norm": 2.7422428131103516, "learning_rate": 4.746118034109778e-05, "loss": 0.9776, "step": 163600 }, { "epoch": 2.54038703269759, "grad_norm": 2.7356631755828857, "learning_rate": 4.745962848585484e-05, "loss": 1.0103, "step": 163700 }, { "epoch": 2.5419388879405327, "grad_norm": 2.4267399311065674, "learning_rate": 4.7458076630611896e-05, "loss": 1.0048, "step": 163800 }, { "epoch": 2.543490743183476, "grad_norm": 2.1737022399902344, "learning_rate": 4.745652477536895e-05, "loss": 1.0093, "step": 163900 }, { "epoch": 2.5450425984264187, "grad_norm": 2.1940906047821045, "learning_rate": 4.745497292012601e-05, "loss": 1.0065, "step": 164000 }, { "epoch": 2.546594453669362, "grad_norm": 2.1881892681121826, "learning_rate": 4.745342106488307e-05, "loss": 0.9911, "step": 164100 }, { "epoch": 2.5481463089123046, "grad_norm": 1.7818078994750977, "learning_rate": 4.7451869209640127e-05, "loss": 0.9838, "step": 164200 }, { "epoch": 2.549698164155248, "grad_norm": 2.4145798683166504, "learning_rate": 4.7450317354397184e-05, "loss": 0.9879, "step": 164300 }, { "epoch": 2.5512500193981906, "grad_norm": 2.940197706222534, "learning_rate": 4.744876549915424e-05, "loss": 0.9962, "step": 164400 }, { "epoch": 2.5528018746411334, "grad_norm": 2.556788682937622, "learning_rate": 4.74472136439113e-05, "loss": 0.9711, "step": 164500 }, { "epoch": 2.5543537298840766, "grad_norm": 3.122790813446045, "learning_rate": 4.744566178866836e-05, "loss": 0.9932, "step": 164600 }, { "epoch": 2.5559055851270194, "grad_norm": 2.5534470081329346, "learning_rate": 4.7444109933425415e-05, "loss": 0.9976, "step": 164700 }, { "epoch": 2.557457440369962, "grad_norm": 2.4225311279296875, "learning_rate": 4.7442558078182466e-05, "loss": 0.994, "step": 164800 }, { "epoch": 2.5590092956129054, "grad_norm": 2.438121795654297, "learning_rate": 4.7441006222939524e-05, "loss": 1.0001, "step": 164900 }, { "epoch": 2.560561150855848, "grad_norm": 2.6318442821502686, "learning_rate": 4.743945436769658e-05, "loss": 1.009, "step": 165000 }, { "epoch": 2.562113006098791, "grad_norm": 2.131669521331787, "learning_rate": 4.743790251245364e-05, "loss": 0.9794, "step": 165100 }, { "epoch": 2.563664861341734, "grad_norm": 1.6646571159362793, "learning_rate": 4.74363506572107e-05, "loss": 0.9842, "step": 165200 }, { "epoch": 2.565216716584677, "grad_norm": 3.072004795074463, "learning_rate": 4.7434798801967755e-05, "loss": 0.9867, "step": 165300 }, { "epoch": 2.56676857182762, "grad_norm": 2.429774761199951, "learning_rate": 4.743324694672481e-05, "loss": 1.0096, "step": 165400 }, { "epoch": 2.568320427070563, "grad_norm": 2.388366937637329, "learning_rate": 4.743169509148187e-05, "loss": 0.9678, "step": 165500 }, { "epoch": 2.569872282313506, "grad_norm": 2.5240445137023926, "learning_rate": 4.743014323623893e-05, "loss": 0.9842, "step": 165600 }, { "epoch": 2.571424137556449, "grad_norm": 2.2264091968536377, "learning_rate": 4.7428591380995986e-05, "loss": 0.984, "step": 165700 }, { "epoch": 2.5729759927993916, "grad_norm": 2.1459665298461914, "learning_rate": 4.7427039525753044e-05, "loss": 0.9984, "step": 165800 }, { "epoch": 2.574527848042335, "grad_norm": 2.6881089210510254, "learning_rate": 4.74254876705101e-05, "loss": 0.989, "step": 165900 }, { "epoch": 2.5760797032852776, "grad_norm": 2.0024867057800293, "learning_rate": 4.742393581526716e-05, "loss": 0.9984, "step": 166000 }, { "epoch": 2.5776315585282203, "grad_norm": 2.0165743827819824, "learning_rate": 4.742238396002421e-05, "loss": 0.9819, "step": 166100 }, { "epoch": 2.5791834137711636, "grad_norm": 2.4074063301086426, "learning_rate": 4.742083210478127e-05, "loss": 0.9898, "step": 166200 }, { "epoch": 2.5807352690141063, "grad_norm": 2.484448194503784, "learning_rate": 4.7419280249538326e-05, "loss": 0.9979, "step": 166300 }, { "epoch": 2.582287124257049, "grad_norm": 2.0884323120117188, "learning_rate": 4.741772839429538e-05, "loss": 0.9867, "step": 166400 }, { "epoch": 2.5838389794999923, "grad_norm": 2.725853681564331, "learning_rate": 4.7416176539052435e-05, "loss": 0.9979, "step": 166500 }, { "epoch": 2.585390834742935, "grad_norm": 2.089369058609009, "learning_rate": 4.741462468380949e-05, "loss": 0.973, "step": 166600 }, { "epoch": 2.586942689985878, "grad_norm": 7.116908073425293, "learning_rate": 4.741307282856655e-05, "loss": 0.9947, "step": 166700 }, { "epoch": 2.588494545228821, "grad_norm": 2.9478468894958496, "learning_rate": 4.741152097332361e-05, "loss": 0.9747, "step": 166800 }, { "epoch": 2.5900464004717643, "grad_norm": 2.5385630130767822, "learning_rate": 4.7409969118080666e-05, "loss": 0.9935, "step": 166900 }, { "epoch": 2.591598255714707, "grad_norm": 2.3281638622283936, "learning_rate": 4.740841726283772e-05, "loss": 0.9979, "step": 167000 }, { "epoch": 2.59315011095765, "grad_norm": 2.7799441814422607, "learning_rate": 4.740686540759478e-05, "loss": 0.9841, "step": 167100 }, { "epoch": 2.594701966200593, "grad_norm": 2.4771296977996826, "learning_rate": 4.740531355235184e-05, "loss": 0.9842, "step": 167200 }, { "epoch": 2.596253821443536, "grad_norm": 2.2049593925476074, "learning_rate": 4.7403761697108897e-05, "loss": 1.0019, "step": 167300 }, { "epoch": 2.5978056766864785, "grad_norm": 2.5260963439941406, "learning_rate": 4.7402209841865954e-05, "loss": 0.9886, "step": 167400 }, { "epoch": 2.5993575319294218, "grad_norm": 3.12412166595459, "learning_rate": 4.740065798662301e-05, "loss": 1.0141, "step": 167500 }, { "epoch": 2.6009093871723645, "grad_norm": 2.21675181388855, "learning_rate": 4.739910613138007e-05, "loss": 0.9912, "step": 167600 }, { "epoch": 2.6024612424153073, "grad_norm": 2.28782320022583, "learning_rate": 4.739755427613712e-05, "loss": 0.9784, "step": 167700 }, { "epoch": 2.6040130976582505, "grad_norm": 2.7144248485565186, "learning_rate": 4.739600242089418e-05, "loss": 1.0071, "step": 167800 }, { "epoch": 2.6055649529011933, "grad_norm": 2.3493473529815674, "learning_rate": 4.7394450565651236e-05, "loss": 0.9913, "step": 167900 }, { "epoch": 2.607116808144136, "grad_norm": 2.1676876544952393, "learning_rate": 4.7392898710408294e-05, "loss": 0.9831, "step": 168000 }, { "epoch": 2.6086686633870793, "grad_norm": 2.646960973739624, "learning_rate": 4.739134685516535e-05, "loss": 0.9915, "step": 168100 }, { "epoch": 2.6102205186300225, "grad_norm": 2.486538887023926, "learning_rate": 4.738979499992241e-05, "loss": 0.9945, "step": 168200 }, { "epoch": 2.6117723738729652, "grad_norm": 2.5518722534179688, "learning_rate": 4.738824314467947e-05, "loss": 1.0, "step": 168300 }, { "epoch": 2.613324229115908, "grad_norm": 2.252894163131714, "learning_rate": 4.7386691289436525e-05, "loss": 0.9783, "step": 168400 }, { "epoch": 2.614876084358851, "grad_norm": 2.3624937534332275, "learning_rate": 4.738513943419358e-05, "loss": 1.0096, "step": 168500 }, { "epoch": 2.616427939601794, "grad_norm": 2.4612812995910645, "learning_rate": 4.738358757895064e-05, "loss": 0.9853, "step": 168600 }, { "epoch": 2.6179797948447368, "grad_norm": 2.0819272994995117, "learning_rate": 4.73820357237077e-05, "loss": 0.9905, "step": 168700 }, { "epoch": 2.61953165008768, "grad_norm": 2.363706350326538, "learning_rate": 4.7380483868464756e-05, "loss": 1.0019, "step": 168800 }, { "epoch": 2.6210835053306227, "grad_norm": 2.1568329334259033, "learning_rate": 4.7378932013221814e-05, "loss": 1.0051, "step": 168900 }, { "epoch": 2.6226353605735655, "grad_norm": 2.276188611984253, "learning_rate": 4.7377380157978865e-05, "loss": 1.0047, "step": 169000 }, { "epoch": 2.6241872158165087, "grad_norm": 2.274855375289917, "learning_rate": 4.737582830273592e-05, "loss": 0.9818, "step": 169100 }, { "epoch": 2.6257390710594515, "grad_norm": 2.509441375732422, "learning_rate": 4.737427644749298e-05, "loss": 0.989, "step": 169200 }, { "epoch": 2.6272909263023942, "grad_norm": 2.3149781227111816, "learning_rate": 4.737272459225004e-05, "loss": 1.022, "step": 169300 }, { "epoch": 2.6288427815453375, "grad_norm": 2.175208806991577, "learning_rate": 4.7371172737007096e-05, "loss": 0.984, "step": 169400 }, { "epoch": 2.6303946367882802, "grad_norm": 2.4652843475341797, "learning_rate": 4.7369620881764154e-05, "loss": 1.0178, "step": 169500 }, { "epoch": 2.6319464920312234, "grad_norm": 2.5660815238952637, "learning_rate": 4.7368069026521205e-05, "loss": 0.9946, "step": 169600 }, { "epoch": 2.633498347274166, "grad_norm": 2.2193174362182617, "learning_rate": 4.736651717127826e-05, "loss": 0.9817, "step": 169700 }, { "epoch": 2.6350502025171094, "grad_norm": 2.9725096225738525, "learning_rate": 4.736496531603532e-05, "loss": 0.9878, "step": 169800 }, { "epoch": 2.636602057760052, "grad_norm": 2.5576412677764893, "learning_rate": 4.736341346079238e-05, "loss": 0.983, "step": 169900 }, { "epoch": 2.638153913002995, "grad_norm": 2.844352960586548, "learning_rate": 4.7361861605549436e-05, "loss": 0.9823, "step": 170000 }, { "epoch": 2.639705768245938, "grad_norm": 2.755368232727051, "learning_rate": 4.736030975030649e-05, "loss": 0.9959, "step": 170100 }, { "epoch": 2.641257623488881, "grad_norm": 2.64593768119812, "learning_rate": 4.735875789506355e-05, "loss": 0.9943, "step": 170200 }, { "epoch": 2.6428094787318237, "grad_norm": 2.880136728286743, "learning_rate": 4.735720603982061e-05, "loss": 0.9986, "step": 170300 }, { "epoch": 2.644361333974767, "grad_norm": 2.2513651847839355, "learning_rate": 4.7355654184577667e-05, "loss": 0.9872, "step": 170400 }, { "epoch": 2.6459131892177097, "grad_norm": 2.71427583694458, "learning_rate": 4.735410232933472e-05, "loss": 1.0026, "step": 170500 }, { "epoch": 2.6474650444606525, "grad_norm": 3.1854612827301025, "learning_rate": 4.7352550474091775e-05, "loss": 1.0136, "step": 170600 }, { "epoch": 2.6490168997035957, "grad_norm": 2.0370569229125977, "learning_rate": 4.735099861884883e-05, "loss": 0.98, "step": 170700 }, { "epoch": 2.6505687549465384, "grad_norm": 2.118340492248535, "learning_rate": 4.734944676360589e-05, "loss": 0.9873, "step": 170800 }, { "epoch": 2.6521206101894816, "grad_norm": 2.3475341796875, "learning_rate": 4.734789490836295e-05, "loss": 0.9856, "step": 170900 }, { "epoch": 2.6536724654324244, "grad_norm": 5.047443866729736, "learning_rate": 4.7346343053120006e-05, "loss": 0.992, "step": 171000 }, { "epoch": 2.6552243206753676, "grad_norm": 2.0929479598999023, "learning_rate": 4.7344791197877064e-05, "loss": 0.98, "step": 171100 }, { "epoch": 2.6567761759183104, "grad_norm": 2.3174355030059814, "learning_rate": 4.734323934263412e-05, "loss": 1.0152, "step": 171200 }, { "epoch": 2.658328031161253, "grad_norm": 2.5048956871032715, "learning_rate": 4.734168748739118e-05, "loss": 0.9802, "step": 171300 }, { "epoch": 2.6598798864041964, "grad_norm": 2.222971200942993, "learning_rate": 4.734013563214824e-05, "loss": 0.9795, "step": 171400 }, { "epoch": 2.661431741647139, "grad_norm": 2.5094597339630127, "learning_rate": 4.7338583776905295e-05, "loss": 1.0036, "step": 171500 }, { "epoch": 2.662983596890082, "grad_norm": 2.6034655570983887, "learning_rate": 4.733703192166235e-05, "loss": 0.9963, "step": 171600 }, { "epoch": 2.664535452133025, "grad_norm": 2.8701984882354736, "learning_rate": 4.733548006641941e-05, "loss": 0.9615, "step": 171700 }, { "epoch": 2.666087307375968, "grad_norm": 2.3495841026306152, "learning_rate": 4.733392821117646e-05, "loss": 1.0093, "step": 171800 }, { "epoch": 2.6676391626189107, "grad_norm": 2.1495566368103027, "learning_rate": 4.733237635593352e-05, "loss": 0.9749, "step": 171900 }, { "epoch": 2.669191017861854, "grad_norm": 2.361732244491577, "learning_rate": 4.733082450069058e-05, "loss": 1.0012, "step": 172000 }, { "epoch": 2.6707428731047966, "grad_norm": 2.4878969192504883, "learning_rate": 4.7329272645447635e-05, "loss": 1.0113, "step": 172100 }, { "epoch": 2.67229472834774, "grad_norm": 2.0869884490966797, "learning_rate": 4.732772079020469e-05, "loss": 1.0117, "step": 172200 }, { "epoch": 2.6738465835906826, "grad_norm": 3.075629472732544, "learning_rate": 4.732616893496175e-05, "loss": 1.006, "step": 172300 }, { "epoch": 2.675398438833626, "grad_norm": 2.0754053592681885, "learning_rate": 4.732461707971881e-05, "loss": 1.0, "step": 172400 }, { "epoch": 2.6769502940765686, "grad_norm": 2.5641214847564697, "learning_rate": 4.7323065224475866e-05, "loss": 0.9972, "step": 172500 }, { "epoch": 2.6785021493195114, "grad_norm": 2.230884313583374, "learning_rate": 4.7321513369232924e-05, "loss": 1.0067, "step": 172600 }, { "epoch": 2.6800540045624546, "grad_norm": 2.2756619453430176, "learning_rate": 4.731996151398998e-05, "loss": 0.9826, "step": 172700 }, { "epoch": 2.6816058598053973, "grad_norm": 2.6662137508392334, "learning_rate": 4.731840965874703e-05, "loss": 0.979, "step": 172800 }, { "epoch": 2.68315771504834, "grad_norm": 2.2045912742614746, "learning_rate": 4.731685780350409e-05, "loss": 1.0047, "step": 172900 }, { "epoch": 2.6847095702912833, "grad_norm": 2.341425657272339, "learning_rate": 4.731530594826115e-05, "loss": 0.9916, "step": 173000 }, { "epoch": 2.686261425534226, "grad_norm": 2.1368792057037354, "learning_rate": 4.7313754093018206e-05, "loss": 0.9822, "step": 173100 }, { "epoch": 2.687813280777169, "grad_norm": 2.745054244995117, "learning_rate": 4.731220223777526e-05, "loss": 1.0111, "step": 173200 }, { "epoch": 2.689365136020112, "grad_norm": 2.165926694869995, "learning_rate": 4.731065038253232e-05, "loss": 0.9955, "step": 173300 }, { "epoch": 2.690916991263055, "grad_norm": 2.4122159481048584, "learning_rate": 4.730909852728937e-05, "loss": 0.9694, "step": 173400 }, { "epoch": 2.692468846505998, "grad_norm": 2.880859851837158, "learning_rate": 4.730754667204643e-05, "loss": 1.0002, "step": 173500 }, { "epoch": 2.694020701748941, "grad_norm": 2.6584131717681885, "learning_rate": 4.730599481680349e-05, "loss": 0.9911, "step": 173600 }, { "epoch": 2.695572556991884, "grad_norm": 2.46101713180542, "learning_rate": 4.7304442961560545e-05, "loss": 0.9662, "step": 173700 }, { "epoch": 2.697124412234827, "grad_norm": 2.4239580631256104, "learning_rate": 4.73028911063176e-05, "loss": 0.9864, "step": 173800 }, { "epoch": 2.6986762674777696, "grad_norm": 2.637930154800415, "learning_rate": 4.730133925107466e-05, "loss": 1.0129, "step": 173900 }, { "epoch": 2.7002281227207128, "grad_norm": 2.4188449382781982, "learning_rate": 4.729978739583172e-05, "loss": 1.0017, "step": 174000 }, { "epoch": 2.7017799779636555, "grad_norm": 2.6869704723358154, "learning_rate": 4.7298235540588776e-05, "loss": 0.9996, "step": 174100 }, { "epoch": 2.7033318332065983, "grad_norm": 2.5650875568389893, "learning_rate": 4.7296683685345834e-05, "loss": 1.0007, "step": 174200 }, { "epoch": 2.7048836884495415, "grad_norm": 1.8697642087936401, "learning_rate": 4.729513183010289e-05, "loss": 1.0022, "step": 174300 }, { "epoch": 2.7064355436924843, "grad_norm": 2.651636838912964, "learning_rate": 4.729357997485995e-05, "loss": 0.9978, "step": 174400 }, { "epoch": 2.707987398935427, "grad_norm": 2.5349197387695312, "learning_rate": 4.729202811961701e-05, "loss": 0.9999, "step": 174500 }, { "epoch": 2.7095392541783703, "grad_norm": 1.9953054189682007, "learning_rate": 4.7290476264374065e-05, "loss": 0.9896, "step": 174600 }, { "epoch": 2.711091109421313, "grad_norm": 5.942043304443359, "learning_rate": 4.7288924409131116e-05, "loss": 0.985, "step": 174700 }, { "epoch": 2.7126429646642563, "grad_norm": 2.3352549076080322, "learning_rate": 4.7287372553888174e-05, "loss": 0.9939, "step": 174800 }, { "epoch": 2.714194819907199, "grad_norm": 1.9461196660995483, "learning_rate": 4.728582069864523e-05, "loss": 1.0062, "step": 174900 }, { "epoch": 2.7157466751501422, "grad_norm": 2.0776076316833496, "learning_rate": 4.728426884340229e-05, "loss": 0.9867, "step": 175000 }, { "epoch": 2.717298530393085, "grad_norm": 2.3396692276000977, "learning_rate": 4.728271698815935e-05, "loss": 1.0026, "step": 175100 }, { "epoch": 2.7188503856360278, "grad_norm": 2.4012486934661865, "learning_rate": 4.7281165132916405e-05, "loss": 1.0056, "step": 175200 }, { "epoch": 2.720402240878971, "grad_norm": 3.1229465007781982, "learning_rate": 4.727961327767346e-05, "loss": 0.9878, "step": 175300 }, { "epoch": 2.7219540961219137, "grad_norm": 2.413496971130371, "learning_rate": 4.727806142243052e-05, "loss": 0.9821, "step": 175400 }, { "epoch": 2.7235059513648565, "grad_norm": 2.672443151473999, "learning_rate": 4.727650956718758e-05, "loss": 0.9856, "step": 175500 }, { "epoch": 2.7250578066077997, "grad_norm": 3.0402121543884277, "learning_rate": 4.7274957711944636e-05, "loss": 1.0253, "step": 175600 }, { "epoch": 2.7266096618507425, "grad_norm": 2.5521719455718994, "learning_rate": 4.7273405856701694e-05, "loss": 0.9625, "step": 175700 }, { "epoch": 2.7281615170936853, "grad_norm": 2.7688469886779785, "learning_rate": 4.727185400145875e-05, "loss": 0.9886, "step": 175800 }, { "epoch": 2.7297133723366285, "grad_norm": 2.4066896438598633, "learning_rate": 4.727030214621581e-05, "loss": 1.0047, "step": 175900 }, { "epoch": 2.7312652275795712, "grad_norm": 2.470064401626587, "learning_rate": 4.726875029097286e-05, "loss": 0.9926, "step": 176000 }, { "epoch": 2.7328170828225145, "grad_norm": 2.272587299346924, "learning_rate": 4.726719843572992e-05, "loss": 0.983, "step": 176100 }, { "epoch": 2.734368938065457, "grad_norm": 2.1352832317352295, "learning_rate": 4.726564658048697e-05, "loss": 0.9929, "step": 176200 }, { "epoch": 2.7359207933084004, "grad_norm": 2.4178695678710938, "learning_rate": 4.7264094725244027e-05, "loss": 0.9797, "step": 176300 }, { "epoch": 2.737472648551343, "grad_norm": 2.5879135131835938, "learning_rate": 4.7262542870001084e-05, "loss": 0.9795, "step": 176400 }, { "epoch": 2.739024503794286, "grad_norm": 2.309870719909668, "learning_rate": 4.726099101475814e-05, "loss": 0.9967, "step": 176500 }, { "epoch": 2.740576359037229, "grad_norm": 2.5933077335357666, "learning_rate": 4.72594391595152e-05, "loss": 0.978, "step": 176600 }, { "epoch": 2.742128214280172, "grad_norm": 2.5511372089385986, "learning_rate": 4.725788730427226e-05, "loss": 0.9851, "step": 176700 }, { "epoch": 2.7436800695231147, "grad_norm": 2.139542818069458, "learning_rate": 4.7256335449029315e-05, "loss": 0.9846, "step": 176800 }, { "epoch": 2.745231924766058, "grad_norm": 2.372938871383667, "learning_rate": 4.725478359378637e-05, "loss": 0.9978, "step": 176900 }, { "epoch": 2.7467837800090007, "grad_norm": 2.665036201477051, "learning_rate": 4.725323173854343e-05, "loss": 0.9961, "step": 177000 }, { "epoch": 2.7483356352519435, "grad_norm": 2.3020882606506348, "learning_rate": 4.725167988330049e-05, "loss": 0.9753, "step": 177100 }, { "epoch": 2.7498874904948867, "grad_norm": 2.1583404541015625, "learning_rate": 4.7250128028057546e-05, "loss": 0.9866, "step": 177200 }, { "epoch": 2.7514393457378294, "grad_norm": 2.4504685401916504, "learning_rate": 4.7248576172814604e-05, "loss": 0.9794, "step": 177300 }, { "epoch": 2.7529912009807727, "grad_norm": 2.4960124492645264, "learning_rate": 4.724702431757166e-05, "loss": 0.9796, "step": 177400 }, { "epoch": 2.7545430562237154, "grad_norm": 2.4604580402374268, "learning_rate": 4.724547246232871e-05, "loss": 0.9869, "step": 177500 }, { "epoch": 2.7560949114666586, "grad_norm": 2.2762703895568848, "learning_rate": 4.724392060708577e-05, "loss": 1.022, "step": 177600 }, { "epoch": 2.7576467667096014, "grad_norm": 2.2202060222625732, "learning_rate": 4.724236875184283e-05, "loss": 0.9929, "step": 177700 }, { "epoch": 2.759198621952544, "grad_norm": 2.3492422103881836, "learning_rate": 4.7240816896599886e-05, "loss": 0.9909, "step": 177800 }, { "epoch": 2.7607504771954874, "grad_norm": 2.8247437477111816, "learning_rate": 4.7239265041356944e-05, "loss": 1.0156, "step": 177900 }, { "epoch": 2.76230233243843, "grad_norm": 2.1378419399261475, "learning_rate": 4.7237713186114e-05, "loss": 1.0119, "step": 178000 }, { "epoch": 2.763854187681373, "grad_norm": 2.154428243637085, "learning_rate": 4.723616133087106e-05, "loss": 0.9688, "step": 178100 }, { "epoch": 2.765406042924316, "grad_norm": 2.594407320022583, "learning_rate": 4.723460947562812e-05, "loss": 0.9966, "step": 178200 }, { "epoch": 2.766957898167259, "grad_norm": 2.6553072929382324, "learning_rate": 4.7233057620385175e-05, "loss": 0.972, "step": 178300 }, { "epoch": 2.7685097534102017, "grad_norm": 2.4965643882751465, "learning_rate": 4.723150576514223e-05, "loss": 0.9836, "step": 178400 }, { "epoch": 2.770061608653145, "grad_norm": 2.0333499908447266, "learning_rate": 4.722995390989929e-05, "loss": 0.9775, "step": 178500 }, { "epoch": 2.7716134638960876, "grad_norm": 2.33737850189209, "learning_rate": 4.722840205465635e-05, "loss": 0.9925, "step": 178600 }, { "epoch": 2.773165319139031, "grad_norm": 2.651899814605713, "learning_rate": 4.7226850199413406e-05, "loss": 0.969, "step": 178700 }, { "epoch": 2.7747171743819736, "grad_norm": 2.3534445762634277, "learning_rate": 4.722529834417046e-05, "loss": 1.0032, "step": 178800 }, { "epoch": 2.776269029624917, "grad_norm": 2.1851696968078613, "learning_rate": 4.7223746488927515e-05, "loss": 0.9909, "step": 178900 }, { "epoch": 2.7778208848678596, "grad_norm": 2.7513070106506348, "learning_rate": 4.722219463368457e-05, "loss": 1.0007, "step": 179000 }, { "epoch": 2.7793727401108024, "grad_norm": 2.6350746154785156, "learning_rate": 4.722064277844163e-05, "loss": 0.9814, "step": 179100 }, { "epoch": 2.7809245953537456, "grad_norm": 2.2522132396698, "learning_rate": 4.721909092319869e-05, "loss": 0.9758, "step": 179200 }, { "epoch": 2.7824764505966884, "grad_norm": 2.214057445526123, "learning_rate": 4.7217539067955746e-05, "loss": 0.9859, "step": 179300 }, { "epoch": 2.784028305839631, "grad_norm": 3.0641214847564697, "learning_rate": 4.7215987212712797e-05, "loss": 0.9745, "step": 179400 }, { "epoch": 2.7855801610825743, "grad_norm": 2.7389159202575684, "learning_rate": 4.7214435357469854e-05, "loss": 0.9934, "step": 179500 }, { "epoch": 2.787132016325517, "grad_norm": 2.199221134185791, "learning_rate": 4.721288350222691e-05, "loss": 0.9571, "step": 179600 }, { "epoch": 2.78868387156846, "grad_norm": 2.0574522018432617, "learning_rate": 4.721133164698397e-05, "loss": 1.004, "step": 179700 }, { "epoch": 2.790235726811403, "grad_norm": 2.952160596847534, "learning_rate": 4.720977979174103e-05, "loss": 0.9928, "step": 179800 }, { "epoch": 2.791787582054346, "grad_norm": 2.52586030960083, "learning_rate": 4.7208227936498085e-05, "loss": 0.9928, "step": 179900 }, { "epoch": 2.793339437297289, "grad_norm": 2.313163995742798, "learning_rate": 4.720667608125514e-05, "loss": 0.9708, "step": 180000 }, { "epoch": 2.794891292540232, "grad_norm": 2.1042230129241943, "learning_rate": 4.72051242260122e-05, "loss": 0.9884, "step": 180100 }, { "epoch": 2.796443147783175, "grad_norm": 2.5428500175476074, "learning_rate": 4.720357237076926e-05, "loss": 0.9924, "step": 180200 }, { "epoch": 2.797995003026118, "grad_norm": 1.7176594734191895, "learning_rate": 4.720202051552631e-05, "loss": 0.9913, "step": 180300 }, { "epoch": 2.7995468582690606, "grad_norm": 2.2201623916625977, "learning_rate": 4.720046866028337e-05, "loss": 1.2305, "step": 180400 }, { "epoch": 2.801098713512004, "grad_norm": 2.445692539215088, "learning_rate": 4.7198916805040425e-05, "loss": 0.9964, "step": 180500 }, { "epoch": 2.8026505687549466, "grad_norm": 2.393535852432251, "learning_rate": 4.719736494979748e-05, "loss": 0.9813, "step": 180600 }, { "epoch": 2.8042024239978893, "grad_norm": 3.0491392612457275, "learning_rate": 4.719581309455454e-05, "loss": 0.9678, "step": 180700 }, { "epoch": 2.8057542792408325, "grad_norm": 2.4926960468292236, "learning_rate": 4.71942612393116e-05, "loss": 0.9914, "step": 180800 }, { "epoch": 2.8073061344837753, "grad_norm": 2.792595624923706, "learning_rate": 4.7192709384068656e-05, "loss": 0.9843, "step": 180900 }, { "epoch": 2.808857989726718, "grad_norm": 2.99853515625, "learning_rate": 4.7191157528825714e-05, "loss": 1.0038, "step": 181000 }, { "epoch": 2.8104098449696613, "grad_norm": 2.5047404766082764, "learning_rate": 4.718960567358277e-05, "loss": 1.0207, "step": 181100 }, { "epoch": 2.811961700212604, "grad_norm": 2.323244094848633, "learning_rate": 4.718805381833983e-05, "loss": 0.9523, "step": 181200 }, { "epoch": 2.8135135554555473, "grad_norm": 2.707174062728882, "learning_rate": 4.718650196309689e-05, "loss": 0.9852, "step": 181300 }, { "epoch": 2.81506541069849, "grad_norm": 2.2547144889831543, "learning_rate": 4.7184950107853945e-05, "loss": 0.9926, "step": 181400 }, { "epoch": 2.8166172659414332, "grad_norm": 2.337104320526123, "learning_rate": 4.7183398252611e-05, "loss": 0.9833, "step": 181500 }, { "epoch": 2.818169121184376, "grad_norm": 2.428063154220581, "learning_rate": 4.7181846397368054e-05, "loss": 0.9952, "step": 181600 }, { "epoch": 2.819720976427319, "grad_norm": 2.4293065071105957, "learning_rate": 4.718029454212511e-05, "loss": 1.0072, "step": 181700 }, { "epoch": 2.821272831670262, "grad_norm": 3.1902127265930176, "learning_rate": 4.717874268688217e-05, "loss": 0.9764, "step": 181800 }, { "epoch": 2.8228246869132048, "grad_norm": 2.965871810913086, "learning_rate": 4.717719083163923e-05, "loss": 0.985, "step": 181900 }, { "epoch": 2.8243765421561475, "grad_norm": 2.6800999641418457, "learning_rate": 4.7175638976396285e-05, "loss": 0.995, "step": 182000 }, { "epoch": 2.8259283973990907, "grad_norm": 5.90096378326416, "learning_rate": 4.717408712115334e-05, "loss": 0.988, "step": 182100 }, { "epoch": 2.8274802526420335, "grad_norm": 2.396225929260254, "learning_rate": 4.71725352659104e-05, "loss": 0.975, "step": 182200 }, { "epoch": 2.8290321078849763, "grad_norm": 2.1389522552490234, "learning_rate": 4.717098341066746e-05, "loss": 1.0007, "step": 182300 }, { "epoch": 2.8305839631279195, "grad_norm": 2.8134517669677734, "learning_rate": 4.7169431555424516e-05, "loss": 1.0052, "step": 182400 }, { "epoch": 2.8321358183708623, "grad_norm": 2.4114389419555664, "learning_rate": 4.716787970018157e-05, "loss": 0.9851, "step": 182500 }, { "epoch": 2.8336876736138055, "grad_norm": 1.8384065628051758, "learning_rate": 4.7166327844938624e-05, "loss": 0.9874, "step": 182600 }, { "epoch": 2.8352395288567482, "grad_norm": 2.622605562210083, "learning_rate": 4.716477598969568e-05, "loss": 0.9793, "step": 182700 }, { "epoch": 2.8367913840996914, "grad_norm": 2.2892489433288574, "learning_rate": 4.716322413445274e-05, "loss": 0.9925, "step": 182800 }, { "epoch": 2.838343239342634, "grad_norm": 2.62580943107605, "learning_rate": 4.71616722792098e-05, "loss": 0.9877, "step": 182900 }, { "epoch": 2.839895094585577, "grad_norm": 2.1189937591552734, "learning_rate": 4.7160120423966855e-05, "loss": 0.9669, "step": 183000 }, { "epoch": 2.84144694982852, "grad_norm": 2.388594150543213, "learning_rate": 4.715856856872391e-05, "loss": 0.9845, "step": 183100 }, { "epoch": 2.842998805071463, "grad_norm": 2.1766397953033447, "learning_rate": 4.7157016713480964e-05, "loss": 0.9917, "step": 183200 }, { "epoch": 2.8445506603144057, "grad_norm": 2.2372965812683105, "learning_rate": 4.715546485823802e-05, "loss": 0.9965, "step": 183300 }, { "epoch": 2.846102515557349, "grad_norm": 2.5037407875061035, "learning_rate": 4.715391300299508e-05, "loss": 1.0012, "step": 183400 }, { "epoch": 2.8476543708002917, "grad_norm": 2.8681583404541016, "learning_rate": 4.715236114775214e-05, "loss": 0.9943, "step": 183500 }, { "epoch": 2.8492062260432345, "grad_norm": 2.420591354370117, "learning_rate": 4.7150809292509195e-05, "loss": 0.9664, "step": 183600 }, { "epoch": 2.8507580812861777, "grad_norm": 2.633864402770996, "learning_rate": 4.714925743726625e-05, "loss": 0.9851, "step": 183700 }, { "epoch": 2.8523099365291205, "grad_norm": 2.613577127456665, "learning_rate": 4.714770558202331e-05, "loss": 0.9844, "step": 183800 }, { "epoch": 2.8538617917720632, "grad_norm": 1.9283421039581299, "learning_rate": 4.714615372678037e-05, "loss": 0.9884, "step": 183900 }, { "epoch": 2.8554136470150064, "grad_norm": 2.695903778076172, "learning_rate": 4.7144601871537426e-05, "loss": 0.9865, "step": 184000 }, { "epoch": 2.8569655022579497, "grad_norm": 2.2344579696655273, "learning_rate": 4.7143050016294484e-05, "loss": 0.9973, "step": 184100 }, { "epoch": 2.8585173575008924, "grad_norm": 2.056467294692993, "learning_rate": 4.714149816105154e-05, "loss": 0.9826, "step": 184200 }, { "epoch": 2.860069212743835, "grad_norm": 2.838608741760254, "learning_rate": 4.71399463058086e-05, "loss": 0.9972, "step": 184300 }, { "epoch": 2.8616210679867784, "grad_norm": 2.208022117614746, "learning_rate": 4.713839445056566e-05, "loss": 0.9868, "step": 184400 }, { "epoch": 2.863172923229721, "grad_norm": 2.1760849952697754, "learning_rate": 4.713684259532271e-05, "loss": 0.9808, "step": 184500 }, { "epoch": 2.864724778472664, "grad_norm": 5.9134931564331055, "learning_rate": 4.7135290740079766e-05, "loss": 0.9534, "step": 184600 }, { "epoch": 2.866276633715607, "grad_norm": 2.5556282997131348, "learning_rate": 4.7133738884836824e-05, "loss": 0.9941, "step": 184700 }, { "epoch": 2.86782848895855, "grad_norm": 2.077636241912842, "learning_rate": 4.713218702959388e-05, "loss": 0.9847, "step": 184800 }, { "epoch": 2.8693803442014927, "grad_norm": 2.276919364929199, "learning_rate": 4.713063517435094e-05, "loss": 0.9599, "step": 184900 }, { "epoch": 2.870932199444436, "grad_norm": 2.562828779220581, "learning_rate": 4.7129083319108e-05, "loss": 0.9794, "step": 185000 }, { "epoch": 2.8724840546873787, "grad_norm": 2.4456658363342285, "learning_rate": 4.7127531463865055e-05, "loss": 0.9828, "step": 185100 }, { "epoch": 2.8740359099303214, "grad_norm": 2.291468381881714, "learning_rate": 4.712597960862211e-05, "loss": 0.9724, "step": 185200 }, { "epoch": 2.8755877651732646, "grad_norm": 2.1348204612731934, "learning_rate": 4.712442775337917e-05, "loss": 1.0041, "step": 185300 }, { "epoch": 2.8771396204162074, "grad_norm": 2.3041205406188965, "learning_rate": 4.712287589813623e-05, "loss": 0.9899, "step": 185400 }, { "epoch": 2.8786914756591506, "grad_norm": 2.1616082191467285, "learning_rate": 4.7121324042893286e-05, "loss": 0.9668, "step": 185500 }, { "epoch": 2.8802433309020934, "grad_norm": 2.1785802841186523, "learning_rate": 4.711977218765034e-05, "loss": 0.9662, "step": 185600 }, { "epoch": 2.8817951861450366, "grad_norm": 2.327796697616577, "learning_rate": 4.71182203324074e-05, "loss": 1.0506, "step": 185700 }, { "epoch": 2.8833470413879794, "grad_norm": 2.327085018157959, "learning_rate": 4.711666847716445e-05, "loss": 0.9769, "step": 185800 }, { "epoch": 2.884898896630922, "grad_norm": 1.8716803789138794, "learning_rate": 4.711511662192151e-05, "loss": 0.9916, "step": 185900 }, { "epoch": 2.8864507518738654, "grad_norm": 2.145085573196411, "learning_rate": 4.711356476667856e-05, "loss": 0.9868, "step": 186000 }, { "epoch": 2.888002607116808, "grad_norm": 2.582432746887207, "learning_rate": 4.711201291143562e-05, "loss": 0.9815, "step": 186100 }, { "epoch": 2.889554462359751, "grad_norm": 2.7264928817749023, "learning_rate": 4.7110461056192676e-05, "loss": 0.9845, "step": 186200 }, { "epoch": 2.891106317602694, "grad_norm": 2.591337203979492, "learning_rate": 4.7108909200949734e-05, "loss": 0.9997, "step": 186300 }, { "epoch": 2.892658172845637, "grad_norm": 2.2320847511291504, "learning_rate": 4.710735734570679e-05, "loss": 0.9793, "step": 186400 }, { "epoch": 2.8942100280885796, "grad_norm": 2.429320812225342, "learning_rate": 4.710580549046385e-05, "loss": 0.9813, "step": 186500 }, { "epoch": 2.895761883331523, "grad_norm": 2.2250733375549316, "learning_rate": 4.710425363522091e-05, "loss": 0.9781, "step": 186600 }, { "epoch": 2.8973137385744656, "grad_norm": 2.420982837677002, "learning_rate": 4.7102701779977965e-05, "loss": 1.0103, "step": 186700 }, { "epoch": 2.898865593817409, "grad_norm": 2.534336805343628, "learning_rate": 4.710114992473502e-05, "loss": 0.9712, "step": 186800 }, { "epoch": 2.9004174490603516, "grad_norm": 2.2477493286132812, "learning_rate": 4.709959806949208e-05, "loss": 1.0067, "step": 186900 }, { "epoch": 2.901969304303295, "grad_norm": 2.078195095062256, "learning_rate": 4.709804621424914e-05, "loss": 0.9897, "step": 187000 }, { "epoch": 2.9035211595462376, "grad_norm": 2.63346791267395, "learning_rate": 4.7096494359006196e-05, "loss": 1.0064, "step": 187100 }, { "epoch": 2.9050730147891803, "grad_norm": 2.4181761741638184, "learning_rate": 4.7094942503763254e-05, "loss": 0.9772, "step": 187200 }, { "epoch": 2.9066248700321236, "grad_norm": 2.8212714195251465, "learning_rate": 4.7093390648520305e-05, "loss": 0.9645, "step": 187300 }, { "epoch": 2.9081767252750663, "grad_norm": 2.40470290184021, "learning_rate": 4.709183879327736e-05, "loss": 0.9896, "step": 187400 }, { "epoch": 2.909728580518009, "grad_norm": 2.4414870738983154, "learning_rate": 4.709028693803442e-05, "loss": 0.9906, "step": 187500 }, { "epoch": 2.9112804357609523, "grad_norm": 2.5339202880859375, "learning_rate": 4.708873508279148e-05, "loss": 0.9726, "step": 187600 }, { "epoch": 2.912832291003895, "grad_norm": 2.471184253692627, "learning_rate": 4.7087183227548536e-05, "loss": 0.9607, "step": 187700 }, { "epoch": 2.914384146246838, "grad_norm": 3.3048973083496094, "learning_rate": 4.7085631372305594e-05, "loss": 0.9752, "step": 187800 }, { "epoch": 2.915936001489781, "grad_norm": 3.715627908706665, "learning_rate": 4.708407951706265e-05, "loss": 0.9897, "step": 187900 }, { "epoch": 2.917487856732724, "grad_norm": 3.9594545364379883, "learning_rate": 4.708252766181971e-05, "loss": 0.9795, "step": 188000 }, { "epoch": 2.919039711975667, "grad_norm": 2.4421803951263428, "learning_rate": 4.708097580657677e-05, "loss": 0.9861, "step": 188100 }, { "epoch": 2.92059156721861, "grad_norm": 2.7301816940307617, "learning_rate": 4.7079423951333825e-05, "loss": 0.9954, "step": 188200 }, { "epoch": 2.922143422461553, "grad_norm": 2.9197609424591064, "learning_rate": 4.707787209609088e-05, "loss": 1.0025, "step": 188300 }, { "epoch": 2.9236952777044958, "grad_norm": 3.304325580596924, "learning_rate": 4.707632024084794e-05, "loss": 0.9864, "step": 188400 }, { "epoch": 2.9252471329474385, "grad_norm": 2.532344341278076, "learning_rate": 4.7074768385605e-05, "loss": 1.013, "step": 188500 }, { "epoch": 2.9267989881903818, "grad_norm": 2.355011463165283, "learning_rate": 4.707321653036205e-05, "loss": 0.9789, "step": 188600 }, { "epoch": 2.9283508434333245, "grad_norm": 2.5637922286987305, "learning_rate": 4.7071664675119107e-05, "loss": 0.9639, "step": 188700 }, { "epoch": 2.9299026986762673, "grad_norm": 2.3940951824188232, "learning_rate": 4.7070112819876164e-05, "loss": 0.9641, "step": 188800 }, { "epoch": 2.9314545539192105, "grad_norm": 2.126951217651367, "learning_rate": 4.706856096463322e-05, "loss": 0.9934, "step": 188900 }, { "epoch": 2.9330064091621533, "grad_norm": 2.6418213844299316, "learning_rate": 4.706700910939028e-05, "loss": 0.9807, "step": 189000 }, { "epoch": 2.934558264405096, "grad_norm": 2.4075067043304443, "learning_rate": 4.706545725414733e-05, "loss": 0.985, "step": 189100 }, { "epoch": 2.9361101196480393, "grad_norm": 3.074343681335449, "learning_rate": 4.706390539890439e-05, "loss": 0.9735, "step": 189200 }, { "epoch": 2.937661974890982, "grad_norm": 2.015557050704956, "learning_rate": 4.7062353543661446e-05, "loss": 0.9714, "step": 189300 }, { "epoch": 2.9392138301339252, "grad_norm": 2.6027753353118896, "learning_rate": 4.7060801688418504e-05, "loss": 0.9778, "step": 189400 }, { "epoch": 2.940765685376868, "grad_norm": 2.508507013320923, "learning_rate": 4.705924983317556e-05, "loss": 0.9712, "step": 189500 }, { "epoch": 2.942317540619811, "grad_norm": 2.535879135131836, "learning_rate": 4.705769797793262e-05, "loss": 0.979, "step": 189600 }, { "epoch": 2.943869395862754, "grad_norm": 2.550832509994507, "learning_rate": 4.705614612268968e-05, "loss": 0.9844, "step": 189700 }, { "epoch": 2.9454212511056967, "grad_norm": 2.275442123413086, "learning_rate": 4.7054594267446735e-05, "loss": 0.9863, "step": 189800 }, { "epoch": 2.94697310634864, "grad_norm": 2.6009528636932373, "learning_rate": 4.705304241220379e-05, "loss": 0.9795, "step": 189900 }, { "epoch": 2.9485249615915827, "grad_norm": 2.1449105739593506, "learning_rate": 4.705149055696085e-05, "loss": 0.9899, "step": 190000 }, { "epoch": 2.9500768168345255, "grad_norm": 1.9790186882019043, "learning_rate": 4.704993870171791e-05, "loss": 0.9956, "step": 190100 }, { "epoch": 2.9516286720774687, "grad_norm": 1.9937858581542969, "learning_rate": 4.704838684647496e-05, "loss": 1.0062, "step": 190200 }, { "epoch": 2.9531805273204115, "grad_norm": 2.633288860321045, "learning_rate": 4.704683499123202e-05, "loss": 0.9807, "step": 190300 }, { "epoch": 2.9547323825633542, "grad_norm": 2.0682358741760254, "learning_rate": 4.7045283135989075e-05, "loss": 0.9656, "step": 190400 }, { "epoch": 2.9562842378062975, "grad_norm": 1.9631118774414062, "learning_rate": 4.704373128074613e-05, "loss": 0.9743, "step": 190500 }, { "epoch": 2.9578360930492402, "grad_norm": 2.243213176727295, "learning_rate": 4.704217942550319e-05, "loss": 0.9541, "step": 190600 }, { "epoch": 2.9593879482921834, "grad_norm": 2.67138671875, "learning_rate": 4.704062757026025e-05, "loss": 0.9996, "step": 190700 }, { "epoch": 2.960939803535126, "grad_norm": 2.994114398956299, "learning_rate": 4.7039075715017306e-05, "loss": 0.9804, "step": 190800 }, { "epoch": 2.9624916587780694, "grad_norm": 2.3190019130706787, "learning_rate": 4.7037523859774364e-05, "loss": 0.9916, "step": 190900 }, { "epoch": 2.964043514021012, "grad_norm": 2.0402817726135254, "learning_rate": 4.703597200453142e-05, "loss": 0.9952, "step": 191000 }, { "epoch": 2.965595369263955, "grad_norm": 2.3033385276794434, "learning_rate": 4.703442014928848e-05, "loss": 0.9685, "step": 191100 }, { "epoch": 2.967147224506898, "grad_norm": 2.0917930603027344, "learning_rate": 4.703286829404554e-05, "loss": 0.9782, "step": 191200 }, { "epoch": 2.968699079749841, "grad_norm": 3.5316965579986572, "learning_rate": 4.7031316438802595e-05, "loss": 0.9928, "step": 191300 }, { "epoch": 2.9702509349927837, "grad_norm": 2.6135449409484863, "learning_rate": 4.702976458355965e-05, "loss": 0.9783, "step": 191400 }, { "epoch": 2.971802790235727, "grad_norm": 2.495619058609009, "learning_rate": 4.70282127283167e-05, "loss": 0.9671, "step": 191500 }, { "epoch": 2.9733546454786697, "grad_norm": 2.468709707260132, "learning_rate": 4.702666087307376e-05, "loss": 0.9968, "step": 191600 }, { "epoch": 2.9749065007216124, "grad_norm": 2.1931636333465576, "learning_rate": 4.702510901783082e-05, "loss": 0.9782, "step": 191700 }, { "epoch": 2.9764583559645557, "grad_norm": 2.253790855407715, "learning_rate": 4.7023557162587877e-05, "loss": 0.9775, "step": 191800 }, { "epoch": 2.9780102112074984, "grad_norm": 2.77315616607666, "learning_rate": 4.7022005307344934e-05, "loss": 0.9991, "step": 191900 }, { "epoch": 2.9795620664504416, "grad_norm": 2.249595880508423, "learning_rate": 4.702045345210199e-05, "loss": 0.9856, "step": 192000 }, { "epoch": 2.9811139216933844, "grad_norm": 2.2015538215637207, "learning_rate": 4.701890159685905e-05, "loss": 0.9611, "step": 192100 }, { "epoch": 2.9826657769363276, "grad_norm": 2.511009931564331, "learning_rate": 4.701734974161611e-05, "loss": 1.0019, "step": 192200 }, { "epoch": 2.9842176321792704, "grad_norm": 2.7516844272613525, "learning_rate": 4.7015797886373165e-05, "loss": 0.9778, "step": 192300 }, { "epoch": 2.985769487422213, "grad_norm": 2.7935068607330322, "learning_rate": 4.7014246031130216e-05, "loss": 0.9843, "step": 192400 }, { "epoch": 2.9873213426651564, "grad_norm": 12.373126029968262, "learning_rate": 4.7012694175887274e-05, "loss": 0.9879, "step": 192500 }, { "epoch": 2.988873197908099, "grad_norm": 2.2637534141540527, "learning_rate": 4.701114232064433e-05, "loss": 0.9786, "step": 192600 }, { "epoch": 2.990425053151042, "grad_norm": 2.595670461654663, "learning_rate": 4.700959046540139e-05, "loss": 1.0302, "step": 192700 }, { "epoch": 2.991976908393985, "grad_norm": 2.5891036987304688, "learning_rate": 4.700803861015845e-05, "loss": 0.9895, "step": 192800 }, { "epoch": 2.993528763636928, "grad_norm": 2.626511812210083, "learning_rate": 4.7006486754915505e-05, "loss": 1.0101, "step": 192900 }, { "epoch": 2.9950806188798706, "grad_norm": 2.105192184448242, "learning_rate": 4.7004934899672556e-05, "loss": 0.9789, "step": 193000 }, { "epoch": 2.996632474122814, "grad_norm": 2.2023699283599854, "learning_rate": 4.7003383044429614e-05, "loss": 0.9724, "step": 193100 }, { "epoch": 2.9981843293657566, "grad_norm": 3.483053207397461, "learning_rate": 4.700183118918667e-05, "loss": 0.9987, "step": 193200 }, { "epoch": 2.9997361846087, "grad_norm": 2.3124778270721436, "learning_rate": 4.700027933394373e-05, "loss": 0.9681, "step": 193300 }, { "epoch": 3.0012880398516426, "grad_norm": 2.3458285331726074, "learning_rate": 4.699872747870079e-05, "loss": 0.9898, "step": 193400 }, { "epoch": 3.0028398950945854, "grad_norm": 2.796020984649658, "learning_rate": 4.6997175623457845e-05, "loss": 1.0042, "step": 193500 }, { "epoch": 3.0043917503375286, "grad_norm": 2.36063289642334, "learning_rate": 4.69956237682149e-05, "loss": 0.9811, "step": 193600 }, { "epoch": 3.0059436055804714, "grad_norm": 2.694326400756836, "learning_rate": 4.699407191297196e-05, "loss": 1.0066, "step": 193700 }, { "epoch": 3.0074954608234146, "grad_norm": 2.334594249725342, "learning_rate": 4.699252005772902e-05, "loss": 0.9766, "step": 193800 }, { "epoch": 3.0090473160663573, "grad_norm": 2.325331926345825, "learning_rate": 4.6990968202486076e-05, "loss": 0.9814, "step": 193900 }, { "epoch": 3.0105991713093, "grad_norm": 2.487588405609131, "learning_rate": 4.6989416347243134e-05, "loss": 0.9735, "step": 194000 }, { "epoch": 3.0121510265522433, "grad_norm": 2.3206117153167725, "learning_rate": 4.698786449200019e-05, "loss": 0.9734, "step": 194100 }, { "epoch": 3.013702881795186, "grad_norm": 2.617901563644409, "learning_rate": 4.698631263675725e-05, "loss": 0.9794, "step": 194200 }, { "epoch": 3.0152547370381293, "grad_norm": 2.2743725776672363, "learning_rate": 4.69847607815143e-05, "loss": 0.9629, "step": 194300 }, { "epoch": 3.016806592281072, "grad_norm": 2.2770254611968994, "learning_rate": 4.698320892627136e-05, "loss": 0.9787, "step": 194400 }, { "epoch": 3.018358447524015, "grad_norm": 2.2435994148254395, "learning_rate": 4.6981657071028416e-05, "loss": 0.9865, "step": 194500 }, { "epoch": 3.019910302766958, "grad_norm": 2.6161680221557617, "learning_rate": 4.698010521578547e-05, "loss": 0.9851, "step": 194600 }, { "epoch": 3.021462158009901, "grad_norm": 2.418020486831665, "learning_rate": 4.697855336054253e-05, "loss": 1.005, "step": 194700 }, { "epoch": 3.0230140132528436, "grad_norm": 2.3490285873413086, "learning_rate": 4.697700150529959e-05, "loss": 0.962, "step": 194800 }, { "epoch": 3.024565868495787, "grad_norm": 2.7330353260040283, "learning_rate": 4.6975449650056647e-05, "loss": 0.9994, "step": 194900 }, { "epoch": 3.0261177237387296, "grad_norm": 4.525585174560547, "learning_rate": 4.6973897794813704e-05, "loss": 0.9775, "step": 195000 }, { "epoch": 3.0276695789816728, "grad_norm": 2.649479389190674, "learning_rate": 4.697234593957076e-05, "loss": 0.9754, "step": 195100 }, { "epoch": 3.0292214342246155, "grad_norm": 2.4138944149017334, "learning_rate": 4.697079408432782e-05, "loss": 0.9576, "step": 195200 }, { "epoch": 3.0307732894675583, "grad_norm": 2.4383480548858643, "learning_rate": 4.696924222908488e-05, "loss": 0.9771, "step": 195300 }, { "epoch": 3.0323251447105015, "grad_norm": 2.810619831085205, "learning_rate": 4.6967690373841935e-05, "loss": 0.9931, "step": 195400 }, { "epoch": 3.0338769999534443, "grad_norm": 2.350973606109619, "learning_rate": 4.696613851859899e-05, "loss": 0.995, "step": 195500 }, { "epoch": 3.0354288551963875, "grad_norm": 2.091646909713745, "learning_rate": 4.6964586663356044e-05, "loss": 0.9676, "step": 195600 }, { "epoch": 3.0369807104393303, "grad_norm": 2.3971426486968994, "learning_rate": 4.69630348081131e-05, "loss": 0.9889, "step": 195700 }, { "epoch": 3.038532565682273, "grad_norm": 2.2291464805603027, "learning_rate": 4.696148295287015e-05, "loss": 0.9918, "step": 195800 }, { "epoch": 3.0400844209252162, "grad_norm": 2.5356991291046143, "learning_rate": 4.695993109762721e-05, "loss": 0.9786, "step": 195900 }, { "epoch": 3.041636276168159, "grad_norm": 2.3369204998016357, "learning_rate": 4.695837924238427e-05, "loss": 0.978, "step": 196000 }, { "epoch": 3.043188131411102, "grad_norm": 2.1558306217193604, "learning_rate": 4.6956827387141326e-05, "loss": 0.9634, "step": 196100 }, { "epoch": 3.044739986654045, "grad_norm": 2.7331583499908447, "learning_rate": 4.6955275531898384e-05, "loss": 0.9866, "step": 196200 }, { "epoch": 3.0462918418969878, "grad_norm": 1.9750306606292725, "learning_rate": 4.695372367665544e-05, "loss": 0.974, "step": 196300 }, { "epoch": 3.047843697139931, "grad_norm": 1.9855722188949585, "learning_rate": 4.69521718214125e-05, "loss": 0.9673, "step": 196400 }, { "epoch": 3.0493955523828737, "grad_norm": 2.3334333896636963, "learning_rate": 4.695061996616956e-05, "loss": 0.9792, "step": 196500 }, { "epoch": 3.0509474076258165, "grad_norm": 2.5820510387420654, "learning_rate": 4.6949068110926615e-05, "loss": 0.9917, "step": 196600 }, { "epoch": 3.0524992628687597, "grad_norm": 7.742116451263428, "learning_rate": 4.694751625568367e-05, "loss": 0.9915, "step": 196700 }, { "epoch": 3.0540511181117025, "grad_norm": 2.3722169399261475, "learning_rate": 4.694596440044073e-05, "loss": 0.9811, "step": 196800 }, { "epoch": 3.0556029733546453, "grad_norm": 2.6898815631866455, "learning_rate": 4.694441254519779e-05, "loss": 0.9887, "step": 196900 }, { "epoch": 3.0571548285975885, "grad_norm": 2.280363082885742, "learning_rate": 4.6942860689954846e-05, "loss": 0.9612, "step": 197000 }, { "epoch": 3.0587066838405312, "grad_norm": 2.384667158126831, "learning_rate": 4.69413088347119e-05, "loss": 0.9757, "step": 197100 }, { "epoch": 3.0602585390834744, "grad_norm": 2.4577221870422363, "learning_rate": 4.6939756979468955e-05, "loss": 0.9873, "step": 197200 }, { "epoch": 3.061810394326417, "grad_norm": 2.2722411155700684, "learning_rate": 4.693820512422601e-05, "loss": 0.9716, "step": 197300 }, { "epoch": 3.06336224956936, "grad_norm": 2.266305446624756, "learning_rate": 4.693665326898307e-05, "loss": 0.9879, "step": 197400 }, { "epoch": 3.064914104812303, "grad_norm": 2.3182260990142822, "learning_rate": 4.693510141374013e-05, "loss": 0.9788, "step": 197500 }, { "epoch": 3.066465960055246, "grad_norm": 2.787478446960449, "learning_rate": 4.6933549558497186e-05, "loss": 0.977, "step": 197600 }, { "epoch": 3.068017815298189, "grad_norm": 2.2585747241973877, "learning_rate": 4.693199770325424e-05, "loss": 0.9582, "step": 197700 }, { "epoch": 3.069569670541132, "grad_norm": 2.5008699893951416, "learning_rate": 4.69304458480113e-05, "loss": 0.9888, "step": 197800 }, { "epoch": 3.0711215257840747, "grad_norm": 1.8762156963348389, "learning_rate": 4.692889399276836e-05, "loss": 0.9745, "step": 197900 }, { "epoch": 3.072673381027018, "grad_norm": 2.6718082427978516, "learning_rate": 4.6927342137525417e-05, "loss": 0.9739, "step": 198000 }, { "epoch": 3.0742252362699607, "grad_norm": 2.9084866046905518, "learning_rate": 4.6925790282282474e-05, "loss": 0.9811, "step": 198100 }, { "epoch": 3.0757770915129035, "grad_norm": 2.586825132369995, "learning_rate": 4.692423842703953e-05, "loss": 0.9812, "step": 198200 }, { "epoch": 3.0773289467558467, "grad_norm": 2.147603988647461, "learning_rate": 4.692268657179659e-05, "loss": 0.9544, "step": 198300 }, { "epoch": 3.0788808019987894, "grad_norm": 2.179543972015381, "learning_rate": 4.692113471655364e-05, "loss": 0.9821, "step": 198400 }, { "epoch": 3.0804326572417327, "grad_norm": 2.9318699836730957, "learning_rate": 4.69195828613107e-05, "loss": 0.9839, "step": 198500 }, { "epoch": 3.0819845124846754, "grad_norm": 2.433058500289917, "learning_rate": 4.6918031006067756e-05, "loss": 0.968, "step": 198600 }, { "epoch": 3.083536367727618, "grad_norm": 2.8908143043518066, "learning_rate": 4.6916479150824814e-05, "loss": 0.9579, "step": 198700 }, { "epoch": 3.0850882229705614, "grad_norm": 2.4483296871185303, "learning_rate": 4.691492729558187e-05, "loss": 0.9898, "step": 198800 }, { "epoch": 3.086640078213504, "grad_norm": 2.367967128753662, "learning_rate": 4.691337544033892e-05, "loss": 0.9686, "step": 198900 }, { "epoch": 3.0881919334564474, "grad_norm": 2.814093828201294, "learning_rate": 4.691182358509598e-05, "loss": 0.9731, "step": 199000 }, { "epoch": 3.08974378869939, "grad_norm": 2.2390553951263428, "learning_rate": 4.691027172985304e-05, "loss": 0.9598, "step": 199100 }, { "epoch": 3.091295643942333, "grad_norm": 2.3798186779022217, "learning_rate": 4.6908719874610096e-05, "loss": 0.9678, "step": 199200 }, { "epoch": 3.092847499185276, "grad_norm": 2.569826126098633, "learning_rate": 4.6907168019367154e-05, "loss": 0.9747, "step": 199300 }, { "epoch": 3.094399354428219, "grad_norm": 2.639374017715454, "learning_rate": 4.690561616412421e-05, "loss": 0.9981, "step": 199400 }, { "epoch": 3.0959512096711617, "grad_norm": 2.136263847351074, "learning_rate": 4.690406430888127e-05, "loss": 0.974, "step": 199500 }, { "epoch": 3.097503064914105, "grad_norm": 2.464301824569702, "learning_rate": 4.690251245363833e-05, "loss": 0.972, "step": 199600 }, { "epoch": 3.0990549201570476, "grad_norm": 2.5139918327331543, "learning_rate": 4.6900960598395385e-05, "loss": 0.9689, "step": 199700 }, { "epoch": 3.100606775399991, "grad_norm": 2.318319797515869, "learning_rate": 4.689940874315244e-05, "loss": 0.9826, "step": 199800 }, { "epoch": 3.1021586306429336, "grad_norm": 2.6354269981384277, "learning_rate": 4.68978568879095e-05, "loss": 0.9826, "step": 199900 }, { "epoch": 3.1037104858858764, "grad_norm": 2.307537794113159, "learning_rate": 4.689630503266655e-05, "loss": 0.9639, "step": 200000 }, { "epoch": 3.1052623411288196, "grad_norm": 2.477670907974243, "learning_rate": 4.689475317742361e-05, "loss": 1.0061, "step": 200100 }, { "epoch": 3.1068141963717624, "grad_norm": 2.597285747528076, "learning_rate": 4.689320132218067e-05, "loss": 0.9763, "step": 200200 }, { "epoch": 3.1083660516147056, "grad_norm": 2.3619768619537354, "learning_rate": 4.6891649466937725e-05, "loss": 0.983, "step": 200300 }, { "epoch": 3.1099179068576484, "grad_norm": 2.1355032920837402, "learning_rate": 4.689009761169478e-05, "loss": 0.9614, "step": 200400 }, { "epoch": 3.111469762100591, "grad_norm": 2.4536688327789307, "learning_rate": 4.688854575645184e-05, "loss": 0.9818, "step": 200500 }, { "epoch": 3.1130216173435343, "grad_norm": 2.547424554824829, "learning_rate": 4.68869939012089e-05, "loss": 0.9754, "step": 200600 }, { "epoch": 3.114573472586477, "grad_norm": 2.3826773166656494, "learning_rate": 4.6885442045965956e-05, "loss": 0.9791, "step": 200700 }, { "epoch": 3.11612532782942, "grad_norm": 3.161182403564453, "learning_rate": 4.688389019072301e-05, "loss": 0.9778, "step": 200800 }, { "epoch": 3.117677183072363, "grad_norm": 2.1158483028411865, "learning_rate": 4.688233833548007e-05, "loss": 0.9603, "step": 200900 }, { "epoch": 3.119229038315306, "grad_norm": 2.8840832710266113, "learning_rate": 4.688078648023713e-05, "loss": 0.9774, "step": 201000 }, { "epoch": 3.120780893558249, "grad_norm": 2.3180959224700928, "learning_rate": 4.6879234624994187e-05, "loss": 0.9802, "step": 201100 }, { "epoch": 3.122332748801192, "grad_norm": 3.199521541595459, "learning_rate": 4.6877682769751244e-05, "loss": 0.959, "step": 201200 }, { "epoch": 3.1238846040441346, "grad_norm": 2.323233127593994, "learning_rate": 4.6876130914508295e-05, "loss": 0.9819, "step": 201300 }, { "epoch": 3.125436459287078, "grad_norm": 2.3602066040039062, "learning_rate": 4.687457905926535e-05, "loss": 0.9729, "step": 201400 }, { "epoch": 3.1269883145300206, "grad_norm": 2.163839101791382, "learning_rate": 4.687302720402241e-05, "loss": 0.9737, "step": 201500 }, { "epoch": 3.128540169772964, "grad_norm": 2.4454245567321777, "learning_rate": 4.687147534877947e-05, "loss": 0.9787, "step": 201600 }, { "epoch": 3.1300920250159066, "grad_norm": 2.1432580947875977, "learning_rate": 4.6869923493536526e-05, "loss": 0.935, "step": 201700 }, { "epoch": 3.1316438802588493, "grad_norm": 2.367708921432495, "learning_rate": 4.6868371638293584e-05, "loss": 0.9786, "step": 201800 }, { "epoch": 3.1331957355017925, "grad_norm": 2.330509662628174, "learning_rate": 4.686681978305064e-05, "loss": 0.9783, "step": 201900 }, { "epoch": 3.1347475907447353, "grad_norm": 2.389829158782959, "learning_rate": 4.68652679278077e-05, "loss": 0.9676, "step": 202000 }, { "epoch": 3.136299445987678, "grad_norm": 2.293586492538452, "learning_rate": 4.686371607256476e-05, "loss": 0.9918, "step": 202100 }, { "epoch": 3.1378513012306213, "grad_norm": 2.0330810546875, "learning_rate": 4.686216421732181e-05, "loss": 0.9788, "step": 202200 }, { "epoch": 3.139403156473564, "grad_norm": 2.126309871673584, "learning_rate": 4.6860612362078866e-05, "loss": 0.9789, "step": 202300 }, { "epoch": 3.1409550117165073, "grad_norm": 2.5358524322509766, "learning_rate": 4.6859060506835924e-05, "loss": 0.9595, "step": 202400 }, { "epoch": 3.14250686695945, "grad_norm": 2.7127878665924072, "learning_rate": 4.685750865159298e-05, "loss": 0.9768, "step": 202500 }, { "epoch": 3.144058722202393, "grad_norm": 2.6751229763031006, "learning_rate": 4.685595679635004e-05, "loss": 0.9762, "step": 202600 }, { "epoch": 3.145610577445336, "grad_norm": 2.370462417602539, "learning_rate": 4.68544049411071e-05, "loss": 0.9586, "step": 202700 }, { "epoch": 3.1471624326882788, "grad_norm": 2.0656661987304688, "learning_rate": 4.685285308586415e-05, "loss": 0.9572, "step": 202800 }, { "epoch": 3.148714287931222, "grad_norm": 2.2008516788482666, "learning_rate": 4.6851301230621206e-05, "loss": 0.9783, "step": 202900 }, { "epoch": 3.1502661431741648, "grad_norm": 2.500897169113159, "learning_rate": 4.6849749375378264e-05, "loss": 0.9953, "step": 203000 }, { "epoch": 3.1518179984171075, "grad_norm": 2.289325475692749, "learning_rate": 4.684819752013532e-05, "loss": 0.9578, "step": 203100 }, { "epoch": 3.1533698536600507, "grad_norm": 2.4198849201202393, "learning_rate": 4.684664566489238e-05, "loss": 0.9816, "step": 203200 }, { "epoch": 3.1549217089029935, "grad_norm": 2.328320026397705, "learning_rate": 4.684509380964944e-05, "loss": 0.9844, "step": 203300 }, { "epoch": 3.1564735641459363, "grad_norm": 2.9561514854431152, "learning_rate": 4.6843541954406495e-05, "loss": 0.9884, "step": 203400 }, { "epoch": 3.1580254193888795, "grad_norm": 2.278493881225586, "learning_rate": 4.684199009916355e-05, "loss": 0.9846, "step": 203500 }, { "epoch": 3.1595772746318223, "grad_norm": 2.4520184993743896, "learning_rate": 4.684043824392061e-05, "loss": 0.9805, "step": 203600 }, { "epoch": 3.1611291298747655, "grad_norm": 2.7855238914489746, "learning_rate": 4.683888638867767e-05, "loss": 0.9819, "step": 203700 }, { "epoch": 3.1626809851177082, "grad_norm": 2.5873663425445557, "learning_rate": 4.6837334533434726e-05, "loss": 0.9702, "step": 203800 }, { "epoch": 3.164232840360651, "grad_norm": 2.4637317657470703, "learning_rate": 4.683578267819178e-05, "loss": 0.9811, "step": 203900 }, { "epoch": 3.165784695603594, "grad_norm": 5.4417266845703125, "learning_rate": 4.683423082294884e-05, "loss": 0.9686, "step": 204000 }, { "epoch": 3.167336550846537, "grad_norm": 2.1931545734405518, "learning_rate": 4.683267896770589e-05, "loss": 0.9832, "step": 204100 }, { "epoch": 3.16888840608948, "grad_norm": 2.144624710083008, "learning_rate": 4.683112711246295e-05, "loss": 0.9638, "step": 204200 }, { "epoch": 3.170440261332423, "grad_norm": 2.588209867477417, "learning_rate": 4.682957525722001e-05, "loss": 0.9816, "step": 204300 }, { "epoch": 3.1719921165753657, "grad_norm": 2.5427372455596924, "learning_rate": 4.6828023401977065e-05, "loss": 0.9388, "step": 204400 }, { "epoch": 3.173543971818309, "grad_norm": 2.5207388401031494, "learning_rate": 4.682647154673412e-05, "loss": 0.9945, "step": 204500 }, { "epoch": 3.1750958270612517, "grad_norm": 2.2907872200012207, "learning_rate": 4.682491969149118e-05, "loss": 0.9648, "step": 204600 }, { "epoch": 3.1766476823041945, "grad_norm": 2.076737880706787, "learning_rate": 4.682336783624824e-05, "loss": 0.9978, "step": 204700 }, { "epoch": 3.1781995375471377, "grad_norm": 3.2021450996398926, "learning_rate": 4.6821815981005296e-05, "loss": 0.9768, "step": 204800 }, { "epoch": 3.1797513927900805, "grad_norm": 2.2459185123443604, "learning_rate": 4.6820264125762354e-05, "loss": 0.9454, "step": 204900 }, { "epoch": 3.1813032480330237, "grad_norm": 2.3355512619018555, "learning_rate": 4.681871227051941e-05, "loss": 0.9579, "step": 205000 }, { "epoch": 3.1828551032759664, "grad_norm": 2.64522385597229, "learning_rate": 4.681716041527647e-05, "loss": 0.9614, "step": 205100 }, { "epoch": 3.184406958518909, "grad_norm": 2.252082109451294, "learning_rate": 4.681560856003353e-05, "loss": 0.9781, "step": 205200 }, { "epoch": 3.1859588137618524, "grad_norm": 2.238100051879883, "learning_rate": 4.6814056704790585e-05, "loss": 0.9874, "step": 205300 }, { "epoch": 3.187510669004795, "grad_norm": 2.1641948223114014, "learning_rate": 4.6812504849547636e-05, "loss": 0.9497, "step": 205400 }, { "epoch": 3.1890625242477384, "grad_norm": 2.4257216453552246, "learning_rate": 4.6810952994304694e-05, "loss": 0.9938, "step": 205500 }, { "epoch": 3.190614379490681, "grad_norm": 2.427554130554199, "learning_rate": 4.680940113906175e-05, "loss": 0.9591, "step": 205600 }, { "epoch": 3.192166234733624, "grad_norm": 2.1279029846191406, "learning_rate": 4.68078492838188e-05, "loss": 0.99, "step": 205700 }, { "epoch": 3.193718089976567, "grad_norm": 2.4062323570251465, "learning_rate": 4.680629742857586e-05, "loss": 0.9561, "step": 205800 }, { "epoch": 3.19526994521951, "grad_norm": 2.802037000656128, "learning_rate": 4.680474557333292e-05, "loss": 0.9609, "step": 205900 }, { "epoch": 3.1968218004624527, "grad_norm": 1.8816951513290405, "learning_rate": 4.6803193718089976e-05, "loss": 0.9844, "step": 206000 }, { "epoch": 3.198373655705396, "grad_norm": 1.9971848726272583, "learning_rate": 4.6801641862847034e-05, "loss": 0.9866, "step": 206100 }, { "epoch": 3.1999255109483387, "grad_norm": 2.268066883087158, "learning_rate": 4.680009000760409e-05, "loss": 0.9582, "step": 206200 }, { "epoch": 3.201477366191282, "grad_norm": 2.123152256011963, "learning_rate": 4.679853815236115e-05, "loss": 0.9858, "step": 206300 }, { "epoch": 3.2030292214342246, "grad_norm": 2.2871429920196533, "learning_rate": 4.679698629711821e-05, "loss": 0.9997, "step": 206400 }, { "epoch": 3.2045810766771674, "grad_norm": 1.983797550201416, "learning_rate": 4.6795434441875265e-05, "loss": 0.9755, "step": 206500 }, { "epoch": 3.2061329319201106, "grad_norm": 2.5424652099609375, "learning_rate": 4.679388258663232e-05, "loss": 0.9713, "step": 206600 }, { "epoch": 3.2076847871630534, "grad_norm": 2.4657442569732666, "learning_rate": 4.679233073138938e-05, "loss": 0.981, "step": 206700 }, { "epoch": 3.2092366424059966, "grad_norm": 2.1475634574890137, "learning_rate": 4.679077887614644e-05, "loss": 0.9756, "step": 206800 }, { "epoch": 3.2107884976489394, "grad_norm": 2.080209255218506, "learning_rate": 4.6789227020903496e-05, "loss": 0.9886, "step": 206900 }, { "epoch": 3.212340352891882, "grad_norm": 2.2685277462005615, "learning_rate": 4.6787675165660547e-05, "loss": 0.9587, "step": 207000 }, { "epoch": 3.2138922081348253, "grad_norm": 2.4574780464172363, "learning_rate": 4.6786123310417604e-05, "loss": 0.9475, "step": 207100 }, { "epoch": 3.215444063377768, "grad_norm": 2.5025315284729004, "learning_rate": 4.678457145517466e-05, "loss": 1.0016, "step": 207200 }, { "epoch": 3.216995918620711, "grad_norm": 2.094045877456665, "learning_rate": 4.678301959993172e-05, "loss": 0.9874, "step": 207300 }, { "epoch": 3.218547773863654, "grad_norm": 2.158668279647827, "learning_rate": 4.678146774468878e-05, "loss": 0.9753, "step": 207400 }, { "epoch": 3.220099629106597, "grad_norm": 2.3556935787200928, "learning_rate": 4.6779915889445835e-05, "loss": 0.9713, "step": 207500 }, { "epoch": 3.22165148434954, "grad_norm": 2.183370351791382, "learning_rate": 4.677836403420289e-05, "loss": 0.98, "step": 207600 }, { "epoch": 3.223203339592483, "grad_norm": 1.9801820516586304, "learning_rate": 4.677681217895995e-05, "loss": 0.9593, "step": 207700 }, { "epoch": 3.2247551948354256, "grad_norm": 2.2726573944091797, "learning_rate": 4.677526032371701e-05, "loss": 0.9695, "step": 207800 }, { "epoch": 3.226307050078369, "grad_norm": 2.657592535018921, "learning_rate": 4.6773708468474066e-05, "loss": 0.9713, "step": 207900 }, { "epoch": 3.2278589053213116, "grad_norm": 2.405496120452881, "learning_rate": 4.6772156613231124e-05, "loss": 0.96, "step": 208000 }, { "epoch": 3.229410760564255, "grad_norm": 2.2222909927368164, "learning_rate": 4.677060475798818e-05, "loss": 0.9667, "step": 208100 }, { "epoch": 3.2309626158071976, "grad_norm": 3.3383922576904297, "learning_rate": 4.676905290274524e-05, "loss": 0.9479, "step": 208200 }, { "epoch": 3.2325144710501403, "grad_norm": 2.001981019973755, "learning_rate": 4.676750104750229e-05, "loss": 0.9451, "step": 208300 }, { "epoch": 3.2340663262930835, "grad_norm": 2.6972899436950684, "learning_rate": 4.676594919225935e-05, "loss": 0.972, "step": 208400 }, { "epoch": 3.2356181815360263, "grad_norm": 2.0227434635162354, "learning_rate": 4.6764397337016406e-05, "loss": 0.952, "step": 208500 }, { "epoch": 3.237170036778969, "grad_norm": 2.1841228008270264, "learning_rate": 4.6762845481773464e-05, "loss": 0.9654, "step": 208600 }, { "epoch": 3.2387218920219123, "grad_norm": 2.3678929805755615, "learning_rate": 4.6761293626530515e-05, "loss": 0.9703, "step": 208700 }, { "epoch": 3.240273747264855, "grad_norm": 2.7907235622406006, "learning_rate": 4.675974177128757e-05, "loss": 0.9658, "step": 208800 }, { "epoch": 3.2418256025077983, "grad_norm": 3.3275513648986816, "learning_rate": 4.675818991604463e-05, "loss": 0.9632, "step": 208900 }, { "epoch": 3.243377457750741, "grad_norm": 2.9234211444854736, "learning_rate": 4.675663806080169e-05, "loss": 0.9847, "step": 209000 }, { "epoch": 3.244929312993684, "grad_norm": 2.57970929145813, "learning_rate": 4.6755086205558746e-05, "loss": 0.9912, "step": 209100 }, { "epoch": 3.246481168236627, "grad_norm": 2.748455762863159, "learning_rate": 4.6753534350315804e-05, "loss": 0.9853, "step": 209200 }, { "epoch": 3.24803302347957, "grad_norm": 2.228722333908081, "learning_rate": 4.675198249507286e-05, "loss": 0.9972, "step": 209300 }, { "epoch": 3.2495848787225126, "grad_norm": 2.6612584590911865, "learning_rate": 4.675043063982992e-05, "loss": 0.9741, "step": 209400 }, { "epoch": 3.2511367339654558, "grad_norm": 2.397484302520752, "learning_rate": 4.674887878458698e-05, "loss": 0.9681, "step": 209500 }, { "epoch": 3.2526885892083985, "grad_norm": 2.737473249435425, "learning_rate": 4.6747326929344035e-05, "loss": 0.9795, "step": 209600 }, { "epoch": 3.2542404444513418, "grad_norm": 2.6061840057373047, "learning_rate": 4.674577507410109e-05, "loss": 0.9466, "step": 209700 }, { "epoch": 3.2557922996942845, "grad_norm": 2.698423147201538, "learning_rate": 4.674422321885814e-05, "loss": 0.9699, "step": 209800 }, { "epoch": 3.2573441549372273, "grad_norm": 2.0765626430511475, "learning_rate": 4.67426713636152e-05, "loss": 0.9651, "step": 209900 }, { "epoch": 3.2588960101801705, "grad_norm": 2.275743246078491, "learning_rate": 4.674111950837226e-05, "loss": 0.9797, "step": 210000 }, { "epoch": 3.2604478654231133, "grad_norm": 2.1565654277801514, "learning_rate": 4.6739567653129317e-05, "loss": 0.9651, "step": 210100 }, { "epoch": 3.261999720666056, "grad_norm": 2.1380770206451416, "learning_rate": 4.6738015797886374e-05, "loss": 0.9739, "step": 210200 }, { "epoch": 3.2635515759089992, "grad_norm": 2.375796318054199, "learning_rate": 4.673646394264343e-05, "loss": 0.9488, "step": 210300 }, { "epoch": 3.265103431151942, "grad_norm": 2.5017898082733154, "learning_rate": 4.673491208740049e-05, "loss": 0.9655, "step": 210400 }, { "epoch": 3.2666552863948852, "grad_norm": 2.1213276386260986, "learning_rate": 4.673336023215755e-05, "loss": 0.9595, "step": 210500 }, { "epoch": 3.268207141637828, "grad_norm": 2.5592708587646484, "learning_rate": 4.6731808376914605e-05, "loss": 0.9671, "step": 210600 }, { "epoch": 3.269758996880771, "grad_norm": 2.6186158657073975, "learning_rate": 4.673025652167166e-05, "loss": 0.9859, "step": 210700 }, { "epoch": 3.271310852123714, "grad_norm": 2.6300909519195557, "learning_rate": 4.672870466642872e-05, "loss": 0.9615, "step": 210800 }, { "epoch": 3.2728627073666567, "grad_norm": 2.0544776916503906, "learning_rate": 4.672715281118578e-05, "loss": 0.9611, "step": 210900 }, { "epoch": 3.2744145626096, "grad_norm": 2.5136194229125977, "learning_rate": 4.6725600955942836e-05, "loss": 0.9764, "step": 211000 }, { "epoch": 3.2759664178525427, "grad_norm": 2.970989465713501, "learning_rate": 4.672404910069989e-05, "loss": 0.9983, "step": 211100 }, { "epoch": 3.2775182730954855, "grad_norm": 2.2655653953552246, "learning_rate": 4.6722497245456945e-05, "loss": 0.9515, "step": 211200 }, { "epoch": 3.2790701283384287, "grad_norm": 2.553241491317749, "learning_rate": 4.6720945390214e-05, "loss": 0.9566, "step": 211300 }, { "epoch": 3.2806219835813715, "grad_norm": 2.786196231842041, "learning_rate": 4.671939353497106e-05, "loss": 0.9899, "step": 211400 }, { "epoch": 3.2821738388243142, "grad_norm": 2.1239490509033203, "learning_rate": 4.671784167972812e-05, "loss": 0.9812, "step": 211500 }, { "epoch": 3.2837256940672575, "grad_norm": 2.53266978263855, "learning_rate": 4.6716289824485176e-05, "loss": 0.9841, "step": 211600 }, { "epoch": 3.2852775493102, "grad_norm": 2.175037145614624, "learning_rate": 4.6714737969242234e-05, "loss": 0.9799, "step": 211700 }, { "epoch": 3.2868294045531434, "grad_norm": 2.3546383380889893, "learning_rate": 4.671318611399929e-05, "loss": 0.9734, "step": 211800 }, { "epoch": 3.288381259796086, "grad_norm": 2.690140962600708, "learning_rate": 4.671163425875634e-05, "loss": 0.9685, "step": 211900 }, { "epoch": 3.289933115039029, "grad_norm": 2.356336832046509, "learning_rate": 4.67100824035134e-05, "loss": 0.9646, "step": 212000 }, { "epoch": 3.291484970281972, "grad_norm": 2.603374719619751, "learning_rate": 4.670853054827046e-05, "loss": 0.9633, "step": 212100 }, { "epoch": 3.293036825524915, "grad_norm": 2.540287971496582, "learning_rate": 4.6706978693027516e-05, "loss": 0.9746, "step": 212200 }, { "epoch": 3.294588680767858, "grad_norm": 2.011167049407959, "learning_rate": 4.6705426837784574e-05, "loss": 0.969, "step": 212300 }, { "epoch": 3.296140536010801, "grad_norm": 3.032888412475586, "learning_rate": 4.670387498254163e-05, "loss": 0.9619, "step": 212400 }, { "epoch": 3.2976923912537437, "grad_norm": 2.4740560054779053, "learning_rate": 4.670232312729869e-05, "loss": 0.9583, "step": 212500 }, { "epoch": 3.299244246496687, "grad_norm": 2.2122983932495117, "learning_rate": 4.670077127205574e-05, "loss": 0.9682, "step": 212600 }, { "epoch": 3.3007961017396297, "grad_norm": 2.1631224155426025, "learning_rate": 4.66992194168128e-05, "loss": 0.9769, "step": 212700 }, { "epoch": 3.3023479569825724, "grad_norm": 2.327928066253662, "learning_rate": 4.6697667561569855e-05, "loss": 0.9701, "step": 212800 }, { "epoch": 3.3038998122255157, "grad_norm": 2.699310541152954, "learning_rate": 4.669611570632691e-05, "loss": 0.9747, "step": 212900 }, { "epoch": 3.3054516674684584, "grad_norm": 2.733142614364624, "learning_rate": 4.669456385108397e-05, "loss": 0.9708, "step": 213000 }, { "epoch": 3.3070035227114016, "grad_norm": 2.0260112285614014, "learning_rate": 4.669301199584103e-05, "loss": 0.9829, "step": 213100 }, { "epoch": 3.3085553779543444, "grad_norm": 2.35030460357666, "learning_rate": 4.6691460140598087e-05, "loss": 0.9706, "step": 213200 }, { "epoch": 3.310107233197287, "grad_norm": 2.3920345306396484, "learning_rate": 4.6689908285355144e-05, "loss": 0.9668, "step": 213300 }, { "epoch": 3.3116590884402304, "grad_norm": 2.298232316970825, "learning_rate": 4.66883564301122e-05, "loss": 0.9952, "step": 213400 }, { "epoch": 3.313210943683173, "grad_norm": 2.754373073577881, "learning_rate": 4.668680457486926e-05, "loss": 0.9714, "step": 213500 }, { "epoch": 3.3147627989261164, "grad_norm": 2.724120616912842, "learning_rate": 4.668525271962632e-05, "loss": 0.9574, "step": 213600 }, { "epoch": 3.316314654169059, "grad_norm": 2.239607334136963, "learning_rate": 4.6683700864383375e-05, "loss": 0.9559, "step": 213700 }, { "epoch": 3.317866509412002, "grad_norm": 1.9346903562545776, "learning_rate": 4.668214900914043e-05, "loss": 0.9819, "step": 213800 }, { "epoch": 3.319418364654945, "grad_norm": 2.424647331237793, "learning_rate": 4.6680597153897484e-05, "loss": 0.9814, "step": 213900 }, { "epoch": 3.320970219897888, "grad_norm": 2.4415457248687744, "learning_rate": 4.667904529865454e-05, "loss": 0.9738, "step": 214000 }, { "epoch": 3.3225220751408306, "grad_norm": 2.2015230655670166, "learning_rate": 4.66774934434116e-05, "loss": 0.972, "step": 214100 }, { "epoch": 3.324073930383774, "grad_norm": 2.2242319583892822, "learning_rate": 4.667594158816866e-05, "loss": 0.9838, "step": 214200 }, { "epoch": 3.3256257856267166, "grad_norm": 2.2057042121887207, "learning_rate": 4.6674389732925715e-05, "loss": 0.9619, "step": 214300 }, { "epoch": 3.32717764086966, "grad_norm": 2.5950565338134766, "learning_rate": 4.667283787768277e-05, "loss": 0.977, "step": 214400 }, { "epoch": 3.3287294961126026, "grad_norm": 2.0156033039093018, "learning_rate": 4.667128602243983e-05, "loss": 0.9699, "step": 214500 }, { "epoch": 3.3302813513555454, "grad_norm": 2.2372074127197266, "learning_rate": 4.666973416719689e-05, "loss": 0.973, "step": 214600 }, { "epoch": 3.3318332065984886, "grad_norm": 2.6760330200195312, "learning_rate": 4.6668182311953946e-05, "loss": 0.9765, "step": 214700 }, { "epoch": 3.3333850618414314, "grad_norm": 2.0694596767425537, "learning_rate": 4.6666630456711004e-05, "loss": 0.9886, "step": 214800 }, { "epoch": 3.3349369170843746, "grad_norm": 3.0026535987854004, "learning_rate": 4.666507860146806e-05, "loss": 0.9726, "step": 214900 }, { "epoch": 3.3364887723273173, "grad_norm": 2.034625768661499, "learning_rate": 4.666352674622512e-05, "loss": 0.9659, "step": 215000 }, { "epoch": 3.33804062757026, "grad_norm": 3.0120036602020264, "learning_rate": 4.666197489098218e-05, "loss": 0.9684, "step": 215100 }, { "epoch": 3.3395924828132033, "grad_norm": 2.3764445781707764, "learning_rate": 4.666042303573923e-05, "loss": 0.9766, "step": 215200 }, { "epoch": 3.341144338056146, "grad_norm": 4.072542190551758, "learning_rate": 4.6658871180496286e-05, "loss": 0.9632, "step": 215300 }, { "epoch": 3.342696193299089, "grad_norm": 3.2545244693756104, "learning_rate": 4.6657319325253344e-05, "loss": 0.9699, "step": 215400 }, { "epoch": 3.344248048542032, "grad_norm": 2.3289096355438232, "learning_rate": 4.6655767470010394e-05, "loss": 0.9744, "step": 215500 }, { "epoch": 3.345799903784975, "grad_norm": 2.2775535583496094, "learning_rate": 4.665421561476745e-05, "loss": 0.9767, "step": 215600 }, { "epoch": 3.347351759027918, "grad_norm": 2.096693515777588, "learning_rate": 4.665266375952451e-05, "loss": 0.9534, "step": 215700 }, { "epoch": 3.348903614270861, "grad_norm": 2.1525797843933105, "learning_rate": 4.665111190428157e-05, "loss": 0.9706, "step": 215800 }, { "epoch": 3.3504554695138036, "grad_norm": 2.6015748977661133, "learning_rate": 4.6649560049038625e-05, "loss": 0.9869, "step": 215900 }, { "epoch": 3.352007324756747, "grad_norm": 1.8633016347885132, "learning_rate": 4.664800819379568e-05, "loss": 0.9628, "step": 216000 }, { "epoch": 3.3535591799996896, "grad_norm": 2.405515193939209, "learning_rate": 4.664645633855274e-05, "loss": 0.9722, "step": 216100 }, { "epoch": 3.3551110352426328, "grad_norm": 2.6398231983184814, "learning_rate": 4.66449044833098e-05, "loss": 0.968, "step": 216200 }, { "epoch": 3.3566628904855755, "grad_norm": 2.6593966484069824, "learning_rate": 4.6643352628066857e-05, "loss": 0.9665, "step": 216300 }, { "epoch": 3.3582147457285183, "grad_norm": 3.0856757164001465, "learning_rate": 4.6641800772823914e-05, "loss": 0.9578, "step": 216400 }, { "epoch": 3.3597666009714615, "grad_norm": 2.33518648147583, "learning_rate": 4.664024891758097e-05, "loss": 0.949, "step": 216500 }, { "epoch": 3.3613184562144043, "grad_norm": 2.27593994140625, "learning_rate": 4.663869706233803e-05, "loss": 0.9728, "step": 216600 }, { "epoch": 3.362870311457347, "grad_norm": 2.15895676612854, "learning_rate": 4.663714520709509e-05, "loss": 0.9517, "step": 216700 }, { "epoch": 3.3644221667002903, "grad_norm": 2.6732683181762695, "learning_rate": 4.663559335185214e-05, "loss": 0.9779, "step": 216800 }, { "epoch": 3.365974021943233, "grad_norm": 2.9457383155822754, "learning_rate": 4.6634041496609196e-05, "loss": 0.9719, "step": 216900 }, { "epoch": 3.3675258771861762, "grad_norm": 2.3644604682922363, "learning_rate": 4.6632489641366254e-05, "loss": 0.9804, "step": 217000 }, { "epoch": 3.369077732429119, "grad_norm": 2.0383403301239014, "learning_rate": 4.663093778612331e-05, "loss": 0.9736, "step": 217100 }, { "epoch": 3.3706295876720618, "grad_norm": 2.14007306098938, "learning_rate": 4.662938593088037e-05, "loss": 0.9806, "step": 217200 }, { "epoch": 3.372181442915005, "grad_norm": 2.3244049549102783, "learning_rate": 4.662783407563743e-05, "loss": 0.982, "step": 217300 }, { "epoch": 3.3737332981579478, "grad_norm": 2.2063379287719727, "learning_rate": 4.6626282220394485e-05, "loss": 0.9694, "step": 217400 }, { "epoch": 3.375285153400891, "grad_norm": 2.8258535861968994, "learning_rate": 4.662473036515154e-05, "loss": 0.9681, "step": 217500 }, { "epoch": 3.3768370086438337, "grad_norm": 2.417346239089966, "learning_rate": 4.66231785099086e-05, "loss": 0.9547, "step": 217600 }, { "epoch": 3.3783888638867765, "grad_norm": 2.416997194290161, "learning_rate": 4.662162665466566e-05, "loss": 0.9687, "step": 217700 }, { "epoch": 3.3799407191297197, "grad_norm": 2.4915099143981934, "learning_rate": 4.6620074799422716e-05, "loss": 0.9591, "step": 217800 }, { "epoch": 3.3814925743726625, "grad_norm": 2.617267370223999, "learning_rate": 4.6618522944179774e-05, "loss": 0.9921, "step": 217900 }, { "epoch": 3.3830444296156053, "grad_norm": 2.2194223403930664, "learning_rate": 4.661697108893683e-05, "loss": 0.9694, "step": 218000 }, { "epoch": 3.3845962848585485, "grad_norm": 2.2361536026000977, "learning_rate": 4.661541923369388e-05, "loss": 0.9818, "step": 218100 }, { "epoch": 3.3861481401014912, "grad_norm": 3.0397660732269287, "learning_rate": 4.661386737845094e-05, "loss": 0.9546, "step": 218200 }, { "epoch": 3.3876999953444344, "grad_norm": 2.7319767475128174, "learning_rate": 4.6612315523208e-05, "loss": 0.9573, "step": 218300 }, { "epoch": 3.389251850587377, "grad_norm": 2.651740550994873, "learning_rate": 4.6610763667965056e-05, "loss": 0.9521, "step": 218400 }, { "epoch": 3.39080370583032, "grad_norm": 2.3152172565460205, "learning_rate": 4.660921181272211e-05, "loss": 0.9695, "step": 218500 }, { "epoch": 3.392355561073263, "grad_norm": 2.2541134357452393, "learning_rate": 4.6607659957479164e-05, "loss": 0.9671, "step": 218600 }, { "epoch": 3.393907416316206, "grad_norm": 2.6370625495910645, "learning_rate": 4.660610810223622e-05, "loss": 0.9761, "step": 218700 }, { "epoch": 3.395459271559149, "grad_norm": 2.493525743484497, "learning_rate": 4.660455624699328e-05, "loss": 0.9748, "step": 218800 }, { "epoch": 3.397011126802092, "grad_norm": 3.3515610694885254, "learning_rate": 4.660300439175034e-05, "loss": 0.9699, "step": 218900 }, { "epoch": 3.3985629820450347, "grad_norm": 2.3086609840393066, "learning_rate": 4.6601452536507395e-05, "loss": 0.9702, "step": 219000 }, { "epoch": 3.400114837287978, "grad_norm": 2.349297523498535, "learning_rate": 4.659990068126445e-05, "loss": 0.967, "step": 219100 }, { "epoch": 3.4016666925309207, "grad_norm": 2.2962982654571533, "learning_rate": 4.659834882602151e-05, "loss": 0.975, "step": 219200 }, { "epoch": 3.4032185477738635, "grad_norm": 2.2249338626861572, "learning_rate": 4.659679697077857e-05, "loss": 0.9677, "step": 219300 }, { "epoch": 3.4047704030168067, "grad_norm": 2.438533067703247, "learning_rate": 4.6595245115535627e-05, "loss": 0.9655, "step": 219400 }, { "epoch": 3.4063222582597494, "grad_norm": 2.70404314994812, "learning_rate": 4.6593693260292684e-05, "loss": 0.9886, "step": 219500 }, { "epoch": 3.4078741135026926, "grad_norm": 2.750397205352783, "learning_rate": 4.6592141405049735e-05, "loss": 0.9859, "step": 219600 }, { "epoch": 3.4094259687456354, "grad_norm": 2.108666181564331, "learning_rate": 4.659058954980679e-05, "loss": 0.9656, "step": 219700 }, { "epoch": 3.410977823988578, "grad_norm": 2.6501379013061523, "learning_rate": 4.658903769456385e-05, "loss": 0.9794, "step": 219800 }, { "epoch": 3.4125296792315214, "grad_norm": 2.363713026046753, "learning_rate": 4.658748583932091e-05, "loss": 0.9629, "step": 219900 }, { "epoch": 3.414081534474464, "grad_norm": 2.508902072906494, "learning_rate": 4.6585933984077966e-05, "loss": 0.9663, "step": 220000 }, { "epoch": 3.4156333897174074, "grad_norm": 2.0923726558685303, "learning_rate": 4.6584382128835024e-05, "loss": 0.9556, "step": 220100 }, { "epoch": 3.41718524496035, "grad_norm": 2.2371480464935303, "learning_rate": 4.658283027359208e-05, "loss": 0.958, "step": 220200 }, { "epoch": 3.418737100203293, "grad_norm": 2.6551835536956787, "learning_rate": 4.658127841834914e-05, "loss": 0.976, "step": 220300 }, { "epoch": 3.420288955446236, "grad_norm": 2.598580837249756, "learning_rate": 4.65797265631062e-05, "loss": 0.9816, "step": 220400 }, { "epoch": 3.421840810689179, "grad_norm": 3.204810857772827, "learning_rate": 4.6578174707863255e-05, "loss": 0.9593, "step": 220500 }, { "epoch": 3.4233926659321217, "grad_norm": 2.2061984539031982, "learning_rate": 4.657662285262031e-05, "loss": 0.9791, "step": 220600 }, { "epoch": 3.424944521175065, "grad_norm": 1.9041773080825806, "learning_rate": 4.657507099737737e-05, "loss": 0.9507, "step": 220700 }, { "epoch": 3.4264963764180076, "grad_norm": 2.1848974227905273, "learning_rate": 4.657351914213443e-05, "loss": 0.9676, "step": 220800 }, { "epoch": 3.428048231660951, "grad_norm": 2.4480197429656982, "learning_rate": 4.657196728689148e-05, "loss": 0.9791, "step": 220900 }, { "epoch": 3.4296000869038936, "grad_norm": 2.092987060546875, "learning_rate": 4.657041543164854e-05, "loss": 0.9732, "step": 221000 }, { "epoch": 3.4311519421468364, "grad_norm": 2.710448741912842, "learning_rate": 4.6568863576405595e-05, "loss": 0.9637, "step": 221100 }, { "epoch": 3.4327037973897796, "grad_norm": 2.4221410751342773, "learning_rate": 4.656731172116265e-05, "loss": 0.9599, "step": 221200 }, { "epoch": 3.4342556526327224, "grad_norm": 2.3093464374542236, "learning_rate": 4.656575986591971e-05, "loss": 0.9706, "step": 221300 }, { "epoch": 3.4358075078756656, "grad_norm": 2.265651226043701, "learning_rate": 4.656420801067677e-05, "loss": 0.9661, "step": 221400 }, { "epoch": 3.4373593631186083, "grad_norm": 2.738287925720215, "learning_rate": 4.6562656155433826e-05, "loss": 0.9665, "step": 221500 }, { "epoch": 3.438911218361551, "grad_norm": 2.818229913711548, "learning_rate": 4.6561104300190884e-05, "loss": 0.9694, "step": 221600 }, { "epoch": 3.4404630736044943, "grad_norm": 2.6674697399139404, "learning_rate": 4.6559552444947934e-05, "loss": 0.9578, "step": 221700 }, { "epoch": 3.442014928847437, "grad_norm": 2.324506998062134, "learning_rate": 4.655800058970499e-05, "loss": 0.9498, "step": 221800 }, { "epoch": 3.44356678409038, "grad_norm": 2.2886624336242676, "learning_rate": 4.655644873446205e-05, "loss": 0.9829, "step": 221900 }, { "epoch": 3.445118639333323, "grad_norm": 2.1548595428466797, "learning_rate": 4.655489687921911e-05, "loss": 0.9485, "step": 222000 }, { "epoch": 3.446670494576266, "grad_norm": 2.428356647491455, "learning_rate": 4.6553345023976165e-05, "loss": 0.976, "step": 222100 }, { "epoch": 3.448222349819209, "grad_norm": 2.258183479309082, "learning_rate": 4.655179316873322e-05, "loss": 0.9751, "step": 222200 }, { "epoch": 3.449774205062152, "grad_norm": 2.703193187713623, "learning_rate": 4.655024131349028e-05, "loss": 0.9702, "step": 222300 }, { "epoch": 3.4513260603050946, "grad_norm": 2.3499321937561035, "learning_rate": 4.654868945824734e-05, "loss": 0.9599, "step": 222400 }, { "epoch": 3.452877915548038, "grad_norm": 2.5658650398254395, "learning_rate": 4.654713760300439e-05, "loss": 0.9733, "step": 222500 }, { "epoch": 3.4544297707909806, "grad_norm": 2.5249581336975098, "learning_rate": 4.654558574776145e-05, "loss": 0.9643, "step": 222600 }, { "epoch": 3.455981626033924, "grad_norm": 2.6649160385131836, "learning_rate": 4.6544033892518505e-05, "loss": 0.9764, "step": 222700 }, { "epoch": 3.4575334812768665, "grad_norm": 2.9648067951202393, "learning_rate": 4.654248203727556e-05, "loss": 0.96, "step": 222800 }, { "epoch": 3.4590853365198093, "grad_norm": 2.427208423614502, "learning_rate": 4.654093018203262e-05, "loss": 0.9651, "step": 222900 }, { "epoch": 3.4606371917627525, "grad_norm": 2.3616175651550293, "learning_rate": 4.653937832678968e-05, "loss": 0.9565, "step": 223000 }, { "epoch": 3.4621890470056953, "grad_norm": 2.5878775119781494, "learning_rate": 4.6537826471546736e-05, "loss": 0.9727, "step": 223100 }, { "epoch": 3.463740902248638, "grad_norm": 2.2611260414123535, "learning_rate": 4.6536274616303794e-05, "loss": 0.9652, "step": 223200 }, { "epoch": 3.4652927574915813, "grad_norm": 2.056553840637207, "learning_rate": 4.653472276106085e-05, "loss": 0.973, "step": 223300 }, { "epoch": 3.466844612734524, "grad_norm": 2.41298508644104, "learning_rate": 4.653317090581791e-05, "loss": 0.9697, "step": 223400 }, { "epoch": 3.468396467977467, "grad_norm": 2.454174280166626, "learning_rate": 4.653161905057497e-05, "loss": 0.9591, "step": 223500 }, { "epoch": 3.46994832322041, "grad_norm": 2.2234067916870117, "learning_rate": 4.6530067195332025e-05, "loss": 0.9514, "step": 223600 }, { "epoch": 3.471500178463353, "grad_norm": 2.5686097145080566, "learning_rate": 4.652851534008908e-05, "loss": 0.9786, "step": 223700 }, { "epoch": 3.473052033706296, "grad_norm": 2.535027503967285, "learning_rate": 4.6526963484846134e-05, "loss": 0.9711, "step": 223800 }, { "epoch": 3.4746038889492388, "grad_norm": 2.075920820236206, "learning_rate": 4.652541162960319e-05, "loss": 0.9718, "step": 223900 }, { "epoch": 3.476155744192182, "grad_norm": 2.7042036056518555, "learning_rate": 4.652385977436025e-05, "loss": 0.9474, "step": 224000 }, { "epoch": 3.4777075994351248, "grad_norm": 2.5208301544189453, "learning_rate": 4.652230791911731e-05, "loss": 0.9665, "step": 224100 }, { "epoch": 3.4792594546780675, "grad_norm": 2.4897689819335938, "learning_rate": 4.6520756063874365e-05, "loss": 0.9571, "step": 224200 }, { "epoch": 3.4808113099210107, "grad_norm": 2.0934641361236572, "learning_rate": 4.651920420863142e-05, "loss": 0.9679, "step": 224300 }, { "epoch": 3.4823631651639535, "grad_norm": 2.1980855464935303, "learning_rate": 4.651765235338848e-05, "loss": 0.9665, "step": 224400 }, { "epoch": 3.4839150204068963, "grad_norm": 2.182861089706421, "learning_rate": 4.651610049814554e-05, "loss": 0.9636, "step": 224500 }, { "epoch": 3.4854668756498395, "grad_norm": 2.4445130825042725, "learning_rate": 4.6514548642902596e-05, "loss": 0.979, "step": 224600 }, { "epoch": 3.4870187308927822, "grad_norm": 2.4823100566864014, "learning_rate": 4.6512996787659654e-05, "loss": 0.9739, "step": 224700 }, { "epoch": 3.488570586135725, "grad_norm": 2.072073221206665, "learning_rate": 4.651144493241671e-05, "loss": 0.9723, "step": 224800 }, { "epoch": 3.4901224413786682, "grad_norm": 2.087369918823242, "learning_rate": 4.650989307717377e-05, "loss": 0.9726, "step": 224900 }, { "epoch": 3.491674296621611, "grad_norm": 2.294132709503174, "learning_rate": 4.650834122193082e-05, "loss": 0.9643, "step": 225000 }, { "epoch": 3.493226151864554, "grad_norm": 2.431861162185669, "learning_rate": 4.650678936668788e-05, "loss": 0.9657, "step": 225100 }, { "epoch": 3.494778007107497, "grad_norm": 2.3083608150482178, "learning_rate": 4.6505237511444935e-05, "loss": 0.9745, "step": 225200 }, { "epoch": 3.49632986235044, "grad_norm": 2.4641411304473877, "learning_rate": 4.6503685656201986e-05, "loss": 0.975, "step": 225300 }, { "epoch": 3.497881717593383, "grad_norm": 2.1575961112976074, "learning_rate": 4.6502133800959044e-05, "loss": 0.9777, "step": 225400 }, { "epoch": 3.4994335728363257, "grad_norm": 2.3430116176605225, "learning_rate": 4.65005819457161e-05, "loss": 0.9825, "step": 225500 }, { "epoch": 3.500985428079269, "grad_norm": 1.8246943950653076, "learning_rate": 4.649903009047316e-05, "loss": 0.965, "step": 225600 }, { "epoch": 3.5025372833222117, "grad_norm": 2.2595934867858887, "learning_rate": 4.649747823523022e-05, "loss": 0.9492, "step": 225700 }, { "epoch": 3.5040891385651545, "grad_norm": 2.469350576400757, "learning_rate": 4.6495926379987275e-05, "loss": 0.9543, "step": 225800 }, { "epoch": 3.5056409938080977, "grad_norm": 2.34452748298645, "learning_rate": 4.649437452474433e-05, "loss": 0.9797, "step": 225900 }, { "epoch": 3.5071928490510405, "grad_norm": 2.309065818786621, "learning_rate": 4.649282266950139e-05, "loss": 0.9911, "step": 226000 }, { "epoch": 3.508744704293983, "grad_norm": 3.317248582839966, "learning_rate": 4.649127081425845e-05, "loss": 0.9602, "step": 226100 }, { "epoch": 3.5102965595369264, "grad_norm": 2.2479658126831055, "learning_rate": 4.6489718959015506e-05, "loss": 0.9788, "step": 226200 }, { "epoch": 3.511848414779869, "grad_norm": 2.234713554382324, "learning_rate": 4.6488167103772564e-05, "loss": 0.953, "step": 226300 }, { "epoch": 3.5134002700228124, "grad_norm": 2.21032977104187, "learning_rate": 4.648661524852962e-05, "loss": 0.9575, "step": 226400 }, { "epoch": 3.514952125265755, "grad_norm": 1.9349695444107056, "learning_rate": 4.648506339328668e-05, "loss": 0.9685, "step": 226500 }, { "epoch": 3.5165039805086984, "grad_norm": 2.3160839080810547, "learning_rate": 4.648351153804373e-05, "loss": 0.9699, "step": 226600 }, { "epoch": 3.518055835751641, "grad_norm": 2.459092855453491, "learning_rate": 4.648195968280079e-05, "loss": 0.9727, "step": 226700 }, { "epoch": 3.519607690994584, "grad_norm": 2.225822925567627, "learning_rate": 4.6480407827557846e-05, "loss": 0.9597, "step": 226800 }, { "epoch": 3.521159546237527, "grad_norm": 2.112962245941162, "learning_rate": 4.6478855972314904e-05, "loss": 0.9597, "step": 226900 }, { "epoch": 3.52271140148047, "grad_norm": 2.4616007804870605, "learning_rate": 4.647730411707196e-05, "loss": 0.9676, "step": 227000 }, { "epoch": 3.5242632567234127, "grad_norm": 2.589001178741455, "learning_rate": 4.647575226182902e-05, "loss": 0.969, "step": 227100 }, { "epoch": 3.525815111966356, "grad_norm": 2.3461856842041016, "learning_rate": 4.647420040658608e-05, "loss": 0.9783, "step": 227200 }, { "epoch": 3.5273669672092987, "grad_norm": 2.3959286212921143, "learning_rate": 4.6472648551343135e-05, "loss": 0.9964, "step": 227300 }, { "epoch": 3.5289188224522414, "grad_norm": 2.603144884109497, "learning_rate": 4.647109669610019e-05, "loss": 0.9685, "step": 227400 }, { "epoch": 3.5304706776951846, "grad_norm": 2.4207868576049805, "learning_rate": 4.646954484085725e-05, "loss": 0.958, "step": 227500 }, { "epoch": 3.5320225329381274, "grad_norm": 2.5940988063812256, "learning_rate": 4.646799298561431e-05, "loss": 0.9823, "step": 227600 }, { "epoch": 3.5335743881810706, "grad_norm": 2.270827531814575, "learning_rate": 4.6466441130371366e-05, "loss": 0.9614, "step": 227700 }, { "epoch": 3.5351262434240134, "grad_norm": 2.4076645374298096, "learning_rate": 4.6464889275128424e-05, "loss": 0.9674, "step": 227800 }, { "epoch": 3.5366780986669566, "grad_norm": 2.6615493297576904, "learning_rate": 4.6463337419885474e-05, "loss": 0.9841, "step": 227900 }, { "epoch": 3.5382299539098994, "grad_norm": 2.653352737426758, "learning_rate": 4.646178556464253e-05, "loss": 0.9739, "step": 228000 }, { "epoch": 3.539781809152842, "grad_norm": 2.304861545562744, "learning_rate": 4.646023370939959e-05, "loss": 0.9795, "step": 228100 }, { "epoch": 3.5413336643957853, "grad_norm": 3.8106706142425537, "learning_rate": 4.645868185415664e-05, "loss": 0.9794, "step": 228200 }, { "epoch": 3.542885519638728, "grad_norm": 2.2700843811035156, "learning_rate": 4.64571299989137e-05, "loss": 0.9668, "step": 228300 }, { "epoch": 3.544437374881671, "grad_norm": 2.277228593826294, "learning_rate": 4.6455578143670756e-05, "loss": 0.9709, "step": 228400 }, { "epoch": 3.545989230124614, "grad_norm": 3.932272434234619, "learning_rate": 4.6454026288427814e-05, "loss": 0.9839, "step": 228500 }, { "epoch": 3.547541085367557, "grad_norm": 2.663984537124634, "learning_rate": 4.645247443318487e-05, "loss": 0.9814, "step": 228600 }, { "epoch": 3.5490929406104996, "grad_norm": 2.4571282863616943, "learning_rate": 4.645092257794193e-05, "loss": 0.9674, "step": 228700 }, { "epoch": 3.550644795853443, "grad_norm": 2.5210084915161133, "learning_rate": 4.644937072269899e-05, "loss": 0.9533, "step": 228800 }, { "epoch": 3.5521966510963856, "grad_norm": 2.3973357677459717, "learning_rate": 4.6447818867456045e-05, "loss": 0.985, "step": 228900 }, { "epoch": 3.553748506339329, "grad_norm": 2.359811305999756, "learning_rate": 4.64462670122131e-05, "loss": 0.9478, "step": 229000 }, { "epoch": 3.5553003615822716, "grad_norm": 2.268571615219116, "learning_rate": 4.644471515697016e-05, "loss": 0.9891, "step": 229100 }, { "epoch": 3.556852216825215, "grad_norm": 2.1455466747283936, "learning_rate": 4.644316330172722e-05, "loss": 0.9588, "step": 229200 }, { "epoch": 3.5584040720681576, "grad_norm": 2.0674266815185547, "learning_rate": 4.6441611446484276e-05, "loss": 0.9701, "step": 229300 }, { "epoch": 3.5599559273111003, "grad_norm": 2.367199182510376, "learning_rate": 4.644005959124133e-05, "loss": 0.9826, "step": 229400 }, { "epoch": 3.5615077825540435, "grad_norm": 2.4536147117614746, "learning_rate": 4.6438507735998385e-05, "loss": 0.952, "step": 229500 }, { "epoch": 3.5630596377969863, "grad_norm": 2.092498302459717, "learning_rate": 4.643695588075544e-05, "loss": 0.9618, "step": 229600 }, { "epoch": 3.564611493039929, "grad_norm": 2.176478385925293, "learning_rate": 4.64354040255125e-05, "loss": 1.0033, "step": 229700 }, { "epoch": 3.5661633482828723, "grad_norm": 2.412071704864502, "learning_rate": 4.643385217026956e-05, "loss": 1.0072, "step": 229800 }, { "epoch": 3.567715203525815, "grad_norm": 2.14066219329834, "learning_rate": 4.6432300315026616e-05, "loss": 0.9649, "step": 229900 }, { "epoch": 3.569267058768758, "grad_norm": 2.4579968452453613, "learning_rate": 4.6430748459783674e-05, "loss": 0.9504, "step": 230000 }, { "epoch": 3.570818914011701, "grad_norm": 2.230984687805176, "learning_rate": 4.642919660454073e-05, "loss": 0.9688, "step": 230100 }, { "epoch": 3.572370769254644, "grad_norm": 2.277994155883789, "learning_rate": 4.642764474929779e-05, "loss": 0.9468, "step": 230200 }, { "epoch": 3.573922624497587, "grad_norm": 2.0097293853759766, "learning_rate": 4.642609289405485e-05, "loss": 0.946, "step": 230300 }, { "epoch": 3.57547447974053, "grad_norm": 5.333477973937988, "learning_rate": 4.6424541038811905e-05, "loss": 0.98, "step": 230400 }, { "epoch": 3.577026334983473, "grad_norm": 2.0949203968048096, "learning_rate": 4.642298918356896e-05, "loss": 0.9676, "step": 230500 }, { "epoch": 3.5785781902264158, "grad_norm": 2.7194032669067383, "learning_rate": 4.642143732832602e-05, "loss": 0.995, "step": 230600 }, { "epoch": 3.5801300454693585, "grad_norm": 2.9212942123413086, "learning_rate": 4.641988547308307e-05, "loss": 0.9643, "step": 230700 }, { "epoch": 3.5816819007123017, "grad_norm": 2.818150043487549, "learning_rate": 4.641833361784013e-05, "loss": 0.9547, "step": 230800 }, { "epoch": 3.5832337559552445, "grad_norm": 2.198272228240967, "learning_rate": 4.641678176259719e-05, "loss": 0.9578, "step": 230900 }, { "epoch": 3.5847856111981873, "grad_norm": 2.791888475418091, "learning_rate": 4.6415229907354244e-05, "loss": 0.9775, "step": 231000 }, { "epoch": 3.5863374664411305, "grad_norm": 1.9633780717849731, "learning_rate": 4.64136780521113e-05, "loss": 0.9799, "step": 231100 }, { "epoch": 3.5878893216840733, "grad_norm": 2.603294849395752, "learning_rate": 4.641212619686836e-05, "loss": 0.9782, "step": 231200 }, { "epoch": 3.589441176927016, "grad_norm": 2.3856558799743652, "learning_rate": 4.641057434162542e-05, "loss": 0.9695, "step": 231300 }, { "epoch": 3.5909930321699592, "grad_norm": 2.1898982524871826, "learning_rate": 4.6409022486382475e-05, "loss": 0.9784, "step": 231400 }, { "epoch": 3.592544887412902, "grad_norm": 2.933495283126831, "learning_rate": 4.6407470631139526e-05, "loss": 0.9665, "step": 231500 }, { "epoch": 3.5940967426558452, "grad_norm": 1.8776103258132935, "learning_rate": 4.6405918775896584e-05, "loss": 0.971, "step": 231600 }, { "epoch": 3.595648597898788, "grad_norm": 2.247300624847412, "learning_rate": 4.640436692065364e-05, "loss": 1.0046, "step": 231700 }, { "epoch": 3.597200453141731, "grad_norm": 2.686190605163574, "learning_rate": 4.64028150654107e-05, "loss": 0.9685, "step": 231800 }, { "epoch": 3.598752308384674, "grad_norm": 2.5068016052246094, "learning_rate": 4.640126321016776e-05, "loss": 0.9711, "step": 231900 }, { "epoch": 3.6003041636276167, "grad_norm": 1.885914921760559, "learning_rate": 4.6399711354924815e-05, "loss": 0.9703, "step": 232000 }, { "epoch": 3.60185601887056, "grad_norm": 2.3119332790374756, "learning_rate": 4.639815949968187e-05, "loss": 0.9869, "step": 232100 }, { "epoch": 3.6034078741135027, "grad_norm": 2.6081700325012207, "learning_rate": 4.639660764443893e-05, "loss": 0.9568, "step": 232200 }, { "epoch": 3.6049597293564455, "grad_norm": 2.5823633670806885, "learning_rate": 4.639505578919598e-05, "loss": 0.9629, "step": 232300 }, { "epoch": 3.6065115845993887, "grad_norm": 2.6489696502685547, "learning_rate": 4.639350393395304e-05, "loss": 0.9803, "step": 232400 }, { "epoch": 3.6080634398423315, "grad_norm": 2.0740299224853516, "learning_rate": 4.63919520787101e-05, "loss": 0.9843, "step": 232500 }, { "epoch": 3.6096152950852742, "grad_norm": 2.1879117488861084, "learning_rate": 4.6390400223467155e-05, "loss": 0.9822, "step": 232600 }, { "epoch": 3.6111671503282174, "grad_norm": 1.9237560033798218, "learning_rate": 4.638884836822421e-05, "loss": 0.9736, "step": 232700 }, { "epoch": 3.61271900557116, "grad_norm": 2.505173683166504, "learning_rate": 4.638729651298127e-05, "loss": 0.9369, "step": 232800 }, { "epoch": 3.6142708608141034, "grad_norm": 2.706794500350952, "learning_rate": 4.638574465773833e-05, "loss": 0.9886, "step": 232900 }, { "epoch": 3.615822716057046, "grad_norm": 2.296290636062622, "learning_rate": 4.6384192802495386e-05, "loss": 0.9654, "step": 233000 }, { "epoch": 3.6173745712999894, "grad_norm": 2.152711868286133, "learning_rate": 4.6382640947252444e-05, "loss": 0.985, "step": 233100 }, { "epoch": 3.618926426542932, "grad_norm": 2.5151572227478027, "learning_rate": 4.63810890920095e-05, "loss": 0.9659, "step": 233200 }, { "epoch": 3.620478281785875, "grad_norm": 2.6225638389587402, "learning_rate": 4.637953723676656e-05, "loss": 0.9443, "step": 233300 }, { "epoch": 3.622030137028818, "grad_norm": 2.413177728652954, "learning_rate": 4.637798538152362e-05, "loss": 0.9667, "step": 233400 }, { "epoch": 3.623581992271761, "grad_norm": 2.3094217777252197, "learning_rate": 4.6376433526280675e-05, "loss": 0.9438, "step": 233500 }, { "epoch": 3.6251338475147037, "grad_norm": 2.5241525173187256, "learning_rate": 4.6374881671037726e-05, "loss": 0.9527, "step": 233600 }, { "epoch": 3.626685702757647, "grad_norm": 2.5104904174804688, "learning_rate": 4.6373329815794783e-05, "loss": 0.9645, "step": 233700 }, { "epoch": 3.6282375580005897, "grad_norm": 2.6876797676086426, "learning_rate": 4.637177796055184e-05, "loss": 0.9753, "step": 233800 }, { "epoch": 3.6297894132435324, "grad_norm": 2.122689723968506, "learning_rate": 4.63702261053089e-05, "loss": 0.9684, "step": 233900 }, { "epoch": 3.6313412684864756, "grad_norm": 2.1321768760681152, "learning_rate": 4.636867425006596e-05, "loss": 0.9706, "step": 234000 }, { "epoch": 3.6328931237294184, "grad_norm": 2.6366469860076904, "learning_rate": 4.6367122394823014e-05, "loss": 0.9603, "step": 234100 }, { "epoch": 3.634444978972361, "grad_norm": 2.1111104488372803, "learning_rate": 4.636557053958007e-05, "loss": 0.9652, "step": 234200 }, { "epoch": 3.6359968342153044, "grad_norm": 3.3459489345550537, "learning_rate": 4.636401868433713e-05, "loss": 0.9686, "step": 234300 }, { "epoch": 3.6375486894582476, "grad_norm": 2.5588011741638184, "learning_rate": 4.636246682909419e-05, "loss": 0.9814, "step": 234400 }, { "epoch": 3.6391005447011904, "grad_norm": 1.9573246240615845, "learning_rate": 4.6360914973851245e-05, "loss": 0.9519, "step": 234500 }, { "epoch": 3.640652399944133, "grad_norm": 2.3324995040893555, "learning_rate": 4.63593631186083e-05, "loss": 0.9577, "step": 234600 }, { "epoch": 3.6422042551870764, "grad_norm": 2.219850778579712, "learning_rate": 4.6357811263365354e-05, "loss": 0.9661, "step": 234700 }, { "epoch": 3.643756110430019, "grad_norm": 2.2030375003814697, "learning_rate": 4.635625940812241e-05, "loss": 0.9801, "step": 234800 }, { "epoch": 3.645307965672962, "grad_norm": 2.6908771991729736, "learning_rate": 4.635470755287947e-05, "loss": 0.9545, "step": 234900 }, { "epoch": 3.646859820915905, "grad_norm": 2.457233428955078, "learning_rate": 4.635315569763653e-05, "loss": 0.9788, "step": 235000 }, { "epoch": 3.648411676158848, "grad_norm": 2.6660072803497314, "learning_rate": 4.635160384239358e-05, "loss": 0.9289, "step": 235100 }, { "epoch": 3.6499635314017906, "grad_norm": 2.0561630725860596, "learning_rate": 4.6350051987150636e-05, "loss": 0.9652, "step": 235200 }, { "epoch": 3.651515386644734, "grad_norm": 2.7103989124298096, "learning_rate": 4.6348500131907694e-05, "loss": 0.9717, "step": 235300 }, { "epoch": 3.6530672418876766, "grad_norm": 2.170891046524048, "learning_rate": 4.634694827666475e-05, "loss": 0.9677, "step": 235400 }, { "epoch": 3.6546190971306194, "grad_norm": 2.191462755203247, "learning_rate": 4.634539642142181e-05, "loss": 0.9854, "step": 235500 }, { "epoch": 3.6561709523735626, "grad_norm": 2.1450891494750977, "learning_rate": 4.634384456617887e-05, "loss": 0.96, "step": 235600 }, { "epoch": 3.6577228076165054, "grad_norm": 2.417560577392578, "learning_rate": 4.6342292710935925e-05, "loss": 0.9719, "step": 235700 }, { "epoch": 3.6592746628594486, "grad_norm": 2.3143184185028076, "learning_rate": 4.634074085569298e-05, "loss": 0.9625, "step": 235800 }, { "epoch": 3.6608265181023913, "grad_norm": 2.1526529788970947, "learning_rate": 4.633918900045004e-05, "loss": 0.9603, "step": 235900 }, { "epoch": 3.6623783733453346, "grad_norm": 2.6992242336273193, "learning_rate": 4.63376371452071e-05, "loss": 0.9666, "step": 236000 }, { "epoch": 3.6639302285882773, "grad_norm": 2.311490058898926, "learning_rate": 4.6336085289964156e-05, "loss": 0.9632, "step": 236100 }, { "epoch": 3.66548208383122, "grad_norm": 2.261174440383911, "learning_rate": 4.6334533434721214e-05, "loss": 0.9712, "step": 236200 }, { "epoch": 3.6670339390741633, "grad_norm": 8.810103416442871, "learning_rate": 4.633298157947827e-05, "loss": 0.968, "step": 236300 }, { "epoch": 3.668585794317106, "grad_norm": 2.995133638381958, "learning_rate": 4.633142972423532e-05, "loss": 0.9737, "step": 236400 }, { "epoch": 3.670137649560049, "grad_norm": 2.1710360050201416, "learning_rate": 4.632987786899238e-05, "loss": 0.9655, "step": 236500 }, { "epoch": 3.671689504802992, "grad_norm": 2.246187448501587, "learning_rate": 4.632832601374944e-05, "loss": 1.0006, "step": 236600 }, { "epoch": 3.673241360045935, "grad_norm": 2.7227137088775635, "learning_rate": 4.6326774158506496e-05, "loss": 0.9614, "step": 236700 }, { "epoch": 3.6747932152888776, "grad_norm": 2.3206613063812256, "learning_rate": 4.6325222303263553e-05, "loss": 0.9483, "step": 236800 }, { "epoch": 3.676345070531821, "grad_norm": 3.211451292037964, "learning_rate": 4.632367044802061e-05, "loss": 0.9441, "step": 236900 }, { "epoch": 3.6778969257747636, "grad_norm": 2.4601259231567383, "learning_rate": 4.632211859277767e-05, "loss": 0.983, "step": 237000 }, { "epoch": 3.679448781017707, "grad_norm": 2.4789884090423584, "learning_rate": 4.632056673753473e-05, "loss": 0.9541, "step": 237100 }, { "epoch": 3.6810006362606496, "grad_norm": 2.231525182723999, "learning_rate": 4.6319014882291784e-05, "loss": 0.9675, "step": 237200 }, { "epoch": 3.6825524915035928, "grad_norm": 2.2411417961120605, "learning_rate": 4.631746302704884e-05, "loss": 0.9544, "step": 237300 }, { "epoch": 3.6841043467465355, "grad_norm": 2.9061543941497803, "learning_rate": 4.63159111718059e-05, "loss": 0.9628, "step": 237400 }, { "epoch": 3.6856562019894783, "grad_norm": 1.8800681829452515, "learning_rate": 4.631435931656296e-05, "loss": 0.952, "step": 237500 }, { "epoch": 3.6872080572324215, "grad_norm": 2.5912296772003174, "learning_rate": 4.6312807461320015e-05, "loss": 0.959, "step": 237600 }, { "epoch": 3.6887599124753643, "grad_norm": 2.0268290042877197, "learning_rate": 4.6311255606077066e-05, "loss": 0.9581, "step": 237700 }, { "epoch": 3.690311767718307, "grad_norm": 3.72499680519104, "learning_rate": 4.6309703750834124e-05, "loss": 0.9592, "step": 237800 }, { "epoch": 3.6918636229612503, "grad_norm": 2.2440311908721924, "learning_rate": 4.630815189559118e-05, "loss": 0.9625, "step": 237900 }, { "epoch": 3.693415478204193, "grad_norm": 2.2005081176757812, "learning_rate": 4.630660004034823e-05, "loss": 0.9681, "step": 238000 }, { "epoch": 3.694967333447136, "grad_norm": 1.9732433557510376, "learning_rate": 4.630504818510529e-05, "loss": 0.9545, "step": 238100 }, { "epoch": 3.696519188690079, "grad_norm": 2.72831130027771, "learning_rate": 4.630349632986235e-05, "loss": 0.9485, "step": 238200 }, { "epoch": 3.6980710439330218, "grad_norm": 2.4407553672790527, "learning_rate": 4.6301944474619406e-05, "loss": 0.9796, "step": 238300 }, { "epoch": 3.699622899175965, "grad_norm": 1.8486636877059937, "learning_rate": 4.6300392619376464e-05, "loss": 0.9582, "step": 238400 }, { "epoch": 3.7011747544189078, "grad_norm": 2.3765370845794678, "learning_rate": 4.629884076413352e-05, "loss": 0.9448, "step": 238500 }, { "epoch": 3.702726609661851, "grad_norm": 12.382792472839355, "learning_rate": 4.629728890889058e-05, "loss": 0.9787, "step": 238600 }, { "epoch": 3.7042784649047937, "grad_norm": 2.3469669818878174, "learning_rate": 4.629573705364764e-05, "loss": 0.9665, "step": 238700 }, { "epoch": 3.7058303201477365, "grad_norm": 2.1814379692077637, "learning_rate": 4.6294185198404695e-05, "loss": 0.9615, "step": 238800 }, { "epoch": 3.7073821753906797, "grad_norm": 1.8087280988693237, "learning_rate": 4.629263334316175e-05, "loss": 0.9546, "step": 238900 }, { "epoch": 3.7089340306336225, "grad_norm": 1.8695015907287598, "learning_rate": 4.629108148791881e-05, "loss": 0.9668, "step": 239000 }, { "epoch": 3.7104858858765652, "grad_norm": 2.6201558113098145, "learning_rate": 4.628952963267587e-05, "loss": 0.9717, "step": 239100 }, { "epoch": 3.7120377411195085, "grad_norm": 2.3718831539154053, "learning_rate": 4.6287977777432926e-05, "loss": 0.9708, "step": 239200 }, { "epoch": 3.7135895963624512, "grad_norm": 1.9755810499191284, "learning_rate": 4.628642592218998e-05, "loss": 0.9469, "step": 239300 }, { "epoch": 3.715141451605394, "grad_norm": 3.064552068710327, "learning_rate": 4.6284874066947035e-05, "loss": 0.9679, "step": 239400 }, { "epoch": 3.716693306848337, "grad_norm": 2.2579355239868164, "learning_rate": 4.628332221170409e-05, "loss": 0.9671, "step": 239500 }, { "epoch": 3.71824516209128, "grad_norm": 2.1819000244140625, "learning_rate": 4.628177035646115e-05, "loss": 0.9583, "step": 239600 }, { "epoch": 3.719797017334223, "grad_norm": 2.7773094177246094, "learning_rate": 4.628021850121821e-05, "loss": 0.95, "step": 239700 }, { "epoch": 3.721348872577166, "grad_norm": 2.1579291820526123, "learning_rate": 4.6278666645975266e-05, "loss": 0.9543, "step": 239800 }, { "epoch": 3.722900727820109, "grad_norm": 2.2950921058654785, "learning_rate": 4.6277114790732323e-05, "loss": 0.9608, "step": 239900 }, { "epoch": 3.724452583063052, "grad_norm": 2.3382716178894043, "learning_rate": 4.627556293548938e-05, "loss": 0.9626, "step": 240000 }, { "epoch": 3.7260044383059947, "grad_norm": 4.533854007720947, "learning_rate": 4.627401108024644e-05, "loss": 0.9635, "step": 240100 }, { "epoch": 3.727556293548938, "grad_norm": 1.9929956197738647, "learning_rate": 4.62724592250035e-05, "loss": 0.9603, "step": 240200 }, { "epoch": 3.7291081487918807, "grad_norm": 2.22918963432312, "learning_rate": 4.6270907369760554e-05, "loss": 0.96, "step": 240300 }, { "epoch": 3.7306600040348235, "grad_norm": 1.8383510112762451, "learning_rate": 4.626935551451761e-05, "loss": 0.9556, "step": 240400 }, { "epoch": 3.7322118592777667, "grad_norm": 2.48028302192688, "learning_rate": 4.626780365927467e-05, "loss": 0.9636, "step": 240500 }, { "epoch": 3.7337637145207094, "grad_norm": 2.779193639755249, "learning_rate": 4.626625180403172e-05, "loss": 0.9881, "step": 240600 }, { "epoch": 3.735315569763652, "grad_norm": 2.5697414875030518, "learning_rate": 4.626469994878878e-05, "loss": 0.9616, "step": 240700 }, { "epoch": 3.7368674250065954, "grad_norm": 2.433716058731079, "learning_rate": 4.6263148093545836e-05, "loss": 0.9564, "step": 240800 }, { "epoch": 3.738419280249538, "grad_norm": 2.6342830657958984, "learning_rate": 4.6261596238302894e-05, "loss": 0.9547, "step": 240900 }, { "epoch": 3.7399711354924814, "grad_norm": 1.9216481447219849, "learning_rate": 4.626004438305995e-05, "loss": 0.9526, "step": 241000 }, { "epoch": 3.741522990735424, "grad_norm": 2.3023314476013184, "learning_rate": 4.625849252781701e-05, "loss": 0.9484, "step": 241100 }, { "epoch": 3.7430748459783674, "grad_norm": 2.2189130783081055, "learning_rate": 4.625694067257407e-05, "loss": 0.9721, "step": 241200 }, { "epoch": 3.74462670122131, "grad_norm": 2.6043686866760254, "learning_rate": 4.625538881733112e-05, "loss": 0.9834, "step": 241300 }, { "epoch": 3.746178556464253, "grad_norm": 2.9533169269561768, "learning_rate": 4.6253836962088176e-05, "loss": 0.9654, "step": 241400 }, { "epoch": 3.747730411707196, "grad_norm": 3.030055522918701, "learning_rate": 4.6252285106845234e-05, "loss": 0.9492, "step": 241500 }, { "epoch": 3.749282266950139, "grad_norm": 2.573866605758667, "learning_rate": 4.625073325160229e-05, "loss": 0.9772, "step": 241600 }, { "epoch": 3.7508341221930817, "grad_norm": 2.9697012901306152, "learning_rate": 4.624918139635935e-05, "loss": 0.9663, "step": 241700 }, { "epoch": 3.752385977436025, "grad_norm": 2.449617385864258, "learning_rate": 4.624762954111641e-05, "loss": 0.9773, "step": 241800 }, { "epoch": 3.7539378326789676, "grad_norm": 2.0333218574523926, "learning_rate": 4.6246077685873465e-05, "loss": 0.9797, "step": 241900 }, { "epoch": 3.7554896879219104, "grad_norm": 2.274601697921753, "learning_rate": 4.624452583063052e-05, "loss": 0.9699, "step": 242000 }, { "epoch": 3.7570415431648536, "grad_norm": 2.453890562057495, "learning_rate": 4.6242973975387574e-05, "loss": 0.9687, "step": 242100 }, { "epoch": 3.7585933984077964, "grad_norm": 2.2922515869140625, "learning_rate": 4.624142212014463e-05, "loss": 0.9648, "step": 242200 }, { "epoch": 3.7601452536507396, "grad_norm": 2.3109726905822754, "learning_rate": 4.623987026490169e-05, "loss": 0.9763, "step": 242300 }, { "epoch": 3.7616971088936824, "grad_norm": 2.696913242340088, "learning_rate": 4.623831840965875e-05, "loss": 0.9698, "step": 242400 }, { "epoch": 3.7632489641366256, "grad_norm": 2.689713954925537, "learning_rate": 4.6236766554415805e-05, "loss": 0.9743, "step": 242500 }, { "epoch": 3.7648008193795683, "grad_norm": 2.4307284355163574, "learning_rate": 4.623521469917286e-05, "loss": 0.9648, "step": 242600 }, { "epoch": 3.766352674622511, "grad_norm": 2.5166666507720947, "learning_rate": 4.623366284392992e-05, "loss": 0.9756, "step": 242700 }, { "epoch": 3.7679045298654543, "grad_norm": 2.379624843597412, "learning_rate": 4.623211098868698e-05, "loss": 0.9443, "step": 242800 }, { "epoch": 3.769456385108397, "grad_norm": 2.6000499725341797, "learning_rate": 4.6230559133444036e-05, "loss": 0.9508, "step": 242900 }, { "epoch": 3.77100824035134, "grad_norm": 2.6107475757598877, "learning_rate": 4.6229007278201093e-05, "loss": 0.9638, "step": 243000 }, { "epoch": 3.772560095594283, "grad_norm": 2.371779680252075, "learning_rate": 4.622745542295815e-05, "loss": 0.9669, "step": 243100 }, { "epoch": 3.774111950837226, "grad_norm": 2.841527223587036, "learning_rate": 4.622590356771521e-05, "loss": 0.9852, "step": 243200 }, { "epoch": 3.7756638060801686, "grad_norm": 2.1400139331817627, "learning_rate": 4.622435171247227e-05, "loss": 0.9589, "step": 243300 }, { "epoch": 3.777215661323112, "grad_norm": 1.9983570575714111, "learning_rate": 4.622279985722932e-05, "loss": 0.9607, "step": 243400 }, { "epoch": 3.7787675165660546, "grad_norm": 2.334944009780884, "learning_rate": 4.6221248001986375e-05, "loss": 0.9686, "step": 243500 }, { "epoch": 3.780319371808998, "grad_norm": 2.119037628173828, "learning_rate": 4.621969614674343e-05, "loss": 0.9629, "step": 243600 }, { "epoch": 3.7818712270519406, "grad_norm": 2.462751626968384, "learning_rate": 4.621814429150049e-05, "loss": 0.9435, "step": 243700 }, { "epoch": 3.7834230822948838, "grad_norm": 2.587165117263794, "learning_rate": 4.621659243625755e-05, "loss": 0.9577, "step": 243800 }, { "epoch": 3.7849749375378265, "grad_norm": 2.141871929168701, "learning_rate": 4.6215040581014606e-05, "loss": 0.9789, "step": 243900 }, { "epoch": 3.7865267927807693, "grad_norm": 2.4658398628234863, "learning_rate": 4.6213488725771664e-05, "loss": 0.9585, "step": 244000 }, { "epoch": 3.7880786480237125, "grad_norm": 2.833173990249634, "learning_rate": 4.621193687052872e-05, "loss": 0.9814, "step": 244100 }, { "epoch": 3.7896305032666553, "grad_norm": 2.0651750564575195, "learning_rate": 4.621038501528578e-05, "loss": 0.9557, "step": 244200 }, { "epoch": 3.791182358509598, "grad_norm": 2.0845115184783936, "learning_rate": 4.620883316004284e-05, "loss": 0.9598, "step": 244300 }, { "epoch": 3.7927342137525413, "grad_norm": 2.856360912322998, "learning_rate": 4.6207281304799895e-05, "loss": 0.9531, "step": 244400 }, { "epoch": 3.794286068995484, "grad_norm": 2.7098703384399414, "learning_rate": 4.6205729449556946e-05, "loss": 0.956, "step": 244500 }, { "epoch": 3.795837924238427, "grad_norm": 2.5361874103546143, "learning_rate": 4.6204177594314004e-05, "loss": 0.9434, "step": 244600 }, { "epoch": 3.79738977948137, "grad_norm": 2.0523622035980225, "learning_rate": 4.620262573907106e-05, "loss": 0.9658, "step": 244700 }, { "epoch": 3.798941634724313, "grad_norm": 2.9382576942443848, "learning_rate": 4.620107388382812e-05, "loss": 0.9353, "step": 244800 }, { "epoch": 3.800493489967256, "grad_norm": 2.162856340408325, "learning_rate": 4.619952202858517e-05, "loss": 0.976, "step": 244900 }, { "epoch": 3.8020453452101988, "grad_norm": 2.691394567489624, "learning_rate": 4.619797017334223e-05, "loss": 0.9498, "step": 245000 }, { "epoch": 3.803597200453142, "grad_norm": 2.3454697132110596, "learning_rate": 4.6196418318099286e-05, "loss": 0.9601, "step": 245100 }, { "epoch": 3.8051490556960847, "grad_norm": 2.557965040206909, "learning_rate": 4.6194866462856344e-05, "loss": 0.9546, "step": 245200 }, { "epoch": 3.8067009109390275, "grad_norm": 2.1374263763427734, "learning_rate": 4.61933146076134e-05, "loss": 0.9655, "step": 245300 }, { "epoch": 3.8082527661819707, "grad_norm": 2.2417478561401367, "learning_rate": 4.619176275237046e-05, "loss": 0.9647, "step": 245400 }, { "epoch": 3.8098046214249135, "grad_norm": 2.249420642852783, "learning_rate": 4.619021089712752e-05, "loss": 0.9722, "step": 245500 }, { "epoch": 3.8113564766678563, "grad_norm": 2.3086416721343994, "learning_rate": 4.6188659041884575e-05, "loss": 0.9607, "step": 245600 }, { "epoch": 3.8129083319107995, "grad_norm": 2.2293660640716553, "learning_rate": 4.618710718664163e-05, "loss": 0.9558, "step": 245700 }, { "epoch": 3.8144601871537422, "grad_norm": 2.477463960647583, "learning_rate": 4.618555533139869e-05, "loss": 0.9645, "step": 245800 }, { "epoch": 3.816012042396685, "grad_norm": 1.975568175315857, "learning_rate": 4.618400347615575e-05, "loss": 0.9528, "step": 245900 }, { "epoch": 3.8175638976396282, "grad_norm": 2.8654661178588867, "learning_rate": 4.6182451620912806e-05, "loss": 0.951, "step": 246000 }, { "epoch": 3.819115752882571, "grad_norm": 2.867647171020508, "learning_rate": 4.6180899765669863e-05, "loss": 0.9557, "step": 246100 }, { "epoch": 3.820667608125514, "grad_norm": 2.3744804859161377, "learning_rate": 4.6179347910426914e-05, "loss": 0.9587, "step": 246200 }, { "epoch": 3.822219463368457, "grad_norm": 2.1752498149871826, "learning_rate": 4.617779605518397e-05, "loss": 0.9625, "step": 246300 }, { "epoch": 3.8237713186114, "grad_norm": 2.3678810596466064, "learning_rate": 4.617624419994103e-05, "loss": 0.9638, "step": 246400 }, { "epoch": 3.825323173854343, "grad_norm": 2.6032025814056396, "learning_rate": 4.617469234469809e-05, "loss": 0.9666, "step": 246500 }, { "epoch": 3.8268750290972857, "grad_norm": 2.8459231853485107, "learning_rate": 4.6173140489455145e-05, "loss": 0.9576, "step": 246600 }, { "epoch": 3.828426884340229, "grad_norm": 2.521998882293701, "learning_rate": 4.61715886342122e-05, "loss": 0.9698, "step": 246700 }, { "epoch": 3.8299787395831717, "grad_norm": 2.459017276763916, "learning_rate": 4.617003677896926e-05, "loss": 0.9742, "step": 246800 }, { "epoch": 3.8315305948261145, "grad_norm": 2.2206356525421143, "learning_rate": 4.616848492372632e-05, "loss": 0.9638, "step": 246900 }, { "epoch": 3.8330824500690577, "grad_norm": 2.496018409729004, "learning_rate": 4.6166933068483376e-05, "loss": 0.9696, "step": 247000 }, { "epoch": 3.8346343053120004, "grad_norm": 2.395921230316162, "learning_rate": 4.6165381213240434e-05, "loss": 0.9707, "step": 247100 }, { "epoch": 3.836186160554943, "grad_norm": 2.3485350608825684, "learning_rate": 4.616382935799749e-05, "loss": 0.9857, "step": 247200 }, { "epoch": 3.8377380157978864, "grad_norm": 1.9484091997146606, "learning_rate": 4.616227750275455e-05, "loss": 0.949, "step": 247300 }, { "epoch": 3.839289871040829, "grad_norm": 2.4426043033599854, "learning_rate": 4.616072564751161e-05, "loss": 0.9608, "step": 247400 }, { "epoch": 3.8408417262837724, "grad_norm": 2.655001163482666, "learning_rate": 4.615917379226866e-05, "loss": 0.9709, "step": 247500 }, { "epoch": 3.842393581526715, "grad_norm": 2.8030242919921875, "learning_rate": 4.6157621937025716e-05, "loss": 0.9578, "step": 247600 }, { "epoch": 3.8439454367696584, "grad_norm": 2.1553852558135986, "learning_rate": 4.6156070081782774e-05, "loss": 0.9553, "step": 247700 }, { "epoch": 3.845497292012601, "grad_norm": 2.9878153800964355, "learning_rate": 4.6154518226539825e-05, "loss": 0.9687, "step": 247800 }, { "epoch": 3.847049147255544, "grad_norm": 2.4834325313568115, "learning_rate": 4.615296637129688e-05, "loss": 0.9564, "step": 247900 }, { "epoch": 3.848601002498487, "grad_norm": 2.83827543258667, "learning_rate": 4.615141451605394e-05, "loss": 0.9504, "step": 248000 }, { "epoch": 3.85015285774143, "grad_norm": 2.1821401119232178, "learning_rate": 4.6149862660811e-05, "loss": 0.9628, "step": 248100 }, { "epoch": 3.8517047129843727, "grad_norm": 2.3682591915130615, "learning_rate": 4.6148310805568056e-05, "loss": 0.973, "step": 248200 }, { "epoch": 3.853256568227316, "grad_norm": 2.0372495651245117, "learning_rate": 4.6146758950325114e-05, "loss": 0.9593, "step": 248300 }, { "epoch": 3.8548084234702586, "grad_norm": 2.4246280193328857, "learning_rate": 4.614520709508217e-05, "loss": 0.9624, "step": 248400 }, { "epoch": 3.8563602787132014, "grad_norm": 2.873910665512085, "learning_rate": 4.614365523983923e-05, "loss": 0.9596, "step": 248500 }, { "epoch": 3.8579121339561446, "grad_norm": 2.3028905391693115, "learning_rate": 4.614210338459629e-05, "loss": 0.9672, "step": 248600 }, { "epoch": 3.8594639891990874, "grad_norm": 2.438647747039795, "learning_rate": 4.6140551529353345e-05, "loss": 0.9757, "step": 248700 }, { "epoch": 3.8610158444420306, "grad_norm": 2.7761754989624023, "learning_rate": 4.61389996741104e-05, "loss": 0.9643, "step": 248800 }, { "epoch": 3.8625676996849734, "grad_norm": 2.2606499195098877, "learning_rate": 4.613744781886746e-05, "loss": 0.9443, "step": 248900 }, { "epoch": 3.8641195549279166, "grad_norm": 2.658202648162842, "learning_rate": 4.613589596362452e-05, "loss": 0.975, "step": 249000 }, { "epoch": 3.8656714101708594, "grad_norm": 3.095552921295166, "learning_rate": 4.613434410838157e-05, "loss": 0.9464, "step": 249100 }, { "epoch": 3.867223265413802, "grad_norm": 2.3369100093841553, "learning_rate": 4.613279225313863e-05, "loss": 0.9819, "step": 249200 }, { "epoch": 3.8687751206567453, "grad_norm": 2.461524248123169, "learning_rate": 4.6131240397895684e-05, "loss": 0.9648, "step": 249300 }, { "epoch": 3.870326975899688, "grad_norm": 2.4584550857543945, "learning_rate": 4.612968854265274e-05, "loss": 0.9598, "step": 249400 }, { "epoch": 3.871878831142631, "grad_norm": 2.834724187850952, "learning_rate": 4.61281366874098e-05, "loss": 0.9581, "step": 249500 }, { "epoch": 3.873430686385574, "grad_norm": 2.2932212352752686, "learning_rate": 4.612658483216686e-05, "loss": 0.9526, "step": 249600 }, { "epoch": 3.874982541628517, "grad_norm": 2.2426798343658447, "learning_rate": 4.6125032976923915e-05, "loss": 0.973, "step": 249700 }, { "epoch": 3.8765343968714596, "grad_norm": 2.0944390296936035, "learning_rate": 4.612348112168097e-05, "loss": 0.9651, "step": 249800 }, { "epoch": 3.878086252114403, "grad_norm": 2.7793080806732178, "learning_rate": 4.612192926643803e-05, "loss": 0.9721, "step": 249900 }, { "epoch": 3.8796381073573456, "grad_norm": 2.1383070945739746, "learning_rate": 4.612037741119509e-05, "loss": 0.9547, "step": 250000 }, { "epoch": 3.8811899626002884, "grad_norm": 2.622943878173828, "learning_rate": 4.6118825555952146e-05, "loss": 0.9452, "step": 250100 }, { "epoch": 3.8827418178432316, "grad_norm": 2.6514110565185547, "learning_rate": 4.6117273700709204e-05, "loss": 0.9563, "step": 250200 }, { "epoch": 3.884293673086175, "grad_norm": 2.5645194053649902, "learning_rate": 4.611572184546626e-05, "loss": 0.9639, "step": 250300 }, { "epoch": 3.8858455283291176, "grad_norm": 2.168426990509033, "learning_rate": 4.611416999022331e-05, "loss": 0.9509, "step": 250400 }, { "epoch": 3.8873973835720603, "grad_norm": 1.9053301811218262, "learning_rate": 4.611261813498037e-05, "loss": 0.9542, "step": 250500 }, { "epoch": 3.8889492388150035, "grad_norm": 2.289501667022705, "learning_rate": 4.611106627973743e-05, "loss": 0.9742, "step": 250600 }, { "epoch": 3.8905010940579463, "grad_norm": 2.996089220046997, "learning_rate": 4.6109514424494486e-05, "loss": 0.9586, "step": 250700 }, { "epoch": 3.892052949300889, "grad_norm": 2.613656997680664, "learning_rate": 4.6107962569251544e-05, "loss": 0.9536, "step": 250800 }, { "epoch": 3.8936048045438323, "grad_norm": 3.361571788787842, "learning_rate": 4.61064107140086e-05, "loss": 0.9556, "step": 250900 }, { "epoch": 3.895156659786775, "grad_norm": 2.724039077758789, "learning_rate": 4.610485885876565e-05, "loss": 0.9447, "step": 251000 }, { "epoch": 3.896708515029718, "grad_norm": 2.3089542388916016, "learning_rate": 4.610330700352271e-05, "loss": 0.9596, "step": 251100 }, { "epoch": 3.898260370272661, "grad_norm": 2.7730958461761475, "learning_rate": 4.610175514827977e-05, "loss": 0.966, "step": 251200 }, { "epoch": 3.899812225515604, "grad_norm": 2.243138551712036, "learning_rate": 4.6100203293036826e-05, "loss": 0.9687, "step": 251300 }, { "epoch": 3.9013640807585466, "grad_norm": 2.3914403915405273, "learning_rate": 4.6098651437793884e-05, "loss": 0.9478, "step": 251400 }, { "epoch": 3.90291593600149, "grad_norm": 2.164823293685913, "learning_rate": 4.609709958255094e-05, "loss": 0.9734, "step": 251500 }, { "epoch": 3.904467791244433, "grad_norm": 2.0271458625793457, "learning_rate": 4.6095547727308e-05, "loss": 0.9541, "step": 251600 }, { "epoch": 3.9060196464873758, "grad_norm": 2.2817163467407227, "learning_rate": 4.609399587206506e-05, "loss": 0.9549, "step": 251700 }, { "epoch": 3.9075715017303185, "grad_norm": 2.0654783248901367, "learning_rate": 4.6092444016822115e-05, "loss": 0.9643, "step": 251800 }, { "epoch": 3.9091233569732617, "grad_norm": 2.436009168624878, "learning_rate": 4.6090892161579166e-05, "loss": 0.9488, "step": 251900 }, { "epoch": 3.9106752122162045, "grad_norm": 2.4122955799102783, "learning_rate": 4.6089340306336223e-05, "loss": 0.9493, "step": 252000 }, { "epoch": 3.9122270674591473, "grad_norm": 2.441713571548462, "learning_rate": 4.608778845109328e-05, "loss": 0.9517, "step": 252100 }, { "epoch": 3.9137789227020905, "grad_norm": 2.418978691101074, "learning_rate": 4.608623659585034e-05, "loss": 0.9405, "step": 252200 }, { "epoch": 3.9153307779450333, "grad_norm": 2.7669477462768555, "learning_rate": 4.60846847406074e-05, "loss": 0.9294, "step": 252300 }, { "epoch": 3.916882633187976, "grad_norm": 2.1899101734161377, "learning_rate": 4.6083132885364454e-05, "loss": 0.9366, "step": 252400 }, { "epoch": 3.9184344884309192, "grad_norm": 2.6299188137054443, "learning_rate": 4.608158103012151e-05, "loss": 0.9471, "step": 252500 }, { "epoch": 3.919986343673862, "grad_norm": 2.4762206077575684, "learning_rate": 4.608002917487857e-05, "loss": 0.9638, "step": 252600 }, { "epoch": 3.9215381989168048, "grad_norm": 2.876412868499756, "learning_rate": 4.607847731963563e-05, "loss": 0.9595, "step": 252700 }, { "epoch": 3.923090054159748, "grad_norm": 2.5713350772857666, "learning_rate": 4.6076925464392685e-05, "loss": 0.9671, "step": 252800 }, { "epoch": 3.9246419094026908, "grad_norm": 2.315887928009033, "learning_rate": 4.607537360914974e-05, "loss": 0.9652, "step": 252900 }, { "epoch": 3.926193764645634, "grad_norm": 2.2930619716644287, "learning_rate": 4.60738217539068e-05, "loss": 0.9694, "step": 253000 }, { "epoch": 3.9277456198885767, "grad_norm": 2.2852895259857178, "learning_rate": 4.607226989866386e-05, "loss": 0.9609, "step": 253100 }, { "epoch": 3.92929747513152, "grad_norm": 2.3692495822906494, "learning_rate": 4.607071804342091e-05, "loss": 0.951, "step": 253200 }, { "epoch": 3.9308493303744627, "grad_norm": 2.560375213623047, "learning_rate": 4.606916618817797e-05, "loss": 0.9467, "step": 253300 }, { "epoch": 3.9324011856174055, "grad_norm": 2.2609755992889404, "learning_rate": 4.6067614332935025e-05, "loss": 0.9574, "step": 253400 }, { "epoch": 3.9339530408603487, "grad_norm": 2.4398396015167236, "learning_rate": 4.606606247769208e-05, "loss": 0.9687, "step": 253500 }, { "epoch": 3.9355048961032915, "grad_norm": 2.0260703563690186, "learning_rate": 4.606451062244914e-05, "loss": 0.9715, "step": 253600 }, { "epoch": 3.9370567513462342, "grad_norm": 2.7391695976257324, "learning_rate": 4.60629587672062e-05, "loss": 0.9427, "step": 253700 }, { "epoch": 3.9386086065891774, "grad_norm": 2.50553035736084, "learning_rate": 4.6061406911963256e-05, "loss": 0.9307, "step": 253800 }, { "epoch": 3.94016046183212, "grad_norm": 1.9925477504730225, "learning_rate": 4.6059855056720314e-05, "loss": 0.9331, "step": 253900 }, { "epoch": 3.941712317075063, "grad_norm": 2.0980064868927, "learning_rate": 4.605830320147737e-05, "loss": 0.9407, "step": 254000 }, { "epoch": 3.943264172318006, "grad_norm": 2.248063325881958, "learning_rate": 4.605675134623443e-05, "loss": 0.957, "step": 254100 }, { "epoch": 3.944816027560949, "grad_norm": 2.2793643474578857, "learning_rate": 4.605519949099149e-05, "loss": 0.9593, "step": 254200 }, { "epoch": 3.946367882803892, "grad_norm": 2.039937734603882, "learning_rate": 4.605364763574854e-05, "loss": 0.9393, "step": 254300 }, { "epoch": 3.947919738046835, "grad_norm": 2.3501534461975098, "learning_rate": 4.6052095780505596e-05, "loss": 0.9474, "step": 254400 }, { "epoch": 3.949471593289778, "grad_norm": 2.488781452178955, "learning_rate": 4.6050543925262654e-05, "loss": 0.9554, "step": 254500 }, { "epoch": 3.951023448532721, "grad_norm": 2.643991708755493, "learning_rate": 4.604899207001971e-05, "loss": 0.9492, "step": 254600 }, { "epoch": 3.9525753037756637, "grad_norm": 2.3973681926727295, "learning_rate": 4.604744021477677e-05, "loss": 0.9803, "step": 254700 }, { "epoch": 3.954127159018607, "grad_norm": 2.328188180923462, "learning_rate": 4.604588835953382e-05, "loss": 0.9509, "step": 254800 }, { "epoch": 3.9556790142615497, "grad_norm": 2.642181634902954, "learning_rate": 4.604433650429088e-05, "loss": 0.9702, "step": 254900 }, { "epoch": 3.9572308695044924, "grad_norm": 2.7715260982513428, "learning_rate": 4.6042784649047936e-05, "loss": 0.9502, "step": 255000 }, { "epoch": 3.9587827247474356, "grad_norm": 2.435835123062134, "learning_rate": 4.6041232793804993e-05, "loss": 0.9562, "step": 255100 }, { "epoch": 3.9603345799903784, "grad_norm": 2.294384241104126, "learning_rate": 4.603968093856205e-05, "loss": 0.9507, "step": 255200 }, { "epoch": 3.961886435233321, "grad_norm": 2.4213449954986572, "learning_rate": 4.603812908331911e-05, "loss": 0.9562, "step": 255300 }, { "epoch": 3.9634382904762644, "grad_norm": 2.1852378845214844, "learning_rate": 4.603657722807617e-05, "loss": 0.9873, "step": 255400 }, { "epoch": 3.964990145719207, "grad_norm": 2.3879640102386475, "learning_rate": 4.6035025372833224e-05, "loss": 0.9544, "step": 255500 }, { "epoch": 3.9665420009621504, "grad_norm": 2.3646371364593506, "learning_rate": 4.603347351759028e-05, "loss": 0.9431, "step": 255600 }, { "epoch": 3.968093856205093, "grad_norm": 2.607381582260132, "learning_rate": 4.603192166234734e-05, "loss": 0.9747, "step": 255700 }, { "epoch": 3.9696457114480364, "grad_norm": 2.078855276107788, "learning_rate": 4.60303698071044e-05, "loss": 0.9793, "step": 255800 }, { "epoch": 3.971197566690979, "grad_norm": 2.1455979347229004, "learning_rate": 4.6028817951861455e-05, "loss": 0.9454, "step": 255900 }, { "epoch": 3.972749421933922, "grad_norm": 2.7543039321899414, "learning_rate": 4.602726609661851e-05, "loss": 0.9668, "step": 256000 }, { "epoch": 3.974301277176865, "grad_norm": 2.338210344314575, "learning_rate": 4.6025714241375564e-05, "loss": 0.947, "step": 256100 }, { "epoch": 3.975853132419808, "grad_norm": 2.3192243576049805, "learning_rate": 4.602416238613262e-05, "loss": 0.9506, "step": 256200 }, { "epoch": 3.9774049876627506, "grad_norm": 1.9029994010925293, "learning_rate": 4.602261053088968e-05, "loss": 0.9254, "step": 256300 }, { "epoch": 3.978956842905694, "grad_norm": 1.8258492946624756, "learning_rate": 4.602105867564674e-05, "loss": 0.9624, "step": 256400 }, { "epoch": 3.9805086981486366, "grad_norm": 1.9874811172485352, "learning_rate": 4.6019506820403795e-05, "loss": 0.9384, "step": 256500 }, { "epoch": 3.9820605533915794, "grad_norm": 2.567720413208008, "learning_rate": 4.601795496516085e-05, "loss": 0.9462, "step": 256600 }, { "epoch": 3.9836124086345226, "grad_norm": 2.6921446323394775, "learning_rate": 4.601640310991791e-05, "loss": 0.9394, "step": 256700 }, { "epoch": 3.9851642638774654, "grad_norm": 2.931229829788208, "learning_rate": 4.601485125467497e-05, "loss": 0.9513, "step": 256800 }, { "epoch": 3.9867161191204086, "grad_norm": 2.3478806018829346, "learning_rate": 4.6013299399432026e-05, "loss": 0.9617, "step": 256900 }, { "epoch": 3.9882679743633513, "grad_norm": 2.3521456718444824, "learning_rate": 4.6011747544189084e-05, "loss": 0.9662, "step": 257000 }, { "epoch": 3.9898198296062946, "grad_norm": 2.0564334392547607, "learning_rate": 4.601019568894614e-05, "loss": 0.9422, "step": 257100 }, { "epoch": 3.9913716848492373, "grad_norm": 1.842071771621704, "learning_rate": 4.60086438337032e-05, "loss": 0.9647, "step": 257200 }, { "epoch": 3.99292354009218, "grad_norm": 2.308626890182495, "learning_rate": 4.600709197846026e-05, "loss": 0.9793, "step": 257300 }, { "epoch": 3.9944753953351233, "grad_norm": 1.9788181781768799, "learning_rate": 4.600554012321731e-05, "loss": 0.9708, "step": 257400 }, { "epoch": 3.996027250578066, "grad_norm": 1.7815159559249878, "learning_rate": 4.6003988267974366e-05, "loss": 0.9686, "step": 257500 }, { "epoch": 3.997579105821009, "grad_norm": 1.8134030103683472, "learning_rate": 4.600243641273142e-05, "loss": 0.9594, "step": 257600 }, { "epoch": 3.999130961063952, "grad_norm": 2.660446882247925, "learning_rate": 4.6000884557488475e-05, "loss": 0.9308, "step": 257700 }, { "epoch": 4.000682816306895, "grad_norm": 2.386772394180298, "learning_rate": 4.599933270224553e-05, "loss": 0.9578, "step": 257800 }, { "epoch": 4.002234671549838, "grad_norm": 2.205749988555908, "learning_rate": 4.599778084700259e-05, "loss": 0.9502, "step": 257900 }, { "epoch": 4.003786526792781, "grad_norm": 2.7558083534240723, "learning_rate": 4.599622899175965e-05, "loss": 0.9518, "step": 258000 }, { "epoch": 4.005338382035724, "grad_norm": 2.0697128772735596, "learning_rate": 4.5994677136516706e-05, "loss": 0.9411, "step": 258100 }, { "epoch": 4.006890237278666, "grad_norm": 2.179945707321167, "learning_rate": 4.5993125281273763e-05, "loss": 0.9486, "step": 258200 }, { "epoch": 4.0084420925216095, "grad_norm": 2.669586658477783, "learning_rate": 4.599157342603082e-05, "loss": 0.9486, "step": 258300 }, { "epoch": 4.009993947764553, "grad_norm": 2.447242259979248, "learning_rate": 4.599002157078788e-05, "loss": 0.9471, "step": 258400 }, { "epoch": 4.011545803007495, "grad_norm": 2.241095781326294, "learning_rate": 4.598846971554494e-05, "loss": 0.9632, "step": 258500 }, { "epoch": 4.013097658250438, "grad_norm": 2.0117292404174805, "learning_rate": 4.5986917860301994e-05, "loss": 0.9435, "step": 258600 }, { "epoch": 4.0146495134933815, "grad_norm": 2.07411789894104, "learning_rate": 4.598536600505905e-05, "loss": 0.9474, "step": 258700 }, { "epoch": 4.016201368736325, "grad_norm": 2.7166850566864014, "learning_rate": 4.598381414981611e-05, "loss": 0.9621, "step": 258800 }, { "epoch": 4.017753223979267, "grad_norm": 2.795341968536377, "learning_rate": 4.598226229457316e-05, "loss": 0.9627, "step": 258900 }, { "epoch": 4.01930507922221, "grad_norm": 2.3739640712738037, "learning_rate": 4.598071043933022e-05, "loss": 0.9657, "step": 259000 }, { "epoch": 4.0208569344651535, "grad_norm": 2.5754904747009277, "learning_rate": 4.5979158584087276e-05, "loss": 0.962, "step": 259100 }, { "epoch": 4.022408789708096, "grad_norm": 3.009676933288574, "learning_rate": 4.5977606728844334e-05, "loss": 0.964, "step": 259200 }, { "epoch": 4.023960644951039, "grad_norm": 2.558089017868042, "learning_rate": 4.597605487360139e-05, "loss": 0.9556, "step": 259300 }, { "epoch": 4.025512500193982, "grad_norm": 2.45127010345459, "learning_rate": 4.597450301835845e-05, "loss": 0.9751, "step": 259400 }, { "epoch": 4.0270643554369245, "grad_norm": 2.823124885559082, "learning_rate": 4.597295116311551e-05, "loss": 0.9783, "step": 259500 }, { "epoch": 4.028616210679868, "grad_norm": 2.1915643215179443, "learning_rate": 4.5971399307872565e-05, "loss": 0.9488, "step": 259600 }, { "epoch": 4.030168065922811, "grad_norm": 1.9865059852600098, "learning_rate": 4.596984745262962e-05, "loss": 0.9735, "step": 259700 }, { "epoch": 4.031719921165753, "grad_norm": 3.129610776901245, "learning_rate": 4.596829559738668e-05, "loss": 0.9546, "step": 259800 }, { "epoch": 4.0332717764086965, "grad_norm": 2.3890984058380127, "learning_rate": 4.596674374214374e-05, "loss": 0.9475, "step": 259900 }, { "epoch": 4.03482363165164, "grad_norm": 2.2658867835998535, "learning_rate": 4.5965191886900796e-05, "loss": 0.9626, "step": 260000 }, { "epoch": 4.036375486894583, "grad_norm": 2.3426475524902344, "learning_rate": 4.5963640031657854e-05, "loss": 0.9785, "step": 260100 }, { "epoch": 4.037927342137525, "grad_norm": 2.150066375732422, "learning_rate": 4.5962088176414905e-05, "loss": 0.9459, "step": 260200 }, { "epoch": 4.0394791973804685, "grad_norm": 2.0398645401000977, "learning_rate": 4.596053632117196e-05, "loss": 0.9403, "step": 260300 }, { "epoch": 4.041031052623412, "grad_norm": 2.297935724258423, "learning_rate": 4.595898446592902e-05, "loss": 0.9468, "step": 260400 }, { "epoch": 4.042582907866354, "grad_norm": 2.188969850540161, "learning_rate": 4.595743261068608e-05, "loss": 0.9451, "step": 260500 }, { "epoch": 4.044134763109297, "grad_norm": 2.520869731903076, "learning_rate": 4.5955880755443136e-05, "loss": 0.9341, "step": 260600 }, { "epoch": 4.04568661835224, "grad_norm": 2.6415045261383057, "learning_rate": 4.5954328900200194e-05, "loss": 0.947, "step": 260700 }, { "epoch": 4.047238473595183, "grad_norm": 2.024040699005127, "learning_rate": 4.5952777044957245e-05, "loss": 0.962, "step": 260800 }, { "epoch": 4.048790328838126, "grad_norm": 2.6245927810668945, "learning_rate": 4.59512251897143e-05, "loss": 0.9505, "step": 260900 }, { "epoch": 4.050342184081069, "grad_norm": 2.182262897491455, "learning_rate": 4.594967333447136e-05, "loss": 0.9525, "step": 261000 }, { "epoch": 4.0518940393240115, "grad_norm": 2.4527010917663574, "learning_rate": 4.594812147922842e-05, "loss": 0.9395, "step": 261100 }, { "epoch": 4.053445894566955, "grad_norm": 2.518841505050659, "learning_rate": 4.5946569623985476e-05, "loss": 0.9521, "step": 261200 }, { "epoch": 4.054997749809898, "grad_norm": 1.7733029127120972, "learning_rate": 4.5945017768742533e-05, "loss": 0.9583, "step": 261300 }, { "epoch": 4.05654960505284, "grad_norm": 2.9535508155822754, "learning_rate": 4.594346591349959e-05, "loss": 0.9544, "step": 261400 }, { "epoch": 4.0581014602957834, "grad_norm": 2.7402002811431885, "learning_rate": 4.594191405825665e-05, "loss": 0.942, "step": 261500 }, { "epoch": 4.059653315538727, "grad_norm": 2.6834235191345215, "learning_rate": 4.594036220301371e-05, "loss": 0.9338, "step": 261600 }, { "epoch": 4.06120517078167, "grad_norm": 2.370631694793701, "learning_rate": 4.593881034777076e-05, "loss": 0.9629, "step": 261700 }, { "epoch": 4.062757026024612, "grad_norm": 1.992591381072998, "learning_rate": 4.5937258492527815e-05, "loss": 0.9488, "step": 261800 }, { "epoch": 4.064308881267555, "grad_norm": 3.331778049468994, "learning_rate": 4.593570663728487e-05, "loss": 0.9801, "step": 261900 }, { "epoch": 4.065860736510499, "grad_norm": 2.3667099475860596, "learning_rate": 4.593415478204193e-05, "loss": 0.9593, "step": 262000 }, { "epoch": 4.067412591753441, "grad_norm": 2.232501983642578, "learning_rate": 4.593260292679899e-05, "loss": 0.9664, "step": 262100 }, { "epoch": 4.068964446996384, "grad_norm": 2.5146374702453613, "learning_rate": 4.5931051071556046e-05, "loss": 0.9346, "step": 262200 }, { "epoch": 4.070516302239327, "grad_norm": 2.4675638675689697, "learning_rate": 4.5929499216313104e-05, "loss": 0.94, "step": 262300 }, { "epoch": 4.07206815748227, "grad_norm": 2.5684351921081543, "learning_rate": 4.592794736107016e-05, "loss": 0.9649, "step": 262400 }, { "epoch": 4.073620012725213, "grad_norm": 2.9621617794036865, "learning_rate": 4.592639550582722e-05, "loss": 0.9468, "step": 262500 }, { "epoch": 4.075171867968156, "grad_norm": 2.221468687057495, "learning_rate": 4.592484365058428e-05, "loss": 0.9536, "step": 262600 }, { "epoch": 4.076723723211098, "grad_norm": 2.5112791061401367, "learning_rate": 4.5923291795341335e-05, "loss": 0.9349, "step": 262700 }, { "epoch": 4.078275578454042, "grad_norm": 2.1550936698913574, "learning_rate": 4.592173994009839e-05, "loss": 0.9462, "step": 262800 }, { "epoch": 4.079827433696985, "grad_norm": 2.6242737770080566, "learning_rate": 4.592018808485545e-05, "loss": 0.9543, "step": 262900 }, { "epoch": 4.081379288939928, "grad_norm": 1.9733939170837402, "learning_rate": 4.59186362296125e-05, "loss": 0.9551, "step": 263000 }, { "epoch": 4.08293114418287, "grad_norm": 2.2308318614959717, "learning_rate": 4.591708437436956e-05, "loss": 0.937, "step": 263100 }, { "epoch": 4.084482999425814, "grad_norm": 2.1778078079223633, "learning_rate": 4.591553251912662e-05, "loss": 0.9507, "step": 263200 }, { "epoch": 4.086034854668757, "grad_norm": 2.5045294761657715, "learning_rate": 4.5913980663883675e-05, "loss": 0.9378, "step": 263300 }, { "epoch": 4.087586709911699, "grad_norm": 2.4262797832489014, "learning_rate": 4.591242880864073e-05, "loss": 0.9571, "step": 263400 }, { "epoch": 4.089138565154642, "grad_norm": 2.3693854808807373, "learning_rate": 4.591087695339779e-05, "loss": 0.9438, "step": 263500 }, { "epoch": 4.090690420397586, "grad_norm": 3.4291396141052246, "learning_rate": 4.590932509815485e-05, "loss": 0.954, "step": 263600 }, { "epoch": 4.092242275640528, "grad_norm": 2.109415054321289, "learning_rate": 4.5907773242911906e-05, "loss": 0.9632, "step": 263700 }, { "epoch": 4.093794130883471, "grad_norm": 2.4319875240325928, "learning_rate": 4.5906221387668964e-05, "loss": 0.9652, "step": 263800 }, { "epoch": 4.095345986126414, "grad_norm": 3.537684917449951, "learning_rate": 4.590466953242602e-05, "loss": 0.9657, "step": 263900 }, { "epoch": 4.096897841369357, "grad_norm": 3.062530755996704, "learning_rate": 4.590311767718308e-05, "loss": 0.9651, "step": 264000 }, { "epoch": 4.0984496966123, "grad_norm": 2.4580297470092773, "learning_rate": 4.590156582194013e-05, "loss": 0.9616, "step": 264100 }, { "epoch": 4.100001551855243, "grad_norm": 2.834481954574585, "learning_rate": 4.590001396669719e-05, "loss": 0.954, "step": 264200 }, { "epoch": 4.101553407098186, "grad_norm": 2.2122907638549805, "learning_rate": 4.5898462111454246e-05, "loss": 0.9481, "step": 264300 }, { "epoch": 4.103105262341129, "grad_norm": 2.5180561542510986, "learning_rate": 4.5896910256211303e-05, "loss": 0.976, "step": 264400 }, { "epoch": 4.104657117584072, "grad_norm": 2.5658743381500244, "learning_rate": 4.589535840096836e-05, "loss": 0.9488, "step": 264500 }, { "epoch": 4.106208972827015, "grad_norm": 4.451369285583496, "learning_rate": 4.589380654572541e-05, "loss": 0.9586, "step": 264600 }, { "epoch": 4.107760828069957, "grad_norm": 2.2843918800354004, "learning_rate": 4.589225469048247e-05, "loss": 0.9548, "step": 264700 }, { "epoch": 4.109312683312901, "grad_norm": 2.009854555130005, "learning_rate": 4.589070283523953e-05, "loss": 0.9517, "step": 264800 }, { "epoch": 4.110864538555844, "grad_norm": 2.506761074066162, "learning_rate": 4.5889150979996585e-05, "loss": 0.9442, "step": 264900 }, { "epoch": 4.112416393798786, "grad_norm": 2.7982921600341797, "learning_rate": 4.588759912475364e-05, "loss": 0.9568, "step": 265000 }, { "epoch": 4.113968249041729, "grad_norm": 2.532555341720581, "learning_rate": 4.58860472695107e-05, "loss": 0.9672, "step": 265100 }, { "epoch": 4.1155201042846725, "grad_norm": 3.7459118366241455, "learning_rate": 4.588449541426776e-05, "loss": 0.9567, "step": 265200 }, { "epoch": 4.117071959527615, "grad_norm": 2.0876026153564453, "learning_rate": 4.5882943559024816e-05, "loss": 0.9487, "step": 265300 }, { "epoch": 4.118623814770558, "grad_norm": 2.2606899738311768, "learning_rate": 4.5881391703781874e-05, "loss": 0.9658, "step": 265400 }, { "epoch": 4.120175670013501, "grad_norm": 1.8613282442092896, "learning_rate": 4.587983984853893e-05, "loss": 0.9524, "step": 265500 }, { "epoch": 4.1217275252564445, "grad_norm": 2.2432398796081543, "learning_rate": 4.587828799329599e-05, "loss": 0.9628, "step": 265600 }, { "epoch": 4.123279380499387, "grad_norm": 1.9915740489959717, "learning_rate": 4.587673613805305e-05, "loss": 0.9362, "step": 265700 }, { "epoch": 4.12483123574233, "grad_norm": 2.4603967666625977, "learning_rate": 4.5875184282810105e-05, "loss": 0.9494, "step": 265800 }, { "epoch": 4.126383090985273, "grad_norm": 2.690214157104492, "learning_rate": 4.5873632427567156e-05, "loss": 0.9599, "step": 265900 }, { "epoch": 4.1279349462282156, "grad_norm": 2.603281021118164, "learning_rate": 4.5872080572324214e-05, "loss": 0.9484, "step": 266000 }, { "epoch": 4.129486801471159, "grad_norm": 2.3156611919403076, "learning_rate": 4.587052871708127e-05, "loss": 0.9476, "step": 266100 }, { "epoch": 4.131038656714102, "grad_norm": 2.479135513305664, "learning_rate": 4.586897686183833e-05, "loss": 0.949, "step": 266200 }, { "epoch": 4.132590511957044, "grad_norm": 2.0956408977508545, "learning_rate": 4.586742500659539e-05, "loss": 0.9516, "step": 266300 }, { "epoch": 4.1341423671999875, "grad_norm": 2.307621479034424, "learning_rate": 4.5865873151352445e-05, "loss": 0.9443, "step": 266400 }, { "epoch": 4.135694222442931, "grad_norm": 2.1014575958251953, "learning_rate": 4.58643212961095e-05, "loss": 0.9353, "step": 266500 }, { "epoch": 4.137246077685873, "grad_norm": 2.4674599170684814, "learning_rate": 4.586276944086656e-05, "loss": 0.9538, "step": 266600 }, { "epoch": 4.138797932928816, "grad_norm": 2.521721601486206, "learning_rate": 4.586121758562362e-05, "loss": 0.9412, "step": 266700 }, { "epoch": 4.1403497881717595, "grad_norm": 2.613654851913452, "learning_rate": 4.5859665730380676e-05, "loss": 0.9723, "step": 266800 }, { "epoch": 4.141901643414703, "grad_norm": 2.044856548309326, "learning_rate": 4.5858113875137734e-05, "loss": 0.958, "step": 266900 }, { "epoch": 4.143453498657645, "grad_norm": 2.357006072998047, "learning_rate": 4.585656201989479e-05, "loss": 0.9723, "step": 267000 }, { "epoch": 4.145005353900588, "grad_norm": 2.2944085597991943, "learning_rate": 4.585501016465185e-05, "loss": 0.9618, "step": 267100 }, { "epoch": 4.146557209143531, "grad_norm": 2.6169674396514893, "learning_rate": 4.58534583094089e-05, "loss": 0.9426, "step": 267200 }, { "epoch": 4.148109064386474, "grad_norm": 2.6744658946990967, "learning_rate": 4.585190645416596e-05, "loss": 0.9499, "step": 267300 }, { "epoch": 4.149660919629417, "grad_norm": 2.738558530807495, "learning_rate": 4.585035459892301e-05, "loss": 0.9499, "step": 267400 }, { "epoch": 4.15121277487236, "grad_norm": 2.4779956340789795, "learning_rate": 4.584880274368007e-05, "loss": 0.9562, "step": 267500 }, { "epoch": 4.1527646301153025, "grad_norm": 2.2776710987091064, "learning_rate": 4.5847250888437124e-05, "loss": 0.9246, "step": 267600 }, { "epoch": 4.154316485358246, "grad_norm": 2.9284017086029053, "learning_rate": 4.584569903319418e-05, "loss": 0.9732, "step": 267700 }, { "epoch": 4.155868340601189, "grad_norm": 2.4514870643615723, "learning_rate": 4.584414717795124e-05, "loss": 0.9404, "step": 267800 }, { "epoch": 4.157420195844131, "grad_norm": 2.0755503177642822, "learning_rate": 4.58425953227083e-05, "loss": 0.935, "step": 267900 }, { "epoch": 4.1589720510870745, "grad_norm": 2.126408815383911, "learning_rate": 4.5841043467465355e-05, "loss": 0.945, "step": 268000 }, { "epoch": 4.160523906330018, "grad_norm": 1.9894120693206787, "learning_rate": 4.583949161222241e-05, "loss": 0.9489, "step": 268100 }, { "epoch": 4.162075761572961, "grad_norm": 2.2396538257598877, "learning_rate": 4.583793975697947e-05, "loss": 0.9639, "step": 268200 }, { "epoch": 4.163627616815903, "grad_norm": 2.805377244949341, "learning_rate": 4.583638790173653e-05, "loss": 0.9568, "step": 268300 }, { "epoch": 4.165179472058846, "grad_norm": 2.459124803543091, "learning_rate": 4.5834836046493586e-05, "loss": 0.9448, "step": 268400 }, { "epoch": 4.16673132730179, "grad_norm": 2.348426580429077, "learning_rate": 4.5833284191250644e-05, "loss": 0.9423, "step": 268500 }, { "epoch": 4.168283182544732, "grad_norm": 7.3705220222473145, "learning_rate": 4.58317323360077e-05, "loss": 0.9448, "step": 268600 }, { "epoch": 4.169835037787675, "grad_norm": 1.9104797840118408, "learning_rate": 4.583018048076475e-05, "loss": 0.9582, "step": 268700 }, { "epoch": 4.171386893030618, "grad_norm": 2.659911632537842, "learning_rate": 4.582862862552181e-05, "loss": 0.9754, "step": 268800 }, { "epoch": 4.172938748273561, "grad_norm": 2.0570030212402344, "learning_rate": 4.582707677027887e-05, "loss": 0.9393, "step": 268900 }, { "epoch": 4.174490603516504, "grad_norm": 2.2651305198669434, "learning_rate": 4.5825524915035926e-05, "loss": 1.1632, "step": 269000 }, { "epoch": 4.176042458759447, "grad_norm": 2.624347686767578, "learning_rate": 4.5823973059792984e-05, "loss": 0.9678, "step": 269100 }, { "epoch": 4.1775943140023895, "grad_norm": 2.670616865158081, "learning_rate": 4.582242120455004e-05, "loss": 0.9374, "step": 269200 }, { "epoch": 4.179146169245333, "grad_norm": 2.4925973415374756, "learning_rate": 4.58208693493071e-05, "loss": 0.9574, "step": 269300 }, { "epoch": 4.180698024488276, "grad_norm": 2.2327897548675537, "learning_rate": 4.581931749406416e-05, "loss": 0.9445, "step": 269400 }, { "epoch": 4.182249879731219, "grad_norm": 2.8289425373077393, "learning_rate": 4.5817765638821215e-05, "loss": 0.9582, "step": 269500 }, { "epoch": 4.183801734974161, "grad_norm": 2.934558391571045, "learning_rate": 4.581621378357827e-05, "loss": 0.9438, "step": 269600 }, { "epoch": 4.185353590217105, "grad_norm": 2.3160240650177, "learning_rate": 4.581466192833533e-05, "loss": 0.9698, "step": 269700 }, { "epoch": 4.186905445460048, "grad_norm": 2.064438819885254, "learning_rate": 4.581311007309239e-05, "loss": 0.9292, "step": 269800 }, { "epoch": 4.18845730070299, "grad_norm": 4.578567028045654, "learning_rate": 4.5811558217849446e-05, "loss": 0.9423, "step": 269900 }, { "epoch": 4.190009155945933, "grad_norm": 3.4317381381988525, "learning_rate": 4.58100063626065e-05, "loss": 0.9577, "step": 270000 }, { "epoch": 4.191561011188877, "grad_norm": 2.530855178833008, "learning_rate": 4.5808454507363555e-05, "loss": 0.9535, "step": 270100 }, { "epoch": 4.193112866431819, "grad_norm": 2.526818037033081, "learning_rate": 4.580690265212061e-05, "loss": 0.9522, "step": 270200 }, { "epoch": 4.194664721674762, "grad_norm": 2.293144464492798, "learning_rate": 4.580535079687767e-05, "loss": 0.9571, "step": 270300 }, { "epoch": 4.196216576917705, "grad_norm": 2.391810178756714, "learning_rate": 4.580379894163473e-05, "loss": 0.9681, "step": 270400 }, { "epoch": 4.197768432160648, "grad_norm": 2.5771522521972656, "learning_rate": 4.5802247086391786e-05, "loss": 0.9534, "step": 270500 }, { "epoch": 4.199320287403591, "grad_norm": 2.530381202697754, "learning_rate": 4.580069523114884e-05, "loss": 0.9644, "step": 270600 }, { "epoch": 4.200872142646534, "grad_norm": 2.508147954940796, "learning_rate": 4.5799143375905894e-05, "loss": 0.9626, "step": 270700 }, { "epoch": 4.202423997889477, "grad_norm": 1.882773518562317, "learning_rate": 4.579759152066295e-05, "loss": 0.9655, "step": 270800 }, { "epoch": 4.20397585313242, "grad_norm": 2.1093831062316895, "learning_rate": 4.579603966542001e-05, "loss": 0.9388, "step": 270900 }, { "epoch": 4.205527708375363, "grad_norm": 2.379361152648926, "learning_rate": 4.579448781017707e-05, "loss": 0.9742, "step": 271000 }, { "epoch": 4.207079563618306, "grad_norm": 2.439324378967285, "learning_rate": 4.5792935954934125e-05, "loss": 0.9535, "step": 271100 }, { "epoch": 4.208631418861248, "grad_norm": 2.253000259399414, "learning_rate": 4.579138409969118e-05, "loss": 0.9481, "step": 271200 }, { "epoch": 4.210183274104192, "grad_norm": 2.31129789352417, "learning_rate": 4.578983224444824e-05, "loss": 0.9459, "step": 271300 }, { "epoch": 4.211735129347135, "grad_norm": 2.606315851211548, "learning_rate": 4.57882803892053e-05, "loss": 0.9503, "step": 271400 }, { "epoch": 4.213286984590077, "grad_norm": 2.398406505584717, "learning_rate": 4.5786728533962356e-05, "loss": 0.9558, "step": 271500 }, { "epoch": 4.21483883983302, "grad_norm": 2.3068346977233887, "learning_rate": 4.578517667871941e-05, "loss": 0.9527, "step": 271600 }, { "epoch": 4.2163906950759635, "grad_norm": 2.162580966949463, "learning_rate": 4.5783624823476465e-05, "loss": 0.9429, "step": 271700 }, { "epoch": 4.217942550318906, "grad_norm": 2.755873203277588, "learning_rate": 4.578207296823352e-05, "loss": 0.9558, "step": 271800 }, { "epoch": 4.219494405561849, "grad_norm": 2.4446914196014404, "learning_rate": 4.578052111299058e-05, "loss": 0.9701, "step": 271900 }, { "epoch": 4.221046260804792, "grad_norm": 2.6638333797454834, "learning_rate": 4.577896925774764e-05, "loss": 0.9738, "step": 272000 }, { "epoch": 4.2225981160477355, "grad_norm": 2.19158673286438, "learning_rate": 4.5777417402504696e-05, "loss": 0.961, "step": 272100 }, { "epoch": 4.224149971290678, "grad_norm": 2.5914907455444336, "learning_rate": 4.5775865547261754e-05, "loss": 0.9371, "step": 272200 }, { "epoch": 4.225701826533621, "grad_norm": 2.4796886444091797, "learning_rate": 4.577431369201881e-05, "loss": 0.9613, "step": 272300 }, { "epoch": 4.227253681776564, "grad_norm": 2.333012104034424, "learning_rate": 4.577276183677587e-05, "loss": 0.9721, "step": 272400 }, { "epoch": 4.228805537019507, "grad_norm": 1.7639623880386353, "learning_rate": 4.577120998153293e-05, "loss": 0.9708, "step": 272500 }, { "epoch": 4.23035739226245, "grad_norm": 2.280776023864746, "learning_rate": 4.5769658126289985e-05, "loss": 0.9523, "step": 272600 }, { "epoch": 4.231909247505393, "grad_norm": 2.4776875972747803, "learning_rate": 4.576810627104704e-05, "loss": 0.9525, "step": 272700 }, { "epoch": 4.233461102748335, "grad_norm": 2.3590919971466064, "learning_rate": 4.57665544158041e-05, "loss": 0.9659, "step": 272800 }, { "epoch": 4.2350129579912785, "grad_norm": 2.1749846935272217, "learning_rate": 4.576500256056115e-05, "loss": 0.968, "step": 272900 }, { "epoch": 4.236564813234222, "grad_norm": 2.3962395191192627, "learning_rate": 4.576345070531821e-05, "loss": 0.9536, "step": 273000 }, { "epoch": 4.238116668477164, "grad_norm": 2.3158106803894043, "learning_rate": 4.576189885007527e-05, "loss": 0.9646, "step": 273100 }, { "epoch": 4.239668523720107, "grad_norm": 2.5484776496887207, "learning_rate": 4.5760346994832325e-05, "loss": 0.9743, "step": 273200 }, { "epoch": 4.2412203789630505, "grad_norm": 2.684412717819214, "learning_rate": 4.575879513958938e-05, "loss": 0.9622, "step": 273300 }, { "epoch": 4.242772234205994, "grad_norm": 2.8247079849243164, "learning_rate": 4.575724328434644e-05, "loss": 0.9677, "step": 273400 }, { "epoch": 4.244324089448936, "grad_norm": 2.805833339691162, "learning_rate": 4.57556914291035e-05, "loss": 0.9617, "step": 273500 }, { "epoch": 4.245875944691879, "grad_norm": 2.4337339401245117, "learning_rate": 4.5754139573860556e-05, "loss": 0.9548, "step": 273600 }, { "epoch": 4.2474277999348224, "grad_norm": 2.889346122741699, "learning_rate": 4.5752587718617613e-05, "loss": 0.9509, "step": 273700 }, { "epoch": 4.248979655177765, "grad_norm": 2.3846325874328613, "learning_rate": 4.5751035863374664e-05, "loss": 0.9638, "step": 273800 }, { "epoch": 4.250531510420708, "grad_norm": 2.4157638549804688, "learning_rate": 4.574948400813172e-05, "loss": 0.9543, "step": 273900 }, { "epoch": 4.252083365663651, "grad_norm": 2.022716999053955, "learning_rate": 4.574793215288878e-05, "loss": 0.9448, "step": 274000 }, { "epoch": 4.2536352209065935, "grad_norm": 2.908731460571289, "learning_rate": 4.574638029764584e-05, "loss": 0.9594, "step": 274100 }, { "epoch": 4.255187076149537, "grad_norm": 2.226807117462158, "learning_rate": 4.5744828442402895e-05, "loss": 0.9756, "step": 274200 }, { "epoch": 4.25673893139248, "grad_norm": 2.576692581176758, "learning_rate": 4.574327658715995e-05, "loss": 0.9523, "step": 274300 }, { "epoch": 4.258290786635422, "grad_norm": 2.2781801223754883, "learning_rate": 4.5741724731917004e-05, "loss": 0.9495, "step": 274400 }, { "epoch": 4.2598426418783655, "grad_norm": 2.221142530441284, "learning_rate": 4.574017287667406e-05, "loss": 0.9571, "step": 274500 }, { "epoch": 4.261394497121309, "grad_norm": 2.289745807647705, "learning_rate": 4.573862102143112e-05, "loss": 0.9763, "step": 274600 }, { "epoch": 4.262946352364251, "grad_norm": 2.5037693977355957, "learning_rate": 4.573706916618818e-05, "loss": 0.9671, "step": 274700 }, { "epoch": 4.264498207607194, "grad_norm": 2.6745574474334717, "learning_rate": 4.5735517310945235e-05, "loss": 0.937, "step": 274800 }, { "epoch": 4.266050062850137, "grad_norm": 2.435272455215454, "learning_rate": 4.573396545570229e-05, "loss": 0.9824, "step": 274900 }, { "epoch": 4.267601918093081, "grad_norm": 2.356067657470703, "learning_rate": 4.573241360045935e-05, "loss": 0.9546, "step": 275000 }, { "epoch": 4.269153773336023, "grad_norm": 2.430814266204834, "learning_rate": 4.573086174521641e-05, "loss": 0.9831, "step": 275100 }, { "epoch": 4.270705628578966, "grad_norm": 2.264638662338257, "learning_rate": 4.5729309889973466e-05, "loss": 0.9575, "step": 275200 }, { "epoch": 4.272257483821909, "grad_norm": 1.7676955461502075, "learning_rate": 4.5727758034730524e-05, "loss": 0.9613, "step": 275300 }, { "epoch": 4.273809339064852, "grad_norm": 2.126235008239746, "learning_rate": 4.572620617948758e-05, "loss": 0.9539, "step": 275400 }, { "epoch": 4.275361194307795, "grad_norm": 2.411301612854004, "learning_rate": 4.572465432424464e-05, "loss": 0.9651, "step": 275500 }, { "epoch": 4.276913049550738, "grad_norm": 2.7028989791870117, "learning_rate": 4.57231024690017e-05, "loss": 0.968, "step": 275600 }, { "epoch": 4.2784649047936805, "grad_norm": 1.8828054666519165, "learning_rate": 4.572155061375875e-05, "loss": 0.9357, "step": 275700 }, { "epoch": 4.280016760036624, "grad_norm": 2.412313461303711, "learning_rate": 4.5719998758515806e-05, "loss": 0.9666, "step": 275800 }, { "epoch": 4.281568615279567, "grad_norm": 2.7358639240264893, "learning_rate": 4.5718446903272864e-05, "loss": 0.9387, "step": 275900 }, { "epoch": 4.28312047052251, "grad_norm": 1.9869111776351929, "learning_rate": 4.571689504802992e-05, "loss": 0.9571, "step": 276000 }, { "epoch": 4.284672325765452, "grad_norm": 2.7130167484283447, "learning_rate": 4.571534319278698e-05, "loss": 0.9527, "step": 276100 }, { "epoch": 4.286224181008396, "grad_norm": 2.7819483280181885, "learning_rate": 4.571379133754404e-05, "loss": 0.9592, "step": 276200 }, { "epoch": 4.287776036251339, "grad_norm": 2.369662284851074, "learning_rate": 4.5712239482301095e-05, "loss": 0.9724, "step": 276300 }, { "epoch": 4.289327891494281, "grad_norm": 2.6673128604888916, "learning_rate": 4.571068762705815e-05, "loss": 0.9476, "step": 276400 }, { "epoch": 4.290879746737224, "grad_norm": 1.7957216501235962, "learning_rate": 4.570913577181521e-05, "loss": 0.9698, "step": 276500 }, { "epoch": 4.292431601980168, "grad_norm": 2.786407947540283, "learning_rate": 4.570758391657227e-05, "loss": 0.9555, "step": 276600 }, { "epoch": 4.29398345722311, "grad_norm": 2.709216356277466, "learning_rate": 4.5706032061329326e-05, "loss": 0.961, "step": 276700 }, { "epoch": 4.295535312466053, "grad_norm": 2.6192033290863037, "learning_rate": 4.5704480206086383e-05, "loss": 0.9521, "step": 276800 }, { "epoch": 4.297087167708996, "grad_norm": 3.3531854152679443, "learning_rate": 4.570292835084344e-05, "loss": 0.9624, "step": 276900 }, { "epoch": 4.298639022951939, "grad_norm": 2.5440030097961426, "learning_rate": 4.570137649560049e-05, "loss": 0.9469, "step": 277000 }, { "epoch": 4.300190878194882, "grad_norm": 2.612758159637451, "learning_rate": 4.569982464035755e-05, "loss": 0.996, "step": 277100 }, { "epoch": 4.301742733437825, "grad_norm": 2.155972480773926, "learning_rate": 4.56982727851146e-05, "loss": 0.9674, "step": 277200 }, { "epoch": 4.303294588680767, "grad_norm": 2.8976945877075195, "learning_rate": 4.569672092987166e-05, "loss": 0.9657, "step": 277300 }, { "epoch": 4.304846443923711, "grad_norm": 2.501742124557495, "learning_rate": 4.5695169074628716e-05, "loss": 0.9778, "step": 277400 }, { "epoch": 4.306398299166654, "grad_norm": 2.19927716255188, "learning_rate": 4.5693617219385774e-05, "loss": 0.9698, "step": 277500 }, { "epoch": 4.307950154409597, "grad_norm": 2.218656063079834, "learning_rate": 4.569206536414283e-05, "loss": 0.9513, "step": 277600 }, { "epoch": 4.309502009652539, "grad_norm": 2.7306482791900635, "learning_rate": 4.569051350889989e-05, "loss": 0.9283, "step": 277700 }, { "epoch": 4.311053864895483, "grad_norm": 2.5488851070404053, "learning_rate": 4.568896165365695e-05, "loss": 0.9436, "step": 277800 }, { "epoch": 4.312605720138426, "grad_norm": 2.3247628211975098, "learning_rate": 4.5687409798414005e-05, "loss": 0.9419, "step": 277900 }, { "epoch": 4.314157575381368, "grad_norm": 2.8633854389190674, "learning_rate": 4.568585794317106e-05, "loss": 0.9298, "step": 278000 }, { "epoch": 4.315709430624311, "grad_norm": 2.88370943069458, "learning_rate": 4.568430608792812e-05, "loss": 0.9648, "step": 278100 }, { "epoch": 4.3172612858672545, "grad_norm": 2.5213725566864014, "learning_rate": 4.568275423268518e-05, "loss": 0.9672, "step": 278200 }, { "epoch": 4.318813141110197, "grad_norm": 2.6202690601348877, "learning_rate": 4.5681202377442236e-05, "loss": 0.9494, "step": 278300 }, { "epoch": 4.32036499635314, "grad_norm": 2.5022921562194824, "learning_rate": 4.5679650522199294e-05, "loss": 0.9603, "step": 278400 }, { "epoch": 4.321916851596083, "grad_norm": 2.170469045639038, "learning_rate": 4.5678098666956345e-05, "loss": 0.9339, "step": 278500 }, { "epoch": 4.3234687068390265, "grad_norm": 2.3554317951202393, "learning_rate": 4.56765468117134e-05, "loss": 0.9456, "step": 278600 }, { "epoch": 4.325020562081969, "grad_norm": 2.4357564449310303, "learning_rate": 4.567499495647046e-05, "loss": 0.9698, "step": 278700 }, { "epoch": 4.326572417324912, "grad_norm": 1.7803012132644653, "learning_rate": 4.567344310122752e-05, "loss": 0.959, "step": 278800 }, { "epoch": 4.328124272567855, "grad_norm": 2.2549099922180176, "learning_rate": 4.5671891245984576e-05, "loss": 0.9486, "step": 278900 }, { "epoch": 4.329676127810798, "grad_norm": 2.734562873840332, "learning_rate": 4.5670339390741634e-05, "loss": 0.9444, "step": 279000 }, { "epoch": 4.331227983053741, "grad_norm": 2.289163589477539, "learning_rate": 4.566878753549869e-05, "loss": 0.965, "step": 279100 }, { "epoch": 4.332779838296684, "grad_norm": 1.9174065589904785, "learning_rate": 4.566723568025575e-05, "loss": 0.9349, "step": 279200 }, { "epoch": 4.334331693539626, "grad_norm": 1.9888296127319336, "learning_rate": 4.566568382501281e-05, "loss": 0.9415, "step": 279300 }, { "epoch": 4.3358835487825695, "grad_norm": 2.123837471008301, "learning_rate": 4.5664131969769865e-05, "loss": 0.964, "step": 279400 }, { "epoch": 4.337435404025513, "grad_norm": 2.5012216567993164, "learning_rate": 4.566258011452692e-05, "loss": 0.9478, "step": 279500 }, { "epoch": 4.338987259268455, "grad_norm": 2.098836898803711, "learning_rate": 4.566102825928398e-05, "loss": 0.968, "step": 279600 }, { "epoch": 4.340539114511398, "grad_norm": 2.489276885986328, "learning_rate": 4.565947640404104e-05, "loss": 0.9527, "step": 279700 }, { "epoch": 4.3420909697543415, "grad_norm": 3.6984362602233887, "learning_rate": 4.565792454879809e-05, "loss": 0.9481, "step": 279800 }, { "epoch": 4.343642824997284, "grad_norm": 2.2199084758758545, "learning_rate": 4.565637269355515e-05, "loss": 0.9344, "step": 279900 }, { "epoch": 4.345194680240227, "grad_norm": 2.3492956161499023, "learning_rate": 4.5654820838312204e-05, "loss": 0.9589, "step": 280000 }, { "epoch": 4.34674653548317, "grad_norm": 2.2888975143432617, "learning_rate": 4.565326898306926e-05, "loss": 0.9485, "step": 280100 }, { "epoch": 4.3482983907261135, "grad_norm": 2.5660147666931152, "learning_rate": 4.565171712782632e-05, "loss": 0.9675, "step": 280200 }, { "epoch": 4.349850245969056, "grad_norm": 2.152362108230591, "learning_rate": 4.565016527258338e-05, "loss": 0.9375, "step": 280300 }, { "epoch": 4.351402101211999, "grad_norm": 2.405099868774414, "learning_rate": 4.564861341734043e-05, "loss": 0.9475, "step": 280400 }, { "epoch": 4.352953956454942, "grad_norm": 2.3458845615386963, "learning_rate": 4.5647061562097486e-05, "loss": 0.9532, "step": 280500 }, { "epoch": 4.3545058116978845, "grad_norm": 2.5165176391601562, "learning_rate": 4.5645509706854544e-05, "loss": 0.9527, "step": 280600 }, { "epoch": 4.356057666940828, "grad_norm": 2.687025308609009, "learning_rate": 4.56439578516116e-05, "loss": 0.9522, "step": 280700 }, { "epoch": 4.357609522183771, "grad_norm": 2.1509432792663574, "learning_rate": 4.564240599636866e-05, "loss": 0.9723, "step": 280800 }, { "epoch": 4.359161377426713, "grad_norm": 2.0782275199890137, "learning_rate": 4.564085414112572e-05, "loss": 0.96, "step": 280900 }, { "epoch": 4.3607132326696565, "grad_norm": 2.61387300491333, "learning_rate": 4.5639302285882775e-05, "loss": 0.9544, "step": 281000 }, { "epoch": 4.3622650879126, "grad_norm": 2.2484211921691895, "learning_rate": 4.563775043063983e-05, "loss": 0.955, "step": 281100 }, { "epoch": 4.363816943155543, "grad_norm": 2.8256936073303223, "learning_rate": 4.563619857539689e-05, "loss": 0.9531, "step": 281200 }, { "epoch": 4.365368798398485, "grad_norm": 2.4078118801116943, "learning_rate": 4.563464672015395e-05, "loss": 0.9625, "step": 281300 }, { "epoch": 4.3669206536414285, "grad_norm": 2.2373712062835693, "learning_rate": 4.5633094864911e-05, "loss": 0.9599, "step": 281400 }, { "epoch": 4.368472508884372, "grad_norm": 3.020334482192993, "learning_rate": 4.563154300966806e-05, "loss": 0.9629, "step": 281500 }, { "epoch": 4.370024364127314, "grad_norm": 2.8539576530456543, "learning_rate": 4.5629991154425115e-05, "loss": 0.9445, "step": 281600 }, { "epoch": 4.371576219370257, "grad_norm": 2.699276924133301, "learning_rate": 4.562843929918217e-05, "loss": 0.9478, "step": 281700 }, { "epoch": 4.3731280746132, "grad_norm": 2.430405616760254, "learning_rate": 4.562688744393923e-05, "loss": 0.9449, "step": 281800 }, { "epoch": 4.374679929856143, "grad_norm": 2.6623058319091797, "learning_rate": 4.562533558869629e-05, "loss": 0.9585, "step": 281900 }, { "epoch": 4.376231785099086, "grad_norm": 2.2928028106689453, "learning_rate": 4.5623783733453346e-05, "loss": 0.9313, "step": 282000 }, { "epoch": 4.377783640342029, "grad_norm": 2.285165309906006, "learning_rate": 4.5622231878210404e-05, "loss": 0.9512, "step": 282100 }, { "epoch": 4.3793354955849715, "grad_norm": 2.3999366760253906, "learning_rate": 4.562068002296746e-05, "loss": 0.9599, "step": 282200 }, { "epoch": 4.380887350827915, "grad_norm": 2.258582592010498, "learning_rate": 4.561912816772452e-05, "loss": 0.9414, "step": 282300 }, { "epoch": 4.382439206070858, "grad_norm": 2.4302546977996826, "learning_rate": 4.561757631248158e-05, "loss": 0.9555, "step": 282400 }, { "epoch": 4.3839910613138, "grad_norm": 2.2792060375213623, "learning_rate": 4.5616024457238635e-05, "loss": 0.9635, "step": 282500 }, { "epoch": 4.385542916556743, "grad_norm": 2.685861110687256, "learning_rate": 4.561447260199569e-05, "loss": 0.9618, "step": 282600 }, { "epoch": 4.387094771799687, "grad_norm": 2.133270740509033, "learning_rate": 4.5612920746752743e-05, "loss": 0.9662, "step": 282700 }, { "epoch": 4.38864662704263, "grad_norm": 2.4342262744903564, "learning_rate": 4.56113688915098e-05, "loss": 0.9497, "step": 282800 }, { "epoch": 4.390198482285572, "grad_norm": 2.6978423595428467, "learning_rate": 4.560981703626686e-05, "loss": 0.9619, "step": 282900 }, { "epoch": 4.391750337528515, "grad_norm": 2.350912570953369, "learning_rate": 4.560826518102392e-05, "loss": 0.9562, "step": 283000 }, { "epoch": 4.393302192771459, "grad_norm": 2.67197585105896, "learning_rate": 4.5606713325780974e-05, "loss": 0.9667, "step": 283100 }, { "epoch": 4.394854048014401, "grad_norm": 2.1743009090423584, "learning_rate": 4.560516147053803e-05, "loss": 0.941, "step": 283200 }, { "epoch": 4.396405903257344, "grad_norm": 2.261115789413452, "learning_rate": 4.560360961529509e-05, "loss": 0.9433, "step": 283300 }, { "epoch": 4.397957758500287, "grad_norm": 2.6397945880889893, "learning_rate": 4.560205776005215e-05, "loss": 0.9623, "step": 283400 }, { "epoch": 4.39950961374323, "grad_norm": 2.2276105880737305, "learning_rate": 4.5600505904809205e-05, "loss": 0.9815, "step": 283500 }, { "epoch": 4.401061468986173, "grad_norm": 2.596635103225708, "learning_rate": 4.5598954049566256e-05, "loss": 0.9837, "step": 283600 }, { "epoch": 4.402613324229116, "grad_norm": 2.133366107940674, "learning_rate": 4.5597402194323314e-05, "loss": 0.961, "step": 283700 }, { "epoch": 4.404165179472058, "grad_norm": 2.441610336303711, "learning_rate": 4.559585033908037e-05, "loss": 0.9761, "step": 283800 }, { "epoch": 4.405717034715002, "grad_norm": 2.054492950439453, "learning_rate": 4.559429848383743e-05, "loss": 0.9926, "step": 283900 }, { "epoch": 4.407268889957945, "grad_norm": 2.919877529144287, "learning_rate": 4.559274662859449e-05, "loss": 1.0104, "step": 284000 }, { "epoch": 4.408820745200888, "grad_norm": 2.5291271209716797, "learning_rate": 4.5591194773351545e-05, "loss": 0.9525, "step": 284100 }, { "epoch": 4.41037260044383, "grad_norm": 2.3134307861328125, "learning_rate": 4.5589642918108596e-05, "loss": 0.9739, "step": 284200 }, { "epoch": 4.411924455686774, "grad_norm": 2.035604476928711, "learning_rate": 4.5588091062865654e-05, "loss": 0.9858, "step": 284300 }, { "epoch": 4.413476310929717, "grad_norm": 2.245612859725952, "learning_rate": 4.558653920762271e-05, "loss": 0.9898, "step": 284400 }, { "epoch": 4.415028166172659, "grad_norm": 2.2305281162261963, "learning_rate": 4.558498735237977e-05, "loss": 0.9701, "step": 284500 }, { "epoch": 4.416580021415602, "grad_norm": 3.6836352348327637, "learning_rate": 4.558343549713683e-05, "loss": 0.9607, "step": 284600 }, { "epoch": 4.418131876658546, "grad_norm": 2.694596767425537, "learning_rate": 4.5581883641893885e-05, "loss": 0.9968, "step": 284700 }, { "epoch": 4.419683731901488, "grad_norm": 2.551436185836792, "learning_rate": 4.558033178665094e-05, "loss": 0.9794, "step": 284800 }, { "epoch": 4.421235587144431, "grad_norm": 2.4205543994903564, "learning_rate": 4.5578779931408e-05, "loss": 0.9865, "step": 284900 }, { "epoch": 4.422787442387374, "grad_norm": 3.1106441020965576, "learning_rate": 4.557722807616506e-05, "loss": 0.9577, "step": 285000 }, { "epoch": 4.424339297630317, "grad_norm": 1.907063364982605, "learning_rate": 4.5575676220922116e-05, "loss": 0.9589, "step": 285100 }, { "epoch": 4.42589115287326, "grad_norm": 2.657801389694214, "learning_rate": 4.5574124365679174e-05, "loss": 0.9899, "step": 285200 }, { "epoch": 4.427443008116203, "grad_norm": 2.603607654571533, "learning_rate": 4.557257251043623e-05, "loss": 0.9655, "step": 285300 }, { "epoch": 4.428994863359146, "grad_norm": 2.6950347423553467, "learning_rate": 4.557102065519329e-05, "loss": 0.9689, "step": 285400 }, { "epoch": 4.430546718602089, "grad_norm": 2.734130859375, "learning_rate": 4.556946879995034e-05, "loss": 0.9739, "step": 285500 }, { "epoch": 4.432098573845032, "grad_norm": 2.2186355590820312, "learning_rate": 4.55679169447074e-05, "loss": 0.9671, "step": 285600 }, { "epoch": 4.433650429087975, "grad_norm": 2.014082193374634, "learning_rate": 4.5566365089464456e-05, "loss": 0.9867, "step": 285700 }, { "epoch": 4.435202284330917, "grad_norm": 2.2343204021453857, "learning_rate": 4.5564813234221513e-05, "loss": 1.0036, "step": 285800 }, { "epoch": 4.4367541395738606, "grad_norm": 2.3113603591918945, "learning_rate": 4.556326137897857e-05, "loss": 0.9976, "step": 285900 }, { "epoch": 4.438305994816804, "grad_norm": 2.7611892223358154, "learning_rate": 4.556170952373563e-05, "loss": 0.9899, "step": 286000 }, { "epoch": 4.439857850059746, "grad_norm": 2.336049795150757, "learning_rate": 4.556015766849269e-05, "loss": 0.9466, "step": 286100 }, { "epoch": 4.441409705302689, "grad_norm": 2.6210458278656006, "learning_rate": 4.5558605813249744e-05, "loss": 1.0042, "step": 286200 }, { "epoch": 4.4429615605456325, "grad_norm": 2.65708065032959, "learning_rate": 4.55570539580068e-05, "loss": 1.1277, "step": 286300 }, { "epoch": 4.444513415788575, "grad_norm": 3.8349111080169678, "learning_rate": 4.555550210276386e-05, "loss": 1.1484, "step": 286400 }, { "epoch": 4.446065271031518, "grad_norm": 3.0189480781555176, "learning_rate": 4.555395024752092e-05, "loss": 1.1295, "step": 286500 }, { "epoch": 4.447617126274461, "grad_norm": 2.3178908824920654, "learning_rate": 4.5552398392277975e-05, "loss": 1.0596, "step": 286600 }, { "epoch": 4.4491689815174045, "grad_norm": 2.6704437732696533, "learning_rate": 4.555084653703503e-05, "loss": 1.0096, "step": 286700 }, { "epoch": 4.450720836760347, "grad_norm": 2.3096249103546143, "learning_rate": 4.5549294681792084e-05, "loss": 1.0619, "step": 286800 }, { "epoch": 4.45227269200329, "grad_norm": 2.5734503269195557, "learning_rate": 4.554774282654914e-05, "loss": 1.0077, "step": 286900 }, { "epoch": 4.453824547246233, "grad_norm": 2.437303304672241, "learning_rate": 4.55461909713062e-05, "loss": 0.9796, "step": 287000 }, { "epoch": 4.4553764024891755, "grad_norm": 2.611910581588745, "learning_rate": 4.554463911606325e-05, "loss": 0.9586, "step": 287100 }, { "epoch": 4.456928257732119, "grad_norm": 2.305626392364502, "learning_rate": 4.554308726082031e-05, "loss": 0.9718, "step": 287200 }, { "epoch": 4.458480112975062, "grad_norm": 2.1554229259490967, "learning_rate": 4.5541535405577366e-05, "loss": 0.9449, "step": 287300 }, { "epoch": 4.460031968218004, "grad_norm": 2.8325657844543457, "learning_rate": 4.5539983550334424e-05, "loss": 0.9667, "step": 287400 }, { "epoch": 4.4615838234609475, "grad_norm": 2.755063772201538, "learning_rate": 4.553843169509148e-05, "loss": 0.9734, "step": 287500 }, { "epoch": 4.463135678703891, "grad_norm": 2.6837873458862305, "learning_rate": 4.553687983984854e-05, "loss": 0.9614, "step": 287600 }, { "epoch": 4.464687533946833, "grad_norm": 2.83879017829895, "learning_rate": 4.55353279846056e-05, "loss": 0.9633, "step": 287700 }, { "epoch": 4.466239389189776, "grad_norm": 2.18794846534729, "learning_rate": 4.5533776129362655e-05, "loss": 0.9608, "step": 287800 }, { "epoch": 4.4677912444327195, "grad_norm": 13.268120765686035, "learning_rate": 4.553222427411971e-05, "loss": 0.9557, "step": 287900 }, { "epoch": 4.469343099675662, "grad_norm": 2.374342918395996, "learning_rate": 4.553067241887677e-05, "loss": 0.9642, "step": 288000 }, { "epoch": 4.470894954918605, "grad_norm": 2.8094639778137207, "learning_rate": 4.552912056363383e-05, "loss": 0.9948, "step": 288100 }, { "epoch": 4.472446810161548, "grad_norm": 2.5334606170654297, "learning_rate": 4.5527568708390886e-05, "loss": 0.9939, "step": 288200 }, { "epoch": 4.473998665404491, "grad_norm": 24.929378509521484, "learning_rate": 4.5526016853147944e-05, "loss": 0.9642, "step": 288300 }, { "epoch": 4.475550520647434, "grad_norm": 2.720463514328003, "learning_rate": 4.5524464997904995e-05, "loss": 0.9571, "step": 288400 }, { "epoch": 4.477102375890377, "grad_norm": 2.1569221019744873, "learning_rate": 4.552291314266205e-05, "loss": 0.9517, "step": 288500 }, { "epoch": 4.47865423113332, "grad_norm": 2.8312792778015137, "learning_rate": 4.552136128741911e-05, "loss": 0.9537, "step": 288600 }, { "epoch": 4.4802060863762625, "grad_norm": 2.6346993446350098, "learning_rate": 4.551980943217617e-05, "loss": 0.9472, "step": 288700 }, { "epoch": 4.481757941619206, "grad_norm": 2.1948652267456055, "learning_rate": 4.5518257576933226e-05, "loss": 0.9507, "step": 288800 }, { "epoch": 4.483309796862149, "grad_norm": 2.6359076499938965, "learning_rate": 4.5516705721690283e-05, "loss": 0.9803, "step": 288900 }, { "epoch": 4.484861652105091, "grad_norm": 2.1034648418426514, "learning_rate": 4.551515386644734e-05, "loss": 0.9628, "step": 289000 }, { "epoch": 4.4864135073480345, "grad_norm": 2.4847071170806885, "learning_rate": 4.55136020112044e-05, "loss": 0.9471, "step": 289100 }, { "epoch": 4.487965362590978, "grad_norm": 2.298570394515991, "learning_rate": 4.551205015596146e-05, "loss": 0.9461, "step": 289200 }, { "epoch": 4.489517217833921, "grad_norm": 2.590711832046509, "learning_rate": 4.5510498300718514e-05, "loss": 0.96, "step": 289300 }, { "epoch": 4.491069073076863, "grad_norm": 2.4945592880249023, "learning_rate": 4.550894644547557e-05, "loss": 0.956, "step": 289400 }, { "epoch": 4.492620928319806, "grad_norm": 1.8763295412063599, "learning_rate": 4.550739459023263e-05, "loss": 0.9522, "step": 289500 }, { "epoch": 4.49417278356275, "grad_norm": 2.8543031215667725, "learning_rate": 4.550584273498969e-05, "loss": 0.9546, "step": 289600 }, { "epoch": 4.495724638805692, "grad_norm": 2.281874418258667, "learning_rate": 4.550429087974674e-05, "loss": 0.9648, "step": 289700 }, { "epoch": 4.497276494048635, "grad_norm": 2.9090754985809326, "learning_rate": 4.5502739024503796e-05, "loss": 0.9744, "step": 289800 }, { "epoch": 4.498828349291578, "grad_norm": 2.0620641708374023, "learning_rate": 4.5501187169260854e-05, "loss": 0.9572, "step": 289900 }, { "epoch": 4.500380204534521, "grad_norm": 3.9979889392852783, "learning_rate": 4.549963531401791e-05, "loss": 0.9438, "step": 290000 }, { "epoch": 4.501932059777464, "grad_norm": 1.9883822202682495, "learning_rate": 4.549808345877496e-05, "loss": 0.9576, "step": 290100 }, { "epoch": 4.503483915020407, "grad_norm": 2.3920834064483643, "learning_rate": 4.549653160353202e-05, "loss": 0.9875, "step": 290200 }, { "epoch": 4.5050357702633494, "grad_norm": 2.703366994857788, "learning_rate": 4.549497974828908e-05, "loss": 0.946, "step": 290300 }, { "epoch": 4.506587625506293, "grad_norm": 2.415745496749878, "learning_rate": 4.5493427893046136e-05, "loss": 0.9463, "step": 290400 }, { "epoch": 4.508139480749236, "grad_norm": 2.2152099609375, "learning_rate": 4.5491876037803194e-05, "loss": 0.954, "step": 290500 }, { "epoch": 4.509691335992178, "grad_norm": 2.2804019451141357, "learning_rate": 4.549032418256025e-05, "loss": 0.9561, "step": 290600 }, { "epoch": 4.511243191235121, "grad_norm": 2.0123751163482666, "learning_rate": 4.548877232731731e-05, "loss": 0.9745, "step": 290700 }, { "epoch": 4.512795046478065, "grad_norm": 2.2292745113372803, "learning_rate": 4.548722047207437e-05, "loss": 0.9514, "step": 290800 }, { "epoch": 4.514346901721008, "grad_norm": 2.4912354946136475, "learning_rate": 4.5485668616831425e-05, "loss": 0.9609, "step": 290900 }, { "epoch": 4.51589875696395, "grad_norm": 2.124375820159912, "learning_rate": 4.548411676158848e-05, "loss": 0.9581, "step": 291000 }, { "epoch": 4.517450612206893, "grad_norm": 2.3391268253326416, "learning_rate": 4.548256490634554e-05, "loss": 0.9557, "step": 291100 }, { "epoch": 4.519002467449837, "grad_norm": 2.5891411304473877, "learning_rate": 4.548101305110259e-05, "loss": 0.9565, "step": 291200 }, { "epoch": 4.520554322692779, "grad_norm": 2.0856451988220215, "learning_rate": 4.547946119585965e-05, "loss": 0.97, "step": 291300 }, { "epoch": 4.522106177935722, "grad_norm": 2.811493158340454, "learning_rate": 4.547790934061671e-05, "loss": 0.95, "step": 291400 }, { "epoch": 4.523658033178665, "grad_norm": 2.005014419555664, "learning_rate": 4.5476357485373765e-05, "loss": 0.9558, "step": 291500 }, { "epoch": 4.525209888421608, "grad_norm": 2.7229719161987305, "learning_rate": 4.547480563013082e-05, "loss": 0.9701, "step": 291600 }, { "epoch": 4.526761743664551, "grad_norm": 2.5716304779052734, "learning_rate": 4.547325377488788e-05, "loss": 0.9476, "step": 291700 }, { "epoch": 4.528313598907494, "grad_norm": 2.1300039291381836, "learning_rate": 4.547170191964494e-05, "loss": 0.9799, "step": 291800 }, { "epoch": 4.529865454150437, "grad_norm": 2.5099546909332275, "learning_rate": 4.5470150064401996e-05, "loss": 0.9616, "step": 291900 }, { "epoch": 4.53141730939338, "grad_norm": 2.9376399517059326, "learning_rate": 4.5468598209159053e-05, "loss": 0.9377, "step": 292000 }, { "epoch": 4.532969164636323, "grad_norm": 2.428426742553711, "learning_rate": 4.546704635391611e-05, "loss": 0.9682, "step": 292100 }, { "epoch": 4.534521019879266, "grad_norm": 2.299015998840332, "learning_rate": 4.546549449867317e-05, "loss": 0.941, "step": 292200 }, { "epoch": 4.536072875122208, "grad_norm": 2.4338276386260986, "learning_rate": 4.546394264343023e-05, "loss": 0.9397, "step": 292300 }, { "epoch": 4.537624730365152, "grad_norm": 2.5478644371032715, "learning_rate": 4.5462390788187284e-05, "loss": 0.9628, "step": 292400 }, { "epoch": 4.539176585608095, "grad_norm": 2.269930124282837, "learning_rate": 4.5460838932944335e-05, "loss": 0.9599, "step": 292500 }, { "epoch": 4.540728440851037, "grad_norm": 2.3922383785247803, "learning_rate": 4.545928707770139e-05, "loss": 0.9239, "step": 292600 }, { "epoch": 4.54228029609398, "grad_norm": 2.2027087211608887, "learning_rate": 4.545773522245845e-05, "loss": 0.933, "step": 292700 }, { "epoch": 4.5438321513369235, "grad_norm": 2.6091578006744385, "learning_rate": 4.545618336721551e-05, "loss": 0.9424, "step": 292800 }, { "epoch": 4.545384006579866, "grad_norm": 2.6885271072387695, "learning_rate": 4.5454631511972566e-05, "loss": 0.9494, "step": 292900 }, { "epoch": 4.546935861822809, "grad_norm": 2.3625845909118652, "learning_rate": 4.5453079656729624e-05, "loss": 0.9585, "step": 293000 }, { "epoch": 4.548487717065752, "grad_norm": 2.395141124725342, "learning_rate": 4.545152780148668e-05, "loss": 0.9483, "step": 293100 }, { "epoch": 4.550039572308695, "grad_norm": 2.700472831726074, "learning_rate": 4.544997594624374e-05, "loss": 0.9736, "step": 293200 }, { "epoch": 4.551591427551638, "grad_norm": 2.13814377784729, "learning_rate": 4.54484240910008e-05, "loss": 0.9377, "step": 293300 }, { "epoch": 4.553143282794581, "grad_norm": 2.1979050636291504, "learning_rate": 4.544687223575785e-05, "loss": 0.9392, "step": 293400 }, { "epoch": 4.554695138037524, "grad_norm": 2.1348674297332764, "learning_rate": 4.5445320380514906e-05, "loss": 0.9537, "step": 293500 }, { "epoch": 4.556246993280467, "grad_norm": 2.378378391265869, "learning_rate": 4.5443768525271964e-05, "loss": 0.9687, "step": 293600 }, { "epoch": 4.55779884852341, "grad_norm": 2.3907549381256104, "learning_rate": 4.544221667002902e-05, "loss": 0.9683, "step": 293700 }, { "epoch": 4.559350703766353, "grad_norm": 2.3627641201019287, "learning_rate": 4.544066481478608e-05, "loss": 0.958, "step": 293800 }, { "epoch": 4.560902559009295, "grad_norm": 2.0132877826690674, "learning_rate": 4.543911295954314e-05, "loss": 0.9502, "step": 293900 }, { "epoch": 4.5624544142522385, "grad_norm": 2.7299630641937256, "learning_rate": 4.543756110430019e-05, "loss": 0.9561, "step": 294000 }, { "epoch": 4.564006269495182, "grad_norm": 2.109203338623047, "learning_rate": 4.5436009249057246e-05, "loss": 0.943, "step": 294100 }, { "epoch": 4.565558124738124, "grad_norm": 2.6123456954956055, "learning_rate": 4.5434457393814304e-05, "loss": 0.9319, "step": 294200 }, { "epoch": 4.567109979981067, "grad_norm": 2.1620535850524902, "learning_rate": 4.543290553857136e-05, "loss": 0.9438, "step": 294300 }, { "epoch": 4.5686618352240105, "grad_norm": 2.742380142211914, "learning_rate": 4.543135368332842e-05, "loss": 0.9419, "step": 294400 }, { "epoch": 4.570213690466954, "grad_norm": 2.119188070297241, "learning_rate": 4.542980182808548e-05, "loss": 0.9532, "step": 294500 }, { "epoch": 4.571765545709896, "grad_norm": 1.7152527570724487, "learning_rate": 4.5428249972842535e-05, "loss": 0.9639, "step": 294600 }, { "epoch": 4.573317400952839, "grad_norm": 2.174830675125122, "learning_rate": 4.542669811759959e-05, "loss": 0.9601, "step": 294700 }, { "epoch": 4.574869256195782, "grad_norm": 2.6380772590637207, "learning_rate": 4.542514626235665e-05, "loss": 0.9754, "step": 294800 }, { "epoch": 4.576421111438725, "grad_norm": 2.2421088218688965, "learning_rate": 4.542359440711371e-05, "loss": 0.9761, "step": 294900 }, { "epoch": 4.577972966681668, "grad_norm": 2.592803716659546, "learning_rate": 4.5422042551870766e-05, "loss": 0.9578, "step": 295000 }, { "epoch": 4.579524821924611, "grad_norm": 2.509744882583618, "learning_rate": 4.5420490696627823e-05, "loss": 0.9657, "step": 295100 }, { "epoch": 4.5810766771675535, "grad_norm": 2.471644878387451, "learning_rate": 4.541893884138488e-05, "loss": 0.9761, "step": 295200 }, { "epoch": 4.582628532410497, "grad_norm": 2.42669677734375, "learning_rate": 4.541738698614193e-05, "loss": 0.9762, "step": 295300 }, { "epoch": 4.58418038765344, "grad_norm": 2.770900249481201, "learning_rate": 4.541583513089899e-05, "loss": 0.95, "step": 295400 }, { "epoch": 4.585732242896382, "grad_norm": 2.1635518074035645, "learning_rate": 4.541428327565605e-05, "loss": 0.9668, "step": 295500 }, { "epoch": 4.5872840981393255, "grad_norm": 2.4254140853881836, "learning_rate": 4.5412731420413105e-05, "loss": 0.9703, "step": 295600 }, { "epoch": 4.588835953382269, "grad_norm": 2.157076835632324, "learning_rate": 4.541117956517016e-05, "loss": 0.9478, "step": 295700 }, { "epoch": 4.590387808625211, "grad_norm": 2.430096387863159, "learning_rate": 4.540962770992722e-05, "loss": 0.9668, "step": 295800 }, { "epoch": 4.591939663868154, "grad_norm": 3.5620813369750977, "learning_rate": 4.540807585468428e-05, "loss": 0.9579, "step": 295900 }, { "epoch": 4.593491519111097, "grad_norm": 2.5569629669189453, "learning_rate": 4.5406523999441336e-05, "loss": 0.931, "step": 296000 }, { "epoch": 4.59504337435404, "grad_norm": 2.6477017402648926, "learning_rate": 4.5404972144198394e-05, "loss": 0.9459, "step": 296100 }, { "epoch": 4.596595229596983, "grad_norm": 1.7794461250305176, "learning_rate": 4.540342028895545e-05, "loss": 0.9322, "step": 296200 }, { "epoch": 4.598147084839926, "grad_norm": 2.5716915130615234, "learning_rate": 4.540186843371251e-05, "loss": 0.9709, "step": 296300 }, { "epoch": 4.599698940082869, "grad_norm": 2.1774649620056152, "learning_rate": 4.540031657846957e-05, "loss": 0.9443, "step": 296400 }, { "epoch": 4.601250795325812, "grad_norm": 2.4179298877716064, "learning_rate": 4.5398764723226625e-05, "loss": 0.9434, "step": 296500 }, { "epoch": 4.602802650568755, "grad_norm": 2.4155092239379883, "learning_rate": 4.5397212867983676e-05, "loss": 0.9695, "step": 296600 }, { "epoch": 4.604354505811698, "grad_norm": 2.083080768585205, "learning_rate": 4.5395661012740734e-05, "loss": 0.9401, "step": 296700 }, { "epoch": 4.6059063610546405, "grad_norm": 2.6314797401428223, "learning_rate": 4.539410915749779e-05, "loss": 0.9537, "step": 296800 }, { "epoch": 4.607458216297584, "grad_norm": 2.621185302734375, "learning_rate": 4.539255730225484e-05, "loss": 0.9459, "step": 296900 }, { "epoch": 4.609010071540527, "grad_norm": 2.389692783355713, "learning_rate": 4.53910054470119e-05, "loss": 0.9775, "step": 297000 }, { "epoch": 4.61056192678347, "grad_norm": 2.113222122192383, "learning_rate": 4.538945359176896e-05, "loss": 0.9531, "step": 297100 }, { "epoch": 4.612113782026412, "grad_norm": 2.478236675262451, "learning_rate": 4.5387901736526016e-05, "loss": 0.9392, "step": 297200 }, { "epoch": 4.613665637269356, "grad_norm": 2.223532199859619, "learning_rate": 4.5386349881283074e-05, "loss": 0.9567, "step": 297300 }, { "epoch": 4.615217492512299, "grad_norm": 2.3101089000701904, "learning_rate": 4.538479802604013e-05, "loss": 0.9696, "step": 297400 }, { "epoch": 4.616769347755241, "grad_norm": 2.198864459991455, "learning_rate": 4.538324617079719e-05, "loss": 0.9473, "step": 297500 }, { "epoch": 4.618321202998184, "grad_norm": 2.7675015926361084, "learning_rate": 4.538169431555425e-05, "loss": 0.9755, "step": 297600 }, { "epoch": 4.619873058241128, "grad_norm": 2.713266372680664, "learning_rate": 4.5380142460311305e-05, "loss": 0.9504, "step": 297700 }, { "epoch": 4.62142491348407, "grad_norm": 2.9661388397216797, "learning_rate": 4.537859060506836e-05, "loss": 0.9616, "step": 297800 }, { "epoch": 4.622976768727013, "grad_norm": 1.937831163406372, "learning_rate": 4.537703874982542e-05, "loss": 0.9409, "step": 297900 }, { "epoch": 4.624528623969956, "grad_norm": 3.515504837036133, "learning_rate": 4.537548689458248e-05, "loss": 0.9658, "step": 298000 }, { "epoch": 4.626080479212899, "grad_norm": 1.8308377265930176, "learning_rate": 4.5373935039339536e-05, "loss": 0.9566, "step": 298100 }, { "epoch": 4.627632334455842, "grad_norm": 2.457736015319824, "learning_rate": 4.537238318409659e-05, "loss": 0.9616, "step": 298200 }, { "epoch": 4.629184189698785, "grad_norm": 2.1030025482177734, "learning_rate": 4.5370831328853644e-05, "loss": 0.9357, "step": 298300 }, { "epoch": 4.630736044941727, "grad_norm": 2.2614853382110596, "learning_rate": 4.53692794736107e-05, "loss": 0.9508, "step": 298400 }, { "epoch": 4.632287900184671, "grad_norm": 2.6679646968841553, "learning_rate": 4.536772761836776e-05, "loss": 0.9461, "step": 298500 }, { "epoch": 4.633839755427614, "grad_norm": 2.871737480163574, "learning_rate": 4.536617576312482e-05, "loss": 0.9467, "step": 298600 }, { "epoch": 4.635391610670556, "grad_norm": 2.198072671890259, "learning_rate": 4.5364623907881875e-05, "loss": 0.9549, "step": 298700 }, { "epoch": 4.636943465913499, "grad_norm": 2.535308599472046, "learning_rate": 4.536307205263893e-05, "loss": 0.9478, "step": 298800 }, { "epoch": 4.638495321156443, "grad_norm": 2.3939363956451416, "learning_rate": 4.536152019739599e-05, "loss": 0.9619, "step": 298900 }, { "epoch": 4.640047176399386, "grad_norm": 2.9425535202026367, "learning_rate": 4.535996834215305e-05, "loss": 0.9581, "step": 299000 }, { "epoch": 4.641599031642328, "grad_norm": 2.3513376712799072, "learning_rate": 4.5358416486910106e-05, "loss": 0.9617, "step": 299100 }, { "epoch": 4.643150886885271, "grad_norm": 1.9393523931503296, "learning_rate": 4.5356864631667164e-05, "loss": 0.9669, "step": 299200 }, { "epoch": 4.6447027421282145, "grad_norm": 2.8623921871185303, "learning_rate": 4.535531277642422e-05, "loss": 0.954, "step": 299300 }, { "epoch": 4.646254597371157, "grad_norm": 2.469696521759033, "learning_rate": 4.535376092118128e-05, "loss": 0.9348, "step": 299400 }, { "epoch": 4.6478064526141, "grad_norm": 2.4211649894714355, "learning_rate": 4.535220906593833e-05, "loss": 0.9712, "step": 299500 }, { "epoch": 4.649358307857043, "grad_norm": 2.3374104499816895, "learning_rate": 4.535065721069539e-05, "loss": 0.971, "step": 299600 }, { "epoch": 4.6509101630999865, "grad_norm": 2.495910167694092, "learning_rate": 4.5349105355452446e-05, "loss": 0.9572, "step": 299700 }, { "epoch": 4.652462018342929, "grad_norm": 2.1944022178649902, "learning_rate": 4.5347553500209504e-05, "loss": 0.95, "step": 299800 }, { "epoch": 4.654013873585872, "grad_norm": 2.166715383529663, "learning_rate": 4.5346001644966555e-05, "loss": 0.9449, "step": 299900 }, { "epoch": 4.655565728828815, "grad_norm": 2.3808345794677734, "learning_rate": 4.534444978972361e-05, "loss": 0.9577, "step": 300000 }, { "epoch": 4.657117584071758, "grad_norm": 2.5008952617645264, "learning_rate": 4.534289793448067e-05, "loss": 0.9616, "step": 300100 }, { "epoch": 4.658669439314701, "grad_norm": 2.5399112701416016, "learning_rate": 4.534134607923773e-05, "loss": 0.9389, "step": 300200 }, { "epoch": 4.660221294557644, "grad_norm": 2.4389514923095703, "learning_rate": 4.5339794223994786e-05, "loss": 0.9481, "step": 300300 }, { "epoch": 4.661773149800586, "grad_norm": 2.2760915756225586, "learning_rate": 4.5338242368751844e-05, "loss": 0.959, "step": 300400 }, { "epoch": 4.6633250050435295, "grad_norm": 2.1339385509490967, "learning_rate": 4.53366905135089e-05, "loss": 0.9486, "step": 300500 }, { "epoch": 4.664876860286473, "grad_norm": 2.5250792503356934, "learning_rate": 4.533513865826596e-05, "loss": 0.9526, "step": 300600 }, { "epoch": 4.666428715529415, "grad_norm": 3.4256186485290527, "learning_rate": 4.533358680302302e-05, "loss": 0.9266, "step": 300700 }, { "epoch": 4.667980570772358, "grad_norm": 2.0367817878723145, "learning_rate": 4.5332034947780075e-05, "loss": 0.9364, "step": 300800 }, { "epoch": 4.6695324260153015, "grad_norm": 2.492945671081543, "learning_rate": 4.533048309253713e-05, "loss": 0.9574, "step": 300900 }, { "epoch": 4.671084281258244, "grad_norm": 2.2522151470184326, "learning_rate": 4.532893123729418e-05, "loss": 0.9558, "step": 301000 }, { "epoch": 4.672636136501187, "grad_norm": 2.7394332885742188, "learning_rate": 4.532737938205124e-05, "loss": 0.9661, "step": 301100 }, { "epoch": 4.67418799174413, "grad_norm": 2.8005149364471436, "learning_rate": 4.53258275268083e-05, "loss": 0.9427, "step": 301200 }, { "epoch": 4.675739846987073, "grad_norm": 3.20951247215271, "learning_rate": 4.532427567156536e-05, "loss": 0.9385, "step": 301300 }, { "epoch": 4.677291702230016, "grad_norm": 2.302304267883301, "learning_rate": 4.5322723816322414e-05, "loss": 0.9513, "step": 301400 }, { "epoch": 4.678843557472959, "grad_norm": 2.3675107955932617, "learning_rate": 4.532117196107947e-05, "loss": 0.959, "step": 301500 }, { "epoch": 4.680395412715902, "grad_norm": 2.521509885787964, "learning_rate": 4.531962010583653e-05, "loss": 0.9323, "step": 301600 }, { "epoch": 4.6819472679588445, "grad_norm": 2.737092971801758, "learning_rate": 4.531806825059359e-05, "loss": 0.937, "step": 301700 }, { "epoch": 4.683499123201788, "grad_norm": 2.374357223510742, "learning_rate": 4.5316516395350645e-05, "loss": 0.9441, "step": 301800 }, { "epoch": 4.685050978444731, "grad_norm": 2.31540584564209, "learning_rate": 4.53149645401077e-05, "loss": 0.9329, "step": 301900 }, { "epoch": 4.686602833687673, "grad_norm": 2.491212844848633, "learning_rate": 4.531341268486476e-05, "loss": 0.9363, "step": 302000 }, { "epoch": 4.6881546889306165, "grad_norm": 2.564307928085327, "learning_rate": 4.531186082962182e-05, "loss": 0.9506, "step": 302100 }, { "epoch": 4.68970654417356, "grad_norm": 2.113341808319092, "learning_rate": 4.5310308974378876e-05, "loss": 0.9373, "step": 302200 }, { "epoch": 4.691258399416503, "grad_norm": 2.4588263034820557, "learning_rate": 4.530875711913593e-05, "loss": 0.9605, "step": 302300 }, { "epoch": 4.692810254659445, "grad_norm": 2.53434157371521, "learning_rate": 4.5307205263892985e-05, "loss": 0.9635, "step": 302400 }, { "epoch": 4.6943621099023884, "grad_norm": 2.5217244625091553, "learning_rate": 4.530565340865004e-05, "loss": 0.9314, "step": 302500 }, { "epoch": 4.695913965145332, "grad_norm": 2.2354137897491455, "learning_rate": 4.53041015534071e-05, "loss": 0.929, "step": 302600 }, { "epoch": 4.697465820388274, "grad_norm": 2.3532631397247314, "learning_rate": 4.530254969816416e-05, "loss": 0.942, "step": 302700 }, { "epoch": 4.699017675631217, "grad_norm": 1.9990977048873901, "learning_rate": 4.5300997842921216e-05, "loss": 0.9436, "step": 302800 }, { "epoch": 4.70056953087416, "grad_norm": 2.713402032852173, "learning_rate": 4.5299445987678274e-05, "loss": 0.9357, "step": 302900 }, { "epoch": 4.702121386117103, "grad_norm": 1.8433403968811035, "learning_rate": 4.529789413243533e-05, "loss": 0.9573, "step": 303000 }, { "epoch": 4.703673241360046, "grad_norm": 2.53104305267334, "learning_rate": 4.529634227719239e-05, "loss": 0.9547, "step": 303100 }, { "epoch": 4.705225096602989, "grad_norm": 2.123872995376587, "learning_rate": 4.529479042194944e-05, "loss": 0.9464, "step": 303200 }, { "epoch": 4.7067769518459315, "grad_norm": 2.5783441066741943, "learning_rate": 4.52932385667065e-05, "loss": 0.9464, "step": 303300 }, { "epoch": 4.708328807088875, "grad_norm": 2.440621852874756, "learning_rate": 4.5291686711463556e-05, "loss": 0.948, "step": 303400 }, { "epoch": 4.709880662331818, "grad_norm": 2.6394476890563965, "learning_rate": 4.5290134856220614e-05, "loss": 0.9603, "step": 303500 }, { "epoch": 4.71143251757476, "grad_norm": 2.669133424758911, "learning_rate": 4.528858300097767e-05, "loss": 0.964, "step": 303600 }, { "epoch": 4.712984372817703, "grad_norm": 2.4810025691986084, "learning_rate": 4.528703114573473e-05, "loss": 0.9492, "step": 303700 }, { "epoch": 4.714536228060647, "grad_norm": 2.4260315895080566, "learning_rate": 4.528547929049179e-05, "loss": 0.9219, "step": 303800 }, { "epoch": 4.716088083303589, "grad_norm": 3.294248580932617, "learning_rate": 4.528392743524884e-05, "loss": 0.9321, "step": 303900 }, { "epoch": 4.717639938546532, "grad_norm": 2.508601188659668, "learning_rate": 4.5282375580005896e-05, "loss": 0.9474, "step": 304000 }, { "epoch": 4.719191793789475, "grad_norm": 1.9785510301589966, "learning_rate": 4.528082372476295e-05, "loss": 0.9449, "step": 304100 }, { "epoch": 4.720743649032419, "grad_norm": 2.0720789432525635, "learning_rate": 4.527927186952001e-05, "loss": 0.9708, "step": 304200 }, { "epoch": 4.722295504275361, "grad_norm": 2.878784418106079, "learning_rate": 4.527772001427707e-05, "loss": 0.9724, "step": 304300 }, { "epoch": 4.723847359518304, "grad_norm": 1.8409661054611206, "learning_rate": 4.527616815903413e-05, "loss": 0.9421, "step": 304400 }, { "epoch": 4.725399214761247, "grad_norm": 2.318964719772339, "learning_rate": 4.5274616303791184e-05, "loss": 0.9488, "step": 304500 }, { "epoch": 4.72695107000419, "grad_norm": 1.9378513097763062, "learning_rate": 4.527306444854824e-05, "loss": 0.934, "step": 304600 }, { "epoch": 4.728502925247133, "grad_norm": 1.8985319137573242, "learning_rate": 4.52715125933053e-05, "loss": 0.9486, "step": 304700 }, { "epoch": 4.730054780490076, "grad_norm": 2.23698091506958, "learning_rate": 4.526996073806236e-05, "loss": 0.9632, "step": 304800 }, { "epoch": 4.731606635733019, "grad_norm": 2.271968364715576, "learning_rate": 4.5268408882819415e-05, "loss": 0.9579, "step": 304900 }, { "epoch": 4.733158490975962, "grad_norm": 2.294938325881958, "learning_rate": 4.526685702757647e-05, "loss": 0.9372, "step": 305000 }, { "epoch": 4.734710346218905, "grad_norm": 2.257526397705078, "learning_rate": 4.526530517233353e-05, "loss": 0.9533, "step": 305100 }, { "epoch": 4.736262201461848, "grad_norm": 2.28794527053833, "learning_rate": 4.526375331709058e-05, "loss": 0.939, "step": 305200 }, { "epoch": 4.73781405670479, "grad_norm": 2.109142541885376, "learning_rate": 4.526220146184764e-05, "loss": 0.949, "step": 305300 }, { "epoch": 4.739365911947734, "grad_norm": 2.1313722133636475, "learning_rate": 4.52606496066047e-05, "loss": 0.9233, "step": 305400 }, { "epoch": 4.740917767190677, "grad_norm": 2.1974198818206787, "learning_rate": 4.5259097751361755e-05, "loss": 0.955, "step": 305500 }, { "epoch": 4.742469622433619, "grad_norm": 2.9074625968933105, "learning_rate": 4.525754589611881e-05, "loss": 0.954, "step": 305600 }, { "epoch": 4.744021477676562, "grad_norm": 2.4265270233154297, "learning_rate": 4.525599404087587e-05, "loss": 0.9431, "step": 305700 }, { "epoch": 4.745573332919506, "grad_norm": 2.2297825813293457, "learning_rate": 4.525444218563293e-05, "loss": 0.9488, "step": 305800 }, { "epoch": 4.747125188162448, "grad_norm": 2.308871030807495, "learning_rate": 4.5252890330389986e-05, "loss": 0.936, "step": 305900 }, { "epoch": 4.748677043405391, "grad_norm": 2.086238145828247, "learning_rate": 4.5251338475147044e-05, "loss": 0.9572, "step": 306000 }, { "epoch": 4.750228898648334, "grad_norm": 2.222214698791504, "learning_rate": 4.52497866199041e-05, "loss": 0.9417, "step": 306100 }, { "epoch": 4.751780753891277, "grad_norm": 3.085169792175293, "learning_rate": 4.524823476466116e-05, "loss": 0.9381, "step": 306200 }, { "epoch": 4.75333260913422, "grad_norm": 2.2150168418884277, "learning_rate": 4.524668290941822e-05, "loss": 0.9303, "step": 306300 }, { "epoch": 4.754884464377163, "grad_norm": 2.4674439430236816, "learning_rate": 4.524513105417527e-05, "loss": 0.9429, "step": 306400 }, { "epoch": 4.756436319620105, "grad_norm": 2.1495025157928467, "learning_rate": 4.5243579198932326e-05, "loss": 0.946, "step": 306500 }, { "epoch": 4.757988174863049, "grad_norm": 2.4638724327087402, "learning_rate": 4.5242027343689384e-05, "loss": 0.9487, "step": 306600 }, { "epoch": 4.759540030105992, "grad_norm": 2.306675672531128, "learning_rate": 4.5240475488446435e-05, "loss": 0.9285, "step": 306700 }, { "epoch": 4.761091885348935, "grad_norm": 2.408397674560547, "learning_rate": 4.523892363320349e-05, "loss": 0.9332, "step": 306800 }, { "epoch": 4.762643740591877, "grad_norm": 2.746066093444824, "learning_rate": 4.523737177796055e-05, "loss": 0.9342, "step": 306900 }, { "epoch": 4.7641955958348206, "grad_norm": 3.228149652481079, "learning_rate": 4.523581992271761e-05, "loss": 0.9462, "step": 307000 }, { "epoch": 4.765747451077764, "grad_norm": 2.505145311355591, "learning_rate": 4.5234268067474666e-05, "loss": 0.959, "step": 307100 }, { "epoch": 4.767299306320706, "grad_norm": 27.44329833984375, "learning_rate": 4.523271621223172e-05, "loss": 0.9269, "step": 307200 }, { "epoch": 4.768851161563649, "grad_norm": 2.345979690551758, "learning_rate": 4.523116435698878e-05, "loss": 0.9273, "step": 307300 }, { "epoch": 4.7704030168065925, "grad_norm": 2.210143566131592, "learning_rate": 4.522961250174584e-05, "loss": 0.9383, "step": 307400 }, { "epoch": 4.771954872049535, "grad_norm": 2.242017984390259, "learning_rate": 4.52280606465029e-05, "loss": 0.9252, "step": 307500 }, { "epoch": 4.773506727292478, "grad_norm": 2.43339204788208, "learning_rate": 4.5226508791259954e-05, "loss": 0.9264, "step": 307600 }, { "epoch": 4.775058582535421, "grad_norm": 2.5085997581481934, "learning_rate": 4.522495693601701e-05, "loss": 0.9424, "step": 307700 }, { "epoch": 4.7766104377783645, "grad_norm": 2.275557518005371, "learning_rate": 4.522340508077407e-05, "loss": 0.9465, "step": 307800 }, { "epoch": 4.778162293021307, "grad_norm": 2.130398988723755, "learning_rate": 4.522185322553113e-05, "loss": 0.9339, "step": 307900 }, { "epoch": 4.77971414826425, "grad_norm": 3.118187427520752, "learning_rate": 4.522030137028818e-05, "loss": 0.9275, "step": 308000 }, { "epoch": 4.781266003507193, "grad_norm": 2.5063211917877197, "learning_rate": 4.5218749515045236e-05, "loss": 0.9488, "step": 308100 }, { "epoch": 4.7828178587501355, "grad_norm": 2.797496795654297, "learning_rate": 4.5217197659802294e-05, "loss": 0.9361, "step": 308200 }, { "epoch": 4.784369713993079, "grad_norm": 12.530488967895508, "learning_rate": 4.521564580455935e-05, "loss": 0.9426, "step": 308300 }, { "epoch": 4.785921569236022, "grad_norm": 2.470473289489746, "learning_rate": 4.521409394931641e-05, "loss": 0.9436, "step": 308400 }, { "epoch": 4.787473424478964, "grad_norm": 2.588658332824707, "learning_rate": 4.521254209407347e-05, "loss": 0.917, "step": 308500 }, { "epoch": 4.7890252797219075, "grad_norm": 2.357419013977051, "learning_rate": 4.5210990238830525e-05, "loss": 0.9378, "step": 308600 }, { "epoch": 4.790577134964851, "grad_norm": 2.5766124725341797, "learning_rate": 4.520943838358758e-05, "loss": 0.9365, "step": 308700 }, { "epoch": 4.792128990207793, "grad_norm": 2.4629642963409424, "learning_rate": 4.520788652834464e-05, "loss": 0.9439, "step": 308800 }, { "epoch": 4.793680845450736, "grad_norm": 3.1557953357696533, "learning_rate": 4.52063346731017e-05, "loss": 0.969, "step": 308900 }, { "epoch": 4.7952327006936795, "grad_norm": 2.3751039505004883, "learning_rate": 4.5204782817858756e-05, "loss": 0.9372, "step": 309000 }, { "epoch": 4.796784555936622, "grad_norm": 2.600109815597534, "learning_rate": 4.5203230962615814e-05, "loss": 0.9606, "step": 309100 }, { "epoch": 4.798336411179565, "grad_norm": 2.851198434829712, "learning_rate": 4.520167910737287e-05, "loss": 0.9288, "step": 309200 }, { "epoch": 4.799888266422508, "grad_norm": 2.0443432331085205, "learning_rate": 4.520012725212992e-05, "loss": 0.9291, "step": 309300 }, { "epoch": 4.801440121665451, "grad_norm": 1.9613323211669922, "learning_rate": 4.519857539688698e-05, "loss": 0.9558, "step": 309400 }, { "epoch": 4.802991976908394, "grad_norm": 2.1446452140808105, "learning_rate": 4.519702354164404e-05, "loss": 0.9417, "step": 309500 }, { "epoch": 4.804543832151337, "grad_norm": 2.7264506816864014, "learning_rate": 4.5195471686401096e-05, "loss": 0.9352, "step": 309600 }, { "epoch": 4.80609568739428, "grad_norm": 2.7593138217926025, "learning_rate": 4.519391983115815e-05, "loss": 0.9345, "step": 309700 }, { "epoch": 4.8076475426372225, "grad_norm": 2.4503424167633057, "learning_rate": 4.5192367975915205e-05, "loss": 0.9516, "step": 309800 }, { "epoch": 4.809199397880166, "grad_norm": 2.225533962249756, "learning_rate": 4.519081612067226e-05, "loss": 0.9392, "step": 309900 }, { "epoch": 4.810751253123109, "grad_norm": 2.488813638687134, "learning_rate": 4.518926426542932e-05, "loss": 0.9221, "step": 310000 }, { "epoch": 4.812303108366051, "grad_norm": 2.6080517768859863, "learning_rate": 4.518771241018638e-05, "loss": 0.9413, "step": 310100 }, { "epoch": 4.8138549636089945, "grad_norm": 2.2766313552856445, "learning_rate": 4.5186160554943436e-05, "loss": 0.9443, "step": 310200 }, { "epoch": 4.815406818851938, "grad_norm": 2.2937352657318115, "learning_rate": 4.518460869970049e-05, "loss": 0.9441, "step": 310300 }, { "epoch": 4.816958674094881, "grad_norm": 2.4078619480133057, "learning_rate": 4.518305684445755e-05, "loss": 0.925, "step": 310400 }, { "epoch": 4.818510529337823, "grad_norm": 2.3705949783325195, "learning_rate": 4.518150498921461e-05, "loss": 0.9424, "step": 310500 }, { "epoch": 4.820062384580766, "grad_norm": 3.0908713340759277, "learning_rate": 4.517995313397167e-05, "loss": 0.9557, "step": 310600 }, { "epoch": 4.82161423982371, "grad_norm": 2.1127827167510986, "learning_rate": 4.5178401278728724e-05, "loss": 0.926, "step": 310700 }, { "epoch": 4.823166095066652, "grad_norm": 3.0412704944610596, "learning_rate": 4.5176849423485775e-05, "loss": 0.9227, "step": 310800 }, { "epoch": 4.824717950309595, "grad_norm": 2.323084831237793, "learning_rate": 4.517529756824283e-05, "loss": 0.9401, "step": 310900 }, { "epoch": 4.826269805552538, "grad_norm": 2.29738187789917, "learning_rate": 4.517374571299989e-05, "loss": 0.9569, "step": 311000 }, { "epoch": 4.827821660795481, "grad_norm": 3.062674045562744, "learning_rate": 4.517219385775695e-05, "loss": 0.9281, "step": 311100 }, { "epoch": 4.829373516038424, "grad_norm": 2.0380406379699707, "learning_rate": 4.5170642002514006e-05, "loss": 0.9359, "step": 311200 }, { "epoch": 4.830925371281367, "grad_norm": 2.3932437896728516, "learning_rate": 4.5169090147271064e-05, "loss": 0.9297, "step": 311300 }, { "epoch": 4.832477226524309, "grad_norm": 2.5418858528137207, "learning_rate": 4.516753829202812e-05, "loss": 0.9571, "step": 311400 }, { "epoch": 4.834029081767253, "grad_norm": 2.003788948059082, "learning_rate": 4.516598643678518e-05, "loss": 0.9324, "step": 311500 }, { "epoch": 4.835580937010196, "grad_norm": 2.5183684825897217, "learning_rate": 4.516443458154224e-05, "loss": 0.9406, "step": 311600 }, { "epoch": 4.837132792253138, "grad_norm": 2.327085256576538, "learning_rate": 4.5162882726299295e-05, "loss": 0.9402, "step": 311700 }, { "epoch": 4.838684647496081, "grad_norm": 2.3215177059173584, "learning_rate": 4.516133087105635e-05, "loss": 0.9483, "step": 311800 }, { "epoch": 4.840236502739025, "grad_norm": 2.8017570972442627, "learning_rate": 4.515977901581341e-05, "loss": 0.9326, "step": 311900 }, { "epoch": 4.841788357981968, "grad_norm": 2.6003901958465576, "learning_rate": 4.515822716057047e-05, "loss": 0.9413, "step": 312000 }, { "epoch": 4.84334021322491, "grad_norm": 2.6965415477752686, "learning_rate": 4.515667530532752e-05, "loss": 0.9376, "step": 312100 }, { "epoch": 4.844892068467853, "grad_norm": 2.2447073459625244, "learning_rate": 4.515512345008458e-05, "loss": 0.9446, "step": 312200 }, { "epoch": 4.846443923710797, "grad_norm": 2.169839859008789, "learning_rate": 4.5153571594841635e-05, "loss": 0.9396, "step": 312300 }, { "epoch": 4.847995778953739, "grad_norm": 2.1079068183898926, "learning_rate": 4.515201973959869e-05, "loss": 0.9375, "step": 312400 }, { "epoch": 4.849547634196682, "grad_norm": 2.366875171661377, "learning_rate": 4.515046788435575e-05, "loss": 0.9365, "step": 312500 }, { "epoch": 4.851099489439625, "grad_norm": 3.9726710319519043, "learning_rate": 4.514891602911281e-05, "loss": 0.9287, "step": 312600 }, { "epoch": 4.852651344682568, "grad_norm": 2.3037776947021484, "learning_rate": 4.5147364173869866e-05, "loss": 0.9549, "step": 312700 }, { "epoch": 4.854203199925511, "grad_norm": 1.9534844160079956, "learning_rate": 4.5145812318626924e-05, "loss": 0.9522, "step": 312800 }, { "epoch": 4.855755055168454, "grad_norm": 2.3433802127838135, "learning_rate": 4.5144260463383975e-05, "loss": 0.9325, "step": 312900 }, { "epoch": 4.857306910411397, "grad_norm": 2.0605649948120117, "learning_rate": 4.514270860814103e-05, "loss": 0.963, "step": 313000 }, { "epoch": 4.85885876565434, "grad_norm": 2.480656862258911, "learning_rate": 4.514115675289809e-05, "loss": 0.9226, "step": 313100 }, { "epoch": 4.860410620897283, "grad_norm": 2.043947458267212, "learning_rate": 4.513960489765515e-05, "loss": 0.9389, "step": 313200 }, { "epoch": 4.861962476140226, "grad_norm": 2.31315016746521, "learning_rate": 4.5138053042412206e-05, "loss": 0.928, "step": 313300 }, { "epoch": 4.863514331383168, "grad_norm": 1.9552295207977295, "learning_rate": 4.513650118716926e-05, "loss": 0.9622, "step": 313400 }, { "epoch": 4.865066186626112, "grad_norm": 2.3834071159362793, "learning_rate": 4.513494933192632e-05, "loss": 0.9463, "step": 313500 }, { "epoch": 4.866618041869055, "grad_norm": 2.124959945678711, "learning_rate": 4.513339747668338e-05, "loss": 0.9713, "step": 313600 }, { "epoch": 4.868169897111997, "grad_norm": 3.45064115524292, "learning_rate": 4.513184562144043e-05, "loss": 0.9206, "step": 313700 }, { "epoch": 4.86972175235494, "grad_norm": 2.1431915760040283, "learning_rate": 4.513029376619749e-05, "loss": 0.9413, "step": 313800 }, { "epoch": 4.8712736075978835, "grad_norm": 2.631978988647461, "learning_rate": 4.5128741910954545e-05, "loss": 0.9381, "step": 313900 }, { "epoch": 4.872825462840826, "grad_norm": 2.7962300777435303, "learning_rate": 4.51271900557116e-05, "loss": 0.9546, "step": 314000 }, { "epoch": 4.874377318083769, "grad_norm": 2.35111927986145, "learning_rate": 4.512563820046866e-05, "loss": 0.938, "step": 314100 }, { "epoch": 4.875929173326712, "grad_norm": 2.3714141845703125, "learning_rate": 4.512408634522572e-05, "loss": 0.9435, "step": 314200 }, { "epoch": 4.877481028569655, "grad_norm": 1.9210163354873657, "learning_rate": 4.5122534489982776e-05, "loss": 0.9688, "step": 314300 }, { "epoch": 4.879032883812598, "grad_norm": 2.048096179962158, "learning_rate": 4.5120982634739834e-05, "loss": 0.9393, "step": 314400 }, { "epoch": 4.880584739055541, "grad_norm": 2.1443026065826416, "learning_rate": 4.511943077949689e-05, "loss": 0.9384, "step": 314500 }, { "epoch": 4.882136594298483, "grad_norm": 1.9021283388137817, "learning_rate": 4.511787892425395e-05, "loss": 0.9323, "step": 314600 }, { "epoch": 4.883688449541427, "grad_norm": 2.0323243141174316, "learning_rate": 4.511632706901101e-05, "loss": 0.9296, "step": 314700 }, { "epoch": 4.88524030478437, "grad_norm": 2.9464282989501953, "learning_rate": 4.5114775213768065e-05, "loss": 0.9402, "step": 314800 }, { "epoch": 4.886792160027313, "grad_norm": 14.345932006835938, "learning_rate": 4.511322335852512e-05, "loss": 0.9347, "step": 314900 }, { "epoch": 4.888344015270255, "grad_norm": 2.434706926345825, "learning_rate": 4.5111671503282174e-05, "loss": 0.9478, "step": 315000 }, { "epoch": 4.8898958705131985, "grad_norm": 2.3757030963897705, "learning_rate": 4.511011964803923e-05, "loss": 0.939, "step": 315100 }, { "epoch": 4.891447725756142, "grad_norm": 2.4376766681671143, "learning_rate": 4.510856779279629e-05, "loss": 0.941, "step": 315200 }, { "epoch": 4.892999580999084, "grad_norm": 2.4705677032470703, "learning_rate": 4.510701593755335e-05, "loss": 0.9508, "step": 315300 }, { "epoch": 4.894551436242027, "grad_norm": 2.3631067276000977, "learning_rate": 4.5105464082310405e-05, "loss": 0.9443, "step": 315400 }, { "epoch": 4.8961032914849705, "grad_norm": 3.021822690963745, "learning_rate": 4.510391222706746e-05, "loss": 0.9521, "step": 315500 }, { "epoch": 4.897655146727914, "grad_norm": 2.4929983615875244, "learning_rate": 4.510236037182452e-05, "loss": 0.946, "step": 315600 }, { "epoch": 4.899207001970856, "grad_norm": 1.9404605627059937, "learning_rate": 4.510080851658158e-05, "loss": 0.94, "step": 315700 }, { "epoch": 4.900758857213799, "grad_norm": 2.1979308128356934, "learning_rate": 4.5099256661338636e-05, "loss": 0.9487, "step": 315800 }, { "epoch": 4.902310712456742, "grad_norm": 2.137218475341797, "learning_rate": 4.5097704806095694e-05, "loss": 0.9337, "step": 315900 }, { "epoch": 4.903862567699685, "grad_norm": 1.8631007671356201, "learning_rate": 4.509615295085275e-05, "loss": 0.9486, "step": 316000 }, { "epoch": 4.905414422942628, "grad_norm": 2.47115421295166, "learning_rate": 4.509460109560981e-05, "loss": 0.9432, "step": 316100 }, { "epoch": 4.906966278185571, "grad_norm": 2.021848678588867, "learning_rate": 4.509304924036686e-05, "loss": 0.9452, "step": 316200 }, { "epoch": 4.9085181334285135, "grad_norm": 2.8513197898864746, "learning_rate": 4.509149738512392e-05, "loss": 0.9406, "step": 316300 }, { "epoch": 4.910069988671457, "grad_norm": 2.087790012359619, "learning_rate": 4.5089945529880976e-05, "loss": 0.9527, "step": 316400 }, { "epoch": 4.9116218439144, "grad_norm": 2.5270330905914307, "learning_rate": 4.5088393674638027e-05, "loss": 0.9516, "step": 316500 }, { "epoch": 4.913173699157342, "grad_norm": 2.2381742000579834, "learning_rate": 4.5086841819395084e-05, "loss": 0.9501, "step": 316600 }, { "epoch": 4.9147255544002855, "grad_norm": 2.1702990531921387, "learning_rate": 4.508528996415214e-05, "loss": 0.9459, "step": 316700 }, { "epoch": 4.916277409643229, "grad_norm": 2.5202646255493164, "learning_rate": 4.50837381089092e-05, "loss": 0.9593, "step": 316800 }, { "epoch": 4.917829264886171, "grad_norm": 1.9539436101913452, "learning_rate": 4.508218625366626e-05, "loss": 0.9378, "step": 316900 }, { "epoch": 4.919381120129114, "grad_norm": 2.3091795444488525, "learning_rate": 4.5080634398423315e-05, "loss": 0.9282, "step": 317000 }, { "epoch": 4.920932975372057, "grad_norm": 2.23740291595459, "learning_rate": 4.507908254318037e-05, "loss": 0.9349, "step": 317100 }, { "epoch": 4.922484830615, "grad_norm": 2.3331758975982666, "learning_rate": 4.507753068793743e-05, "loss": 0.9502, "step": 317200 }, { "epoch": 4.924036685857943, "grad_norm": 2.3883750438690186, "learning_rate": 4.507597883269449e-05, "loss": 0.9351, "step": 317300 }, { "epoch": 4.925588541100886, "grad_norm": 2.023210048675537, "learning_rate": 4.5074426977451546e-05, "loss": 0.9489, "step": 317400 }, { "epoch": 4.927140396343829, "grad_norm": 2.226330041885376, "learning_rate": 4.5072875122208604e-05, "loss": 0.9353, "step": 317500 }, { "epoch": 4.928692251586772, "grad_norm": 3.3345024585723877, "learning_rate": 4.507132326696566e-05, "loss": 0.961, "step": 317600 }, { "epoch": 4.930244106829715, "grad_norm": 2.3615920543670654, "learning_rate": 4.506977141172272e-05, "loss": 0.9486, "step": 317700 }, { "epoch": 4.931795962072658, "grad_norm": 2.391618013381958, "learning_rate": 4.506821955647977e-05, "loss": 0.9597, "step": 317800 }, { "epoch": 4.9333478173156005, "grad_norm": 2.254703998565674, "learning_rate": 4.506666770123683e-05, "loss": 0.949, "step": 317900 }, { "epoch": 4.934899672558544, "grad_norm": 2.2051382064819336, "learning_rate": 4.5065115845993886e-05, "loss": 0.9649, "step": 318000 }, { "epoch": 4.936451527801487, "grad_norm": 2.0978894233703613, "learning_rate": 4.5063563990750944e-05, "loss": 0.9496, "step": 318100 }, { "epoch": 4.93800338304443, "grad_norm": 2.8717241287231445, "learning_rate": 4.5062012135508e-05, "loss": 0.9488, "step": 318200 }, { "epoch": 4.939555238287372, "grad_norm": 2.585404872894287, "learning_rate": 4.506046028026506e-05, "loss": 0.943, "step": 318300 }, { "epoch": 4.941107093530316, "grad_norm": 2.507885456085205, "learning_rate": 4.505890842502212e-05, "loss": 0.9277, "step": 318400 }, { "epoch": 4.942658948773259, "grad_norm": 2.9575016498565674, "learning_rate": 4.5057356569779175e-05, "loss": 0.9404, "step": 318500 }, { "epoch": 4.944210804016201, "grad_norm": 2.8423197269439697, "learning_rate": 4.505580471453623e-05, "loss": 0.954, "step": 318600 }, { "epoch": 4.945762659259144, "grad_norm": 2.213393211364746, "learning_rate": 4.505425285929329e-05, "loss": 0.9425, "step": 318700 }, { "epoch": 4.947314514502088, "grad_norm": 2.4073281288146973, "learning_rate": 4.505270100405035e-05, "loss": 0.9476, "step": 318800 }, { "epoch": 4.94886636974503, "grad_norm": 2.2753820419311523, "learning_rate": 4.5051149148807406e-05, "loss": 0.9355, "step": 318900 }, { "epoch": 4.950418224987973, "grad_norm": 1.902570366859436, "learning_rate": 4.5049597293564464e-05, "loss": 0.9593, "step": 319000 }, { "epoch": 4.951970080230916, "grad_norm": 2.098022937774658, "learning_rate": 4.5048045438321515e-05, "loss": 0.9167, "step": 319100 }, { "epoch": 4.953521935473859, "grad_norm": 2.5977885723114014, "learning_rate": 4.504649358307857e-05, "loss": 0.9303, "step": 319200 }, { "epoch": 4.955073790716802, "grad_norm": 3.268897533416748, "learning_rate": 4.504494172783563e-05, "loss": 0.9319, "step": 319300 }, { "epoch": 4.956625645959745, "grad_norm": 2.9118943214416504, "learning_rate": 4.504338987259268e-05, "loss": 0.9416, "step": 319400 }, { "epoch": 4.958177501202687, "grad_norm": 2.113837718963623, "learning_rate": 4.504183801734974e-05, "loss": 0.9477, "step": 319500 }, { "epoch": 4.959729356445631, "grad_norm": 2.4569010734558105, "learning_rate": 4.5040286162106797e-05, "loss": 0.9444, "step": 319600 }, { "epoch": 4.961281211688574, "grad_norm": 2.220038414001465, "learning_rate": 4.5038734306863854e-05, "loss": 0.9203, "step": 319700 }, { "epoch": 4.962833066931516, "grad_norm": 4.9932355880737305, "learning_rate": 4.503718245162091e-05, "loss": 0.9584, "step": 319800 }, { "epoch": 4.964384922174459, "grad_norm": 2.542505979537964, "learning_rate": 4.503563059637797e-05, "loss": 0.9545, "step": 319900 }, { "epoch": 4.965936777417403, "grad_norm": 5.872199058532715, "learning_rate": 4.503407874113503e-05, "loss": 0.963, "step": 320000 }, { "epoch": 4.967488632660346, "grad_norm": 2.5846688747406006, "learning_rate": 4.5032526885892085e-05, "loss": 0.9426, "step": 320100 }, { "epoch": 4.969040487903288, "grad_norm": 2.4904370307922363, "learning_rate": 4.503097503064914e-05, "loss": 0.939, "step": 320200 }, { "epoch": 4.970592343146231, "grad_norm": 2.5601537227630615, "learning_rate": 4.50294231754062e-05, "loss": 0.9381, "step": 320300 }, { "epoch": 4.9721441983891745, "grad_norm": 2.1984386444091797, "learning_rate": 4.502787132016326e-05, "loss": 0.9631, "step": 320400 }, { "epoch": 4.973696053632117, "grad_norm": 2.6690120697021484, "learning_rate": 4.5026319464920316e-05, "loss": 0.9203, "step": 320500 }, { "epoch": 4.97524790887506, "grad_norm": 2.3230834007263184, "learning_rate": 4.5024767609677374e-05, "loss": 0.9482, "step": 320600 }, { "epoch": 4.976799764118003, "grad_norm": 2.5702579021453857, "learning_rate": 4.5023215754434425e-05, "loss": 0.9511, "step": 320700 }, { "epoch": 4.9783516193609465, "grad_norm": 5.736624240875244, "learning_rate": 4.502166389919148e-05, "loss": 0.9177, "step": 320800 }, { "epoch": 4.979903474603889, "grad_norm": 2.1526288986206055, "learning_rate": 4.502011204394854e-05, "loss": 0.9492, "step": 320900 }, { "epoch": 4.981455329846832, "grad_norm": 2.507173776626587, "learning_rate": 4.50185601887056e-05, "loss": 0.9445, "step": 321000 }, { "epoch": 4.983007185089775, "grad_norm": 2.477039098739624, "learning_rate": 4.5017008333462656e-05, "loss": 0.9494, "step": 321100 }, { "epoch": 4.984559040332718, "grad_norm": 2.5895519256591797, "learning_rate": 4.5015456478219714e-05, "loss": 0.9525, "step": 321200 }, { "epoch": 4.986110895575661, "grad_norm": 2.174562454223633, "learning_rate": 4.501390462297677e-05, "loss": 0.9447, "step": 321300 }, { "epoch": 4.987662750818604, "grad_norm": 2.219026803970337, "learning_rate": 4.501235276773383e-05, "loss": 0.9332, "step": 321400 }, { "epoch": 4.989214606061546, "grad_norm": 2.6146748065948486, "learning_rate": 4.501080091249089e-05, "loss": 0.9452, "step": 321500 }, { "epoch": 4.9907664613044895, "grad_norm": 2.1792008876800537, "learning_rate": 4.5009249057247945e-05, "loss": 0.9618, "step": 321600 }, { "epoch": 4.992318316547433, "grad_norm": 2.5979883670806885, "learning_rate": 4.5007697202005e-05, "loss": 0.9728, "step": 321700 }, { "epoch": 4.993870171790375, "grad_norm": 2.4659230709075928, "learning_rate": 4.500614534676206e-05, "loss": 0.9519, "step": 321800 }, { "epoch": 4.995422027033318, "grad_norm": 2.5065293312072754, "learning_rate": 4.500459349151912e-05, "loss": 0.9482, "step": 321900 }, { "epoch": 4.9969738822762615, "grad_norm": 2.614793062210083, "learning_rate": 4.500304163627617e-05, "loss": 0.9384, "step": 322000 }, { "epoch": 4.998525737519204, "grad_norm": 2.390781879425049, "learning_rate": 4.500148978103323e-05, "loss": 0.9569, "step": 322100 }, { "epoch": 5.000077592762147, "grad_norm": 2.347316026687622, "learning_rate": 4.4999937925790285e-05, "loss": 0.941, "step": 322200 }, { "epoch": 5.00162944800509, "grad_norm": 2.559129476547241, "learning_rate": 4.499838607054734e-05, "loss": 0.9351, "step": 322300 }, { "epoch": 5.0031813032480335, "grad_norm": 2.4681804180145264, "learning_rate": 4.49968342153044e-05, "loss": 0.9439, "step": 322400 }, { "epoch": 5.004733158490976, "grad_norm": 2.0576131343841553, "learning_rate": 4.499528236006146e-05, "loss": 0.9415, "step": 322500 }, { "epoch": 5.006285013733919, "grad_norm": 2.0235025882720947, "learning_rate": 4.4993730504818516e-05, "loss": 0.943, "step": 322600 }, { "epoch": 5.007836868976862, "grad_norm": 2.803955554962158, "learning_rate": 4.4992178649575567e-05, "loss": 0.9202, "step": 322700 }, { "epoch": 5.0093887242198045, "grad_norm": 2.0899658203125, "learning_rate": 4.4990626794332624e-05, "loss": 0.9487, "step": 322800 }, { "epoch": 5.010940579462748, "grad_norm": 2.5290629863739014, "learning_rate": 4.498907493908968e-05, "loss": 0.9274, "step": 322900 }, { "epoch": 5.012492434705691, "grad_norm": 2.4043097496032715, "learning_rate": 4.498752308384674e-05, "loss": 0.9354, "step": 323000 }, { "epoch": 5.014044289948633, "grad_norm": 2.8713812828063965, "learning_rate": 4.49859712286038e-05, "loss": 0.9249, "step": 323100 }, { "epoch": 5.0155961451915765, "grad_norm": 2.462629556655884, "learning_rate": 4.4984419373360855e-05, "loss": 0.9239, "step": 323200 }, { "epoch": 5.01714800043452, "grad_norm": 2.2615132331848145, "learning_rate": 4.498286751811791e-05, "loss": 0.9177, "step": 323300 }, { "epoch": 5.018699855677462, "grad_norm": 2.3706109523773193, "learning_rate": 4.498131566287497e-05, "loss": 0.9281, "step": 323400 }, { "epoch": 5.020251710920405, "grad_norm": 2.2520835399627686, "learning_rate": 4.497976380763202e-05, "loss": 0.9271, "step": 323500 }, { "epoch": 5.021803566163348, "grad_norm": 1.9909920692443848, "learning_rate": 4.497821195238908e-05, "loss": 0.9245, "step": 323600 }, { "epoch": 5.023355421406292, "grad_norm": 2.263463020324707, "learning_rate": 4.497666009714614e-05, "loss": 0.9413, "step": 323700 }, { "epoch": 5.024907276649234, "grad_norm": 2.642301559448242, "learning_rate": 4.4975108241903195e-05, "loss": 0.9462, "step": 323800 }, { "epoch": 5.026459131892177, "grad_norm": 2.108497142791748, "learning_rate": 4.497355638666025e-05, "loss": 0.9291, "step": 323900 }, { "epoch": 5.02801098713512, "grad_norm": 2.6619019508361816, "learning_rate": 4.497200453141731e-05, "loss": 0.9396, "step": 324000 }, { "epoch": 5.029562842378063, "grad_norm": 2.5671544075012207, "learning_rate": 4.497045267617437e-05, "loss": 0.9156, "step": 324100 }, { "epoch": 5.031114697621006, "grad_norm": 2.0733346939086914, "learning_rate": 4.4968900820931426e-05, "loss": 0.9208, "step": 324200 }, { "epoch": 5.032666552863949, "grad_norm": 2.0850565433502197, "learning_rate": 4.4967348965688484e-05, "loss": 0.9584, "step": 324300 }, { "epoch": 5.0342184081068915, "grad_norm": 2.4272682666778564, "learning_rate": 4.496579711044554e-05, "loss": 0.9225, "step": 324400 }, { "epoch": 5.035770263349835, "grad_norm": 2.279730796813965, "learning_rate": 4.49642452552026e-05, "loss": 0.9201, "step": 324500 }, { "epoch": 5.037322118592778, "grad_norm": 2.8253204822540283, "learning_rate": 4.496269339995966e-05, "loss": 0.9224, "step": 324600 }, { "epoch": 5.03887397383572, "grad_norm": 2.461792230606079, "learning_rate": 4.4961141544716715e-05, "loss": 0.9452, "step": 324700 }, { "epoch": 5.040425829078663, "grad_norm": 2.849407911300659, "learning_rate": 4.4959589689473766e-05, "loss": 0.9179, "step": 324800 }, { "epoch": 5.041977684321607, "grad_norm": 2.8984506130218506, "learning_rate": 4.4958037834230824e-05, "loss": 0.9516, "step": 324900 }, { "epoch": 5.04352953956455, "grad_norm": 2.6941490173339844, "learning_rate": 4.495648597898788e-05, "loss": 0.9487, "step": 325000 }, { "epoch": 5.045081394807492, "grad_norm": 2.036566734313965, "learning_rate": 4.495493412374494e-05, "loss": 0.9338, "step": 325100 }, { "epoch": 5.046633250050435, "grad_norm": 2.200101852416992, "learning_rate": 4.4953382268502e-05, "loss": 0.9551, "step": 325200 }, { "epoch": 5.048185105293379, "grad_norm": 2.4567151069641113, "learning_rate": 4.4951830413259055e-05, "loss": 0.9455, "step": 325300 }, { "epoch": 5.049736960536321, "grad_norm": 2.012946128845215, "learning_rate": 4.495027855801611e-05, "loss": 0.9336, "step": 325400 }, { "epoch": 5.051288815779264, "grad_norm": 2.0810999870300293, "learning_rate": 4.494872670277317e-05, "loss": 0.9409, "step": 325500 }, { "epoch": 5.052840671022207, "grad_norm": 2.3124871253967285, "learning_rate": 4.494717484753023e-05, "loss": 0.9252, "step": 325600 }, { "epoch": 5.05439252626515, "grad_norm": 2.8281376361846924, "learning_rate": 4.4945622992287286e-05, "loss": 0.9464, "step": 325700 }, { "epoch": 5.055944381508093, "grad_norm": 2.099107503890991, "learning_rate": 4.494407113704434e-05, "loss": 0.911, "step": 325800 }, { "epoch": 5.057496236751036, "grad_norm": 2.1740570068359375, "learning_rate": 4.49425192818014e-05, "loss": 0.9414, "step": 325900 }, { "epoch": 5.059048091993978, "grad_norm": 3.989823818206787, "learning_rate": 4.494096742655845e-05, "loss": 0.9431, "step": 326000 }, { "epoch": 5.060599947236922, "grad_norm": 2.4790358543395996, "learning_rate": 4.493941557131551e-05, "loss": 0.9462, "step": 326100 }, { "epoch": 5.062151802479865, "grad_norm": 2.29978346824646, "learning_rate": 4.493786371607257e-05, "loss": 0.9192, "step": 326200 }, { "epoch": 5.063703657722808, "grad_norm": 2.5995535850524902, "learning_rate": 4.493631186082962e-05, "loss": 0.9369, "step": 326300 }, { "epoch": 5.06525551296575, "grad_norm": 2.0604941844940186, "learning_rate": 4.4934760005586676e-05, "loss": 0.9317, "step": 326400 }, { "epoch": 5.066807368208694, "grad_norm": 2.4740841388702393, "learning_rate": 4.4933208150343734e-05, "loss": 0.9466, "step": 326500 }, { "epoch": 5.068359223451637, "grad_norm": 2.4050912857055664, "learning_rate": 4.493165629510079e-05, "loss": 0.9815, "step": 326600 }, { "epoch": 5.069911078694579, "grad_norm": 2.4057934284210205, "learning_rate": 4.493010443985785e-05, "loss": 0.9287, "step": 326700 }, { "epoch": 5.071462933937522, "grad_norm": 2.444791316986084, "learning_rate": 4.492855258461491e-05, "loss": 0.9252, "step": 326800 }, { "epoch": 5.0730147891804656, "grad_norm": 2.4279468059539795, "learning_rate": 4.4927000729371965e-05, "loss": 0.919, "step": 326900 }, { "epoch": 5.074566644423408, "grad_norm": 2.613544464111328, "learning_rate": 4.492544887412902e-05, "loss": 0.936, "step": 327000 }, { "epoch": 5.076118499666351, "grad_norm": 2.088479995727539, "learning_rate": 4.492389701888608e-05, "loss": 0.9626, "step": 327100 }, { "epoch": 5.077670354909294, "grad_norm": 2.518490791320801, "learning_rate": 4.492234516364314e-05, "loss": 0.9565, "step": 327200 }, { "epoch": 5.079222210152237, "grad_norm": 2.275557518005371, "learning_rate": 4.4920793308400196e-05, "loss": 0.9329, "step": 327300 }, { "epoch": 5.08077406539518, "grad_norm": 2.5919785499572754, "learning_rate": 4.4919241453157254e-05, "loss": 0.9334, "step": 327400 }, { "epoch": 5.082325920638123, "grad_norm": 2.203076124191284, "learning_rate": 4.491768959791431e-05, "loss": 0.9201, "step": 327500 }, { "epoch": 5.083877775881065, "grad_norm": 1.9619935750961304, "learning_rate": 4.491613774267136e-05, "loss": 0.9328, "step": 327600 }, { "epoch": 5.085429631124009, "grad_norm": 2.643890619277954, "learning_rate": 4.491458588742842e-05, "loss": 0.9534, "step": 327700 }, { "epoch": 5.086981486366952, "grad_norm": 2.0486152172088623, "learning_rate": 4.491303403218548e-05, "loss": 0.9532, "step": 327800 }, { "epoch": 5.088533341609895, "grad_norm": 2.401646614074707, "learning_rate": 4.4911482176942536e-05, "loss": 0.9527, "step": 327900 }, { "epoch": 5.090085196852837, "grad_norm": 2.308637857437134, "learning_rate": 4.4909930321699594e-05, "loss": 0.9446, "step": 328000 }, { "epoch": 5.0916370520957805, "grad_norm": 2.2360644340515137, "learning_rate": 4.490837846645665e-05, "loss": 0.9444, "step": 328100 }, { "epoch": 5.093188907338724, "grad_norm": 2.147420883178711, "learning_rate": 4.490682661121371e-05, "loss": 0.9717, "step": 328200 }, { "epoch": 5.094740762581666, "grad_norm": 2.7627639770507812, "learning_rate": 4.490527475597077e-05, "loss": 0.9558, "step": 328300 }, { "epoch": 5.096292617824609, "grad_norm": 2.7743945121765137, "learning_rate": 4.4903722900727825e-05, "loss": 0.9551, "step": 328400 }, { "epoch": 5.0978444730675525, "grad_norm": 2.0228092670440674, "learning_rate": 4.490217104548488e-05, "loss": 0.9381, "step": 328500 }, { "epoch": 5.099396328310495, "grad_norm": 2.442352056503296, "learning_rate": 4.490061919024194e-05, "loss": 0.9549, "step": 328600 }, { "epoch": 5.100948183553438, "grad_norm": 2.108042001724243, "learning_rate": 4.4899067334999e-05, "loss": 0.956, "step": 328700 }, { "epoch": 5.102500038796381, "grad_norm": 2.3686277866363525, "learning_rate": 4.4897515479756056e-05, "loss": 0.9613, "step": 328800 }, { "epoch": 5.104051894039324, "grad_norm": 2.1694276332855225, "learning_rate": 4.4895963624513107e-05, "loss": 0.9557, "step": 328900 }, { "epoch": 5.105603749282267, "grad_norm": 2.4864728450775146, "learning_rate": 4.4894411769270164e-05, "loss": 0.9566, "step": 329000 }, { "epoch": 5.10715560452521, "grad_norm": 2.901928424835205, "learning_rate": 4.489285991402722e-05, "loss": 0.937, "step": 329100 }, { "epoch": 5.108707459768153, "grad_norm": 2.5412092208862305, "learning_rate": 4.489130805878427e-05, "loss": 0.9394, "step": 329200 }, { "epoch": 5.1102593150110955, "grad_norm": 3.0296497344970703, "learning_rate": 4.488975620354133e-05, "loss": 0.9261, "step": 329300 }, { "epoch": 5.111811170254039, "grad_norm": 2.289612293243408, "learning_rate": 4.488820434829839e-05, "loss": 0.9411, "step": 329400 }, { "epoch": 5.113363025496982, "grad_norm": 2.537592887878418, "learning_rate": 4.4886652493055446e-05, "loss": 0.9471, "step": 329500 }, { "epoch": 5.114914880739924, "grad_norm": 2.8172383308410645, "learning_rate": 4.4885100637812504e-05, "loss": 0.923, "step": 329600 }, { "epoch": 5.1164667359828675, "grad_norm": 2.1736035346984863, "learning_rate": 4.488354878256956e-05, "loss": 0.9385, "step": 329700 }, { "epoch": 5.118018591225811, "grad_norm": 2.492478847503662, "learning_rate": 4.488199692732662e-05, "loss": 0.9274, "step": 329800 }, { "epoch": 5.119570446468753, "grad_norm": 1.9335956573486328, "learning_rate": 4.488044507208368e-05, "loss": 0.9316, "step": 329900 }, { "epoch": 5.121122301711696, "grad_norm": 2.4427108764648438, "learning_rate": 4.4878893216840735e-05, "loss": 0.9284, "step": 330000 }, { "epoch": 5.1226741569546395, "grad_norm": 2.444591999053955, "learning_rate": 4.487734136159779e-05, "loss": 0.9364, "step": 330100 }, { "epoch": 5.124226012197582, "grad_norm": 3.805903434753418, "learning_rate": 4.487578950635485e-05, "loss": 0.9404, "step": 330200 }, { "epoch": 5.125777867440525, "grad_norm": 2.8099279403686523, "learning_rate": 4.487423765111191e-05, "loss": 0.9381, "step": 330300 }, { "epoch": 5.127329722683468, "grad_norm": 2.311497211456299, "learning_rate": 4.4872685795868966e-05, "loss": 0.9221, "step": 330400 }, { "epoch": 5.128881577926411, "grad_norm": 2.324272394180298, "learning_rate": 4.487113394062602e-05, "loss": 0.9315, "step": 330500 }, { "epoch": 5.130433433169354, "grad_norm": 2.5286362171173096, "learning_rate": 4.4869582085383075e-05, "loss": 0.9608, "step": 330600 }, { "epoch": 5.131985288412297, "grad_norm": 2.3630173206329346, "learning_rate": 4.486803023014013e-05, "loss": 0.9668, "step": 330700 }, { "epoch": 5.13353714365524, "grad_norm": 2.6161348819732666, "learning_rate": 4.486647837489719e-05, "loss": 0.921, "step": 330800 }, { "epoch": 5.1350889988981825, "grad_norm": 2.346592903137207, "learning_rate": 4.486492651965425e-05, "loss": 0.9335, "step": 330900 }, { "epoch": 5.136640854141126, "grad_norm": 2.675856351852417, "learning_rate": 4.4863374664411306e-05, "loss": 0.9427, "step": 331000 }, { "epoch": 5.138192709384069, "grad_norm": 2.246399164199829, "learning_rate": 4.4861822809168364e-05, "loss": 0.9328, "step": 331100 }, { "epoch": 5.139744564627011, "grad_norm": 1.806089997291565, "learning_rate": 4.486027095392542e-05, "loss": 0.9253, "step": 331200 }, { "epoch": 5.1412964198699544, "grad_norm": 2.518101930618286, "learning_rate": 4.485871909868248e-05, "loss": 0.9349, "step": 331300 }, { "epoch": 5.142848275112898, "grad_norm": 2.6788482666015625, "learning_rate": 4.485716724343954e-05, "loss": 0.9317, "step": 331400 }, { "epoch": 5.14440013035584, "grad_norm": 1.9252171516418457, "learning_rate": 4.4855615388196595e-05, "loss": 0.944, "step": 331500 }, { "epoch": 5.145951985598783, "grad_norm": 2.2052817344665527, "learning_rate": 4.485406353295365e-05, "loss": 0.9591, "step": 331600 }, { "epoch": 5.147503840841726, "grad_norm": 2.535583972930908, "learning_rate": 4.485251167771071e-05, "loss": 0.9307, "step": 331700 }, { "epoch": 5.14905569608467, "grad_norm": 2.0313751697540283, "learning_rate": 4.485095982246776e-05, "loss": 0.9574, "step": 331800 }, { "epoch": 5.150607551327612, "grad_norm": 2.3180432319641113, "learning_rate": 4.484940796722482e-05, "loss": 0.9487, "step": 331900 }, { "epoch": 5.152159406570555, "grad_norm": 2.4816319942474365, "learning_rate": 4.4847856111981877e-05, "loss": 0.9296, "step": 332000 }, { "epoch": 5.153711261813498, "grad_norm": 2.6870551109313965, "learning_rate": 4.4846304256738934e-05, "loss": 0.9481, "step": 332100 }, { "epoch": 5.155263117056441, "grad_norm": 1.9675137996673584, "learning_rate": 4.484475240149599e-05, "loss": 0.9496, "step": 332200 }, { "epoch": 5.156814972299384, "grad_norm": 2.095510721206665, "learning_rate": 4.484320054625305e-05, "loss": 0.9232, "step": 332300 }, { "epoch": 5.158366827542327, "grad_norm": 2.2953479290008545, "learning_rate": 4.484164869101011e-05, "loss": 0.9203, "step": 332400 }, { "epoch": 5.159918682785269, "grad_norm": 2.3582136631011963, "learning_rate": 4.484009683576716e-05, "loss": 0.9397, "step": 332500 }, { "epoch": 5.161470538028213, "grad_norm": 2.532912254333496, "learning_rate": 4.4838544980524216e-05, "loss": 0.9301, "step": 332600 }, { "epoch": 5.163022393271156, "grad_norm": 2.7134761810302734, "learning_rate": 4.4836993125281274e-05, "loss": 0.929, "step": 332700 }, { "epoch": 5.164574248514098, "grad_norm": 2.222034215927124, "learning_rate": 4.483544127003833e-05, "loss": 0.9526, "step": 332800 }, { "epoch": 5.166126103757041, "grad_norm": 2.75068998336792, "learning_rate": 4.483388941479539e-05, "loss": 0.9076, "step": 332900 }, { "epoch": 5.167677958999985, "grad_norm": 2.7021796703338623, "learning_rate": 4.483233755955245e-05, "loss": 0.9449, "step": 333000 }, { "epoch": 5.169229814242928, "grad_norm": 3.063997745513916, "learning_rate": 4.4830785704309505e-05, "loss": 0.9501, "step": 333100 }, { "epoch": 5.17078166948587, "grad_norm": 2.347933769226074, "learning_rate": 4.482923384906656e-05, "loss": 0.9117, "step": 333200 }, { "epoch": 5.172333524728813, "grad_norm": 2.0761148929595947, "learning_rate": 4.4827681993823614e-05, "loss": 0.934, "step": 333300 }, { "epoch": 5.173885379971757, "grad_norm": 2.7138662338256836, "learning_rate": 4.482613013858067e-05, "loss": 0.9314, "step": 333400 }, { "epoch": 5.175437235214699, "grad_norm": 2.2544968128204346, "learning_rate": 4.482457828333773e-05, "loss": 0.9191, "step": 333500 }, { "epoch": 5.176989090457642, "grad_norm": 2.0109879970550537, "learning_rate": 4.482302642809479e-05, "loss": 0.9532, "step": 333600 }, { "epoch": 5.178540945700585, "grad_norm": 2.167799234390259, "learning_rate": 4.4821474572851845e-05, "loss": 0.9206, "step": 333700 }, { "epoch": 5.180092800943528, "grad_norm": 1.9144397974014282, "learning_rate": 4.48199227176089e-05, "loss": 0.9362, "step": 333800 }, { "epoch": 5.181644656186471, "grad_norm": 2.3663907051086426, "learning_rate": 4.481837086236596e-05, "loss": 0.9303, "step": 333900 }, { "epoch": 5.183196511429414, "grad_norm": 2.354807138442993, "learning_rate": 4.481681900712302e-05, "loss": 0.9523, "step": 334000 }, { "epoch": 5.184748366672356, "grad_norm": 2.2805490493774414, "learning_rate": 4.4815267151880076e-05, "loss": 0.9192, "step": 334100 }, { "epoch": 5.1863002219153, "grad_norm": 2.1835274696350098, "learning_rate": 4.4813715296637134e-05, "loss": 0.9204, "step": 334200 }, { "epoch": 5.187852077158243, "grad_norm": 2.9378137588500977, "learning_rate": 4.481216344139419e-05, "loss": 0.9324, "step": 334300 }, { "epoch": 5.189403932401186, "grad_norm": 2.423781633377075, "learning_rate": 4.481061158615125e-05, "loss": 0.9332, "step": 334400 }, { "epoch": 5.190955787644128, "grad_norm": 2.2698163986206055, "learning_rate": 4.480905973090831e-05, "loss": 0.9606, "step": 334500 }, { "epoch": 5.192507642887072, "grad_norm": 2.402588129043579, "learning_rate": 4.480750787566536e-05, "loss": 0.949, "step": 334600 }, { "epoch": 5.194059498130015, "grad_norm": 2.5423386096954346, "learning_rate": 4.4805956020422416e-05, "loss": 0.9698, "step": 334700 }, { "epoch": 5.195611353372957, "grad_norm": 2.199608087539673, "learning_rate": 4.480440416517947e-05, "loss": 0.9404, "step": 334800 }, { "epoch": 5.1971632086159, "grad_norm": 2.2218263149261475, "learning_rate": 4.480285230993653e-05, "loss": 0.9442, "step": 334900 }, { "epoch": 5.1987150638588435, "grad_norm": 2.550536870956421, "learning_rate": 4.480130045469359e-05, "loss": 0.9349, "step": 335000 }, { "epoch": 5.200266919101786, "grad_norm": 2.187838554382324, "learning_rate": 4.4799748599450647e-05, "loss": 0.9368, "step": 335100 }, { "epoch": 5.201818774344729, "grad_norm": 2.3110222816467285, "learning_rate": 4.4798196744207704e-05, "loss": 0.9424, "step": 335200 }, { "epoch": 5.203370629587672, "grad_norm": 2.3333053588867188, "learning_rate": 4.479664488896476e-05, "loss": 0.9387, "step": 335300 }, { "epoch": 5.204922484830615, "grad_norm": 2.5928475856781006, "learning_rate": 4.479509303372182e-05, "loss": 0.9472, "step": 335400 }, { "epoch": 5.206474340073558, "grad_norm": 2.4974663257598877, "learning_rate": 4.479354117847888e-05, "loss": 0.9355, "step": 335500 }, { "epoch": 5.208026195316501, "grad_norm": 2.3604626655578613, "learning_rate": 4.4791989323235935e-05, "loss": 0.9367, "step": 335600 }, { "epoch": 5.209578050559444, "grad_norm": 2.5092456340789795, "learning_rate": 4.4790437467992986e-05, "loss": 0.9355, "step": 335700 }, { "epoch": 5.2111299058023866, "grad_norm": 2.696324348449707, "learning_rate": 4.4788885612750044e-05, "loss": 0.9293, "step": 335800 }, { "epoch": 5.21268176104533, "grad_norm": 2.0863029956817627, "learning_rate": 4.47873337575071e-05, "loss": 0.9407, "step": 335900 }, { "epoch": 5.214233616288273, "grad_norm": 2.3235466480255127, "learning_rate": 4.478578190226416e-05, "loss": 0.9436, "step": 336000 }, { "epoch": 5.215785471531215, "grad_norm": 2.279214382171631, "learning_rate": 4.478423004702121e-05, "loss": 0.9427, "step": 336100 }, { "epoch": 5.2173373267741585, "grad_norm": 1.908363938331604, "learning_rate": 4.478267819177827e-05, "loss": 0.9362, "step": 336200 }, { "epoch": 5.218889182017102, "grad_norm": 2.4290566444396973, "learning_rate": 4.4781126336535326e-05, "loss": 0.9302, "step": 336300 }, { "epoch": 5.220441037260044, "grad_norm": 2.5620579719543457, "learning_rate": 4.4779574481292384e-05, "loss": 0.9562, "step": 336400 }, { "epoch": 5.221992892502987, "grad_norm": 2.550293445587158, "learning_rate": 4.477802262604944e-05, "loss": 0.9362, "step": 336500 }, { "epoch": 5.2235447477459305, "grad_norm": 2.330932140350342, "learning_rate": 4.47764707708065e-05, "loss": 0.9383, "step": 336600 }, { "epoch": 5.225096602988873, "grad_norm": 2.607112169265747, "learning_rate": 4.477491891556356e-05, "loss": 0.958, "step": 336700 }, { "epoch": 5.226648458231816, "grad_norm": 2.6160247325897217, "learning_rate": 4.4773367060320615e-05, "loss": 0.9271, "step": 336800 }, { "epoch": 5.228200313474759, "grad_norm": 3.3095061779022217, "learning_rate": 4.477181520507767e-05, "loss": 0.9365, "step": 336900 }, { "epoch": 5.229752168717702, "grad_norm": 2.3789634704589844, "learning_rate": 4.477026334983473e-05, "loss": 0.9315, "step": 337000 }, { "epoch": 5.231304023960645, "grad_norm": 2.188098192214966, "learning_rate": 4.476871149459179e-05, "loss": 0.9413, "step": 337100 }, { "epoch": 5.232855879203588, "grad_norm": 2.240095853805542, "learning_rate": 4.4767159639348846e-05, "loss": 0.9386, "step": 337200 }, { "epoch": 5.234407734446531, "grad_norm": 2.2836861610412598, "learning_rate": 4.4765607784105904e-05, "loss": 0.9392, "step": 337300 }, { "epoch": 5.2359595896894735, "grad_norm": 2.523648977279663, "learning_rate": 4.476405592886296e-05, "loss": 0.9367, "step": 337400 }, { "epoch": 5.237511444932417, "grad_norm": 2.7476730346679688, "learning_rate": 4.476250407362001e-05, "loss": 0.9302, "step": 337500 }, { "epoch": 5.23906330017536, "grad_norm": 2.155001401901245, "learning_rate": 4.476095221837707e-05, "loss": 0.9575, "step": 337600 }, { "epoch": 5.240615155418302, "grad_norm": 2.2363150119781494, "learning_rate": 4.475940036313413e-05, "loss": 0.9283, "step": 337700 }, { "epoch": 5.2421670106612455, "grad_norm": 2.6665494441986084, "learning_rate": 4.4757848507891186e-05, "loss": 0.9286, "step": 337800 }, { "epoch": 5.243718865904189, "grad_norm": 2.4878904819488525, "learning_rate": 4.475629665264824e-05, "loss": 0.928, "step": 337900 }, { "epoch": 5.245270721147131, "grad_norm": 2.3347055912017822, "learning_rate": 4.47547447974053e-05, "loss": 0.9357, "step": 338000 }, { "epoch": 5.246822576390074, "grad_norm": 2.212482452392578, "learning_rate": 4.475319294216236e-05, "loss": 0.9306, "step": 338100 }, { "epoch": 5.248374431633017, "grad_norm": 1.733121395111084, "learning_rate": 4.4751641086919417e-05, "loss": 0.8999, "step": 338200 }, { "epoch": 5.249926286875961, "grad_norm": 2.1066431999206543, "learning_rate": 4.4750089231676474e-05, "loss": 0.9315, "step": 338300 }, { "epoch": 5.251478142118903, "grad_norm": 2.5459046363830566, "learning_rate": 4.474853737643353e-05, "loss": 0.9283, "step": 338400 }, { "epoch": 5.253029997361846, "grad_norm": 2.1508164405822754, "learning_rate": 4.474698552119059e-05, "loss": 0.9277, "step": 338500 }, { "epoch": 5.254581852604789, "grad_norm": 3.071340799331665, "learning_rate": 4.474543366594765e-05, "loss": 0.915, "step": 338600 }, { "epoch": 5.256133707847732, "grad_norm": 2.0952510833740234, "learning_rate": 4.4743881810704705e-05, "loss": 0.9443, "step": 338700 }, { "epoch": 5.257685563090675, "grad_norm": 2.7172558307647705, "learning_rate": 4.4742329955461756e-05, "loss": 0.9313, "step": 338800 }, { "epoch": 5.259237418333618, "grad_norm": 2.8227391242980957, "learning_rate": 4.4740778100218814e-05, "loss": 0.9379, "step": 338900 }, { "epoch": 5.2607892735765605, "grad_norm": 2.26188588142395, "learning_rate": 4.4739226244975865e-05, "loss": 0.927, "step": 339000 }, { "epoch": 5.262341128819504, "grad_norm": 2.3466920852661133, "learning_rate": 4.473767438973292e-05, "loss": 0.9394, "step": 339100 }, { "epoch": 5.263892984062447, "grad_norm": 2.68229079246521, "learning_rate": 4.473612253448998e-05, "loss": 0.9453, "step": 339200 }, { "epoch": 5.265444839305389, "grad_norm": 2.2861335277557373, "learning_rate": 4.473457067924704e-05, "loss": 0.9416, "step": 339300 }, { "epoch": 5.266996694548332, "grad_norm": 2.6508710384368896, "learning_rate": 4.4733018824004096e-05, "loss": 0.9274, "step": 339400 }, { "epoch": 5.268548549791276, "grad_norm": 2.1282732486724854, "learning_rate": 4.4731466968761154e-05, "loss": 0.936, "step": 339500 }, { "epoch": 5.270100405034219, "grad_norm": 2.5777840614318848, "learning_rate": 4.472991511351821e-05, "loss": 0.9291, "step": 339600 }, { "epoch": 5.271652260277161, "grad_norm": 2.1604416370391846, "learning_rate": 4.472836325827527e-05, "loss": 0.9343, "step": 339700 }, { "epoch": 5.273204115520104, "grad_norm": 2.212364912033081, "learning_rate": 4.472681140303233e-05, "loss": 0.9331, "step": 339800 }, { "epoch": 5.274755970763048, "grad_norm": 2.3613016605377197, "learning_rate": 4.4725259547789385e-05, "loss": 0.9383, "step": 339900 }, { "epoch": 5.27630782600599, "grad_norm": 2.746622323989868, "learning_rate": 4.472370769254644e-05, "loss": 0.9333, "step": 340000 }, { "epoch": 5.277859681248933, "grad_norm": 2.369797706604004, "learning_rate": 4.47221558373035e-05, "loss": 0.9368, "step": 340100 }, { "epoch": 5.279411536491876, "grad_norm": 3.338216543197632, "learning_rate": 4.472060398206056e-05, "loss": 0.9161, "step": 340200 }, { "epoch": 5.280963391734819, "grad_norm": 2.740516424179077, "learning_rate": 4.471905212681761e-05, "loss": 0.9238, "step": 340300 }, { "epoch": 5.282515246977762, "grad_norm": 2.328472375869751, "learning_rate": 4.471750027157467e-05, "loss": 0.9312, "step": 340400 }, { "epoch": 5.284067102220705, "grad_norm": 2.584178924560547, "learning_rate": 4.4715948416331725e-05, "loss": 0.9479, "step": 340500 }, { "epoch": 5.285618957463647, "grad_norm": 2.314603805541992, "learning_rate": 4.471439656108878e-05, "loss": 0.9043, "step": 340600 }, { "epoch": 5.287170812706591, "grad_norm": 2.67551851272583, "learning_rate": 4.471284470584584e-05, "loss": 0.9563, "step": 340700 }, { "epoch": 5.288722667949534, "grad_norm": 2.564943552017212, "learning_rate": 4.47112928506029e-05, "loss": 0.9219, "step": 340800 }, { "epoch": 5.290274523192476, "grad_norm": 2.2841994762420654, "learning_rate": 4.4709740995359956e-05, "loss": 0.946, "step": 340900 }, { "epoch": 5.291826378435419, "grad_norm": 4.39994478225708, "learning_rate": 4.470818914011701e-05, "loss": 0.9453, "step": 341000 }, { "epoch": 5.293378233678363, "grad_norm": 2.4029433727264404, "learning_rate": 4.470663728487407e-05, "loss": 0.9225, "step": 341100 }, { "epoch": 5.294930088921306, "grad_norm": 2.1270804405212402, "learning_rate": 4.470508542963113e-05, "loss": 0.9145, "step": 341200 }, { "epoch": 5.296481944164248, "grad_norm": 2.3497273921966553, "learning_rate": 4.4703533574388187e-05, "loss": 0.9302, "step": 341300 }, { "epoch": 5.298033799407191, "grad_norm": 2.8467586040496826, "learning_rate": 4.4701981719145244e-05, "loss": 0.9424, "step": 341400 }, { "epoch": 5.2995856546501345, "grad_norm": 2.342426061630249, "learning_rate": 4.47004298639023e-05, "loss": 0.9501, "step": 341500 }, { "epoch": 5.301137509893077, "grad_norm": 2.2114741802215576, "learning_rate": 4.469887800865935e-05, "loss": 0.9157, "step": 341600 }, { "epoch": 5.30268936513602, "grad_norm": 2.3363637924194336, "learning_rate": 4.469732615341641e-05, "loss": 0.9395, "step": 341700 }, { "epoch": 5.304241220378963, "grad_norm": 2.2887377738952637, "learning_rate": 4.469577429817347e-05, "loss": 0.9304, "step": 341800 }, { "epoch": 5.305793075621906, "grad_norm": 2.5890958309173584, "learning_rate": 4.4694222442930526e-05, "loss": 0.9324, "step": 341900 }, { "epoch": 5.307344930864849, "grad_norm": 2.596276044845581, "learning_rate": 4.4692670587687584e-05, "loss": 0.9287, "step": 342000 }, { "epoch": 5.308896786107792, "grad_norm": 2.3310697078704834, "learning_rate": 4.469111873244464e-05, "loss": 0.9201, "step": 342100 }, { "epoch": 5.310448641350735, "grad_norm": 2.3093066215515137, "learning_rate": 4.468956687720169e-05, "loss": 0.9233, "step": 342200 }, { "epoch": 5.312000496593678, "grad_norm": 2.3616435527801514, "learning_rate": 4.468801502195875e-05, "loss": 0.93, "step": 342300 }, { "epoch": 5.313552351836621, "grad_norm": 2.3065264225006104, "learning_rate": 4.468646316671581e-05, "loss": 0.9415, "step": 342400 }, { "epoch": 5.315104207079564, "grad_norm": 2.2461142539978027, "learning_rate": 4.4684911311472866e-05, "loss": 0.9349, "step": 342500 }, { "epoch": 5.316656062322506, "grad_norm": 2.6543939113616943, "learning_rate": 4.4683359456229924e-05, "loss": 0.9339, "step": 342600 }, { "epoch": 5.3182079175654495, "grad_norm": 2.1307969093322754, "learning_rate": 4.468180760098698e-05, "loss": 0.9381, "step": 342700 }, { "epoch": 5.319759772808393, "grad_norm": 1.9315050840377808, "learning_rate": 4.468025574574404e-05, "loss": 0.9343, "step": 342800 }, { "epoch": 5.321311628051335, "grad_norm": 2.668227434158325, "learning_rate": 4.46787038905011e-05, "loss": 0.9263, "step": 342900 }, { "epoch": 5.322863483294278, "grad_norm": 2.312304735183716, "learning_rate": 4.4677152035258155e-05, "loss": 0.9329, "step": 343000 }, { "epoch": 5.3244153385372215, "grad_norm": 2.6697068214416504, "learning_rate": 4.4675600180015206e-05, "loss": 0.9299, "step": 343100 }, { "epoch": 5.325967193780164, "grad_norm": 2.340005397796631, "learning_rate": 4.4674048324772264e-05, "loss": 0.9216, "step": 343200 }, { "epoch": 5.327519049023107, "grad_norm": 12.126254081726074, "learning_rate": 4.467249646952932e-05, "loss": 0.931, "step": 343300 }, { "epoch": 5.32907090426605, "grad_norm": 2.5090491771698, "learning_rate": 4.467094461428638e-05, "loss": 0.9283, "step": 343400 }, { "epoch": 5.330622759508993, "grad_norm": 2.5498058795928955, "learning_rate": 4.466939275904344e-05, "loss": 0.9173, "step": 343500 }, { "epoch": 5.332174614751936, "grad_norm": 2.301290512084961, "learning_rate": 4.4667840903800495e-05, "loss": 0.9317, "step": 343600 }, { "epoch": 5.333726469994879, "grad_norm": 2.3305540084838867, "learning_rate": 4.466628904855755e-05, "loss": 0.9407, "step": 343700 }, { "epoch": 5.335278325237822, "grad_norm": 2.4497978687286377, "learning_rate": 4.466473719331461e-05, "loss": 0.924, "step": 343800 }, { "epoch": 5.3368301804807645, "grad_norm": 2.0917134284973145, "learning_rate": 4.466318533807167e-05, "loss": 0.9322, "step": 343900 }, { "epoch": 5.338382035723708, "grad_norm": 2.546950578689575, "learning_rate": 4.4661633482828726e-05, "loss": 0.9403, "step": 344000 }, { "epoch": 5.339933890966651, "grad_norm": 2.3260014057159424, "learning_rate": 4.466008162758578e-05, "loss": 0.9359, "step": 344100 }, { "epoch": 5.341485746209593, "grad_norm": 2.4337263107299805, "learning_rate": 4.465852977234284e-05, "loss": 0.9355, "step": 344200 }, { "epoch": 5.3430376014525365, "grad_norm": 2.2937326431274414, "learning_rate": 4.46569779170999e-05, "loss": 0.9331, "step": 344300 }, { "epoch": 5.34458945669548, "grad_norm": 2.7773754596710205, "learning_rate": 4.465542606185695e-05, "loss": 0.9429, "step": 344400 }, { "epoch": 5.346141311938422, "grad_norm": 2.3431665897369385, "learning_rate": 4.465387420661401e-05, "loss": 0.9333, "step": 344500 }, { "epoch": 5.347693167181365, "grad_norm": 2.15849232673645, "learning_rate": 4.4652322351371065e-05, "loss": 0.9313, "step": 344600 }, { "epoch": 5.349245022424308, "grad_norm": 2.234497308731079, "learning_rate": 4.465077049612812e-05, "loss": 0.9208, "step": 344700 }, { "epoch": 5.350796877667252, "grad_norm": 2.4546382427215576, "learning_rate": 4.464921864088518e-05, "loss": 0.9227, "step": 344800 }, { "epoch": 5.352348732910194, "grad_norm": 2.1840200424194336, "learning_rate": 4.464766678564224e-05, "loss": 0.9306, "step": 344900 }, { "epoch": 5.353900588153137, "grad_norm": 2.0288572311401367, "learning_rate": 4.4646114930399296e-05, "loss": 0.9293, "step": 345000 }, { "epoch": 5.35545244339608, "grad_norm": 2.3710718154907227, "learning_rate": 4.4644563075156354e-05, "loss": 0.9409, "step": 345100 }, { "epoch": 5.357004298639023, "grad_norm": 2.3587846755981445, "learning_rate": 4.464301121991341e-05, "loss": 0.9347, "step": 345200 }, { "epoch": 5.358556153881966, "grad_norm": 1.867029070854187, "learning_rate": 4.464145936467047e-05, "loss": 0.9215, "step": 345300 }, { "epoch": 5.360108009124909, "grad_norm": 2.711923360824585, "learning_rate": 4.463990750942753e-05, "loss": 0.9367, "step": 345400 }, { "epoch": 5.3616598643678515, "grad_norm": 2.3150227069854736, "learning_rate": 4.463835565418458e-05, "loss": 0.9191, "step": 345500 }, { "epoch": 5.363211719610795, "grad_norm": 2.101011037826538, "learning_rate": 4.4636803798941636e-05, "loss": 0.9262, "step": 345600 }, { "epoch": 5.364763574853738, "grad_norm": 2.2388579845428467, "learning_rate": 4.4635251943698694e-05, "loss": 0.9433, "step": 345700 }, { "epoch": 5.36631543009668, "grad_norm": 2.004814863204956, "learning_rate": 4.463370008845575e-05, "loss": 0.9252, "step": 345800 }, { "epoch": 5.367867285339623, "grad_norm": 2.2130181789398193, "learning_rate": 4.463214823321281e-05, "loss": 0.9325, "step": 345900 }, { "epoch": 5.369419140582567, "grad_norm": 2.6334128379821777, "learning_rate": 4.463059637796986e-05, "loss": 0.9271, "step": 346000 }, { "epoch": 5.370970995825509, "grad_norm": 2.674484968185425, "learning_rate": 4.462904452272692e-05, "loss": 0.9427, "step": 346100 }, { "epoch": 5.372522851068452, "grad_norm": 2.658144950866699, "learning_rate": 4.4627492667483976e-05, "loss": 0.9046, "step": 346200 }, { "epoch": 5.374074706311395, "grad_norm": 2.464202404022217, "learning_rate": 4.4625940812241034e-05, "loss": 0.9164, "step": 346300 }, { "epoch": 5.375626561554339, "grad_norm": 2.6364240646362305, "learning_rate": 4.462438895699809e-05, "loss": 0.9386, "step": 346400 }, { "epoch": 5.377178416797281, "grad_norm": 2.176462173461914, "learning_rate": 4.462283710175515e-05, "loss": 0.9407, "step": 346500 }, { "epoch": 5.378730272040224, "grad_norm": 2.6935198307037354, "learning_rate": 4.462128524651221e-05, "loss": 0.9273, "step": 346600 }, { "epoch": 5.380282127283167, "grad_norm": 2.7590205669403076, "learning_rate": 4.4619733391269265e-05, "loss": 0.9398, "step": 346700 }, { "epoch": 5.38183398252611, "grad_norm": 2.1829757690429688, "learning_rate": 4.461818153602632e-05, "loss": 0.9183, "step": 346800 }, { "epoch": 5.383385837769053, "grad_norm": 2.1459381580352783, "learning_rate": 4.461662968078338e-05, "loss": 0.9326, "step": 346900 }, { "epoch": 5.384937693011996, "grad_norm": 2.9559385776519775, "learning_rate": 4.461507782554044e-05, "loss": 0.9215, "step": 347000 }, { "epoch": 5.386489548254938, "grad_norm": 2.8108248710632324, "learning_rate": 4.4613525970297496e-05, "loss": 0.9376, "step": 347100 }, { "epoch": 5.388041403497882, "grad_norm": 2.0802290439605713, "learning_rate": 4.461197411505455e-05, "loss": 0.93, "step": 347200 }, { "epoch": 5.389593258740825, "grad_norm": 2.02363657951355, "learning_rate": 4.4610422259811604e-05, "loss": 0.9082, "step": 347300 }, { "epoch": 5.391145113983768, "grad_norm": 2.720186233520508, "learning_rate": 4.460887040456866e-05, "loss": 0.9178, "step": 347400 }, { "epoch": 5.39269696922671, "grad_norm": 1.7786576747894287, "learning_rate": 4.460731854932572e-05, "loss": 0.9412, "step": 347500 }, { "epoch": 5.394248824469654, "grad_norm": 2.0834245681762695, "learning_rate": 4.460576669408278e-05, "loss": 0.9387, "step": 347600 }, { "epoch": 5.395800679712597, "grad_norm": 2.232769727706909, "learning_rate": 4.4604214838839835e-05, "loss": 0.9299, "step": 347700 }, { "epoch": 5.397352534955539, "grad_norm": 2.4298579692840576, "learning_rate": 4.460266298359689e-05, "loss": 0.953, "step": 347800 }, { "epoch": 5.398904390198482, "grad_norm": 2.3888306617736816, "learning_rate": 4.460111112835395e-05, "loss": 0.9287, "step": 347900 }, { "epoch": 5.4004562454414256, "grad_norm": 2.0261340141296387, "learning_rate": 4.459955927311101e-05, "loss": 0.9129, "step": 348000 }, { "epoch": 5.402008100684368, "grad_norm": 2.171875, "learning_rate": 4.4598007417868066e-05, "loss": 0.9112, "step": 348100 }, { "epoch": 5.403559955927311, "grad_norm": 1.993625283241272, "learning_rate": 4.4596455562625124e-05, "loss": 0.9462, "step": 348200 }, { "epoch": 5.405111811170254, "grad_norm": 2.173541307449341, "learning_rate": 4.459490370738218e-05, "loss": 0.9223, "step": 348300 }, { "epoch": 5.406663666413197, "grad_norm": 1.9383457899093628, "learning_rate": 4.459335185213924e-05, "loss": 0.907, "step": 348400 }, { "epoch": 5.40821552165614, "grad_norm": 2.514192819595337, "learning_rate": 4.45917999968963e-05, "loss": 0.9177, "step": 348500 }, { "epoch": 5.409767376899083, "grad_norm": 2.122702121734619, "learning_rate": 4.459024814165335e-05, "loss": 0.9223, "step": 348600 }, { "epoch": 5.411319232142025, "grad_norm": 2.2001800537109375, "learning_rate": 4.4588696286410406e-05, "loss": 0.9334, "step": 348700 }, { "epoch": 5.412871087384969, "grad_norm": 4.808893203735352, "learning_rate": 4.458714443116746e-05, "loss": 0.9346, "step": 348800 }, { "epoch": 5.414422942627912, "grad_norm": 2.294724941253662, "learning_rate": 4.4585592575924515e-05, "loss": 0.944, "step": 348900 }, { "epoch": 5.415974797870855, "grad_norm": 3.5909221172332764, "learning_rate": 4.458404072068157e-05, "loss": 0.9294, "step": 349000 }, { "epoch": 5.417526653113797, "grad_norm": 2.7372453212738037, "learning_rate": 4.458248886543863e-05, "loss": 0.9382, "step": 349100 }, { "epoch": 5.4190785083567405, "grad_norm": 1.833951711654663, "learning_rate": 4.458093701019569e-05, "loss": 0.9388, "step": 349200 }, { "epoch": 5.420630363599684, "grad_norm": 2.167898416519165, "learning_rate": 4.4579385154952746e-05, "loss": 0.9575, "step": 349300 }, { "epoch": 5.422182218842626, "grad_norm": 2.0499587059020996, "learning_rate": 4.4577833299709804e-05, "loss": 0.9198, "step": 349400 }, { "epoch": 5.423734074085569, "grad_norm": 2.603097438812256, "learning_rate": 4.457628144446686e-05, "loss": 0.937, "step": 349500 }, { "epoch": 5.4252859293285125, "grad_norm": 2.099392890930176, "learning_rate": 4.457472958922392e-05, "loss": 0.9326, "step": 349600 }, { "epoch": 5.426837784571455, "grad_norm": 2.688746213912964, "learning_rate": 4.457317773398098e-05, "loss": 0.945, "step": 349700 }, { "epoch": 5.428389639814398, "grad_norm": 2.2131495475769043, "learning_rate": 4.4571625878738035e-05, "loss": 0.9274, "step": 349800 }, { "epoch": 5.429941495057341, "grad_norm": 1.9827975034713745, "learning_rate": 4.457007402349509e-05, "loss": 0.91, "step": 349900 }, { "epoch": 5.431493350300284, "grad_norm": 2.843352794647217, "learning_rate": 4.456852216825215e-05, "loss": 0.9318, "step": 350000 }, { "epoch": 5.433045205543227, "grad_norm": 2.1873738765716553, "learning_rate": 4.45669703130092e-05, "loss": 0.9308, "step": 350100 }, { "epoch": 5.43459706078617, "grad_norm": 2.9388318061828613, "learning_rate": 4.456541845776626e-05, "loss": 0.9238, "step": 350200 }, { "epoch": 5.436148916029113, "grad_norm": 2.509669780731201, "learning_rate": 4.4563866602523317e-05, "loss": 0.9292, "step": 350300 }, { "epoch": 5.4377007712720555, "grad_norm": 3.249394178390503, "learning_rate": 4.4562314747280374e-05, "loss": 0.9422, "step": 350400 }, { "epoch": 5.439252626514999, "grad_norm": 2.325761556625366, "learning_rate": 4.456076289203743e-05, "loss": 0.95, "step": 350500 }, { "epoch": 5.440804481757942, "grad_norm": 2.4972190856933594, "learning_rate": 4.455921103679449e-05, "loss": 0.931, "step": 350600 }, { "epoch": 5.442356337000884, "grad_norm": 2.187497615814209, "learning_rate": 4.455765918155155e-05, "loss": 0.9249, "step": 350700 }, { "epoch": 5.4439081922438275, "grad_norm": 1.9464036226272583, "learning_rate": 4.4556107326308605e-05, "loss": 0.9591, "step": 350800 }, { "epoch": 5.445460047486771, "grad_norm": 2.807499647140503, "learning_rate": 4.455455547106566e-05, "loss": 0.921, "step": 350900 }, { "epoch": 5.447011902729713, "grad_norm": 2.171391010284424, "learning_rate": 4.455300361582272e-05, "loss": 0.9482, "step": 351000 }, { "epoch": 5.448563757972656, "grad_norm": 2.72310733795166, "learning_rate": 4.455145176057978e-05, "loss": 0.9388, "step": 351100 }, { "epoch": 5.4501156132155995, "grad_norm": 2.2435169219970703, "learning_rate": 4.4549899905336836e-05, "loss": 0.92, "step": 351200 }, { "epoch": 5.451667468458542, "grad_norm": 2.4416680335998535, "learning_rate": 4.4548348050093894e-05, "loss": 0.9417, "step": 351300 }, { "epoch": 5.453219323701485, "grad_norm": 2.382068157196045, "learning_rate": 4.4546796194850945e-05, "loss": 0.9175, "step": 351400 }, { "epoch": 5.454771178944428, "grad_norm": 2.349994421005249, "learning_rate": 4.4545244339608e-05, "loss": 0.9387, "step": 351500 }, { "epoch": 5.456323034187371, "grad_norm": 2.097886085510254, "learning_rate": 4.454369248436506e-05, "loss": 0.9142, "step": 351600 }, { "epoch": 5.457874889430314, "grad_norm": 2.4716098308563232, "learning_rate": 4.454214062912212e-05, "loss": 0.9371, "step": 351700 }, { "epoch": 5.459426744673257, "grad_norm": 2.0536675453186035, "learning_rate": 4.4540588773879176e-05, "loss": 0.9306, "step": 351800 }, { "epoch": 5.4609785999162, "grad_norm": 2.389329433441162, "learning_rate": 4.4539036918636234e-05, "loss": 0.9266, "step": 351900 }, { "epoch": 5.4625304551591425, "grad_norm": 2.5463647842407227, "learning_rate": 4.4537485063393285e-05, "loss": 0.9373, "step": 352000 }, { "epoch": 5.464082310402086, "grad_norm": 1.9288430213928223, "learning_rate": 4.453593320815034e-05, "loss": 0.9318, "step": 352100 }, { "epoch": 5.465634165645029, "grad_norm": 2.3915483951568604, "learning_rate": 4.45343813529074e-05, "loss": 0.9324, "step": 352200 }, { "epoch": 5.467186020887971, "grad_norm": 2.6312999725341797, "learning_rate": 4.453282949766446e-05, "loss": 0.9388, "step": 352300 }, { "epoch": 5.468737876130914, "grad_norm": 2.2113802433013916, "learning_rate": 4.4531277642421516e-05, "loss": 0.9263, "step": 352400 }, { "epoch": 5.470289731373858, "grad_norm": 2.1822781562805176, "learning_rate": 4.4529725787178574e-05, "loss": 0.9398, "step": 352500 }, { "epoch": 5.4718415866168, "grad_norm": 2.1785638332366943, "learning_rate": 4.452817393193563e-05, "loss": 0.9327, "step": 352600 }, { "epoch": 5.473393441859743, "grad_norm": 2.268179178237915, "learning_rate": 4.452662207669269e-05, "loss": 0.9347, "step": 352700 }, { "epoch": 5.474945297102686, "grad_norm": 2.6135027408599854, "learning_rate": 4.452507022144975e-05, "loss": 0.9385, "step": 352800 }, { "epoch": 5.47649715234563, "grad_norm": 2.5027642250061035, "learning_rate": 4.45235183662068e-05, "loss": 0.9422, "step": 352900 }, { "epoch": 5.478049007588572, "grad_norm": 3.242431879043579, "learning_rate": 4.4521966510963856e-05, "loss": 0.9309, "step": 353000 }, { "epoch": 5.479600862831515, "grad_norm": 2.4293205738067627, "learning_rate": 4.452041465572091e-05, "loss": 0.9468, "step": 353100 }, { "epoch": 5.481152718074458, "grad_norm": 2.36332631111145, "learning_rate": 4.451886280047797e-05, "loss": 0.9426, "step": 353200 }, { "epoch": 5.482704573317401, "grad_norm": 2.2093558311462402, "learning_rate": 4.451731094523503e-05, "loss": 0.9138, "step": 353300 }, { "epoch": 5.484256428560344, "grad_norm": 2.382009267807007, "learning_rate": 4.4515759089992087e-05, "loss": 0.9184, "step": 353400 }, { "epoch": 5.485808283803287, "grad_norm": 2.002971649169922, "learning_rate": 4.4514207234749144e-05, "loss": 0.9218, "step": 353500 }, { "epoch": 5.487360139046229, "grad_norm": 2.5171408653259277, "learning_rate": 4.45126553795062e-05, "loss": 0.9297, "step": 353600 }, { "epoch": 5.488911994289173, "grad_norm": 2.9409053325653076, "learning_rate": 4.451110352426326e-05, "loss": 0.9283, "step": 353700 }, { "epoch": 5.490463849532116, "grad_norm": 2.552809476852417, "learning_rate": 4.450955166902032e-05, "loss": 0.9324, "step": 353800 }, { "epoch": 5.492015704775058, "grad_norm": 2.3946938514709473, "learning_rate": 4.4507999813777375e-05, "loss": 0.9157, "step": 353900 }, { "epoch": 5.493567560018001, "grad_norm": 2.4266064167022705, "learning_rate": 4.450644795853443e-05, "loss": 0.9315, "step": 354000 }, { "epoch": 5.495119415260945, "grad_norm": 1.7675186395645142, "learning_rate": 4.450489610329149e-05, "loss": 0.911, "step": 354100 }, { "epoch": 5.496671270503887, "grad_norm": 2.6313397884368896, "learning_rate": 4.450334424804855e-05, "loss": 0.9291, "step": 354200 }, { "epoch": 5.49822312574683, "grad_norm": 1.6767656803131104, "learning_rate": 4.45017923928056e-05, "loss": 0.9247, "step": 354300 }, { "epoch": 5.499774980989773, "grad_norm": 2.154066801071167, "learning_rate": 4.450024053756266e-05, "loss": 0.9295, "step": 354400 }, { "epoch": 5.501326836232717, "grad_norm": 2.0238780975341797, "learning_rate": 4.4498688682319715e-05, "loss": 0.9187, "step": 354500 }, { "epoch": 5.502878691475659, "grad_norm": 2.318746566772461, "learning_rate": 4.449713682707677e-05, "loss": 0.9516, "step": 354600 }, { "epoch": 5.504430546718602, "grad_norm": 2.353142738342285, "learning_rate": 4.449558497183383e-05, "loss": 0.9281, "step": 354700 }, { "epoch": 5.505982401961545, "grad_norm": 1.9801207780838013, "learning_rate": 4.449403311659089e-05, "loss": 0.939, "step": 354800 }, { "epoch": 5.507534257204488, "grad_norm": 2.5909109115600586, "learning_rate": 4.4492481261347946e-05, "loss": 0.9346, "step": 354900 }, { "epoch": 5.509086112447431, "grad_norm": 2.0088682174682617, "learning_rate": 4.4490929406105004e-05, "loss": 0.9403, "step": 355000 }, { "epoch": 5.510637967690374, "grad_norm": 2.8426034450531006, "learning_rate": 4.448937755086206e-05, "loss": 0.9457, "step": 355100 }, { "epoch": 5.512189822933317, "grad_norm": 2.298290252685547, "learning_rate": 4.448782569561912e-05, "loss": 0.9312, "step": 355200 }, { "epoch": 5.51374167817626, "grad_norm": 2.3351998329162598, "learning_rate": 4.448627384037617e-05, "loss": 0.9376, "step": 355300 }, { "epoch": 5.515293533419203, "grad_norm": 2.0165162086486816, "learning_rate": 4.448472198513323e-05, "loss": 0.9327, "step": 355400 }, { "epoch": 5.516845388662146, "grad_norm": 2.156956911087036, "learning_rate": 4.4483170129890286e-05, "loss": 0.9114, "step": 355500 }, { "epoch": 5.518397243905088, "grad_norm": 2.3515710830688477, "learning_rate": 4.4481618274647344e-05, "loss": 0.9144, "step": 355600 }, { "epoch": 5.519949099148032, "grad_norm": 2.3100318908691406, "learning_rate": 4.44800664194044e-05, "loss": 0.9422, "step": 355700 }, { "epoch": 5.521500954390975, "grad_norm": 2.3480308055877686, "learning_rate": 4.447851456416145e-05, "loss": 0.9235, "step": 355800 }, { "epoch": 5.523052809633917, "grad_norm": 2.1465277671813965, "learning_rate": 4.447696270891851e-05, "loss": 0.9094, "step": 355900 }, { "epoch": 5.52460466487686, "grad_norm": 2.5463056564331055, "learning_rate": 4.447541085367557e-05, "loss": 0.9226, "step": 356000 }, { "epoch": 5.5261565201198035, "grad_norm": 1.9189684391021729, "learning_rate": 4.4473858998432626e-05, "loss": 0.9437, "step": 356100 }, { "epoch": 5.527708375362746, "grad_norm": 2.3804092407226562, "learning_rate": 4.447230714318968e-05, "loss": 0.9476, "step": 356200 }, { "epoch": 5.529260230605689, "grad_norm": 2.9284842014312744, "learning_rate": 4.447075528794674e-05, "loss": 0.9394, "step": 356300 }, { "epoch": 5.530812085848632, "grad_norm": 2.4831862449645996, "learning_rate": 4.44692034327038e-05, "loss": 0.9334, "step": 356400 }, { "epoch": 5.532363941091575, "grad_norm": 2.4872889518737793, "learning_rate": 4.4467651577460857e-05, "loss": 0.9552, "step": 356500 }, { "epoch": 5.533915796334518, "grad_norm": 2.1460447311401367, "learning_rate": 4.4466099722217914e-05, "loss": 0.9268, "step": 356600 }, { "epoch": 5.535467651577461, "grad_norm": 1.9608980417251587, "learning_rate": 4.446454786697497e-05, "loss": 0.9382, "step": 356700 }, { "epoch": 5.537019506820403, "grad_norm": 2.122884511947632, "learning_rate": 4.446299601173203e-05, "loss": 0.9219, "step": 356800 }, { "epoch": 5.5385713620633465, "grad_norm": 2.451322555541992, "learning_rate": 4.446144415648909e-05, "loss": 0.9219, "step": 356900 }, { "epoch": 5.54012321730629, "grad_norm": 2.110605239868164, "learning_rate": 4.4459892301246145e-05, "loss": 0.9141, "step": 357000 }, { "epoch": 5.541675072549233, "grad_norm": 2.361027240753174, "learning_rate": 4.4458340446003196e-05, "loss": 0.9417, "step": 357100 }, { "epoch": 5.543226927792175, "grad_norm": 2.5065042972564697, "learning_rate": 4.4456788590760254e-05, "loss": 0.9413, "step": 357200 }, { "epoch": 5.5447787830351185, "grad_norm": 2.11735200881958, "learning_rate": 4.445523673551731e-05, "loss": 0.941, "step": 357300 }, { "epoch": 5.546330638278062, "grad_norm": 2.8767507076263428, "learning_rate": 4.445368488027437e-05, "loss": 0.9238, "step": 357400 }, { "epoch": 5.547882493521004, "grad_norm": 2.6510183811187744, "learning_rate": 4.445213302503143e-05, "loss": 0.9281, "step": 357500 }, { "epoch": 5.549434348763947, "grad_norm": 2.4517879486083984, "learning_rate": 4.4450581169788485e-05, "loss": 0.9305, "step": 357600 }, { "epoch": 5.5509862040068905, "grad_norm": 2.712381601333618, "learning_rate": 4.444902931454554e-05, "loss": 0.919, "step": 357700 }, { "epoch": 5.552538059249833, "grad_norm": 2.0226669311523438, "learning_rate": 4.44474774593026e-05, "loss": 0.9407, "step": 357800 }, { "epoch": 5.554089914492776, "grad_norm": 2.101388931274414, "learning_rate": 4.444592560405966e-05, "loss": 0.9212, "step": 357900 }, { "epoch": 5.555641769735719, "grad_norm": 2.5201425552368164, "learning_rate": 4.4444373748816716e-05, "loss": 0.9246, "step": 358000 }, { "epoch": 5.557193624978662, "grad_norm": 2.318666458129883, "learning_rate": 4.4442821893573774e-05, "loss": 0.9214, "step": 358100 }, { "epoch": 5.558745480221605, "grad_norm": 2.8607916831970215, "learning_rate": 4.444127003833083e-05, "loss": 0.9194, "step": 358200 }, { "epoch": 5.560297335464548, "grad_norm": 2.868417263031006, "learning_rate": 4.443971818308789e-05, "loss": 0.9267, "step": 358300 }, { "epoch": 5.561849190707491, "grad_norm": 2.5361499786376953, "learning_rate": 4.443816632784494e-05, "loss": 0.9377, "step": 358400 }, { "epoch": 5.5634010459504335, "grad_norm": 2.044569253921509, "learning_rate": 4.4436614472602e-05, "loss": 0.9395, "step": 358500 }, { "epoch": 5.564952901193377, "grad_norm": 2.5122478008270264, "learning_rate": 4.443506261735905e-05, "loss": 0.9374, "step": 358600 }, { "epoch": 5.56650475643632, "grad_norm": 2.3042547702789307, "learning_rate": 4.443351076211611e-05, "loss": 0.9326, "step": 358700 }, { "epoch": 5.568056611679262, "grad_norm": 2.422398567199707, "learning_rate": 4.4431958906873165e-05, "loss": 0.9173, "step": 358800 }, { "epoch": 5.5696084669222055, "grad_norm": 2.47454571723938, "learning_rate": 4.443040705163022e-05, "loss": 0.9343, "step": 358900 }, { "epoch": 5.571160322165149, "grad_norm": 2.5991873741149902, "learning_rate": 4.442885519638728e-05, "loss": 0.9279, "step": 359000 }, { "epoch": 5.572712177408091, "grad_norm": 2.4027822017669678, "learning_rate": 4.442730334114434e-05, "loss": 0.9284, "step": 359100 }, { "epoch": 5.574264032651034, "grad_norm": 2.725639581680298, "learning_rate": 4.4425751485901396e-05, "loss": 0.9488, "step": 359200 }, { "epoch": 5.575815887893977, "grad_norm": 2.5246379375457764, "learning_rate": 4.442419963065845e-05, "loss": 0.9428, "step": 359300 }, { "epoch": 5.57736774313692, "grad_norm": 2.0793519020080566, "learning_rate": 4.442264777541551e-05, "loss": 0.923, "step": 359400 }, { "epoch": 5.578919598379863, "grad_norm": 2.35316801071167, "learning_rate": 4.442109592017257e-05, "loss": 0.9176, "step": 359500 }, { "epoch": 5.580471453622806, "grad_norm": 2.125232458114624, "learning_rate": 4.4419544064929627e-05, "loss": 0.9283, "step": 359600 }, { "epoch": 5.582023308865749, "grad_norm": 2.0995426177978516, "learning_rate": 4.4417992209686684e-05, "loss": 0.9328, "step": 359700 }, { "epoch": 5.583575164108692, "grad_norm": 2.0638344287872314, "learning_rate": 4.441644035444374e-05, "loss": 0.9395, "step": 359800 }, { "epoch": 5.585127019351635, "grad_norm": 2.8453750610351562, "learning_rate": 4.441488849920079e-05, "loss": 0.9099, "step": 359900 }, { "epoch": 5.586678874594578, "grad_norm": 2.209789514541626, "learning_rate": 4.441333664395785e-05, "loss": 0.9166, "step": 360000 }, { "epoch": 5.5882307298375204, "grad_norm": 2.2494070529937744, "learning_rate": 4.441178478871491e-05, "loss": 0.9512, "step": 360100 }, { "epoch": 5.589782585080464, "grad_norm": 2.3726260662078857, "learning_rate": 4.4410232933471966e-05, "loss": 0.9415, "step": 360200 }, { "epoch": 5.591334440323407, "grad_norm": 2.46063494682312, "learning_rate": 4.4408681078229024e-05, "loss": 0.9609, "step": 360300 }, { "epoch": 5.592886295566349, "grad_norm": 2.543144702911377, "learning_rate": 4.440712922298608e-05, "loss": 0.9278, "step": 360400 }, { "epoch": 5.594438150809292, "grad_norm": 2.621018886566162, "learning_rate": 4.440557736774314e-05, "loss": 0.9441, "step": 360500 }, { "epoch": 5.595990006052236, "grad_norm": 2.6276121139526367, "learning_rate": 4.44040255125002e-05, "loss": 0.9255, "step": 360600 }, { "epoch": 5.597541861295179, "grad_norm": 1.9596357345581055, "learning_rate": 4.4402473657257255e-05, "loss": 0.9368, "step": 360700 }, { "epoch": 5.599093716538121, "grad_norm": 2.1579654216766357, "learning_rate": 4.440092180201431e-05, "loss": 0.9538, "step": 360800 }, { "epoch": 5.600645571781064, "grad_norm": 1.9138736724853516, "learning_rate": 4.439936994677137e-05, "loss": 0.934, "step": 360900 }, { "epoch": 5.602197427024008, "grad_norm": 2.398998975753784, "learning_rate": 4.439781809152843e-05, "loss": 0.9225, "step": 361000 }, { "epoch": 5.60374928226695, "grad_norm": 3.821086883544922, "learning_rate": 4.4396266236285486e-05, "loss": 0.9421, "step": 361100 }, { "epoch": 5.605301137509893, "grad_norm": 2.6693825721740723, "learning_rate": 4.439471438104254e-05, "loss": 0.9224, "step": 361200 }, { "epoch": 5.606852992752836, "grad_norm": 2.1605262756347656, "learning_rate": 4.4393162525799595e-05, "loss": 0.9361, "step": 361300 }, { "epoch": 5.608404847995779, "grad_norm": 2.167950391769409, "learning_rate": 4.439161067055665e-05, "loss": 0.9084, "step": 361400 }, { "epoch": 5.609956703238722, "grad_norm": 2.3905506134033203, "learning_rate": 4.439005881531371e-05, "loss": 0.9123, "step": 361500 }, { "epoch": 5.611508558481665, "grad_norm": 2.2409725189208984, "learning_rate": 4.438850696007077e-05, "loss": 0.9149, "step": 361600 }, { "epoch": 5.613060413724607, "grad_norm": 2.176377773284912, "learning_rate": 4.4386955104827826e-05, "loss": 0.9212, "step": 361700 }, { "epoch": 5.614612268967551, "grad_norm": 2.1110141277313232, "learning_rate": 4.438540324958488e-05, "loss": 0.9229, "step": 361800 }, { "epoch": 5.616164124210494, "grad_norm": 2.98650860786438, "learning_rate": 4.4383851394341935e-05, "loss": 0.9196, "step": 361900 }, { "epoch": 5.617715979453436, "grad_norm": 2.220989465713501, "learning_rate": 4.438229953909899e-05, "loss": 0.9429, "step": 362000 }, { "epoch": 5.619267834696379, "grad_norm": 2.519818067550659, "learning_rate": 4.438074768385605e-05, "loss": 0.9417, "step": 362100 }, { "epoch": 5.620819689939323, "grad_norm": 1.841310739517212, "learning_rate": 4.437919582861311e-05, "loss": 0.9207, "step": 362200 }, { "epoch": 5.622371545182266, "grad_norm": 2.209097385406494, "learning_rate": 4.4377643973370166e-05, "loss": 0.9538, "step": 362300 }, { "epoch": 5.623923400425208, "grad_norm": 2.336773157119751, "learning_rate": 4.437609211812722e-05, "loss": 0.944, "step": 362400 }, { "epoch": 5.625475255668151, "grad_norm": 2.0668141841888428, "learning_rate": 4.437454026288428e-05, "loss": 0.9456, "step": 362500 }, { "epoch": 5.6270271109110945, "grad_norm": 2.2492170333862305, "learning_rate": 4.437298840764134e-05, "loss": 0.9317, "step": 362600 }, { "epoch": 5.628578966154037, "grad_norm": 2.3108370304107666, "learning_rate": 4.4371436552398397e-05, "loss": 0.9334, "step": 362700 }, { "epoch": 5.63013082139698, "grad_norm": 1.925979495048523, "learning_rate": 4.436988469715545e-05, "loss": 0.9324, "step": 362800 }, { "epoch": 5.631682676639923, "grad_norm": 2.620699167251587, "learning_rate": 4.4368332841912505e-05, "loss": 0.9381, "step": 362900 }, { "epoch": 5.633234531882866, "grad_norm": 2.4794418811798096, "learning_rate": 4.436678098666956e-05, "loss": 0.9363, "step": 363000 }, { "epoch": 5.634786387125809, "grad_norm": 2.5736618041992188, "learning_rate": 4.436522913142662e-05, "loss": 0.9224, "step": 363100 }, { "epoch": 5.636338242368752, "grad_norm": 2.33585262298584, "learning_rate": 4.436367727618368e-05, "loss": 0.9378, "step": 363200 }, { "epoch": 5.637890097611695, "grad_norm": 2.4225995540618896, "learning_rate": 4.4362125420940736e-05, "loss": 0.9296, "step": 363300 }, { "epoch": 5.639441952854638, "grad_norm": 2.3408703804016113, "learning_rate": 4.4360573565697794e-05, "loss": 0.92, "step": 363400 }, { "epoch": 5.640993808097581, "grad_norm": 2.661978006362915, "learning_rate": 4.435902171045485e-05, "loss": 0.9299, "step": 363500 }, { "epoch": 5.642545663340524, "grad_norm": 2.2814548015594482, "learning_rate": 4.435746985521191e-05, "loss": 0.9184, "step": 363600 }, { "epoch": 5.644097518583466, "grad_norm": 2.7602550983428955, "learning_rate": 4.435591799996897e-05, "loss": 0.9539, "step": 363700 }, { "epoch": 5.6456493738264095, "grad_norm": 2.2844347953796387, "learning_rate": 4.4354366144726025e-05, "loss": 0.9441, "step": 363800 }, { "epoch": 5.647201229069353, "grad_norm": 2.5906193256378174, "learning_rate": 4.435281428948308e-05, "loss": 0.9246, "step": 363900 }, { "epoch": 5.648753084312295, "grad_norm": 2.213181734085083, "learning_rate": 4.435126243424014e-05, "loss": 0.9291, "step": 364000 }, { "epoch": 5.650304939555238, "grad_norm": 2.851949691772461, "learning_rate": 4.434971057899719e-05, "loss": 0.9308, "step": 364100 }, { "epoch": 5.6518567947981815, "grad_norm": 2.7878828048706055, "learning_rate": 4.434815872375425e-05, "loss": 0.9177, "step": 364200 }, { "epoch": 5.653408650041124, "grad_norm": 2.1245994567871094, "learning_rate": 4.434660686851131e-05, "loss": 0.9241, "step": 364300 }, { "epoch": 5.654960505284067, "grad_norm": 2.4670844078063965, "learning_rate": 4.4345055013268365e-05, "loss": 0.907, "step": 364400 }, { "epoch": 5.65651236052701, "grad_norm": 2.566483497619629, "learning_rate": 4.434350315802542e-05, "loss": 0.9387, "step": 364500 }, { "epoch": 5.6580642157699526, "grad_norm": 2.567556619644165, "learning_rate": 4.434195130278248e-05, "loss": 0.9355, "step": 364600 }, { "epoch": 5.659616071012896, "grad_norm": 2.334972858428955, "learning_rate": 4.434039944753954e-05, "loss": 0.9338, "step": 364700 }, { "epoch": 5.661167926255839, "grad_norm": 2.9742932319641113, "learning_rate": 4.4338847592296596e-05, "loss": 0.9312, "step": 364800 }, { "epoch": 5.662719781498781, "grad_norm": 2.577641010284424, "learning_rate": 4.4337295737053654e-05, "loss": 0.9097, "step": 364900 }, { "epoch": 5.6642716367417245, "grad_norm": 2.184039354324341, "learning_rate": 4.433574388181071e-05, "loss": 0.9365, "step": 365000 }, { "epoch": 5.665823491984668, "grad_norm": 2.145604372024536, "learning_rate": 4.433419202656776e-05, "loss": 0.914, "step": 365100 }, { "epoch": 5.667375347227611, "grad_norm": 2.1651041507720947, "learning_rate": 4.433264017132482e-05, "loss": 0.9273, "step": 365200 }, { "epoch": 5.668927202470553, "grad_norm": 2.040647268295288, "learning_rate": 4.433108831608188e-05, "loss": 0.9452, "step": 365300 }, { "epoch": 5.6704790577134965, "grad_norm": 2.631885290145874, "learning_rate": 4.4329536460838936e-05, "loss": 0.9184, "step": 365400 }, { "epoch": 5.67203091295644, "grad_norm": 2.713850498199463, "learning_rate": 4.432798460559599e-05, "loss": 0.9263, "step": 365500 }, { "epoch": 5.673582768199382, "grad_norm": 2.6144778728485107, "learning_rate": 4.4326432750353044e-05, "loss": 0.932, "step": 365600 }, { "epoch": 5.675134623442325, "grad_norm": 2.0203099250793457, "learning_rate": 4.43248808951101e-05, "loss": 0.9243, "step": 365700 }, { "epoch": 5.676686478685268, "grad_norm": 2.165541648864746, "learning_rate": 4.432332903986716e-05, "loss": 0.9259, "step": 365800 }, { "epoch": 5.678238333928212, "grad_norm": 2.225295066833496, "learning_rate": 4.432177718462422e-05, "loss": 0.9191, "step": 365900 }, { "epoch": 5.679790189171154, "grad_norm": 2.493354558944702, "learning_rate": 4.4320225329381275e-05, "loss": 0.9168, "step": 366000 }, { "epoch": 5.681342044414097, "grad_norm": 2.6571762561798096, "learning_rate": 4.431867347413833e-05, "loss": 0.9386, "step": 366100 }, { "epoch": 5.68289389965704, "grad_norm": 1.9205049276351929, "learning_rate": 4.431712161889539e-05, "loss": 0.921, "step": 366200 }, { "epoch": 5.684445754899983, "grad_norm": 2.295492649078369, "learning_rate": 4.431556976365245e-05, "loss": 0.9242, "step": 366300 }, { "epoch": 5.685997610142926, "grad_norm": 2.1689586639404297, "learning_rate": 4.4314017908409506e-05, "loss": 0.9247, "step": 366400 }, { "epoch": 5.687549465385869, "grad_norm": 1.7218530178070068, "learning_rate": 4.4312466053166564e-05, "loss": 0.9239, "step": 366500 }, { "epoch": 5.6891013206288115, "grad_norm": 2.585411787033081, "learning_rate": 4.431091419792362e-05, "loss": 0.9274, "step": 366600 }, { "epoch": 5.690653175871755, "grad_norm": 2.4222774505615234, "learning_rate": 4.430936234268068e-05, "loss": 0.9499, "step": 366700 }, { "epoch": 5.692205031114698, "grad_norm": 2.701373338699341, "learning_rate": 4.430781048743774e-05, "loss": 0.9132, "step": 366800 }, { "epoch": 5.69375688635764, "grad_norm": 2.0188100337982178, "learning_rate": 4.430625863219479e-05, "loss": 0.9293, "step": 366900 }, { "epoch": 5.695308741600583, "grad_norm": 2.0881810188293457, "learning_rate": 4.4304706776951846e-05, "loss": 0.95, "step": 367000 }, { "epoch": 5.696860596843527, "grad_norm": 2.5594685077667236, "learning_rate": 4.4303154921708904e-05, "loss": 0.9374, "step": 367100 }, { "epoch": 5.698412452086469, "grad_norm": 2.49722957611084, "learning_rate": 4.430160306646596e-05, "loss": 0.9316, "step": 367200 }, { "epoch": 5.699964307329412, "grad_norm": 1.9994752407073975, "learning_rate": 4.430005121122302e-05, "loss": 0.9139, "step": 367300 }, { "epoch": 5.701516162572355, "grad_norm": 2.392850160598755, "learning_rate": 4.429849935598008e-05, "loss": 0.9272, "step": 367400 }, { "epoch": 5.703068017815298, "grad_norm": 2.9529290199279785, "learning_rate": 4.4296947500737135e-05, "loss": 0.9073, "step": 367500 }, { "epoch": 5.704619873058241, "grad_norm": 1.9487853050231934, "learning_rate": 4.429539564549419e-05, "loss": 0.9362, "step": 367600 }, { "epoch": 5.706171728301184, "grad_norm": 2.586231231689453, "learning_rate": 4.429384379025125e-05, "loss": 0.9393, "step": 367700 }, { "epoch": 5.707723583544127, "grad_norm": 2.0504374504089355, "learning_rate": 4.429229193500831e-05, "loss": 0.9355, "step": 367800 }, { "epoch": 5.70927543878707, "grad_norm": 2.527252197265625, "learning_rate": 4.4290740079765366e-05, "loss": 0.9341, "step": 367900 }, { "epoch": 5.710827294030013, "grad_norm": 2.3414461612701416, "learning_rate": 4.4289188224522424e-05, "loss": 0.9341, "step": 368000 }, { "epoch": 5.712379149272956, "grad_norm": 2.214618444442749, "learning_rate": 4.428763636927948e-05, "loss": 0.9308, "step": 368100 }, { "epoch": 5.713931004515898, "grad_norm": 1.8325119018554688, "learning_rate": 4.428608451403653e-05, "loss": 0.9175, "step": 368200 }, { "epoch": 5.715482859758842, "grad_norm": 1.6481571197509766, "learning_rate": 4.428453265879359e-05, "loss": 0.8981, "step": 368300 }, { "epoch": 5.717034715001785, "grad_norm": 1.9996092319488525, "learning_rate": 4.428298080355064e-05, "loss": 0.9401, "step": 368400 }, { "epoch": 5.718586570244728, "grad_norm": 2.525768518447876, "learning_rate": 4.42814289483077e-05, "loss": 0.9108, "step": 368500 }, { "epoch": 5.72013842548767, "grad_norm": 2.0854620933532715, "learning_rate": 4.4279877093064757e-05, "loss": 0.9275, "step": 368600 }, { "epoch": 5.721690280730614, "grad_norm": 2.509031295776367, "learning_rate": 4.4278325237821814e-05, "loss": 0.9287, "step": 368700 }, { "epoch": 5.723242135973557, "grad_norm": 2.3116822242736816, "learning_rate": 4.427677338257887e-05, "loss": 0.9298, "step": 368800 }, { "epoch": 5.724793991216499, "grad_norm": 2.6815073490142822, "learning_rate": 4.427522152733593e-05, "loss": 0.9123, "step": 368900 }, { "epoch": 5.726345846459442, "grad_norm": 2.4279627799987793, "learning_rate": 4.427366967209299e-05, "loss": 0.939, "step": 369000 }, { "epoch": 5.7278977017023855, "grad_norm": 2.650533437728882, "learning_rate": 4.4272117816850045e-05, "loss": 0.9503, "step": 369100 }, { "epoch": 5.729449556945328, "grad_norm": 2.4605424404144287, "learning_rate": 4.42705659616071e-05, "loss": 0.9341, "step": 369200 }, { "epoch": 5.731001412188271, "grad_norm": 1.7422220706939697, "learning_rate": 4.426901410636416e-05, "loss": 0.9491, "step": 369300 }, { "epoch": 5.732553267431214, "grad_norm": 2.1599831581115723, "learning_rate": 4.426746225112122e-05, "loss": 0.9242, "step": 369400 }, { "epoch": 5.734105122674157, "grad_norm": 1.8551788330078125, "learning_rate": 4.4265910395878276e-05, "loss": 0.9188, "step": 369500 }, { "epoch": 5.7356569779171, "grad_norm": 2.519383430480957, "learning_rate": 4.4264358540635334e-05, "loss": 0.9454, "step": 369600 }, { "epoch": 5.737208833160043, "grad_norm": 2.3770852088928223, "learning_rate": 4.4262806685392385e-05, "loss": 0.9382, "step": 369700 }, { "epoch": 5.738760688402985, "grad_norm": 2.4177803993225098, "learning_rate": 4.426125483014944e-05, "loss": 0.9146, "step": 369800 }, { "epoch": 5.740312543645929, "grad_norm": 2.0180509090423584, "learning_rate": 4.42597029749065e-05, "loss": 0.9272, "step": 369900 }, { "epoch": 5.741864398888872, "grad_norm": 2.3660728931427, "learning_rate": 4.425815111966356e-05, "loss": 0.9226, "step": 370000 }, { "epoch": 5.743416254131814, "grad_norm": 2.1567156314849854, "learning_rate": 4.4256599264420616e-05, "loss": 0.9308, "step": 370100 }, { "epoch": 5.744968109374757, "grad_norm": 2.9937503337860107, "learning_rate": 4.4255047409177674e-05, "loss": 0.946, "step": 370200 }, { "epoch": 5.7465199646177005, "grad_norm": 2.1057121753692627, "learning_rate": 4.425349555393473e-05, "loss": 0.9421, "step": 370300 }, { "epoch": 5.748071819860644, "grad_norm": 2.336742639541626, "learning_rate": 4.425194369869179e-05, "loss": 0.919, "step": 370400 }, { "epoch": 5.749623675103586, "grad_norm": 2.495710611343384, "learning_rate": 4.425039184344885e-05, "loss": 0.9149, "step": 370500 }, { "epoch": 5.751175530346529, "grad_norm": 2.833146810531616, "learning_rate": 4.4248839988205905e-05, "loss": 0.9314, "step": 370600 }, { "epoch": 5.7527273855894725, "grad_norm": 2.076221227645874, "learning_rate": 4.424728813296296e-05, "loss": 0.9142, "step": 370700 }, { "epoch": 5.754279240832415, "grad_norm": 2.7535197734832764, "learning_rate": 4.424573627772002e-05, "loss": 0.9174, "step": 370800 }, { "epoch": 5.755831096075358, "grad_norm": 2.1797678470611572, "learning_rate": 4.424418442247708e-05, "loss": 0.9514, "step": 370900 }, { "epoch": 5.757382951318301, "grad_norm": 2.6341359615325928, "learning_rate": 4.4242632567234136e-05, "loss": 0.913, "step": 371000 }, { "epoch": 5.7589348065612445, "grad_norm": 6.7712836265563965, "learning_rate": 4.424108071199119e-05, "loss": 0.9219, "step": 371100 }, { "epoch": 5.760486661804187, "grad_norm": 2.3560597896575928, "learning_rate": 4.4239528856748245e-05, "loss": 0.9381, "step": 371200 }, { "epoch": 5.76203851704713, "grad_norm": 2.1516146659851074, "learning_rate": 4.42379770015053e-05, "loss": 0.9212, "step": 371300 }, { "epoch": 5.763590372290073, "grad_norm": 2.441126585006714, "learning_rate": 4.423642514626236e-05, "loss": 0.9199, "step": 371400 }, { "epoch": 5.7651422275330155, "grad_norm": 2.424161672592163, "learning_rate": 4.423487329101942e-05, "loss": 0.9316, "step": 371500 }, { "epoch": 5.766694082775959, "grad_norm": 2.73758602142334, "learning_rate": 4.423332143577647e-05, "loss": 0.9339, "step": 371600 }, { "epoch": 5.768245938018902, "grad_norm": 2.5455832481384277, "learning_rate": 4.4231769580533527e-05, "loss": 0.9359, "step": 371700 }, { "epoch": 5.769797793261844, "grad_norm": 2.764674425125122, "learning_rate": 4.4230217725290584e-05, "loss": 0.9255, "step": 371800 }, { "epoch": 5.7713496485047875, "grad_norm": 2.15977144241333, "learning_rate": 4.422866587004764e-05, "loss": 0.9329, "step": 371900 }, { "epoch": 5.772901503747731, "grad_norm": 2.513197422027588, "learning_rate": 4.42271140148047e-05, "loss": 0.9329, "step": 372000 }, { "epoch": 5.774453358990673, "grad_norm": 1.9431573152542114, "learning_rate": 4.422556215956176e-05, "loss": 0.9072, "step": 372100 }, { "epoch": 5.776005214233616, "grad_norm": 2.8399314880371094, "learning_rate": 4.4224010304318815e-05, "loss": 0.9291, "step": 372200 }, { "epoch": 5.7775570694765594, "grad_norm": 2.265237808227539, "learning_rate": 4.422245844907587e-05, "loss": 0.9343, "step": 372300 }, { "epoch": 5.779108924719502, "grad_norm": 2.2398853302001953, "learning_rate": 4.422090659383293e-05, "loss": 0.9254, "step": 372400 }, { "epoch": 5.780660779962445, "grad_norm": 2.2830371856689453, "learning_rate": 4.421935473858999e-05, "loss": 0.9396, "step": 372500 }, { "epoch": 5.782212635205388, "grad_norm": 2.4340362548828125, "learning_rate": 4.421780288334704e-05, "loss": 0.9381, "step": 372600 }, { "epoch": 5.7837644904483305, "grad_norm": 2.611630439758301, "learning_rate": 4.42162510281041e-05, "loss": 0.9497, "step": 372700 }, { "epoch": 5.785316345691274, "grad_norm": 2.786362648010254, "learning_rate": 4.4214699172861155e-05, "loss": 0.9357, "step": 372800 }, { "epoch": 5.786868200934217, "grad_norm": 2.4658925533294678, "learning_rate": 4.421314731761821e-05, "loss": 0.9149, "step": 372900 }, { "epoch": 5.78842005617716, "grad_norm": 2.1330978870391846, "learning_rate": 4.421159546237527e-05, "loss": 0.9114, "step": 373000 }, { "epoch": 5.7899719114201025, "grad_norm": 2.2332301139831543, "learning_rate": 4.421004360713233e-05, "loss": 0.9302, "step": 373100 }, { "epoch": 5.791523766663046, "grad_norm": 2.407566547393799, "learning_rate": 4.4208491751889386e-05, "loss": 0.9306, "step": 373200 }, { "epoch": 5.793075621905989, "grad_norm": 2.655856132507324, "learning_rate": 4.4206939896646444e-05, "loss": 0.918, "step": 373300 }, { "epoch": 5.794627477148931, "grad_norm": 2.168830633163452, "learning_rate": 4.42053880414035e-05, "loss": 0.9193, "step": 373400 }, { "epoch": 5.796179332391874, "grad_norm": 2.294774293899536, "learning_rate": 4.420383618616056e-05, "loss": 0.9375, "step": 373500 }, { "epoch": 5.797731187634818, "grad_norm": 2.916827917098999, "learning_rate": 4.420228433091762e-05, "loss": 0.9348, "step": 373600 }, { "epoch": 5.79928304287776, "grad_norm": 2.0201213359832764, "learning_rate": 4.4200732475674675e-05, "loss": 0.923, "step": 373700 }, { "epoch": 5.800834898120703, "grad_norm": 2.1302883625030518, "learning_rate": 4.419918062043173e-05, "loss": 0.9211, "step": 373800 }, { "epoch": 5.802386753363646, "grad_norm": 2.0613768100738525, "learning_rate": 4.4197628765188784e-05, "loss": 0.9174, "step": 373900 }, { "epoch": 5.80393860860659, "grad_norm": 2.2483513355255127, "learning_rate": 4.419607690994584e-05, "loss": 0.9218, "step": 374000 }, { "epoch": 5.805490463849532, "grad_norm": 2.3423266410827637, "learning_rate": 4.41945250547029e-05, "loss": 0.9194, "step": 374100 }, { "epoch": 5.807042319092475, "grad_norm": 2.172497272491455, "learning_rate": 4.419297319945996e-05, "loss": 0.9086, "step": 374200 }, { "epoch": 5.808594174335418, "grad_norm": 2.384155035018921, "learning_rate": 4.4191421344217015e-05, "loss": 0.9321, "step": 374300 }, { "epoch": 5.810146029578361, "grad_norm": 2.0315699577331543, "learning_rate": 4.418986948897407e-05, "loss": 0.9328, "step": 374400 }, { "epoch": 5.811697884821304, "grad_norm": 2.567885160446167, "learning_rate": 4.418831763373113e-05, "loss": 0.9133, "step": 374500 }, { "epoch": 5.813249740064247, "grad_norm": 2.137476921081543, "learning_rate": 4.418676577848819e-05, "loss": 0.9259, "step": 374600 }, { "epoch": 5.814801595307189, "grad_norm": 2.214953660964966, "learning_rate": 4.4185213923245246e-05, "loss": 0.9252, "step": 374700 }, { "epoch": 5.816353450550133, "grad_norm": 2.6156749725341797, "learning_rate": 4.4183662068002297e-05, "loss": 0.9159, "step": 374800 }, { "epoch": 5.817905305793076, "grad_norm": 1.9838955402374268, "learning_rate": 4.4182110212759354e-05, "loss": 0.9374, "step": 374900 }, { "epoch": 5.819457161036018, "grad_norm": 2.249929428100586, "learning_rate": 4.418055835751641e-05, "loss": 0.9341, "step": 375000 }, { "epoch": 5.821009016278961, "grad_norm": 2.1195247173309326, "learning_rate": 4.417900650227347e-05, "loss": 0.9156, "step": 375100 }, { "epoch": 5.822560871521905, "grad_norm": 2.6045241355895996, "learning_rate": 4.417745464703053e-05, "loss": 0.9208, "step": 375200 }, { "epoch": 5.824112726764847, "grad_norm": 2.0580878257751465, "learning_rate": 4.4175902791787585e-05, "loss": 0.9266, "step": 375300 }, { "epoch": 5.82566458200779, "grad_norm": 2.3777241706848145, "learning_rate": 4.4174350936544636e-05, "loss": 0.9269, "step": 375400 }, { "epoch": 5.827216437250733, "grad_norm": 1.8769292831420898, "learning_rate": 4.4172799081301694e-05, "loss": 0.9325, "step": 375500 }, { "epoch": 5.828768292493677, "grad_norm": 2.498622417449951, "learning_rate": 4.417124722605875e-05, "loss": 0.9426, "step": 375600 }, { "epoch": 5.830320147736619, "grad_norm": 7.495016098022461, "learning_rate": 4.416969537081581e-05, "loss": 0.9157, "step": 375700 }, { "epoch": 5.831872002979562, "grad_norm": 2.301790237426758, "learning_rate": 4.416814351557287e-05, "loss": 0.9541, "step": 375800 }, { "epoch": 5.833423858222505, "grad_norm": 2.124613046646118, "learning_rate": 4.4166591660329925e-05, "loss": 0.9078, "step": 375900 }, { "epoch": 5.834975713465448, "grad_norm": 2.53865647315979, "learning_rate": 4.416503980508698e-05, "loss": 0.9429, "step": 376000 }, { "epoch": 5.836527568708391, "grad_norm": 1.770917296409607, "learning_rate": 4.416348794984404e-05, "loss": 0.9261, "step": 376100 }, { "epoch": 5.838079423951334, "grad_norm": 2.4364147186279297, "learning_rate": 4.41619360946011e-05, "loss": 0.9457, "step": 376200 }, { "epoch": 5.839631279194276, "grad_norm": 2.1453535556793213, "learning_rate": 4.4160384239358156e-05, "loss": 0.9292, "step": 376300 }, { "epoch": 5.84118313443722, "grad_norm": 2.1768858432769775, "learning_rate": 4.4158832384115214e-05, "loss": 0.933, "step": 376400 }, { "epoch": 5.842734989680163, "grad_norm": 3.909966230392456, "learning_rate": 4.415728052887227e-05, "loss": 0.9266, "step": 376500 }, { "epoch": 5.844286844923106, "grad_norm": 2.0681557655334473, "learning_rate": 4.415572867362933e-05, "loss": 0.9141, "step": 376600 }, { "epoch": 5.845838700166048, "grad_norm": 1.7066795825958252, "learning_rate": 4.415417681838638e-05, "loss": 0.9312, "step": 376700 }, { "epoch": 5.8473905554089916, "grad_norm": 2.127495765686035, "learning_rate": 4.415262496314344e-05, "loss": 0.9311, "step": 376800 }, { "epoch": 5.848942410651935, "grad_norm": 2.2289412021636963, "learning_rate": 4.4151073107900496e-05, "loss": 0.9345, "step": 376900 }, { "epoch": 5.850494265894877, "grad_norm": 2.2448034286499023, "learning_rate": 4.4149521252657554e-05, "loss": 0.9366, "step": 377000 }, { "epoch": 5.85204612113782, "grad_norm": 2.4862945079803467, "learning_rate": 4.414796939741461e-05, "loss": 0.9239, "step": 377100 }, { "epoch": 5.8535979763807635, "grad_norm": 2.4093167781829834, "learning_rate": 4.414641754217167e-05, "loss": 0.9433, "step": 377200 }, { "epoch": 5.855149831623706, "grad_norm": 1.9444105625152588, "learning_rate": 4.414486568692873e-05, "loss": 0.9137, "step": 377300 }, { "epoch": 5.856701686866649, "grad_norm": 2.6660680770874023, "learning_rate": 4.4143313831685785e-05, "loss": 0.9213, "step": 377400 }, { "epoch": 5.858253542109592, "grad_norm": 2.2826108932495117, "learning_rate": 4.414176197644284e-05, "loss": 0.9183, "step": 377500 }, { "epoch": 5.859805397352535, "grad_norm": 2.373914957046509, "learning_rate": 4.41402101211999e-05, "loss": 0.9486, "step": 377600 }, { "epoch": 5.861357252595478, "grad_norm": 2.3810834884643555, "learning_rate": 4.413865826595696e-05, "loss": 0.9242, "step": 377700 }, { "epoch": 5.862909107838421, "grad_norm": 2.475078821182251, "learning_rate": 4.4137106410714016e-05, "loss": 0.9253, "step": 377800 }, { "epoch": 5.864460963081363, "grad_norm": 2.029362440109253, "learning_rate": 4.413555455547107e-05, "loss": 0.9289, "step": 377900 }, { "epoch": 5.8660128183243065, "grad_norm": 2.2424025535583496, "learning_rate": 4.4134002700228124e-05, "loss": 0.9008, "step": 378000 }, { "epoch": 5.86756467356725, "grad_norm": 2.5935440063476562, "learning_rate": 4.413245084498518e-05, "loss": 0.9292, "step": 378100 }, { "epoch": 5.869116528810193, "grad_norm": 2.8523647785186768, "learning_rate": 4.413089898974224e-05, "loss": 0.9277, "step": 378200 }, { "epoch": 5.870668384053135, "grad_norm": 2.1704444885253906, "learning_rate": 4.412934713449929e-05, "loss": 0.9244, "step": 378300 }, { "epoch": 5.8722202392960785, "grad_norm": 2.5880565643310547, "learning_rate": 4.412779527925635e-05, "loss": 0.9304, "step": 378400 }, { "epoch": 5.873772094539022, "grad_norm": 2.2958340644836426, "learning_rate": 4.4126243424013406e-05, "loss": 0.9285, "step": 378500 }, { "epoch": 5.875323949781964, "grad_norm": 2.005978584289551, "learning_rate": 4.4124691568770464e-05, "loss": 0.9322, "step": 378600 }, { "epoch": 5.876875805024907, "grad_norm": 1.9338600635528564, "learning_rate": 4.412313971352752e-05, "loss": 0.9169, "step": 378700 }, { "epoch": 5.8784276602678505, "grad_norm": 2.5991976261138916, "learning_rate": 4.412158785828458e-05, "loss": 0.9298, "step": 378800 }, { "epoch": 5.879979515510793, "grad_norm": 2.3250279426574707, "learning_rate": 4.412003600304164e-05, "loss": 0.9343, "step": 378900 }, { "epoch": 5.881531370753736, "grad_norm": 2.616421699523926, "learning_rate": 4.4118484147798695e-05, "loss": 0.9282, "step": 379000 }, { "epoch": 5.883083225996679, "grad_norm": 2.5987648963928223, "learning_rate": 4.411693229255575e-05, "loss": 0.9205, "step": 379100 }, { "epoch": 5.884635081239622, "grad_norm": 2.843447208404541, "learning_rate": 4.411538043731281e-05, "loss": 0.9235, "step": 379200 }, { "epoch": 5.886186936482565, "grad_norm": 2.927158832550049, "learning_rate": 4.411382858206987e-05, "loss": 0.9327, "step": 379300 }, { "epoch": 5.887738791725508, "grad_norm": 2.8394968509674072, "learning_rate": 4.4112276726826926e-05, "loss": 0.9237, "step": 379400 }, { "epoch": 5.889290646968451, "grad_norm": 4.552837371826172, "learning_rate": 4.4110724871583984e-05, "loss": 0.933, "step": 379500 }, { "epoch": 5.8908425022113935, "grad_norm": 2.3533196449279785, "learning_rate": 4.4109173016341035e-05, "loss": 0.9296, "step": 379600 }, { "epoch": 5.892394357454337, "grad_norm": 2.1388397216796875, "learning_rate": 4.410762116109809e-05, "loss": 0.9306, "step": 379700 }, { "epoch": 5.89394621269728, "grad_norm": 2.2157015800476074, "learning_rate": 4.410606930585515e-05, "loss": 0.9314, "step": 379800 }, { "epoch": 5.895498067940222, "grad_norm": 2.3486554622650146, "learning_rate": 4.410451745061221e-05, "loss": 0.9192, "step": 379900 }, { "epoch": 5.8970499231831655, "grad_norm": 2.130868911743164, "learning_rate": 4.4102965595369266e-05, "loss": 0.9289, "step": 380000 }, { "epoch": 5.898601778426109, "grad_norm": 2.8294870853424072, "learning_rate": 4.4101413740126324e-05, "loss": 0.9313, "step": 380100 }, { "epoch": 5.900153633669051, "grad_norm": 3.6762642860412598, "learning_rate": 4.409986188488338e-05, "loss": 0.914, "step": 380200 }, { "epoch": 5.901705488911994, "grad_norm": 6.868281841278076, "learning_rate": 4.409831002964044e-05, "loss": 0.9239, "step": 380300 }, { "epoch": 5.903257344154937, "grad_norm": 2.21103835105896, "learning_rate": 4.40967581743975e-05, "loss": 0.9169, "step": 380400 }, { "epoch": 5.90480919939788, "grad_norm": 2.1818583011627197, "learning_rate": 4.4095206319154555e-05, "loss": 0.917, "step": 380500 }, { "epoch": 5.906361054640823, "grad_norm": 2.079655647277832, "learning_rate": 4.409365446391161e-05, "loss": 0.9341, "step": 380600 }, { "epoch": 5.907912909883766, "grad_norm": 2.5411593914031982, "learning_rate": 4.409210260866867e-05, "loss": 0.9205, "step": 380700 }, { "epoch": 5.9094647651267085, "grad_norm": 2.7072412967681885, "learning_rate": 4.409055075342573e-05, "loss": 0.9307, "step": 380800 }, { "epoch": 5.911016620369652, "grad_norm": 2.437809944152832, "learning_rate": 4.408899889818278e-05, "loss": 0.9397, "step": 380900 }, { "epoch": 5.912568475612595, "grad_norm": 2.1585686206817627, "learning_rate": 4.4087447042939837e-05, "loss": 0.9492, "step": 381000 }, { "epoch": 5.914120330855538, "grad_norm": 2.2248077392578125, "learning_rate": 4.4085895187696894e-05, "loss": 0.9228, "step": 381100 }, { "epoch": 5.9156721860984804, "grad_norm": 1.803015112876892, "learning_rate": 4.408434333245395e-05, "loss": 0.9382, "step": 381200 }, { "epoch": 5.917224041341424, "grad_norm": 1.924534797668457, "learning_rate": 4.4082791477211e-05, "loss": 0.9233, "step": 381300 }, { "epoch": 5.918775896584367, "grad_norm": 2.030559778213501, "learning_rate": 4.408123962196806e-05, "loss": 0.9093, "step": 381400 }, { "epoch": 5.920327751827309, "grad_norm": 2.4440488815307617, "learning_rate": 4.407968776672512e-05, "loss": 0.8995, "step": 381500 }, { "epoch": 5.921879607070252, "grad_norm": 2.2303977012634277, "learning_rate": 4.4078135911482176e-05, "loss": 0.9295, "step": 381600 }, { "epoch": 5.923431462313196, "grad_norm": 2.5516586303710938, "learning_rate": 4.4076584056239234e-05, "loss": 0.9291, "step": 381700 }, { "epoch": 5.924983317556139, "grad_norm": 2.3242311477661133, "learning_rate": 4.407503220099629e-05, "loss": 0.9369, "step": 381800 }, { "epoch": 5.926535172799081, "grad_norm": 3.745464324951172, "learning_rate": 4.407348034575335e-05, "loss": 0.9143, "step": 381900 }, { "epoch": 5.928087028042024, "grad_norm": 2.110266923904419, "learning_rate": 4.407192849051041e-05, "loss": 0.9368, "step": 382000 }, { "epoch": 5.929638883284968, "grad_norm": 2.4010369777679443, "learning_rate": 4.4070376635267465e-05, "loss": 0.9103, "step": 382100 }, { "epoch": 5.93119073852791, "grad_norm": 1.7738538980484009, "learning_rate": 4.406882478002452e-05, "loss": 0.936, "step": 382200 }, { "epoch": 5.932742593770853, "grad_norm": 2.4484570026397705, "learning_rate": 4.406727292478158e-05, "loss": 0.9317, "step": 382300 }, { "epoch": 5.934294449013796, "grad_norm": 2.096397638320923, "learning_rate": 4.406572106953863e-05, "loss": 0.9644, "step": 382400 }, { "epoch": 5.935846304256739, "grad_norm": 2.3275158405303955, "learning_rate": 4.406416921429569e-05, "loss": 0.9141, "step": 382500 }, { "epoch": 5.937398159499682, "grad_norm": 1.9960864782333374, "learning_rate": 4.406261735905275e-05, "loss": 0.9406, "step": 382600 }, { "epoch": 5.938950014742625, "grad_norm": 2.4337902069091797, "learning_rate": 4.4061065503809805e-05, "loss": 0.9251, "step": 382700 }, { "epoch": 5.940501869985567, "grad_norm": 2.08793306350708, "learning_rate": 4.405951364856686e-05, "loss": 0.9071, "step": 382800 }, { "epoch": 5.942053725228511, "grad_norm": 1.6882188320159912, "learning_rate": 4.405796179332392e-05, "loss": 0.9336, "step": 382900 }, { "epoch": 5.943605580471454, "grad_norm": 2.405649185180664, "learning_rate": 4.405640993808098e-05, "loss": 0.9088, "step": 383000 }, { "epoch": 5.945157435714396, "grad_norm": 2.9336326122283936, "learning_rate": 4.4054858082838036e-05, "loss": 0.9326, "step": 383100 }, { "epoch": 5.946709290957339, "grad_norm": 2.2749955654144287, "learning_rate": 4.4053306227595094e-05, "loss": 0.9145, "step": 383200 }, { "epoch": 5.948261146200283, "grad_norm": 2.1550958156585693, "learning_rate": 4.405175437235215e-05, "loss": 0.9183, "step": 383300 }, { "epoch": 5.949813001443225, "grad_norm": 2.2279582023620605, "learning_rate": 4.405020251710921e-05, "loss": 0.9244, "step": 383400 }, { "epoch": 5.951364856686168, "grad_norm": 2.5670042037963867, "learning_rate": 4.404865066186627e-05, "loss": 0.9479, "step": 383500 }, { "epoch": 5.952916711929111, "grad_norm": 2.416271686553955, "learning_rate": 4.4047098806623325e-05, "loss": 0.9438, "step": 383600 }, { "epoch": 5.9544685671720545, "grad_norm": 2.11055064201355, "learning_rate": 4.4045546951380376e-05, "loss": 0.9195, "step": 383700 }, { "epoch": 5.956020422414997, "grad_norm": 2.3150413036346436, "learning_rate": 4.404399509613743e-05, "loss": 0.9266, "step": 383800 }, { "epoch": 5.95757227765794, "grad_norm": 2.032788038253784, "learning_rate": 4.404244324089449e-05, "loss": 0.9165, "step": 383900 }, { "epoch": 5.959124132900883, "grad_norm": 2.4799156188964844, "learning_rate": 4.404089138565155e-05, "loss": 0.9026, "step": 384000 }, { "epoch": 5.960675988143826, "grad_norm": 2.908860445022583, "learning_rate": 4.4039339530408607e-05, "loss": 0.9177, "step": 384100 }, { "epoch": 5.962227843386769, "grad_norm": 2.297247886657715, "learning_rate": 4.4037787675165664e-05, "loss": 0.9202, "step": 384200 }, { "epoch": 5.963779698629712, "grad_norm": 2.258314609527588, "learning_rate": 4.403623581992272e-05, "loss": 0.9441, "step": 384300 }, { "epoch": 5.965331553872655, "grad_norm": 2.338510751724243, "learning_rate": 4.403468396467978e-05, "loss": 0.9358, "step": 384400 }, { "epoch": 5.966883409115598, "grad_norm": 2.2287614345550537, "learning_rate": 4.403313210943684e-05, "loss": 0.9205, "step": 384500 }, { "epoch": 5.968435264358541, "grad_norm": 2.164900064468384, "learning_rate": 4.403158025419389e-05, "loss": 0.9318, "step": 384600 }, { "epoch": 5.969987119601484, "grad_norm": 2.165616750717163, "learning_rate": 4.4030028398950946e-05, "loss": 0.9372, "step": 384700 }, { "epoch": 5.971538974844426, "grad_norm": 1.9925497770309448, "learning_rate": 4.4028476543708004e-05, "loss": 0.938, "step": 384800 }, { "epoch": 5.9730908300873695, "grad_norm": 2.909029722213745, "learning_rate": 4.402692468846506e-05, "loss": 0.9237, "step": 384900 }, { "epoch": 5.974642685330313, "grad_norm": 2.506934404373169, "learning_rate": 4.402537283322212e-05, "loss": 0.9142, "step": 385000 }, { "epoch": 5.976194540573255, "grad_norm": 2.0706374645233154, "learning_rate": 4.402382097797918e-05, "loss": 0.919, "step": 385100 }, { "epoch": 5.977746395816198, "grad_norm": 2.9680657386779785, "learning_rate": 4.402226912273623e-05, "loss": 0.9416, "step": 385200 }, { "epoch": 5.9792982510591415, "grad_norm": 2.3082339763641357, "learning_rate": 4.4020717267493286e-05, "loss": 0.9333, "step": 385300 }, { "epoch": 5.980850106302084, "grad_norm": 2.3770253658294678, "learning_rate": 4.4019165412250344e-05, "loss": 0.9246, "step": 385400 }, { "epoch": 5.982401961545027, "grad_norm": 2.101931571960449, "learning_rate": 4.40176135570074e-05, "loss": 0.9268, "step": 385500 }, { "epoch": 5.98395381678797, "grad_norm": 2.23042368888855, "learning_rate": 4.401606170176446e-05, "loss": 0.9247, "step": 385600 }, { "epoch": 5.9855056720309125, "grad_norm": 2.1745498180389404, "learning_rate": 4.401450984652152e-05, "loss": 0.9215, "step": 385700 }, { "epoch": 5.987057527273856, "grad_norm": 3.549194097518921, "learning_rate": 4.4012957991278575e-05, "loss": 0.9312, "step": 385800 }, { "epoch": 5.988609382516799, "grad_norm": 2.235198497772217, "learning_rate": 4.401140613603563e-05, "loss": 0.936, "step": 385900 }, { "epoch": 5.990161237759741, "grad_norm": 2.21164870262146, "learning_rate": 4.400985428079269e-05, "loss": 0.9329, "step": 386000 }, { "epoch": 5.9917130930026845, "grad_norm": 2.069857358932495, "learning_rate": 4.400830242554975e-05, "loss": 0.9233, "step": 386100 }, { "epoch": 5.993264948245628, "grad_norm": 2.393022060394287, "learning_rate": 4.4006750570306806e-05, "loss": 0.9237, "step": 386200 }, { "epoch": 5.994816803488571, "grad_norm": 2.6218183040618896, "learning_rate": 4.4005198715063864e-05, "loss": 0.9283, "step": 386300 }, { "epoch": 5.996368658731513, "grad_norm": 2.090283155441284, "learning_rate": 4.400364685982092e-05, "loss": 0.9254, "step": 386400 }, { "epoch": 5.9979205139744565, "grad_norm": 2.50335955619812, "learning_rate": 4.400209500457797e-05, "loss": 0.9182, "step": 386500 }, { "epoch": 5.9994723692174, "grad_norm": 3.126054525375366, "learning_rate": 4.400054314933503e-05, "loss": 0.9277, "step": 386600 }, { "epoch": 6.001024224460342, "grad_norm": 2.0110902786254883, "learning_rate": 4.399899129409209e-05, "loss": 0.9077, "step": 386700 }, { "epoch": 6.002576079703285, "grad_norm": 2.3426144123077393, "learning_rate": 4.3997439438849146e-05, "loss": 0.9057, "step": 386800 }, { "epoch": 6.004127934946228, "grad_norm": 2.1818981170654297, "learning_rate": 4.39958875836062e-05, "loss": 0.9136, "step": 386900 }, { "epoch": 6.005679790189171, "grad_norm": 2.025068759918213, "learning_rate": 4.399433572836326e-05, "loss": 0.9133, "step": 387000 }, { "epoch": 6.007231645432114, "grad_norm": 2.062511444091797, "learning_rate": 4.399278387312032e-05, "loss": 0.908, "step": 387100 }, { "epoch": 6.008783500675057, "grad_norm": 2.829012393951416, "learning_rate": 4.3991232017877377e-05, "loss": 0.8917, "step": 387200 }, { "epoch": 6.010335355918, "grad_norm": 1.9712255001068115, "learning_rate": 4.3989680162634434e-05, "loss": 0.9031, "step": 387300 }, { "epoch": 6.011887211160943, "grad_norm": 2.128382444381714, "learning_rate": 4.398812830739149e-05, "loss": 0.8988, "step": 387400 }, { "epoch": 6.013439066403886, "grad_norm": 2.652858257293701, "learning_rate": 4.398657645214855e-05, "loss": 0.8852, "step": 387500 } ], "logging_steps": 100, "max_steps": 3221950, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4671004555676852e+19, "train_batch_size": 96, "trial_name": null, "trial_params": null }