{ "best_metric": 0.20218822360038757, "best_model_checkpoint": "autotrain-q1ygq-0lob5/checkpoint-209196", "epoch": 3.0, "eval_steps": 500, "global_step": 209196, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00035851545918660015, "grad_norm": 4.1363325119018555, "learning_rate": 3.58508604206501e-08, "loss": 0.4046, "step": 25 }, { "epoch": 0.0007170309183732003, "grad_norm": 10.085333824157715, "learning_rate": 7.17017208413002e-08, "loss": 0.2716, "step": 50 }, { "epoch": 0.0010755463775598004, "grad_norm": 3.604381561279297, "learning_rate": 1.0755258126195028e-07, "loss": 0.3115, "step": 75 }, { "epoch": 0.0014340618367464006, "grad_norm": 23.976533889770508, "learning_rate": 1.434034416826004e-07, "loss": 0.389, "step": 100 }, { "epoch": 0.0017925772959330006, "grad_norm": 10.465509414672852, "learning_rate": 1.7925430210325047e-07, "loss": 0.4266, "step": 125 }, { "epoch": 0.002151092755119601, "grad_norm": 15.951292037963867, "learning_rate": 2.1510516252390057e-07, "loss": 0.3045, "step": 150 }, { "epoch": 0.002509608214306201, "grad_norm": 17.603187561035156, "learning_rate": 2.509560229445507e-07, "loss": 0.3363, "step": 175 }, { "epoch": 0.0028681236734928012, "grad_norm": 20.09035873413086, "learning_rate": 2.868068833652008e-07, "loss": 0.4819, "step": 200 }, { "epoch": 0.003226639132679401, "grad_norm": 7.76746940612793, "learning_rate": 3.226577437858509e-07, "loss": 0.5153, "step": 225 }, { "epoch": 0.003585154591866001, "grad_norm": 24.937841415405273, "learning_rate": 3.5850860420650093e-07, "loss": 0.5158, "step": 250 }, { "epoch": 0.003943670051052601, "grad_norm": 25.6325740814209, "learning_rate": 3.9435946462715104e-07, "loss": 0.5066, "step": 275 }, { "epoch": 0.004302185510239202, "grad_norm": 1.6846942901611328, "learning_rate": 4.3021032504780114e-07, "loss": 0.3741, "step": 300 }, { "epoch": 0.004660700969425802, "grad_norm": 25.74128532409668, "learning_rate": 4.660611854684513e-07, "loss": 0.3981, "step": 325 }, { "epoch": 0.005019216428612402, "grad_norm": 2.0924599170684814, "learning_rate": 5.019120458891013e-07, "loss": 0.4581, "step": 350 }, { "epoch": 0.005377731887799002, "grad_norm": 19.57196617126465, "learning_rate": 5.377629063097515e-07, "loss": 0.3482, "step": 375 }, { "epoch": 0.0057362473469856025, "grad_norm": 13.97435474395752, "learning_rate": 5.736137667304016e-07, "loss": 0.356, "step": 400 }, { "epoch": 0.006094762806172202, "grad_norm": 22.52356719970703, "learning_rate": 6.094646271510517e-07, "loss": 0.3521, "step": 425 }, { "epoch": 0.006453278265358802, "grad_norm": 7.018030166625977, "learning_rate": 6.453154875717018e-07, "loss": 0.4295, "step": 450 }, { "epoch": 0.006811793724545402, "grad_norm": 20.5902099609375, "learning_rate": 6.811663479923519e-07, "loss": 0.4565, "step": 475 }, { "epoch": 0.007170309183732002, "grad_norm": 19.357349395751953, "learning_rate": 7.170172084130019e-07, "loss": 0.3213, "step": 500 }, { "epoch": 0.007528824642918603, "grad_norm": 4.130999565124512, "learning_rate": 7.52868068833652e-07, "loss": 0.4547, "step": 525 }, { "epoch": 0.007887340102105202, "grad_norm": 8.162277221679688, "learning_rate": 7.887189292543021e-07, "loss": 0.4735, "step": 550 }, { "epoch": 0.008245855561291803, "grad_norm": 7.060818672180176, "learning_rate": 8.245697896749522e-07, "loss": 0.3314, "step": 575 }, { "epoch": 0.008604371020478403, "grad_norm": 8.146357536315918, "learning_rate": 8.604206500956023e-07, "loss": 0.4322, "step": 600 }, { "epoch": 0.008962886479665004, "grad_norm": 3.4557390213012695, "learning_rate": 8.962715105162524e-07, "loss": 0.3776, "step": 625 }, { "epoch": 0.009321401938851604, "grad_norm": 1.6928133964538574, "learning_rate": 9.321223709369026e-07, "loss": 0.2927, "step": 650 }, { "epoch": 0.009679917398038203, "grad_norm": 7.360342979431152, "learning_rate": 9.679732313575526e-07, "loss": 0.3275, "step": 675 }, { "epoch": 0.010038432857224803, "grad_norm": 0.8680502772331238, "learning_rate": 1.0038240917782027e-06, "loss": 0.3591, "step": 700 }, { "epoch": 0.010396948316411404, "grad_norm": 25.63970184326172, "learning_rate": 1.0396749521988528e-06, "loss": 0.4378, "step": 725 }, { "epoch": 0.010755463775598004, "grad_norm": 11.426755905151367, "learning_rate": 1.075525812619503e-06, "loss": 0.2719, "step": 750 }, { "epoch": 0.011113979234784604, "grad_norm": 14.124526023864746, "learning_rate": 1.111376673040153e-06, "loss": 0.4438, "step": 775 }, { "epoch": 0.011472494693971205, "grad_norm": 14.181686401367188, "learning_rate": 1.1472275334608031e-06, "loss": 0.3853, "step": 800 }, { "epoch": 0.011831010153157804, "grad_norm": 12.553092956542969, "learning_rate": 1.183078393881453e-06, "loss": 0.427, "step": 825 }, { "epoch": 0.012189525612344404, "grad_norm": 1.4032467603683472, "learning_rate": 1.2189292543021033e-06, "loss": 0.2634, "step": 850 }, { "epoch": 0.012548041071531004, "grad_norm": 20.863117218017578, "learning_rate": 1.2547801147227534e-06, "loss": 0.3193, "step": 875 }, { "epoch": 0.012906556530717605, "grad_norm": 0.4464549720287323, "learning_rate": 1.2906309751434035e-06, "loss": 0.331, "step": 900 }, { "epoch": 0.013265071989904205, "grad_norm": 13.197609901428223, "learning_rate": 1.3264818355640536e-06, "loss": 0.4077, "step": 925 }, { "epoch": 0.013623587449090804, "grad_norm": 15.240842819213867, "learning_rate": 1.3623326959847037e-06, "loss": 0.4057, "step": 950 }, { "epoch": 0.013982102908277404, "grad_norm": 13.67392635345459, "learning_rate": 1.3981835564053538e-06, "loss": 0.2862, "step": 975 }, { "epoch": 0.014340618367464005, "grad_norm": 6.971039772033691, "learning_rate": 1.4340344168260037e-06, "loss": 0.2805, "step": 1000 }, { "epoch": 0.014699133826650605, "grad_norm": 20.04720115661621, "learning_rate": 1.469885277246654e-06, "loss": 0.3694, "step": 1025 }, { "epoch": 0.015057649285837206, "grad_norm": 2.3768529891967773, "learning_rate": 1.505736137667304e-06, "loss": 0.2742, "step": 1050 }, { "epoch": 0.015416164745023806, "grad_norm": 19.469141006469727, "learning_rate": 1.5415869980879543e-06, "loss": 0.4425, "step": 1075 }, { "epoch": 0.015774680204210405, "grad_norm": 15.055968284606934, "learning_rate": 1.5774378585086041e-06, "loss": 0.4141, "step": 1100 }, { "epoch": 0.016133195663397007, "grad_norm": 22.04874038696289, "learning_rate": 1.6132887189292545e-06, "loss": 0.3396, "step": 1125 }, { "epoch": 0.016491711122583606, "grad_norm": 22.121885299682617, "learning_rate": 1.6491395793499044e-06, "loss": 0.4963, "step": 1150 }, { "epoch": 0.016850226581770204, "grad_norm": 16.071096420288086, "learning_rate": 1.6849904397705545e-06, "loss": 0.2595, "step": 1175 }, { "epoch": 0.017208742040956806, "grad_norm": 12.828895568847656, "learning_rate": 1.7208413001912046e-06, "loss": 0.2925, "step": 1200 }, { "epoch": 0.017567257500143405, "grad_norm": 2.1447744369506836, "learning_rate": 1.7566921606118547e-06, "loss": 0.3566, "step": 1225 }, { "epoch": 0.017925772959330007, "grad_norm": 12.973687171936035, "learning_rate": 1.7925430210325048e-06, "loss": 0.3158, "step": 1250 }, { "epoch": 0.018284288418516606, "grad_norm": 20.333715438842773, "learning_rate": 1.8283938814531549e-06, "loss": 0.3438, "step": 1275 }, { "epoch": 0.018642803877703208, "grad_norm": 12.496926307678223, "learning_rate": 1.8642447418738052e-06, "loss": 0.2765, "step": 1300 }, { "epoch": 0.019001319336889807, "grad_norm": 17.143346786499023, "learning_rate": 1.9000956022944553e-06, "loss": 0.4234, "step": 1325 }, { "epoch": 0.019359834796076406, "grad_norm": 19.342208862304688, "learning_rate": 1.935946462715105e-06, "loss": 0.287, "step": 1350 }, { "epoch": 0.019718350255263008, "grad_norm": 14.375370025634766, "learning_rate": 1.9717973231357553e-06, "loss": 0.3835, "step": 1375 }, { "epoch": 0.020076865714449606, "grad_norm": 16.112825393676758, "learning_rate": 2.0076481835564054e-06, "loss": 0.19, "step": 1400 }, { "epoch": 0.02043538117363621, "grad_norm": 15.790172576904297, "learning_rate": 2.0434990439770555e-06, "loss": 0.3385, "step": 1425 }, { "epoch": 0.020793896632822807, "grad_norm": 8.58986759185791, "learning_rate": 2.0793499043977056e-06, "loss": 0.3305, "step": 1450 }, { "epoch": 0.021152412092009406, "grad_norm": 2.335312604904175, "learning_rate": 2.1152007648183557e-06, "loss": 0.3359, "step": 1475 }, { "epoch": 0.021510927551196008, "grad_norm": 13.258298873901367, "learning_rate": 2.151051625239006e-06, "loss": 0.3688, "step": 1500 }, { "epoch": 0.021869443010382607, "grad_norm": 12.521761894226074, "learning_rate": 2.186902485659656e-06, "loss": 0.2417, "step": 1525 }, { "epoch": 0.02222795846956921, "grad_norm": 22.646255493164062, "learning_rate": 2.222753346080306e-06, "loss": 0.3489, "step": 1550 }, { "epoch": 0.022586473928755808, "grad_norm": 2.2286324501037598, "learning_rate": 2.258604206500956e-06, "loss": 0.1935, "step": 1575 }, { "epoch": 0.02294498938794241, "grad_norm": 10.411453247070312, "learning_rate": 2.2944550669216062e-06, "loss": 0.4262, "step": 1600 }, { "epoch": 0.02330350484712901, "grad_norm": 6.25962495803833, "learning_rate": 2.3303059273422563e-06, "loss": 0.2372, "step": 1625 }, { "epoch": 0.023662020306315607, "grad_norm": 3.622568130493164, "learning_rate": 2.366156787762906e-06, "loss": 0.3306, "step": 1650 }, { "epoch": 0.02402053576550221, "grad_norm": 31.045677185058594, "learning_rate": 2.4020076481835565e-06, "loss": 0.5239, "step": 1675 }, { "epoch": 0.024379051224688808, "grad_norm": 10.583403587341309, "learning_rate": 2.4378585086042066e-06, "loss": 0.3412, "step": 1700 }, { "epoch": 0.02473756668387541, "grad_norm": 12.031203269958496, "learning_rate": 2.4737093690248567e-06, "loss": 0.3226, "step": 1725 }, { "epoch": 0.02509608214306201, "grad_norm": 18.027193069458008, "learning_rate": 2.509560229445507e-06, "loss": 0.3519, "step": 1750 }, { "epoch": 0.025454597602248608, "grad_norm": 15.12567138671875, "learning_rate": 2.5454110898661565e-06, "loss": 0.3586, "step": 1775 }, { "epoch": 0.02581311306143521, "grad_norm": 24.386444091796875, "learning_rate": 2.581261950286807e-06, "loss": 0.364, "step": 1800 }, { "epoch": 0.02617162852062181, "grad_norm": 12.813180923461914, "learning_rate": 2.617112810707457e-06, "loss": 0.238, "step": 1825 }, { "epoch": 0.02653014397980841, "grad_norm": 17.7932186126709, "learning_rate": 2.6529636711281073e-06, "loss": 0.2646, "step": 1850 }, { "epoch": 0.02688865943899501, "grad_norm": 17.99534797668457, "learning_rate": 2.688814531548757e-06, "loss": 0.2903, "step": 1875 }, { "epoch": 0.027247174898181608, "grad_norm": 6.524699687957764, "learning_rate": 2.7246653919694075e-06, "loss": 0.345, "step": 1900 }, { "epoch": 0.02760569035736821, "grad_norm": 18.041791915893555, "learning_rate": 2.7605162523900576e-06, "loss": 0.3337, "step": 1925 }, { "epoch": 0.02796420581655481, "grad_norm": 19.67485237121582, "learning_rate": 2.7963671128107077e-06, "loss": 0.2772, "step": 1950 }, { "epoch": 0.02832272127574141, "grad_norm": 7.539529323577881, "learning_rate": 2.8322179732313574e-06, "loss": 0.2269, "step": 1975 }, { "epoch": 0.02868123673492801, "grad_norm": 23.262821197509766, "learning_rate": 2.8680688336520075e-06, "loss": 0.3047, "step": 2000 }, { "epoch": 0.029039752194114612, "grad_norm": 1.9566833972930908, "learning_rate": 2.903919694072658e-06, "loss": 0.2351, "step": 2025 }, { "epoch": 0.02939826765330121, "grad_norm": 9.423388481140137, "learning_rate": 2.939770554493308e-06, "loss": 0.3828, "step": 2050 }, { "epoch": 0.02975678311248781, "grad_norm": 0.6729122996330261, "learning_rate": 2.9756214149139578e-06, "loss": 0.418, "step": 2075 }, { "epoch": 0.03011529857167441, "grad_norm": 8.726465225219727, "learning_rate": 3.011472275334608e-06, "loss": 0.3029, "step": 2100 }, { "epoch": 0.03047381403086101, "grad_norm": 12.06694221496582, "learning_rate": 3.0473231357552584e-06, "loss": 0.3615, "step": 2125 }, { "epoch": 0.030832329490047612, "grad_norm": 23.41604232788086, "learning_rate": 3.0831739961759085e-06, "loss": 0.4476, "step": 2150 }, { "epoch": 0.03119084494923421, "grad_norm": 5.982637882232666, "learning_rate": 3.1190248565965586e-06, "loss": 0.2823, "step": 2175 }, { "epoch": 0.03154936040842081, "grad_norm": 7.124455451965332, "learning_rate": 3.1548757170172083e-06, "loss": 0.3158, "step": 2200 }, { "epoch": 0.03190787586760741, "grad_norm": 13.922211647033691, "learning_rate": 3.1907265774378584e-06, "loss": 0.357, "step": 2225 }, { "epoch": 0.032266391326794014, "grad_norm": 9.497605323791504, "learning_rate": 3.226577437858509e-06, "loss": 0.3103, "step": 2250 }, { "epoch": 0.03262490678598061, "grad_norm": 8.579977035522461, "learning_rate": 3.262428298279159e-06, "loss": 0.2767, "step": 2275 }, { "epoch": 0.03298342224516721, "grad_norm": 1.3907852172851562, "learning_rate": 3.2982791586998087e-06, "loss": 0.4561, "step": 2300 }, { "epoch": 0.03334193770435381, "grad_norm": 4.490305423736572, "learning_rate": 3.334130019120459e-06, "loss": 0.2653, "step": 2325 }, { "epoch": 0.03370045316354041, "grad_norm": 17.28898811340332, "learning_rate": 3.369980879541109e-06, "loss": 0.3466, "step": 2350 }, { "epoch": 0.03405896862272701, "grad_norm": 7.030242919921875, "learning_rate": 3.4058317399617594e-06, "loss": 0.2546, "step": 2375 }, { "epoch": 0.03441748408191361, "grad_norm": 13.491316795349121, "learning_rate": 3.441682600382409e-06, "loss": 0.2877, "step": 2400 }, { "epoch": 0.034775999541100215, "grad_norm": 17.76638412475586, "learning_rate": 3.4775334608030592e-06, "loss": 0.2114, "step": 2425 }, { "epoch": 0.03513451500028681, "grad_norm": 9.335227012634277, "learning_rate": 3.5133843212237093e-06, "loss": 0.4016, "step": 2450 }, { "epoch": 0.03549303045947341, "grad_norm": 8.54090404510498, "learning_rate": 3.54923518164436e-06, "loss": 0.4042, "step": 2475 }, { "epoch": 0.035851545918660015, "grad_norm": 8.30825138092041, "learning_rate": 3.5850860420650095e-06, "loss": 0.3184, "step": 2500 }, { "epoch": 0.03621006137784661, "grad_norm": 11.412107467651367, "learning_rate": 3.6209369024856596e-06, "loss": 0.3902, "step": 2525 }, { "epoch": 0.03656857683703321, "grad_norm": 10.957119941711426, "learning_rate": 3.6567877629063097e-06, "loss": 0.3678, "step": 2550 }, { "epoch": 0.036927092296219814, "grad_norm": 16.272850036621094, "learning_rate": 3.69263862332696e-06, "loss": 0.3847, "step": 2575 }, { "epoch": 0.037285607755406416, "grad_norm": 18.5426025390625, "learning_rate": 3.7284894837476104e-06, "loss": 0.314, "step": 2600 }, { "epoch": 0.03764412321459301, "grad_norm": 6.177000045776367, "learning_rate": 3.76434034416826e-06, "loss": 0.3008, "step": 2625 }, { "epoch": 0.038002638673779614, "grad_norm": 7.659793853759766, "learning_rate": 3.8001912045889106e-06, "loss": 0.3425, "step": 2650 }, { "epoch": 0.038361154132966216, "grad_norm": 24.2507381439209, "learning_rate": 3.83604206500956e-06, "loss": 0.3714, "step": 2675 }, { "epoch": 0.03871966959215281, "grad_norm": 9.338908195495605, "learning_rate": 3.87189292543021e-06, "loss": 0.2335, "step": 2700 }, { "epoch": 0.03907818505133941, "grad_norm": 18.219829559326172, "learning_rate": 3.9077437858508605e-06, "loss": 0.2835, "step": 2725 }, { "epoch": 0.039436700510526015, "grad_norm": 14.597171783447266, "learning_rate": 3.9435946462715106e-06, "loss": 0.4452, "step": 2750 }, { "epoch": 0.03979521596971261, "grad_norm": 8.001080513000488, "learning_rate": 3.979445506692161e-06, "loss": 0.3821, "step": 2775 }, { "epoch": 0.04015373142889921, "grad_norm": 3.890228748321533, "learning_rate": 4.015296367112811e-06, "loss": 0.2264, "step": 2800 }, { "epoch": 0.040512246888085815, "grad_norm": 1.4646942615509033, "learning_rate": 4.051147227533461e-06, "loss": 0.3435, "step": 2825 }, { "epoch": 0.04087076234727242, "grad_norm": 15.442397117614746, "learning_rate": 4.086998087954111e-06, "loss": 0.2481, "step": 2850 }, { "epoch": 0.04122927780645901, "grad_norm": 3.934739112854004, "learning_rate": 4.122848948374761e-06, "loss": 0.3421, "step": 2875 }, { "epoch": 0.041587793265645615, "grad_norm": 5.985713958740234, "learning_rate": 4.158699808795411e-06, "loss": 0.3213, "step": 2900 }, { "epoch": 0.04194630872483222, "grad_norm": 9.620708465576172, "learning_rate": 4.194550669216061e-06, "loss": 0.301, "step": 2925 }, { "epoch": 0.04230482418401881, "grad_norm": 13.77983570098877, "learning_rate": 4.230401529636711e-06, "loss": 0.2816, "step": 2950 }, { "epoch": 0.042663339643205414, "grad_norm": 0.7665311694145203, "learning_rate": 4.2662523900573615e-06, "loss": 0.2745, "step": 2975 }, { "epoch": 0.043021855102392016, "grad_norm": 10.757230758666992, "learning_rate": 4.302103250478012e-06, "loss": 0.2966, "step": 3000 }, { "epoch": 0.04338037056157862, "grad_norm": 1.0165427923202515, "learning_rate": 4.337954110898662e-06, "loss": 0.304, "step": 3025 }, { "epoch": 0.043738886020765214, "grad_norm": 2.409059762954712, "learning_rate": 4.373804971319312e-06, "loss": 0.3661, "step": 3050 }, { "epoch": 0.044097401479951816, "grad_norm": 1.7158704996109009, "learning_rate": 4.409655831739962e-06, "loss": 0.2251, "step": 3075 }, { "epoch": 0.04445591693913842, "grad_norm": 32.43648147583008, "learning_rate": 4.445506692160612e-06, "loss": 0.4681, "step": 3100 }, { "epoch": 0.04481443239832501, "grad_norm": 13.185606002807617, "learning_rate": 4.481357552581262e-06, "loss": 0.2907, "step": 3125 }, { "epoch": 0.045172947857511615, "grad_norm": 3.5653316974639893, "learning_rate": 4.517208413001912e-06, "loss": 0.277, "step": 3150 }, { "epoch": 0.04553146331669822, "grad_norm": 22.6489315032959, "learning_rate": 4.553059273422562e-06, "loss": 0.3705, "step": 3175 }, { "epoch": 0.04588997877588482, "grad_norm": 1.2854938507080078, "learning_rate": 4.5889101338432124e-06, "loss": 0.3327, "step": 3200 }, { "epoch": 0.046248494235071415, "grad_norm": 4.474982738494873, "learning_rate": 4.6247609942638625e-06, "loss": 0.1829, "step": 3225 }, { "epoch": 0.04660700969425802, "grad_norm": 4.721594333648682, "learning_rate": 4.660611854684513e-06, "loss": 0.3039, "step": 3250 }, { "epoch": 0.04696552515344462, "grad_norm": 1.65247642993927, "learning_rate": 4.696462715105163e-06, "loss": 0.2573, "step": 3275 }, { "epoch": 0.047324040612631214, "grad_norm": 9.920370101928711, "learning_rate": 4.732313575525812e-06, "loss": 0.2918, "step": 3300 }, { "epoch": 0.047682556071817817, "grad_norm": 0.9573529362678528, "learning_rate": 4.768164435946463e-06, "loss": 0.2769, "step": 3325 }, { "epoch": 0.04804107153100442, "grad_norm": 5.646825790405273, "learning_rate": 4.804015296367113e-06, "loss": 0.2437, "step": 3350 }, { "epoch": 0.048399586990191014, "grad_norm": 1.0868719816207886, "learning_rate": 4.839866156787763e-06, "loss": 0.18, "step": 3375 }, { "epoch": 0.048758102449377616, "grad_norm": 9.650443077087402, "learning_rate": 4.875717017208413e-06, "loss": 0.289, "step": 3400 }, { "epoch": 0.04911661790856422, "grad_norm": 1.174780249595642, "learning_rate": 4.9115678776290625e-06, "loss": 0.3689, "step": 3425 }, { "epoch": 0.04947513336775082, "grad_norm": 17.737510681152344, "learning_rate": 4.9474187380497135e-06, "loss": 0.2678, "step": 3450 }, { "epoch": 0.049833648826937416, "grad_norm": 3.3412046432495117, "learning_rate": 4.983269598470364e-06, "loss": 0.2441, "step": 3475 }, { "epoch": 0.05019216428612402, "grad_norm": 17.6431827545166, "learning_rate": 5.019120458891014e-06, "loss": 0.2896, "step": 3500 }, { "epoch": 0.05055067974531062, "grad_norm": 7.3869428634643555, "learning_rate": 5.054971319311664e-06, "loss": 0.2894, "step": 3525 }, { "epoch": 0.050909195204497215, "grad_norm": 10.661377906799316, "learning_rate": 5.090822179732313e-06, "loss": 0.2625, "step": 3550 }, { "epoch": 0.05126771066368382, "grad_norm": 1.1681920289993286, "learning_rate": 5.126673040152964e-06, "loss": 0.2328, "step": 3575 }, { "epoch": 0.05162622612287042, "grad_norm": 12.257088661193848, "learning_rate": 5.162523900573614e-06, "loss": 0.325, "step": 3600 }, { "epoch": 0.05198474158205702, "grad_norm": 14.586543083190918, "learning_rate": 5.198374760994263e-06, "loss": 0.3388, "step": 3625 }, { "epoch": 0.05234325704124362, "grad_norm": 16.690563201904297, "learning_rate": 5.234225621414914e-06, "loss": 0.4227, "step": 3650 }, { "epoch": 0.05270177250043022, "grad_norm": 20.154735565185547, "learning_rate": 5.270076481835564e-06, "loss": 0.3246, "step": 3675 }, { "epoch": 0.05306028795961682, "grad_norm": 12.045920372009277, "learning_rate": 5.3059273422562145e-06, "loss": 0.2813, "step": 3700 }, { "epoch": 0.053418803418803416, "grad_norm": 25.365291595458984, "learning_rate": 5.341778202676865e-06, "loss": 0.3932, "step": 3725 }, { "epoch": 0.05377731887799002, "grad_norm": 1.040185809135437, "learning_rate": 5.377629063097514e-06, "loss": 0.3461, "step": 3750 }, { "epoch": 0.05413583433717662, "grad_norm": 22.527341842651367, "learning_rate": 5.413479923518165e-06, "loss": 0.2823, "step": 3775 }, { "epoch": 0.054494349796363216, "grad_norm": 2.0592339038848877, "learning_rate": 5.449330783938815e-06, "loss": 0.3183, "step": 3800 }, { "epoch": 0.05485286525554982, "grad_norm": 3.3155517578125, "learning_rate": 5.485181644359465e-06, "loss": 0.291, "step": 3825 }, { "epoch": 0.05521138071473642, "grad_norm": 20.930545806884766, "learning_rate": 5.521032504780115e-06, "loss": 0.2547, "step": 3850 }, { "epoch": 0.05556989617392302, "grad_norm": 5.713258743286133, "learning_rate": 5.556883365200764e-06, "loss": 0.1941, "step": 3875 }, { "epoch": 0.05592841163310962, "grad_norm": 0.3433953523635864, "learning_rate": 5.592734225621415e-06, "loss": 0.2981, "step": 3900 }, { "epoch": 0.05628692709229622, "grad_norm": 2.814664363861084, "learning_rate": 5.6285850860420654e-06, "loss": 0.2672, "step": 3925 }, { "epoch": 0.05664544255148282, "grad_norm": 6.726901054382324, "learning_rate": 5.664435946462715e-06, "loss": 0.3191, "step": 3950 }, { "epoch": 0.05700395801066942, "grad_norm": 5.199446678161621, "learning_rate": 5.700286806883366e-06, "loss": 0.2471, "step": 3975 }, { "epoch": 0.05736247346985602, "grad_norm": 2.618137836456299, "learning_rate": 5.736137667304015e-06, "loss": 0.2112, "step": 4000 }, { "epoch": 0.05772098892904262, "grad_norm": 28.59296989440918, "learning_rate": 5.771988527724666e-06, "loss": 0.3336, "step": 4025 }, { "epoch": 0.058079504388229224, "grad_norm": 5.383624076843262, "learning_rate": 5.807839388145316e-06, "loss": 0.254, "step": 4050 }, { "epoch": 0.05843801984741582, "grad_norm": 12.735970497131348, "learning_rate": 5.843690248565965e-06, "loss": 0.3335, "step": 4075 }, { "epoch": 0.05879653530660242, "grad_norm": 3.2661921977996826, "learning_rate": 5.879541108986616e-06, "loss": 0.1809, "step": 4100 }, { "epoch": 0.05915505076578902, "grad_norm": 14.57696533203125, "learning_rate": 5.9153919694072654e-06, "loss": 0.2663, "step": 4125 }, { "epoch": 0.05951356622497562, "grad_norm": 13.833056449890137, "learning_rate": 5.9512428298279155e-06, "loss": 0.4549, "step": 4150 }, { "epoch": 0.05987208168416222, "grad_norm": 11.998178482055664, "learning_rate": 5.9870936902485665e-06, "loss": 0.2773, "step": 4175 }, { "epoch": 0.06023059714334882, "grad_norm": 4.182008743286133, "learning_rate": 6.022944550669216e-06, "loss": 0.2806, "step": 4200 }, { "epoch": 0.06058911260253542, "grad_norm": 23.295757293701172, "learning_rate": 6.058795411089867e-06, "loss": 0.1793, "step": 4225 }, { "epoch": 0.06094762806172202, "grad_norm": 6.717942237854004, "learning_rate": 6.094646271510517e-06, "loss": 0.1769, "step": 4250 }, { "epoch": 0.06130614352090862, "grad_norm": 11.703352928161621, "learning_rate": 6.130497131931166e-06, "loss": 0.4138, "step": 4275 }, { "epoch": 0.061664658980095224, "grad_norm": 16.226572036743164, "learning_rate": 6.166347992351817e-06, "loss": 0.3151, "step": 4300 }, { "epoch": 0.06202317443928182, "grad_norm": 7.861140251159668, "learning_rate": 6.202198852772466e-06, "loss": 0.2603, "step": 4325 }, { "epoch": 0.06238168989846842, "grad_norm": 16.231801986694336, "learning_rate": 6.238049713193117e-06, "loss": 0.304, "step": 4350 }, { "epoch": 0.06274020535765502, "grad_norm": 3.058619976043701, "learning_rate": 6.273900573613767e-06, "loss": 0.2668, "step": 4375 }, { "epoch": 0.06309872081684162, "grad_norm": 2.7687156200408936, "learning_rate": 6.3097514340344166e-06, "loss": 0.2349, "step": 4400 }, { "epoch": 0.06345723627602823, "grad_norm": 20.107852935791016, "learning_rate": 6.3456022944550675e-06, "loss": 0.2321, "step": 4425 }, { "epoch": 0.06381575173521482, "grad_norm": 17.46559715270996, "learning_rate": 6.381453154875717e-06, "loss": 0.3871, "step": 4450 }, { "epoch": 0.06417426719440142, "grad_norm": 3.318965196609497, "learning_rate": 6.417304015296367e-06, "loss": 0.3491, "step": 4475 }, { "epoch": 0.06453278265358803, "grad_norm": 21.96360206604004, "learning_rate": 6.453154875717018e-06, "loss": 0.2528, "step": 4500 }, { "epoch": 0.06489129811277462, "grad_norm": 3.5024871826171875, "learning_rate": 6.489005736137667e-06, "loss": 0.3305, "step": 4525 }, { "epoch": 0.06524981357196122, "grad_norm": 11.938565254211426, "learning_rate": 6.524856596558318e-06, "loss": 0.2873, "step": 4550 }, { "epoch": 0.06560832903114783, "grad_norm": 6.379380226135254, "learning_rate": 6.560707456978967e-06, "loss": 0.2505, "step": 4575 }, { "epoch": 0.06596684449033442, "grad_norm": 3.4157721996307373, "learning_rate": 6.596558317399617e-06, "loss": 0.2716, "step": 4600 }, { "epoch": 0.06632535994952102, "grad_norm": 18.432348251342773, "learning_rate": 6.632409177820268e-06, "loss": 0.3347, "step": 4625 }, { "epoch": 0.06668387540870763, "grad_norm": 11.388775825500488, "learning_rate": 6.668260038240918e-06, "loss": 0.2896, "step": 4650 }, { "epoch": 0.06704239086789422, "grad_norm": 21.211660385131836, "learning_rate": 6.7041108986615686e-06, "loss": 0.22, "step": 4675 }, { "epoch": 0.06740090632708082, "grad_norm": 12.626204490661621, "learning_rate": 6.739961759082218e-06, "loss": 0.2363, "step": 4700 }, { "epoch": 0.06775942178626743, "grad_norm": 12.552257537841797, "learning_rate": 6.775812619502868e-06, "loss": 0.2916, "step": 4725 }, { "epoch": 0.06811793724545402, "grad_norm": 21.29806137084961, "learning_rate": 6.811663479923519e-06, "loss": 0.2547, "step": 4750 }, { "epoch": 0.06847645270464063, "grad_norm": 7.858422756195068, "learning_rate": 6.847514340344168e-06, "loss": 0.2604, "step": 4775 }, { "epoch": 0.06883496816382723, "grad_norm": 16.849042892456055, "learning_rate": 6.883365200764818e-06, "loss": 0.2818, "step": 4800 }, { "epoch": 0.06919348362301382, "grad_norm": 7.374769687652588, "learning_rate": 6.919216061185469e-06, "loss": 0.223, "step": 4825 }, { "epoch": 0.06955199908220043, "grad_norm": 10.125648498535156, "learning_rate": 6.9550669216061184e-06, "loss": 0.2766, "step": 4850 }, { "epoch": 0.06991051454138703, "grad_norm": 14.431543350219727, "learning_rate": 6.990917782026769e-06, "loss": 0.2573, "step": 4875 }, { "epoch": 0.07026903000057362, "grad_norm": 14.011460304260254, "learning_rate": 7.026768642447419e-06, "loss": 0.2417, "step": 4900 }, { "epoch": 0.07062754545976023, "grad_norm": 18.060426712036133, "learning_rate": 7.062619502868069e-06, "loss": 0.2777, "step": 4925 }, { "epoch": 0.07098606091894683, "grad_norm": 3.5972554683685303, "learning_rate": 7.09847036328872e-06, "loss": 0.3278, "step": 4950 }, { "epoch": 0.07134457637813342, "grad_norm": 1.3712530136108398, "learning_rate": 7.134321223709369e-06, "loss": 0.246, "step": 4975 }, { "epoch": 0.07170309183732003, "grad_norm": 12.337106704711914, "learning_rate": 7.170172084130019e-06, "loss": 0.294, "step": 5000 }, { "epoch": 0.07206160729650662, "grad_norm": 16.15752410888672, "learning_rate": 7.206022944550669e-06, "loss": 0.342, "step": 5025 }, { "epoch": 0.07242012275569322, "grad_norm": 4.118162631988525, "learning_rate": 7.241873804971319e-06, "loss": 0.2119, "step": 5050 }, { "epoch": 0.07277863821487983, "grad_norm": 8.763769149780273, "learning_rate": 7.27772466539197e-06, "loss": 0.3332, "step": 5075 }, { "epoch": 0.07313715367406642, "grad_norm": 7.672851085662842, "learning_rate": 7.3135755258126195e-06, "loss": 0.3505, "step": 5100 }, { "epoch": 0.07349566913325302, "grad_norm": 7.740843296051025, "learning_rate": 7.34942638623327e-06, "loss": 0.2385, "step": 5125 }, { "epoch": 0.07385418459243963, "grad_norm": 0.7666776180267334, "learning_rate": 7.38527724665392e-06, "loss": 0.2066, "step": 5150 }, { "epoch": 0.07421270005162622, "grad_norm": 20.697586059570312, "learning_rate": 7.42112810707457e-06, "loss": 0.3771, "step": 5175 }, { "epoch": 0.07457121551081283, "grad_norm": 9.066771507263184, "learning_rate": 7.456978967495221e-06, "loss": 0.2952, "step": 5200 }, { "epoch": 0.07492973096999943, "grad_norm": 10.988297462463379, "learning_rate": 7.49282982791587e-06, "loss": 0.3004, "step": 5225 }, { "epoch": 0.07528824642918602, "grad_norm": 17.95578384399414, "learning_rate": 7.52868068833652e-06, "loss": 0.3006, "step": 5250 }, { "epoch": 0.07564676188837263, "grad_norm": 16.319400787353516, "learning_rate": 7.56453154875717e-06, "loss": 0.3571, "step": 5275 }, { "epoch": 0.07600527734755923, "grad_norm": 6.837569713592529, "learning_rate": 7.600382409177821e-06, "loss": 0.2746, "step": 5300 }, { "epoch": 0.07636379280674582, "grad_norm": 6.141521453857422, "learning_rate": 7.636233269598471e-06, "loss": 0.174, "step": 5325 }, { "epoch": 0.07672230826593243, "grad_norm": 17.731536865234375, "learning_rate": 7.67208413001912e-06, "loss": 0.1826, "step": 5350 }, { "epoch": 0.07708082372511903, "grad_norm": 16.809494018554688, "learning_rate": 7.70793499043977e-06, "loss": 0.2707, "step": 5375 }, { "epoch": 0.07743933918430562, "grad_norm": 18.606369018554688, "learning_rate": 7.74378585086042e-06, "loss": 0.3599, "step": 5400 }, { "epoch": 0.07779785464349223, "grad_norm": 2.5781025886535645, "learning_rate": 7.77963671128107e-06, "loss": 0.3074, "step": 5425 }, { "epoch": 0.07815637010267883, "grad_norm": 9.782306671142578, "learning_rate": 7.815487571701721e-06, "loss": 0.1888, "step": 5450 }, { "epoch": 0.07851488556186542, "grad_norm": 13.138923645019531, "learning_rate": 7.851338432122372e-06, "loss": 0.3004, "step": 5475 }, { "epoch": 0.07887340102105203, "grad_norm": 1.9858505725860596, "learning_rate": 7.887189292543021e-06, "loss": 0.3461, "step": 5500 }, { "epoch": 0.07923191648023863, "grad_norm": 20.09796905517578, "learning_rate": 7.92304015296367e-06, "loss": 0.4847, "step": 5525 }, { "epoch": 0.07959043193942522, "grad_norm": 0.7435798645019531, "learning_rate": 7.958891013384321e-06, "loss": 0.3341, "step": 5550 }, { "epoch": 0.07994894739861183, "grad_norm": 6.4624481201171875, "learning_rate": 7.994741873804972e-06, "loss": 0.2929, "step": 5575 }, { "epoch": 0.08030746285779843, "grad_norm": 26.496055603027344, "learning_rate": 8.030592734225622e-06, "loss": 0.2711, "step": 5600 }, { "epoch": 0.08066597831698503, "grad_norm": 14.294866561889648, "learning_rate": 8.066443594646273e-06, "loss": 0.2291, "step": 5625 }, { "epoch": 0.08102449377617163, "grad_norm": 25.71739387512207, "learning_rate": 8.102294455066922e-06, "loss": 0.3104, "step": 5650 }, { "epoch": 0.08138300923535823, "grad_norm": 20.985458374023438, "learning_rate": 8.138145315487571e-06, "loss": 0.2243, "step": 5675 }, { "epoch": 0.08174152469454483, "grad_norm": 22.03936004638672, "learning_rate": 8.173996175908222e-06, "loss": 0.3628, "step": 5700 }, { "epoch": 0.08210004015373143, "grad_norm": 2.0014491081237793, "learning_rate": 8.209847036328873e-06, "loss": 0.2648, "step": 5725 }, { "epoch": 0.08245855561291802, "grad_norm": 10.871953964233398, "learning_rate": 8.245697896749522e-06, "loss": 0.2664, "step": 5750 }, { "epoch": 0.08281707107210463, "grad_norm": 12.780025482177734, "learning_rate": 8.281548757170171e-06, "loss": 0.3302, "step": 5775 }, { "epoch": 0.08317558653129123, "grad_norm": 6.846097469329834, "learning_rate": 8.317399617590822e-06, "loss": 0.2813, "step": 5800 }, { "epoch": 0.08353410199047782, "grad_norm": 12.63754653930664, "learning_rate": 8.353250478011473e-06, "loss": 0.3479, "step": 5825 }, { "epoch": 0.08389261744966443, "grad_norm": 12.429254531860352, "learning_rate": 8.389101338432123e-06, "loss": 0.2842, "step": 5850 }, { "epoch": 0.08425113290885103, "grad_norm": 0.4009374976158142, "learning_rate": 8.424952198852774e-06, "loss": 0.259, "step": 5875 }, { "epoch": 0.08460964836803762, "grad_norm": 23.34334373474121, "learning_rate": 8.460803059273423e-06, "loss": 0.382, "step": 5900 }, { "epoch": 0.08496816382722423, "grad_norm": 18.360658645629883, "learning_rate": 8.496653919694072e-06, "loss": 0.2183, "step": 5925 }, { "epoch": 0.08532667928641083, "grad_norm": 19.900911331176758, "learning_rate": 8.532504780114723e-06, "loss": 0.2883, "step": 5950 }, { "epoch": 0.08568519474559742, "grad_norm": 5.370075225830078, "learning_rate": 8.568355640535374e-06, "loss": 0.2414, "step": 5975 }, { "epoch": 0.08604371020478403, "grad_norm": 0.3239923417568207, "learning_rate": 8.604206500956023e-06, "loss": 0.2656, "step": 6000 }, { "epoch": 0.08640222566397063, "grad_norm": 2.176591634750366, "learning_rate": 8.640057361376672e-06, "loss": 0.2892, "step": 6025 }, { "epoch": 0.08676074112315724, "grad_norm": 0.956741213798523, "learning_rate": 8.675908221797323e-06, "loss": 0.3001, "step": 6050 }, { "epoch": 0.08711925658234383, "grad_norm": 19.336124420166016, "learning_rate": 8.711759082217973e-06, "loss": 0.3248, "step": 6075 }, { "epoch": 0.08747777204153043, "grad_norm": 5.649174690246582, "learning_rate": 8.747609942638624e-06, "loss": 0.2865, "step": 6100 }, { "epoch": 0.08783628750071704, "grad_norm": 10.958148002624512, "learning_rate": 8.783460803059275e-06, "loss": 0.2559, "step": 6125 }, { "epoch": 0.08819480295990363, "grad_norm": 7.359428882598877, "learning_rate": 8.819311663479924e-06, "loss": 0.3207, "step": 6150 }, { "epoch": 0.08855331841909023, "grad_norm": 30.375017166137695, "learning_rate": 8.855162523900573e-06, "loss": 0.3683, "step": 6175 }, { "epoch": 0.08891183387827684, "grad_norm": 12.677515983581543, "learning_rate": 8.891013384321224e-06, "loss": 0.3399, "step": 6200 }, { "epoch": 0.08927034933746343, "grad_norm": 13.935861587524414, "learning_rate": 8.926864244741875e-06, "loss": 0.1988, "step": 6225 }, { "epoch": 0.08962886479665003, "grad_norm": 5.437995433807373, "learning_rate": 8.962715105162524e-06, "loss": 0.1686, "step": 6250 }, { "epoch": 0.08998738025583664, "grad_norm": 12.647872924804688, "learning_rate": 8.998565965583174e-06, "loss": 0.3315, "step": 6275 }, { "epoch": 0.09034589571502323, "grad_norm": 12.21076774597168, "learning_rate": 9.034416826003824e-06, "loss": 0.3217, "step": 6300 }, { "epoch": 0.09070441117420983, "grad_norm": 18.39698600769043, "learning_rate": 9.070267686424474e-06, "loss": 0.2308, "step": 6325 }, { "epoch": 0.09106292663339643, "grad_norm": 2.6252005100250244, "learning_rate": 9.106118546845125e-06, "loss": 0.1716, "step": 6350 }, { "epoch": 0.09142144209258303, "grad_norm": 6.801554203033447, "learning_rate": 9.141969407265776e-06, "loss": 0.1579, "step": 6375 }, { "epoch": 0.09177995755176964, "grad_norm": 8.07211685180664, "learning_rate": 9.177820267686425e-06, "loss": 0.2274, "step": 6400 }, { "epoch": 0.09213847301095623, "grad_norm": 14.581258773803711, "learning_rate": 9.213671128107074e-06, "loss": 0.3091, "step": 6425 }, { "epoch": 0.09249698847014283, "grad_norm": 4.094153881072998, "learning_rate": 9.249521988527725e-06, "loss": 0.3107, "step": 6450 }, { "epoch": 0.09285550392932944, "grad_norm": 9.862046241760254, "learning_rate": 9.285372848948376e-06, "loss": 0.3867, "step": 6475 }, { "epoch": 0.09321401938851603, "grad_norm": 6.8866801261901855, "learning_rate": 9.321223709369025e-06, "loss": 0.2283, "step": 6500 }, { "epoch": 0.09357253484770263, "grad_norm": 4.640687465667725, "learning_rate": 9.357074569789675e-06, "loss": 0.3245, "step": 6525 }, { "epoch": 0.09393105030688924, "grad_norm": 15.582645416259766, "learning_rate": 9.392925430210326e-06, "loss": 0.2856, "step": 6550 }, { "epoch": 0.09428956576607583, "grad_norm": 0.6284382939338684, "learning_rate": 9.428776290630975e-06, "loss": 0.2862, "step": 6575 }, { "epoch": 0.09464808122526243, "grad_norm": 13.354533195495605, "learning_rate": 9.464627151051624e-06, "loss": 0.3408, "step": 6600 }, { "epoch": 0.09500659668444904, "grad_norm": 2.8739140033721924, "learning_rate": 9.500478011472277e-06, "loss": 0.1791, "step": 6625 }, { "epoch": 0.09536511214363563, "grad_norm": 18.193843841552734, "learning_rate": 9.536328871892926e-06, "loss": 0.3129, "step": 6650 }, { "epoch": 0.09572362760282223, "grad_norm": 13.472644805908203, "learning_rate": 9.572179732313575e-06, "loss": 0.2672, "step": 6675 }, { "epoch": 0.09608214306200884, "grad_norm": 14.050415992736816, "learning_rate": 9.608030592734226e-06, "loss": 0.2778, "step": 6700 }, { "epoch": 0.09644065852119543, "grad_norm": 19.868755340576172, "learning_rate": 9.643881453154875e-06, "loss": 0.3309, "step": 6725 }, { "epoch": 0.09679917398038203, "grad_norm": 4.7416181564331055, "learning_rate": 9.679732313575526e-06, "loss": 0.248, "step": 6750 }, { "epoch": 0.09715768943956864, "grad_norm": 6.3058247566223145, "learning_rate": 9.715583173996177e-06, "loss": 0.3065, "step": 6775 }, { "epoch": 0.09751620489875523, "grad_norm": 4.436661243438721, "learning_rate": 9.751434034416827e-06, "loss": 0.2586, "step": 6800 }, { "epoch": 0.09787472035794184, "grad_norm": 18.943267822265625, "learning_rate": 9.787284894837476e-06, "loss": 0.2539, "step": 6825 }, { "epoch": 0.09823323581712844, "grad_norm": 19.619022369384766, "learning_rate": 9.823135755258125e-06, "loss": 0.2958, "step": 6850 }, { "epoch": 0.09859175127631503, "grad_norm": 28.197166442871094, "learning_rate": 9.858986615678778e-06, "loss": 0.3608, "step": 6875 }, { "epoch": 0.09895026673550164, "grad_norm": 4.569759368896484, "learning_rate": 9.894837476099427e-06, "loss": 0.2551, "step": 6900 }, { "epoch": 0.09930878219468824, "grad_norm": 6.80328893661499, "learning_rate": 9.930688336520076e-06, "loss": 0.2145, "step": 6925 }, { "epoch": 0.09966729765387483, "grad_norm": 2.0649709701538086, "learning_rate": 9.966539196940727e-06, "loss": 0.2681, "step": 6950 }, { "epoch": 0.10002581311306144, "grad_norm": 3.1558127403259277, "learning_rate": 1.0002390057361376e-05, "loss": 0.2261, "step": 6975 }, { "epoch": 0.10038432857224804, "grad_norm": 4.891828536987305, "learning_rate": 1.0038240917782027e-05, "loss": 0.2891, "step": 7000 }, { "epoch": 0.10074284403143463, "grad_norm": 9.517853736877441, "learning_rate": 1.0074091778202678e-05, "loss": 0.3138, "step": 7025 }, { "epoch": 0.10110135949062124, "grad_norm": 0.8335961699485779, "learning_rate": 1.0109942638623328e-05, "loss": 0.4105, "step": 7050 }, { "epoch": 0.10145987494980784, "grad_norm": 14.314862251281738, "learning_rate": 1.0145793499043977e-05, "loss": 0.2954, "step": 7075 }, { "epoch": 0.10181839040899443, "grad_norm": 6.088142395019531, "learning_rate": 1.0181644359464626e-05, "loss": 0.261, "step": 7100 }, { "epoch": 0.10217690586818104, "grad_norm": 14.331783294677734, "learning_rate": 1.0217495219885277e-05, "loss": 0.1886, "step": 7125 }, { "epoch": 0.10253542132736763, "grad_norm": 18.086074829101562, "learning_rate": 1.0253346080305928e-05, "loss": 0.3499, "step": 7150 }, { "epoch": 0.10289393678655423, "grad_norm": 0.530082643032074, "learning_rate": 1.0289196940726577e-05, "loss": 0.3608, "step": 7175 }, { "epoch": 0.10325245224574084, "grad_norm": 20.140615463256836, "learning_rate": 1.0325047801147228e-05, "loss": 0.3947, "step": 7200 }, { "epoch": 0.10361096770492743, "grad_norm": 24.230958938598633, "learning_rate": 1.0360898661567877e-05, "loss": 0.3336, "step": 7225 }, { "epoch": 0.10396948316411404, "grad_norm": 23.000640869140625, "learning_rate": 1.0396749521988527e-05, "loss": 0.3959, "step": 7250 }, { "epoch": 0.10432799862330064, "grad_norm": 9.382373809814453, "learning_rate": 1.043260038240918e-05, "loss": 0.3745, "step": 7275 }, { "epoch": 0.10468651408248723, "grad_norm": 21.421438217163086, "learning_rate": 1.0468451242829829e-05, "loss": 0.3263, "step": 7300 }, { "epoch": 0.10504502954167384, "grad_norm": 2.981389284133911, "learning_rate": 1.0504302103250478e-05, "loss": 0.1698, "step": 7325 }, { "epoch": 0.10540354500086044, "grad_norm": 23.876689910888672, "learning_rate": 1.0540152963671129e-05, "loss": 0.2905, "step": 7350 }, { "epoch": 0.10576206046004703, "grad_norm": 6.159508228302002, "learning_rate": 1.0576003824091778e-05, "loss": 0.3776, "step": 7375 }, { "epoch": 0.10612057591923364, "grad_norm": 9.762521743774414, "learning_rate": 1.0611854684512429e-05, "loss": 0.2976, "step": 7400 }, { "epoch": 0.10647909137842024, "grad_norm": 2.5443050861358643, "learning_rate": 1.0647705544933078e-05, "loss": 0.2069, "step": 7425 }, { "epoch": 0.10683760683760683, "grad_norm": 15.833447456359863, "learning_rate": 1.068355640535373e-05, "loss": 0.3693, "step": 7450 }, { "epoch": 0.10719612229679344, "grad_norm": 0.40668830275535583, "learning_rate": 1.0719407265774378e-05, "loss": 0.3615, "step": 7475 }, { "epoch": 0.10755463775598004, "grad_norm": 7.914300441741943, "learning_rate": 1.0755258126195028e-05, "loss": 0.2646, "step": 7500 }, { "epoch": 0.10791315321516663, "grad_norm": 12.216639518737793, "learning_rate": 1.079110898661568e-05, "loss": 0.1904, "step": 7525 }, { "epoch": 0.10827166867435324, "grad_norm": 11.191414833068848, "learning_rate": 1.082695984703633e-05, "loss": 0.3599, "step": 7550 }, { "epoch": 0.10863018413353984, "grad_norm": 0.27859094738960266, "learning_rate": 1.0862810707456979e-05, "loss": 0.2418, "step": 7575 }, { "epoch": 0.10898869959272643, "grad_norm": 9.31594181060791, "learning_rate": 1.089866156787763e-05, "loss": 0.1978, "step": 7600 }, { "epoch": 0.10934721505191304, "grad_norm": 4.339696884155273, "learning_rate": 1.0934512428298279e-05, "loss": 0.3212, "step": 7625 }, { "epoch": 0.10970573051109964, "grad_norm": 0.6278403401374817, "learning_rate": 1.097036328871893e-05, "loss": 0.2604, "step": 7650 }, { "epoch": 0.11006424597028625, "grad_norm": 17.435758590698242, "learning_rate": 1.100621414913958e-05, "loss": 0.286, "step": 7675 }, { "epoch": 0.11042276142947284, "grad_norm": 5.286866188049316, "learning_rate": 1.104206500956023e-05, "loss": 0.259, "step": 7700 }, { "epoch": 0.11078127688865944, "grad_norm": 14.168050765991211, "learning_rate": 1.107791586998088e-05, "loss": 0.1775, "step": 7725 }, { "epoch": 0.11113979234784604, "grad_norm": 10.761833190917969, "learning_rate": 1.1113766730401529e-05, "loss": 0.2711, "step": 7750 }, { "epoch": 0.11149830780703264, "grad_norm": 5.123095989227295, "learning_rate": 1.114961759082218e-05, "loss": 0.2886, "step": 7775 }, { "epoch": 0.11185682326621924, "grad_norm": 23.989742279052734, "learning_rate": 1.118546845124283e-05, "loss": 0.32, "step": 7800 }, { "epoch": 0.11221533872540584, "grad_norm": 2.973930835723877, "learning_rate": 1.122131931166348e-05, "loss": 0.2455, "step": 7825 }, { "epoch": 0.11257385418459244, "grad_norm": 16.354774475097656, "learning_rate": 1.1257170172084131e-05, "loss": 0.2727, "step": 7850 }, { "epoch": 0.11293236964377903, "grad_norm": 3.698993682861328, "learning_rate": 1.129302103250478e-05, "loss": 0.3204, "step": 7875 }, { "epoch": 0.11329088510296564, "grad_norm": 23.629255294799805, "learning_rate": 1.132887189292543e-05, "loss": 0.4103, "step": 7900 }, { "epoch": 0.11364940056215224, "grad_norm": 10.648221015930176, "learning_rate": 1.1364722753346082e-05, "loss": 0.3257, "step": 7925 }, { "epoch": 0.11400791602133883, "grad_norm": 8.110867500305176, "learning_rate": 1.1400573613766731e-05, "loss": 0.2509, "step": 7950 }, { "epoch": 0.11436643148052544, "grad_norm": 13.736448287963867, "learning_rate": 1.143642447418738e-05, "loss": 0.2249, "step": 7975 }, { "epoch": 0.11472494693971204, "grad_norm": 21.43511390686035, "learning_rate": 1.147227533460803e-05, "loss": 0.3091, "step": 8000 }, { "epoch": 0.11508346239889863, "grad_norm": 23.673892974853516, "learning_rate": 1.150812619502868e-05, "loss": 0.286, "step": 8025 }, { "epoch": 0.11544197785808524, "grad_norm": 2.534350633621216, "learning_rate": 1.1543977055449332e-05, "loss": 0.2687, "step": 8050 }, { "epoch": 0.11580049331727184, "grad_norm": 22.74208641052246, "learning_rate": 1.1579827915869981e-05, "loss": 0.3292, "step": 8075 }, { "epoch": 0.11615900877645845, "grad_norm": 14.738645553588867, "learning_rate": 1.1615678776290632e-05, "loss": 0.2949, "step": 8100 }, { "epoch": 0.11651752423564504, "grad_norm": 6.0501532554626465, "learning_rate": 1.1651529636711281e-05, "loss": 0.3328, "step": 8125 }, { "epoch": 0.11687603969483164, "grad_norm": 2.680980682373047, "learning_rate": 1.168738049713193e-05, "loss": 0.2678, "step": 8150 }, { "epoch": 0.11723455515401825, "grad_norm": 9.113404273986816, "learning_rate": 1.1723231357552583e-05, "loss": 0.2394, "step": 8175 }, { "epoch": 0.11759307061320484, "grad_norm": 2.6976821422576904, "learning_rate": 1.1759082217973232e-05, "loss": 0.1915, "step": 8200 }, { "epoch": 0.11795158607239144, "grad_norm": 3.5648367404937744, "learning_rate": 1.1794933078393882e-05, "loss": 0.3521, "step": 8225 }, { "epoch": 0.11831010153157805, "grad_norm": 22.945236206054688, "learning_rate": 1.1830783938814531e-05, "loss": 0.3141, "step": 8250 }, { "epoch": 0.11866861699076464, "grad_norm": 12.75001049041748, "learning_rate": 1.1866634799235182e-05, "loss": 0.2195, "step": 8275 }, { "epoch": 0.11902713244995124, "grad_norm": 17.52507972717285, "learning_rate": 1.1902485659655831e-05, "loss": 0.485, "step": 8300 }, { "epoch": 0.11938564790913785, "grad_norm": 3.0177817344665527, "learning_rate": 1.1938336520076482e-05, "loss": 0.3052, "step": 8325 }, { "epoch": 0.11974416336832444, "grad_norm": 19.50627899169922, "learning_rate": 1.1974187380497133e-05, "loss": 0.3124, "step": 8350 }, { "epoch": 0.12010267882751104, "grad_norm": 2.0926496982574463, "learning_rate": 1.2010038240917782e-05, "loss": 0.2291, "step": 8375 }, { "epoch": 0.12046119428669765, "grad_norm": 11.68246078491211, "learning_rate": 1.2045889101338431e-05, "loss": 0.2886, "step": 8400 }, { "epoch": 0.12081970974588424, "grad_norm": 0.4324930012226105, "learning_rate": 1.2081739961759082e-05, "loss": 0.2871, "step": 8425 }, { "epoch": 0.12117822520507084, "grad_norm": 27.83258056640625, "learning_rate": 1.2117590822179733e-05, "loss": 0.3527, "step": 8450 }, { "epoch": 0.12153674066425744, "grad_norm": 26.076093673706055, "learning_rate": 1.2153441682600383e-05, "loss": 0.2377, "step": 8475 }, { "epoch": 0.12189525612344404, "grad_norm": 4.8778181076049805, "learning_rate": 1.2189292543021034e-05, "loss": 0.2021, "step": 8500 }, { "epoch": 0.12225377158263065, "grad_norm": 3.2413330078125, "learning_rate": 1.2225143403441683e-05, "loss": 0.257, "step": 8525 }, { "epoch": 0.12261228704181724, "grad_norm": 11.522294998168945, "learning_rate": 1.2260994263862332e-05, "loss": 0.3434, "step": 8550 }, { "epoch": 0.12297080250100384, "grad_norm": 1.05722975730896, "learning_rate": 1.2296845124282983e-05, "loss": 0.3254, "step": 8575 }, { "epoch": 0.12332931796019045, "grad_norm": 17.868511199951172, "learning_rate": 1.2332695984703634e-05, "loss": 0.3069, "step": 8600 }, { "epoch": 0.12368783341937704, "grad_norm": 2.5695393085479736, "learning_rate": 1.2368546845124283e-05, "loss": 0.3845, "step": 8625 }, { "epoch": 0.12404634887856364, "grad_norm": 11.351773262023926, "learning_rate": 1.2404397705544933e-05, "loss": 0.2187, "step": 8650 }, { "epoch": 0.12440486433775025, "grad_norm": 12.971888542175293, "learning_rate": 1.2440248565965583e-05, "loss": 0.2412, "step": 8675 }, { "epoch": 0.12476337979693684, "grad_norm": 24.066993713378906, "learning_rate": 1.2476099426386234e-05, "loss": 0.2904, "step": 8700 }, { "epoch": 0.12512189525612344, "grad_norm": 6.817066192626953, "learning_rate": 1.2511950286806884e-05, "loss": 0.2358, "step": 8725 }, { "epoch": 0.12548041071531005, "grad_norm": 19.68012046813965, "learning_rate": 1.2547801147227535e-05, "loss": 0.3129, "step": 8750 }, { "epoch": 0.12583892617449666, "grad_norm": 24.423818588256836, "learning_rate": 1.2583652007648184e-05, "loss": 0.342, "step": 8775 }, { "epoch": 0.12619744163368324, "grad_norm": 2.911623239517212, "learning_rate": 1.2619502868068833e-05, "loss": 0.2916, "step": 8800 }, { "epoch": 0.12655595709286985, "grad_norm": 2.345089912414551, "learning_rate": 1.2655353728489482e-05, "loss": 0.2797, "step": 8825 }, { "epoch": 0.12691447255205646, "grad_norm": 10.685256958007812, "learning_rate": 1.2691204588910135e-05, "loss": 0.2561, "step": 8850 }, { "epoch": 0.12727298801124304, "grad_norm": 16.625776290893555, "learning_rate": 1.2727055449330784e-05, "loss": 0.2762, "step": 8875 }, { "epoch": 0.12763150347042965, "grad_norm": 11.443361282348633, "learning_rate": 1.2762906309751434e-05, "loss": 0.2105, "step": 8900 }, { "epoch": 0.12799001892961626, "grad_norm": 1.698874831199646, "learning_rate": 1.2798757170172085e-05, "loss": 0.2114, "step": 8925 }, { "epoch": 0.12834853438880284, "grad_norm": 11.867179870605469, "learning_rate": 1.2834608030592734e-05, "loss": 0.2406, "step": 8950 }, { "epoch": 0.12870704984798945, "grad_norm": 11.702847480773926, "learning_rate": 1.2870458891013385e-05, "loss": 0.2502, "step": 8975 }, { "epoch": 0.12906556530717606, "grad_norm": 1.4539886713027954, "learning_rate": 1.2906309751434036e-05, "loss": 0.3227, "step": 9000 }, { "epoch": 0.12942408076636264, "grad_norm": 3.6188247203826904, "learning_rate": 1.2942160611854685e-05, "loss": 0.1966, "step": 9025 }, { "epoch": 0.12978259622554925, "grad_norm": 1.1882764101028442, "learning_rate": 1.2978011472275334e-05, "loss": 0.2884, "step": 9050 }, { "epoch": 0.13014111168473586, "grad_norm": 14.088600158691406, "learning_rate": 1.3013862332695985e-05, "loss": 0.283, "step": 9075 }, { "epoch": 0.13049962714392244, "grad_norm": 20.499393463134766, "learning_rate": 1.3049713193116636e-05, "loss": 0.2147, "step": 9100 }, { "epoch": 0.13085814260310905, "grad_norm": 0.8238634467124939, "learning_rate": 1.3085564053537285e-05, "loss": 0.2412, "step": 9125 }, { "epoch": 0.13121665806229565, "grad_norm": 15.354934692382812, "learning_rate": 1.3121414913957935e-05, "loss": 0.2504, "step": 9150 }, { "epoch": 0.13157517352148224, "grad_norm": 11.526517868041992, "learning_rate": 1.3157265774378586e-05, "loss": 0.2976, "step": 9175 }, { "epoch": 0.13193368898066885, "grad_norm": 4.268041610717773, "learning_rate": 1.3193116634799235e-05, "loss": 0.2395, "step": 9200 }, { "epoch": 0.13229220443985545, "grad_norm": 18.63066291809082, "learning_rate": 1.3228967495219886e-05, "loss": 0.2913, "step": 9225 }, { "epoch": 0.13265071989904204, "grad_norm": 16.038999557495117, "learning_rate": 1.3264818355640537e-05, "loss": 0.2564, "step": 9250 }, { "epoch": 0.13300923535822864, "grad_norm": 17.239137649536133, "learning_rate": 1.3300669216061186e-05, "loss": 0.3168, "step": 9275 }, { "epoch": 0.13336775081741525, "grad_norm": 22.60131072998047, "learning_rate": 1.3336520076481835e-05, "loss": 0.2207, "step": 9300 }, { "epoch": 0.13372626627660184, "grad_norm": 1.6197315454483032, "learning_rate": 1.3372370936902486e-05, "loss": 0.2563, "step": 9325 }, { "epoch": 0.13408478173578844, "grad_norm": 9.463700294494629, "learning_rate": 1.3408221797323137e-05, "loss": 0.2593, "step": 9350 }, { "epoch": 0.13444329719497505, "grad_norm": 8.641891479492188, "learning_rate": 1.3444072657743786e-05, "loss": 0.3556, "step": 9375 }, { "epoch": 0.13480181265416163, "grad_norm": 11.827491760253906, "learning_rate": 1.3479923518164436e-05, "loss": 0.2193, "step": 9400 }, { "epoch": 0.13516032811334824, "grad_norm": 15.027226448059082, "learning_rate": 1.3515774378585087e-05, "loss": 0.2799, "step": 9425 }, { "epoch": 0.13551884357253485, "grad_norm": 0.3342060446739197, "learning_rate": 1.3551625239005736e-05, "loss": 0.2218, "step": 9450 }, { "epoch": 0.13587735903172143, "grad_norm": 15.823084831237793, "learning_rate": 1.3587476099426385e-05, "loss": 0.233, "step": 9475 }, { "epoch": 0.13623587449090804, "grad_norm": 13.48747444152832, "learning_rate": 1.3623326959847038e-05, "loss": 0.2302, "step": 9500 }, { "epoch": 0.13659438995009465, "grad_norm": 7.0587286949157715, "learning_rate": 1.3659177820267687e-05, "loss": 0.2179, "step": 9525 }, { "epoch": 0.13695290540928126, "grad_norm": 0.05728193372488022, "learning_rate": 1.3695028680688336e-05, "loss": 0.3462, "step": 9550 }, { "epoch": 0.13731142086846784, "grad_norm": 17.166425704956055, "learning_rate": 1.3730879541108987e-05, "loss": 0.2051, "step": 9575 }, { "epoch": 0.13766993632765445, "grad_norm": 7.859862804412842, "learning_rate": 1.3766730401529636e-05, "loss": 0.2397, "step": 9600 }, { "epoch": 0.13802845178684106, "grad_norm": 16.48023223876953, "learning_rate": 1.3802581261950287e-05, "loss": 0.2138, "step": 9625 }, { "epoch": 0.13838696724602764, "grad_norm": 17.111236572265625, "learning_rate": 1.3838432122370938e-05, "loss": 0.1725, "step": 9650 }, { "epoch": 0.13874548270521425, "grad_norm": 24.01788902282715, "learning_rate": 1.3874282982791588e-05, "loss": 0.3062, "step": 9675 }, { "epoch": 0.13910399816440086, "grad_norm": 5.332062244415283, "learning_rate": 1.3910133843212237e-05, "loss": 0.4036, "step": 9700 }, { "epoch": 0.13946251362358744, "grad_norm": 30.110389709472656, "learning_rate": 1.3945984703632886e-05, "loss": 0.3018, "step": 9725 }, { "epoch": 0.13982102908277405, "grad_norm": 32.480125427246094, "learning_rate": 1.3981835564053539e-05, "loss": 0.4429, "step": 9750 }, { "epoch": 0.14017954454196066, "grad_norm": 15.69480037689209, "learning_rate": 1.4017686424474188e-05, "loss": 0.2215, "step": 9775 }, { "epoch": 0.14053806000114724, "grad_norm": 11.107805252075195, "learning_rate": 1.4053537284894837e-05, "loss": 0.1898, "step": 9800 }, { "epoch": 0.14089657546033385, "grad_norm": 6.358415126800537, "learning_rate": 1.4089388145315488e-05, "loss": 0.1921, "step": 9825 }, { "epoch": 0.14125509091952046, "grad_norm": 11.04603099822998, "learning_rate": 1.4125239005736138e-05, "loss": 0.314, "step": 9850 }, { "epoch": 0.14161360637870704, "grad_norm": 5.064452171325684, "learning_rate": 1.4161089866156788e-05, "loss": 0.2212, "step": 9875 }, { "epoch": 0.14197212183789365, "grad_norm": 18.752994537353516, "learning_rate": 1.419694072657744e-05, "loss": 0.197, "step": 9900 }, { "epoch": 0.14233063729708026, "grad_norm": 0.33898741006851196, "learning_rate": 1.4232791586998089e-05, "loss": 0.3416, "step": 9925 }, { "epoch": 0.14268915275626684, "grad_norm": 16.085783004760742, "learning_rate": 1.4268642447418738e-05, "loss": 0.4404, "step": 9950 }, { "epoch": 0.14304766821545345, "grad_norm": 0.2417752742767334, "learning_rate": 1.4304493307839387e-05, "loss": 0.232, "step": 9975 }, { "epoch": 0.14340618367464006, "grad_norm": 16.492090225219727, "learning_rate": 1.4340344168260038e-05, "loss": 0.2107, "step": 10000 }, { "epoch": 0.14376469913382664, "grad_norm": 16.11565399169922, "learning_rate": 1.4376195028680689e-05, "loss": 0.3269, "step": 10025 }, { "epoch": 0.14412321459301325, "grad_norm": 19.828208923339844, "learning_rate": 1.4412045889101338e-05, "loss": 0.401, "step": 10050 }, { "epoch": 0.14448173005219986, "grad_norm": 10.694297790527344, "learning_rate": 1.444789674952199e-05, "loss": 0.2733, "step": 10075 }, { "epoch": 0.14484024551138644, "grad_norm": 18.6577205657959, "learning_rate": 1.4483747609942639e-05, "loss": 0.2808, "step": 10100 }, { "epoch": 0.14519876097057305, "grad_norm": 16.905235290527344, "learning_rate": 1.4519598470363288e-05, "loss": 0.3102, "step": 10125 }, { "epoch": 0.14555727642975966, "grad_norm": 4.169461250305176, "learning_rate": 1.455544933078394e-05, "loss": 0.2057, "step": 10150 }, { "epoch": 0.14591579188894624, "grad_norm": 2.59370493888855, "learning_rate": 1.459130019120459e-05, "loss": 0.3061, "step": 10175 }, { "epoch": 0.14627430734813285, "grad_norm": 2.0360071659088135, "learning_rate": 1.4627151051625239e-05, "loss": 0.1416, "step": 10200 }, { "epoch": 0.14663282280731946, "grad_norm": 0.4490683376789093, "learning_rate": 1.466300191204589e-05, "loss": 0.252, "step": 10225 }, { "epoch": 0.14699133826650604, "grad_norm": 2.2378175258636475, "learning_rate": 1.469885277246654e-05, "loss": 0.3355, "step": 10250 }, { "epoch": 0.14734985372569265, "grad_norm": 8.132572174072266, "learning_rate": 1.473470363288719e-05, "loss": 0.2544, "step": 10275 }, { "epoch": 0.14770836918487926, "grad_norm": 13.80290699005127, "learning_rate": 1.477055449330784e-05, "loss": 0.2743, "step": 10300 }, { "epoch": 0.14806688464406587, "grad_norm": 6.832772254943848, "learning_rate": 1.480640535372849e-05, "loss": 0.2962, "step": 10325 }, { "epoch": 0.14842540010325245, "grad_norm": 16.857196807861328, "learning_rate": 1.484225621414914e-05, "loss": 0.2292, "step": 10350 }, { "epoch": 0.14878391556243906, "grad_norm": 21.293046951293945, "learning_rate": 1.4878107074569789e-05, "loss": 0.2574, "step": 10375 }, { "epoch": 0.14914243102162567, "grad_norm": 3.926741361618042, "learning_rate": 1.4913957934990441e-05, "loss": 0.2914, "step": 10400 }, { "epoch": 0.14950094648081225, "grad_norm": 8.852882385253906, "learning_rate": 1.494980879541109e-05, "loss": 0.2511, "step": 10425 }, { "epoch": 0.14985946193999886, "grad_norm": 16.455188751220703, "learning_rate": 1.498565965583174e-05, "loss": 0.3095, "step": 10450 }, { "epoch": 0.15021797739918546, "grad_norm": 17.636394500732422, "learning_rate": 1.5021510516252391e-05, "loss": 0.2505, "step": 10475 }, { "epoch": 0.15057649285837205, "grad_norm": 18.123008728027344, "learning_rate": 1.505736137667304e-05, "loss": 0.2652, "step": 10500 }, { "epoch": 0.15093500831755866, "grad_norm": 13.321243286132812, "learning_rate": 1.509321223709369e-05, "loss": 0.2403, "step": 10525 }, { "epoch": 0.15129352377674526, "grad_norm": 8.497184753417969, "learning_rate": 1.512906309751434e-05, "loss": 0.1091, "step": 10550 }, { "epoch": 0.15165203923593185, "grad_norm": 11.896331787109375, "learning_rate": 1.516491395793499e-05, "loss": 0.2522, "step": 10575 }, { "epoch": 0.15201055469511845, "grad_norm": 17.43647575378418, "learning_rate": 1.5200764818355642e-05, "loss": 0.291, "step": 10600 }, { "epoch": 0.15236907015430506, "grad_norm": 6.948025703430176, "learning_rate": 1.5236615678776292e-05, "loss": 0.2901, "step": 10625 }, { "epoch": 0.15272758561349165, "grad_norm": 0.7824473977088928, "learning_rate": 1.5272466539196943e-05, "loss": 0.2431, "step": 10650 }, { "epoch": 0.15308610107267825, "grad_norm": 7.71803617477417, "learning_rate": 1.5308317399617592e-05, "loss": 0.2723, "step": 10675 }, { "epoch": 0.15344461653186486, "grad_norm": 14.82254409790039, "learning_rate": 1.534416826003824e-05, "loss": 0.3449, "step": 10700 }, { "epoch": 0.15380313199105144, "grad_norm": 6.661323547363281, "learning_rate": 1.538001912045889e-05, "loss": 0.2166, "step": 10725 }, { "epoch": 0.15416164745023805, "grad_norm": 1.1034338474273682, "learning_rate": 1.541586998087954e-05, "loss": 0.2052, "step": 10750 }, { "epoch": 0.15452016290942466, "grad_norm": 21.459274291992188, "learning_rate": 1.5451720841300192e-05, "loss": 0.2591, "step": 10775 }, { "epoch": 0.15487867836861124, "grad_norm": 5.239480972290039, "learning_rate": 1.548757170172084e-05, "loss": 0.2462, "step": 10800 }, { "epoch": 0.15523719382779785, "grad_norm": 4.729661464691162, "learning_rate": 1.552342256214149e-05, "loss": 0.332, "step": 10825 }, { "epoch": 0.15559570928698446, "grad_norm": 4.409570217132568, "learning_rate": 1.555927342256214e-05, "loss": 0.2762, "step": 10850 }, { "epoch": 0.15595422474617104, "grad_norm": 24.196752548217773, "learning_rate": 1.5595124282982793e-05, "loss": 0.3677, "step": 10875 }, { "epoch": 0.15631274020535765, "grad_norm": 17.609739303588867, "learning_rate": 1.5630975143403442e-05, "loss": 0.33, "step": 10900 }, { "epoch": 0.15667125566454426, "grad_norm": 28.70581817626953, "learning_rate": 1.5666826003824095e-05, "loss": 0.3473, "step": 10925 }, { "epoch": 0.15702977112373084, "grad_norm": 17.02039909362793, "learning_rate": 1.5702676864244744e-05, "loss": 0.2093, "step": 10950 }, { "epoch": 0.15738828658291745, "grad_norm": 13.151288032531738, "learning_rate": 1.5738527724665393e-05, "loss": 0.2642, "step": 10975 }, { "epoch": 0.15774680204210406, "grad_norm": 1.4347820281982422, "learning_rate": 1.5774378585086042e-05, "loss": 0.3136, "step": 11000 }, { "epoch": 0.15810531750129064, "grad_norm": 0.7763677835464478, "learning_rate": 1.581022944550669e-05, "loss": 0.2203, "step": 11025 }, { "epoch": 0.15846383296047725, "grad_norm": 1.8233129978179932, "learning_rate": 1.584608030592734e-05, "loss": 0.3187, "step": 11050 }, { "epoch": 0.15882234841966386, "grad_norm": 6.655683517456055, "learning_rate": 1.588193116634799e-05, "loss": 0.1871, "step": 11075 }, { "epoch": 0.15918086387885044, "grad_norm": 3.946406602859497, "learning_rate": 1.5917782026768643e-05, "loss": 0.2984, "step": 11100 }, { "epoch": 0.15953937933803705, "grad_norm": 14.329797744750977, "learning_rate": 1.5953632887189295e-05, "loss": 0.2166, "step": 11125 }, { "epoch": 0.15989789479722366, "grad_norm": 6.622003078460693, "learning_rate": 1.5989483747609945e-05, "loss": 0.3207, "step": 11150 }, { "epoch": 0.16025641025641027, "grad_norm": 22.770343780517578, "learning_rate": 1.6025334608030594e-05, "loss": 0.1988, "step": 11175 }, { "epoch": 0.16061492571559685, "grad_norm": 19.166423797607422, "learning_rate": 1.6061185468451243e-05, "loss": 0.2135, "step": 11200 }, { "epoch": 0.16097344117478346, "grad_norm": 2.716235637664795, "learning_rate": 1.6097036328871892e-05, "loss": 0.2562, "step": 11225 }, { "epoch": 0.16133195663397007, "grad_norm": 12.122281074523926, "learning_rate": 1.6132887189292545e-05, "loss": 0.2948, "step": 11250 }, { "epoch": 0.16169047209315665, "grad_norm": 19.658681869506836, "learning_rate": 1.6168738049713194e-05, "loss": 0.2356, "step": 11275 }, { "epoch": 0.16204898755234326, "grad_norm": 2.4535510540008545, "learning_rate": 1.6204588910133844e-05, "loss": 0.2886, "step": 11300 }, { "epoch": 0.16240750301152987, "grad_norm": 7.263482093811035, "learning_rate": 1.6240439770554493e-05, "loss": 0.3553, "step": 11325 }, { "epoch": 0.16276601847071645, "grad_norm": 12.172643661499023, "learning_rate": 1.6276290630975142e-05, "loss": 0.3818, "step": 11350 }, { "epoch": 0.16312453392990306, "grad_norm": 22.326637268066406, "learning_rate": 1.631214149139579e-05, "loss": 0.2549, "step": 11375 }, { "epoch": 0.16348304938908967, "grad_norm": 13.146454811096191, "learning_rate": 1.6347992351816444e-05, "loss": 0.2905, "step": 11400 }, { "epoch": 0.16384156484827625, "grad_norm": 31.5742244720459, "learning_rate": 1.6383843212237097e-05, "loss": 0.3711, "step": 11425 }, { "epoch": 0.16420008030746286, "grad_norm": 6.215224742889404, "learning_rate": 1.6419694072657746e-05, "loss": 0.3598, "step": 11450 }, { "epoch": 0.16455859576664947, "grad_norm": 15.081979751586914, "learning_rate": 1.6455544933078395e-05, "loss": 0.2719, "step": 11475 }, { "epoch": 0.16491711122583605, "grad_norm": 10.286989212036133, "learning_rate": 1.6491395793499044e-05, "loss": 0.3031, "step": 11500 }, { "epoch": 0.16527562668502266, "grad_norm": 12.01315689086914, "learning_rate": 1.6527246653919694e-05, "loss": 0.313, "step": 11525 }, { "epoch": 0.16563414214420927, "grad_norm": 36.180931091308594, "learning_rate": 1.6563097514340343e-05, "loss": 0.2152, "step": 11550 }, { "epoch": 0.16599265760339585, "grad_norm": 15.821786880493164, "learning_rate": 1.6598948374760992e-05, "loss": 0.2317, "step": 11575 }, { "epoch": 0.16635117306258246, "grad_norm": 9.373276710510254, "learning_rate": 1.6634799235181645e-05, "loss": 0.2677, "step": 11600 }, { "epoch": 0.16670968852176907, "grad_norm": 10.919914245605469, "learning_rate": 1.6670650095602294e-05, "loss": 0.2674, "step": 11625 }, { "epoch": 0.16706820398095565, "grad_norm": 15.802181243896484, "learning_rate": 1.6706500956022947e-05, "loss": 0.2674, "step": 11650 }, { "epoch": 0.16742671944014226, "grad_norm": 8.144623756408691, "learning_rate": 1.6742351816443596e-05, "loss": 0.2847, "step": 11675 }, { "epoch": 0.16778523489932887, "grad_norm": 1.2909811735153198, "learning_rate": 1.6778202676864245e-05, "loss": 0.2014, "step": 11700 }, { "epoch": 0.16814375035851545, "grad_norm": 2.001636028289795, "learning_rate": 1.6814053537284894e-05, "loss": 0.2338, "step": 11725 }, { "epoch": 0.16850226581770206, "grad_norm": 4.610132694244385, "learning_rate": 1.6849904397705547e-05, "loss": 0.3347, "step": 11750 }, { "epoch": 0.16886078127688867, "grad_norm": 2.7858340740203857, "learning_rate": 1.6885755258126196e-05, "loss": 0.3078, "step": 11775 }, { "epoch": 0.16921929673607525, "grad_norm": 4.125547885894775, "learning_rate": 1.6921606118546846e-05, "loss": 0.2769, "step": 11800 }, { "epoch": 0.16957781219526186, "grad_norm": 3.4889895915985107, "learning_rate": 1.6957456978967495e-05, "loss": 0.2729, "step": 11825 }, { "epoch": 0.16993632765444847, "grad_norm": 3.137624502182007, "learning_rate": 1.6993307839388144e-05, "loss": 0.1877, "step": 11850 }, { "epoch": 0.17029484311363505, "grad_norm": 6.064413070678711, "learning_rate": 1.7029158699808793e-05, "loss": 0.2751, "step": 11875 }, { "epoch": 0.17065335857282166, "grad_norm": 7.762181282043457, "learning_rate": 1.7065009560229446e-05, "loss": 0.2127, "step": 11900 }, { "epoch": 0.17101187403200827, "grad_norm": 1.1488157510757446, "learning_rate": 1.71008604206501e-05, "loss": 0.2649, "step": 11925 }, { "epoch": 0.17137038949119485, "grad_norm": 2.052969217300415, "learning_rate": 1.7136711281070748e-05, "loss": 0.2012, "step": 11950 }, { "epoch": 0.17172890495038146, "grad_norm": 10.710022926330566, "learning_rate": 1.7172562141491397e-05, "loss": 0.2457, "step": 11975 }, { "epoch": 0.17208742040956806, "grad_norm": 5.2444987297058105, "learning_rate": 1.7208413001912046e-05, "loss": 0.1981, "step": 12000 }, { "epoch": 0.17244593586875467, "grad_norm": 4.592362880706787, "learning_rate": 1.7244263862332696e-05, "loss": 0.2818, "step": 12025 }, { "epoch": 0.17280445132794126, "grad_norm": 17.535812377929688, "learning_rate": 1.7280114722753345e-05, "loss": 0.2405, "step": 12050 }, { "epoch": 0.17316296678712786, "grad_norm": 4.358172416687012, "learning_rate": 1.7315965583173998e-05, "loss": 0.2138, "step": 12075 }, { "epoch": 0.17352148224631447, "grad_norm": 2.108764410018921, "learning_rate": 1.7351816443594647e-05, "loss": 0.2147, "step": 12100 }, { "epoch": 0.17387999770550105, "grad_norm": 0.05134429782629013, "learning_rate": 1.7387667304015296e-05, "loss": 0.2169, "step": 12125 }, { "epoch": 0.17423851316468766, "grad_norm": 10.722013473510742, "learning_rate": 1.7423518164435945e-05, "loss": 0.3008, "step": 12150 }, { "epoch": 0.17459702862387427, "grad_norm": 9.002740859985352, "learning_rate": 1.7459369024856598e-05, "loss": 0.2203, "step": 12175 }, { "epoch": 0.17495554408306085, "grad_norm": 21.102359771728516, "learning_rate": 1.7495219885277247e-05, "loss": 0.3341, "step": 12200 }, { "epoch": 0.17531405954224746, "grad_norm": 4.814400672912598, "learning_rate": 1.7531070745697897e-05, "loss": 0.3199, "step": 12225 }, { "epoch": 0.17567257500143407, "grad_norm": 10.349713325500488, "learning_rate": 1.756692160611855e-05, "loss": 0.2708, "step": 12250 }, { "epoch": 0.17603109046062065, "grad_norm": 15.6974458694458, "learning_rate": 1.76027724665392e-05, "loss": 0.2548, "step": 12275 }, { "epoch": 0.17638960591980726, "grad_norm": 21.00507164001465, "learning_rate": 1.7638623326959848e-05, "loss": 0.21, "step": 12300 }, { "epoch": 0.17674812137899387, "grad_norm": 12.954899787902832, "learning_rate": 1.7674474187380497e-05, "loss": 0.2289, "step": 12325 }, { "epoch": 0.17710663683818045, "grad_norm": 18.55520248413086, "learning_rate": 1.7710325047801146e-05, "loss": 0.2766, "step": 12350 }, { "epoch": 0.17746515229736706, "grad_norm": 20.456411361694336, "learning_rate": 1.7746175908221795e-05, "loss": 0.2093, "step": 12375 }, { "epoch": 0.17782366775655367, "grad_norm": 12.54308032989502, "learning_rate": 1.7782026768642448e-05, "loss": 0.3476, "step": 12400 }, { "epoch": 0.17818218321574025, "grad_norm": 19.658023834228516, "learning_rate": 1.78178776290631e-05, "loss": 0.185, "step": 12425 }, { "epoch": 0.17854069867492686, "grad_norm": 4.567080974578857, "learning_rate": 1.785372848948375e-05, "loss": 0.2308, "step": 12450 }, { "epoch": 0.17889921413411347, "grad_norm": 6.8268046379089355, "learning_rate": 1.78895793499044e-05, "loss": 0.236, "step": 12475 }, { "epoch": 0.17925772959330005, "grad_norm": 11.02785587310791, "learning_rate": 1.792543021032505e-05, "loss": 0.2508, "step": 12500 }, { "epoch": 0.17961624505248666, "grad_norm": 5.4654083251953125, "learning_rate": 1.7961281070745698e-05, "loss": 0.1667, "step": 12525 }, { "epoch": 0.17997476051167327, "grad_norm": 10.121848106384277, "learning_rate": 1.7997131931166347e-05, "loss": 0.205, "step": 12550 }, { "epoch": 0.18033327597085985, "grad_norm": 14.981104850769043, "learning_rate": 1.8032982791587e-05, "loss": 0.1506, "step": 12575 }, { "epoch": 0.18069179143004646, "grad_norm": 23.49588966369629, "learning_rate": 1.806883365200765e-05, "loss": 0.3821, "step": 12600 }, { "epoch": 0.18105030688923307, "grad_norm": 2.7092537879943848, "learning_rate": 1.8104684512428298e-05, "loss": 0.1778, "step": 12625 }, { "epoch": 0.18140882234841965, "grad_norm": 10.781951904296875, "learning_rate": 1.8140535372848947e-05, "loss": 0.2324, "step": 12650 }, { "epoch": 0.18176733780760626, "grad_norm": 3.831057071685791, "learning_rate": 1.8176386233269597e-05, "loss": 0.4084, "step": 12675 }, { "epoch": 0.18212585326679287, "grad_norm": 22.82911491394043, "learning_rate": 1.821223709369025e-05, "loss": 0.3243, "step": 12700 }, { "epoch": 0.18248436872597945, "grad_norm": 3.261145830154419, "learning_rate": 1.82480879541109e-05, "loss": 0.2732, "step": 12725 }, { "epoch": 0.18284288418516606, "grad_norm": 23.101394653320312, "learning_rate": 1.828393881453155e-05, "loss": 0.3696, "step": 12750 }, { "epoch": 0.18320139964435267, "grad_norm": 3.5918619632720947, "learning_rate": 1.83197896749522e-05, "loss": 0.2417, "step": 12775 }, { "epoch": 0.18355991510353928, "grad_norm": 4.218470573425293, "learning_rate": 1.835564053537285e-05, "loss": 0.2914, "step": 12800 }, { "epoch": 0.18391843056272586, "grad_norm": 22.810903549194336, "learning_rate": 1.83914913957935e-05, "loss": 0.2165, "step": 12825 }, { "epoch": 0.18427694602191247, "grad_norm": 6.662921905517578, "learning_rate": 1.8427342256214148e-05, "loss": 0.271, "step": 12850 }, { "epoch": 0.18463546148109908, "grad_norm": 4.016091346740723, "learning_rate": 1.8463193116634798e-05, "loss": 0.1634, "step": 12875 }, { "epoch": 0.18499397694028566, "grad_norm": 7.332846164703369, "learning_rate": 1.849904397705545e-05, "loss": 0.272, "step": 12900 }, { "epoch": 0.18535249239947227, "grad_norm": 8.615911483764648, "learning_rate": 1.85348948374761e-05, "loss": 0.3422, "step": 12925 }, { "epoch": 0.18571100785865888, "grad_norm": 2.4440839290618896, "learning_rate": 1.8570745697896752e-05, "loss": 0.294, "step": 12950 }, { "epoch": 0.18606952331784546, "grad_norm": 2.535374641418457, "learning_rate": 1.86065965583174e-05, "loss": 0.2093, "step": 12975 }, { "epoch": 0.18642803877703207, "grad_norm": 22.8557071685791, "learning_rate": 1.864244741873805e-05, "loss": 0.2384, "step": 13000 }, { "epoch": 0.18678655423621868, "grad_norm": 1.1257294416427612, "learning_rate": 1.86782982791587e-05, "loss": 0.2427, "step": 13025 }, { "epoch": 0.18714506969540526, "grad_norm": 0.842964768409729, "learning_rate": 1.871414913957935e-05, "loss": 0.2965, "step": 13050 }, { "epoch": 0.18750358515459187, "grad_norm": 23.37690544128418, "learning_rate": 1.8750000000000002e-05, "loss": 0.1498, "step": 13075 }, { "epoch": 0.18786210061377848, "grad_norm": 25.53985595703125, "learning_rate": 1.878585086042065e-05, "loss": 0.3083, "step": 13100 }, { "epoch": 0.18822061607296506, "grad_norm": 4.156383991241455, "learning_rate": 1.88217017208413e-05, "loss": 0.3169, "step": 13125 }, { "epoch": 0.18857913153215167, "grad_norm": 21.888689041137695, "learning_rate": 1.885755258126195e-05, "loss": 0.2523, "step": 13150 }, { "epoch": 0.18893764699133828, "grad_norm": 7.158427715301514, "learning_rate": 1.88934034416826e-05, "loss": 0.2752, "step": 13175 }, { "epoch": 0.18929616245052486, "grad_norm": 20.771564483642578, "learning_rate": 1.8929254302103248e-05, "loss": 0.2821, "step": 13200 }, { "epoch": 0.18965467790971147, "grad_norm": 14.147101402282715, "learning_rate": 1.8965105162523904e-05, "loss": 0.3685, "step": 13225 }, { "epoch": 0.19001319336889808, "grad_norm": 7.42561674118042, "learning_rate": 1.9000956022944553e-05, "loss": 0.2781, "step": 13250 }, { "epoch": 0.19037170882808466, "grad_norm": 3.134117364883423, "learning_rate": 1.9036806883365203e-05, "loss": 0.3366, "step": 13275 }, { "epoch": 0.19073022428727127, "grad_norm": 11.31389331817627, "learning_rate": 1.9072657743785852e-05, "loss": 0.3025, "step": 13300 }, { "epoch": 0.19108873974645788, "grad_norm": 14.076050758361816, "learning_rate": 1.91085086042065e-05, "loss": 0.2857, "step": 13325 }, { "epoch": 0.19144725520564446, "grad_norm": 1.6417051553726196, "learning_rate": 1.914435946462715e-05, "loss": 0.3148, "step": 13350 }, { "epoch": 0.19180577066483107, "grad_norm": 15.269339561462402, "learning_rate": 1.91802103250478e-05, "loss": 0.1766, "step": 13375 }, { "epoch": 0.19216428612401767, "grad_norm": 21.17713165283203, "learning_rate": 1.9216061185468452e-05, "loss": 0.3483, "step": 13400 }, { "epoch": 0.19252280158320426, "grad_norm": 26.301918029785156, "learning_rate": 1.92519120458891e-05, "loss": 0.2198, "step": 13425 }, { "epoch": 0.19288131704239087, "grad_norm": 11.252765655517578, "learning_rate": 1.928776290630975e-05, "loss": 0.199, "step": 13450 }, { "epoch": 0.19323983250157747, "grad_norm": 19.194889068603516, "learning_rate": 1.9323613766730403e-05, "loss": 0.2357, "step": 13475 }, { "epoch": 0.19359834796076406, "grad_norm": 13.328980445861816, "learning_rate": 1.9359464627151053e-05, "loss": 0.295, "step": 13500 }, { "epoch": 0.19395686341995066, "grad_norm": 8.885634422302246, "learning_rate": 1.9395315487571702e-05, "loss": 0.2177, "step": 13525 }, { "epoch": 0.19431537887913727, "grad_norm": 2.6610608100891113, "learning_rate": 1.9431166347992355e-05, "loss": 0.2399, "step": 13550 }, { "epoch": 0.19467389433832386, "grad_norm": 7.541551113128662, "learning_rate": 1.9467017208413004e-05, "loss": 0.2789, "step": 13575 }, { "epoch": 0.19503240979751046, "grad_norm": 1.9849804639816284, "learning_rate": 1.9502868068833653e-05, "loss": 0.3151, "step": 13600 }, { "epoch": 0.19539092525669707, "grad_norm": 24.6917667388916, "learning_rate": 1.9538718929254302e-05, "loss": 0.2598, "step": 13625 }, { "epoch": 0.19574944071588368, "grad_norm": 10.449054718017578, "learning_rate": 1.957456978967495e-05, "loss": 0.1981, "step": 13650 }, { "epoch": 0.19610795617507026, "grad_norm": 2.976572036743164, "learning_rate": 1.96104206500956e-05, "loss": 0.3335, "step": 13675 }, { "epoch": 0.19646647163425687, "grad_norm": 2.653907060623169, "learning_rate": 1.964627151051625e-05, "loss": 0.245, "step": 13700 }, { "epoch": 0.19682498709344348, "grad_norm": 0.08924546092748642, "learning_rate": 1.9682122370936903e-05, "loss": 0.2672, "step": 13725 }, { "epoch": 0.19718350255263006, "grad_norm": 3.3527920246124268, "learning_rate": 1.9717973231357555e-05, "loss": 0.2421, "step": 13750 }, { "epoch": 0.19754201801181667, "grad_norm": 20.080825805664062, "learning_rate": 1.9753824091778205e-05, "loss": 0.215, "step": 13775 }, { "epoch": 0.19790053347100328, "grad_norm": 9.243728637695312, "learning_rate": 1.9789674952198854e-05, "loss": 0.2097, "step": 13800 }, { "epoch": 0.19825904893018986, "grad_norm": 6.638285160064697, "learning_rate": 1.9825525812619503e-05, "loss": 0.3406, "step": 13825 }, { "epoch": 0.19861756438937647, "grad_norm": 1.4172821044921875, "learning_rate": 1.9861376673040152e-05, "loss": 0.3534, "step": 13850 }, { "epoch": 0.19897607984856308, "grad_norm": 12.762964248657227, "learning_rate": 1.98972275334608e-05, "loss": 0.2269, "step": 13875 }, { "epoch": 0.19933459530774966, "grad_norm": 20.76905059814453, "learning_rate": 1.9933078393881454e-05, "loss": 0.2518, "step": 13900 }, { "epoch": 0.19969311076693627, "grad_norm": 11.809926986694336, "learning_rate": 1.9968929254302104e-05, "loss": 0.3254, "step": 13925 }, { "epoch": 0.20005162622612288, "grad_norm": 1.7210204601287842, "learning_rate": 2.0004780114722753e-05, "loss": 0.3298, "step": 13950 }, { "epoch": 0.20041014168530946, "grad_norm": 6.529745101928711, "learning_rate": 2.0040630975143402e-05, "loss": 0.4342, "step": 13975 }, { "epoch": 0.20076865714449607, "grad_norm": 1.8496177196502686, "learning_rate": 2.0076481835564055e-05, "loss": 0.2103, "step": 14000 }, { "epoch": 0.20112717260368268, "grad_norm": 19.744823455810547, "learning_rate": 2.0112332695984704e-05, "loss": 0.3411, "step": 14025 }, { "epoch": 0.20148568806286926, "grad_norm": 10.102351188659668, "learning_rate": 2.0148183556405357e-05, "loss": 0.2015, "step": 14050 }, { "epoch": 0.20184420352205587, "grad_norm": 8.985700607299805, "learning_rate": 2.0184034416826006e-05, "loss": 0.3097, "step": 14075 }, { "epoch": 0.20220271898124248, "grad_norm": 3.8325142860412598, "learning_rate": 2.0219885277246655e-05, "loss": 0.2917, "step": 14100 }, { "epoch": 0.20256123444042906, "grad_norm": 7.215860366821289, "learning_rate": 2.0255736137667304e-05, "loss": 0.2494, "step": 14125 }, { "epoch": 0.20291974989961567, "grad_norm": 3.579479932785034, "learning_rate": 2.0291586998087954e-05, "loss": 0.2903, "step": 14150 }, { "epoch": 0.20327826535880228, "grad_norm": 4.108723163604736, "learning_rate": 2.0327437858508603e-05, "loss": 0.2272, "step": 14175 }, { "epoch": 0.20363678081798886, "grad_norm": 1.678317666053772, "learning_rate": 2.0363288718929252e-05, "loss": 0.1666, "step": 14200 }, { "epoch": 0.20399529627717547, "grad_norm": 16.54724884033203, "learning_rate": 2.0399139579349905e-05, "loss": 0.2693, "step": 14225 }, { "epoch": 0.20435381173636208, "grad_norm": 9.212376594543457, "learning_rate": 2.0434990439770554e-05, "loss": 0.2665, "step": 14250 }, { "epoch": 0.20471232719554866, "grad_norm": 1.9821583032608032, "learning_rate": 2.0470841300191207e-05, "loss": 0.3334, "step": 14275 }, { "epoch": 0.20507084265473527, "grad_norm": 14.50825023651123, "learning_rate": 2.0506692160611856e-05, "loss": 0.133, "step": 14300 }, { "epoch": 0.20542935811392188, "grad_norm": 19.948196411132812, "learning_rate": 2.0542543021032505e-05, "loss": 0.2794, "step": 14325 }, { "epoch": 0.20578787357310846, "grad_norm": 16.186843872070312, "learning_rate": 2.0578393881453154e-05, "loss": 0.2756, "step": 14350 }, { "epoch": 0.20614638903229507, "grad_norm": 4.990640163421631, "learning_rate": 2.0614244741873807e-05, "loss": 0.3081, "step": 14375 }, { "epoch": 0.20650490449148168, "grad_norm": 11.470888137817383, "learning_rate": 2.0650095602294456e-05, "loss": 0.1804, "step": 14400 }, { "epoch": 0.20686341995066826, "grad_norm": 16.3361759185791, "learning_rate": 2.0685946462715106e-05, "loss": 0.2253, "step": 14425 }, { "epoch": 0.20722193540985487, "grad_norm": 6.4091997146606445, "learning_rate": 2.0721797323135755e-05, "loss": 0.2104, "step": 14450 }, { "epoch": 0.20758045086904148, "grad_norm": 1.1235712766647339, "learning_rate": 2.0757648183556404e-05, "loss": 0.2659, "step": 14475 }, { "epoch": 0.2079389663282281, "grad_norm": 2.2587084770202637, "learning_rate": 2.0793499043977053e-05, "loss": 0.2466, "step": 14500 }, { "epoch": 0.20829748178741467, "grad_norm": 13.797155380249023, "learning_rate": 2.0829349904397706e-05, "loss": 0.1674, "step": 14525 }, { "epoch": 0.20865599724660128, "grad_norm": 19.31214141845703, "learning_rate": 2.086520076481836e-05, "loss": 0.1776, "step": 14550 }, { "epoch": 0.20901451270578789, "grad_norm": 1.1947451829910278, "learning_rate": 2.0901051625239008e-05, "loss": 0.4083, "step": 14575 }, { "epoch": 0.20937302816497447, "grad_norm": 9.942072868347168, "learning_rate": 2.0936902485659657e-05, "loss": 0.2255, "step": 14600 }, { "epoch": 0.20973154362416108, "grad_norm": 15.453841209411621, "learning_rate": 2.0972753346080306e-05, "loss": 0.2053, "step": 14625 }, { "epoch": 0.21009005908334769, "grad_norm": 18.091814041137695, "learning_rate": 2.1008604206500956e-05, "loss": 0.2754, "step": 14650 }, { "epoch": 0.21044857454253427, "grad_norm": 2.479109287261963, "learning_rate": 2.1044455066921605e-05, "loss": 0.2284, "step": 14675 }, { "epoch": 0.21080709000172088, "grad_norm": 13.678213119506836, "learning_rate": 2.1080305927342258e-05, "loss": 0.3641, "step": 14700 }, { "epoch": 0.21116560546090749, "grad_norm": 18.002010345458984, "learning_rate": 2.1116156787762907e-05, "loss": 0.3571, "step": 14725 }, { "epoch": 0.21152412092009407, "grad_norm": 10.710123062133789, "learning_rate": 2.1152007648183556e-05, "loss": 0.3336, "step": 14750 }, { "epoch": 0.21188263637928068, "grad_norm": 7.275698184967041, "learning_rate": 2.118785850860421e-05, "loss": 0.3315, "step": 14775 }, { "epoch": 0.21224115183846728, "grad_norm": 26.995786666870117, "learning_rate": 2.1223709369024858e-05, "loss": 0.3012, "step": 14800 }, { "epoch": 0.21259966729765387, "grad_norm": 4.377394199371338, "learning_rate": 2.1259560229445507e-05, "loss": 0.2206, "step": 14825 }, { "epoch": 0.21295818275684048, "grad_norm": 13.78493595123291, "learning_rate": 2.1295411089866157e-05, "loss": 0.323, "step": 14850 }, { "epoch": 0.21331669821602708, "grad_norm": 22.69456672668457, "learning_rate": 2.133126195028681e-05, "loss": 0.2949, "step": 14875 }, { "epoch": 0.21367521367521367, "grad_norm": 14.544554710388184, "learning_rate": 2.136711281070746e-05, "loss": 0.1943, "step": 14900 }, { "epoch": 0.21403372913440027, "grad_norm": 11.376479148864746, "learning_rate": 2.1402963671128108e-05, "loss": 0.2425, "step": 14925 }, { "epoch": 0.21439224459358688, "grad_norm": 0.36570924520492554, "learning_rate": 2.1438814531548757e-05, "loss": 0.3171, "step": 14950 }, { "epoch": 0.21475076005277347, "grad_norm": 14.525489807128906, "learning_rate": 2.1474665391969406e-05, "loss": 0.2526, "step": 14975 }, { "epoch": 0.21510927551196007, "grad_norm": 3.2429401874542236, "learning_rate": 2.1510516252390056e-05, "loss": 0.2502, "step": 15000 }, { "epoch": 0.21546779097114668, "grad_norm": 18.905122756958008, "learning_rate": 2.1546367112810708e-05, "loss": 0.3095, "step": 15025 }, { "epoch": 0.21582630643033326, "grad_norm": 1.2669283151626587, "learning_rate": 2.158221797323136e-05, "loss": 0.2573, "step": 15050 }, { "epoch": 0.21618482188951987, "grad_norm": 1.3216358423233032, "learning_rate": 2.161806883365201e-05, "loss": 0.1949, "step": 15075 }, { "epoch": 0.21654333734870648, "grad_norm": 9.190300941467285, "learning_rate": 2.165391969407266e-05, "loss": 0.1165, "step": 15100 }, { "epoch": 0.21690185280789306, "grad_norm": 4.043756008148193, "learning_rate": 2.168977055449331e-05, "loss": 0.196, "step": 15125 }, { "epoch": 0.21726036826707967, "grad_norm": 20.155052185058594, "learning_rate": 2.1725621414913958e-05, "loss": 0.1684, "step": 15150 }, { "epoch": 0.21761888372626628, "grad_norm": 26.4940128326416, "learning_rate": 2.1761472275334607e-05, "loss": 0.285, "step": 15175 }, { "epoch": 0.21797739918545286, "grad_norm": 23.22510528564453, "learning_rate": 2.179732313575526e-05, "loss": 0.3368, "step": 15200 }, { "epoch": 0.21833591464463947, "grad_norm": 1.5917283296585083, "learning_rate": 2.183317399617591e-05, "loss": 0.3623, "step": 15225 }, { "epoch": 0.21869443010382608, "grad_norm": 3.928621292114258, "learning_rate": 2.1869024856596558e-05, "loss": 0.3363, "step": 15250 }, { "epoch": 0.2190529455630127, "grad_norm": 9.006464958190918, "learning_rate": 2.1904875717017207e-05, "loss": 0.2394, "step": 15275 }, { "epoch": 0.21941146102219927, "grad_norm": 0.8572168946266174, "learning_rate": 2.194072657743786e-05, "loss": 0.2949, "step": 15300 }, { "epoch": 0.21976997648138588, "grad_norm": 5.559074401855469, "learning_rate": 2.197657743785851e-05, "loss": 0.2266, "step": 15325 }, { "epoch": 0.2201284919405725, "grad_norm": 1.4520103931427002, "learning_rate": 2.201242829827916e-05, "loss": 0.2594, "step": 15350 }, { "epoch": 0.22048700739975907, "grad_norm": 7.689632892608643, "learning_rate": 2.204827915869981e-05, "loss": 0.315, "step": 15375 }, { "epoch": 0.22084552285894568, "grad_norm": 12.650141716003418, "learning_rate": 2.208413001912046e-05, "loss": 0.319, "step": 15400 }, { "epoch": 0.2212040383181323, "grad_norm": 9.560564041137695, "learning_rate": 2.211998087954111e-05, "loss": 0.298, "step": 15425 }, { "epoch": 0.22156255377731887, "grad_norm": 8.091878890991211, "learning_rate": 2.215583173996176e-05, "loss": 0.3205, "step": 15450 }, { "epoch": 0.22192106923650548, "grad_norm": 16.76528549194336, "learning_rate": 2.219168260038241e-05, "loss": 0.2874, "step": 15475 }, { "epoch": 0.2222795846956921, "grad_norm": 6.73468017578125, "learning_rate": 2.2227533460803058e-05, "loss": 0.2823, "step": 15500 }, { "epoch": 0.22263810015487867, "grad_norm": 16.171096801757812, "learning_rate": 2.226338432122371e-05, "loss": 0.2412, "step": 15525 }, { "epoch": 0.22299661561406528, "grad_norm": 18.83925437927246, "learning_rate": 2.229923518164436e-05, "loss": 0.2207, "step": 15550 }, { "epoch": 0.2233551310732519, "grad_norm": 3.405231237411499, "learning_rate": 2.2335086042065012e-05, "loss": 0.4157, "step": 15575 }, { "epoch": 0.22371364653243847, "grad_norm": 11.37020492553711, "learning_rate": 2.237093690248566e-05, "loss": 0.284, "step": 15600 }, { "epoch": 0.22407216199162508, "grad_norm": 0.675417423248291, "learning_rate": 2.240678776290631e-05, "loss": 0.2905, "step": 15625 }, { "epoch": 0.2244306774508117, "grad_norm": 19.647525787353516, "learning_rate": 2.244263862332696e-05, "loss": 0.3644, "step": 15650 }, { "epoch": 0.22478919290999827, "grad_norm": 15.711414337158203, "learning_rate": 2.247848948374761e-05, "loss": 0.2543, "step": 15675 }, { "epoch": 0.22514770836918488, "grad_norm": 2.2756705284118652, "learning_rate": 2.2514340344168262e-05, "loss": 0.294, "step": 15700 }, { "epoch": 0.2255062238283715, "grad_norm": 5.076662063598633, "learning_rate": 2.255019120458891e-05, "loss": 0.323, "step": 15725 }, { "epoch": 0.22586473928755807, "grad_norm": 2.941643476486206, "learning_rate": 2.258604206500956e-05, "loss": 0.2166, "step": 15750 }, { "epoch": 0.22622325474674468, "grad_norm": 3.817310094833374, "learning_rate": 2.262189292543021e-05, "loss": 0.3122, "step": 15775 }, { "epoch": 0.2265817702059313, "grad_norm": 14.421196937561035, "learning_rate": 2.265774378585086e-05, "loss": 0.2374, "step": 15800 }, { "epoch": 0.22694028566511787, "grad_norm": 8.808897972106934, "learning_rate": 2.269359464627151e-05, "loss": 0.3254, "step": 15825 }, { "epoch": 0.22729880112430448, "grad_norm": 0.9000584483146667, "learning_rate": 2.2729445506692164e-05, "loss": 0.2446, "step": 15850 }, { "epoch": 0.2276573165834911, "grad_norm": 11.781364440917969, "learning_rate": 2.2765296367112813e-05, "loss": 0.2361, "step": 15875 }, { "epoch": 0.22801583204267767, "grad_norm": 10.303633689880371, "learning_rate": 2.2801147227533463e-05, "loss": 0.2406, "step": 15900 }, { "epoch": 0.22837434750186428, "grad_norm": 9.053655624389648, "learning_rate": 2.2836998087954112e-05, "loss": 0.272, "step": 15925 }, { "epoch": 0.2287328629610509, "grad_norm": 3.8947741985321045, "learning_rate": 2.287284894837476e-05, "loss": 0.1865, "step": 15950 }, { "epoch": 0.22909137842023747, "grad_norm": 19.232593536376953, "learning_rate": 2.290869980879541e-05, "loss": 0.2863, "step": 15975 }, { "epoch": 0.22944989387942408, "grad_norm": 9.128899574279785, "learning_rate": 2.294455066921606e-05, "loss": 0.2827, "step": 16000 }, { "epoch": 0.2298084093386107, "grad_norm": 3.0357506275177, "learning_rate": 2.2980401529636712e-05, "loss": 0.2328, "step": 16025 }, { "epoch": 0.23016692479779727, "grad_norm": 0.2998218238353729, "learning_rate": 2.301625239005736e-05, "loss": 0.2733, "step": 16050 }, { "epoch": 0.23052544025698388, "grad_norm": 3.8224101066589355, "learning_rate": 2.305210325047801e-05, "loss": 0.3248, "step": 16075 }, { "epoch": 0.23088395571617049, "grad_norm": 4.80781888961792, "learning_rate": 2.3087954110898663e-05, "loss": 0.2828, "step": 16100 }, { "epoch": 0.2312424711753571, "grad_norm": 0.10697107017040253, "learning_rate": 2.3123804971319313e-05, "loss": 0.1823, "step": 16125 }, { "epoch": 0.23160098663454368, "grad_norm": 2.8913462162017822, "learning_rate": 2.3159655831739962e-05, "loss": 0.216, "step": 16150 }, { "epoch": 0.23195950209373029, "grad_norm": 19.3918399810791, "learning_rate": 2.3195506692160615e-05, "loss": 0.1757, "step": 16175 }, { "epoch": 0.2323180175529169, "grad_norm": 3.15661358833313, "learning_rate": 2.3231357552581264e-05, "loss": 0.3114, "step": 16200 }, { "epoch": 0.23267653301210348, "grad_norm": 0.14787787199020386, "learning_rate": 2.3267208413001913e-05, "loss": 0.227, "step": 16225 }, { "epoch": 0.23303504847129008, "grad_norm": 10.870231628417969, "learning_rate": 2.3303059273422562e-05, "loss": 0.2276, "step": 16250 }, { "epoch": 0.2333935639304767, "grad_norm": 14.414502143859863, "learning_rate": 2.333891013384321e-05, "loss": 0.2158, "step": 16275 }, { "epoch": 0.23375207938966328, "grad_norm": 8.305215835571289, "learning_rate": 2.337476099426386e-05, "loss": 0.2569, "step": 16300 }, { "epoch": 0.23411059484884988, "grad_norm": 0.6896699070930481, "learning_rate": 2.341061185468451e-05, "loss": 0.3239, "step": 16325 }, { "epoch": 0.2344691103080365, "grad_norm": 15.997671127319336, "learning_rate": 2.3446462715105166e-05, "loss": 0.3594, "step": 16350 }, { "epoch": 0.23482762576722307, "grad_norm": 4.046178817749023, "learning_rate": 2.3482313575525815e-05, "loss": 0.1626, "step": 16375 }, { "epoch": 0.23518614122640968, "grad_norm": 12.619485855102539, "learning_rate": 2.3518164435946465e-05, "loss": 0.2567, "step": 16400 }, { "epoch": 0.2355446566855963, "grad_norm": 26.477718353271484, "learning_rate": 2.3554015296367114e-05, "loss": 0.1319, "step": 16425 }, { "epoch": 0.23590317214478287, "grad_norm": 3.3853092193603516, "learning_rate": 2.3589866156787763e-05, "loss": 0.2969, "step": 16450 }, { "epoch": 0.23626168760396948, "grad_norm": 10.863924980163574, "learning_rate": 2.3625717017208412e-05, "loss": 0.2621, "step": 16475 }, { "epoch": 0.2366202030631561, "grad_norm": 2.614105701446533, "learning_rate": 2.3661567877629062e-05, "loss": 0.221, "step": 16500 }, { "epoch": 0.23697871852234267, "grad_norm": 10.301831245422363, "learning_rate": 2.3697418738049714e-05, "loss": 0.2774, "step": 16525 }, { "epoch": 0.23733723398152928, "grad_norm": 11.139878273010254, "learning_rate": 2.3733269598470364e-05, "loss": 0.3322, "step": 16550 }, { "epoch": 0.2376957494407159, "grad_norm": 6.709636211395264, "learning_rate": 2.3769120458891013e-05, "loss": 0.3085, "step": 16575 }, { "epoch": 0.23805426489990247, "grad_norm": 13.580421447753906, "learning_rate": 2.3804971319311662e-05, "loss": 0.239, "step": 16600 }, { "epoch": 0.23841278035908908, "grad_norm": 3.4103450775146484, "learning_rate": 2.3840822179732315e-05, "loss": 0.4293, "step": 16625 }, { "epoch": 0.2387712958182757, "grad_norm": 7.838134765625, "learning_rate": 2.3876673040152964e-05, "loss": 0.2878, "step": 16650 }, { "epoch": 0.23912981127746227, "grad_norm": 3.099550724029541, "learning_rate": 2.3912523900573617e-05, "loss": 0.1895, "step": 16675 }, { "epoch": 0.23948832673664888, "grad_norm": 18.508411407470703, "learning_rate": 2.3948374760994266e-05, "loss": 0.342, "step": 16700 }, { "epoch": 0.2398468421958355, "grad_norm": 10.390716552734375, "learning_rate": 2.3984225621414915e-05, "loss": 0.2723, "step": 16725 }, { "epoch": 0.24020535765502207, "grad_norm": 14.30135726928711, "learning_rate": 2.4020076481835564e-05, "loss": 0.222, "step": 16750 }, { "epoch": 0.24056387311420868, "grad_norm": 13.553800582885742, "learning_rate": 2.4055927342256214e-05, "loss": 0.2299, "step": 16775 }, { "epoch": 0.2409223885733953, "grad_norm": 4.5347442626953125, "learning_rate": 2.4091778202676863e-05, "loss": 0.1957, "step": 16800 }, { "epoch": 0.24128090403258187, "grad_norm": 6.830546855926514, "learning_rate": 2.4127629063097512e-05, "loss": 0.2985, "step": 16825 }, { "epoch": 0.24163941949176848, "grad_norm": 17.160104751586914, "learning_rate": 2.4163479923518165e-05, "loss": 0.2889, "step": 16850 }, { "epoch": 0.2419979349509551, "grad_norm": 7.735109806060791, "learning_rate": 2.4199330783938818e-05, "loss": 0.1413, "step": 16875 }, { "epoch": 0.24235645041014167, "grad_norm": 10.379412651062012, "learning_rate": 2.4235181644359467e-05, "loss": 0.2274, "step": 16900 }, { "epoch": 0.24271496586932828, "grad_norm": 0.633603036403656, "learning_rate": 2.4271032504780116e-05, "loss": 0.2461, "step": 16925 }, { "epoch": 0.2430734813285149, "grad_norm": 1.91392982006073, "learning_rate": 2.4306883365200765e-05, "loss": 0.2828, "step": 16950 }, { "epoch": 0.2434319967877015, "grad_norm": 3.6278910636901855, "learning_rate": 2.4342734225621415e-05, "loss": 0.2393, "step": 16975 }, { "epoch": 0.24379051224688808, "grad_norm": 14.22424602508545, "learning_rate": 2.4378585086042067e-05, "loss": 0.2934, "step": 17000 }, { "epoch": 0.2441490277060747, "grad_norm": 2.359711170196533, "learning_rate": 2.4414435946462716e-05, "loss": 0.341, "step": 17025 }, { "epoch": 0.2445075431652613, "grad_norm": 4.258749008178711, "learning_rate": 2.4450286806883366e-05, "loss": 0.238, "step": 17050 }, { "epoch": 0.24486605862444788, "grad_norm": 3.4504761695861816, "learning_rate": 2.4486137667304015e-05, "loss": 0.1928, "step": 17075 }, { "epoch": 0.2452245740836345, "grad_norm": 18.347862243652344, "learning_rate": 2.4521988527724664e-05, "loss": 0.3533, "step": 17100 }, { "epoch": 0.2455830895428211, "grad_norm": 8.651162147521973, "learning_rate": 2.4557839388145313e-05, "loss": 0.3115, "step": 17125 }, { "epoch": 0.24594160500200768, "grad_norm": 7.922807216644287, "learning_rate": 2.4593690248565966e-05, "loss": 0.2658, "step": 17150 }, { "epoch": 0.2463001204611943, "grad_norm": 4.5822601318359375, "learning_rate": 2.462954110898662e-05, "loss": 0.1975, "step": 17175 }, { "epoch": 0.2466586359203809, "grad_norm": 13.854204177856445, "learning_rate": 2.4665391969407268e-05, "loss": 0.2287, "step": 17200 }, { "epoch": 0.24701715137956748, "grad_norm": 17.697521209716797, "learning_rate": 2.4701242829827917e-05, "loss": 0.2734, "step": 17225 }, { "epoch": 0.2473756668387541, "grad_norm": 12.379929542541504, "learning_rate": 2.4737093690248567e-05, "loss": 0.3739, "step": 17250 }, { "epoch": 0.2477341822979407, "grad_norm": 2.118001937866211, "learning_rate": 2.4772944550669216e-05, "loss": 0.2639, "step": 17275 }, { "epoch": 0.24809269775712728, "grad_norm": 17.724885940551758, "learning_rate": 2.4808795411089865e-05, "loss": 0.2654, "step": 17300 }, { "epoch": 0.2484512132163139, "grad_norm": 7.6847825050354, "learning_rate": 2.4844646271510518e-05, "loss": 0.2502, "step": 17325 }, { "epoch": 0.2488097286755005, "grad_norm": 20.082653045654297, "learning_rate": 2.4880497131931167e-05, "loss": 0.2716, "step": 17350 }, { "epoch": 0.24916824413468708, "grad_norm": 12.658340454101562, "learning_rate": 2.4916347992351816e-05, "loss": 0.2423, "step": 17375 }, { "epoch": 0.2495267595938737, "grad_norm": 16.94017219543457, "learning_rate": 2.495219885277247e-05, "loss": 0.259, "step": 17400 }, { "epoch": 0.2498852750530603, "grad_norm": 5.354132175445557, "learning_rate": 2.4988049713193118e-05, "loss": 0.3749, "step": 17425 }, { "epoch": 0.2502437905122469, "grad_norm": 2.8667049407958984, "learning_rate": 2.5023900573613767e-05, "loss": 0.1646, "step": 17450 }, { "epoch": 0.2506023059714335, "grad_norm": 1.526833176612854, "learning_rate": 2.5059751434034417e-05, "loss": 0.2228, "step": 17475 }, { "epoch": 0.2509608214306201, "grad_norm": 0.7573360204696655, "learning_rate": 2.509560229445507e-05, "loss": 0.2316, "step": 17500 }, { "epoch": 0.2513193368898067, "grad_norm": 3.8987553119659424, "learning_rate": 2.513145315487572e-05, "loss": 0.2927, "step": 17525 }, { "epoch": 0.2516778523489933, "grad_norm": 14.8519868850708, "learning_rate": 2.5167304015296368e-05, "loss": 0.3393, "step": 17550 }, { "epoch": 0.2520363678081799, "grad_norm": 17.681690216064453, "learning_rate": 2.5203154875717017e-05, "loss": 0.4308, "step": 17575 }, { "epoch": 0.2523948832673665, "grad_norm": 1.2293968200683594, "learning_rate": 2.5239005736137666e-05, "loss": 0.3874, "step": 17600 }, { "epoch": 0.2527533987265531, "grad_norm": 12.973206520080566, "learning_rate": 2.5274856596558316e-05, "loss": 0.3076, "step": 17625 }, { "epoch": 0.2531119141857397, "grad_norm": 26.821746826171875, "learning_rate": 2.5310707456978965e-05, "loss": 0.3474, "step": 17650 }, { "epoch": 0.2534704296449263, "grad_norm": 1.3999649286270142, "learning_rate": 2.534655831739962e-05, "loss": 0.1248, "step": 17675 }, { "epoch": 0.2538289451041129, "grad_norm": 5.696373462677002, "learning_rate": 2.538240917782027e-05, "loss": 0.3119, "step": 17700 }, { "epoch": 0.2541874605632995, "grad_norm": 15.889311790466309, "learning_rate": 2.541826003824092e-05, "loss": 0.1985, "step": 17725 }, { "epoch": 0.2545459760224861, "grad_norm": 3.9156768321990967, "learning_rate": 2.545411089866157e-05, "loss": 0.2287, "step": 17750 }, { "epoch": 0.2549044914816727, "grad_norm": 2.730602741241455, "learning_rate": 2.5489961759082218e-05, "loss": 0.2537, "step": 17775 }, { "epoch": 0.2552630069408593, "grad_norm": 14.639958381652832, "learning_rate": 2.5525812619502867e-05, "loss": 0.2969, "step": 17800 }, { "epoch": 0.2556215224000459, "grad_norm": 5.639432907104492, "learning_rate": 2.556166347992352e-05, "loss": 0.3004, "step": 17825 }, { "epoch": 0.2559800378592325, "grad_norm": 3.00826358795166, "learning_rate": 2.559751434034417e-05, "loss": 0.1951, "step": 17850 }, { "epoch": 0.2563385533184191, "grad_norm": 0.5728304386138916, "learning_rate": 2.5633365200764818e-05, "loss": 0.1324, "step": 17875 }, { "epoch": 0.2566970687776057, "grad_norm": 6.5108723640441895, "learning_rate": 2.5669216061185468e-05, "loss": 0.3032, "step": 17900 }, { "epoch": 0.2570555842367923, "grad_norm": 16.0515079498291, "learning_rate": 2.570506692160612e-05, "loss": 0.284, "step": 17925 }, { "epoch": 0.2574140996959789, "grad_norm": 1.0791465044021606, "learning_rate": 2.574091778202677e-05, "loss": 0.1957, "step": 17950 }, { "epoch": 0.2577726151551655, "grad_norm": 2.4818198680877686, "learning_rate": 2.577676864244742e-05, "loss": 0.1595, "step": 17975 }, { "epoch": 0.2581311306143521, "grad_norm": 4.034417152404785, "learning_rate": 2.581261950286807e-05, "loss": 0.3024, "step": 18000 }, { "epoch": 0.2584896460735387, "grad_norm": 12.435139656066895, "learning_rate": 2.584847036328872e-05, "loss": 0.2543, "step": 18025 }, { "epoch": 0.2588481615327253, "grad_norm": 2.8467583656311035, "learning_rate": 2.588432122370937e-05, "loss": 0.1593, "step": 18050 }, { "epoch": 0.2592066769919119, "grad_norm": 12.575146675109863, "learning_rate": 2.592017208413002e-05, "loss": 0.2581, "step": 18075 }, { "epoch": 0.2595651924510985, "grad_norm": 2.5742852687835693, "learning_rate": 2.595602294455067e-05, "loss": 0.2775, "step": 18100 }, { "epoch": 0.2599237079102851, "grad_norm": 3.493165969848633, "learning_rate": 2.5991873804971318e-05, "loss": 0.2004, "step": 18125 }, { "epoch": 0.2602822233694717, "grad_norm": 12.708489418029785, "learning_rate": 2.602772466539197e-05, "loss": 0.3006, "step": 18150 }, { "epoch": 0.2606407388286583, "grad_norm": 7.698634624481201, "learning_rate": 2.6063575525812623e-05, "loss": 0.2173, "step": 18175 }, { "epoch": 0.2609992542878449, "grad_norm": 0.161372110247612, "learning_rate": 2.6099426386233272e-05, "loss": 0.1375, "step": 18200 }, { "epoch": 0.2613577697470315, "grad_norm": 15.057682037353516, "learning_rate": 2.613527724665392e-05, "loss": 0.253, "step": 18225 }, { "epoch": 0.2617162852062181, "grad_norm": 10.648690223693848, "learning_rate": 2.617112810707457e-05, "loss": 0.2848, "step": 18250 }, { "epoch": 0.2620748006654047, "grad_norm": 12.983576774597168, "learning_rate": 2.620697896749522e-05, "loss": 0.2653, "step": 18275 }, { "epoch": 0.2624333161245913, "grad_norm": 7.108206272125244, "learning_rate": 2.624282982791587e-05, "loss": 0.2791, "step": 18300 }, { "epoch": 0.2627918315837779, "grad_norm": 9.843406677246094, "learning_rate": 2.6278680688336522e-05, "loss": 0.2243, "step": 18325 }, { "epoch": 0.26315034704296447, "grad_norm": 5.499070644378662, "learning_rate": 2.631453154875717e-05, "loss": 0.2654, "step": 18350 }, { "epoch": 0.2635088625021511, "grad_norm": 13.978869438171387, "learning_rate": 2.635038240917782e-05, "loss": 0.2774, "step": 18375 }, { "epoch": 0.2638673779613377, "grad_norm": 4.063225269317627, "learning_rate": 2.638623326959847e-05, "loss": 0.4072, "step": 18400 }, { "epoch": 0.26422589342052427, "grad_norm": 14.764230728149414, "learning_rate": 2.642208413001912e-05, "loss": 0.2288, "step": 18425 }, { "epoch": 0.2645844088797109, "grad_norm": 2.169006586074829, "learning_rate": 2.645793499043977e-05, "loss": 0.1988, "step": 18450 }, { "epoch": 0.2649429243388975, "grad_norm": 9.80324649810791, "learning_rate": 2.6493785850860424e-05, "loss": 0.3247, "step": 18475 }, { "epoch": 0.26530143979808407, "grad_norm": 8.086175918579102, "learning_rate": 2.6529636711281073e-05, "loss": 0.2694, "step": 18500 }, { "epoch": 0.2656599552572707, "grad_norm": 9.243480682373047, "learning_rate": 2.6565487571701723e-05, "loss": 0.2721, "step": 18525 }, { "epoch": 0.2660184707164573, "grad_norm": 14.456110954284668, "learning_rate": 2.6601338432122372e-05, "loss": 0.2572, "step": 18550 }, { "epoch": 0.26637698617564387, "grad_norm": 2.155789852142334, "learning_rate": 2.663718929254302e-05, "loss": 0.249, "step": 18575 }, { "epoch": 0.2667355016348305, "grad_norm": 0.8742579221725464, "learning_rate": 2.667304015296367e-05, "loss": 0.2773, "step": 18600 }, { "epoch": 0.2670940170940171, "grad_norm": 6.328431606292725, "learning_rate": 2.670889101338432e-05, "loss": 0.3252, "step": 18625 }, { "epoch": 0.26745253255320367, "grad_norm": 0.475935161113739, "learning_rate": 2.6744741873804972e-05, "loss": 0.2089, "step": 18650 }, { "epoch": 0.2678110480123903, "grad_norm": 12.685311317443848, "learning_rate": 2.678059273422562e-05, "loss": 0.2767, "step": 18675 }, { "epoch": 0.2681695634715769, "grad_norm": 21.617998123168945, "learning_rate": 2.6816443594646274e-05, "loss": 0.3757, "step": 18700 }, { "epoch": 0.26852807893076347, "grad_norm": 2.527991533279419, "learning_rate": 2.6852294455066924e-05, "loss": 0.3634, "step": 18725 }, { "epoch": 0.2688865943899501, "grad_norm": 4.676393508911133, "learning_rate": 2.6888145315487573e-05, "loss": 0.1608, "step": 18750 }, { "epoch": 0.2692451098491367, "grad_norm": 1.756076455116272, "learning_rate": 2.6923996175908222e-05, "loss": 0.2597, "step": 18775 }, { "epoch": 0.26960362530832327, "grad_norm": 1.7674870491027832, "learning_rate": 2.695984703632887e-05, "loss": 0.2097, "step": 18800 }, { "epoch": 0.2699621407675099, "grad_norm": 9.571147918701172, "learning_rate": 2.6995697896749524e-05, "loss": 0.291, "step": 18825 }, { "epoch": 0.2703206562266965, "grad_norm": 14.519121170043945, "learning_rate": 2.7031548757170173e-05, "loss": 0.3633, "step": 18850 }, { "epoch": 0.27067917168588307, "grad_norm": 14.027094841003418, "learning_rate": 2.7067399617590822e-05, "loss": 0.2209, "step": 18875 }, { "epoch": 0.2710376871450697, "grad_norm": 8.778681755065918, "learning_rate": 2.7103250478011472e-05, "loss": 0.2805, "step": 18900 }, { "epoch": 0.2713962026042563, "grad_norm": 0.643807590007782, "learning_rate": 2.713910133843212e-05, "loss": 0.3359, "step": 18925 }, { "epoch": 0.27175471806344287, "grad_norm": 22.90638542175293, "learning_rate": 2.717495219885277e-05, "loss": 0.1801, "step": 18950 }, { "epoch": 0.2721132335226295, "grad_norm": 5.855782985687256, "learning_rate": 2.7210803059273426e-05, "loss": 0.3271, "step": 18975 }, { "epoch": 0.2724717489818161, "grad_norm": 20.383407592773438, "learning_rate": 2.7246653919694075e-05, "loss": 0.2953, "step": 19000 }, { "epoch": 0.2728302644410027, "grad_norm": 1.3912880420684814, "learning_rate": 2.7282504780114725e-05, "loss": 0.2011, "step": 19025 }, { "epoch": 0.2731887799001893, "grad_norm": 18.00847053527832, "learning_rate": 2.7318355640535374e-05, "loss": 0.2795, "step": 19050 }, { "epoch": 0.2735472953593759, "grad_norm": 16.871410369873047, "learning_rate": 2.7354206500956023e-05, "loss": 0.1848, "step": 19075 }, { "epoch": 0.2739058108185625, "grad_norm": 2.020170211791992, "learning_rate": 2.7390057361376673e-05, "loss": 0.1997, "step": 19100 }, { "epoch": 0.2742643262777491, "grad_norm": 16.399391174316406, "learning_rate": 2.7425908221797322e-05, "loss": 0.184, "step": 19125 }, { "epoch": 0.2746228417369357, "grad_norm": 11.213601112365723, "learning_rate": 2.7461759082217974e-05, "loss": 0.2893, "step": 19150 }, { "epoch": 0.2749813571961223, "grad_norm": 13.404705047607422, "learning_rate": 2.7497609942638624e-05, "loss": 0.189, "step": 19175 }, { "epoch": 0.2753398726553089, "grad_norm": 8.009819984436035, "learning_rate": 2.7533460803059273e-05, "loss": 0.2881, "step": 19200 }, { "epoch": 0.2756983881144955, "grad_norm": 12.845431327819824, "learning_rate": 2.7569311663479926e-05, "loss": 0.3205, "step": 19225 }, { "epoch": 0.2760569035736821, "grad_norm": 3.441805601119995, "learning_rate": 2.7605162523900575e-05, "loss": 0.1505, "step": 19250 }, { "epoch": 0.2764154190328687, "grad_norm": 15.883031845092773, "learning_rate": 2.7641013384321224e-05, "loss": 0.1619, "step": 19275 }, { "epoch": 0.2767739344920553, "grad_norm": 7.494959354400635, "learning_rate": 2.7676864244741877e-05, "loss": 0.2006, "step": 19300 }, { "epoch": 0.2771324499512419, "grad_norm": 2.4284117221832275, "learning_rate": 2.7712715105162526e-05, "loss": 0.3025, "step": 19325 }, { "epoch": 0.2774909654104285, "grad_norm": 5.224027156829834, "learning_rate": 2.7748565965583175e-05, "loss": 0.2846, "step": 19350 }, { "epoch": 0.2778494808696151, "grad_norm": 5.017745018005371, "learning_rate": 2.7784416826003825e-05, "loss": 0.233, "step": 19375 }, { "epoch": 0.2782079963288017, "grad_norm": 7.7991251945495605, "learning_rate": 2.7820267686424474e-05, "loss": 0.2217, "step": 19400 }, { "epoch": 0.2785665117879883, "grad_norm": 18.017242431640625, "learning_rate": 2.7856118546845123e-05, "loss": 0.274, "step": 19425 }, { "epoch": 0.2789250272471749, "grad_norm": 16.060243606567383, "learning_rate": 2.7891969407265772e-05, "loss": 0.1889, "step": 19450 }, { "epoch": 0.2792835427063615, "grad_norm": 1.6310229301452637, "learning_rate": 2.7927820267686425e-05, "loss": 0.3097, "step": 19475 }, { "epoch": 0.2796420581655481, "grad_norm": 0.7646090388298035, "learning_rate": 2.7963671128107078e-05, "loss": 0.2929, "step": 19500 }, { "epoch": 0.2800005736247347, "grad_norm": 20.845195770263672, "learning_rate": 2.7999521988527727e-05, "loss": 0.2477, "step": 19525 }, { "epoch": 0.2803590890839213, "grad_norm": 9.782679557800293, "learning_rate": 2.8035372848948376e-05, "loss": 0.2918, "step": 19550 }, { "epoch": 0.2807176045431079, "grad_norm": 6.794526100158691, "learning_rate": 2.8071223709369025e-05, "loss": 0.2344, "step": 19575 }, { "epoch": 0.2810761200022945, "grad_norm": 1.892106294631958, "learning_rate": 2.8107074569789675e-05, "loss": 0.2188, "step": 19600 }, { "epoch": 0.2814346354614811, "grad_norm": 7.6464948654174805, "learning_rate": 2.8142925430210327e-05, "loss": 0.3667, "step": 19625 }, { "epoch": 0.2817931509206677, "grad_norm": 6.198355197906494, "learning_rate": 2.8178776290630977e-05, "loss": 0.2304, "step": 19650 }, { "epoch": 0.2821516663798543, "grad_norm": 10.79351806640625, "learning_rate": 2.8214627151051626e-05, "loss": 0.3182, "step": 19675 }, { "epoch": 0.2825101818390409, "grad_norm": 10.320473670959473, "learning_rate": 2.8250478011472275e-05, "loss": 0.1979, "step": 19700 }, { "epoch": 0.2828686972982275, "grad_norm": 1.4933017492294312, "learning_rate": 2.8286328871892924e-05, "loss": 0.3099, "step": 19725 }, { "epoch": 0.2832272127574141, "grad_norm": 4.444819927215576, "learning_rate": 2.8322179732313577e-05, "loss": 0.1474, "step": 19750 }, { "epoch": 0.2835857282166007, "grad_norm": 4.023462295532227, "learning_rate": 2.8358030592734226e-05, "loss": 0.1899, "step": 19775 }, { "epoch": 0.2839442436757873, "grad_norm": 3.9906067848205566, "learning_rate": 2.839388145315488e-05, "loss": 0.243, "step": 19800 }, { "epoch": 0.2843027591349739, "grad_norm": 15.614852905273438, "learning_rate": 2.8429732313575528e-05, "loss": 0.2512, "step": 19825 }, { "epoch": 0.2846612745941605, "grad_norm": 13.608783721923828, "learning_rate": 2.8465583173996177e-05, "loss": 0.2304, "step": 19850 }, { "epoch": 0.2850197900533471, "grad_norm": 2.8169682025909424, "learning_rate": 2.8501434034416827e-05, "loss": 0.2802, "step": 19875 }, { "epoch": 0.2853783055125337, "grad_norm": 15.196334838867188, "learning_rate": 2.8537284894837476e-05, "loss": 0.3565, "step": 19900 }, { "epoch": 0.2857368209717203, "grad_norm": 5.287376880645752, "learning_rate": 2.8573135755258125e-05, "loss": 0.2239, "step": 19925 }, { "epoch": 0.2860953364309069, "grad_norm": 14.698201179504395, "learning_rate": 2.8608986615678774e-05, "loss": 0.2071, "step": 19950 }, { "epoch": 0.2864538518900935, "grad_norm": 4.278194904327393, "learning_rate": 2.8644837476099427e-05, "loss": 0.2059, "step": 19975 }, { "epoch": 0.2868123673492801, "grad_norm": 14.188668251037598, "learning_rate": 2.8680688336520076e-05, "loss": 0.3352, "step": 20000 }, { "epoch": 0.2871708828084667, "grad_norm": 12.816661834716797, "learning_rate": 2.871653919694073e-05, "loss": 0.2666, "step": 20025 }, { "epoch": 0.2875293982676533, "grad_norm": 4.425185680389404, "learning_rate": 2.8752390057361378e-05, "loss": 0.3116, "step": 20050 }, { "epoch": 0.2878879137268399, "grad_norm": 3.7756059169769287, "learning_rate": 2.8788240917782027e-05, "loss": 0.2727, "step": 20075 }, { "epoch": 0.2882464291860265, "grad_norm": 13.720636367797852, "learning_rate": 2.8824091778202677e-05, "loss": 0.2277, "step": 20100 }, { "epoch": 0.2886049446452131, "grad_norm": 11.119729995727539, "learning_rate": 2.885994263862333e-05, "loss": 0.2251, "step": 20125 }, { "epoch": 0.2889634601043997, "grad_norm": 2.016619920730591, "learning_rate": 2.889579349904398e-05, "loss": 0.2668, "step": 20150 }, { "epoch": 0.2893219755635863, "grad_norm": 0.673554539680481, "learning_rate": 2.8931644359464628e-05, "loss": 0.2439, "step": 20175 }, { "epoch": 0.2896804910227729, "grad_norm": 2.712735652923584, "learning_rate": 2.8967495219885277e-05, "loss": 0.1946, "step": 20200 }, { "epoch": 0.2900390064819595, "grad_norm": 4.789625644683838, "learning_rate": 2.9003346080305926e-05, "loss": 0.2458, "step": 20225 }, { "epoch": 0.2903975219411461, "grad_norm": 7.319321632385254, "learning_rate": 2.9039196940726576e-05, "loss": 0.1701, "step": 20250 }, { "epoch": 0.2907560374003327, "grad_norm": 3.7356371879577637, "learning_rate": 2.9075047801147228e-05, "loss": 0.387, "step": 20275 }, { "epoch": 0.2911145528595193, "grad_norm": 3.2291035652160645, "learning_rate": 2.911089866156788e-05, "loss": 0.2331, "step": 20300 }, { "epoch": 0.2914730683187059, "grad_norm": 15.10956859588623, "learning_rate": 2.914674952198853e-05, "loss": 0.2713, "step": 20325 }, { "epoch": 0.2918315837778925, "grad_norm": 0.9835026860237122, "learning_rate": 2.918260038240918e-05, "loss": 0.2428, "step": 20350 }, { "epoch": 0.2921900992370791, "grad_norm": 16.90852928161621, "learning_rate": 2.921845124282983e-05, "loss": 0.3164, "step": 20375 }, { "epoch": 0.2925486146962657, "grad_norm": 8.946943283081055, "learning_rate": 2.9254302103250478e-05, "loss": 0.1747, "step": 20400 }, { "epoch": 0.2929071301554523, "grad_norm": 6.562338352203369, "learning_rate": 2.9290152963671127e-05, "loss": 0.3795, "step": 20425 }, { "epoch": 0.2932656456146389, "grad_norm": 7.776217460632324, "learning_rate": 2.932600382409178e-05, "loss": 0.3195, "step": 20450 }, { "epoch": 0.2936241610738255, "grad_norm": 15.733892440795898, "learning_rate": 2.936185468451243e-05, "loss": 0.2378, "step": 20475 }, { "epoch": 0.2939826765330121, "grad_norm": 2.551854372024536, "learning_rate": 2.939770554493308e-05, "loss": 0.2876, "step": 20500 }, { "epoch": 0.2943411919921987, "grad_norm": 5.355380058288574, "learning_rate": 2.9433556405353728e-05, "loss": 0.3103, "step": 20525 }, { "epoch": 0.2946997074513853, "grad_norm": 1.7332359552383423, "learning_rate": 2.946940726577438e-05, "loss": 0.2284, "step": 20550 }, { "epoch": 0.2950582229105719, "grad_norm": 19.07693099975586, "learning_rate": 2.950525812619503e-05, "loss": 0.2723, "step": 20575 }, { "epoch": 0.2954167383697585, "grad_norm": 8.401837348937988, "learning_rate": 2.954110898661568e-05, "loss": 0.2245, "step": 20600 }, { "epoch": 0.2957752538289451, "grad_norm": 8.8356294631958, "learning_rate": 2.957695984703633e-05, "loss": 0.2337, "step": 20625 }, { "epoch": 0.29613376928813173, "grad_norm": 1.9513119459152222, "learning_rate": 2.961281070745698e-05, "loss": 0.2163, "step": 20650 }, { "epoch": 0.2964922847473183, "grad_norm": 0.3883037567138672, "learning_rate": 2.964866156787763e-05, "loss": 0.2786, "step": 20675 }, { "epoch": 0.2968508002065049, "grad_norm": 3.465054512023926, "learning_rate": 2.968451242829828e-05, "loss": 0.2206, "step": 20700 }, { "epoch": 0.29720931566569153, "grad_norm": 7.261002540588379, "learning_rate": 2.972036328871893e-05, "loss": 0.3165, "step": 20725 }, { "epoch": 0.2975678311248781, "grad_norm": 4.08643102645874, "learning_rate": 2.9756214149139578e-05, "loss": 0.3218, "step": 20750 }, { "epoch": 0.2979263465840647, "grad_norm": 14.658404350280762, "learning_rate": 2.979206500956023e-05, "loss": 0.2487, "step": 20775 }, { "epoch": 0.29828486204325133, "grad_norm": 24.711029052734375, "learning_rate": 2.9827915869980883e-05, "loss": 0.2195, "step": 20800 }, { "epoch": 0.2986433775024379, "grad_norm": 1.2454246282577515, "learning_rate": 2.9863766730401532e-05, "loss": 0.1814, "step": 20825 }, { "epoch": 0.2990018929616245, "grad_norm": 0.683849573135376, "learning_rate": 2.989961759082218e-05, "loss": 0.23, "step": 20850 }, { "epoch": 0.29936040842081113, "grad_norm": 0.3523104786872864, "learning_rate": 2.993546845124283e-05, "loss": 0.2571, "step": 20875 }, { "epoch": 0.2997189238799977, "grad_norm": 1.7496182918548584, "learning_rate": 2.997131931166348e-05, "loss": 0.2167, "step": 20900 }, { "epoch": 0.3000774393391843, "grad_norm": 5.460028648376465, "learning_rate": 2.9999203297286962e-05, "loss": 0.1738, "step": 20925 }, { "epoch": 0.30043595479837093, "grad_norm": 17.124204635620117, "learning_rate": 2.999521978372177e-05, "loss": 0.1225, "step": 20950 }, { "epoch": 0.3007944702575575, "grad_norm": 1.7300434112548828, "learning_rate": 2.9991236270156578e-05, "loss": 0.183, "step": 20975 }, { "epoch": 0.3011529857167441, "grad_norm": 6.557162284851074, "learning_rate": 2.998725275659139e-05, "loss": 0.1801, "step": 21000 }, { "epoch": 0.30151150117593073, "grad_norm": 16.05350685119629, "learning_rate": 2.9983269243026197e-05, "loss": 0.2734, "step": 21025 }, { "epoch": 0.3018700166351173, "grad_norm": 11.568105697631836, "learning_rate": 2.9979285729461005e-05, "loss": 0.2306, "step": 21050 }, { "epoch": 0.3022285320943039, "grad_norm": 16.08601951599121, "learning_rate": 2.9975302215895813e-05, "loss": 0.1919, "step": 21075 }, { "epoch": 0.30258704755349053, "grad_norm": 0.44933411478996277, "learning_rate": 2.997131870233062e-05, "loss": 0.1882, "step": 21100 }, { "epoch": 0.3029455630126771, "grad_norm": 0.7617999315261841, "learning_rate": 2.9967335188765432e-05, "loss": 0.2511, "step": 21125 }, { "epoch": 0.3033040784718637, "grad_norm": 11.954914093017578, "learning_rate": 2.996335167520024e-05, "loss": 0.2971, "step": 21150 }, { "epoch": 0.30366259393105033, "grad_norm": 4.230805397033691, "learning_rate": 2.9959368161635048e-05, "loss": 0.2033, "step": 21175 }, { "epoch": 0.3040211093902369, "grad_norm": 4.654829978942871, "learning_rate": 2.9955384648069855e-05, "loss": 0.2574, "step": 21200 }, { "epoch": 0.3043796248494235, "grad_norm": 5.43945837020874, "learning_rate": 2.9951401134504667e-05, "loss": 0.2105, "step": 21225 }, { "epoch": 0.30473814030861013, "grad_norm": 16.07280158996582, "learning_rate": 2.9947417620939474e-05, "loss": 0.2486, "step": 21250 }, { "epoch": 0.3050966557677967, "grad_norm": 17.358999252319336, "learning_rate": 2.9943434107374282e-05, "loss": 0.3726, "step": 21275 }, { "epoch": 0.3054551712269833, "grad_norm": 9.108317375183105, "learning_rate": 2.993945059380909e-05, "loss": 0.2555, "step": 21300 }, { "epoch": 0.3058136866861699, "grad_norm": 6.765558242797852, "learning_rate": 2.9935467080243898e-05, "loss": 0.2747, "step": 21325 }, { "epoch": 0.3061722021453565, "grad_norm": 15.295775413513184, "learning_rate": 2.993148356667871e-05, "loss": 0.3068, "step": 21350 }, { "epoch": 0.3065307176045431, "grad_norm": 3.8406872749328613, "learning_rate": 2.9927500053113517e-05, "loss": 0.2129, "step": 21375 }, { "epoch": 0.3068892330637297, "grad_norm": 5.712185382843018, "learning_rate": 2.9923516539548325e-05, "loss": 0.3077, "step": 21400 }, { "epoch": 0.3072477485229163, "grad_norm": 16.23902702331543, "learning_rate": 2.9919533025983133e-05, "loss": 0.2669, "step": 21425 }, { "epoch": 0.3076062639821029, "grad_norm": 8.450338363647461, "learning_rate": 2.991554951241794e-05, "loss": 0.3334, "step": 21450 }, { "epoch": 0.3079647794412895, "grad_norm": 7.182157039642334, "learning_rate": 2.991156599885275e-05, "loss": 0.2649, "step": 21475 }, { "epoch": 0.3083232949004761, "grad_norm": 19.773523330688477, "learning_rate": 2.9907582485287556e-05, "loss": 0.3026, "step": 21500 }, { "epoch": 0.3086818103596627, "grad_norm": 7.724667549133301, "learning_rate": 2.9903598971722364e-05, "loss": 0.1932, "step": 21525 }, { "epoch": 0.3090403258188493, "grad_norm": 18.090898513793945, "learning_rate": 2.9899615458157172e-05, "loss": 0.4045, "step": 21550 }, { "epoch": 0.3093988412780359, "grad_norm": 12.394676208496094, "learning_rate": 2.989563194459198e-05, "loss": 0.3079, "step": 21575 }, { "epoch": 0.3097573567372225, "grad_norm": 2.157986879348755, "learning_rate": 2.989164843102679e-05, "loss": 0.1849, "step": 21600 }, { "epoch": 0.3101158721964091, "grad_norm": 17.672285079956055, "learning_rate": 2.98876649174616e-05, "loss": 0.3043, "step": 21625 }, { "epoch": 0.3104743876555957, "grad_norm": 4.485792636871338, "learning_rate": 2.9883681403896407e-05, "loss": 0.1846, "step": 21650 }, { "epoch": 0.3108329031147823, "grad_norm": 22.912601470947266, "learning_rate": 2.9879697890331215e-05, "loss": 0.3556, "step": 21675 }, { "epoch": 0.3111914185739689, "grad_norm": 13.455900192260742, "learning_rate": 2.9875714376766022e-05, "loss": 0.3088, "step": 21700 }, { "epoch": 0.3115499340331555, "grad_norm": 0.9097751379013062, "learning_rate": 2.9871730863200834e-05, "loss": 0.282, "step": 21725 }, { "epoch": 0.3119084494923421, "grad_norm": 21.211320877075195, "learning_rate": 2.986774734963564e-05, "loss": 0.2547, "step": 21750 }, { "epoch": 0.3122669649515287, "grad_norm": 18.547794342041016, "learning_rate": 2.986376383607045e-05, "loss": 0.3529, "step": 21775 }, { "epoch": 0.3126254804107153, "grad_norm": 19.381603240966797, "learning_rate": 2.9859780322505257e-05, "loss": 0.1531, "step": 21800 }, { "epoch": 0.3129839958699019, "grad_norm": 5.983455181121826, "learning_rate": 2.985579680894007e-05, "loss": 0.2174, "step": 21825 }, { "epoch": 0.3133425113290885, "grad_norm": 11.451029777526855, "learning_rate": 2.9851813295374876e-05, "loss": 0.181, "step": 21850 }, { "epoch": 0.3137010267882751, "grad_norm": 4.683139324188232, "learning_rate": 2.9847829781809684e-05, "loss": 0.2927, "step": 21875 }, { "epoch": 0.3140595422474617, "grad_norm": 16.32216453552246, "learning_rate": 2.9843846268244492e-05, "loss": 0.1159, "step": 21900 }, { "epoch": 0.3144180577066483, "grad_norm": 1.3987348079681396, "learning_rate": 2.98398627546793e-05, "loss": 0.2526, "step": 21925 }, { "epoch": 0.3147765731658349, "grad_norm": 15.44442081451416, "learning_rate": 2.983587924111411e-05, "loss": 0.2205, "step": 21950 }, { "epoch": 0.3151350886250215, "grad_norm": 17.571826934814453, "learning_rate": 2.983189572754892e-05, "loss": 0.2439, "step": 21975 }, { "epoch": 0.3154936040842081, "grad_norm": 16.18746566772461, "learning_rate": 2.9827912213983727e-05, "loss": 0.1636, "step": 22000 }, { "epoch": 0.3158521195433947, "grad_norm": 18.800987243652344, "learning_rate": 2.9823928700418535e-05, "loss": 0.3264, "step": 22025 }, { "epoch": 0.3162106350025813, "grad_norm": 11.791354179382324, "learning_rate": 2.9819945186853342e-05, "loss": 0.2622, "step": 22050 }, { "epoch": 0.3165691504617679, "grad_norm": 6.063324928283691, "learning_rate": 2.9815961673288154e-05, "loss": 0.3179, "step": 22075 }, { "epoch": 0.3169276659209545, "grad_norm": 2.7555506229400635, "learning_rate": 2.981197815972296e-05, "loss": 0.1842, "step": 22100 }, { "epoch": 0.3172861813801411, "grad_norm": 1.266060471534729, "learning_rate": 2.980799464615777e-05, "loss": 0.3729, "step": 22125 }, { "epoch": 0.3176446968393277, "grad_norm": 21.918733596801758, "learning_rate": 2.9804011132592577e-05, "loss": 0.1424, "step": 22150 }, { "epoch": 0.3180032122985143, "grad_norm": 19.63381576538086, "learning_rate": 2.9800027619027385e-05, "loss": 0.1976, "step": 22175 }, { "epoch": 0.3183617277577009, "grad_norm": 6.9891839027404785, "learning_rate": 2.9796044105462196e-05, "loss": 0.2625, "step": 22200 }, { "epoch": 0.3187202432168875, "grad_norm": 2.674755096435547, "learning_rate": 2.9792060591897004e-05, "loss": 0.1959, "step": 22225 }, { "epoch": 0.3190787586760741, "grad_norm": 2.6462981700897217, "learning_rate": 2.9788077078331812e-05, "loss": 0.3099, "step": 22250 }, { "epoch": 0.3194372741352607, "grad_norm": 2.3995537757873535, "learning_rate": 2.978409356476662e-05, "loss": 0.1911, "step": 22275 }, { "epoch": 0.3197957895944473, "grad_norm": 0.7237873077392578, "learning_rate": 2.9780110051201428e-05, "loss": 0.2738, "step": 22300 }, { "epoch": 0.3201543050536339, "grad_norm": 9.139585494995117, "learning_rate": 2.977612653763624e-05, "loss": 0.3306, "step": 22325 }, { "epoch": 0.32051282051282054, "grad_norm": 15.391871452331543, "learning_rate": 2.9772143024071047e-05, "loss": 0.2663, "step": 22350 }, { "epoch": 0.3208713359720071, "grad_norm": 10.907376289367676, "learning_rate": 2.9768159510505855e-05, "loss": 0.2603, "step": 22375 }, { "epoch": 0.3212298514311937, "grad_norm": 10.746197700500488, "learning_rate": 2.9764175996940662e-05, "loss": 0.2248, "step": 22400 }, { "epoch": 0.32158836689038034, "grad_norm": 7.457546234130859, "learning_rate": 2.9760192483375474e-05, "loss": 0.1879, "step": 22425 }, { "epoch": 0.3219468823495669, "grad_norm": 2.4504826068878174, "learning_rate": 2.975620896981028e-05, "loss": 0.2845, "step": 22450 }, { "epoch": 0.3223053978087535, "grad_norm": 22.74581527709961, "learning_rate": 2.975222545624509e-05, "loss": 0.3395, "step": 22475 }, { "epoch": 0.32266391326794014, "grad_norm": 13.047669410705566, "learning_rate": 2.9748241942679897e-05, "loss": 0.2041, "step": 22500 }, { "epoch": 0.3230224287271267, "grad_norm": 11.981760025024414, "learning_rate": 2.9744258429114705e-05, "loss": 0.2092, "step": 22525 }, { "epoch": 0.3233809441863133, "grad_norm": 10.428959846496582, "learning_rate": 2.9740274915549516e-05, "loss": 0.2462, "step": 22550 }, { "epoch": 0.32373945964549994, "grad_norm": 15.568696022033691, "learning_rate": 2.973629140198432e-05, "loss": 0.1949, "step": 22575 }, { "epoch": 0.3240979751046865, "grad_norm": 2.385331630706787, "learning_rate": 2.973230788841913e-05, "loss": 0.2982, "step": 22600 }, { "epoch": 0.3244564905638731, "grad_norm": 2.661036729812622, "learning_rate": 2.9728324374853936e-05, "loss": 0.1788, "step": 22625 }, { "epoch": 0.32481500602305974, "grad_norm": 0.2529655396938324, "learning_rate": 2.9724340861288744e-05, "loss": 0.4055, "step": 22650 }, { "epoch": 0.3251735214822463, "grad_norm": 7.348079681396484, "learning_rate": 2.9720357347723555e-05, "loss": 0.1143, "step": 22675 }, { "epoch": 0.3255320369414329, "grad_norm": 18.337507247924805, "learning_rate": 2.9716373834158363e-05, "loss": 0.2738, "step": 22700 }, { "epoch": 0.32589055240061954, "grad_norm": 12.693746566772461, "learning_rate": 2.971239032059317e-05, "loss": 0.214, "step": 22725 }, { "epoch": 0.3262490678598061, "grad_norm": 4.435439109802246, "learning_rate": 2.970840680702798e-05, "loss": 0.3421, "step": 22750 }, { "epoch": 0.3266075833189927, "grad_norm": 5.04595422744751, "learning_rate": 2.9704423293462787e-05, "loss": 0.3857, "step": 22775 }, { "epoch": 0.32696609877817934, "grad_norm": 5.977475643157959, "learning_rate": 2.9700439779897598e-05, "loss": 0.2871, "step": 22800 }, { "epoch": 0.3273246142373659, "grad_norm": 15.651930809020996, "learning_rate": 2.9696456266332406e-05, "loss": 0.2574, "step": 22825 }, { "epoch": 0.3276831296965525, "grad_norm": 22.9890193939209, "learning_rate": 2.9692472752767214e-05, "loss": 0.2348, "step": 22850 }, { "epoch": 0.32804164515573914, "grad_norm": 0.6702564358711243, "learning_rate": 2.968848923920202e-05, "loss": 0.2651, "step": 22875 }, { "epoch": 0.3284001606149257, "grad_norm": 4.324332237243652, "learning_rate": 2.968450572563683e-05, "loss": 0.2348, "step": 22900 }, { "epoch": 0.3287586760741123, "grad_norm": 13.84181022644043, "learning_rate": 2.968052221207164e-05, "loss": 0.2374, "step": 22925 }, { "epoch": 0.32911719153329894, "grad_norm": 16.29159164428711, "learning_rate": 2.967653869850645e-05, "loss": 0.3084, "step": 22950 }, { "epoch": 0.3294757069924855, "grad_norm": 5.795655250549316, "learning_rate": 2.9672555184941256e-05, "loss": 0.2003, "step": 22975 }, { "epoch": 0.3298342224516721, "grad_norm": 7.609113693237305, "learning_rate": 2.9668571671376064e-05, "loss": 0.1673, "step": 23000 }, { "epoch": 0.33019273791085874, "grad_norm": 0.4638534486293793, "learning_rate": 2.9664588157810875e-05, "loss": 0.2873, "step": 23025 }, { "epoch": 0.3305512533700453, "grad_norm": 3.0715019702911377, "learning_rate": 2.9660604644245683e-05, "loss": 0.1674, "step": 23050 }, { "epoch": 0.3309097688292319, "grad_norm": 1.632935643196106, "learning_rate": 2.965662113068049e-05, "loss": 0.3094, "step": 23075 }, { "epoch": 0.33126828428841854, "grad_norm": 17.838176727294922, "learning_rate": 2.96526376171153e-05, "loss": 0.3885, "step": 23100 }, { "epoch": 0.3316267997476051, "grad_norm": 12.30765438079834, "learning_rate": 2.9648654103550107e-05, "loss": 0.164, "step": 23125 }, { "epoch": 0.3319853152067917, "grad_norm": 19.633586883544922, "learning_rate": 2.9644670589984918e-05, "loss": 0.2327, "step": 23150 }, { "epoch": 0.33234383066597833, "grad_norm": 17.15802574157715, "learning_rate": 2.9640687076419726e-05, "loss": 0.2715, "step": 23175 }, { "epoch": 0.3327023461251649, "grad_norm": 21.008607864379883, "learning_rate": 2.9636703562854534e-05, "loss": 0.2796, "step": 23200 }, { "epoch": 0.3330608615843515, "grad_norm": 21.285991668701172, "learning_rate": 2.963272004928934e-05, "loss": 0.3203, "step": 23225 }, { "epoch": 0.33341937704353813, "grad_norm": 3.5171446800231934, "learning_rate": 2.962873653572415e-05, "loss": 0.2951, "step": 23250 }, { "epoch": 0.3337778925027247, "grad_norm": 2.312645673751831, "learning_rate": 2.962475302215896e-05, "loss": 0.3371, "step": 23275 }, { "epoch": 0.3341364079619113, "grad_norm": 8.313732147216797, "learning_rate": 2.962076950859377e-05, "loss": 0.2558, "step": 23300 }, { "epoch": 0.33449492342109793, "grad_norm": 1.8229466676712036, "learning_rate": 2.9616785995028576e-05, "loss": 0.2443, "step": 23325 }, { "epoch": 0.3348534388802845, "grad_norm": 14.77869987487793, "learning_rate": 2.9612802481463384e-05, "loss": 0.2115, "step": 23350 }, { "epoch": 0.3352119543394711, "grad_norm": 8.053282737731934, "learning_rate": 2.9608818967898192e-05, "loss": 0.1755, "step": 23375 }, { "epoch": 0.33557046979865773, "grad_norm": 3.295053243637085, "learning_rate": 2.9604835454333003e-05, "loss": 0.1515, "step": 23400 }, { "epoch": 0.3359289852578443, "grad_norm": 4.045801162719727, "learning_rate": 2.960085194076781e-05, "loss": 0.2871, "step": 23425 }, { "epoch": 0.3362875007170309, "grad_norm": 15.098994255065918, "learning_rate": 2.959686842720262e-05, "loss": 0.3052, "step": 23450 }, { "epoch": 0.33664601617621753, "grad_norm": 2.8335981369018555, "learning_rate": 2.9592884913637427e-05, "loss": 0.1402, "step": 23475 }, { "epoch": 0.3370045316354041, "grad_norm": 17.974220275878906, "learning_rate": 2.9588901400072235e-05, "loss": 0.3331, "step": 23500 }, { "epoch": 0.3373630470945907, "grad_norm": 16.3046932220459, "learning_rate": 2.9584917886507046e-05, "loss": 0.3404, "step": 23525 }, { "epoch": 0.33772156255377733, "grad_norm": 0.4000454843044281, "learning_rate": 2.9580934372941854e-05, "loss": 0.2352, "step": 23550 }, { "epoch": 0.3380800780129639, "grad_norm": 15.960461616516113, "learning_rate": 2.957695085937666e-05, "loss": 0.2743, "step": 23575 }, { "epoch": 0.3384385934721505, "grad_norm": 18.86566925048828, "learning_rate": 2.957296734581147e-05, "loss": 0.2861, "step": 23600 }, { "epoch": 0.33879710893133713, "grad_norm": 3.5086886882781982, "learning_rate": 2.956898383224628e-05, "loss": 0.2344, "step": 23625 }, { "epoch": 0.3391556243905237, "grad_norm": 9.73814868927002, "learning_rate": 2.956500031868109e-05, "loss": 0.2285, "step": 23650 }, { "epoch": 0.3395141398497103, "grad_norm": 1.6655077934265137, "learning_rate": 2.9561016805115893e-05, "loss": 0.1757, "step": 23675 }, { "epoch": 0.33987265530889693, "grad_norm": 9.170037269592285, "learning_rate": 2.95570332915507e-05, "loss": 0.2568, "step": 23700 }, { "epoch": 0.3402311707680835, "grad_norm": 12.812776565551758, "learning_rate": 2.955304977798551e-05, "loss": 0.198, "step": 23725 }, { "epoch": 0.3405896862272701, "grad_norm": 12.708268165588379, "learning_rate": 2.954906626442032e-05, "loss": 0.2954, "step": 23750 }, { "epoch": 0.34094820168645673, "grad_norm": 14.962780952453613, "learning_rate": 2.9545082750855128e-05, "loss": 0.298, "step": 23775 }, { "epoch": 0.3413067171456433, "grad_norm": 0.35515111684799194, "learning_rate": 2.9541099237289936e-05, "loss": 0.2615, "step": 23800 }, { "epoch": 0.3416652326048299, "grad_norm": 11.468474388122559, "learning_rate": 2.9537115723724743e-05, "loss": 0.2379, "step": 23825 }, { "epoch": 0.34202374806401653, "grad_norm": 10.22083854675293, "learning_rate": 2.953313221015955e-05, "loss": 0.2371, "step": 23850 }, { "epoch": 0.3423822635232031, "grad_norm": 15.816717147827148, "learning_rate": 2.9529148696594363e-05, "loss": 0.2438, "step": 23875 }, { "epoch": 0.3427407789823897, "grad_norm": 1.2691898345947266, "learning_rate": 2.952516518302917e-05, "loss": 0.1635, "step": 23900 }, { "epoch": 0.34309929444157633, "grad_norm": 6.15402889251709, "learning_rate": 2.9521181669463978e-05, "loss": 0.2602, "step": 23925 }, { "epoch": 0.3434578099007629, "grad_norm": 19.7387752532959, "learning_rate": 2.9517198155898786e-05, "loss": 0.3317, "step": 23950 }, { "epoch": 0.34381632535994955, "grad_norm": 5.406175136566162, "learning_rate": 2.9513214642333594e-05, "loss": 0.2682, "step": 23975 }, { "epoch": 0.34417484081913613, "grad_norm": 0.5352908968925476, "learning_rate": 2.9509231128768405e-05, "loss": 0.2214, "step": 24000 }, { "epoch": 0.3445333562783227, "grad_norm": 17.25306510925293, "learning_rate": 2.9505247615203213e-05, "loss": 0.2172, "step": 24025 }, { "epoch": 0.34489187173750935, "grad_norm": 4.349745273590088, "learning_rate": 2.950126410163802e-05, "loss": 0.2255, "step": 24050 }, { "epoch": 0.34525038719669593, "grad_norm": 9.73361587524414, "learning_rate": 2.949728058807283e-05, "loss": 0.291, "step": 24075 }, { "epoch": 0.3456089026558825, "grad_norm": 14.897175788879395, "learning_rate": 2.9493297074507637e-05, "loss": 0.2082, "step": 24100 }, { "epoch": 0.34596741811506915, "grad_norm": 1.3057208061218262, "learning_rate": 2.9489313560942448e-05, "loss": 0.2559, "step": 24125 }, { "epoch": 0.34632593357425573, "grad_norm": 14.56025218963623, "learning_rate": 2.9485330047377256e-05, "loss": 0.2699, "step": 24150 }, { "epoch": 0.3466844490334423, "grad_norm": 15.96402359008789, "learning_rate": 2.9481346533812063e-05, "loss": 0.3436, "step": 24175 }, { "epoch": 0.34704296449262895, "grad_norm": 4.329315185546875, "learning_rate": 2.947736302024687e-05, "loss": 0.1547, "step": 24200 }, { "epoch": 0.34740147995181553, "grad_norm": 12.512358665466309, "learning_rate": 2.9473379506681683e-05, "loss": 0.1385, "step": 24225 }, { "epoch": 0.3477599954110021, "grad_norm": 16.150035858154297, "learning_rate": 2.946939599311649e-05, "loss": 0.2727, "step": 24250 }, { "epoch": 0.34811851087018875, "grad_norm": 20.095308303833008, "learning_rate": 2.9465412479551298e-05, "loss": 0.285, "step": 24275 }, { "epoch": 0.34847702632937533, "grad_norm": 15.899024963378906, "learning_rate": 2.9461428965986106e-05, "loss": 0.1922, "step": 24300 }, { "epoch": 0.3488355417885619, "grad_norm": 9.4336519241333, "learning_rate": 2.9457445452420914e-05, "loss": 0.2524, "step": 24325 }, { "epoch": 0.34919405724774855, "grad_norm": 15.081279754638672, "learning_rate": 2.9453461938855725e-05, "loss": 0.2177, "step": 24350 }, { "epoch": 0.3495525727069351, "grad_norm": 1.1824359893798828, "learning_rate": 2.9449478425290533e-05, "loss": 0.2232, "step": 24375 }, { "epoch": 0.3499110881661217, "grad_norm": 6.974002361297607, "learning_rate": 2.944549491172534e-05, "loss": 0.3173, "step": 24400 }, { "epoch": 0.35026960362530835, "grad_norm": 4.926356792449951, "learning_rate": 2.944151139816015e-05, "loss": 0.2763, "step": 24425 }, { "epoch": 0.3506281190844949, "grad_norm": 11.26715087890625, "learning_rate": 2.9437527884594957e-05, "loss": 0.3672, "step": 24450 }, { "epoch": 0.3509866345436815, "grad_norm": 8.434706687927246, "learning_rate": 2.9433544371029768e-05, "loss": 0.3052, "step": 24475 }, { "epoch": 0.35134515000286815, "grad_norm": 13.529003143310547, "learning_rate": 2.9429560857464576e-05, "loss": 0.2488, "step": 24500 }, { "epoch": 0.3517036654620547, "grad_norm": 14.676383018493652, "learning_rate": 2.9425577343899383e-05, "loss": 0.4079, "step": 24525 }, { "epoch": 0.3520621809212413, "grad_norm": 19.213090896606445, "learning_rate": 2.942159383033419e-05, "loss": 0.2206, "step": 24550 }, { "epoch": 0.35242069638042794, "grad_norm": 20.267494201660156, "learning_rate": 2.9417610316769e-05, "loss": 0.2756, "step": 24575 }, { "epoch": 0.3527792118396145, "grad_norm": 1.4770338535308838, "learning_rate": 2.941362680320381e-05, "loss": 0.2352, "step": 24600 }, { "epoch": 0.3531377272988011, "grad_norm": 12.496036529541016, "learning_rate": 2.9409643289638618e-05, "loss": 0.2141, "step": 24625 }, { "epoch": 0.35349624275798774, "grad_norm": 8.336191177368164, "learning_rate": 2.9405659776073426e-05, "loss": 0.2963, "step": 24650 }, { "epoch": 0.3538547582171743, "grad_norm": 4.525081634521484, "learning_rate": 2.9401676262508234e-05, "loss": 0.3807, "step": 24675 }, { "epoch": 0.3542132736763609, "grad_norm": 8.926490783691406, "learning_rate": 2.9397692748943042e-05, "loss": 0.2044, "step": 24700 }, { "epoch": 0.35457178913554754, "grad_norm": 3.6876420974731445, "learning_rate": 2.9393709235377853e-05, "loss": 0.3007, "step": 24725 }, { "epoch": 0.3549303045947341, "grad_norm": 0.823472261428833, "learning_rate": 2.938972572181266e-05, "loss": 0.1947, "step": 24750 }, { "epoch": 0.3552888200539207, "grad_norm": 26.125568389892578, "learning_rate": 2.9385742208247465e-05, "loss": 0.3006, "step": 24775 }, { "epoch": 0.35564733551310734, "grad_norm": 9.722007751464844, "learning_rate": 2.9381758694682273e-05, "loss": 0.3499, "step": 24800 }, { "epoch": 0.3560058509722939, "grad_norm": 14.56052017211914, "learning_rate": 2.9377775181117084e-05, "loss": 0.3327, "step": 24825 }, { "epoch": 0.3563643664314805, "grad_norm": 1.2003718614578247, "learning_rate": 2.9373791667551892e-05, "loss": 0.1926, "step": 24850 }, { "epoch": 0.35672288189066714, "grad_norm": 1.9628406763076782, "learning_rate": 2.93698081539867e-05, "loss": 0.2378, "step": 24875 }, { "epoch": 0.3570813973498537, "grad_norm": 20.815113067626953, "learning_rate": 2.9365824640421508e-05, "loss": 0.2618, "step": 24900 }, { "epoch": 0.3574399128090403, "grad_norm": 3.099806547164917, "learning_rate": 2.9361841126856316e-05, "loss": 0.2902, "step": 24925 }, { "epoch": 0.35779842826822694, "grad_norm": 0.704993724822998, "learning_rate": 2.9357857613291127e-05, "loss": 0.2205, "step": 24950 }, { "epoch": 0.3581569437274135, "grad_norm": 13.0752534866333, "learning_rate": 2.9353874099725935e-05, "loss": 0.3611, "step": 24975 }, { "epoch": 0.3585154591866001, "grad_norm": 1.360100269317627, "learning_rate": 2.9349890586160743e-05, "loss": 0.2944, "step": 25000 }, { "epoch": 0.35887397464578674, "grad_norm": 20.393651962280273, "learning_rate": 2.934590707259555e-05, "loss": 0.1723, "step": 25025 }, { "epoch": 0.3592324901049733, "grad_norm": 2.4447968006134033, "learning_rate": 2.934192355903036e-05, "loss": 0.3565, "step": 25050 }, { "epoch": 0.3595910055641599, "grad_norm": 4.078526496887207, "learning_rate": 2.933794004546517e-05, "loss": 0.1343, "step": 25075 }, { "epoch": 0.35994952102334654, "grad_norm": 14.657326698303223, "learning_rate": 2.9333956531899977e-05, "loss": 0.2008, "step": 25100 }, { "epoch": 0.3603080364825331, "grad_norm": 13.81492805480957, "learning_rate": 2.9329973018334785e-05, "loss": 0.1341, "step": 25125 }, { "epoch": 0.3606665519417197, "grad_norm": 5.382019996643066, "learning_rate": 2.9325989504769593e-05, "loss": 0.1789, "step": 25150 }, { "epoch": 0.36102506740090634, "grad_norm": 4.824952125549316, "learning_rate": 2.93220059912044e-05, "loss": 0.188, "step": 25175 }, { "epoch": 0.3613835828600929, "grad_norm": 6.997865200042725, "learning_rate": 2.9318022477639212e-05, "loss": 0.2962, "step": 25200 }, { "epoch": 0.3617420983192795, "grad_norm": 5.0316057205200195, "learning_rate": 2.931403896407402e-05, "loss": 0.2144, "step": 25225 }, { "epoch": 0.36210061377846614, "grad_norm": 13.36693000793457, "learning_rate": 2.9310055450508828e-05, "loss": 0.3284, "step": 25250 }, { "epoch": 0.3624591292376527, "grad_norm": 2.2875287532806396, "learning_rate": 2.9306071936943636e-05, "loss": 0.172, "step": 25275 }, { "epoch": 0.3628176446968393, "grad_norm": 3.494028329849243, "learning_rate": 2.9302088423378444e-05, "loss": 0.3009, "step": 25300 }, { "epoch": 0.36317616015602594, "grad_norm": 4.616530418395996, "learning_rate": 2.9298104909813255e-05, "loss": 0.2603, "step": 25325 }, { "epoch": 0.3635346756152125, "grad_norm": 22.222633361816406, "learning_rate": 2.9294121396248063e-05, "loss": 0.1862, "step": 25350 }, { "epoch": 0.3638931910743991, "grad_norm": 1.3013118505477905, "learning_rate": 2.929013788268287e-05, "loss": 0.26, "step": 25375 }, { "epoch": 0.36425170653358574, "grad_norm": 10.01912784576416, "learning_rate": 2.928615436911768e-05, "loss": 0.1741, "step": 25400 }, { "epoch": 0.3646102219927723, "grad_norm": 28.46045684814453, "learning_rate": 2.9282170855552486e-05, "loss": 0.2661, "step": 25425 }, { "epoch": 0.3649687374519589, "grad_norm": 13.751355171203613, "learning_rate": 2.9278187341987297e-05, "loss": 0.3587, "step": 25450 }, { "epoch": 0.36532725291114554, "grad_norm": 4.509911060333252, "learning_rate": 2.9274203828422105e-05, "loss": 0.2482, "step": 25475 }, { "epoch": 0.3656857683703321, "grad_norm": 5.24999475479126, "learning_rate": 2.9270220314856913e-05, "loss": 0.2828, "step": 25500 }, { "epoch": 0.3660442838295187, "grad_norm": 17.716705322265625, "learning_rate": 2.926623680129172e-05, "loss": 0.2675, "step": 25525 }, { "epoch": 0.36640279928870534, "grad_norm": 5.276062965393066, "learning_rate": 2.9262253287726532e-05, "loss": 0.2036, "step": 25550 }, { "epoch": 0.3667613147478919, "grad_norm": 15.45035171508789, "learning_rate": 2.925826977416134e-05, "loss": 0.2403, "step": 25575 }, { "epoch": 0.36711983020707856, "grad_norm": 6.196724891662598, "learning_rate": 2.9254286260596148e-05, "loss": 0.2169, "step": 25600 }, { "epoch": 0.36747834566626514, "grad_norm": 9.853055000305176, "learning_rate": 2.9250302747030956e-05, "loss": 0.3112, "step": 25625 }, { "epoch": 0.3678368611254517, "grad_norm": 10.628134727478027, "learning_rate": 2.9246319233465764e-05, "loss": 0.3524, "step": 25650 }, { "epoch": 0.36819537658463836, "grad_norm": 18.789594650268555, "learning_rate": 2.9242335719900575e-05, "loss": 0.327, "step": 25675 }, { "epoch": 0.36855389204382494, "grad_norm": 6.4429931640625, "learning_rate": 2.9238352206335383e-05, "loss": 0.1852, "step": 25700 }, { "epoch": 0.3689124075030115, "grad_norm": 9.476052284240723, "learning_rate": 2.923436869277019e-05, "loss": 0.2564, "step": 25725 }, { "epoch": 0.36927092296219816, "grad_norm": 6.305465221405029, "learning_rate": 2.9230385179205e-05, "loss": 0.2515, "step": 25750 }, { "epoch": 0.36962943842138474, "grad_norm": 13.540791511535645, "learning_rate": 2.9226401665639806e-05, "loss": 0.3022, "step": 25775 }, { "epoch": 0.3699879538805713, "grad_norm": 3.761113405227661, "learning_rate": 2.9222418152074617e-05, "loss": 0.1598, "step": 25800 }, { "epoch": 0.37034646933975796, "grad_norm": 0.034134093672037125, "learning_rate": 2.9218434638509425e-05, "loss": 0.2425, "step": 25825 }, { "epoch": 0.37070498479894454, "grad_norm": 17.13890266418457, "learning_rate": 2.9214451124944233e-05, "loss": 0.2714, "step": 25850 }, { "epoch": 0.3710635002581311, "grad_norm": 1.4526835680007935, "learning_rate": 2.9210467611379038e-05, "loss": 0.2806, "step": 25875 }, { "epoch": 0.37142201571731776, "grad_norm": 16.937274932861328, "learning_rate": 2.9206484097813845e-05, "loss": 0.2028, "step": 25900 }, { "epoch": 0.37178053117650434, "grad_norm": 5.533425331115723, "learning_rate": 2.9202500584248657e-05, "loss": 0.1584, "step": 25925 }, { "epoch": 0.3721390466356909, "grad_norm": 0.8485677242279053, "learning_rate": 2.9198517070683464e-05, "loss": 0.166, "step": 25950 }, { "epoch": 0.37249756209487755, "grad_norm": 13.621667861938477, "learning_rate": 2.9194533557118272e-05, "loss": 0.1872, "step": 25975 }, { "epoch": 0.37285607755406414, "grad_norm": 4.255244731903076, "learning_rate": 2.919055004355308e-05, "loss": 0.3576, "step": 26000 }, { "epoch": 0.3732145930132507, "grad_norm": 0.47748705744743347, "learning_rate": 2.9186566529987888e-05, "loss": 0.2466, "step": 26025 }, { "epoch": 0.37357310847243735, "grad_norm": 9.303563117980957, "learning_rate": 2.91825830164227e-05, "loss": 0.3297, "step": 26050 }, { "epoch": 0.37393162393162394, "grad_norm": 0.7996941804885864, "learning_rate": 2.9178599502857507e-05, "loss": 0.2334, "step": 26075 }, { "epoch": 0.3742901393908105, "grad_norm": 2.422518253326416, "learning_rate": 2.9174615989292315e-05, "loss": 0.3971, "step": 26100 }, { "epoch": 0.37464865484999715, "grad_norm": 7.857664585113525, "learning_rate": 2.9170632475727123e-05, "loss": 0.2457, "step": 26125 }, { "epoch": 0.37500717030918373, "grad_norm": 13.64965534210205, "learning_rate": 2.9166648962161934e-05, "loss": 0.2098, "step": 26150 }, { "epoch": 0.3753656857683703, "grad_norm": 21.057952880859375, "learning_rate": 2.9162665448596742e-05, "loss": 0.3257, "step": 26175 }, { "epoch": 0.37572420122755695, "grad_norm": 8.698038101196289, "learning_rate": 2.915868193503155e-05, "loss": 0.2321, "step": 26200 }, { "epoch": 0.37608271668674353, "grad_norm": 21.67657470703125, "learning_rate": 2.9154698421466358e-05, "loss": 0.3152, "step": 26225 }, { "epoch": 0.3764412321459301, "grad_norm": 8.039473533630371, "learning_rate": 2.9150714907901165e-05, "loss": 0.1426, "step": 26250 }, { "epoch": 0.37679974760511675, "grad_norm": 12.567922592163086, "learning_rate": 2.9146731394335977e-05, "loss": 0.2704, "step": 26275 }, { "epoch": 0.37715826306430333, "grad_norm": 14.351625442504883, "learning_rate": 2.9142747880770784e-05, "loss": 0.2096, "step": 26300 }, { "epoch": 0.3775167785234899, "grad_norm": 21.709095001220703, "learning_rate": 2.9138764367205592e-05, "loss": 0.231, "step": 26325 }, { "epoch": 0.37787529398267655, "grad_norm": 17.607606887817383, "learning_rate": 2.91347808536404e-05, "loss": 0.369, "step": 26350 }, { "epoch": 0.37823380944186313, "grad_norm": 4.8116350173950195, "learning_rate": 2.9130797340075208e-05, "loss": 0.221, "step": 26375 }, { "epoch": 0.3785923249010497, "grad_norm": 6.236042499542236, "learning_rate": 2.912681382651002e-05, "loss": 0.266, "step": 26400 }, { "epoch": 0.37895084036023635, "grad_norm": 12.425125122070312, "learning_rate": 2.9122830312944827e-05, "loss": 0.2243, "step": 26425 }, { "epoch": 0.37930935581942293, "grad_norm": 7.812643527984619, "learning_rate": 2.9118846799379635e-05, "loss": 0.3353, "step": 26450 }, { "epoch": 0.3796678712786095, "grad_norm": 22.79737091064453, "learning_rate": 2.9114863285814443e-05, "loss": 0.2361, "step": 26475 }, { "epoch": 0.38002638673779615, "grad_norm": 15.110734939575195, "learning_rate": 2.911087977224925e-05, "loss": 0.3176, "step": 26500 }, { "epoch": 0.38038490219698273, "grad_norm": 17.370038986206055, "learning_rate": 2.9106896258684062e-05, "loss": 0.2572, "step": 26525 }, { "epoch": 0.3807434176561693, "grad_norm": 3.9080493450164795, "learning_rate": 2.910291274511887e-05, "loss": 0.3728, "step": 26550 }, { "epoch": 0.38110193311535595, "grad_norm": 9.271160125732422, "learning_rate": 2.9098929231553678e-05, "loss": 0.2689, "step": 26575 }, { "epoch": 0.38146044857454253, "grad_norm": 8.12177562713623, "learning_rate": 2.9094945717988485e-05, "loss": 0.2655, "step": 26600 }, { "epoch": 0.3818189640337291, "grad_norm": 16.385541915893555, "learning_rate": 2.9090962204423293e-05, "loss": 0.2847, "step": 26625 }, { "epoch": 0.38217747949291575, "grad_norm": 18.09789276123047, "learning_rate": 2.9086978690858104e-05, "loss": 0.2878, "step": 26650 }, { "epoch": 0.38253599495210233, "grad_norm": 6.839613437652588, "learning_rate": 2.9082995177292912e-05, "loss": 0.3239, "step": 26675 }, { "epoch": 0.3828945104112889, "grad_norm": 7.12529993057251, "learning_rate": 2.907901166372772e-05, "loss": 0.2173, "step": 26700 }, { "epoch": 0.38325302587047555, "grad_norm": 14.00050163269043, "learning_rate": 2.9075028150162528e-05, "loss": 0.2664, "step": 26725 }, { "epoch": 0.38361154132966213, "grad_norm": 16.978260040283203, "learning_rate": 2.907104463659734e-05, "loss": 0.1893, "step": 26750 }, { "epoch": 0.3839700567888487, "grad_norm": 0.9133902788162231, "learning_rate": 2.9067061123032147e-05, "loss": 0.2379, "step": 26775 }, { "epoch": 0.38432857224803535, "grad_norm": 10.204113006591797, "learning_rate": 2.9063077609466955e-05, "loss": 0.3074, "step": 26800 }, { "epoch": 0.38468708770722193, "grad_norm": 3.0474750995635986, "learning_rate": 2.9059094095901763e-05, "loss": 0.2687, "step": 26825 }, { "epoch": 0.3850456031664085, "grad_norm": 11.389253616333008, "learning_rate": 2.905511058233657e-05, "loss": 0.2559, "step": 26850 }, { "epoch": 0.38540411862559515, "grad_norm": 2.369614839553833, "learning_rate": 2.9051127068771382e-05, "loss": 0.2755, "step": 26875 }, { "epoch": 0.38576263408478173, "grad_norm": 8.071020126342773, "learning_rate": 2.904714355520619e-05, "loss": 0.3933, "step": 26900 }, { "epoch": 0.3861211495439683, "grad_norm": 0.8647685647010803, "learning_rate": 2.9043160041640998e-05, "loss": 0.2127, "step": 26925 }, { "epoch": 0.38647966500315495, "grad_norm": 14.844685554504395, "learning_rate": 2.9039176528075805e-05, "loss": 0.1793, "step": 26950 }, { "epoch": 0.38683818046234153, "grad_norm": 7.893363952636719, "learning_rate": 2.903519301451061e-05, "loss": 0.1721, "step": 26975 }, { "epoch": 0.3871966959215281, "grad_norm": 19.955148696899414, "learning_rate": 2.903120950094542e-05, "loss": 0.2819, "step": 27000 }, { "epoch": 0.38755521138071475, "grad_norm": 24.405569076538086, "learning_rate": 2.902722598738023e-05, "loss": 0.2719, "step": 27025 }, { "epoch": 0.38791372683990133, "grad_norm": 9.85604476928711, "learning_rate": 2.9023242473815037e-05, "loss": 0.279, "step": 27050 }, { "epoch": 0.3882722422990879, "grad_norm": 8.581001281738281, "learning_rate": 2.9019258960249845e-05, "loss": 0.2445, "step": 27075 }, { "epoch": 0.38863075775827455, "grad_norm": 11.667106628417969, "learning_rate": 2.9015275446684652e-05, "loss": 0.3435, "step": 27100 }, { "epoch": 0.38898927321746113, "grad_norm": 18.20888900756836, "learning_rate": 2.9011291933119464e-05, "loss": 0.2596, "step": 27125 }, { "epoch": 0.3893477886766477, "grad_norm": 17.63239097595215, "learning_rate": 2.900730841955427e-05, "loss": 0.4458, "step": 27150 }, { "epoch": 0.38970630413583435, "grad_norm": 2.247157096862793, "learning_rate": 2.900332490598908e-05, "loss": 0.1807, "step": 27175 }, { "epoch": 0.39006481959502093, "grad_norm": 11.304150581359863, "learning_rate": 2.8999341392423887e-05, "loss": 0.3289, "step": 27200 }, { "epoch": 0.3904233350542075, "grad_norm": 14.88448715209961, "learning_rate": 2.8995357878858695e-05, "loss": 0.297, "step": 27225 }, { "epoch": 0.39078185051339415, "grad_norm": 12.576225280761719, "learning_rate": 2.8991374365293506e-05, "loss": 0.2552, "step": 27250 }, { "epoch": 0.39114036597258073, "grad_norm": 0.6576104164123535, "learning_rate": 2.8987390851728314e-05, "loss": 0.2433, "step": 27275 }, { "epoch": 0.39149888143176736, "grad_norm": 3.9748106002807617, "learning_rate": 2.8983407338163122e-05, "loss": 0.128, "step": 27300 }, { "epoch": 0.39185739689095395, "grad_norm": 21.583133697509766, "learning_rate": 2.897942382459793e-05, "loss": 0.2227, "step": 27325 }, { "epoch": 0.3922159123501405, "grad_norm": 4.634997367858887, "learning_rate": 2.897544031103274e-05, "loss": 0.3687, "step": 27350 }, { "epoch": 0.39257442780932716, "grad_norm": 11.738582611083984, "learning_rate": 2.897145679746755e-05, "loss": 0.2693, "step": 27375 }, { "epoch": 0.39293294326851375, "grad_norm": 12.935501098632812, "learning_rate": 2.8967473283902357e-05, "loss": 0.2568, "step": 27400 }, { "epoch": 0.3932914587277003, "grad_norm": 2.860593795776367, "learning_rate": 2.8963489770337165e-05, "loss": 0.1593, "step": 27425 }, { "epoch": 0.39364997418688696, "grad_norm": 23.29871368408203, "learning_rate": 2.8959506256771972e-05, "loss": 0.302, "step": 27450 }, { "epoch": 0.39400848964607355, "grad_norm": 19.57683753967285, "learning_rate": 2.8955522743206784e-05, "loss": 0.326, "step": 27475 }, { "epoch": 0.3943670051052601, "grad_norm": 3.609184980392456, "learning_rate": 2.895153922964159e-05, "loss": 0.159, "step": 27500 }, { "epoch": 0.39472552056444676, "grad_norm": 1.6102290153503418, "learning_rate": 2.89475557160764e-05, "loss": 0.2106, "step": 27525 }, { "epoch": 0.39508403602363334, "grad_norm": 2.3063852787017822, "learning_rate": 2.8943572202511207e-05, "loss": 0.1746, "step": 27550 }, { "epoch": 0.3954425514828199, "grad_norm": 1.5618293285369873, "learning_rate": 2.8939588688946015e-05, "loss": 0.3526, "step": 27575 }, { "epoch": 0.39580106694200656, "grad_norm": 9.563315391540527, "learning_rate": 2.8935605175380826e-05, "loss": 0.2224, "step": 27600 }, { "epoch": 0.39615958240119314, "grad_norm": 9.126052856445312, "learning_rate": 2.8931621661815634e-05, "loss": 0.139, "step": 27625 }, { "epoch": 0.3965180978603797, "grad_norm": 1.400840401649475, "learning_rate": 2.8927638148250442e-05, "loss": 0.1838, "step": 27650 }, { "epoch": 0.39687661331956636, "grad_norm": 2.199944019317627, "learning_rate": 2.892365463468525e-05, "loss": 0.2218, "step": 27675 }, { "epoch": 0.39723512877875294, "grad_norm": 0.5465529561042786, "learning_rate": 2.8919671121120058e-05, "loss": 0.2067, "step": 27700 }, { "epoch": 0.3975936442379395, "grad_norm": 3.3968050479888916, "learning_rate": 2.891568760755487e-05, "loss": 0.1337, "step": 27725 }, { "epoch": 0.39795215969712616, "grad_norm": 2.414095401763916, "learning_rate": 2.8911704093989677e-05, "loss": 0.1892, "step": 27750 }, { "epoch": 0.39831067515631274, "grad_norm": 14.217358589172363, "learning_rate": 2.8907720580424485e-05, "loss": 0.1933, "step": 27775 }, { "epoch": 0.3986691906154993, "grad_norm": 7.235437870025635, "learning_rate": 2.8903737066859292e-05, "loss": 0.1286, "step": 27800 }, { "epoch": 0.39902770607468596, "grad_norm": 15.105610847473145, "learning_rate": 2.88997535532941e-05, "loss": 0.2148, "step": 27825 }, { "epoch": 0.39938622153387254, "grad_norm": 8.620416641235352, "learning_rate": 2.889577003972891e-05, "loss": 0.2486, "step": 27850 }, { "epoch": 0.3997447369930591, "grad_norm": 9.161365509033203, "learning_rate": 2.889178652616372e-05, "loss": 0.2776, "step": 27875 }, { "epoch": 0.40010325245224576, "grad_norm": 15.594342231750488, "learning_rate": 2.8887803012598527e-05, "loss": 0.2543, "step": 27900 }, { "epoch": 0.40046176791143234, "grad_norm": 7.955554962158203, "learning_rate": 2.8883819499033335e-05, "loss": 0.1693, "step": 27925 }, { "epoch": 0.4008202833706189, "grad_norm": 0.9201169013977051, "learning_rate": 2.8879835985468146e-05, "loss": 0.2212, "step": 27950 }, { "epoch": 0.40117879882980556, "grad_norm": 2.6806602478027344, "learning_rate": 2.8875852471902954e-05, "loss": 0.1839, "step": 27975 }, { "epoch": 0.40153731428899214, "grad_norm": 19.412290573120117, "learning_rate": 2.8871868958337762e-05, "loss": 0.3278, "step": 28000 }, { "epoch": 0.4018958297481787, "grad_norm": 6.854404926300049, "learning_rate": 2.886788544477257e-05, "loss": 0.2188, "step": 28025 }, { "epoch": 0.40225434520736536, "grad_norm": 3.4234273433685303, "learning_rate": 2.8863901931207378e-05, "loss": 0.1354, "step": 28050 }, { "epoch": 0.40261286066655194, "grad_norm": 16.39963150024414, "learning_rate": 2.8859918417642185e-05, "loss": 0.3069, "step": 28075 }, { "epoch": 0.4029713761257385, "grad_norm": 0.4709762632846832, "learning_rate": 2.8855934904076993e-05, "loss": 0.2187, "step": 28100 }, { "epoch": 0.40332989158492516, "grad_norm": 3.5786378383636475, "learning_rate": 2.88519513905118e-05, "loss": 0.2774, "step": 28125 }, { "epoch": 0.40368840704411174, "grad_norm": 1.4876320362091064, "learning_rate": 2.884796787694661e-05, "loss": 0.2277, "step": 28150 }, { "epoch": 0.4040469225032983, "grad_norm": 8.533146858215332, "learning_rate": 2.8843984363381417e-05, "loss": 0.2815, "step": 28175 }, { "epoch": 0.40440543796248496, "grad_norm": 10.874841690063477, "learning_rate": 2.8840000849816228e-05, "loss": 0.1705, "step": 28200 }, { "epoch": 0.40476395342167154, "grad_norm": 2.6729860305786133, "learning_rate": 2.8836017336251036e-05, "loss": 0.1928, "step": 28225 }, { "epoch": 0.4051224688808581, "grad_norm": 15.049805641174316, "learning_rate": 2.8832033822685844e-05, "loss": 0.2748, "step": 28250 }, { "epoch": 0.40548098434004476, "grad_norm": 2.920022964477539, "learning_rate": 2.882805030912065e-05, "loss": 0.2134, "step": 28275 }, { "epoch": 0.40583949979923134, "grad_norm": 16.001220703125, "learning_rate": 2.882406679555546e-05, "loss": 0.2929, "step": 28300 }, { "epoch": 0.4061980152584179, "grad_norm": 12.38768482208252, "learning_rate": 2.882008328199027e-05, "loss": 0.2389, "step": 28325 }, { "epoch": 0.40655653071760456, "grad_norm": 5.900087833404541, "learning_rate": 2.881609976842508e-05, "loss": 0.3203, "step": 28350 }, { "epoch": 0.40691504617679114, "grad_norm": 0.9473737478256226, "learning_rate": 2.8812116254859886e-05, "loss": 0.1839, "step": 28375 }, { "epoch": 0.4072735616359777, "grad_norm": 19.795324325561523, "learning_rate": 2.8808132741294694e-05, "loss": 0.2533, "step": 28400 }, { "epoch": 0.40763207709516436, "grad_norm": 4.386395454406738, "learning_rate": 2.8804149227729502e-05, "loss": 0.1909, "step": 28425 }, { "epoch": 0.40799059255435094, "grad_norm": 16.811159133911133, "learning_rate": 2.8800165714164313e-05, "loss": 0.3074, "step": 28450 }, { "epoch": 0.4083491080135375, "grad_norm": 3.5332937240600586, "learning_rate": 2.879618220059912e-05, "loss": 0.2269, "step": 28475 }, { "epoch": 0.40870762347272416, "grad_norm": 6.009772777557373, "learning_rate": 2.879219868703393e-05, "loss": 0.2301, "step": 28500 }, { "epoch": 0.40906613893191074, "grad_norm": 20.6021728515625, "learning_rate": 2.8788215173468737e-05, "loss": 0.3249, "step": 28525 }, { "epoch": 0.4094246543910973, "grad_norm": 0.20917150378227234, "learning_rate": 2.8784231659903548e-05, "loss": 0.2337, "step": 28550 }, { "epoch": 0.40978316985028396, "grad_norm": 16.877023696899414, "learning_rate": 2.8780248146338356e-05, "loss": 0.2633, "step": 28575 }, { "epoch": 0.41014168530947054, "grad_norm": 16.384862899780273, "learning_rate": 2.8776264632773164e-05, "loss": 0.2418, "step": 28600 }, { "epoch": 0.4105002007686571, "grad_norm": 20.821962356567383, "learning_rate": 2.877228111920797e-05, "loss": 0.2799, "step": 28625 }, { "epoch": 0.41085871622784376, "grad_norm": 8.200485229492188, "learning_rate": 2.876829760564278e-05, "loss": 0.256, "step": 28650 }, { "epoch": 0.41121723168703034, "grad_norm": 11.39193058013916, "learning_rate": 2.876431409207759e-05, "loss": 0.2785, "step": 28675 }, { "epoch": 0.4115757471462169, "grad_norm": 8.583800315856934, "learning_rate": 2.87603305785124e-05, "loss": 0.2155, "step": 28700 }, { "epoch": 0.41193426260540356, "grad_norm": 7.933117866516113, "learning_rate": 2.8756347064947206e-05, "loss": 0.2356, "step": 28725 }, { "epoch": 0.41229277806459014, "grad_norm": 0.3456187844276428, "learning_rate": 2.8752363551382014e-05, "loss": 0.3933, "step": 28750 }, { "epoch": 0.4126512935237767, "grad_norm": 7.972713470458984, "learning_rate": 2.8748380037816822e-05, "loss": 0.2993, "step": 28775 }, { "epoch": 0.41300980898296336, "grad_norm": 2.8689355850219727, "learning_rate": 2.8744396524251633e-05, "loss": 0.3354, "step": 28800 }, { "epoch": 0.41336832444214994, "grad_norm": 7.0146803855896, "learning_rate": 2.874041301068644e-05, "loss": 0.1996, "step": 28825 }, { "epoch": 0.4137268399013365, "grad_norm": 1.5629692077636719, "learning_rate": 2.873642949712125e-05, "loss": 0.1626, "step": 28850 }, { "epoch": 0.41408535536052316, "grad_norm": 0.4112838804721832, "learning_rate": 2.8732445983556057e-05, "loss": 0.2482, "step": 28875 }, { "epoch": 0.41444387081970974, "grad_norm": 13.866215705871582, "learning_rate": 2.8728462469990865e-05, "loss": 0.3132, "step": 28900 }, { "epoch": 0.4148023862788964, "grad_norm": 9.300886154174805, "learning_rate": 2.8724478956425676e-05, "loss": 0.2975, "step": 28925 }, { "epoch": 0.41516090173808295, "grad_norm": 1.6838083267211914, "learning_rate": 2.8720495442860484e-05, "loss": 0.1391, "step": 28950 }, { "epoch": 0.41551941719726954, "grad_norm": 6.720662593841553, "learning_rate": 2.871651192929529e-05, "loss": 0.2485, "step": 28975 }, { "epoch": 0.4158779326564562, "grad_norm": 6.513310432434082, "learning_rate": 2.87125284157301e-05, "loss": 0.2198, "step": 29000 }, { "epoch": 0.41623644811564275, "grad_norm": 11.899341583251953, "learning_rate": 2.8708544902164907e-05, "loss": 0.2693, "step": 29025 }, { "epoch": 0.41659496357482934, "grad_norm": 1.236950159072876, "learning_rate": 2.870456138859972e-05, "loss": 0.1656, "step": 29050 }, { "epoch": 0.416953479034016, "grad_norm": 9.464078903198242, "learning_rate": 2.8700577875034526e-05, "loss": 0.2471, "step": 29075 }, { "epoch": 0.41731199449320255, "grad_norm": 12.195323944091797, "learning_rate": 2.8696594361469334e-05, "loss": 0.2584, "step": 29100 }, { "epoch": 0.41767050995238914, "grad_norm": 7.602330207824707, "learning_rate": 2.8692610847904142e-05, "loss": 0.1781, "step": 29125 }, { "epoch": 0.41802902541157577, "grad_norm": 12.148274421691895, "learning_rate": 2.868862733433895e-05, "loss": 0.2381, "step": 29150 }, { "epoch": 0.41838754087076235, "grad_norm": 2.9743096828460693, "learning_rate": 2.8684643820773758e-05, "loss": 0.1175, "step": 29175 }, { "epoch": 0.41874605632994893, "grad_norm": 0.5673454999923706, "learning_rate": 2.8680660307208566e-05, "loss": 0.2593, "step": 29200 }, { "epoch": 0.41910457178913557, "grad_norm": 11.99504280090332, "learning_rate": 2.8676676793643373e-05, "loss": 0.2668, "step": 29225 }, { "epoch": 0.41946308724832215, "grad_norm": 8.837156295776367, "learning_rate": 2.867269328007818e-05, "loss": 0.1946, "step": 29250 }, { "epoch": 0.41982160270750873, "grad_norm": 11.849785804748535, "learning_rate": 2.8668709766512992e-05, "loss": 0.2617, "step": 29275 }, { "epoch": 0.42018011816669537, "grad_norm": 0.3081607222557068, "learning_rate": 2.86647262529478e-05, "loss": 0.2378, "step": 29300 }, { "epoch": 0.42053863362588195, "grad_norm": 0.6205219030380249, "learning_rate": 2.8660742739382608e-05, "loss": 0.1549, "step": 29325 }, { "epoch": 0.42089714908506853, "grad_norm": 2.395418405532837, "learning_rate": 2.8656759225817416e-05, "loss": 0.1823, "step": 29350 }, { "epoch": 0.42125566454425517, "grad_norm": 1.3256088495254517, "learning_rate": 2.8652775712252224e-05, "loss": 0.1413, "step": 29375 }, { "epoch": 0.42161418000344175, "grad_norm": 0.36766859889030457, "learning_rate": 2.8648792198687035e-05, "loss": 0.2211, "step": 29400 }, { "epoch": 0.42197269546262833, "grad_norm": 9.331770896911621, "learning_rate": 2.8644808685121843e-05, "loss": 0.3111, "step": 29425 }, { "epoch": 0.42233121092181497, "grad_norm": 0.27877163887023926, "learning_rate": 2.864082517155665e-05, "loss": 0.1102, "step": 29450 }, { "epoch": 0.42268972638100155, "grad_norm": 6.090244770050049, "learning_rate": 2.863684165799146e-05, "loss": 0.2743, "step": 29475 }, { "epoch": 0.42304824184018813, "grad_norm": 0.5109679698944092, "learning_rate": 2.8632858144426266e-05, "loss": 0.1331, "step": 29500 }, { "epoch": 0.42340675729937477, "grad_norm": 9.45372200012207, "learning_rate": 2.8628874630861078e-05, "loss": 0.2102, "step": 29525 }, { "epoch": 0.42376527275856135, "grad_norm": 14.922021865844727, "learning_rate": 2.8624891117295886e-05, "loss": 0.2676, "step": 29550 }, { "epoch": 0.42412378821774793, "grad_norm": 4.502877712249756, "learning_rate": 2.8620907603730693e-05, "loss": 0.2021, "step": 29575 }, { "epoch": 0.42448230367693457, "grad_norm": 0.8818169832229614, "learning_rate": 2.86169240901655e-05, "loss": 0.2806, "step": 29600 }, { "epoch": 0.42484081913612115, "grad_norm": 4.170677661895752, "learning_rate": 2.861294057660031e-05, "loss": 0.2429, "step": 29625 }, { "epoch": 0.42519933459530773, "grad_norm": 10.183497428894043, "learning_rate": 2.860895706303512e-05, "loss": 0.2737, "step": 29650 }, { "epoch": 0.42555785005449437, "grad_norm": 0.08610080182552338, "learning_rate": 2.8604973549469928e-05, "loss": 0.2086, "step": 29675 }, { "epoch": 0.42591636551368095, "grad_norm": 11.6821870803833, "learning_rate": 2.8600990035904736e-05, "loss": 0.2147, "step": 29700 }, { "epoch": 0.42627488097286753, "grad_norm": 1.6253951787948608, "learning_rate": 2.8597006522339544e-05, "loss": 0.172, "step": 29725 }, { "epoch": 0.42663339643205417, "grad_norm": 19.13266372680664, "learning_rate": 2.8593023008774355e-05, "loss": 0.2494, "step": 29750 }, { "epoch": 0.42699191189124075, "grad_norm": 1.89067804813385, "learning_rate": 2.8589039495209163e-05, "loss": 0.2602, "step": 29775 }, { "epoch": 0.42735042735042733, "grad_norm": 9.523008346557617, "learning_rate": 2.858505598164397e-05, "loss": 0.2212, "step": 29800 }, { "epoch": 0.42770894280961397, "grad_norm": 6.616396903991699, "learning_rate": 2.858107246807878e-05, "loss": 0.3898, "step": 29825 }, { "epoch": 0.42806745826880055, "grad_norm": 15.711434364318848, "learning_rate": 2.8577088954513586e-05, "loss": 0.2407, "step": 29850 }, { "epoch": 0.42842597372798713, "grad_norm": 18.768199920654297, "learning_rate": 2.8573105440948398e-05, "loss": 0.2242, "step": 29875 }, { "epoch": 0.42878448918717377, "grad_norm": 6.864587306976318, "learning_rate": 2.8569121927383206e-05, "loss": 0.2371, "step": 29900 }, { "epoch": 0.42914300464636035, "grad_norm": 16.25983238220215, "learning_rate": 2.8565138413818013e-05, "loss": 0.3304, "step": 29925 }, { "epoch": 0.42950152010554693, "grad_norm": 17.349441528320312, "learning_rate": 2.856115490025282e-05, "loss": 0.2629, "step": 29950 }, { "epoch": 0.42986003556473357, "grad_norm": 2.080552816390991, "learning_rate": 2.855717138668763e-05, "loss": 0.3658, "step": 29975 }, { "epoch": 0.43021855102392015, "grad_norm": 14.79726505279541, "learning_rate": 2.855318787312244e-05, "loss": 0.2824, "step": 30000 }, { "epoch": 0.43057706648310673, "grad_norm": 14.535283088684082, "learning_rate": 2.8549204359557248e-05, "loss": 0.2666, "step": 30025 }, { "epoch": 0.43093558194229337, "grad_norm": 8.51321792602539, "learning_rate": 2.8545220845992056e-05, "loss": 0.2784, "step": 30050 }, { "epoch": 0.43129409740147995, "grad_norm": 12.003729820251465, "learning_rate": 2.8541237332426864e-05, "loss": 0.2145, "step": 30075 }, { "epoch": 0.43165261286066653, "grad_norm": 5.454165458679199, "learning_rate": 2.8537253818861672e-05, "loss": 0.1553, "step": 30100 }, { "epoch": 0.43201112831985317, "grad_norm": 9.234086036682129, "learning_rate": 2.8533270305296483e-05, "loss": 0.3488, "step": 30125 }, { "epoch": 0.43236964377903975, "grad_norm": 5.0848469734191895, "learning_rate": 2.852928679173129e-05, "loss": 0.2507, "step": 30150 }, { "epoch": 0.43272815923822633, "grad_norm": 10.652213096618652, "learning_rate": 2.85253032781661e-05, "loss": 0.2143, "step": 30175 }, { "epoch": 0.43308667469741297, "grad_norm": 13.85190486907959, "learning_rate": 2.8521319764600906e-05, "loss": 0.3133, "step": 30200 }, { "epoch": 0.43344519015659955, "grad_norm": 23.466259002685547, "learning_rate": 2.8517336251035714e-05, "loss": 0.2844, "step": 30225 }, { "epoch": 0.43380370561578613, "grad_norm": 8.810803413391113, "learning_rate": 2.8513352737470526e-05, "loss": 0.2695, "step": 30250 }, { "epoch": 0.43416222107497277, "grad_norm": 8.26700210571289, "learning_rate": 2.850936922390533e-05, "loss": 0.2107, "step": 30275 }, { "epoch": 0.43452073653415935, "grad_norm": 12.004823684692383, "learning_rate": 2.8505385710340138e-05, "loss": 0.213, "step": 30300 }, { "epoch": 0.4348792519933459, "grad_norm": 2.6561331748962402, "learning_rate": 2.8501402196774946e-05, "loss": 0.1977, "step": 30325 }, { "epoch": 0.43523776745253256, "grad_norm": 6.45474910736084, "learning_rate": 2.8497418683209754e-05, "loss": 0.2244, "step": 30350 }, { "epoch": 0.43559628291171915, "grad_norm": 2.3375837802886963, "learning_rate": 2.8493435169644565e-05, "loss": 0.1356, "step": 30375 }, { "epoch": 0.4359547983709057, "grad_norm": 19.93529510498047, "learning_rate": 2.8489451656079373e-05, "loss": 0.2792, "step": 30400 }, { "epoch": 0.43631331383009236, "grad_norm": 1.2489960193634033, "learning_rate": 2.848546814251418e-05, "loss": 0.1829, "step": 30425 }, { "epoch": 0.43667182928927895, "grad_norm": 5.7339768409729, "learning_rate": 2.8481484628948988e-05, "loss": 0.2902, "step": 30450 }, { "epoch": 0.4370303447484655, "grad_norm": 0.319886177778244, "learning_rate": 2.84775011153838e-05, "loss": 0.2191, "step": 30475 }, { "epoch": 0.43738886020765216, "grad_norm": 2.091063976287842, "learning_rate": 2.8473517601818607e-05, "loss": 0.1946, "step": 30500 }, { "epoch": 0.43774737566683875, "grad_norm": 19.305103302001953, "learning_rate": 2.8469534088253415e-05, "loss": 0.3374, "step": 30525 }, { "epoch": 0.4381058911260254, "grad_norm": 10.005836486816406, "learning_rate": 2.8465550574688223e-05, "loss": 0.244, "step": 30550 }, { "epoch": 0.43846440658521196, "grad_norm": 0.7532293796539307, "learning_rate": 2.846156706112303e-05, "loss": 0.2078, "step": 30575 }, { "epoch": 0.43882292204439854, "grad_norm": 0.18420755863189697, "learning_rate": 2.8457583547557842e-05, "loss": 0.1727, "step": 30600 }, { "epoch": 0.4391814375035852, "grad_norm": 0.12768207490444183, "learning_rate": 2.845360003399265e-05, "loss": 0.161, "step": 30625 }, { "epoch": 0.43953995296277176, "grad_norm": 22.648508071899414, "learning_rate": 2.8449616520427458e-05, "loss": 0.2503, "step": 30650 }, { "epoch": 0.43989846842195834, "grad_norm": 0.9977130889892578, "learning_rate": 2.8445633006862266e-05, "loss": 0.1821, "step": 30675 }, { "epoch": 0.440256983881145, "grad_norm": 2.478569507598877, "learning_rate": 2.8441649493297074e-05, "loss": 0.2056, "step": 30700 }, { "epoch": 0.44061549934033156, "grad_norm": 1.5396161079406738, "learning_rate": 2.8437665979731885e-05, "loss": 0.3083, "step": 30725 }, { "epoch": 0.44097401479951814, "grad_norm": 18.693134307861328, "learning_rate": 2.8433682466166693e-05, "loss": 0.2673, "step": 30750 }, { "epoch": 0.4413325302587048, "grad_norm": 15.584599494934082, "learning_rate": 2.84296989526015e-05, "loss": 0.2554, "step": 30775 }, { "epoch": 0.44169104571789136, "grad_norm": 19.22650146484375, "learning_rate": 2.8425715439036308e-05, "loss": 0.2523, "step": 30800 }, { "epoch": 0.44204956117707794, "grad_norm": 6.515274524688721, "learning_rate": 2.8421731925471116e-05, "loss": 0.2239, "step": 30825 }, { "epoch": 0.4424080766362646, "grad_norm": 7.972828388214111, "learning_rate": 2.8417748411905927e-05, "loss": 0.2957, "step": 30850 }, { "epoch": 0.44276659209545116, "grad_norm": 12.419438362121582, "learning_rate": 2.8413764898340735e-05, "loss": 0.2974, "step": 30875 }, { "epoch": 0.44312510755463774, "grad_norm": 18.22265625, "learning_rate": 2.8409781384775543e-05, "loss": 0.3278, "step": 30900 }, { "epoch": 0.4434836230138244, "grad_norm": 14.907329559326172, "learning_rate": 2.840579787121035e-05, "loss": 0.2792, "step": 30925 }, { "epoch": 0.44384213847301096, "grad_norm": 8.536490440368652, "learning_rate": 2.840181435764516e-05, "loss": 0.1982, "step": 30950 }, { "epoch": 0.44420065393219754, "grad_norm": 11.412633895874023, "learning_rate": 2.839783084407997e-05, "loss": 0.1932, "step": 30975 }, { "epoch": 0.4445591693913842, "grad_norm": 9.499017715454102, "learning_rate": 2.8393847330514778e-05, "loss": 0.2796, "step": 31000 }, { "epoch": 0.44491768485057076, "grad_norm": 13.09565258026123, "learning_rate": 2.8389863816949586e-05, "loss": 0.2903, "step": 31025 }, { "epoch": 0.44527620030975734, "grad_norm": 22.779315948486328, "learning_rate": 2.8385880303384394e-05, "loss": 0.1498, "step": 31050 }, { "epoch": 0.445634715768944, "grad_norm": 2.594590187072754, "learning_rate": 2.8381896789819205e-05, "loss": 0.2625, "step": 31075 }, { "epoch": 0.44599323122813056, "grad_norm": 13.26119613647461, "learning_rate": 2.8377913276254013e-05, "loss": 0.2047, "step": 31100 }, { "epoch": 0.44635174668731714, "grad_norm": 0.9308417439460754, "learning_rate": 2.837392976268882e-05, "loss": 0.2364, "step": 31125 }, { "epoch": 0.4467102621465038, "grad_norm": 2.2670021057128906, "learning_rate": 2.8369946249123628e-05, "loss": 0.1541, "step": 31150 }, { "epoch": 0.44706877760569036, "grad_norm": 17.41800880432129, "learning_rate": 2.8365962735558436e-05, "loss": 0.1973, "step": 31175 }, { "epoch": 0.44742729306487694, "grad_norm": 13.944840431213379, "learning_rate": 2.8361979221993247e-05, "loss": 0.3104, "step": 31200 }, { "epoch": 0.4477858085240636, "grad_norm": 8.42185115814209, "learning_rate": 2.8357995708428055e-05, "loss": 0.1734, "step": 31225 }, { "epoch": 0.44814432398325016, "grad_norm": 0.7030354738235474, "learning_rate": 2.8354012194862863e-05, "loss": 0.2541, "step": 31250 }, { "epoch": 0.44850283944243674, "grad_norm": 2.6064579486846924, "learning_rate": 2.835002868129767e-05, "loss": 0.2148, "step": 31275 }, { "epoch": 0.4488613549016234, "grad_norm": 14.909832954406738, "learning_rate": 2.834604516773248e-05, "loss": 0.1905, "step": 31300 }, { "epoch": 0.44921987036080996, "grad_norm": 13.130645751953125, "learning_rate": 2.834206165416729e-05, "loss": 0.3012, "step": 31325 }, { "epoch": 0.44957838581999654, "grad_norm": 2.5359914302825928, "learning_rate": 2.8338078140602098e-05, "loss": 0.2521, "step": 31350 }, { "epoch": 0.4499369012791832, "grad_norm": 0.30913904309272766, "learning_rate": 2.8334094627036902e-05, "loss": 0.1696, "step": 31375 }, { "epoch": 0.45029541673836976, "grad_norm": 0.7992724180221558, "learning_rate": 2.833011111347171e-05, "loss": 0.2009, "step": 31400 }, { "epoch": 0.45065393219755634, "grad_norm": 20.83186912536621, "learning_rate": 2.8326127599906518e-05, "loss": 0.2514, "step": 31425 }, { "epoch": 0.451012447656743, "grad_norm": 16.8234806060791, "learning_rate": 2.832214408634133e-05, "loss": 0.2802, "step": 31450 }, { "epoch": 0.45137096311592956, "grad_norm": 3.1681952476501465, "learning_rate": 2.8318160572776137e-05, "loss": 0.1942, "step": 31475 }, { "epoch": 0.45172947857511614, "grad_norm": 14.634171485900879, "learning_rate": 2.8314177059210945e-05, "loss": 0.1878, "step": 31500 }, { "epoch": 0.4520879940343028, "grad_norm": 11.430805206298828, "learning_rate": 2.8310193545645753e-05, "loss": 0.1979, "step": 31525 }, { "epoch": 0.45244650949348936, "grad_norm": 14.555899620056152, "learning_rate": 2.830621003208056e-05, "loss": 0.2207, "step": 31550 }, { "epoch": 0.45280502495267594, "grad_norm": 3.115086555480957, "learning_rate": 2.8302226518515372e-05, "loss": 0.2057, "step": 31575 }, { "epoch": 0.4531635404118626, "grad_norm": 3.0609610080718994, "learning_rate": 2.829824300495018e-05, "loss": 0.1975, "step": 31600 }, { "epoch": 0.45352205587104916, "grad_norm": 9.237727165222168, "learning_rate": 2.8294259491384987e-05, "loss": 0.1778, "step": 31625 }, { "epoch": 0.45388057133023574, "grad_norm": 12.237277030944824, "learning_rate": 2.8290275977819795e-05, "loss": 0.1892, "step": 31650 }, { "epoch": 0.4542390867894224, "grad_norm": 11.923284530639648, "learning_rate": 2.8286292464254607e-05, "loss": 0.1881, "step": 31675 }, { "epoch": 0.45459760224860896, "grad_norm": 5.266074180603027, "learning_rate": 2.8282308950689414e-05, "loss": 0.2381, "step": 31700 }, { "epoch": 0.45495611770779554, "grad_norm": 4.667016506195068, "learning_rate": 2.8278325437124222e-05, "loss": 0.2442, "step": 31725 }, { "epoch": 0.4553146331669822, "grad_norm": 0.44577738642692566, "learning_rate": 2.827434192355903e-05, "loss": 0.286, "step": 31750 }, { "epoch": 0.45567314862616876, "grad_norm": 4.170209884643555, "learning_rate": 2.8270358409993838e-05, "loss": 0.2494, "step": 31775 }, { "epoch": 0.45603166408535534, "grad_norm": 4.972789764404297, "learning_rate": 2.826637489642865e-05, "loss": 0.2282, "step": 31800 }, { "epoch": 0.456390179544542, "grad_norm": 16.051855087280273, "learning_rate": 2.8262391382863457e-05, "loss": 0.19, "step": 31825 }, { "epoch": 0.45674869500372856, "grad_norm": 0.04623661935329437, "learning_rate": 2.8258407869298265e-05, "loss": 0.2164, "step": 31850 }, { "epoch": 0.45710721046291514, "grad_norm": 3.0927469730377197, "learning_rate": 2.8254424355733073e-05, "loss": 0.2363, "step": 31875 }, { "epoch": 0.4574657259221018, "grad_norm": 8.974098205566406, "learning_rate": 2.825044084216788e-05, "loss": 0.2219, "step": 31900 }, { "epoch": 0.45782424138128835, "grad_norm": 7.98169469833374, "learning_rate": 2.8246457328602692e-05, "loss": 0.1892, "step": 31925 }, { "epoch": 0.45818275684047494, "grad_norm": 2.7802975177764893, "learning_rate": 2.82424738150375e-05, "loss": 0.2572, "step": 31950 }, { "epoch": 0.4585412722996616, "grad_norm": 1.5719050168991089, "learning_rate": 2.8238490301472307e-05, "loss": 0.2692, "step": 31975 }, { "epoch": 0.45889978775884815, "grad_norm": 2.9155163764953613, "learning_rate": 2.8234506787907115e-05, "loss": 0.2159, "step": 32000 }, { "epoch": 0.45925830321803474, "grad_norm": 13.095877647399902, "learning_rate": 2.8230523274341923e-05, "loss": 0.231, "step": 32025 }, { "epoch": 0.4596168186772214, "grad_norm": 11.084964752197266, "learning_rate": 2.8226539760776734e-05, "loss": 0.2281, "step": 32050 }, { "epoch": 0.45997533413640795, "grad_norm": 9.673748970031738, "learning_rate": 2.8222556247211542e-05, "loss": 0.2753, "step": 32075 }, { "epoch": 0.46033384959559454, "grad_norm": 20.309154510498047, "learning_rate": 2.821857273364635e-05, "loss": 0.2898, "step": 32100 }, { "epoch": 0.46069236505478117, "grad_norm": 0.5962597131729126, "learning_rate": 2.8214589220081158e-05, "loss": 0.2525, "step": 32125 }, { "epoch": 0.46105088051396775, "grad_norm": 6.537120819091797, "learning_rate": 2.8210605706515966e-05, "loss": 0.1793, "step": 32150 }, { "epoch": 0.46140939597315433, "grad_norm": 1.243766188621521, "learning_rate": 2.8206622192950777e-05, "loss": 0.1932, "step": 32175 }, { "epoch": 0.46176791143234097, "grad_norm": 0.7927608489990234, "learning_rate": 2.8202638679385585e-05, "loss": 0.2679, "step": 32200 }, { "epoch": 0.46212642689152755, "grad_norm": 0.5958202481269836, "learning_rate": 2.8198655165820393e-05, "loss": 0.2843, "step": 32225 }, { "epoch": 0.4624849423507142, "grad_norm": 15.547015190124512, "learning_rate": 2.81946716522552e-05, "loss": 0.3593, "step": 32250 }, { "epoch": 0.46284345780990077, "grad_norm": 8.069841384887695, "learning_rate": 2.8190688138690012e-05, "loss": 0.2829, "step": 32275 }, { "epoch": 0.46320197326908735, "grad_norm": 17.92749786376953, "learning_rate": 2.818670462512482e-05, "loss": 0.26, "step": 32300 }, { "epoch": 0.463560488728274, "grad_norm": 5.108863830566406, "learning_rate": 2.8182721111559627e-05, "loss": 0.277, "step": 32325 }, { "epoch": 0.46391900418746057, "grad_norm": 0.5705750584602356, "learning_rate": 2.8178737597994435e-05, "loss": 0.3402, "step": 32350 }, { "epoch": 0.46427751964664715, "grad_norm": 21.114389419555664, "learning_rate": 2.8174754084429243e-05, "loss": 0.3338, "step": 32375 }, { "epoch": 0.4646360351058338, "grad_norm": 5.244955062866211, "learning_rate": 2.8170770570864054e-05, "loss": 0.1739, "step": 32400 }, { "epoch": 0.46499455056502037, "grad_norm": 16.474199295043945, "learning_rate": 2.8166787057298862e-05, "loss": 0.2455, "step": 32425 }, { "epoch": 0.46535306602420695, "grad_norm": 11.096033096313477, "learning_rate": 2.816280354373367e-05, "loss": 0.2511, "step": 32450 }, { "epoch": 0.4657115814833936, "grad_norm": 1.581092119216919, "learning_rate": 2.8158820030168475e-05, "loss": 0.2519, "step": 32475 }, { "epoch": 0.46607009694258017, "grad_norm": 7.070646286010742, "learning_rate": 2.8154836516603282e-05, "loss": 0.1812, "step": 32500 }, { "epoch": 0.46642861240176675, "grad_norm": 0.790593683719635, "learning_rate": 2.8150853003038094e-05, "loss": 0.2351, "step": 32525 }, { "epoch": 0.4667871278609534, "grad_norm": 23.751352310180664, "learning_rate": 2.81468694894729e-05, "loss": 0.3099, "step": 32550 }, { "epoch": 0.46714564332013997, "grad_norm": 2.6913235187530518, "learning_rate": 2.814288597590771e-05, "loss": 0.2382, "step": 32575 }, { "epoch": 0.46750415877932655, "grad_norm": 2.7289280891418457, "learning_rate": 2.8138902462342517e-05, "loss": 0.2317, "step": 32600 }, { "epoch": 0.4678626742385132, "grad_norm": 3.2125329971313477, "learning_rate": 2.8134918948777325e-05, "loss": 0.1503, "step": 32625 }, { "epoch": 0.46822118969769977, "grad_norm": 11.985738754272461, "learning_rate": 2.8130935435212136e-05, "loss": 0.2651, "step": 32650 }, { "epoch": 0.46857970515688635, "grad_norm": 10.875306129455566, "learning_rate": 2.8126951921646944e-05, "loss": 0.3047, "step": 32675 }, { "epoch": 0.468938220616073, "grad_norm": 1.1827565431594849, "learning_rate": 2.8122968408081752e-05, "loss": 0.1705, "step": 32700 }, { "epoch": 0.46929673607525957, "grad_norm": 7.1098856925964355, "learning_rate": 2.811898489451656e-05, "loss": 0.2013, "step": 32725 }, { "epoch": 0.46965525153444615, "grad_norm": 8.226515769958496, "learning_rate": 2.8115001380951368e-05, "loss": 0.1657, "step": 32750 }, { "epoch": 0.4700137669936328, "grad_norm": 2.5474910736083984, "learning_rate": 2.811101786738618e-05, "loss": 0.1965, "step": 32775 }, { "epoch": 0.47037228245281937, "grad_norm": 18.06205177307129, "learning_rate": 2.8107034353820987e-05, "loss": 0.222, "step": 32800 }, { "epoch": 0.47073079791200595, "grad_norm": 21.308685302734375, "learning_rate": 2.8103050840255795e-05, "loss": 0.2897, "step": 32825 }, { "epoch": 0.4710893133711926, "grad_norm": 1.9117677211761475, "learning_rate": 2.8099067326690602e-05, "loss": 0.2414, "step": 32850 }, { "epoch": 0.47144782883037917, "grad_norm": 9.641486167907715, "learning_rate": 2.8095083813125414e-05, "loss": 0.3328, "step": 32875 }, { "epoch": 0.47180634428956575, "grad_norm": 16.303834915161133, "learning_rate": 2.809110029956022e-05, "loss": 0.2793, "step": 32900 }, { "epoch": 0.4721648597487524, "grad_norm": 1.0755335092544556, "learning_rate": 2.808711678599503e-05, "loss": 0.1759, "step": 32925 }, { "epoch": 0.47252337520793897, "grad_norm": 0.7177824378013611, "learning_rate": 2.8083133272429837e-05, "loss": 0.2974, "step": 32950 }, { "epoch": 0.47288189066712555, "grad_norm": 7.406129837036133, "learning_rate": 2.8079149758864645e-05, "loss": 0.2328, "step": 32975 }, { "epoch": 0.4732404061263122, "grad_norm": 19.843074798583984, "learning_rate": 2.8075166245299456e-05, "loss": 0.2337, "step": 33000 }, { "epoch": 0.47359892158549877, "grad_norm": 3.5449862480163574, "learning_rate": 2.8071182731734264e-05, "loss": 0.2026, "step": 33025 }, { "epoch": 0.47395743704468535, "grad_norm": 1.9308910369873047, "learning_rate": 2.8067199218169072e-05, "loss": 0.3076, "step": 33050 }, { "epoch": 0.474315952503872, "grad_norm": 11.784811019897461, "learning_rate": 2.806321570460388e-05, "loss": 0.1159, "step": 33075 }, { "epoch": 0.47467446796305857, "grad_norm": 5.729335784912109, "learning_rate": 2.8059232191038688e-05, "loss": 0.1968, "step": 33100 }, { "epoch": 0.47503298342224515, "grad_norm": 17.680511474609375, "learning_rate": 2.80552486774735e-05, "loss": 0.2132, "step": 33125 }, { "epoch": 0.4753914988814318, "grad_norm": 17.32459831237793, "learning_rate": 2.8051265163908307e-05, "loss": 0.1956, "step": 33150 }, { "epoch": 0.47575001434061837, "grad_norm": 9.75263786315918, "learning_rate": 2.8047281650343115e-05, "loss": 0.2037, "step": 33175 }, { "epoch": 0.47610852979980495, "grad_norm": 20.76472282409668, "learning_rate": 2.8043298136777922e-05, "loss": 0.2576, "step": 33200 }, { "epoch": 0.4764670452589916, "grad_norm": 23.14208221435547, "learning_rate": 2.803931462321273e-05, "loss": 0.2798, "step": 33225 }, { "epoch": 0.47682556071817817, "grad_norm": 0.1471686065196991, "learning_rate": 2.803533110964754e-05, "loss": 0.2366, "step": 33250 }, { "epoch": 0.47718407617736475, "grad_norm": 6.384776592254639, "learning_rate": 2.803134759608235e-05, "loss": 0.2217, "step": 33275 }, { "epoch": 0.4775425916365514, "grad_norm": 1.3578773736953735, "learning_rate": 2.8027364082517157e-05, "loss": 0.2671, "step": 33300 }, { "epoch": 0.47790110709573796, "grad_norm": 0.45463526248931885, "learning_rate": 2.8023380568951965e-05, "loss": 0.2547, "step": 33325 }, { "epoch": 0.47825962255492455, "grad_norm": 8.831303596496582, "learning_rate": 2.8019397055386773e-05, "loss": 0.3721, "step": 33350 }, { "epoch": 0.4786181380141112, "grad_norm": 1.9235517978668213, "learning_rate": 2.8015413541821584e-05, "loss": 0.143, "step": 33375 }, { "epoch": 0.47897665347329776, "grad_norm": 20.978734970092773, "learning_rate": 2.8011430028256392e-05, "loss": 0.1996, "step": 33400 }, { "epoch": 0.47933516893248435, "grad_norm": 8.120469093322754, "learning_rate": 2.80074465146912e-05, "loss": 0.2728, "step": 33425 }, { "epoch": 0.479693684391671, "grad_norm": 14.270025253295898, "learning_rate": 2.8003463001126008e-05, "loss": 0.251, "step": 33450 }, { "epoch": 0.48005219985085756, "grad_norm": 7.676972389221191, "learning_rate": 2.799947948756082e-05, "loss": 0.3019, "step": 33475 }, { "epoch": 0.48041071531004415, "grad_norm": 14.71060848236084, "learning_rate": 2.7995495973995627e-05, "loss": 0.2357, "step": 33500 }, { "epoch": 0.4807692307692308, "grad_norm": 1.874373197555542, "learning_rate": 2.7991512460430434e-05, "loss": 0.2305, "step": 33525 }, { "epoch": 0.48112774622841736, "grad_norm": 0.20414477586746216, "learning_rate": 2.7987528946865242e-05, "loss": 0.2541, "step": 33550 }, { "epoch": 0.48148626168760394, "grad_norm": 9.466683387756348, "learning_rate": 2.7983545433300047e-05, "loss": 0.3459, "step": 33575 }, { "epoch": 0.4818447771467906, "grad_norm": 12.013806343078613, "learning_rate": 2.7979561919734858e-05, "loss": 0.2126, "step": 33600 }, { "epoch": 0.48220329260597716, "grad_norm": 12.394521713256836, "learning_rate": 2.7975578406169666e-05, "loss": 0.2255, "step": 33625 }, { "epoch": 0.48256180806516374, "grad_norm": 6.102743148803711, "learning_rate": 2.7971594892604474e-05, "loss": 0.2357, "step": 33650 }, { "epoch": 0.4829203235243504, "grad_norm": 0.4947282671928406, "learning_rate": 2.796761137903928e-05, "loss": 0.2847, "step": 33675 }, { "epoch": 0.48327883898353696, "grad_norm": 10.919458389282227, "learning_rate": 2.796362786547409e-05, "loss": 0.2183, "step": 33700 }, { "epoch": 0.48363735444272354, "grad_norm": 4.149260997772217, "learning_rate": 2.79596443519089e-05, "loss": 0.2527, "step": 33725 }, { "epoch": 0.4839958699019102, "grad_norm": 2.8228206634521484, "learning_rate": 2.795566083834371e-05, "loss": 0.1148, "step": 33750 }, { "epoch": 0.48435438536109676, "grad_norm": 1.4640487432479858, "learning_rate": 2.7951677324778516e-05, "loss": 0.2562, "step": 33775 }, { "epoch": 0.48471290082028334, "grad_norm": 1.3206641674041748, "learning_rate": 2.7947693811213324e-05, "loss": 0.2809, "step": 33800 }, { "epoch": 0.48507141627947, "grad_norm": 7.776708602905273, "learning_rate": 2.7943710297648132e-05, "loss": 0.1263, "step": 33825 }, { "epoch": 0.48542993173865656, "grad_norm": 3.335771322250366, "learning_rate": 2.7939726784082943e-05, "loss": 0.2981, "step": 33850 }, { "epoch": 0.4857884471978432, "grad_norm": 5.987175464630127, "learning_rate": 2.793574327051775e-05, "loss": 0.2083, "step": 33875 }, { "epoch": 0.4861469626570298, "grad_norm": 2.937636137008667, "learning_rate": 2.793175975695256e-05, "loss": 0.1726, "step": 33900 }, { "epoch": 0.48650547811621636, "grad_norm": 13.34713363647461, "learning_rate": 2.7927776243387367e-05, "loss": 0.3092, "step": 33925 }, { "epoch": 0.486863993575403, "grad_norm": 8.459158897399902, "learning_rate": 2.7923792729822175e-05, "loss": 0.2446, "step": 33950 }, { "epoch": 0.4872225090345896, "grad_norm": 2.2844696044921875, "learning_rate": 2.7919809216256986e-05, "loss": 0.1465, "step": 33975 }, { "epoch": 0.48758102449377616, "grad_norm": 0.8025962710380554, "learning_rate": 2.7915825702691794e-05, "loss": 0.2196, "step": 34000 }, { "epoch": 0.4879395399529628, "grad_norm": 6.711642265319824, "learning_rate": 2.79118421891266e-05, "loss": 0.2602, "step": 34025 }, { "epoch": 0.4882980554121494, "grad_norm": 4.1984543800354, "learning_rate": 2.790785867556141e-05, "loss": 0.2422, "step": 34050 }, { "epoch": 0.48865657087133596, "grad_norm": 5.658032417297363, "learning_rate": 2.790387516199622e-05, "loss": 0.1939, "step": 34075 }, { "epoch": 0.4890150863305226, "grad_norm": 13.76767349243164, "learning_rate": 2.789989164843103e-05, "loss": 0.1959, "step": 34100 }, { "epoch": 0.4893736017897092, "grad_norm": 10.038883209228516, "learning_rate": 2.7895908134865836e-05, "loss": 0.3089, "step": 34125 }, { "epoch": 0.48973211724889576, "grad_norm": 2.460297107696533, "learning_rate": 2.7891924621300644e-05, "loss": 0.2103, "step": 34150 }, { "epoch": 0.4900906327080824, "grad_norm": 2.848870277404785, "learning_rate": 2.7887941107735452e-05, "loss": 0.1627, "step": 34175 }, { "epoch": 0.490449148167269, "grad_norm": 0.7190040946006775, "learning_rate": 2.7883957594170263e-05, "loss": 0.1355, "step": 34200 }, { "epoch": 0.49080766362645556, "grad_norm": 15.279474258422852, "learning_rate": 2.787997408060507e-05, "loss": 0.3183, "step": 34225 }, { "epoch": 0.4911661790856422, "grad_norm": 4.91668701171875, "learning_rate": 2.787599056703988e-05, "loss": 0.1682, "step": 34250 }, { "epoch": 0.4915246945448288, "grad_norm": 0.6645397543907166, "learning_rate": 2.7872007053474687e-05, "loss": 0.3339, "step": 34275 }, { "epoch": 0.49188321000401536, "grad_norm": 10.338302612304688, "learning_rate": 2.7868023539909495e-05, "loss": 0.281, "step": 34300 }, { "epoch": 0.492241725463202, "grad_norm": 1.5119012594223022, "learning_rate": 2.7864040026344306e-05, "loss": 0.1912, "step": 34325 }, { "epoch": 0.4926002409223886, "grad_norm": 15.585188865661621, "learning_rate": 2.7860056512779114e-05, "loss": 0.2265, "step": 34350 }, { "epoch": 0.49295875638157516, "grad_norm": 4.571041107177734, "learning_rate": 2.785607299921392e-05, "loss": 0.3308, "step": 34375 }, { "epoch": 0.4933172718407618, "grad_norm": 9.011298179626465, "learning_rate": 2.785208948564873e-05, "loss": 0.2829, "step": 34400 }, { "epoch": 0.4936757872999484, "grad_norm": 14.146049499511719, "learning_rate": 2.7848105972083537e-05, "loss": 0.267, "step": 34425 }, { "epoch": 0.49403430275913496, "grad_norm": 0.7606714963912964, "learning_rate": 2.784412245851835e-05, "loss": 0.2082, "step": 34450 }, { "epoch": 0.4943928182183216, "grad_norm": 6.570642471313477, "learning_rate": 2.7840138944953156e-05, "loss": 0.1534, "step": 34475 }, { "epoch": 0.4947513336775082, "grad_norm": 2.138979196548462, "learning_rate": 2.7836155431387964e-05, "loss": 0.2303, "step": 34500 }, { "epoch": 0.49510984913669476, "grad_norm": 8.250100135803223, "learning_rate": 2.7832171917822772e-05, "loss": 0.2858, "step": 34525 }, { "epoch": 0.4954683645958814, "grad_norm": 17.637052536010742, "learning_rate": 2.782818840425758e-05, "loss": 0.2483, "step": 34550 }, { "epoch": 0.495826880055068, "grad_norm": 0.08030881732702255, "learning_rate": 2.782420489069239e-05, "loss": 0.2824, "step": 34575 }, { "epoch": 0.49618539551425456, "grad_norm": 0.341288298368454, "learning_rate": 2.78202213771272e-05, "loss": 0.2925, "step": 34600 }, { "epoch": 0.4965439109734412, "grad_norm": 1.7808901071548462, "learning_rate": 2.7816237863562007e-05, "loss": 0.2298, "step": 34625 }, { "epoch": 0.4969024264326278, "grad_norm": 0.6156447529792786, "learning_rate": 2.7812254349996815e-05, "loss": 0.2501, "step": 34650 }, { "epoch": 0.49726094189181436, "grad_norm": 0.5584651827812195, "learning_rate": 2.780827083643162e-05, "loss": 0.1344, "step": 34675 }, { "epoch": 0.497619457351001, "grad_norm": 10.046646118164062, "learning_rate": 2.780428732286643e-05, "loss": 0.2065, "step": 34700 }, { "epoch": 0.4979779728101876, "grad_norm": 0.3001100718975067, "learning_rate": 2.7800303809301238e-05, "loss": 0.2025, "step": 34725 }, { "epoch": 0.49833648826937416, "grad_norm": 13.529801368713379, "learning_rate": 2.7796320295736046e-05, "loss": 0.327, "step": 34750 }, { "epoch": 0.4986950037285608, "grad_norm": 15.23560905456543, "learning_rate": 2.7792336782170854e-05, "loss": 0.1639, "step": 34775 }, { "epoch": 0.4990535191877474, "grad_norm": 3.614776372909546, "learning_rate": 2.7788353268605665e-05, "loss": 0.3468, "step": 34800 }, { "epoch": 0.49941203464693396, "grad_norm": 13.723474502563477, "learning_rate": 2.7784369755040473e-05, "loss": 0.1988, "step": 34825 }, { "epoch": 0.4997705501061206, "grad_norm": 3.8036789894104004, "learning_rate": 2.778038624147528e-05, "loss": 0.2282, "step": 34850 }, { "epoch": 0.5001290655653072, "grad_norm": 5.1397480964660645, "learning_rate": 2.777640272791009e-05, "loss": 0.3274, "step": 34875 }, { "epoch": 0.5004875810244938, "grad_norm": 2.662478446960449, "learning_rate": 2.7772419214344896e-05, "loss": 0.2214, "step": 34900 }, { "epoch": 0.5008460964836804, "grad_norm": 16.00400161743164, "learning_rate": 2.7768435700779708e-05, "loss": 0.279, "step": 34925 }, { "epoch": 0.501204611942867, "grad_norm": 3.3635270595550537, "learning_rate": 2.7764452187214516e-05, "loss": 0.3226, "step": 34950 }, { "epoch": 0.5015631274020536, "grad_norm": 18.379091262817383, "learning_rate": 2.7760468673649323e-05, "loss": 0.3616, "step": 34975 }, { "epoch": 0.5019216428612402, "grad_norm": 16.630626678466797, "learning_rate": 2.775648516008413e-05, "loss": 0.1857, "step": 35000 }, { "epoch": 0.5022801583204268, "grad_norm": 1.1194167137145996, "learning_rate": 2.775250164651894e-05, "loss": 0.2472, "step": 35025 }, { "epoch": 0.5026386737796134, "grad_norm": 1.0259050130844116, "learning_rate": 2.774851813295375e-05, "loss": 0.2598, "step": 35050 }, { "epoch": 0.5029971892388, "grad_norm": 0.6390739679336548, "learning_rate": 2.7744534619388558e-05, "loss": 0.2562, "step": 35075 }, { "epoch": 0.5033557046979866, "grad_norm": 17.523521423339844, "learning_rate": 2.7740551105823366e-05, "loss": 0.3204, "step": 35100 }, { "epoch": 0.5037142201571732, "grad_norm": 15.727463722229004, "learning_rate": 2.7736567592258174e-05, "loss": 0.2179, "step": 35125 }, { "epoch": 0.5040727356163598, "grad_norm": 1.643644094467163, "learning_rate": 2.773258407869298e-05, "loss": 0.2068, "step": 35150 }, { "epoch": 0.5044312510755464, "grad_norm": 25.896984100341797, "learning_rate": 2.7728600565127793e-05, "loss": 0.3016, "step": 35175 }, { "epoch": 0.504789766534733, "grad_norm": 3.2422034740448, "learning_rate": 2.77246170515626e-05, "loss": 0.2323, "step": 35200 }, { "epoch": 0.5051482819939196, "grad_norm": 0.8023493885993958, "learning_rate": 2.772063353799741e-05, "loss": 0.315, "step": 35225 }, { "epoch": 0.5055067974531062, "grad_norm": 10.09537410736084, "learning_rate": 2.7716650024432216e-05, "loss": 0.185, "step": 35250 }, { "epoch": 0.5058653129122928, "grad_norm": 20.15782356262207, "learning_rate": 2.7712666510867024e-05, "loss": 0.3338, "step": 35275 }, { "epoch": 0.5062238283714794, "grad_norm": 3.1614527702331543, "learning_rate": 2.7708682997301836e-05, "loss": 0.2468, "step": 35300 }, { "epoch": 0.506582343830666, "grad_norm": 12.637556076049805, "learning_rate": 2.7704699483736643e-05, "loss": 0.3539, "step": 35325 }, { "epoch": 0.5069408592898526, "grad_norm": 8.19939136505127, "learning_rate": 2.770071597017145e-05, "loss": 0.2023, "step": 35350 }, { "epoch": 0.5072993747490392, "grad_norm": 15.55248737335205, "learning_rate": 2.769673245660626e-05, "loss": 0.2398, "step": 35375 }, { "epoch": 0.5076578902082258, "grad_norm": 12.76515007019043, "learning_rate": 2.769274894304107e-05, "loss": 0.1749, "step": 35400 }, { "epoch": 0.5080164056674124, "grad_norm": 7.83133602142334, "learning_rate": 2.7688765429475878e-05, "loss": 0.4012, "step": 35425 }, { "epoch": 0.508374921126599, "grad_norm": 1.5073964595794678, "learning_rate": 2.7684781915910686e-05, "loss": 0.1652, "step": 35450 }, { "epoch": 0.5087334365857856, "grad_norm": 10.083446502685547, "learning_rate": 2.7680798402345494e-05, "loss": 0.234, "step": 35475 }, { "epoch": 0.5090919520449722, "grad_norm": 13.385431289672852, "learning_rate": 2.76768148887803e-05, "loss": 0.1646, "step": 35500 }, { "epoch": 0.5094504675041588, "grad_norm": 14.098177909851074, "learning_rate": 2.7672831375215113e-05, "loss": 0.2471, "step": 35525 }, { "epoch": 0.5098089829633454, "grad_norm": 8.186561584472656, "learning_rate": 2.766884786164992e-05, "loss": 0.2863, "step": 35550 }, { "epoch": 0.510167498422532, "grad_norm": 2.7958168983459473, "learning_rate": 2.766486434808473e-05, "loss": 0.1645, "step": 35575 }, { "epoch": 0.5105260138817186, "grad_norm": 4.040065288543701, "learning_rate": 2.7660880834519536e-05, "loss": 0.2034, "step": 35600 }, { "epoch": 0.5108845293409052, "grad_norm": 15.058971405029297, "learning_rate": 2.7656897320954344e-05, "loss": 0.2268, "step": 35625 }, { "epoch": 0.5112430448000918, "grad_norm": 0.9527122378349304, "learning_rate": 2.7652913807389155e-05, "loss": 0.2207, "step": 35650 }, { "epoch": 0.5116015602592784, "grad_norm": 1.382511019706726, "learning_rate": 2.7648930293823963e-05, "loss": 0.2836, "step": 35675 }, { "epoch": 0.511960075718465, "grad_norm": 19.233348846435547, "learning_rate": 2.764494678025877e-05, "loss": 0.1659, "step": 35700 }, { "epoch": 0.5123185911776515, "grad_norm": 2.0223002433776855, "learning_rate": 2.764096326669358e-05, "loss": 0.2792, "step": 35725 }, { "epoch": 0.5126771066368382, "grad_norm": 14.277868270874023, "learning_rate": 2.7636979753128387e-05, "loss": 0.3329, "step": 35750 }, { "epoch": 0.5130356220960248, "grad_norm": 1.7121427059173584, "learning_rate": 2.7632996239563195e-05, "loss": 0.1547, "step": 35775 }, { "epoch": 0.5133941375552113, "grad_norm": 5.457465648651123, "learning_rate": 2.7629012725998003e-05, "loss": 0.3134, "step": 35800 }, { "epoch": 0.513752653014398, "grad_norm": 5.788388252258301, "learning_rate": 2.762502921243281e-05, "loss": 0.1232, "step": 35825 }, { "epoch": 0.5141111684735846, "grad_norm": 4.5207743644714355, "learning_rate": 2.7621045698867618e-05, "loss": 0.1443, "step": 35850 }, { "epoch": 0.5144696839327711, "grad_norm": 12.488065719604492, "learning_rate": 2.7617062185302426e-05, "loss": 0.2672, "step": 35875 }, { "epoch": 0.5148281993919578, "grad_norm": 1.4787455797195435, "learning_rate": 2.7613078671737237e-05, "loss": 0.2246, "step": 35900 }, { "epoch": 0.5151867148511444, "grad_norm": 26.564054489135742, "learning_rate": 2.7609095158172045e-05, "loss": 0.2694, "step": 35925 }, { "epoch": 0.515545230310331, "grad_norm": 20.564023971557617, "learning_rate": 2.7605111644606853e-05, "loss": 0.204, "step": 35950 }, { "epoch": 0.5159037457695176, "grad_norm": 1.057470440864563, "learning_rate": 2.760112813104166e-05, "loss": 0.2626, "step": 35975 }, { "epoch": 0.5162622612287042, "grad_norm": 1.8665368556976318, "learning_rate": 2.7597144617476472e-05, "loss": 0.2247, "step": 36000 }, { "epoch": 0.5166207766878907, "grad_norm": 3.931948661804199, "learning_rate": 2.759316110391128e-05, "loss": 0.1619, "step": 36025 }, { "epoch": 0.5169792921470774, "grad_norm": 0.29979410767555237, "learning_rate": 2.7589177590346088e-05, "loss": 0.2514, "step": 36050 }, { "epoch": 0.517337807606264, "grad_norm": 4.0568156242370605, "learning_rate": 2.7585194076780896e-05, "loss": 0.1947, "step": 36075 }, { "epoch": 0.5176963230654505, "grad_norm": 7.377418041229248, "learning_rate": 2.7581210563215703e-05, "loss": 0.2701, "step": 36100 }, { "epoch": 0.5180548385246372, "grad_norm": 0.23850566148757935, "learning_rate": 2.7577227049650515e-05, "loss": 0.2825, "step": 36125 }, { "epoch": 0.5184133539838238, "grad_norm": 3.5266919136047363, "learning_rate": 2.7573243536085323e-05, "loss": 0.2724, "step": 36150 }, { "epoch": 0.5187718694430103, "grad_norm": 18.454875946044922, "learning_rate": 2.756926002252013e-05, "loss": 0.2415, "step": 36175 }, { "epoch": 0.519130384902197, "grad_norm": 4.707977771759033, "learning_rate": 2.7565276508954938e-05, "loss": 0.1895, "step": 36200 }, { "epoch": 0.5194889003613836, "grad_norm": 8.522283554077148, "learning_rate": 2.7561292995389746e-05, "loss": 0.3241, "step": 36225 }, { "epoch": 0.5198474158205701, "grad_norm": 11.99328899383545, "learning_rate": 2.7557309481824557e-05, "loss": 0.1937, "step": 36250 }, { "epoch": 0.5202059312797568, "grad_norm": 11.806272506713867, "learning_rate": 2.7553325968259365e-05, "loss": 0.2414, "step": 36275 }, { "epoch": 0.5205644467389434, "grad_norm": 10.39821720123291, "learning_rate": 2.7549342454694173e-05, "loss": 0.1997, "step": 36300 }, { "epoch": 0.52092296219813, "grad_norm": 3.264526844024658, "learning_rate": 2.754535894112898e-05, "loss": 0.2846, "step": 36325 }, { "epoch": 0.5212814776573166, "grad_norm": 0.7907812595367432, "learning_rate": 2.754137542756379e-05, "loss": 0.1799, "step": 36350 }, { "epoch": 0.5216399931165032, "grad_norm": 6.159631252288818, "learning_rate": 2.75373919139986e-05, "loss": 0.2657, "step": 36375 }, { "epoch": 0.5219985085756897, "grad_norm": 18.933534622192383, "learning_rate": 2.7533408400433408e-05, "loss": 0.1869, "step": 36400 }, { "epoch": 0.5223570240348764, "grad_norm": 9.141243934631348, "learning_rate": 2.7529424886868216e-05, "loss": 0.3183, "step": 36425 }, { "epoch": 0.522715539494063, "grad_norm": 5.131206512451172, "learning_rate": 2.7525441373303023e-05, "loss": 0.2793, "step": 36450 }, { "epoch": 0.5230740549532495, "grad_norm": 9.606364250183105, "learning_rate": 2.752145785973783e-05, "loss": 0.2668, "step": 36475 }, { "epoch": 0.5234325704124362, "grad_norm": 2.8381831645965576, "learning_rate": 2.7517474346172643e-05, "loss": 0.3261, "step": 36500 }, { "epoch": 0.5237910858716228, "grad_norm": 0.34590116143226624, "learning_rate": 2.751349083260745e-05, "loss": 0.2264, "step": 36525 }, { "epoch": 0.5241496013308093, "grad_norm": 0.9246371388435364, "learning_rate": 2.7509507319042258e-05, "loss": 0.2209, "step": 36550 }, { "epoch": 0.524508116789996, "grad_norm": 3.798309803009033, "learning_rate": 2.7505523805477066e-05, "loss": 0.2459, "step": 36575 }, { "epoch": 0.5248666322491826, "grad_norm": 8.86894416809082, "learning_rate": 2.7501540291911877e-05, "loss": 0.1875, "step": 36600 }, { "epoch": 0.5252251477083691, "grad_norm": 10.978509902954102, "learning_rate": 2.7497556778346685e-05, "loss": 0.2049, "step": 36625 }, { "epoch": 0.5255836631675558, "grad_norm": 7.059563636779785, "learning_rate": 2.7493573264781493e-05, "loss": 0.1967, "step": 36650 }, { "epoch": 0.5259421786267424, "grad_norm": 13.936517715454102, "learning_rate": 2.74895897512163e-05, "loss": 0.2931, "step": 36675 }, { "epoch": 0.5263006940859289, "grad_norm": 2.0142722129821777, "learning_rate": 2.748560623765111e-05, "loss": 0.4147, "step": 36700 }, { "epoch": 0.5266592095451156, "grad_norm": 2.803708076477051, "learning_rate": 2.748162272408592e-05, "loss": 0.1773, "step": 36725 }, { "epoch": 0.5270177250043022, "grad_norm": 0.5066791772842407, "learning_rate": 2.7477639210520728e-05, "loss": 0.1743, "step": 36750 }, { "epoch": 0.5273762404634887, "grad_norm": 1.863049864768982, "learning_rate": 2.7473655696955536e-05, "loss": 0.1745, "step": 36775 }, { "epoch": 0.5277347559226754, "grad_norm": 0.8621711134910583, "learning_rate": 2.7469672183390343e-05, "loss": 0.251, "step": 36800 }, { "epoch": 0.528093271381862, "grad_norm": 11.432084083557129, "learning_rate": 2.746568866982515e-05, "loss": 0.2062, "step": 36825 }, { "epoch": 0.5284517868410485, "grad_norm": 1.5091745853424072, "learning_rate": 2.746170515625996e-05, "loss": 0.2247, "step": 36850 }, { "epoch": 0.5288103023002352, "grad_norm": 16.019432067871094, "learning_rate": 2.7457721642694767e-05, "loss": 0.2666, "step": 36875 }, { "epoch": 0.5291688177594218, "grad_norm": 14.189940452575684, "learning_rate": 2.7453738129129575e-05, "loss": 0.2197, "step": 36900 }, { "epoch": 0.5295273332186083, "grad_norm": 19.721012115478516, "learning_rate": 2.7449754615564383e-05, "loss": 0.2535, "step": 36925 }, { "epoch": 0.529885848677795, "grad_norm": 12.040104866027832, "learning_rate": 2.744577110199919e-05, "loss": 0.2921, "step": 36950 }, { "epoch": 0.5302443641369816, "grad_norm": 10.050649642944336, "learning_rate": 2.7441787588434002e-05, "loss": 0.2651, "step": 36975 }, { "epoch": 0.5306028795961681, "grad_norm": 20.47180938720703, "learning_rate": 2.743780407486881e-05, "loss": 0.2994, "step": 37000 }, { "epoch": 0.5309613950553548, "grad_norm": 4.892871856689453, "learning_rate": 2.7433820561303617e-05, "loss": 0.2878, "step": 37025 }, { "epoch": 0.5313199105145414, "grad_norm": 5.08312463760376, "learning_rate": 2.7429837047738425e-05, "loss": 0.19, "step": 37050 }, { "epoch": 0.5316784259737279, "grad_norm": 13.955199241638184, "learning_rate": 2.7425853534173233e-05, "loss": 0.238, "step": 37075 }, { "epoch": 0.5320369414329146, "grad_norm": 15.63149356842041, "learning_rate": 2.7421870020608044e-05, "loss": 0.407, "step": 37100 }, { "epoch": 0.5323954568921012, "grad_norm": 3.493037462234497, "learning_rate": 2.7417886507042852e-05, "loss": 0.2931, "step": 37125 }, { "epoch": 0.5327539723512877, "grad_norm": 4.866361141204834, "learning_rate": 2.741390299347766e-05, "loss": 0.2007, "step": 37150 }, { "epoch": 0.5331124878104744, "grad_norm": 11.814473152160645, "learning_rate": 2.7409919479912468e-05, "loss": 0.2853, "step": 37175 }, { "epoch": 0.533471003269661, "grad_norm": 20.528804779052734, "learning_rate": 2.740593596634728e-05, "loss": 0.3803, "step": 37200 }, { "epoch": 0.5338295187288475, "grad_norm": 1.6462552547454834, "learning_rate": 2.7401952452782087e-05, "loss": 0.1927, "step": 37225 }, { "epoch": 0.5341880341880342, "grad_norm": 21.627622604370117, "learning_rate": 2.7397968939216895e-05, "loss": 0.3141, "step": 37250 }, { "epoch": 0.5345465496472208, "grad_norm": 2.7400100231170654, "learning_rate": 2.7393985425651703e-05, "loss": 0.2957, "step": 37275 }, { "epoch": 0.5349050651064073, "grad_norm": 0.3966071307659149, "learning_rate": 2.739000191208651e-05, "loss": 0.1904, "step": 37300 }, { "epoch": 0.535263580565594, "grad_norm": 9.561092376708984, "learning_rate": 2.7386018398521322e-05, "loss": 0.229, "step": 37325 }, { "epoch": 0.5356220960247806, "grad_norm": 10.721622467041016, "learning_rate": 2.738203488495613e-05, "loss": 0.3962, "step": 37350 }, { "epoch": 0.5359806114839671, "grad_norm": 0.1404716819524765, "learning_rate": 2.7378051371390937e-05, "loss": 0.1439, "step": 37375 }, { "epoch": 0.5363391269431538, "grad_norm": 2.852938652038574, "learning_rate": 2.7374067857825745e-05, "loss": 0.1843, "step": 37400 }, { "epoch": 0.5366976424023404, "grad_norm": 5.369728088378906, "learning_rate": 2.7370084344260553e-05, "loss": 0.1671, "step": 37425 }, { "epoch": 0.5370561578615269, "grad_norm": 0.1455700397491455, "learning_rate": 2.7366100830695364e-05, "loss": 0.2241, "step": 37450 }, { "epoch": 0.5374146733207136, "grad_norm": 5.579031467437744, "learning_rate": 2.7362117317130172e-05, "loss": 0.1644, "step": 37475 }, { "epoch": 0.5377731887799002, "grad_norm": 0.796127200126648, "learning_rate": 2.735813380356498e-05, "loss": 0.1955, "step": 37500 }, { "epoch": 0.5381317042390867, "grad_norm": 9.284302711486816, "learning_rate": 2.7354150289999788e-05, "loss": 0.1843, "step": 37525 }, { "epoch": 0.5384902196982734, "grad_norm": 12.303901672363281, "learning_rate": 2.7350166776434596e-05, "loss": 0.3035, "step": 37550 }, { "epoch": 0.53884873515746, "grad_norm": 7.945218563079834, "learning_rate": 2.7346183262869407e-05, "loss": 0.2878, "step": 37575 }, { "epoch": 0.5392072506166465, "grad_norm": 6.415719985961914, "learning_rate": 2.7342199749304215e-05, "loss": 0.185, "step": 37600 }, { "epoch": 0.5395657660758332, "grad_norm": 0.7418746948242188, "learning_rate": 2.7338216235739023e-05, "loss": 0.1876, "step": 37625 }, { "epoch": 0.5399242815350198, "grad_norm": 0.041772570461034775, "learning_rate": 2.733423272217383e-05, "loss": 0.2121, "step": 37650 }, { "epoch": 0.5402827969942063, "grad_norm": 13.055379867553711, "learning_rate": 2.733024920860864e-05, "loss": 0.3171, "step": 37675 }, { "epoch": 0.540641312453393, "grad_norm": 0.37502947449684143, "learning_rate": 2.732626569504345e-05, "loss": 0.0984, "step": 37700 }, { "epoch": 0.5409998279125796, "grad_norm": 10.84543514251709, "learning_rate": 2.7322282181478257e-05, "loss": 0.2143, "step": 37725 }, { "epoch": 0.5413583433717661, "grad_norm": 11.705179214477539, "learning_rate": 2.7318298667913065e-05, "loss": 0.2194, "step": 37750 }, { "epoch": 0.5417168588309528, "grad_norm": 30.656177520751953, "learning_rate": 2.7314315154347873e-05, "loss": 0.1811, "step": 37775 }, { "epoch": 0.5420753742901394, "grad_norm": 7.601302146911621, "learning_rate": 2.7310331640782684e-05, "loss": 0.0963, "step": 37800 }, { "epoch": 0.5424338897493259, "grad_norm": 14.387463569641113, "learning_rate": 2.7306348127217492e-05, "loss": 0.237, "step": 37825 }, { "epoch": 0.5427924052085126, "grad_norm": 14.89864444732666, "learning_rate": 2.73023646136523e-05, "loss": 0.2631, "step": 37850 }, { "epoch": 0.5431509206676992, "grad_norm": 1.2303119897842407, "learning_rate": 2.7298381100087108e-05, "loss": 0.1597, "step": 37875 }, { "epoch": 0.5435094361268857, "grad_norm": 15.770167350769043, "learning_rate": 2.7294397586521916e-05, "loss": 0.217, "step": 37900 }, { "epoch": 0.5438679515860724, "grad_norm": 22.222089767456055, "learning_rate": 2.7290414072956727e-05, "loss": 0.2105, "step": 37925 }, { "epoch": 0.544226467045259, "grad_norm": 1.702053427696228, "learning_rate": 2.728643055939153e-05, "loss": 0.1688, "step": 37950 }, { "epoch": 0.5445849825044456, "grad_norm": 5.285175800323486, "learning_rate": 2.728244704582634e-05, "loss": 0.1971, "step": 37975 }, { "epoch": 0.5449434979636322, "grad_norm": 4.947132587432861, "learning_rate": 2.7278463532261147e-05, "loss": 0.1606, "step": 38000 }, { "epoch": 0.5453020134228188, "grad_norm": 14.692900657653809, "learning_rate": 2.7274480018695955e-05, "loss": 0.2008, "step": 38025 }, { "epoch": 0.5456605288820054, "grad_norm": 8.7791748046875, "learning_rate": 2.7270496505130766e-05, "loss": 0.2913, "step": 38050 }, { "epoch": 0.546019044341192, "grad_norm": 12.843642234802246, "learning_rate": 2.7266512991565574e-05, "loss": 0.2485, "step": 38075 }, { "epoch": 0.5463775598003786, "grad_norm": 8.609908103942871, "learning_rate": 2.7262529478000382e-05, "loss": 0.2177, "step": 38100 }, { "epoch": 0.5467360752595652, "grad_norm": 19.18729019165039, "learning_rate": 2.725854596443519e-05, "loss": 0.1564, "step": 38125 }, { "epoch": 0.5470945907187518, "grad_norm": 14.78596305847168, "learning_rate": 2.7254562450869998e-05, "loss": 0.2732, "step": 38150 }, { "epoch": 0.5474531061779384, "grad_norm": 0.2471461296081543, "learning_rate": 2.725057893730481e-05, "loss": 0.2172, "step": 38175 }, { "epoch": 0.547811621637125, "grad_norm": 0.8321511149406433, "learning_rate": 2.7246595423739617e-05, "loss": 0.1978, "step": 38200 }, { "epoch": 0.5481701370963116, "grad_norm": 12.287365913391113, "learning_rate": 2.7242611910174424e-05, "loss": 0.2861, "step": 38225 }, { "epoch": 0.5485286525554982, "grad_norm": 6.737407684326172, "learning_rate": 2.7238628396609232e-05, "loss": 0.1294, "step": 38250 }, { "epoch": 0.5488871680146848, "grad_norm": 13.045625686645508, "learning_rate": 2.723464488304404e-05, "loss": 0.1954, "step": 38275 }, { "epoch": 0.5492456834738714, "grad_norm": 6.699998378753662, "learning_rate": 2.723066136947885e-05, "loss": 0.2524, "step": 38300 }, { "epoch": 0.549604198933058, "grad_norm": 9.072937965393066, "learning_rate": 2.722667785591366e-05, "loss": 0.1179, "step": 38325 }, { "epoch": 0.5499627143922446, "grad_norm": 11.922216415405273, "learning_rate": 2.7222694342348467e-05, "loss": 0.2212, "step": 38350 }, { "epoch": 0.5503212298514312, "grad_norm": 12.259234428405762, "learning_rate": 2.7218710828783275e-05, "loss": 0.1853, "step": 38375 }, { "epoch": 0.5506797453106178, "grad_norm": 4.862159729003906, "learning_rate": 2.7214727315218086e-05, "loss": 0.1412, "step": 38400 }, { "epoch": 0.5510382607698044, "grad_norm": 15.529319763183594, "learning_rate": 2.7210743801652894e-05, "loss": 0.2567, "step": 38425 }, { "epoch": 0.551396776228991, "grad_norm": 7.146137237548828, "learning_rate": 2.7206760288087702e-05, "loss": 0.2255, "step": 38450 }, { "epoch": 0.5517552916881776, "grad_norm": 0.37043872475624084, "learning_rate": 2.720277677452251e-05, "loss": 0.194, "step": 38475 }, { "epoch": 0.5521138071473642, "grad_norm": 6.226984024047852, "learning_rate": 2.7198793260957318e-05, "loss": 0.1759, "step": 38500 }, { "epoch": 0.5524723226065508, "grad_norm": 16.599164962768555, "learning_rate": 2.719480974739213e-05, "loss": 0.1966, "step": 38525 }, { "epoch": 0.5528308380657374, "grad_norm": 1.5448722839355469, "learning_rate": 2.7190826233826937e-05, "loss": 0.1984, "step": 38550 }, { "epoch": 0.553189353524924, "grad_norm": 3.2365365028381348, "learning_rate": 2.7186842720261744e-05, "loss": 0.2213, "step": 38575 }, { "epoch": 0.5535478689841106, "grad_norm": 20.75689697265625, "learning_rate": 2.7182859206696552e-05, "loss": 0.2386, "step": 38600 }, { "epoch": 0.5539063844432972, "grad_norm": 4.038875102996826, "learning_rate": 2.717887569313136e-05, "loss": 0.1906, "step": 38625 }, { "epoch": 0.5542648999024838, "grad_norm": 4.491791725158691, "learning_rate": 2.717489217956617e-05, "loss": 0.2538, "step": 38650 }, { "epoch": 0.5546234153616704, "grad_norm": 0.6023632884025574, "learning_rate": 2.717090866600098e-05, "loss": 0.2048, "step": 38675 }, { "epoch": 0.554981930820857, "grad_norm": 7.438556671142578, "learning_rate": 2.7166925152435787e-05, "loss": 0.2286, "step": 38700 }, { "epoch": 0.5553404462800436, "grad_norm": 7.753760814666748, "learning_rate": 2.7162941638870595e-05, "loss": 0.2484, "step": 38725 }, { "epoch": 0.5556989617392302, "grad_norm": 18.092653274536133, "learning_rate": 2.7158958125305403e-05, "loss": 0.2105, "step": 38750 }, { "epoch": 0.5560574771984168, "grad_norm": 11.810647010803223, "learning_rate": 2.7154974611740214e-05, "loss": 0.1885, "step": 38775 }, { "epoch": 0.5564159926576034, "grad_norm": 22.321054458618164, "learning_rate": 2.7150991098175022e-05, "loss": 0.2097, "step": 38800 }, { "epoch": 0.55677450811679, "grad_norm": 2.0237135887145996, "learning_rate": 2.714700758460983e-05, "loss": 0.3116, "step": 38825 }, { "epoch": 0.5571330235759766, "grad_norm": 12.399508476257324, "learning_rate": 2.7143024071044638e-05, "loss": 0.194, "step": 38850 }, { "epoch": 0.5574915390351632, "grad_norm": 6.981904029846191, "learning_rate": 2.7139040557479445e-05, "loss": 0.1011, "step": 38875 }, { "epoch": 0.5578500544943498, "grad_norm": 0.16215361654758453, "learning_rate": 2.7135057043914257e-05, "loss": 0.283, "step": 38900 }, { "epoch": 0.5582085699535364, "grad_norm": 12.985486030578613, "learning_rate": 2.7131073530349064e-05, "loss": 0.1838, "step": 38925 }, { "epoch": 0.558567085412723, "grad_norm": 20.621118545532227, "learning_rate": 2.7127090016783872e-05, "loss": 0.2876, "step": 38950 }, { "epoch": 0.5589256008719096, "grad_norm": 5.957789897918701, "learning_rate": 2.712310650321868e-05, "loss": 0.1539, "step": 38975 }, { "epoch": 0.5592841163310962, "grad_norm": 5.0453572273254395, "learning_rate": 2.7119122989653488e-05, "loss": 0.2355, "step": 39000 }, { "epoch": 0.5596426317902828, "grad_norm": 9.237494468688965, "learning_rate": 2.71151394760883e-05, "loss": 0.24, "step": 39025 }, { "epoch": 0.5600011472494694, "grad_norm": 7.05992317199707, "learning_rate": 2.7111155962523104e-05, "loss": 0.1762, "step": 39050 }, { "epoch": 0.560359662708656, "grad_norm": 4.268676280975342, "learning_rate": 2.710717244895791e-05, "loss": 0.2216, "step": 39075 }, { "epoch": 0.5607181781678426, "grad_norm": 16.140607833862305, "learning_rate": 2.710318893539272e-05, "loss": 0.2049, "step": 39100 }, { "epoch": 0.5610766936270292, "grad_norm": 0.05154997482895851, "learning_rate": 2.709920542182753e-05, "loss": 0.2423, "step": 39125 }, { "epoch": 0.5614352090862158, "grad_norm": 1.711741328239441, "learning_rate": 2.709522190826234e-05, "loss": 0.1813, "step": 39150 }, { "epoch": 0.5617937245454024, "grad_norm": 0.45725691318511963, "learning_rate": 2.7091238394697146e-05, "loss": 0.1431, "step": 39175 }, { "epoch": 0.562152240004589, "grad_norm": 18.095020294189453, "learning_rate": 2.7087254881131954e-05, "loss": 0.2025, "step": 39200 }, { "epoch": 0.5625107554637756, "grad_norm": 10.873886108398438, "learning_rate": 2.7083271367566762e-05, "loss": 0.2872, "step": 39225 }, { "epoch": 0.5628692709229622, "grad_norm": 1.156237006187439, "learning_rate": 2.7079287854001573e-05, "loss": 0.2171, "step": 39250 }, { "epoch": 0.5632277863821488, "grad_norm": 6.358883380889893, "learning_rate": 2.707530434043638e-05, "loss": 0.1824, "step": 39275 }, { "epoch": 0.5635863018413354, "grad_norm": 6.956238746643066, "learning_rate": 2.707132082687119e-05, "loss": 0.2533, "step": 39300 }, { "epoch": 0.563944817300522, "grad_norm": 2.258441209793091, "learning_rate": 2.7067337313305997e-05, "loss": 0.2084, "step": 39325 }, { "epoch": 0.5643033327597086, "grad_norm": 14.09233283996582, "learning_rate": 2.7063353799740805e-05, "loss": 0.1097, "step": 39350 }, { "epoch": 0.5646618482188952, "grad_norm": 16.077667236328125, "learning_rate": 2.7059370286175616e-05, "loss": 0.1291, "step": 39375 }, { "epoch": 0.5650203636780818, "grad_norm": 2.025662422180176, "learning_rate": 2.7055386772610424e-05, "loss": 0.2358, "step": 39400 }, { "epoch": 0.5653788791372684, "grad_norm": 16.836368560791016, "learning_rate": 2.705140325904523e-05, "loss": 0.3161, "step": 39425 }, { "epoch": 0.565737394596455, "grad_norm": 0.6035826802253723, "learning_rate": 2.704741974548004e-05, "loss": 0.138, "step": 39450 }, { "epoch": 0.5660959100556416, "grad_norm": 20.41454315185547, "learning_rate": 2.7043436231914847e-05, "loss": 0.214, "step": 39475 }, { "epoch": 0.5664544255148282, "grad_norm": 12.881443977355957, "learning_rate": 2.703945271834966e-05, "loss": 0.2042, "step": 39500 }, { "epoch": 0.5668129409740148, "grad_norm": 12.31697940826416, "learning_rate": 2.7035469204784466e-05, "loss": 0.3043, "step": 39525 }, { "epoch": 0.5671714564332014, "grad_norm": 7.221100330352783, "learning_rate": 2.7031485691219274e-05, "loss": 0.2052, "step": 39550 }, { "epoch": 0.567529971892388, "grad_norm": 13.85973072052002, "learning_rate": 2.7027502177654082e-05, "loss": 0.2087, "step": 39575 }, { "epoch": 0.5678884873515746, "grad_norm": 8.422651290893555, "learning_rate": 2.702351866408889e-05, "loss": 0.2246, "step": 39600 }, { "epoch": 0.5682470028107612, "grad_norm": 0.28828299045562744, "learning_rate": 2.70195351505237e-05, "loss": 0.1688, "step": 39625 }, { "epoch": 0.5686055182699478, "grad_norm": 4.116021156311035, "learning_rate": 2.701555163695851e-05, "loss": 0.1956, "step": 39650 }, { "epoch": 0.5689640337291344, "grad_norm": 17.01022720336914, "learning_rate": 2.7011568123393317e-05, "loss": 0.1479, "step": 39675 }, { "epoch": 0.569322549188321, "grad_norm": 0.3061140477657318, "learning_rate": 2.7007584609828125e-05, "loss": 0.1894, "step": 39700 }, { "epoch": 0.5696810646475076, "grad_norm": 11.714567184448242, "learning_rate": 2.7003601096262936e-05, "loss": 0.197, "step": 39725 }, { "epoch": 0.5700395801066942, "grad_norm": 11.871502876281738, "learning_rate": 2.6999617582697744e-05, "loss": 0.2858, "step": 39750 }, { "epoch": 0.5703980955658808, "grad_norm": 24.194719314575195, "learning_rate": 2.699563406913255e-05, "loss": 0.2881, "step": 39775 }, { "epoch": 0.5707566110250674, "grad_norm": 2.652844190597534, "learning_rate": 2.699165055556736e-05, "loss": 0.1938, "step": 39800 }, { "epoch": 0.571115126484254, "grad_norm": 0.4611770808696747, "learning_rate": 2.6987667042002167e-05, "loss": 0.1609, "step": 39825 }, { "epoch": 0.5714736419434406, "grad_norm": 0.25732508301734924, "learning_rate": 2.698368352843698e-05, "loss": 0.2086, "step": 39850 }, { "epoch": 0.5718321574026272, "grad_norm": 9.84803581237793, "learning_rate": 2.6979700014871786e-05, "loss": 0.3176, "step": 39875 }, { "epoch": 0.5721906728618138, "grad_norm": 4.956047534942627, "learning_rate": 2.6975716501306594e-05, "loss": 0.2918, "step": 39900 }, { "epoch": 0.5725491883210004, "grad_norm": 3.7648515701293945, "learning_rate": 2.6971732987741402e-05, "loss": 0.2755, "step": 39925 }, { "epoch": 0.572907703780187, "grad_norm": 12.860793113708496, "learning_rate": 2.696774947417621e-05, "loss": 0.2138, "step": 39950 }, { "epoch": 0.5732662192393736, "grad_norm": 2.2074692249298096, "learning_rate": 2.696376596061102e-05, "loss": 0.2421, "step": 39975 }, { "epoch": 0.5736247346985602, "grad_norm": 21.815675735473633, "learning_rate": 2.695978244704583e-05, "loss": 0.2773, "step": 40000 }, { "epoch": 0.5739832501577468, "grad_norm": 6.835700511932373, "learning_rate": 2.6955798933480637e-05, "loss": 0.0943, "step": 40025 }, { "epoch": 0.5743417656169334, "grad_norm": 6.022839069366455, "learning_rate": 2.6951815419915445e-05, "loss": 0.3465, "step": 40050 }, { "epoch": 0.57470028107612, "grad_norm": 0.5286858677864075, "learning_rate": 2.6947831906350252e-05, "loss": 0.1924, "step": 40075 }, { "epoch": 0.5750587965353066, "grad_norm": 18.28533935546875, "learning_rate": 2.6943848392785064e-05, "loss": 0.2965, "step": 40100 }, { "epoch": 0.5754173119944932, "grad_norm": 5.388155460357666, "learning_rate": 2.693986487921987e-05, "loss": 0.1915, "step": 40125 }, { "epoch": 0.5757758274536798, "grad_norm": 1.5732041597366333, "learning_rate": 2.6935881365654676e-05, "loss": 0.1566, "step": 40150 }, { "epoch": 0.5761343429128664, "grad_norm": 5.392938137054443, "learning_rate": 2.6931897852089484e-05, "loss": 0.2133, "step": 40175 }, { "epoch": 0.576492858372053, "grad_norm": 1.5756895542144775, "learning_rate": 2.692791433852429e-05, "loss": 0.2184, "step": 40200 }, { "epoch": 0.5768513738312396, "grad_norm": 3.57470440864563, "learning_rate": 2.6923930824959103e-05, "loss": 0.2035, "step": 40225 }, { "epoch": 0.5772098892904262, "grad_norm": 0.2834940254688263, "learning_rate": 2.691994731139391e-05, "loss": 0.1738, "step": 40250 }, { "epoch": 0.5775684047496128, "grad_norm": 12.26462173461914, "learning_rate": 2.691596379782872e-05, "loss": 0.2511, "step": 40275 }, { "epoch": 0.5779269202087994, "grad_norm": 9.71161937713623, "learning_rate": 2.6911980284263526e-05, "loss": 0.1414, "step": 40300 }, { "epoch": 0.578285435667986, "grad_norm": 22.56549072265625, "learning_rate": 2.6907996770698338e-05, "loss": 0.2016, "step": 40325 }, { "epoch": 0.5786439511271726, "grad_norm": 0.7319689989089966, "learning_rate": 2.6904013257133145e-05, "loss": 0.2459, "step": 40350 }, { "epoch": 0.5790024665863592, "grad_norm": 1.9882124662399292, "learning_rate": 2.6900029743567953e-05, "loss": 0.2155, "step": 40375 }, { "epoch": 0.5793609820455458, "grad_norm": 8.59772777557373, "learning_rate": 2.689604623000276e-05, "loss": 0.1667, "step": 40400 }, { "epoch": 0.5797194975047324, "grad_norm": 2.857844829559326, "learning_rate": 2.689206271643757e-05, "loss": 0.1982, "step": 40425 }, { "epoch": 0.580078012963919, "grad_norm": 10.078802108764648, "learning_rate": 2.688807920287238e-05, "loss": 0.2148, "step": 40450 }, { "epoch": 0.5804365284231056, "grad_norm": 15.722647666931152, "learning_rate": 2.6884095689307188e-05, "loss": 0.2707, "step": 40475 }, { "epoch": 0.5807950438822922, "grad_norm": 0.6405608654022217, "learning_rate": 2.6880112175741996e-05, "loss": 0.1939, "step": 40500 }, { "epoch": 0.5811535593414788, "grad_norm": 3.7206804752349854, "learning_rate": 2.6876128662176804e-05, "loss": 0.2162, "step": 40525 }, { "epoch": 0.5815120748006654, "grad_norm": 3.5218122005462646, "learning_rate": 2.687214514861161e-05, "loss": 0.1738, "step": 40550 }, { "epoch": 0.581870590259852, "grad_norm": 7.477013111114502, "learning_rate": 2.6868161635046423e-05, "loss": 0.2804, "step": 40575 }, { "epoch": 0.5822291057190386, "grad_norm": 1.8999556303024292, "learning_rate": 2.686417812148123e-05, "loss": 0.3087, "step": 40600 }, { "epoch": 0.5825876211782252, "grad_norm": 3.3203682899475098, "learning_rate": 2.686019460791604e-05, "loss": 0.1154, "step": 40625 }, { "epoch": 0.5829461366374118, "grad_norm": 2.15413761138916, "learning_rate": 2.6856211094350846e-05, "loss": 0.1813, "step": 40650 }, { "epoch": 0.5833046520965984, "grad_norm": 7.514678955078125, "learning_rate": 2.6852227580785654e-05, "loss": 0.1727, "step": 40675 }, { "epoch": 0.583663167555785, "grad_norm": 17.626419067382812, "learning_rate": 2.6848244067220465e-05, "loss": 0.327, "step": 40700 }, { "epoch": 0.5840216830149716, "grad_norm": 0.20594611763954163, "learning_rate": 2.6844260553655273e-05, "loss": 0.1908, "step": 40725 }, { "epoch": 0.5843801984741582, "grad_norm": 13.965194702148438, "learning_rate": 2.684027704009008e-05, "loss": 0.2162, "step": 40750 }, { "epoch": 0.5847387139333448, "grad_norm": 0.3684762716293335, "learning_rate": 2.683629352652489e-05, "loss": 0.1734, "step": 40775 }, { "epoch": 0.5850972293925314, "grad_norm": 12.465391159057617, "learning_rate": 2.6832310012959697e-05, "loss": 0.2142, "step": 40800 }, { "epoch": 0.585455744851718, "grad_norm": 12.848894119262695, "learning_rate": 2.6828326499394508e-05, "loss": 0.1726, "step": 40825 }, { "epoch": 0.5858142603109046, "grad_norm": 9.976659774780273, "learning_rate": 2.6824342985829316e-05, "loss": 0.173, "step": 40850 }, { "epoch": 0.5861727757700912, "grad_norm": 0.8135828375816345, "learning_rate": 2.6820359472264124e-05, "loss": 0.2009, "step": 40875 }, { "epoch": 0.5865312912292778, "grad_norm": 17.431394577026367, "learning_rate": 2.681637595869893e-05, "loss": 0.1744, "step": 40900 }, { "epoch": 0.5868898066884644, "grad_norm": 1.4696071147918701, "learning_rate": 2.6812392445133743e-05, "loss": 0.2082, "step": 40925 }, { "epoch": 0.587248322147651, "grad_norm": 10.092906951904297, "learning_rate": 2.680840893156855e-05, "loss": 0.2569, "step": 40950 }, { "epoch": 0.5876068376068376, "grad_norm": 2.1732664108276367, "learning_rate": 2.680442541800336e-05, "loss": 0.1785, "step": 40975 }, { "epoch": 0.5879653530660242, "grad_norm": 0.17135250568389893, "learning_rate": 2.6800441904438166e-05, "loss": 0.2429, "step": 41000 }, { "epoch": 0.5883238685252108, "grad_norm": 2.872328519821167, "learning_rate": 2.6796458390872974e-05, "loss": 0.1888, "step": 41025 }, { "epoch": 0.5886823839843974, "grad_norm": 16.664888381958008, "learning_rate": 2.6792474877307785e-05, "loss": 0.1433, "step": 41050 }, { "epoch": 0.589040899443584, "grad_norm": 11.795760154724121, "learning_rate": 2.6788491363742593e-05, "loss": 0.2513, "step": 41075 }, { "epoch": 0.5893994149027706, "grad_norm": 22.729598999023438, "learning_rate": 2.67845078501774e-05, "loss": 0.366, "step": 41100 }, { "epoch": 0.5897579303619572, "grad_norm": 10.2198486328125, "learning_rate": 2.678052433661221e-05, "loss": 0.2353, "step": 41125 }, { "epoch": 0.5901164458211438, "grad_norm": 8.953221321105957, "learning_rate": 2.6776540823047017e-05, "loss": 0.2655, "step": 41150 }, { "epoch": 0.5904749612803304, "grad_norm": 3.975659132003784, "learning_rate": 2.6772557309481828e-05, "loss": 0.0902, "step": 41175 }, { "epoch": 0.590833476739517, "grad_norm": 14.10741138458252, "learning_rate": 2.6768573795916636e-05, "loss": 0.3224, "step": 41200 }, { "epoch": 0.5911919921987036, "grad_norm": 6.103735446929932, "learning_rate": 2.6764590282351444e-05, "loss": 0.2814, "step": 41225 }, { "epoch": 0.5915505076578902, "grad_norm": 14.385578155517578, "learning_rate": 2.6760606768786248e-05, "loss": 0.2617, "step": 41250 }, { "epoch": 0.5919090231170768, "grad_norm": 9.115473747253418, "learning_rate": 2.6756623255221056e-05, "loss": 0.2133, "step": 41275 }, { "epoch": 0.5922675385762635, "grad_norm": 7.834077835083008, "learning_rate": 2.6752639741655867e-05, "loss": 0.1838, "step": 41300 }, { "epoch": 0.59262605403545, "grad_norm": 2.6815898418426514, "learning_rate": 2.6748656228090675e-05, "loss": 0.2219, "step": 41325 }, { "epoch": 0.5929845694946366, "grad_norm": 2.530947685241699, "learning_rate": 2.6744672714525483e-05, "loss": 0.2258, "step": 41350 }, { "epoch": 0.5933430849538233, "grad_norm": 6.325314998626709, "learning_rate": 2.674068920096029e-05, "loss": 0.2482, "step": 41375 }, { "epoch": 0.5937016004130098, "grad_norm": 9.884748458862305, "learning_rate": 2.67367056873951e-05, "loss": 0.1778, "step": 41400 }, { "epoch": 0.5940601158721964, "grad_norm": 1.8964425325393677, "learning_rate": 2.673272217382991e-05, "loss": 0.2457, "step": 41425 }, { "epoch": 0.5944186313313831, "grad_norm": 4.3538665771484375, "learning_rate": 2.6728738660264718e-05, "loss": 0.2331, "step": 41450 }, { "epoch": 0.5947771467905696, "grad_norm": 29.974956512451172, "learning_rate": 2.6724755146699526e-05, "loss": 0.2487, "step": 41475 }, { "epoch": 0.5951356622497562, "grad_norm": 3.6570675373077393, "learning_rate": 2.6720771633134333e-05, "loss": 0.2635, "step": 41500 }, { "epoch": 0.5954941777089429, "grad_norm": 6.973223686218262, "learning_rate": 2.6716788119569145e-05, "loss": 0.1763, "step": 41525 }, { "epoch": 0.5958526931681294, "grad_norm": 1.4549694061279297, "learning_rate": 2.6712804606003953e-05, "loss": 0.212, "step": 41550 }, { "epoch": 0.596211208627316, "grad_norm": 0.052036043256521225, "learning_rate": 2.670882109243876e-05, "loss": 0.1902, "step": 41575 }, { "epoch": 0.5965697240865027, "grad_norm": 0.5271192193031311, "learning_rate": 2.6704837578873568e-05, "loss": 0.3345, "step": 41600 }, { "epoch": 0.5969282395456892, "grad_norm": 11.304685592651367, "learning_rate": 2.6700854065308376e-05, "loss": 0.2388, "step": 41625 }, { "epoch": 0.5972867550048758, "grad_norm": 19.2200870513916, "learning_rate": 2.6696870551743187e-05, "loss": 0.2734, "step": 41650 }, { "epoch": 0.5976452704640625, "grad_norm": 26.26524543762207, "learning_rate": 2.6692887038177995e-05, "loss": 0.3429, "step": 41675 }, { "epoch": 0.598003785923249, "grad_norm": 11.419465065002441, "learning_rate": 2.6688903524612803e-05, "loss": 0.2386, "step": 41700 }, { "epoch": 0.5983623013824356, "grad_norm": 6.682565689086914, "learning_rate": 2.668492001104761e-05, "loss": 0.2343, "step": 41725 }, { "epoch": 0.5987208168416223, "grad_norm": 5.415627479553223, "learning_rate": 2.668093649748242e-05, "loss": 0.1883, "step": 41750 }, { "epoch": 0.5990793323008088, "grad_norm": 8.80038833618164, "learning_rate": 2.667695298391723e-05, "loss": 0.15, "step": 41775 }, { "epoch": 0.5994378477599954, "grad_norm": 0.29569244384765625, "learning_rate": 2.6672969470352038e-05, "loss": 0.2146, "step": 41800 }, { "epoch": 0.5997963632191821, "grad_norm": 10.318097114562988, "learning_rate": 2.6668985956786846e-05, "loss": 0.364, "step": 41825 }, { "epoch": 0.6001548786783686, "grad_norm": 14.572966575622559, "learning_rate": 2.6665002443221653e-05, "loss": 0.2092, "step": 41850 }, { "epoch": 0.6005133941375552, "grad_norm": 5.7369465827941895, "learning_rate": 2.666101892965646e-05, "loss": 0.2938, "step": 41875 }, { "epoch": 0.6008719095967419, "grad_norm": 0.8328118324279785, "learning_rate": 2.6657035416091273e-05, "loss": 0.3057, "step": 41900 }, { "epoch": 0.6012304250559284, "grad_norm": 9.231532096862793, "learning_rate": 2.665305190252608e-05, "loss": 0.2269, "step": 41925 }, { "epoch": 0.601588940515115, "grad_norm": 0.7976161241531372, "learning_rate": 2.6649068388960888e-05, "loss": 0.1251, "step": 41950 }, { "epoch": 0.6019474559743017, "grad_norm": 2.292400360107422, "learning_rate": 2.6645084875395696e-05, "loss": 0.1373, "step": 41975 }, { "epoch": 0.6023059714334882, "grad_norm": 9.556707382202148, "learning_rate": 2.6641101361830504e-05, "loss": 0.1854, "step": 42000 }, { "epoch": 0.6026644868926748, "grad_norm": 6.442174434661865, "learning_rate": 2.6637117848265315e-05, "loss": 0.195, "step": 42025 }, { "epoch": 0.6030230023518615, "grad_norm": 17.175771713256836, "learning_rate": 2.6633134334700123e-05, "loss": 0.1777, "step": 42050 }, { "epoch": 0.603381517811048, "grad_norm": 0.34638363122940063, "learning_rate": 2.662915082113493e-05, "loss": 0.265, "step": 42075 }, { "epoch": 0.6037400332702346, "grad_norm": 7.045352935791016, "learning_rate": 2.662516730756974e-05, "loss": 0.1949, "step": 42100 }, { "epoch": 0.6040985487294213, "grad_norm": 18.80528450012207, "learning_rate": 2.662118379400455e-05, "loss": 0.3068, "step": 42125 }, { "epoch": 0.6044570641886078, "grad_norm": 0.49329066276550293, "learning_rate": 2.6617200280439358e-05, "loss": 0.2513, "step": 42150 }, { "epoch": 0.6048155796477944, "grad_norm": 3.811094045639038, "learning_rate": 2.6613216766874166e-05, "loss": 0.2285, "step": 42175 }, { "epoch": 0.6051740951069811, "grad_norm": 0.3490944802761078, "learning_rate": 2.6609233253308973e-05, "loss": 0.1508, "step": 42200 }, { "epoch": 0.6055326105661676, "grad_norm": 0.6187502145767212, "learning_rate": 2.660524973974378e-05, "loss": 0.3502, "step": 42225 }, { "epoch": 0.6058911260253542, "grad_norm": 10.091165542602539, "learning_rate": 2.6601266226178592e-05, "loss": 0.1875, "step": 42250 }, { "epoch": 0.6062496414845409, "grad_norm": 1.8145421743392944, "learning_rate": 2.65972827126134e-05, "loss": 0.171, "step": 42275 }, { "epoch": 0.6066081569437274, "grad_norm": 17.346914291381836, "learning_rate": 2.6593299199048208e-05, "loss": 0.2326, "step": 42300 }, { "epoch": 0.606966672402914, "grad_norm": 14.68632698059082, "learning_rate": 2.6589315685483016e-05, "loss": 0.2193, "step": 42325 }, { "epoch": 0.6073251878621007, "grad_norm": 0.38050803542137146, "learning_rate": 2.658533217191782e-05, "loss": 0.1694, "step": 42350 }, { "epoch": 0.6076837033212872, "grad_norm": 8.288836479187012, "learning_rate": 2.6581348658352632e-05, "loss": 0.2716, "step": 42375 }, { "epoch": 0.6080422187804738, "grad_norm": 16.118194580078125, "learning_rate": 2.657736514478744e-05, "loss": 0.2282, "step": 42400 }, { "epoch": 0.6084007342396605, "grad_norm": 4.699530601501465, "learning_rate": 2.6573381631222247e-05, "loss": 0.1952, "step": 42425 }, { "epoch": 0.608759249698847, "grad_norm": 10.669146537780762, "learning_rate": 2.6569398117657055e-05, "loss": 0.3033, "step": 42450 }, { "epoch": 0.6091177651580336, "grad_norm": 4.838540554046631, "learning_rate": 2.6565414604091863e-05, "loss": 0.2441, "step": 42475 }, { "epoch": 0.6094762806172203, "grad_norm": 14.675129890441895, "learning_rate": 2.6561431090526674e-05, "loss": 0.2661, "step": 42500 }, { "epoch": 0.6098347960764068, "grad_norm": 9.28681755065918, "learning_rate": 2.6557447576961482e-05, "loss": 0.2661, "step": 42525 }, { "epoch": 0.6101933115355934, "grad_norm": 4.401608943939209, "learning_rate": 2.655346406339629e-05, "loss": 0.3502, "step": 42550 }, { "epoch": 0.6105518269947801, "grad_norm": 6.093710422515869, "learning_rate": 2.6549480549831098e-05, "loss": 0.1701, "step": 42575 }, { "epoch": 0.6109103424539666, "grad_norm": 1.401940107345581, "learning_rate": 2.6545497036265906e-05, "loss": 0.119, "step": 42600 }, { "epoch": 0.6112688579131532, "grad_norm": 15.447752952575684, "learning_rate": 2.6541513522700717e-05, "loss": 0.1242, "step": 42625 }, { "epoch": 0.6116273733723399, "grad_norm": 17.667688369750977, "learning_rate": 2.6537530009135525e-05, "loss": 0.1913, "step": 42650 }, { "epoch": 0.6119858888315264, "grad_norm": 6.951498985290527, "learning_rate": 2.6533546495570333e-05, "loss": 0.1483, "step": 42675 }, { "epoch": 0.612344404290713, "grad_norm": 0.11095473915338516, "learning_rate": 2.652956298200514e-05, "loss": 0.2005, "step": 42700 }, { "epoch": 0.6127029197498997, "grad_norm": 0.6357178092002869, "learning_rate": 2.6525579468439952e-05, "loss": 0.2902, "step": 42725 }, { "epoch": 0.6130614352090862, "grad_norm": 14.67663860321045, "learning_rate": 2.652159595487476e-05, "loss": 0.2067, "step": 42750 }, { "epoch": 0.6134199506682728, "grad_norm": 10.131973266601562, "learning_rate": 2.6517612441309567e-05, "loss": 0.1954, "step": 42775 }, { "epoch": 0.6137784661274595, "grad_norm": 0.5278668999671936, "learning_rate": 2.6513628927744375e-05, "loss": 0.2446, "step": 42800 }, { "epoch": 0.614136981586646, "grad_norm": 12.235491752624512, "learning_rate": 2.6509645414179183e-05, "loss": 0.1556, "step": 42825 }, { "epoch": 0.6144954970458326, "grad_norm": 1.5865452289581299, "learning_rate": 2.6505661900613994e-05, "loss": 0.168, "step": 42850 }, { "epoch": 0.6148540125050193, "grad_norm": 7.42185640335083, "learning_rate": 2.6501678387048802e-05, "loss": 0.236, "step": 42875 }, { "epoch": 0.6152125279642058, "grad_norm": 9.540203094482422, "learning_rate": 2.649769487348361e-05, "loss": 0.1954, "step": 42900 }, { "epoch": 0.6155710434233924, "grad_norm": 1.2987329959869385, "learning_rate": 2.6493711359918418e-05, "loss": 0.17, "step": 42925 }, { "epoch": 0.615929558882579, "grad_norm": 21.081249237060547, "learning_rate": 2.6489727846353226e-05, "loss": 0.21, "step": 42950 }, { "epoch": 0.6162880743417656, "grad_norm": 9.802013397216797, "learning_rate": 2.6485744332788037e-05, "loss": 0.176, "step": 42975 }, { "epoch": 0.6166465898009522, "grad_norm": 18.839563369750977, "learning_rate": 2.6481760819222845e-05, "loss": 0.2582, "step": 43000 }, { "epoch": 0.6170051052601389, "grad_norm": 5.717194080352783, "learning_rate": 2.6477777305657653e-05, "loss": 0.1606, "step": 43025 }, { "epoch": 0.6173636207193254, "grad_norm": 12.58409595489502, "learning_rate": 2.647379379209246e-05, "loss": 0.292, "step": 43050 }, { "epoch": 0.617722136178512, "grad_norm": 11.222321510314941, "learning_rate": 2.646981027852727e-05, "loss": 0.2143, "step": 43075 }, { "epoch": 0.6180806516376987, "grad_norm": 5.9584808349609375, "learning_rate": 2.646582676496208e-05, "loss": 0.1492, "step": 43100 }, { "epoch": 0.6184391670968852, "grad_norm": 1.2466751337051392, "learning_rate": 2.6461843251396887e-05, "loss": 0.2558, "step": 43125 }, { "epoch": 0.6187976825560718, "grad_norm": 12.012810707092285, "learning_rate": 2.6457859737831695e-05, "loss": 0.203, "step": 43150 }, { "epoch": 0.6191561980152585, "grad_norm": 3.8941361904144287, "learning_rate": 2.6453876224266503e-05, "loss": 0.1527, "step": 43175 }, { "epoch": 0.619514713474445, "grad_norm": 17.30421257019043, "learning_rate": 2.644989271070131e-05, "loss": 0.1731, "step": 43200 }, { "epoch": 0.6198732289336316, "grad_norm": 4.046815395355225, "learning_rate": 2.6445909197136122e-05, "loss": 0.2947, "step": 43225 }, { "epoch": 0.6202317443928183, "grad_norm": 18.142784118652344, "learning_rate": 2.644192568357093e-05, "loss": 0.2488, "step": 43250 }, { "epoch": 0.6205902598520048, "grad_norm": 4.150970458984375, "learning_rate": 2.6437942170005738e-05, "loss": 0.227, "step": 43275 }, { "epoch": 0.6209487753111914, "grad_norm": 1.17970609664917, "learning_rate": 2.6433958656440546e-05, "loss": 0.2302, "step": 43300 }, { "epoch": 0.621307290770378, "grad_norm": 15.643561363220215, "learning_rate": 2.6429975142875354e-05, "loss": 0.2611, "step": 43325 }, { "epoch": 0.6216658062295646, "grad_norm": 2.146369457244873, "learning_rate": 2.6425991629310165e-05, "loss": 0.2258, "step": 43350 }, { "epoch": 0.6220243216887512, "grad_norm": 17.54804039001465, "learning_rate": 2.6422008115744973e-05, "loss": 0.2678, "step": 43375 }, { "epoch": 0.6223828371479379, "grad_norm": 4.758824348449707, "learning_rate": 2.641802460217978e-05, "loss": 0.2369, "step": 43400 }, { "epoch": 0.6227413526071244, "grad_norm": 11.910316467285156, "learning_rate": 2.6414041088614588e-05, "loss": 0.2382, "step": 43425 }, { "epoch": 0.623099868066311, "grad_norm": 13.246682167053223, "learning_rate": 2.6410057575049396e-05, "loss": 0.2578, "step": 43450 }, { "epoch": 0.6234583835254977, "grad_norm": 9.615623474121094, "learning_rate": 2.6406074061484204e-05, "loss": 0.244, "step": 43475 }, { "epoch": 0.6238168989846842, "grad_norm": 6.470996856689453, "learning_rate": 2.6402090547919012e-05, "loss": 0.2625, "step": 43500 }, { "epoch": 0.6241754144438708, "grad_norm": 12.777199745178223, "learning_rate": 2.639810703435382e-05, "loss": 0.2791, "step": 43525 }, { "epoch": 0.6245339299030574, "grad_norm": 0.3908720314502716, "learning_rate": 2.6394123520788628e-05, "loss": 0.132, "step": 43550 }, { "epoch": 0.624892445362244, "grad_norm": 0.945480227470398, "learning_rate": 2.639014000722344e-05, "loss": 0.1824, "step": 43575 }, { "epoch": 0.6252509608214306, "grad_norm": 0.32029038667678833, "learning_rate": 2.6386156493658247e-05, "loss": 0.1438, "step": 43600 }, { "epoch": 0.6256094762806172, "grad_norm": 0.726582407951355, "learning_rate": 2.6382172980093054e-05, "loss": 0.1695, "step": 43625 }, { "epoch": 0.6259679917398038, "grad_norm": 2.398350715637207, "learning_rate": 2.6378189466527862e-05, "loss": 0.198, "step": 43650 }, { "epoch": 0.6263265071989904, "grad_norm": 1.4898185729980469, "learning_rate": 2.637420595296267e-05, "loss": 0.114, "step": 43675 }, { "epoch": 0.626685022658177, "grad_norm": 15.041858673095703, "learning_rate": 2.637022243939748e-05, "loss": 0.2435, "step": 43700 }, { "epoch": 0.6270435381173636, "grad_norm": 17.282629013061523, "learning_rate": 2.636623892583229e-05, "loss": 0.266, "step": 43725 }, { "epoch": 0.6274020535765502, "grad_norm": 14.312301635742188, "learning_rate": 2.6362255412267097e-05, "loss": 0.2193, "step": 43750 }, { "epoch": 0.6277605690357368, "grad_norm": 2.7205252647399902, "learning_rate": 2.6358271898701905e-05, "loss": 0.1345, "step": 43775 }, { "epoch": 0.6281190844949234, "grad_norm": 1.0688506364822388, "learning_rate": 2.6354288385136713e-05, "loss": 0.1788, "step": 43800 }, { "epoch": 0.62847759995411, "grad_norm": 21.015474319458008, "learning_rate": 2.6350304871571524e-05, "loss": 0.291, "step": 43825 }, { "epoch": 0.6288361154132966, "grad_norm": 12.469378471374512, "learning_rate": 2.6346321358006332e-05, "loss": 0.1962, "step": 43850 }, { "epoch": 0.6291946308724832, "grad_norm": 5.946814060211182, "learning_rate": 2.634233784444114e-05, "loss": 0.1619, "step": 43875 }, { "epoch": 0.6295531463316698, "grad_norm": 1.3972206115722656, "learning_rate": 2.6338354330875948e-05, "loss": 0.2273, "step": 43900 }, { "epoch": 0.6299116617908564, "grad_norm": 5.446441173553467, "learning_rate": 2.6334370817310755e-05, "loss": 0.1747, "step": 43925 }, { "epoch": 0.630270177250043, "grad_norm": 6.366206169128418, "learning_rate": 2.6330387303745567e-05, "loss": 0.148, "step": 43950 }, { "epoch": 0.6306286927092296, "grad_norm": 11.266039848327637, "learning_rate": 2.6326403790180374e-05, "loss": 0.1332, "step": 43975 }, { "epoch": 0.6309872081684162, "grad_norm": 9.650976181030273, "learning_rate": 2.6322420276615182e-05, "loss": 0.1505, "step": 44000 }, { "epoch": 0.6313457236276028, "grad_norm": 4.5129194259643555, "learning_rate": 2.631843676304999e-05, "loss": 0.1758, "step": 44025 }, { "epoch": 0.6317042390867894, "grad_norm": 0.5983185768127441, "learning_rate": 2.63144532494848e-05, "loss": 0.2313, "step": 44050 }, { "epoch": 0.632062754545976, "grad_norm": 3.8941941261291504, "learning_rate": 2.631046973591961e-05, "loss": 0.2913, "step": 44075 }, { "epoch": 0.6324212700051626, "grad_norm": 8.191631317138672, "learning_rate": 2.6306486222354417e-05, "loss": 0.2615, "step": 44100 }, { "epoch": 0.6327797854643492, "grad_norm": 1.752906322479248, "learning_rate": 2.6302502708789225e-05, "loss": 0.228, "step": 44125 }, { "epoch": 0.6331383009235358, "grad_norm": 8.542378425598145, "learning_rate": 2.6298519195224033e-05, "loss": 0.193, "step": 44150 }, { "epoch": 0.6334968163827224, "grad_norm": 4.099010467529297, "learning_rate": 2.6294535681658844e-05, "loss": 0.2742, "step": 44175 }, { "epoch": 0.633855331841909, "grad_norm": 3.131373882293701, "learning_rate": 2.6290552168093652e-05, "loss": 0.2343, "step": 44200 }, { "epoch": 0.6342138473010956, "grad_norm": 12.274124145507812, "learning_rate": 2.628656865452846e-05, "loss": 0.2026, "step": 44225 }, { "epoch": 0.6345723627602822, "grad_norm": 0.7411843538284302, "learning_rate": 2.6282585140963267e-05, "loss": 0.2092, "step": 44250 }, { "epoch": 0.6349308782194688, "grad_norm": 12.255169868469238, "learning_rate": 2.6278601627398075e-05, "loss": 0.3069, "step": 44275 }, { "epoch": 0.6352893936786554, "grad_norm": 15.056601524353027, "learning_rate": 2.6274618113832887e-05, "loss": 0.1937, "step": 44300 }, { "epoch": 0.635647909137842, "grad_norm": 6.731452941894531, "learning_rate": 2.6270634600267694e-05, "loss": 0.2658, "step": 44325 }, { "epoch": 0.6360064245970286, "grad_norm": 2.241884708404541, "learning_rate": 2.6266651086702502e-05, "loss": 0.1839, "step": 44350 }, { "epoch": 0.6363649400562152, "grad_norm": 1.9744222164154053, "learning_rate": 2.626266757313731e-05, "loss": 0.171, "step": 44375 }, { "epoch": 0.6367234555154018, "grad_norm": 11.735159873962402, "learning_rate": 2.6258684059572118e-05, "loss": 0.2364, "step": 44400 }, { "epoch": 0.6370819709745884, "grad_norm": 14.597502708435059, "learning_rate": 2.625470054600693e-05, "loss": 0.26, "step": 44425 }, { "epoch": 0.637440486433775, "grad_norm": 8.121912956237793, "learning_rate": 2.6250717032441737e-05, "loss": 0.2608, "step": 44450 }, { "epoch": 0.6377990018929616, "grad_norm": 7.425535202026367, "learning_rate": 2.6246733518876545e-05, "loss": 0.2252, "step": 44475 }, { "epoch": 0.6381575173521482, "grad_norm": 7.685338497161865, "learning_rate": 2.6242750005311353e-05, "loss": 0.2852, "step": 44500 }, { "epoch": 0.6385160328113348, "grad_norm": 6.454859256744385, "learning_rate": 2.623876649174616e-05, "loss": 0.1593, "step": 44525 }, { "epoch": 0.6388745482705214, "grad_norm": 8.659252166748047, "learning_rate": 2.623478297818097e-05, "loss": 0.171, "step": 44550 }, { "epoch": 0.639233063729708, "grad_norm": 14.456416130065918, "learning_rate": 2.6230799464615776e-05, "loss": 0.2912, "step": 44575 }, { "epoch": 0.6395915791888946, "grad_norm": 2.5842525959014893, "learning_rate": 2.6226815951050584e-05, "loss": 0.2077, "step": 44600 }, { "epoch": 0.6399500946480813, "grad_norm": 22.119461059570312, "learning_rate": 2.6222832437485392e-05, "loss": 0.2537, "step": 44625 }, { "epoch": 0.6403086101072678, "grad_norm": 0.5492269396781921, "learning_rate": 2.6218848923920203e-05, "loss": 0.1654, "step": 44650 }, { "epoch": 0.6406671255664544, "grad_norm": 15.143065452575684, "learning_rate": 2.621486541035501e-05, "loss": 0.2972, "step": 44675 }, { "epoch": 0.6410256410256411, "grad_norm": 9.423266410827637, "learning_rate": 2.621088189678982e-05, "loss": 0.1903, "step": 44700 }, { "epoch": 0.6413841564848276, "grad_norm": 12.218191146850586, "learning_rate": 2.6206898383224627e-05, "loss": 0.1235, "step": 44725 }, { "epoch": 0.6417426719440142, "grad_norm": 5.196013927459717, "learning_rate": 2.6202914869659435e-05, "loss": 0.2919, "step": 44750 }, { "epoch": 0.6421011874032009, "grad_norm": 2.969106435775757, "learning_rate": 2.6198931356094246e-05, "loss": 0.3004, "step": 44775 }, { "epoch": 0.6424597028623874, "grad_norm": 6.4928131103515625, "learning_rate": 2.6194947842529054e-05, "loss": 0.2158, "step": 44800 }, { "epoch": 0.642818218321574, "grad_norm": 13.392071723937988, "learning_rate": 2.619096432896386e-05, "loss": 0.2067, "step": 44825 }, { "epoch": 0.6431767337807607, "grad_norm": 20.793960571289062, "learning_rate": 2.618698081539867e-05, "loss": 0.1924, "step": 44850 }, { "epoch": 0.6435352492399472, "grad_norm": 5.75787878036499, "learning_rate": 2.6182997301833477e-05, "loss": 0.2135, "step": 44875 }, { "epoch": 0.6438937646991338, "grad_norm": 6.262861728668213, "learning_rate": 2.617901378826829e-05, "loss": 0.1645, "step": 44900 }, { "epoch": 0.6442522801583205, "grad_norm": 2.690091609954834, "learning_rate": 2.6175030274703096e-05, "loss": 0.2237, "step": 44925 }, { "epoch": 0.644610795617507, "grad_norm": 3.219740152359009, "learning_rate": 2.6171046761137904e-05, "loss": 0.24, "step": 44950 }, { "epoch": 0.6449693110766936, "grad_norm": 13.024764060974121, "learning_rate": 2.6167063247572712e-05, "loss": 0.1706, "step": 44975 }, { "epoch": 0.6453278265358803, "grad_norm": 10.197701454162598, "learning_rate": 2.616307973400752e-05, "loss": 0.1732, "step": 45000 }, { "epoch": 0.6456863419950668, "grad_norm": 18.32566261291504, "learning_rate": 2.615909622044233e-05, "loss": 0.2787, "step": 45025 }, { "epoch": 0.6460448574542534, "grad_norm": 1.1406365633010864, "learning_rate": 2.615511270687714e-05, "loss": 0.1652, "step": 45050 }, { "epoch": 0.6464033729134401, "grad_norm": 14.959321975708008, "learning_rate": 2.6151129193311947e-05, "loss": 0.291, "step": 45075 }, { "epoch": 0.6467618883726266, "grad_norm": 10.462681770324707, "learning_rate": 2.6147145679746755e-05, "loss": 0.1936, "step": 45100 }, { "epoch": 0.6471204038318132, "grad_norm": 2.350177526473999, "learning_rate": 2.6143162166181562e-05, "loss": 0.2575, "step": 45125 }, { "epoch": 0.6474789192909999, "grad_norm": 0.6751030683517456, "learning_rate": 2.6139178652616374e-05, "loss": 0.2179, "step": 45150 }, { "epoch": 0.6478374347501864, "grad_norm": 6.013749599456787, "learning_rate": 2.613519513905118e-05, "loss": 0.194, "step": 45175 }, { "epoch": 0.648195950209373, "grad_norm": 15.134842872619629, "learning_rate": 2.613121162548599e-05, "loss": 0.212, "step": 45200 }, { "epoch": 0.6485544656685597, "grad_norm": 10.739830017089844, "learning_rate": 2.6127228111920797e-05, "loss": 0.201, "step": 45225 }, { "epoch": 0.6489129811277462, "grad_norm": 10.926408767700195, "learning_rate": 2.612324459835561e-05, "loss": 0.1338, "step": 45250 }, { "epoch": 0.6492714965869328, "grad_norm": 0.7741734385490417, "learning_rate": 2.6119261084790416e-05, "loss": 0.2426, "step": 45275 }, { "epoch": 0.6496300120461195, "grad_norm": 0.8259892463684082, "learning_rate": 2.6115277571225224e-05, "loss": 0.2928, "step": 45300 }, { "epoch": 0.649988527505306, "grad_norm": 9.986886978149414, "learning_rate": 2.6111294057660032e-05, "loss": 0.3017, "step": 45325 }, { "epoch": 0.6503470429644926, "grad_norm": 2.7987470626831055, "learning_rate": 2.610731054409484e-05, "loss": 0.1992, "step": 45350 }, { "epoch": 0.6507055584236793, "grad_norm": 1.3954225778579712, "learning_rate": 2.610332703052965e-05, "loss": 0.1853, "step": 45375 }, { "epoch": 0.6510640738828658, "grad_norm": 14.065890312194824, "learning_rate": 2.609934351696446e-05, "loss": 0.2116, "step": 45400 }, { "epoch": 0.6514225893420524, "grad_norm": 2.517286777496338, "learning_rate": 2.6095360003399267e-05, "loss": 0.2323, "step": 45425 }, { "epoch": 0.6517811048012391, "grad_norm": 30.04060935974121, "learning_rate": 2.6091376489834075e-05, "loss": 0.2178, "step": 45450 }, { "epoch": 0.6521396202604256, "grad_norm": 7.195874214172363, "learning_rate": 2.6087392976268882e-05, "loss": 0.211, "step": 45475 }, { "epoch": 0.6524981357196122, "grad_norm": 5.194607734680176, "learning_rate": 2.6083409462703694e-05, "loss": 0.2916, "step": 45500 }, { "epoch": 0.6528566511787989, "grad_norm": 2.56602144241333, "learning_rate": 2.60794259491385e-05, "loss": 0.1853, "step": 45525 }, { "epoch": 0.6532151666379854, "grad_norm": 10.401512145996094, "learning_rate": 2.607544243557331e-05, "loss": 0.2244, "step": 45550 }, { "epoch": 0.653573682097172, "grad_norm": 0.7188613414764404, "learning_rate": 2.6071458922008117e-05, "loss": 0.1242, "step": 45575 }, { "epoch": 0.6539321975563587, "grad_norm": 17.194576263427734, "learning_rate": 2.6067475408442925e-05, "loss": 0.1676, "step": 45600 }, { "epoch": 0.6542907130155452, "grad_norm": 0.9680184125900269, "learning_rate": 2.6063491894877736e-05, "loss": 0.1443, "step": 45625 }, { "epoch": 0.6546492284747318, "grad_norm": 6.242096900939941, "learning_rate": 2.605950838131254e-05, "loss": 0.1748, "step": 45650 }, { "epoch": 0.6550077439339185, "grad_norm": 9.856827735900879, "learning_rate": 2.605552486774735e-05, "loss": 0.2199, "step": 45675 }, { "epoch": 0.655366259393105, "grad_norm": 4.689044952392578, "learning_rate": 2.6051541354182156e-05, "loss": 0.2178, "step": 45700 }, { "epoch": 0.6557247748522916, "grad_norm": 5.0724382400512695, "learning_rate": 2.6047557840616964e-05, "loss": 0.2602, "step": 45725 }, { "epoch": 0.6560832903114783, "grad_norm": 0.043479204177856445, "learning_rate": 2.6043574327051775e-05, "loss": 0.2859, "step": 45750 }, { "epoch": 0.6564418057706648, "grad_norm": 21.257261276245117, "learning_rate": 2.6039590813486583e-05, "loss": 0.1969, "step": 45775 }, { "epoch": 0.6568003212298514, "grad_norm": 12.480144500732422, "learning_rate": 2.603560729992139e-05, "loss": 0.2412, "step": 45800 }, { "epoch": 0.6571588366890381, "grad_norm": 4.393990516662598, "learning_rate": 2.60316237863562e-05, "loss": 0.1784, "step": 45825 }, { "epoch": 0.6575173521482246, "grad_norm": 6.866861820220947, "learning_rate": 2.602764027279101e-05, "loss": 0.2, "step": 45850 }, { "epoch": 0.6578758676074112, "grad_norm": 1.036781907081604, "learning_rate": 2.6023656759225818e-05, "loss": 0.1825, "step": 45875 }, { "epoch": 0.6582343830665979, "grad_norm": 0.6547845602035522, "learning_rate": 2.6019673245660626e-05, "loss": 0.2494, "step": 45900 }, { "epoch": 0.6585928985257844, "grad_norm": 8.792608261108398, "learning_rate": 2.6015689732095434e-05, "loss": 0.2408, "step": 45925 }, { "epoch": 0.658951413984971, "grad_norm": 16.25312042236328, "learning_rate": 2.601170621853024e-05, "loss": 0.1517, "step": 45950 }, { "epoch": 0.6593099294441577, "grad_norm": 13.18540096282959, "learning_rate": 2.6007722704965053e-05, "loss": 0.1617, "step": 45975 }, { "epoch": 0.6596684449033442, "grad_norm": 4.82725715637207, "learning_rate": 2.600373919139986e-05, "loss": 0.1708, "step": 46000 }, { "epoch": 0.6600269603625308, "grad_norm": 11.181498527526855, "learning_rate": 2.599975567783467e-05, "loss": 0.2931, "step": 46025 }, { "epoch": 0.6603854758217175, "grad_norm": 7.627374649047852, "learning_rate": 2.5995772164269476e-05, "loss": 0.2089, "step": 46050 }, { "epoch": 0.660743991280904, "grad_norm": 2.518982410430908, "learning_rate": 2.5991788650704284e-05, "loss": 0.2721, "step": 46075 }, { "epoch": 0.6611025067400906, "grad_norm": 0.3898479640483856, "learning_rate": 2.5987805137139095e-05, "loss": 0.2999, "step": 46100 }, { "epoch": 0.6614610221992773, "grad_norm": 0.16298004984855652, "learning_rate": 2.5983821623573903e-05, "loss": 0.1598, "step": 46125 }, { "epoch": 0.6618195376584638, "grad_norm": 0.5863973498344421, "learning_rate": 2.597983811000871e-05, "loss": 0.2293, "step": 46150 }, { "epoch": 0.6621780531176504, "grad_norm": 0.022248638793826103, "learning_rate": 2.597585459644352e-05, "loss": 0.162, "step": 46175 }, { "epoch": 0.6625365685768371, "grad_norm": 8.088638305664062, "learning_rate": 2.5971871082878327e-05, "loss": 0.1751, "step": 46200 }, { "epoch": 0.6628950840360236, "grad_norm": 19.793582916259766, "learning_rate": 2.5967887569313138e-05, "loss": 0.2332, "step": 46225 }, { "epoch": 0.6632535994952102, "grad_norm": 9.156176567077637, "learning_rate": 2.5963904055747946e-05, "loss": 0.1348, "step": 46250 }, { "epoch": 0.6636121149543969, "grad_norm": 20.64098358154297, "learning_rate": 2.5959920542182754e-05, "loss": 0.1901, "step": 46275 }, { "epoch": 0.6639706304135834, "grad_norm": 0.7540135383605957, "learning_rate": 2.595593702861756e-05, "loss": 0.3351, "step": 46300 }, { "epoch": 0.66432914587277, "grad_norm": 5.641460418701172, "learning_rate": 2.595195351505237e-05, "loss": 0.1794, "step": 46325 }, { "epoch": 0.6646876613319567, "grad_norm": 2.539881467819214, "learning_rate": 2.594797000148718e-05, "loss": 0.2149, "step": 46350 }, { "epoch": 0.6650461767911432, "grad_norm": 13.00069522857666, "learning_rate": 2.594398648792199e-05, "loss": 0.244, "step": 46375 }, { "epoch": 0.6654046922503298, "grad_norm": 22.223522186279297, "learning_rate": 2.5940002974356796e-05, "loss": 0.1849, "step": 46400 }, { "epoch": 0.6657632077095165, "grad_norm": 1.4301670789718628, "learning_rate": 2.5936019460791604e-05, "loss": 0.2081, "step": 46425 }, { "epoch": 0.666121723168703, "grad_norm": 2.011647939682007, "learning_rate": 2.5932035947226415e-05, "loss": 0.1707, "step": 46450 }, { "epoch": 0.6664802386278896, "grad_norm": 7.7340006828308105, "learning_rate": 2.5928052433661223e-05, "loss": 0.2034, "step": 46475 }, { "epoch": 0.6668387540870763, "grad_norm": 1.8950905799865723, "learning_rate": 2.592406892009603e-05, "loss": 0.2275, "step": 46500 }, { "epoch": 0.6671972695462628, "grad_norm": 4.989152431488037, "learning_rate": 2.592008540653084e-05, "loss": 0.1681, "step": 46525 }, { "epoch": 0.6675557850054494, "grad_norm": 6.794288635253906, "learning_rate": 2.5916101892965647e-05, "loss": 0.2509, "step": 46550 }, { "epoch": 0.6679143004646361, "grad_norm": 0.2754022181034088, "learning_rate": 2.5912118379400458e-05, "loss": 0.2876, "step": 46575 }, { "epoch": 0.6682728159238226, "grad_norm": 5.704349517822266, "learning_rate": 2.5908134865835266e-05, "loss": 0.1849, "step": 46600 }, { "epoch": 0.6686313313830092, "grad_norm": 9.462201118469238, "learning_rate": 2.5904151352270074e-05, "loss": 0.1624, "step": 46625 }, { "epoch": 0.6689898468421959, "grad_norm": 1.1938451528549194, "learning_rate": 2.590016783870488e-05, "loss": 0.1524, "step": 46650 }, { "epoch": 0.6693483623013824, "grad_norm": 0.770871102809906, "learning_rate": 2.589618432513969e-05, "loss": 0.1559, "step": 46675 }, { "epoch": 0.669706877760569, "grad_norm": 0.28993096947669983, "learning_rate": 2.58922008115745e-05, "loss": 0.1496, "step": 46700 }, { "epoch": 0.6700653932197557, "grad_norm": 9.67967414855957, "learning_rate": 2.588821729800931e-05, "loss": 0.2845, "step": 46725 }, { "epoch": 0.6704239086789422, "grad_norm": 18.33902931213379, "learning_rate": 2.5884233784444113e-05, "loss": 0.2255, "step": 46750 }, { "epoch": 0.6707824241381288, "grad_norm": 6.15297269821167, "learning_rate": 2.588025027087892e-05, "loss": 0.1528, "step": 46775 }, { "epoch": 0.6711409395973155, "grad_norm": 4.708961486816406, "learning_rate": 2.587626675731373e-05, "loss": 0.1327, "step": 46800 }, { "epoch": 0.671499455056502, "grad_norm": 11.652478218078613, "learning_rate": 2.587228324374854e-05, "loss": 0.221, "step": 46825 }, { "epoch": 0.6718579705156886, "grad_norm": 5.711638450622559, "learning_rate": 2.5868299730183348e-05, "loss": 0.336, "step": 46850 }, { "epoch": 0.6722164859748753, "grad_norm": 0.9070949554443359, "learning_rate": 2.5864316216618156e-05, "loss": 0.2474, "step": 46875 }, { "epoch": 0.6725750014340618, "grad_norm": 32.49085998535156, "learning_rate": 2.5860332703052963e-05, "loss": 0.2028, "step": 46900 }, { "epoch": 0.6729335168932484, "grad_norm": 0.7637625336647034, "learning_rate": 2.585634918948777e-05, "loss": 0.1853, "step": 46925 }, { "epoch": 0.6732920323524351, "grad_norm": 13.205764770507812, "learning_rate": 2.5852365675922582e-05, "loss": 0.2545, "step": 46950 }, { "epoch": 0.6736505478116216, "grad_norm": 12.2720365524292, "learning_rate": 2.584838216235739e-05, "loss": 0.1362, "step": 46975 }, { "epoch": 0.6740090632708082, "grad_norm": 2.2597620487213135, "learning_rate": 2.5844398648792198e-05, "loss": 0.136, "step": 47000 }, { "epoch": 0.6743675787299949, "grad_norm": 9.305859565734863, "learning_rate": 2.5840415135227006e-05, "loss": 0.3192, "step": 47025 }, { "epoch": 0.6747260941891814, "grad_norm": 0.2837841212749481, "learning_rate": 2.5836431621661817e-05, "loss": 0.1868, "step": 47050 }, { "epoch": 0.675084609648368, "grad_norm": 9.075830459594727, "learning_rate": 2.5832448108096625e-05, "loss": 0.2146, "step": 47075 }, { "epoch": 0.6754431251075547, "grad_norm": 0.24335120618343353, "learning_rate": 2.5828464594531433e-05, "loss": 0.2109, "step": 47100 }, { "epoch": 0.6758016405667412, "grad_norm": 0.4656877815723419, "learning_rate": 2.582448108096624e-05, "loss": 0.2237, "step": 47125 }, { "epoch": 0.6761601560259278, "grad_norm": 13.2868013381958, "learning_rate": 2.582049756740105e-05, "loss": 0.154, "step": 47150 }, { "epoch": 0.6765186714851145, "grad_norm": 21.525081634521484, "learning_rate": 2.581651405383586e-05, "loss": 0.2728, "step": 47175 }, { "epoch": 0.676877186944301, "grad_norm": 0.4387757480144501, "learning_rate": 2.5812530540270668e-05, "loss": 0.1945, "step": 47200 }, { "epoch": 0.6772357024034876, "grad_norm": 8.687211036682129, "learning_rate": 2.5808547026705476e-05, "loss": 0.224, "step": 47225 }, { "epoch": 0.6775942178626743, "grad_norm": 12.153742790222168, "learning_rate": 2.5804563513140283e-05, "loss": 0.1872, "step": 47250 }, { "epoch": 0.6779527333218608, "grad_norm": 5.658341407775879, "learning_rate": 2.580057999957509e-05, "loss": 0.1308, "step": 47275 }, { "epoch": 0.6783112487810474, "grad_norm": 10.07785701751709, "learning_rate": 2.5796596486009902e-05, "loss": 0.2066, "step": 47300 }, { "epoch": 0.6786697642402341, "grad_norm": 1.080583095550537, "learning_rate": 2.579261297244471e-05, "loss": 0.2617, "step": 47325 }, { "epoch": 0.6790282796994206, "grad_norm": 3.930149555206299, "learning_rate": 2.5788629458879518e-05, "loss": 0.2409, "step": 47350 }, { "epoch": 0.6793867951586072, "grad_norm": 0.7903839945793152, "learning_rate": 2.5784645945314326e-05, "loss": 0.1467, "step": 47375 }, { "epoch": 0.6797453106177939, "grad_norm": 4.271041393280029, "learning_rate": 2.5780662431749134e-05, "loss": 0.1128, "step": 47400 }, { "epoch": 0.6801038260769804, "grad_norm": 0.4626365602016449, "learning_rate": 2.5776678918183945e-05, "loss": 0.2582, "step": 47425 }, { "epoch": 0.680462341536167, "grad_norm": 12.26414680480957, "learning_rate": 2.5772695404618753e-05, "loss": 0.1669, "step": 47450 }, { "epoch": 0.6808208569953537, "grad_norm": 11.69873332977295, "learning_rate": 2.576871189105356e-05, "loss": 0.1805, "step": 47475 }, { "epoch": 0.6811793724545402, "grad_norm": 6.720857620239258, "learning_rate": 2.576472837748837e-05, "loss": 0.1903, "step": 47500 }, { "epoch": 0.6815378879137268, "grad_norm": 6.980034351348877, "learning_rate": 2.5760744863923176e-05, "loss": 0.2679, "step": 47525 }, { "epoch": 0.6818964033729135, "grad_norm": 0.14807264506816864, "learning_rate": 2.5756761350357988e-05, "loss": 0.1743, "step": 47550 }, { "epoch": 0.6822549188321, "grad_norm": 8.733154296875, "learning_rate": 2.5752777836792796e-05, "loss": 0.1668, "step": 47575 }, { "epoch": 0.6826134342912866, "grad_norm": 13.774730682373047, "learning_rate": 2.5748794323227603e-05, "loss": 0.2044, "step": 47600 }, { "epoch": 0.6829719497504733, "grad_norm": 14.60163688659668, "learning_rate": 2.574481080966241e-05, "loss": 0.1976, "step": 47625 }, { "epoch": 0.6833304652096598, "grad_norm": 4.03992223739624, "learning_rate": 2.574082729609722e-05, "loss": 0.1972, "step": 47650 }, { "epoch": 0.6836889806688464, "grad_norm": 1.09526526927948, "learning_rate": 2.573684378253203e-05, "loss": 0.2166, "step": 47675 }, { "epoch": 0.6840474961280331, "grad_norm": 7.010862827301025, "learning_rate": 2.5732860268966838e-05, "loss": 0.2316, "step": 47700 }, { "epoch": 0.6844060115872196, "grad_norm": 0.31904053688049316, "learning_rate": 2.5728876755401646e-05, "loss": 0.2194, "step": 47725 }, { "epoch": 0.6847645270464062, "grad_norm": 9.083686828613281, "learning_rate": 2.5724893241836454e-05, "loss": 0.3272, "step": 47750 }, { "epoch": 0.6851230425055929, "grad_norm": 2.387876033782959, "learning_rate": 2.5720909728271265e-05, "loss": 0.1503, "step": 47775 }, { "epoch": 0.6854815579647794, "grad_norm": 3.815077781677246, "learning_rate": 2.5716926214706073e-05, "loss": 0.0962, "step": 47800 }, { "epoch": 0.685840073423966, "grad_norm": 2.83951997756958, "learning_rate": 2.571294270114088e-05, "loss": 0.1806, "step": 47825 }, { "epoch": 0.6861985888831527, "grad_norm": 4.126065731048584, "learning_rate": 2.5708959187575685e-05, "loss": 0.1047, "step": 47850 }, { "epoch": 0.6865571043423393, "grad_norm": 7.615231037139893, "learning_rate": 2.5704975674010493e-05, "loss": 0.1561, "step": 47875 }, { "epoch": 0.6869156198015258, "grad_norm": 12.71405029296875, "learning_rate": 2.5700992160445304e-05, "loss": 0.2239, "step": 47900 }, { "epoch": 0.6872741352607125, "grad_norm": 13.619203567504883, "learning_rate": 2.5697008646880112e-05, "loss": 0.1694, "step": 47925 }, { "epoch": 0.6876326507198991, "grad_norm": 8.06566333770752, "learning_rate": 2.569302513331492e-05, "loss": 0.2326, "step": 47950 }, { "epoch": 0.6879911661790856, "grad_norm": 0.9590276479721069, "learning_rate": 2.5689041619749728e-05, "loss": 0.1806, "step": 47975 }, { "epoch": 0.6883496816382723, "grad_norm": 5.3854780197143555, "learning_rate": 2.5685058106184536e-05, "loss": 0.2452, "step": 48000 }, { "epoch": 0.6887081970974589, "grad_norm": 18.00775718688965, "learning_rate": 2.5681074592619347e-05, "loss": 0.2546, "step": 48025 }, { "epoch": 0.6890667125566454, "grad_norm": 2.0834193229675293, "learning_rate": 2.5677091079054155e-05, "loss": 0.2248, "step": 48050 }, { "epoch": 0.6894252280158321, "grad_norm": 14.408710479736328, "learning_rate": 2.5673107565488963e-05, "loss": 0.1806, "step": 48075 }, { "epoch": 0.6897837434750187, "grad_norm": 10.678139686584473, "learning_rate": 2.566912405192377e-05, "loss": 0.2534, "step": 48100 }, { "epoch": 0.6901422589342052, "grad_norm": 16.767229080200195, "learning_rate": 2.5665140538358578e-05, "loss": 0.1975, "step": 48125 }, { "epoch": 0.6905007743933919, "grad_norm": 4.811177730560303, "learning_rate": 2.566115702479339e-05, "loss": 0.2227, "step": 48150 }, { "epoch": 0.6908592898525785, "grad_norm": 6.717509746551514, "learning_rate": 2.5657173511228197e-05, "loss": 0.2194, "step": 48175 }, { "epoch": 0.691217805311765, "grad_norm": 16.557897567749023, "learning_rate": 2.5653189997663005e-05, "loss": 0.2451, "step": 48200 }, { "epoch": 0.6915763207709517, "grad_norm": 6.381607532501221, "learning_rate": 2.5649206484097813e-05, "loss": 0.2046, "step": 48225 }, { "epoch": 0.6919348362301383, "grad_norm": 17.191328048706055, "learning_rate": 2.564522297053262e-05, "loss": 0.2016, "step": 48250 }, { "epoch": 0.6922933516893248, "grad_norm": 15.16845703125, "learning_rate": 2.5641239456967432e-05, "loss": 0.2491, "step": 48275 }, { "epoch": 0.6926518671485115, "grad_norm": 8.18386459350586, "learning_rate": 2.563725594340224e-05, "loss": 0.2562, "step": 48300 }, { "epoch": 0.6930103826076981, "grad_norm": 9.809605598449707, "learning_rate": 2.5633272429837048e-05, "loss": 0.2339, "step": 48325 }, { "epoch": 0.6933688980668846, "grad_norm": 20.23722267150879, "learning_rate": 2.5629288916271856e-05, "loss": 0.2619, "step": 48350 }, { "epoch": 0.6937274135260713, "grad_norm": 13.737516403198242, "learning_rate": 2.5625305402706667e-05, "loss": 0.2202, "step": 48375 }, { "epoch": 0.6940859289852579, "grad_norm": 11.046343803405762, "learning_rate": 2.5621321889141475e-05, "loss": 0.2184, "step": 48400 }, { "epoch": 0.6944444444444444, "grad_norm": 5.841567039489746, "learning_rate": 2.5617338375576283e-05, "loss": 0.1759, "step": 48425 }, { "epoch": 0.6948029599036311, "grad_norm": 1.9102824926376343, "learning_rate": 2.561335486201109e-05, "loss": 0.1623, "step": 48450 }, { "epoch": 0.6951614753628177, "grad_norm": 13.114572525024414, "learning_rate": 2.5609371348445898e-05, "loss": 0.1362, "step": 48475 }, { "epoch": 0.6955199908220042, "grad_norm": 6.338075637817383, "learning_rate": 2.560538783488071e-05, "loss": 0.3121, "step": 48500 }, { "epoch": 0.6958785062811909, "grad_norm": 1.0590856075286865, "learning_rate": 2.5601404321315517e-05, "loss": 0.2313, "step": 48525 }, { "epoch": 0.6962370217403775, "grad_norm": 2.3297998905181885, "learning_rate": 2.5597420807750325e-05, "loss": 0.2545, "step": 48550 }, { "epoch": 0.696595537199564, "grad_norm": 19.888967514038086, "learning_rate": 2.5593437294185133e-05, "loss": 0.2591, "step": 48575 }, { "epoch": 0.6969540526587507, "grad_norm": 6.70273494720459, "learning_rate": 2.558945378061994e-05, "loss": 0.176, "step": 48600 }, { "epoch": 0.6973125681179373, "grad_norm": 22.711244583129883, "learning_rate": 2.5585470267054752e-05, "loss": 0.2709, "step": 48625 }, { "epoch": 0.6976710835771238, "grad_norm": 3.8228015899658203, "learning_rate": 2.558148675348956e-05, "loss": 0.182, "step": 48650 }, { "epoch": 0.6980295990363105, "grad_norm": 6.995223522186279, "learning_rate": 2.5577503239924368e-05, "loss": 0.2686, "step": 48675 }, { "epoch": 0.6983881144954971, "grad_norm": 9.635945320129395, "learning_rate": 2.5573519726359176e-05, "loss": 0.1639, "step": 48700 }, { "epoch": 0.6987466299546836, "grad_norm": 13.649540901184082, "learning_rate": 2.5569536212793983e-05, "loss": 0.2807, "step": 48725 }, { "epoch": 0.6991051454138703, "grad_norm": 3.1263246536254883, "learning_rate": 2.5565552699228795e-05, "loss": 0.1633, "step": 48750 }, { "epoch": 0.6994636608730569, "grad_norm": 0.9838136434555054, "learning_rate": 2.5561569185663603e-05, "loss": 0.2295, "step": 48775 }, { "epoch": 0.6998221763322434, "grad_norm": 9.139395713806152, "learning_rate": 2.555758567209841e-05, "loss": 0.1735, "step": 48800 }, { "epoch": 0.70018069179143, "grad_norm": 17.47401237487793, "learning_rate": 2.5553602158533218e-05, "loss": 0.1543, "step": 48825 }, { "epoch": 0.7005392072506167, "grad_norm": 16.929012298583984, "learning_rate": 2.5549618644968026e-05, "loss": 0.1757, "step": 48850 }, { "epoch": 0.7008977227098032, "grad_norm": 5.164494514465332, "learning_rate": 2.5545635131402837e-05, "loss": 0.3211, "step": 48875 }, { "epoch": 0.7012562381689899, "grad_norm": 14.255220413208008, "learning_rate": 2.5541651617837645e-05, "loss": 0.2276, "step": 48900 }, { "epoch": 0.7016147536281765, "grad_norm": 8.554600715637207, "learning_rate": 2.5537668104272453e-05, "loss": 0.1501, "step": 48925 }, { "epoch": 0.701973269087363, "grad_norm": 0.2092844545841217, "learning_rate": 2.5533684590707257e-05, "loss": 0.2756, "step": 48950 }, { "epoch": 0.7023317845465497, "grad_norm": 0.7037070989608765, "learning_rate": 2.552970107714207e-05, "loss": 0.1086, "step": 48975 }, { "epoch": 0.7026903000057363, "grad_norm": 0.5983951687812805, "learning_rate": 2.5525717563576877e-05, "loss": 0.1857, "step": 49000 }, { "epoch": 0.7030488154649228, "grad_norm": 8.813067436218262, "learning_rate": 2.5521734050011684e-05, "loss": 0.2234, "step": 49025 }, { "epoch": 0.7034073309241095, "grad_norm": 4.218845367431641, "learning_rate": 2.5517750536446492e-05, "loss": 0.2621, "step": 49050 }, { "epoch": 0.7037658463832961, "grad_norm": 2.2478420734405518, "learning_rate": 2.55137670228813e-05, "loss": 0.1981, "step": 49075 }, { "epoch": 0.7041243618424826, "grad_norm": 2.0957424640655518, "learning_rate": 2.550978350931611e-05, "loss": 0.2507, "step": 49100 }, { "epoch": 0.7044828773016693, "grad_norm": 3.1931381225585938, "learning_rate": 2.550579999575092e-05, "loss": 0.139, "step": 49125 }, { "epoch": 0.7048413927608559, "grad_norm": 16.279708862304688, "learning_rate": 2.5501816482185727e-05, "loss": 0.1467, "step": 49150 }, { "epoch": 0.7051999082200424, "grad_norm": 6.240708351135254, "learning_rate": 2.5497832968620535e-05, "loss": 0.186, "step": 49175 }, { "epoch": 0.705558423679229, "grad_norm": 3.056933879852295, "learning_rate": 2.5493849455055343e-05, "loss": 0.2007, "step": 49200 }, { "epoch": 0.7059169391384157, "grad_norm": 10.540163040161133, "learning_rate": 2.5489865941490154e-05, "loss": 0.2625, "step": 49225 }, { "epoch": 0.7062754545976022, "grad_norm": 0.9299947023391724, "learning_rate": 2.5485882427924962e-05, "loss": 0.2192, "step": 49250 }, { "epoch": 0.7066339700567889, "grad_norm": 8.712246894836426, "learning_rate": 2.548189891435977e-05, "loss": 0.2079, "step": 49275 }, { "epoch": 0.7069924855159755, "grad_norm": 16.147829055786133, "learning_rate": 2.5477915400794577e-05, "loss": 0.242, "step": 49300 }, { "epoch": 0.707351000975162, "grad_norm": 0.7288482785224915, "learning_rate": 2.5473931887229385e-05, "loss": 0.1535, "step": 49325 }, { "epoch": 0.7077095164343487, "grad_norm": 15.57101821899414, "learning_rate": 2.5469948373664197e-05, "loss": 0.2612, "step": 49350 }, { "epoch": 0.7080680318935353, "grad_norm": 4.914612293243408, "learning_rate": 2.5465964860099004e-05, "loss": 0.1927, "step": 49375 }, { "epoch": 0.7084265473527218, "grad_norm": 3.258941411972046, "learning_rate": 2.5461981346533812e-05, "loss": 0.1503, "step": 49400 }, { "epoch": 0.7087850628119085, "grad_norm": 1.606888771057129, "learning_rate": 2.545799783296862e-05, "loss": 0.1134, "step": 49425 }, { "epoch": 0.7091435782710951, "grad_norm": 11.503899574279785, "learning_rate": 2.5454014319403428e-05, "loss": 0.1505, "step": 49450 }, { "epoch": 0.7095020937302816, "grad_norm": 15.2843599319458, "learning_rate": 2.545003080583824e-05, "loss": 0.2253, "step": 49475 }, { "epoch": 0.7098606091894683, "grad_norm": 0.9246481657028198, "learning_rate": 2.5446047292273047e-05, "loss": 0.229, "step": 49500 }, { "epoch": 0.7102191246486549, "grad_norm": 7.89980936050415, "learning_rate": 2.5442063778707855e-05, "loss": 0.1918, "step": 49525 }, { "epoch": 0.7105776401078414, "grad_norm": 3.3449900150299072, "learning_rate": 2.5438080265142663e-05, "loss": 0.1451, "step": 49550 }, { "epoch": 0.710936155567028, "grad_norm": 8.845477104187012, "learning_rate": 2.5434096751577474e-05, "loss": 0.1213, "step": 49575 }, { "epoch": 0.7112946710262147, "grad_norm": 10.24954605102539, "learning_rate": 2.5430113238012282e-05, "loss": 0.2371, "step": 49600 }, { "epoch": 0.7116531864854012, "grad_norm": 9.736761093139648, "learning_rate": 2.542612972444709e-05, "loss": 0.2443, "step": 49625 }, { "epoch": 0.7120117019445878, "grad_norm": 0.12790893018245697, "learning_rate": 2.5422146210881897e-05, "loss": 0.1195, "step": 49650 }, { "epoch": 0.7123702174037745, "grad_norm": 3.1024563312530518, "learning_rate": 2.5418162697316705e-05, "loss": 0.1596, "step": 49675 }, { "epoch": 0.712728732862961, "grad_norm": 0.6731327176094055, "learning_rate": 2.5414179183751517e-05, "loss": 0.2472, "step": 49700 }, { "epoch": 0.7130872483221476, "grad_norm": 5.87436056137085, "learning_rate": 2.5410195670186324e-05, "loss": 0.2558, "step": 49725 }, { "epoch": 0.7134457637813343, "grad_norm": 11.592942237854004, "learning_rate": 2.5406212156621132e-05, "loss": 0.1185, "step": 49750 }, { "epoch": 0.7138042792405208, "grad_norm": 12.492218017578125, "learning_rate": 2.540222864305594e-05, "loss": 0.1668, "step": 49775 }, { "epoch": 0.7141627946997074, "grad_norm": 6.724203109741211, "learning_rate": 2.5398245129490748e-05, "loss": 0.1856, "step": 49800 }, { "epoch": 0.7145213101588941, "grad_norm": 11.544733047485352, "learning_rate": 2.539426161592556e-05, "loss": 0.3572, "step": 49825 }, { "epoch": 0.7148798256180806, "grad_norm": 14.367207527160645, "learning_rate": 2.5390278102360367e-05, "loss": 0.2992, "step": 49850 }, { "epoch": 0.7152383410772672, "grad_norm": 0.3356517553329468, "learning_rate": 2.5386294588795175e-05, "loss": 0.2415, "step": 49875 }, { "epoch": 0.7155968565364539, "grad_norm": 13.70170783996582, "learning_rate": 2.5382311075229983e-05, "loss": 0.2006, "step": 49900 }, { "epoch": 0.7159553719956404, "grad_norm": 9.459465026855469, "learning_rate": 2.537832756166479e-05, "loss": 0.2059, "step": 49925 }, { "epoch": 0.716313887454827, "grad_norm": 1.4820029735565186, "learning_rate": 2.5374344048099602e-05, "loss": 0.2672, "step": 49950 }, { "epoch": 0.7166724029140137, "grad_norm": 7.119725704193115, "learning_rate": 2.537036053453441e-05, "loss": 0.1811, "step": 49975 }, { "epoch": 0.7170309183732002, "grad_norm": 16.731685638427734, "learning_rate": 2.5366377020969217e-05, "loss": 0.2775, "step": 50000 }, { "epoch": 0.7173894338323868, "grad_norm": 7.746875286102295, "learning_rate": 2.5362393507404025e-05, "loss": 0.1646, "step": 50025 }, { "epoch": 0.7177479492915735, "grad_norm": 6.154821395874023, "learning_rate": 2.535840999383883e-05, "loss": 0.2881, "step": 50050 }, { "epoch": 0.71810646475076, "grad_norm": 7.0594892501831055, "learning_rate": 2.535442648027364e-05, "loss": 0.2611, "step": 50075 }, { "epoch": 0.7184649802099466, "grad_norm": 1.3026933670043945, "learning_rate": 2.535044296670845e-05, "loss": 0.2007, "step": 50100 }, { "epoch": 0.7188234956691333, "grad_norm": 4.820169448852539, "learning_rate": 2.5346459453143257e-05, "loss": 0.2498, "step": 50125 }, { "epoch": 0.7191820111283198, "grad_norm": 2.7407212257385254, "learning_rate": 2.5342475939578065e-05, "loss": 0.2428, "step": 50150 }, { "epoch": 0.7195405265875064, "grad_norm": 1.152086615562439, "learning_rate": 2.5338492426012876e-05, "loss": 0.1828, "step": 50175 }, { "epoch": 0.7198990420466931, "grad_norm": 6.955665111541748, "learning_rate": 2.5334508912447684e-05, "loss": 0.2592, "step": 50200 }, { "epoch": 0.7202575575058796, "grad_norm": 11.449328422546387, "learning_rate": 2.533052539888249e-05, "loss": 0.1894, "step": 50225 }, { "epoch": 0.7206160729650662, "grad_norm": 1.3846416473388672, "learning_rate": 2.53265418853173e-05, "loss": 0.1805, "step": 50250 }, { "epoch": 0.7209745884242529, "grad_norm": 19.705978393554688, "learning_rate": 2.5322558371752107e-05, "loss": 0.2164, "step": 50275 }, { "epoch": 0.7213331038834394, "grad_norm": 1.362453818321228, "learning_rate": 2.531857485818692e-05, "loss": 0.2448, "step": 50300 }, { "epoch": 0.721691619342626, "grad_norm": 13.886855125427246, "learning_rate": 2.5314591344621726e-05, "loss": 0.1272, "step": 50325 }, { "epoch": 0.7220501348018127, "grad_norm": 7.837186813354492, "learning_rate": 2.5310607831056534e-05, "loss": 0.2357, "step": 50350 }, { "epoch": 0.7224086502609992, "grad_norm": 2.939556360244751, "learning_rate": 2.5306624317491342e-05, "loss": 0.2459, "step": 50375 }, { "epoch": 0.7227671657201858, "grad_norm": 9.505022048950195, "learning_rate": 2.530264080392615e-05, "loss": 0.2039, "step": 50400 }, { "epoch": 0.7231256811793725, "grad_norm": 0.9654161930084229, "learning_rate": 2.529865729036096e-05, "loss": 0.2095, "step": 50425 }, { "epoch": 0.723484196638559, "grad_norm": 6.600845813751221, "learning_rate": 2.529467377679577e-05, "loss": 0.164, "step": 50450 }, { "epoch": 0.7238427120977456, "grad_norm": 12.237832069396973, "learning_rate": 2.5290690263230577e-05, "loss": 0.2186, "step": 50475 }, { "epoch": 0.7242012275569323, "grad_norm": 0.43458473682403564, "learning_rate": 2.5286706749665385e-05, "loss": 0.1802, "step": 50500 }, { "epoch": 0.7245597430161188, "grad_norm": 14.870211601257324, "learning_rate": 2.5282723236100192e-05, "loss": 0.1601, "step": 50525 }, { "epoch": 0.7249182584753054, "grad_norm": 5.181654453277588, "learning_rate": 2.5278739722535004e-05, "loss": 0.2308, "step": 50550 }, { "epoch": 0.7252767739344921, "grad_norm": 0.2100396454334259, "learning_rate": 2.527475620896981e-05, "loss": 0.2556, "step": 50575 }, { "epoch": 0.7256352893936786, "grad_norm": 12.33954906463623, "learning_rate": 2.527077269540462e-05, "loss": 0.2346, "step": 50600 }, { "epoch": 0.7259938048528652, "grad_norm": 6.111337661743164, "learning_rate": 2.5266789181839427e-05, "loss": 0.27, "step": 50625 }, { "epoch": 0.7263523203120519, "grad_norm": 8.435004234313965, "learning_rate": 2.5262805668274235e-05, "loss": 0.1743, "step": 50650 }, { "epoch": 0.7267108357712384, "grad_norm": 18.890365600585938, "learning_rate": 2.5258822154709046e-05, "loss": 0.2121, "step": 50675 }, { "epoch": 0.727069351230425, "grad_norm": 20.226919174194336, "learning_rate": 2.5254838641143854e-05, "loss": 0.2059, "step": 50700 }, { "epoch": 0.7274278666896117, "grad_norm": 2.4947245121002197, "learning_rate": 2.5250855127578662e-05, "loss": 0.2362, "step": 50725 }, { "epoch": 0.7277863821487982, "grad_norm": 13.257113456726074, "learning_rate": 2.524687161401347e-05, "loss": 0.2847, "step": 50750 }, { "epoch": 0.7281448976079848, "grad_norm": 5.493901252746582, "learning_rate": 2.524288810044828e-05, "loss": 0.2746, "step": 50775 }, { "epoch": 0.7285034130671715, "grad_norm": 2.663189172744751, "learning_rate": 2.523890458688309e-05, "loss": 0.1163, "step": 50800 }, { "epoch": 0.728861928526358, "grad_norm": 6.742499351501465, "learning_rate": 2.5234921073317897e-05, "loss": 0.347, "step": 50825 }, { "epoch": 0.7292204439855446, "grad_norm": 10.551657676696777, "learning_rate": 2.5230937559752704e-05, "loss": 0.275, "step": 50850 }, { "epoch": 0.7295789594447313, "grad_norm": 14.535724639892578, "learning_rate": 2.5226954046187512e-05, "loss": 0.3556, "step": 50875 }, { "epoch": 0.7299374749039178, "grad_norm": 6.292080402374268, "learning_rate": 2.5222970532622324e-05, "loss": 0.1746, "step": 50900 }, { "epoch": 0.7302959903631044, "grad_norm": 1.711198329925537, "learning_rate": 2.521898701905713e-05, "loss": 0.1514, "step": 50925 }, { "epoch": 0.7306545058222911, "grad_norm": 1.457282543182373, "learning_rate": 2.521500350549194e-05, "loss": 0.2643, "step": 50950 }, { "epoch": 0.7310130212814776, "grad_norm": 21.3856201171875, "learning_rate": 2.5211019991926747e-05, "loss": 0.2453, "step": 50975 }, { "epoch": 0.7313715367406642, "grad_norm": 11.05455207824707, "learning_rate": 2.5207036478361555e-05, "loss": 0.135, "step": 51000 }, { "epoch": 0.7317300521998509, "grad_norm": 6.445497035980225, "learning_rate": 2.5203052964796366e-05, "loss": 0.2338, "step": 51025 }, { "epoch": 0.7320885676590374, "grad_norm": 4.915597438812256, "learning_rate": 2.5199069451231174e-05, "loss": 0.252, "step": 51050 }, { "epoch": 0.732447083118224, "grad_norm": 7.5260419845581055, "learning_rate": 2.5195085937665982e-05, "loss": 0.2018, "step": 51075 }, { "epoch": 0.7328055985774107, "grad_norm": 0.08552476763725281, "learning_rate": 2.519110242410079e-05, "loss": 0.1461, "step": 51100 }, { "epoch": 0.7331641140365972, "grad_norm": 0.7378263473510742, "learning_rate": 2.5187118910535598e-05, "loss": 0.0728, "step": 51125 }, { "epoch": 0.7335226294957838, "grad_norm": 9.249420166015625, "learning_rate": 2.5183135396970405e-05, "loss": 0.1978, "step": 51150 }, { "epoch": 0.7338811449549705, "grad_norm": 21.197046279907227, "learning_rate": 2.5179151883405213e-05, "loss": 0.2, "step": 51175 }, { "epoch": 0.7342396604141571, "grad_norm": 0.5963737368583679, "learning_rate": 2.517516836984002e-05, "loss": 0.1613, "step": 51200 }, { "epoch": 0.7345981758733436, "grad_norm": 16.758880615234375, "learning_rate": 2.517118485627483e-05, "loss": 0.1463, "step": 51225 }, { "epoch": 0.7349566913325303, "grad_norm": 1.6427079439163208, "learning_rate": 2.5167201342709637e-05, "loss": 0.1925, "step": 51250 }, { "epoch": 0.7353152067917169, "grad_norm": 5.30280876159668, "learning_rate": 2.5163217829144448e-05, "loss": 0.2161, "step": 51275 }, { "epoch": 0.7356737222509034, "grad_norm": 20.516305923461914, "learning_rate": 2.5159234315579256e-05, "loss": 0.2658, "step": 51300 }, { "epoch": 0.7360322377100901, "grad_norm": 2.5681324005126953, "learning_rate": 2.5155250802014064e-05, "loss": 0.1937, "step": 51325 }, { "epoch": 0.7363907531692767, "grad_norm": 3.99983549118042, "learning_rate": 2.515126728844887e-05, "loss": 0.1551, "step": 51350 }, { "epoch": 0.7367492686284632, "grad_norm": 11.31380558013916, "learning_rate": 2.5147283774883683e-05, "loss": 0.3306, "step": 51375 }, { "epoch": 0.7371077840876499, "grad_norm": 14.364718437194824, "learning_rate": 2.514330026131849e-05, "loss": 0.2435, "step": 51400 }, { "epoch": 0.7374662995468365, "grad_norm": 2.219434976577759, "learning_rate": 2.51393167477533e-05, "loss": 0.1908, "step": 51425 }, { "epoch": 0.737824815006023, "grad_norm": 14.235979080200195, "learning_rate": 2.5135333234188106e-05, "loss": 0.2313, "step": 51450 }, { "epoch": 0.7381833304652097, "grad_norm": 2.458843469619751, "learning_rate": 2.5131349720622914e-05, "loss": 0.2358, "step": 51475 }, { "epoch": 0.7385418459243963, "grad_norm": 1.0393255949020386, "learning_rate": 2.5127366207057725e-05, "loss": 0.2335, "step": 51500 }, { "epoch": 0.7389003613835828, "grad_norm": 14.5276517868042, "learning_rate": 2.5123382693492533e-05, "loss": 0.2427, "step": 51525 }, { "epoch": 0.7392588768427695, "grad_norm": 8.615577697753906, "learning_rate": 2.511939917992734e-05, "loss": 0.3657, "step": 51550 }, { "epoch": 0.7396173923019561, "grad_norm": 3.619020700454712, "learning_rate": 2.511541566636215e-05, "loss": 0.1338, "step": 51575 }, { "epoch": 0.7399759077611426, "grad_norm": 8.503814697265625, "learning_rate": 2.5111432152796957e-05, "loss": 0.1609, "step": 51600 }, { "epoch": 0.7403344232203293, "grad_norm": 2.9111757278442383, "learning_rate": 2.5107448639231768e-05, "loss": 0.1746, "step": 51625 }, { "epoch": 0.7406929386795159, "grad_norm": 9.35600471496582, "learning_rate": 2.5103465125666576e-05, "loss": 0.2041, "step": 51650 }, { "epoch": 0.7410514541387024, "grad_norm": 0.6955277919769287, "learning_rate": 2.5099481612101384e-05, "loss": 0.1291, "step": 51675 }, { "epoch": 0.7414099695978891, "grad_norm": 10.412870407104492, "learning_rate": 2.509549809853619e-05, "loss": 0.3525, "step": 51700 }, { "epoch": 0.7417684850570757, "grad_norm": 3.3583824634552, "learning_rate": 2.5091514584971e-05, "loss": 0.3255, "step": 51725 }, { "epoch": 0.7421270005162622, "grad_norm": 6.914744853973389, "learning_rate": 2.508753107140581e-05, "loss": 0.1806, "step": 51750 }, { "epoch": 0.7424855159754489, "grad_norm": 0.12309455871582031, "learning_rate": 2.508354755784062e-05, "loss": 0.1568, "step": 51775 }, { "epoch": 0.7428440314346355, "grad_norm": 2.436800718307495, "learning_rate": 2.5079564044275426e-05, "loss": 0.1939, "step": 51800 }, { "epoch": 0.743202546893822, "grad_norm": 1.0710885524749756, "learning_rate": 2.5075580530710234e-05, "loss": 0.1934, "step": 51825 }, { "epoch": 0.7435610623530087, "grad_norm": 14.00747013092041, "learning_rate": 2.5071597017145042e-05, "loss": 0.136, "step": 51850 }, { "epoch": 0.7439195778121953, "grad_norm": 1.892797827720642, "learning_rate": 2.5067613503579853e-05, "loss": 0.2287, "step": 51875 }, { "epoch": 0.7442780932713818, "grad_norm": 19.142641067504883, "learning_rate": 2.506362999001466e-05, "loss": 0.3185, "step": 51900 }, { "epoch": 0.7446366087305685, "grad_norm": 2.9137306213378906, "learning_rate": 2.505964647644947e-05, "loss": 0.226, "step": 51925 }, { "epoch": 0.7449951241897551, "grad_norm": 7.534127712249756, "learning_rate": 2.5055662962884277e-05, "loss": 0.2872, "step": 51950 }, { "epoch": 0.7453536396489416, "grad_norm": 2.62214994430542, "learning_rate": 2.5051679449319088e-05, "loss": 0.1148, "step": 51975 }, { "epoch": 0.7457121551081283, "grad_norm": 11.454050064086914, "learning_rate": 2.5047695935753896e-05, "loss": 0.2234, "step": 52000 }, { "epoch": 0.7460706705673149, "grad_norm": 10.931231498718262, "learning_rate": 2.5043712422188704e-05, "loss": 0.1149, "step": 52025 }, { "epoch": 0.7464291860265014, "grad_norm": 0.8193973898887634, "learning_rate": 2.503972890862351e-05, "loss": 0.2089, "step": 52050 }, { "epoch": 0.7467877014856881, "grad_norm": 6.273176670074463, "learning_rate": 2.503574539505832e-05, "loss": 0.1526, "step": 52075 }, { "epoch": 0.7471462169448747, "grad_norm": 4.701350688934326, "learning_rate": 2.503176188149313e-05, "loss": 0.1683, "step": 52100 }, { "epoch": 0.7475047324040612, "grad_norm": 3.211033821105957, "learning_rate": 2.502777836792794e-05, "loss": 0.198, "step": 52125 }, { "epoch": 0.7478632478632479, "grad_norm": 14.844167709350586, "learning_rate": 2.5023794854362746e-05, "loss": 0.2299, "step": 52150 }, { "epoch": 0.7482217633224345, "grad_norm": 2.518960952758789, "learning_rate": 2.5019811340797554e-05, "loss": 0.2201, "step": 52175 }, { "epoch": 0.748580278781621, "grad_norm": 10.093320846557617, "learning_rate": 2.5015827827232362e-05, "loss": 0.2877, "step": 52200 }, { "epoch": 0.7489387942408077, "grad_norm": 0.37909093499183655, "learning_rate": 2.501184431366717e-05, "loss": 0.1446, "step": 52225 }, { "epoch": 0.7492973096999943, "grad_norm": 0.05344092473387718, "learning_rate": 2.5007860800101978e-05, "loss": 0.2626, "step": 52250 }, { "epoch": 0.7496558251591808, "grad_norm": 17.78056526184082, "learning_rate": 2.5003877286536786e-05, "loss": 0.2615, "step": 52275 }, { "epoch": 0.7500143406183675, "grad_norm": 18.557411193847656, "learning_rate": 2.4999893772971593e-05, "loss": 0.3131, "step": 52300 }, { "epoch": 0.7503728560775541, "grad_norm": 4.1507954597473145, "learning_rate": 2.49959102594064e-05, "loss": 0.1451, "step": 52325 }, { "epoch": 0.7507313715367406, "grad_norm": 0.05050576478242874, "learning_rate": 2.4991926745841212e-05, "loss": 0.2659, "step": 52350 }, { "epoch": 0.7510898869959273, "grad_norm": 5.469889163970947, "learning_rate": 2.498794323227602e-05, "loss": 0.1886, "step": 52375 }, { "epoch": 0.7514484024551139, "grad_norm": 5.750992298126221, "learning_rate": 2.4983959718710828e-05, "loss": 0.1374, "step": 52400 }, { "epoch": 0.7518069179143004, "grad_norm": 13.538805961608887, "learning_rate": 2.4979976205145636e-05, "loss": 0.2516, "step": 52425 }, { "epoch": 0.7521654333734871, "grad_norm": 3.49733567237854, "learning_rate": 2.4975992691580444e-05, "loss": 0.1236, "step": 52450 }, { "epoch": 0.7525239488326737, "grad_norm": 11.666732788085938, "learning_rate": 2.4972009178015255e-05, "loss": 0.2631, "step": 52475 }, { "epoch": 0.7528824642918602, "grad_norm": 2.9963459968566895, "learning_rate": 2.4968025664450063e-05, "loss": 0.2716, "step": 52500 }, { "epoch": 0.7532409797510469, "grad_norm": 4.284170627593994, "learning_rate": 2.496404215088487e-05, "loss": 0.2141, "step": 52525 }, { "epoch": 0.7535994952102335, "grad_norm": 8.028654098510742, "learning_rate": 2.496005863731968e-05, "loss": 0.2608, "step": 52550 }, { "epoch": 0.75395801066942, "grad_norm": 12.70154857635498, "learning_rate": 2.4956075123754486e-05, "loss": 0.1807, "step": 52575 }, { "epoch": 0.7543165261286067, "grad_norm": 2.395838499069214, "learning_rate": 2.4952091610189298e-05, "loss": 0.2377, "step": 52600 }, { "epoch": 0.7546750415877933, "grad_norm": 6.887660980224609, "learning_rate": 2.4948108096624106e-05, "loss": 0.1827, "step": 52625 }, { "epoch": 0.7550335570469798, "grad_norm": 0.5965154767036438, "learning_rate": 2.4944124583058913e-05, "loss": 0.1326, "step": 52650 }, { "epoch": 0.7553920725061665, "grad_norm": 0.16648058593273163, "learning_rate": 2.494014106949372e-05, "loss": 0.2827, "step": 52675 }, { "epoch": 0.7557505879653531, "grad_norm": 9.64054012298584, "learning_rate": 2.4936157555928532e-05, "loss": 0.1867, "step": 52700 }, { "epoch": 0.7561091034245396, "grad_norm": 0.6860041618347168, "learning_rate": 2.493217404236334e-05, "loss": 0.1665, "step": 52725 }, { "epoch": 0.7564676188837263, "grad_norm": 3.2539355754852295, "learning_rate": 2.4928190528798148e-05, "loss": 0.1413, "step": 52750 }, { "epoch": 0.7568261343429129, "grad_norm": 1.5899807214736938, "learning_rate": 2.4924207015232956e-05, "loss": 0.2324, "step": 52775 }, { "epoch": 0.7571846498020994, "grad_norm": 2.396925687789917, "learning_rate": 2.4920223501667764e-05, "loss": 0.2456, "step": 52800 }, { "epoch": 0.7575431652612861, "grad_norm": 10.850423812866211, "learning_rate": 2.4916239988102575e-05, "loss": 0.2631, "step": 52825 }, { "epoch": 0.7579016807204727, "grad_norm": 15.237521171569824, "learning_rate": 2.4912256474537383e-05, "loss": 0.2683, "step": 52850 }, { "epoch": 0.7582601961796592, "grad_norm": 16.1119327545166, "learning_rate": 2.490827296097219e-05, "loss": 0.2144, "step": 52875 }, { "epoch": 0.7586187116388459, "grad_norm": 6.466443061828613, "learning_rate": 2.4904289447407e-05, "loss": 0.2446, "step": 52900 }, { "epoch": 0.7589772270980325, "grad_norm": 11.207780838012695, "learning_rate": 2.4900305933841806e-05, "loss": 0.3181, "step": 52925 }, { "epoch": 0.759335742557219, "grad_norm": 0.2926609218120575, "learning_rate": 2.4896322420276618e-05, "loss": 0.1073, "step": 52950 }, { "epoch": 0.7596942580164057, "grad_norm": 13.7081880569458, "learning_rate": 2.4892338906711425e-05, "loss": 0.2041, "step": 52975 }, { "epoch": 0.7600527734755923, "grad_norm": 22.871788024902344, "learning_rate": 2.4888355393146233e-05, "loss": 0.2071, "step": 53000 }, { "epoch": 0.7604112889347788, "grad_norm": 17.747793197631836, "learning_rate": 2.488437187958104e-05, "loss": 0.1604, "step": 53025 }, { "epoch": 0.7607698043939655, "grad_norm": 12.251692771911621, "learning_rate": 2.488038836601585e-05, "loss": 0.1554, "step": 53050 }, { "epoch": 0.7611283198531521, "grad_norm": 17.914535522460938, "learning_rate": 2.487640485245066e-05, "loss": 0.2363, "step": 53075 }, { "epoch": 0.7614868353123386, "grad_norm": 0.10899394750595093, "learning_rate": 2.4872421338885468e-05, "loss": 0.2455, "step": 53100 }, { "epoch": 0.7618453507715253, "grad_norm": 4.818118095397949, "learning_rate": 2.4868437825320276e-05, "loss": 0.1749, "step": 53125 }, { "epoch": 0.7622038662307119, "grad_norm": 12.013470649719238, "learning_rate": 2.4864454311755084e-05, "loss": 0.233, "step": 53150 }, { "epoch": 0.7625623816898984, "grad_norm": 1.1978198289871216, "learning_rate": 2.486047079818989e-05, "loss": 0.1518, "step": 53175 }, { "epoch": 0.7629208971490851, "grad_norm": 5.6515913009643555, "learning_rate": 2.4856487284624703e-05, "loss": 0.1989, "step": 53200 }, { "epoch": 0.7632794126082717, "grad_norm": 9.16110897064209, "learning_rate": 2.485250377105951e-05, "loss": 0.1292, "step": 53225 }, { "epoch": 0.7636379280674582, "grad_norm": 12.218998908996582, "learning_rate": 2.484852025749432e-05, "loss": 0.1556, "step": 53250 }, { "epoch": 0.7639964435266449, "grad_norm": 10.687811851501465, "learning_rate": 2.4844536743929126e-05, "loss": 0.2422, "step": 53275 }, { "epoch": 0.7643549589858315, "grad_norm": 4.475570201873779, "learning_rate": 2.4840553230363938e-05, "loss": 0.2205, "step": 53300 }, { "epoch": 0.764713474445018, "grad_norm": 12.928417205810547, "learning_rate": 2.4836569716798742e-05, "loss": 0.1412, "step": 53325 }, { "epoch": 0.7650719899042047, "grad_norm": 7.806126594543457, "learning_rate": 2.483258620323355e-05, "loss": 0.2113, "step": 53350 }, { "epoch": 0.7654305053633913, "grad_norm": 13.216428756713867, "learning_rate": 2.4828602689668358e-05, "loss": 0.1411, "step": 53375 }, { "epoch": 0.7657890208225778, "grad_norm": 4.43410587310791, "learning_rate": 2.4824619176103166e-05, "loss": 0.2495, "step": 53400 }, { "epoch": 0.7661475362817645, "grad_norm": 20.020328521728516, "learning_rate": 2.4820635662537977e-05, "loss": 0.301, "step": 53425 }, { "epoch": 0.7665060517409511, "grad_norm": 1.4786295890808105, "learning_rate": 2.4816652148972785e-05, "loss": 0.1808, "step": 53450 }, { "epoch": 0.7668645672001376, "grad_norm": 2.543961524963379, "learning_rate": 2.4812668635407593e-05, "loss": 0.1855, "step": 53475 }, { "epoch": 0.7672230826593243, "grad_norm": 9.255824089050293, "learning_rate": 2.48086851218424e-05, "loss": 0.3175, "step": 53500 }, { "epoch": 0.7675815981185109, "grad_norm": 17.788209915161133, "learning_rate": 2.4804701608277208e-05, "loss": 0.1633, "step": 53525 }, { "epoch": 0.7679401135776974, "grad_norm": 0.18035510182380676, "learning_rate": 2.480071809471202e-05, "loss": 0.1863, "step": 53550 }, { "epoch": 0.7682986290368841, "grad_norm": 2.211282730102539, "learning_rate": 2.4796734581146827e-05, "loss": 0.1574, "step": 53575 }, { "epoch": 0.7686571444960707, "grad_norm": 5.269371032714844, "learning_rate": 2.4792751067581635e-05, "loss": 0.1387, "step": 53600 }, { "epoch": 0.7690156599552572, "grad_norm": 14.584433555603027, "learning_rate": 2.4788767554016443e-05, "loss": 0.2098, "step": 53625 }, { "epoch": 0.7693741754144439, "grad_norm": 5.606329441070557, "learning_rate": 2.478478404045125e-05, "loss": 0.2483, "step": 53650 }, { "epoch": 0.7697326908736305, "grad_norm": 7.44419002532959, "learning_rate": 2.4780800526886062e-05, "loss": 0.2387, "step": 53675 }, { "epoch": 0.770091206332817, "grad_norm": 5.980225563049316, "learning_rate": 2.477681701332087e-05, "loss": 0.1383, "step": 53700 }, { "epoch": 0.7704497217920037, "grad_norm": 10.472184181213379, "learning_rate": 2.4772833499755678e-05, "loss": 0.2208, "step": 53725 }, { "epoch": 0.7708082372511903, "grad_norm": 6.945937633514404, "learning_rate": 2.4768849986190486e-05, "loss": 0.1685, "step": 53750 }, { "epoch": 0.7711667527103768, "grad_norm": 29.39360237121582, "learning_rate": 2.4764866472625293e-05, "loss": 0.0994, "step": 53775 }, { "epoch": 0.7715252681695635, "grad_norm": 9.91408634185791, "learning_rate": 2.4760882959060105e-05, "loss": 0.2073, "step": 53800 }, { "epoch": 0.7718837836287501, "grad_norm": 2.0912094116210938, "learning_rate": 2.4756899445494913e-05, "loss": 0.2202, "step": 53825 }, { "epoch": 0.7722422990879366, "grad_norm": 15.258938789367676, "learning_rate": 2.475291593192972e-05, "loss": 0.2252, "step": 53850 }, { "epoch": 0.7726008145471233, "grad_norm": 2.264417886734009, "learning_rate": 2.4748932418364528e-05, "loss": 0.2302, "step": 53875 }, { "epoch": 0.7729593300063099, "grad_norm": 14.59819221496582, "learning_rate": 2.474494890479934e-05, "loss": 0.2309, "step": 53900 }, { "epoch": 0.7733178454654964, "grad_norm": 12.097167015075684, "learning_rate": 2.4740965391234147e-05, "loss": 0.2524, "step": 53925 }, { "epoch": 0.7736763609246831, "grad_norm": 1.4436367750167847, "learning_rate": 2.4736981877668955e-05, "loss": 0.2003, "step": 53950 }, { "epoch": 0.7740348763838697, "grad_norm": 16.989261627197266, "learning_rate": 2.4732998364103763e-05, "loss": 0.1747, "step": 53975 }, { "epoch": 0.7743933918430562, "grad_norm": 15.786237716674805, "learning_rate": 2.472901485053857e-05, "loss": 0.1908, "step": 54000 }, { "epoch": 0.7747519073022429, "grad_norm": 9.429876327514648, "learning_rate": 2.4725031336973382e-05, "loss": 0.217, "step": 54025 }, { "epoch": 0.7751104227614295, "grad_norm": 10.480880737304688, "learning_rate": 2.472104782340819e-05, "loss": 0.1351, "step": 54050 }, { "epoch": 0.775468938220616, "grad_norm": 0.3167595863342285, "learning_rate": 2.4717064309842998e-05, "loss": 0.162, "step": 54075 }, { "epoch": 0.7758274536798027, "grad_norm": 6.765605926513672, "learning_rate": 2.4713080796277806e-05, "loss": 0.2004, "step": 54100 }, { "epoch": 0.7761859691389893, "grad_norm": 9.011117935180664, "learning_rate": 2.4709097282712613e-05, "loss": 0.0944, "step": 54125 }, { "epoch": 0.7765444845981758, "grad_norm": 13.10275650024414, "learning_rate": 2.4705113769147425e-05, "loss": 0.1888, "step": 54150 }, { "epoch": 0.7769030000573625, "grad_norm": 1.2910375595092773, "learning_rate": 2.4701130255582233e-05, "loss": 0.2685, "step": 54175 }, { "epoch": 0.7772615155165491, "grad_norm": 5.378042221069336, "learning_rate": 2.469714674201704e-05, "loss": 0.2061, "step": 54200 }, { "epoch": 0.7776200309757356, "grad_norm": 21.607128143310547, "learning_rate": 2.4693163228451848e-05, "loss": 0.2772, "step": 54225 }, { "epoch": 0.7779785464349223, "grad_norm": 5.579397678375244, "learning_rate": 2.4689179714886656e-05, "loss": 0.1435, "step": 54250 }, { "epoch": 0.7783370618941089, "grad_norm": 0.9097792506217957, "learning_rate": 2.4685196201321467e-05, "loss": 0.1698, "step": 54275 }, { "epoch": 0.7786955773532954, "grad_norm": 6.895192623138428, "learning_rate": 2.4681212687756275e-05, "loss": 0.3256, "step": 54300 }, { "epoch": 0.7790540928124821, "grad_norm": 0.13767901062965393, "learning_rate": 2.4677229174191083e-05, "loss": 0.1749, "step": 54325 }, { "epoch": 0.7794126082716687, "grad_norm": 1.9810256958007812, "learning_rate": 2.467324566062589e-05, "loss": 0.1849, "step": 54350 }, { "epoch": 0.7797711237308552, "grad_norm": 10.804314613342285, "learning_rate": 2.46692621470607e-05, "loss": 0.2126, "step": 54375 }, { "epoch": 0.7801296391900419, "grad_norm": 20.926700592041016, "learning_rate": 2.466527863349551e-05, "loss": 0.1827, "step": 54400 }, { "epoch": 0.7804881546492285, "grad_norm": 36.823421478271484, "learning_rate": 2.4661295119930314e-05, "loss": 0.2611, "step": 54425 }, { "epoch": 0.780846670108415, "grad_norm": 2.2614006996154785, "learning_rate": 2.4657311606365122e-05, "loss": 0.1604, "step": 54450 }, { "epoch": 0.7812051855676017, "grad_norm": 11.490524291992188, "learning_rate": 2.465332809279993e-05, "loss": 0.2163, "step": 54475 }, { "epoch": 0.7815637010267883, "grad_norm": 6.372769832611084, "learning_rate": 2.464934457923474e-05, "loss": 0.2013, "step": 54500 }, { "epoch": 0.7819222164859749, "grad_norm": 3.912896156311035, "learning_rate": 2.464536106566955e-05, "loss": 0.1292, "step": 54525 }, { "epoch": 0.7822807319451615, "grad_norm": 4.554114818572998, "learning_rate": 2.4641377552104357e-05, "loss": 0.2468, "step": 54550 }, { "epoch": 0.7826392474043481, "grad_norm": 1.245154857635498, "learning_rate": 2.4637394038539165e-05, "loss": 0.3343, "step": 54575 }, { "epoch": 0.7829977628635347, "grad_norm": 7.774140357971191, "learning_rate": 2.4633410524973973e-05, "loss": 0.1596, "step": 54600 }, { "epoch": 0.7833562783227213, "grad_norm": 0.5677321553230286, "learning_rate": 2.4629427011408784e-05, "loss": 0.1765, "step": 54625 }, { "epoch": 0.7837147937819079, "grad_norm": 10.350456237792969, "learning_rate": 2.4625443497843592e-05, "loss": 0.2724, "step": 54650 }, { "epoch": 0.7840733092410945, "grad_norm": 6.124373912811279, "learning_rate": 2.46214599842784e-05, "loss": 0.2559, "step": 54675 }, { "epoch": 0.784431824700281, "grad_norm": 0.35369980335235596, "learning_rate": 2.4617476470713207e-05, "loss": 0.2453, "step": 54700 }, { "epoch": 0.7847903401594677, "grad_norm": 2.840327024459839, "learning_rate": 2.4613492957148015e-05, "loss": 0.2745, "step": 54725 }, { "epoch": 0.7851488556186543, "grad_norm": 13.019189834594727, "learning_rate": 2.4609509443582827e-05, "loss": 0.1907, "step": 54750 }, { "epoch": 0.7855073710778409, "grad_norm": 0.15570443868637085, "learning_rate": 2.4605525930017634e-05, "loss": 0.164, "step": 54775 }, { "epoch": 0.7858658865370275, "grad_norm": 2.0913000106811523, "learning_rate": 2.4601542416452442e-05, "loss": 0.2155, "step": 54800 }, { "epoch": 0.7862244019962141, "grad_norm": 21.82289695739746, "learning_rate": 2.459755890288725e-05, "loss": 0.3047, "step": 54825 }, { "epoch": 0.7865829174554007, "grad_norm": 2.8926689624786377, "learning_rate": 2.4593575389322058e-05, "loss": 0.2586, "step": 54850 }, { "epoch": 0.7869414329145873, "grad_norm": 1.0927660465240479, "learning_rate": 2.458959187575687e-05, "loss": 0.151, "step": 54875 }, { "epoch": 0.7872999483737739, "grad_norm": 10.479037284851074, "learning_rate": 2.4585608362191677e-05, "loss": 0.1922, "step": 54900 }, { "epoch": 0.7876584638329605, "grad_norm": 1.8808561563491821, "learning_rate": 2.4581624848626485e-05, "loss": 0.1087, "step": 54925 }, { "epoch": 0.7880169792921471, "grad_norm": 3.2186944484710693, "learning_rate": 2.4577641335061293e-05, "loss": 0.2483, "step": 54950 }, { "epoch": 0.7883754947513337, "grad_norm": 12.537614822387695, "learning_rate": 2.45736578214961e-05, "loss": 0.2758, "step": 54975 }, { "epoch": 0.7887340102105203, "grad_norm": 8.651204109191895, "learning_rate": 2.4569674307930912e-05, "loss": 0.1447, "step": 55000 }, { "epoch": 0.7890925256697069, "grad_norm": 7.527142524719238, "learning_rate": 2.456569079436572e-05, "loss": 0.1393, "step": 55025 }, { "epoch": 0.7894510411288935, "grad_norm": 2.633613348007202, "learning_rate": 2.4561707280800527e-05, "loss": 0.1832, "step": 55050 }, { "epoch": 0.78980955658808, "grad_norm": 7.58916711807251, "learning_rate": 2.4557723767235335e-05, "loss": 0.2063, "step": 55075 }, { "epoch": 0.7901680720472667, "grad_norm": 3.9782097339630127, "learning_rate": 2.4553740253670146e-05, "loss": 0.1362, "step": 55100 }, { "epoch": 0.7905265875064533, "grad_norm": 3.0023193359375, "learning_rate": 2.4549756740104954e-05, "loss": 0.1438, "step": 55125 }, { "epoch": 0.7908851029656399, "grad_norm": 11.779471397399902, "learning_rate": 2.4545773226539762e-05, "loss": 0.2851, "step": 55150 }, { "epoch": 0.7912436184248265, "grad_norm": 5.574542999267578, "learning_rate": 2.454178971297457e-05, "loss": 0.1825, "step": 55175 }, { "epoch": 0.7916021338840131, "grad_norm": 10.314001083374023, "learning_rate": 2.4537806199409378e-05, "loss": 0.3198, "step": 55200 }, { "epoch": 0.7919606493431997, "grad_norm": 1.8519508838653564, "learning_rate": 2.453382268584419e-05, "loss": 0.2464, "step": 55225 }, { "epoch": 0.7923191648023863, "grad_norm": 3.8380420207977295, "learning_rate": 2.4529839172278997e-05, "loss": 0.2232, "step": 55250 }, { "epoch": 0.7926776802615729, "grad_norm": 7.189006805419922, "learning_rate": 2.4525855658713805e-05, "loss": 0.1297, "step": 55275 }, { "epoch": 0.7930361957207595, "grad_norm": 15.07776927947998, "learning_rate": 2.4521872145148613e-05, "loss": 0.1837, "step": 55300 }, { "epoch": 0.7933947111799461, "grad_norm": 16.808156967163086, "learning_rate": 2.451788863158342e-05, "loss": 0.1325, "step": 55325 }, { "epoch": 0.7937532266391327, "grad_norm": 1.0282902717590332, "learning_rate": 2.4513905118018232e-05, "loss": 0.135, "step": 55350 }, { "epoch": 0.7941117420983193, "grad_norm": 10.23409366607666, "learning_rate": 2.450992160445304e-05, "loss": 0.1697, "step": 55375 }, { "epoch": 0.7944702575575059, "grad_norm": 0.4860974848270416, "learning_rate": 2.4505938090887847e-05, "loss": 0.29, "step": 55400 }, { "epoch": 0.7948287730166925, "grad_norm": 2.08728289604187, "learning_rate": 2.4501954577322655e-05, "loss": 0.1661, "step": 55425 }, { "epoch": 0.795187288475879, "grad_norm": 15.503584861755371, "learning_rate": 2.4497971063757463e-05, "loss": 0.2672, "step": 55450 }, { "epoch": 0.7955458039350657, "grad_norm": 3.015883445739746, "learning_rate": 2.4493987550192274e-05, "loss": 0.1856, "step": 55475 }, { "epoch": 0.7959043193942523, "grad_norm": 13.93816089630127, "learning_rate": 2.4490004036627082e-05, "loss": 0.2825, "step": 55500 }, { "epoch": 0.7962628348534389, "grad_norm": 0.43610504269599915, "learning_rate": 2.4486020523061887e-05, "loss": 0.1806, "step": 55525 }, { "epoch": 0.7966213503126255, "grad_norm": 0.383515864610672, "learning_rate": 2.4482037009496694e-05, "loss": 0.218, "step": 55550 }, { "epoch": 0.7969798657718121, "grad_norm": 2.8221099376678467, "learning_rate": 2.4478053495931502e-05, "loss": 0.2325, "step": 55575 }, { "epoch": 0.7973383812309986, "grad_norm": 6.66822624206543, "learning_rate": 2.4474069982366314e-05, "loss": 0.1755, "step": 55600 }, { "epoch": 0.7976968966901853, "grad_norm": 1.7076457738876343, "learning_rate": 2.447008646880112e-05, "loss": 0.2132, "step": 55625 }, { "epoch": 0.7980554121493719, "grad_norm": 9.893799781799316, "learning_rate": 2.446610295523593e-05, "loss": 0.1977, "step": 55650 }, { "epoch": 0.7984139276085584, "grad_norm": 10.564728736877441, "learning_rate": 2.4462119441670737e-05, "loss": 0.2353, "step": 55675 }, { "epoch": 0.7987724430677451, "grad_norm": 9.900053977966309, "learning_rate": 2.445813592810555e-05, "loss": 0.1814, "step": 55700 }, { "epoch": 0.7991309585269317, "grad_norm": 0.4255791902542114, "learning_rate": 2.4454152414540356e-05, "loss": 0.2478, "step": 55725 }, { "epoch": 0.7994894739861182, "grad_norm": 20.763614654541016, "learning_rate": 2.4450168900975164e-05, "loss": 0.201, "step": 55750 }, { "epoch": 0.7998479894453049, "grad_norm": 9.940995216369629, "learning_rate": 2.4446185387409972e-05, "loss": 0.1871, "step": 55775 }, { "epoch": 0.8002065049044915, "grad_norm": 0.28916725516319275, "learning_rate": 2.444220187384478e-05, "loss": 0.1007, "step": 55800 }, { "epoch": 0.800565020363678, "grad_norm": 7.0589470863342285, "learning_rate": 2.443821836027959e-05, "loss": 0.1764, "step": 55825 }, { "epoch": 0.8009235358228647, "grad_norm": 4.017249584197998, "learning_rate": 2.44342348467144e-05, "loss": 0.1314, "step": 55850 }, { "epoch": 0.8012820512820513, "grad_norm": 14.295341491699219, "learning_rate": 2.4430251333149207e-05, "loss": 0.2116, "step": 55875 }, { "epoch": 0.8016405667412378, "grad_norm": 5.5032734870910645, "learning_rate": 2.4426267819584014e-05, "loss": 0.1608, "step": 55900 }, { "epoch": 0.8019990822004245, "grad_norm": 9.420777320861816, "learning_rate": 2.4422284306018822e-05, "loss": 0.2582, "step": 55925 }, { "epoch": 0.8023575976596111, "grad_norm": 2.8163034915924072, "learning_rate": 2.4418300792453634e-05, "loss": 0.206, "step": 55950 }, { "epoch": 0.8027161131187976, "grad_norm": 4.4644951820373535, "learning_rate": 2.441431727888844e-05, "loss": 0.1216, "step": 55975 }, { "epoch": 0.8030746285779843, "grad_norm": 9.801971435546875, "learning_rate": 2.441033376532325e-05, "loss": 0.1682, "step": 56000 }, { "epoch": 0.8034331440371709, "grad_norm": 1.2801294326782227, "learning_rate": 2.4406350251758057e-05, "loss": 0.1916, "step": 56025 }, { "epoch": 0.8037916594963574, "grad_norm": 15.056510925292969, "learning_rate": 2.4402366738192865e-05, "loss": 0.2308, "step": 56050 }, { "epoch": 0.8041501749555441, "grad_norm": 7.130540370941162, "learning_rate": 2.4398383224627676e-05, "loss": 0.2143, "step": 56075 }, { "epoch": 0.8045086904147307, "grad_norm": 18.66415786743164, "learning_rate": 2.4394399711062484e-05, "loss": 0.1078, "step": 56100 }, { "epoch": 0.8048672058739172, "grad_norm": 0.5887745022773743, "learning_rate": 2.4390416197497292e-05, "loss": 0.1924, "step": 56125 }, { "epoch": 0.8052257213331039, "grad_norm": 6.46488094329834, "learning_rate": 2.43864326839321e-05, "loss": 0.2074, "step": 56150 }, { "epoch": 0.8055842367922905, "grad_norm": 4.5137224197387695, "learning_rate": 2.4382449170366908e-05, "loss": 0.1782, "step": 56175 }, { "epoch": 0.805942752251477, "grad_norm": 0.33010581135749817, "learning_rate": 2.437846565680172e-05, "loss": 0.2473, "step": 56200 }, { "epoch": 0.8063012677106637, "grad_norm": 15.091448783874512, "learning_rate": 2.4374482143236527e-05, "loss": 0.2023, "step": 56225 }, { "epoch": 0.8066597831698503, "grad_norm": 0.5115715861320496, "learning_rate": 2.4370498629671334e-05, "loss": 0.2009, "step": 56250 }, { "epoch": 0.8070182986290368, "grad_norm": 4.550087928771973, "learning_rate": 2.4366515116106142e-05, "loss": 0.2185, "step": 56275 }, { "epoch": 0.8073768140882235, "grad_norm": 0.32557040452957153, "learning_rate": 2.4362531602540954e-05, "loss": 0.2342, "step": 56300 }, { "epoch": 0.8077353295474101, "grad_norm": 0.09818831086158752, "learning_rate": 2.435854808897576e-05, "loss": 0.1619, "step": 56325 }, { "epoch": 0.8080938450065966, "grad_norm": 9.549029350280762, "learning_rate": 2.435456457541057e-05, "loss": 0.2748, "step": 56350 }, { "epoch": 0.8084523604657833, "grad_norm": 0.5269588232040405, "learning_rate": 2.4350581061845377e-05, "loss": 0.1646, "step": 56375 }, { "epoch": 0.8088108759249699, "grad_norm": 1.3084726333618164, "learning_rate": 2.4346597548280185e-05, "loss": 0.2458, "step": 56400 }, { "epoch": 0.8091693913841564, "grad_norm": 19.984302520751953, "learning_rate": 2.4342614034714996e-05, "loss": 0.1816, "step": 56425 }, { "epoch": 0.8095279068433431, "grad_norm": 0.3858236074447632, "learning_rate": 2.4338630521149804e-05, "loss": 0.2388, "step": 56450 }, { "epoch": 0.8098864223025297, "grad_norm": 4.745510578155518, "learning_rate": 2.4334647007584612e-05, "loss": 0.133, "step": 56475 }, { "epoch": 0.8102449377617162, "grad_norm": 2.6071746349334717, "learning_rate": 2.433066349401942e-05, "loss": 0.1052, "step": 56500 }, { "epoch": 0.8106034532209029, "grad_norm": 0.4620191752910614, "learning_rate": 2.4326679980454228e-05, "loss": 0.1891, "step": 56525 }, { "epoch": 0.8109619686800895, "grad_norm": 3.2616543769836426, "learning_rate": 2.432269646688904e-05, "loss": 0.17, "step": 56550 }, { "epoch": 0.811320484139276, "grad_norm": 1.8640353679656982, "learning_rate": 2.4318712953323847e-05, "loss": 0.209, "step": 56575 }, { "epoch": 0.8116789995984627, "grad_norm": 1.0438233613967896, "learning_rate": 2.4314729439758654e-05, "loss": 0.1941, "step": 56600 }, { "epoch": 0.8120375150576493, "grad_norm": 0.4221806526184082, "learning_rate": 2.431074592619346e-05, "loss": 0.1639, "step": 56625 }, { "epoch": 0.8123960305168358, "grad_norm": 6.290027618408203, "learning_rate": 2.4306762412628267e-05, "loss": 0.3588, "step": 56650 }, { "epoch": 0.8127545459760225, "grad_norm": 11.349748611450195, "learning_rate": 2.4302778899063078e-05, "loss": 0.1513, "step": 56675 }, { "epoch": 0.8131130614352091, "grad_norm": 0.805528998374939, "learning_rate": 2.4298795385497886e-05, "loss": 0.1298, "step": 56700 }, { "epoch": 0.8134715768943956, "grad_norm": 21.276185989379883, "learning_rate": 2.4294811871932694e-05, "loss": 0.2923, "step": 56725 }, { "epoch": 0.8138300923535823, "grad_norm": 2.873417377471924, "learning_rate": 2.42908283583675e-05, "loss": 0.1756, "step": 56750 }, { "epoch": 0.8141886078127689, "grad_norm": 5.633941173553467, "learning_rate": 2.428684484480231e-05, "loss": 0.2071, "step": 56775 }, { "epoch": 0.8145471232719554, "grad_norm": 0.09460192918777466, "learning_rate": 2.428286133123712e-05, "loss": 0.1944, "step": 56800 }, { "epoch": 0.8149056387311421, "grad_norm": 0.6687222123146057, "learning_rate": 2.427887781767193e-05, "loss": 0.2119, "step": 56825 }, { "epoch": 0.8152641541903287, "grad_norm": 20.270950317382812, "learning_rate": 2.4274894304106736e-05, "loss": 0.256, "step": 56850 }, { "epoch": 0.8156226696495152, "grad_norm": 12.616190910339355, "learning_rate": 2.4270910790541544e-05, "loss": 0.1923, "step": 56875 }, { "epoch": 0.8159811851087019, "grad_norm": 4.351966381072998, "learning_rate": 2.4266927276976352e-05, "loss": 0.1291, "step": 56900 }, { "epoch": 0.8163397005678885, "grad_norm": 6.241795539855957, "learning_rate": 2.4262943763411163e-05, "loss": 0.2074, "step": 56925 }, { "epoch": 0.816698216027075, "grad_norm": 2.7020468711853027, "learning_rate": 2.425896024984597e-05, "loss": 0.1707, "step": 56950 }, { "epoch": 0.8170567314862617, "grad_norm": 14.533209800720215, "learning_rate": 2.425497673628078e-05, "loss": 0.2175, "step": 56975 }, { "epoch": 0.8174152469454483, "grad_norm": 7.421328544616699, "learning_rate": 2.4250993222715587e-05, "loss": 0.2517, "step": 57000 }, { "epoch": 0.8177737624046348, "grad_norm": 12.175294876098633, "learning_rate": 2.4247009709150398e-05, "loss": 0.1231, "step": 57025 }, { "epoch": 0.8181322778638215, "grad_norm": 5.9229631423950195, "learning_rate": 2.4243026195585206e-05, "loss": 0.2153, "step": 57050 }, { "epoch": 0.8184907933230081, "grad_norm": 11.346471786499023, "learning_rate": 2.4239042682020014e-05, "loss": 0.2429, "step": 57075 }, { "epoch": 0.8188493087821946, "grad_norm": 13.977330207824707, "learning_rate": 2.423505916845482e-05, "loss": 0.2172, "step": 57100 }, { "epoch": 0.8192078242413813, "grad_norm": 0.7797583341598511, "learning_rate": 2.423107565488963e-05, "loss": 0.2203, "step": 57125 }, { "epoch": 0.8195663397005679, "grad_norm": 13.276204109191895, "learning_rate": 2.422709214132444e-05, "loss": 0.1844, "step": 57150 }, { "epoch": 0.8199248551597544, "grad_norm": 14.592622756958008, "learning_rate": 2.422310862775925e-05, "loss": 0.2652, "step": 57175 }, { "epoch": 0.8202833706189411, "grad_norm": 5.662628650665283, "learning_rate": 2.4219125114194056e-05, "loss": 0.156, "step": 57200 }, { "epoch": 0.8206418860781277, "grad_norm": 7.368398189544678, "learning_rate": 2.4215141600628864e-05, "loss": 0.1651, "step": 57225 }, { "epoch": 0.8210004015373142, "grad_norm": 0.1185787171125412, "learning_rate": 2.4211158087063672e-05, "loss": 0.2656, "step": 57250 }, { "epoch": 0.8213589169965009, "grad_norm": 5.373048305511475, "learning_rate": 2.4207174573498483e-05, "loss": 0.1647, "step": 57275 }, { "epoch": 0.8217174324556875, "grad_norm": 2.7326369285583496, "learning_rate": 2.420319105993329e-05, "loss": 0.2146, "step": 57300 }, { "epoch": 0.822075947914874, "grad_norm": 12.20999813079834, "learning_rate": 2.41992075463681e-05, "loss": 0.2081, "step": 57325 }, { "epoch": 0.8224344633740607, "grad_norm": 0.5299258232116699, "learning_rate": 2.4195224032802907e-05, "loss": 0.1211, "step": 57350 }, { "epoch": 0.8227929788332473, "grad_norm": 17.424360275268555, "learning_rate": 2.4191240519237715e-05, "loss": 0.27, "step": 57375 }, { "epoch": 0.8231514942924338, "grad_norm": 14.5670747756958, "learning_rate": 2.4187257005672526e-05, "loss": 0.1496, "step": 57400 }, { "epoch": 0.8235100097516205, "grad_norm": 0.9897518157958984, "learning_rate": 2.4183273492107334e-05, "loss": 0.1858, "step": 57425 }, { "epoch": 0.8238685252108071, "grad_norm": 0.2180412858724594, "learning_rate": 2.417928997854214e-05, "loss": 0.1719, "step": 57450 }, { "epoch": 0.8242270406699936, "grad_norm": 1.0118600130081177, "learning_rate": 2.417530646497695e-05, "loss": 0.2148, "step": 57475 }, { "epoch": 0.8245855561291803, "grad_norm": 1.2709331512451172, "learning_rate": 2.4171322951411757e-05, "loss": 0.1725, "step": 57500 }, { "epoch": 0.8249440715883669, "grad_norm": 17.850614547729492, "learning_rate": 2.416733943784657e-05, "loss": 0.1975, "step": 57525 }, { "epoch": 0.8253025870475534, "grad_norm": 14.517633438110352, "learning_rate": 2.4163355924281376e-05, "loss": 0.1253, "step": 57550 }, { "epoch": 0.8256611025067401, "grad_norm": 17.65534019470215, "learning_rate": 2.4159372410716184e-05, "loss": 0.1933, "step": 57575 }, { "epoch": 0.8260196179659267, "grad_norm": 0.938289225101471, "learning_rate": 2.4155388897150992e-05, "loss": 0.1091, "step": 57600 }, { "epoch": 0.8263781334251132, "grad_norm": 10.079828262329102, "learning_rate": 2.4151405383585803e-05, "loss": 0.2811, "step": 57625 }, { "epoch": 0.8267366488842999, "grad_norm": 14.482244491577148, "learning_rate": 2.414742187002061e-05, "loss": 0.1339, "step": 57650 }, { "epoch": 0.8270951643434865, "grad_norm": 14.370861053466797, "learning_rate": 2.414343835645542e-05, "loss": 0.2512, "step": 57675 }, { "epoch": 0.827453679802673, "grad_norm": 8.155793190002441, "learning_rate": 2.4139454842890227e-05, "loss": 0.2207, "step": 57700 }, { "epoch": 0.8278121952618597, "grad_norm": 4.345125675201416, "learning_rate": 2.4135471329325035e-05, "loss": 0.1914, "step": 57725 }, { "epoch": 0.8281707107210463, "grad_norm": 9.738463401794434, "learning_rate": 2.4131487815759842e-05, "loss": 0.2111, "step": 57750 }, { "epoch": 0.8285292261802328, "grad_norm": 7.3105244636535645, "learning_rate": 2.412750430219465e-05, "loss": 0.2832, "step": 57775 }, { "epoch": 0.8288877416394195, "grad_norm": 0.08984778076410294, "learning_rate": 2.4123520788629458e-05, "loss": 0.2937, "step": 57800 }, { "epoch": 0.8292462570986061, "grad_norm": 3.9998626708984375, "learning_rate": 2.4119537275064266e-05, "loss": 0.1824, "step": 57825 }, { "epoch": 0.8296047725577927, "grad_norm": 9.414665222167969, "learning_rate": 2.4115553761499074e-05, "loss": 0.0953, "step": 57850 }, { "epoch": 0.8299632880169793, "grad_norm": 11.378517150878906, "learning_rate": 2.4111570247933885e-05, "loss": 0.2367, "step": 57875 }, { "epoch": 0.8303218034761659, "grad_norm": 1.3687020540237427, "learning_rate": 2.4107586734368693e-05, "loss": 0.1776, "step": 57900 }, { "epoch": 0.8306803189353525, "grad_norm": 19.36351776123047, "learning_rate": 2.41036032208035e-05, "loss": 0.2467, "step": 57925 }, { "epoch": 0.8310388343945391, "grad_norm": 0.19887675344944, "learning_rate": 2.409961970723831e-05, "loss": 0.2486, "step": 57950 }, { "epoch": 0.8313973498537257, "grad_norm": 18.1120548248291, "learning_rate": 2.4095636193673116e-05, "loss": 0.1387, "step": 57975 }, { "epoch": 0.8317558653129123, "grad_norm": 5.747171878814697, "learning_rate": 2.4091652680107928e-05, "loss": 0.2167, "step": 58000 }, { "epoch": 0.8321143807720989, "grad_norm": 3.6051838397979736, "learning_rate": 2.4087669166542735e-05, "loss": 0.15, "step": 58025 }, { "epoch": 0.8324728962312855, "grad_norm": 15.21248722076416, "learning_rate": 2.4083685652977543e-05, "loss": 0.2456, "step": 58050 }, { "epoch": 0.8328314116904721, "grad_norm": 13.857061386108398, "learning_rate": 2.407970213941235e-05, "loss": 0.1346, "step": 58075 }, { "epoch": 0.8331899271496587, "grad_norm": 6.444247245788574, "learning_rate": 2.407571862584716e-05, "loss": 0.2417, "step": 58100 }, { "epoch": 0.8335484426088453, "grad_norm": 0.2079296112060547, "learning_rate": 2.407173511228197e-05, "loss": 0.2677, "step": 58125 }, { "epoch": 0.833906958068032, "grad_norm": 2.6084511280059814, "learning_rate": 2.4067751598716778e-05, "loss": 0.1034, "step": 58150 }, { "epoch": 0.8342654735272185, "grad_norm": 3.9659807682037354, "learning_rate": 2.4063768085151586e-05, "loss": 0.1847, "step": 58175 }, { "epoch": 0.8346239889864051, "grad_norm": 0.5376915335655212, "learning_rate": 2.4059784571586394e-05, "loss": 0.2, "step": 58200 }, { "epoch": 0.8349825044455917, "grad_norm": 10.246989250183105, "learning_rate": 2.4055801058021205e-05, "loss": 0.1739, "step": 58225 }, { "epoch": 0.8353410199047783, "grad_norm": 2.7147395610809326, "learning_rate": 2.4051817544456013e-05, "loss": 0.2124, "step": 58250 }, { "epoch": 0.8356995353639649, "grad_norm": 5.694031715393066, "learning_rate": 2.404783403089082e-05, "loss": 0.1672, "step": 58275 }, { "epoch": 0.8360580508231515, "grad_norm": 1.0719079971313477, "learning_rate": 2.404385051732563e-05, "loss": 0.1522, "step": 58300 }, { "epoch": 0.8364165662823381, "grad_norm": 12.547607421875, "learning_rate": 2.4039867003760436e-05, "loss": 0.2966, "step": 58325 }, { "epoch": 0.8367750817415247, "grad_norm": 2.120198965072632, "learning_rate": 2.4035883490195248e-05, "loss": 0.1219, "step": 58350 }, { "epoch": 0.8371335972007113, "grad_norm": 8.511225700378418, "learning_rate": 2.4031899976630055e-05, "loss": 0.1882, "step": 58375 }, { "epoch": 0.8374921126598979, "grad_norm": 0.8366949558258057, "learning_rate": 2.4027916463064863e-05, "loss": 0.1574, "step": 58400 }, { "epoch": 0.8378506281190845, "grad_norm": 10.706034660339355, "learning_rate": 2.402393294949967e-05, "loss": 0.1593, "step": 58425 }, { "epoch": 0.8382091435782711, "grad_norm": 10.270462036132812, "learning_rate": 2.401994943593448e-05, "loss": 0.206, "step": 58450 }, { "epoch": 0.8385676590374577, "grad_norm": 8.607298851013184, "learning_rate": 2.401596592236929e-05, "loss": 0.2116, "step": 58475 }, { "epoch": 0.8389261744966443, "grad_norm": 0.5541796684265137, "learning_rate": 2.4011982408804098e-05, "loss": 0.1301, "step": 58500 }, { "epoch": 0.8392846899558309, "grad_norm": 11.146523475646973, "learning_rate": 2.4007998895238906e-05, "loss": 0.149, "step": 58525 }, { "epoch": 0.8396432054150175, "grad_norm": 12.448284149169922, "learning_rate": 2.4004015381673714e-05, "loss": 0.2024, "step": 58550 }, { "epoch": 0.8400017208742041, "grad_norm": 17.42201805114746, "learning_rate": 2.400003186810852e-05, "loss": 0.1596, "step": 58575 }, { "epoch": 0.8403602363333907, "grad_norm": 0.08195345848798752, "learning_rate": 2.3996048354543333e-05, "loss": 0.2233, "step": 58600 }, { "epoch": 0.8407187517925773, "grad_norm": 5.782777786254883, "learning_rate": 2.399206484097814e-05, "loss": 0.2526, "step": 58625 }, { "epoch": 0.8410772672517639, "grad_norm": 8.831490516662598, "learning_rate": 2.398808132741295e-05, "loss": 0.1398, "step": 58650 }, { "epoch": 0.8414357827109505, "grad_norm": 8.087462425231934, "learning_rate": 2.3984097813847756e-05, "loss": 0.2121, "step": 58675 }, { "epoch": 0.8417942981701371, "grad_norm": 0.27661192417144775, "learning_rate": 2.3980114300282564e-05, "loss": 0.1932, "step": 58700 }, { "epoch": 0.8421528136293237, "grad_norm": 1.5825954675674438, "learning_rate": 2.3976130786717375e-05, "loss": 0.2497, "step": 58725 }, { "epoch": 0.8425113290885103, "grad_norm": 11.78575611114502, "learning_rate": 2.3972147273152183e-05, "loss": 0.2461, "step": 58750 }, { "epoch": 0.8428698445476969, "grad_norm": 18.785871505737305, "learning_rate": 2.396816375958699e-05, "loss": 0.175, "step": 58775 }, { "epoch": 0.8432283600068835, "grad_norm": 7.5558857917785645, "learning_rate": 2.39641802460218e-05, "loss": 0.2236, "step": 58800 }, { "epoch": 0.8435868754660701, "grad_norm": 6.613083362579346, "learning_rate": 2.3960196732456607e-05, "loss": 0.2038, "step": 58825 }, { "epoch": 0.8439453909252567, "grad_norm": 22.55792808532715, "learning_rate": 2.3956213218891415e-05, "loss": 0.2992, "step": 58850 }, { "epoch": 0.8443039063844433, "grad_norm": 12.83201789855957, "learning_rate": 2.3952229705326223e-05, "loss": 0.1725, "step": 58875 }, { "epoch": 0.8446624218436299, "grad_norm": 0.3198804557323456, "learning_rate": 2.394824619176103e-05, "loss": 0.1317, "step": 58900 }, { "epoch": 0.8450209373028165, "grad_norm": 7.818063735961914, "learning_rate": 2.3944262678195838e-05, "loss": 0.1747, "step": 58925 }, { "epoch": 0.8453794527620031, "grad_norm": 1.2796581983566284, "learning_rate": 2.394027916463065e-05, "loss": 0.1902, "step": 58950 }, { "epoch": 0.8457379682211897, "grad_norm": 10.831594467163086, "learning_rate": 2.3936295651065457e-05, "loss": 0.157, "step": 58975 }, { "epoch": 0.8460964836803763, "grad_norm": 15.471402168273926, "learning_rate": 2.3932312137500265e-05, "loss": 0.1753, "step": 59000 }, { "epoch": 0.8464549991395629, "grad_norm": 0.9942063689231873, "learning_rate": 2.3928328623935073e-05, "loss": 0.2192, "step": 59025 }, { "epoch": 0.8468135145987495, "grad_norm": 9.828997611999512, "learning_rate": 2.392434511036988e-05, "loss": 0.2098, "step": 59050 }, { "epoch": 0.8471720300579361, "grad_norm": 8.934715270996094, "learning_rate": 2.3920361596804692e-05, "loss": 0.174, "step": 59075 }, { "epoch": 0.8475305455171227, "grad_norm": 0.9792462587356567, "learning_rate": 2.39163780832395e-05, "loss": 0.2245, "step": 59100 }, { "epoch": 0.8478890609763093, "grad_norm": 0.37696439027786255, "learning_rate": 2.3912394569674308e-05, "loss": 0.1306, "step": 59125 }, { "epoch": 0.8482475764354959, "grad_norm": 0.780803382396698, "learning_rate": 2.3908411056109116e-05, "loss": 0.1742, "step": 59150 }, { "epoch": 0.8486060918946825, "grad_norm": 10.444048881530762, "learning_rate": 2.3904427542543923e-05, "loss": 0.323, "step": 59175 }, { "epoch": 0.8489646073538691, "grad_norm": 0.5917268991470337, "learning_rate": 2.3900444028978735e-05, "loss": 0.1293, "step": 59200 }, { "epoch": 0.8493231228130557, "grad_norm": 5.830556869506836, "learning_rate": 2.3896460515413543e-05, "loss": 0.2187, "step": 59225 }, { "epoch": 0.8496816382722423, "grad_norm": 9.514433860778809, "learning_rate": 2.389247700184835e-05, "loss": 0.1469, "step": 59250 }, { "epoch": 0.8500401537314289, "grad_norm": 5.5915937423706055, "learning_rate": 2.3888493488283158e-05, "loss": 0.1906, "step": 59275 }, { "epoch": 0.8503986691906155, "grad_norm": 22.78993797302246, "learning_rate": 2.3884509974717966e-05, "loss": 0.1865, "step": 59300 }, { "epoch": 0.8507571846498021, "grad_norm": 10.383119583129883, "learning_rate": 2.3880526461152777e-05, "loss": 0.1596, "step": 59325 }, { "epoch": 0.8511157001089887, "grad_norm": 3.110809803009033, "learning_rate": 2.3876542947587585e-05, "loss": 0.1207, "step": 59350 }, { "epoch": 0.8514742155681753, "grad_norm": 2.3352489471435547, "learning_rate": 2.3872559434022393e-05, "loss": 0.2296, "step": 59375 }, { "epoch": 0.8518327310273619, "grad_norm": 1.2660415172576904, "learning_rate": 2.38685759204572e-05, "loss": 0.1903, "step": 59400 }, { "epoch": 0.8521912464865485, "grad_norm": 19.59035873413086, "learning_rate": 2.3864592406892012e-05, "loss": 0.1761, "step": 59425 }, { "epoch": 0.8525497619457351, "grad_norm": 9.645317077636719, "learning_rate": 2.386060889332682e-05, "loss": 0.1782, "step": 59450 }, { "epoch": 0.8529082774049217, "grad_norm": 15.523565292358398, "learning_rate": 2.3856625379761628e-05, "loss": 0.1718, "step": 59475 }, { "epoch": 0.8532667928641083, "grad_norm": 0.5608633756637573, "learning_rate": 2.3852641866196436e-05, "loss": 0.2429, "step": 59500 }, { "epoch": 0.8536253083232949, "grad_norm": 20.922870635986328, "learning_rate": 2.3848658352631243e-05, "loss": 0.2869, "step": 59525 }, { "epoch": 0.8539838237824815, "grad_norm": 0.8722829818725586, "learning_rate": 2.3844674839066055e-05, "loss": 0.217, "step": 59550 }, { "epoch": 0.8543423392416681, "grad_norm": 15.383103370666504, "learning_rate": 2.3840691325500862e-05, "loss": 0.1993, "step": 59575 }, { "epoch": 0.8547008547008547, "grad_norm": 10.48262882232666, "learning_rate": 2.383670781193567e-05, "loss": 0.1955, "step": 59600 }, { "epoch": 0.8550593701600413, "grad_norm": 0.6813935041427612, "learning_rate": 2.3832724298370478e-05, "loss": 0.1556, "step": 59625 }, { "epoch": 0.8554178856192279, "grad_norm": 14.061888694763184, "learning_rate": 2.3828740784805286e-05, "loss": 0.2934, "step": 59650 }, { "epoch": 0.8557764010784145, "grad_norm": 3.154385566711426, "learning_rate": 2.3824757271240097e-05, "loss": 0.1537, "step": 59675 }, { "epoch": 0.8561349165376011, "grad_norm": 6.317718505859375, "learning_rate": 2.3820773757674905e-05, "loss": 0.3388, "step": 59700 }, { "epoch": 0.8564934319967877, "grad_norm": 0.14967814087867737, "learning_rate": 2.3816790244109713e-05, "loss": 0.1995, "step": 59725 }, { "epoch": 0.8568519474559743, "grad_norm": 18.93630027770996, "learning_rate": 2.381280673054452e-05, "loss": 0.3613, "step": 59750 }, { "epoch": 0.8572104629151609, "grad_norm": 0.10684142261743546, "learning_rate": 2.380882321697933e-05, "loss": 0.2205, "step": 59775 }, { "epoch": 0.8575689783743475, "grad_norm": 12.922229766845703, "learning_rate": 2.380483970341414e-05, "loss": 0.1496, "step": 59800 }, { "epoch": 0.8579274938335341, "grad_norm": 0.9659484028816223, "learning_rate": 2.3800856189848948e-05, "loss": 0.2383, "step": 59825 }, { "epoch": 0.8582860092927207, "grad_norm": 4.343211650848389, "learning_rate": 2.3796872676283756e-05, "loss": 0.2162, "step": 59850 }, { "epoch": 0.8586445247519073, "grad_norm": 1.8569138050079346, "learning_rate": 2.3792889162718563e-05, "loss": 0.2357, "step": 59875 }, { "epoch": 0.8590030402110939, "grad_norm": 3.183196544647217, "learning_rate": 2.378890564915337e-05, "loss": 0.1992, "step": 59900 }, { "epoch": 0.8593615556702805, "grad_norm": 19.53995132446289, "learning_rate": 2.378492213558818e-05, "loss": 0.2017, "step": 59925 }, { "epoch": 0.8597200711294671, "grad_norm": 6.9454345703125, "learning_rate": 2.3780938622022987e-05, "loss": 0.1935, "step": 59950 }, { "epoch": 0.8600785865886537, "grad_norm": 5.579140663146973, "learning_rate": 2.3776955108457795e-05, "loss": 0.1267, "step": 59975 }, { "epoch": 0.8604371020478403, "grad_norm": 11.569389343261719, "learning_rate": 2.3772971594892603e-05, "loss": 0.2271, "step": 60000 }, { "epoch": 0.8607956175070269, "grad_norm": 0.6670913100242615, "learning_rate": 2.3768988081327414e-05, "loss": 0.2873, "step": 60025 }, { "epoch": 0.8611541329662135, "grad_norm": 1.98805570602417, "learning_rate": 2.3765004567762222e-05, "loss": 0.2239, "step": 60050 }, { "epoch": 0.8615126484254001, "grad_norm": 11.422531127929688, "learning_rate": 2.376102105419703e-05, "loss": 0.2453, "step": 60075 }, { "epoch": 0.8618711638845867, "grad_norm": 1.966076135635376, "learning_rate": 2.3757037540631837e-05, "loss": 0.2085, "step": 60100 }, { "epoch": 0.8622296793437733, "grad_norm": 2.8513643741607666, "learning_rate": 2.3753054027066645e-05, "loss": 0.1519, "step": 60125 }, { "epoch": 0.8625881948029599, "grad_norm": 20.002885818481445, "learning_rate": 2.3749070513501456e-05, "loss": 0.1643, "step": 60150 }, { "epoch": 0.8629467102621465, "grad_norm": 10.256516456604004, "learning_rate": 2.3745086999936264e-05, "loss": 0.1615, "step": 60175 }, { "epoch": 0.8633052257213331, "grad_norm": 1.1313376426696777, "learning_rate": 2.3741103486371072e-05, "loss": 0.2576, "step": 60200 }, { "epoch": 0.8636637411805197, "grad_norm": 0.6118314862251282, "learning_rate": 2.373711997280588e-05, "loss": 0.2331, "step": 60225 }, { "epoch": 0.8640222566397063, "grad_norm": 2.367525815963745, "learning_rate": 2.3733136459240688e-05, "loss": 0.2121, "step": 60250 }, { "epoch": 0.8643807720988929, "grad_norm": 2.0850601196289062, "learning_rate": 2.37291529456755e-05, "loss": 0.1956, "step": 60275 }, { "epoch": 0.8647392875580795, "grad_norm": 11.455164909362793, "learning_rate": 2.3725169432110307e-05, "loss": 0.2587, "step": 60300 }, { "epoch": 0.8650978030172661, "grad_norm": 11.028013229370117, "learning_rate": 2.3721185918545115e-05, "loss": 0.2783, "step": 60325 }, { "epoch": 0.8654563184764527, "grad_norm": 10.456210136413574, "learning_rate": 2.3717202404979923e-05, "loss": 0.1717, "step": 60350 }, { "epoch": 0.8658148339356393, "grad_norm": 8.784029006958008, "learning_rate": 2.371321889141473e-05, "loss": 0.2111, "step": 60375 }, { "epoch": 0.8661733493948259, "grad_norm": 10.003386497497559, "learning_rate": 2.370923537784954e-05, "loss": 0.2435, "step": 60400 }, { "epoch": 0.8665318648540125, "grad_norm": 0.618430495262146, "learning_rate": 2.370525186428435e-05, "loss": 0.1948, "step": 60425 }, { "epoch": 0.8668903803131991, "grad_norm": 18.510007858276367, "learning_rate": 2.3701268350719157e-05, "loss": 0.2888, "step": 60450 }, { "epoch": 0.8672488957723857, "grad_norm": 6.596806526184082, "learning_rate": 2.3697284837153965e-05, "loss": 0.1461, "step": 60475 }, { "epoch": 0.8676074112315723, "grad_norm": 5.188225746154785, "learning_rate": 2.3693301323588773e-05, "loss": 0.1482, "step": 60500 }, { "epoch": 0.8679659266907589, "grad_norm": 11.810300827026367, "learning_rate": 2.3689317810023584e-05, "loss": 0.1686, "step": 60525 }, { "epoch": 0.8683244421499455, "grad_norm": 12.580669403076172, "learning_rate": 2.3685334296458392e-05, "loss": 0.1927, "step": 60550 }, { "epoch": 0.8686829576091321, "grad_norm": 0.27443358302116394, "learning_rate": 2.36813507828932e-05, "loss": 0.1706, "step": 60575 }, { "epoch": 0.8690414730683187, "grad_norm": 13.116230964660645, "learning_rate": 2.3677367269328008e-05, "loss": 0.2412, "step": 60600 }, { "epoch": 0.8693999885275053, "grad_norm": 1.0036900043487549, "learning_rate": 2.367338375576282e-05, "loss": 0.1519, "step": 60625 }, { "epoch": 0.8697585039866919, "grad_norm": 24.361709594726562, "learning_rate": 2.3669400242197627e-05, "loss": 0.2006, "step": 60650 }, { "epoch": 0.8701170194458785, "grad_norm": 16.127382278442383, "learning_rate": 2.3665416728632435e-05, "loss": 0.1738, "step": 60675 }, { "epoch": 0.8704755349050651, "grad_norm": 10.838949203491211, "learning_rate": 2.3661433215067243e-05, "loss": 0.112, "step": 60700 }, { "epoch": 0.8708340503642517, "grad_norm": 5.183264255523682, "learning_rate": 2.365744970150205e-05, "loss": 0.1837, "step": 60725 }, { "epoch": 0.8711925658234383, "grad_norm": 5.119192600250244, "learning_rate": 2.365346618793686e-05, "loss": 0.1674, "step": 60750 }, { "epoch": 0.8715510812826249, "grad_norm": 0.6993245482444763, "learning_rate": 2.364948267437167e-05, "loss": 0.2847, "step": 60775 }, { "epoch": 0.8719095967418115, "grad_norm": 2.6987109184265137, "learning_rate": 2.3645499160806477e-05, "loss": 0.218, "step": 60800 }, { "epoch": 0.8722681122009981, "grad_norm": 0.9938321113586426, "learning_rate": 2.3641515647241285e-05, "loss": 0.1244, "step": 60825 }, { "epoch": 0.8726266276601847, "grad_norm": 17.782363891601562, "learning_rate": 2.3637532133676093e-05, "loss": 0.1265, "step": 60850 }, { "epoch": 0.8729851431193713, "grad_norm": 1.3180371522903442, "learning_rate": 2.3633548620110904e-05, "loss": 0.1963, "step": 60875 }, { "epoch": 0.8733436585785579, "grad_norm": 8.314395904541016, "learning_rate": 2.3629565106545712e-05, "loss": 0.2502, "step": 60900 }, { "epoch": 0.8737021740377445, "grad_norm": 5.832565784454346, "learning_rate": 2.362558159298052e-05, "loss": 0.2718, "step": 60925 }, { "epoch": 0.874060689496931, "grad_norm": 8.410141944885254, "learning_rate": 2.3621598079415328e-05, "loss": 0.2315, "step": 60950 }, { "epoch": 0.8744192049561177, "grad_norm": 1.3126161098480225, "learning_rate": 2.3617614565850136e-05, "loss": 0.1989, "step": 60975 }, { "epoch": 0.8747777204153043, "grad_norm": 9.906417846679688, "learning_rate": 2.3613631052284947e-05, "loss": 0.2096, "step": 61000 }, { "epoch": 0.8751362358744909, "grad_norm": 2.3896141052246094, "learning_rate": 2.360964753871975e-05, "loss": 0.2048, "step": 61025 }, { "epoch": 0.8754947513336775, "grad_norm": 16.5257511138916, "learning_rate": 2.360566402515456e-05, "loss": 0.1699, "step": 61050 }, { "epoch": 0.8758532667928641, "grad_norm": 5.777262210845947, "learning_rate": 2.3601680511589367e-05, "loss": 0.133, "step": 61075 }, { "epoch": 0.8762117822520508, "grad_norm": 0.7688774466514587, "learning_rate": 2.3597696998024175e-05, "loss": 0.1662, "step": 61100 }, { "epoch": 0.8765702977112373, "grad_norm": 7.678110122680664, "learning_rate": 2.3593713484458986e-05, "loss": 0.1842, "step": 61125 }, { "epoch": 0.8769288131704239, "grad_norm": 3.218888521194458, "learning_rate": 2.3589729970893794e-05, "loss": 0.1405, "step": 61150 }, { "epoch": 0.8772873286296106, "grad_norm": 0.9786465167999268, "learning_rate": 2.3585746457328602e-05, "loss": 0.1247, "step": 61175 }, { "epoch": 0.8776458440887971, "grad_norm": 4.717813014984131, "learning_rate": 2.358176294376341e-05, "loss": 0.1621, "step": 61200 }, { "epoch": 0.8780043595479837, "grad_norm": 0.6842845678329468, "learning_rate": 2.3577779430198218e-05, "loss": 0.151, "step": 61225 }, { "epoch": 0.8783628750071704, "grad_norm": 0.3121117353439331, "learning_rate": 2.357379591663303e-05, "loss": 0.1888, "step": 61250 }, { "epoch": 0.8787213904663569, "grad_norm": 1.5565186738967896, "learning_rate": 2.3569812403067837e-05, "loss": 0.1466, "step": 61275 }, { "epoch": 0.8790799059255435, "grad_norm": 23.677705764770508, "learning_rate": 2.3565828889502644e-05, "loss": 0.2578, "step": 61300 }, { "epoch": 0.8794384213847302, "grad_norm": 11.875723838806152, "learning_rate": 2.3561845375937452e-05, "loss": 0.1984, "step": 61325 }, { "epoch": 0.8797969368439167, "grad_norm": 0.899189293384552, "learning_rate": 2.3557861862372264e-05, "loss": 0.0737, "step": 61350 }, { "epoch": 0.8801554523031033, "grad_norm": 3.3866360187530518, "learning_rate": 2.355387834880707e-05, "loss": 0.1984, "step": 61375 }, { "epoch": 0.88051396776229, "grad_norm": 4.401764392852783, "learning_rate": 2.354989483524188e-05, "loss": 0.1762, "step": 61400 }, { "epoch": 0.8808724832214765, "grad_norm": 5.161829948425293, "learning_rate": 2.3545911321676687e-05, "loss": 0.1342, "step": 61425 }, { "epoch": 0.8812309986806631, "grad_norm": 2.806654453277588, "learning_rate": 2.3541927808111495e-05, "loss": 0.1703, "step": 61450 }, { "epoch": 0.8815895141398498, "grad_norm": 0.7390363216400146, "learning_rate": 2.3537944294546306e-05, "loss": 0.1795, "step": 61475 }, { "epoch": 0.8819480295990363, "grad_norm": 11.429630279541016, "learning_rate": 2.3533960780981114e-05, "loss": 0.3342, "step": 61500 }, { "epoch": 0.8823065450582229, "grad_norm": 9.224228858947754, "learning_rate": 2.3529977267415922e-05, "loss": 0.2679, "step": 61525 }, { "epoch": 0.8826650605174096, "grad_norm": 2.741637706756592, "learning_rate": 2.352599375385073e-05, "loss": 0.2607, "step": 61550 }, { "epoch": 0.8830235759765961, "grad_norm": 1.2581512928009033, "learning_rate": 2.3522010240285537e-05, "loss": 0.1834, "step": 61575 }, { "epoch": 0.8833820914357827, "grad_norm": 6.797613143920898, "learning_rate": 2.351802672672035e-05, "loss": 0.2214, "step": 61600 }, { "epoch": 0.8837406068949694, "grad_norm": 15.459728240966797, "learning_rate": 2.3514043213155157e-05, "loss": 0.1136, "step": 61625 }, { "epoch": 0.8840991223541559, "grad_norm": 4.540705680847168, "learning_rate": 2.3510059699589964e-05, "loss": 0.1481, "step": 61650 }, { "epoch": 0.8844576378133425, "grad_norm": 8.692743301391602, "learning_rate": 2.3506076186024772e-05, "loss": 0.226, "step": 61675 }, { "epoch": 0.8848161532725292, "grad_norm": 9.179621696472168, "learning_rate": 2.350209267245958e-05, "loss": 0.2064, "step": 61700 }, { "epoch": 0.8851746687317157, "grad_norm": 7.229057788848877, "learning_rate": 2.349810915889439e-05, "loss": 0.1727, "step": 61725 }, { "epoch": 0.8855331841909023, "grad_norm": 2.7303833961486816, "learning_rate": 2.34941256453292e-05, "loss": 0.1742, "step": 61750 }, { "epoch": 0.885891699650089, "grad_norm": 14.146946907043457, "learning_rate": 2.3490142131764007e-05, "loss": 0.306, "step": 61775 }, { "epoch": 0.8862502151092755, "grad_norm": 2.1217188835144043, "learning_rate": 2.3486158618198815e-05, "loss": 0.2229, "step": 61800 }, { "epoch": 0.8866087305684621, "grad_norm": 18.932165145874023, "learning_rate": 2.3482175104633623e-05, "loss": 0.1422, "step": 61825 }, { "epoch": 0.8869672460276488, "grad_norm": 1.5985347032546997, "learning_rate": 2.3478191591068434e-05, "loss": 0.2528, "step": 61850 }, { "epoch": 0.8873257614868353, "grad_norm": 16.58967399597168, "learning_rate": 2.3474208077503242e-05, "loss": 0.2754, "step": 61875 }, { "epoch": 0.8876842769460219, "grad_norm": 0.04639097675681114, "learning_rate": 2.347022456393805e-05, "loss": 0.1484, "step": 61900 }, { "epoch": 0.8880427924052086, "grad_norm": 15.658936500549316, "learning_rate": 2.3466241050372857e-05, "loss": 0.2203, "step": 61925 }, { "epoch": 0.8884013078643951, "grad_norm": 2.5189175605773926, "learning_rate": 2.346225753680767e-05, "loss": 0.2107, "step": 61950 }, { "epoch": 0.8887598233235817, "grad_norm": 1.1398217678070068, "learning_rate": 2.3458274023242477e-05, "loss": 0.1301, "step": 61975 }, { "epoch": 0.8891183387827684, "grad_norm": 7.304404258728027, "learning_rate": 2.3454290509677284e-05, "loss": 0.1564, "step": 62000 }, { "epoch": 0.8894768542419549, "grad_norm": 1.512500286102295, "learning_rate": 2.3450306996112092e-05, "loss": 0.2118, "step": 62025 }, { "epoch": 0.8898353697011415, "grad_norm": 11.968249320983887, "learning_rate": 2.34463234825469e-05, "loss": 0.1848, "step": 62050 }, { "epoch": 0.8901938851603282, "grad_norm": 0.180817648768425, "learning_rate": 2.344233996898171e-05, "loss": 0.1369, "step": 62075 }, { "epoch": 0.8905524006195147, "grad_norm": 1.4336934089660645, "learning_rate": 2.343835645541652e-05, "loss": 0.208, "step": 62100 }, { "epoch": 0.8909109160787013, "grad_norm": 18.11941909790039, "learning_rate": 2.3434372941851324e-05, "loss": 0.1257, "step": 62125 }, { "epoch": 0.891269431537888, "grad_norm": 1.6315370798110962, "learning_rate": 2.343038942828613e-05, "loss": 0.2341, "step": 62150 }, { "epoch": 0.8916279469970745, "grad_norm": 3.678825616836548, "learning_rate": 2.342640591472094e-05, "loss": 0.1926, "step": 62175 }, { "epoch": 0.8919864624562611, "grad_norm": 2.2995963096618652, "learning_rate": 2.342242240115575e-05, "loss": 0.1974, "step": 62200 }, { "epoch": 0.8923449779154478, "grad_norm": 4.971856117248535, "learning_rate": 2.341843888759056e-05, "loss": 0.1566, "step": 62225 }, { "epoch": 0.8927034933746343, "grad_norm": 5.440528869628906, "learning_rate": 2.3414455374025366e-05, "loss": 0.187, "step": 62250 }, { "epoch": 0.8930620088338209, "grad_norm": 0.7282354831695557, "learning_rate": 2.3410471860460174e-05, "loss": 0.1873, "step": 62275 }, { "epoch": 0.8934205242930076, "grad_norm": 7.060957431793213, "learning_rate": 2.3406488346894982e-05, "loss": 0.1474, "step": 62300 }, { "epoch": 0.8937790397521941, "grad_norm": 10.702204704284668, "learning_rate": 2.3402504833329793e-05, "loss": 0.2193, "step": 62325 }, { "epoch": 0.8941375552113807, "grad_norm": 8.347649574279785, "learning_rate": 2.33985213197646e-05, "loss": 0.1104, "step": 62350 }, { "epoch": 0.8944960706705674, "grad_norm": 13.779099464416504, "learning_rate": 2.339453780619941e-05, "loss": 0.2504, "step": 62375 }, { "epoch": 0.8948545861297539, "grad_norm": 0.2952440083026886, "learning_rate": 2.3390554292634217e-05, "loss": 0.1625, "step": 62400 }, { "epoch": 0.8952131015889405, "grad_norm": 2.366528034210205, "learning_rate": 2.3386570779069025e-05, "loss": 0.1756, "step": 62425 }, { "epoch": 0.8955716170481272, "grad_norm": 1.3567583560943604, "learning_rate": 2.3382587265503836e-05, "loss": 0.2082, "step": 62450 }, { "epoch": 0.8959301325073137, "grad_norm": 4.462821960449219, "learning_rate": 2.3378603751938644e-05, "loss": 0.1635, "step": 62475 }, { "epoch": 0.8962886479665003, "grad_norm": 9.873109817504883, "learning_rate": 2.337462023837345e-05, "loss": 0.1842, "step": 62500 }, { "epoch": 0.896647163425687, "grad_norm": 9.167912483215332, "learning_rate": 2.337063672480826e-05, "loss": 0.1646, "step": 62525 }, { "epoch": 0.8970056788848735, "grad_norm": 0.249561607837677, "learning_rate": 2.336665321124307e-05, "loss": 0.1365, "step": 62550 }, { "epoch": 0.8973641943440601, "grad_norm": 0.46094009280204773, "learning_rate": 2.336266969767788e-05, "loss": 0.1087, "step": 62575 }, { "epoch": 0.8977227098032468, "grad_norm": 17.41419792175293, "learning_rate": 2.3358686184112686e-05, "loss": 0.2007, "step": 62600 }, { "epoch": 0.8980812252624333, "grad_norm": 1.1872042417526245, "learning_rate": 2.3354702670547494e-05, "loss": 0.2408, "step": 62625 }, { "epoch": 0.8984397407216199, "grad_norm": 16.772497177124023, "learning_rate": 2.3350719156982302e-05, "loss": 0.1795, "step": 62650 }, { "epoch": 0.8987982561808066, "grad_norm": 0.07137352973222733, "learning_rate": 2.3346735643417113e-05, "loss": 0.295, "step": 62675 }, { "epoch": 0.8991567716399931, "grad_norm": 1.130812406539917, "learning_rate": 2.334275212985192e-05, "loss": 0.1608, "step": 62700 }, { "epoch": 0.8995152870991797, "grad_norm": 4.294372081756592, "learning_rate": 2.333876861628673e-05, "loss": 0.2209, "step": 62725 }, { "epoch": 0.8998738025583664, "grad_norm": 21.864503860473633, "learning_rate": 2.3334785102721537e-05, "loss": 0.1979, "step": 62750 }, { "epoch": 0.9002323180175529, "grad_norm": 4.854998588562012, "learning_rate": 2.3330801589156345e-05, "loss": 0.2131, "step": 62775 }, { "epoch": 0.9005908334767395, "grad_norm": 1.7635339498519897, "learning_rate": 2.3326818075591156e-05, "loss": 0.075, "step": 62800 }, { "epoch": 0.9009493489359262, "grad_norm": 5.4419965744018555, "learning_rate": 2.3322834562025964e-05, "loss": 0.1611, "step": 62825 }, { "epoch": 0.9013078643951127, "grad_norm": 0.06698744744062424, "learning_rate": 2.331885104846077e-05, "loss": 0.1557, "step": 62850 }, { "epoch": 0.9016663798542993, "grad_norm": 4.69265604019165, "learning_rate": 2.331486753489558e-05, "loss": 0.1149, "step": 62875 }, { "epoch": 0.902024895313486, "grad_norm": 14.469810485839844, "learning_rate": 2.3310884021330387e-05, "loss": 0.2316, "step": 62900 }, { "epoch": 0.9023834107726725, "grad_norm": 12.968690872192383, "learning_rate": 2.33069005077652e-05, "loss": 0.2174, "step": 62925 }, { "epoch": 0.9027419262318591, "grad_norm": 0.7912417054176331, "learning_rate": 2.3302916994200006e-05, "loss": 0.144, "step": 62950 }, { "epoch": 0.9031004416910458, "grad_norm": 0.21061812341213226, "learning_rate": 2.3298933480634814e-05, "loss": 0.1796, "step": 62975 }, { "epoch": 0.9034589571502323, "grad_norm": 8.917574882507324, "learning_rate": 2.3294949967069622e-05, "loss": 0.1808, "step": 63000 }, { "epoch": 0.9038174726094189, "grad_norm": 4.635324954986572, "learning_rate": 2.329096645350443e-05, "loss": 0.2071, "step": 63025 }, { "epoch": 0.9041759880686056, "grad_norm": 4.201223373413086, "learning_rate": 2.328698293993924e-05, "loss": 0.2381, "step": 63050 }, { "epoch": 0.9045345035277921, "grad_norm": 2.2995715141296387, "learning_rate": 2.328299942637405e-05, "loss": 0.1866, "step": 63075 }, { "epoch": 0.9048930189869787, "grad_norm": 1.3130466938018799, "learning_rate": 2.3279015912808857e-05, "loss": 0.1531, "step": 63100 }, { "epoch": 0.9052515344461654, "grad_norm": 0.44784247875213623, "learning_rate": 2.3275032399243665e-05, "loss": 0.2293, "step": 63125 }, { "epoch": 0.9056100499053519, "grad_norm": 1.7814626693725586, "learning_rate": 2.3271048885678476e-05, "loss": 0.1806, "step": 63150 }, { "epoch": 0.9059685653645385, "grad_norm": 0.6916454434394836, "learning_rate": 2.3267065372113284e-05, "loss": 0.1844, "step": 63175 }, { "epoch": 0.9063270808237252, "grad_norm": 5.07111930847168, "learning_rate": 2.326308185854809e-05, "loss": 0.1609, "step": 63200 }, { "epoch": 0.9066855962829117, "grad_norm": 4.74080228805542, "learning_rate": 2.3259098344982896e-05, "loss": 0.1774, "step": 63225 }, { "epoch": 0.9070441117420983, "grad_norm": 0.37361326813697815, "learning_rate": 2.3255114831417704e-05, "loss": 0.2097, "step": 63250 }, { "epoch": 0.907402627201285, "grad_norm": 0.2872397303581238, "learning_rate": 2.3251131317852515e-05, "loss": 0.1487, "step": 63275 }, { "epoch": 0.9077611426604715, "grad_norm": 29.783607482910156, "learning_rate": 2.3247147804287323e-05, "loss": 0.2485, "step": 63300 }, { "epoch": 0.9081196581196581, "grad_norm": 6.823953151702881, "learning_rate": 2.324316429072213e-05, "loss": 0.1504, "step": 63325 }, { "epoch": 0.9084781735788447, "grad_norm": 4.200455188751221, "learning_rate": 2.323918077715694e-05, "loss": 0.2913, "step": 63350 }, { "epoch": 0.9088366890380313, "grad_norm": 14.386664390563965, "learning_rate": 2.3235197263591746e-05, "loss": 0.2022, "step": 63375 }, { "epoch": 0.9091952044972179, "grad_norm": 0.09254572540521622, "learning_rate": 2.3231213750026558e-05, "loss": 0.1045, "step": 63400 }, { "epoch": 0.9095537199564045, "grad_norm": 0.8683726191520691, "learning_rate": 2.3227230236461365e-05, "loss": 0.1524, "step": 63425 }, { "epoch": 0.9099122354155911, "grad_norm": 13.992973327636719, "learning_rate": 2.3223246722896173e-05, "loss": 0.1881, "step": 63450 }, { "epoch": 0.9102707508747777, "grad_norm": 1.4071301221847534, "learning_rate": 2.321926320933098e-05, "loss": 0.1934, "step": 63475 }, { "epoch": 0.9106292663339643, "grad_norm": 11.683161735534668, "learning_rate": 2.321527969576579e-05, "loss": 0.3006, "step": 63500 }, { "epoch": 0.9109877817931509, "grad_norm": 12.635209083557129, "learning_rate": 2.32112961822006e-05, "loss": 0.1746, "step": 63525 }, { "epoch": 0.9113462972523375, "grad_norm": 12.827460289001465, "learning_rate": 2.3207312668635408e-05, "loss": 0.249, "step": 63550 }, { "epoch": 0.9117048127115241, "grad_norm": 4.7655348777771, "learning_rate": 2.3203329155070216e-05, "loss": 0.1877, "step": 63575 }, { "epoch": 0.9120633281707107, "grad_norm": 11.814339637756348, "learning_rate": 2.3199345641505024e-05, "loss": 0.2272, "step": 63600 }, { "epoch": 0.9124218436298973, "grad_norm": 2.7640275955200195, "learning_rate": 2.319536212793983e-05, "loss": 0.1134, "step": 63625 }, { "epoch": 0.912780359089084, "grad_norm": 1.7891895771026611, "learning_rate": 2.3191378614374643e-05, "loss": 0.2607, "step": 63650 }, { "epoch": 0.9131388745482705, "grad_norm": 1.079232096672058, "learning_rate": 2.318739510080945e-05, "loss": 0.2195, "step": 63675 }, { "epoch": 0.9134973900074571, "grad_norm": 6.326147556304932, "learning_rate": 2.318341158724426e-05, "loss": 0.1534, "step": 63700 }, { "epoch": 0.9138559054666437, "grad_norm": 0.544863760471344, "learning_rate": 2.3179428073679066e-05, "loss": 0.2687, "step": 63725 }, { "epoch": 0.9142144209258303, "grad_norm": 1.7988277673721313, "learning_rate": 2.3175444560113878e-05, "loss": 0.2351, "step": 63750 }, { "epoch": 0.9145729363850169, "grad_norm": 4.751895904541016, "learning_rate": 2.3171461046548685e-05, "loss": 0.1361, "step": 63775 }, { "epoch": 0.9149314518442035, "grad_norm": 0.08070747554302216, "learning_rate": 2.3167477532983493e-05, "loss": 0.2655, "step": 63800 }, { "epoch": 0.9152899673033901, "grad_norm": 14.775219917297363, "learning_rate": 2.31634940194183e-05, "loss": 0.2379, "step": 63825 }, { "epoch": 0.9156484827625767, "grad_norm": 6.609038829803467, "learning_rate": 2.315951050585311e-05, "loss": 0.169, "step": 63850 }, { "epoch": 0.9160069982217633, "grad_norm": 1.6632033586502075, "learning_rate": 2.315552699228792e-05, "loss": 0.162, "step": 63875 }, { "epoch": 0.9163655136809499, "grad_norm": 16.289939880371094, "learning_rate": 2.3151543478722728e-05, "loss": 0.1689, "step": 63900 }, { "epoch": 0.9167240291401365, "grad_norm": 21.737836837768555, "learning_rate": 2.3147559965157536e-05, "loss": 0.252, "step": 63925 }, { "epoch": 0.9170825445993231, "grad_norm": 2.0827107429504395, "learning_rate": 2.3143576451592344e-05, "loss": 0.1388, "step": 63950 }, { "epoch": 0.9174410600585097, "grad_norm": 2.739074945449829, "learning_rate": 2.313959293802715e-05, "loss": 0.1491, "step": 63975 }, { "epoch": 0.9177995755176963, "grad_norm": 5.79194974899292, "learning_rate": 2.3135609424461963e-05, "loss": 0.2164, "step": 64000 }, { "epoch": 0.918158090976883, "grad_norm": 18.76669692993164, "learning_rate": 2.313162591089677e-05, "loss": 0.1739, "step": 64025 }, { "epoch": 0.9185166064360695, "grad_norm": 4.687665939331055, "learning_rate": 2.312764239733158e-05, "loss": 0.2917, "step": 64050 }, { "epoch": 0.9188751218952561, "grad_norm": 1.2056822776794434, "learning_rate": 2.3123658883766386e-05, "loss": 0.1976, "step": 64075 }, { "epoch": 0.9192336373544427, "grad_norm": 8.801251411437988, "learning_rate": 2.3119675370201194e-05, "loss": 0.1988, "step": 64100 }, { "epoch": 0.9195921528136293, "grad_norm": 4.991309642791748, "learning_rate": 2.3115691856636005e-05, "loss": 0.215, "step": 64125 }, { "epoch": 0.9199506682728159, "grad_norm": 1.3403674364089966, "learning_rate": 2.3111708343070813e-05, "loss": 0.21, "step": 64150 }, { "epoch": 0.9203091837320025, "grad_norm": 0.4570901095867157, "learning_rate": 2.310772482950562e-05, "loss": 0.1503, "step": 64175 }, { "epoch": 0.9206676991911891, "grad_norm": 0.37552666664123535, "learning_rate": 2.310374131594043e-05, "loss": 0.2167, "step": 64200 }, { "epoch": 0.9210262146503757, "grad_norm": 1.2834806442260742, "learning_rate": 2.3099757802375237e-05, "loss": 0.1118, "step": 64225 }, { "epoch": 0.9213847301095623, "grad_norm": 14.379252433776855, "learning_rate": 2.3095774288810048e-05, "loss": 0.2654, "step": 64250 }, { "epoch": 0.9217432455687489, "grad_norm": 0.3113831579685211, "learning_rate": 2.3091790775244856e-05, "loss": 0.1326, "step": 64275 }, { "epoch": 0.9221017610279355, "grad_norm": 2.4353814125061035, "learning_rate": 2.3087807261679664e-05, "loss": 0.2018, "step": 64300 }, { "epoch": 0.9224602764871221, "grad_norm": 6.26729679107666, "learning_rate": 2.3083823748114468e-05, "loss": 0.1227, "step": 64325 }, { "epoch": 0.9228187919463087, "grad_norm": 22.993057250976562, "learning_rate": 2.307984023454928e-05, "loss": 0.1991, "step": 64350 }, { "epoch": 0.9231773074054953, "grad_norm": 1.8629094362258911, "learning_rate": 2.3075856720984087e-05, "loss": 0.308, "step": 64375 }, { "epoch": 0.9235358228646819, "grad_norm": 3.602379322052002, "learning_rate": 2.3071873207418895e-05, "loss": 0.092, "step": 64400 }, { "epoch": 0.9238943383238686, "grad_norm": 3.6827921867370605, "learning_rate": 2.3067889693853703e-05, "loss": 0.1479, "step": 64425 }, { "epoch": 0.9242528537830551, "grad_norm": 16.656352996826172, "learning_rate": 2.306390618028851e-05, "loss": 0.1552, "step": 64450 }, { "epoch": 0.9246113692422417, "grad_norm": 17.535381317138672, "learning_rate": 2.3059922666723322e-05, "loss": 0.2826, "step": 64475 }, { "epoch": 0.9249698847014284, "grad_norm": 3.774747610092163, "learning_rate": 2.305593915315813e-05, "loss": 0.1485, "step": 64500 }, { "epoch": 0.9253284001606149, "grad_norm": 1.6812539100646973, "learning_rate": 2.3051955639592938e-05, "loss": 0.191, "step": 64525 }, { "epoch": 0.9256869156198015, "grad_norm": 14.002607345581055, "learning_rate": 2.3047972126027746e-05, "loss": 0.3405, "step": 64550 }, { "epoch": 0.9260454310789882, "grad_norm": 1.7037293910980225, "learning_rate": 2.3043988612462553e-05, "loss": 0.1944, "step": 64575 }, { "epoch": 0.9264039465381747, "grad_norm": 3.7873644828796387, "learning_rate": 2.3040005098897365e-05, "loss": 0.157, "step": 64600 }, { "epoch": 0.9267624619973613, "grad_norm": 16.747156143188477, "learning_rate": 2.3036021585332172e-05, "loss": 0.1676, "step": 64625 }, { "epoch": 0.927120977456548, "grad_norm": 17.010276794433594, "learning_rate": 2.303203807176698e-05, "loss": 0.2602, "step": 64650 }, { "epoch": 0.9274794929157345, "grad_norm": 0.23802699148654938, "learning_rate": 2.3028054558201788e-05, "loss": 0.1472, "step": 64675 }, { "epoch": 0.9278380083749211, "grad_norm": 11.326446533203125, "learning_rate": 2.3024071044636596e-05, "loss": 0.2212, "step": 64700 }, { "epoch": 0.9281965238341078, "grad_norm": 12.220940589904785, "learning_rate": 2.3020087531071407e-05, "loss": 0.1714, "step": 64725 }, { "epoch": 0.9285550392932943, "grad_norm": 7.955123424530029, "learning_rate": 2.3016104017506215e-05, "loss": 0.1052, "step": 64750 }, { "epoch": 0.9289135547524809, "grad_norm": 2.443619728088379, "learning_rate": 2.3012120503941023e-05, "loss": 0.1749, "step": 64775 }, { "epoch": 0.9292720702116676, "grad_norm": 17.698070526123047, "learning_rate": 2.300813699037583e-05, "loss": 0.1408, "step": 64800 }, { "epoch": 0.9296305856708541, "grad_norm": 1.949198842048645, "learning_rate": 2.300415347681064e-05, "loss": 0.1363, "step": 64825 }, { "epoch": 0.9299891011300407, "grad_norm": 14.692307472229004, "learning_rate": 2.300016996324545e-05, "loss": 0.2062, "step": 64850 }, { "epoch": 0.9303476165892274, "grad_norm": 1.9562950134277344, "learning_rate": 2.2996186449680258e-05, "loss": 0.1195, "step": 64875 }, { "epoch": 0.9307061320484139, "grad_norm": 1.054731845855713, "learning_rate": 2.2992202936115066e-05, "loss": 0.1741, "step": 64900 }, { "epoch": 0.9310646475076005, "grad_norm": 6.966536045074463, "learning_rate": 2.2988219422549873e-05, "loss": 0.2519, "step": 64925 }, { "epoch": 0.9314231629667872, "grad_norm": 0.7152689695358276, "learning_rate": 2.2984235908984685e-05, "loss": 0.1585, "step": 64950 }, { "epoch": 0.9317816784259737, "grad_norm": 0.4611266851425171, "learning_rate": 2.2980252395419492e-05, "loss": 0.2204, "step": 64975 }, { "epoch": 0.9321401938851603, "grad_norm": 1.1236293315887451, "learning_rate": 2.29762688818543e-05, "loss": 0.112, "step": 65000 }, { "epoch": 0.932498709344347, "grad_norm": 10.04992961883545, "learning_rate": 2.2972285368289108e-05, "loss": 0.186, "step": 65025 }, { "epoch": 0.9328572248035335, "grad_norm": 2.1666486263275146, "learning_rate": 2.2968301854723916e-05, "loss": 0.197, "step": 65050 }, { "epoch": 0.9332157402627201, "grad_norm": 26.781044006347656, "learning_rate": 2.2964318341158727e-05, "loss": 0.1914, "step": 65075 }, { "epoch": 0.9335742557219068, "grad_norm": 12.325298309326172, "learning_rate": 2.2960334827593535e-05, "loss": 0.1902, "step": 65100 }, { "epoch": 0.9339327711810933, "grad_norm": 9.491113662719727, "learning_rate": 2.2956351314028343e-05, "loss": 0.1412, "step": 65125 }, { "epoch": 0.9342912866402799, "grad_norm": 7.559164047241211, "learning_rate": 2.295236780046315e-05, "loss": 0.2032, "step": 65150 }, { "epoch": 0.9346498020994666, "grad_norm": 15.512998580932617, "learning_rate": 2.294838428689796e-05, "loss": 0.2752, "step": 65175 }, { "epoch": 0.9350083175586531, "grad_norm": 2.911090850830078, "learning_rate": 2.294440077333277e-05, "loss": 0.1567, "step": 65200 }, { "epoch": 0.9353668330178397, "grad_norm": 0.1939922273159027, "learning_rate": 2.2940417259767578e-05, "loss": 0.1579, "step": 65225 }, { "epoch": 0.9357253484770264, "grad_norm": 0.5134579539299011, "learning_rate": 2.2936433746202386e-05, "loss": 0.1809, "step": 65250 }, { "epoch": 0.9360838639362129, "grad_norm": 2.8557822704315186, "learning_rate": 2.2932450232637193e-05, "loss": 0.0849, "step": 65275 }, { "epoch": 0.9364423793953995, "grad_norm": 0.6393778920173645, "learning_rate": 2.2928466719072e-05, "loss": 0.2446, "step": 65300 }, { "epoch": 0.9368008948545862, "grad_norm": 1.999107837677002, "learning_rate": 2.2924483205506812e-05, "loss": 0.1847, "step": 65325 }, { "epoch": 0.9371594103137727, "grad_norm": 13.306360244750977, "learning_rate": 2.292049969194162e-05, "loss": 0.2452, "step": 65350 }, { "epoch": 0.9375179257729593, "grad_norm": 3.7062621116638184, "learning_rate": 2.2916516178376428e-05, "loss": 0.1991, "step": 65375 }, { "epoch": 0.937876441232146, "grad_norm": 9.357926368713379, "learning_rate": 2.2912532664811236e-05, "loss": 0.1496, "step": 65400 }, { "epoch": 0.9382349566913325, "grad_norm": 1.2142994403839111, "learning_rate": 2.290854915124604e-05, "loss": 0.2007, "step": 65425 }, { "epoch": 0.9385934721505191, "grad_norm": 2.6168582439422607, "learning_rate": 2.290456563768085e-05, "loss": 0.1865, "step": 65450 }, { "epoch": 0.9389519876097058, "grad_norm": 0.5538811683654785, "learning_rate": 2.290058212411566e-05, "loss": 0.1644, "step": 65475 }, { "epoch": 0.9393105030688923, "grad_norm": 13.080521583557129, "learning_rate": 2.2896598610550467e-05, "loss": 0.2319, "step": 65500 }, { "epoch": 0.9396690185280789, "grad_norm": 9.070815086364746, "learning_rate": 2.2892615096985275e-05, "loss": 0.1188, "step": 65525 }, { "epoch": 0.9400275339872656, "grad_norm": 20.593976974487305, "learning_rate": 2.2888631583420083e-05, "loss": 0.2609, "step": 65550 }, { "epoch": 0.9403860494464521, "grad_norm": 2.9177474975585938, "learning_rate": 2.2884648069854894e-05, "loss": 0.1597, "step": 65575 }, { "epoch": 0.9407445649056387, "grad_norm": 1.7217702865600586, "learning_rate": 2.2880664556289702e-05, "loss": 0.202, "step": 65600 }, { "epoch": 0.9411030803648254, "grad_norm": 1.1888847351074219, "learning_rate": 2.287668104272451e-05, "loss": 0.2041, "step": 65625 }, { "epoch": 0.9414615958240119, "grad_norm": 14.774511337280273, "learning_rate": 2.2872697529159318e-05, "loss": 0.1235, "step": 65650 }, { "epoch": 0.9418201112831985, "grad_norm": 2.0657918453216553, "learning_rate": 2.286871401559413e-05, "loss": 0.2985, "step": 65675 }, { "epoch": 0.9421786267423852, "grad_norm": 5.765331268310547, "learning_rate": 2.2864730502028937e-05, "loss": 0.1227, "step": 65700 }, { "epoch": 0.9425371422015717, "grad_norm": 1.1917980909347534, "learning_rate": 2.2860746988463745e-05, "loss": 0.2427, "step": 65725 }, { "epoch": 0.9428956576607583, "grad_norm": 0.6267575621604919, "learning_rate": 2.2856763474898553e-05, "loss": 0.2371, "step": 65750 }, { "epoch": 0.943254173119945, "grad_norm": 0.06576641649007797, "learning_rate": 2.285277996133336e-05, "loss": 0.2115, "step": 65775 }, { "epoch": 0.9436126885791315, "grad_norm": 1.718745231628418, "learning_rate": 2.284879644776817e-05, "loss": 0.2318, "step": 65800 }, { "epoch": 0.9439712040383181, "grad_norm": 3.541152000427246, "learning_rate": 2.284481293420298e-05, "loss": 0.2204, "step": 65825 }, { "epoch": 0.9443297194975048, "grad_norm": 1.1628252267837524, "learning_rate": 2.2840829420637787e-05, "loss": 0.2557, "step": 65850 }, { "epoch": 0.9446882349566913, "grad_norm": 4.026730537414551, "learning_rate": 2.2836845907072595e-05, "loss": 0.1164, "step": 65875 }, { "epoch": 0.9450467504158779, "grad_norm": 13.110967636108398, "learning_rate": 2.2832862393507403e-05, "loss": 0.2135, "step": 65900 }, { "epoch": 0.9454052658750646, "grad_norm": 10.912766456604004, "learning_rate": 2.2828878879942214e-05, "loss": 0.1741, "step": 65925 }, { "epoch": 0.9457637813342511, "grad_norm": 0.5926514267921448, "learning_rate": 2.2824895366377022e-05, "loss": 0.207, "step": 65950 }, { "epoch": 0.9461222967934377, "grad_norm": 3.218238353729248, "learning_rate": 2.282091185281183e-05, "loss": 0.2092, "step": 65975 }, { "epoch": 0.9464808122526244, "grad_norm": 11.918741226196289, "learning_rate": 2.2816928339246638e-05, "loss": 0.1421, "step": 66000 }, { "epoch": 0.9468393277118109, "grad_norm": 16.938447952270508, "learning_rate": 2.2812944825681446e-05, "loss": 0.2072, "step": 66025 }, { "epoch": 0.9471978431709975, "grad_norm": 1.2325341701507568, "learning_rate": 2.2808961312116257e-05, "loss": 0.1697, "step": 66050 }, { "epoch": 0.9475563586301842, "grad_norm": 0.9332544803619385, "learning_rate": 2.2804977798551065e-05, "loss": 0.186, "step": 66075 }, { "epoch": 0.9479148740893707, "grad_norm": 0.891204833984375, "learning_rate": 2.2800994284985873e-05, "loss": 0.1376, "step": 66100 }, { "epoch": 0.9482733895485573, "grad_norm": 15.223237991333008, "learning_rate": 2.279701077142068e-05, "loss": 0.1568, "step": 66125 }, { "epoch": 0.948631905007744, "grad_norm": 0.5150556564331055, "learning_rate": 2.2793027257855488e-05, "loss": 0.1325, "step": 66150 }, { "epoch": 0.9489904204669305, "grad_norm": 1.9847294092178345, "learning_rate": 2.27890437442903e-05, "loss": 0.1599, "step": 66175 }, { "epoch": 0.9493489359261171, "grad_norm": 0.5640689730644226, "learning_rate": 2.2785060230725107e-05, "loss": 0.1825, "step": 66200 }, { "epoch": 0.9497074513853038, "grad_norm": 3.3505282402038574, "learning_rate": 2.2781076717159915e-05, "loss": 0.211, "step": 66225 }, { "epoch": 0.9500659668444903, "grad_norm": 2.793149471282959, "learning_rate": 2.2777093203594723e-05, "loss": 0.1447, "step": 66250 }, { "epoch": 0.9504244823036769, "grad_norm": 10.775781631469727, "learning_rate": 2.2773109690029534e-05, "loss": 0.2221, "step": 66275 }, { "epoch": 0.9507829977628636, "grad_norm": 9.094637870788574, "learning_rate": 2.2769126176464342e-05, "loss": 0.1762, "step": 66300 }, { "epoch": 0.9511415132220501, "grad_norm": 8.72350025177002, "learning_rate": 2.276514266289915e-05, "loss": 0.1222, "step": 66325 }, { "epoch": 0.9515000286812367, "grad_norm": 1.6533637046813965, "learning_rate": 2.2761159149333958e-05, "loss": 0.18, "step": 66350 }, { "epoch": 0.9518585441404234, "grad_norm": 14.003210067749023, "learning_rate": 2.2757175635768766e-05, "loss": 0.2301, "step": 66375 }, { "epoch": 0.9522170595996099, "grad_norm": 7.459280967712402, "learning_rate": 2.2753192122203577e-05, "loss": 0.2274, "step": 66400 }, { "epoch": 0.9525755750587965, "grad_norm": 17.71633529663086, "learning_rate": 2.2749208608638385e-05, "loss": 0.1999, "step": 66425 }, { "epoch": 0.9529340905179832, "grad_norm": 0.9120614528656006, "learning_rate": 2.2745225095073193e-05, "loss": 0.1134, "step": 66450 }, { "epoch": 0.9532926059771697, "grad_norm": 5.5312700271606445, "learning_rate": 2.2741241581508e-05, "loss": 0.1966, "step": 66475 }, { "epoch": 0.9536511214363563, "grad_norm": 6.826251983642578, "learning_rate": 2.2737258067942808e-05, "loss": 0.1534, "step": 66500 }, { "epoch": 0.954009636895543, "grad_norm": 10.437417030334473, "learning_rate": 2.2733274554377616e-05, "loss": 0.191, "step": 66525 }, { "epoch": 0.9543681523547295, "grad_norm": 14.376974105834961, "learning_rate": 2.2729291040812424e-05, "loss": 0.1451, "step": 66550 }, { "epoch": 0.9547266678139161, "grad_norm": 8.662193298339844, "learning_rate": 2.2725307527247232e-05, "loss": 0.1518, "step": 66575 }, { "epoch": 0.9550851832731028, "grad_norm": 25.478126525878906, "learning_rate": 2.272132401368204e-05, "loss": 0.2178, "step": 66600 }, { "epoch": 0.9554436987322893, "grad_norm": 1.0222910642623901, "learning_rate": 2.2717340500116847e-05, "loss": 0.1682, "step": 66625 }, { "epoch": 0.9558022141914759, "grad_norm": 27.114831924438477, "learning_rate": 2.271335698655166e-05, "loss": 0.1669, "step": 66650 }, { "epoch": 0.9561607296506626, "grad_norm": 22.797569274902344, "learning_rate": 2.2709373472986467e-05, "loss": 0.209, "step": 66675 }, { "epoch": 0.9565192451098491, "grad_norm": 0.39789924025535583, "learning_rate": 2.2705389959421274e-05, "loss": 0.1407, "step": 66700 }, { "epoch": 0.9568777605690357, "grad_norm": 1.0889744758605957, "learning_rate": 2.2701406445856082e-05, "loss": 0.1783, "step": 66725 }, { "epoch": 0.9572362760282224, "grad_norm": 2.373359203338623, "learning_rate": 2.269742293229089e-05, "loss": 0.1888, "step": 66750 }, { "epoch": 0.9575947914874089, "grad_norm": 13.808198928833008, "learning_rate": 2.26934394187257e-05, "loss": 0.3179, "step": 66775 }, { "epoch": 0.9579533069465955, "grad_norm": 11.978759765625, "learning_rate": 2.268945590516051e-05, "loss": 0.1665, "step": 66800 }, { "epoch": 0.9583118224057822, "grad_norm": 4.216922760009766, "learning_rate": 2.2685472391595317e-05, "loss": 0.2351, "step": 66825 }, { "epoch": 0.9586703378649687, "grad_norm": 1.1108307838439941, "learning_rate": 2.2681488878030125e-05, "loss": 0.1314, "step": 66850 }, { "epoch": 0.9590288533241553, "grad_norm": 3.6555421352386475, "learning_rate": 2.2677505364464936e-05, "loss": 0.1082, "step": 66875 }, { "epoch": 0.959387368783342, "grad_norm": 0.23652413487434387, "learning_rate": 2.2673521850899744e-05, "loss": 0.1952, "step": 66900 }, { "epoch": 0.9597458842425285, "grad_norm": 8.428768157958984, "learning_rate": 2.2669538337334552e-05, "loss": 0.2195, "step": 66925 }, { "epoch": 0.9601043997017151, "grad_norm": 0.8114760518074036, "learning_rate": 2.266555482376936e-05, "loss": 0.2178, "step": 66950 }, { "epoch": 0.9604629151609018, "grad_norm": 10.916959762573242, "learning_rate": 2.2661571310204167e-05, "loss": 0.1239, "step": 66975 }, { "epoch": 0.9608214306200883, "grad_norm": 6.770764350891113, "learning_rate": 2.265758779663898e-05, "loss": 0.1559, "step": 67000 }, { "epoch": 0.9611799460792749, "grad_norm": 2.039072036743164, "learning_rate": 2.2653604283073787e-05, "loss": 0.2105, "step": 67025 }, { "epoch": 0.9615384615384616, "grad_norm": 3.2014575004577637, "learning_rate": 2.2649620769508594e-05, "loss": 0.232, "step": 67050 }, { "epoch": 0.9618969769976481, "grad_norm": 0.7589302659034729, "learning_rate": 2.2645637255943402e-05, "loss": 0.1783, "step": 67075 }, { "epoch": 0.9622554924568347, "grad_norm": 0.038318198174238205, "learning_rate": 2.264165374237821e-05, "loss": 0.1751, "step": 67100 }, { "epoch": 0.9626140079160214, "grad_norm": 1.6400319337844849, "learning_rate": 2.263767022881302e-05, "loss": 0.2468, "step": 67125 }, { "epoch": 0.9629725233752079, "grad_norm": 8.56553840637207, "learning_rate": 2.263368671524783e-05, "loss": 0.2739, "step": 67150 }, { "epoch": 0.9633310388343945, "grad_norm": 11.133108139038086, "learning_rate": 2.2629703201682637e-05, "loss": 0.2435, "step": 67175 }, { "epoch": 0.9636895542935812, "grad_norm": 7.340960502624512, "learning_rate": 2.2625719688117445e-05, "loss": 0.2862, "step": 67200 }, { "epoch": 0.9640480697527677, "grad_norm": 0.2213849276304245, "learning_rate": 2.2621736174552253e-05, "loss": 0.0987, "step": 67225 }, { "epoch": 0.9644065852119543, "grad_norm": 5.316972732543945, "learning_rate": 2.2617752660987064e-05, "loss": 0.1631, "step": 67250 }, { "epoch": 0.964765100671141, "grad_norm": 3.8445467948913574, "learning_rate": 2.2613769147421872e-05, "loss": 0.2155, "step": 67275 }, { "epoch": 0.9651236161303275, "grad_norm": 5.600621223449707, "learning_rate": 2.260978563385668e-05, "loss": 0.1718, "step": 67300 }, { "epoch": 0.9654821315895141, "grad_norm": 11.322966575622559, "learning_rate": 2.2605802120291487e-05, "loss": 0.1995, "step": 67325 }, { "epoch": 0.9658406470487008, "grad_norm": 0.03080270066857338, "learning_rate": 2.2601818606726295e-05, "loss": 0.1781, "step": 67350 }, { "epoch": 0.9661991625078873, "grad_norm": 18.466705322265625, "learning_rate": 2.2597835093161107e-05, "loss": 0.1668, "step": 67375 }, { "epoch": 0.9665576779670739, "grad_norm": 6.104811668395996, "learning_rate": 2.2593851579595914e-05, "loss": 0.1719, "step": 67400 }, { "epoch": 0.9669161934262606, "grad_norm": 4.12502384185791, "learning_rate": 2.2589868066030722e-05, "loss": 0.3141, "step": 67425 }, { "epoch": 0.9672747088854471, "grad_norm": 15.263739585876465, "learning_rate": 2.258588455246553e-05, "loss": 0.1803, "step": 67450 }, { "epoch": 0.9676332243446337, "grad_norm": 1.5372730493545532, "learning_rate": 2.258190103890034e-05, "loss": 0.1926, "step": 67475 }, { "epoch": 0.9679917398038204, "grad_norm": 0.5525491833686829, "learning_rate": 2.257791752533515e-05, "loss": 0.0819, "step": 67500 }, { "epoch": 0.9683502552630069, "grad_norm": 1.3442797660827637, "learning_rate": 2.2573934011769957e-05, "loss": 0.1657, "step": 67525 }, { "epoch": 0.9687087707221935, "grad_norm": 5.176055431365967, "learning_rate": 2.2569950498204765e-05, "loss": 0.2863, "step": 67550 }, { "epoch": 0.9690672861813802, "grad_norm": 8.192761421203613, "learning_rate": 2.2565966984639573e-05, "loss": 0.1855, "step": 67575 }, { "epoch": 0.9694258016405667, "grad_norm": 4.963038444519043, "learning_rate": 2.256198347107438e-05, "loss": 0.218, "step": 67600 }, { "epoch": 0.9697843170997533, "grad_norm": 0.5553001761436462, "learning_rate": 2.255799995750919e-05, "loss": 0.0933, "step": 67625 }, { "epoch": 0.97014283255894, "grad_norm": 4.201992988586426, "learning_rate": 2.2554016443943996e-05, "loss": 0.1901, "step": 67650 }, { "epoch": 0.9705013480181265, "grad_norm": 0.15214282274246216, "learning_rate": 2.2550032930378804e-05, "loss": 0.1524, "step": 67675 }, { "epoch": 0.9708598634773131, "grad_norm": 0.9695124626159668, "learning_rate": 2.2546049416813612e-05, "loss": 0.0711, "step": 67700 }, { "epoch": 0.9712183789364998, "grad_norm": 0.49380651116371155, "learning_rate": 2.2542065903248423e-05, "loss": 0.3054, "step": 67725 }, { "epoch": 0.9715768943956864, "grad_norm": 0.6482510566711426, "learning_rate": 2.253808238968323e-05, "loss": 0.1806, "step": 67750 }, { "epoch": 0.9719354098548729, "grad_norm": 5.767922878265381, "learning_rate": 2.253409887611804e-05, "loss": 0.133, "step": 67775 }, { "epoch": 0.9722939253140596, "grad_norm": 0.09698329120874405, "learning_rate": 2.2530115362552847e-05, "loss": 0.1096, "step": 67800 }, { "epoch": 0.9726524407732462, "grad_norm": 4.384365558624268, "learning_rate": 2.2526131848987655e-05, "loss": 0.2639, "step": 67825 }, { "epoch": 0.9730109562324327, "grad_norm": 3.7727537155151367, "learning_rate": 2.2522148335422466e-05, "loss": 0.2053, "step": 67850 }, { "epoch": 0.9733694716916194, "grad_norm": 11.526171684265137, "learning_rate": 2.2518164821857274e-05, "loss": 0.1154, "step": 67875 }, { "epoch": 0.973727987150806, "grad_norm": 13.178618431091309, "learning_rate": 2.251418130829208e-05, "loss": 0.2094, "step": 67900 }, { "epoch": 0.9740865026099925, "grad_norm": 3.206611394882202, "learning_rate": 2.251019779472689e-05, "loss": 0.1232, "step": 67925 }, { "epoch": 0.9744450180691792, "grad_norm": 1.749419093132019, "learning_rate": 2.2506214281161697e-05, "loss": 0.1543, "step": 67950 }, { "epoch": 0.9748035335283658, "grad_norm": 5.502901077270508, "learning_rate": 2.250223076759651e-05, "loss": 0.2354, "step": 67975 }, { "epoch": 0.9751620489875523, "grad_norm": 3.9734914302825928, "learning_rate": 2.2498247254031316e-05, "loss": 0.1524, "step": 68000 }, { "epoch": 0.975520564446739, "grad_norm": 1.5722153186798096, "learning_rate": 2.2494263740466124e-05, "loss": 0.209, "step": 68025 }, { "epoch": 0.9758790799059256, "grad_norm": 4.0601630210876465, "learning_rate": 2.2490280226900932e-05, "loss": 0.1653, "step": 68050 }, { "epoch": 0.9762375953651121, "grad_norm": 11.976208686828613, "learning_rate": 2.2486296713335743e-05, "loss": 0.1825, "step": 68075 }, { "epoch": 0.9765961108242988, "grad_norm": 12.426338195800781, "learning_rate": 2.248231319977055e-05, "loss": 0.1444, "step": 68100 }, { "epoch": 0.9769546262834854, "grad_norm": 16.135440826416016, "learning_rate": 2.247832968620536e-05, "loss": 0.2547, "step": 68125 }, { "epoch": 0.9773131417426719, "grad_norm": 13.066540718078613, "learning_rate": 2.2474346172640167e-05, "loss": 0.1925, "step": 68150 }, { "epoch": 0.9776716572018586, "grad_norm": 4.970682621002197, "learning_rate": 2.2470362659074974e-05, "loss": 0.1591, "step": 68175 }, { "epoch": 0.9780301726610452, "grad_norm": 11.26240062713623, "learning_rate": 2.2466379145509786e-05, "loss": 0.2437, "step": 68200 }, { "epoch": 0.9783886881202317, "grad_norm": 0.6676299571990967, "learning_rate": 2.2462395631944594e-05, "loss": 0.1409, "step": 68225 }, { "epoch": 0.9787472035794184, "grad_norm": 20.749530792236328, "learning_rate": 2.24584121183794e-05, "loss": 0.1177, "step": 68250 }, { "epoch": 0.979105719038605, "grad_norm": 12.962589263916016, "learning_rate": 2.245442860481421e-05, "loss": 0.1896, "step": 68275 }, { "epoch": 0.9794642344977915, "grad_norm": 4.406562805175781, "learning_rate": 2.2450445091249017e-05, "loss": 0.2222, "step": 68300 }, { "epoch": 0.9798227499569782, "grad_norm": 9.097156524658203, "learning_rate": 2.244646157768383e-05, "loss": 0.205, "step": 68325 }, { "epoch": 0.9801812654161648, "grad_norm": 20.85038948059082, "learning_rate": 2.2442478064118636e-05, "loss": 0.2326, "step": 68350 }, { "epoch": 0.9805397808753513, "grad_norm": 0.5271291732788086, "learning_rate": 2.2438494550553444e-05, "loss": 0.2042, "step": 68375 }, { "epoch": 0.980898296334538, "grad_norm": 10.022358894348145, "learning_rate": 2.2434511036988252e-05, "loss": 0.2444, "step": 68400 }, { "epoch": 0.9812568117937246, "grad_norm": 0.3191325068473816, "learning_rate": 2.243052752342306e-05, "loss": 0.2145, "step": 68425 }, { "epoch": 0.9816153272529111, "grad_norm": 0.2609356939792633, "learning_rate": 2.242654400985787e-05, "loss": 0.2203, "step": 68450 }, { "epoch": 0.9819738427120978, "grad_norm": 6.299399375915527, "learning_rate": 2.242256049629268e-05, "loss": 0.1662, "step": 68475 }, { "epoch": 0.9823323581712844, "grad_norm": 19.046070098876953, "learning_rate": 2.2418576982727487e-05, "loss": 0.304, "step": 68500 }, { "epoch": 0.9826908736304709, "grad_norm": 14.723737716674805, "learning_rate": 2.2414593469162294e-05, "loss": 0.1093, "step": 68525 }, { "epoch": 0.9830493890896576, "grad_norm": 21.081218719482422, "learning_rate": 2.2410609955597102e-05, "loss": 0.2399, "step": 68550 }, { "epoch": 0.9834079045488442, "grad_norm": 2.88196063041687, "learning_rate": 2.2406626442031914e-05, "loss": 0.1661, "step": 68575 }, { "epoch": 0.9837664200080307, "grad_norm": 13.793587684631348, "learning_rate": 2.240264292846672e-05, "loss": 0.2188, "step": 68600 }, { "epoch": 0.9841249354672174, "grad_norm": 9.00488567352295, "learning_rate": 2.239865941490153e-05, "loss": 0.205, "step": 68625 }, { "epoch": 0.984483450926404, "grad_norm": 3.0967581272125244, "learning_rate": 2.2394675901336337e-05, "loss": 0.2003, "step": 68650 }, { "epoch": 0.9848419663855905, "grad_norm": 4.0901689529418945, "learning_rate": 2.239069238777115e-05, "loss": 0.1907, "step": 68675 }, { "epoch": 0.9852004818447772, "grad_norm": 4.547916412353516, "learning_rate": 2.2386708874205953e-05, "loss": 0.1929, "step": 68700 }, { "epoch": 0.9855589973039638, "grad_norm": 0.504755973815918, "learning_rate": 2.238272536064076e-05, "loss": 0.2488, "step": 68725 }, { "epoch": 0.9859175127631503, "grad_norm": 16.984989166259766, "learning_rate": 2.237874184707557e-05, "loss": 0.1711, "step": 68750 }, { "epoch": 0.986276028222337, "grad_norm": 4.811713218688965, "learning_rate": 2.2374758333510376e-05, "loss": 0.3068, "step": 68775 }, { "epoch": 0.9866345436815236, "grad_norm": 8.023663520812988, "learning_rate": 2.2370774819945188e-05, "loss": 0.111, "step": 68800 }, { "epoch": 0.9869930591407101, "grad_norm": 5.537216663360596, "learning_rate": 2.2366791306379995e-05, "loss": 0.2111, "step": 68825 }, { "epoch": 0.9873515745998968, "grad_norm": 7.636597633361816, "learning_rate": 2.2362807792814803e-05, "loss": 0.1968, "step": 68850 }, { "epoch": 0.9877100900590834, "grad_norm": 10.758795738220215, "learning_rate": 2.235882427924961e-05, "loss": 0.1783, "step": 68875 }, { "epoch": 0.9880686055182699, "grad_norm": 10.0366792678833, "learning_rate": 2.235484076568442e-05, "loss": 0.1883, "step": 68900 }, { "epoch": 0.9884271209774566, "grad_norm": 0.22808247804641724, "learning_rate": 2.235085725211923e-05, "loss": 0.1744, "step": 68925 }, { "epoch": 0.9887856364366432, "grad_norm": 0.8996685147285461, "learning_rate": 2.2346873738554038e-05, "loss": 0.1628, "step": 68950 }, { "epoch": 0.9891441518958297, "grad_norm": 15.42473316192627, "learning_rate": 2.2342890224988846e-05, "loss": 0.2877, "step": 68975 }, { "epoch": 0.9895026673550164, "grad_norm": 23.439807891845703, "learning_rate": 2.2338906711423654e-05, "loss": 0.2, "step": 69000 }, { "epoch": 0.989861182814203, "grad_norm": 0.044397007673978806, "learning_rate": 2.233492319785846e-05, "loss": 0.0834, "step": 69025 }, { "epoch": 0.9902196982733895, "grad_norm": 7.7410054206848145, "learning_rate": 2.2330939684293273e-05, "loss": 0.2156, "step": 69050 }, { "epoch": 0.9905782137325762, "grad_norm": 11.326866149902344, "learning_rate": 2.232695617072808e-05, "loss": 0.2622, "step": 69075 }, { "epoch": 0.9909367291917628, "grad_norm": 15.605151176452637, "learning_rate": 2.232297265716289e-05, "loss": 0.21, "step": 69100 }, { "epoch": 0.9912952446509493, "grad_norm": 4.188972473144531, "learning_rate": 2.2318989143597696e-05, "loss": 0.2237, "step": 69125 }, { "epoch": 0.991653760110136, "grad_norm": 8.240391731262207, "learning_rate": 2.2315005630032504e-05, "loss": 0.146, "step": 69150 }, { "epoch": 0.9920122755693226, "grad_norm": 23.281675338745117, "learning_rate": 2.2311022116467315e-05, "loss": 0.1913, "step": 69175 }, { "epoch": 0.9923707910285091, "grad_norm": 0.2784132659435272, "learning_rate": 2.2307038602902123e-05, "loss": 0.1656, "step": 69200 }, { "epoch": 0.9927293064876958, "grad_norm": 13.35449504852295, "learning_rate": 2.230305508933693e-05, "loss": 0.1783, "step": 69225 }, { "epoch": 0.9930878219468824, "grad_norm": 4.509952545166016, "learning_rate": 2.229907157577174e-05, "loss": 0.3138, "step": 69250 }, { "epoch": 0.9934463374060689, "grad_norm": 3.504058599472046, "learning_rate": 2.229508806220655e-05, "loss": 0.1518, "step": 69275 }, { "epoch": 0.9938048528652556, "grad_norm": 2.0118911266326904, "learning_rate": 2.2291104548641358e-05, "loss": 0.1242, "step": 69300 }, { "epoch": 0.9941633683244422, "grad_norm": 0.3173293471336365, "learning_rate": 2.2287121035076166e-05, "loss": 0.1891, "step": 69325 }, { "epoch": 0.9945218837836287, "grad_norm": 0.9948386549949646, "learning_rate": 2.2283137521510974e-05, "loss": 0.2201, "step": 69350 }, { "epoch": 0.9948803992428153, "grad_norm": 2.1898133754730225, "learning_rate": 2.227915400794578e-05, "loss": 0.1732, "step": 69375 }, { "epoch": 0.995238914702002, "grad_norm": 3.4672930240631104, "learning_rate": 2.2275170494380593e-05, "loss": 0.1377, "step": 69400 }, { "epoch": 0.9955974301611885, "grad_norm": 0.5938862562179565, "learning_rate": 2.22711869808154e-05, "loss": 0.0953, "step": 69425 }, { "epoch": 0.9959559456203751, "grad_norm": 5.890440940856934, "learning_rate": 2.226720346725021e-05, "loss": 0.2948, "step": 69450 }, { "epoch": 0.9963144610795618, "grad_norm": 0.8590313792228699, "learning_rate": 2.2263219953685016e-05, "loss": 0.2045, "step": 69475 }, { "epoch": 0.9966729765387483, "grad_norm": 1.1784038543701172, "learning_rate": 2.2259236440119824e-05, "loss": 0.1579, "step": 69500 }, { "epoch": 0.997031491997935, "grad_norm": 16.792896270751953, "learning_rate": 2.2255252926554635e-05, "loss": 0.2126, "step": 69525 }, { "epoch": 0.9973900074571216, "grad_norm": 2.3331854343414307, "learning_rate": 2.2251269412989443e-05, "loss": 0.1296, "step": 69550 }, { "epoch": 0.9977485229163081, "grad_norm": 8.11996078491211, "learning_rate": 2.224728589942425e-05, "loss": 0.3035, "step": 69575 }, { "epoch": 0.9981070383754947, "grad_norm": 15.452249526977539, "learning_rate": 2.224330238585906e-05, "loss": 0.221, "step": 69600 }, { "epoch": 0.9984655538346814, "grad_norm": 19.241111755371094, "learning_rate": 2.2239318872293867e-05, "loss": 0.1706, "step": 69625 }, { "epoch": 0.9988240692938679, "grad_norm": 0.27205920219421387, "learning_rate": 2.2235335358728678e-05, "loss": 0.1562, "step": 69650 }, { "epoch": 0.9991825847530545, "grad_norm": 0.9563487768173218, "learning_rate": 2.2231351845163486e-05, "loss": 0.1354, "step": 69675 }, { "epoch": 0.9995411002122412, "grad_norm": 0.37491726875305176, "learning_rate": 2.2227368331598294e-05, "loss": 0.2281, "step": 69700 }, { "epoch": 0.9998996156714277, "grad_norm": 23.303600311279297, "learning_rate": 2.22233848180331e-05, "loss": 0.2172, "step": 69725 }, { "epoch": 1.0, "eval_cosine_accuracy": 0.9504860267314702, "eval_loss": 0.22793857753276825, "eval_runtime": 238.1014, "eval_samples_per_second": 27.652, "eval_steps_per_second": 1.73, "step": 69732 }, { "epoch": 1.0002581311306145, "grad_norm": 0.5439954400062561, "learning_rate": 2.221940130446791e-05, "loss": 0.1416, "step": 69750 }, { "epoch": 1.000616646589801, "grad_norm": 0.824950098991394, "learning_rate": 2.221541779090272e-05, "loss": 0.2021, "step": 69775 }, { "epoch": 1.0009751620489875, "grad_norm": 16.47440528869629, "learning_rate": 2.2211434277337525e-05, "loss": 0.2516, "step": 69800 }, { "epoch": 1.0013336775081743, "grad_norm": 16.597543716430664, "learning_rate": 2.2207450763772333e-05, "loss": 0.0898, "step": 69825 }, { "epoch": 1.0016921929673608, "grad_norm": 0.5622639060020447, "learning_rate": 2.220346725020714e-05, "loss": 0.1165, "step": 69850 }, { "epoch": 1.0020507084265473, "grad_norm": 0.9902083873748779, "learning_rate": 2.219948373664195e-05, "loss": 0.0685, "step": 69875 }, { "epoch": 1.002409223885734, "grad_norm": 1.326607346534729, "learning_rate": 2.219550022307676e-05, "loss": 0.1027, "step": 69900 }, { "epoch": 1.0027677393449206, "grad_norm": 13.088573455810547, "learning_rate": 2.2191516709511568e-05, "loss": 0.0897, "step": 69925 }, { "epoch": 1.003126254804107, "grad_norm": 14.13629150390625, "learning_rate": 2.2187533195946376e-05, "loss": 0.1206, "step": 69950 }, { "epoch": 1.0034847702632939, "grad_norm": 15.489845275878906, "learning_rate": 2.2183549682381183e-05, "loss": 0.2237, "step": 69975 }, { "epoch": 1.0038432857224804, "grad_norm": 16.880908966064453, "learning_rate": 2.2179566168815995e-05, "loss": 0.1257, "step": 70000 }, { "epoch": 1.004201801181667, "grad_norm": 8.138826370239258, "learning_rate": 2.2175582655250802e-05, "loss": 0.1742, "step": 70025 }, { "epoch": 1.0045603166408537, "grad_norm": 1.054599404335022, "learning_rate": 2.217159914168561e-05, "loss": 0.0943, "step": 70050 }, { "epoch": 1.0049188321000402, "grad_norm": 1.651628017425537, "learning_rate": 2.2167615628120418e-05, "loss": 0.184, "step": 70075 }, { "epoch": 1.0052773475592267, "grad_norm": 2.4853122234344482, "learning_rate": 2.2163632114555226e-05, "loss": 0.0937, "step": 70100 }, { "epoch": 1.0056358630184135, "grad_norm": 8.4967679977417, "learning_rate": 2.2159648600990037e-05, "loss": 0.122, "step": 70125 }, { "epoch": 1.0059943784776, "grad_norm": 13.244261741638184, "learning_rate": 2.2155665087424845e-05, "loss": 0.2086, "step": 70150 }, { "epoch": 1.0063528939367865, "grad_norm": 5.970308303833008, "learning_rate": 2.2151681573859653e-05, "loss": 0.2189, "step": 70175 }, { "epoch": 1.0067114093959733, "grad_norm": 0.5512340068817139, "learning_rate": 2.214769806029446e-05, "loss": 0.1673, "step": 70200 }, { "epoch": 1.0070699248551598, "grad_norm": 0.02206023968756199, "learning_rate": 2.214371454672927e-05, "loss": 0.0834, "step": 70225 }, { "epoch": 1.0074284403143463, "grad_norm": 1.5152186155319214, "learning_rate": 2.213973103316408e-05, "loss": 0.1083, "step": 70250 }, { "epoch": 1.007786955773533, "grad_norm": 2.6662800312042236, "learning_rate": 2.2135747519598888e-05, "loss": 0.1287, "step": 70275 }, { "epoch": 1.0081454712327196, "grad_norm": 0.6945073008537292, "learning_rate": 2.2131764006033695e-05, "loss": 0.2242, "step": 70300 }, { "epoch": 1.008503986691906, "grad_norm": 14.1707763671875, "learning_rate": 2.2127780492468503e-05, "loss": 0.1319, "step": 70325 }, { "epoch": 1.0088625021510929, "grad_norm": 0.6004346609115601, "learning_rate": 2.212379697890331e-05, "loss": 0.1349, "step": 70350 }, { "epoch": 1.0092210176102794, "grad_norm": 0.07089722901582718, "learning_rate": 2.2119813465338122e-05, "loss": 0.1463, "step": 70375 }, { "epoch": 1.009579533069466, "grad_norm": 2.678842306137085, "learning_rate": 2.211582995177293e-05, "loss": 0.0933, "step": 70400 }, { "epoch": 1.0099380485286527, "grad_norm": 14.754121780395508, "learning_rate": 2.2111846438207738e-05, "loss": 0.145, "step": 70425 }, { "epoch": 1.0102965639878392, "grad_norm": 5.2585225105285645, "learning_rate": 2.2107862924642546e-05, "loss": 0.1395, "step": 70450 }, { "epoch": 1.0106550794470257, "grad_norm": 0.6327592134475708, "learning_rate": 2.2103879411077354e-05, "loss": 0.1737, "step": 70475 }, { "epoch": 1.0110135949062125, "grad_norm": 1.969727635383606, "learning_rate": 2.2099895897512165e-05, "loss": 0.186, "step": 70500 }, { "epoch": 1.011372110365399, "grad_norm": 12.550702095031738, "learning_rate": 2.2095912383946973e-05, "loss": 0.1917, "step": 70525 }, { "epoch": 1.0117306258245855, "grad_norm": 2.3393123149871826, "learning_rate": 2.209192887038178e-05, "loss": 0.1426, "step": 70550 }, { "epoch": 1.0120891412837723, "grad_norm": 11.93814468383789, "learning_rate": 2.208794535681659e-05, "loss": 0.1707, "step": 70575 }, { "epoch": 1.0124476567429588, "grad_norm": 13.999817848205566, "learning_rate": 2.20839618432514e-05, "loss": 0.2135, "step": 70600 }, { "epoch": 1.0128061722021453, "grad_norm": 0.3083137273788452, "learning_rate": 2.2079978329686208e-05, "loss": 0.169, "step": 70625 }, { "epoch": 1.013164687661332, "grad_norm": 17.771663665771484, "learning_rate": 2.2075994816121015e-05, "loss": 0.2046, "step": 70650 }, { "epoch": 1.0135232031205186, "grad_norm": 0.8010663986206055, "learning_rate": 2.2072011302555823e-05, "loss": 0.1962, "step": 70675 }, { "epoch": 1.013881718579705, "grad_norm": 2.214268207550049, "learning_rate": 2.206802778899063e-05, "loss": 0.117, "step": 70700 }, { "epoch": 1.0142402340388919, "grad_norm": 18.850465774536133, "learning_rate": 2.2064044275425442e-05, "loss": 0.2226, "step": 70725 }, { "epoch": 1.0145987494980784, "grad_norm": 0.20765937864780426, "learning_rate": 2.206006076186025e-05, "loss": 0.0891, "step": 70750 }, { "epoch": 1.014957264957265, "grad_norm": 4.831532001495361, "learning_rate": 2.2056077248295058e-05, "loss": 0.1076, "step": 70775 }, { "epoch": 1.0153157804164517, "grad_norm": 16.361116409301758, "learning_rate": 2.2052093734729866e-05, "loss": 0.1543, "step": 70800 }, { "epoch": 1.0156742958756382, "grad_norm": 18.79647445678711, "learning_rate": 2.2048110221164674e-05, "loss": 0.158, "step": 70825 }, { "epoch": 1.0160328113348247, "grad_norm": 1.5802838802337646, "learning_rate": 2.2044126707599485e-05, "loss": 0.1249, "step": 70850 }, { "epoch": 1.0163913267940115, "grad_norm": 11.758337020874023, "learning_rate": 2.2040143194034293e-05, "loss": 0.116, "step": 70875 }, { "epoch": 1.016749842253198, "grad_norm": 0.17900598049163818, "learning_rate": 2.20361596804691e-05, "loss": 0.0729, "step": 70900 }, { "epoch": 1.0171083577123845, "grad_norm": 12.237061500549316, "learning_rate": 2.2032176166903905e-05, "loss": 0.142, "step": 70925 }, { "epoch": 1.0174668731715713, "grad_norm": 4.644871234893799, "learning_rate": 2.2028192653338713e-05, "loss": 0.1403, "step": 70950 }, { "epoch": 1.0178253886307578, "grad_norm": 8.031601905822754, "learning_rate": 2.2024209139773524e-05, "loss": 0.1623, "step": 70975 }, { "epoch": 1.0181839040899443, "grad_norm": 8.567733764648438, "learning_rate": 2.2020225626208332e-05, "loss": 0.1462, "step": 71000 }, { "epoch": 1.018542419549131, "grad_norm": 0.9428072571754456, "learning_rate": 2.201624211264314e-05, "loss": 0.133, "step": 71025 }, { "epoch": 1.0189009350083176, "grad_norm": 0.38249877095222473, "learning_rate": 2.2012258599077948e-05, "loss": 0.1245, "step": 71050 }, { "epoch": 1.019259450467504, "grad_norm": 0.8997902274131775, "learning_rate": 2.2008275085512756e-05, "loss": 0.1523, "step": 71075 }, { "epoch": 1.0196179659266909, "grad_norm": 2.4188928604125977, "learning_rate": 2.2004291571947567e-05, "loss": 0.1339, "step": 71100 }, { "epoch": 1.0199764813858774, "grad_norm": 9.105585098266602, "learning_rate": 2.2000308058382375e-05, "loss": 0.1347, "step": 71125 }, { "epoch": 1.020334996845064, "grad_norm": 0.13441768288612366, "learning_rate": 2.1996324544817183e-05, "loss": 0.1442, "step": 71150 }, { "epoch": 1.0206935123042506, "grad_norm": 8.943675994873047, "learning_rate": 2.199234103125199e-05, "loss": 0.2218, "step": 71175 }, { "epoch": 1.0210520277634372, "grad_norm": 1.6629793643951416, "learning_rate": 2.19883575176868e-05, "loss": 0.1976, "step": 71200 }, { "epoch": 1.0214105432226237, "grad_norm": 14.408101081848145, "learning_rate": 2.198437400412161e-05, "loss": 0.1368, "step": 71225 }, { "epoch": 1.0217690586818104, "grad_norm": 9.167520523071289, "learning_rate": 2.1980390490556417e-05, "loss": 0.2248, "step": 71250 }, { "epoch": 1.022127574140997, "grad_norm": 15.132585525512695, "learning_rate": 2.1976406976991225e-05, "loss": 0.1987, "step": 71275 }, { "epoch": 1.0224860896001835, "grad_norm": 2.2194724082946777, "learning_rate": 2.1972423463426033e-05, "loss": 0.08, "step": 71300 }, { "epoch": 1.0228446050593702, "grad_norm": 0.3458114266395569, "learning_rate": 2.1968439949860844e-05, "loss": 0.1401, "step": 71325 }, { "epoch": 1.0232031205185568, "grad_norm": 0.2674455940723419, "learning_rate": 2.1964456436295652e-05, "loss": 0.3203, "step": 71350 }, { "epoch": 1.0235616359777433, "grad_norm": 17.217185974121094, "learning_rate": 2.196047292273046e-05, "loss": 0.2007, "step": 71375 }, { "epoch": 1.02392015143693, "grad_norm": 0.33854424953460693, "learning_rate": 2.1956489409165268e-05, "loss": 0.1347, "step": 71400 }, { "epoch": 1.0242786668961166, "grad_norm": 13.677847862243652, "learning_rate": 2.1952505895600076e-05, "loss": 0.2121, "step": 71425 }, { "epoch": 1.024637182355303, "grad_norm": 21.93752670288086, "learning_rate": 2.1948522382034887e-05, "loss": 0.131, "step": 71450 }, { "epoch": 1.0249956978144898, "grad_norm": 0.3502258062362671, "learning_rate": 2.1944538868469695e-05, "loss": 0.1319, "step": 71475 }, { "epoch": 1.0253542132736764, "grad_norm": 0.47493991255760193, "learning_rate": 2.1940555354904503e-05, "loss": 0.1682, "step": 71500 }, { "epoch": 1.025712728732863, "grad_norm": 6.660799980163574, "learning_rate": 2.193657184133931e-05, "loss": 0.153, "step": 71525 }, { "epoch": 1.0260712441920496, "grad_norm": 3.9369184970855713, "learning_rate": 2.1932588327774118e-05, "loss": 0.1299, "step": 71550 }, { "epoch": 1.0264297596512362, "grad_norm": 13.363688468933105, "learning_rate": 2.192860481420893e-05, "loss": 0.1524, "step": 71575 }, { "epoch": 1.0267882751104227, "grad_norm": 15.71542739868164, "learning_rate": 2.1924621300643737e-05, "loss": 0.1182, "step": 71600 }, { "epoch": 1.0271467905696094, "grad_norm": 0.4576958119869232, "learning_rate": 2.1920637787078545e-05, "loss": 0.0632, "step": 71625 }, { "epoch": 1.027505306028796, "grad_norm": 2.487408399581909, "learning_rate": 2.1916654273513353e-05, "loss": 0.07, "step": 71650 }, { "epoch": 1.0278638214879825, "grad_norm": 11.209147453308105, "learning_rate": 2.191267075994816e-05, "loss": 0.1331, "step": 71675 }, { "epoch": 1.0282223369471692, "grad_norm": 23.71634864807129, "learning_rate": 2.1908687246382972e-05, "loss": 0.1504, "step": 71700 }, { "epoch": 1.0285808524063558, "grad_norm": 0.5291299223899841, "learning_rate": 2.190470373281778e-05, "loss": 0.1465, "step": 71725 }, { "epoch": 1.0289393678655423, "grad_norm": 0.22357526421546936, "learning_rate": 2.1900720219252588e-05, "loss": 0.1806, "step": 71750 }, { "epoch": 1.029297883324729, "grad_norm": 0.829918622970581, "learning_rate": 2.1896736705687396e-05, "loss": 0.1344, "step": 71775 }, { "epoch": 1.0296563987839156, "grad_norm": 5.661595344543457, "learning_rate": 2.1892753192122207e-05, "loss": 0.112, "step": 71800 }, { "epoch": 1.030014914243102, "grad_norm": 2.7119059562683105, "learning_rate": 2.1888769678557015e-05, "loss": 0.1992, "step": 71825 }, { "epoch": 1.0303734297022888, "grad_norm": 6.197131633758545, "learning_rate": 2.1884786164991823e-05, "loss": 0.1028, "step": 71850 }, { "epoch": 1.0307319451614754, "grad_norm": 27.760936737060547, "learning_rate": 2.188080265142663e-05, "loss": 0.2523, "step": 71875 }, { "epoch": 1.031090460620662, "grad_norm": 13.48608112335205, "learning_rate": 2.1876819137861438e-05, "loss": 0.2076, "step": 71900 }, { "epoch": 1.0314489760798486, "grad_norm": 24.06319808959961, "learning_rate": 2.187283562429625e-05, "loss": 0.1082, "step": 71925 }, { "epoch": 1.0318074915390352, "grad_norm": 10.022605895996094, "learning_rate": 2.1868852110731057e-05, "loss": 0.1856, "step": 71950 }, { "epoch": 1.0321660069982217, "grad_norm": 0.893922746181488, "learning_rate": 2.1864868597165865e-05, "loss": 0.163, "step": 71975 }, { "epoch": 1.0325245224574084, "grad_norm": 9.860956192016602, "learning_rate": 2.1860885083600673e-05, "loss": 0.1306, "step": 72000 }, { "epoch": 1.032883037916595, "grad_norm": 14.144259452819824, "learning_rate": 2.1856901570035477e-05, "loss": 0.1795, "step": 72025 }, { "epoch": 1.0332415533757815, "grad_norm": 20.213775634765625, "learning_rate": 2.185291805647029e-05, "loss": 0.1758, "step": 72050 }, { "epoch": 1.0336000688349682, "grad_norm": 16.470670700073242, "learning_rate": 2.1848934542905097e-05, "loss": 0.1093, "step": 72075 }, { "epoch": 1.0339585842941548, "grad_norm": 4.243277072906494, "learning_rate": 2.1844951029339904e-05, "loss": 0.1353, "step": 72100 }, { "epoch": 1.0343170997533413, "grad_norm": 0.9842538237571716, "learning_rate": 2.1840967515774712e-05, "loss": 0.0801, "step": 72125 }, { "epoch": 1.034675615212528, "grad_norm": 3.607771158218384, "learning_rate": 2.183698400220952e-05, "loss": 0.1321, "step": 72150 }, { "epoch": 1.0350341306717146, "grad_norm": 2.431185245513916, "learning_rate": 2.183300048864433e-05, "loss": 0.14, "step": 72175 }, { "epoch": 1.035392646130901, "grad_norm": 1.1461457014083862, "learning_rate": 2.182901697507914e-05, "loss": 0.1223, "step": 72200 }, { "epoch": 1.0357511615900878, "grad_norm": 0.77040034532547, "learning_rate": 2.1825033461513947e-05, "loss": 0.201, "step": 72225 }, { "epoch": 1.0361096770492744, "grad_norm": 22.6701717376709, "learning_rate": 2.1821049947948755e-05, "loss": 0.1217, "step": 72250 }, { "epoch": 1.036468192508461, "grad_norm": 1.7139915227890015, "learning_rate": 2.1817066434383563e-05, "loss": 0.173, "step": 72275 }, { "epoch": 1.0368267079676476, "grad_norm": 8.835393905639648, "learning_rate": 2.1813082920818374e-05, "loss": 0.1672, "step": 72300 }, { "epoch": 1.0371852234268342, "grad_norm": 1.9256610870361328, "learning_rate": 2.1809099407253182e-05, "loss": 0.109, "step": 72325 }, { "epoch": 1.0375437388860207, "grad_norm": 4.200699806213379, "learning_rate": 2.180511589368799e-05, "loss": 0.0505, "step": 72350 }, { "epoch": 1.0379022543452074, "grad_norm": 8.110048294067383, "learning_rate": 2.1801132380122797e-05, "loss": 0.1301, "step": 72375 }, { "epoch": 1.038260769804394, "grad_norm": 0.088014155626297, "learning_rate": 2.179714886655761e-05, "loss": 0.1151, "step": 72400 }, { "epoch": 1.0386192852635805, "grad_norm": 3.720142126083374, "learning_rate": 2.1793165352992416e-05, "loss": 0.1197, "step": 72425 }, { "epoch": 1.0389778007227672, "grad_norm": 1.591720461845398, "learning_rate": 2.1789181839427224e-05, "loss": 0.2038, "step": 72450 }, { "epoch": 1.0393363161819538, "grad_norm": 7.205256462097168, "learning_rate": 2.1785198325862032e-05, "loss": 0.1649, "step": 72475 }, { "epoch": 1.0396948316411403, "grad_norm": 4.673331260681152, "learning_rate": 2.178121481229684e-05, "loss": 0.1255, "step": 72500 }, { "epoch": 1.040053347100327, "grad_norm": 1.7059975862503052, "learning_rate": 2.177723129873165e-05, "loss": 0.1074, "step": 72525 }, { "epoch": 1.0404118625595136, "grad_norm": 0.544947624206543, "learning_rate": 2.177324778516646e-05, "loss": 0.1841, "step": 72550 }, { "epoch": 1.0407703780187, "grad_norm": 4.646634578704834, "learning_rate": 2.1769264271601267e-05, "loss": 0.2024, "step": 72575 }, { "epoch": 1.0411288934778868, "grad_norm": 8.558755874633789, "learning_rate": 2.1765280758036075e-05, "loss": 0.1161, "step": 72600 }, { "epoch": 1.0414874089370734, "grad_norm": 0.796449601650238, "learning_rate": 2.1761297244470883e-05, "loss": 0.1854, "step": 72625 }, { "epoch": 1.04184592439626, "grad_norm": 10.080775260925293, "learning_rate": 2.1757313730905694e-05, "loss": 0.1455, "step": 72650 }, { "epoch": 1.0422044398554466, "grad_norm": 4.953771591186523, "learning_rate": 2.1753330217340502e-05, "loss": 0.0681, "step": 72675 }, { "epoch": 1.0425629553146332, "grad_norm": 1.9989047050476074, "learning_rate": 2.174934670377531e-05, "loss": 0.1113, "step": 72700 }, { "epoch": 1.0429214707738197, "grad_norm": 4.478720664978027, "learning_rate": 2.1745363190210117e-05, "loss": 0.1405, "step": 72725 }, { "epoch": 1.0432799862330064, "grad_norm": 2.5461008548736572, "learning_rate": 2.1741379676644925e-05, "loss": 0.1617, "step": 72750 }, { "epoch": 1.043638501692193, "grad_norm": 24.902585983276367, "learning_rate": 2.1737396163079736e-05, "loss": 0.1532, "step": 72775 }, { "epoch": 1.0439970171513795, "grad_norm": 3.3747670650482178, "learning_rate": 2.1733412649514544e-05, "loss": 0.0969, "step": 72800 }, { "epoch": 1.0443555326105662, "grad_norm": 2.740410804748535, "learning_rate": 2.1729429135949352e-05, "loss": 0.1799, "step": 72825 }, { "epoch": 1.0447140480697528, "grad_norm": 0.13681456446647644, "learning_rate": 2.172544562238416e-05, "loss": 0.0981, "step": 72850 }, { "epoch": 1.0450725635289393, "grad_norm": 0.28437089920043945, "learning_rate": 2.1721462108818968e-05, "loss": 0.1326, "step": 72875 }, { "epoch": 1.045431078988126, "grad_norm": 0.43829038739204407, "learning_rate": 2.171747859525378e-05, "loss": 0.169, "step": 72900 }, { "epoch": 1.0457895944473126, "grad_norm": 0.2883650064468384, "learning_rate": 2.1713495081688587e-05, "loss": 0.2468, "step": 72925 }, { "epoch": 1.046148109906499, "grad_norm": 8.538219451904297, "learning_rate": 2.1709511568123395e-05, "loss": 0.1205, "step": 72950 }, { "epoch": 1.0465066253656858, "grad_norm": 8.115805625915527, "learning_rate": 2.1705528054558203e-05, "loss": 0.1526, "step": 72975 }, { "epoch": 1.0468651408248724, "grad_norm": 2.4602932929992676, "learning_rate": 2.1701544540993014e-05, "loss": 0.1311, "step": 73000 }, { "epoch": 1.047223656284059, "grad_norm": 23.459238052368164, "learning_rate": 2.1697561027427822e-05, "loss": 0.1813, "step": 73025 }, { "epoch": 1.0475821717432456, "grad_norm": 0.2930394411087036, "learning_rate": 2.169357751386263e-05, "loss": 0.1733, "step": 73050 }, { "epoch": 1.0479406872024322, "grad_norm": 3.1329870223999023, "learning_rate": 2.1689594000297437e-05, "loss": 0.1255, "step": 73075 }, { "epoch": 1.0482992026616187, "grad_norm": 3.9612462520599365, "learning_rate": 2.1685610486732245e-05, "loss": 0.2009, "step": 73100 }, { "epoch": 1.0486577181208054, "grad_norm": 12.027514457702637, "learning_rate": 2.1681626973167053e-05, "loss": 0.1914, "step": 73125 }, { "epoch": 1.049016233579992, "grad_norm": 2.440185546875, "learning_rate": 2.167764345960186e-05, "loss": 0.1282, "step": 73150 }, { "epoch": 1.0493747490391785, "grad_norm": 0.1551116406917572, "learning_rate": 2.167365994603667e-05, "loss": 0.1931, "step": 73175 }, { "epoch": 1.0497332644983652, "grad_norm": 3.5945487022399902, "learning_rate": 2.1669676432471477e-05, "loss": 0.125, "step": 73200 }, { "epoch": 1.0500917799575518, "grad_norm": 15.197063446044922, "learning_rate": 2.1665692918906284e-05, "loss": 0.1519, "step": 73225 }, { "epoch": 1.0504502954167383, "grad_norm": 0.1356254369020462, "learning_rate": 2.1661709405341096e-05, "loss": 0.0922, "step": 73250 }, { "epoch": 1.050808810875925, "grad_norm": 3.551541566848755, "learning_rate": 2.1657725891775904e-05, "loss": 0.1236, "step": 73275 }, { "epoch": 1.0511673263351116, "grad_norm": 6.680334091186523, "learning_rate": 2.165374237821071e-05, "loss": 0.144, "step": 73300 }, { "epoch": 1.051525841794298, "grad_norm": 2.9441003799438477, "learning_rate": 2.164975886464552e-05, "loss": 0.0954, "step": 73325 }, { "epoch": 1.0518843572534848, "grad_norm": 1.5956121683120728, "learning_rate": 2.1645775351080327e-05, "loss": 0.1446, "step": 73350 }, { "epoch": 1.0522428727126714, "grad_norm": 3.842463493347168, "learning_rate": 2.164179183751514e-05, "loss": 0.1191, "step": 73375 }, { "epoch": 1.0526013881718579, "grad_norm": 3.3749682903289795, "learning_rate": 2.1637808323949946e-05, "loss": 0.0991, "step": 73400 }, { "epoch": 1.0529599036310446, "grad_norm": 19.01183319091797, "learning_rate": 2.1633824810384754e-05, "loss": 0.1542, "step": 73425 }, { "epoch": 1.0533184190902312, "grad_norm": 22.38410758972168, "learning_rate": 2.1629841296819562e-05, "loss": 0.1706, "step": 73450 }, { "epoch": 1.0536769345494177, "grad_norm": 1.0347670316696167, "learning_rate": 2.162585778325437e-05, "loss": 0.1811, "step": 73475 }, { "epoch": 1.0540354500086044, "grad_norm": 3.29587984085083, "learning_rate": 2.162187426968918e-05, "loss": 0.1268, "step": 73500 }, { "epoch": 1.054393965467791, "grad_norm": 8.591434478759766, "learning_rate": 2.161789075612399e-05, "loss": 0.1841, "step": 73525 }, { "epoch": 1.0547524809269775, "grad_norm": 7.84783411026001, "learning_rate": 2.1613907242558797e-05, "loss": 0.142, "step": 73550 }, { "epoch": 1.0551109963861642, "grad_norm": 1.460655689239502, "learning_rate": 2.1609923728993604e-05, "loss": 0.1329, "step": 73575 }, { "epoch": 1.0554695118453508, "grad_norm": 10.131775856018066, "learning_rate": 2.1605940215428416e-05, "loss": 0.115, "step": 73600 }, { "epoch": 1.0558280273045373, "grad_norm": 14.576871871948242, "learning_rate": 2.1601956701863224e-05, "loss": 0.2795, "step": 73625 }, { "epoch": 1.056186542763724, "grad_norm": 6.581235885620117, "learning_rate": 2.159797318829803e-05, "loss": 0.137, "step": 73650 }, { "epoch": 1.0565450582229106, "grad_norm": 19.02414894104004, "learning_rate": 2.159398967473284e-05, "loss": 0.2059, "step": 73675 }, { "epoch": 1.056903573682097, "grad_norm": 2.508044958114624, "learning_rate": 2.1590006161167647e-05, "loss": 0.1317, "step": 73700 }, { "epoch": 1.0572620891412838, "grad_norm": 1.822455883026123, "learning_rate": 2.1586022647602458e-05, "loss": 0.1266, "step": 73725 }, { "epoch": 1.0576206046004704, "grad_norm": 0.8435398936271667, "learning_rate": 2.1582039134037266e-05, "loss": 0.0911, "step": 73750 }, { "epoch": 1.0579791200596569, "grad_norm": 3.2141666412353516, "learning_rate": 2.1578055620472074e-05, "loss": 0.1616, "step": 73775 }, { "epoch": 1.0583376355188436, "grad_norm": 4.45272970199585, "learning_rate": 2.1574072106906882e-05, "loss": 0.183, "step": 73800 }, { "epoch": 1.0586961509780302, "grad_norm": 2.5230648517608643, "learning_rate": 2.157008859334169e-05, "loss": 0.2139, "step": 73825 }, { "epoch": 1.0590546664372167, "grad_norm": 5.077320098876953, "learning_rate": 2.15661050797765e-05, "loss": 0.136, "step": 73850 }, { "epoch": 1.0594131818964034, "grad_norm": 2.171055793762207, "learning_rate": 2.156212156621131e-05, "loss": 0.1495, "step": 73875 }, { "epoch": 1.05977169735559, "grad_norm": 0.8190237283706665, "learning_rate": 2.1558138052646117e-05, "loss": 0.1223, "step": 73900 }, { "epoch": 1.0601302128147765, "grad_norm": 0.9676940441131592, "learning_rate": 2.1554154539080924e-05, "loss": 0.1398, "step": 73925 }, { "epoch": 1.0604887282739632, "grad_norm": 0.14995378255844116, "learning_rate": 2.1550171025515732e-05, "loss": 0.2262, "step": 73950 }, { "epoch": 1.0608472437331498, "grad_norm": 0.4536886513233185, "learning_rate": 2.1546187511950544e-05, "loss": 0.1684, "step": 73975 }, { "epoch": 1.0612057591923363, "grad_norm": 10.137675285339355, "learning_rate": 2.154220399838535e-05, "loss": 0.2453, "step": 74000 }, { "epoch": 1.061564274651523, "grad_norm": 9.865352630615234, "learning_rate": 2.153822048482016e-05, "loss": 0.2257, "step": 74025 }, { "epoch": 1.0619227901107096, "grad_norm": 14.665505409240723, "learning_rate": 2.1534236971254967e-05, "loss": 0.1458, "step": 74050 }, { "epoch": 1.062281305569896, "grad_norm": 16.966413497924805, "learning_rate": 2.1530253457689775e-05, "loss": 0.1483, "step": 74075 }, { "epoch": 1.0626398210290828, "grad_norm": 13.963542938232422, "learning_rate": 2.1526269944124586e-05, "loss": 0.1925, "step": 74100 }, { "epoch": 1.0629983364882694, "grad_norm": 9.972513198852539, "learning_rate": 2.1522286430559394e-05, "loss": 0.1377, "step": 74125 }, { "epoch": 1.0633568519474559, "grad_norm": 22.674530029296875, "learning_rate": 2.1518302916994202e-05, "loss": 0.1478, "step": 74150 }, { "epoch": 1.0637153674066426, "grad_norm": 3.4535701274871826, "learning_rate": 2.151431940342901e-05, "loss": 0.158, "step": 74175 }, { "epoch": 1.0640738828658292, "grad_norm": 0.3173743188381195, "learning_rate": 2.1510335889863818e-05, "loss": 0.1697, "step": 74200 }, { "epoch": 1.0644323983250157, "grad_norm": 1.996336579322815, "learning_rate": 2.1506352376298625e-05, "loss": 0.1295, "step": 74225 }, { "epoch": 1.0647909137842024, "grad_norm": 5.700761795043945, "learning_rate": 2.1502368862733433e-05, "loss": 0.1458, "step": 74250 }, { "epoch": 1.065149429243389, "grad_norm": 0.8414364457130432, "learning_rate": 2.149838534916824e-05, "loss": 0.2093, "step": 74275 }, { "epoch": 1.0655079447025755, "grad_norm": 11.742996215820312, "learning_rate": 2.149440183560305e-05, "loss": 0.123, "step": 74300 }, { "epoch": 1.0658664601617622, "grad_norm": 10.260698318481445, "learning_rate": 2.149041832203786e-05, "loss": 0.1965, "step": 74325 }, { "epoch": 1.0662249756209488, "grad_norm": 4.16531229019165, "learning_rate": 2.1486434808472668e-05, "loss": 0.1093, "step": 74350 }, { "epoch": 1.0665834910801353, "grad_norm": 2.8642234802246094, "learning_rate": 2.1482451294907476e-05, "loss": 0.1731, "step": 74375 }, { "epoch": 1.066942006539322, "grad_norm": 7.827768802642822, "learning_rate": 2.1478467781342284e-05, "loss": 0.1858, "step": 74400 }, { "epoch": 1.0673005219985086, "grad_norm": 8.302613258361816, "learning_rate": 2.147448426777709e-05, "loss": 0.1148, "step": 74425 }, { "epoch": 1.067659037457695, "grad_norm": 18.014883041381836, "learning_rate": 2.1470500754211903e-05, "loss": 0.1434, "step": 74450 }, { "epoch": 1.0680175529168818, "grad_norm": 20.491130828857422, "learning_rate": 2.146651724064671e-05, "loss": 0.0759, "step": 74475 }, { "epoch": 1.0683760683760684, "grad_norm": 10.412945747375488, "learning_rate": 2.146253372708152e-05, "loss": 0.1818, "step": 74500 }, { "epoch": 1.0687345838352549, "grad_norm": 4.467052459716797, "learning_rate": 2.1458550213516326e-05, "loss": 0.1102, "step": 74525 }, { "epoch": 1.0690930992944416, "grad_norm": 15.5189847946167, "learning_rate": 2.1454566699951134e-05, "loss": 0.2528, "step": 74550 }, { "epoch": 1.0694516147536282, "grad_norm": 1.6723750829696655, "learning_rate": 2.1450583186385945e-05, "loss": 0.1166, "step": 74575 }, { "epoch": 1.0698101302128147, "grad_norm": 0.6416935920715332, "learning_rate": 2.1446599672820753e-05, "loss": 0.1694, "step": 74600 }, { "epoch": 1.0701686456720014, "grad_norm": 0.5500681400299072, "learning_rate": 2.144261615925556e-05, "loss": 0.1425, "step": 74625 }, { "epoch": 1.070527161131188, "grad_norm": 1.0582433938980103, "learning_rate": 2.143863264569037e-05, "loss": 0.1286, "step": 74650 }, { "epoch": 1.0708856765903745, "grad_norm": 7.302096843719482, "learning_rate": 2.1434649132125177e-05, "loss": 0.127, "step": 74675 }, { "epoch": 1.0712441920495612, "grad_norm": 25.150394439697266, "learning_rate": 2.1430665618559988e-05, "loss": 0.221, "step": 74700 }, { "epoch": 1.0716027075087478, "grad_norm": 1.4272770881652832, "learning_rate": 2.1426682104994796e-05, "loss": 0.2151, "step": 74725 }, { "epoch": 1.0719612229679343, "grad_norm": 5.20337438583374, "learning_rate": 2.1422698591429604e-05, "loss": 0.1185, "step": 74750 }, { "epoch": 1.072319738427121, "grad_norm": 3.42964506149292, "learning_rate": 2.141871507786441e-05, "loss": 0.1583, "step": 74775 }, { "epoch": 1.0726782538863076, "grad_norm": 0.7872825264930725, "learning_rate": 2.141473156429922e-05, "loss": 0.0888, "step": 74800 }, { "epoch": 1.073036769345494, "grad_norm": 5.001439571380615, "learning_rate": 2.141074805073403e-05, "loss": 0.1463, "step": 74825 }, { "epoch": 1.0733952848046808, "grad_norm": 9.698424339294434, "learning_rate": 2.140676453716884e-05, "loss": 0.1173, "step": 74850 }, { "epoch": 1.0737538002638674, "grad_norm": 1.9604328870773315, "learning_rate": 2.1402781023603646e-05, "loss": 0.1149, "step": 74875 }, { "epoch": 1.0741123157230539, "grad_norm": 11.914599418640137, "learning_rate": 2.1398797510038454e-05, "loss": 0.1382, "step": 74900 }, { "epoch": 1.0744708311822406, "grad_norm": 0.9643842577934265, "learning_rate": 2.1394813996473265e-05, "loss": 0.268, "step": 74925 }, { "epoch": 1.0748293466414272, "grad_norm": 0.8292235136032104, "learning_rate": 2.1390830482908073e-05, "loss": 0.1543, "step": 74950 }, { "epoch": 1.0751878621006137, "grad_norm": 12.92324447631836, "learning_rate": 2.138684696934288e-05, "loss": 0.1484, "step": 74975 }, { "epoch": 1.0755463775598004, "grad_norm": 13.511861801147461, "learning_rate": 2.138286345577769e-05, "loss": 0.1906, "step": 75000 }, { "epoch": 1.075904893018987, "grad_norm": 12.674439430236816, "learning_rate": 2.1378879942212497e-05, "loss": 0.1296, "step": 75025 }, { "epoch": 1.0762634084781735, "grad_norm": 0.8037183880805969, "learning_rate": 2.1374896428647308e-05, "loss": 0.1001, "step": 75050 }, { "epoch": 1.0766219239373602, "grad_norm": 0.9350007176399231, "learning_rate": 2.1370912915082116e-05, "loss": 0.0836, "step": 75075 }, { "epoch": 1.0769804393965468, "grad_norm": 0.175935298204422, "learning_rate": 2.1366929401516924e-05, "loss": 0.1894, "step": 75100 }, { "epoch": 1.0773389548557333, "grad_norm": 1.2355228662490845, "learning_rate": 2.136294588795173e-05, "loss": 0.1205, "step": 75125 }, { "epoch": 1.07769747031492, "grad_norm": 10.245357513427734, "learning_rate": 2.135896237438654e-05, "loss": 0.1783, "step": 75150 }, { "epoch": 1.0780559857741066, "grad_norm": 5.230781078338623, "learning_rate": 2.135497886082135e-05, "loss": 0.1834, "step": 75175 }, { "epoch": 1.078414501233293, "grad_norm": 11.548373222351074, "learning_rate": 2.135099534725616e-05, "loss": 0.1726, "step": 75200 }, { "epoch": 1.0787730166924798, "grad_norm": 21.514171600341797, "learning_rate": 2.1347011833690966e-05, "loss": 0.119, "step": 75225 }, { "epoch": 1.0791315321516664, "grad_norm": 10.012129783630371, "learning_rate": 2.1343028320125774e-05, "loss": 0.1299, "step": 75250 }, { "epoch": 1.0794900476108529, "grad_norm": 1.2423027753829956, "learning_rate": 2.1339044806560582e-05, "loss": 0.1594, "step": 75275 }, { "epoch": 1.0798485630700396, "grad_norm": 3.4014484882354736, "learning_rate": 2.133506129299539e-05, "loss": 0.115, "step": 75300 }, { "epoch": 1.0802070785292262, "grad_norm": 21.519819259643555, "learning_rate": 2.1331077779430198e-05, "loss": 0.1768, "step": 75325 }, { "epoch": 1.0805655939884127, "grad_norm": 2.868095874786377, "learning_rate": 2.1327094265865005e-05, "loss": 0.3267, "step": 75350 }, { "epoch": 1.0809241094475994, "grad_norm": 0.5745278596878052, "learning_rate": 2.1323110752299813e-05, "loss": 0.1506, "step": 75375 }, { "epoch": 1.081282624906786, "grad_norm": 22.30284309387207, "learning_rate": 2.131912723873462e-05, "loss": 0.2049, "step": 75400 }, { "epoch": 1.0816411403659725, "grad_norm": 3.433523416519165, "learning_rate": 2.1315143725169432e-05, "loss": 0.2055, "step": 75425 }, { "epoch": 1.0819996558251592, "grad_norm": 0.4930567443370819, "learning_rate": 2.131116021160424e-05, "loss": 0.1256, "step": 75450 }, { "epoch": 1.0823581712843457, "grad_norm": 3.148024797439575, "learning_rate": 2.1307176698039048e-05, "loss": 0.1631, "step": 75475 }, { "epoch": 1.0827166867435323, "grad_norm": 20.53632354736328, "learning_rate": 2.1303193184473856e-05, "loss": 0.1632, "step": 75500 }, { "epoch": 1.083075202202719, "grad_norm": 23.041898727416992, "learning_rate": 2.1299209670908667e-05, "loss": 0.1781, "step": 75525 }, { "epoch": 1.0834337176619055, "grad_norm": 0.16257579624652863, "learning_rate": 2.1295226157343475e-05, "loss": 0.1551, "step": 75550 }, { "epoch": 1.083792233121092, "grad_norm": 2.964611053466797, "learning_rate": 2.1291242643778283e-05, "loss": 0.1422, "step": 75575 }, { "epoch": 1.0841507485802788, "grad_norm": 12.171220779418945, "learning_rate": 2.128725913021309e-05, "loss": 0.2017, "step": 75600 }, { "epoch": 1.0845092640394653, "grad_norm": 15.778399467468262, "learning_rate": 2.12832756166479e-05, "loss": 0.1751, "step": 75625 }, { "epoch": 1.0848677794986519, "grad_norm": 2.5927984714508057, "learning_rate": 2.127929210308271e-05, "loss": 0.1827, "step": 75650 }, { "epoch": 1.0852262949578386, "grad_norm": 1.6832128763198853, "learning_rate": 2.1275308589517518e-05, "loss": 0.1271, "step": 75675 }, { "epoch": 1.0855848104170251, "grad_norm": 0.667809784412384, "learning_rate": 2.1271325075952325e-05, "loss": 0.0981, "step": 75700 }, { "epoch": 1.0859433258762117, "grad_norm": 0.7409619092941284, "learning_rate": 2.1267341562387133e-05, "loss": 0.1077, "step": 75725 }, { "epoch": 1.0863018413353984, "grad_norm": 3.4977197647094727, "learning_rate": 2.126335804882194e-05, "loss": 0.0952, "step": 75750 }, { "epoch": 1.086660356794585, "grad_norm": 13.643017768859863, "learning_rate": 2.1259374535256752e-05, "loss": 0.1971, "step": 75775 }, { "epoch": 1.0870188722537715, "grad_norm": 1.591834545135498, "learning_rate": 2.125539102169156e-05, "loss": 0.1465, "step": 75800 }, { "epoch": 1.0873773877129582, "grad_norm": 7.752531051635742, "learning_rate": 2.1251407508126368e-05, "loss": 0.1515, "step": 75825 }, { "epoch": 1.0877359031721447, "grad_norm": 7.520070552825928, "learning_rate": 2.1247423994561176e-05, "loss": 0.2719, "step": 75850 }, { "epoch": 1.0880944186313313, "grad_norm": 3.9792985916137695, "learning_rate": 2.1243440480995984e-05, "loss": 0.1101, "step": 75875 }, { "epoch": 1.088452934090518, "grad_norm": 5.056556224822998, "learning_rate": 2.1239456967430795e-05, "loss": 0.1639, "step": 75900 }, { "epoch": 1.0888114495497045, "grad_norm": 2.8465046882629395, "learning_rate": 2.1235473453865603e-05, "loss": 0.1819, "step": 75925 }, { "epoch": 1.089169965008891, "grad_norm": 15.220673561096191, "learning_rate": 2.123148994030041e-05, "loss": 0.1195, "step": 75950 }, { "epoch": 1.0895284804680778, "grad_norm": 2.2191007137298584, "learning_rate": 2.122750642673522e-05, "loss": 0.1654, "step": 75975 }, { "epoch": 1.0898869959272643, "grad_norm": 0.9882014989852905, "learning_rate": 2.1223522913170026e-05, "loss": 0.1191, "step": 76000 }, { "epoch": 1.0902455113864509, "grad_norm": 4.096860885620117, "learning_rate": 2.1219539399604838e-05, "loss": 0.1124, "step": 76025 }, { "epoch": 1.0906040268456376, "grad_norm": 8.708625793457031, "learning_rate": 2.1215555886039645e-05, "loss": 0.1574, "step": 76050 }, { "epoch": 1.0909625423048241, "grad_norm": 8.429116249084473, "learning_rate": 2.1211572372474453e-05, "loss": 0.1191, "step": 76075 }, { "epoch": 1.0913210577640107, "grad_norm": 6.356689929962158, "learning_rate": 2.120758885890926e-05, "loss": 0.1886, "step": 76100 }, { "epoch": 1.0916795732231974, "grad_norm": 15.289088249206543, "learning_rate": 2.1203605345344072e-05, "loss": 0.1457, "step": 76125 }, { "epoch": 1.092038088682384, "grad_norm": 17.941638946533203, "learning_rate": 2.119962183177888e-05, "loss": 0.1586, "step": 76150 }, { "epoch": 1.0923966041415705, "grad_norm": 16.35716438293457, "learning_rate": 2.1195638318213688e-05, "loss": 0.1642, "step": 76175 }, { "epoch": 1.0927551196007572, "grad_norm": 17.698699951171875, "learning_rate": 2.1191654804648496e-05, "loss": 0.1338, "step": 76200 }, { "epoch": 1.0931136350599437, "grad_norm": 21.158151626586914, "learning_rate": 2.1187671291083304e-05, "loss": 0.21, "step": 76225 }, { "epoch": 1.0934721505191303, "grad_norm": 6.143709659576416, "learning_rate": 2.1183687777518115e-05, "loss": 0.2365, "step": 76250 }, { "epoch": 1.093830665978317, "grad_norm": 3.7836244106292725, "learning_rate": 2.1179704263952923e-05, "loss": 0.1413, "step": 76275 }, { "epoch": 1.0941891814375035, "grad_norm": 3.2990243434906006, "learning_rate": 2.117572075038773e-05, "loss": 0.1233, "step": 76300 }, { "epoch": 1.09454769689669, "grad_norm": 1.4591338634490967, "learning_rate": 2.117173723682254e-05, "loss": 0.1564, "step": 76325 }, { "epoch": 1.0949062123558768, "grad_norm": 12.104814529418945, "learning_rate": 2.1167753723257346e-05, "loss": 0.1332, "step": 76350 }, { "epoch": 1.0952647278150633, "grad_norm": 15.015909194946289, "learning_rate": 2.1163770209692158e-05, "loss": 0.1283, "step": 76375 }, { "epoch": 1.0956232432742499, "grad_norm": 9.891599655151367, "learning_rate": 2.1159786696126962e-05, "loss": 0.1025, "step": 76400 }, { "epoch": 1.0959817587334366, "grad_norm": 14.54979133605957, "learning_rate": 2.115580318256177e-05, "loss": 0.102, "step": 76425 }, { "epoch": 1.0963402741926231, "grad_norm": 0.4912387728691101, "learning_rate": 2.1151819668996578e-05, "loss": 0.1406, "step": 76450 }, { "epoch": 1.0966987896518097, "grad_norm": 0.63905930519104, "learning_rate": 2.1147836155431386e-05, "loss": 0.0889, "step": 76475 }, { "epoch": 1.0970573051109964, "grad_norm": 6.416964054107666, "learning_rate": 2.1143852641866197e-05, "loss": 0.1351, "step": 76500 }, { "epoch": 1.097415820570183, "grad_norm": 9.968045234680176, "learning_rate": 2.1139869128301005e-05, "loss": 0.1838, "step": 76525 }, { "epoch": 1.0977743360293695, "grad_norm": 22.751209259033203, "learning_rate": 2.1135885614735813e-05, "loss": 0.1796, "step": 76550 }, { "epoch": 1.0981328514885562, "grad_norm": 9.568964004516602, "learning_rate": 2.113190210117062e-05, "loss": 0.1316, "step": 76575 }, { "epoch": 1.0984913669477427, "grad_norm": 13.952173233032227, "learning_rate": 2.1127918587605428e-05, "loss": 0.1447, "step": 76600 }, { "epoch": 1.0988498824069295, "grad_norm": 0.3559722602367401, "learning_rate": 2.112393507404024e-05, "loss": 0.1368, "step": 76625 }, { "epoch": 1.099208397866116, "grad_norm": 15.229116439819336, "learning_rate": 2.1119951560475047e-05, "loss": 0.1081, "step": 76650 }, { "epoch": 1.0995669133253025, "grad_norm": 6.985005855560303, "learning_rate": 2.1115968046909855e-05, "loss": 0.1746, "step": 76675 }, { "epoch": 1.0999254287844893, "grad_norm": 1.507651925086975, "learning_rate": 2.1111984533344663e-05, "loss": 0.1542, "step": 76700 }, { "epoch": 1.1002839442436758, "grad_norm": 0.36558404564857483, "learning_rate": 2.1108001019779474e-05, "loss": 0.0971, "step": 76725 }, { "epoch": 1.1006424597028623, "grad_norm": 3.764385223388672, "learning_rate": 2.1104017506214282e-05, "loss": 0.1008, "step": 76750 }, { "epoch": 1.101000975162049, "grad_norm": 5.714193344116211, "learning_rate": 2.110003399264909e-05, "loss": 0.1456, "step": 76775 }, { "epoch": 1.1013594906212356, "grad_norm": 1.9392465353012085, "learning_rate": 2.1096050479083898e-05, "loss": 0.1797, "step": 76800 }, { "epoch": 1.1017180060804221, "grad_norm": 0.2596932649612427, "learning_rate": 2.1092066965518706e-05, "loss": 0.1538, "step": 76825 }, { "epoch": 1.1020765215396089, "grad_norm": 13.970687866210938, "learning_rate": 2.1088083451953517e-05, "loss": 0.1325, "step": 76850 }, { "epoch": 1.1024350369987954, "grad_norm": 0.0717901811003685, "learning_rate": 2.1084099938388325e-05, "loss": 0.1198, "step": 76875 }, { "epoch": 1.102793552457982, "grad_norm": 2.37461519241333, "learning_rate": 2.1080116424823132e-05, "loss": 0.1316, "step": 76900 }, { "epoch": 1.1031520679171687, "grad_norm": 13.238046646118164, "learning_rate": 2.107613291125794e-05, "loss": 0.153, "step": 76925 }, { "epoch": 1.1035105833763552, "grad_norm": 1.5240917205810547, "learning_rate": 2.1072149397692748e-05, "loss": 0.1827, "step": 76950 }, { "epoch": 1.1038690988355417, "grad_norm": 7.542766094207764, "learning_rate": 2.106816588412756e-05, "loss": 0.073, "step": 76975 }, { "epoch": 1.1042276142947285, "grad_norm": 9.79904842376709, "learning_rate": 2.1064182370562367e-05, "loss": 0.087, "step": 77000 }, { "epoch": 1.104586129753915, "grad_norm": 0.014737817458808422, "learning_rate": 2.1060198856997175e-05, "loss": 0.1112, "step": 77025 }, { "epoch": 1.1049446452131015, "grad_norm": 2.246257781982422, "learning_rate": 2.1056215343431983e-05, "loss": 0.2096, "step": 77050 }, { "epoch": 1.1053031606722883, "grad_norm": 5.377308368682861, "learning_rate": 2.105223182986679e-05, "loss": 0.1397, "step": 77075 }, { "epoch": 1.1056616761314748, "grad_norm": 1.9875342845916748, "learning_rate": 2.1048248316301602e-05, "loss": 0.1483, "step": 77100 }, { "epoch": 1.1060201915906613, "grad_norm": 0.42216870188713074, "learning_rate": 2.104426480273641e-05, "loss": 0.1601, "step": 77125 }, { "epoch": 1.106378707049848, "grad_norm": 12.137541770935059, "learning_rate": 2.1040281289171218e-05, "loss": 0.1135, "step": 77150 }, { "epoch": 1.1067372225090346, "grad_norm": 1.0653133392333984, "learning_rate": 2.1036297775606026e-05, "loss": 0.1332, "step": 77175 }, { "epoch": 1.1070957379682211, "grad_norm": 5.814371585845947, "learning_rate": 2.1032314262040833e-05, "loss": 0.1295, "step": 77200 }, { "epoch": 1.1074542534274079, "grad_norm": 3.8178019523620605, "learning_rate": 2.1028330748475645e-05, "loss": 0.075, "step": 77225 }, { "epoch": 1.1078127688865944, "grad_norm": 16.243518829345703, "learning_rate": 2.1024347234910452e-05, "loss": 0.092, "step": 77250 }, { "epoch": 1.108171284345781, "grad_norm": 0.20690836012363434, "learning_rate": 2.102036372134526e-05, "loss": 0.2429, "step": 77275 }, { "epoch": 1.1085297998049677, "grad_norm": 18.54674530029297, "learning_rate": 2.1016380207780068e-05, "loss": 0.132, "step": 77300 }, { "epoch": 1.1088883152641542, "grad_norm": 0.2995668351650238, "learning_rate": 2.101239669421488e-05, "loss": 0.1455, "step": 77325 }, { "epoch": 1.1092468307233407, "grad_norm": 2.0381836891174316, "learning_rate": 2.1008413180649687e-05, "loss": 0.1002, "step": 77350 }, { "epoch": 1.1096053461825275, "grad_norm": 1.041619062423706, "learning_rate": 2.1004429667084495e-05, "loss": 0.1652, "step": 77375 }, { "epoch": 1.109963861641714, "grad_norm": 28.8507137298584, "learning_rate": 2.1000446153519303e-05, "loss": 0.2208, "step": 77400 }, { "epoch": 1.1103223771009005, "grad_norm": 21.39751434326172, "learning_rate": 2.099646263995411e-05, "loss": 0.1785, "step": 77425 }, { "epoch": 1.1106808925600873, "grad_norm": 20.916622161865234, "learning_rate": 2.0992479126388922e-05, "loss": 0.1296, "step": 77450 }, { "epoch": 1.1110394080192738, "grad_norm": 4.0327067375183105, "learning_rate": 2.098849561282373e-05, "loss": 0.2021, "step": 77475 }, { "epoch": 1.1113979234784603, "grad_norm": 2.8376455307006836, "learning_rate": 2.0984512099258534e-05, "loss": 0.1003, "step": 77500 }, { "epoch": 1.111756438937647, "grad_norm": 2.845545530319214, "learning_rate": 2.0980528585693342e-05, "loss": 0.1474, "step": 77525 }, { "epoch": 1.1121149543968336, "grad_norm": 3.9217069149017334, "learning_rate": 2.097654507212815e-05, "loss": 0.1317, "step": 77550 }, { "epoch": 1.1124734698560201, "grad_norm": 0.7918757796287537, "learning_rate": 2.097256155856296e-05, "loss": 0.1269, "step": 77575 }, { "epoch": 1.1128319853152069, "grad_norm": 5.556789875030518, "learning_rate": 2.096857804499777e-05, "loss": 0.1337, "step": 77600 }, { "epoch": 1.1131905007743934, "grad_norm": 1.4433180093765259, "learning_rate": 2.0964594531432577e-05, "loss": 0.1631, "step": 77625 }, { "epoch": 1.11354901623358, "grad_norm": 10.270607948303223, "learning_rate": 2.0960611017867385e-05, "loss": 0.1065, "step": 77650 }, { "epoch": 1.1139075316927667, "grad_norm": 5.795435428619385, "learning_rate": 2.0956627504302193e-05, "loss": 0.0973, "step": 77675 }, { "epoch": 1.1142660471519532, "grad_norm": 10.775822639465332, "learning_rate": 2.0952643990737004e-05, "loss": 0.214, "step": 77700 }, { "epoch": 1.1146245626111397, "grad_norm": 0.06467749178409576, "learning_rate": 2.094866047717181e-05, "loss": 0.1296, "step": 77725 }, { "epoch": 1.1149830780703265, "grad_norm": 0.5268807411193848, "learning_rate": 2.094467696360662e-05, "loss": 0.1489, "step": 77750 }, { "epoch": 1.115341593529513, "grad_norm": 12.95591926574707, "learning_rate": 2.0940693450041427e-05, "loss": 0.1338, "step": 77775 }, { "epoch": 1.1157001089886995, "grad_norm": 7.465995788574219, "learning_rate": 2.0936709936476235e-05, "loss": 0.1598, "step": 77800 }, { "epoch": 1.1160586244478863, "grad_norm": 6.058501720428467, "learning_rate": 2.0932726422911046e-05, "loss": 0.0997, "step": 77825 }, { "epoch": 1.1164171399070728, "grad_norm": 0.45406949520111084, "learning_rate": 2.0928742909345854e-05, "loss": 0.0772, "step": 77850 }, { "epoch": 1.1167756553662593, "grad_norm": 0.7527957558631897, "learning_rate": 2.0924759395780662e-05, "loss": 0.0892, "step": 77875 }, { "epoch": 1.117134170825446, "grad_norm": 8.04058837890625, "learning_rate": 2.092077588221547e-05, "loss": 0.2067, "step": 77900 }, { "epoch": 1.1174926862846326, "grad_norm": 6.764615058898926, "learning_rate": 2.091679236865028e-05, "loss": 0.1085, "step": 77925 }, { "epoch": 1.1178512017438191, "grad_norm": 0.12287113070487976, "learning_rate": 2.091280885508509e-05, "loss": 0.1425, "step": 77950 }, { "epoch": 1.1182097172030059, "grad_norm": 23.172630310058594, "learning_rate": 2.0908825341519897e-05, "loss": 0.2085, "step": 77975 }, { "epoch": 1.1185682326621924, "grad_norm": 6.718591213226318, "learning_rate": 2.0904841827954705e-05, "loss": 0.1349, "step": 78000 }, { "epoch": 1.118926748121379, "grad_norm": 0.44099605083465576, "learning_rate": 2.0900858314389513e-05, "loss": 0.128, "step": 78025 }, { "epoch": 1.1192852635805657, "grad_norm": 1.5417855978012085, "learning_rate": 2.0896874800824324e-05, "loss": 0.2062, "step": 78050 }, { "epoch": 1.1196437790397522, "grad_norm": 8.65954875946045, "learning_rate": 2.089289128725913e-05, "loss": 0.168, "step": 78075 }, { "epoch": 1.1200022944989387, "grad_norm": 7.549484729766846, "learning_rate": 2.088890777369394e-05, "loss": 0.1068, "step": 78100 }, { "epoch": 1.1203608099581255, "grad_norm": 3.456169605255127, "learning_rate": 2.0884924260128747e-05, "loss": 0.1356, "step": 78125 }, { "epoch": 1.120719325417312, "grad_norm": 0.9939962029457092, "learning_rate": 2.0880940746563555e-05, "loss": 0.113, "step": 78150 }, { "epoch": 1.1210778408764985, "grad_norm": 6.985161781311035, "learning_rate": 2.0876957232998366e-05, "loss": 0.1485, "step": 78175 }, { "epoch": 1.1214363563356853, "grad_norm": 3.831631660461426, "learning_rate": 2.0872973719433174e-05, "loss": 0.2106, "step": 78200 }, { "epoch": 1.1217948717948718, "grad_norm": 2.204359531402588, "learning_rate": 2.0868990205867982e-05, "loss": 0.1668, "step": 78225 }, { "epoch": 1.1221533872540583, "grad_norm": 1.780578374862671, "learning_rate": 2.086500669230279e-05, "loss": 0.1184, "step": 78250 }, { "epoch": 1.122511902713245, "grad_norm": 5.9778337478637695, "learning_rate": 2.0861023178737598e-05, "loss": 0.1297, "step": 78275 }, { "epoch": 1.1228704181724316, "grad_norm": 3.71337890625, "learning_rate": 2.085703966517241e-05, "loss": 0.145, "step": 78300 }, { "epoch": 1.1232289336316181, "grad_norm": 4.124797821044922, "learning_rate": 2.0853056151607217e-05, "loss": 0.2211, "step": 78325 }, { "epoch": 1.1235874490908049, "grad_norm": 1.419707179069519, "learning_rate": 2.0849072638042025e-05, "loss": 0.167, "step": 78350 }, { "epoch": 1.1239459645499914, "grad_norm": 0.6425291895866394, "learning_rate": 2.0845089124476833e-05, "loss": 0.1852, "step": 78375 }, { "epoch": 1.124304480009178, "grad_norm": 0.8563973307609558, "learning_rate": 2.084110561091164e-05, "loss": 0.1629, "step": 78400 }, { "epoch": 1.1246629954683647, "grad_norm": 14.213590621948242, "learning_rate": 2.083712209734645e-05, "loss": 0.1297, "step": 78425 }, { "epoch": 1.1250215109275512, "grad_norm": 16.86236000061035, "learning_rate": 2.083313858378126e-05, "loss": 0.1319, "step": 78450 }, { "epoch": 1.1253800263867377, "grad_norm": 10.437902450561523, "learning_rate": 2.0829155070216067e-05, "loss": 0.18, "step": 78475 }, { "epoch": 1.1257385418459245, "grad_norm": 0.48243147134780884, "learning_rate": 2.0825171556650875e-05, "loss": 0.158, "step": 78500 }, { "epoch": 1.126097057305111, "grad_norm": 18.2255916595459, "learning_rate": 2.0821188043085686e-05, "loss": 0.1129, "step": 78525 }, { "epoch": 1.1264555727642975, "grad_norm": 1.0469391345977783, "learning_rate": 2.0817204529520494e-05, "loss": 0.0867, "step": 78550 }, { "epoch": 1.1268140882234843, "grad_norm": 2.2917284965515137, "learning_rate": 2.0813221015955302e-05, "loss": 0.0896, "step": 78575 }, { "epoch": 1.1271726036826708, "grad_norm": 10.874360084533691, "learning_rate": 2.0809237502390107e-05, "loss": 0.2375, "step": 78600 }, { "epoch": 1.1275311191418573, "grad_norm": 17.324621200561523, "learning_rate": 2.0805253988824914e-05, "loss": 0.1296, "step": 78625 }, { "epoch": 1.127889634601044, "grad_norm": 1.6478954553604126, "learning_rate": 2.0801270475259726e-05, "loss": 0.201, "step": 78650 }, { "epoch": 1.1282481500602306, "grad_norm": 4.488115310668945, "learning_rate": 2.0797286961694534e-05, "loss": 0.2208, "step": 78675 }, { "epoch": 1.1286066655194171, "grad_norm": 12.110437393188477, "learning_rate": 2.079330344812934e-05, "loss": 0.1181, "step": 78700 }, { "epoch": 1.1289651809786039, "grad_norm": 0.19546887278556824, "learning_rate": 2.078931993456415e-05, "loss": 0.15, "step": 78725 }, { "epoch": 1.1293236964377904, "grad_norm": 0.03647156432271004, "learning_rate": 2.0785336420998957e-05, "loss": 0.1252, "step": 78750 }, { "epoch": 1.129682211896977, "grad_norm": 4.403375148773193, "learning_rate": 2.0781352907433768e-05, "loss": 0.1798, "step": 78775 }, { "epoch": 1.1300407273561637, "grad_norm": 11.320133209228516, "learning_rate": 2.0777369393868576e-05, "loss": 0.1856, "step": 78800 }, { "epoch": 1.1303992428153502, "grad_norm": 2.5928244590759277, "learning_rate": 2.0773385880303384e-05, "loss": 0.0814, "step": 78825 }, { "epoch": 1.1307577582745367, "grad_norm": 11.211289405822754, "learning_rate": 2.0769402366738192e-05, "loss": 0.077, "step": 78850 }, { "epoch": 1.1311162737337235, "grad_norm": 1.6790157556533813, "learning_rate": 2.0765418853173e-05, "loss": 0.1659, "step": 78875 }, { "epoch": 1.13147478919291, "grad_norm": 12.518549919128418, "learning_rate": 2.076143533960781e-05, "loss": 0.1563, "step": 78900 }, { "epoch": 1.1318333046520965, "grad_norm": 7.087803840637207, "learning_rate": 2.075745182604262e-05, "loss": 0.115, "step": 78925 }, { "epoch": 1.1321918201112833, "grad_norm": 16.008113861083984, "learning_rate": 2.0753468312477427e-05, "loss": 0.1845, "step": 78950 }, { "epoch": 1.1325503355704698, "grad_norm": 11.798137664794922, "learning_rate": 2.0749484798912234e-05, "loss": 0.1862, "step": 78975 }, { "epoch": 1.1329088510296563, "grad_norm": 2.5760130882263184, "learning_rate": 2.0745501285347042e-05, "loss": 0.2479, "step": 79000 }, { "epoch": 1.133267366488843, "grad_norm": 3.1509411334991455, "learning_rate": 2.0741517771781853e-05, "loss": 0.172, "step": 79025 }, { "epoch": 1.1336258819480296, "grad_norm": 5.0826520919799805, "learning_rate": 2.073753425821666e-05, "loss": 0.1376, "step": 79050 }, { "epoch": 1.1339843974072161, "grad_norm": 12.577377319335938, "learning_rate": 2.073355074465147e-05, "loss": 0.1397, "step": 79075 }, { "epoch": 1.1343429128664029, "grad_norm": 0.3683280348777771, "learning_rate": 2.0729567231086277e-05, "loss": 0.0983, "step": 79100 }, { "epoch": 1.1347014283255894, "grad_norm": 5.539823055267334, "learning_rate": 2.0725583717521085e-05, "loss": 0.1828, "step": 79125 }, { "epoch": 1.135059943784776, "grad_norm": 4.946642875671387, "learning_rate": 2.0721600203955896e-05, "loss": 0.1918, "step": 79150 }, { "epoch": 1.1354184592439627, "grad_norm": 7.98561429977417, "learning_rate": 2.0717616690390704e-05, "loss": 0.152, "step": 79175 }, { "epoch": 1.1357769747031492, "grad_norm": 15.52524471282959, "learning_rate": 2.0713633176825512e-05, "loss": 0.2119, "step": 79200 }, { "epoch": 1.1361354901623357, "grad_norm": 20.405797958374023, "learning_rate": 2.070964966326032e-05, "loss": 0.1079, "step": 79225 }, { "epoch": 1.1364940056215225, "grad_norm": 3.963151216506958, "learning_rate": 2.070566614969513e-05, "loss": 0.2222, "step": 79250 }, { "epoch": 1.136852521080709, "grad_norm": 17.176294326782227, "learning_rate": 2.070168263612994e-05, "loss": 0.2108, "step": 79275 }, { "epoch": 1.1372110365398955, "grad_norm": 3.476439952850342, "learning_rate": 2.0697699122564747e-05, "loss": 0.199, "step": 79300 }, { "epoch": 1.1375695519990823, "grad_norm": 2.0070865154266357, "learning_rate": 2.0693715608999554e-05, "loss": 0.1509, "step": 79325 }, { "epoch": 1.1379280674582688, "grad_norm": 13.253188133239746, "learning_rate": 2.0689732095434362e-05, "loss": 0.1628, "step": 79350 }, { "epoch": 1.1382865829174553, "grad_norm": 1.4853109121322632, "learning_rate": 2.0685748581869173e-05, "loss": 0.2309, "step": 79375 }, { "epoch": 1.138645098376642, "grad_norm": 6.34264612197876, "learning_rate": 2.068176506830398e-05, "loss": 0.1002, "step": 79400 }, { "epoch": 1.1390036138358286, "grad_norm": 8.306294441223145, "learning_rate": 2.067778155473879e-05, "loss": 0.1207, "step": 79425 }, { "epoch": 1.1393621292950151, "grad_norm": 1.1837748289108276, "learning_rate": 2.0673798041173597e-05, "loss": 0.1592, "step": 79450 }, { "epoch": 1.1397206447542019, "grad_norm": 9.343287467956543, "learning_rate": 2.0669814527608405e-05, "loss": 0.1256, "step": 79475 }, { "epoch": 1.1400791602133884, "grad_norm": 0.5831305384635925, "learning_rate": 2.0665831014043216e-05, "loss": 0.1434, "step": 79500 }, { "epoch": 1.140437675672575, "grad_norm": 15.74967098236084, "learning_rate": 2.0661847500478024e-05, "loss": 0.201, "step": 79525 }, { "epoch": 1.1407961911317617, "grad_norm": 0.901038646697998, "learning_rate": 2.0657863986912832e-05, "loss": 0.0757, "step": 79550 }, { "epoch": 1.1411547065909482, "grad_norm": 8.84376335144043, "learning_rate": 2.065388047334764e-05, "loss": 0.1767, "step": 79575 }, { "epoch": 1.1415132220501347, "grad_norm": 4.999418258666992, "learning_rate": 2.0649896959782447e-05, "loss": 0.09, "step": 79600 }, { "epoch": 1.1418717375093215, "grad_norm": 10.921326637268066, "learning_rate": 2.064591344621726e-05, "loss": 0.1838, "step": 79625 }, { "epoch": 1.142230252968508, "grad_norm": 1.0569679737091064, "learning_rate": 2.0641929932652067e-05, "loss": 0.1603, "step": 79650 }, { "epoch": 1.1425887684276945, "grad_norm": 0.05560509115457535, "learning_rate": 2.0637946419086874e-05, "loss": 0.0926, "step": 79675 }, { "epoch": 1.1429472838868813, "grad_norm": 5.006752967834473, "learning_rate": 2.063396290552168e-05, "loss": 0.1401, "step": 79700 }, { "epoch": 1.1433057993460678, "grad_norm": 3.246061325073242, "learning_rate": 2.0629979391956487e-05, "loss": 0.1187, "step": 79725 }, { "epoch": 1.1436643148052543, "grad_norm": 1.7787582874298096, "learning_rate": 2.0625995878391298e-05, "loss": 0.1392, "step": 79750 }, { "epoch": 1.144022830264441, "grad_norm": 12.140669822692871, "learning_rate": 2.0622012364826106e-05, "loss": 0.1754, "step": 79775 }, { "epoch": 1.1443813457236276, "grad_norm": 7.984576225280762, "learning_rate": 2.0618028851260914e-05, "loss": 0.1029, "step": 79800 }, { "epoch": 1.1447398611828141, "grad_norm": 0.5331727266311646, "learning_rate": 2.061404533769572e-05, "loss": 0.1908, "step": 79825 }, { "epoch": 1.1450983766420009, "grad_norm": 16.400033950805664, "learning_rate": 2.0610061824130533e-05, "loss": 0.138, "step": 79850 }, { "epoch": 1.1454568921011874, "grad_norm": 0.4022552967071533, "learning_rate": 2.060607831056534e-05, "loss": 0.1055, "step": 79875 }, { "epoch": 1.145815407560374, "grad_norm": 0.0315166711807251, "learning_rate": 2.060209479700015e-05, "loss": 0.1261, "step": 79900 }, { "epoch": 1.1461739230195607, "grad_norm": 3.097867250442505, "learning_rate": 2.0598111283434956e-05, "loss": 0.118, "step": 79925 }, { "epoch": 1.1465324384787472, "grad_norm": 2.541560649871826, "learning_rate": 2.0594127769869764e-05, "loss": 0.1517, "step": 79950 }, { "epoch": 1.1468909539379337, "grad_norm": 0.5378922820091248, "learning_rate": 2.0590144256304575e-05, "loss": 0.1953, "step": 79975 }, { "epoch": 1.1472494693971205, "grad_norm": 0.10792617499828339, "learning_rate": 2.0586160742739383e-05, "loss": 0.0772, "step": 80000 }, { "epoch": 1.147607984856307, "grad_norm": 0.6485068798065186, "learning_rate": 2.058217722917419e-05, "loss": 0.1011, "step": 80025 }, { "epoch": 1.1479665003154935, "grad_norm": 15.77476692199707, "learning_rate": 2.0578193715609e-05, "loss": 0.1694, "step": 80050 }, { "epoch": 1.1483250157746803, "grad_norm": 9.14852237701416, "learning_rate": 2.0574210202043807e-05, "loss": 0.0972, "step": 80075 }, { "epoch": 1.1486835312338668, "grad_norm": 0.3269750475883484, "learning_rate": 2.0570226688478618e-05, "loss": 0.0931, "step": 80100 }, { "epoch": 1.1490420466930533, "grad_norm": 15.062600135803223, "learning_rate": 2.0566243174913426e-05, "loss": 0.1407, "step": 80125 }, { "epoch": 1.14940056215224, "grad_norm": 6.184482574462891, "learning_rate": 2.0562259661348234e-05, "loss": 0.078, "step": 80150 }, { "epoch": 1.1497590776114266, "grad_norm": 0.0814860388636589, "learning_rate": 2.055827614778304e-05, "loss": 0.0971, "step": 80175 }, { "epoch": 1.1501175930706131, "grad_norm": 0.23856352269649506, "learning_rate": 2.055429263421785e-05, "loss": 0.177, "step": 80200 }, { "epoch": 1.1504761085297999, "grad_norm": 21.349327087402344, "learning_rate": 2.055030912065266e-05, "loss": 0.1904, "step": 80225 }, { "epoch": 1.1508346239889864, "grad_norm": 10.171082496643066, "learning_rate": 2.054632560708747e-05, "loss": 0.1076, "step": 80250 }, { "epoch": 1.151193139448173, "grad_norm": 9.286396026611328, "learning_rate": 2.0542342093522276e-05, "loss": 0.0876, "step": 80275 }, { "epoch": 1.1515516549073597, "grad_norm": 16.468120574951172, "learning_rate": 2.0538358579957084e-05, "loss": 0.261, "step": 80300 }, { "epoch": 1.1519101703665462, "grad_norm": 12.017560958862305, "learning_rate": 2.0534375066391892e-05, "loss": 0.1585, "step": 80325 }, { "epoch": 1.1522686858257327, "grad_norm": 7.927954196929932, "learning_rate": 2.0530391552826703e-05, "loss": 0.1428, "step": 80350 }, { "epoch": 1.1526272012849195, "grad_norm": 15.99493408203125, "learning_rate": 2.052640803926151e-05, "loss": 0.1853, "step": 80375 }, { "epoch": 1.152985716744106, "grad_norm": 6.419301986694336, "learning_rate": 2.052242452569632e-05, "loss": 0.1292, "step": 80400 }, { "epoch": 1.1533442322032925, "grad_norm": 17.12464141845703, "learning_rate": 2.0518441012131127e-05, "loss": 0.2024, "step": 80425 }, { "epoch": 1.1537027476624793, "grad_norm": 1.5614557266235352, "learning_rate": 2.0514457498565938e-05, "loss": 0.2439, "step": 80450 }, { "epoch": 1.1540612631216658, "grad_norm": 7.238077163696289, "learning_rate": 2.0510473985000746e-05, "loss": 0.2328, "step": 80475 }, { "epoch": 1.1544197785808523, "grad_norm": 2.467911720275879, "learning_rate": 2.0506490471435554e-05, "loss": 0.1065, "step": 80500 }, { "epoch": 1.154778294040039, "grad_norm": 2.4370036125183105, "learning_rate": 2.050250695787036e-05, "loss": 0.1396, "step": 80525 }, { "epoch": 1.1551368094992256, "grad_norm": 7.015035629272461, "learning_rate": 2.049852344430517e-05, "loss": 0.1557, "step": 80550 }, { "epoch": 1.1554953249584121, "grad_norm": 0.6008578538894653, "learning_rate": 2.049453993073998e-05, "loss": 0.0815, "step": 80575 }, { "epoch": 1.1558538404175989, "grad_norm": 18.177669525146484, "learning_rate": 2.049055641717479e-05, "loss": 0.1099, "step": 80600 }, { "epoch": 1.1562123558767854, "grad_norm": 20.41349220275879, "learning_rate": 2.0486572903609596e-05, "loss": 0.2046, "step": 80625 }, { "epoch": 1.156570871335972, "grad_norm": 8.5348539352417, "learning_rate": 2.0482589390044404e-05, "loss": 0.1079, "step": 80650 }, { "epoch": 1.1569293867951587, "grad_norm": 0.42630913853645325, "learning_rate": 2.0478605876479212e-05, "loss": 0.1412, "step": 80675 }, { "epoch": 1.1572879022543452, "grad_norm": 16.17692756652832, "learning_rate": 2.0474622362914023e-05, "loss": 0.1959, "step": 80700 }, { "epoch": 1.1576464177135317, "grad_norm": 0.17651136219501495, "learning_rate": 2.047063884934883e-05, "loss": 0.1304, "step": 80725 }, { "epoch": 1.1580049331727185, "grad_norm": 24.96202278137207, "learning_rate": 2.046665533578364e-05, "loss": 0.2662, "step": 80750 }, { "epoch": 1.158363448631905, "grad_norm": 0.8118601441383362, "learning_rate": 2.0462671822218447e-05, "loss": 0.1451, "step": 80775 }, { "epoch": 1.1587219640910915, "grad_norm": 0.5507341623306274, "learning_rate": 2.045868830865325e-05, "loss": 0.1218, "step": 80800 }, { "epoch": 1.1590804795502783, "grad_norm": 9.13808822631836, "learning_rate": 2.0454704795088062e-05, "loss": 0.138, "step": 80825 }, { "epoch": 1.1594389950094648, "grad_norm": 7.17974853515625, "learning_rate": 2.045072128152287e-05, "loss": 0.2181, "step": 80850 }, { "epoch": 1.1597975104686513, "grad_norm": 1.360286831855774, "learning_rate": 2.0446737767957678e-05, "loss": 0.1091, "step": 80875 }, { "epoch": 1.160156025927838, "grad_norm": 4.59215784072876, "learning_rate": 2.0442754254392486e-05, "loss": 0.1526, "step": 80900 }, { "epoch": 1.1605145413870246, "grad_norm": 6.417291164398193, "learning_rate": 2.0438770740827294e-05, "loss": 0.1303, "step": 80925 }, { "epoch": 1.1608730568462111, "grad_norm": 19.33081817626953, "learning_rate": 2.0434787227262105e-05, "loss": 0.1917, "step": 80950 }, { "epoch": 1.1612315723053979, "grad_norm": 6.869673252105713, "learning_rate": 2.0430803713696913e-05, "loss": 0.1523, "step": 80975 }, { "epoch": 1.1615900877645844, "grad_norm": 17.451946258544922, "learning_rate": 2.042682020013172e-05, "loss": 0.175, "step": 81000 }, { "epoch": 1.161948603223771, "grad_norm": 5.4559431076049805, "learning_rate": 2.042283668656653e-05, "loss": 0.1242, "step": 81025 }, { "epoch": 1.1623071186829577, "grad_norm": 0.39427173137664795, "learning_rate": 2.041885317300134e-05, "loss": 0.177, "step": 81050 }, { "epoch": 1.1626656341421442, "grad_norm": 16.6716365814209, "learning_rate": 2.0414869659436148e-05, "loss": 0.1515, "step": 81075 }, { "epoch": 1.1630241496013307, "grad_norm": 2.569279432296753, "learning_rate": 2.0410886145870955e-05, "loss": 0.1543, "step": 81100 }, { "epoch": 1.1633826650605175, "grad_norm": 6.1312947273254395, "learning_rate": 2.0406902632305763e-05, "loss": 0.1285, "step": 81125 }, { "epoch": 1.163741180519704, "grad_norm": 2.665621042251587, "learning_rate": 2.040291911874057e-05, "loss": 0.1134, "step": 81150 }, { "epoch": 1.1640996959788905, "grad_norm": 1.944365382194519, "learning_rate": 2.0398935605175382e-05, "loss": 0.2947, "step": 81175 }, { "epoch": 1.1644582114380773, "grad_norm": 5.559756278991699, "learning_rate": 2.039495209161019e-05, "loss": 0.1435, "step": 81200 }, { "epoch": 1.1648167268972638, "grad_norm": 0.7320159673690796, "learning_rate": 2.0390968578044998e-05, "loss": 0.1855, "step": 81225 }, { "epoch": 1.1651752423564503, "grad_norm": 4.763611793518066, "learning_rate": 2.0386985064479806e-05, "loss": 0.1111, "step": 81250 }, { "epoch": 1.165533757815637, "grad_norm": 1.8069405555725098, "learning_rate": 2.0383001550914614e-05, "loss": 0.1567, "step": 81275 }, { "epoch": 1.1658922732748236, "grad_norm": 4.220132350921631, "learning_rate": 2.0379018037349425e-05, "loss": 0.1349, "step": 81300 }, { "epoch": 1.1662507887340103, "grad_norm": 3.1351540088653564, "learning_rate": 2.0375034523784233e-05, "loss": 0.2883, "step": 81325 }, { "epoch": 1.1666093041931969, "grad_norm": 22.484539031982422, "learning_rate": 2.037105101021904e-05, "loss": 0.1345, "step": 81350 }, { "epoch": 1.1669678196523834, "grad_norm": 3.053530693054199, "learning_rate": 2.036706749665385e-05, "loss": 0.1658, "step": 81375 }, { "epoch": 1.1673263351115701, "grad_norm": 16.32683563232422, "learning_rate": 2.0363083983088656e-05, "loss": 0.2019, "step": 81400 }, { "epoch": 1.1676848505707567, "grad_norm": 10.41084098815918, "learning_rate": 2.0359100469523468e-05, "loss": 0.1203, "step": 81425 }, { "epoch": 1.1680433660299432, "grad_norm": 4.274566650390625, "learning_rate": 2.0355116955958275e-05, "loss": 0.0887, "step": 81450 }, { "epoch": 1.16840188148913, "grad_norm": 13.844040870666504, "learning_rate": 2.0351133442393083e-05, "loss": 0.2403, "step": 81475 }, { "epoch": 1.1687603969483165, "grad_norm": 8.644352912902832, "learning_rate": 2.034714992882789e-05, "loss": 0.2364, "step": 81500 }, { "epoch": 1.169118912407503, "grad_norm": 3.196749448776245, "learning_rate": 2.03431664152627e-05, "loss": 0.1352, "step": 81525 }, { "epoch": 1.1694774278666897, "grad_norm": 5.304472923278809, "learning_rate": 2.033918290169751e-05, "loss": 0.0971, "step": 81550 }, { "epoch": 1.1698359433258763, "grad_norm": 4.1170759201049805, "learning_rate": 2.0335199388132318e-05, "loss": 0.1718, "step": 81575 }, { "epoch": 1.1701944587850628, "grad_norm": 0.2570006251335144, "learning_rate": 2.0331215874567126e-05, "loss": 0.1489, "step": 81600 }, { "epoch": 1.1705529742442495, "grad_norm": 1.4089144468307495, "learning_rate": 2.0327232361001934e-05, "loss": 0.1575, "step": 81625 }, { "epoch": 1.170911489703436, "grad_norm": 15.671385765075684, "learning_rate": 2.0323248847436745e-05, "loss": 0.1446, "step": 81650 }, { "epoch": 1.1712700051626226, "grad_norm": 11.789848327636719, "learning_rate": 2.0319265333871553e-05, "loss": 0.1901, "step": 81675 }, { "epoch": 1.1716285206218093, "grad_norm": 2.49135160446167, "learning_rate": 2.031528182030636e-05, "loss": 0.1158, "step": 81700 }, { "epoch": 1.1719870360809959, "grad_norm": 14.06103515625, "learning_rate": 2.031129830674117e-05, "loss": 0.1749, "step": 81725 }, { "epoch": 1.1723455515401824, "grad_norm": 2.5170676708221436, "learning_rate": 2.0307314793175976e-05, "loss": 0.2231, "step": 81750 }, { "epoch": 1.1727040669993691, "grad_norm": 1.5817174911499023, "learning_rate": 2.0303331279610788e-05, "loss": 0.1903, "step": 81775 }, { "epoch": 1.1730625824585557, "grad_norm": 0.5134584903717041, "learning_rate": 2.0299347766045595e-05, "loss": 0.1138, "step": 81800 }, { "epoch": 1.1734210979177422, "grad_norm": 5.465732574462891, "learning_rate": 2.0295364252480403e-05, "loss": 0.1204, "step": 81825 }, { "epoch": 1.173779613376929, "grad_norm": 5.418457508087158, "learning_rate": 2.029138073891521e-05, "loss": 0.0981, "step": 81850 }, { "epoch": 1.1741381288361155, "grad_norm": 3.899329900741577, "learning_rate": 2.028739722535002e-05, "loss": 0.0552, "step": 81875 }, { "epoch": 1.174496644295302, "grad_norm": 0.5714189410209656, "learning_rate": 2.0283413711784827e-05, "loss": 0.1297, "step": 81900 }, { "epoch": 1.1748551597544887, "grad_norm": 11.731046676635742, "learning_rate": 2.0279430198219635e-05, "loss": 0.1509, "step": 81925 }, { "epoch": 1.1752136752136753, "grad_norm": 7.722223281860352, "learning_rate": 2.0275446684654442e-05, "loss": 0.2382, "step": 81950 }, { "epoch": 1.1755721906728618, "grad_norm": 2.231215238571167, "learning_rate": 2.027146317108925e-05, "loss": 0.0993, "step": 81975 }, { "epoch": 1.1759307061320485, "grad_norm": 7.24084997177124, "learning_rate": 2.0267479657524058e-05, "loss": 0.1286, "step": 82000 }, { "epoch": 1.176289221591235, "grad_norm": 16.46404266357422, "learning_rate": 2.026349614395887e-05, "loss": 0.1389, "step": 82025 }, { "epoch": 1.1766477370504216, "grad_norm": 0.44098514318466187, "learning_rate": 2.0259512630393677e-05, "loss": 0.1192, "step": 82050 }, { "epoch": 1.1770062525096083, "grad_norm": 2.183759927749634, "learning_rate": 2.0255529116828485e-05, "loss": 0.0834, "step": 82075 }, { "epoch": 1.1773647679687949, "grad_norm": 1.1168997287750244, "learning_rate": 2.0251545603263293e-05, "loss": 0.1125, "step": 82100 }, { "epoch": 1.1777232834279814, "grad_norm": 1.5341860055923462, "learning_rate": 2.02475620896981e-05, "loss": 0.1313, "step": 82125 }, { "epoch": 1.1780817988871681, "grad_norm": 0.19991178810596466, "learning_rate": 2.0243578576132912e-05, "loss": 0.1866, "step": 82150 }, { "epoch": 1.1784403143463547, "grad_norm": 0.6398065090179443, "learning_rate": 2.023959506256772e-05, "loss": 0.1098, "step": 82175 }, { "epoch": 1.1787988298055412, "grad_norm": 0.24305933713912964, "learning_rate": 2.0235611549002528e-05, "loss": 0.1205, "step": 82200 }, { "epoch": 1.179157345264728, "grad_norm": 0.6114645600318909, "learning_rate": 2.0231628035437336e-05, "loss": 0.0955, "step": 82225 }, { "epoch": 1.1795158607239145, "grad_norm": 13.192978858947754, "learning_rate": 2.0227644521872147e-05, "loss": 0.0572, "step": 82250 }, { "epoch": 1.179874376183101, "grad_norm": 10.119816780090332, "learning_rate": 2.0223661008306955e-05, "loss": 0.119, "step": 82275 }, { "epoch": 1.1802328916422877, "grad_norm": 12.164138793945312, "learning_rate": 2.0219677494741762e-05, "loss": 0.1268, "step": 82300 }, { "epoch": 1.1805914071014743, "grad_norm": 3.870020627975464, "learning_rate": 2.021569398117657e-05, "loss": 0.1224, "step": 82325 }, { "epoch": 1.1809499225606608, "grad_norm": 1.2118505239486694, "learning_rate": 2.0211710467611378e-05, "loss": 0.1943, "step": 82350 }, { "epoch": 1.1813084380198475, "grad_norm": 0.6110102534294128, "learning_rate": 2.020772695404619e-05, "loss": 0.1789, "step": 82375 }, { "epoch": 1.181666953479034, "grad_norm": 12.708374977111816, "learning_rate": 2.0203743440480997e-05, "loss": 0.0963, "step": 82400 }, { "epoch": 1.1820254689382206, "grad_norm": 20.61904525756836, "learning_rate": 2.0199759926915805e-05, "loss": 0.1422, "step": 82425 }, { "epoch": 1.1823839843974073, "grad_norm": 2.1526949405670166, "learning_rate": 2.0195776413350613e-05, "loss": 0.1055, "step": 82450 }, { "epoch": 1.1827424998565939, "grad_norm": 9.503007888793945, "learning_rate": 2.019179289978542e-05, "loss": 0.0852, "step": 82475 }, { "epoch": 1.1831010153157804, "grad_norm": 1.3960720300674438, "learning_rate": 2.0187809386220232e-05, "loss": 0.1864, "step": 82500 }, { "epoch": 1.1834595307749671, "grad_norm": 2.652193069458008, "learning_rate": 2.018382587265504e-05, "loss": 0.1816, "step": 82525 }, { "epoch": 1.1838180462341537, "grad_norm": 25.085681915283203, "learning_rate": 2.0179842359089848e-05, "loss": 0.1567, "step": 82550 }, { "epoch": 1.1841765616933402, "grad_norm": 1.9097594022750854, "learning_rate": 2.0175858845524656e-05, "loss": 0.1261, "step": 82575 }, { "epoch": 1.184535077152527, "grad_norm": 14.364704132080078, "learning_rate": 2.0171875331959463e-05, "loss": 0.1942, "step": 82600 }, { "epoch": 1.1848935926117135, "grad_norm": 0.9350797533988953, "learning_rate": 2.0167891818394275e-05, "loss": 0.2043, "step": 82625 }, { "epoch": 1.1852521080709, "grad_norm": 1.3304722309112549, "learning_rate": 2.0163908304829082e-05, "loss": 0.2152, "step": 82650 }, { "epoch": 1.1856106235300867, "grad_norm": 0.5296156406402588, "learning_rate": 2.015992479126389e-05, "loss": 0.1141, "step": 82675 }, { "epoch": 1.1859691389892733, "grad_norm": 0.5241543650627136, "learning_rate": 2.0155941277698698e-05, "loss": 0.1472, "step": 82700 }, { "epoch": 1.1863276544484598, "grad_norm": 8.068705558776855, "learning_rate": 2.0151957764133506e-05, "loss": 0.1018, "step": 82725 }, { "epoch": 1.1866861699076465, "grad_norm": 17.53134536743164, "learning_rate": 2.0147974250568317e-05, "loss": 0.1338, "step": 82750 }, { "epoch": 1.187044685366833, "grad_norm": 20.974750518798828, "learning_rate": 2.0143990737003125e-05, "loss": 0.2069, "step": 82775 }, { "epoch": 1.1874032008260196, "grad_norm": 1.4272422790527344, "learning_rate": 2.0140007223437933e-05, "loss": 0.1219, "step": 82800 }, { "epoch": 1.1877617162852063, "grad_norm": 6.911182880401611, "learning_rate": 2.013602370987274e-05, "loss": 0.1075, "step": 82825 }, { "epoch": 1.1881202317443929, "grad_norm": 5.263704776763916, "learning_rate": 2.0132040196307552e-05, "loss": 0.2162, "step": 82850 }, { "epoch": 1.1884787472035794, "grad_norm": 14.89355754852295, "learning_rate": 2.012805668274236e-05, "loss": 0.0778, "step": 82875 }, { "epoch": 1.1888372626627661, "grad_norm": 3.833099842071533, "learning_rate": 2.0124073169177168e-05, "loss": 0.1212, "step": 82900 }, { "epoch": 1.1891957781219527, "grad_norm": 2.3652398586273193, "learning_rate": 2.0120089655611976e-05, "loss": 0.1317, "step": 82925 }, { "epoch": 1.1895542935811392, "grad_norm": 15.179102897644043, "learning_rate": 2.0116106142046783e-05, "loss": 0.1605, "step": 82950 }, { "epoch": 1.189912809040326, "grad_norm": 4.0107316970825195, "learning_rate": 2.011212262848159e-05, "loss": 0.1058, "step": 82975 }, { "epoch": 1.1902713244995125, "grad_norm": 1.1529600620269775, "learning_rate": 2.01081391149164e-05, "loss": 0.1465, "step": 83000 }, { "epoch": 1.190629839958699, "grad_norm": 11.704115867614746, "learning_rate": 2.0104155601351207e-05, "loss": 0.098, "step": 83025 }, { "epoch": 1.1909883554178857, "grad_norm": 7.813187599182129, "learning_rate": 2.0100172087786015e-05, "loss": 0.2032, "step": 83050 }, { "epoch": 1.1913468708770723, "grad_norm": 0.2448362410068512, "learning_rate": 2.0096188574220823e-05, "loss": 0.1044, "step": 83075 }, { "epoch": 1.1917053863362588, "grad_norm": 0.08353569358587265, "learning_rate": 2.0092205060655634e-05, "loss": 0.1301, "step": 83100 }, { "epoch": 1.1920639017954455, "grad_norm": 0.22497877478599548, "learning_rate": 2.008822154709044e-05, "loss": 0.1023, "step": 83125 }, { "epoch": 1.192422417254632, "grad_norm": 7.271242618560791, "learning_rate": 2.008423803352525e-05, "loss": 0.092, "step": 83150 }, { "epoch": 1.1927809327138186, "grad_norm": 17.870689392089844, "learning_rate": 2.0080254519960057e-05, "loss": 0.1114, "step": 83175 }, { "epoch": 1.1931394481730053, "grad_norm": 3.887390375137329, "learning_rate": 2.0076271006394865e-05, "loss": 0.0881, "step": 83200 }, { "epoch": 1.1934979636321918, "grad_norm": 7.32380485534668, "learning_rate": 2.0072287492829676e-05, "loss": 0.1716, "step": 83225 }, { "epoch": 1.1938564790913784, "grad_norm": 0.39013010263442993, "learning_rate": 2.0068303979264484e-05, "loss": 0.1727, "step": 83250 }, { "epoch": 1.1942149945505651, "grad_norm": 1.1051586866378784, "learning_rate": 2.0064320465699292e-05, "loss": 0.1947, "step": 83275 }, { "epoch": 1.1945735100097516, "grad_norm": 5.814789772033691, "learning_rate": 2.00603369521341e-05, "loss": 0.1098, "step": 83300 }, { "epoch": 1.1949320254689382, "grad_norm": 11.438288688659668, "learning_rate": 2.0056353438568908e-05, "loss": 0.187, "step": 83325 }, { "epoch": 1.195290540928125, "grad_norm": 3.6783695220947266, "learning_rate": 2.005236992500372e-05, "loss": 0.1671, "step": 83350 }, { "epoch": 1.1956490563873114, "grad_norm": 1.03128182888031, "learning_rate": 2.0048386411438527e-05, "loss": 0.1565, "step": 83375 }, { "epoch": 1.196007571846498, "grad_norm": 15.527702331542969, "learning_rate": 2.0044402897873335e-05, "loss": 0.1312, "step": 83400 }, { "epoch": 1.1963660873056847, "grad_norm": 0.13567541539669037, "learning_rate": 2.0040419384308143e-05, "loss": 0.0915, "step": 83425 }, { "epoch": 1.1967246027648712, "grad_norm": 0.08529099822044373, "learning_rate": 2.003643587074295e-05, "loss": 0.1459, "step": 83450 }, { "epoch": 1.1970831182240578, "grad_norm": 10.387395858764648, "learning_rate": 2.003245235717776e-05, "loss": 0.1052, "step": 83475 }, { "epoch": 1.1974416336832445, "grad_norm": 5.762390613555908, "learning_rate": 2.002846884361257e-05, "loss": 0.145, "step": 83500 }, { "epoch": 1.197800149142431, "grad_norm": 2.0393974781036377, "learning_rate": 2.0024485330047377e-05, "loss": 0.1832, "step": 83525 }, { "epoch": 1.1981586646016176, "grad_norm": 1.8632162809371948, "learning_rate": 2.0020501816482185e-05, "loss": 0.1373, "step": 83550 }, { "epoch": 1.1985171800608043, "grad_norm": 4.335537910461426, "learning_rate": 2.0016518302916996e-05, "loss": 0.0723, "step": 83575 }, { "epoch": 1.1988756955199908, "grad_norm": 0.06790325790643692, "learning_rate": 2.0012534789351804e-05, "loss": 0.1784, "step": 83600 }, { "epoch": 1.1992342109791774, "grad_norm": 14.930862426757812, "learning_rate": 2.0008551275786612e-05, "loss": 0.1215, "step": 83625 }, { "epoch": 1.1995927264383641, "grad_norm": 0.6847938895225525, "learning_rate": 2.000456776222142e-05, "loss": 0.2126, "step": 83650 }, { "epoch": 1.1999512418975506, "grad_norm": 10.814953804016113, "learning_rate": 2.0000584248656228e-05, "loss": 0.1714, "step": 83675 }, { "epoch": 1.2003097573567372, "grad_norm": 8.011683464050293, "learning_rate": 1.999660073509104e-05, "loss": 0.0625, "step": 83700 }, { "epoch": 1.200668272815924, "grad_norm": 12.057685852050781, "learning_rate": 1.9992617221525847e-05, "loss": 0.2046, "step": 83725 }, { "epoch": 1.2010267882751104, "grad_norm": 2.624502420425415, "learning_rate": 1.9988633707960655e-05, "loss": 0.0955, "step": 83750 }, { "epoch": 1.201385303734297, "grad_norm": 0.7239574193954468, "learning_rate": 1.9984650194395463e-05, "loss": 0.1954, "step": 83775 }, { "epoch": 1.2017438191934837, "grad_norm": 6.300218105316162, "learning_rate": 1.998066668083027e-05, "loss": 0.1923, "step": 83800 }, { "epoch": 1.2021023346526702, "grad_norm": 8.244440078735352, "learning_rate": 1.997668316726508e-05, "loss": 0.2308, "step": 83825 }, { "epoch": 1.2024608501118568, "grad_norm": 1.5436739921569824, "learning_rate": 1.997269965369989e-05, "loss": 0.1099, "step": 83850 }, { "epoch": 1.2028193655710435, "grad_norm": 2.769827365875244, "learning_rate": 1.9968716140134697e-05, "loss": 0.2161, "step": 83875 }, { "epoch": 1.20317788103023, "grad_norm": 1.3338068723678589, "learning_rate": 1.9964732626569505e-05, "loss": 0.0751, "step": 83900 }, { "epoch": 1.2035363964894166, "grad_norm": 1.2312389612197876, "learning_rate": 1.9960749113004313e-05, "loss": 0.075, "step": 83925 }, { "epoch": 1.2038949119486033, "grad_norm": 0.07822975516319275, "learning_rate": 1.9956765599439124e-05, "loss": 0.0808, "step": 83950 }, { "epoch": 1.2042534274077898, "grad_norm": 0.8123732805252075, "learning_rate": 1.9952782085873932e-05, "loss": 0.1694, "step": 83975 }, { "epoch": 1.2046119428669764, "grad_norm": 10.218799591064453, "learning_rate": 1.994879857230874e-05, "loss": 0.0815, "step": 84000 }, { "epoch": 1.2049704583261631, "grad_norm": 1.4815963506698608, "learning_rate": 1.9944815058743548e-05, "loss": 0.1177, "step": 84025 }, { "epoch": 1.2053289737853496, "grad_norm": 15.01300048828125, "learning_rate": 1.9940831545178356e-05, "loss": 0.1407, "step": 84050 }, { "epoch": 1.2056874892445362, "grad_norm": 1.008509874343872, "learning_rate": 1.9936848031613167e-05, "loss": 0.181, "step": 84075 }, { "epoch": 1.206046004703723, "grad_norm": 0.23628756403923035, "learning_rate": 1.993286451804797e-05, "loss": 0.1525, "step": 84100 }, { "epoch": 1.2064045201629094, "grad_norm": 2.207688808441162, "learning_rate": 1.992888100448278e-05, "loss": 0.0659, "step": 84125 }, { "epoch": 1.206763035622096, "grad_norm": 0.6153626441955566, "learning_rate": 1.9924897490917587e-05, "loss": 0.2254, "step": 84150 }, { "epoch": 1.2071215510812827, "grad_norm": 12.518716812133789, "learning_rate": 1.9920913977352398e-05, "loss": 0.1374, "step": 84175 }, { "epoch": 1.2074800665404692, "grad_norm": 1.5360193252563477, "learning_rate": 1.9916930463787206e-05, "loss": 0.1407, "step": 84200 }, { "epoch": 1.2078385819996558, "grad_norm": 15.796856880187988, "learning_rate": 1.9912946950222014e-05, "loss": 0.1474, "step": 84225 }, { "epoch": 1.2081970974588425, "grad_norm": 23.75484275817871, "learning_rate": 1.9908963436656822e-05, "loss": 0.1182, "step": 84250 }, { "epoch": 1.208555612918029, "grad_norm": 1.2447261810302734, "learning_rate": 1.990497992309163e-05, "loss": 0.1779, "step": 84275 }, { "epoch": 1.2089141283772156, "grad_norm": 9.259206771850586, "learning_rate": 1.990099640952644e-05, "loss": 0.1295, "step": 84300 }, { "epoch": 1.2092726438364023, "grad_norm": 9.493935585021973, "learning_rate": 1.989701289596125e-05, "loss": 0.1664, "step": 84325 }, { "epoch": 1.2096311592955888, "grad_norm": 3.9043054580688477, "learning_rate": 1.9893029382396057e-05, "loss": 0.1606, "step": 84350 }, { "epoch": 1.2099896747547754, "grad_norm": 6.58043909072876, "learning_rate": 1.9889045868830864e-05, "loss": 0.1501, "step": 84375 }, { "epoch": 1.2103481902139621, "grad_norm": 8.009166717529297, "learning_rate": 1.9885062355265672e-05, "loss": 0.2178, "step": 84400 }, { "epoch": 1.2107067056731486, "grad_norm": 11.856939315795898, "learning_rate": 1.9881078841700483e-05, "loss": 0.0875, "step": 84425 }, { "epoch": 1.2110652211323352, "grad_norm": 0.5198144316673279, "learning_rate": 1.987709532813529e-05, "loss": 0.1837, "step": 84450 }, { "epoch": 1.211423736591522, "grad_norm": 13.329258918762207, "learning_rate": 1.98731118145701e-05, "loss": 0.159, "step": 84475 }, { "epoch": 1.2117822520507084, "grad_norm": 1.1745481491088867, "learning_rate": 1.9869128301004907e-05, "loss": 0.1613, "step": 84500 }, { "epoch": 1.212140767509895, "grad_norm": 13.520059585571289, "learning_rate": 1.9865144787439715e-05, "loss": 0.1946, "step": 84525 }, { "epoch": 1.2124992829690817, "grad_norm": 12.531767845153809, "learning_rate": 1.9861161273874526e-05, "loss": 0.1364, "step": 84550 }, { "epoch": 1.2128577984282682, "grad_norm": 9.54564380645752, "learning_rate": 1.9857177760309334e-05, "loss": 0.1267, "step": 84575 }, { "epoch": 1.2132163138874548, "grad_norm": 2.906958818435669, "learning_rate": 1.9853194246744142e-05, "loss": 0.1138, "step": 84600 }, { "epoch": 1.2135748293466415, "grad_norm": 4.977423667907715, "learning_rate": 1.984921073317895e-05, "loss": 0.1047, "step": 84625 }, { "epoch": 1.213933344805828, "grad_norm": 4.704277038574219, "learning_rate": 1.9845227219613757e-05, "loss": 0.2093, "step": 84650 }, { "epoch": 1.2142918602650146, "grad_norm": 0.5113750100135803, "learning_rate": 1.984124370604857e-05, "loss": 0.1619, "step": 84675 }, { "epoch": 1.2146503757242013, "grad_norm": 4.446418762207031, "learning_rate": 1.9837260192483377e-05, "loss": 0.1029, "step": 84700 }, { "epoch": 1.2150088911833878, "grad_norm": 5.580574035644531, "learning_rate": 1.9833276678918184e-05, "loss": 0.0905, "step": 84725 }, { "epoch": 1.2153674066425744, "grad_norm": 1.8353267908096313, "learning_rate": 1.9829293165352992e-05, "loss": 0.0898, "step": 84750 }, { "epoch": 1.2157259221017611, "grad_norm": 15.558917045593262, "learning_rate": 1.9825309651787803e-05, "loss": 0.1499, "step": 84775 }, { "epoch": 1.2160844375609476, "grad_norm": 3.4362289905548096, "learning_rate": 1.982132613822261e-05, "loss": 0.1021, "step": 84800 }, { "epoch": 1.2164429530201342, "grad_norm": 6.756643772125244, "learning_rate": 1.981734262465742e-05, "loss": 0.1754, "step": 84825 }, { "epoch": 1.216801468479321, "grad_norm": 0.5707326531410217, "learning_rate": 1.9813359111092227e-05, "loss": 0.1579, "step": 84850 }, { "epoch": 1.2171599839385074, "grad_norm": 8.04273509979248, "learning_rate": 1.9809375597527035e-05, "loss": 0.1102, "step": 84875 }, { "epoch": 1.217518499397694, "grad_norm": 1.1925711631774902, "learning_rate": 1.9805392083961846e-05, "loss": 0.0972, "step": 84900 }, { "epoch": 1.2178770148568807, "grad_norm": 7.969354152679443, "learning_rate": 1.9801408570396654e-05, "loss": 0.1919, "step": 84925 }, { "epoch": 1.2182355303160672, "grad_norm": 18.037569046020508, "learning_rate": 1.9797425056831462e-05, "loss": 0.1206, "step": 84950 }, { "epoch": 1.2185940457752538, "grad_norm": 1.8084131479263306, "learning_rate": 1.979344154326627e-05, "loss": 0.2129, "step": 84975 }, { "epoch": 1.2189525612344405, "grad_norm": 2.846735715866089, "learning_rate": 1.9789458029701077e-05, "loss": 0.103, "step": 85000 }, { "epoch": 1.219311076693627, "grad_norm": 10.99833869934082, "learning_rate": 1.978547451613589e-05, "loss": 0.1733, "step": 85025 }, { "epoch": 1.2196695921528136, "grad_norm": 2.785933494567871, "learning_rate": 1.9781491002570697e-05, "loss": 0.2243, "step": 85050 }, { "epoch": 1.2200281076120003, "grad_norm": 11.351670265197754, "learning_rate": 1.9777507489005504e-05, "loss": 0.1349, "step": 85075 }, { "epoch": 1.2203866230711868, "grad_norm": 11.02046012878418, "learning_rate": 1.9773523975440312e-05, "loss": 0.1582, "step": 85100 }, { "epoch": 1.2207451385303734, "grad_norm": 11.101923942565918, "learning_rate": 1.976954046187512e-05, "loss": 0.1094, "step": 85125 }, { "epoch": 1.2211036539895601, "grad_norm": 18.15245246887207, "learning_rate": 1.976555694830993e-05, "loss": 0.203, "step": 85150 }, { "epoch": 1.2214621694487466, "grad_norm": 13.24808406829834, "learning_rate": 1.976157343474474e-05, "loss": 0.131, "step": 85175 }, { "epoch": 1.2218206849079332, "grad_norm": 10.453234672546387, "learning_rate": 1.9757589921179544e-05, "loss": 0.1209, "step": 85200 }, { "epoch": 1.22217920036712, "grad_norm": 1.558677315711975, "learning_rate": 1.975360640761435e-05, "loss": 0.1966, "step": 85225 }, { "epoch": 1.2225377158263064, "grad_norm": 12.662324905395508, "learning_rate": 1.974962289404916e-05, "loss": 0.1572, "step": 85250 }, { "epoch": 1.222896231285493, "grad_norm": 0.2729366421699524, "learning_rate": 1.974563938048397e-05, "loss": 0.0794, "step": 85275 }, { "epoch": 1.2232547467446797, "grad_norm": 23.591541290283203, "learning_rate": 1.974165586691878e-05, "loss": 0.2104, "step": 85300 }, { "epoch": 1.2236132622038662, "grad_norm": 5.441860675811768, "learning_rate": 1.9737672353353586e-05, "loss": 0.1424, "step": 85325 }, { "epoch": 1.2239717776630528, "grad_norm": 7.584991931915283, "learning_rate": 1.9733688839788394e-05, "loss": 0.0976, "step": 85350 }, { "epoch": 1.2243302931222395, "grad_norm": 6.136943340301514, "learning_rate": 1.9729705326223205e-05, "loss": 0.1284, "step": 85375 }, { "epoch": 1.224688808581426, "grad_norm": 0.40613892674446106, "learning_rate": 1.9725721812658013e-05, "loss": 0.1661, "step": 85400 }, { "epoch": 1.2250473240406126, "grad_norm": 7.554370880126953, "learning_rate": 1.972173829909282e-05, "loss": 0.1589, "step": 85425 }, { "epoch": 1.2254058394997993, "grad_norm": 5.036656856536865, "learning_rate": 1.971775478552763e-05, "loss": 0.0735, "step": 85450 }, { "epoch": 1.2257643549589858, "grad_norm": 2.351287841796875, "learning_rate": 1.9713771271962437e-05, "loss": 0.1206, "step": 85475 }, { "epoch": 1.2261228704181724, "grad_norm": 2.7536299228668213, "learning_rate": 1.9709787758397248e-05, "loss": 0.1359, "step": 85500 }, { "epoch": 1.226481385877359, "grad_norm": 0.3040899336338043, "learning_rate": 1.9705804244832056e-05, "loss": 0.2309, "step": 85525 }, { "epoch": 1.2268399013365456, "grad_norm": 13.28127384185791, "learning_rate": 1.9701820731266864e-05, "loss": 0.1533, "step": 85550 }, { "epoch": 1.2271984167957322, "grad_norm": 2.5735371112823486, "learning_rate": 1.969783721770167e-05, "loss": 0.0853, "step": 85575 }, { "epoch": 1.227556932254919, "grad_norm": 0.6607207655906677, "learning_rate": 1.969385370413648e-05, "loss": 0.1886, "step": 85600 }, { "epoch": 1.2279154477141054, "grad_norm": 1.4019514322280884, "learning_rate": 1.968987019057129e-05, "loss": 0.1228, "step": 85625 }, { "epoch": 1.228273963173292, "grad_norm": 0.6589192748069763, "learning_rate": 1.96858866770061e-05, "loss": 0.121, "step": 85650 }, { "epoch": 1.2286324786324787, "grad_norm": 0.8452647924423218, "learning_rate": 1.9681903163440906e-05, "loss": 0.1191, "step": 85675 }, { "epoch": 1.2289909940916652, "grad_norm": 5.3826470375061035, "learning_rate": 1.9677919649875714e-05, "loss": 0.186, "step": 85700 }, { "epoch": 1.2293495095508518, "grad_norm": 12.912796020507812, "learning_rate": 1.9673936136310522e-05, "loss": 0.1752, "step": 85725 }, { "epoch": 1.2297080250100385, "grad_norm": 0.5499128699302673, "learning_rate": 1.9669952622745333e-05, "loss": 0.1962, "step": 85750 }, { "epoch": 1.230066540469225, "grad_norm": 4.891895771026611, "learning_rate": 1.966596910918014e-05, "loss": 0.1514, "step": 85775 }, { "epoch": 1.2304250559284116, "grad_norm": 19.901809692382812, "learning_rate": 1.966198559561495e-05, "loss": 0.1916, "step": 85800 }, { "epoch": 1.2307835713875983, "grad_norm": 2.408626079559326, "learning_rate": 1.9658002082049757e-05, "loss": 0.1638, "step": 85825 }, { "epoch": 1.2311420868467848, "grad_norm": 1.3621007204055786, "learning_rate": 1.9654018568484564e-05, "loss": 0.139, "step": 85850 }, { "epoch": 1.2315006023059714, "grad_norm": 11.67822265625, "learning_rate": 1.9650035054919376e-05, "loss": 0.1616, "step": 85875 }, { "epoch": 1.231859117765158, "grad_norm": 21.09786605834961, "learning_rate": 1.9646051541354184e-05, "loss": 0.154, "step": 85900 }, { "epoch": 1.2322176332243446, "grad_norm": 8.946200370788574, "learning_rate": 1.964206802778899e-05, "loss": 0.1235, "step": 85925 }, { "epoch": 1.2325761486835312, "grad_norm": 0.9359543323516846, "learning_rate": 1.96380845142238e-05, "loss": 0.1028, "step": 85950 }, { "epoch": 1.232934664142718, "grad_norm": 17.86543846130371, "learning_rate": 1.963410100065861e-05, "loss": 0.1368, "step": 85975 }, { "epoch": 1.2332931796019044, "grad_norm": 4.408057689666748, "learning_rate": 1.963011748709342e-05, "loss": 0.1605, "step": 86000 }, { "epoch": 1.233651695061091, "grad_norm": 16.04673194885254, "learning_rate": 1.9626133973528226e-05, "loss": 0.1604, "step": 86025 }, { "epoch": 1.2340102105202777, "grad_norm": 0.36896875500679016, "learning_rate": 1.9622150459963034e-05, "loss": 0.1314, "step": 86050 }, { "epoch": 1.2343687259794642, "grad_norm": 8.236780166625977, "learning_rate": 1.9618166946397842e-05, "loss": 0.1265, "step": 86075 }, { "epoch": 1.2347272414386508, "grad_norm": 1.083343744277954, "learning_rate": 1.9614183432832653e-05, "loss": 0.0587, "step": 86100 }, { "epoch": 1.2350857568978375, "grad_norm": 0.3922148644924164, "learning_rate": 1.961019991926746e-05, "loss": 0.1394, "step": 86125 }, { "epoch": 1.235444272357024, "grad_norm": 0.4424006938934326, "learning_rate": 1.960621640570227e-05, "loss": 0.0779, "step": 86150 }, { "epoch": 1.2358027878162106, "grad_norm": 18.084815979003906, "learning_rate": 1.9602232892137077e-05, "loss": 0.2143, "step": 86175 }, { "epoch": 1.2361613032753973, "grad_norm": 3.187960624694824, "learning_rate": 1.9598249378571884e-05, "loss": 0.1232, "step": 86200 }, { "epoch": 1.2365198187345838, "grad_norm": 4.727453231811523, "learning_rate": 1.9594265865006696e-05, "loss": 0.131, "step": 86225 }, { "epoch": 1.2368783341937704, "grad_norm": 9.51749038696289, "learning_rate": 1.9590282351441504e-05, "loss": 0.101, "step": 86250 }, { "epoch": 1.237236849652957, "grad_norm": 2.64363956451416, "learning_rate": 1.958629883787631e-05, "loss": 0.0997, "step": 86275 }, { "epoch": 1.2375953651121436, "grad_norm": 0.29623767733573914, "learning_rate": 1.9582315324311116e-05, "loss": 0.1581, "step": 86300 }, { "epoch": 1.2379538805713302, "grad_norm": 0.6727738976478577, "learning_rate": 1.9578331810745924e-05, "loss": 0.1354, "step": 86325 }, { "epoch": 1.238312396030517, "grad_norm": 3.2072150707244873, "learning_rate": 1.9574348297180735e-05, "loss": 0.1283, "step": 86350 }, { "epoch": 1.2386709114897034, "grad_norm": 5.8187479972839355, "learning_rate": 1.9570364783615543e-05, "loss": 0.2377, "step": 86375 }, { "epoch": 1.23902942694889, "grad_norm": 11.372291564941406, "learning_rate": 1.956638127005035e-05, "loss": 0.1408, "step": 86400 }, { "epoch": 1.2393879424080767, "grad_norm": 2.792980432510376, "learning_rate": 1.956239775648516e-05, "loss": 0.1, "step": 86425 }, { "epoch": 1.2397464578672632, "grad_norm": 0.17730024456977844, "learning_rate": 1.9558414242919966e-05, "loss": 0.0957, "step": 86450 }, { "epoch": 1.2401049733264498, "grad_norm": 0.2468557059764862, "learning_rate": 1.9554430729354778e-05, "loss": 0.1075, "step": 86475 }, { "epoch": 1.2404634887856365, "grad_norm": 2.97021746635437, "learning_rate": 1.9550447215789585e-05, "loss": 0.2168, "step": 86500 }, { "epoch": 1.240822004244823, "grad_norm": 1.9961127042770386, "learning_rate": 1.9546463702224393e-05, "loss": 0.1593, "step": 86525 }, { "epoch": 1.2411805197040096, "grad_norm": 13.426895141601562, "learning_rate": 1.95424801886592e-05, "loss": 0.1562, "step": 86550 }, { "epoch": 1.2415390351631963, "grad_norm": 0.19812512397766113, "learning_rate": 1.9538496675094012e-05, "loss": 0.073, "step": 86575 }, { "epoch": 1.2418975506223828, "grad_norm": 18.211069107055664, "learning_rate": 1.953451316152882e-05, "loss": 0.1685, "step": 86600 }, { "epoch": 1.2422560660815694, "grad_norm": 9.742734909057617, "learning_rate": 1.9530529647963628e-05, "loss": 0.1853, "step": 86625 }, { "epoch": 1.242614581540756, "grad_norm": 1.5478978157043457, "learning_rate": 1.9526546134398436e-05, "loss": 0.0722, "step": 86650 }, { "epoch": 1.2429730969999426, "grad_norm": 12.394100189208984, "learning_rate": 1.9522562620833244e-05, "loss": 0.2361, "step": 86675 }, { "epoch": 1.2433316124591292, "grad_norm": 2.6723222732543945, "learning_rate": 1.9518579107268055e-05, "loss": 0.237, "step": 86700 }, { "epoch": 1.243690127918316, "grad_norm": 3.9860689640045166, "learning_rate": 1.9514595593702863e-05, "loss": 0.108, "step": 86725 }, { "epoch": 1.2440486433775024, "grad_norm": 3.2846057415008545, "learning_rate": 1.951061208013767e-05, "loss": 0.0885, "step": 86750 }, { "epoch": 1.244407158836689, "grad_norm": 6.202033519744873, "learning_rate": 1.950662856657248e-05, "loss": 0.139, "step": 86775 }, { "epoch": 1.2447656742958757, "grad_norm": 1.663127064704895, "learning_rate": 1.9502645053007286e-05, "loss": 0.088, "step": 86800 }, { "epoch": 1.2451241897550622, "grad_norm": 2.188663959503174, "learning_rate": 1.9498661539442098e-05, "loss": 0.1329, "step": 86825 }, { "epoch": 1.2454827052142488, "grad_norm": 14.935900688171387, "learning_rate": 1.9494678025876905e-05, "loss": 0.0825, "step": 86850 }, { "epoch": 1.2458412206734355, "grad_norm": 8.857732772827148, "learning_rate": 1.9490694512311713e-05, "loss": 0.1569, "step": 86875 }, { "epoch": 1.246199736132622, "grad_norm": 11.219146728515625, "learning_rate": 1.948671099874652e-05, "loss": 0.1921, "step": 86900 }, { "epoch": 1.2465582515918086, "grad_norm": 7.549914360046387, "learning_rate": 1.948272748518133e-05, "loss": 0.1557, "step": 86925 }, { "epoch": 1.2469167670509953, "grad_norm": 5.534095287322998, "learning_rate": 1.947874397161614e-05, "loss": 0.101, "step": 86950 }, { "epoch": 1.2472752825101818, "grad_norm": 0.3814396262168884, "learning_rate": 1.9474760458050948e-05, "loss": 0.0958, "step": 86975 }, { "epoch": 1.2476337979693684, "grad_norm": 7.948850631713867, "learning_rate": 1.9470776944485756e-05, "loss": 0.1821, "step": 87000 }, { "epoch": 1.247992313428555, "grad_norm": 7.525486469268799, "learning_rate": 1.9466793430920564e-05, "loss": 0.1214, "step": 87025 }, { "epoch": 1.2483508288877416, "grad_norm": 0.17919766902923584, "learning_rate": 1.946280991735537e-05, "loss": 0.1369, "step": 87050 }, { "epoch": 1.2487093443469282, "grad_norm": 7.809053421020508, "learning_rate": 1.9458826403790183e-05, "loss": 0.1277, "step": 87075 }, { "epoch": 1.249067859806115, "grad_norm": 0.4597426652908325, "learning_rate": 1.945484289022499e-05, "loss": 0.1503, "step": 87100 }, { "epoch": 1.2494263752653014, "grad_norm": 7.201597690582275, "learning_rate": 1.94508593766598e-05, "loss": 0.1029, "step": 87125 }, { "epoch": 1.249784890724488, "grad_norm": 0.5998862981796265, "learning_rate": 1.9446875863094606e-05, "loss": 0.1202, "step": 87150 }, { "epoch": 1.2501434061836747, "grad_norm": 8.116232872009277, "learning_rate": 1.9442892349529418e-05, "loss": 0.1672, "step": 87175 }, { "epoch": 1.2505019216428612, "grad_norm": 4.046141147613525, "learning_rate": 1.9438908835964225e-05, "loss": 0.1651, "step": 87200 }, { "epoch": 1.2508604371020478, "grad_norm": 11.215320587158203, "learning_rate": 1.9434925322399033e-05, "loss": 0.1214, "step": 87225 }, { "epoch": 1.2512189525612345, "grad_norm": 6.516470432281494, "learning_rate": 1.943094180883384e-05, "loss": 0.0862, "step": 87250 }, { "epoch": 1.251577468020421, "grad_norm": 1.0877827405929565, "learning_rate": 1.942695829526865e-05, "loss": 0.1286, "step": 87275 }, { "epoch": 1.2519359834796076, "grad_norm": 3.3253066539764404, "learning_rate": 1.942297478170346e-05, "loss": 0.1237, "step": 87300 }, { "epoch": 1.2522944989387943, "grad_norm": 0.7894201278686523, "learning_rate": 1.9418991268138268e-05, "loss": 0.1805, "step": 87325 }, { "epoch": 1.2526530143979808, "grad_norm": 6.732401371002197, "learning_rate": 1.9415007754573076e-05, "loss": 0.1088, "step": 87350 }, { "epoch": 1.2530115298571673, "grad_norm": 22.296037673950195, "learning_rate": 1.9411024241007884e-05, "loss": 0.1405, "step": 87375 }, { "epoch": 1.253370045316354, "grad_norm": 3.2061526775360107, "learning_rate": 1.9407040727442688e-05, "loss": 0.2279, "step": 87400 }, { "epoch": 1.2537285607755406, "grad_norm": 0.3113708198070526, "learning_rate": 1.94030572138775e-05, "loss": 0.1241, "step": 87425 }, { "epoch": 1.2540870762347271, "grad_norm": 0.12223266810178757, "learning_rate": 1.9399073700312307e-05, "loss": 0.1285, "step": 87450 }, { "epoch": 1.254445591693914, "grad_norm": 23.381649017333984, "learning_rate": 1.9395090186747115e-05, "loss": 0.1459, "step": 87475 }, { "epoch": 1.2548041071531004, "grad_norm": 16.038209915161133, "learning_rate": 1.9391106673181923e-05, "loss": 0.2209, "step": 87500 }, { "epoch": 1.255162622612287, "grad_norm": 0.5072634220123291, "learning_rate": 1.938712315961673e-05, "loss": 0.1722, "step": 87525 }, { "epoch": 1.2555211380714737, "grad_norm": 0.4829537272453308, "learning_rate": 1.9383139646051542e-05, "loss": 0.1226, "step": 87550 }, { "epoch": 1.2558796535306602, "grad_norm": 16.162446975708008, "learning_rate": 1.937915613248635e-05, "loss": 0.1678, "step": 87575 }, { "epoch": 1.2562381689898467, "grad_norm": 5.752259731292725, "learning_rate": 1.9375172618921158e-05, "loss": 0.0646, "step": 87600 }, { "epoch": 1.2565966844490335, "grad_norm": 1.0869109630584717, "learning_rate": 1.9371189105355965e-05, "loss": 0.1184, "step": 87625 }, { "epoch": 1.25695519990822, "grad_norm": 12.80834674835205, "learning_rate": 1.9367205591790773e-05, "loss": 0.1842, "step": 87650 }, { "epoch": 1.2573137153674065, "grad_norm": 1.7570202350616455, "learning_rate": 1.9363222078225585e-05, "loss": 0.0985, "step": 87675 }, { "epoch": 1.2576722308265933, "grad_norm": 0.26201942563056946, "learning_rate": 1.9359238564660392e-05, "loss": 0.1558, "step": 87700 }, { "epoch": 1.2580307462857798, "grad_norm": 1.1440134048461914, "learning_rate": 1.93552550510952e-05, "loss": 0.1255, "step": 87725 }, { "epoch": 1.2583892617449663, "grad_norm": 2.590226888656616, "learning_rate": 1.9351271537530008e-05, "loss": 0.0877, "step": 87750 }, { "epoch": 1.258747777204153, "grad_norm": 0.20306211709976196, "learning_rate": 1.9347288023964816e-05, "loss": 0.2902, "step": 87775 }, { "epoch": 1.2591062926633396, "grad_norm": 8.85634994506836, "learning_rate": 1.9343304510399627e-05, "loss": 0.1188, "step": 87800 }, { "epoch": 1.2594648081225261, "grad_norm": 10.764398574829102, "learning_rate": 1.9339320996834435e-05, "loss": 0.1323, "step": 87825 }, { "epoch": 1.259823323581713, "grad_norm": 3.8249754905700684, "learning_rate": 1.9335337483269243e-05, "loss": 0.1301, "step": 87850 }, { "epoch": 1.2601818390408994, "grad_norm": 11.898635864257812, "learning_rate": 1.933135396970405e-05, "loss": 0.17, "step": 87875 }, { "epoch": 1.260540354500086, "grad_norm": 18.087547302246094, "learning_rate": 1.9327370456138862e-05, "loss": 0.1642, "step": 87900 }, { "epoch": 1.2608988699592727, "grad_norm": 5.290402889251709, "learning_rate": 1.932338694257367e-05, "loss": 0.1335, "step": 87925 }, { "epoch": 1.2612573854184592, "grad_norm": 0.1732071489095688, "learning_rate": 1.9319403429008478e-05, "loss": 0.0729, "step": 87950 }, { "epoch": 1.2616159008776457, "grad_norm": 5.984612941741943, "learning_rate": 1.9315419915443285e-05, "loss": 0.1188, "step": 87975 }, { "epoch": 1.2619744163368325, "grad_norm": 11.558344841003418, "learning_rate": 1.9311436401878093e-05, "loss": 0.1232, "step": 88000 }, { "epoch": 1.262332931796019, "grad_norm": 0.3397071659564972, "learning_rate": 1.9307452888312905e-05, "loss": 0.1893, "step": 88025 }, { "epoch": 1.2626914472552055, "grad_norm": 0.40948057174682617, "learning_rate": 1.9303469374747712e-05, "loss": 0.1706, "step": 88050 }, { "epoch": 1.2630499627143923, "grad_norm": 2.987064838409424, "learning_rate": 1.929948586118252e-05, "loss": 0.1555, "step": 88075 }, { "epoch": 1.2634084781735788, "grad_norm": 5.519211769104004, "learning_rate": 1.9295502347617328e-05, "loss": 0.1322, "step": 88100 }, { "epoch": 1.2637669936327653, "grad_norm": 12.10764217376709, "learning_rate": 1.9291518834052136e-05, "loss": 0.107, "step": 88125 }, { "epoch": 1.264125509091952, "grad_norm": 1.25774085521698, "learning_rate": 1.9287535320486947e-05, "loss": 0.1585, "step": 88150 }, { "epoch": 1.2644840245511386, "grad_norm": 9.413365364074707, "learning_rate": 1.9283551806921755e-05, "loss": 0.1025, "step": 88175 }, { "epoch": 1.2648425400103251, "grad_norm": 5.502744197845459, "learning_rate": 1.9279568293356563e-05, "loss": 0.1956, "step": 88200 }, { "epoch": 1.265201055469512, "grad_norm": 1.321262240409851, "learning_rate": 1.927558477979137e-05, "loss": 0.1057, "step": 88225 }, { "epoch": 1.2655595709286984, "grad_norm": 0.84572434425354, "learning_rate": 1.927160126622618e-05, "loss": 0.0889, "step": 88250 }, { "epoch": 1.265918086387885, "grad_norm": 6.310572147369385, "learning_rate": 1.926761775266099e-05, "loss": 0.2875, "step": 88275 }, { "epoch": 1.2662766018470717, "grad_norm": 2.8491930961608887, "learning_rate": 1.9263634239095798e-05, "loss": 0.1435, "step": 88300 }, { "epoch": 1.2666351173062582, "grad_norm": 13.305085182189941, "learning_rate": 1.9259650725530605e-05, "loss": 0.1361, "step": 88325 }, { "epoch": 1.2669936327654447, "grad_norm": 4.033822536468506, "learning_rate": 1.9255667211965413e-05, "loss": 0.1723, "step": 88350 }, { "epoch": 1.2673521482246315, "grad_norm": 17.40733528137207, "learning_rate": 1.925168369840022e-05, "loss": 0.1473, "step": 88375 }, { "epoch": 1.267710663683818, "grad_norm": 0.2959986925125122, "learning_rate": 1.9247700184835032e-05, "loss": 0.1629, "step": 88400 }, { "epoch": 1.2680691791430045, "grad_norm": 10.000846862792969, "learning_rate": 1.924371667126984e-05, "loss": 0.131, "step": 88425 }, { "epoch": 1.2684276946021913, "grad_norm": 9.40120792388916, "learning_rate": 1.9239733157704648e-05, "loss": 0.0693, "step": 88450 }, { "epoch": 1.2687862100613778, "grad_norm": 13.34094524383545, "learning_rate": 1.9235749644139456e-05, "loss": 0.1735, "step": 88475 }, { "epoch": 1.2691447255205643, "grad_norm": 3.3353238105773926, "learning_rate": 1.9231766130574264e-05, "loss": 0.1006, "step": 88500 }, { "epoch": 1.269503240979751, "grad_norm": 6.618330001831055, "learning_rate": 1.922778261700907e-05, "loss": 0.1374, "step": 88525 }, { "epoch": 1.2698617564389376, "grad_norm": 1.5134788751602173, "learning_rate": 1.922379910344388e-05, "loss": 0.1621, "step": 88550 }, { "epoch": 1.2702202718981241, "grad_norm": 0.34944620728492737, "learning_rate": 1.9219815589878687e-05, "loss": 0.1931, "step": 88575 }, { "epoch": 1.270578787357311, "grad_norm": 17.319622039794922, "learning_rate": 1.9215832076313495e-05, "loss": 0.1404, "step": 88600 }, { "epoch": 1.2709373028164974, "grad_norm": 14.757564544677734, "learning_rate": 1.9211848562748306e-05, "loss": 0.133, "step": 88625 }, { "epoch": 1.271295818275684, "grad_norm": 3.6776645183563232, "learning_rate": 1.9207865049183114e-05, "loss": 0.1642, "step": 88650 }, { "epoch": 1.2716543337348707, "grad_norm": 0.18435066938400269, "learning_rate": 1.9203881535617922e-05, "loss": 0.1158, "step": 88675 }, { "epoch": 1.2720128491940572, "grad_norm": 3.5077478885650635, "learning_rate": 1.919989802205273e-05, "loss": 0.1082, "step": 88700 }, { "epoch": 1.2723713646532437, "grad_norm": 16.63112449645996, "learning_rate": 1.9195914508487538e-05, "loss": 0.1879, "step": 88725 }, { "epoch": 1.2727298801124305, "grad_norm": 5.052867889404297, "learning_rate": 1.919193099492235e-05, "loss": 0.1217, "step": 88750 }, { "epoch": 1.273088395571617, "grad_norm": 0.22490479052066803, "learning_rate": 1.9187947481357157e-05, "loss": 0.1271, "step": 88775 }, { "epoch": 1.2734469110308035, "grad_norm": 9.487274169921875, "learning_rate": 1.9183963967791965e-05, "loss": 0.1302, "step": 88800 }, { "epoch": 1.2738054264899903, "grad_norm": 25.172439575195312, "learning_rate": 1.9179980454226773e-05, "loss": 0.1145, "step": 88825 }, { "epoch": 1.2741639419491768, "grad_norm": 1.9555836915969849, "learning_rate": 1.917599694066158e-05, "loss": 0.1304, "step": 88850 }, { "epoch": 1.2745224574083633, "grad_norm": 11.961627006530762, "learning_rate": 1.917201342709639e-05, "loss": 0.1689, "step": 88875 }, { "epoch": 1.27488097286755, "grad_norm": 2.0868568420410156, "learning_rate": 1.91680299135312e-05, "loss": 0.0937, "step": 88900 }, { "epoch": 1.2752394883267366, "grad_norm": 3.9488167762756348, "learning_rate": 1.9164046399966007e-05, "loss": 0.1299, "step": 88925 }, { "epoch": 1.2755980037859231, "grad_norm": 2.0048773288726807, "learning_rate": 1.9160062886400815e-05, "loss": 0.1759, "step": 88950 }, { "epoch": 1.2759565192451099, "grad_norm": 16.95816993713379, "learning_rate": 1.9156079372835623e-05, "loss": 0.1048, "step": 88975 }, { "epoch": 1.2763150347042964, "grad_norm": 9.52876091003418, "learning_rate": 1.9152095859270434e-05, "loss": 0.0838, "step": 89000 }, { "epoch": 1.276673550163483, "grad_norm": 11.772560119628906, "learning_rate": 1.9148112345705242e-05, "loss": 0.1386, "step": 89025 }, { "epoch": 1.2770320656226697, "grad_norm": 11.381997108459473, "learning_rate": 1.914412883214005e-05, "loss": 0.1779, "step": 89050 }, { "epoch": 1.2773905810818562, "grad_norm": 1.7793774604797363, "learning_rate": 1.9140145318574858e-05, "loss": 0.1007, "step": 89075 }, { "epoch": 1.2777490965410427, "grad_norm": 0.313202440738678, "learning_rate": 1.913616180500967e-05, "loss": 0.1231, "step": 89100 }, { "epoch": 1.2781076120002295, "grad_norm": 0.8829665780067444, "learning_rate": 1.9132178291444477e-05, "loss": 0.1498, "step": 89125 }, { "epoch": 1.278466127459416, "grad_norm": 12.944592475891113, "learning_rate": 1.9128194777879285e-05, "loss": 0.1881, "step": 89150 }, { "epoch": 1.2788246429186025, "grad_norm": 4.8661298751831055, "learning_rate": 1.9124211264314093e-05, "loss": 0.1509, "step": 89175 }, { "epoch": 1.2791831583777893, "grad_norm": 1.4922595024108887, "learning_rate": 1.91202277507489e-05, "loss": 0.1558, "step": 89200 }, { "epoch": 1.2795416738369758, "grad_norm": 0.8286322951316833, "learning_rate": 1.911624423718371e-05, "loss": 0.1735, "step": 89225 }, { "epoch": 1.2799001892961623, "grad_norm": 26.050514221191406, "learning_rate": 1.911226072361852e-05, "loss": 0.1546, "step": 89250 }, { "epoch": 1.280258704755349, "grad_norm": 6.769221782684326, "learning_rate": 1.9108277210053327e-05, "loss": 0.1515, "step": 89275 }, { "epoch": 1.2806172202145356, "grad_norm": 1.9496262073516846, "learning_rate": 1.9104293696488135e-05, "loss": 0.194, "step": 89300 }, { "epoch": 1.2809757356737221, "grad_norm": 1.9228930473327637, "learning_rate": 1.9100310182922943e-05, "loss": 0.1054, "step": 89325 }, { "epoch": 1.2813342511329089, "grad_norm": 5.1145501136779785, "learning_rate": 1.9096326669357754e-05, "loss": 0.1337, "step": 89350 }, { "epoch": 1.2816927665920954, "grad_norm": 1.694364309310913, "learning_rate": 1.9092343155792562e-05, "loss": 0.0748, "step": 89375 }, { "epoch": 1.282051282051282, "grad_norm": 0.3651246428489685, "learning_rate": 1.908835964222737e-05, "loss": 0.1886, "step": 89400 }, { "epoch": 1.2824097975104687, "grad_norm": 1.7411822080612183, "learning_rate": 1.9084376128662178e-05, "loss": 0.1816, "step": 89425 }, { "epoch": 1.2827683129696552, "grad_norm": 12.174686431884766, "learning_rate": 1.9080392615096986e-05, "loss": 0.1205, "step": 89450 }, { "epoch": 1.2831268284288417, "grad_norm": 3.760211229324341, "learning_rate": 1.9076409101531797e-05, "loss": 0.1999, "step": 89475 }, { "epoch": 1.2834853438880285, "grad_norm": 3.2471582889556885, "learning_rate": 1.9072425587966605e-05, "loss": 0.1432, "step": 89500 }, { "epoch": 1.283843859347215, "grad_norm": 5.306251525878906, "learning_rate": 1.9068442074401413e-05, "loss": 0.1978, "step": 89525 }, { "epoch": 1.2842023748064015, "grad_norm": 0.5769215226173401, "learning_rate": 1.906445856083622e-05, "loss": 0.1144, "step": 89550 }, { "epoch": 1.2845608902655883, "grad_norm": 10.122278213500977, "learning_rate": 1.9060475047271028e-05, "loss": 0.1324, "step": 89575 }, { "epoch": 1.2849194057247748, "grad_norm": 1.736657977104187, "learning_rate": 1.9056491533705836e-05, "loss": 0.0894, "step": 89600 }, { "epoch": 1.2852779211839613, "grad_norm": 1.0126692056655884, "learning_rate": 1.9052508020140644e-05, "loss": 0.1042, "step": 89625 }, { "epoch": 1.285636436643148, "grad_norm": 8.4027681350708, "learning_rate": 1.9048524506575452e-05, "loss": 0.1255, "step": 89650 }, { "epoch": 1.2859949521023346, "grad_norm": 13.22317123413086, "learning_rate": 1.904454099301026e-05, "loss": 0.1343, "step": 89675 }, { "epoch": 1.2863534675615211, "grad_norm": 0.19316600263118744, "learning_rate": 1.904055747944507e-05, "loss": 0.0537, "step": 89700 }, { "epoch": 1.2867119830207079, "grad_norm": 17.487146377563477, "learning_rate": 1.903657396587988e-05, "loss": 0.1001, "step": 89725 }, { "epoch": 1.2870704984798944, "grad_norm": 1.692923665046692, "learning_rate": 1.9032590452314686e-05, "loss": 0.1331, "step": 89750 }, { "epoch": 1.287429013939081, "grad_norm": 5.3571906089782715, "learning_rate": 1.9028606938749494e-05, "loss": 0.1045, "step": 89775 }, { "epoch": 1.2877875293982677, "grad_norm": 11.88892936706543, "learning_rate": 1.9024623425184302e-05, "loss": 0.1229, "step": 89800 }, { "epoch": 1.2881460448574542, "grad_norm": 6.67463493347168, "learning_rate": 1.9020639911619113e-05, "loss": 0.083, "step": 89825 }, { "epoch": 1.2885045603166407, "grad_norm": 13.524585723876953, "learning_rate": 1.901665639805392e-05, "loss": 0.1886, "step": 89850 }, { "epoch": 1.2888630757758275, "grad_norm": 1.1586428880691528, "learning_rate": 1.901267288448873e-05, "loss": 0.1009, "step": 89875 }, { "epoch": 1.289221591235014, "grad_norm": 1.8288354873657227, "learning_rate": 1.9008689370923537e-05, "loss": 0.1162, "step": 89900 }, { "epoch": 1.2895801066942005, "grad_norm": 2.9314961433410645, "learning_rate": 1.9004705857358345e-05, "loss": 0.0934, "step": 89925 }, { "epoch": 1.2899386221533873, "grad_norm": 3.7469961643218994, "learning_rate": 1.9000722343793156e-05, "loss": 0.1482, "step": 89950 }, { "epoch": 1.2902971376125738, "grad_norm": 1.7358901500701904, "learning_rate": 1.8996738830227964e-05, "loss": 0.2038, "step": 89975 }, { "epoch": 1.2906556530717603, "grad_norm": 1.2806271314620972, "learning_rate": 1.8992755316662772e-05, "loss": 0.1735, "step": 90000 }, { "epoch": 1.291014168530947, "grad_norm": 10.256885528564453, "learning_rate": 1.898877180309758e-05, "loss": 0.1464, "step": 90025 }, { "epoch": 1.2913726839901336, "grad_norm": 0.1491103619337082, "learning_rate": 1.8984788289532387e-05, "loss": 0.1465, "step": 90050 }, { "epoch": 1.2917311994493201, "grad_norm": 1.6864222288131714, "learning_rate": 1.89808047759672e-05, "loss": 0.1762, "step": 90075 }, { "epoch": 1.2920897149085069, "grad_norm": 4.680619239807129, "learning_rate": 1.8976821262402006e-05, "loss": 0.1354, "step": 90100 }, { "epoch": 1.2924482303676934, "grad_norm": 0.20557022094726562, "learning_rate": 1.8972837748836814e-05, "loss": 0.1348, "step": 90125 }, { "epoch": 1.29280674582688, "grad_norm": 0.544093906879425, "learning_rate": 1.8968854235271622e-05, "loss": 0.0919, "step": 90150 }, { "epoch": 1.2931652612860667, "grad_norm": 3.4182634353637695, "learning_rate": 1.896487072170643e-05, "loss": 0.1647, "step": 90175 }, { "epoch": 1.2935237767452532, "grad_norm": 0.7058152556419373, "learning_rate": 1.896088720814124e-05, "loss": 0.1165, "step": 90200 }, { "epoch": 1.2938822922044397, "grad_norm": 1.5486979484558105, "learning_rate": 1.895690369457605e-05, "loss": 0.2287, "step": 90225 }, { "epoch": 1.2942408076636265, "grad_norm": 1.2811819314956665, "learning_rate": 1.8952920181010857e-05, "loss": 0.1233, "step": 90250 }, { "epoch": 1.294599323122813, "grad_norm": 11.731249809265137, "learning_rate": 1.8948936667445665e-05, "loss": 0.157, "step": 90275 }, { "epoch": 1.2949578385819995, "grad_norm": 0.6819745898246765, "learning_rate": 1.8944953153880476e-05, "loss": 0.0903, "step": 90300 }, { "epoch": 1.2953163540411863, "grad_norm": 0.93301922082901, "learning_rate": 1.8940969640315284e-05, "loss": 0.1009, "step": 90325 }, { "epoch": 1.2956748695003728, "grad_norm": 0.7130472660064697, "learning_rate": 1.8936986126750092e-05, "loss": 0.096, "step": 90350 }, { "epoch": 1.2960333849595593, "grad_norm": 14.355293273925781, "learning_rate": 1.89330026131849e-05, "loss": 0.1368, "step": 90375 }, { "epoch": 1.296391900418746, "grad_norm": 6.2527079582214355, "learning_rate": 1.8929019099619707e-05, "loss": 0.1306, "step": 90400 }, { "epoch": 1.2967504158779326, "grad_norm": 22.5249080657959, "learning_rate": 1.892503558605452e-05, "loss": 0.1443, "step": 90425 }, { "epoch": 1.2971089313371191, "grad_norm": 18.349924087524414, "learning_rate": 1.8921052072489326e-05, "loss": 0.201, "step": 90450 }, { "epoch": 1.2974674467963059, "grad_norm": 0.09689147770404816, "learning_rate": 1.8917068558924134e-05, "loss": 0.1133, "step": 90475 }, { "epoch": 1.2978259622554924, "grad_norm": 3.9869048595428467, "learning_rate": 1.8913085045358942e-05, "loss": 0.1971, "step": 90500 }, { "epoch": 1.298184477714679, "grad_norm": 9.751250267028809, "learning_rate": 1.890910153179375e-05, "loss": 0.2646, "step": 90525 }, { "epoch": 1.2985429931738657, "grad_norm": 0.4514658749103546, "learning_rate": 1.890511801822856e-05, "loss": 0.1374, "step": 90550 }, { "epoch": 1.2989015086330522, "grad_norm": 2.4231081008911133, "learning_rate": 1.890113450466337e-05, "loss": 0.1396, "step": 90575 }, { "epoch": 1.2992600240922387, "grad_norm": 17.21450424194336, "learning_rate": 1.8897150991098177e-05, "loss": 0.2312, "step": 90600 }, { "epoch": 1.2996185395514255, "grad_norm": 0.7430815696716309, "learning_rate": 1.8893167477532985e-05, "loss": 0.1588, "step": 90625 }, { "epoch": 1.299977055010612, "grad_norm": 0.16117286682128906, "learning_rate": 1.8889183963967793e-05, "loss": 0.1, "step": 90650 }, { "epoch": 1.3003355704697985, "grad_norm": 8.02660083770752, "learning_rate": 1.88852004504026e-05, "loss": 0.0908, "step": 90675 }, { "epoch": 1.3006940859289853, "grad_norm": 14.164112091064453, "learning_rate": 1.8881216936837408e-05, "loss": 0.1662, "step": 90700 }, { "epoch": 1.3010526013881718, "grad_norm": 14.997140884399414, "learning_rate": 1.8877233423272216e-05, "loss": 0.2149, "step": 90725 }, { "epoch": 1.3014111168473583, "grad_norm": 7.7594380378723145, "learning_rate": 1.8873249909707024e-05, "loss": 0.1007, "step": 90750 }, { "epoch": 1.301769632306545, "grad_norm": 1.5690251588821411, "learning_rate": 1.8869266396141832e-05, "loss": 0.0519, "step": 90775 }, { "epoch": 1.3021281477657316, "grad_norm": 0.994732677936554, "learning_rate": 1.8865282882576643e-05, "loss": 0.1115, "step": 90800 }, { "epoch": 1.3024866632249181, "grad_norm": 2.0241613388061523, "learning_rate": 1.886129936901145e-05, "loss": 0.1718, "step": 90825 }, { "epoch": 1.3028451786841049, "grad_norm": 2.629594564437866, "learning_rate": 1.885731585544626e-05, "loss": 0.1687, "step": 90850 }, { "epoch": 1.3032036941432914, "grad_norm": 12.764100074768066, "learning_rate": 1.8853332341881067e-05, "loss": 0.1549, "step": 90875 }, { "epoch": 1.303562209602478, "grad_norm": 12.377694129943848, "learning_rate": 1.8849348828315878e-05, "loss": 0.1631, "step": 90900 }, { "epoch": 1.3039207250616647, "grad_norm": 0.09879350662231445, "learning_rate": 1.8845365314750686e-05, "loss": 0.1492, "step": 90925 }, { "epoch": 1.3042792405208512, "grad_norm": 1.0883853435516357, "learning_rate": 1.8841381801185494e-05, "loss": 0.0894, "step": 90950 }, { "epoch": 1.3046377559800377, "grad_norm": 2.087754011154175, "learning_rate": 1.88373982876203e-05, "loss": 0.1, "step": 90975 }, { "epoch": 1.3049962714392245, "grad_norm": 0.7245159149169922, "learning_rate": 1.883341477405511e-05, "loss": 0.2188, "step": 91000 }, { "epoch": 1.305354786898411, "grad_norm": 0.13497547805309296, "learning_rate": 1.882943126048992e-05, "loss": 0.0672, "step": 91025 }, { "epoch": 1.3057133023575975, "grad_norm": 1.0256311893463135, "learning_rate": 1.8825447746924728e-05, "loss": 0.1275, "step": 91050 }, { "epoch": 1.3060718178167843, "grad_norm": 0.20446525514125824, "learning_rate": 1.8821464233359536e-05, "loss": 0.1467, "step": 91075 }, { "epoch": 1.3064303332759708, "grad_norm": 5.996006965637207, "learning_rate": 1.8817480719794344e-05, "loss": 0.2169, "step": 91100 }, { "epoch": 1.3067888487351573, "grad_norm": 5.868703365325928, "learning_rate": 1.8813497206229152e-05, "loss": 0.1201, "step": 91125 }, { "epoch": 1.307147364194344, "grad_norm": 19.359203338623047, "learning_rate": 1.8809513692663963e-05, "loss": 0.1512, "step": 91150 }, { "epoch": 1.3075058796535306, "grad_norm": 9.44827651977539, "learning_rate": 1.880553017909877e-05, "loss": 0.1485, "step": 91175 }, { "epoch": 1.3078643951127171, "grad_norm": 17.839048385620117, "learning_rate": 1.880154666553358e-05, "loss": 0.1558, "step": 91200 }, { "epoch": 1.3082229105719039, "grad_norm": 9.177870750427246, "learning_rate": 1.8797563151968387e-05, "loss": 0.1657, "step": 91225 }, { "epoch": 1.3085814260310904, "grad_norm": 0.3534977436065674, "learning_rate": 1.8793579638403194e-05, "loss": 0.1137, "step": 91250 }, { "epoch": 1.308939941490277, "grad_norm": 3.1227588653564453, "learning_rate": 1.8789596124838006e-05, "loss": 0.1571, "step": 91275 }, { "epoch": 1.3092984569494637, "grad_norm": 0.48205211758613586, "learning_rate": 1.8785612611272814e-05, "loss": 0.1186, "step": 91300 }, { "epoch": 1.3096569724086502, "grad_norm": 7.088231086730957, "learning_rate": 1.878162909770762e-05, "loss": 0.1293, "step": 91325 }, { "epoch": 1.3100154878678367, "grad_norm": 5.705817699432373, "learning_rate": 1.877764558414243e-05, "loss": 0.1014, "step": 91350 }, { "epoch": 1.3103740033270235, "grad_norm": 2.4660825729370117, "learning_rate": 1.8773662070577237e-05, "loss": 0.0305, "step": 91375 }, { "epoch": 1.31073251878621, "grad_norm": 2.2795486450195312, "learning_rate": 1.8769678557012048e-05, "loss": 0.1224, "step": 91400 }, { "epoch": 1.3110910342453965, "grad_norm": 0.37658411264419556, "learning_rate": 1.8765695043446856e-05, "loss": 0.1223, "step": 91425 }, { "epoch": 1.3114495497045833, "grad_norm": 0.6189699769020081, "learning_rate": 1.8761711529881664e-05, "loss": 0.1587, "step": 91450 }, { "epoch": 1.3118080651637698, "grad_norm": 2.732168674468994, "learning_rate": 1.8757728016316472e-05, "loss": 0.157, "step": 91475 }, { "epoch": 1.3121665806229563, "grad_norm": 8.471474647521973, "learning_rate": 1.8753744502751283e-05, "loss": 0.1237, "step": 91500 }, { "epoch": 1.312525096082143, "grad_norm": 1.5009468793869019, "learning_rate": 1.874976098918609e-05, "loss": 0.1362, "step": 91525 }, { "epoch": 1.3128836115413296, "grad_norm": 11.817500114440918, "learning_rate": 1.87457774756209e-05, "loss": 0.1539, "step": 91550 }, { "epoch": 1.3132421270005161, "grad_norm": 18.464231491088867, "learning_rate": 1.8741793962055707e-05, "loss": 0.2125, "step": 91575 }, { "epoch": 1.3136006424597029, "grad_norm": 5.910933017730713, "learning_rate": 1.8737810448490514e-05, "loss": 0.2128, "step": 91600 }, { "epoch": 1.3139591579188894, "grad_norm": 3.327465057373047, "learning_rate": 1.8733826934925326e-05, "loss": 0.1574, "step": 91625 }, { "epoch": 1.3143176733780761, "grad_norm": 2.880999803543091, "learning_rate": 1.8729843421360134e-05, "loss": 0.1379, "step": 91650 }, { "epoch": 1.3146761888372627, "grad_norm": 13.0866117477417, "learning_rate": 1.872585990779494e-05, "loss": 0.1694, "step": 91675 }, { "epoch": 1.3150347042964492, "grad_norm": 11.997928619384766, "learning_rate": 1.872187639422975e-05, "loss": 0.1744, "step": 91700 }, { "epoch": 1.315393219755636, "grad_norm": 3.1082212924957275, "learning_rate": 1.8717892880664557e-05, "loss": 0.1479, "step": 91725 }, { "epoch": 1.3157517352148225, "grad_norm": 1.0102267265319824, "learning_rate": 1.8713909367099368e-05, "loss": 0.0992, "step": 91750 }, { "epoch": 1.316110250674009, "grad_norm": 5.0813093185424805, "learning_rate": 1.8709925853534173e-05, "loss": 0.1583, "step": 91775 }, { "epoch": 1.3164687661331957, "grad_norm": 1.3834489583969116, "learning_rate": 1.870594233996898e-05, "loss": 0.1824, "step": 91800 }, { "epoch": 1.3168272815923823, "grad_norm": 0.32752084732055664, "learning_rate": 1.870195882640379e-05, "loss": 0.0496, "step": 91825 }, { "epoch": 1.3171857970515688, "grad_norm": 0.9088341593742371, "learning_rate": 1.8697975312838596e-05, "loss": 0.1259, "step": 91850 }, { "epoch": 1.3175443125107555, "grad_norm": 17.536012649536133, "learning_rate": 1.8693991799273407e-05, "loss": 0.1339, "step": 91875 }, { "epoch": 1.317902827969942, "grad_norm": 7.749189853668213, "learning_rate": 1.8690008285708215e-05, "loss": 0.1465, "step": 91900 }, { "epoch": 1.3182613434291286, "grad_norm": 1.112113118171692, "learning_rate": 1.8686024772143023e-05, "loss": 0.0511, "step": 91925 }, { "epoch": 1.3186198588883153, "grad_norm": 0.30535972118377686, "learning_rate": 1.868204125857783e-05, "loss": 0.1891, "step": 91950 }, { "epoch": 1.3189783743475019, "grad_norm": 6.952750205993652, "learning_rate": 1.867805774501264e-05, "loss": 0.1331, "step": 91975 }, { "epoch": 1.3193368898066884, "grad_norm": 5.403261661529541, "learning_rate": 1.867407423144745e-05, "loss": 0.1301, "step": 92000 }, { "epoch": 1.3196954052658751, "grad_norm": 1.0204894542694092, "learning_rate": 1.8670090717882258e-05, "loss": 0.0908, "step": 92025 }, { "epoch": 1.3200539207250617, "grad_norm": 2.0007312297821045, "learning_rate": 1.8666107204317066e-05, "loss": 0.0889, "step": 92050 }, { "epoch": 1.3204124361842482, "grad_norm": 0.8581940531730652, "learning_rate": 1.8662123690751874e-05, "loss": 0.1831, "step": 92075 }, { "epoch": 1.320770951643435, "grad_norm": 13.415704727172852, "learning_rate": 1.865814017718668e-05, "loss": 0.2238, "step": 92100 }, { "epoch": 1.3211294671026215, "grad_norm": 22.53983497619629, "learning_rate": 1.8654156663621493e-05, "loss": 0.1564, "step": 92125 }, { "epoch": 1.321487982561808, "grad_norm": 24.414148330688477, "learning_rate": 1.86501731500563e-05, "loss": 0.1255, "step": 92150 }, { "epoch": 1.3218464980209947, "grad_norm": 13.012552261352539, "learning_rate": 1.864618963649111e-05, "loss": 0.2088, "step": 92175 }, { "epoch": 1.3222050134801813, "grad_norm": 0.8171483874320984, "learning_rate": 1.8642206122925916e-05, "loss": 0.1564, "step": 92200 }, { "epoch": 1.3225635289393678, "grad_norm": 5.081037998199463, "learning_rate": 1.8638222609360727e-05, "loss": 0.2044, "step": 92225 }, { "epoch": 1.3229220443985545, "grad_norm": 0.08777985721826553, "learning_rate": 1.8634239095795535e-05, "loss": 0.115, "step": 92250 }, { "epoch": 1.323280559857741, "grad_norm": 3.8044886589050293, "learning_rate": 1.8630255582230343e-05, "loss": 0.1163, "step": 92275 }, { "epoch": 1.3236390753169276, "grad_norm": 18.92210578918457, "learning_rate": 1.862627206866515e-05, "loss": 0.134, "step": 92300 }, { "epoch": 1.3239975907761143, "grad_norm": 2.2384402751922607, "learning_rate": 1.862228855509996e-05, "loss": 0.205, "step": 92325 }, { "epoch": 1.3243561062353009, "grad_norm": 9.882781982421875, "learning_rate": 1.861830504153477e-05, "loss": 0.202, "step": 92350 }, { "epoch": 1.3247146216944874, "grad_norm": 23.03582763671875, "learning_rate": 1.8614321527969578e-05, "loss": 0.1025, "step": 92375 }, { "epoch": 1.3250731371536741, "grad_norm": 3.79671049118042, "learning_rate": 1.8610338014404386e-05, "loss": 0.0575, "step": 92400 }, { "epoch": 1.3254316526128607, "grad_norm": 4.500112533569336, "learning_rate": 1.8606354500839194e-05, "loss": 0.1605, "step": 92425 }, { "epoch": 1.3257901680720472, "grad_norm": 20.02817153930664, "learning_rate": 1.8602370987274e-05, "loss": 0.1986, "step": 92450 }, { "epoch": 1.326148683531234, "grad_norm": 7.165395736694336, "learning_rate": 1.8598387473708813e-05, "loss": 0.1354, "step": 92475 }, { "epoch": 1.3265071989904205, "grad_norm": 16.66243553161621, "learning_rate": 1.859440396014362e-05, "loss": 0.1203, "step": 92500 }, { "epoch": 1.326865714449607, "grad_norm": 0.24623408913612366, "learning_rate": 1.859042044657843e-05, "loss": 0.1828, "step": 92525 }, { "epoch": 1.3272242299087937, "grad_norm": 6.104673862457275, "learning_rate": 1.8586436933013236e-05, "loss": 0.1807, "step": 92550 }, { "epoch": 1.3275827453679803, "grad_norm": 2.615917921066284, "learning_rate": 1.8582453419448044e-05, "loss": 0.1119, "step": 92575 }, { "epoch": 1.3279412608271668, "grad_norm": 6.292456150054932, "learning_rate": 1.8578469905882855e-05, "loss": 0.1194, "step": 92600 }, { "epoch": 1.3282997762863535, "grad_norm": 3.910547971725464, "learning_rate": 1.8574486392317663e-05, "loss": 0.2688, "step": 92625 }, { "epoch": 1.32865829174554, "grad_norm": 0.06475561112165451, "learning_rate": 1.857050287875247e-05, "loss": 0.139, "step": 92650 }, { "epoch": 1.3290168072047266, "grad_norm": 0.2266012579202652, "learning_rate": 1.856651936518728e-05, "loss": 0.1469, "step": 92675 }, { "epoch": 1.3293753226639133, "grad_norm": 16.325576782226562, "learning_rate": 1.8562535851622087e-05, "loss": 0.1145, "step": 92700 }, { "epoch": 1.3297338381230999, "grad_norm": 2.0604405403137207, "learning_rate": 1.8558552338056898e-05, "loss": 0.1087, "step": 92725 }, { "epoch": 1.3300923535822864, "grad_norm": 20.772153854370117, "learning_rate": 1.8554568824491706e-05, "loss": 0.1559, "step": 92750 }, { "epoch": 1.3304508690414731, "grad_norm": 10.557680130004883, "learning_rate": 1.8550585310926514e-05, "loss": 0.0926, "step": 92775 }, { "epoch": 1.3308093845006597, "grad_norm": 2.3884994983673096, "learning_rate": 1.854660179736132e-05, "loss": 0.1729, "step": 92800 }, { "epoch": 1.3311678999598462, "grad_norm": 0.6822266578674316, "learning_rate": 1.8542618283796133e-05, "loss": 0.1337, "step": 92825 }, { "epoch": 1.331526415419033, "grad_norm": 0.35204797983169556, "learning_rate": 1.853863477023094e-05, "loss": 0.1437, "step": 92850 }, { "epoch": 1.3318849308782195, "grad_norm": 0.5553974509239197, "learning_rate": 1.8534651256665745e-05, "loss": 0.1257, "step": 92875 }, { "epoch": 1.332243446337406, "grad_norm": 13.485546112060547, "learning_rate": 1.8530667743100553e-05, "loss": 0.0771, "step": 92900 }, { "epoch": 1.3326019617965927, "grad_norm": 18.138851165771484, "learning_rate": 1.852668422953536e-05, "loss": 0.1892, "step": 92925 }, { "epoch": 1.3329604772557793, "grad_norm": 5.914841175079346, "learning_rate": 1.8522700715970172e-05, "loss": 0.186, "step": 92950 }, { "epoch": 1.3333189927149658, "grad_norm": 2.3938982486724854, "learning_rate": 1.851871720240498e-05, "loss": 0.1409, "step": 92975 }, { "epoch": 1.3336775081741525, "grad_norm": 11.559562683105469, "learning_rate": 1.8514733688839788e-05, "loss": 0.1349, "step": 93000 }, { "epoch": 1.334036023633339, "grad_norm": 2.113804817199707, "learning_rate": 1.8510750175274595e-05, "loss": 0.1361, "step": 93025 }, { "epoch": 1.3343945390925256, "grad_norm": 0.07350136339664459, "learning_rate": 1.8506766661709403e-05, "loss": 0.1189, "step": 93050 }, { "epoch": 1.3347530545517123, "grad_norm": 1.176590085029602, "learning_rate": 1.8502783148144215e-05, "loss": 0.1606, "step": 93075 }, { "epoch": 1.3351115700108989, "grad_norm": 0.15250100195407867, "learning_rate": 1.8498799634579022e-05, "loss": 0.1244, "step": 93100 }, { "epoch": 1.3354700854700854, "grad_norm": 6.1008687019348145, "learning_rate": 1.849481612101383e-05, "loss": 0.1075, "step": 93125 }, { "epoch": 1.3358286009292721, "grad_norm": 5.841319561004639, "learning_rate": 1.8490832607448638e-05, "loss": 0.1673, "step": 93150 }, { "epoch": 1.3361871163884587, "grad_norm": 17.812477111816406, "learning_rate": 1.8486849093883446e-05, "loss": 0.2674, "step": 93175 }, { "epoch": 1.3365456318476452, "grad_norm": 9.545165061950684, "learning_rate": 1.8482865580318257e-05, "loss": 0.0617, "step": 93200 }, { "epoch": 1.336904147306832, "grad_norm": 11.761322975158691, "learning_rate": 1.8478882066753065e-05, "loss": 0.1105, "step": 93225 }, { "epoch": 1.3372626627660185, "grad_norm": 1.0920865535736084, "learning_rate": 1.8474898553187873e-05, "loss": 0.1173, "step": 93250 }, { "epoch": 1.3376211782252052, "grad_norm": 30.906705856323242, "learning_rate": 1.847091503962268e-05, "loss": 0.1867, "step": 93275 }, { "epoch": 1.3379796936843917, "grad_norm": 4.105011940002441, "learning_rate": 1.846693152605749e-05, "loss": 0.0618, "step": 93300 }, { "epoch": 1.3383382091435783, "grad_norm": 0.8862562775611877, "learning_rate": 1.84629480124923e-05, "loss": 0.1387, "step": 93325 }, { "epoch": 1.338696724602765, "grad_norm": 0.4827101230621338, "learning_rate": 1.8458964498927108e-05, "loss": 0.1713, "step": 93350 }, { "epoch": 1.3390552400619515, "grad_norm": 0.4436943829059601, "learning_rate": 1.8454980985361915e-05, "loss": 0.1491, "step": 93375 }, { "epoch": 1.339413755521138, "grad_norm": 4.770129203796387, "learning_rate": 1.8450997471796723e-05, "loss": 0.0987, "step": 93400 }, { "epoch": 1.3397722709803248, "grad_norm": 2.3564178943634033, "learning_rate": 1.8447013958231535e-05, "loss": 0.1411, "step": 93425 }, { "epoch": 1.3401307864395113, "grad_norm": 0.6919806599617004, "learning_rate": 1.8443030444666342e-05, "loss": 0.0856, "step": 93450 }, { "epoch": 1.3404893018986979, "grad_norm": 0.4601726531982422, "learning_rate": 1.843904693110115e-05, "loss": 0.1576, "step": 93475 }, { "epoch": 1.3408478173578846, "grad_norm": 2.3852169513702393, "learning_rate": 1.8435063417535958e-05, "loss": 0.109, "step": 93500 }, { "epoch": 1.3412063328170711, "grad_norm": 13.260693550109863, "learning_rate": 1.8431079903970766e-05, "loss": 0.138, "step": 93525 }, { "epoch": 1.3415648482762577, "grad_norm": 2.3545525074005127, "learning_rate": 1.8427096390405577e-05, "loss": 0.1243, "step": 93550 }, { "epoch": 1.3419233637354444, "grad_norm": 3.062471389770508, "learning_rate": 1.8423112876840385e-05, "loss": 0.1825, "step": 93575 }, { "epoch": 1.342281879194631, "grad_norm": 0.7010043859481812, "learning_rate": 1.8419129363275193e-05, "loss": 0.0822, "step": 93600 }, { "epoch": 1.3426403946538175, "grad_norm": 0.6735103726387024, "learning_rate": 1.841514584971e-05, "loss": 0.1783, "step": 93625 }, { "epoch": 1.3429989101130042, "grad_norm": 0.47724324464797974, "learning_rate": 1.841116233614481e-05, "loss": 0.2199, "step": 93650 }, { "epoch": 1.3433574255721907, "grad_norm": 3.8574886322021484, "learning_rate": 1.840717882257962e-05, "loss": 0.1755, "step": 93675 }, { "epoch": 1.3437159410313773, "grad_norm": 3.009209394454956, "learning_rate": 1.8403195309014428e-05, "loss": 0.1064, "step": 93700 }, { "epoch": 1.344074456490564, "grad_norm": 9.450227737426758, "learning_rate": 1.8399211795449235e-05, "loss": 0.0826, "step": 93725 }, { "epoch": 1.3444329719497505, "grad_norm": 0.20291782915592194, "learning_rate": 1.8395228281884043e-05, "loss": 0.1646, "step": 93750 }, { "epoch": 1.344791487408937, "grad_norm": 14.615718841552734, "learning_rate": 1.839124476831885e-05, "loss": 0.0907, "step": 93775 }, { "epoch": 1.3451500028681238, "grad_norm": 5.69982385635376, "learning_rate": 1.8387261254753662e-05, "loss": 0.1266, "step": 93800 }, { "epoch": 1.3455085183273103, "grad_norm": 12.92162036895752, "learning_rate": 1.838327774118847e-05, "loss": 0.1308, "step": 93825 }, { "epoch": 1.3458670337864969, "grad_norm": 14.467633247375488, "learning_rate": 1.8379294227623278e-05, "loss": 0.1488, "step": 93850 }, { "epoch": 1.3462255492456836, "grad_norm": 0.9277559518814087, "learning_rate": 1.8375310714058086e-05, "loss": 0.1657, "step": 93875 }, { "epoch": 1.3465840647048701, "grad_norm": 15.38683795928955, "learning_rate": 1.8371327200492894e-05, "loss": 0.2105, "step": 93900 }, { "epoch": 1.3469425801640567, "grad_norm": 4.281856060028076, "learning_rate": 1.8367343686927705e-05, "loss": 0.1462, "step": 93925 }, { "epoch": 1.3473010956232434, "grad_norm": 0.38176047801971436, "learning_rate": 1.8363360173362513e-05, "loss": 0.1064, "step": 93950 }, { "epoch": 1.34765961108243, "grad_norm": 4.783679008483887, "learning_rate": 1.8359376659797317e-05, "loss": 0.1045, "step": 93975 }, { "epoch": 1.3480181265416165, "grad_norm": 0.7867108583450317, "learning_rate": 1.8355393146232125e-05, "loss": 0.1094, "step": 94000 }, { "epoch": 1.3483766420008032, "grad_norm": 0.4253251254558563, "learning_rate": 1.8351409632666936e-05, "loss": 0.1308, "step": 94025 }, { "epoch": 1.3487351574599897, "grad_norm": 4.770827293395996, "learning_rate": 1.8347426119101744e-05, "loss": 0.1495, "step": 94050 }, { "epoch": 1.3490936729191763, "grad_norm": 2.2533302307128906, "learning_rate": 1.8343442605536552e-05, "loss": 0.1212, "step": 94075 }, { "epoch": 1.349452188378363, "grad_norm": 19.041114807128906, "learning_rate": 1.833945909197136e-05, "loss": 0.1782, "step": 94100 }, { "epoch": 1.3498107038375495, "grad_norm": 1.0908737182617188, "learning_rate": 1.8335475578406168e-05, "loss": 0.2311, "step": 94125 }, { "epoch": 1.350169219296736, "grad_norm": 0.1328631490468979, "learning_rate": 1.833149206484098e-05, "loss": 0.0911, "step": 94150 }, { "epoch": 1.3505277347559228, "grad_norm": 5.992888450622559, "learning_rate": 1.8327508551275787e-05, "loss": 0.1054, "step": 94175 }, { "epoch": 1.3508862502151093, "grad_norm": 1.011179804801941, "learning_rate": 1.8323525037710595e-05, "loss": 0.149, "step": 94200 }, { "epoch": 1.3512447656742959, "grad_norm": 2.9813528060913086, "learning_rate": 1.8319541524145402e-05, "loss": 0.1387, "step": 94225 }, { "epoch": 1.3516032811334826, "grad_norm": 1.3596179485321045, "learning_rate": 1.831555801058021e-05, "loss": 0.2175, "step": 94250 }, { "epoch": 1.3519617965926691, "grad_norm": 14.206650733947754, "learning_rate": 1.831157449701502e-05, "loss": 0.1796, "step": 94275 }, { "epoch": 1.3523203120518557, "grad_norm": 1.4804487228393555, "learning_rate": 1.830759098344983e-05, "loss": 0.1465, "step": 94300 }, { "epoch": 1.3526788275110424, "grad_norm": 13.94365406036377, "learning_rate": 1.8303607469884637e-05, "loss": 0.1689, "step": 94325 }, { "epoch": 1.353037342970229, "grad_norm": 0.3554743528366089, "learning_rate": 1.8299623956319445e-05, "loss": 0.1998, "step": 94350 }, { "epoch": 1.3533958584294155, "grad_norm": 9.8719482421875, "learning_rate": 1.8295640442754253e-05, "loss": 0.2554, "step": 94375 }, { "epoch": 1.3537543738886022, "grad_norm": 0.6126500964164734, "learning_rate": 1.8291656929189064e-05, "loss": 0.0813, "step": 94400 }, { "epoch": 1.3541128893477887, "grad_norm": 12.084569931030273, "learning_rate": 1.8287673415623872e-05, "loss": 0.1523, "step": 94425 }, { "epoch": 1.3544714048069753, "grad_norm": 0.7571948766708374, "learning_rate": 1.828368990205868e-05, "loss": 0.1191, "step": 94450 }, { "epoch": 1.354829920266162, "grad_norm": 18.671571731567383, "learning_rate": 1.8279706388493488e-05, "loss": 0.1337, "step": 94475 }, { "epoch": 1.3551884357253485, "grad_norm": 16.907665252685547, "learning_rate": 1.8275722874928296e-05, "loss": 0.1447, "step": 94500 }, { "epoch": 1.355546951184535, "grad_norm": 5.415493011474609, "learning_rate": 1.8271739361363107e-05, "loss": 0.1161, "step": 94525 }, { "epoch": 1.3559054666437218, "grad_norm": 0.02720191329717636, "learning_rate": 1.8267755847797915e-05, "loss": 0.1573, "step": 94550 }, { "epoch": 1.3562639821029083, "grad_norm": 1.9127647876739502, "learning_rate": 1.8263772334232722e-05, "loss": 0.1835, "step": 94575 }, { "epoch": 1.3566224975620949, "grad_norm": 5.358430862426758, "learning_rate": 1.825978882066753e-05, "loss": 0.078, "step": 94600 }, { "epoch": 1.3569810130212816, "grad_norm": 3.6211583614349365, "learning_rate": 1.825580530710234e-05, "loss": 0.1493, "step": 94625 }, { "epoch": 1.3573395284804681, "grad_norm": 11.709691047668457, "learning_rate": 1.825182179353715e-05, "loss": 0.1094, "step": 94650 }, { "epoch": 1.3576980439396547, "grad_norm": 0.028753921389579773, "learning_rate": 1.8247838279971957e-05, "loss": 0.1384, "step": 94675 }, { "epoch": 1.3580565593988414, "grad_norm": 5.634369373321533, "learning_rate": 1.8243854766406765e-05, "loss": 0.1683, "step": 94700 }, { "epoch": 1.358415074858028, "grad_norm": 1.1627272367477417, "learning_rate": 1.8239871252841573e-05, "loss": 0.183, "step": 94725 }, { "epoch": 1.3587735903172145, "grad_norm": 5.474667549133301, "learning_rate": 1.8235887739276384e-05, "loss": 0.1065, "step": 94750 }, { "epoch": 1.3591321057764012, "grad_norm": 0.7361723184585571, "learning_rate": 1.8231904225711192e-05, "loss": 0.1147, "step": 94775 }, { "epoch": 1.3594906212355877, "grad_norm": 0.6724950075149536, "learning_rate": 1.8227920712146e-05, "loss": 0.0955, "step": 94800 }, { "epoch": 1.3598491366947743, "grad_norm": 8.854875564575195, "learning_rate": 1.8223937198580808e-05, "loss": 0.1412, "step": 94825 }, { "epoch": 1.360207652153961, "grad_norm": 0.3517484664916992, "learning_rate": 1.8219953685015616e-05, "loss": 0.1633, "step": 94850 }, { "epoch": 1.3605661676131475, "grad_norm": 1.3840314149856567, "learning_rate": 1.8215970171450427e-05, "loss": 0.1787, "step": 94875 }, { "epoch": 1.360924683072334, "grad_norm": 3.465219020843506, "learning_rate": 1.8211986657885235e-05, "loss": 0.136, "step": 94900 }, { "epoch": 1.3612831985315208, "grad_norm": 2.217495918273926, "learning_rate": 1.8208003144320042e-05, "loss": 0.123, "step": 94925 }, { "epoch": 1.3616417139907073, "grad_norm": 6.937696933746338, "learning_rate": 1.820401963075485e-05, "loss": 0.2045, "step": 94950 }, { "epoch": 1.3620002294498939, "grad_norm": 17.570621490478516, "learning_rate": 1.8200036117189658e-05, "loss": 0.1754, "step": 94975 }, { "epoch": 1.3623587449090806, "grad_norm": 2.1597602367401123, "learning_rate": 1.819605260362447e-05, "loss": 0.1493, "step": 95000 }, { "epoch": 1.3627172603682671, "grad_norm": 10.360025405883789, "learning_rate": 1.8192069090059277e-05, "loss": 0.1671, "step": 95025 }, { "epoch": 1.3630757758274537, "grad_norm": 0.7260206341743469, "learning_rate": 1.8188085576494085e-05, "loss": 0.0883, "step": 95050 }, { "epoch": 1.3634342912866404, "grad_norm": 12.603754043579102, "learning_rate": 1.818410206292889e-05, "loss": 0.195, "step": 95075 }, { "epoch": 1.363792806745827, "grad_norm": 0.29941022396087646, "learning_rate": 1.8180118549363697e-05, "loss": 0.1263, "step": 95100 }, { "epoch": 1.3641513222050135, "grad_norm": 4.8222246170043945, "learning_rate": 1.817613503579851e-05, "loss": 0.1542, "step": 95125 }, { "epoch": 1.3645098376642002, "grad_norm": 0.06545911729335785, "learning_rate": 1.8172151522233316e-05, "loss": 0.1911, "step": 95150 }, { "epoch": 1.3648683531233867, "grad_norm": 14.939775466918945, "learning_rate": 1.8168168008668124e-05, "loss": 0.1731, "step": 95175 }, { "epoch": 1.3652268685825732, "grad_norm": 11.035154342651367, "learning_rate": 1.8164184495102932e-05, "loss": 0.1494, "step": 95200 }, { "epoch": 1.36558538404176, "grad_norm": 1.250549554824829, "learning_rate": 1.8160200981537743e-05, "loss": 0.0853, "step": 95225 }, { "epoch": 1.3659438995009465, "grad_norm": 11.201435089111328, "learning_rate": 1.815621746797255e-05, "loss": 0.1295, "step": 95250 }, { "epoch": 1.366302414960133, "grad_norm": 2.192678928375244, "learning_rate": 1.815223395440736e-05, "loss": 0.0949, "step": 95275 }, { "epoch": 1.3666609304193198, "grad_norm": 0.5745987296104431, "learning_rate": 1.8148250440842167e-05, "loss": 0.1096, "step": 95300 }, { "epoch": 1.3670194458785063, "grad_norm": 4.112375259399414, "learning_rate": 1.8144266927276975e-05, "loss": 0.1806, "step": 95325 }, { "epoch": 1.3673779613376928, "grad_norm": 15.719703674316406, "learning_rate": 1.8140283413711786e-05, "loss": 0.1331, "step": 95350 }, { "epoch": 1.3677364767968796, "grad_norm": 0.22638718783855438, "learning_rate": 1.8136299900146594e-05, "loss": 0.0996, "step": 95375 }, { "epoch": 1.3680949922560661, "grad_norm": 0.060156818479299545, "learning_rate": 1.81323163865814e-05, "loss": 0.124, "step": 95400 }, { "epoch": 1.3684535077152526, "grad_norm": 16.85157585144043, "learning_rate": 1.812833287301621e-05, "loss": 0.1771, "step": 95425 }, { "epoch": 1.3688120231744394, "grad_norm": 0.4155464768409729, "learning_rate": 1.8124349359451017e-05, "loss": 0.1085, "step": 95450 }, { "epoch": 1.369170538633626, "grad_norm": 8.55222225189209, "learning_rate": 1.812036584588583e-05, "loss": 0.0948, "step": 95475 }, { "epoch": 1.3695290540928124, "grad_norm": 0.23788613080978394, "learning_rate": 1.8116382332320636e-05, "loss": 0.1146, "step": 95500 }, { "epoch": 1.3698875695519992, "grad_norm": 8.688342094421387, "learning_rate": 1.8112398818755444e-05, "loss": 0.1728, "step": 95525 }, { "epoch": 1.3702460850111857, "grad_norm": 0.3408235013484955, "learning_rate": 1.8108415305190252e-05, "loss": 0.2625, "step": 95550 }, { "epoch": 1.3706046004703722, "grad_norm": 7.018551826477051, "learning_rate": 1.810443179162506e-05, "loss": 0.1586, "step": 95575 }, { "epoch": 1.370963115929559, "grad_norm": 2.975480794906616, "learning_rate": 1.810044827805987e-05, "loss": 0.1464, "step": 95600 }, { "epoch": 1.3713216313887455, "grad_norm": 18.58814811706543, "learning_rate": 1.809646476449468e-05, "loss": 0.1088, "step": 95625 }, { "epoch": 1.371680146847932, "grad_norm": 7.477758407592773, "learning_rate": 1.8092481250929487e-05, "loss": 0.1637, "step": 95650 }, { "epoch": 1.3720386623071188, "grad_norm": 0.6818468570709229, "learning_rate": 1.8088497737364295e-05, "loss": 0.1772, "step": 95675 }, { "epoch": 1.3723971777663053, "grad_norm": 8.772534370422363, "learning_rate": 1.8084514223799103e-05, "loss": 0.0952, "step": 95700 }, { "epoch": 1.3727556932254918, "grad_norm": 13.908625602722168, "learning_rate": 1.8080530710233914e-05, "loss": 0.1313, "step": 95725 }, { "epoch": 1.3731142086846786, "grad_norm": 0.3614901304244995, "learning_rate": 1.807654719666872e-05, "loss": 0.1247, "step": 95750 }, { "epoch": 1.3734727241438651, "grad_norm": 1.3120076656341553, "learning_rate": 1.807256368310353e-05, "loss": 0.1709, "step": 95775 }, { "epoch": 1.3738312396030516, "grad_norm": 8.517166137695312, "learning_rate": 1.8068580169538337e-05, "loss": 0.1662, "step": 95800 }, { "epoch": 1.3741897550622384, "grad_norm": 0.3519255220890045, "learning_rate": 1.806459665597315e-05, "loss": 0.1486, "step": 95825 }, { "epoch": 1.374548270521425, "grad_norm": 1.805113673210144, "learning_rate": 1.8060613142407956e-05, "loss": 0.0938, "step": 95850 }, { "epoch": 1.3749067859806114, "grad_norm": 9.525044441223145, "learning_rate": 1.8056629628842764e-05, "loss": 0.2239, "step": 95875 }, { "epoch": 1.3752653014397982, "grad_norm": 0.437641441822052, "learning_rate": 1.8052646115277572e-05, "loss": 0.1032, "step": 95900 }, { "epoch": 1.3756238168989847, "grad_norm": 15.60258960723877, "learning_rate": 1.804866260171238e-05, "loss": 0.1583, "step": 95925 }, { "epoch": 1.3759823323581712, "grad_norm": 16.82314109802246, "learning_rate": 1.804467908814719e-05, "loss": 0.1281, "step": 95950 }, { "epoch": 1.376340847817358, "grad_norm": 12.246973037719727, "learning_rate": 1.8040695574582e-05, "loss": 0.2238, "step": 95975 }, { "epoch": 1.3766993632765445, "grad_norm": 4.3083953857421875, "learning_rate": 1.8036712061016807e-05, "loss": 0.0717, "step": 96000 }, { "epoch": 1.377057878735731, "grad_norm": 0.1396850198507309, "learning_rate": 1.8032728547451615e-05, "loss": 0.1134, "step": 96025 }, { "epoch": 1.3774163941949178, "grad_norm": 15.375473976135254, "learning_rate": 1.8028745033886423e-05, "loss": 0.1754, "step": 96050 }, { "epoch": 1.3777749096541043, "grad_norm": 0.2836090326309204, "learning_rate": 1.8024761520321234e-05, "loss": 0.0567, "step": 96075 }, { "epoch": 1.3781334251132908, "grad_norm": 29.867692947387695, "learning_rate": 1.802077800675604e-05, "loss": 0.1551, "step": 96100 }, { "epoch": 1.3784919405724776, "grad_norm": 0.5734465718269348, "learning_rate": 1.801679449319085e-05, "loss": 0.0952, "step": 96125 }, { "epoch": 1.3788504560316641, "grad_norm": 1.0513668060302734, "learning_rate": 1.8012810979625657e-05, "loss": 0.1175, "step": 96150 }, { "epoch": 1.3792089714908506, "grad_norm": 4.1107964515686035, "learning_rate": 1.8008827466060462e-05, "loss": 0.0961, "step": 96175 }, { "epoch": 1.3795674869500374, "grad_norm": 12.74854850769043, "learning_rate": 1.8004843952495273e-05, "loss": 0.0898, "step": 96200 }, { "epoch": 1.379926002409224, "grad_norm": 0.2124772071838379, "learning_rate": 1.800086043893008e-05, "loss": 0.1672, "step": 96225 }, { "epoch": 1.3802845178684104, "grad_norm": 1.2649093866348267, "learning_rate": 1.799687692536489e-05, "loss": 0.1763, "step": 96250 }, { "epoch": 1.3806430333275972, "grad_norm": 0.48767346143722534, "learning_rate": 1.7992893411799697e-05, "loss": 0.1185, "step": 96275 }, { "epoch": 1.3810015487867837, "grad_norm": 16.36854362487793, "learning_rate": 1.7988909898234504e-05, "loss": 0.1904, "step": 96300 }, { "epoch": 1.3813600642459702, "grad_norm": 5.230780124664307, "learning_rate": 1.7984926384669316e-05, "loss": 0.1422, "step": 96325 }, { "epoch": 1.381718579705157, "grad_norm": 0.08837287873029709, "learning_rate": 1.7980942871104123e-05, "loss": 0.1668, "step": 96350 }, { "epoch": 1.3820770951643435, "grad_norm": 2.058655023574829, "learning_rate": 1.797695935753893e-05, "loss": 0.0987, "step": 96375 }, { "epoch": 1.38243561062353, "grad_norm": 1.8195172548294067, "learning_rate": 1.797297584397374e-05, "loss": 0.102, "step": 96400 }, { "epoch": 1.3827941260827168, "grad_norm": 25.240835189819336, "learning_rate": 1.7968992330408547e-05, "loss": 0.3259, "step": 96425 }, { "epoch": 1.3831526415419033, "grad_norm": 10.103433609008789, "learning_rate": 1.7965008816843358e-05, "loss": 0.1461, "step": 96450 }, { "epoch": 1.3835111570010898, "grad_norm": 11.55942153930664, "learning_rate": 1.7961025303278166e-05, "loss": 0.1967, "step": 96475 }, { "epoch": 1.3838696724602766, "grad_norm": 20.928810119628906, "learning_rate": 1.7957041789712974e-05, "loss": 0.2198, "step": 96500 }, { "epoch": 1.3842281879194631, "grad_norm": 0.9280039072036743, "learning_rate": 1.7953058276147782e-05, "loss": 0.1485, "step": 96525 }, { "epoch": 1.3845867033786496, "grad_norm": 12.148383140563965, "learning_rate": 1.7949074762582593e-05, "loss": 0.1286, "step": 96550 }, { "epoch": 1.3849452188378364, "grad_norm": 10.852837562561035, "learning_rate": 1.79450912490174e-05, "loss": 0.14, "step": 96575 }, { "epoch": 1.385303734297023, "grad_norm": 9.581513404846191, "learning_rate": 1.794110773545221e-05, "loss": 0.1633, "step": 96600 }, { "epoch": 1.3856622497562094, "grad_norm": 0.01502220332622528, "learning_rate": 1.7937124221887017e-05, "loss": 0.2206, "step": 96625 }, { "epoch": 1.3860207652153962, "grad_norm": 5.977667808532715, "learning_rate": 1.7933140708321824e-05, "loss": 0.1682, "step": 96650 }, { "epoch": 1.3863792806745827, "grad_norm": 4.692214488983154, "learning_rate": 1.7929157194756636e-05, "loss": 0.1764, "step": 96675 }, { "epoch": 1.3867377961337692, "grad_norm": 34.27368927001953, "learning_rate": 1.7925173681191443e-05, "loss": 0.2057, "step": 96700 }, { "epoch": 1.387096311592956, "grad_norm": 3.36759090423584, "learning_rate": 1.792119016762625e-05, "loss": 0.2041, "step": 96725 }, { "epoch": 1.3874548270521425, "grad_norm": 8.181817054748535, "learning_rate": 1.791720665406106e-05, "loss": 0.106, "step": 96750 }, { "epoch": 1.387813342511329, "grad_norm": 0.830839216709137, "learning_rate": 1.7913223140495867e-05, "loss": 0.1939, "step": 96775 }, { "epoch": 1.3881718579705158, "grad_norm": 0.146390900015831, "learning_rate": 1.7909239626930678e-05, "loss": 0.0982, "step": 96800 }, { "epoch": 1.3885303734297023, "grad_norm": 0.24128611385822296, "learning_rate": 1.7905256113365486e-05, "loss": 0.1271, "step": 96825 }, { "epoch": 1.3888888888888888, "grad_norm": 1.1777840852737427, "learning_rate": 1.7901272599800294e-05, "loss": 0.1244, "step": 96850 }, { "epoch": 1.3892474043480756, "grad_norm": 21.934091567993164, "learning_rate": 1.7897289086235102e-05, "loss": 0.1772, "step": 96875 }, { "epoch": 1.3896059198072621, "grad_norm": 0.16477249562740326, "learning_rate": 1.789330557266991e-05, "loss": 0.1302, "step": 96900 }, { "epoch": 1.3899644352664486, "grad_norm": 1.0384751558303833, "learning_rate": 1.788932205910472e-05, "loss": 0.1108, "step": 96925 }, { "epoch": 1.3903229507256354, "grad_norm": 3.950134754180908, "learning_rate": 1.788533854553953e-05, "loss": 0.0714, "step": 96950 }, { "epoch": 1.390681466184822, "grad_norm": 0.1922658234834671, "learning_rate": 1.7881355031974337e-05, "loss": 0.1437, "step": 96975 }, { "epoch": 1.3910399816440084, "grad_norm": 3.1681180000305176, "learning_rate": 1.7877371518409144e-05, "loss": 0.1407, "step": 97000 }, { "epoch": 1.3913984971031952, "grad_norm": 4.285105228424072, "learning_rate": 1.7873388004843952e-05, "loss": 0.3403, "step": 97025 }, { "epoch": 1.3917570125623817, "grad_norm": 4.261404514312744, "learning_rate": 1.7869404491278763e-05, "loss": 0.1013, "step": 97050 }, { "epoch": 1.3921155280215682, "grad_norm": 9.971553802490234, "learning_rate": 1.786542097771357e-05, "loss": 0.1638, "step": 97075 }, { "epoch": 1.392474043480755, "grad_norm": 0.9571086764335632, "learning_rate": 1.786143746414838e-05, "loss": 0.1201, "step": 97100 }, { "epoch": 1.3928325589399415, "grad_norm": 1.4212905168533325, "learning_rate": 1.7857453950583187e-05, "loss": 0.107, "step": 97125 }, { "epoch": 1.393191074399128, "grad_norm": 0.2727715075016022, "learning_rate": 1.7853470437017998e-05, "loss": 0.198, "step": 97150 }, { "epoch": 1.3935495898583148, "grad_norm": 1.400884985923767, "learning_rate": 1.7849486923452806e-05, "loss": 0.0339, "step": 97175 }, { "epoch": 1.3939081053175013, "grad_norm": 8.039134979248047, "learning_rate": 1.7845503409887614e-05, "loss": 0.1122, "step": 97200 }, { "epoch": 1.3942666207766878, "grad_norm": 4.60422420501709, "learning_rate": 1.7841519896322422e-05, "loss": 0.1905, "step": 97225 }, { "epoch": 1.3946251362358746, "grad_norm": 0.5207469463348389, "learning_rate": 1.783753638275723e-05, "loss": 0.1526, "step": 97250 }, { "epoch": 1.394983651695061, "grad_norm": 5.8800482749938965, "learning_rate": 1.7833552869192037e-05, "loss": 0.1026, "step": 97275 }, { "epoch": 1.3953421671542476, "grad_norm": 2.4028687477111816, "learning_rate": 1.7829569355626845e-05, "loss": 0.0559, "step": 97300 }, { "epoch": 1.3957006826134344, "grad_norm": 0.7233050465583801, "learning_rate": 1.7825585842061653e-05, "loss": 0.1308, "step": 97325 }, { "epoch": 1.396059198072621, "grad_norm": 0.7401587963104248, "learning_rate": 1.782160232849646e-05, "loss": 0.0794, "step": 97350 }, { "epoch": 1.3964177135318074, "grad_norm": 15.789946556091309, "learning_rate": 1.781761881493127e-05, "loss": 0.1382, "step": 97375 }, { "epoch": 1.3967762289909942, "grad_norm": 6.65364408493042, "learning_rate": 1.781363530136608e-05, "loss": 0.1732, "step": 97400 }, { "epoch": 1.3971347444501807, "grad_norm": 0.593652606010437, "learning_rate": 1.7809651787800888e-05, "loss": 0.1442, "step": 97425 }, { "epoch": 1.3974932599093672, "grad_norm": 4.5720415115356445, "learning_rate": 1.7805668274235696e-05, "loss": 0.0691, "step": 97450 }, { "epoch": 1.397851775368554, "grad_norm": 11.817107200622559, "learning_rate": 1.7801684760670504e-05, "loss": 0.0999, "step": 97475 }, { "epoch": 1.3982102908277405, "grad_norm": 0.10945223271846771, "learning_rate": 1.779770124710531e-05, "loss": 0.1646, "step": 97500 }, { "epoch": 1.398568806286927, "grad_norm": 10.404165267944336, "learning_rate": 1.7793717733540123e-05, "loss": 0.2574, "step": 97525 }, { "epoch": 1.3989273217461138, "grad_norm": 16.057661056518555, "learning_rate": 1.778973421997493e-05, "loss": 0.1517, "step": 97550 }, { "epoch": 1.3992858372053003, "grad_norm": 0.448637992143631, "learning_rate": 1.778575070640974e-05, "loss": 0.1921, "step": 97575 }, { "epoch": 1.3996443526644868, "grad_norm": 0.8344857096672058, "learning_rate": 1.7781767192844546e-05, "loss": 0.1829, "step": 97600 }, { "epoch": 1.4000028681236736, "grad_norm": 0.37612366676330566, "learning_rate": 1.7777783679279354e-05, "loss": 0.1169, "step": 97625 }, { "epoch": 1.40036138358286, "grad_norm": 4.48245906829834, "learning_rate": 1.7773800165714165e-05, "loss": 0.0857, "step": 97650 }, { "epoch": 1.4007198990420466, "grad_norm": 6.64392614364624, "learning_rate": 1.7769816652148973e-05, "loss": 0.0688, "step": 97675 }, { "epoch": 1.4010784145012334, "grad_norm": 1.430060625076294, "learning_rate": 1.776583313858378e-05, "loss": 0.2307, "step": 97700 }, { "epoch": 1.40143692996042, "grad_norm": 5.893048286437988, "learning_rate": 1.776184962501859e-05, "loss": 0.0906, "step": 97725 }, { "epoch": 1.4017954454196064, "grad_norm": 31.322118759155273, "learning_rate": 1.77578661114534e-05, "loss": 0.0982, "step": 97750 }, { "epoch": 1.4021539608787932, "grad_norm": 0.7028698325157166, "learning_rate": 1.7753882597888208e-05, "loss": 0.159, "step": 97775 }, { "epoch": 1.4025124763379797, "grad_norm": 0.030409904196858406, "learning_rate": 1.7749899084323016e-05, "loss": 0.2183, "step": 97800 }, { "epoch": 1.4028709917971662, "grad_norm": 1.5260009765625, "learning_rate": 1.7745915570757824e-05, "loss": 0.1553, "step": 97825 }, { "epoch": 1.403229507256353, "grad_norm": 8.300485610961914, "learning_rate": 1.774193205719263e-05, "loss": 0.1878, "step": 97850 }, { "epoch": 1.4035880227155395, "grad_norm": 1.9601116180419922, "learning_rate": 1.7737948543627443e-05, "loss": 0.2057, "step": 97875 }, { "epoch": 1.403946538174726, "grad_norm": 12.024136543273926, "learning_rate": 1.773396503006225e-05, "loss": 0.1186, "step": 97900 }, { "epoch": 1.4043050536339128, "grad_norm": 3.0150625705718994, "learning_rate": 1.772998151649706e-05, "loss": 0.215, "step": 97925 }, { "epoch": 1.4046635690930993, "grad_norm": 2.677806854248047, "learning_rate": 1.7725998002931866e-05, "loss": 0.1187, "step": 97950 }, { "epoch": 1.4050220845522858, "grad_norm": 15.400957107543945, "learning_rate": 1.7722014489366674e-05, "loss": 0.1197, "step": 97975 }, { "epoch": 1.4053806000114726, "grad_norm": 0.20437347888946533, "learning_rate": 1.7718030975801485e-05, "loss": 0.1933, "step": 98000 }, { "epoch": 1.405739115470659, "grad_norm": 10.285850524902344, "learning_rate": 1.7714047462236293e-05, "loss": 0.1372, "step": 98025 }, { "epoch": 1.4060976309298456, "grad_norm": 6.20427131652832, "learning_rate": 1.77100639486711e-05, "loss": 0.0896, "step": 98050 }, { "epoch": 1.4064561463890324, "grad_norm": 0.24480728805065155, "learning_rate": 1.770608043510591e-05, "loss": 0.0916, "step": 98075 }, { "epoch": 1.406814661848219, "grad_norm": 0.16115137934684753, "learning_rate": 1.7702096921540717e-05, "loss": 0.1315, "step": 98100 }, { "epoch": 1.4071731773074054, "grad_norm": 7.036706447601318, "learning_rate": 1.7698113407975528e-05, "loss": 0.0775, "step": 98125 }, { "epoch": 1.4075316927665922, "grad_norm": 6.906226634979248, "learning_rate": 1.7694129894410336e-05, "loss": 0.1408, "step": 98150 }, { "epoch": 1.4078902082257787, "grad_norm": 1.9333347082138062, "learning_rate": 1.7690146380845144e-05, "loss": 0.1009, "step": 98175 }, { "epoch": 1.4082487236849652, "grad_norm": 19.23069953918457, "learning_rate": 1.768616286727995e-05, "loss": 0.1407, "step": 98200 }, { "epoch": 1.408607239144152, "grad_norm": 18.305673599243164, "learning_rate": 1.768217935371476e-05, "loss": 0.1272, "step": 98225 }, { "epoch": 1.4089657546033385, "grad_norm": 0.4801602363586426, "learning_rate": 1.767819584014957e-05, "loss": 0.1598, "step": 98250 }, { "epoch": 1.409324270062525, "grad_norm": 1.5923717021942139, "learning_rate": 1.767421232658438e-05, "loss": 0.1139, "step": 98275 }, { "epoch": 1.4096827855217118, "grad_norm": 0.3467610776424408, "learning_rate": 1.7670228813019186e-05, "loss": 0.2026, "step": 98300 }, { "epoch": 1.4100413009808983, "grad_norm": 3.8667781352996826, "learning_rate": 1.7666245299453994e-05, "loss": 0.1281, "step": 98325 }, { "epoch": 1.4103998164400848, "grad_norm": 0.07240750640630722, "learning_rate": 1.7662261785888805e-05, "loss": 0.0725, "step": 98350 }, { "epoch": 1.4107583318992716, "grad_norm": 8.879776954650879, "learning_rate": 1.765827827232361e-05, "loss": 0.2156, "step": 98375 }, { "epoch": 1.411116847358458, "grad_norm": 3.714092969894409, "learning_rate": 1.7654294758758418e-05, "loss": 0.1801, "step": 98400 }, { "epoch": 1.4114753628176446, "grad_norm": 0.3640684485435486, "learning_rate": 1.7650311245193225e-05, "loss": 0.1543, "step": 98425 }, { "epoch": 1.4118338782768314, "grad_norm": 0.5705454349517822, "learning_rate": 1.7646327731628033e-05, "loss": 0.1154, "step": 98450 }, { "epoch": 1.412192393736018, "grad_norm": 12.457152366638184, "learning_rate": 1.7642344218062844e-05, "loss": 0.219, "step": 98475 }, { "epoch": 1.4125509091952044, "grad_norm": 1.1408016681671143, "learning_rate": 1.7638360704497652e-05, "loss": 0.1466, "step": 98500 }, { "epoch": 1.4129094246543912, "grad_norm": 0.6973113417625427, "learning_rate": 1.763437719093246e-05, "loss": 0.1105, "step": 98525 }, { "epoch": 1.4132679401135777, "grad_norm": 5.993745803833008, "learning_rate": 1.7630393677367268e-05, "loss": 0.0518, "step": 98550 }, { "epoch": 1.4136264555727642, "grad_norm": 20.800722122192383, "learning_rate": 1.7626410163802076e-05, "loss": 0.0856, "step": 98575 }, { "epoch": 1.413984971031951, "grad_norm": 17.204360961914062, "learning_rate": 1.7622426650236887e-05, "loss": 0.1183, "step": 98600 }, { "epoch": 1.4143434864911375, "grad_norm": 1.849656105041504, "learning_rate": 1.7618443136671695e-05, "loss": 0.1471, "step": 98625 }, { "epoch": 1.414702001950324, "grad_norm": 21.14556884765625, "learning_rate": 1.7614459623106503e-05, "loss": 0.1453, "step": 98650 }, { "epoch": 1.4150605174095108, "grad_norm": 2.0224947929382324, "learning_rate": 1.761047610954131e-05, "loss": 0.1294, "step": 98675 }, { "epoch": 1.4154190328686973, "grad_norm": 0.4413714110851288, "learning_rate": 1.760649259597612e-05, "loss": 0.14, "step": 98700 }, { "epoch": 1.4157775483278838, "grad_norm": 12.456145286560059, "learning_rate": 1.760250908241093e-05, "loss": 0.1437, "step": 98725 }, { "epoch": 1.4161360637870706, "grad_norm": 5.512345314025879, "learning_rate": 1.7598525568845738e-05, "loss": 0.147, "step": 98750 }, { "epoch": 1.416494579246257, "grad_norm": 0.3534921407699585, "learning_rate": 1.7594542055280545e-05, "loss": 0.1273, "step": 98775 }, { "epoch": 1.4168530947054436, "grad_norm": 0.12701472640037537, "learning_rate": 1.7590558541715353e-05, "loss": 0.1306, "step": 98800 }, { "epoch": 1.4172116101646304, "grad_norm": 2.6868197917938232, "learning_rate": 1.758657502815016e-05, "loss": 0.1154, "step": 98825 }, { "epoch": 1.417570125623817, "grad_norm": 8.694250106811523, "learning_rate": 1.7582591514584972e-05, "loss": 0.1145, "step": 98850 }, { "epoch": 1.4179286410830034, "grad_norm": 2.896136522293091, "learning_rate": 1.757860800101978e-05, "loss": 0.1273, "step": 98875 }, { "epoch": 1.4182871565421902, "grad_norm": 1.142504096031189, "learning_rate": 1.7574624487454588e-05, "loss": 0.1445, "step": 98900 }, { "epoch": 1.4186456720013767, "grad_norm": 14.039558410644531, "learning_rate": 1.7570640973889396e-05, "loss": 0.1454, "step": 98925 }, { "epoch": 1.4190041874605632, "grad_norm": 0.620353639125824, "learning_rate": 1.7566657460324207e-05, "loss": 0.119, "step": 98950 }, { "epoch": 1.41936270291975, "grad_norm": 22.09821128845215, "learning_rate": 1.7562673946759015e-05, "loss": 0.1715, "step": 98975 }, { "epoch": 1.4197212183789365, "grad_norm": 1.1305981874465942, "learning_rate": 1.7558690433193823e-05, "loss": 0.1303, "step": 99000 }, { "epoch": 1.420079733838123, "grad_norm": 0.20681455731391907, "learning_rate": 1.755470691962863e-05, "loss": 0.0941, "step": 99025 }, { "epoch": 1.4204382492973098, "grad_norm": 7.864628791809082, "learning_rate": 1.755072340606344e-05, "loss": 0.1272, "step": 99050 }, { "epoch": 1.4207967647564963, "grad_norm": 0.15577806532382965, "learning_rate": 1.754673989249825e-05, "loss": 0.1205, "step": 99075 }, { "epoch": 1.4211552802156828, "grad_norm": 13.088215827941895, "learning_rate": 1.7542756378933058e-05, "loss": 0.1284, "step": 99100 }, { "epoch": 1.4215137956748696, "grad_norm": 0.30758509039878845, "learning_rate": 1.7538772865367865e-05, "loss": 0.1357, "step": 99125 }, { "epoch": 1.421872311134056, "grad_norm": 4.011868000030518, "learning_rate": 1.7534789351802673e-05, "loss": 0.1027, "step": 99150 }, { "epoch": 1.4222308265932426, "grad_norm": 3.511190414428711, "learning_rate": 1.753080583823748e-05, "loss": 0.0895, "step": 99175 }, { "epoch": 1.4225893420524294, "grad_norm": 0.05877676233649254, "learning_rate": 1.7526822324672292e-05, "loss": 0.1181, "step": 99200 }, { "epoch": 1.422947857511616, "grad_norm": 18.211040496826172, "learning_rate": 1.75228388111071e-05, "loss": 0.1492, "step": 99225 }, { "epoch": 1.4233063729708024, "grad_norm": 0.7056440114974976, "learning_rate": 1.7518855297541908e-05, "loss": 0.1548, "step": 99250 }, { "epoch": 1.4236648884299892, "grad_norm": 1.4854309558868408, "learning_rate": 1.7514871783976716e-05, "loss": 0.1228, "step": 99275 }, { "epoch": 1.4240234038891757, "grad_norm": 0.36173394322395325, "learning_rate": 1.7510888270411524e-05, "loss": 0.1428, "step": 99300 }, { "epoch": 1.4243819193483622, "grad_norm": 0.21158194541931152, "learning_rate": 1.7506904756846335e-05, "loss": 0.1418, "step": 99325 }, { "epoch": 1.424740434807549, "grad_norm": 4.103992938995361, "learning_rate": 1.7502921243281143e-05, "loss": 0.1191, "step": 99350 }, { "epoch": 1.4250989502667355, "grad_norm": 4.29340124130249, "learning_rate": 1.749893772971595e-05, "loss": 0.1637, "step": 99375 }, { "epoch": 1.425457465725922, "grad_norm": 17.63391876220703, "learning_rate": 1.749495421615076e-05, "loss": 0.1339, "step": 99400 }, { "epoch": 1.4258159811851088, "grad_norm": 5.220385551452637, "learning_rate": 1.7490970702585566e-05, "loss": 0.0945, "step": 99425 }, { "epoch": 1.4261744966442953, "grad_norm": 4.3085713386535645, "learning_rate": 1.7486987189020378e-05, "loss": 0.1309, "step": 99450 }, { "epoch": 1.4265330121034818, "grad_norm": 0.08337928354740143, "learning_rate": 1.7483003675455182e-05, "loss": 0.1796, "step": 99475 }, { "epoch": 1.4268915275626686, "grad_norm": 8.219565391540527, "learning_rate": 1.747902016188999e-05, "loss": 0.124, "step": 99500 }, { "epoch": 1.427250043021855, "grad_norm": 12.554691314697266, "learning_rate": 1.7475036648324798e-05, "loss": 0.1414, "step": 99525 }, { "epoch": 1.4276085584810416, "grad_norm": 10.605162620544434, "learning_rate": 1.747105313475961e-05, "loss": 0.2117, "step": 99550 }, { "epoch": 1.4279670739402284, "grad_norm": 0.8879551887512207, "learning_rate": 1.7467069621194417e-05, "loss": 0.1325, "step": 99575 }, { "epoch": 1.428325589399415, "grad_norm": 0.6942654848098755, "learning_rate": 1.7463086107629225e-05, "loss": 0.1545, "step": 99600 }, { "epoch": 1.4286841048586014, "grad_norm": 1.1824434995651245, "learning_rate": 1.7459102594064032e-05, "loss": 0.1305, "step": 99625 }, { "epoch": 1.4290426203177882, "grad_norm": 8.308156967163086, "learning_rate": 1.745511908049884e-05, "loss": 0.0772, "step": 99650 }, { "epoch": 1.4294011357769747, "grad_norm": 12.928093910217285, "learning_rate": 1.745113556693365e-05, "loss": 0.145, "step": 99675 }, { "epoch": 1.4297596512361612, "grad_norm": 4.378645420074463, "learning_rate": 1.744715205336846e-05, "loss": 0.0745, "step": 99700 }, { "epoch": 1.430118166695348, "grad_norm": 22.77495002746582, "learning_rate": 1.7443168539803267e-05, "loss": 0.1786, "step": 99725 }, { "epoch": 1.4304766821545345, "grad_norm": 0.2385689616203308, "learning_rate": 1.7439185026238075e-05, "loss": 0.1121, "step": 99750 }, { "epoch": 1.430835197613721, "grad_norm": 1.6279488801956177, "learning_rate": 1.7435201512672883e-05, "loss": 0.127, "step": 99775 }, { "epoch": 1.4311937130729078, "grad_norm": 0.48709097504615784, "learning_rate": 1.7431217999107694e-05, "loss": 0.1105, "step": 99800 }, { "epoch": 1.4315522285320943, "grad_norm": 1.1969281435012817, "learning_rate": 1.7427234485542502e-05, "loss": 0.1626, "step": 99825 }, { "epoch": 1.4319107439912808, "grad_norm": 0.9384445548057556, "learning_rate": 1.742325097197731e-05, "loss": 0.0844, "step": 99850 }, { "epoch": 1.4322692594504676, "grad_norm": 2.9139018058776855, "learning_rate": 1.7419267458412118e-05, "loss": 0.1561, "step": 99875 }, { "epoch": 1.432627774909654, "grad_norm": 0.42732271552085876, "learning_rate": 1.7415283944846926e-05, "loss": 0.1226, "step": 99900 }, { "epoch": 1.4329862903688406, "grad_norm": 0.10873616486787796, "learning_rate": 1.7411300431281737e-05, "loss": 0.1039, "step": 99925 }, { "epoch": 1.4333448058280274, "grad_norm": 0.6240264773368835, "learning_rate": 1.7407316917716545e-05, "loss": 0.1039, "step": 99950 }, { "epoch": 1.433703321287214, "grad_norm": 16.25092124938965, "learning_rate": 1.7403333404151352e-05, "loss": 0.2333, "step": 99975 }, { "epoch": 1.4340618367464004, "grad_norm": 0.7640058398246765, "learning_rate": 1.739934989058616e-05, "loss": 0.1511, "step": 100000 }, { "epoch": 1.4344203522055872, "grad_norm": 3.9590697288513184, "learning_rate": 1.7395366377020968e-05, "loss": 0.1295, "step": 100025 }, { "epoch": 1.4347788676647737, "grad_norm": 0.24715256690979004, "learning_rate": 1.739138286345578e-05, "loss": 0.1556, "step": 100050 }, { "epoch": 1.4351373831239602, "grad_norm": 5.6614766120910645, "learning_rate": 1.7387399349890587e-05, "loss": 0.131, "step": 100075 }, { "epoch": 1.435495898583147, "grad_norm": 1.6775022745132446, "learning_rate": 1.7383415836325395e-05, "loss": 0.12, "step": 100100 }, { "epoch": 1.4358544140423335, "grad_norm": 0.06130113825201988, "learning_rate": 1.7379432322760203e-05, "loss": 0.0867, "step": 100125 }, { "epoch": 1.43621292950152, "grad_norm": 3.915959596633911, "learning_rate": 1.7375448809195014e-05, "loss": 0.1532, "step": 100150 }, { "epoch": 1.4365714449607068, "grad_norm": 15.098173141479492, "learning_rate": 1.7371465295629822e-05, "loss": 0.1285, "step": 100175 }, { "epoch": 1.4369299604198933, "grad_norm": 5.780630588531494, "learning_rate": 1.736748178206463e-05, "loss": 0.2491, "step": 100200 }, { "epoch": 1.4372884758790798, "grad_norm": 2.104778528213501, "learning_rate": 1.7363498268499438e-05, "loss": 0.1102, "step": 100225 }, { "epoch": 1.4376469913382666, "grad_norm": 1.371198058128357, "learning_rate": 1.7359514754934246e-05, "loss": 0.1189, "step": 100250 }, { "epoch": 1.438005506797453, "grad_norm": 15.001352310180664, "learning_rate": 1.7355531241369057e-05, "loss": 0.1099, "step": 100275 }, { "epoch": 1.4383640222566396, "grad_norm": 24.724153518676758, "learning_rate": 1.7351547727803865e-05, "loss": 0.1248, "step": 100300 }, { "epoch": 1.4387225377158264, "grad_norm": 1.6072313785552979, "learning_rate": 1.7347564214238672e-05, "loss": 0.1198, "step": 100325 }, { "epoch": 1.439081053175013, "grad_norm": 11.067790031433105, "learning_rate": 1.734358070067348e-05, "loss": 0.1857, "step": 100350 }, { "epoch": 1.4394395686341994, "grad_norm": 5.764673709869385, "learning_rate": 1.7339597187108288e-05, "loss": 0.1192, "step": 100375 }, { "epoch": 1.4397980840933862, "grad_norm": 11.562943458557129, "learning_rate": 1.73356136735431e-05, "loss": 0.1936, "step": 100400 }, { "epoch": 1.4401565995525727, "grad_norm": 4.55534029006958, "learning_rate": 1.7331630159977907e-05, "loss": 0.1375, "step": 100425 }, { "epoch": 1.4405151150117592, "grad_norm": 6.8071980476379395, "learning_rate": 1.7327646646412715e-05, "loss": 0.0799, "step": 100450 }, { "epoch": 1.440873630470946, "grad_norm": 0.4990479648113251, "learning_rate": 1.7323663132847523e-05, "loss": 0.1481, "step": 100475 }, { "epoch": 1.4412321459301325, "grad_norm": 1.7933461666107178, "learning_rate": 1.731967961928233e-05, "loss": 0.0964, "step": 100500 }, { "epoch": 1.441590661389319, "grad_norm": 3.0383787155151367, "learning_rate": 1.7315696105717142e-05, "loss": 0.1754, "step": 100525 }, { "epoch": 1.4419491768485058, "grad_norm": 0.44577276706695557, "learning_rate": 1.731171259215195e-05, "loss": 0.2049, "step": 100550 }, { "epoch": 1.4423076923076923, "grad_norm": 0.6673192381858826, "learning_rate": 1.7307729078586754e-05, "loss": 0.1461, "step": 100575 }, { "epoch": 1.4426662077668788, "grad_norm": 1.5579510927200317, "learning_rate": 1.7303745565021562e-05, "loss": 0.1346, "step": 100600 }, { "epoch": 1.4430247232260656, "grad_norm": 0.05231652781367302, "learning_rate": 1.729976205145637e-05, "loss": 0.1104, "step": 100625 }, { "epoch": 1.443383238685252, "grad_norm": 1.3849636316299438, "learning_rate": 1.729577853789118e-05, "loss": 0.143, "step": 100650 }, { "epoch": 1.4437417541444386, "grad_norm": 0.06602934747934341, "learning_rate": 1.729179502432599e-05, "loss": 0.1875, "step": 100675 }, { "epoch": 1.4441002696036254, "grad_norm": 1.0150123834609985, "learning_rate": 1.7287811510760797e-05, "loss": 0.2456, "step": 100700 }, { "epoch": 1.444458785062812, "grad_norm": 0.11881639808416367, "learning_rate": 1.7283827997195605e-05, "loss": 0.0566, "step": 100725 }, { "epoch": 1.4448173005219984, "grad_norm": 0.12688814103603363, "learning_rate": 1.7279844483630416e-05, "loss": 0.0791, "step": 100750 }, { "epoch": 1.4451758159811852, "grad_norm": 8.412961959838867, "learning_rate": 1.7275860970065224e-05, "loss": 0.1875, "step": 100775 }, { "epoch": 1.4455343314403717, "grad_norm": 1.1036314964294434, "learning_rate": 1.727187745650003e-05, "loss": 0.1131, "step": 100800 }, { "epoch": 1.4458928468995582, "grad_norm": 16.947895050048828, "learning_rate": 1.726789394293484e-05, "loss": 0.092, "step": 100825 }, { "epoch": 1.446251362358745, "grad_norm": 0.9933703541755676, "learning_rate": 1.7263910429369647e-05, "loss": 0.1787, "step": 100850 }, { "epoch": 1.4466098778179315, "grad_norm": 15.386799812316895, "learning_rate": 1.725992691580446e-05, "loss": 0.1644, "step": 100875 }, { "epoch": 1.446968393277118, "grad_norm": 6.452852725982666, "learning_rate": 1.7255943402239266e-05, "loss": 0.1561, "step": 100900 }, { "epoch": 1.4473269087363048, "grad_norm": 1.0642117261886597, "learning_rate": 1.7251959888674074e-05, "loss": 0.162, "step": 100925 }, { "epoch": 1.4476854241954913, "grad_norm": 0.9008466601371765, "learning_rate": 1.7247976375108882e-05, "loss": 0.0872, "step": 100950 }, { "epoch": 1.4480439396546778, "grad_norm": 12.312065124511719, "learning_rate": 1.724399286154369e-05, "loss": 0.1848, "step": 100975 }, { "epoch": 1.4484024551138646, "grad_norm": 1.5553295612335205, "learning_rate": 1.72400093479785e-05, "loss": 0.1311, "step": 101000 }, { "epoch": 1.448760970573051, "grad_norm": 15.997153282165527, "learning_rate": 1.723602583441331e-05, "loss": 0.1097, "step": 101025 }, { "epoch": 1.4491194860322376, "grad_norm": 9.182489395141602, "learning_rate": 1.7232042320848117e-05, "loss": 0.1248, "step": 101050 }, { "epoch": 1.4494780014914244, "grad_norm": 1.9110342264175415, "learning_rate": 1.7228058807282925e-05, "loss": 0.1784, "step": 101075 }, { "epoch": 1.4498365169506109, "grad_norm": 7.801819324493408, "learning_rate": 1.7224075293717733e-05, "loss": 0.0892, "step": 101100 }, { "epoch": 1.4501950324097974, "grad_norm": 1.9646929502487183, "learning_rate": 1.7220091780152544e-05, "loss": 0.1446, "step": 101125 }, { "epoch": 1.4505535478689842, "grad_norm": 3.531019687652588, "learning_rate": 1.721610826658735e-05, "loss": 0.141, "step": 101150 }, { "epoch": 1.4509120633281707, "grad_norm": 11.955479621887207, "learning_rate": 1.721212475302216e-05, "loss": 0.1702, "step": 101175 }, { "epoch": 1.4512705787873572, "grad_norm": 3.8361988067626953, "learning_rate": 1.7208141239456967e-05, "loss": 0.1598, "step": 101200 }, { "epoch": 1.451629094246544, "grad_norm": 6.502337455749512, "learning_rate": 1.7204157725891775e-05, "loss": 0.1753, "step": 101225 }, { "epoch": 1.4519876097057305, "grad_norm": 10.278690338134766, "learning_rate": 1.7200174212326586e-05, "loss": 0.1379, "step": 101250 }, { "epoch": 1.452346125164917, "grad_norm": 19.104949951171875, "learning_rate": 1.7196190698761394e-05, "loss": 0.1352, "step": 101275 }, { "epoch": 1.4527046406241038, "grad_norm": 1.862924575805664, "learning_rate": 1.7192207185196202e-05, "loss": 0.202, "step": 101300 }, { "epoch": 1.4530631560832903, "grad_norm": 0.2450697422027588, "learning_rate": 1.718822367163101e-05, "loss": 0.0729, "step": 101325 }, { "epoch": 1.4534216715424768, "grad_norm": 1.2381054162979126, "learning_rate": 1.7184240158065818e-05, "loss": 0.1652, "step": 101350 }, { "epoch": 1.4537801870016636, "grad_norm": 1.0049619674682617, "learning_rate": 1.718025664450063e-05, "loss": 0.1825, "step": 101375 }, { "epoch": 1.45413870246085, "grad_norm": 2.5470547676086426, "learning_rate": 1.7176273130935437e-05, "loss": 0.1034, "step": 101400 }, { "epoch": 1.4544972179200366, "grad_norm": 2.302694082260132, "learning_rate": 1.7172289617370245e-05, "loss": 0.1096, "step": 101425 }, { "epoch": 1.4548557333792234, "grad_norm": 6.748882293701172, "learning_rate": 1.7168306103805053e-05, "loss": 0.0907, "step": 101450 }, { "epoch": 1.4552142488384099, "grad_norm": 1.3086793422698975, "learning_rate": 1.7164322590239864e-05, "loss": 0.1435, "step": 101475 }, { "epoch": 1.4555727642975964, "grad_norm": 8.856730461120605, "learning_rate": 1.716033907667467e-05, "loss": 0.1026, "step": 101500 }, { "epoch": 1.4559312797567832, "grad_norm": 3.880776882171631, "learning_rate": 1.715635556310948e-05, "loss": 0.141, "step": 101525 }, { "epoch": 1.4562897952159697, "grad_norm": 12.666852951049805, "learning_rate": 1.7152372049544287e-05, "loss": 0.0936, "step": 101550 }, { "epoch": 1.4566483106751562, "grad_norm": 1.2406543493270874, "learning_rate": 1.7148388535979095e-05, "loss": 0.2556, "step": 101575 }, { "epoch": 1.457006826134343, "grad_norm": 3.014096736907959, "learning_rate": 1.7144405022413906e-05, "loss": 0.0774, "step": 101600 }, { "epoch": 1.4573653415935295, "grad_norm": 8.491689682006836, "learning_rate": 1.7140421508848714e-05, "loss": 0.2263, "step": 101625 }, { "epoch": 1.457723857052716, "grad_norm": 0.7074673175811768, "learning_rate": 1.7136437995283522e-05, "loss": 0.1284, "step": 101650 }, { "epoch": 1.4580823725119028, "grad_norm": 0.2783837616443634, "learning_rate": 1.7132454481718327e-05, "loss": 0.1846, "step": 101675 }, { "epoch": 1.4584408879710893, "grad_norm": 13.88641357421875, "learning_rate": 1.7128470968153134e-05, "loss": 0.1477, "step": 101700 }, { "epoch": 1.4587994034302758, "grad_norm": 9.735495567321777, "learning_rate": 1.7124487454587946e-05, "loss": 0.1756, "step": 101725 }, { "epoch": 1.4591579188894626, "grad_norm": 10.628706932067871, "learning_rate": 1.7120503941022753e-05, "loss": 0.1177, "step": 101750 }, { "epoch": 1.459516434348649, "grad_norm": 13.581022262573242, "learning_rate": 1.711652042745756e-05, "loss": 0.1488, "step": 101775 }, { "epoch": 1.4598749498078356, "grad_norm": 17.509138107299805, "learning_rate": 1.711253691389237e-05, "loss": 0.1673, "step": 101800 }, { "epoch": 1.4602334652670224, "grad_norm": 11.964263916015625, "learning_rate": 1.7108553400327177e-05, "loss": 0.1014, "step": 101825 }, { "epoch": 1.4605919807262089, "grad_norm": 1.3530901670455933, "learning_rate": 1.7104569886761988e-05, "loss": 0.179, "step": 101850 }, { "epoch": 1.4609504961853954, "grad_norm": 0.6862311363220215, "learning_rate": 1.7100586373196796e-05, "loss": 0.1611, "step": 101875 }, { "epoch": 1.4613090116445822, "grad_norm": 3.3574838638305664, "learning_rate": 1.7096602859631604e-05, "loss": 0.1357, "step": 101900 }, { "epoch": 1.4616675271037687, "grad_norm": 6.220373630523682, "learning_rate": 1.7092619346066412e-05, "loss": 0.2062, "step": 101925 }, { "epoch": 1.4620260425629552, "grad_norm": 6.09192419052124, "learning_rate": 1.708863583250122e-05, "loss": 0.2502, "step": 101950 }, { "epoch": 1.462384558022142, "grad_norm": 11.390238761901855, "learning_rate": 1.708465231893603e-05, "loss": 0.1416, "step": 101975 }, { "epoch": 1.4627430734813285, "grad_norm": 3.101642370223999, "learning_rate": 1.708066880537084e-05, "loss": 0.1043, "step": 102000 }, { "epoch": 1.463101588940515, "grad_norm": 17.867521286010742, "learning_rate": 1.7076685291805647e-05, "loss": 0.1553, "step": 102025 }, { "epoch": 1.4634601043997018, "grad_norm": 0.8134332299232483, "learning_rate": 1.7072701778240454e-05, "loss": 0.2172, "step": 102050 }, { "epoch": 1.4638186198588883, "grad_norm": 0.552063524723053, "learning_rate": 1.7068718264675266e-05, "loss": 0.0754, "step": 102075 }, { "epoch": 1.4641771353180748, "grad_norm": 6.897924900054932, "learning_rate": 1.7064734751110073e-05, "loss": 0.1594, "step": 102100 }, { "epoch": 1.4645356507772616, "grad_norm": 1.228391170501709, "learning_rate": 1.706075123754488e-05, "loss": 0.1108, "step": 102125 }, { "epoch": 1.464894166236448, "grad_norm": 1.1873393058776855, "learning_rate": 1.705676772397969e-05, "loss": 0.1686, "step": 102150 }, { "epoch": 1.4652526816956346, "grad_norm": 0.3889513909816742, "learning_rate": 1.7052784210414497e-05, "loss": 0.1982, "step": 102175 }, { "epoch": 1.4656111971548214, "grad_norm": 1.405163288116455, "learning_rate": 1.7048800696849308e-05, "loss": 0.1295, "step": 102200 }, { "epoch": 1.4659697126140079, "grad_norm": 0.5591996312141418, "learning_rate": 1.7044817183284116e-05, "loss": 0.235, "step": 102225 }, { "epoch": 1.4663282280731944, "grad_norm": 0.5024827718734741, "learning_rate": 1.7040833669718924e-05, "loss": 0.0665, "step": 102250 }, { "epoch": 1.4666867435323812, "grad_norm": 3.659933090209961, "learning_rate": 1.7036850156153732e-05, "loss": 0.0872, "step": 102275 }, { "epoch": 1.4670452589915677, "grad_norm": 4.556118965148926, "learning_rate": 1.703286664258854e-05, "loss": 0.1892, "step": 102300 }, { "epoch": 1.4674037744507542, "grad_norm": 0.10897117108106613, "learning_rate": 1.702888312902335e-05, "loss": 0.093, "step": 102325 }, { "epoch": 1.467762289909941, "grad_norm": 4.482096195220947, "learning_rate": 1.702489961545816e-05, "loss": 0.0846, "step": 102350 }, { "epoch": 1.4681208053691275, "grad_norm": 17.729995727539062, "learning_rate": 1.7020916101892967e-05, "loss": 0.1639, "step": 102375 }, { "epoch": 1.468479320828314, "grad_norm": 10.309479713439941, "learning_rate": 1.7016932588327774e-05, "loss": 0.0885, "step": 102400 }, { "epoch": 1.4688378362875008, "grad_norm": 15.881339073181152, "learning_rate": 1.7012949074762582e-05, "loss": 0.1091, "step": 102425 }, { "epoch": 1.4691963517466873, "grad_norm": 0.33096930384635925, "learning_rate": 1.7008965561197393e-05, "loss": 0.1264, "step": 102450 }, { "epoch": 1.4695548672058738, "grad_norm": 11.325957298278809, "learning_rate": 1.70049820476322e-05, "loss": 0.2013, "step": 102475 }, { "epoch": 1.4699133826650606, "grad_norm": 5.283063888549805, "learning_rate": 1.700099853406701e-05, "loss": 0.1825, "step": 102500 }, { "epoch": 1.470271898124247, "grad_norm": 16.71765899658203, "learning_rate": 1.6997015020501817e-05, "loss": 0.1213, "step": 102525 }, { "epoch": 1.4706304135834336, "grad_norm": 5.170689582824707, "learning_rate": 1.6993031506936625e-05, "loss": 0.0746, "step": 102550 }, { "epoch": 1.4709889290426204, "grad_norm": 6.013514041900635, "learning_rate": 1.6989047993371436e-05, "loss": 0.122, "step": 102575 }, { "epoch": 1.4713474445018069, "grad_norm": 2.1973586082458496, "learning_rate": 1.6985064479806244e-05, "loss": 0.2056, "step": 102600 }, { "epoch": 1.4717059599609934, "grad_norm": 0.22125761210918427, "learning_rate": 1.6981080966241052e-05, "loss": 0.1396, "step": 102625 }, { "epoch": 1.4720644754201802, "grad_norm": 0.054420873522758484, "learning_rate": 1.697709745267586e-05, "loss": 0.1042, "step": 102650 }, { "epoch": 1.4724229908793667, "grad_norm": 2.1377108097076416, "learning_rate": 1.697311393911067e-05, "loss": 0.081, "step": 102675 }, { "epoch": 1.4727815063385532, "grad_norm": 8.993789672851562, "learning_rate": 1.696913042554548e-05, "loss": 0.1356, "step": 102700 }, { "epoch": 1.47314002179774, "grad_norm": 1.9586684703826904, "learning_rate": 1.6965146911980286e-05, "loss": 0.0812, "step": 102725 }, { "epoch": 1.4734985372569265, "grad_norm": 5.127080917358398, "learning_rate": 1.6961163398415094e-05, "loss": 0.1011, "step": 102750 }, { "epoch": 1.473857052716113, "grad_norm": 0.0703996941447258, "learning_rate": 1.69571798848499e-05, "loss": 0.0884, "step": 102775 }, { "epoch": 1.4742155681752998, "grad_norm": 2.856010675430298, "learning_rate": 1.695319637128471e-05, "loss": 0.1678, "step": 102800 }, { "epoch": 1.4745740836344863, "grad_norm": 9.532306671142578, "learning_rate": 1.6949212857719518e-05, "loss": 0.1293, "step": 102825 }, { "epoch": 1.4749325990936728, "grad_norm": 2.017704725265503, "learning_rate": 1.6945229344154326e-05, "loss": 0.1139, "step": 102850 }, { "epoch": 1.4752911145528596, "grad_norm": 0.43912577629089355, "learning_rate": 1.6941245830589134e-05, "loss": 0.1873, "step": 102875 }, { "epoch": 1.475649630012046, "grad_norm": 1.4036142826080322, "learning_rate": 1.693726231702394e-05, "loss": 0.1423, "step": 102900 }, { "epoch": 1.4760081454712326, "grad_norm": 16.89090347290039, "learning_rate": 1.6933278803458753e-05, "loss": 0.1169, "step": 102925 }, { "epoch": 1.4763666609304193, "grad_norm": 1.5751965045928955, "learning_rate": 1.692929528989356e-05, "loss": 0.0764, "step": 102950 }, { "epoch": 1.4767251763896059, "grad_norm": 0.5289820432662964, "learning_rate": 1.692531177632837e-05, "loss": 0.1539, "step": 102975 }, { "epoch": 1.4770836918487924, "grad_norm": 0.013346397317945957, "learning_rate": 1.6921328262763176e-05, "loss": 0.1806, "step": 103000 }, { "epoch": 1.4774422073079791, "grad_norm": 2.1880955696105957, "learning_rate": 1.6917344749197984e-05, "loss": 0.1292, "step": 103025 }, { "epoch": 1.4778007227671657, "grad_norm": 9.137344360351562, "learning_rate": 1.6913361235632795e-05, "loss": 0.1663, "step": 103050 }, { "epoch": 1.4781592382263522, "grad_norm": 0.6316223740577698, "learning_rate": 1.6909377722067603e-05, "loss": 0.1054, "step": 103075 }, { "epoch": 1.478517753685539, "grad_norm": 2.814465045928955, "learning_rate": 1.690539420850241e-05, "loss": 0.1177, "step": 103100 }, { "epoch": 1.4788762691447255, "grad_norm": 0.6072667837142944, "learning_rate": 1.690141069493722e-05, "loss": 0.0476, "step": 103125 }, { "epoch": 1.479234784603912, "grad_norm": 10.228572845458984, "learning_rate": 1.6897427181372027e-05, "loss": 0.1666, "step": 103150 }, { "epoch": 1.4795933000630987, "grad_norm": 15.29851245880127, "learning_rate": 1.6893443667806838e-05, "loss": 0.1182, "step": 103175 }, { "epoch": 1.4799518155222853, "grad_norm": 0.982728123664856, "learning_rate": 1.6889460154241646e-05, "loss": 0.1118, "step": 103200 }, { "epoch": 1.4803103309814718, "grad_norm": 15.90556526184082, "learning_rate": 1.6885476640676454e-05, "loss": 0.1893, "step": 103225 }, { "epoch": 1.4806688464406585, "grad_norm": 12.835931777954102, "learning_rate": 1.688149312711126e-05, "loss": 0.144, "step": 103250 }, { "epoch": 1.481027361899845, "grad_norm": 2.899043083190918, "learning_rate": 1.6877509613546073e-05, "loss": 0.1102, "step": 103275 }, { "epoch": 1.4813858773590316, "grad_norm": 0.21937085688114166, "learning_rate": 1.687352609998088e-05, "loss": 0.1307, "step": 103300 }, { "epoch": 1.4817443928182183, "grad_norm": 2.2328293323516846, "learning_rate": 1.686954258641569e-05, "loss": 0.0741, "step": 103325 }, { "epoch": 1.4821029082774049, "grad_norm": 0.8787546753883362, "learning_rate": 1.6865559072850496e-05, "loss": 0.1175, "step": 103350 }, { "epoch": 1.4824614237365914, "grad_norm": 2.6155009269714355, "learning_rate": 1.6861575559285304e-05, "loss": 0.0587, "step": 103375 }, { "epoch": 1.4828199391957781, "grad_norm": 22.485809326171875, "learning_rate": 1.6857592045720115e-05, "loss": 0.1199, "step": 103400 }, { "epoch": 1.4831784546549647, "grad_norm": 2.822486162185669, "learning_rate": 1.6853608532154923e-05, "loss": 0.1191, "step": 103425 }, { "epoch": 1.4835369701141512, "grad_norm": 6.198246002197266, "learning_rate": 1.684962501858973e-05, "loss": 0.0874, "step": 103450 }, { "epoch": 1.483895485573338, "grad_norm": 20.44668197631836, "learning_rate": 1.684564150502454e-05, "loss": 0.1412, "step": 103475 }, { "epoch": 1.4842540010325245, "grad_norm": 3.0741798877716064, "learning_rate": 1.6841657991459347e-05, "loss": 0.0784, "step": 103500 }, { "epoch": 1.484612516491711, "grad_norm": 2.7809581756591797, "learning_rate": 1.6837674477894158e-05, "loss": 0.2691, "step": 103525 }, { "epoch": 1.4849710319508977, "grad_norm": 3.147624969482422, "learning_rate": 1.6833690964328966e-05, "loss": 0.1278, "step": 103550 }, { "epoch": 1.4853295474100843, "grad_norm": 1.7379767894744873, "learning_rate": 1.6829707450763774e-05, "loss": 0.1385, "step": 103575 }, { "epoch": 1.4856880628692708, "grad_norm": 3.5464205741882324, "learning_rate": 1.682572393719858e-05, "loss": 0.1386, "step": 103600 }, { "epoch": 1.4860465783284575, "grad_norm": 1.3103996515274048, "learning_rate": 1.682174042363339e-05, "loss": 0.0924, "step": 103625 }, { "epoch": 1.486405093787644, "grad_norm": 1.8666188716888428, "learning_rate": 1.68177569100682e-05, "loss": 0.2266, "step": 103650 }, { "epoch": 1.4867636092468306, "grad_norm": 0.269774466753006, "learning_rate": 1.681377339650301e-05, "loss": 0.1316, "step": 103675 }, { "epoch": 1.4871221247060173, "grad_norm": 1.4790385961532593, "learning_rate": 1.6809789882937816e-05, "loss": 0.0858, "step": 103700 }, { "epoch": 1.4874806401652039, "grad_norm": 2.2494959831237793, "learning_rate": 1.6805806369372624e-05, "loss": 0.1179, "step": 103725 }, { "epoch": 1.4878391556243904, "grad_norm": 3.768021821975708, "learning_rate": 1.6801822855807432e-05, "loss": 0.1374, "step": 103750 }, { "epoch": 1.4881976710835771, "grad_norm": 19.06175422668457, "learning_rate": 1.6797839342242243e-05, "loss": 0.1525, "step": 103775 }, { "epoch": 1.4885561865427637, "grad_norm": 4.485994338989258, "learning_rate": 1.679385582867705e-05, "loss": 0.0732, "step": 103800 }, { "epoch": 1.4889147020019502, "grad_norm": 0.4235735535621643, "learning_rate": 1.678987231511186e-05, "loss": 0.1067, "step": 103825 }, { "epoch": 1.489273217461137, "grad_norm": 1.887853980064392, "learning_rate": 1.6785888801546667e-05, "loss": 0.1077, "step": 103850 }, { "epoch": 1.4896317329203235, "grad_norm": 0.21031306684017181, "learning_rate": 1.6781905287981474e-05, "loss": 0.1392, "step": 103875 }, { "epoch": 1.48999024837951, "grad_norm": 1.5551890134811401, "learning_rate": 1.6777921774416282e-05, "loss": 0.1253, "step": 103900 }, { "epoch": 1.4903487638386967, "grad_norm": 11.837270736694336, "learning_rate": 1.677393826085109e-05, "loss": 0.106, "step": 103925 }, { "epoch": 1.4907072792978833, "grad_norm": 0.750561535358429, "learning_rate": 1.6769954747285898e-05, "loss": 0.0892, "step": 103950 }, { "epoch": 1.4910657947570698, "grad_norm": 14.524494171142578, "learning_rate": 1.6765971233720706e-05, "loss": 0.1766, "step": 103975 }, { "epoch": 1.4914243102162565, "grad_norm": 1.4683682918548584, "learning_rate": 1.6761987720155517e-05, "loss": 0.1079, "step": 104000 }, { "epoch": 1.491782825675443, "grad_norm": 4.788580894470215, "learning_rate": 1.6758004206590325e-05, "loss": 0.1279, "step": 104025 }, { "epoch": 1.4921413411346296, "grad_norm": 18.03231430053711, "learning_rate": 1.6754020693025133e-05, "loss": 0.1081, "step": 104050 }, { "epoch": 1.4924998565938163, "grad_norm": 1.7889765501022339, "learning_rate": 1.675003717945994e-05, "loss": 0.2005, "step": 104075 }, { "epoch": 1.4928583720530029, "grad_norm": 5.239400386810303, "learning_rate": 1.674605366589475e-05, "loss": 0.107, "step": 104100 }, { "epoch": 1.4932168875121894, "grad_norm": 0.2800632119178772, "learning_rate": 1.674207015232956e-05, "loss": 0.1639, "step": 104125 }, { "epoch": 1.4935754029713761, "grad_norm": 0.7359676957130432, "learning_rate": 1.6738086638764368e-05, "loss": 0.2005, "step": 104150 }, { "epoch": 1.4939339184305627, "grad_norm": 2.844599485397339, "learning_rate": 1.6734103125199175e-05, "loss": 0.1533, "step": 104175 }, { "epoch": 1.4942924338897492, "grad_norm": 16.109634399414062, "learning_rate": 1.6730119611633983e-05, "loss": 0.1998, "step": 104200 }, { "epoch": 1.494650949348936, "grad_norm": 14.148138999938965, "learning_rate": 1.672613609806879e-05, "loss": 0.1069, "step": 104225 }, { "epoch": 1.4950094648081225, "grad_norm": 7.333103179931641, "learning_rate": 1.6722152584503602e-05, "loss": 0.0909, "step": 104250 }, { "epoch": 1.495367980267309, "grad_norm": 1.65121591091156, "learning_rate": 1.671816907093841e-05, "loss": 0.1343, "step": 104275 }, { "epoch": 1.4957264957264957, "grad_norm": 2.5511648654937744, "learning_rate": 1.6714185557373218e-05, "loss": 0.0852, "step": 104300 }, { "epoch": 1.4960850111856823, "grad_norm": 0.7114431858062744, "learning_rate": 1.6710202043808026e-05, "loss": 0.1768, "step": 104325 }, { "epoch": 1.4964435266448688, "grad_norm": 0.35385242104530334, "learning_rate": 1.6706218530242834e-05, "loss": 0.1413, "step": 104350 }, { "epoch": 1.4968020421040555, "grad_norm": 16.6693172454834, "learning_rate": 1.6702235016677645e-05, "loss": 0.115, "step": 104375 }, { "epoch": 1.497160557563242, "grad_norm": 1.709755301475525, "learning_rate": 1.6698251503112453e-05, "loss": 0.1112, "step": 104400 }, { "epoch": 1.4975190730224286, "grad_norm": 4.628873825073242, "learning_rate": 1.669426798954726e-05, "loss": 0.1158, "step": 104425 }, { "epoch": 1.4978775884816153, "grad_norm": 0.3049486577510834, "learning_rate": 1.669028447598207e-05, "loss": 0.1218, "step": 104450 }, { "epoch": 1.4982361039408019, "grad_norm": 0.8199405670166016, "learning_rate": 1.668630096241688e-05, "loss": 0.143, "step": 104475 }, { "epoch": 1.4985946193999884, "grad_norm": 0.8096835017204285, "learning_rate": 1.6682317448851688e-05, "loss": 0.1087, "step": 104500 }, { "epoch": 1.4989531348591751, "grad_norm": 12.107192993164062, "learning_rate": 1.6678333935286495e-05, "loss": 0.1633, "step": 104525 }, { "epoch": 1.4993116503183617, "grad_norm": 0.2705868184566498, "learning_rate": 1.6674350421721303e-05, "loss": 0.1024, "step": 104550 }, { "epoch": 1.4996701657775482, "grad_norm": 3.083332061767578, "learning_rate": 1.667036690815611e-05, "loss": 0.1522, "step": 104575 }, { "epoch": 1.500028681236735, "grad_norm": 1.621295690536499, "learning_rate": 1.6666383394590922e-05, "loss": 0.1147, "step": 104600 }, { "epoch": 1.5003871966959217, "grad_norm": 0.10883279144763947, "learning_rate": 1.666239988102573e-05, "loss": 0.1116, "step": 104625 }, { "epoch": 1.500745712155108, "grad_norm": 0.08837492763996124, "learning_rate": 1.6658416367460538e-05, "loss": 0.0905, "step": 104650 }, { "epoch": 1.5011042276142947, "grad_norm": 0.8501602411270142, "learning_rate": 1.6654432853895346e-05, "loss": 0.1742, "step": 104675 }, { "epoch": 1.5014627430734815, "grad_norm": 1.2843315601348877, "learning_rate": 1.6650449340330154e-05, "loss": 0.1179, "step": 104700 }, { "epoch": 1.5018212585326678, "grad_norm": 0.9573855996131897, "learning_rate": 1.6646465826764965e-05, "loss": 0.1292, "step": 104725 }, { "epoch": 1.5021797739918545, "grad_norm": 0.0426022969186306, "learning_rate": 1.6642482313199773e-05, "loss": 0.166, "step": 104750 }, { "epoch": 1.5025382894510413, "grad_norm": 6.383395195007324, "learning_rate": 1.663849879963458e-05, "loss": 0.1144, "step": 104775 }, { "epoch": 1.5028968049102276, "grad_norm": 18.13240623474121, "learning_rate": 1.663451528606939e-05, "loss": 0.1706, "step": 104800 }, { "epoch": 1.5032553203694143, "grad_norm": 11.004864692687988, "learning_rate": 1.6630531772504196e-05, "loss": 0.1311, "step": 104825 }, { "epoch": 1.503613835828601, "grad_norm": 14.976715087890625, "learning_rate": 1.6626548258939007e-05, "loss": 0.1315, "step": 104850 }, { "epoch": 1.5039723512877874, "grad_norm": 0.2556643784046173, "learning_rate": 1.6622564745373815e-05, "loss": 0.1458, "step": 104875 }, { "epoch": 1.5043308667469741, "grad_norm": 9.34535026550293, "learning_rate": 1.6618581231808623e-05, "loss": 0.1507, "step": 104900 }, { "epoch": 1.5046893822061609, "grad_norm": 3.255856990814209, "learning_rate": 1.661459771824343e-05, "loss": 0.1703, "step": 104925 }, { "epoch": 1.5050478976653472, "grad_norm": 0.635857880115509, "learning_rate": 1.661061420467824e-05, "loss": 0.1244, "step": 104950 }, { "epoch": 1.505406413124534, "grad_norm": 0.054409317672252655, "learning_rate": 1.6606630691113047e-05, "loss": 0.105, "step": 104975 }, { "epoch": 1.5057649285837207, "grad_norm": 0.33217060565948486, "learning_rate": 1.6602647177547855e-05, "loss": 0.083, "step": 105000 }, { "epoch": 1.506123444042907, "grad_norm": 14.592422485351562, "learning_rate": 1.6598663663982662e-05, "loss": 0.1199, "step": 105025 }, { "epoch": 1.5064819595020937, "grad_norm": 4.112560272216797, "learning_rate": 1.659468015041747e-05, "loss": 0.1174, "step": 105050 }, { "epoch": 1.5068404749612805, "grad_norm": 1.7925931215286255, "learning_rate": 1.659069663685228e-05, "loss": 0.0954, "step": 105075 }, { "epoch": 1.5071989904204668, "grad_norm": 5.235974311828613, "learning_rate": 1.658671312328709e-05, "loss": 0.1884, "step": 105100 }, { "epoch": 1.5075575058796535, "grad_norm": 0.6449534893035889, "learning_rate": 1.6582729609721897e-05, "loss": 0.085, "step": 105125 }, { "epoch": 1.5079160213388403, "grad_norm": 12.956709861755371, "learning_rate": 1.6578746096156705e-05, "loss": 0.1425, "step": 105150 }, { "epoch": 1.5082745367980266, "grad_norm": 1.0045137405395508, "learning_rate": 1.6574762582591513e-05, "loss": 0.0928, "step": 105175 }, { "epoch": 1.5086330522572133, "grad_norm": 0.6231756806373596, "learning_rate": 1.6570779069026324e-05, "loss": 0.1036, "step": 105200 }, { "epoch": 1.5089915677164, "grad_norm": 2.4291810989379883, "learning_rate": 1.6566795555461132e-05, "loss": 0.1141, "step": 105225 }, { "epoch": 1.5093500831755864, "grad_norm": 1.966193437576294, "learning_rate": 1.656281204189594e-05, "loss": 0.1529, "step": 105250 }, { "epoch": 1.5097085986347731, "grad_norm": 0.4537714123725891, "learning_rate": 1.6558828528330748e-05, "loss": 0.255, "step": 105275 }, { "epoch": 1.5100671140939599, "grad_norm": 11.373956680297852, "learning_rate": 1.6554845014765555e-05, "loss": 0.1183, "step": 105300 }, { "epoch": 1.5104256295531462, "grad_norm": 10.055879592895508, "learning_rate": 1.6550861501200367e-05, "loss": 0.1358, "step": 105325 }, { "epoch": 1.510784145012333, "grad_norm": 0.6939252614974976, "learning_rate": 1.6546877987635175e-05, "loss": 0.0974, "step": 105350 }, { "epoch": 1.5111426604715197, "grad_norm": 8.179828643798828, "learning_rate": 1.6542894474069982e-05, "loss": 0.1169, "step": 105375 }, { "epoch": 1.511501175930706, "grad_norm": 2.7498066425323486, "learning_rate": 1.653891096050479e-05, "loss": 0.1067, "step": 105400 }, { "epoch": 1.5118596913898927, "grad_norm": 5.014984607696533, "learning_rate": 1.6534927446939598e-05, "loss": 0.1764, "step": 105425 }, { "epoch": 1.5122182068490795, "grad_norm": 3.285569906234741, "learning_rate": 1.653094393337441e-05, "loss": 0.1785, "step": 105450 }, { "epoch": 1.5125767223082658, "grad_norm": 12.611419677734375, "learning_rate": 1.6526960419809217e-05, "loss": 0.221, "step": 105475 }, { "epoch": 1.5129352377674525, "grad_norm": 6.633113384246826, "learning_rate": 1.6522976906244025e-05, "loss": 0.185, "step": 105500 }, { "epoch": 1.5132937532266393, "grad_norm": 2.1764304637908936, "learning_rate": 1.6518993392678833e-05, "loss": 0.0939, "step": 105525 }, { "epoch": 1.5136522686858256, "grad_norm": 18.730863571166992, "learning_rate": 1.651500987911364e-05, "loss": 0.0984, "step": 105550 }, { "epoch": 1.5140107841450123, "grad_norm": 15.29007625579834, "learning_rate": 1.6511026365548452e-05, "loss": 0.1262, "step": 105575 }, { "epoch": 1.514369299604199, "grad_norm": 2.359870195388794, "learning_rate": 1.650704285198326e-05, "loss": 0.1032, "step": 105600 }, { "epoch": 1.5147278150633854, "grad_norm": 29.21080207824707, "learning_rate": 1.6503059338418068e-05, "loss": 0.1479, "step": 105625 }, { "epoch": 1.5150863305225721, "grad_norm": 4.346257209777832, "learning_rate": 1.6499075824852875e-05, "loss": 0.1272, "step": 105650 }, { "epoch": 1.5154448459817589, "grad_norm": 0.47135597467422485, "learning_rate": 1.6495092311287683e-05, "loss": 0.0789, "step": 105675 }, { "epoch": 1.5158033614409452, "grad_norm": 1.983655571937561, "learning_rate": 1.6491108797722495e-05, "loss": 0.0572, "step": 105700 }, { "epoch": 1.516161876900132, "grad_norm": 4.992228031158447, "learning_rate": 1.6487125284157302e-05, "loss": 0.0925, "step": 105725 }, { "epoch": 1.5165203923593187, "grad_norm": 12.595222473144531, "learning_rate": 1.648314177059211e-05, "loss": 0.1615, "step": 105750 }, { "epoch": 1.516878907818505, "grad_norm": 2.763512372970581, "learning_rate": 1.6479158257026918e-05, "loss": 0.1015, "step": 105775 }, { "epoch": 1.5172374232776917, "grad_norm": 19.065229415893555, "learning_rate": 1.647517474346173e-05, "loss": 0.1594, "step": 105800 }, { "epoch": 1.5175959387368785, "grad_norm": 9.271138191223145, "learning_rate": 1.6471191229896537e-05, "loss": 0.1219, "step": 105825 }, { "epoch": 1.5179544541960648, "grad_norm": 3.6490540504455566, "learning_rate": 1.6467207716331345e-05, "loss": 0.0689, "step": 105850 }, { "epoch": 1.5183129696552515, "grad_norm": 10.975476264953613, "learning_rate": 1.6463224202766153e-05, "loss": 0.1162, "step": 105875 }, { "epoch": 1.5186714851144383, "grad_norm": 11.012126922607422, "learning_rate": 1.645924068920096e-05, "loss": 0.2538, "step": 105900 }, { "epoch": 1.5190300005736246, "grad_norm": 13.762681007385254, "learning_rate": 1.6455257175635772e-05, "loss": 0.1301, "step": 105925 }, { "epoch": 1.5193885160328113, "grad_norm": 3.697226047515869, "learning_rate": 1.645127366207058e-05, "loss": 0.1097, "step": 105950 }, { "epoch": 1.519747031491998, "grad_norm": 12.9124116897583, "learning_rate": 1.6447290148505388e-05, "loss": 0.1751, "step": 105975 }, { "epoch": 1.5201055469511844, "grad_norm": 1.1462138891220093, "learning_rate": 1.6443306634940195e-05, "loss": 0.1995, "step": 106000 }, { "epoch": 1.5204640624103711, "grad_norm": 3.065656900405884, "learning_rate": 1.6439323121375003e-05, "loss": 0.1153, "step": 106025 }, { "epoch": 1.5208225778695579, "grad_norm": 17.042089462280273, "learning_rate": 1.643533960780981e-05, "loss": 0.1322, "step": 106050 }, { "epoch": 1.5211810933287442, "grad_norm": 2.1243486404418945, "learning_rate": 1.643135609424462e-05, "loss": 0.1431, "step": 106075 }, { "epoch": 1.521539608787931, "grad_norm": 9.359235763549805, "learning_rate": 1.6427372580679427e-05, "loss": 0.1402, "step": 106100 }, { "epoch": 1.5218981242471177, "grad_norm": 2.2479007244110107, "learning_rate": 1.6423389067114235e-05, "loss": 0.1184, "step": 106125 }, { "epoch": 1.522256639706304, "grad_norm": 13.702770233154297, "learning_rate": 1.6419405553549043e-05, "loss": 0.1249, "step": 106150 }, { "epoch": 1.5226151551654907, "grad_norm": 4.654520511627197, "learning_rate": 1.6415422039983854e-05, "loss": 0.1285, "step": 106175 }, { "epoch": 1.5229736706246775, "grad_norm": 4.196623802185059, "learning_rate": 1.641143852641866e-05, "loss": 0.1053, "step": 106200 }, { "epoch": 1.5233321860838638, "grad_norm": 2.7976796627044678, "learning_rate": 1.640745501285347e-05, "loss": 0.106, "step": 106225 }, { "epoch": 1.5236907015430505, "grad_norm": 9.32800006866455, "learning_rate": 1.6403471499288277e-05, "loss": 0.1125, "step": 106250 }, { "epoch": 1.5240492170022373, "grad_norm": 9.800128936767578, "learning_rate": 1.6399487985723085e-05, "loss": 0.1334, "step": 106275 }, { "epoch": 1.5244077324614236, "grad_norm": 11.634517669677734, "learning_rate": 1.6395504472157896e-05, "loss": 0.1185, "step": 106300 }, { "epoch": 1.5247662479206103, "grad_norm": 0.062197063118219376, "learning_rate": 1.6391520958592704e-05, "loss": 0.1908, "step": 106325 }, { "epoch": 1.525124763379797, "grad_norm": 1.446353793144226, "learning_rate": 1.6387537445027512e-05, "loss": 0.1458, "step": 106350 }, { "epoch": 1.5254832788389834, "grad_norm": 0.12911443412303925, "learning_rate": 1.638355393146232e-05, "loss": 0.2084, "step": 106375 }, { "epoch": 1.5258417942981701, "grad_norm": 1.430418610572815, "learning_rate": 1.637957041789713e-05, "loss": 0.2248, "step": 106400 }, { "epoch": 1.5262003097573569, "grad_norm": 14.497458457946777, "learning_rate": 1.637558690433194e-05, "loss": 0.1692, "step": 106425 }, { "epoch": 1.5265588252165432, "grad_norm": 6.929307460784912, "learning_rate": 1.6371603390766747e-05, "loss": 0.089, "step": 106450 }, { "epoch": 1.52691734067573, "grad_norm": 0.07259757071733475, "learning_rate": 1.6367619877201555e-05, "loss": 0.171, "step": 106475 }, { "epoch": 1.5272758561349167, "grad_norm": 14.504081726074219, "learning_rate": 1.6363636363636363e-05, "loss": 0.1998, "step": 106500 }, { "epoch": 1.527634371594103, "grad_norm": 6.012524604797363, "learning_rate": 1.6359652850071174e-05, "loss": 0.1221, "step": 106525 }, { "epoch": 1.5279928870532897, "grad_norm": 0.2468152642250061, "learning_rate": 1.635566933650598e-05, "loss": 0.0866, "step": 106550 }, { "epoch": 1.5283514025124765, "grad_norm": 5.271758556365967, "learning_rate": 1.635168582294079e-05, "loss": 0.1373, "step": 106575 }, { "epoch": 1.5287099179716628, "grad_norm": 0.29029861092567444, "learning_rate": 1.6347702309375597e-05, "loss": 0.1074, "step": 106600 }, { "epoch": 1.5290684334308495, "grad_norm": 7.338322162628174, "learning_rate": 1.6343718795810405e-05, "loss": 0.1184, "step": 106625 }, { "epoch": 1.5294269488900363, "grad_norm": 4.357701301574707, "learning_rate": 1.6339735282245216e-05, "loss": 0.1529, "step": 106650 }, { "epoch": 1.5297854643492226, "grad_norm": 1.3796708583831787, "learning_rate": 1.6335751768680024e-05, "loss": 0.1899, "step": 106675 }, { "epoch": 1.5301439798084093, "grad_norm": 8.2561616897583, "learning_rate": 1.6331768255114832e-05, "loss": 0.1953, "step": 106700 }, { "epoch": 1.530502495267596, "grad_norm": 12.83799934387207, "learning_rate": 1.632778474154964e-05, "loss": 0.1804, "step": 106725 }, { "epoch": 1.5308610107267824, "grad_norm": 22.95792007446289, "learning_rate": 1.6323801227984448e-05, "loss": 0.1797, "step": 106750 }, { "epoch": 1.5312195261859691, "grad_norm": 3.594883441925049, "learning_rate": 1.631981771441926e-05, "loss": 0.1399, "step": 106775 }, { "epoch": 1.5315780416451559, "grad_norm": 6.340157508850098, "learning_rate": 1.6315834200854067e-05, "loss": 0.0761, "step": 106800 }, { "epoch": 1.5319365571043422, "grad_norm": 1.3461639881134033, "learning_rate": 1.6311850687288875e-05, "loss": 0.0904, "step": 106825 }, { "epoch": 1.532295072563529, "grad_norm": 2.778398275375366, "learning_rate": 1.6307867173723682e-05, "loss": 0.0692, "step": 106850 }, { "epoch": 1.5326535880227157, "grad_norm": 17.045001983642578, "learning_rate": 1.630388366015849e-05, "loss": 0.1269, "step": 106875 }, { "epoch": 1.533012103481902, "grad_norm": 19.393295288085938, "learning_rate": 1.62999001465933e-05, "loss": 0.1588, "step": 106900 }, { "epoch": 1.5333706189410887, "grad_norm": 3.0547776222229004, "learning_rate": 1.629591663302811e-05, "loss": 0.1069, "step": 106925 }, { "epoch": 1.5337291344002755, "grad_norm": 0.5031606554985046, "learning_rate": 1.6291933119462917e-05, "loss": 0.165, "step": 106950 }, { "epoch": 1.5340876498594618, "grad_norm": 0.8733700513839722, "learning_rate": 1.6287949605897725e-05, "loss": 0.1296, "step": 106975 }, { "epoch": 1.5344461653186485, "grad_norm": 5.290008544921875, "learning_rate": 1.6283966092332536e-05, "loss": 0.0972, "step": 107000 }, { "epoch": 1.5348046807778353, "grad_norm": 21.94239044189453, "learning_rate": 1.6279982578767344e-05, "loss": 0.1881, "step": 107025 }, { "epoch": 1.5351631962370216, "grad_norm": 2.502101421356201, "learning_rate": 1.6275999065202152e-05, "loss": 0.143, "step": 107050 }, { "epoch": 1.5355217116962083, "grad_norm": 1.7356908321380615, "learning_rate": 1.627201555163696e-05, "loss": 0.0967, "step": 107075 }, { "epoch": 1.535880227155395, "grad_norm": 26.899349212646484, "learning_rate": 1.6268032038071768e-05, "loss": 0.1974, "step": 107100 }, { "epoch": 1.5362387426145814, "grad_norm": 3.284682512283325, "learning_rate": 1.626404852450658e-05, "loss": 0.2128, "step": 107125 }, { "epoch": 1.5365972580737681, "grad_norm": 2.14402437210083, "learning_rate": 1.6260065010941383e-05, "loss": 0.1425, "step": 107150 }, { "epoch": 1.5369557735329549, "grad_norm": 10.273870468139648, "learning_rate": 1.625608149737619e-05, "loss": 0.132, "step": 107175 }, { "epoch": 1.5373142889921412, "grad_norm": 1.5195486545562744, "learning_rate": 1.6252097983811e-05, "loss": 0.0909, "step": 107200 }, { "epoch": 1.537672804451328, "grad_norm": 0.4386079013347626, "learning_rate": 1.6248114470245807e-05, "loss": 0.1324, "step": 107225 }, { "epoch": 1.5380313199105147, "grad_norm": 12.417966842651367, "learning_rate": 1.6244130956680618e-05, "loss": 0.1296, "step": 107250 }, { "epoch": 1.538389835369701, "grad_norm": 0.9696084260940552, "learning_rate": 1.6240147443115426e-05, "loss": 0.1477, "step": 107275 }, { "epoch": 1.5387483508288877, "grad_norm": 1.1804994344711304, "learning_rate": 1.6236163929550234e-05, "loss": 0.1159, "step": 107300 }, { "epoch": 1.5391068662880745, "grad_norm": 3.812619924545288, "learning_rate": 1.6232180415985042e-05, "loss": 0.0831, "step": 107325 }, { "epoch": 1.539465381747261, "grad_norm": 4.291228294372559, "learning_rate": 1.622819690241985e-05, "loss": 0.0892, "step": 107350 }, { "epoch": 1.5398238972064475, "grad_norm": 0.39468514919281006, "learning_rate": 1.622421338885466e-05, "loss": 0.1552, "step": 107375 }, { "epoch": 1.5401824126656343, "grad_norm": 8.257037162780762, "learning_rate": 1.622022987528947e-05, "loss": 0.1423, "step": 107400 }, { "epoch": 1.5405409281248208, "grad_norm": 2.724006175994873, "learning_rate": 1.6216246361724276e-05, "loss": 0.064, "step": 107425 }, { "epoch": 1.5408994435840073, "grad_norm": 1.147286295890808, "learning_rate": 1.6212262848159084e-05, "loss": 0.1058, "step": 107450 }, { "epoch": 1.541257959043194, "grad_norm": 6.2101030349731445, "learning_rate": 1.6208279334593892e-05, "loss": 0.1242, "step": 107475 }, { "epoch": 1.5416164745023806, "grad_norm": 4.276113510131836, "learning_rate": 1.6204295821028703e-05, "loss": 0.1157, "step": 107500 }, { "epoch": 1.5419749899615671, "grad_norm": 0.9203636646270752, "learning_rate": 1.620031230746351e-05, "loss": 0.0852, "step": 107525 }, { "epoch": 1.5423335054207539, "grad_norm": 0.4080369174480438, "learning_rate": 1.619632879389832e-05, "loss": 0.1427, "step": 107550 }, { "epoch": 1.5426920208799404, "grad_norm": 0.39750397205352783, "learning_rate": 1.6192345280333127e-05, "loss": 0.115, "step": 107575 }, { "epoch": 1.543050536339127, "grad_norm": 0.44245103001594543, "learning_rate": 1.6188361766767938e-05, "loss": 0.0772, "step": 107600 }, { "epoch": 1.5434090517983137, "grad_norm": 6.746984004974365, "learning_rate": 1.6184378253202746e-05, "loss": 0.1868, "step": 107625 }, { "epoch": 1.5437675672575002, "grad_norm": 2.572519540786743, "learning_rate": 1.6180394739637554e-05, "loss": 0.1485, "step": 107650 }, { "epoch": 1.5441260827166867, "grad_norm": 3.4465346336364746, "learning_rate": 1.6176411226072362e-05, "loss": 0.1122, "step": 107675 }, { "epoch": 1.5444845981758735, "grad_norm": 1.2741307020187378, "learning_rate": 1.617242771250717e-05, "loss": 0.1679, "step": 107700 }, { "epoch": 1.54484311363506, "grad_norm": 17.811254501342773, "learning_rate": 1.616844419894198e-05, "loss": 0.1387, "step": 107725 }, { "epoch": 1.5452016290942465, "grad_norm": 1.6953932046890259, "learning_rate": 1.616446068537679e-05, "loss": 0.1705, "step": 107750 }, { "epoch": 1.5455601445534333, "grad_norm": 1.1120035648345947, "learning_rate": 1.6160477171811596e-05, "loss": 0.1024, "step": 107775 }, { "epoch": 1.5459186600126198, "grad_norm": 3.078814744949341, "learning_rate": 1.6156493658246404e-05, "loss": 0.2082, "step": 107800 }, { "epoch": 1.5462771754718063, "grad_norm": 0.775729775428772, "learning_rate": 1.6152510144681212e-05, "loss": 0.1497, "step": 107825 }, { "epoch": 1.546635690930993, "grad_norm": 0.8883739113807678, "learning_rate": 1.6148526631116023e-05, "loss": 0.0779, "step": 107850 }, { "epoch": 1.5469942063901796, "grad_norm": 3.8615472316741943, "learning_rate": 1.614454311755083e-05, "loss": 0.1713, "step": 107875 }, { "epoch": 1.5473527218493661, "grad_norm": 13.45565128326416, "learning_rate": 1.614055960398564e-05, "loss": 0.1042, "step": 107900 }, { "epoch": 1.5477112373085529, "grad_norm": 25.119766235351562, "learning_rate": 1.6136576090420447e-05, "loss": 0.1144, "step": 107925 }, { "epoch": 1.5480697527677394, "grad_norm": 0.0702333077788353, "learning_rate": 1.6132592576855255e-05, "loss": 0.1377, "step": 107950 }, { "epoch": 1.548428268226926, "grad_norm": 0.229001984000206, "learning_rate": 1.6128609063290066e-05, "loss": 0.1501, "step": 107975 }, { "epoch": 1.5487867836861127, "grad_norm": 4.119572162628174, "learning_rate": 1.6124625549724874e-05, "loss": 0.2272, "step": 108000 }, { "epoch": 1.5491452991452992, "grad_norm": 2.570965051651001, "learning_rate": 1.612064203615968e-05, "loss": 0.1107, "step": 108025 }, { "epoch": 1.5495038146044857, "grad_norm": 22.811033248901367, "learning_rate": 1.611665852259449e-05, "loss": 0.2185, "step": 108050 }, { "epoch": 1.5498623300636725, "grad_norm": 6.090419292449951, "learning_rate": 1.6112675009029297e-05, "loss": 0.1772, "step": 108075 }, { "epoch": 1.550220845522859, "grad_norm": 0.3966958522796631, "learning_rate": 1.610869149546411e-05, "loss": 0.0806, "step": 108100 }, { "epoch": 1.5505793609820455, "grad_norm": 0.48335739970207214, "learning_rate": 1.6104707981898916e-05, "loss": 0.1624, "step": 108125 }, { "epoch": 1.5509378764412323, "grad_norm": 3.772495746612549, "learning_rate": 1.6100724468333724e-05, "loss": 0.1632, "step": 108150 }, { "epoch": 1.5512963919004188, "grad_norm": 7.01034688949585, "learning_rate": 1.6096740954768532e-05, "loss": 0.0655, "step": 108175 }, { "epoch": 1.5516549073596053, "grad_norm": 5.064388275146484, "learning_rate": 1.6092757441203343e-05, "loss": 0.1814, "step": 108200 }, { "epoch": 1.552013422818792, "grad_norm": 0.21941854059696198, "learning_rate": 1.608877392763815e-05, "loss": 0.1129, "step": 108225 }, { "epoch": 1.5523719382779786, "grad_norm": 0.8513625264167786, "learning_rate": 1.6084790414072956e-05, "loss": 0.1705, "step": 108250 }, { "epoch": 1.5527304537371651, "grad_norm": 1.1948915719985962, "learning_rate": 1.6080806900507764e-05, "loss": 0.1396, "step": 108275 }, { "epoch": 1.5530889691963519, "grad_norm": 18.94098472595215, "learning_rate": 1.607682338694257e-05, "loss": 0.1928, "step": 108300 }, { "epoch": 1.5534474846555384, "grad_norm": 8.259722709655762, "learning_rate": 1.6072839873377383e-05, "loss": 0.1227, "step": 108325 }, { "epoch": 1.553806000114725, "grad_norm": 1.2936654090881348, "learning_rate": 1.606885635981219e-05, "loss": 0.1746, "step": 108350 }, { "epoch": 1.5541645155739117, "grad_norm": 3.4896774291992188, "learning_rate": 1.6064872846246998e-05, "loss": 0.1512, "step": 108375 }, { "epoch": 1.5545230310330982, "grad_norm": 0.1645621657371521, "learning_rate": 1.6060889332681806e-05, "loss": 0.077, "step": 108400 }, { "epoch": 1.5548815464922847, "grad_norm": 2.0376346111297607, "learning_rate": 1.6056905819116614e-05, "loss": 0.0662, "step": 108425 }, { "epoch": 1.5552400619514715, "grad_norm": 1.0306349992752075, "learning_rate": 1.6052922305551425e-05, "loss": 0.0542, "step": 108450 }, { "epoch": 1.555598577410658, "grad_norm": 13.372455596923828, "learning_rate": 1.6048938791986233e-05, "loss": 0.1297, "step": 108475 }, { "epoch": 1.5559570928698445, "grad_norm": 2.477321147918701, "learning_rate": 1.604495527842104e-05, "loss": 0.1465, "step": 108500 }, { "epoch": 1.5563156083290313, "grad_norm": 1.5914510488510132, "learning_rate": 1.604097176485585e-05, "loss": 0.1332, "step": 108525 }, { "epoch": 1.5566741237882178, "grad_norm": 10.006301879882812, "learning_rate": 1.6036988251290657e-05, "loss": 0.1306, "step": 108550 }, { "epoch": 1.5570326392474043, "grad_norm": 7.273108959197998, "learning_rate": 1.6033004737725468e-05, "loss": 0.0975, "step": 108575 }, { "epoch": 1.557391154706591, "grad_norm": 18.810632705688477, "learning_rate": 1.6029021224160276e-05, "loss": 0.1053, "step": 108600 }, { "epoch": 1.5577496701657776, "grad_norm": 1.5558066368103027, "learning_rate": 1.6025037710595084e-05, "loss": 0.1072, "step": 108625 }, { "epoch": 1.5581081856249641, "grad_norm": 4.159802436828613, "learning_rate": 1.602105419702989e-05, "loss": 0.1119, "step": 108650 }, { "epoch": 1.5584667010841509, "grad_norm": 0.13541103899478912, "learning_rate": 1.60170706834647e-05, "loss": 0.1406, "step": 108675 }, { "epoch": 1.5588252165433374, "grad_norm": 4.082190036773682, "learning_rate": 1.601308716989951e-05, "loss": 0.1209, "step": 108700 }, { "epoch": 1.559183732002524, "grad_norm": 11.430256843566895, "learning_rate": 1.6009103656334318e-05, "loss": 0.1359, "step": 108725 }, { "epoch": 1.5595422474617107, "grad_norm": 1.7085527181625366, "learning_rate": 1.6005120142769126e-05, "loss": 0.1253, "step": 108750 }, { "epoch": 1.5599007629208972, "grad_norm": 0.03154303878545761, "learning_rate": 1.6001136629203934e-05, "loss": 0.0845, "step": 108775 }, { "epoch": 1.5602592783800837, "grad_norm": 3.248420000076294, "learning_rate": 1.5997153115638745e-05, "loss": 0.1361, "step": 108800 }, { "epoch": 1.5606177938392705, "grad_norm": 8.792187690734863, "learning_rate": 1.5993169602073553e-05, "loss": 0.1236, "step": 108825 }, { "epoch": 1.560976309298457, "grad_norm": 10.390057563781738, "learning_rate": 1.598918608850836e-05, "loss": 0.1843, "step": 108850 }, { "epoch": 1.5613348247576435, "grad_norm": 4.813485622406006, "learning_rate": 1.598520257494317e-05, "loss": 0.119, "step": 108875 }, { "epoch": 1.5616933402168303, "grad_norm": 0.3855483829975128, "learning_rate": 1.5981219061377977e-05, "loss": 0.1694, "step": 108900 }, { "epoch": 1.5620518556760168, "grad_norm": 10.619688987731934, "learning_rate": 1.5977235547812788e-05, "loss": 0.1794, "step": 108925 }, { "epoch": 1.5624103711352033, "grad_norm": 15.722604751586914, "learning_rate": 1.5973252034247596e-05, "loss": 0.118, "step": 108950 }, { "epoch": 1.56276888659439, "grad_norm": 19.2074031829834, "learning_rate": 1.5969268520682404e-05, "loss": 0.1763, "step": 108975 }, { "epoch": 1.5631274020535766, "grad_norm": 4.230288028717041, "learning_rate": 1.596528500711721e-05, "loss": 0.1056, "step": 109000 }, { "epoch": 1.5634859175127631, "grad_norm": 4.190769672393799, "learning_rate": 1.596130149355202e-05, "loss": 0.1037, "step": 109025 }, { "epoch": 1.5638444329719499, "grad_norm": 0.41619017720222473, "learning_rate": 1.595731797998683e-05, "loss": 0.1448, "step": 109050 }, { "epoch": 1.5642029484311364, "grad_norm": 15.414167404174805, "learning_rate": 1.5953334466421638e-05, "loss": 0.1277, "step": 109075 }, { "epoch": 1.564561463890323, "grad_norm": 9.452557563781738, "learning_rate": 1.5949350952856446e-05, "loss": 0.1271, "step": 109100 }, { "epoch": 1.5649199793495097, "grad_norm": 1.3272111415863037, "learning_rate": 1.5945367439291254e-05, "loss": 0.1151, "step": 109125 }, { "epoch": 1.5652784948086962, "grad_norm": 6.31231164932251, "learning_rate": 1.5941383925726062e-05, "loss": 0.1676, "step": 109150 }, { "epoch": 1.5656370102678827, "grad_norm": 0.4028574526309967, "learning_rate": 1.5937400412160873e-05, "loss": 0.1763, "step": 109175 }, { "epoch": 1.5659955257270695, "grad_norm": 2.637566328048706, "learning_rate": 1.593341689859568e-05, "loss": 0.107, "step": 109200 }, { "epoch": 1.566354041186256, "grad_norm": 12.053897857666016, "learning_rate": 1.592943338503049e-05, "loss": 0.1369, "step": 109225 }, { "epoch": 1.5667125566454425, "grad_norm": 6.918288230895996, "learning_rate": 1.5925449871465297e-05, "loss": 0.1025, "step": 109250 }, { "epoch": 1.5670710721046293, "grad_norm": 0.23910917341709137, "learning_rate": 1.5921466357900104e-05, "loss": 0.1504, "step": 109275 }, { "epoch": 1.5674295875638158, "grad_norm": 1.2216789722442627, "learning_rate": 1.5917482844334916e-05, "loss": 0.114, "step": 109300 }, { "epoch": 1.5677881030230023, "grad_norm": 2.972081422805786, "learning_rate": 1.5913499330769723e-05, "loss": 0.0948, "step": 109325 }, { "epoch": 1.568146618482189, "grad_norm": 6.803816318511963, "learning_rate": 1.5909515817204528e-05, "loss": 0.1407, "step": 109350 }, { "epoch": 1.5685051339413756, "grad_norm": 1.1384490728378296, "learning_rate": 1.5905532303639336e-05, "loss": 0.1255, "step": 109375 }, { "epoch": 1.568863649400562, "grad_norm": 8.453943252563477, "learning_rate": 1.5901548790074147e-05, "loss": 0.1131, "step": 109400 }, { "epoch": 1.5692221648597489, "grad_norm": 11.81513500213623, "learning_rate": 1.5897565276508955e-05, "loss": 0.1631, "step": 109425 }, { "epoch": 1.5695806803189354, "grad_norm": 23.14830780029297, "learning_rate": 1.5893581762943763e-05, "loss": 0.1031, "step": 109450 }, { "epoch": 1.569939195778122, "grad_norm": 2.1228673458099365, "learning_rate": 1.588959824937857e-05, "loss": 0.1347, "step": 109475 }, { "epoch": 1.5702977112373087, "grad_norm": 2.0391669273376465, "learning_rate": 1.588561473581338e-05, "loss": 0.1361, "step": 109500 }, { "epoch": 1.5706562266964952, "grad_norm": 15.95274829864502, "learning_rate": 1.588163122224819e-05, "loss": 0.1733, "step": 109525 }, { "epoch": 1.5710147421556817, "grad_norm": 2.0213677883148193, "learning_rate": 1.5877647708682997e-05, "loss": 0.0817, "step": 109550 }, { "epoch": 1.5713732576148685, "grad_norm": 0.6326050758361816, "learning_rate": 1.5873664195117805e-05, "loss": 0.1102, "step": 109575 }, { "epoch": 1.571731773074055, "grad_norm": 0.1941893845796585, "learning_rate": 1.5869680681552613e-05, "loss": 0.1113, "step": 109600 }, { "epoch": 1.5720902885332415, "grad_norm": 4.457529067993164, "learning_rate": 1.586569716798742e-05, "loss": 0.1012, "step": 109625 }, { "epoch": 1.5724488039924283, "grad_norm": 0.4124469459056854, "learning_rate": 1.5861713654422232e-05, "loss": 0.1982, "step": 109650 }, { "epoch": 1.5728073194516148, "grad_norm": 0.20402249693870544, "learning_rate": 1.585773014085704e-05, "loss": 0.1029, "step": 109675 }, { "epoch": 1.5731658349108013, "grad_norm": 2.613887310028076, "learning_rate": 1.5853746627291848e-05, "loss": 0.1261, "step": 109700 }, { "epoch": 1.573524350369988, "grad_norm": 0.14867010712623596, "learning_rate": 1.5849763113726656e-05, "loss": 0.0477, "step": 109725 }, { "epoch": 1.5738828658291746, "grad_norm": 10.71470832824707, "learning_rate": 1.5845779600161464e-05, "loss": 0.1237, "step": 109750 }, { "epoch": 1.574241381288361, "grad_norm": 15.062411308288574, "learning_rate": 1.5841796086596275e-05, "loss": 0.1882, "step": 109775 }, { "epoch": 1.5745998967475479, "grad_norm": 0.393209844827652, "learning_rate": 1.5837812573031083e-05, "loss": 0.0804, "step": 109800 }, { "epoch": 1.5749584122067344, "grad_norm": 0.3435704708099365, "learning_rate": 1.583382905946589e-05, "loss": 0.0633, "step": 109825 }, { "epoch": 1.575316927665921, "grad_norm": 2.9095771312713623, "learning_rate": 1.58298455459007e-05, "loss": 0.0882, "step": 109850 }, { "epoch": 1.5756754431251077, "grad_norm": 12.120250701904297, "learning_rate": 1.5825862032335506e-05, "loss": 0.1246, "step": 109875 }, { "epoch": 1.5760339585842942, "grad_norm": 11.771854400634766, "learning_rate": 1.5821878518770317e-05, "loss": 0.0988, "step": 109900 }, { "epoch": 1.5763924740434807, "grad_norm": 1.779136300086975, "learning_rate": 1.5817895005205125e-05, "loss": 0.0724, "step": 109925 }, { "epoch": 1.5767509895026675, "grad_norm": 0.2828727662563324, "learning_rate": 1.5813911491639933e-05, "loss": 0.0983, "step": 109950 }, { "epoch": 1.577109504961854, "grad_norm": 7.221841335296631, "learning_rate": 1.580992797807474e-05, "loss": 0.1145, "step": 109975 }, { "epoch": 1.5774680204210405, "grad_norm": 14.66770076751709, "learning_rate": 1.580594446450955e-05, "loss": 0.1409, "step": 110000 }, { "epoch": 1.5778265358802273, "grad_norm": 1.0853767395019531, "learning_rate": 1.580196095094436e-05, "loss": 0.1521, "step": 110025 }, { "epoch": 1.5781850513394138, "grad_norm": 17.464588165283203, "learning_rate": 1.5797977437379168e-05, "loss": 0.1415, "step": 110050 }, { "epoch": 1.5785435667986003, "grad_norm": 4.351961612701416, "learning_rate": 1.5793993923813976e-05, "loss": 0.1626, "step": 110075 }, { "epoch": 1.578902082257787, "grad_norm": 4.580378532409668, "learning_rate": 1.5790010410248784e-05, "loss": 0.0907, "step": 110100 }, { "epoch": 1.5792605977169736, "grad_norm": 2.9517529010772705, "learning_rate": 1.5786026896683595e-05, "loss": 0.1055, "step": 110125 }, { "epoch": 1.57961911317616, "grad_norm": 14.013166427612305, "learning_rate": 1.5782043383118403e-05, "loss": 0.135, "step": 110150 }, { "epoch": 1.5799776286353469, "grad_norm": 1.046134352684021, "learning_rate": 1.577805986955321e-05, "loss": 0.1277, "step": 110175 }, { "epoch": 1.5803361440945334, "grad_norm": 1.1009801626205444, "learning_rate": 1.577407635598802e-05, "loss": 0.1408, "step": 110200 }, { "epoch": 1.58069465955372, "grad_norm": 0.4955666661262512, "learning_rate": 1.5770092842422826e-05, "loss": 0.0833, "step": 110225 }, { "epoch": 1.5810531750129067, "grad_norm": 0.9613654017448425, "learning_rate": 1.5766109328857637e-05, "loss": 0.1636, "step": 110250 }, { "epoch": 1.5814116904720932, "grad_norm": 0.29086530208587646, "learning_rate": 1.5762125815292445e-05, "loss": 0.1547, "step": 110275 }, { "epoch": 1.5817702059312797, "grad_norm": 19.909225463867188, "learning_rate": 1.5758142301727253e-05, "loss": 0.0891, "step": 110300 }, { "epoch": 1.5821287213904665, "grad_norm": 6.13496732711792, "learning_rate": 1.575415878816206e-05, "loss": 0.1182, "step": 110325 }, { "epoch": 1.582487236849653, "grad_norm": 9.61989688873291, "learning_rate": 1.575017527459687e-05, "loss": 0.2062, "step": 110350 }, { "epoch": 1.5828457523088395, "grad_norm": 13.550698280334473, "learning_rate": 1.574619176103168e-05, "loss": 0.0974, "step": 110375 }, { "epoch": 1.5832042677680263, "grad_norm": 0.805781900882721, "learning_rate": 1.5742208247466488e-05, "loss": 0.2203, "step": 110400 }, { "epoch": 1.5835627832272128, "grad_norm": 9.057718276977539, "learning_rate": 1.5738224733901296e-05, "loss": 0.2011, "step": 110425 }, { "epoch": 1.5839212986863993, "grad_norm": 0.25579530000686646, "learning_rate": 1.57342412203361e-05, "loss": 0.1256, "step": 110450 }, { "epoch": 1.584279814145586, "grad_norm": 0.23669451475143433, "learning_rate": 1.5730257706770908e-05, "loss": 0.0291, "step": 110475 }, { "epoch": 1.5846383296047726, "grad_norm": 25.654193878173828, "learning_rate": 1.572627419320572e-05, "loss": 0.1908, "step": 110500 }, { "epoch": 1.584996845063959, "grad_norm": 3.048365831375122, "learning_rate": 1.5722290679640527e-05, "loss": 0.1646, "step": 110525 }, { "epoch": 1.5853553605231459, "grad_norm": 1.2362544536590576, "learning_rate": 1.5718307166075335e-05, "loss": 0.0812, "step": 110550 }, { "epoch": 1.5857138759823324, "grad_norm": 19.226409912109375, "learning_rate": 1.5714323652510143e-05, "loss": 0.1637, "step": 110575 }, { "epoch": 1.586072391441519, "grad_norm": 13.482420921325684, "learning_rate": 1.571034013894495e-05, "loss": 0.1874, "step": 110600 }, { "epoch": 1.5864309069007057, "grad_norm": 9.714757919311523, "learning_rate": 1.5706356625379762e-05, "loss": 0.1631, "step": 110625 }, { "epoch": 1.5867894223598922, "grad_norm": 3.507180690765381, "learning_rate": 1.570237311181457e-05, "loss": 0.1147, "step": 110650 }, { "epoch": 1.5871479378190787, "grad_norm": 12.377923965454102, "learning_rate": 1.5698389598249378e-05, "loss": 0.0939, "step": 110675 }, { "epoch": 1.5875064532782655, "grad_norm": 0.864164412021637, "learning_rate": 1.5694406084684185e-05, "loss": 0.1559, "step": 110700 }, { "epoch": 1.587864968737452, "grad_norm": 3.8399596214294434, "learning_rate": 1.5690422571118997e-05, "loss": 0.058, "step": 110725 }, { "epoch": 1.5882234841966385, "grad_norm": 1.9394279718399048, "learning_rate": 1.5686439057553805e-05, "loss": 0.1316, "step": 110750 }, { "epoch": 1.5885819996558252, "grad_norm": 8.541963577270508, "learning_rate": 1.5682455543988612e-05, "loss": 0.2121, "step": 110775 }, { "epoch": 1.5889405151150118, "grad_norm": 2.5412757396698, "learning_rate": 1.567847203042342e-05, "loss": 0.1035, "step": 110800 }, { "epoch": 1.5892990305741983, "grad_norm": 2.97137188911438, "learning_rate": 1.5674488516858228e-05, "loss": 0.0831, "step": 110825 }, { "epoch": 1.589657546033385, "grad_norm": 5.269392967224121, "learning_rate": 1.567050500329304e-05, "loss": 0.2124, "step": 110850 }, { "epoch": 1.5900160614925716, "grad_norm": 0.9445958137512207, "learning_rate": 1.5666521489727847e-05, "loss": 0.068, "step": 110875 }, { "epoch": 1.590374576951758, "grad_norm": 12.568552017211914, "learning_rate": 1.5662537976162655e-05, "loss": 0.0861, "step": 110900 }, { "epoch": 1.5907330924109448, "grad_norm": 0.0881243571639061, "learning_rate": 1.5658554462597463e-05, "loss": 0.071, "step": 110925 }, { "epoch": 1.5910916078701314, "grad_norm": 0.28377765417099, "learning_rate": 1.565457094903227e-05, "loss": 0.0699, "step": 110950 }, { "epoch": 1.591450123329318, "grad_norm": 3.82171630859375, "learning_rate": 1.5650587435467082e-05, "loss": 0.102, "step": 110975 }, { "epoch": 1.5918086387885046, "grad_norm": 18.61661148071289, "learning_rate": 1.564660392190189e-05, "loss": 0.1834, "step": 111000 }, { "epoch": 1.5921671542476912, "grad_norm": 0.38322746753692627, "learning_rate": 1.5642620408336698e-05, "loss": 0.0831, "step": 111025 }, { "epoch": 1.5925256697068777, "grad_norm": 17.406780242919922, "learning_rate": 1.5638636894771505e-05, "loss": 0.1363, "step": 111050 }, { "epoch": 1.5928841851660644, "grad_norm": 0.5807610154151917, "learning_rate": 1.5634653381206313e-05, "loss": 0.1135, "step": 111075 }, { "epoch": 1.593242700625251, "grad_norm": 9.470600128173828, "learning_rate": 1.5630669867641125e-05, "loss": 0.0903, "step": 111100 }, { "epoch": 1.5936012160844375, "grad_norm": 22.923940658569336, "learning_rate": 1.5626686354075932e-05, "loss": 0.1242, "step": 111125 }, { "epoch": 1.5939597315436242, "grad_norm": 0.7129669785499573, "learning_rate": 1.562270284051074e-05, "loss": 0.2053, "step": 111150 }, { "epoch": 1.5943182470028108, "grad_norm": 6.137176036834717, "learning_rate": 1.5618719326945548e-05, "loss": 0.1022, "step": 111175 }, { "epoch": 1.5946767624619973, "grad_norm": 4.760951995849609, "learning_rate": 1.5614735813380356e-05, "loss": 0.1428, "step": 111200 }, { "epoch": 1.595035277921184, "grad_norm": 13.642029762268066, "learning_rate": 1.5610752299815167e-05, "loss": 0.1022, "step": 111225 }, { "epoch": 1.5953937933803706, "grad_norm": 3.182339668273926, "learning_rate": 1.5606768786249975e-05, "loss": 0.2611, "step": 111250 }, { "epoch": 1.595752308839557, "grad_norm": 0.0845305398106575, "learning_rate": 1.5602785272684783e-05, "loss": 0.1505, "step": 111275 }, { "epoch": 1.5961108242987438, "grad_norm": 1.5389952659606934, "learning_rate": 1.559880175911959e-05, "loss": 0.0846, "step": 111300 }, { "epoch": 1.5964693397579304, "grad_norm": 6.4708685874938965, "learning_rate": 1.5594818245554402e-05, "loss": 0.1741, "step": 111325 }, { "epoch": 1.596827855217117, "grad_norm": 5.337104797363281, "learning_rate": 1.559083473198921e-05, "loss": 0.0818, "step": 111350 }, { "epoch": 1.5971863706763036, "grad_norm": 1.3445794582366943, "learning_rate": 1.5586851218424018e-05, "loss": 0.1378, "step": 111375 }, { "epoch": 1.5975448861354902, "grad_norm": 2.137669086456299, "learning_rate": 1.5582867704858825e-05, "loss": 0.0964, "step": 111400 }, { "epoch": 1.5979034015946767, "grad_norm": 9.19495677947998, "learning_rate": 1.5578884191293633e-05, "loss": 0.1727, "step": 111425 }, { "epoch": 1.5982619170538634, "grad_norm": 4.860259056091309, "learning_rate": 1.5574900677728444e-05, "loss": 0.1644, "step": 111450 }, { "epoch": 1.59862043251305, "grad_norm": 0.9365372061729431, "learning_rate": 1.5570917164163252e-05, "loss": 0.0743, "step": 111475 }, { "epoch": 1.5989789479722365, "grad_norm": 1.0434843301773071, "learning_rate": 1.556693365059806e-05, "loss": 0.141, "step": 111500 }, { "epoch": 1.5993374634314232, "grad_norm": 0.7385405898094177, "learning_rate": 1.5562950137032868e-05, "loss": 0.1671, "step": 111525 }, { "epoch": 1.5996959788906098, "grad_norm": 0.22998009622097015, "learning_rate": 1.5558966623467676e-05, "loss": 0.0633, "step": 111550 }, { "epoch": 1.6000544943497963, "grad_norm": 4.8020172119140625, "learning_rate": 1.5554983109902484e-05, "loss": 0.157, "step": 111575 }, { "epoch": 1.600413009808983, "grad_norm": 2.845041275024414, "learning_rate": 1.555099959633729e-05, "loss": 0.0641, "step": 111600 }, { "epoch": 1.6007715252681696, "grad_norm": 0.15155786275863647, "learning_rate": 1.55470160827721e-05, "loss": 0.1478, "step": 111625 }, { "epoch": 1.601130040727356, "grad_norm": 3.545516014099121, "learning_rate": 1.5543032569206907e-05, "loss": 0.0637, "step": 111650 }, { "epoch": 1.6014885561865428, "grad_norm": 15.987210273742676, "learning_rate": 1.5539049055641715e-05, "loss": 0.0976, "step": 111675 }, { "epoch": 1.6018470716457294, "grad_norm": 15.09727954864502, "learning_rate": 1.5535065542076526e-05, "loss": 0.1391, "step": 111700 }, { "epoch": 1.602205587104916, "grad_norm": 13.655477523803711, "learning_rate": 1.5531082028511334e-05, "loss": 0.1568, "step": 111725 }, { "epoch": 1.6025641025641026, "grad_norm": 0.7013760805130005, "learning_rate": 1.5527098514946142e-05, "loss": 0.1643, "step": 111750 }, { "epoch": 1.6029226180232892, "grad_norm": 5.5872697830200195, "learning_rate": 1.552311500138095e-05, "loss": 0.1314, "step": 111775 }, { "epoch": 1.6032811334824757, "grad_norm": 1.5167573690414429, "learning_rate": 1.5519131487815758e-05, "loss": 0.1126, "step": 111800 }, { "epoch": 1.6036396489416624, "grad_norm": 7.3395609855651855, "learning_rate": 1.551514797425057e-05, "loss": 0.1303, "step": 111825 }, { "epoch": 1.603998164400849, "grad_norm": 3.1273200511932373, "learning_rate": 1.5511164460685377e-05, "loss": 0.1363, "step": 111850 }, { "epoch": 1.6043566798600355, "grad_norm": 0.7048693895339966, "learning_rate": 1.5507180947120185e-05, "loss": 0.1139, "step": 111875 }, { "epoch": 1.6047151953192222, "grad_norm": 0.9097107648849487, "learning_rate": 1.5503197433554992e-05, "loss": 0.1336, "step": 111900 }, { "epoch": 1.6050737107784088, "grad_norm": 17.16159439086914, "learning_rate": 1.5499213919989804e-05, "loss": 0.1651, "step": 111925 }, { "epoch": 1.6054322262375953, "grad_norm": 0.848983645439148, "learning_rate": 1.549523040642461e-05, "loss": 0.1194, "step": 111950 }, { "epoch": 1.605790741696782, "grad_norm": 1.434770941734314, "learning_rate": 1.549124689285942e-05, "loss": 0.1262, "step": 111975 }, { "epoch": 1.6061492571559686, "grad_norm": 3.7201075553894043, "learning_rate": 1.5487263379294227e-05, "loss": 0.1896, "step": 112000 }, { "epoch": 1.606507772615155, "grad_norm": 2.1184139251708984, "learning_rate": 1.5483279865729035e-05, "loss": 0.0952, "step": 112025 }, { "epoch": 1.6068662880743418, "grad_norm": 0.3768002390861511, "learning_rate": 1.5479296352163846e-05, "loss": 0.1079, "step": 112050 }, { "epoch": 1.6072248035335284, "grad_norm": 28.24454116821289, "learning_rate": 1.5475312838598654e-05, "loss": 0.1049, "step": 112075 }, { "epoch": 1.607583318992715, "grad_norm": 8.396060943603516, "learning_rate": 1.5471329325033462e-05, "loss": 0.1268, "step": 112100 }, { "epoch": 1.6079418344519016, "grad_norm": 11.003251075744629, "learning_rate": 1.546734581146827e-05, "loss": 0.1136, "step": 112125 }, { "epoch": 1.6083003499110882, "grad_norm": 1.1842052936553955, "learning_rate": 1.5463362297903078e-05, "loss": 0.0901, "step": 112150 }, { "epoch": 1.6086588653702747, "grad_norm": 3.5207576751708984, "learning_rate": 1.545937878433789e-05, "loss": 0.113, "step": 112175 }, { "epoch": 1.6090173808294614, "grad_norm": 1.2231321334838867, "learning_rate": 1.5455395270772697e-05, "loss": 0.0682, "step": 112200 }, { "epoch": 1.609375896288648, "grad_norm": 1.2826125621795654, "learning_rate": 1.5451411757207505e-05, "loss": 0.1486, "step": 112225 }, { "epoch": 1.6097344117478345, "grad_norm": 0.14970771968364716, "learning_rate": 1.5447428243642312e-05, "loss": 0.0576, "step": 112250 }, { "epoch": 1.6100929272070212, "grad_norm": 8.436515808105469, "learning_rate": 1.544344473007712e-05, "loss": 0.1554, "step": 112275 }, { "epoch": 1.6104514426662078, "grad_norm": 0.1328907161951065, "learning_rate": 1.543946121651193e-05, "loss": 0.1408, "step": 112300 }, { "epoch": 1.6108099581253943, "grad_norm": 0.0817224532365799, "learning_rate": 1.543547770294674e-05, "loss": 0.2044, "step": 112325 }, { "epoch": 1.611168473584581, "grad_norm": 17.77408218383789, "learning_rate": 1.5431494189381547e-05, "loss": 0.1814, "step": 112350 }, { "epoch": 1.6115269890437676, "grad_norm": 2.059018850326538, "learning_rate": 1.5427510675816355e-05, "loss": 0.2507, "step": 112375 }, { "epoch": 1.611885504502954, "grad_norm": 0.8910621404647827, "learning_rate": 1.5423527162251163e-05, "loss": 0.1306, "step": 112400 }, { "epoch": 1.6122440199621408, "grad_norm": 24.00347137451172, "learning_rate": 1.5419543648685974e-05, "loss": 0.1375, "step": 112425 }, { "epoch": 1.6126025354213274, "grad_norm": 16.46379280090332, "learning_rate": 1.5415560135120782e-05, "loss": 0.1712, "step": 112450 }, { "epoch": 1.612961050880514, "grad_norm": 18.478515625, "learning_rate": 1.541157662155559e-05, "loss": 0.1609, "step": 112475 }, { "epoch": 1.6133195663397006, "grad_norm": 11.306337356567383, "learning_rate": 1.5407593107990398e-05, "loss": 0.0828, "step": 112500 }, { "epoch": 1.6136780817988872, "grad_norm": 5.800646781921387, "learning_rate": 1.540360959442521e-05, "loss": 0.1543, "step": 112525 }, { "epoch": 1.6140365972580737, "grad_norm": 15.809135437011719, "learning_rate": 1.5399626080860017e-05, "loss": 0.2035, "step": 112550 }, { "epoch": 1.6143951127172604, "grad_norm": 12.907777786254883, "learning_rate": 1.5395642567294825e-05, "loss": 0.1039, "step": 112575 }, { "epoch": 1.614753628176447, "grad_norm": 2.8142759799957275, "learning_rate": 1.5391659053729632e-05, "loss": 0.1903, "step": 112600 }, { "epoch": 1.6151121436356335, "grad_norm": 1.1494107246398926, "learning_rate": 1.538767554016444e-05, "loss": 0.124, "step": 112625 }, { "epoch": 1.6154706590948202, "grad_norm": 1.266666293144226, "learning_rate": 1.5383692026599248e-05, "loss": 0.054, "step": 112650 }, { "epoch": 1.6158291745540068, "grad_norm": 11.302698135375977, "learning_rate": 1.5379708513034056e-05, "loss": 0.1588, "step": 112675 }, { "epoch": 1.6161876900131933, "grad_norm": 2.8204596042633057, "learning_rate": 1.5375724999468864e-05, "loss": 0.0858, "step": 112700 }, { "epoch": 1.61654620547238, "grad_norm": 0.6203196048736572, "learning_rate": 1.537174148590367e-05, "loss": 0.1361, "step": 112725 }, { "epoch": 1.6169047209315666, "grad_norm": 1.4002128839492798, "learning_rate": 1.536775797233848e-05, "loss": 0.0942, "step": 112750 }, { "epoch": 1.617263236390753, "grad_norm": 0.6749517321586609, "learning_rate": 1.536377445877329e-05, "loss": 0.197, "step": 112775 }, { "epoch": 1.6176217518499398, "grad_norm": 7.037564277648926, "learning_rate": 1.53597909452081e-05, "loss": 0.1556, "step": 112800 }, { "epoch": 1.6179802673091264, "grad_norm": 1.9370392560958862, "learning_rate": 1.5355807431642906e-05, "loss": 0.1011, "step": 112825 }, { "epoch": 1.618338782768313, "grad_norm": 0.4509413540363312, "learning_rate": 1.5351823918077714e-05, "loss": 0.1275, "step": 112850 }, { "epoch": 1.6186972982274996, "grad_norm": 5.692628860473633, "learning_rate": 1.5347840404512522e-05, "loss": 0.1582, "step": 112875 }, { "epoch": 1.6190558136866862, "grad_norm": 8.943541526794434, "learning_rate": 1.5343856890947333e-05, "loss": 0.1446, "step": 112900 }, { "epoch": 1.6194143291458727, "grad_norm": 6.961772918701172, "learning_rate": 1.533987337738214e-05, "loss": 0.1535, "step": 112925 }, { "epoch": 1.6197728446050594, "grad_norm": 2.6695363521575928, "learning_rate": 1.533588986381695e-05, "loss": 0.1151, "step": 112950 }, { "epoch": 1.620131360064246, "grad_norm": 23.33633804321289, "learning_rate": 1.5331906350251757e-05, "loss": 0.152, "step": 112975 }, { "epoch": 1.6204898755234325, "grad_norm": 3.421076536178589, "learning_rate": 1.5327922836686565e-05, "loss": 0.1462, "step": 113000 }, { "epoch": 1.6208483909826192, "grad_norm": 0.24243944883346558, "learning_rate": 1.5323939323121376e-05, "loss": 0.0836, "step": 113025 }, { "epoch": 1.6212069064418058, "grad_norm": 13.589010238647461, "learning_rate": 1.5319955809556184e-05, "loss": 0.1967, "step": 113050 }, { "epoch": 1.6215654219009923, "grad_norm": 9.86119270324707, "learning_rate": 1.531597229599099e-05, "loss": 0.1437, "step": 113075 }, { "epoch": 1.621923937360179, "grad_norm": 0.18476465344429016, "learning_rate": 1.53119887824258e-05, "loss": 0.0766, "step": 113100 }, { "epoch": 1.6222824528193656, "grad_norm": 0.13079750537872314, "learning_rate": 1.530800526886061e-05, "loss": 0.0946, "step": 113125 }, { "epoch": 1.622640968278552, "grad_norm": 1.1925601959228516, "learning_rate": 1.530402175529542e-05, "loss": 0.1285, "step": 113150 }, { "epoch": 1.6229994837377388, "grad_norm": 3.750605344772339, "learning_rate": 1.5300038241730226e-05, "loss": 0.1734, "step": 113175 }, { "epoch": 1.6233579991969254, "grad_norm": 27.10495376586914, "learning_rate": 1.5296054728165034e-05, "loss": 0.143, "step": 113200 }, { "epoch": 1.6237165146561119, "grad_norm": 0.854514479637146, "learning_rate": 1.5292071214599842e-05, "loss": 0.1265, "step": 113225 }, { "epoch": 1.6240750301152986, "grad_norm": 3.86334228515625, "learning_rate": 1.5288087701034653e-05, "loss": 0.1327, "step": 113250 }, { "epoch": 1.6244335455744852, "grad_norm": 7.381829738616943, "learning_rate": 1.528410418746946e-05, "loss": 0.1651, "step": 113275 }, { "epoch": 1.6247920610336717, "grad_norm": 25.146244049072266, "learning_rate": 1.528012067390427e-05, "loss": 0.1907, "step": 113300 }, { "epoch": 1.6251505764928584, "grad_norm": 13.734879493713379, "learning_rate": 1.5276137160339077e-05, "loss": 0.1307, "step": 113325 }, { "epoch": 1.625509091952045, "grad_norm": 0.8996734619140625, "learning_rate": 1.5272153646773885e-05, "loss": 0.0958, "step": 113350 }, { "epoch": 1.6258676074112315, "grad_norm": 4.953733444213867, "learning_rate": 1.5268170133208696e-05, "loss": 0.1335, "step": 113375 }, { "epoch": 1.6262261228704182, "grad_norm": 8.818832397460938, "learning_rate": 1.5264186619643504e-05, "loss": 0.1343, "step": 113400 }, { "epoch": 1.6265846383296048, "grad_norm": 1.826992392539978, "learning_rate": 1.526020310607831e-05, "loss": 0.1463, "step": 113425 }, { "epoch": 1.6269431537887913, "grad_norm": 11.589485168457031, "learning_rate": 1.525621959251312e-05, "loss": 0.1582, "step": 113450 }, { "epoch": 1.627301669247978, "grad_norm": 1.5187784433364868, "learning_rate": 1.5252236078947926e-05, "loss": 0.128, "step": 113475 }, { "epoch": 1.6276601847071646, "grad_norm": 27.120820999145508, "learning_rate": 1.5248252565382737e-05, "loss": 0.2017, "step": 113500 }, { "epoch": 1.628018700166351, "grad_norm": 0.10242827981710434, "learning_rate": 1.5244269051817545e-05, "loss": 0.1009, "step": 113525 }, { "epoch": 1.6283772156255378, "grad_norm": 13.412840843200684, "learning_rate": 1.5240285538252353e-05, "loss": 0.128, "step": 113550 }, { "epoch": 1.6287357310847244, "grad_norm": 10.943232536315918, "learning_rate": 1.523630202468716e-05, "loss": 0.0973, "step": 113575 }, { "epoch": 1.6290942465439109, "grad_norm": 1.8355530500411987, "learning_rate": 1.5232318511121968e-05, "loss": 0.0837, "step": 113600 }, { "epoch": 1.6294527620030976, "grad_norm": 0.7881605625152588, "learning_rate": 1.522833499755678e-05, "loss": 0.1395, "step": 113625 }, { "epoch": 1.6298112774622842, "grad_norm": 4.949732303619385, "learning_rate": 1.5224351483991587e-05, "loss": 0.1147, "step": 113650 }, { "epoch": 1.6301697929214707, "grad_norm": 7.874276638031006, "learning_rate": 1.5220367970426395e-05, "loss": 0.1016, "step": 113675 }, { "epoch": 1.6305283083806574, "grad_norm": 0.017793362960219383, "learning_rate": 1.5216384456861203e-05, "loss": 0.1214, "step": 113700 }, { "epoch": 1.630886823839844, "grad_norm": 5.516775608062744, "learning_rate": 1.5212400943296014e-05, "loss": 0.1664, "step": 113725 }, { "epoch": 1.6312453392990305, "grad_norm": 5.077443599700928, "learning_rate": 1.5208417429730822e-05, "loss": 0.1046, "step": 113750 }, { "epoch": 1.6316038547582172, "grad_norm": 1.1476367712020874, "learning_rate": 1.520443391616563e-05, "loss": 0.1257, "step": 113775 }, { "epoch": 1.6319623702174038, "grad_norm": 3.79379940032959, "learning_rate": 1.5200450402600438e-05, "loss": 0.083, "step": 113800 }, { "epoch": 1.6323208856765903, "grad_norm": 1.4883191585540771, "learning_rate": 1.5196466889035246e-05, "loss": 0.1332, "step": 113825 }, { "epoch": 1.632679401135777, "grad_norm": 10.787284851074219, "learning_rate": 1.5192483375470057e-05, "loss": 0.1071, "step": 113850 }, { "epoch": 1.6330379165949636, "grad_norm": 3.486327886581421, "learning_rate": 1.5188499861904865e-05, "loss": 0.0895, "step": 113875 }, { "epoch": 1.63339643205415, "grad_norm": 12.569319725036621, "learning_rate": 1.5184516348339673e-05, "loss": 0.2176, "step": 113900 }, { "epoch": 1.6337549475133368, "grad_norm": 10.959856033325195, "learning_rate": 1.518053283477448e-05, "loss": 0.1871, "step": 113925 }, { "epoch": 1.6341134629725234, "grad_norm": 2.2637999057769775, "learning_rate": 1.5176549321209288e-05, "loss": 0.1445, "step": 113950 }, { "epoch": 1.6344719784317099, "grad_norm": 0.454983651638031, "learning_rate": 1.51725658076441e-05, "loss": 0.1488, "step": 113975 }, { "epoch": 1.6348304938908966, "grad_norm": 20.339344024658203, "learning_rate": 1.5168582294078907e-05, "loss": 0.1803, "step": 114000 }, { "epoch": 1.6351890093500832, "grad_norm": 10.919414520263672, "learning_rate": 1.5164598780513713e-05, "loss": 0.0824, "step": 114025 }, { "epoch": 1.6355475248092697, "grad_norm": 2.0678627490997314, "learning_rate": 1.5160615266948521e-05, "loss": 0.1375, "step": 114050 }, { "epoch": 1.6359060402684564, "grad_norm": 1.1369608640670776, "learning_rate": 1.515663175338333e-05, "loss": 0.2479, "step": 114075 }, { "epoch": 1.636264555727643, "grad_norm": 15.102277755737305, "learning_rate": 1.515264823981814e-05, "loss": 0.1199, "step": 114100 }, { "epoch": 1.6366230711868295, "grad_norm": 10.988997459411621, "learning_rate": 1.5148664726252948e-05, "loss": 0.0801, "step": 114125 }, { "epoch": 1.6369815866460162, "grad_norm": 10.378682136535645, "learning_rate": 1.5144681212687756e-05, "loss": 0.1693, "step": 114150 }, { "epoch": 1.6373401021052028, "grad_norm": 17.611042022705078, "learning_rate": 1.5140697699122564e-05, "loss": 0.1191, "step": 114175 }, { "epoch": 1.6376986175643893, "grad_norm": 9.947006225585938, "learning_rate": 1.5136714185557372e-05, "loss": 0.1622, "step": 114200 }, { "epoch": 1.638057133023576, "grad_norm": 0.6024399399757385, "learning_rate": 1.5132730671992183e-05, "loss": 0.0803, "step": 114225 }, { "epoch": 1.6384156484827626, "grad_norm": 2.85636568069458, "learning_rate": 1.5128747158426991e-05, "loss": 0.1604, "step": 114250 }, { "epoch": 1.638774163941949, "grad_norm": 0.13448001444339752, "learning_rate": 1.5124763644861799e-05, "loss": 0.0855, "step": 114275 }, { "epoch": 1.6391326794011358, "grad_norm": 0.48157253861427307, "learning_rate": 1.5120780131296607e-05, "loss": 0.1183, "step": 114300 }, { "epoch": 1.6394911948603224, "grad_norm": 4.47376823425293, "learning_rate": 1.5116796617731414e-05, "loss": 0.1731, "step": 114325 }, { "epoch": 1.6398497103195089, "grad_norm": 0.7102075815200806, "learning_rate": 1.5112813104166226e-05, "loss": 0.1172, "step": 114350 }, { "epoch": 1.6402082257786956, "grad_norm": 4.800610065460205, "learning_rate": 1.5108829590601033e-05, "loss": 0.1233, "step": 114375 }, { "epoch": 1.6405667412378822, "grad_norm": 0.3049097955226898, "learning_rate": 1.5104846077035841e-05, "loss": 0.0956, "step": 114400 }, { "epoch": 1.6409252566970687, "grad_norm": 0.1465855985879898, "learning_rate": 1.5100862563470649e-05, "loss": 0.0666, "step": 114425 }, { "epoch": 1.6412837721562554, "grad_norm": 8.568418502807617, "learning_rate": 1.509687904990546e-05, "loss": 0.1633, "step": 114450 }, { "epoch": 1.641642287615442, "grad_norm": 1.7930480241775513, "learning_rate": 1.5092895536340268e-05, "loss": 0.1595, "step": 114475 }, { "epoch": 1.6420008030746285, "grad_norm": 0.37006351351737976, "learning_rate": 1.5088912022775076e-05, "loss": 0.1568, "step": 114500 }, { "epoch": 1.6423593185338152, "grad_norm": 1.8507894277572632, "learning_rate": 1.5084928509209884e-05, "loss": 0.1526, "step": 114525 }, { "epoch": 1.6427178339930018, "grad_norm": 10.839665412902832, "learning_rate": 1.5080944995644692e-05, "loss": 0.134, "step": 114550 }, { "epoch": 1.6430763494521883, "grad_norm": 0.213973268866539, "learning_rate": 1.5076961482079501e-05, "loss": 0.1192, "step": 114575 }, { "epoch": 1.643434864911375, "grad_norm": 13.712465286254883, "learning_rate": 1.5072977968514309e-05, "loss": 0.1074, "step": 114600 }, { "epoch": 1.6437933803705616, "grad_norm": 4.877667427062988, "learning_rate": 1.5068994454949117e-05, "loss": 0.0457, "step": 114625 }, { "epoch": 1.644151895829748, "grad_norm": 12.743515014648438, "learning_rate": 1.5065010941383925e-05, "loss": 0.1797, "step": 114650 }, { "epoch": 1.6445104112889348, "grad_norm": 5.931875228881836, "learning_rate": 1.5061027427818733e-05, "loss": 0.1766, "step": 114675 }, { "epoch": 1.6448689267481214, "grad_norm": 4.3426923751831055, "learning_rate": 1.5057043914253544e-05, "loss": 0.1063, "step": 114700 }, { "epoch": 1.6452274422073079, "grad_norm": 14.479171752929688, "learning_rate": 1.5053060400688352e-05, "loss": 0.1637, "step": 114725 }, { "epoch": 1.6455859576664946, "grad_norm": 1.4127835035324097, "learning_rate": 1.504907688712316e-05, "loss": 0.1658, "step": 114750 }, { "epoch": 1.6459444731256812, "grad_norm": 0.8050453066825867, "learning_rate": 1.5045093373557967e-05, "loss": 0.0403, "step": 114775 }, { "epoch": 1.6463029885848677, "grad_norm": 7.100243091583252, "learning_rate": 1.5041109859992775e-05, "loss": 0.1055, "step": 114800 }, { "epoch": 1.6466615040440544, "grad_norm": 6.868293762207031, "learning_rate": 1.5037126346427587e-05, "loss": 0.0815, "step": 114825 }, { "epoch": 1.647020019503241, "grad_norm": 0.45267757773399353, "learning_rate": 1.5033142832862394e-05, "loss": 0.1292, "step": 114850 }, { "epoch": 1.6473785349624275, "grad_norm": 17.092809677124023, "learning_rate": 1.5029159319297202e-05, "loss": 0.1645, "step": 114875 }, { "epoch": 1.6477370504216142, "grad_norm": 0.174832284450531, "learning_rate": 1.502517580573201e-05, "loss": 0.1044, "step": 114900 }, { "epoch": 1.6480955658808008, "grad_norm": 10.155691146850586, "learning_rate": 1.5021192292166818e-05, "loss": 0.1794, "step": 114925 }, { "epoch": 1.6484540813399873, "grad_norm": 12.388981819152832, "learning_rate": 1.5017208778601629e-05, "loss": 0.1614, "step": 114950 }, { "epoch": 1.648812596799174, "grad_norm": 4.86134147644043, "learning_rate": 1.5013225265036437e-05, "loss": 0.1245, "step": 114975 }, { "epoch": 1.6491711122583605, "grad_norm": 6.791014194488525, "learning_rate": 1.5009241751471245e-05, "loss": 0.0879, "step": 115000 }, { "epoch": 1.649529627717547, "grad_norm": 1.4077117443084717, "learning_rate": 1.5005258237906053e-05, "loss": 0.1407, "step": 115025 }, { "epoch": 1.6498881431767338, "grad_norm": 6.416537761688232, "learning_rate": 1.5001274724340864e-05, "loss": 0.1151, "step": 115050 }, { "epoch": 1.6502466586359203, "grad_norm": 2.760798931121826, "learning_rate": 1.499729121077567e-05, "loss": 0.1088, "step": 115075 }, { "epoch": 1.6506051740951069, "grad_norm": 12.135295867919922, "learning_rate": 1.499330769721048e-05, "loss": 0.1328, "step": 115100 }, { "epoch": 1.6509636895542936, "grad_norm": 11.043299674987793, "learning_rate": 1.4989324183645286e-05, "loss": 0.1462, "step": 115125 }, { "epoch": 1.6513222050134801, "grad_norm": 0.7465837001800537, "learning_rate": 1.4985340670080095e-05, "loss": 0.1355, "step": 115150 }, { "epoch": 1.6516807204726667, "grad_norm": 0.8363621830940247, "learning_rate": 1.4981357156514903e-05, "loss": 0.1219, "step": 115175 }, { "epoch": 1.6520392359318534, "grad_norm": 0.9119580984115601, "learning_rate": 1.4977373642949713e-05, "loss": 0.0582, "step": 115200 }, { "epoch": 1.65239775139104, "grad_norm": 12.839937210083008, "learning_rate": 1.497339012938452e-05, "loss": 0.1764, "step": 115225 }, { "epoch": 1.6527562668502265, "grad_norm": 1.3749116659164429, "learning_rate": 1.4969406615819328e-05, "loss": 0.1107, "step": 115250 }, { "epoch": 1.6531147823094132, "grad_norm": 0.3374287784099579, "learning_rate": 1.4965423102254138e-05, "loss": 0.1255, "step": 115275 }, { "epoch": 1.6534732977685997, "grad_norm": 14.272370338439941, "learning_rate": 1.4961439588688946e-05, "loss": 0.1767, "step": 115300 }, { "epoch": 1.6538318132277863, "grad_norm": 22.143207550048828, "learning_rate": 1.4957456075123755e-05, "loss": 0.1724, "step": 115325 }, { "epoch": 1.654190328686973, "grad_norm": 2.4174373149871826, "learning_rate": 1.4953472561558563e-05, "loss": 0.2267, "step": 115350 }, { "epoch": 1.6545488441461595, "grad_norm": 0.1640394926071167, "learning_rate": 1.4949489047993371e-05, "loss": 0.1538, "step": 115375 }, { "epoch": 1.654907359605346, "grad_norm": 6.158883571624756, "learning_rate": 1.494550553442818e-05, "loss": 0.109, "step": 115400 }, { "epoch": 1.6552658750645328, "grad_norm": 2.4546520709991455, "learning_rate": 1.4941522020862988e-05, "loss": 0.1749, "step": 115425 }, { "epoch": 1.6556243905237193, "grad_norm": 3.1857967376708984, "learning_rate": 1.4937538507297798e-05, "loss": 0.077, "step": 115450 }, { "epoch": 1.6559829059829059, "grad_norm": 8.863759994506836, "learning_rate": 1.4933554993732606e-05, "loss": 0.0621, "step": 115475 }, { "epoch": 1.6563414214420926, "grad_norm": 15.036480903625488, "learning_rate": 1.4929571480167415e-05, "loss": 0.1091, "step": 115500 }, { "epoch": 1.6566999369012791, "grad_norm": 1.0589373111724854, "learning_rate": 1.4925587966602223e-05, "loss": 0.0964, "step": 115525 }, { "epoch": 1.6570584523604657, "grad_norm": 7.543754577636719, "learning_rate": 1.4921604453037031e-05, "loss": 0.1696, "step": 115550 }, { "epoch": 1.6574169678196524, "grad_norm": 25.697856903076172, "learning_rate": 1.491762093947184e-05, "loss": 0.1118, "step": 115575 }, { "epoch": 1.657775483278839, "grad_norm": 4.285641670227051, "learning_rate": 1.4913637425906648e-05, "loss": 0.1713, "step": 115600 }, { "epoch": 1.6581339987380255, "grad_norm": 5.368215560913086, "learning_rate": 1.4909653912341458e-05, "loss": 0.1644, "step": 115625 }, { "epoch": 1.6584925141972122, "grad_norm": 2.4966495037078857, "learning_rate": 1.4905670398776266e-05, "loss": 0.117, "step": 115650 }, { "epoch": 1.6588510296563987, "grad_norm": 4.0409111976623535, "learning_rate": 1.4901686885211072e-05, "loss": 0.157, "step": 115675 }, { "epoch": 1.6592095451155853, "grad_norm": 3.3796498775482178, "learning_rate": 1.4897703371645881e-05, "loss": 0.1696, "step": 115700 }, { "epoch": 1.659568060574772, "grad_norm": 12.176471710205078, "learning_rate": 1.489371985808069e-05, "loss": 0.1432, "step": 115725 }, { "epoch": 1.6599265760339585, "grad_norm": 5.072174549102783, "learning_rate": 1.4889736344515499e-05, "loss": 0.1136, "step": 115750 }, { "epoch": 1.660285091493145, "grad_norm": 9.905001640319824, "learning_rate": 1.4885752830950307e-05, "loss": 0.1669, "step": 115775 }, { "epoch": 1.6606436069523318, "grad_norm": 0.6847703456878662, "learning_rate": 1.4881769317385116e-05, "loss": 0.1372, "step": 115800 }, { "epoch": 1.6610021224115183, "grad_norm": 0.9002034664154053, "learning_rate": 1.4877785803819924e-05, "loss": 0.1032, "step": 115825 }, { "epoch": 1.6613606378707049, "grad_norm": 1.2756986618041992, "learning_rate": 1.4873802290254732e-05, "loss": 0.1541, "step": 115850 }, { "epoch": 1.6617191533298916, "grad_norm": 10.437382698059082, "learning_rate": 1.4869818776689541e-05, "loss": 0.0811, "step": 115875 }, { "epoch": 1.6620776687890781, "grad_norm": 1.4748882055282593, "learning_rate": 1.486583526312435e-05, "loss": 0.1294, "step": 115900 }, { "epoch": 1.6624361842482647, "grad_norm": 6.729259014129639, "learning_rate": 1.4861851749559159e-05, "loss": 0.1084, "step": 115925 }, { "epoch": 1.6627946997074514, "grad_norm": 0.4244295656681061, "learning_rate": 1.4857868235993967e-05, "loss": 0.1496, "step": 115950 }, { "epoch": 1.663153215166638, "grad_norm": 18.804508209228516, "learning_rate": 1.4853884722428774e-05, "loss": 0.0913, "step": 115975 }, { "epoch": 1.6635117306258245, "grad_norm": 2.9023544788360596, "learning_rate": 1.4849901208863584e-05, "loss": 0.1075, "step": 116000 }, { "epoch": 1.6638702460850112, "grad_norm": 1.2840460538864136, "learning_rate": 1.4845917695298392e-05, "loss": 0.0875, "step": 116025 }, { "epoch": 1.6642287615441977, "grad_norm": 1.4373592138290405, "learning_rate": 1.4841934181733201e-05, "loss": 0.0944, "step": 116050 }, { "epoch": 1.6645872770033843, "grad_norm": 2.5830042362213135, "learning_rate": 1.483795066816801e-05, "loss": 0.1075, "step": 116075 }, { "epoch": 1.664945792462571, "grad_norm": 21.428186416625977, "learning_rate": 1.4833967154602819e-05, "loss": 0.0918, "step": 116100 }, { "epoch": 1.6653043079217575, "grad_norm": 8.126228332519531, "learning_rate": 1.4829983641037627e-05, "loss": 0.137, "step": 116125 }, { "epoch": 1.665662823380944, "grad_norm": 1.4515607357025146, "learning_rate": 1.4826000127472434e-05, "loss": 0.071, "step": 116150 }, { "epoch": 1.6660213388401308, "grad_norm": 1.0414798259735107, "learning_rate": 1.4822016613907244e-05, "loss": 0.131, "step": 116175 }, { "epoch": 1.6663798542993173, "grad_norm": 0.08900777250528336, "learning_rate": 1.4818033100342052e-05, "loss": 0.1304, "step": 116200 }, { "epoch": 1.6667383697585039, "grad_norm": 0.2666081190109253, "learning_rate": 1.481404958677686e-05, "loss": 0.2078, "step": 116225 }, { "epoch": 1.6670968852176906, "grad_norm": 2.6669366359710693, "learning_rate": 1.4810066073211668e-05, "loss": 0.1197, "step": 116250 }, { "epoch": 1.6674554006768771, "grad_norm": 8.548626899719238, "learning_rate": 1.4806082559646475e-05, "loss": 0.0751, "step": 116275 }, { "epoch": 1.6678139161360637, "grad_norm": 5.115691184997559, "learning_rate": 1.4802099046081285e-05, "loss": 0.1419, "step": 116300 }, { "epoch": 1.6681724315952504, "grad_norm": 8.309897422790527, "learning_rate": 1.4798115532516093e-05, "loss": 0.1019, "step": 116325 }, { "epoch": 1.668530947054437, "grad_norm": 0.6824802160263062, "learning_rate": 1.4794132018950902e-05, "loss": 0.1261, "step": 116350 }, { "epoch": 1.6688894625136235, "grad_norm": 3.959686040878296, "learning_rate": 1.479014850538571e-05, "loss": 0.0953, "step": 116375 }, { "epoch": 1.6692479779728102, "grad_norm": 13.489941596984863, "learning_rate": 1.478616499182052e-05, "loss": 0.1463, "step": 116400 }, { "epoch": 1.6696064934319967, "grad_norm": 1.1969045400619507, "learning_rate": 1.4782181478255328e-05, "loss": 0.1363, "step": 116425 }, { "epoch": 1.6699650088911833, "grad_norm": 10.884803771972656, "learning_rate": 1.4778197964690135e-05, "loss": 0.1786, "step": 116450 }, { "epoch": 1.67032352435037, "grad_norm": 16.87468910217285, "learning_rate": 1.4774214451124945e-05, "loss": 0.1254, "step": 116475 }, { "epoch": 1.6706820398095565, "grad_norm": 8.77364444732666, "learning_rate": 1.4770230937559753e-05, "loss": 0.1481, "step": 116500 }, { "epoch": 1.671040555268743, "grad_norm": 14.330191612243652, "learning_rate": 1.4766247423994562e-05, "loss": 0.2237, "step": 116525 }, { "epoch": 1.6713990707279298, "grad_norm": 6.409341812133789, "learning_rate": 1.476226391042937e-05, "loss": 0.114, "step": 116550 }, { "epoch": 1.6717575861871163, "grad_norm": 4.169719219207764, "learning_rate": 1.4758280396864178e-05, "loss": 0.1318, "step": 116575 }, { "epoch": 1.6721161016463029, "grad_norm": 8.769893646240234, "learning_rate": 1.4754296883298988e-05, "loss": 0.1357, "step": 116600 }, { "epoch": 1.6724746171054896, "grad_norm": 1.0750824213027954, "learning_rate": 1.4750313369733795e-05, "loss": 0.1297, "step": 116625 }, { "epoch": 1.6728331325646761, "grad_norm": 0.5739865303039551, "learning_rate": 1.4746329856168605e-05, "loss": 0.0988, "step": 116650 }, { "epoch": 1.6731916480238627, "grad_norm": 2.6214613914489746, "learning_rate": 1.4742346342603413e-05, "loss": 0.1623, "step": 116675 }, { "epoch": 1.6735501634830494, "grad_norm": 2.5244908332824707, "learning_rate": 1.4738362829038222e-05, "loss": 0.1525, "step": 116700 }, { "epoch": 1.673908678942236, "grad_norm": 0.39305195212364197, "learning_rate": 1.473437931547303e-05, "loss": 0.1255, "step": 116725 }, { "epoch": 1.6742671944014225, "grad_norm": 0.8127602338790894, "learning_rate": 1.4730395801907838e-05, "loss": 0.0978, "step": 116750 }, { "epoch": 1.6746257098606092, "grad_norm": 3.6989450454711914, "learning_rate": 1.4726412288342646e-05, "loss": 0.1195, "step": 116775 }, { "epoch": 1.6749842253197957, "grad_norm": 0.22529074549674988, "learning_rate": 1.4722428774777454e-05, "loss": 0.2323, "step": 116800 }, { "epoch": 1.6753427407789823, "grad_norm": 2.0920052528381348, "learning_rate": 1.4718445261212263e-05, "loss": 0.2178, "step": 116825 }, { "epoch": 1.675701256238169, "grad_norm": 6.038250923156738, "learning_rate": 1.4714461747647071e-05, "loss": 0.1961, "step": 116850 }, { "epoch": 1.6760597716973555, "grad_norm": 17.97923469543457, "learning_rate": 1.4710478234081879e-05, "loss": 0.1063, "step": 116875 }, { "epoch": 1.676418287156542, "grad_norm": 6.554179668426514, "learning_rate": 1.4706494720516688e-05, "loss": 0.1403, "step": 116900 }, { "epoch": 1.6767768026157288, "grad_norm": 0.6860151290893555, "learning_rate": 1.4702511206951496e-05, "loss": 0.199, "step": 116925 }, { "epoch": 1.6771353180749153, "grad_norm": 2.9356346130371094, "learning_rate": 1.4698527693386306e-05, "loss": 0.0817, "step": 116950 }, { "epoch": 1.6774938335341019, "grad_norm": 2.716614007949829, "learning_rate": 1.4694544179821114e-05, "loss": 0.1601, "step": 116975 }, { "epoch": 1.6778523489932886, "grad_norm": 4.017399787902832, "learning_rate": 1.4690560666255923e-05, "loss": 0.1235, "step": 117000 }, { "epoch": 1.6782108644524751, "grad_norm": 3.4367005825042725, "learning_rate": 1.4686577152690731e-05, "loss": 0.1304, "step": 117025 }, { "epoch": 1.6785693799116617, "grad_norm": 1.8103505373001099, "learning_rate": 1.4682593639125539e-05, "loss": 0.1093, "step": 117050 }, { "epoch": 1.6789278953708484, "grad_norm": 0.30828702449798584, "learning_rate": 1.4678610125560348e-05, "loss": 0.0387, "step": 117075 }, { "epoch": 1.679286410830035, "grad_norm": 2.0524256229400635, "learning_rate": 1.4674626611995156e-05, "loss": 0.1791, "step": 117100 }, { "epoch": 1.6796449262892215, "grad_norm": 0.6096739768981934, "learning_rate": 1.4670643098429966e-05, "loss": 0.1747, "step": 117125 }, { "epoch": 1.6800034417484082, "grad_norm": 1.7092541456222534, "learning_rate": 1.4666659584864774e-05, "loss": 0.1354, "step": 117150 }, { "epoch": 1.6803619572075947, "grad_norm": 9.867462158203125, "learning_rate": 1.4662676071299582e-05, "loss": 0.1121, "step": 117175 }, { "epoch": 1.6807204726667813, "grad_norm": 18.242015838623047, "learning_rate": 1.4658692557734391e-05, "loss": 0.1278, "step": 117200 }, { "epoch": 1.681078988125968, "grad_norm": 0.699058473110199, "learning_rate": 1.4654709044169199e-05, "loss": 0.0885, "step": 117225 }, { "epoch": 1.6814375035851548, "grad_norm": 1.1560384035110474, "learning_rate": 1.4650725530604008e-05, "loss": 0.1445, "step": 117250 }, { "epoch": 1.681796019044341, "grad_norm": 16.906953811645508, "learning_rate": 1.4646742017038816e-05, "loss": 0.1607, "step": 117275 }, { "epoch": 1.6821545345035278, "grad_norm": 4.736642360687256, "learning_rate": 1.4642758503473626e-05, "loss": 0.1825, "step": 117300 }, { "epoch": 1.6825130499627146, "grad_norm": 8.342652320861816, "learning_rate": 1.4638774989908432e-05, "loss": 0.1349, "step": 117325 }, { "epoch": 1.6828715654219009, "grad_norm": 6.415328502655029, "learning_rate": 1.463479147634324e-05, "loss": 0.0832, "step": 117350 }, { "epoch": 1.6832300808810876, "grad_norm": 3.008418560028076, "learning_rate": 1.463080796277805e-05, "loss": 0.0938, "step": 117375 }, { "epoch": 1.6835885963402744, "grad_norm": 10.715727806091309, "learning_rate": 1.4626824449212857e-05, "loss": 0.0867, "step": 117400 }, { "epoch": 1.6839471117994607, "grad_norm": 0.06981667131185532, "learning_rate": 1.4622840935647667e-05, "loss": 0.1359, "step": 117425 }, { "epoch": 1.6843056272586474, "grad_norm": 6.385518550872803, "learning_rate": 1.4618857422082475e-05, "loss": 0.1224, "step": 117450 }, { "epoch": 1.6846641427178342, "grad_norm": 0.5480861663818359, "learning_rate": 1.4614873908517282e-05, "loss": 0.1404, "step": 117475 }, { "epoch": 1.6850226581770205, "grad_norm": 0.9021674394607544, "learning_rate": 1.4610890394952092e-05, "loss": 0.1228, "step": 117500 }, { "epoch": 1.6853811736362072, "grad_norm": 0.33229491114616394, "learning_rate": 1.46069068813869e-05, "loss": 0.1223, "step": 117525 }, { "epoch": 1.685739689095394, "grad_norm": 5.537556171417236, "learning_rate": 1.460292336782171e-05, "loss": 0.1225, "step": 117550 }, { "epoch": 1.6860982045545803, "grad_norm": 23.55714988708496, "learning_rate": 1.4598939854256517e-05, "loss": 0.0902, "step": 117575 }, { "epoch": 1.686456720013767, "grad_norm": 6.330302715301514, "learning_rate": 1.4594956340691327e-05, "loss": 0.1609, "step": 117600 }, { "epoch": 1.6868152354729538, "grad_norm": 22.055438995361328, "learning_rate": 1.4590972827126135e-05, "loss": 0.1161, "step": 117625 }, { "epoch": 1.68717375093214, "grad_norm": 22.312673568725586, "learning_rate": 1.4586989313560942e-05, "loss": 0.0747, "step": 117650 }, { "epoch": 1.6875322663913268, "grad_norm": 1.770993709564209, "learning_rate": 1.4583005799995752e-05, "loss": 0.0986, "step": 117675 }, { "epoch": 1.6878907818505136, "grad_norm": 3.740952491760254, "learning_rate": 1.457902228643056e-05, "loss": 0.0937, "step": 117700 }, { "epoch": 1.6882492973096999, "grad_norm": 3.5758183002471924, "learning_rate": 1.457503877286537e-05, "loss": 0.1554, "step": 117725 }, { "epoch": 1.6886078127688866, "grad_norm": 18.830297470092773, "learning_rate": 1.4571055259300177e-05, "loss": 0.1294, "step": 117750 }, { "epoch": 1.6889663282280734, "grad_norm": 3.312110424041748, "learning_rate": 1.4567071745734985e-05, "loss": 0.0855, "step": 117775 }, { "epoch": 1.6893248436872597, "grad_norm": 1.6020379066467285, "learning_rate": 1.4563088232169795e-05, "loss": 0.1346, "step": 117800 }, { "epoch": 1.6896833591464464, "grad_norm": 16.325010299682617, "learning_rate": 1.4559104718604602e-05, "loss": 0.1716, "step": 117825 }, { "epoch": 1.6900418746056332, "grad_norm": 14.688810348510742, "learning_rate": 1.4555121205039412e-05, "loss": 0.2036, "step": 117850 }, { "epoch": 1.6904003900648195, "grad_norm": 0.8896178007125854, "learning_rate": 1.4551137691474218e-05, "loss": 0.1354, "step": 117875 }, { "epoch": 1.6907589055240062, "grad_norm": 11.980156898498535, "learning_rate": 1.4547154177909028e-05, "loss": 0.1228, "step": 117900 }, { "epoch": 1.691117420983193, "grad_norm": 18.879501342773438, "learning_rate": 1.4543170664343835e-05, "loss": 0.1346, "step": 117925 }, { "epoch": 1.6914759364423793, "grad_norm": 1.3947399854660034, "learning_rate": 1.4539187150778643e-05, "loss": 0.1017, "step": 117950 }, { "epoch": 1.691834451901566, "grad_norm": 16.054214477539062, "learning_rate": 1.4535203637213453e-05, "loss": 0.1371, "step": 117975 }, { "epoch": 1.6921929673607528, "grad_norm": 1.1692897081375122, "learning_rate": 1.453122012364826e-05, "loss": 0.1301, "step": 118000 }, { "epoch": 1.692551482819939, "grad_norm": 4.185601234436035, "learning_rate": 1.452723661008307e-05, "loss": 0.1419, "step": 118025 }, { "epoch": 1.6929099982791258, "grad_norm": 10.40366268157959, "learning_rate": 1.4523253096517878e-05, "loss": 0.1031, "step": 118050 }, { "epoch": 1.6932685137383126, "grad_norm": 0.19058401882648468, "learning_rate": 1.4519269582952686e-05, "loss": 0.0735, "step": 118075 }, { "epoch": 1.6936270291974989, "grad_norm": 0.1712014526128769, "learning_rate": 1.4515286069387495e-05, "loss": 0.0952, "step": 118100 }, { "epoch": 1.6939855446566856, "grad_norm": 18.48647689819336, "learning_rate": 1.4511302555822303e-05, "loss": 0.2075, "step": 118125 }, { "epoch": 1.6943440601158724, "grad_norm": 9.111275672912598, "learning_rate": 1.4507319042257113e-05, "loss": 0.0898, "step": 118150 }, { "epoch": 1.6947025755750587, "grad_norm": 9.530776977539062, "learning_rate": 1.450333552869192e-05, "loss": 0.1113, "step": 118175 }, { "epoch": 1.6950610910342454, "grad_norm": 22.613853454589844, "learning_rate": 1.449935201512673e-05, "loss": 0.1941, "step": 118200 }, { "epoch": 1.6954196064934322, "grad_norm": 0.19273221492767334, "learning_rate": 1.4495368501561538e-05, "loss": 0.1371, "step": 118225 }, { "epoch": 1.6957781219526185, "grad_norm": 2.0340676307678223, "learning_rate": 1.4491384987996346e-05, "loss": 0.1585, "step": 118250 }, { "epoch": 1.6961366374118052, "grad_norm": 0.741582453250885, "learning_rate": 1.4487401474431155e-05, "loss": 0.1565, "step": 118275 }, { "epoch": 1.696495152870992, "grad_norm": 9.446676254272461, "learning_rate": 1.4483417960865963e-05, "loss": 0.0679, "step": 118300 }, { "epoch": 1.6968536683301783, "grad_norm": 12.615857124328613, "learning_rate": 1.4479434447300773e-05, "loss": 0.1218, "step": 118325 }, { "epoch": 1.697212183789365, "grad_norm": 5.3371100425720215, "learning_rate": 1.447545093373558e-05, "loss": 0.1314, "step": 118350 }, { "epoch": 1.6975706992485518, "grad_norm": 5.971244812011719, "learning_rate": 1.4471467420170389e-05, "loss": 0.1611, "step": 118375 }, { "epoch": 1.697929214707738, "grad_norm": 6.118712902069092, "learning_rate": 1.4467483906605198e-05, "loss": 0.13, "step": 118400 }, { "epoch": 1.6982877301669248, "grad_norm": 0.17515872418880463, "learning_rate": 1.4463500393040004e-05, "loss": 0.1523, "step": 118425 }, { "epoch": 1.6986462456261116, "grad_norm": 4.529242992401123, "learning_rate": 1.4459516879474814e-05, "loss": 0.1583, "step": 118450 }, { "epoch": 1.6990047610852979, "grad_norm": 8.004142761230469, "learning_rate": 1.4455533365909622e-05, "loss": 0.0863, "step": 118475 }, { "epoch": 1.6993632765444846, "grad_norm": 12.865274429321289, "learning_rate": 1.4451549852344431e-05, "loss": 0.1711, "step": 118500 }, { "epoch": 1.6997217920036714, "grad_norm": 6.798962116241455, "learning_rate": 1.4447566338779239e-05, "loss": 0.087, "step": 118525 }, { "epoch": 1.7000803074628577, "grad_norm": 3.488949775695801, "learning_rate": 1.4443582825214047e-05, "loss": 0.1052, "step": 118550 }, { "epoch": 1.7004388229220444, "grad_norm": 6.171590328216553, "learning_rate": 1.4439599311648856e-05, "loss": 0.0939, "step": 118575 }, { "epoch": 1.7007973383812311, "grad_norm": 5.323143005371094, "learning_rate": 1.4435615798083664e-05, "loss": 0.0647, "step": 118600 }, { "epoch": 1.7011558538404175, "grad_norm": 3.967649221420288, "learning_rate": 1.4431632284518474e-05, "loss": 0.0857, "step": 118625 }, { "epoch": 1.7015143692996042, "grad_norm": 4.664878845214844, "learning_rate": 1.4427648770953282e-05, "loss": 0.1607, "step": 118650 }, { "epoch": 1.701872884758791, "grad_norm": 10.265624046325684, "learning_rate": 1.442366525738809e-05, "loss": 0.12, "step": 118675 }, { "epoch": 1.7022314002179773, "grad_norm": 9.792373657226562, "learning_rate": 1.4419681743822899e-05, "loss": 0.1418, "step": 118700 }, { "epoch": 1.702589915677164, "grad_norm": 4.044111728668213, "learning_rate": 1.4415698230257707e-05, "loss": 0.2822, "step": 118725 }, { "epoch": 1.7029484311363507, "grad_norm": 0.593799889087677, "learning_rate": 1.4411714716692516e-05, "loss": 0.1329, "step": 118750 }, { "epoch": 1.703306946595537, "grad_norm": 10.790225982666016, "learning_rate": 1.4407731203127324e-05, "loss": 0.1114, "step": 118775 }, { "epoch": 1.7036654620547238, "grad_norm": 1.8066706657409668, "learning_rate": 1.4403747689562132e-05, "loss": 0.1553, "step": 118800 }, { "epoch": 1.7040239775139105, "grad_norm": 4.586004257202148, "learning_rate": 1.4399764175996942e-05, "loss": 0.1191, "step": 118825 }, { "epoch": 1.7043824929730969, "grad_norm": 2.91341233253479, "learning_rate": 1.439578066243175e-05, "loss": 0.0732, "step": 118850 }, { "epoch": 1.7047410084322836, "grad_norm": 0.9671263098716736, "learning_rate": 1.4391797148866559e-05, "loss": 0.1009, "step": 118875 }, { "epoch": 1.7050995238914703, "grad_norm": 12.278162002563477, "learning_rate": 1.4387813635301367e-05, "loss": 0.125, "step": 118900 }, { "epoch": 1.7054580393506567, "grad_norm": 2.0845561027526855, "learning_rate": 1.4383830121736176e-05, "loss": 0.2171, "step": 118925 }, { "epoch": 1.7058165548098434, "grad_norm": 3.1578328609466553, "learning_rate": 1.4379846608170984e-05, "loss": 0.1024, "step": 118950 }, { "epoch": 1.7061750702690301, "grad_norm": 2.2373886108398438, "learning_rate": 1.437586309460579e-05, "loss": 0.1382, "step": 118975 }, { "epoch": 1.7065335857282165, "grad_norm": 5.151762962341309, "learning_rate": 1.43718795810406e-05, "loss": 0.1323, "step": 119000 }, { "epoch": 1.7068921011874032, "grad_norm": 0.18824245035648346, "learning_rate": 1.4367896067475408e-05, "loss": 0.0556, "step": 119025 }, { "epoch": 1.70725061664659, "grad_norm": 0.6853033304214478, "learning_rate": 1.4363912553910217e-05, "loss": 0.1222, "step": 119050 }, { "epoch": 1.7076091321057763, "grad_norm": 0.061731766909360886, "learning_rate": 1.4359929040345025e-05, "loss": 0.099, "step": 119075 }, { "epoch": 1.707967647564963, "grad_norm": 15.397771835327148, "learning_rate": 1.4355945526779833e-05, "loss": 0.2151, "step": 119100 }, { "epoch": 1.7083261630241497, "grad_norm": 5.291375160217285, "learning_rate": 1.4351962013214643e-05, "loss": 0.1844, "step": 119125 }, { "epoch": 1.708684678483336, "grad_norm": 2.042527437210083, "learning_rate": 1.434797849964945e-05, "loss": 0.079, "step": 119150 }, { "epoch": 1.7090431939425228, "grad_norm": 7.308110237121582, "learning_rate": 1.434399498608426e-05, "loss": 0.1677, "step": 119175 }, { "epoch": 1.7094017094017095, "grad_norm": 2.687350273132324, "learning_rate": 1.4340011472519068e-05, "loss": 0.0685, "step": 119200 }, { "epoch": 1.7097602248608958, "grad_norm": 1.6884294748306274, "learning_rate": 1.4336027958953877e-05, "loss": 0.0703, "step": 119225 }, { "epoch": 1.7101187403200826, "grad_norm": 24.26011085510254, "learning_rate": 1.4332044445388685e-05, "loss": 0.1291, "step": 119250 }, { "epoch": 1.7104772557792693, "grad_norm": 1.4042673110961914, "learning_rate": 1.4328060931823493e-05, "loss": 0.2199, "step": 119275 }, { "epoch": 1.7108357712384556, "grad_norm": 8.238384246826172, "learning_rate": 1.4324077418258303e-05, "loss": 0.1303, "step": 119300 }, { "epoch": 1.7111942866976424, "grad_norm": 8.749116897583008, "learning_rate": 1.432009390469311e-05, "loss": 0.1433, "step": 119325 }, { "epoch": 1.7115528021568291, "grad_norm": 12.753235816955566, "learning_rate": 1.431611039112792e-05, "loss": 0.0755, "step": 119350 }, { "epoch": 1.7119113176160154, "grad_norm": 16.341066360473633, "learning_rate": 1.4312126877562728e-05, "loss": 0.084, "step": 119375 }, { "epoch": 1.7122698330752022, "grad_norm": 7.370471477508545, "learning_rate": 1.4308143363997536e-05, "loss": 0.1493, "step": 119400 }, { "epoch": 1.712628348534389, "grad_norm": 1.6945961713790894, "learning_rate": 1.4304159850432345e-05, "loss": 0.1601, "step": 119425 }, { "epoch": 1.7129868639935752, "grad_norm": 1.6011806726455688, "learning_rate": 1.4300176336867153e-05, "loss": 0.1371, "step": 119450 }, { "epoch": 1.713345379452762, "grad_norm": 14.897636413574219, "learning_rate": 1.4296192823301963e-05, "loss": 0.1697, "step": 119475 }, { "epoch": 1.7137038949119487, "grad_norm": 0.21145913004875183, "learning_rate": 1.429220930973677e-05, "loss": 0.118, "step": 119500 }, { "epoch": 1.714062410371135, "grad_norm": 0.1488906592130661, "learning_rate": 1.4288225796171578e-05, "loss": 0.1131, "step": 119525 }, { "epoch": 1.7144209258303218, "grad_norm": 17.45012855529785, "learning_rate": 1.4284242282606386e-05, "loss": 0.1425, "step": 119550 }, { "epoch": 1.7147794412895085, "grad_norm": 25.968847274780273, "learning_rate": 1.4280258769041194e-05, "loss": 0.1564, "step": 119575 }, { "epoch": 1.7151379567486948, "grad_norm": 9.898372650146484, "learning_rate": 1.4276275255476003e-05, "loss": 0.1346, "step": 119600 }, { "epoch": 1.7154964722078816, "grad_norm": 9.330191612243652, "learning_rate": 1.4272291741910811e-05, "loss": 0.1365, "step": 119625 }, { "epoch": 1.7158549876670683, "grad_norm": 0.9916436672210693, "learning_rate": 1.426830822834562e-05, "loss": 0.1621, "step": 119650 }, { "epoch": 1.7162135031262546, "grad_norm": 7.867892265319824, "learning_rate": 1.4264324714780429e-05, "loss": 0.131, "step": 119675 }, { "epoch": 1.7165720185854414, "grad_norm": 4.989745616912842, "learning_rate": 1.4260341201215237e-05, "loss": 0.0845, "step": 119700 }, { "epoch": 1.7169305340446281, "grad_norm": 0.5451178550720215, "learning_rate": 1.4256357687650046e-05, "loss": 0.1317, "step": 119725 }, { "epoch": 1.7172890495038144, "grad_norm": 1.1594960689544678, "learning_rate": 1.4252374174084854e-05, "loss": 0.09, "step": 119750 }, { "epoch": 1.7176475649630012, "grad_norm": 0.6476234197616577, "learning_rate": 1.4248390660519663e-05, "loss": 0.1205, "step": 119775 }, { "epoch": 1.718006080422188, "grad_norm": 5.02520227432251, "learning_rate": 1.4244407146954471e-05, "loss": 0.1764, "step": 119800 }, { "epoch": 1.7183645958813742, "grad_norm": 1.7678864002227783, "learning_rate": 1.424042363338928e-05, "loss": 0.0622, "step": 119825 }, { "epoch": 1.718723111340561, "grad_norm": 18.228384017944336, "learning_rate": 1.4236440119824089e-05, "loss": 0.1233, "step": 119850 }, { "epoch": 1.7190816267997477, "grad_norm": 0.6308192610740662, "learning_rate": 1.4232456606258896e-05, "loss": 0.137, "step": 119875 }, { "epoch": 1.719440142258934, "grad_norm": 0.08732827007770538, "learning_rate": 1.4228473092693706e-05, "loss": 0.0903, "step": 119900 }, { "epoch": 1.7197986577181208, "grad_norm": 4.813003063201904, "learning_rate": 1.4224489579128514e-05, "loss": 0.1428, "step": 119925 }, { "epoch": 1.7201571731773075, "grad_norm": 7.123504638671875, "learning_rate": 1.4220506065563323e-05, "loss": 0.0937, "step": 119950 }, { "epoch": 1.7205156886364938, "grad_norm": 3.8003525733947754, "learning_rate": 1.4216522551998131e-05, "loss": 0.1806, "step": 119975 }, { "epoch": 1.7208742040956806, "grad_norm": 5.255599021911621, "learning_rate": 1.4212539038432939e-05, "loss": 0.1561, "step": 120000 }, { "epoch": 1.7212327195548673, "grad_norm": 0.07798822224140167, "learning_rate": 1.4208555524867749e-05, "loss": 0.1028, "step": 120025 }, { "epoch": 1.7215912350140536, "grad_norm": 0.49749955534935, "learning_rate": 1.4204572011302556e-05, "loss": 0.2619, "step": 120050 }, { "epoch": 1.7219497504732404, "grad_norm": 3.499528646469116, "learning_rate": 1.4200588497737364e-05, "loss": 0.1437, "step": 120075 }, { "epoch": 1.7223082659324271, "grad_norm": 6.149311542510986, "learning_rate": 1.4196604984172172e-05, "loss": 0.1785, "step": 120100 }, { "epoch": 1.7226667813916134, "grad_norm": 2.623746871948242, "learning_rate": 1.4192621470606982e-05, "loss": 0.1655, "step": 120125 }, { "epoch": 1.7230252968508002, "grad_norm": 10.387149810791016, "learning_rate": 1.418863795704179e-05, "loss": 0.0929, "step": 120150 }, { "epoch": 1.723383812309987, "grad_norm": 7.221709728240967, "learning_rate": 1.4184654443476597e-05, "loss": 0.1369, "step": 120175 }, { "epoch": 1.7237423277691732, "grad_norm": 9.452812194824219, "learning_rate": 1.4180670929911407e-05, "loss": 0.1338, "step": 120200 }, { "epoch": 1.72410084322836, "grad_norm": 1.8278354406356812, "learning_rate": 1.4176687416346215e-05, "loss": 0.159, "step": 120225 }, { "epoch": 1.7244593586875467, "grad_norm": 0.9183980822563171, "learning_rate": 1.4172703902781024e-05, "loss": 0.151, "step": 120250 }, { "epoch": 1.724817874146733, "grad_norm": 2.2628915309906006, "learning_rate": 1.4168720389215832e-05, "loss": 0.1072, "step": 120275 }, { "epoch": 1.7251763896059198, "grad_norm": 9.325264930725098, "learning_rate": 1.416473687565064e-05, "loss": 0.1286, "step": 120300 }, { "epoch": 1.7255349050651065, "grad_norm": 2.4739830493927, "learning_rate": 1.416075336208545e-05, "loss": 0.0949, "step": 120325 }, { "epoch": 1.7258934205242928, "grad_norm": 4.8270263671875, "learning_rate": 1.4156769848520257e-05, "loss": 0.1111, "step": 120350 }, { "epoch": 1.7262519359834796, "grad_norm": 14.8016357421875, "learning_rate": 1.4152786334955067e-05, "loss": 0.163, "step": 120375 }, { "epoch": 1.7266104514426663, "grad_norm": 9.526447296142578, "learning_rate": 1.4148802821389875e-05, "loss": 0.0901, "step": 120400 }, { "epoch": 1.7269689669018526, "grad_norm": 0.1475493311882019, "learning_rate": 1.4144819307824684e-05, "loss": 0.1563, "step": 120425 }, { "epoch": 1.7273274823610394, "grad_norm": 14.284174919128418, "learning_rate": 1.4140835794259492e-05, "loss": 0.165, "step": 120450 }, { "epoch": 1.7276859978202261, "grad_norm": 2.140207290649414, "learning_rate": 1.41368522806943e-05, "loss": 0.0911, "step": 120475 }, { "epoch": 1.7280445132794124, "grad_norm": 5.239156723022461, "learning_rate": 1.413286876712911e-05, "loss": 0.1333, "step": 120500 }, { "epoch": 1.7284030287385992, "grad_norm": 0.2771107256412506, "learning_rate": 1.4128885253563917e-05, "loss": 0.1022, "step": 120525 }, { "epoch": 1.728761544197786, "grad_norm": 6.0129523277282715, "learning_rate": 1.4124901739998727e-05, "loss": 0.1181, "step": 120550 }, { "epoch": 1.7291200596569725, "grad_norm": 4.100048065185547, "learning_rate": 1.4120918226433535e-05, "loss": 0.1006, "step": 120575 }, { "epoch": 1.729478575116159, "grad_norm": 8.62777042388916, "learning_rate": 1.4116934712868343e-05, "loss": 0.1232, "step": 120600 }, { "epoch": 1.7298370905753457, "grad_norm": 10.863117218017578, "learning_rate": 1.411295119930315e-05, "loss": 0.1322, "step": 120625 }, { "epoch": 1.7301956060345323, "grad_norm": 0.3627447187900543, "learning_rate": 1.4108967685737958e-05, "loss": 0.1385, "step": 120650 }, { "epoch": 1.7305541214937188, "grad_norm": 1.634203553199768, "learning_rate": 1.4104984172172768e-05, "loss": 0.1163, "step": 120675 }, { "epoch": 1.7309126369529055, "grad_norm": 1.0254652500152588, "learning_rate": 1.4101000658607576e-05, "loss": 0.1102, "step": 120700 }, { "epoch": 1.731271152412092, "grad_norm": 4.343289852142334, "learning_rate": 1.4097017145042385e-05, "loss": 0.1468, "step": 120725 }, { "epoch": 1.7316296678712786, "grad_norm": 5.627541542053223, "learning_rate": 1.4093033631477193e-05, "loss": 0.0927, "step": 120750 }, { "epoch": 1.7319881833304653, "grad_norm": 10.31214714050293, "learning_rate": 1.4089050117912001e-05, "loss": 0.1554, "step": 120775 }, { "epoch": 1.7323466987896519, "grad_norm": 0.6765010356903076, "learning_rate": 1.408506660434681e-05, "loss": 0.1183, "step": 120800 }, { "epoch": 1.7327052142488384, "grad_norm": 0.012983949854969978, "learning_rate": 1.4081083090781618e-05, "loss": 0.0945, "step": 120825 }, { "epoch": 1.7330637297080251, "grad_norm": 8.213781356811523, "learning_rate": 1.4077099577216428e-05, "loss": 0.0888, "step": 120850 }, { "epoch": 1.7334222451672117, "grad_norm": 0.05928798392415047, "learning_rate": 1.4073116063651236e-05, "loss": 0.1148, "step": 120875 }, { "epoch": 1.7337807606263982, "grad_norm": 1.0013184547424316, "learning_rate": 1.4069132550086044e-05, "loss": 0.0893, "step": 120900 }, { "epoch": 1.734139276085585, "grad_norm": 10.870857238769531, "learning_rate": 1.4065149036520853e-05, "loss": 0.1339, "step": 120925 }, { "epoch": 1.7344977915447715, "grad_norm": 2.4318976402282715, "learning_rate": 1.4061165522955661e-05, "loss": 0.2183, "step": 120950 }, { "epoch": 1.734856307003958, "grad_norm": 0.3520348370075226, "learning_rate": 1.405718200939047e-05, "loss": 0.1434, "step": 120975 }, { "epoch": 1.7352148224631447, "grad_norm": 0.10196730494499207, "learning_rate": 1.4053198495825278e-05, "loss": 0.1065, "step": 121000 }, { "epoch": 1.7355733379223313, "grad_norm": 0.49778616428375244, "learning_rate": 1.4049214982260088e-05, "loss": 0.1248, "step": 121025 }, { "epoch": 1.7359318533815178, "grad_norm": 12.220088958740234, "learning_rate": 1.4045231468694896e-05, "loss": 0.1503, "step": 121050 }, { "epoch": 1.7362903688407045, "grad_norm": 0.9356821775436401, "learning_rate": 1.4041247955129704e-05, "loss": 0.0734, "step": 121075 }, { "epoch": 1.736648884299891, "grad_norm": 2.222078561782837, "learning_rate": 1.4037264441564513e-05, "loss": 0.3049, "step": 121100 }, { "epoch": 1.7370073997590776, "grad_norm": 3.015052318572998, "learning_rate": 1.4033280927999321e-05, "loss": 0.2027, "step": 121125 }, { "epoch": 1.7373659152182643, "grad_norm": 15.967743873596191, "learning_rate": 1.402929741443413e-05, "loss": 0.0825, "step": 121150 }, { "epoch": 1.7377244306774509, "grad_norm": 1.4741636514663696, "learning_rate": 1.4025313900868937e-05, "loss": 0.0869, "step": 121175 }, { "epoch": 1.7380829461366374, "grad_norm": 12.002994537353516, "learning_rate": 1.4021330387303744e-05, "loss": 0.1027, "step": 121200 }, { "epoch": 1.7384414615958241, "grad_norm": 2.675461769104004, "learning_rate": 1.4017346873738554e-05, "loss": 0.109, "step": 121225 }, { "epoch": 1.7387999770550107, "grad_norm": 10.514867782592773, "learning_rate": 1.4013363360173362e-05, "loss": 0.1689, "step": 121250 }, { "epoch": 1.7391584925141972, "grad_norm": 0.38143447041511536, "learning_rate": 1.4009379846608171e-05, "loss": 0.1066, "step": 121275 }, { "epoch": 1.739517007973384, "grad_norm": 5.700894355773926, "learning_rate": 1.400539633304298e-05, "loss": 0.1442, "step": 121300 }, { "epoch": 1.7398755234325705, "grad_norm": 7.592781066894531, "learning_rate": 1.4001412819477789e-05, "loss": 0.1028, "step": 121325 }, { "epoch": 1.740234038891757, "grad_norm": 0.6129941940307617, "learning_rate": 1.3997429305912597e-05, "loss": 0.1357, "step": 121350 }, { "epoch": 1.7405925543509437, "grad_norm": 2.5681028366088867, "learning_rate": 1.3993445792347404e-05, "loss": 0.1673, "step": 121375 }, { "epoch": 1.7409510698101303, "grad_norm": 13.45497989654541, "learning_rate": 1.3989462278782214e-05, "loss": 0.2305, "step": 121400 }, { "epoch": 1.7413095852693168, "grad_norm": 1.9141618013381958, "learning_rate": 1.3985478765217022e-05, "loss": 0.155, "step": 121425 }, { "epoch": 1.7416681007285035, "grad_norm": 6.407559394836426, "learning_rate": 1.3981495251651831e-05, "loss": 0.1763, "step": 121450 }, { "epoch": 1.74202661618769, "grad_norm": 2.079249382019043, "learning_rate": 1.397751173808664e-05, "loss": 0.163, "step": 121475 }, { "epoch": 1.7423851316468766, "grad_norm": 0.0988268256187439, "learning_rate": 1.3973528224521447e-05, "loss": 0.0731, "step": 121500 }, { "epoch": 1.7427436471060633, "grad_norm": 1.042724847793579, "learning_rate": 1.3969544710956257e-05, "loss": 0.1666, "step": 121525 }, { "epoch": 1.7431021625652499, "grad_norm": 18.90333366394043, "learning_rate": 1.3965561197391064e-05, "loss": 0.1565, "step": 121550 }, { "epoch": 1.7434606780244364, "grad_norm": 9.720352172851562, "learning_rate": 1.3961577683825874e-05, "loss": 0.1363, "step": 121575 }, { "epoch": 1.7438191934836231, "grad_norm": 1.4426977634429932, "learning_rate": 1.3957594170260682e-05, "loss": 0.0735, "step": 121600 }, { "epoch": 1.7441777089428097, "grad_norm": 0.5865465998649597, "learning_rate": 1.3953610656695491e-05, "loss": 0.1479, "step": 121625 }, { "epoch": 1.7445362244019962, "grad_norm": 0.27790847420692444, "learning_rate": 1.39496271431303e-05, "loss": 0.129, "step": 121650 }, { "epoch": 1.744894739861183, "grad_norm": 0.39177775382995605, "learning_rate": 1.3945643629565107e-05, "loss": 0.0644, "step": 121675 }, { "epoch": 1.7452532553203695, "grad_norm": 6.817676067352295, "learning_rate": 1.3941660115999917e-05, "loss": 0.0856, "step": 121700 }, { "epoch": 1.745611770779556, "grad_norm": 2.0666916370391846, "learning_rate": 1.3937676602434723e-05, "loss": 0.1966, "step": 121725 }, { "epoch": 1.7459702862387427, "grad_norm": 3.7112205028533936, "learning_rate": 1.3933693088869532e-05, "loss": 0.1251, "step": 121750 }, { "epoch": 1.7463288016979293, "grad_norm": 0.8343656063079834, "learning_rate": 1.392970957530434e-05, "loss": 0.1208, "step": 121775 }, { "epoch": 1.7466873171571158, "grad_norm": 4.721967697143555, "learning_rate": 1.3925726061739148e-05, "loss": 0.1999, "step": 121800 }, { "epoch": 1.7470458326163025, "grad_norm": 1.5447909832000732, "learning_rate": 1.3921742548173958e-05, "loss": 0.1333, "step": 121825 }, { "epoch": 1.747404348075489, "grad_norm": 0.2553898096084595, "learning_rate": 1.3917759034608765e-05, "loss": 0.1349, "step": 121850 }, { "epoch": 1.7477628635346756, "grad_norm": 0.5885089635848999, "learning_rate": 1.3913775521043575e-05, "loss": 0.087, "step": 121875 }, { "epoch": 1.7481213789938623, "grad_norm": 12.46556568145752, "learning_rate": 1.3909792007478383e-05, "loss": 0.1171, "step": 121900 }, { "epoch": 1.7484798944530489, "grad_norm": 6.7160162925720215, "learning_rate": 1.3905808493913192e-05, "loss": 0.1551, "step": 121925 }, { "epoch": 1.7488384099122354, "grad_norm": 3.4894726276397705, "learning_rate": 1.3901824980348e-05, "loss": 0.113, "step": 121950 }, { "epoch": 1.7491969253714221, "grad_norm": 9.361820220947266, "learning_rate": 1.3897841466782808e-05, "loss": 0.1601, "step": 121975 }, { "epoch": 1.7495554408306087, "grad_norm": 3.350571632385254, "learning_rate": 1.3893857953217617e-05, "loss": 0.0973, "step": 122000 }, { "epoch": 1.7499139562897952, "grad_norm": 0.6006773710250854, "learning_rate": 1.3889874439652425e-05, "loss": 0.1384, "step": 122025 }, { "epoch": 1.750272471748982, "grad_norm": 7.332768440246582, "learning_rate": 1.3885890926087235e-05, "loss": 0.1214, "step": 122050 }, { "epoch": 1.7506309872081685, "grad_norm": 14.81093692779541, "learning_rate": 1.3881907412522043e-05, "loss": 0.2269, "step": 122075 }, { "epoch": 1.750989502667355, "grad_norm": 1.7317826747894287, "learning_rate": 1.387792389895685e-05, "loss": 0.1222, "step": 122100 }, { "epoch": 1.7513480181265417, "grad_norm": 2.28419828414917, "learning_rate": 1.387394038539166e-05, "loss": 0.0926, "step": 122125 }, { "epoch": 1.7517065335857283, "grad_norm": 5.70322847366333, "learning_rate": 1.3869956871826468e-05, "loss": 0.1785, "step": 122150 }, { "epoch": 1.7520650490449148, "grad_norm": 7.2692084312438965, "learning_rate": 1.3865973358261277e-05, "loss": 0.1573, "step": 122175 }, { "epoch": 1.7524235645041015, "grad_norm": 2.092825412750244, "learning_rate": 1.3861989844696085e-05, "loss": 0.1176, "step": 122200 }, { "epoch": 1.752782079963288, "grad_norm": 0.6296495795249939, "learning_rate": 1.3858006331130895e-05, "loss": 0.1775, "step": 122225 }, { "epoch": 1.7531405954224746, "grad_norm": 10.802103996276855, "learning_rate": 1.3854022817565703e-05, "loss": 0.0832, "step": 122250 }, { "epoch": 1.7534991108816613, "grad_norm": 2.009779691696167, "learning_rate": 1.3850039304000509e-05, "loss": 0.1271, "step": 122275 }, { "epoch": 1.7538576263408479, "grad_norm": 13.808206558227539, "learning_rate": 1.3846055790435318e-05, "loss": 0.162, "step": 122300 }, { "epoch": 1.7542161418000344, "grad_norm": 1.6211432218551636, "learning_rate": 1.3842072276870126e-05, "loss": 0.1606, "step": 122325 }, { "epoch": 1.7545746572592211, "grad_norm": 0.9016832113265991, "learning_rate": 1.3838088763304936e-05, "loss": 0.1603, "step": 122350 }, { "epoch": 1.7549331727184077, "grad_norm": 0.018813075497746468, "learning_rate": 1.3834105249739744e-05, "loss": 0.1257, "step": 122375 }, { "epoch": 1.7552916881775942, "grad_norm": 12.24410343170166, "learning_rate": 1.3830121736174551e-05, "loss": 0.1424, "step": 122400 }, { "epoch": 1.755650203636781, "grad_norm": 3.828657865524292, "learning_rate": 1.3826138222609361e-05, "loss": 0.181, "step": 122425 }, { "epoch": 1.7560087190959675, "grad_norm": 4.451027870178223, "learning_rate": 1.3822154709044169e-05, "loss": 0.103, "step": 122450 }, { "epoch": 1.756367234555154, "grad_norm": 4.308757781982422, "learning_rate": 1.3818171195478978e-05, "loss": 0.133, "step": 122475 }, { "epoch": 1.7567257500143407, "grad_norm": 7.662469387054443, "learning_rate": 1.3814187681913786e-05, "loss": 0.2211, "step": 122500 }, { "epoch": 1.7570842654735273, "grad_norm": 11.64238166809082, "learning_rate": 1.3810204168348596e-05, "loss": 0.1383, "step": 122525 }, { "epoch": 1.7574427809327138, "grad_norm": 0.17840655148029327, "learning_rate": 1.3806220654783404e-05, "loss": 0.0721, "step": 122550 }, { "epoch": 1.7578012963919005, "grad_norm": 4.366082191467285, "learning_rate": 1.3802237141218211e-05, "loss": 0.1827, "step": 122575 }, { "epoch": 1.758159811851087, "grad_norm": 3.1923928260803223, "learning_rate": 1.3798253627653021e-05, "loss": 0.1356, "step": 122600 }, { "epoch": 1.7585183273102736, "grad_norm": 2.507490634918213, "learning_rate": 1.3794270114087829e-05, "loss": 0.0789, "step": 122625 }, { "epoch": 1.7588768427694603, "grad_norm": 16.63212013244629, "learning_rate": 1.3790286600522638e-05, "loss": 0.2095, "step": 122650 }, { "epoch": 1.7592353582286469, "grad_norm": 0.868859589099884, "learning_rate": 1.3786303086957446e-05, "loss": 0.1818, "step": 122675 }, { "epoch": 1.7595938736878334, "grad_norm": 9.063098907470703, "learning_rate": 1.3782319573392254e-05, "loss": 0.093, "step": 122700 }, { "epoch": 1.7599523891470201, "grad_norm": 10.543604850769043, "learning_rate": 1.3778336059827064e-05, "loss": 0.1545, "step": 122725 }, { "epoch": 1.7603109046062067, "grad_norm": 0.021205252036452293, "learning_rate": 1.3774352546261871e-05, "loss": 0.1197, "step": 122750 }, { "epoch": 1.7606694200653932, "grad_norm": 1.14142906665802, "learning_rate": 1.3770369032696681e-05, "loss": 0.1528, "step": 122775 }, { "epoch": 1.76102793552458, "grad_norm": 10.090231895446777, "learning_rate": 1.3766385519131489e-05, "loss": 0.1388, "step": 122800 }, { "epoch": 1.7613864509837664, "grad_norm": 3.2104272842407227, "learning_rate": 1.3762402005566297e-05, "loss": 0.0912, "step": 122825 }, { "epoch": 1.761744966442953, "grad_norm": 5.445137977600098, "learning_rate": 1.3758418492001105e-05, "loss": 0.1869, "step": 122850 }, { "epoch": 1.7621034819021397, "grad_norm": 0.0667322650551796, "learning_rate": 1.3754434978435912e-05, "loss": 0.095, "step": 122875 }, { "epoch": 1.7624619973613262, "grad_norm": 0.9603590965270996, "learning_rate": 1.3750451464870722e-05, "loss": 0.1144, "step": 122900 }, { "epoch": 1.7628205128205128, "grad_norm": 0.733441174030304, "learning_rate": 1.374646795130553e-05, "loss": 0.1034, "step": 122925 }, { "epoch": 1.7631790282796995, "grad_norm": 0.27612248063087463, "learning_rate": 1.374248443774034e-05, "loss": 0.2103, "step": 122950 }, { "epoch": 1.763537543738886, "grad_norm": 2.8174378871917725, "learning_rate": 1.3738500924175147e-05, "loss": 0.1262, "step": 122975 }, { "epoch": 1.7638960591980726, "grad_norm": 9.319546699523926, "learning_rate": 1.3734517410609955e-05, "loss": 0.1029, "step": 123000 }, { "epoch": 1.7642545746572593, "grad_norm": 6.012783527374268, "learning_rate": 1.3730533897044765e-05, "loss": 0.11, "step": 123025 }, { "epoch": 1.7646130901164458, "grad_norm": 0.4339587390422821, "learning_rate": 1.3726550383479572e-05, "loss": 0.1469, "step": 123050 }, { "epoch": 1.7649716055756324, "grad_norm": 7.489436149597168, "learning_rate": 1.3722566869914382e-05, "loss": 0.073, "step": 123075 }, { "epoch": 1.7653301210348191, "grad_norm": 0.535664975643158, "learning_rate": 1.371858335634919e-05, "loss": 0.22, "step": 123100 }, { "epoch": 1.7656886364940056, "grad_norm": 9.233174324035645, "learning_rate": 1.3714599842783998e-05, "loss": 0.1077, "step": 123125 }, { "epoch": 1.7660471519531922, "grad_norm": 4.81092643737793, "learning_rate": 1.3710616329218807e-05, "loss": 0.0678, "step": 123150 }, { "epoch": 1.766405667412379, "grad_norm": 4.8780035972595215, "learning_rate": 1.3706632815653615e-05, "loss": 0.1619, "step": 123175 }, { "epoch": 1.7667641828715654, "grad_norm": 6.27357292175293, "learning_rate": 1.3702649302088425e-05, "loss": 0.1355, "step": 123200 }, { "epoch": 1.767122698330752, "grad_norm": 3.0810062885284424, "learning_rate": 1.3698665788523232e-05, "loss": 0.2274, "step": 123225 }, { "epoch": 1.7674812137899387, "grad_norm": 2.8556463718414307, "learning_rate": 1.3694682274958042e-05, "loss": 0.1127, "step": 123250 }, { "epoch": 1.7678397292491252, "grad_norm": 3.6677567958831787, "learning_rate": 1.369069876139285e-05, "loss": 0.097, "step": 123275 }, { "epoch": 1.7681982447083118, "grad_norm": 11.944038391113281, "learning_rate": 1.3686715247827658e-05, "loss": 0.1215, "step": 123300 }, { "epoch": 1.7685567601674985, "grad_norm": 15.359697341918945, "learning_rate": 1.3682731734262467e-05, "loss": 0.1274, "step": 123325 }, { "epoch": 1.768915275626685, "grad_norm": 7.724093914031982, "learning_rate": 1.3678748220697275e-05, "loss": 0.2154, "step": 123350 }, { "epoch": 1.7692737910858716, "grad_norm": 4.021061897277832, "learning_rate": 1.3674764707132083e-05, "loss": 0.1118, "step": 123375 }, { "epoch": 1.7696323065450583, "grad_norm": 19.69768524169922, "learning_rate": 1.367078119356689e-05, "loss": 0.1943, "step": 123400 }, { "epoch": 1.7699908220042448, "grad_norm": 4.703602313995361, "learning_rate": 1.3666797680001699e-05, "loss": 0.1781, "step": 123425 }, { "epoch": 1.7703493374634314, "grad_norm": 7.692932605743408, "learning_rate": 1.3662814166436508e-05, "loss": 0.1336, "step": 123450 }, { "epoch": 1.7707078529226181, "grad_norm": 4.8277587890625, "learning_rate": 1.3658830652871316e-05, "loss": 0.0883, "step": 123475 }, { "epoch": 1.7710663683818046, "grad_norm": 3.9356892108917236, "learning_rate": 1.3654847139306125e-05, "loss": 0.1722, "step": 123500 }, { "epoch": 1.7714248838409912, "grad_norm": 0.8080458045005798, "learning_rate": 1.3650863625740933e-05, "loss": 0.0682, "step": 123525 }, { "epoch": 1.771783399300178, "grad_norm": 1.5514551401138306, "learning_rate": 1.3646880112175743e-05, "loss": 0.1298, "step": 123550 }, { "epoch": 1.7721419147593644, "grad_norm": 2.5186080932617188, "learning_rate": 1.364289659861055e-05, "loss": 0.076, "step": 123575 }, { "epoch": 1.772500430218551, "grad_norm": 13.783672332763672, "learning_rate": 1.3638913085045359e-05, "loss": 0.0974, "step": 123600 }, { "epoch": 1.7728589456777377, "grad_norm": 1.8256595134735107, "learning_rate": 1.3634929571480168e-05, "loss": 0.1202, "step": 123625 }, { "epoch": 1.7732174611369242, "grad_norm": 1.8103218078613281, "learning_rate": 1.3630946057914976e-05, "loss": 0.14, "step": 123650 }, { "epoch": 1.7735759765961108, "grad_norm": 0.08546725660562515, "learning_rate": 1.3626962544349785e-05, "loss": 0.1153, "step": 123675 }, { "epoch": 1.7739344920552975, "grad_norm": 0.7922612428665161, "learning_rate": 1.3622979030784593e-05, "loss": 0.063, "step": 123700 }, { "epoch": 1.774293007514484, "grad_norm": 0.1970289945602417, "learning_rate": 1.3618995517219401e-05, "loss": 0.1652, "step": 123725 }, { "epoch": 1.7746515229736706, "grad_norm": 0.7130560874938965, "learning_rate": 1.361501200365421e-05, "loss": 0.0899, "step": 123750 }, { "epoch": 1.7750100384328573, "grad_norm": 0.9481215476989746, "learning_rate": 1.3611028490089019e-05, "loss": 0.1492, "step": 123775 }, { "epoch": 1.7753685538920438, "grad_norm": 16.282991409301758, "learning_rate": 1.3607044976523828e-05, "loss": 0.2256, "step": 123800 }, { "epoch": 1.7757270693512304, "grad_norm": 16.01458740234375, "learning_rate": 1.3603061462958636e-05, "loss": 0.2069, "step": 123825 }, { "epoch": 1.7760855848104171, "grad_norm": 0.3684370517730713, "learning_rate": 1.3599077949393445e-05, "loss": 0.0791, "step": 123850 }, { "epoch": 1.7764441002696036, "grad_norm": 1.3529013395309448, "learning_rate": 1.3595094435828253e-05, "loss": 0.1354, "step": 123875 }, { "epoch": 1.7768026157287902, "grad_norm": 12.221447944641113, "learning_rate": 1.3591110922263061e-05, "loss": 0.0854, "step": 123900 }, { "epoch": 1.777161131187977, "grad_norm": 4.894373893737793, "learning_rate": 1.3587127408697869e-05, "loss": 0.1147, "step": 123925 }, { "epoch": 1.7775196466471634, "grad_norm": 0.3249936103820801, "learning_rate": 1.3583143895132677e-05, "loss": 0.0853, "step": 123950 }, { "epoch": 1.77787816210635, "grad_norm": 0.9122349619865417, "learning_rate": 1.3579160381567486e-05, "loss": 0.1031, "step": 123975 }, { "epoch": 1.7782366775655367, "grad_norm": 0.890447735786438, "learning_rate": 1.3575176868002294e-05, "loss": 0.0662, "step": 124000 }, { "epoch": 1.7785951930247232, "grad_norm": 1.034277319908142, "learning_rate": 1.3571193354437102e-05, "loss": 0.1042, "step": 124025 }, { "epoch": 1.7789537084839098, "grad_norm": 1.7554419040679932, "learning_rate": 1.3567209840871912e-05, "loss": 0.0867, "step": 124050 }, { "epoch": 1.7793122239430965, "grad_norm": 6.6337809562683105, "learning_rate": 1.356322632730672e-05, "loss": 0.2688, "step": 124075 }, { "epoch": 1.779670739402283, "grad_norm": 0.21396438777446747, "learning_rate": 1.3559242813741529e-05, "loss": 0.1218, "step": 124100 }, { "epoch": 1.7800292548614696, "grad_norm": 6.195936679840088, "learning_rate": 1.3555259300176337e-05, "loss": 0.06, "step": 124125 }, { "epoch": 1.7803877703206563, "grad_norm": 1.660398006439209, "learning_rate": 1.3551275786611146e-05, "loss": 0.181, "step": 124150 }, { "epoch": 1.7807462857798428, "grad_norm": 1.9771385192871094, "learning_rate": 1.3547292273045954e-05, "loss": 0.1402, "step": 124175 }, { "epoch": 1.7811048012390294, "grad_norm": 20.67163848876953, "learning_rate": 1.3543308759480762e-05, "loss": 0.0935, "step": 124200 }, { "epoch": 1.7814633166982161, "grad_norm": 0.23796698451042175, "learning_rate": 1.3539325245915572e-05, "loss": 0.0836, "step": 124225 }, { "epoch": 1.7818218321574026, "grad_norm": 0.11065880209207535, "learning_rate": 1.353534173235038e-05, "loss": 0.258, "step": 124250 }, { "epoch": 1.7821803476165892, "grad_norm": 8.12260627746582, "learning_rate": 1.3531358218785189e-05, "loss": 0.1907, "step": 124275 }, { "epoch": 1.782538863075776, "grad_norm": 8.571059226989746, "learning_rate": 1.3527374705219997e-05, "loss": 0.0768, "step": 124300 }, { "epoch": 1.7828973785349624, "grad_norm": 13.862401962280273, "learning_rate": 1.3523391191654805e-05, "loss": 0.2086, "step": 124325 }, { "epoch": 1.783255893994149, "grad_norm": 0.8569445013999939, "learning_rate": 1.3519407678089614e-05, "loss": 0.1233, "step": 124350 }, { "epoch": 1.7836144094533357, "grad_norm": 1.7898008823394775, "learning_rate": 1.3515424164524422e-05, "loss": 0.1805, "step": 124375 }, { "epoch": 1.7839729249125222, "grad_norm": 18.97199821472168, "learning_rate": 1.3511440650959232e-05, "loss": 0.2238, "step": 124400 }, { "epoch": 1.7843314403717088, "grad_norm": 18.04740333557129, "learning_rate": 1.350745713739404e-05, "loss": 0.1837, "step": 124425 }, { "epoch": 1.7846899558308955, "grad_norm": 0.20073416829109192, "learning_rate": 1.3503473623828847e-05, "loss": 0.0511, "step": 124450 }, { "epoch": 1.785048471290082, "grad_norm": 1.0293183326721191, "learning_rate": 1.3499490110263655e-05, "loss": 0.1265, "step": 124475 }, { "epoch": 1.7854069867492686, "grad_norm": 0.1071246862411499, "learning_rate": 1.3495506596698463e-05, "loss": 0.169, "step": 124500 }, { "epoch": 1.7857655022084553, "grad_norm": 1.7052161693572998, "learning_rate": 1.3491523083133272e-05, "loss": 0.0823, "step": 124525 }, { "epoch": 1.7861240176676418, "grad_norm": 13.036317825317383, "learning_rate": 1.348753956956808e-05, "loss": 0.2081, "step": 124550 }, { "epoch": 1.7864825331268284, "grad_norm": 0.9017021656036377, "learning_rate": 1.348355605600289e-05, "loss": 0.1074, "step": 124575 }, { "epoch": 1.7868410485860151, "grad_norm": 6.537775039672852, "learning_rate": 1.3479572542437698e-05, "loss": 0.0926, "step": 124600 }, { "epoch": 1.7871995640452016, "grad_norm": 1.5341160297393799, "learning_rate": 1.3475589028872506e-05, "loss": 0.1235, "step": 124625 }, { "epoch": 1.7875580795043882, "grad_norm": 0.6230726838111877, "learning_rate": 1.3471605515307315e-05, "loss": 0.2033, "step": 124650 }, { "epoch": 1.787916594963575, "grad_norm": 10.424033164978027, "learning_rate": 1.3467622001742123e-05, "loss": 0.1933, "step": 124675 }, { "epoch": 1.7882751104227614, "grad_norm": 1.0340670347213745, "learning_rate": 1.3463638488176932e-05, "loss": 0.132, "step": 124700 }, { "epoch": 1.788633625881948, "grad_norm": 3.1397552490234375, "learning_rate": 1.345965497461174e-05, "loss": 0.074, "step": 124725 }, { "epoch": 1.7889921413411347, "grad_norm": 16.413612365722656, "learning_rate": 1.345567146104655e-05, "loss": 0.1072, "step": 124750 }, { "epoch": 1.7893506568003212, "grad_norm": 1.6665478944778442, "learning_rate": 1.3451687947481358e-05, "loss": 0.0818, "step": 124775 }, { "epoch": 1.7897091722595078, "grad_norm": 18.048328399658203, "learning_rate": 1.3447704433916166e-05, "loss": 0.137, "step": 124800 }, { "epoch": 1.7900676877186945, "grad_norm": 2.119239568710327, "learning_rate": 1.3443720920350975e-05, "loss": 0.1471, "step": 124825 }, { "epoch": 1.790426203177881, "grad_norm": 0.7396166324615479, "learning_rate": 1.3439737406785783e-05, "loss": 0.128, "step": 124850 }, { "epoch": 1.7907847186370676, "grad_norm": 1.4241050481796265, "learning_rate": 1.3435753893220592e-05, "loss": 0.0736, "step": 124875 }, { "epoch": 1.7911432340962543, "grad_norm": 10.991413116455078, "learning_rate": 1.34317703796554e-05, "loss": 0.114, "step": 124900 }, { "epoch": 1.7915017495554408, "grad_norm": 5.7105231285095215, "learning_rate": 1.3427786866090208e-05, "loss": 0.1215, "step": 124925 }, { "epoch": 1.7918602650146274, "grad_norm": 10.217679023742676, "learning_rate": 1.3423803352525018e-05, "loss": 0.1251, "step": 124950 }, { "epoch": 1.792218780473814, "grad_norm": 0.18925032019615173, "learning_rate": 1.3419819838959826e-05, "loss": 0.1102, "step": 124975 }, { "epoch": 1.7925772959330006, "grad_norm": 10.203205108642578, "learning_rate": 1.3415836325394633e-05, "loss": 0.131, "step": 125000 }, { "epoch": 1.7929358113921872, "grad_norm": 1.0978126525878906, "learning_rate": 1.3411852811829441e-05, "loss": 0.137, "step": 125025 }, { "epoch": 1.793294326851374, "grad_norm": 0.12662667036056519, "learning_rate": 1.340786929826425e-05, "loss": 0.0903, "step": 125050 }, { "epoch": 1.7936528423105604, "grad_norm": 0.5743400454521179, "learning_rate": 1.3403885784699059e-05, "loss": 0.0944, "step": 125075 }, { "epoch": 1.794011357769747, "grad_norm": 0.9712875485420227, "learning_rate": 1.3399902271133866e-05, "loss": 0.1429, "step": 125100 }, { "epoch": 1.7943698732289337, "grad_norm": 12.675986289978027, "learning_rate": 1.3395918757568676e-05, "loss": 0.1339, "step": 125125 }, { "epoch": 1.7947283886881202, "grad_norm": 2.8036227226257324, "learning_rate": 1.3391935244003484e-05, "loss": 0.1364, "step": 125150 }, { "epoch": 1.7950869041473068, "grad_norm": 1.1380717754364014, "learning_rate": 1.3387951730438293e-05, "loss": 0.1243, "step": 125175 }, { "epoch": 1.7954454196064935, "grad_norm": 0.3605251908302307, "learning_rate": 1.3383968216873101e-05, "loss": 0.1202, "step": 125200 }, { "epoch": 1.79580393506568, "grad_norm": 4.870563983917236, "learning_rate": 1.3379984703307909e-05, "loss": 0.1209, "step": 125225 }, { "epoch": 1.7961624505248666, "grad_norm": 7.321747303009033, "learning_rate": 1.3376001189742719e-05, "loss": 0.1345, "step": 125250 }, { "epoch": 1.7965209659840533, "grad_norm": 0.5484129786491394, "learning_rate": 1.3372017676177526e-05, "loss": 0.1086, "step": 125275 }, { "epoch": 1.7968794814432398, "grad_norm": 0.3737850487232208, "learning_rate": 1.3368034162612336e-05, "loss": 0.1024, "step": 125300 }, { "epoch": 1.7972379969024264, "grad_norm": 4.404728889465332, "learning_rate": 1.3364050649047144e-05, "loss": 0.1032, "step": 125325 }, { "epoch": 1.797596512361613, "grad_norm": 5.9225077629089355, "learning_rate": 1.3360067135481953e-05, "loss": 0.138, "step": 125350 }, { "epoch": 1.7979550278207996, "grad_norm": 0.9579043388366699, "learning_rate": 1.3356083621916761e-05, "loss": 0.076, "step": 125375 }, { "epoch": 1.7983135432799862, "grad_norm": 3.705211639404297, "learning_rate": 1.3352100108351569e-05, "loss": 0.0913, "step": 125400 }, { "epoch": 1.798672058739173, "grad_norm": 8.747072219848633, "learning_rate": 1.3348116594786379e-05, "loss": 0.1562, "step": 125425 }, { "epoch": 1.7990305741983594, "grad_norm": 1.4731723070144653, "learning_rate": 1.3344133081221186e-05, "loss": 0.1054, "step": 125450 }, { "epoch": 1.799389089657546, "grad_norm": 1.3337739706039429, "learning_rate": 1.3340149567655996e-05, "loss": 0.1448, "step": 125475 }, { "epoch": 1.7997476051167327, "grad_norm": 4.543097019195557, "learning_rate": 1.3336166054090804e-05, "loss": 0.0896, "step": 125500 }, { "epoch": 1.8001061205759192, "grad_norm": 3.3141427040100098, "learning_rate": 1.3332182540525612e-05, "loss": 0.1253, "step": 125525 }, { "epoch": 1.8004646360351058, "grad_norm": 1.061430811882019, "learning_rate": 1.332819902696042e-05, "loss": 0.0715, "step": 125550 }, { "epoch": 1.8008231514942925, "grad_norm": 6.5181379318237305, "learning_rate": 1.3324215513395227e-05, "loss": 0.115, "step": 125575 }, { "epoch": 1.801181666953479, "grad_norm": 2.3205456733703613, "learning_rate": 1.3320231999830037e-05, "loss": 0.0897, "step": 125600 }, { "epoch": 1.8015401824126656, "grad_norm": 0.25393199920654297, "learning_rate": 1.3316248486264845e-05, "loss": 0.1512, "step": 125625 }, { "epoch": 1.8018986978718523, "grad_norm": 0.5097155570983887, "learning_rate": 1.3312264972699654e-05, "loss": 0.1047, "step": 125650 }, { "epoch": 1.8022572133310388, "grad_norm": 15.429966926574707, "learning_rate": 1.3308281459134462e-05, "loss": 0.1753, "step": 125675 }, { "epoch": 1.8026157287902254, "grad_norm": 13.677457809448242, "learning_rate": 1.330429794556927e-05, "loss": 0.1934, "step": 125700 }, { "epoch": 1.802974244249412, "grad_norm": 2.8582394123077393, "learning_rate": 1.330031443200408e-05, "loss": 0.1839, "step": 125725 }, { "epoch": 1.8033327597085986, "grad_norm": 3.7202303409576416, "learning_rate": 1.3296330918438887e-05, "loss": 0.1355, "step": 125750 }, { "epoch": 1.8036912751677852, "grad_norm": 8.502874374389648, "learning_rate": 1.3292347404873697e-05, "loss": 0.1311, "step": 125775 }, { "epoch": 1.804049790626972, "grad_norm": 0.15854193270206451, "learning_rate": 1.3288363891308505e-05, "loss": 0.1048, "step": 125800 }, { "epoch": 1.8044083060861584, "grad_norm": 1.97597074508667, "learning_rate": 1.3284380377743313e-05, "loss": 0.1084, "step": 125825 }, { "epoch": 1.804766821545345, "grad_norm": 0.40250757336616516, "learning_rate": 1.3280396864178122e-05, "loss": 0.2162, "step": 125850 }, { "epoch": 1.8051253370045317, "grad_norm": 0.13947440683841705, "learning_rate": 1.327641335061293e-05, "loss": 0.1523, "step": 125875 }, { "epoch": 1.8054838524637182, "grad_norm": 1.2982784509658813, "learning_rate": 1.327242983704774e-05, "loss": 0.1206, "step": 125900 }, { "epoch": 1.8058423679229048, "grad_norm": 0.6249599456787109, "learning_rate": 1.3268446323482547e-05, "loss": 0.1021, "step": 125925 }, { "epoch": 1.8062008833820915, "grad_norm": 10.465824127197266, "learning_rate": 1.3264462809917357e-05, "loss": 0.1201, "step": 125950 }, { "epoch": 1.806559398841278, "grad_norm": 14.421382904052734, "learning_rate": 1.3260479296352165e-05, "loss": 0.1732, "step": 125975 }, { "epoch": 1.8069179143004646, "grad_norm": 18.150310516357422, "learning_rate": 1.3256495782786973e-05, "loss": 0.1731, "step": 126000 }, { "epoch": 1.8072764297596513, "grad_norm": 3.168715000152588, "learning_rate": 1.3252512269221782e-05, "loss": 0.1681, "step": 126025 }, { "epoch": 1.8076349452188378, "grad_norm": 0.33193910121917725, "learning_rate": 1.324852875565659e-05, "loss": 0.0907, "step": 126050 }, { "epoch": 1.8079934606780244, "grad_norm": 0.3616201877593994, "learning_rate": 1.32445452420914e-05, "loss": 0.0982, "step": 126075 }, { "epoch": 1.808351976137211, "grad_norm": 1.5765258073806763, "learning_rate": 1.3240561728526206e-05, "loss": 0.1343, "step": 126100 }, { "epoch": 1.8087104915963976, "grad_norm": 0.25179702043533325, "learning_rate": 1.3236578214961014e-05, "loss": 0.0592, "step": 126125 }, { "epoch": 1.8090690070555842, "grad_norm": 11.668144226074219, "learning_rate": 1.3232594701395823e-05, "loss": 0.1039, "step": 126150 }, { "epoch": 1.809427522514771, "grad_norm": 4.182100772857666, "learning_rate": 1.3228611187830631e-05, "loss": 0.1227, "step": 126175 }, { "epoch": 1.8097860379739574, "grad_norm": 13.951647758483887, "learning_rate": 1.322462767426544e-05, "loss": 0.0998, "step": 126200 }, { "epoch": 1.810144553433144, "grad_norm": 0.33486780524253845, "learning_rate": 1.3220644160700248e-05, "loss": 0.0851, "step": 126225 }, { "epoch": 1.8105030688923307, "grad_norm": 0.292946994304657, "learning_rate": 1.3216660647135058e-05, "loss": 0.104, "step": 126250 }, { "epoch": 1.8108615843515172, "grad_norm": 1.2337979078292847, "learning_rate": 1.3212677133569866e-05, "loss": 0.1702, "step": 126275 }, { "epoch": 1.8112200998107038, "grad_norm": 0.6541158556938171, "learning_rate": 1.3208693620004673e-05, "loss": 0.157, "step": 126300 }, { "epoch": 1.8115786152698905, "grad_norm": 2.6006720066070557, "learning_rate": 1.3204710106439483e-05, "loss": 0.1697, "step": 126325 }, { "epoch": 1.811937130729077, "grad_norm": 11.386242866516113, "learning_rate": 1.3200726592874291e-05, "loss": 0.2188, "step": 126350 }, { "epoch": 1.8122956461882636, "grad_norm": 14.187195777893066, "learning_rate": 1.31967430793091e-05, "loss": 0.0749, "step": 126375 }, { "epoch": 1.8126541616474503, "grad_norm": 2.4232065677642822, "learning_rate": 1.3192759565743908e-05, "loss": 0.1441, "step": 126400 }, { "epoch": 1.8130126771066368, "grad_norm": 0.11356448382139206, "learning_rate": 1.3188776052178716e-05, "loss": 0.2168, "step": 126425 }, { "epoch": 1.8133711925658234, "grad_norm": 0.6749463677406311, "learning_rate": 1.3184792538613526e-05, "loss": 0.0667, "step": 126450 }, { "epoch": 1.81372970802501, "grad_norm": 1.7438596487045288, "learning_rate": 1.3180809025048333e-05, "loss": 0.1975, "step": 126475 }, { "epoch": 1.8140882234841966, "grad_norm": 0.24376042187213898, "learning_rate": 1.3176825511483143e-05, "loss": 0.1942, "step": 126500 }, { "epoch": 1.8144467389433832, "grad_norm": 0.45966672897338867, "learning_rate": 1.3172841997917951e-05, "loss": 0.1457, "step": 126525 }, { "epoch": 1.81480525440257, "grad_norm": 3.6734232902526855, "learning_rate": 1.316885848435276e-05, "loss": 0.1343, "step": 126550 }, { "epoch": 1.8151637698617564, "grad_norm": 9.56579875946045, "learning_rate": 1.3164874970787568e-05, "loss": 0.1068, "step": 126575 }, { "epoch": 1.815522285320943, "grad_norm": 1.1061989068984985, "learning_rate": 1.3160891457222376e-05, "loss": 0.1793, "step": 126600 }, { "epoch": 1.8158808007801297, "grad_norm": 9.717350959777832, "learning_rate": 1.3156907943657186e-05, "loss": 0.0848, "step": 126625 }, { "epoch": 1.8162393162393162, "grad_norm": 0.1468745619058609, "learning_rate": 1.3152924430091992e-05, "loss": 0.0879, "step": 126650 }, { "epoch": 1.8165978316985028, "grad_norm": 0.48983994126319885, "learning_rate": 1.3148940916526801e-05, "loss": 0.1085, "step": 126675 }, { "epoch": 1.8169563471576895, "grad_norm": 12.991584777832031, "learning_rate": 1.314495740296161e-05, "loss": 0.1381, "step": 126700 }, { "epoch": 1.817314862616876, "grad_norm": 1.5837372541427612, "learning_rate": 1.3140973889396417e-05, "loss": 0.2079, "step": 126725 }, { "epoch": 1.8176733780760626, "grad_norm": 1.0531651973724365, "learning_rate": 1.3136990375831227e-05, "loss": 0.1874, "step": 126750 }, { "epoch": 1.8180318935352493, "grad_norm": 2.34472393989563, "learning_rate": 1.3133006862266034e-05, "loss": 0.0834, "step": 126775 }, { "epoch": 1.8183904089944358, "grad_norm": 1.7699116468429565, "learning_rate": 1.3129023348700844e-05, "loss": 0.1882, "step": 126800 }, { "epoch": 1.8187489244536224, "grad_norm": 2.019683361053467, "learning_rate": 1.3125039835135652e-05, "loss": 0.1119, "step": 126825 }, { "epoch": 1.819107439912809, "grad_norm": 3.9139151573181152, "learning_rate": 1.3121056321570461e-05, "loss": 0.1356, "step": 126850 }, { "epoch": 1.8194659553719956, "grad_norm": 13.603676795959473, "learning_rate": 1.311707280800527e-05, "loss": 0.2198, "step": 126875 }, { "epoch": 1.8198244708311822, "grad_norm": 21.65889549255371, "learning_rate": 1.3113089294440077e-05, "loss": 0.1477, "step": 126900 }, { "epoch": 1.820182986290369, "grad_norm": 9.704753875732422, "learning_rate": 1.3109105780874887e-05, "loss": 0.1218, "step": 126925 }, { "epoch": 1.8205415017495554, "grad_norm": 1.7135446071624756, "learning_rate": 1.3105122267309694e-05, "loss": 0.0978, "step": 126950 }, { "epoch": 1.820900017208742, "grad_norm": 0.11804691702127457, "learning_rate": 1.3101138753744504e-05, "loss": 0.2104, "step": 126975 }, { "epoch": 1.8212585326679287, "grad_norm": 0.10410609841346741, "learning_rate": 1.3097155240179312e-05, "loss": 0.1966, "step": 127000 }, { "epoch": 1.8216170481271152, "grad_norm": 19.503128051757812, "learning_rate": 1.309317172661412e-05, "loss": 0.1301, "step": 127025 }, { "epoch": 1.8219755635863017, "grad_norm": 4.761789321899414, "learning_rate": 1.308918821304893e-05, "loss": 0.1246, "step": 127050 }, { "epoch": 1.8223340790454885, "grad_norm": 0.4719056487083435, "learning_rate": 1.3085204699483737e-05, "loss": 0.2113, "step": 127075 }, { "epoch": 1.822692594504675, "grad_norm": 3.312464714050293, "learning_rate": 1.3081221185918547e-05, "loss": 0.1104, "step": 127100 }, { "epoch": 1.8230511099638615, "grad_norm": 12.272801399230957, "learning_rate": 1.3077237672353354e-05, "loss": 0.1646, "step": 127125 }, { "epoch": 1.8234096254230483, "grad_norm": 9.831446647644043, "learning_rate": 1.3073254158788164e-05, "loss": 0.0707, "step": 127150 }, { "epoch": 1.8237681408822348, "grad_norm": 11.922207832336426, "learning_rate": 1.3069270645222972e-05, "loss": 0.0955, "step": 127175 }, { "epoch": 1.8241266563414213, "grad_norm": 1.173812985420227, "learning_rate": 1.3065287131657778e-05, "loss": 0.1946, "step": 127200 }, { "epoch": 1.824485171800608, "grad_norm": 8.439505577087402, "learning_rate": 1.3061303618092587e-05, "loss": 0.1475, "step": 127225 }, { "epoch": 1.8248436872597946, "grad_norm": 16.337324142456055, "learning_rate": 1.3057320104527395e-05, "loss": 0.1873, "step": 127250 }, { "epoch": 1.8252022027189811, "grad_norm": 0.43555548787117004, "learning_rate": 1.3053336590962205e-05, "loss": 0.1617, "step": 127275 }, { "epoch": 1.825560718178168, "grad_norm": 0.19149766862392426, "learning_rate": 1.3049353077397013e-05, "loss": 0.1921, "step": 127300 }, { "epoch": 1.8259192336373544, "grad_norm": 0.3396454453468323, "learning_rate": 1.304536956383182e-05, "loss": 0.1385, "step": 127325 }, { "epoch": 1.826277749096541, "grad_norm": 0.19175931811332703, "learning_rate": 1.304138605026663e-05, "loss": 0.1075, "step": 127350 }, { "epoch": 1.8266362645557277, "grad_norm": 11.364167213439941, "learning_rate": 1.3037402536701438e-05, "loss": 0.0822, "step": 127375 }, { "epoch": 1.8269947800149142, "grad_norm": 12.908061981201172, "learning_rate": 1.3033419023136247e-05, "loss": 0.1201, "step": 127400 }, { "epoch": 1.8273532954741007, "grad_norm": 1.3940352201461792, "learning_rate": 1.3029435509571055e-05, "loss": 0.1571, "step": 127425 }, { "epoch": 1.8277118109332875, "grad_norm": 1.384729027748108, "learning_rate": 1.3025451996005865e-05, "loss": 0.068, "step": 127450 }, { "epoch": 1.828070326392474, "grad_norm": 10.999277114868164, "learning_rate": 1.3021468482440673e-05, "loss": 0.2309, "step": 127475 }, { "epoch": 1.8284288418516605, "grad_norm": 6.75691032409668, "learning_rate": 1.301748496887548e-05, "loss": 0.0693, "step": 127500 }, { "epoch": 1.8287873573108473, "grad_norm": 0.44893914461135864, "learning_rate": 1.301350145531029e-05, "loss": 0.1696, "step": 127525 }, { "epoch": 1.8291458727700338, "grad_norm": 0.15566745400428772, "learning_rate": 1.3009517941745098e-05, "loss": 0.0757, "step": 127550 }, { "epoch": 1.8295043882292203, "grad_norm": 8.039094924926758, "learning_rate": 1.3005534428179907e-05, "loss": 0.1528, "step": 127575 }, { "epoch": 1.829862903688407, "grad_norm": 0.1782519370317459, "learning_rate": 1.3001550914614715e-05, "loss": 0.0744, "step": 127600 }, { "epoch": 1.8302214191475936, "grad_norm": 0.9480229020118713, "learning_rate": 1.2997567401049523e-05, "loss": 0.152, "step": 127625 }, { "epoch": 1.8305799346067801, "grad_norm": 0.6820863485336304, "learning_rate": 1.2993583887484333e-05, "loss": 0.1225, "step": 127650 }, { "epoch": 1.830938450065967, "grad_norm": 3.7075021266937256, "learning_rate": 1.298960037391914e-05, "loss": 0.1497, "step": 127675 }, { "epoch": 1.8312969655251534, "grad_norm": 20.606491088867188, "learning_rate": 1.298561686035395e-05, "loss": 0.1525, "step": 127700 }, { "epoch": 1.83165548098434, "grad_norm": 15.028970718383789, "learning_rate": 1.2981633346788758e-05, "loss": 0.1305, "step": 127725 }, { "epoch": 1.8320139964435267, "grad_norm": 3.051539421081543, "learning_rate": 1.2977649833223564e-05, "loss": 0.0785, "step": 127750 }, { "epoch": 1.8323725119027132, "grad_norm": 1.9332661628723145, "learning_rate": 1.2973666319658374e-05, "loss": 0.1355, "step": 127775 }, { "epoch": 1.8327310273618997, "grad_norm": 1.2470142841339111, "learning_rate": 1.2969682806093181e-05, "loss": 0.2099, "step": 127800 }, { "epoch": 1.8330895428210865, "grad_norm": 0.11530732363462448, "learning_rate": 1.2965699292527991e-05, "loss": 0.0394, "step": 127825 }, { "epoch": 1.833448058280273, "grad_norm": 2.683582305908203, "learning_rate": 1.2961715778962799e-05, "loss": 0.117, "step": 127850 }, { "epoch": 1.8338065737394595, "grad_norm": 0.2705508768558502, "learning_rate": 1.2957732265397608e-05, "loss": 0.1194, "step": 127875 }, { "epoch": 1.8341650891986463, "grad_norm": 3.5687084197998047, "learning_rate": 1.2953748751832416e-05, "loss": 0.1709, "step": 127900 }, { "epoch": 1.8345236046578328, "grad_norm": 10.41955852508545, "learning_rate": 1.2949765238267224e-05, "loss": 0.1798, "step": 127925 }, { "epoch": 1.8348821201170193, "grad_norm": 3.914557456970215, "learning_rate": 1.2945781724702034e-05, "loss": 0.1251, "step": 127950 }, { "epoch": 1.835240635576206, "grad_norm": 7.947587490081787, "learning_rate": 1.2941798211136841e-05, "loss": 0.0947, "step": 127975 }, { "epoch": 1.8355991510353926, "grad_norm": 0.04108622670173645, "learning_rate": 1.2937814697571651e-05, "loss": 0.1044, "step": 128000 }, { "epoch": 1.8359576664945791, "grad_norm": 9.32410717010498, "learning_rate": 1.2933831184006459e-05, "loss": 0.1347, "step": 128025 }, { "epoch": 1.836316181953766, "grad_norm": 15.996403694152832, "learning_rate": 1.2929847670441267e-05, "loss": 0.1331, "step": 128050 }, { "epoch": 1.8366746974129524, "grad_norm": 0.6259276866912842, "learning_rate": 1.2925864156876076e-05, "loss": 0.1184, "step": 128075 }, { "epoch": 1.837033212872139, "grad_norm": 10.796547889709473, "learning_rate": 1.2921880643310884e-05, "loss": 0.2268, "step": 128100 }, { "epoch": 1.8373917283313257, "grad_norm": 0.43405571579933167, "learning_rate": 1.2917897129745694e-05, "loss": 0.1263, "step": 128125 }, { "epoch": 1.8377502437905122, "grad_norm": 0.7469269633293152, "learning_rate": 1.2913913616180501e-05, "loss": 0.0853, "step": 128150 }, { "epoch": 1.8381087592496987, "grad_norm": 6.680711269378662, "learning_rate": 1.2909930102615311e-05, "loss": 0.0696, "step": 128175 }, { "epoch": 1.8384672747088855, "grad_norm": 1.2504202127456665, "learning_rate": 1.2905946589050119e-05, "loss": 0.1146, "step": 128200 }, { "epoch": 1.838825790168072, "grad_norm": 0.7339927554130554, "learning_rate": 1.2901963075484927e-05, "loss": 0.1642, "step": 128225 }, { "epoch": 1.8391843056272585, "grad_norm": 0.10807494074106216, "learning_rate": 1.2897979561919736e-05, "loss": 0.178, "step": 128250 }, { "epoch": 1.8395428210864453, "grad_norm": 22.76593780517578, "learning_rate": 1.2893996048354544e-05, "loss": 0.1492, "step": 128275 }, { "epoch": 1.8399013365456318, "grad_norm": 0.6733422875404358, "learning_rate": 1.2890012534789352e-05, "loss": 0.1484, "step": 128300 }, { "epoch": 1.8402598520048183, "grad_norm": 0.3535078465938568, "learning_rate": 1.288602902122416e-05, "loss": 0.1208, "step": 128325 }, { "epoch": 1.840618367464005, "grad_norm": 0.7988664507865906, "learning_rate": 1.2882045507658968e-05, "loss": 0.0938, "step": 128350 }, { "epoch": 1.8409768829231916, "grad_norm": 4.913212299346924, "learning_rate": 1.2878061994093777e-05, "loss": 0.0902, "step": 128375 }, { "epoch": 1.8413353983823781, "grad_norm": 0.3926476538181305, "learning_rate": 1.2874078480528585e-05, "loss": 0.0952, "step": 128400 }, { "epoch": 1.841693913841565, "grad_norm": 4.598597049713135, "learning_rate": 1.2870094966963395e-05, "loss": 0.1319, "step": 128425 }, { "epoch": 1.8420524293007514, "grad_norm": 0.6463335752487183, "learning_rate": 1.2866111453398202e-05, "loss": 0.0957, "step": 128450 }, { "epoch": 1.842410944759938, "grad_norm": 9.316095352172852, "learning_rate": 1.2862127939833012e-05, "loss": 0.1258, "step": 128475 }, { "epoch": 1.8427694602191247, "grad_norm": 3.2879838943481445, "learning_rate": 1.285814442626782e-05, "loss": 0.0802, "step": 128500 }, { "epoch": 1.8431279756783112, "grad_norm": 0.06529819220304489, "learning_rate": 1.2854160912702628e-05, "loss": 0.0838, "step": 128525 }, { "epoch": 1.8434864911374977, "grad_norm": 0.5655903816223145, "learning_rate": 1.2850177399137437e-05, "loss": 0.1605, "step": 128550 }, { "epoch": 1.8438450065966845, "grad_norm": 0.34064942598342896, "learning_rate": 1.2846193885572245e-05, "loss": 0.1526, "step": 128575 }, { "epoch": 1.844203522055871, "grad_norm": 14.826241493225098, "learning_rate": 1.2842210372007054e-05, "loss": 0.0982, "step": 128600 }, { "epoch": 1.8445620375150575, "grad_norm": 1.550864815711975, "learning_rate": 1.2838226858441862e-05, "loss": 0.1297, "step": 128625 }, { "epoch": 1.8449205529742443, "grad_norm": 1.5317037105560303, "learning_rate": 1.283424334487667e-05, "loss": 0.2397, "step": 128650 }, { "epoch": 1.8452790684334308, "grad_norm": 0.821570634841919, "learning_rate": 1.283025983131148e-05, "loss": 0.1324, "step": 128675 }, { "epoch": 1.8456375838926173, "grad_norm": 13.739409446716309, "learning_rate": 1.2826276317746288e-05, "loss": 0.0905, "step": 128700 }, { "epoch": 1.845996099351804, "grad_norm": 0.2263682335615158, "learning_rate": 1.2822292804181097e-05, "loss": 0.1404, "step": 128725 }, { "epoch": 1.8463546148109906, "grad_norm": 0.10528329759836197, "learning_rate": 1.2818309290615905e-05, "loss": 0.0928, "step": 128750 }, { "epoch": 1.8467131302701771, "grad_norm": 0.7170953750610352, "learning_rate": 1.2814325777050714e-05, "loss": 0.1183, "step": 128775 }, { "epoch": 1.8470716457293639, "grad_norm": 18.83077049255371, "learning_rate": 1.2810342263485522e-05, "loss": 0.1036, "step": 128800 }, { "epoch": 1.8474301611885504, "grad_norm": 0.35105159878730774, "learning_rate": 1.280635874992033e-05, "loss": 0.1074, "step": 128825 }, { "epoch": 1.847788676647737, "grad_norm": 19.99750518798828, "learning_rate": 1.2802375236355138e-05, "loss": 0.0778, "step": 128850 }, { "epoch": 1.8481471921069237, "grad_norm": 0.7156492471694946, "learning_rate": 1.2798391722789946e-05, "loss": 0.218, "step": 128875 }, { "epoch": 1.8485057075661102, "grad_norm": 18.21687126159668, "learning_rate": 1.2794408209224755e-05, "loss": 0.1813, "step": 128900 }, { "epoch": 1.8488642230252967, "grad_norm": 15.393304824829102, "learning_rate": 1.2790424695659563e-05, "loss": 0.1933, "step": 128925 }, { "epoch": 1.8492227384844835, "grad_norm": 0.21493208408355713, "learning_rate": 1.2786441182094371e-05, "loss": 0.1068, "step": 128950 }, { "epoch": 1.84958125394367, "grad_norm": 0.21986599266529083, "learning_rate": 1.278245766852918e-05, "loss": 0.0649, "step": 128975 }, { "epoch": 1.8499397694028565, "grad_norm": 0.1927943080663681, "learning_rate": 1.2778474154963988e-05, "loss": 0.1409, "step": 129000 }, { "epoch": 1.8502982848620433, "grad_norm": 14.839851379394531, "learning_rate": 1.2774490641398798e-05, "loss": 0.2444, "step": 129025 }, { "epoch": 1.8506568003212298, "grad_norm": 0.5599791407585144, "learning_rate": 1.2770507127833606e-05, "loss": 0.0833, "step": 129050 }, { "epoch": 1.8510153157804163, "grad_norm": 6.824876308441162, "learning_rate": 1.2766523614268415e-05, "loss": 0.0775, "step": 129075 }, { "epoch": 1.851373831239603, "grad_norm": 13.163507461547852, "learning_rate": 1.2762540100703223e-05, "loss": 0.1157, "step": 129100 }, { "epoch": 1.8517323466987896, "grad_norm": 12.027472496032715, "learning_rate": 1.2758556587138031e-05, "loss": 0.1481, "step": 129125 }, { "epoch": 1.8520908621579761, "grad_norm": 15.366165161132812, "learning_rate": 1.275457307357284e-05, "loss": 0.0925, "step": 129150 }, { "epoch": 1.8524493776171629, "grad_norm": 8.486766815185547, "learning_rate": 1.2750589560007648e-05, "loss": 0.1052, "step": 129175 }, { "epoch": 1.8528078930763494, "grad_norm": 2.9409124851226807, "learning_rate": 1.2746606046442458e-05, "loss": 0.156, "step": 129200 }, { "epoch": 1.853166408535536, "grad_norm": 18.22443962097168, "learning_rate": 1.2742622532877266e-05, "loss": 0.1138, "step": 129225 }, { "epoch": 1.8535249239947227, "grad_norm": 5.84903621673584, "learning_rate": 1.2738639019312074e-05, "loss": 0.0967, "step": 129250 }, { "epoch": 1.8538834394539092, "grad_norm": 0.31482139229774475, "learning_rate": 1.2734655505746883e-05, "loss": 0.1288, "step": 129275 }, { "epoch": 1.8542419549130957, "grad_norm": 26.808176040649414, "learning_rate": 1.2730671992181691e-05, "loss": 0.1353, "step": 129300 }, { "epoch": 1.8546004703722825, "grad_norm": 9.566642761230469, "learning_rate": 1.27266884786165e-05, "loss": 0.1356, "step": 129325 }, { "epoch": 1.854958985831469, "grad_norm": 0.1889038383960724, "learning_rate": 1.2722704965051308e-05, "loss": 0.1083, "step": 129350 }, { "epoch": 1.8553175012906555, "grad_norm": 20.43624496459961, "learning_rate": 1.2718721451486118e-05, "loss": 0.1585, "step": 129375 }, { "epoch": 1.8556760167498423, "grad_norm": 0.38159337639808655, "learning_rate": 1.2714737937920924e-05, "loss": 0.0784, "step": 129400 }, { "epoch": 1.8560345322090288, "grad_norm": 12.602557182312012, "learning_rate": 1.2710754424355732e-05, "loss": 0.1057, "step": 129425 }, { "epoch": 1.8563930476682153, "grad_norm": 14.295024871826172, "learning_rate": 1.2706770910790542e-05, "loss": 0.1346, "step": 129450 }, { "epoch": 1.856751563127402, "grad_norm": 1.5430957078933716, "learning_rate": 1.270278739722535e-05, "loss": 0.056, "step": 129475 }, { "epoch": 1.8571100785865886, "grad_norm": 0.1890547126531601, "learning_rate": 1.2698803883660159e-05, "loss": 0.2054, "step": 129500 }, { "epoch": 1.8574685940457751, "grad_norm": 14.125385284423828, "learning_rate": 1.2694820370094967e-05, "loss": 0.1475, "step": 129525 }, { "epoch": 1.8578271095049619, "grad_norm": 7.704086780548096, "learning_rate": 1.2690836856529775e-05, "loss": 0.1792, "step": 129550 }, { "epoch": 1.8581856249641484, "grad_norm": 9.427268981933594, "learning_rate": 1.2686853342964584e-05, "loss": 0.0933, "step": 129575 }, { "epoch": 1.858544140423335, "grad_norm": 22.117992401123047, "learning_rate": 1.2682869829399392e-05, "loss": 0.1386, "step": 129600 }, { "epoch": 1.8589026558825217, "grad_norm": 16.13442039489746, "learning_rate": 1.2678886315834202e-05, "loss": 0.0796, "step": 129625 }, { "epoch": 1.8592611713417082, "grad_norm": 0.35487887263298035, "learning_rate": 1.267490280226901e-05, "loss": 0.163, "step": 129650 }, { "epoch": 1.8596196868008947, "grad_norm": 16.401182174682617, "learning_rate": 1.2670919288703819e-05, "loss": 0.1366, "step": 129675 }, { "epoch": 1.8599782022600815, "grad_norm": 0.12332814186811447, "learning_rate": 1.2666935775138627e-05, "loss": 0.0939, "step": 129700 }, { "epoch": 1.860336717719268, "grad_norm": 1.2534940242767334, "learning_rate": 1.2662952261573435e-05, "loss": 0.1576, "step": 129725 }, { "epoch": 1.8606952331784545, "grad_norm": 0.44598186016082764, "learning_rate": 1.2658968748008244e-05, "loss": 0.1156, "step": 129750 }, { "epoch": 1.8610537486376413, "grad_norm": 6.68617582321167, "learning_rate": 1.2654985234443052e-05, "loss": 0.0512, "step": 129775 }, { "epoch": 1.8614122640968278, "grad_norm": 1.2685729265213013, "learning_rate": 1.2651001720877862e-05, "loss": 0.0785, "step": 129800 }, { "epoch": 1.8617707795560143, "grad_norm": 13.823275566101074, "learning_rate": 1.264701820731267e-05, "loss": 0.1622, "step": 129825 }, { "epoch": 1.862129295015201, "grad_norm": 6.409114360809326, "learning_rate": 1.2643034693747477e-05, "loss": 0.0665, "step": 129850 }, { "epoch": 1.8624878104743876, "grad_norm": 14.074410438537598, "learning_rate": 1.2639051180182287e-05, "loss": 0.1379, "step": 129875 }, { "epoch": 1.8628463259335741, "grad_norm": 1.4874494075775146, "learning_rate": 1.2635067666617095e-05, "loss": 0.1875, "step": 129900 }, { "epoch": 1.8632048413927609, "grad_norm": 17.20707130432129, "learning_rate": 1.2631084153051904e-05, "loss": 0.1461, "step": 129925 }, { "epoch": 1.8635633568519474, "grad_norm": 9.69825267791748, "learning_rate": 1.262710063948671e-05, "loss": 0.2337, "step": 129950 }, { "epoch": 1.863921872311134, "grad_norm": 7.405020236968994, "learning_rate": 1.262311712592152e-05, "loss": 0.1212, "step": 129975 }, { "epoch": 1.8642803877703207, "grad_norm": 10.837759971618652, "learning_rate": 1.2619133612356328e-05, "loss": 0.1042, "step": 130000 }, { "epoch": 1.8646389032295072, "grad_norm": 1.601396083831787, "learning_rate": 1.2615150098791136e-05, "loss": 0.1457, "step": 130025 }, { "epoch": 1.8649974186886937, "grad_norm": 4.281298637390137, "learning_rate": 1.2611166585225945e-05, "loss": 0.1005, "step": 130050 }, { "epoch": 1.8653559341478805, "grad_norm": 21.349023818969727, "learning_rate": 1.2607183071660753e-05, "loss": 0.1121, "step": 130075 }, { "epoch": 1.865714449607067, "grad_norm": 2.9902584552764893, "learning_rate": 1.2603199558095562e-05, "loss": 0.1389, "step": 130100 }, { "epoch": 1.8660729650662535, "grad_norm": 16.018423080444336, "learning_rate": 1.259921604453037e-05, "loss": 0.0619, "step": 130125 }, { "epoch": 1.8664314805254403, "grad_norm": 0.10199540853500366, "learning_rate": 1.2595232530965178e-05, "loss": 0.1047, "step": 130150 }, { "epoch": 1.8667899959846268, "grad_norm": 0.032177478075027466, "learning_rate": 1.2591249017399988e-05, "loss": 0.1305, "step": 130175 }, { "epoch": 1.8671485114438133, "grad_norm": 0.4500160217285156, "learning_rate": 1.2587265503834796e-05, "loss": 0.1942, "step": 130200 }, { "epoch": 1.867507026903, "grad_norm": 15.128961563110352, "learning_rate": 1.2583281990269605e-05, "loss": 0.144, "step": 130225 }, { "epoch": 1.8678655423621866, "grad_norm": 0.02301265113055706, "learning_rate": 1.2579298476704413e-05, "loss": 0.0699, "step": 130250 }, { "epoch": 1.8682240578213731, "grad_norm": 0.7221315503120422, "learning_rate": 1.2575314963139222e-05, "loss": 0.1104, "step": 130275 }, { "epoch": 1.8685825732805599, "grad_norm": 0.45840078592300415, "learning_rate": 1.257133144957403e-05, "loss": 0.1311, "step": 130300 }, { "epoch": 1.8689410887397464, "grad_norm": 19.957868576049805, "learning_rate": 1.2567347936008838e-05, "loss": 0.1518, "step": 130325 }, { "epoch": 1.869299604198933, "grad_norm": 2.3502695560455322, "learning_rate": 1.2563364422443648e-05, "loss": 0.1604, "step": 130350 }, { "epoch": 1.8696581196581197, "grad_norm": 1.425214409828186, "learning_rate": 1.2559380908878456e-05, "loss": 0.08, "step": 130375 }, { "epoch": 1.8700166351173062, "grad_norm": 0.7561257481575012, "learning_rate": 1.2555397395313265e-05, "loss": 0.0936, "step": 130400 }, { "epoch": 1.8703751505764927, "grad_norm": 0.05845700576901436, "learning_rate": 1.2551413881748073e-05, "loss": 0.1662, "step": 130425 }, { "epoch": 1.8707336660356795, "grad_norm": 13.945833206176758, "learning_rate": 1.254743036818288e-05, "loss": 0.1205, "step": 130450 }, { "epoch": 1.8710921814948662, "grad_norm": 0.17737440764904022, "learning_rate": 1.254344685461769e-05, "loss": 0.1081, "step": 130475 }, { "epoch": 1.8714506969540525, "grad_norm": 3.672274589538574, "learning_rate": 1.2539463341052496e-05, "loss": 0.1005, "step": 130500 }, { "epoch": 1.8718092124132393, "grad_norm": 0.3900725543498993, "learning_rate": 1.2535479827487306e-05, "loss": 0.1336, "step": 130525 }, { "epoch": 1.872167727872426, "grad_norm": 0.9384759664535522, "learning_rate": 1.2531496313922114e-05, "loss": 0.1594, "step": 130550 }, { "epoch": 1.8725262433316123, "grad_norm": 2.3301901817321777, "learning_rate": 1.2527512800356923e-05, "loss": 0.1434, "step": 130575 }, { "epoch": 1.872884758790799, "grad_norm": 1.5356054306030273, "learning_rate": 1.2523529286791731e-05, "loss": 0.1425, "step": 130600 }, { "epoch": 1.8732432742499858, "grad_norm": 0.44582948088645935, "learning_rate": 1.2519545773226539e-05, "loss": 0.0974, "step": 130625 }, { "epoch": 1.8736017897091721, "grad_norm": 0.7906078100204468, "learning_rate": 1.2515562259661349e-05, "loss": 0.0856, "step": 130650 }, { "epoch": 1.8739603051683589, "grad_norm": 7.034746170043945, "learning_rate": 1.2511578746096156e-05, "loss": 0.1934, "step": 130675 }, { "epoch": 1.8743188206275456, "grad_norm": 0.36678245663642883, "learning_rate": 1.2507595232530966e-05, "loss": 0.119, "step": 130700 }, { "epoch": 1.874677336086732, "grad_norm": 0.8099156618118286, "learning_rate": 1.2503611718965774e-05, "loss": 0.1218, "step": 130725 }, { "epoch": 1.8750358515459187, "grad_norm": 1.040543556213379, "learning_rate": 1.2499628205400582e-05, "loss": 0.1549, "step": 130750 }, { "epoch": 1.8753943670051054, "grad_norm": 0.7581588625907898, "learning_rate": 1.2495644691835391e-05, "loss": 0.128, "step": 130775 }, { "epoch": 1.8757528824642917, "grad_norm": 5.933798313140869, "learning_rate": 1.2491661178270199e-05, "loss": 0.1412, "step": 130800 }, { "epoch": 1.8761113979234785, "grad_norm": 5.541106700897217, "learning_rate": 1.2487677664705009e-05, "loss": 0.0957, "step": 130825 }, { "epoch": 1.8764699133826652, "grad_norm": 0.038984622806310654, "learning_rate": 1.2483694151139816e-05, "loss": 0.0601, "step": 130850 }, { "epoch": 1.8768284288418515, "grad_norm": 7.502253532409668, "learning_rate": 1.2479710637574626e-05, "loss": 0.1395, "step": 130875 }, { "epoch": 1.8771869443010383, "grad_norm": 11.75502872467041, "learning_rate": 1.2475727124009434e-05, "loss": 0.0936, "step": 130900 }, { "epoch": 1.877545459760225, "grad_norm": 16.179304122924805, "learning_rate": 1.2471743610444242e-05, "loss": 0.1583, "step": 130925 }, { "epoch": 1.8779039752194113, "grad_norm": 6.620942115783691, "learning_rate": 1.2467760096879051e-05, "loss": 0.175, "step": 130950 }, { "epoch": 1.878262490678598, "grad_norm": 1.6160067319869995, "learning_rate": 1.2463776583313859e-05, "loss": 0.0848, "step": 130975 }, { "epoch": 1.8786210061377848, "grad_norm": 23.308982849121094, "learning_rate": 1.2459793069748669e-05, "loss": 0.1484, "step": 131000 }, { "epoch": 1.8789795215969711, "grad_norm": 21.33611488342285, "learning_rate": 1.2455809556183476e-05, "loss": 0.1225, "step": 131025 }, { "epoch": 1.8793380370561579, "grad_norm": 0.10005000233650208, "learning_rate": 1.2451826042618283e-05, "loss": 0.1488, "step": 131050 }, { "epoch": 1.8796965525153446, "grad_norm": 2.46966290473938, "learning_rate": 1.2447842529053092e-05, "loss": 0.1435, "step": 131075 }, { "epoch": 1.880055067974531, "grad_norm": 7.917078495025635, "learning_rate": 1.24438590154879e-05, "loss": 0.1718, "step": 131100 }, { "epoch": 1.8804135834337177, "grad_norm": 1.5085982084274292, "learning_rate": 1.243987550192271e-05, "loss": 0.1178, "step": 131125 }, { "epoch": 1.8807720988929044, "grad_norm": 2.7443301677703857, "learning_rate": 1.2435891988357517e-05, "loss": 0.0865, "step": 131150 }, { "epoch": 1.8811306143520907, "grad_norm": 1.2168686389923096, "learning_rate": 1.2431908474792327e-05, "loss": 0.0927, "step": 131175 }, { "epoch": 1.8814891298112775, "grad_norm": 1.681654691696167, "learning_rate": 1.2427924961227135e-05, "loss": 0.1037, "step": 131200 }, { "epoch": 1.8818476452704642, "grad_norm": 10.409687042236328, "learning_rate": 1.2423941447661943e-05, "loss": 0.1336, "step": 131225 }, { "epoch": 1.8822061607296505, "grad_norm": 0.9407346248626709, "learning_rate": 1.2419957934096752e-05, "loss": 0.0447, "step": 131250 }, { "epoch": 1.8825646761888373, "grad_norm": 0.9193363189697266, "learning_rate": 1.241597442053156e-05, "loss": 0.1734, "step": 131275 }, { "epoch": 1.882923191648024, "grad_norm": 9.778740882873535, "learning_rate": 1.241199090696637e-05, "loss": 0.0722, "step": 131300 }, { "epoch": 1.8832817071072103, "grad_norm": 14.889090538024902, "learning_rate": 1.2408007393401177e-05, "loss": 0.1188, "step": 131325 }, { "epoch": 1.883640222566397, "grad_norm": 0.6500499248504639, "learning_rate": 1.2404023879835985e-05, "loss": 0.0868, "step": 131350 }, { "epoch": 1.8839987380255838, "grad_norm": 0.5061282515525818, "learning_rate": 1.2400040366270795e-05, "loss": 0.1338, "step": 131375 }, { "epoch": 1.8843572534847701, "grad_norm": 4.572085380554199, "learning_rate": 1.2396056852705603e-05, "loss": 0.1008, "step": 131400 }, { "epoch": 1.8847157689439569, "grad_norm": 16.456768035888672, "learning_rate": 1.2392073339140412e-05, "loss": 0.1694, "step": 131425 }, { "epoch": 1.8850742844031436, "grad_norm": 3.8373053073883057, "learning_rate": 1.238808982557522e-05, "loss": 0.0974, "step": 131450 }, { "epoch": 1.88543279986233, "grad_norm": 3.8423352241516113, "learning_rate": 1.238410631201003e-05, "loss": 0.1296, "step": 131475 }, { "epoch": 1.8857913153215167, "grad_norm": 0.4882431924343109, "learning_rate": 1.2380122798444837e-05, "loss": 0.163, "step": 131500 }, { "epoch": 1.8861498307807034, "grad_norm": 2.752919912338257, "learning_rate": 1.2376139284879645e-05, "loss": 0.1125, "step": 131525 }, { "epoch": 1.8865083462398897, "grad_norm": 1.2960816621780396, "learning_rate": 1.2372155771314455e-05, "loss": 0.176, "step": 131550 }, { "epoch": 1.8868668616990765, "grad_norm": 2.9120030403137207, "learning_rate": 1.2368172257749263e-05, "loss": 0.0689, "step": 131575 }, { "epoch": 1.8872253771582632, "grad_norm": 3.3414764404296875, "learning_rate": 1.236418874418407e-05, "loss": 0.1101, "step": 131600 }, { "epoch": 1.8875838926174495, "grad_norm": 10.471107482910156, "learning_rate": 1.2360205230618878e-05, "loss": 0.0952, "step": 131625 }, { "epoch": 1.8879424080766363, "grad_norm": 3.6607203483581543, "learning_rate": 1.2356221717053686e-05, "loss": 0.1204, "step": 131650 }, { "epoch": 1.888300923535823, "grad_norm": 2.371121644973755, "learning_rate": 1.2352238203488496e-05, "loss": 0.0836, "step": 131675 }, { "epoch": 1.8886594389950093, "grad_norm": 3.6532976627349854, "learning_rate": 1.2348254689923303e-05, "loss": 0.1151, "step": 131700 }, { "epoch": 1.889017954454196, "grad_norm": 0.9425303936004639, "learning_rate": 1.2344271176358113e-05, "loss": 0.1243, "step": 131725 }, { "epoch": 1.8893764699133828, "grad_norm": 12.054696083068848, "learning_rate": 1.234028766279292e-05, "loss": 0.123, "step": 131750 }, { "epoch": 1.8897349853725691, "grad_norm": 12.136513710021973, "learning_rate": 1.233630414922773e-05, "loss": 0.1359, "step": 131775 }, { "epoch": 1.8900935008317559, "grad_norm": 23.68752098083496, "learning_rate": 1.2332320635662538e-05, "loss": 0.1375, "step": 131800 }, { "epoch": 1.8904520162909426, "grad_norm": 3.1683835983276367, "learning_rate": 1.2328337122097346e-05, "loss": 0.1385, "step": 131825 }, { "epoch": 1.890810531750129, "grad_norm": 11.315717697143555, "learning_rate": 1.2324353608532156e-05, "loss": 0.1863, "step": 131850 }, { "epoch": 1.8911690472093157, "grad_norm": 14.522261619567871, "learning_rate": 1.2320370094966963e-05, "loss": 0.2079, "step": 131875 }, { "epoch": 1.8915275626685024, "grad_norm": 9.016822814941406, "learning_rate": 1.2316386581401773e-05, "loss": 0.0995, "step": 131900 }, { "epoch": 1.8918860781276887, "grad_norm": 0.11885850131511688, "learning_rate": 1.231240306783658e-05, "loss": 0.0938, "step": 131925 }, { "epoch": 1.8922445935868755, "grad_norm": 15.657236099243164, "learning_rate": 1.2308419554271389e-05, "loss": 0.111, "step": 131950 }, { "epoch": 1.8926031090460622, "grad_norm": 0.5633066892623901, "learning_rate": 1.2304436040706198e-05, "loss": 0.1119, "step": 131975 }, { "epoch": 1.8929616245052485, "grad_norm": 2.702040910720825, "learning_rate": 1.2300452527141006e-05, "loss": 0.0797, "step": 132000 }, { "epoch": 1.8933201399644353, "grad_norm": 1.610526204109192, "learning_rate": 1.2296469013575816e-05, "loss": 0.123, "step": 132025 }, { "epoch": 1.893678655423622, "grad_norm": 1.0347578525543213, "learning_rate": 1.2292485500010623e-05, "loss": 0.0902, "step": 132050 }, { "epoch": 1.8940371708828083, "grad_norm": 0.593650221824646, "learning_rate": 1.2288501986445431e-05, "loss": 0.2388, "step": 132075 }, { "epoch": 1.894395686341995, "grad_norm": 0.24521037936210632, "learning_rate": 1.228451847288024e-05, "loss": 0.1097, "step": 132100 }, { "epoch": 1.8947542018011818, "grad_norm": 9.270625114440918, "learning_rate": 1.2280534959315049e-05, "loss": 0.1492, "step": 132125 }, { "epoch": 1.8951127172603681, "grad_norm": 0.15498222410678864, "learning_rate": 1.2276551445749857e-05, "loss": 0.1186, "step": 132150 }, { "epoch": 1.8954712327195549, "grad_norm": 0.06646449863910675, "learning_rate": 1.2272567932184664e-05, "loss": 0.1387, "step": 132175 }, { "epoch": 1.8958297481787416, "grad_norm": 1.474538803100586, "learning_rate": 1.2268584418619474e-05, "loss": 0.0684, "step": 132200 }, { "epoch": 1.896188263637928, "grad_norm": 29.130590438842773, "learning_rate": 1.2264600905054282e-05, "loss": 0.1358, "step": 132225 }, { "epoch": 1.8965467790971147, "grad_norm": 0.18060711026191711, "learning_rate": 1.226061739148909e-05, "loss": 0.1044, "step": 132250 }, { "epoch": 1.8969052945563014, "grad_norm": 0.061930470168590546, "learning_rate": 1.2256633877923899e-05, "loss": 0.1126, "step": 132275 }, { "epoch": 1.8972638100154877, "grad_norm": 13.131975173950195, "learning_rate": 1.2252650364358707e-05, "loss": 0.1255, "step": 132300 }, { "epoch": 1.8976223254746745, "grad_norm": 6.082961559295654, "learning_rate": 1.2248666850793517e-05, "loss": 0.1853, "step": 132325 }, { "epoch": 1.8979808409338612, "grad_norm": 13.263872146606445, "learning_rate": 1.2244683337228324e-05, "loss": 0.1095, "step": 132350 }, { "epoch": 1.8983393563930475, "grad_norm": 8.253275871276855, "learning_rate": 1.2240699823663132e-05, "loss": 0.078, "step": 132375 }, { "epoch": 1.8986978718522343, "grad_norm": 0.13292554020881653, "learning_rate": 1.2236716310097942e-05, "loss": 0.0645, "step": 132400 }, { "epoch": 1.899056387311421, "grad_norm": 5.947458267211914, "learning_rate": 1.223273279653275e-05, "loss": 0.1191, "step": 132425 }, { "epoch": 1.8994149027706073, "grad_norm": 3.991438865661621, "learning_rate": 1.2228749282967559e-05, "loss": 0.2162, "step": 132450 }, { "epoch": 1.899773418229794, "grad_norm": 3.4383320808410645, "learning_rate": 1.2224765769402367e-05, "loss": 0.1048, "step": 132475 }, { "epoch": 1.9001319336889808, "grad_norm": 15.284976959228516, "learning_rate": 1.2220782255837177e-05, "loss": 0.1957, "step": 132500 }, { "epoch": 1.9004904491481671, "grad_norm": 0.8237897157669067, "learning_rate": 1.2216798742271984e-05, "loss": 0.1658, "step": 132525 }, { "epoch": 1.9008489646073539, "grad_norm": 0.618876039981842, "learning_rate": 1.2212815228706792e-05, "loss": 0.1849, "step": 132550 }, { "epoch": 1.9012074800665406, "grad_norm": 7.985204696655273, "learning_rate": 1.2208831715141602e-05, "loss": 0.0979, "step": 132575 }, { "epoch": 1.901565995525727, "grad_norm": 0.20066861808300018, "learning_rate": 1.220484820157641e-05, "loss": 0.2197, "step": 132600 }, { "epoch": 1.9019245109849137, "grad_norm": 21.11566734313965, "learning_rate": 1.2200864688011219e-05, "loss": 0.1665, "step": 132625 }, { "epoch": 1.9022830264441004, "grad_norm": 0.6497341394424438, "learning_rate": 1.2196881174446027e-05, "loss": 0.0765, "step": 132650 }, { "epoch": 1.9026415419032867, "grad_norm": 0.46837350726127625, "learning_rate": 1.2192897660880835e-05, "loss": 0.0985, "step": 132675 }, { "epoch": 1.9030000573624735, "grad_norm": 2.018641948699951, "learning_rate": 1.2188914147315643e-05, "loss": 0.0787, "step": 132700 }, { "epoch": 1.9033585728216602, "grad_norm": 0.45311668515205383, "learning_rate": 1.218493063375045e-05, "loss": 0.1129, "step": 132725 }, { "epoch": 1.9037170882808465, "grad_norm": 2.5588791370391846, "learning_rate": 1.218094712018526e-05, "loss": 0.1119, "step": 132750 }, { "epoch": 1.9040756037400333, "grad_norm": 0.8743604421615601, "learning_rate": 1.2176963606620068e-05, "loss": 0.0682, "step": 132775 }, { "epoch": 1.90443411919922, "grad_norm": 0.11899442970752716, "learning_rate": 1.2172980093054877e-05, "loss": 0.1125, "step": 132800 }, { "epoch": 1.9047926346584063, "grad_norm": 11.134337425231934, "learning_rate": 1.2168996579489685e-05, "loss": 0.074, "step": 132825 }, { "epoch": 1.905151150117593, "grad_norm": 5.678440570831299, "learning_rate": 1.2165013065924493e-05, "loss": 0.1398, "step": 132850 }, { "epoch": 1.9055096655767798, "grad_norm": 0.5176705121994019, "learning_rate": 1.2161029552359303e-05, "loss": 0.0668, "step": 132875 }, { "epoch": 1.9058681810359661, "grad_norm": 0.12892411649227142, "learning_rate": 1.215704603879411e-05, "loss": 0.1215, "step": 132900 }, { "epoch": 1.9062266964951529, "grad_norm": 0.8128584623336792, "learning_rate": 1.215306252522892e-05, "loss": 0.075, "step": 132925 }, { "epoch": 1.9065852119543396, "grad_norm": 0.38265299797058105, "learning_rate": 1.2149079011663728e-05, "loss": 0.1074, "step": 132950 }, { "epoch": 1.906943727413526, "grad_norm": 2.5359294414520264, "learning_rate": 1.2145095498098536e-05, "loss": 0.1462, "step": 132975 }, { "epoch": 1.9073022428727127, "grad_norm": 18.975614547729492, "learning_rate": 1.2141111984533345e-05, "loss": 0.1914, "step": 133000 }, { "epoch": 1.9076607583318994, "grad_norm": 1.1775460243225098, "learning_rate": 1.2137128470968153e-05, "loss": 0.1126, "step": 133025 }, { "epoch": 1.9080192737910857, "grad_norm": 16.180021286010742, "learning_rate": 1.2133144957402963e-05, "loss": 0.248, "step": 133050 }, { "epoch": 1.9083777892502725, "grad_norm": 0.9803509712219238, "learning_rate": 1.212916144383777e-05, "loss": 0.1891, "step": 133075 }, { "epoch": 1.9087363047094592, "grad_norm": 4.872742652893066, "learning_rate": 1.212517793027258e-05, "loss": 0.1013, "step": 133100 }, { "epoch": 1.9090948201686455, "grad_norm": 15.942778587341309, "learning_rate": 1.2121194416707388e-05, "loss": 0.1001, "step": 133125 }, { "epoch": 1.9094533356278323, "grad_norm": 7.727712154388428, "learning_rate": 1.2117210903142196e-05, "loss": 0.0692, "step": 133150 }, { "epoch": 1.909811851087019, "grad_norm": 2.4152753353118896, "learning_rate": 1.2113227389577005e-05, "loss": 0.1117, "step": 133175 }, { "epoch": 1.9101703665462053, "grad_norm": 4.966827869415283, "learning_rate": 1.2109243876011813e-05, "loss": 0.1353, "step": 133200 }, { "epoch": 1.910528882005392, "grad_norm": 6.927927494049072, "learning_rate": 1.2105260362446623e-05, "loss": 0.1478, "step": 133225 }, { "epoch": 1.9108873974645788, "grad_norm": 0.0969591736793518, "learning_rate": 1.2101276848881429e-05, "loss": 0.1424, "step": 133250 }, { "epoch": 1.9112459129237651, "grad_norm": 5.615168571472168, "learning_rate": 1.2097293335316237e-05, "loss": 0.1383, "step": 133275 }, { "epoch": 1.9116044283829519, "grad_norm": 2.1642165184020996, "learning_rate": 1.2093309821751046e-05, "loss": 0.1399, "step": 133300 }, { "epoch": 1.9119629438421386, "grad_norm": 9.500798225402832, "learning_rate": 1.2089326308185854e-05, "loss": 0.1419, "step": 133325 }, { "epoch": 1.912321459301325, "grad_norm": 10.50909423828125, "learning_rate": 1.2085342794620664e-05, "loss": 0.0569, "step": 133350 }, { "epoch": 1.9126799747605117, "grad_norm": 1.491382360458374, "learning_rate": 1.2081359281055471e-05, "loss": 0.1623, "step": 133375 }, { "epoch": 1.9130384902196984, "grad_norm": 5.103508949279785, "learning_rate": 1.2077375767490281e-05, "loss": 0.1351, "step": 133400 }, { "epoch": 1.9133970056788847, "grad_norm": 15.17320442199707, "learning_rate": 1.2073392253925089e-05, "loss": 0.1496, "step": 133425 }, { "epoch": 1.9137555211380715, "grad_norm": 0.9976111054420471, "learning_rate": 1.2069408740359897e-05, "loss": 0.1717, "step": 133450 }, { "epoch": 1.9141140365972582, "grad_norm": 5.995475769042969, "learning_rate": 1.2065425226794706e-05, "loss": 0.0995, "step": 133475 }, { "epoch": 1.9144725520564445, "grad_norm": 2.8500375747680664, "learning_rate": 1.2061441713229514e-05, "loss": 0.0866, "step": 133500 }, { "epoch": 1.9148310675156313, "grad_norm": 14.806591033935547, "learning_rate": 1.2057458199664324e-05, "loss": 0.164, "step": 133525 }, { "epoch": 1.915189582974818, "grad_norm": 2.2823710441589355, "learning_rate": 1.2053474686099131e-05, "loss": 0.1087, "step": 133550 }, { "epoch": 1.9155480984340043, "grad_norm": 3.2315478324890137, "learning_rate": 1.204949117253394e-05, "loss": 0.1782, "step": 133575 }, { "epoch": 1.915906613893191, "grad_norm": 15.08370590209961, "learning_rate": 1.2045507658968749e-05, "loss": 0.1353, "step": 133600 }, { "epoch": 1.9162651293523778, "grad_norm": 9.982684135437012, "learning_rate": 1.2041524145403557e-05, "loss": 0.071, "step": 133625 }, { "epoch": 1.916623644811564, "grad_norm": 1.2482765913009644, "learning_rate": 1.2037540631838366e-05, "loss": 0.1424, "step": 133650 }, { "epoch": 1.9169821602707509, "grad_norm": 4.152139663696289, "learning_rate": 1.2033557118273174e-05, "loss": 0.1789, "step": 133675 }, { "epoch": 1.9173406757299376, "grad_norm": 2.800978899002075, "learning_rate": 1.2029573604707984e-05, "loss": 0.1009, "step": 133700 }, { "epoch": 1.9176991911891241, "grad_norm": 10.484561920166016, "learning_rate": 1.2025590091142791e-05, "loss": 0.1209, "step": 133725 }, { "epoch": 1.9180577066483107, "grad_norm": 14.566463470458984, "learning_rate": 1.20216065775776e-05, "loss": 0.2085, "step": 133750 }, { "epoch": 1.9184162221074974, "grad_norm": 4.933841228485107, "learning_rate": 1.2017623064012409e-05, "loss": 0.0883, "step": 133775 }, { "epoch": 1.918774737566684, "grad_norm": 14.184020042419434, "learning_rate": 1.2013639550447215e-05, "loss": 0.1349, "step": 133800 }, { "epoch": 1.9191332530258705, "grad_norm": 9.85683822631836, "learning_rate": 1.2009656036882024e-05, "loss": 0.212, "step": 133825 }, { "epoch": 1.9194917684850572, "grad_norm": 0.6600126028060913, "learning_rate": 1.2005672523316832e-05, "loss": 0.0833, "step": 133850 }, { "epoch": 1.9198502839442437, "grad_norm": 8.273826599121094, "learning_rate": 1.200168900975164e-05, "loss": 0.0932, "step": 133875 }, { "epoch": 1.9202087994034303, "grad_norm": 3.713754653930664, "learning_rate": 1.199770549618645e-05, "loss": 0.1837, "step": 133900 }, { "epoch": 1.920567314862617, "grad_norm": 7.834407806396484, "learning_rate": 1.1993721982621258e-05, "loss": 0.1175, "step": 133925 }, { "epoch": 1.9209258303218035, "grad_norm": 11.018919944763184, "learning_rate": 1.1989738469056067e-05, "loss": 0.1074, "step": 133950 }, { "epoch": 1.92128434578099, "grad_norm": 0.06111612915992737, "learning_rate": 1.1985754955490875e-05, "loss": 0.1057, "step": 133975 }, { "epoch": 1.9216428612401768, "grad_norm": 28.004858016967773, "learning_rate": 1.1981771441925684e-05, "loss": 0.2096, "step": 134000 }, { "epoch": 1.9220013766993633, "grad_norm": 13.777868270874023, "learning_rate": 1.1977787928360492e-05, "loss": 0.0689, "step": 134025 }, { "epoch": 1.9223598921585499, "grad_norm": 6.989131450653076, "learning_rate": 1.19738044147953e-05, "loss": 0.0604, "step": 134050 }, { "epoch": 1.9227184076177366, "grad_norm": 0.5801117420196533, "learning_rate": 1.196982090123011e-05, "loss": 0.1064, "step": 134075 }, { "epoch": 1.9230769230769231, "grad_norm": 0.1220712810754776, "learning_rate": 1.1965837387664918e-05, "loss": 0.0687, "step": 134100 }, { "epoch": 1.9234354385361097, "grad_norm": 0.03884151577949524, "learning_rate": 1.1961853874099727e-05, "loss": 0.1343, "step": 134125 }, { "epoch": 1.9237939539952964, "grad_norm": 1.2447680234909058, "learning_rate": 1.1957870360534535e-05, "loss": 0.0578, "step": 134150 }, { "epoch": 1.924152469454483, "grad_norm": 3.4208903312683105, "learning_rate": 1.1953886846969343e-05, "loss": 0.1305, "step": 134175 }, { "epoch": 1.9245109849136695, "grad_norm": 15.27458667755127, "learning_rate": 1.1949903333404152e-05, "loss": 0.1429, "step": 134200 }, { "epoch": 1.9248695003728562, "grad_norm": 4.353878021240234, "learning_rate": 1.194591981983896e-05, "loss": 0.1873, "step": 134225 }, { "epoch": 1.9252280158320427, "grad_norm": 6.115928649902344, "learning_rate": 1.194193630627377e-05, "loss": 0.1568, "step": 134250 }, { "epoch": 1.9255865312912293, "grad_norm": 0.9300339818000793, "learning_rate": 1.1937952792708578e-05, "loss": 0.1435, "step": 134275 }, { "epoch": 1.925945046750416, "grad_norm": 0.8016968369483948, "learning_rate": 1.1933969279143387e-05, "loss": 0.0894, "step": 134300 }, { "epoch": 1.9263035622096025, "grad_norm": 1.5265982151031494, "learning_rate": 1.1929985765578195e-05, "loss": 0.1739, "step": 134325 }, { "epoch": 1.926662077668789, "grad_norm": 0.14139825105667114, "learning_rate": 1.1926002252013001e-05, "loss": 0.0761, "step": 134350 }, { "epoch": 1.9270205931279758, "grad_norm": 14.172037124633789, "learning_rate": 1.192201873844781e-05, "loss": 0.094, "step": 134375 }, { "epoch": 1.9273791085871623, "grad_norm": 1.283035159111023, "learning_rate": 1.1918035224882618e-05, "loss": 0.2002, "step": 134400 }, { "epoch": 1.9277376240463489, "grad_norm": 5.9066243171691895, "learning_rate": 1.1914051711317428e-05, "loss": 0.149, "step": 134425 }, { "epoch": 1.9280961395055356, "grad_norm": 2.5294528007507324, "learning_rate": 1.1910068197752236e-05, "loss": 0.1072, "step": 134450 }, { "epoch": 1.9284546549647221, "grad_norm": 12.804170608520508, "learning_rate": 1.1906084684187044e-05, "loss": 0.0881, "step": 134475 }, { "epoch": 1.9288131704239087, "grad_norm": 7.071032524108887, "learning_rate": 1.1902101170621853e-05, "loss": 0.0462, "step": 134500 }, { "epoch": 1.9291716858830954, "grad_norm": 12.815510749816895, "learning_rate": 1.1898117657056661e-05, "loss": 0.1432, "step": 134525 }, { "epoch": 1.929530201342282, "grad_norm": 10.398592948913574, "learning_rate": 1.189413414349147e-05, "loss": 0.0951, "step": 134550 }, { "epoch": 1.9298887168014685, "grad_norm": 1.1392946243286133, "learning_rate": 1.1890150629926278e-05, "loss": 0.1858, "step": 134575 }, { "epoch": 1.9302472322606552, "grad_norm": 0.14058071374893188, "learning_rate": 1.1886167116361088e-05, "loss": 0.1249, "step": 134600 }, { "epoch": 1.9306057477198417, "grad_norm": 12.130672454833984, "learning_rate": 1.1882183602795896e-05, "loss": 0.1875, "step": 134625 }, { "epoch": 1.9309642631790283, "grad_norm": 1.4245132207870483, "learning_rate": 1.1878200089230704e-05, "loss": 0.1201, "step": 134650 }, { "epoch": 1.931322778638215, "grad_norm": 0.2527826726436615, "learning_rate": 1.1874216575665513e-05, "loss": 0.2157, "step": 134675 }, { "epoch": 1.9316812940974015, "grad_norm": 1.9668858051300049, "learning_rate": 1.1870233062100321e-05, "loss": 0.1891, "step": 134700 }, { "epoch": 1.932039809556588, "grad_norm": 0.339848130941391, "learning_rate": 1.186624954853513e-05, "loss": 0.0859, "step": 134725 }, { "epoch": 1.9323983250157748, "grad_norm": 0.049764327704906464, "learning_rate": 1.1862266034969938e-05, "loss": 0.1658, "step": 134750 }, { "epoch": 1.9327568404749613, "grad_norm": 11.454934120178223, "learning_rate": 1.1858282521404746e-05, "loss": 0.1312, "step": 134775 }, { "epoch": 1.9331153559341478, "grad_norm": 6.83821964263916, "learning_rate": 1.1854299007839556e-05, "loss": 0.1088, "step": 134800 }, { "epoch": 1.9334738713933346, "grad_norm": 1.0467324256896973, "learning_rate": 1.1850315494274364e-05, "loss": 0.2336, "step": 134825 }, { "epoch": 1.9338323868525211, "grad_norm": 20.88812828063965, "learning_rate": 1.1846331980709173e-05, "loss": 0.1252, "step": 134850 }, { "epoch": 1.9341909023117076, "grad_norm": 1.634587049484253, "learning_rate": 1.1842348467143981e-05, "loss": 0.1316, "step": 134875 }, { "epoch": 1.9345494177708944, "grad_norm": 1.2997314929962158, "learning_rate": 1.1838364953578789e-05, "loss": 0.1369, "step": 134900 }, { "epoch": 1.934907933230081, "grad_norm": 4.67997932434082, "learning_rate": 1.1834381440013597e-05, "loss": 0.1369, "step": 134925 }, { "epoch": 1.9352664486892674, "grad_norm": 6.112560272216797, "learning_rate": 1.1830397926448405e-05, "loss": 0.1043, "step": 134950 }, { "epoch": 1.9356249641484542, "grad_norm": 7.378050327301025, "learning_rate": 1.1826414412883214e-05, "loss": 0.0426, "step": 134975 }, { "epoch": 1.9359834796076407, "grad_norm": 0.11077173054218292, "learning_rate": 1.1822430899318022e-05, "loss": 0.0845, "step": 135000 }, { "epoch": 1.9363419950668272, "grad_norm": 1.228798270225525, "learning_rate": 1.1818447385752831e-05, "loss": 0.0655, "step": 135025 }, { "epoch": 1.936700510526014, "grad_norm": 4.212104320526123, "learning_rate": 1.181446387218764e-05, "loss": 0.1288, "step": 135050 }, { "epoch": 1.9370590259852005, "grad_norm": 9.745885848999023, "learning_rate": 1.1810480358622447e-05, "loss": 0.0803, "step": 135075 }, { "epoch": 1.937417541444387, "grad_norm": 4.723878383636475, "learning_rate": 1.1806496845057257e-05, "loss": 0.2077, "step": 135100 }, { "epoch": 1.9377760569035738, "grad_norm": 0.032106541097164154, "learning_rate": 1.1802513331492065e-05, "loss": 0.0511, "step": 135125 }, { "epoch": 1.9381345723627603, "grad_norm": 0.9752845168113708, "learning_rate": 1.1798529817926874e-05, "loss": 0.048, "step": 135150 }, { "epoch": 1.9384930878219468, "grad_norm": 0.22273707389831543, "learning_rate": 1.1794546304361682e-05, "loss": 0.105, "step": 135175 }, { "epoch": 1.9388516032811336, "grad_norm": 3.0892231464385986, "learning_rate": 1.1790562790796491e-05, "loss": 0.0801, "step": 135200 }, { "epoch": 1.9392101187403201, "grad_norm": 0.2116813361644745, "learning_rate": 1.17865792772313e-05, "loss": 0.0644, "step": 135225 }, { "epoch": 1.9395686341995066, "grad_norm": 9.734357833862305, "learning_rate": 1.1782595763666107e-05, "loss": 0.11, "step": 135250 }, { "epoch": 1.9399271496586934, "grad_norm": 1.117114543914795, "learning_rate": 1.1778612250100917e-05, "loss": 0.082, "step": 135275 }, { "epoch": 1.94028566511788, "grad_norm": 14.390318870544434, "learning_rate": 1.1774628736535725e-05, "loss": 0.0675, "step": 135300 }, { "epoch": 1.9406441805770664, "grad_norm": 1.0198853015899658, "learning_rate": 1.1770645222970534e-05, "loss": 0.1169, "step": 135325 }, { "epoch": 1.9410026960362532, "grad_norm": 0.46934038400650024, "learning_rate": 1.1766661709405342e-05, "loss": 0.1214, "step": 135350 }, { "epoch": 1.9413612114954397, "grad_norm": 18.33720588684082, "learning_rate": 1.176267819584015e-05, "loss": 0.1344, "step": 135375 }, { "epoch": 1.9417197269546262, "grad_norm": 17.365909576416016, "learning_rate": 1.175869468227496e-05, "loss": 0.1369, "step": 135400 }, { "epoch": 1.942078242413813, "grad_norm": 1.6853891611099243, "learning_rate": 1.1754711168709767e-05, "loss": 0.1707, "step": 135425 }, { "epoch": 1.9424367578729995, "grad_norm": 0.17194634675979614, "learning_rate": 1.1750727655144575e-05, "loss": 0.0698, "step": 135450 }, { "epoch": 1.942795273332186, "grad_norm": 0.6163688898086548, "learning_rate": 1.1746744141579383e-05, "loss": 0.0854, "step": 135475 }, { "epoch": 1.9431537887913728, "grad_norm": 7.913673400878906, "learning_rate": 1.1742760628014192e-05, "loss": 0.1039, "step": 135500 }, { "epoch": 1.9435123042505593, "grad_norm": 0.5265359878540039, "learning_rate": 1.1738777114449e-05, "loss": 0.0632, "step": 135525 }, { "epoch": 1.9438708197097458, "grad_norm": 0.18035556375980377, "learning_rate": 1.1734793600883808e-05, "loss": 0.0416, "step": 135550 }, { "epoch": 1.9442293351689326, "grad_norm": 2.1990654468536377, "learning_rate": 1.1730810087318618e-05, "loss": 0.239, "step": 135575 }, { "epoch": 1.9445878506281191, "grad_norm": 9.22320556640625, "learning_rate": 1.1726826573753425e-05, "loss": 0.1906, "step": 135600 }, { "epoch": 1.9449463660873056, "grad_norm": 12.593631744384766, "learning_rate": 1.1722843060188235e-05, "loss": 0.14, "step": 135625 }, { "epoch": 1.9453048815464924, "grad_norm": 0.9017486572265625, "learning_rate": 1.1718859546623043e-05, "loss": 0.1257, "step": 135650 }, { "epoch": 1.945663397005679, "grad_norm": 4.494324684143066, "learning_rate": 1.171487603305785e-05, "loss": 0.0902, "step": 135675 }, { "epoch": 1.9460219124648654, "grad_norm": 3.688570022583008, "learning_rate": 1.171089251949266e-05, "loss": 0.0536, "step": 135700 }, { "epoch": 1.9463804279240522, "grad_norm": 15.84997844696045, "learning_rate": 1.1706909005927468e-05, "loss": 0.1243, "step": 135725 }, { "epoch": 1.9467389433832387, "grad_norm": 0.26086583733558655, "learning_rate": 1.1702925492362278e-05, "loss": 0.0864, "step": 135750 }, { "epoch": 1.9470974588424252, "grad_norm": 5.7461090087890625, "learning_rate": 1.1698941978797085e-05, "loss": 0.1621, "step": 135775 }, { "epoch": 1.947455974301612, "grad_norm": 14.5596342086792, "learning_rate": 1.1694958465231895e-05, "loss": 0.1088, "step": 135800 }, { "epoch": 1.9478144897607985, "grad_norm": 0.3229467272758484, "learning_rate": 1.1690974951666703e-05, "loss": 0.0992, "step": 135825 }, { "epoch": 1.948173005219985, "grad_norm": 9.724872589111328, "learning_rate": 1.168699143810151e-05, "loss": 0.1215, "step": 135850 }, { "epoch": 1.9485315206791718, "grad_norm": 6.254179000854492, "learning_rate": 1.168300792453632e-05, "loss": 0.0609, "step": 135875 }, { "epoch": 1.9488900361383583, "grad_norm": 2.925645589828491, "learning_rate": 1.1679024410971128e-05, "loss": 0.1007, "step": 135900 }, { "epoch": 1.9492485515975448, "grad_norm": 5.730644702911377, "learning_rate": 1.1675040897405938e-05, "loss": 0.0976, "step": 135925 }, { "epoch": 1.9496070670567316, "grad_norm": 6.544134616851807, "learning_rate": 1.1671057383840745e-05, "loss": 0.0736, "step": 135950 }, { "epoch": 1.9499655825159181, "grad_norm": 0.1288241744041443, "learning_rate": 1.1667073870275553e-05, "loss": 0.1223, "step": 135975 }, { "epoch": 1.9503240979751046, "grad_norm": 8.975451469421387, "learning_rate": 1.1663090356710361e-05, "loss": 0.0674, "step": 136000 }, { "epoch": 1.9506826134342914, "grad_norm": 0.05565647408366203, "learning_rate": 1.1659106843145169e-05, "loss": 0.1129, "step": 136025 }, { "epoch": 1.951041128893478, "grad_norm": 1.2590889930725098, "learning_rate": 1.1655123329579979e-05, "loss": 0.1634, "step": 136050 }, { "epoch": 1.9513996443526644, "grad_norm": 9.665996551513672, "learning_rate": 1.1651139816014786e-05, "loss": 0.054, "step": 136075 }, { "epoch": 1.9517581598118512, "grad_norm": 3.313084602355957, "learning_rate": 1.1647156302449596e-05, "loss": 0.1636, "step": 136100 }, { "epoch": 1.9521166752710377, "grad_norm": 20.005088806152344, "learning_rate": 1.1643172788884404e-05, "loss": 0.2159, "step": 136125 }, { "epoch": 1.9524751907302242, "grad_norm": 13.244016647338867, "learning_rate": 1.1639189275319212e-05, "loss": 0.1004, "step": 136150 }, { "epoch": 1.952833706189411, "grad_norm": 5.336380481719971, "learning_rate": 1.1635205761754021e-05, "loss": 0.1977, "step": 136175 }, { "epoch": 1.9531922216485975, "grad_norm": 14.206463813781738, "learning_rate": 1.1631222248188829e-05, "loss": 0.0957, "step": 136200 }, { "epoch": 1.953550737107784, "grad_norm": 2.3122076988220215, "learning_rate": 1.1627238734623639e-05, "loss": 0.1893, "step": 136225 }, { "epoch": 1.9539092525669708, "grad_norm": 0.5608050227165222, "learning_rate": 1.1623255221058446e-05, "loss": 0.1216, "step": 136250 }, { "epoch": 1.9542677680261573, "grad_norm": 7.359834671020508, "learning_rate": 1.1619271707493254e-05, "loss": 0.1595, "step": 136275 }, { "epoch": 1.9546262834853438, "grad_norm": 1.2635784149169922, "learning_rate": 1.1615288193928064e-05, "loss": 0.1057, "step": 136300 }, { "epoch": 1.9549847989445306, "grad_norm": 3.194915771484375, "learning_rate": 1.1611304680362872e-05, "loss": 0.1014, "step": 136325 }, { "epoch": 1.9553433144037171, "grad_norm": 6.600093841552734, "learning_rate": 1.1607321166797681e-05, "loss": 0.1033, "step": 136350 }, { "epoch": 1.9557018298629036, "grad_norm": 11.583785057067871, "learning_rate": 1.1603337653232489e-05, "loss": 0.1704, "step": 136375 }, { "epoch": 1.9560603453220904, "grad_norm": 4.6247029304504395, "learning_rate": 1.1599354139667297e-05, "loss": 0.1771, "step": 136400 }, { "epoch": 1.956418860781277, "grad_norm": 3.3398728370666504, "learning_rate": 1.1595370626102106e-05, "loss": 0.1535, "step": 136425 }, { "epoch": 1.9567773762404634, "grad_norm": 3.896908760070801, "learning_rate": 1.1591387112536914e-05, "loss": 0.0969, "step": 136450 }, { "epoch": 1.9571358916996502, "grad_norm": 2.354259967803955, "learning_rate": 1.1587403598971724e-05, "loss": 0.0985, "step": 136475 }, { "epoch": 1.9574944071588367, "grad_norm": 0.38598859310150146, "learning_rate": 1.1583420085406532e-05, "loss": 0.0913, "step": 136500 }, { "epoch": 1.9578529226180232, "grad_norm": 1.6943076848983765, "learning_rate": 1.1579436571841341e-05, "loss": 0.1136, "step": 136525 }, { "epoch": 1.95821143807721, "grad_norm": 4.445244312286377, "learning_rate": 1.1575453058276147e-05, "loss": 0.1682, "step": 136550 }, { "epoch": 1.9585699535363965, "grad_norm": 1.3943649530410767, "learning_rate": 1.1571469544710955e-05, "loss": 0.1234, "step": 136575 }, { "epoch": 1.958928468995583, "grad_norm": 0.07032905519008636, "learning_rate": 1.1567486031145765e-05, "loss": 0.085, "step": 136600 }, { "epoch": 1.9592869844547698, "grad_norm": 1.392722487449646, "learning_rate": 1.1563502517580573e-05, "loss": 0.0915, "step": 136625 }, { "epoch": 1.9596454999139563, "grad_norm": 1.845420479774475, "learning_rate": 1.1559519004015382e-05, "loss": 0.1842, "step": 136650 }, { "epoch": 1.9600040153731428, "grad_norm": 9.200510025024414, "learning_rate": 1.155553549045019e-05, "loss": 0.07, "step": 136675 }, { "epoch": 1.9603625308323296, "grad_norm": 3.661777973175049, "learning_rate": 1.1551551976884998e-05, "loss": 0.0975, "step": 136700 }, { "epoch": 1.9607210462915161, "grad_norm": 19.0012149810791, "learning_rate": 1.1547568463319807e-05, "loss": 0.1588, "step": 136725 }, { "epoch": 1.9610795617507026, "grad_norm": 3.8448173999786377, "learning_rate": 1.1543584949754615e-05, "loss": 0.0684, "step": 136750 }, { "epoch": 1.9614380772098894, "grad_norm": 7.4955878257751465, "learning_rate": 1.1539601436189425e-05, "loss": 0.2043, "step": 136775 }, { "epoch": 1.961796592669076, "grad_norm": 0.9750637412071228, "learning_rate": 1.1535617922624233e-05, "loss": 0.1008, "step": 136800 }, { "epoch": 1.9621551081282624, "grad_norm": 1.7857835292816162, "learning_rate": 1.1531634409059042e-05, "loss": 0.081, "step": 136825 }, { "epoch": 1.9625136235874492, "grad_norm": 1.2437677383422852, "learning_rate": 1.152765089549385e-05, "loss": 0.0781, "step": 136850 }, { "epoch": 1.9628721390466357, "grad_norm": 0.03123277612030506, "learning_rate": 1.1523667381928658e-05, "loss": 0.1518, "step": 136875 }, { "epoch": 1.9632306545058222, "grad_norm": 2.7281384468078613, "learning_rate": 1.1519683868363467e-05, "loss": 0.1359, "step": 136900 }, { "epoch": 1.963589169965009, "grad_norm": 9.831912994384766, "learning_rate": 1.1515700354798275e-05, "loss": 0.2017, "step": 136925 }, { "epoch": 1.9639476854241955, "grad_norm": 2.0979912281036377, "learning_rate": 1.1511716841233085e-05, "loss": 0.077, "step": 136950 }, { "epoch": 1.964306200883382, "grad_norm": 3.960146427154541, "learning_rate": 1.1507733327667893e-05, "loss": 0.1106, "step": 136975 }, { "epoch": 1.9646647163425688, "grad_norm": 1.4788695573806763, "learning_rate": 1.15037498141027e-05, "loss": 0.1274, "step": 137000 }, { "epoch": 1.9650232318017553, "grad_norm": 6.104147911071777, "learning_rate": 1.149976630053751e-05, "loss": 0.1105, "step": 137025 }, { "epoch": 1.9653817472609418, "grad_norm": 5.186028003692627, "learning_rate": 1.1495782786972318e-05, "loss": 0.1283, "step": 137050 }, { "epoch": 1.9657402627201286, "grad_norm": 0.12470059841871262, "learning_rate": 1.1491799273407127e-05, "loss": 0.109, "step": 137075 }, { "epoch": 1.966098778179315, "grad_norm": 19.138545989990234, "learning_rate": 1.1487815759841933e-05, "loss": 0.0838, "step": 137100 }, { "epoch": 1.9664572936385016, "grad_norm": 11.774962425231934, "learning_rate": 1.1483832246276743e-05, "loss": 0.1102, "step": 137125 }, { "epoch": 1.9668158090976884, "grad_norm": 1.3015326261520386, "learning_rate": 1.147984873271155e-05, "loss": 0.1678, "step": 137150 }, { "epoch": 1.967174324556875, "grad_norm": 0.7966116070747375, "learning_rate": 1.1475865219146359e-05, "loss": 0.1106, "step": 137175 }, { "epoch": 1.9675328400160614, "grad_norm": 11.506630897521973, "learning_rate": 1.1471881705581168e-05, "loss": 0.0979, "step": 137200 }, { "epoch": 1.9678913554752482, "grad_norm": 3.8871328830718994, "learning_rate": 1.1467898192015976e-05, "loss": 0.1211, "step": 137225 }, { "epoch": 1.9682498709344347, "grad_norm": 4.095094203948975, "learning_rate": 1.1463914678450786e-05, "loss": 0.1413, "step": 137250 }, { "epoch": 1.9686083863936212, "grad_norm": 1.9715174436569214, "learning_rate": 1.1459931164885593e-05, "loss": 0.1217, "step": 137275 }, { "epoch": 1.968966901852808, "grad_norm": 0.5345528721809387, "learning_rate": 1.1455947651320401e-05, "loss": 0.0775, "step": 137300 }, { "epoch": 1.9693254173119945, "grad_norm": 8.932374000549316, "learning_rate": 1.145196413775521e-05, "loss": 0.0885, "step": 137325 }, { "epoch": 1.969683932771181, "grad_norm": 0.4938107430934906, "learning_rate": 1.1447980624190019e-05, "loss": 0.1269, "step": 137350 }, { "epoch": 1.9700424482303678, "grad_norm": 3.137683868408203, "learning_rate": 1.1443997110624828e-05, "loss": 0.0491, "step": 137375 }, { "epoch": 1.9704009636895543, "grad_norm": 7.628324031829834, "learning_rate": 1.1440013597059636e-05, "loss": 0.0796, "step": 137400 }, { "epoch": 1.9707594791487408, "grad_norm": 3.790001630783081, "learning_rate": 1.1436030083494446e-05, "loss": 0.117, "step": 137425 }, { "epoch": 1.9711179946079276, "grad_norm": 3.8347651958465576, "learning_rate": 1.1432046569929253e-05, "loss": 0.1912, "step": 137450 }, { "epoch": 1.971476510067114, "grad_norm": 0.628337562084198, "learning_rate": 1.1428063056364061e-05, "loss": 0.1612, "step": 137475 }, { "epoch": 1.9718350255263006, "grad_norm": 9.290430068969727, "learning_rate": 1.142407954279887e-05, "loss": 0.0775, "step": 137500 }, { "epoch": 1.9721935409854874, "grad_norm": 1.6028764247894287, "learning_rate": 1.1420096029233679e-05, "loss": 0.101, "step": 137525 }, { "epoch": 1.972552056444674, "grad_norm": 0.28795644640922546, "learning_rate": 1.1416112515668488e-05, "loss": 0.1444, "step": 137550 }, { "epoch": 1.9729105719038604, "grad_norm": 0.4619351625442505, "learning_rate": 1.1412129002103296e-05, "loss": 0.1729, "step": 137575 }, { "epoch": 1.9732690873630472, "grad_norm": 1.338689923286438, "learning_rate": 1.1408145488538104e-05, "loss": 0.0666, "step": 137600 }, { "epoch": 1.9736276028222337, "grad_norm": 7.560879230499268, "learning_rate": 1.1404161974972913e-05, "loss": 0.0932, "step": 137625 }, { "epoch": 1.9739861182814202, "grad_norm": 3.39801025390625, "learning_rate": 1.140017846140772e-05, "loss": 0.1418, "step": 137650 }, { "epoch": 1.974344633740607, "grad_norm": 1.8904558420181274, "learning_rate": 1.1396194947842529e-05, "loss": 0.0785, "step": 137675 }, { "epoch": 1.9747031491997935, "grad_norm": 19.66830062866211, "learning_rate": 1.1392211434277337e-05, "loss": 0.1003, "step": 137700 }, { "epoch": 1.97506166465898, "grad_norm": 16.157806396484375, "learning_rate": 1.1388227920712146e-05, "loss": 0.0965, "step": 137725 }, { "epoch": 1.9754201801181668, "grad_norm": 0.9876415133476257, "learning_rate": 1.1384244407146954e-05, "loss": 0.1318, "step": 137750 }, { "epoch": 1.9757786955773533, "grad_norm": 4.168567180633545, "learning_rate": 1.1380260893581762e-05, "loss": 0.0912, "step": 137775 }, { "epoch": 1.9761372110365398, "grad_norm": 8.480319023132324, "learning_rate": 1.1376277380016572e-05, "loss": 0.1858, "step": 137800 }, { "epoch": 1.9764957264957266, "grad_norm": 7.600017070770264, "learning_rate": 1.137229386645138e-05, "loss": 0.1051, "step": 137825 }, { "epoch": 1.976854241954913, "grad_norm": 3.024646043777466, "learning_rate": 1.1368310352886189e-05, "loss": 0.0785, "step": 137850 }, { "epoch": 1.9772127574140996, "grad_norm": 1.258904218673706, "learning_rate": 1.1364326839320997e-05, "loss": 0.0578, "step": 137875 }, { "epoch": 1.9775712728732864, "grad_norm": 3.5187830924987793, "learning_rate": 1.1360343325755805e-05, "loss": 0.1145, "step": 137900 }, { "epoch": 1.977929788332473, "grad_norm": 10.70605182647705, "learning_rate": 1.1356359812190614e-05, "loss": 0.1346, "step": 137925 }, { "epoch": 1.9782883037916594, "grad_norm": 3.259437084197998, "learning_rate": 1.1352376298625422e-05, "loss": 0.1148, "step": 137950 }, { "epoch": 1.9786468192508462, "grad_norm": 4.8428544998168945, "learning_rate": 1.1348392785060232e-05, "loss": 0.1088, "step": 137975 }, { "epoch": 1.9790053347100327, "grad_norm": 8.188169479370117, "learning_rate": 1.134440927149504e-05, "loss": 0.1342, "step": 138000 }, { "epoch": 1.9793638501692192, "grad_norm": 1.790804386138916, "learning_rate": 1.1340425757929849e-05, "loss": 0.1234, "step": 138025 }, { "epoch": 1.979722365628406, "grad_norm": 2.5916552543640137, "learning_rate": 1.1336442244364657e-05, "loss": 0.1475, "step": 138050 }, { "epoch": 1.9800808810875925, "grad_norm": 0.35359466075897217, "learning_rate": 1.1332458730799465e-05, "loss": 0.1376, "step": 138075 }, { "epoch": 1.980439396546779, "grad_norm": 4.9834394454956055, "learning_rate": 1.1328475217234274e-05, "loss": 0.1168, "step": 138100 }, { "epoch": 1.9807979120059658, "grad_norm": 0.2955259680747986, "learning_rate": 1.1324491703669082e-05, "loss": 0.1025, "step": 138125 }, { "epoch": 1.9811564274651523, "grad_norm": 1.5222604274749756, "learning_rate": 1.1320508190103892e-05, "loss": 0.1559, "step": 138150 }, { "epoch": 1.9815149429243388, "grad_norm": 19.36206817626953, "learning_rate": 1.13165246765387e-05, "loss": 0.1765, "step": 138175 }, { "epoch": 1.9818734583835256, "grad_norm": 17.28472900390625, "learning_rate": 1.1312541162973506e-05, "loss": 0.1466, "step": 138200 }, { "epoch": 1.982231973842712, "grad_norm": 6.318155765533447, "learning_rate": 1.1308557649408315e-05, "loss": 0.1411, "step": 138225 }, { "epoch": 1.9825904893018986, "grad_norm": 0.28583526611328125, "learning_rate": 1.1304574135843123e-05, "loss": 0.123, "step": 138250 }, { "epoch": 1.9829490047610854, "grad_norm": 0.6902658343315125, "learning_rate": 1.1300590622277933e-05, "loss": 0.1615, "step": 138275 }, { "epoch": 1.983307520220272, "grad_norm": 15.267078399658203, "learning_rate": 1.129660710871274e-05, "loss": 0.1936, "step": 138300 }, { "epoch": 1.9836660356794584, "grad_norm": 2.5662505626678467, "learning_rate": 1.129262359514755e-05, "loss": 0.0932, "step": 138325 }, { "epoch": 1.9840245511386452, "grad_norm": 0.9953246712684631, "learning_rate": 1.1288640081582358e-05, "loss": 0.1146, "step": 138350 }, { "epoch": 1.9843830665978317, "grad_norm": 9.590473175048828, "learning_rate": 1.1284656568017166e-05, "loss": 0.1565, "step": 138375 }, { "epoch": 1.9847415820570182, "grad_norm": 1.2241379022598267, "learning_rate": 1.1280673054451975e-05, "loss": 0.1561, "step": 138400 }, { "epoch": 1.985100097516205, "grad_norm": 1.1714197397232056, "learning_rate": 1.1276689540886783e-05, "loss": 0.0934, "step": 138425 }, { "epoch": 1.9854586129753915, "grad_norm": 12.872227668762207, "learning_rate": 1.1272706027321593e-05, "loss": 0.0779, "step": 138450 }, { "epoch": 1.985817128434578, "grad_norm": 13.798563003540039, "learning_rate": 1.12687225137564e-05, "loss": 0.0795, "step": 138475 }, { "epoch": 1.9861756438937648, "grad_norm": 5.930870056152344, "learning_rate": 1.1264739000191208e-05, "loss": 0.1295, "step": 138500 }, { "epoch": 1.9865341593529513, "grad_norm": 20.700883865356445, "learning_rate": 1.1260755486626018e-05, "loss": 0.1221, "step": 138525 }, { "epoch": 1.9868926748121378, "grad_norm": 0.2978003919124603, "learning_rate": 1.1256771973060826e-05, "loss": 0.0956, "step": 138550 }, { "epoch": 1.9872511902713246, "grad_norm": 15.72806453704834, "learning_rate": 1.1252788459495635e-05, "loss": 0.0993, "step": 138575 }, { "epoch": 1.987609705730511, "grad_norm": 0.4820093512535095, "learning_rate": 1.1248804945930443e-05, "loss": 0.0628, "step": 138600 }, { "epoch": 1.9879682211896976, "grad_norm": 0.1601867824792862, "learning_rate": 1.1244821432365253e-05, "loss": 0.1232, "step": 138625 }, { "epoch": 1.9883267366488844, "grad_norm": 0.27613991498947144, "learning_rate": 1.124083791880006e-05, "loss": 0.1461, "step": 138650 }, { "epoch": 1.988685252108071, "grad_norm": 0.3494299054145813, "learning_rate": 1.1236854405234868e-05, "loss": 0.0967, "step": 138675 }, { "epoch": 1.9890437675672574, "grad_norm": 3.54849910736084, "learning_rate": 1.1232870891669678e-05, "loss": 0.1024, "step": 138700 }, { "epoch": 1.9894022830264442, "grad_norm": 1.9141390323638916, "learning_rate": 1.1228887378104486e-05, "loss": 0.0692, "step": 138725 }, { "epoch": 1.9897607984856307, "grad_norm": 10.346924781799316, "learning_rate": 1.1224903864539294e-05, "loss": 0.1436, "step": 138750 }, { "epoch": 1.9901193139448172, "grad_norm": 0.6556389331817627, "learning_rate": 1.1220920350974101e-05, "loss": 0.0678, "step": 138775 }, { "epoch": 1.990477829404004, "grad_norm": 9.998923301696777, "learning_rate": 1.121693683740891e-05, "loss": 0.1387, "step": 138800 }, { "epoch": 1.9908363448631905, "grad_norm": 6.879919052124023, "learning_rate": 1.1212953323843719e-05, "loss": 0.1031, "step": 138825 }, { "epoch": 1.991194860322377, "grad_norm": 7.872308731079102, "learning_rate": 1.1208969810278527e-05, "loss": 0.1779, "step": 138850 }, { "epoch": 1.9915533757815638, "grad_norm": 6.397178649902344, "learning_rate": 1.1204986296713336e-05, "loss": 0.1166, "step": 138875 }, { "epoch": 1.9919118912407503, "grad_norm": 4.938756942749023, "learning_rate": 1.1201002783148144e-05, "loss": 0.1513, "step": 138900 }, { "epoch": 1.9922704066999368, "grad_norm": 14.995183944702148, "learning_rate": 1.1197019269582954e-05, "loss": 0.1507, "step": 138925 }, { "epoch": 1.9926289221591236, "grad_norm": 9.662007331848145, "learning_rate": 1.1193035756017761e-05, "loss": 0.0618, "step": 138950 }, { "epoch": 1.99298743761831, "grad_norm": 5.616464138031006, "learning_rate": 1.118905224245257e-05, "loss": 0.1615, "step": 138975 }, { "epoch": 1.9933459530774966, "grad_norm": 0.3559759259223938, "learning_rate": 1.1185068728887379e-05, "loss": 0.0948, "step": 139000 }, { "epoch": 1.9937044685366834, "grad_norm": 0.9859103560447693, "learning_rate": 1.1181085215322187e-05, "loss": 0.106, "step": 139025 }, { "epoch": 1.99406298399587, "grad_norm": 0.9904047846794128, "learning_rate": 1.1177101701756996e-05, "loss": 0.1345, "step": 139050 }, { "epoch": 1.9944214994550564, "grad_norm": 0.6341187953948975, "learning_rate": 1.1173118188191804e-05, "loss": 0.1423, "step": 139075 }, { "epoch": 1.9947800149142432, "grad_norm": 0.3884458541870117, "learning_rate": 1.1169134674626612e-05, "loss": 0.1096, "step": 139100 }, { "epoch": 1.9951385303734297, "grad_norm": 2.4915313720703125, "learning_rate": 1.1165151161061421e-05, "loss": 0.1141, "step": 139125 }, { "epoch": 1.9954970458326162, "grad_norm": 3.6571218967437744, "learning_rate": 1.116116764749623e-05, "loss": 0.0901, "step": 139150 }, { "epoch": 1.995855561291803, "grad_norm": 12.776862144470215, "learning_rate": 1.1157184133931039e-05, "loss": 0.1627, "step": 139175 }, { "epoch": 1.9962140767509895, "grad_norm": 2.593538761138916, "learning_rate": 1.1153200620365847e-05, "loss": 0.162, "step": 139200 }, { "epoch": 1.996572592210176, "grad_norm": 0.9422709941864014, "learning_rate": 1.1149217106800656e-05, "loss": 0.1736, "step": 139225 }, { "epoch": 1.9969311076693628, "grad_norm": 0.9894459247589111, "learning_rate": 1.1145233593235464e-05, "loss": 0.1062, "step": 139250 }, { "epoch": 1.9972896231285493, "grad_norm": 1.1603844165802002, "learning_rate": 1.1141250079670272e-05, "loss": 0.128, "step": 139275 }, { "epoch": 1.9976481385877358, "grad_norm": 17.455547332763672, "learning_rate": 1.113726656610508e-05, "loss": 0.151, "step": 139300 }, { "epoch": 1.9980066540469226, "grad_norm": 14.648189544677734, "learning_rate": 1.1133283052539887e-05, "loss": 0.1232, "step": 139325 }, { "epoch": 1.998365169506109, "grad_norm": 2.5697102546691895, "learning_rate": 1.1129299538974697e-05, "loss": 0.1518, "step": 139350 }, { "epoch": 1.9987236849652956, "grad_norm": 0.144203782081604, "learning_rate": 1.1125316025409505e-05, "loss": 0.12, "step": 139375 }, { "epoch": 1.9990822004244824, "grad_norm": 17.63067626953125, "learning_rate": 1.1121332511844313e-05, "loss": 0.0904, "step": 139400 }, { "epoch": 1.999440715883669, "grad_norm": 1.4722813367843628, "learning_rate": 1.1117348998279122e-05, "loss": 0.0871, "step": 139425 }, { "epoch": 1.9997992313428554, "grad_norm": 0.8995280861854553, "learning_rate": 1.111336548471393e-05, "loss": 0.0429, "step": 139450 }, { "epoch": 2.0, "eval_cosine_accuracy": 0.9559538274605103, "eval_loss": 0.21542182564735413, "eval_runtime": 196.296, "eval_samples_per_second": 33.541, "eval_steps_per_second": 2.099, "step": 139464 }, { "epoch": 2.000157746802042, "grad_norm": 2.8002636432647705, "learning_rate": 1.110938197114874e-05, "loss": 0.0915, "step": 139475 }, { "epoch": 2.000516262261229, "grad_norm": 3.8929085731506348, "learning_rate": 1.1105398457583547e-05, "loss": 0.0835, "step": 139500 }, { "epoch": 2.000874777720415, "grad_norm": 25.709495544433594, "learning_rate": 1.1101414944018357e-05, "loss": 0.0666, "step": 139525 }, { "epoch": 2.001233293179602, "grad_norm": 0.02276797965168953, "learning_rate": 1.1097431430453165e-05, "loss": 0.1189, "step": 139550 }, { "epoch": 2.0015918086387887, "grad_norm": 1.1913044452667236, "learning_rate": 1.1093447916887973e-05, "loss": 0.1097, "step": 139575 }, { "epoch": 2.001950324097975, "grad_norm": 0.6335310935974121, "learning_rate": 1.1089464403322782e-05, "loss": 0.1174, "step": 139600 }, { "epoch": 2.0023088395571618, "grad_norm": 0.05425674095749855, "learning_rate": 1.108548088975759e-05, "loss": 0.0771, "step": 139625 }, { "epoch": 2.0026673550163485, "grad_norm": 3.336557388305664, "learning_rate": 1.10814973761924e-05, "loss": 0.1186, "step": 139650 }, { "epoch": 2.003025870475535, "grad_norm": 0.3587735593318939, "learning_rate": 1.1077513862627207e-05, "loss": 0.2004, "step": 139675 }, { "epoch": 2.0033843859347216, "grad_norm": 0.6804975271224976, "learning_rate": 1.1073530349062015e-05, "loss": 0.0493, "step": 139700 }, { "epoch": 2.0037429013939083, "grad_norm": 1.0024101734161377, "learning_rate": 1.1069546835496825e-05, "loss": 0.0808, "step": 139725 }, { "epoch": 2.0041014168530946, "grad_norm": 0.18602479994297028, "learning_rate": 1.1065563321931633e-05, "loss": 0.1124, "step": 139750 }, { "epoch": 2.0044599323122814, "grad_norm": 10.627523422241211, "learning_rate": 1.1061579808366442e-05, "loss": 0.1282, "step": 139775 }, { "epoch": 2.004818447771468, "grad_norm": 0.162591814994812, "learning_rate": 1.105759629480125e-05, "loss": 0.0803, "step": 139800 }, { "epoch": 2.0051769632306544, "grad_norm": 3.5721852779388428, "learning_rate": 1.1053612781236058e-05, "loss": 0.0334, "step": 139825 }, { "epoch": 2.005535478689841, "grad_norm": 1.5860037803649902, "learning_rate": 1.1049629267670866e-05, "loss": 0.0748, "step": 139850 }, { "epoch": 2.005893994149028, "grad_norm": 6.634640216827393, "learning_rate": 1.1045645754105674e-05, "loss": 0.0362, "step": 139875 }, { "epoch": 2.006252509608214, "grad_norm": 12.101033210754395, "learning_rate": 1.1041662240540483e-05, "loss": 0.0955, "step": 139900 }, { "epoch": 2.006611025067401, "grad_norm": 5.623154163360596, "learning_rate": 1.1037678726975291e-05, "loss": 0.1229, "step": 139925 }, { "epoch": 2.0069695405265877, "grad_norm": 1.6541951894760132, "learning_rate": 1.10336952134101e-05, "loss": 0.0637, "step": 139950 }, { "epoch": 2.007328055985774, "grad_norm": 11.673616409301758, "learning_rate": 1.1029711699844908e-05, "loss": 0.1385, "step": 139975 }, { "epoch": 2.0076865714449608, "grad_norm": 0.9126507043838501, "learning_rate": 1.1025728186279716e-05, "loss": 0.0394, "step": 140000 }, { "epoch": 2.0080450869041475, "grad_norm": 1.9946032762527466, "learning_rate": 1.1021744672714526e-05, "loss": 0.0545, "step": 140025 }, { "epoch": 2.008403602363334, "grad_norm": 0.44317808747291565, "learning_rate": 1.1017761159149334e-05, "loss": 0.0366, "step": 140050 }, { "epoch": 2.0087621178225206, "grad_norm": 0.16288071870803833, "learning_rate": 1.1013777645584143e-05, "loss": 0.0854, "step": 140075 }, { "epoch": 2.0091206332817073, "grad_norm": 1.0590510368347168, "learning_rate": 1.1009794132018951e-05, "loss": 0.0591, "step": 140100 }, { "epoch": 2.0094791487408936, "grad_norm": 10.13278865814209, "learning_rate": 1.100581061845376e-05, "loss": 0.0864, "step": 140125 }, { "epoch": 2.0098376642000804, "grad_norm": 0.5057254433631897, "learning_rate": 1.1001827104888568e-05, "loss": 0.1047, "step": 140150 }, { "epoch": 2.010196179659267, "grad_norm": 0.04100755229592323, "learning_rate": 1.0997843591323376e-05, "loss": 0.0649, "step": 140175 }, { "epoch": 2.0105546951184534, "grad_norm": 2.2890982627868652, "learning_rate": 1.0993860077758186e-05, "loss": 0.0733, "step": 140200 }, { "epoch": 2.01091321057764, "grad_norm": 1.8120449781417847, "learning_rate": 1.0989876564192994e-05, "loss": 0.0726, "step": 140225 }, { "epoch": 2.011271726036827, "grad_norm": 0.05500384420156479, "learning_rate": 1.0985893050627803e-05, "loss": 0.1284, "step": 140250 }, { "epoch": 2.011630241496013, "grad_norm": 2.627422332763672, "learning_rate": 1.0981909537062611e-05, "loss": 0.0793, "step": 140275 }, { "epoch": 2.0119887569552, "grad_norm": 0.8103851675987244, "learning_rate": 1.0977926023497419e-05, "loss": 0.0741, "step": 140300 }, { "epoch": 2.0123472724143867, "grad_norm": 0.07674331218004227, "learning_rate": 1.0973942509932228e-05, "loss": 0.0498, "step": 140325 }, { "epoch": 2.012705787873573, "grad_norm": 2.4054806232452393, "learning_rate": 1.0969958996367036e-05, "loss": 0.1189, "step": 140350 }, { "epoch": 2.0130643033327598, "grad_norm": 1.8133282661437988, "learning_rate": 1.0965975482801844e-05, "loss": 0.0878, "step": 140375 }, { "epoch": 2.0134228187919465, "grad_norm": 9.185224533081055, "learning_rate": 1.0961991969236652e-05, "loss": 0.0649, "step": 140400 }, { "epoch": 2.013781334251133, "grad_norm": 8.821386337280273, "learning_rate": 1.0958008455671461e-05, "loss": 0.1267, "step": 140425 }, { "epoch": 2.0141398497103196, "grad_norm": 3.5357518196105957, "learning_rate": 1.095402494210627e-05, "loss": 0.0575, "step": 140450 }, { "epoch": 2.0144983651695063, "grad_norm": 0.6717970371246338, "learning_rate": 1.0950041428541077e-05, "loss": 0.0752, "step": 140475 }, { "epoch": 2.0148568806286926, "grad_norm": 7.804410457611084, "learning_rate": 1.0946057914975887e-05, "loss": 0.1109, "step": 140500 }, { "epoch": 2.0152153960878794, "grad_norm": 0.3547665774822235, "learning_rate": 1.0942074401410695e-05, "loss": 0.1532, "step": 140525 }, { "epoch": 2.015573911547066, "grad_norm": 1.634012222290039, "learning_rate": 1.0938090887845504e-05, "loss": 0.0805, "step": 140550 }, { "epoch": 2.0159324270062524, "grad_norm": 0.647596001625061, "learning_rate": 1.0934107374280312e-05, "loss": 0.137, "step": 140575 }, { "epoch": 2.016290942465439, "grad_norm": 0.0575438030064106, "learning_rate": 1.093012386071512e-05, "loss": 0.1424, "step": 140600 }, { "epoch": 2.016649457924626, "grad_norm": 0.1399017572402954, "learning_rate": 1.092614034714993e-05, "loss": 0.0798, "step": 140625 }, { "epoch": 2.017007973383812, "grad_norm": 11.830355644226074, "learning_rate": 1.0922156833584737e-05, "loss": 0.0804, "step": 140650 }, { "epoch": 2.017366488842999, "grad_norm": 1.394835352897644, "learning_rate": 1.0918173320019547e-05, "loss": 0.0633, "step": 140675 }, { "epoch": 2.0177250043021857, "grad_norm": 5.829293251037598, "learning_rate": 1.0914189806454355e-05, "loss": 0.1167, "step": 140700 }, { "epoch": 2.018083519761372, "grad_norm": 0.4362693428993225, "learning_rate": 1.0910206292889164e-05, "loss": 0.058, "step": 140725 }, { "epoch": 2.0184420352205588, "grad_norm": 0.7628694772720337, "learning_rate": 1.0906222779323972e-05, "loss": 0.0688, "step": 140750 }, { "epoch": 2.0188005506797455, "grad_norm": 0.27646535634994507, "learning_rate": 1.090223926575878e-05, "loss": 0.1125, "step": 140775 }, { "epoch": 2.019159066138932, "grad_norm": 8.835901260375977, "learning_rate": 1.089825575219359e-05, "loss": 0.0874, "step": 140800 }, { "epoch": 2.0195175815981186, "grad_norm": 17.66066551208496, "learning_rate": 1.0894272238628397e-05, "loss": 0.1229, "step": 140825 }, { "epoch": 2.0198760970573053, "grad_norm": 0.31540149450302124, "learning_rate": 1.0890288725063207e-05, "loss": 0.0743, "step": 140850 }, { "epoch": 2.0202346125164916, "grad_norm": 6.006246089935303, "learning_rate": 1.0886305211498015e-05, "loss": 0.088, "step": 140875 }, { "epoch": 2.0205931279756784, "grad_norm": 9.653518676757812, "learning_rate": 1.0882321697932822e-05, "loss": 0.1317, "step": 140900 }, { "epoch": 2.020951643434865, "grad_norm": 0.05089440196752548, "learning_rate": 1.0878338184367632e-05, "loss": 0.1007, "step": 140925 }, { "epoch": 2.0213101588940514, "grad_norm": 1.0707098245620728, "learning_rate": 1.0874354670802438e-05, "loss": 0.0908, "step": 140950 }, { "epoch": 2.021668674353238, "grad_norm": 18.28034019470215, "learning_rate": 1.0870371157237248e-05, "loss": 0.1282, "step": 140975 }, { "epoch": 2.022027189812425, "grad_norm": 1.4629207849502563, "learning_rate": 1.0866387643672055e-05, "loss": 0.1418, "step": 141000 }, { "epoch": 2.022385705271611, "grad_norm": 8.322961807250977, "learning_rate": 1.0862404130106863e-05, "loss": 0.0722, "step": 141025 }, { "epoch": 2.022744220730798, "grad_norm": 0.08360534906387329, "learning_rate": 1.0858420616541673e-05, "loss": 0.1207, "step": 141050 }, { "epoch": 2.0231027361899847, "grad_norm": 2.3016254901885986, "learning_rate": 1.085443710297648e-05, "loss": 0.0503, "step": 141075 }, { "epoch": 2.023461251649171, "grad_norm": 0.42678260803222656, "learning_rate": 1.085045358941129e-05, "loss": 0.0284, "step": 141100 }, { "epoch": 2.0238197671083578, "grad_norm": 0.9419455528259277, "learning_rate": 1.0846470075846098e-05, "loss": 0.1024, "step": 141125 }, { "epoch": 2.0241782825675445, "grad_norm": 0.6115884780883789, "learning_rate": 1.0842486562280908e-05, "loss": 0.1241, "step": 141150 }, { "epoch": 2.024536798026731, "grad_norm": 3.523289680480957, "learning_rate": 1.0838503048715715e-05, "loss": 0.0408, "step": 141175 }, { "epoch": 2.0248953134859176, "grad_norm": 9.75903606414795, "learning_rate": 1.0834519535150523e-05, "loss": 0.0853, "step": 141200 }, { "epoch": 2.0252538289451043, "grad_norm": 1.193331003189087, "learning_rate": 1.0830536021585333e-05, "loss": 0.1047, "step": 141225 }, { "epoch": 2.0256123444042906, "grad_norm": 7.807082653045654, "learning_rate": 1.082655250802014e-05, "loss": 0.1457, "step": 141250 }, { "epoch": 2.0259708598634774, "grad_norm": 0.3469778001308441, "learning_rate": 1.082256899445495e-05, "loss": 0.1463, "step": 141275 }, { "epoch": 2.026329375322664, "grad_norm": 6.092298984527588, "learning_rate": 1.0818585480889758e-05, "loss": 0.0873, "step": 141300 }, { "epoch": 2.0266878907818504, "grad_norm": 0.4549410939216614, "learning_rate": 1.0814601967324566e-05, "loss": 0.0869, "step": 141325 }, { "epoch": 2.027046406241037, "grad_norm": 11.414936065673828, "learning_rate": 1.0810618453759375e-05, "loss": 0.093, "step": 141350 }, { "epoch": 2.027404921700224, "grad_norm": 9.475211143493652, "learning_rate": 1.0806634940194183e-05, "loss": 0.1137, "step": 141375 }, { "epoch": 2.02776343715941, "grad_norm": 0.6529891490936279, "learning_rate": 1.0802651426628993e-05, "loss": 0.1085, "step": 141400 }, { "epoch": 2.028121952618597, "grad_norm": 0.4404282867908478, "learning_rate": 1.07986679130638e-05, "loss": 0.1201, "step": 141425 }, { "epoch": 2.0284804680777837, "grad_norm": 14.207696914672852, "learning_rate": 1.079468439949861e-05, "loss": 0.1499, "step": 141450 }, { "epoch": 2.02883898353697, "grad_norm": 6.71413516998291, "learning_rate": 1.0790700885933418e-05, "loss": 0.0872, "step": 141475 }, { "epoch": 2.0291974989961568, "grad_norm": 1.623770833015442, "learning_rate": 1.0786717372368224e-05, "loss": 0.1289, "step": 141500 }, { "epoch": 2.0295560144553435, "grad_norm": 2.250915050506592, "learning_rate": 1.0782733858803034e-05, "loss": 0.0711, "step": 141525 }, { "epoch": 2.02991452991453, "grad_norm": 7.129426002502441, "learning_rate": 1.0778750345237842e-05, "loss": 0.1003, "step": 141550 }, { "epoch": 2.0302730453737166, "grad_norm": 4.194611549377441, "learning_rate": 1.0774766831672651e-05, "loss": 0.1668, "step": 141575 }, { "epoch": 2.0306315608329033, "grad_norm": 1.821471095085144, "learning_rate": 1.0770783318107459e-05, "loss": 0.1673, "step": 141600 }, { "epoch": 2.0309900762920896, "grad_norm": 16.07676887512207, "learning_rate": 1.0766799804542267e-05, "loss": 0.0836, "step": 141625 }, { "epoch": 2.0313485917512764, "grad_norm": 0.6478937268257141, "learning_rate": 1.0762816290977076e-05, "loss": 0.157, "step": 141650 }, { "epoch": 2.031707107210463, "grad_norm": 0.27013099193573, "learning_rate": 1.0758832777411884e-05, "loss": 0.1562, "step": 141675 }, { "epoch": 2.0320656226696494, "grad_norm": 0.3563746213912964, "learning_rate": 1.0754849263846694e-05, "loss": 0.059, "step": 141700 }, { "epoch": 2.032424138128836, "grad_norm": 0.9662806391716003, "learning_rate": 1.0750865750281502e-05, "loss": 0.1278, "step": 141725 }, { "epoch": 2.032782653588023, "grad_norm": 19.52045440673828, "learning_rate": 1.0746882236716311e-05, "loss": 0.0959, "step": 141750 }, { "epoch": 2.033141169047209, "grad_norm": 10.51535701751709, "learning_rate": 1.0742898723151119e-05, "loss": 0.0731, "step": 141775 }, { "epoch": 2.033499684506396, "grad_norm": 10.57571792602539, "learning_rate": 1.0738915209585927e-05, "loss": 0.0513, "step": 141800 }, { "epoch": 2.0338581999655827, "grad_norm": 3.2009267807006836, "learning_rate": 1.0734931696020736e-05, "loss": 0.1092, "step": 141825 }, { "epoch": 2.034216715424769, "grad_norm": 2.071615219116211, "learning_rate": 1.0730948182455544e-05, "loss": 0.1289, "step": 141850 }, { "epoch": 2.0345752308839558, "grad_norm": 2.8842391967773438, "learning_rate": 1.0726964668890354e-05, "loss": 0.078, "step": 141875 }, { "epoch": 2.0349337463431425, "grad_norm": 3.7829067707061768, "learning_rate": 1.0722981155325162e-05, "loss": 0.036, "step": 141900 }, { "epoch": 2.035292261802329, "grad_norm": 13.422252655029297, "learning_rate": 1.071899764175997e-05, "loss": 0.0626, "step": 141925 }, { "epoch": 2.0356507772615156, "grad_norm": 0.7460333704948425, "learning_rate": 1.0715014128194779e-05, "loss": 0.0712, "step": 141950 }, { "epoch": 2.0360092927207023, "grad_norm": 13.688366889953613, "learning_rate": 1.0711030614629587e-05, "loss": 0.0667, "step": 141975 }, { "epoch": 2.0363678081798886, "grad_norm": 5.209323883056641, "learning_rate": 1.0707047101064396e-05, "loss": 0.1393, "step": 142000 }, { "epoch": 2.0367263236390754, "grad_norm": 10.792927742004395, "learning_rate": 1.0703063587499204e-05, "loss": 0.0941, "step": 142025 }, { "epoch": 2.037084839098262, "grad_norm": 5.0018205642700195, "learning_rate": 1.0699080073934012e-05, "loss": 0.0867, "step": 142050 }, { "epoch": 2.0374433545574484, "grad_norm": 1.108458399772644, "learning_rate": 1.069509656036882e-05, "loss": 0.1135, "step": 142075 }, { "epoch": 2.037801870016635, "grad_norm": 3.8582935333251953, "learning_rate": 1.0691113046803628e-05, "loss": 0.0403, "step": 142100 }, { "epoch": 2.038160385475822, "grad_norm": 5.572331428527832, "learning_rate": 1.0687129533238437e-05, "loss": 0.1443, "step": 142125 }, { "epoch": 2.038518900935008, "grad_norm": 0.5841681361198425, "learning_rate": 1.0683146019673245e-05, "loss": 0.1213, "step": 142150 }, { "epoch": 2.038877416394195, "grad_norm": 13.466169357299805, "learning_rate": 1.0679162506108055e-05, "loss": 0.0943, "step": 142175 }, { "epoch": 2.0392359318533817, "grad_norm": 19.47496795654297, "learning_rate": 1.0675178992542862e-05, "loss": 0.1716, "step": 142200 }, { "epoch": 2.039594447312568, "grad_norm": 0.28756317496299744, "learning_rate": 1.067119547897767e-05, "loss": 0.0898, "step": 142225 }, { "epoch": 2.0399529627717548, "grad_norm": 2.9065353870391846, "learning_rate": 1.066721196541248e-05, "loss": 0.1308, "step": 142250 }, { "epoch": 2.0403114782309415, "grad_norm": 0.7937688827514648, "learning_rate": 1.0663228451847288e-05, "loss": 0.1258, "step": 142275 }, { "epoch": 2.040669993690128, "grad_norm": 1.2150558233261108, "learning_rate": 1.0659244938282097e-05, "loss": 0.059, "step": 142300 }, { "epoch": 2.0410285091493146, "grad_norm": 7.503447532653809, "learning_rate": 1.0655261424716905e-05, "loss": 0.0779, "step": 142325 }, { "epoch": 2.0413870246085013, "grad_norm": 2.949068307876587, "learning_rate": 1.0651277911151715e-05, "loss": 0.1096, "step": 142350 }, { "epoch": 2.0417455400676876, "grad_norm": 0.9901480674743652, "learning_rate": 1.0647294397586522e-05, "loss": 0.0983, "step": 142375 }, { "epoch": 2.0421040555268744, "grad_norm": 0.35483676195144653, "learning_rate": 1.064331088402133e-05, "loss": 0.0375, "step": 142400 }, { "epoch": 2.042462570986061, "grad_norm": 2.189495325088501, "learning_rate": 1.063932737045614e-05, "loss": 0.063, "step": 142425 }, { "epoch": 2.0428210864452474, "grad_norm": 1.9861558675765991, "learning_rate": 1.0635343856890948e-05, "loss": 0.044, "step": 142450 }, { "epoch": 2.043179601904434, "grad_norm": 0.14287209510803223, "learning_rate": 1.0631360343325757e-05, "loss": 0.0917, "step": 142475 }, { "epoch": 2.043538117363621, "grad_norm": 1.6290335655212402, "learning_rate": 1.0627376829760565e-05, "loss": 0.0753, "step": 142500 }, { "epoch": 2.043896632822807, "grad_norm": 0.31244850158691406, "learning_rate": 1.0623393316195373e-05, "loss": 0.0488, "step": 142525 }, { "epoch": 2.044255148281994, "grad_norm": 2.06784987449646, "learning_rate": 1.0619409802630182e-05, "loss": 0.1182, "step": 142550 }, { "epoch": 2.0446136637411807, "grad_norm": 1.4204310178756714, "learning_rate": 1.061542628906499e-05, "loss": 0.1659, "step": 142575 }, { "epoch": 2.044972179200367, "grad_norm": 0.20775988698005676, "learning_rate": 1.0611442775499798e-05, "loss": 0.0582, "step": 142600 }, { "epoch": 2.0453306946595537, "grad_norm": 3.187431573867798, "learning_rate": 1.0607459261934606e-05, "loss": 0.0746, "step": 142625 }, { "epoch": 2.0456892101187405, "grad_norm": 0.40861764550209045, "learning_rate": 1.0603475748369416e-05, "loss": 0.1005, "step": 142650 }, { "epoch": 2.046047725577927, "grad_norm": 5.390635013580322, "learning_rate": 1.0599492234804223e-05, "loss": 0.1278, "step": 142675 }, { "epoch": 2.0464062410371135, "grad_norm": 0.5120677947998047, "learning_rate": 1.0595508721239031e-05, "loss": 0.0927, "step": 142700 }, { "epoch": 2.0467647564963003, "grad_norm": 0.2916962206363678, "learning_rate": 1.059152520767384e-05, "loss": 0.0915, "step": 142725 }, { "epoch": 2.0471232719554866, "grad_norm": 1.8515511751174927, "learning_rate": 1.0587541694108649e-05, "loss": 0.1234, "step": 142750 }, { "epoch": 2.0474817874146733, "grad_norm": 3.807802200317383, "learning_rate": 1.0583558180543458e-05, "loss": 0.0463, "step": 142775 }, { "epoch": 2.04784030287386, "grad_norm": 10.064775466918945, "learning_rate": 1.0579574666978266e-05, "loss": 0.0923, "step": 142800 }, { "epoch": 2.0481988183330464, "grad_norm": 0.1550188809633255, "learning_rate": 1.0575591153413074e-05, "loss": 0.0581, "step": 142825 }, { "epoch": 2.048557333792233, "grad_norm": 0.2544427216053009, "learning_rate": 1.0571607639847883e-05, "loss": 0.0951, "step": 142850 }, { "epoch": 2.04891584925142, "grad_norm": 5.031650066375732, "learning_rate": 1.0567624126282691e-05, "loss": 0.0697, "step": 142875 }, { "epoch": 2.049274364710606, "grad_norm": 0.3454389274120331, "learning_rate": 1.05636406127175e-05, "loss": 0.0774, "step": 142900 }, { "epoch": 2.049632880169793, "grad_norm": 0.8813914656639099, "learning_rate": 1.0559657099152309e-05, "loss": 0.0739, "step": 142925 }, { "epoch": 2.0499913956289797, "grad_norm": 2.6998989582061768, "learning_rate": 1.0555673585587118e-05, "loss": 0.0818, "step": 142950 }, { "epoch": 2.050349911088166, "grad_norm": 10.15136432647705, "learning_rate": 1.0551690072021926e-05, "loss": 0.0918, "step": 142975 }, { "epoch": 2.0507084265473527, "grad_norm": 0.10605043172836304, "learning_rate": 1.0547706558456734e-05, "loss": 0.0558, "step": 143000 }, { "epoch": 2.0510669420065395, "grad_norm": 0.11740496754646301, "learning_rate": 1.0543723044891543e-05, "loss": 0.0781, "step": 143025 }, { "epoch": 2.051425457465726, "grad_norm": 0.8256964087486267, "learning_rate": 1.0539739531326351e-05, "loss": 0.062, "step": 143050 }, { "epoch": 2.0517839729249125, "grad_norm": 0.1428331434726715, "learning_rate": 1.053575601776116e-05, "loss": 0.0826, "step": 143075 }, { "epoch": 2.0521424883840993, "grad_norm": 7.747033596038818, "learning_rate": 1.0531772504195969e-05, "loss": 0.0593, "step": 143100 }, { "epoch": 2.0525010038432856, "grad_norm": 9.941810607910156, "learning_rate": 1.0527788990630776e-05, "loss": 0.0985, "step": 143125 }, { "epoch": 2.0528595193024723, "grad_norm": 6.755593299865723, "learning_rate": 1.0523805477065584e-05, "loss": 0.1122, "step": 143150 }, { "epoch": 2.053218034761659, "grad_norm": 0.4683079123497009, "learning_rate": 1.0519821963500392e-05, "loss": 0.0673, "step": 143175 }, { "epoch": 2.0535765502208454, "grad_norm": 0.3872593641281128, "learning_rate": 1.0515838449935202e-05, "loss": 0.0784, "step": 143200 }, { "epoch": 2.053935065680032, "grad_norm": 3.1402547359466553, "learning_rate": 1.051185493637001e-05, "loss": 0.1072, "step": 143225 }, { "epoch": 2.054293581139219, "grad_norm": 10.82656478881836, "learning_rate": 1.0507871422804819e-05, "loss": 0.2091, "step": 143250 }, { "epoch": 2.054652096598405, "grad_norm": 3.258908271789551, "learning_rate": 1.0503887909239627e-05, "loss": 0.0879, "step": 143275 }, { "epoch": 2.055010612057592, "grad_norm": 5.514636993408203, "learning_rate": 1.0499904395674435e-05, "loss": 0.1517, "step": 143300 }, { "epoch": 2.0553691275167787, "grad_norm": 0.1969829499721527, "learning_rate": 1.0495920882109244e-05, "loss": 0.0853, "step": 143325 }, { "epoch": 2.055727642975965, "grad_norm": 21.340167999267578, "learning_rate": 1.0491937368544052e-05, "loss": 0.1076, "step": 143350 }, { "epoch": 2.0560861584351517, "grad_norm": 6.115670204162598, "learning_rate": 1.0487953854978862e-05, "loss": 0.0838, "step": 143375 }, { "epoch": 2.0564446738943385, "grad_norm": 9.017480850219727, "learning_rate": 1.048397034141367e-05, "loss": 0.1486, "step": 143400 }, { "epoch": 2.056803189353525, "grad_norm": 6.154101848602295, "learning_rate": 1.0479986827848477e-05, "loss": 0.107, "step": 143425 }, { "epoch": 2.0571617048127115, "grad_norm": 0.33961421251296997, "learning_rate": 1.0476003314283287e-05, "loss": 0.1238, "step": 143450 }, { "epoch": 2.0575202202718983, "grad_norm": 0.45306000113487244, "learning_rate": 1.0472019800718095e-05, "loss": 0.1051, "step": 143475 }, { "epoch": 2.0578787357310846, "grad_norm": 8.487176895141602, "learning_rate": 1.0468036287152904e-05, "loss": 0.177, "step": 143500 }, { "epoch": 2.0582372511902713, "grad_norm": 9.194345474243164, "learning_rate": 1.0464052773587712e-05, "loss": 0.1001, "step": 143525 }, { "epoch": 2.058595766649458, "grad_norm": 2.179034948348999, "learning_rate": 1.0460069260022522e-05, "loss": 0.1124, "step": 143550 }, { "epoch": 2.0589542821086444, "grad_norm": 2.5225741863250732, "learning_rate": 1.045608574645733e-05, "loss": 0.0621, "step": 143575 }, { "epoch": 2.059312797567831, "grad_norm": 0.2838638722896576, "learning_rate": 1.0452102232892137e-05, "loss": 0.0607, "step": 143600 }, { "epoch": 2.059671313027018, "grad_norm": 0.08977445960044861, "learning_rate": 1.0448118719326947e-05, "loss": 0.0562, "step": 143625 }, { "epoch": 2.060029828486204, "grad_norm": 0.09939263015985489, "learning_rate": 1.0444135205761755e-05, "loss": 0.0658, "step": 143650 }, { "epoch": 2.060388343945391, "grad_norm": 0.031275324523448944, "learning_rate": 1.0440151692196563e-05, "loss": 0.0651, "step": 143675 }, { "epoch": 2.0607468594045777, "grad_norm": 10.261219024658203, "learning_rate": 1.043616817863137e-05, "loss": 0.1026, "step": 143700 }, { "epoch": 2.061105374863764, "grad_norm": 8.09448528289795, "learning_rate": 1.0432184665066178e-05, "loss": 0.1147, "step": 143725 }, { "epoch": 2.0614638903229507, "grad_norm": 10.639835357666016, "learning_rate": 1.0428201151500988e-05, "loss": 0.0724, "step": 143750 }, { "epoch": 2.0618224057821375, "grad_norm": 3.1582119464874268, "learning_rate": 1.0424217637935796e-05, "loss": 0.1131, "step": 143775 }, { "epoch": 2.062180921241324, "grad_norm": 6.559942245483398, "learning_rate": 1.0420234124370605e-05, "loss": 0.115, "step": 143800 }, { "epoch": 2.0625394367005105, "grad_norm": 8.851094245910645, "learning_rate": 1.0416250610805413e-05, "loss": 0.1208, "step": 143825 }, { "epoch": 2.0628979521596973, "grad_norm": 2.043604850769043, "learning_rate": 1.0412267097240223e-05, "loss": 0.1198, "step": 143850 }, { "epoch": 2.0632564676188836, "grad_norm": 17.079120635986328, "learning_rate": 1.040828358367503e-05, "loss": 0.1143, "step": 143875 }, { "epoch": 2.0636149830780703, "grad_norm": 4.237784385681152, "learning_rate": 1.0404300070109838e-05, "loss": 0.1001, "step": 143900 }, { "epoch": 2.063973498537257, "grad_norm": 4.43256950378418, "learning_rate": 1.0400316556544648e-05, "loss": 0.0799, "step": 143925 }, { "epoch": 2.0643320139964434, "grad_norm": 3.2023229598999023, "learning_rate": 1.0396333042979456e-05, "loss": 0.0689, "step": 143950 }, { "epoch": 2.06469052945563, "grad_norm": 2.9294815063476562, "learning_rate": 1.0392349529414265e-05, "loss": 0.041, "step": 143975 }, { "epoch": 2.065049044914817, "grad_norm": 3.9436936378479004, "learning_rate": 1.0388366015849073e-05, "loss": 0.1547, "step": 144000 }, { "epoch": 2.065407560374003, "grad_norm": 6.114943504333496, "learning_rate": 1.0384382502283881e-05, "loss": 0.0744, "step": 144025 }, { "epoch": 2.06576607583319, "grad_norm": 0.14416933059692383, "learning_rate": 1.038039898871869e-05, "loss": 0.0961, "step": 144050 }, { "epoch": 2.0661245912923767, "grad_norm": 13.955889701843262, "learning_rate": 1.0376415475153498e-05, "loss": 0.1426, "step": 144075 }, { "epoch": 2.066483106751563, "grad_norm": 17.8708438873291, "learning_rate": 1.0372431961588308e-05, "loss": 0.0993, "step": 144100 }, { "epoch": 2.0668416222107497, "grad_norm": 1.4164376258850098, "learning_rate": 1.0368448448023116e-05, "loss": 0.1125, "step": 144125 }, { "epoch": 2.0672001376699365, "grad_norm": 0.2622537910938263, "learning_rate": 1.0364464934457925e-05, "loss": 0.0966, "step": 144150 }, { "epoch": 2.067558653129123, "grad_norm": 2.055664539337158, "learning_rate": 1.0360481420892733e-05, "loss": 0.0804, "step": 144175 }, { "epoch": 2.0679171685883095, "grad_norm": 0.7259885668754578, "learning_rate": 1.0356497907327541e-05, "loss": 0.1215, "step": 144200 }, { "epoch": 2.0682756840474963, "grad_norm": 0.1445949822664261, "learning_rate": 1.0352514393762349e-05, "loss": 0.0672, "step": 144225 }, { "epoch": 2.0686341995066826, "grad_norm": 1.251202940940857, "learning_rate": 1.0348530880197157e-05, "loss": 0.0636, "step": 144250 }, { "epoch": 2.0689927149658693, "grad_norm": 13.353273391723633, "learning_rate": 1.0344547366631966e-05, "loss": 0.0622, "step": 144275 }, { "epoch": 2.069351230425056, "grad_norm": 10.929105758666992, "learning_rate": 1.0340563853066774e-05, "loss": 0.1296, "step": 144300 }, { "epoch": 2.0697097458842424, "grad_norm": 1.1218242645263672, "learning_rate": 1.0336580339501582e-05, "loss": 0.1149, "step": 144325 }, { "epoch": 2.070068261343429, "grad_norm": 14.882431030273438, "learning_rate": 1.0332596825936391e-05, "loss": 0.0815, "step": 144350 }, { "epoch": 2.070426776802616, "grad_norm": 12.803553581237793, "learning_rate": 1.03286133123712e-05, "loss": 0.0996, "step": 144375 }, { "epoch": 2.070785292261802, "grad_norm": 18.588031768798828, "learning_rate": 1.0324629798806009e-05, "loss": 0.0829, "step": 144400 }, { "epoch": 2.071143807720989, "grad_norm": 6.427468776702881, "learning_rate": 1.0320646285240817e-05, "loss": 0.1025, "step": 144425 }, { "epoch": 2.0715023231801757, "grad_norm": 3.0167548656463623, "learning_rate": 1.0316662771675626e-05, "loss": 0.0494, "step": 144450 }, { "epoch": 2.071860838639362, "grad_norm": 0.6174665689468384, "learning_rate": 1.0312679258110434e-05, "loss": 0.0713, "step": 144475 }, { "epoch": 2.0722193540985487, "grad_norm": 15.361284255981445, "learning_rate": 1.0308695744545242e-05, "loss": 0.0991, "step": 144500 }, { "epoch": 2.0725778695577355, "grad_norm": 1.5434390306472778, "learning_rate": 1.0304712230980051e-05, "loss": 0.038, "step": 144525 }, { "epoch": 2.072936385016922, "grad_norm": 3.249086618423462, "learning_rate": 1.030072871741486e-05, "loss": 0.0615, "step": 144550 }, { "epoch": 2.0732949004761085, "grad_norm": 19.96686553955078, "learning_rate": 1.0296745203849669e-05, "loss": 0.0878, "step": 144575 }, { "epoch": 2.0736534159352953, "grad_norm": 16.613107681274414, "learning_rate": 1.0292761690284477e-05, "loss": 0.0877, "step": 144600 }, { "epoch": 2.0740119313944816, "grad_norm": 1.6432281732559204, "learning_rate": 1.0288778176719284e-05, "loss": 0.0691, "step": 144625 }, { "epoch": 2.0743704468536683, "grad_norm": 3.3758761882781982, "learning_rate": 1.0284794663154094e-05, "loss": 0.0791, "step": 144650 }, { "epoch": 2.074728962312855, "grad_norm": 11.090421676635742, "learning_rate": 1.0280811149588902e-05, "loss": 0.091, "step": 144675 }, { "epoch": 2.0750874777720414, "grad_norm": 5.776586532592773, "learning_rate": 1.0276827636023711e-05, "loss": 0.1041, "step": 144700 }, { "epoch": 2.075445993231228, "grad_norm": 3.2744650840759277, "learning_rate": 1.0272844122458519e-05, "loss": 0.0948, "step": 144725 }, { "epoch": 2.075804508690415, "grad_norm": 14.260661125183105, "learning_rate": 1.0268860608893329e-05, "loss": 0.0763, "step": 144750 }, { "epoch": 2.076163024149601, "grad_norm": 0.3578687012195587, "learning_rate": 1.0264877095328135e-05, "loss": 0.1176, "step": 144775 }, { "epoch": 2.076521539608788, "grad_norm": 16.22494125366211, "learning_rate": 1.0260893581762943e-05, "loss": 0.0931, "step": 144800 }, { "epoch": 2.0768800550679747, "grad_norm": 12.189140319824219, "learning_rate": 1.0256910068197752e-05, "loss": 0.0817, "step": 144825 }, { "epoch": 2.077238570527161, "grad_norm": 13.0716552734375, "learning_rate": 1.025292655463256e-05, "loss": 0.0891, "step": 144850 }, { "epoch": 2.0775970859863477, "grad_norm": 5.162441730499268, "learning_rate": 1.024894304106737e-05, "loss": 0.0919, "step": 144875 }, { "epoch": 2.0779556014455345, "grad_norm": 0.2031865119934082, "learning_rate": 1.0244959527502177e-05, "loss": 0.092, "step": 144900 }, { "epoch": 2.078314116904721, "grad_norm": 6.568020343780518, "learning_rate": 1.0240976013936985e-05, "loss": 0.0787, "step": 144925 }, { "epoch": 2.0786726323639075, "grad_norm": 9.735990524291992, "learning_rate": 1.0236992500371795e-05, "loss": 0.1155, "step": 144950 }, { "epoch": 2.0790311478230943, "grad_norm": 0.3396008014678955, "learning_rate": 1.0233008986806603e-05, "loss": 0.1183, "step": 144975 }, { "epoch": 2.0793896632822806, "grad_norm": 0.022603889927268028, "learning_rate": 1.0229025473241412e-05, "loss": 0.1236, "step": 145000 }, { "epoch": 2.0797481787414673, "grad_norm": 9.477232933044434, "learning_rate": 1.022504195967622e-05, "loss": 0.0623, "step": 145025 }, { "epoch": 2.080106694200654, "grad_norm": 1.4339607954025269, "learning_rate": 1.022105844611103e-05, "loss": 0.1103, "step": 145050 }, { "epoch": 2.0804652096598404, "grad_norm": 3.926434278488159, "learning_rate": 1.0217074932545837e-05, "loss": 0.0948, "step": 145075 }, { "epoch": 2.080823725119027, "grad_norm": 0.6226731538772583, "learning_rate": 1.0213091418980645e-05, "loss": 0.0909, "step": 145100 }, { "epoch": 2.081182240578214, "grad_norm": 8.344039916992188, "learning_rate": 1.0209107905415455e-05, "loss": 0.1154, "step": 145125 }, { "epoch": 2.0815407560374, "grad_norm": 3.709195613861084, "learning_rate": 1.0205124391850263e-05, "loss": 0.0611, "step": 145150 }, { "epoch": 2.081899271496587, "grad_norm": 0.6152632236480713, "learning_rate": 1.0201140878285072e-05, "loss": 0.0867, "step": 145175 }, { "epoch": 2.0822577869557737, "grad_norm": 3.896010637283325, "learning_rate": 1.019715736471988e-05, "loss": 0.0603, "step": 145200 }, { "epoch": 2.08261630241496, "grad_norm": 0.8869026899337769, "learning_rate": 1.0193173851154688e-05, "loss": 0.1211, "step": 145225 }, { "epoch": 2.0829748178741467, "grad_norm": 2.460380792617798, "learning_rate": 1.0189190337589497e-05, "loss": 0.0828, "step": 145250 }, { "epoch": 2.0833333333333335, "grad_norm": 0.28334933519363403, "learning_rate": 1.0185206824024305e-05, "loss": 0.0963, "step": 145275 }, { "epoch": 2.08369184879252, "grad_norm": 2.424426794052124, "learning_rate": 1.0181223310459115e-05, "loss": 0.0574, "step": 145300 }, { "epoch": 2.0840503642517065, "grad_norm": 0.8224698901176453, "learning_rate": 1.0177239796893921e-05, "loss": 0.0388, "step": 145325 }, { "epoch": 2.0844088797108933, "grad_norm": 0.8477196097373962, "learning_rate": 1.0173256283328729e-05, "loss": 0.0775, "step": 145350 }, { "epoch": 2.0847673951700796, "grad_norm": 3.721186876296997, "learning_rate": 1.0169272769763538e-05, "loss": 0.0559, "step": 145375 }, { "epoch": 2.0851259106292663, "grad_norm": 1.398817539215088, "learning_rate": 1.0165289256198346e-05, "loss": 0.1371, "step": 145400 }, { "epoch": 2.085484426088453, "grad_norm": 0.002850792370736599, "learning_rate": 1.0161305742633156e-05, "loss": 0.0439, "step": 145425 }, { "epoch": 2.0858429415476394, "grad_norm": 16.783117294311523, "learning_rate": 1.0157322229067964e-05, "loss": 0.0624, "step": 145450 }, { "epoch": 2.086201457006826, "grad_norm": 6.622122287750244, "learning_rate": 1.0153338715502773e-05, "loss": 0.1317, "step": 145475 }, { "epoch": 2.086559972466013, "grad_norm": 0.7773304581642151, "learning_rate": 1.0149355201937581e-05, "loss": 0.0864, "step": 145500 }, { "epoch": 2.086918487925199, "grad_norm": 0.9281050562858582, "learning_rate": 1.0145371688372389e-05, "loss": 0.067, "step": 145525 }, { "epoch": 2.087277003384386, "grad_norm": 11.784096717834473, "learning_rate": 1.0141388174807198e-05, "loss": 0.0903, "step": 145550 }, { "epoch": 2.0876355188435727, "grad_norm": 8.701948165893555, "learning_rate": 1.0137404661242006e-05, "loss": 0.0459, "step": 145575 }, { "epoch": 2.087994034302759, "grad_norm": 1.0294321775436401, "learning_rate": 1.0133421147676816e-05, "loss": 0.166, "step": 145600 }, { "epoch": 2.0883525497619457, "grad_norm": 12.981643676757812, "learning_rate": 1.0129437634111624e-05, "loss": 0.054, "step": 145625 }, { "epoch": 2.0887110652211325, "grad_norm": 2.0800986289978027, "learning_rate": 1.0125454120546431e-05, "loss": 0.1473, "step": 145650 }, { "epoch": 2.089069580680319, "grad_norm": 21.300941467285156, "learning_rate": 1.0121470606981241e-05, "loss": 0.0801, "step": 145675 }, { "epoch": 2.0894280961395055, "grad_norm": 6.27219820022583, "learning_rate": 1.0117487093416049e-05, "loss": 0.0757, "step": 145700 }, { "epoch": 2.0897866115986923, "grad_norm": 16.469989776611328, "learning_rate": 1.0113503579850858e-05, "loss": 0.129, "step": 145725 }, { "epoch": 2.0901451270578786, "grad_norm": 3.077472448348999, "learning_rate": 1.0109520066285666e-05, "loss": 0.0487, "step": 145750 }, { "epoch": 2.0905036425170653, "grad_norm": 1.1730568408966064, "learning_rate": 1.0105536552720476e-05, "loss": 0.0394, "step": 145775 }, { "epoch": 2.090862157976252, "grad_norm": 0.3744838535785675, "learning_rate": 1.0101553039155284e-05, "loss": 0.1129, "step": 145800 }, { "epoch": 2.0912206734354384, "grad_norm": 0.3873444199562073, "learning_rate": 1.0097569525590091e-05, "loss": 0.061, "step": 145825 }, { "epoch": 2.091579188894625, "grad_norm": 0.16509215533733368, "learning_rate": 1.0093586012024901e-05, "loss": 0.0761, "step": 145850 }, { "epoch": 2.091937704353812, "grad_norm": 1.87419855594635, "learning_rate": 1.0089602498459707e-05, "loss": 0.1413, "step": 145875 }, { "epoch": 2.092296219812998, "grad_norm": 9.164702415466309, "learning_rate": 1.0085618984894517e-05, "loss": 0.1018, "step": 145900 }, { "epoch": 2.092654735272185, "grad_norm": 3.4676995277404785, "learning_rate": 1.0081635471329324e-05, "loss": 0.0765, "step": 145925 }, { "epoch": 2.0930132507313717, "grad_norm": 5.6396331787109375, "learning_rate": 1.0077651957764132e-05, "loss": 0.1057, "step": 145950 }, { "epoch": 2.093371766190558, "grad_norm": 3.600856065750122, "learning_rate": 1.0073668444198942e-05, "loss": 0.0923, "step": 145975 }, { "epoch": 2.0937302816497447, "grad_norm": 0.3924725651741028, "learning_rate": 1.006968493063375e-05, "loss": 0.0852, "step": 146000 }, { "epoch": 2.0940887971089315, "grad_norm": 2.6240673065185547, "learning_rate": 1.006570141706856e-05, "loss": 0.0711, "step": 146025 }, { "epoch": 2.094447312568118, "grad_norm": 2.4133236408233643, "learning_rate": 1.0061717903503367e-05, "loss": 0.1295, "step": 146050 }, { "epoch": 2.0948058280273045, "grad_norm": 0.09079441428184509, "learning_rate": 1.0057734389938177e-05, "loss": 0.0995, "step": 146075 }, { "epoch": 2.0951643434864913, "grad_norm": 3.1313421726226807, "learning_rate": 1.0053750876372984e-05, "loss": 0.0731, "step": 146100 }, { "epoch": 2.0955228589456776, "grad_norm": 0.7682211995124817, "learning_rate": 1.0049767362807792e-05, "loss": 0.0495, "step": 146125 }, { "epoch": 2.0958813744048643, "grad_norm": 0.11389220505952835, "learning_rate": 1.0045783849242602e-05, "loss": 0.0631, "step": 146150 }, { "epoch": 2.096239889864051, "grad_norm": 3.3877928256988525, "learning_rate": 1.004180033567741e-05, "loss": 0.0677, "step": 146175 }, { "epoch": 2.0965984053232374, "grad_norm": 0.4155751168727875, "learning_rate": 1.003781682211222e-05, "loss": 0.0821, "step": 146200 }, { "epoch": 2.096956920782424, "grad_norm": 0.5367392897605896, "learning_rate": 1.0033833308547027e-05, "loss": 0.1299, "step": 146225 }, { "epoch": 2.097315436241611, "grad_norm": 7.797656536102295, "learning_rate": 1.0029849794981835e-05, "loss": 0.0998, "step": 146250 }, { "epoch": 2.097673951700797, "grad_norm": 0.31957316398620605, "learning_rate": 1.0025866281416644e-05, "loss": 0.0897, "step": 146275 }, { "epoch": 2.098032467159984, "grad_norm": 5.580171585083008, "learning_rate": 1.0021882767851452e-05, "loss": 0.111, "step": 146300 }, { "epoch": 2.0983909826191707, "grad_norm": 16.998022079467773, "learning_rate": 1.0017899254286262e-05, "loss": 0.1304, "step": 146325 }, { "epoch": 2.098749498078357, "grad_norm": 0.7834790945053101, "learning_rate": 1.001391574072107e-05, "loss": 0.0388, "step": 146350 }, { "epoch": 2.0991080135375437, "grad_norm": 1.5906267166137695, "learning_rate": 1.000993222715588e-05, "loss": 0.0994, "step": 146375 }, { "epoch": 2.0994665289967305, "grad_norm": 3.6372528076171875, "learning_rate": 1.0005948713590687e-05, "loss": 0.1747, "step": 146400 }, { "epoch": 2.099825044455917, "grad_norm": 0.1089569479227066, "learning_rate": 1.0001965200025493e-05, "loss": 0.0932, "step": 146425 }, { "epoch": 2.1001835599151035, "grad_norm": 1.0790473222732544, "learning_rate": 9.997981686460303e-06, "loss": 0.1186, "step": 146450 }, { "epoch": 2.1005420753742903, "grad_norm": 0.6043875217437744, "learning_rate": 9.99399817289511e-06, "loss": 0.0806, "step": 146475 }, { "epoch": 2.1009005908334766, "grad_norm": 0.946881890296936, "learning_rate": 9.99001465932992e-06, "loss": 0.0447, "step": 146500 }, { "epoch": 2.1012591062926633, "grad_norm": 0.6248337030410767, "learning_rate": 9.986031145764728e-06, "loss": 0.0693, "step": 146525 }, { "epoch": 2.10161762175185, "grad_norm": 1.6494841575622559, "learning_rate": 9.982047632199536e-06, "loss": 0.1097, "step": 146550 }, { "epoch": 2.1019761372110364, "grad_norm": 20.028438568115234, "learning_rate": 9.978064118634345e-06, "loss": 0.1062, "step": 146575 }, { "epoch": 2.102334652670223, "grad_norm": 10.651036262512207, "learning_rate": 9.974080605069153e-06, "loss": 0.0845, "step": 146600 }, { "epoch": 2.10269316812941, "grad_norm": 0.26064029335975647, "learning_rate": 9.970097091503963e-06, "loss": 0.1323, "step": 146625 }, { "epoch": 2.103051683588596, "grad_norm": 5.517566680908203, "learning_rate": 9.96611357793877e-06, "loss": 0.0676, "step": 146650 }, { "epoch": 2.103410199047783, "grad_norm": 1.514148235321045, "learning_rate": 9.96213006437358e-06, "loss": 0.19, "step": 146675 }, { "epoch": 2.1037687145069697, "grad_norm": 1.2161197662353516, "learning_rate": 9.958146550808388e-06, "loss": 0.0909, "step": 146700 }, { "epoch": 2.104127229966156, "grad_norm": 5.512853145599365, "learning_rate": 9.954163037243196e-06, "loss": 0.1004, "step": 146725 }, { "epoch": 2.1044857454253427, "grad_norm": 13.239532470703125, "learning_rate": 9.950179523678005e-06, "loss": 0.0744, "step": 146750 }, { "epoch": 2.1048442608845295, "grad_norm": 9.280503273010254, "learning_rate": 9.946196010112813e-06, "loss": 0.1099, "step": 146775 }, { "epoch": 2.1052027763437158, "grad_norm": 0.15314649045467377, "learning_rate": 9.942212496547623e-06, "loss": 0.0897, "step": 146800 }, { "epoch": 2.1055612918029025, "grad_norm": 4.649750232696533, "learning_rate": 9.93822898298243e-06, "loss": 0.1073, "step": 146825 }, { "epoch": 2.1059198072620893, "grad_norm": 2.838083267211914, "learning_rate": 9.934245469417238e-06, "loss": 0.1213, "step": 146850 }, { "epoch": 2.1062783227212756, "grad_norm": 0.19354502856731415, "learning_rate": 9.930261955852048e-06, "loss": 0.0797, "step": 146875 }, { "epoch": 2.1066368381804623, "grad_norm": 0.7722437977790833, "learning_rate": 9.926278442286856e-06, "loss": 0.0895, "step": 146900 }, { "epoch": 2.106995353639649, "grad_norm": 0.34537366032600403, "learning_rate": 9.922294928721665e-06, "loss": 0.0928, "step": 146925 }, { "epoch": 2.1073538690988354, "grad_norm": 0.047632575035095215, "learning_rate": 9.918311415156473e-06, "loss": 0.0548, "step": 146950 }, { "epoch": 2.107712384558022, "grad_norm": 0.2790226340293884, "learning_rate": 9.914327901591281e-06, "loss": 0.0693, "step": 146975 }, { "epoch": 2.108070900017209, "grad_norm": 0.18530265986919403, "learning_rate": 9.910344388026089e-06, "loss": 0.1028, "step": 147000 }, { "epoch": 2.108429415476395, "grad_norm": 18.060945510864258, "learning_rate": 9.906360874460897e-06, "loss": 0.0756, "step": 147025 }, { "epoch": 2.108787930935582, "grad_norm": 1.3843821287155151, "learning_rate": 9.902377360895706e-06, "loss": 0.0908, "step": 147050 }, { "epoch": 2.1091464463947687, "grad_norm": 13.50596809387207, "learning_rate": 9.898393847330514e-06, "loss": 0.0515, "step": 147075 }, { "epoch": 2.109504961853955, "grad_norm": 3.443801164627075, "learning_rate": 9.894410333765324e-06, "loss": 0.1306, "step": 147100 }, { "epoch": 2.1098634773131417, "grad_norm": 0.14935313165187836, "learning_rate": 9.890426820200132e-06, "loss": 0.104, "step": 147125 }, { "epoch": 2.1102219927723285, "grad_norm": 11.469193458557129, "learning_rate": 9.88644330663494e-06, "loss": 0.098, "step": 147150 }, { "epoch": 2.1105805082315148, "grad_norm": 3.326380491256714, "learning_rate": 9.882459793069749e-06, "loss": 0.1642, "step": 147175 }, { "epoch": 2.1109390236907015, "grad_norm": 10.18455696105957, "learning_rate": 9.878476279504557e-06, "loss": 0.0618, "step": 147200 }, { "epoch": 2.1112975391498883, "grad_norm": 1.0196763277053833, "learning_rate": 9.874492765939366e-06, "loss": 0.1073, "step": 147225 }, { "epoch": 2.1116560546090746, "grad_norm": 0.47438108921051025, "learning_rate": 9.870509252374174e-06, "loss": 0.0856, "step": 147250 }, { "epoch": 2.1120145700682613, "grad_norm": 0.3265960216522217, "learning_rate": 9.866525738808984e-06, "loss": 0.0434, "step": 147275 }, { "epoch": 2.112373085527448, "grad_norm": 0.5081396698951721, "learning_rate": 9.862542225243792e-06, "loss": 0.1051, "step": 147300 }, { "epoch": 2.1127316009866344, "grad_norm": 4.1436991691589355, "learning_rate": 9.8585587116786e-06, "loss": 0.115, "step": 147325 }, { "epoch": 2.113090116445821, "grad_norm": 0.4657130837440491, "learning_rate": 9.854575198113409e-06, "loss": 0.1517, "step": 147350 }, { "epoch": 2.113448631905008, "grad_norm": 0.4325356185436249, "learning_rate": 9.850591684548217e-06, "loss": 0.036, "step": 147375 }, { "epoch": 2.113807147364194, "grad_norm": 9.500706672668457, "learning_rate": 9.846608170983026e-06, "loss": 0.0668, "step": 147400 }, { "epoch": 2.114165662823381, "grad_norm": 9.996475219726562, "learning_rate": 9.842624657417834e-06, "loss": 0.0944, "step": 147425 }, { "epoch": 2.1145241782825677, "grad_norm": 0.19945237040519714, "learning_rate": 9.838641143852642e-06, "loss": 0.0607, "step": 147450 }, { "epoch": 2.114882693741754, "grad_norm": 0.3793677091598511, "learning_rate": 9.834657630287452e-06, "loss": 0.0447, "step": 147475 }, { "epoch": 2.1152412092009407, "grad_norm": 10.432961463928223, "learning_rate": 9.83067411672226e-06, "loss": 0.099, "step": 147500 }, { "epoch": 2.1155997246601275, "grad_norm": 0.0716063529253006, "learning_rate": 9.826690603157067e-06, "loss": 0.0616, "step": 147525 }, { "epoch": 2.1159582401193138, "grad_norm": 8.28529167175293, "learning_rate": 9.822707089591875e-06, "loss": 0.0497, "step": 147550 }, { "epoch": 2.1163167555785005, "grad_norm": 1.940350890159607, "learning_rate": 9.818723576026685e-06, "loss": 0.0441, "step": 147575 }, { "epoch": 2.1166752710376873, "grad_norm": 0.6822715401649475, "learning_rate": 9.814740062461492e-06, "loss": 0.0963, "step": 147600 }, { "epoch": 2.1170337864968736, "grad_norm": 2.1362359523773193, "learning_rate": 9.8107565488963e-06, "loss": 0.1358, "step": 147625 }, { "epoch": 2.1173923019560603, "grad_norm": 0.145582914352417, "learning_rate": 9.80677303533111e-06, "loss": 0.057, "step": 147650 }, { "epoch": 2.117750817415247, "grad_norm": 15.657033920288086, "learning_rate": 9.802789521765918e-06, "loss": 0.0903, "step": 147675 }, { "epoch": 2.1181093328744334, "grad_norm": 2.409609794616699, "learning_rate": 9.798806008200727e-06, "loss": 0.074, "step": 147700 }, { "epoch": 2.11846784833362, "grad_norm": 0.1387537270784378, "learning_rate": 9.794822494635535e-06, "loss": 0.0832, "step": 147725 }, { "epoch": 2.118826363792807, "grad_norm": 4.174490928649902, "learning_rate": 9.790838981070343e-06, "loss": 0.0788, "step": 147750 }, { "epoch": 2.119184879251993, "grad_norm": 3.6559484004974365, "learning_rate": 9.786855467505152e-06, "loss": 0.0516, "step": 147775 }, { "epoch": 2.11954339471118, "grad_norm": 1.2114558219909668, "learning_rate": 9.78287195393996e-06, "loss": 0.1022, "step": 147800 }, { "epoch": 2.1199019101703667, "grad_norm": 13.441651344299316, "learning_rate": 9.77888844037477e-06, "loss": 0.0571, "step": 147825 }, { "epoch": 2.120260425629553, "grad_norm": 5.505556106567383, "learning_rate": 9.774904926809578e-06, "loss": 0.0426, "step": 147850 }, { "epoch": 2.1206189410887397, "grad_norm": 4.067105770111084, "learning_rate": 9.770921413244387e-06, "loss": 0.1901, "step": 147875 }, { "epoch": 2.1209774565479265, "grad_norm": 0.20460566878318787, "learning_rate": 9.766937899679195e-06, "loss": 0.0482, "step": 147900 }, { "epoch": 2.1213359720071128, "grad_norm": 0.22752967476844788, "learning_rate": 9.762954386114003e-06, "loss": 0.0484, "step": 147925 }, { "epoch": 2.1216944874662995, "grad_norm": 17.732440948486328, "learning_rate": 9.758970872548812e-06, "loss": 0.1259, "step": 147950 }, { "epoch": 2.1220530029254863, "grad_norm": 8.397388458251953, "learning_rate": 9.75498735898362e-06, "loss": 0.0931, "step": 147975 }, { "epoch": 2.1224115183846726, "grad_norm": 14.366458892822266, "learning_rate": 9.75100384541843e-06, "loss": 0.088, "step": 148000 }, { "epoch": 2.1227700338438593, "grad_norm": 9.392037391662598, "learning_rate": 9.747020331853238e-06, "loss": 0.0669, "step": 148025 }, { "epoch": 2.123128549303046, "grad_norm": 2.4120469093322754, "learning_rate": 9.743036818288045e-06, "loss": 0.0456, "step": 148050 }, { "epoch": 2.1234870647622324, "grad_norm": 7.652953624725342, "learning_rate": 9.739053304722853e-06, "loss": 0.0932, "step": 148075 }, { "epoch": 2.123845580221419, "grad_norm": 0.6941074132919312, "learning_rate": 9.735069791157661e-06, "loss": 0.1093, "step": 148100 }, { "epoch": 2.124204095680606, "grad_norm": 14.09292221069336, "learning_rate": 9.73108627759247e-06, "loss": 0.108, "step": 148125 }, { "epoch": 2.124562611139792, "grad_norm": 21.874732971191406, "learning_rate": 9.727102764027279e-06, "loss": 0.078, "step": 148150 }, { "epoch": 2.124921126598979, "grad_norm": 18.03020668029785, "learning_rate": 9.723119250462088e-06, "loss": 0.1076, "step": 148175 }, { "epoch": 2.1252796420581657, "grad_norm": 13.452472686767578, "learning_rate": 9.719135736896896e-06, "loss": 0.1117, "step": 148200 }, { "epoch": 2.125638157517352, "grad_norm": 2.4740726947784424, "learning_rate": 9.715152223331704e-06, "loss": 0.1125, "step": 148225 }, { "epoch": 2.1259966729765387, "grad_norm": 0.17268116772174835, "learning_rate": 9.711168709766513e-06, "loss": 0.0376, "step": 148250 }, { "epoch": 2.1263551884357255, "grad_norm": 5.546867370605469, "learning_rate": 9.707185196201321e-06, "loss": 0.099, "step": 148275 }, { "epoch": 2.1267137038949118, "grad_norm": 5.623615264892578, "learning_rate": 9.70320168263613e-06, "loss": 0.0651, "step": 148300 }, { "epoch": 2.1270722193540985, "grad_norm": 2.0501084327697754, "learning_rate": 9.699218169070939e-06, "loss": 0.0904, "step": 148325 }, { "epoch": 2.1274307348132853, "grad_norm": 3.687995433807373, "learning_rate": 9.695234655505746e-06, "loss": 0.0889, "step": 148350 }, { "epoch": 2.1277892502724716, "grad_norm": 9.052858352661133, "learning_rate": 9.691251141940556e-06, "loss": 0.1121, "step": 148375 }, { "epoch": 2.1281477657316583, "grad_norm": 2.8963654041290283, "learning_rate": 9.687267628375364e-06, "loss": 0.1415, "step": 148400 }, { "epoch": 2.128506281190845, "grad_norm": 6.106618404388428, "learning_rate": 9.683284114810173e-06, "loss": 0.0756, "step": 148425 }, { "epoch": 2.1288647966500314, "grad_norm": 1.38677978515625, "learning_rate": 9.679300601244981e-06, "loss": 0.1048, "step": 148450 }, { "epoch": 2.129223312109218, "grad_norm": 8.77365779876709, "learning_rate": 9.67531708767979e-06, "loss": 0.1342, "step": 148475 }, { "epoch": 2.129581827568405, "grad_norm": 6.128775596618652, "learning_rate": 9.671333574114599e-06, "loss": 0.0498, "step": 148500 }, { "epoch": 2.129940343027591, "grad_norm": 4.939640045166016, "learning_rate": 9.667350060549406e-06, "loss": 0.0936, "step": 148525 }, { "epoch": 2.130298858486778, "grad_norm": 2.9000916481018066, "learning_rate": 9.663366546984216e-06, "loss": 0.0819, "step": 148550 }, { "epoch": 2.1306573739459647, "grad_norm": 7.579886436462402, "learning_rate": 9.659383033419024e-06, "loss": 0.1436, "step": 148575 }, { "epoch": 2.131015889405151, "grad_norm": 3.2019855976104736, "learning_rate": 9.655399519853833e-06, "loss": 0.1507, "step": 148600 }, { "epoch": 2.1313744048643377, "grad_norm": 12.653926849365234, "learning_rate": 9.65141600628864e-06, "loss": 0.1336, "step": 148625 }, { "epoch": 2.1317329203235245, "grad_norm": 0.3137771785259247, "learning_rate": 9.647432492723447e-06, "loss": 0.1493, "step": 148650 }, { "epoch": 2.1320914357827108, "grad_norm": 1.7803192138671875, "learning_rate": 9.643448979158257e-06, "loss": 0.1479, "step": 148675 }, { "epoch": 2.1324499512418975, "grad_norm": 3.492588520050049, "learning_rate": 9.639465465593065e-06, "loss": 0.1722, "step": 148700 }, { "epoch": 2.1328084667010843, "grad_norm": 7.301219940185547, "learning_rate": 9.635481952027874e-06, "loss": 0.0454, "step": 148725 }, { "epoch": 2.1331669821602706, "grad_norm": 0.6354212760925293, "learning_rate": 9.631498438462682e-06, "loss": 0.1008, "step": 148750 }, { "epoch": 2.1335254976194573, "grad_norm": 0.5094444751739502, "learning_rate": 9.627514924897492e-06, "loss": 0.1117, "step": 148775 }, { "epoch": 2.133884013078644, "grad_norm": 1.9407013654708862, "learning_rate": 9.6235314113323e-06, "loss": 0.0637, "step": 148800 }, { "epoch": 2.1342425285378304, "grad_norm": 5.3059234619140625, "learning_rate": 9.619547897767107e-06, "loss": 0.1146, "step": 148825 }, { "epoch": 2.134601043997017, "grad_norm": 10.115752220153809, "learning_rate": 9.615564384201917e-06, "loss": 0.1138, "step": 148850 }, { "epoch": 2.134959559456204, "grad_norm": 14.375837326049805, "learning_rate": 9.611580870636725e-06, "loss": 0.0746, "step": 148875 }, { "epoch": 2.13531807491539, "grad_norm": 1.854915738105774, "learning_rate": 9.607597357071534e-06, "loss": 0.0726, "step": 148900 }, { "epoch": 2.135676590374577, "grad_norm": 8.156253814697266, "learning_rate": 9.603613843506342e-06, "loss": 0.0654, "step": 148925 }, { "epoch": 2.1360351058337637, "grad_norm": 1.4672355651855469, "learning_rate": 9.59963032994115e-06, "loss": 0.0853, "step": 148950 }, { "epoch": 2.13639362129295, "grad_norm": 0.2607489228248596, "learning_rate": 9.59564681637596e-06, "loss": 0.1032, "step": 148975 }, { "epoch": 2.1367521367521367, "grad_norm": 0.7022542953491211, "learning_rate": 9.591663302810767e-06, "loss": 0.042, "step": 149000 }, { "epoch": 2.1371106522113235, "grad_norm": 0.26182499527931213, "learning_rate": 9.587679789245577e-06, "loss": 0.1088, "step": 149025 }, { "epoch": 2.1374691676705098, "grad_norm": 6.44327449798584, "learning_rate": 9.583696275680385e-06, "loss": 0.0998, "step": 149050 }, { "epoch": 2.1378276831296965, "grad_norm": 7.344099521636963, "learning_rate": 9.579712762115194e-06, "loss": 0.1216, "step": 149075 }, { "epoch": 2.1381861985888833, "grad_norm": 0.6171255111694336, "learning_rate": 9.575729248550002e-06, "loss": 0.1019, "step": 149100 }, { "epoch": 2.1385447140480696, "grad_norm": 8.261161804199219, "learning_rate": 9.57174573498481e-06, "loss": 0.0911, "step": 149125 }, { "epoch": 2.1389032295072563, "grad_norm": 14.947805404663086, "learning_rate": 9.56776222141962e-06, "loss": 0.1583, "step": 149150 }, { "epoch": 2.139261744966443, "grad_norm": 1.2917636632919312, "learning_rate": 9.563778707854426e-06, "loss": 0.1107, "step": 149175 }, { "epoch": 2.1396202604256294, "grad_norm": 10.642644882202148, "learning_rate": 9.559795194289235e-06, "loss": 0.0624, "step": 149200 }, { "epoch": 2.139978775884816, "grad_norm": 0.0778149962425232, "learning_rate": 9.555811680724043e-06, "loss": 0.0484, "step": 149225 }, { "epoch": 2.140337291344003, "grad_norm": 1.8744808435440063, "learning_rate": 9.55182816715885e-06, "loss": 0.1325, "step": 149250 }, { "epoch": 2.140695806803189, "grad_norm": 1.0626100301742554, "learning_rate": 9.54784465359366e-06, "loss": 0.1324, "step": 149275 }, { "epoch": 2.141054322262376, "grad_norm": 14.640973091125488, "learning_rate": 9.543861140028468e-06, "loss": 0.128, "step": 149300 }, { "epoch": 2.1414128377215627, "grad_norm": 0.14013420045375824, "learning_rate": 9.539877626463278e-06, "loss": 0.0822, "step": 149325 }, { "epoch": 2.141771353180749, "grad_norm": 7.479713439941406, "learning_rate": 9.535894112898086e-06, "loss": 0.1043, "step": 149350 }, { "epoch": 2.1421298686399357, "grad_norm": 0.040700461715459824, "learning_rate": 9.531910599332895e-06, "loss": 0.1539, "step": 149375 }, { "epoch": 2.1424883840991225, "grad_norm": 2.2907872200012207, "learning_rate": 9.527927085767703e-06, "loss": 0.1955, "step": 149400 }, { "epoch": 2.1428468995583088, "grad_norm": 4.2869110107421875, "learning_rate": 9.52394357220251e-06, "loss": 0.1277, "step": 149425 }, { "epoch": 2.1432054150174955, "grad_norm": 1.3912919759750366, "learning_rate": 9.51996005863732e-06, "loss": 0.0295, "step": 149450 }, { "epoch": 2.1435639304766823, "grad_norm": 3.577230930328369, "learning_rate": 9.515976545072128e-06, "loss": 0.149, "step": 149475 }, { "epoch": 2.1439224459358686, "grad_norm": 15.332244873046875, "learning_rate": 9.511993031506938e-06, "loss": 0.1093, "step": 149500 }, { "epoch": 2.1442809613950553, "grad_norm": 0.7690092921257019, "learning_rate": 9.508009517941746e-06, "loss": 0.0965, "step": 149525 }, { "epoch": 2.144639476854242, "grad_norm": 5.703613758087158, "learning_rate": 9.504026004376553e-06, "loss": 0.0801, "step": 149550 }, { "epoch": 2.1449979923134284, "grad_norm": 0.22621369361877441, "learning_rate": 9.500042490811363e-06, "loss": 0.1079, "step": 149575 }, { "epoch": 2.145356507772615, "grad_norm": 9.499987602233887, "learning_rate": 9.49605897724617e-06, "loss": 0.1044, "step": 149600 }, { "epoch": 2.145715023231802, "grad_norm": 0.8674216270446777, "learning_rate": 9.49207546368098e-06, "loss": 0.0416, "step": 149625 }, { "epoch": 2.146073538690988, "grad_norm": 0.3749718964099884, "learning_rate": 9.488091950115788e-06, "loss": 0.0861, "step": 149650 }, { "epoch": 2.146432054150175, "grad_norm": 11.155364990234375, "learning_rate": 9.484108436550598e-06, "loss": 0.0779, "step": 149675 }, { "epoch": 2.1467905696093617, "grad_norm": 0.4941350519657135, "learning_rate": 9.480124922985406e-06, "loss": 0.0705, "step": 149700 }, { "epoch": 2.147149085068548, "grad_norm": 6.062887668609619, "learning_rate": 9.476141409420212e-06, "loss": 0.1147, "step": 149725 }, { "epoch": 2.1475076005277347, "grad_norm": 5.352019786834717, "learning_rate": 9.472157895855021e-06, "loss": 0.1218, "step": 149750 }, { "epoch": 2.1478661159869215, "grad_norm": 1.4553136825561523, "learning_rate": 9.468174382289829e-06, "loss": 0.064, "step": 149775 }, { "epoch": 2.1482246314461078, "grad_norm": 3.590299606323242, "learning_rate": 9.464190868724639e-06, "loss": 0.0751, "step": 149800 }, { "epoch": 2.1485831469052945, "grad_norm": 8.96078109741211, "learning_rate": 9.460207355159447e-06, "loss": 0.0689, "step": 149825 }, { "epoch": 2.1489416623644813, "grad_norm": 0.04528455808758736, "learning_rate": 9.456223841594254e-06, "loss": 0.0993, "step": 149850 }, { "epoch": 2.1493001778236676, "grad_norm": 5.4137420654296875, "learning_rate": 9.452240328029064e-06, "loss": 0.0803, "step": 149875 }, { "epoch": 2.1496586932828543, "grad_norm": 15.94823169708252, "learning_rate": 9.448256814463872e-06, "loss": 0.1172, "step": 149900 }, { "epoch": 2.150017208742041, "grad_norm": 0.4160628318786621, "learning_rate": 9.444273300898681e-06, "loss": 0.1012, "step": 149925 }, { "epoch": 2.1503757242012274, "grad_norm": 0.790252685546875, "learning_rate": 9.440289787333489e-06, "loss": 0.1282, "step": 149950 }, { "epoch": 2.150734239660414, "grad_norm": 0.10442666709423065, "learning_rate": 9.436306273768297e-06, "loss": 0.1234, "step": 149975 }, { "epoch": 2.151092755119601, "grad_norm": 1.3029764890670776, "learning_rate": 9.432322760203107e-06, "loss": 0.1602, "step": 150000 }, { "epoch": 2.151451270578787, "grad_norm": 1.1134490966796875, "learning_rate": 9.428339246637914e-06, "loss": 0.1162, "step": 150025 }, { "epoch": 2.151809786037974, "grad_norm": 2.119856834411621, "learning_rate": 9.424355733072724e-06, "loss": 0.1052, "step": 150050 }, { "epoch": 2.1521683014971607, "grad_norm": 8.796359062194824, "learning_rate": 9.420372219507532e-06, "loss": 0.0462, "step": 150075 }, { "epoch": 2.152526816956347, "grad_norm": 11.296734809875488, "learning_rate": 9.416388705942341e-06, "loss": 0.121, "step": 150100 }, { "epoch": 2.1528853324155337, "grad_norm": 0.3598598837852478, "learning_rate": 9.412405192377149e-06, "loss": 0.1475, "step": 150125 }, { "epoch": 2.1532438478747205, "grad_norm": 2.0792489051818848, "learning_rate": 9.408421678811957e-06, "loss": 0.1882, "step": 150150 }, { "epoch": 2.1536023633339068, "grad_norm": 11.540274620056152, "learning_rate": 9.404438165246766e-06, "loss": 0.089, "step": 150175 }, { "epoch": 2.1539608787930935, "grad_norm": 0.2797040045261383, "learning_rate": 9.400454651681574e-06, "loss": 0.1175, "step": 150200 }, { "epoch": 2.1543193942522803, "grad_norm": 15.032903671264648, "learning_rate": 9.396471138116384e-06, "loss": 0.0801, "step": 150225 }, { "epoch": 2.1546779097114666, "grad_norm": 15.751940727233887, "learning_rate": 9.392487624551192e-06, "loss": 0.129, "step": 150250 }, { "epoch": 2.1550364251706533, "grad_norm": 7.3493123054504395, "learning_rate": 9.388504110985998e-06, "loss": 0.1343, "step": 150275 }, { "epoch": 2.15539494062984, "grad_norm": 2.2980010509490967, "learning_rate": 9.384520597420807e-06, "loss": 0.1287, "step": 150300 }, { "epoch": 2.1557534560890264, "grad_norm": 0.015430535189807415, "learning_rate": 9.380537083855615e-06, "loss": 0.1195, "step": 150325 }, { "epoch": 2.156111971548213, "grad_norm": 0.5942469239234924, "learning_rate": 9.376553570290425e-06, "loss": 0.0975, "step": 150350 }, { "epoch": 2.1564704870074, "grad_norm": 1.1553916931152344, "learning_rate": 9.372570056725233e-06, "loss": 0.0968, "step": 150375 }, { "epoch": 2.156829002466586, "grad_norm": 2.266514539718628, "learning_rate": 9.368586543160042e-06, "loss": 0.1307, "step": 150400 }, { "epoch": 2.157187517925773, "grad_norm": 0.29588866233825684, "learning_rate": 9.36460302959485e-06, "loss": 0.02, "step": 150425 }, { "epoch": 2.1575460333849596, "grad_norm": 2.6094183921813965, "learning_rate": 9.360619516029658e-06, "loss": 0.2002, "step": 150450 }, { "epoch": 2.157904548844146, "grad_norm": 3.62018084526062, "learning_rate": 9.356636002464467e-06, "loss": 0.0924, "step": 150475 }, { "epoch": 2.1582630643033327, "grad_norm": 14.605579376220703, "learning_rate": 9.352652488899275e-06, "loss": 0.0784, "step": 150500 }, { "epoch": 2.1586215797625194, "grad_norm": 2.389315128326416, "learning_rate": 9.348668975334085e-06, "loss": 0.0604, "step": 150525 }, { "epoch": 2.1589800952217058, "grad_norm": 22.56108283996582, "learning_rate": 9.344685461768893e-06, "loss": 0.1388, "step": 150550 }, { "epoch": 2.1593386106808925, "grad_norm": 19.989408493041992, "learning_rate": 9.3407019482037e-06, "loss": 0.0977, "step": 150575 }, { "epoch": 2.1596971261400792, "grad_norm": 7.520564079284668, "learning_rate": 9.33671843463851e-06, "loss": 0.1401, "step": 150600 }, { "epoch": 2.1600556415992656, "grad_norm": 1.2315692901611328, "learning_rate": 9.332734921073318e-06, "loss": 0.0996, "step": 150625 }, { "epoch": 2.1604141570584523, "grad_norm": 1.4023581743240356, "learning_rate": 9.328751407508127e-06, "loss": 0.0652, "step": 150650 }, { "epoch": 2.160772672517639, "grad_norm": 0.10846023261547089, "learning_rate": 9.324767893942935e-06, "loss": 0.1305, "step": 150675 }, { "epoch": 2.1611311879768254, "grad_norm": 0.17072728276252747, "learning_rate": 9.320784380377745e-06, "loss": 0.0909, "step": 150700 }, { "epoch": 2.161489703436012, "grad_norm": 11.663473129272461, "learning_rate": 9.316800866812553e-06, "loss": 0.1128, "step": 150725 }, { "epoch": 2.161848218895199, "grad_norm": 20.895063400268555, "learning_rate": 9.31281735324736e-06, "loss": 0.0856, "step": 150750 }, { "epoch": 2.1622067343543856, "grad_norm": 7.071538925170898, "learning_rate": 9.30883383968217e-06, "loss": 0.1405, "step": 150775 }, { "epoch": 2.162565249813572, "grad_norm": 0.11559812724590302, "learning_rate": 9.304850326116978e-06, "loss": 0.0693, "step": 150800 }, { "epoch": 2.1629237652727586, "grad_norm": 9.981346130371094, "learning_rate": 9.300866812551786e-06, "loss": 0.0757, "step": 150825 }, { "epoch": 2.163282280731945, "grad_norm": 0.8274640440940857, "learning_rate": 9.296883298986594e-06, "loss": 0.0626, "step": 150850 }, { "epoch": 2.1636407961911317, "grad_norm": 3.137394905090332, "learning_rate": 9.292899785421401e-06, "loss": 0.1367, "step": 150875 }, { "epoch": 2.1639993116503184, "grad_norm": 15.727458953857422, "learning_rate": 9.288916271856211e-06, "loss": 0.1205, "step": 150900 }, { "epoch": 2.164357827109505, "grad_norm": 4.944762706756592, "learning_rate": 9.284932758291019e-06, "loss": 0.0871, "step": 150925 }, { "epoch": 2.1647163425686915, "grad_norm": 0.3822215795516968, "learning_rate": 9.280949244725828e-06, "loss": 0.0813, "step": 150950 }, { "epoch": 2.1650748580278782, "grad_norm": 12.545516967773438, "learning_rate": 9.276965731160636e-06, "loss": 0.0843, "step": 150975 }, { "epoch": 2.1654333734870646, "grad_norm": 12.338005065917969, "learning_rate": 9.272982217595446e-06, "loss": 0.1641, "step": 151000 }, { "epoch": 2.1657918889462513, "grad_norm": 0.8759415149688721, "learning_rate": 9.268998704030254e-06, "loss": 0.0833, "step": 151025 }, { "epoch": 2.166150404405438, "grad_norm": 4.1803131103515625, "learning_rate": 9.265015190465061e-06, "loss": 0.0326, "step": 151050 }, { "epoch": 2.166508919864625, "grad_norm": 12.280255317687988, "learning_rate": 9.261031676899871e-06, "loss": 0.1258, "step": 151075 }, { "epoch": 2.166867435323811, "grad_norm": 0.2864890396595001, "learning_rate": 9.257048163334679e-06, "loss": 0.0808, "step": 151100 }, { "epoch": 2.167225950782998, "grad_norm": 15.629182815551758, "learning_rate": 9.253064649769488e-06, "loss": 0.1286, "step": 151125 }, { "epoch": 2.167584466242184, "grad_norm": 0.15418921411037445, "learning_rate": 9.249081136204296e-06, "loss": 0.1013, "step": 151150 }, { "epoch": 2.167942981701371, "grad_norm": 15.985154151916504, "learning_rate": 9.245097622639104e-06, "loss": 0.1267, "step": 151175 }, { "epoch": 2.1683014971605576, "grad_norm": 1.865478754043579, "learning_rate": 9.241114109073914e-06, "loss": 0.0658, "step": 151200 }, { "epoch": 2.1686600126197444, "grad_norm": 7.436251163482666, "learning_rate": 9.237130595508721e-06, "loss": 0.0756, "step": 151225 }, { "epoch": 2.1690185280789307, "grad_norm": 2.138568878173828, "learning_rate": 9.233147081943531e-06, "loss": 0.0815, "step": 151250 }, { "epoch": 2.1693770435381174, "grad_norm": 0.290924072265625, "learning_rate": 9.229163568378339e-06, "loss": 0.083, "step": 151275 }, { "epoch": 2.1697355589973037, "grad_norm": 19.76808738708496, "learning_rate": 9.225180054813148e-06, "loss": 0.0912, "step": 151300 }, { "epoch": 2.1700940744564905, "grad_norm": 0.6715295910835266, "learning_rate": 9.221196541247956e-06, "loss": 0.087, "step": 151325 }, { "epoch": 2.1704525899156772, "grad_norm": 2.6541686058044434, "learning_rate": 9.217213027682764e-06, "loss": 0.0604, "step": 151350 }, { "epoch": 2.170811105374864, "grad_norm": 2.8975062370300293, "learning_rate": 9.213229514117572e-06, "loss": 0.0594, "step": 151375 }, { "epoch": 2.1711696208340503, "grad_norm": 6.7462897300720215, "learning_rate": 9.20924600055238e-06, "loss": 0.0701, "step": 151400 }, { "epoch": 2.171528136293237, "grad_norm": 2.892054557800293, "learning_rate": 9.20526248698719e-06, "loss": 0.1, "step": 151425 }, { "epoch": 2.1718866517524233, "grad_norm": 0.7946051359176636, "learning_rate": 9.201278973421997e-06, "loss": 0.0384, "step": 151450 }, { "epoch": 2.17224516721161, "grad_norm": 5.540461540222168, "learning_rate": 9.197295459856805e-06, "loss": 0.095, "step": 151475 }, { "epoch": 2.172603682670797, "grad_norm": 4.345017910003662, "learning_rate": 9.193311946291614e-06, "loss": 0.0622, "step": 151500 }, { "epoch": 2.1729621981299836, "grad_norm": 8.242680549621582, "learning_rate": 9.189328432726422e-06, "loss": 0.1091, "step": 151525 }, { "epoch": 2.17332071358917, "grad_norm": 1.9845566749572754, "learning_rate": 9.185344919161232e-06, "loss": 0.1022, "step": 151550 }, { "epoch": 2.1736792290483566, "grad_norm": 0.9134750366210938, "learning_rate": 9.18136140559604e-06, "loss": 0.1071, "step": 151575 }, { "epoch": 2.174037744507543, "grad_norm": 3.4212489128112793, "learning_rate": 9.17737789203085e-06, "loss": 0.059, "step": 151600 }, { "epoch": 2.1743962599667297, "grad_norm": 2.9734909534454346, "learning_rate": 9.173394378465657e-06, "loss": 0.0686, "step": 151625 }, { "epoch": 2.1747547754259164, "grad_norm": 2.828265428543091, "learning_rate": 9.169410864900465e-06, "loss": 0.0651, "step": 151650 }, { "epoch": 2.175113290885103, "grad_norm": 21.868534088134766, "learning_rate": 9.165427351335274e-06, "loss": 0.1065, "step": 151675 }, { "epoch": 2.1754718063442895, "grad_norm": 0.2716211974620819, "learning_rate": 9.161443837770082e-06, "loss": 0.0496, "step": 151700 }, { "epoch": 2.1758303218034762, "grad_norm": 2.403010129928589, "learning_rate": 9.157460324204892e-06, "loss": 0.0607, "step": 151725 }, { "epoch": 2.1761888372626625, "grad_norm": 0.03029789589345455, "learning_rate": 9.1534768106397e-06, "loss": 0.1522, "step": 151750 }, { "epoch": 2.1765473527218493, "grad_norm": 10.505549430847168, "learning_rate": 9.149493297074508e-06, "loss": 0.1494, "step": 151775 }, { "epoch": 2.176905868181036, "grad_norm": 3.314692974090576, "learning_rate": 9.145509783509317e-06, "loss": 0.0688, "step": 151800 }, { "epoch": 2.177264383640223, "grad_norm": 0.09270961582660675, "learning_rate": 9.141526269944125e-06, "loss": 0.0822, "step": 151825 }, { "epoch": 2.177622899099409, "grad_norm": 9.787714004516602, "learning_rate": 9.137542756378934e-06, "loss": 0.0553, "step": 151850 }, { "epoch": 2.177981414558596, "grad_norm": 0.37350013852119446, "learning_rate": 9.133559242813742e-06, "loss": 0.1219, "step": 151875 }, { "epoch": 2.178339930017782, "grad_norm": 0.5357393622398376, "learning_rate": 9.129575729248552e-06, "loss": 0.1014, "step": 151900 }, { "epoch": 2.178698445476969, "grad_norm": 14.554646492004395, "learning_rate": 9.125592215683358e-06, "loss": 0.0583, "step": 151925 }, { "epoch": 2.1790569609361556, "grad_norm": 5.130795955657959, "learning_rate": 9.121608702118166e-06, "loss": 0.1218, "step": 151950 }, { "epoch": 2.1794154763953424, "grad_norm": 8.072774887084961, "learning_rate": 9.117625188552975e-06, "loss": 0.0896, "step": 151975 }, { "epoch": 2.1797739918545287, "grad_norm": 0.07047275453805923, "learning_rate": 9.113641674987783e-06, "loss": 0.1113, "step": 152000 }, { "epoch": 2.1801325073137154, "grad_norm": 2.1958229541778564, "learning_rate": 9.109658161422593e-06, "loss": 0.0272, "step": 152025 }, { "epoch": 2.1804910227729017, "grad_norm": 11.204858779907227, "learning_rate": 9.1056746478574e-06, "loss": 0.0701, "step": 152050 }, { "epoch": 2.1808495382320885, "grad_norm": 0.8011798858642578, "learning_rate": 9.101691134292208e-06, "loss": 0.0576, "step": 152075 }, { "epoch": 2.1812080536912752, "grad_norm": 1.8580259084701538, "learning_rate": 9.097707620727018e-06, "loss": 0.0616, "step": 152100 }, { "epoch": 2.181566569150462, "grad_norm": 1.0846468210220337, "learning_rate": 9.093724107161826e-06, "loss": 0.0955, "step": 152125 }, { "epoch": 2.1819250846096483, "grad_norm": 13.46346378326416, "learning_rate": 9.089740593596635e-06, "loss": 0.168, "step": 152150 }, { "epoch": 2.182283600068835, "grad_norm": 0.5443722009658813, "learning_rate": 9.085757080031443e-06, "loss": 0.0972, "step": 152175 }, { "epoch": 2.1826421155280213, "grad_norm": 0.5613654255867004, "learning_rate": 9.081773566466253e-06, "loss": 0.0849, "step": 152200 }, { "epoch": 2.183000630987208, "grad_norm": 5.215660572052002, "learning_rate": 9.07779005290106e-06, "loss": 0.0612, "step": 152225 }, { "epoch": 2.183359146446395, "grad_norm": 2.5866007804870605, "learning_rate": 9.073806539335868e-06, "loss": 0.0779, "step": 152250 }, { "epoch": 2.1837176619055816, "grad_norm": 6.545581817626953, "learning_rate": 9.069823025770678e-06, "loss": 0.0586, "step": 152275 }, { "epoch": 2.184076177364768, "grad_norm": 3.3335158824920654, "learning_rate": 9.065839512205486e-06, "loss": 0.1485, "step": 152300 }, { "epoch": 2.1844346928239546, "grad_norm": 0.18982188403606415, "learning_rate": 9.061855998640295e-06, "loss": 0.046, "step": 152325 }, { "epoch": 2.184793208283141, "grad_norm": 11.015839576721191, "learning_rate": 9.057872485075103e-06, "loss": 0.1157, "step": 152350 }, { "epoch": 2.1851517237423277, "grad_norm": 0.29694104194641113, "learning_rate": 9.053888971509911e-06, "loss": 0.1566, "step": 152375 }, { "epoch": 2.1855102392015144, "grad_norm": 1.1338814496994019, "learning_rate": 9.04990545794472e-06, "loss": 0.0985, "step": 152400 }, { "epoch": 2.185868754660701, "grad_norm": 9.836166381835938, "learning_rate": 9.045921944379528e-06, "loss": 0.0853, "step": 152425 }, { "epoch": 2.1862272701198875, "grad_norm": 7.992711544036865, "learning_rate": 9.041938430814338e-06, "loss": 0.11, "step": 152450 }, { "epoch": 2.1865857855790742, "grad_norm": 16.992067337036133, "learning_rate": 9.037954917249144e-06, "loss": 0.1754, "step": 152475 }, { "epoch": 2.1869443010382605, "grad_norm": 7.084727764129639, "learning_rate": 9.033971403683954e-06, "loss": 0.0612, "step": 152500 }, { "epoch": 2.1873028164974473, "grad_norm": 12.98836898803711, "learning_rate": 9.029987890118761e-06, "loss": 0.187, "step": 152525 }, { "epoch": 2.187661331956634, "grad_norm": 1.7958953380584717, "learning_rate": 9.02600437655357e-06, "loss": 0.0871, "step": 152550 }, { "epoch": 2.188019847415821, "grad_norm": 0.37494775652885437, "learning_rate": 9.022020862988379e-06, "loss": 0.1135, "step": 152575 }, { "epoch": 2.188378362875007, "grad_norm": 1.053135633468628, "learning_rate": 9.018037349423187e-06, "loss": 0.1236, "step": 152600 }, { "epoch": 2.188736878334194, "grad_norm": 0.054149072617292404, "learning_rate": 9.014053835857996e-06, "loss": 0.0643, "step": 152625 }, { "epoch": 2.18909539379338, "grad_norm": 0.8476911783218384, "learning_rate": 9.010070322292804e-06, "loss": 0.104, "step": 152650 }, { "epoch": 2.189453909252567, "grad_norm": 12.057558059692383, "learning_rate": 9.006086808727612e-06, "loss": 0.1355, "step": 152675 }, { "epoch": 2.1898124247117536, "grad_norm": 0.5513665080070496, "learning_rate": 9.002103295162421e-06, "loss": 0.1102, "step": 152700 }, { "epoch": 2.1901709401709404, "grad_norm": 13.550256729125977, "learning_rate": 8.99811978159723e-06, "loss": 0.0902, "step": 152725 }, { "epoch": 2.1905294556301267, "grad_norm": 11.019536018371582, "learning_rate": 8.994136268032039e-06, "loss": 0.0721, "step": 152750 }, { "epoch": 2.1908879710893134, "grad_norm": 15.66824722290039, "learning_rate": 8.990152754466847e-06, "loss": 0.0723, "step": 152775 }, { "epoch": 2.1912464865484997, "grad_norm": 2.610976219177246, "learning_rate": 8.986169240901656e-06, "loss": 0.1076, "step": 152800 }, { "epoch": 2.1916050020076865, "grad_norm": 1.0501645803451538, "learning_rate": 8.982185727336464e-06, "loss": 0.109, "step": 152825 }, { "epoch": 2.1919635174668732, "grad_norm": 0.2533095180988312, "learning_rate": 8.978202213771272e-06, "loss": 0.0713, "step": 152850 }, { "epoch": 2.19232203292606, "grad_norm": 0.8502959609031677, "learning_rate": 8.974218700206081e-06, "loss": 0.0965, "step": 152875 }, { "epoch": 2.1926805483852463, "grad_norm": 9.917193412780762, "learning_rate": 8.97023518664089e-06, "loss": 0.2032, "step": 152900 }, { "epoch": 2.193039063844433, "grad_norm": 0.6691857576370239, "learning_rate": 8.966251673075699e-06, "loss": 0.1717, "step": 152925 }, { "epoch": 2.1933975793036193, "grad_norm": 17.353254318237305, "learning_rate": 8.962268159510507e-06, "loss": 0.0377, "step": 152950 }, { "epoch": 2.193756094762806, "grad_norm": 1.683001160621643, "learning_rate": 8.958284645945315e-06, "loss": 0.0549, "step": 152975 }, { "epoch": 2.194114610221993, "grad_norm": 17.863677978515625, "learning_rate": 8.954301132380124e-06, "loss": 0.0844, "step": 153000 }, { "epoch": 2.1944731256811796, "grad_norm": 1.6028317213058472, "learning_rate": 8.95031761881493e-06, "loss": 0.1538, "step": 153025 }, { "epoch": 2.194831641140366, "grad_norm": 11.977834701538086, "learning_rate": 8.94633410524974e-06, "loss": 0.0975, "step": 153050 }, { "epoch": 2.1951901565995526, "grad_norm": 0.9646387100219727, "learning_rate": 8.942350591684548e-06, "loss": 0.0894, "step": 153075 }, { "epoch": 2.195548672058739, "grad_norm": 1.9093514680862427, "learning_rate": 8.938367078119357e-06, "loss": 0.0763, "step": 153100 }, { "epoch": 2.1959071875179257, "grad_norm": 1.7526353597640991, "learning_rate": 8.934383564554165e-06, "loss": 0.0664, "step": 153125 }, { "epoch": 2.1962657029771124, "grad_norm": 14.548548698425293, "learning_rate": 8.930400050988973e-06, "loss": 0.0739, "step": 153150 }, { "epoch": 2.196624218436299, "grad_norm": 1.3079602718353271, "learning_rate": 8.926416537423782e-06, "loss": 0.0866, "step": 153175 }, { "epoch": 2.1969827338954855, "grad_norm": 3.089570999145508, "learning_rate": 8.92243302385859e-06, "loss": 0.0595, "step": 153200 }, { "epoch": 2.1973412493546722, "grad_norm": 7.7914276123046875, "learning_rate": 8.9184495102934e-06, "loss": 0.1075, "step": 153225 }, { "epoch": 2.197699764813859, "grad_norm": 0.8495354652404785, "learning_rate": 8.914465996728208e-06, "loss": 0.0463, "step": 153250 }, { "epoch": 2.1980582802730453, "grad_norm": 2.151815891265869, "learning_rate": 8.910482483163015e-06, "loss": 0.0841, "step": 153275 }, { "epoch": 2.198416795732232, "grad_norm": 8.212874412536621, "learning_rate": 8.906498969597825e-06, "loss": 0.04, "step": 153300 }, { "epoch": 2.198775311191419, "grad_norm": 8.720788955688477, "learning_rate": 8.902515456032633e-06, "loss": 0.1949, "step": 153325 }, { "epoch": 2.199133826650605, "grad_norm": 4.01032829284668, "learning_rate": 8.898531942467442e-06, "loss": 0.0872, "step": 153350 }, { "epoch": 2.199492342109792, "grad_norm": 3.359734296798706, "learning_rate": 8.89454842890225e-06, "loss": 0.0712, "step": 153375 }, { "epoch": 2.1998508575689786, "grad_norm": 6.373823642730713, "learning_rate": 8.89056491533706e-06, "loss": 0.0394, "step": 153400 }, { "epoch": 2.200209373028165, "grad_norm": 5.328566551208496, "learning_rate": 8.886581401771868e-06, "loss": 0.1101, "step": 153425 }, { "epoch": 2.2005678884873516, "grad_norm": 2.592747926712036, "learning_rate": 8.882597888206675e-06, "loss": 0.0751, "step": 153450 }, { "epoch": 2.2009264039465384, "grad_norm": 0.2610323429107666, "learning_rate": 8.878614374641485e-06, "loss": 0.0499, "step": 153475 }, { "epoch": 2.2012849194057247, "grad_norm": 3.3908426761627197, "learning_rate": 8.874630861076293e-06, "loss": 0.0515, "step": 153500 }, { "epoch": 2.2016434348649114, "grad_norm": 0.6069886088371277, "learning_rate": 8.870647347511102e-06, "loss": 0.0937, "step": 153525 }, { "epoch": 2.202001950324098, "grad_norm": 13.348692893981934, "learning_rate": 8.86666383394591e-06, "loss": 0.0683, "step": 153550 }, { "epoch": 2.2023604657832845, "grad_norm": 1.7498010396957397, "learning_rate": 8.862680320380716e-06, "loss": 0.0929, "step": 153575 }, { "epoch": 2.2027189812424712, "grad_norm": 2.072547674179077, "learning_rate": 8.858696806815526e-06, "loss": 0.0963, "step": 153600 }, { "epoch": 2.203077496701658, "grad_norm": 0.36005812883377075, "learning_rate": 8.854713293250334e-06, "loss": 0.0449, "step": 153625 }, { "epoch": 2.2034360121608443, "grad_norm": 0.47941291332244873, "learning_rate": 8.850729779685143e-06, "loss": 0.0823, "step": 153650 }, { "epoch": 2.203794527620031, "grad_norm": 0.3894939720630646, "learning_rate": 8.846746266119951e-06, "loss": 0.0644, "step": 153675 }, { "epoch": 2.2041530430792178, "grad_norm": 0.09629198163747787, "learning_rate": 8.84276275255476e-06, "loss": 0.1012, "step": 153700 }, { "epoch": 2.204511558538404, "grad_norm": 3.6264841556549072, "learning_rate": 8.838779238989569e-06, "loss": 0.0494, "step": 153725 }, { "epoch": 2.204870073997591, "grad_norm": 16.43877410888672, "learning_rate": 8.834795725424376e-06, "loss": 0.0855, "step": 153750 }, { "epoch": 2.2052285894567776, "grad_norm": 3.109663486480713, "learning_rate": 8.830812211859186e-06, "loss": 0.144, "step": 153775 }, { "epoch": 2.205587104915964, "grad_norm": 1.9031529426574707, "learning_rate": 8.826828698293994e-06, "loss": 0.0846, "step": 153800 }, { "epoch": 2.2059456203751506, "grad_norm": 2.187448740005493, "learning_rate": 8.822845184728803e-06, "loss": 0.0459, "step": 153825 }, { "epoch": 2.2063041358343374, "grad_norm": 18.525041580200195, "learning_rate": 8.818861671163611e-06, "loss": 0.0921, "step": 153850 }, { "epoch": 2.2066626512935237, "grad_norm": 0.3607323467731476, "learning_rate": 8.814878157598419e-06, "loss": 0.0898, "step": 153875 }, { "epoch": 2.2070211667527104, "grad_norm": 0.5191817879676819, "learning_rate": 8.810894644033229e-06, "loss": 0.0834, "step": 153900 }, { "epoch": 2.207379682211897, "grad_norm": 2.624295473098755, "learning_rate": 8.806911130468036e-06, "loss": 0.0843, "step": 153925 }, { "epoch": 2.2077381976710835, "grad_norm": 0.17697903513908386, "learning_rate": 8.802927616902846e-06, "loss": 0.1186, "step": 153950 }, { "epoch": 2.2080967131302702, "grad_norm": 1.7385756969451904, "learning_rate": 8.798944103337654e-06, "loss": 0.0587, "step": 153975 }, { "epoch": 2.208455228589457, "grad_norm": 12.394301414489746, "learning_rate": 8.794960589772463e-06, "loss": 0.0501, "step": 154000 }, { "epoch": 2.2088137440486433, "grad_norm": 0.046829190105199814, "learning_rate": 8.790977076207271e-06, "loss": 0.0847, "step": 154025 }, { "epoch": 2.20917225950783, "grad_norm": 0.1221519261598587, "learning_rate": 8.786993562642079e-06, "loss": 0.0844, "step": 154050 }, { "epoch": 2.2095307749670168, "grad_norm": 0.2878974974155426, "learning_rate": 8.783010049076889e-06, "loss": 0.0912, "step": 154075 }, { "epoch": 2.209889290426203, "grad_norm": 0.5784372687339783, "learning_rate": 8.779026535511696e-06, "loss": 0.0873, "step": 154100 }, { "epoch": 2.21024780588539, "grad_norm": 0.9319890737533569, "learning_rate": 8.775043021946504e-06, "loss": 0.0496, "step": 154125 }, { "epoch": 2.2106063213445766, "grad_norm": 0.6572452783584595, "learning_rate": 8.771059508381312e-06, "loss": 0.1041, "step": 154150 }, { "epoch": 2.210964836803763, "grad_norm": 0.5566487312316895, "learning_rate": 8.76707599481612e-06, "loss": 0.1317, "step": 154175 }, { "epoch": 2.2113233522629496, "grad_norm": 12.763232231140137, "learning_rate": 8.76309248125093e-06, "loss": 0.127, "step": 154200 }, { "epoch": 2.2116818677221364, "grad_norm": 2.4306371212005615, "learning_rate": 8.759108967685737e-06, "loss": 0.1152, "step": 154225 }, { "epoch": 2.2120403831813227, "grad_norm": 3.5850419998168945, "learning_rate": 8.755125454120547e-06, "loss": 0.0311, "step": 154250 }, { "epoch": 2.2123988986405094, "grad_norm": 7.173678398132324, "learning_rate": 8.751141940555355e-06, "loss": 0.1038, "step": 154275 }, { "epoch": 2.212757414099696, "grad_norm": 0.32048797607421875, "learning_rate": 8.747158426990163e-06, "loss": 0.0804, "step": 154300 }, { "epoch": 2.2131159295588825, "grad_norm": 2.1912996768951416, "learning_rate": 8.743174913424972e-06, "loss": 0.0478, "step": 154325 }, { "epoch": 2.2134744450180692, "grad_norm": 0.2639313340187073, "learning_rate": 8.73919139985978e-06, "loss": 0.1222, "step": 154350 }, { "epoch": 2.213832960477256, "grad_norm": 0.32443055510520935, "learning_rate": 8.73520788629459e-06, "loss": 0.0437, "step": 154375 }, { "epoch": 2.2141914759364423, "grad_norm": 1.7326329946517944, "learning_rate": 8.731224372729397e-06, "loss": 0.1034, "step": 154400 }, { "epoch": 2.214549991395629, "grad_norm": 2.555551528930664, "learning_rate": 8.727240859164207e-06, "loss": 0.0753, "step": 154425 }, { "epoch": 2.2149085068548158, "grad_norm": 13.70197582244873, "learning_rate": 8.723257345599015e-06, "loss": 0.1466, "step": 154450 }, { "epoch": 2.215267022314002, "grad_norm": 0.8352129459381104, "learning_rate": 8.719273832033822e-06, "loss": 0.0573, "step": 154475 }, { "epoch": 2.215625537773189, "grad_norm": 0.8418903946876526, "learning_rate": 8.715290318468632e-06, "loss": 0.1092, "step": 154500 }, { "epoch": 2.2159840532323756, "grad_norm": 0.3346066474914551, "learning_rate": 8.71130680490344e-06, "loss": 0.0582, "step": 154525 }, { "epoch": 2.216342568691562, "grad_norm": 0.9188634157180786, "learning_rate": 8.70732329133825e-06, "loss": 0.115, "step": 154550 }, { "epoch": 2.2167010841507486, "grad_norm": 4.746646881103516, "learning_rate": 8.703339777773057e-06, "loss": 0.1106, "step": 154575 }, { "epoch": 2.2170595996099354, "grad_norm": 2.7687649726867676, "learning_rate": 8.699356264207865e-06, "loss": 0.1054, "step": 154600 }, { "epoch": 2.2174181150691217, "grad_norm": 0.11922089010477066, "learning_rate": 8.695372750642675e-06, "loss": 0.0926, "step": 154625 }, { "epoch": 2.2177766305283084, "grad_norm": 17.006053924560547, "learning_rate": 8.691389237077482e-06, "loss": 0.0533, "step": 154650 }, { "epoch": 2.218135145987495, "grad_norm": 2.725027561187744, "learning_rate": 8.68740572351229e-06, "loss": 0.1339, "step": 154675 }, { "epoch": 2.2184936614466815, "grad_norm": 3.3966705799102783, "learning_rate": 8.683422209947098e-06, "loss": 0.0525, "step": 154700 }, { "epoch": 2.2188521769058682, "grad_norm": 5.142869472503662, "learning_rate": 8.679438696381908e-06, "loss": 0.0958, "step": 154725 }, { "epoch": 2.219210692365055, "grad_norm": 17.632049560546875, "learning_rate": 8.675455182816716e-06, "loss": 0.1472, "step": 154750 }, { "epoch": 2.2195692078242413, "grad_norm": 7.72509241104126, "learning_rate": 8.671471669251523e-06, "loss": 0.0863, "step": 154775 }, { "epoch": 2.219927723283428, "grad_norm": 5.590676307678223, "learning_rate": 8.667488155686333e-06, "loss": 0.1468, "step": 154800 }, { "epoch": 2.2202862387426148, "grad_norm": 2.947134017944336, "learning_rate": 8.66350464212114e-06, "loss": 0.0567, "step": 154825 }, { "epoch": 2.220644754201801, "grad_norm": 2.1945555210113525, "learning_rate": 8.65952112855595e-06, "loss": 0.0329, "step": 154850 }, { "epoch": 2.221003269660988, "grad_norm": 4.652674674987793, "learning_rate": 8.655537614990758e-06, "loss": 0.0918, "step": 154875 }, { "epoch": 2.2213617851201746, "grad_norm": 17.950849533081055, "learning_rate": 8.651554101425566e-06, "loss": 0.0756, "step": 154900 }, { "epoch": 2.221720300579361, "grad_norm": 3.3227436542510986, "learning_rate": 8.647570587860376e-06, "loss": 0.1041, "step": 154925 }, { "epoch": 2.2220788160385476, "grad_norm": 0.14611683785915375, "learning_rate": 8.643587074295183e-06, "loss": 0.0703, "step": 154950 }, { "epoch": 2.2224373314977344, "grad_norm": 1.5873721837997437, "learning_rate": 8.639603560729993e-06, "loss": 0.0712, "step": 154975 }, { "epoch": 2.2227958469569207, "grad_norm": 1.2701375484466553, "learning_rate": 8.6356200471648e-06, "loss": 0.0572, "step": 155000 }, { "epoch": 2.2231543624161074, "grad_norm": 2.425800323486328, "learning_rate": 8.63163653359961e-06, "loss": 0.0724, "step": 155025 }, { "epoch": 2.223512877875294, "grad_norm": 0.8708847165107727, "learning_rate": 8.627653020034418e-06, "loss": 0.1095, "step": 155050 }, { "epoch": 2.2238713933344805, "grad_norm": 1.3056938648223877, "learning_rate": 8.623669506469226e-06, "loss": 0.1101, "step": 155075 }, { "epoch": 2.224229908793667, "grad_norm": 7.869133472442627, "learning_rate": 8.619685992904036e-06, "loss": 0.157, "step": 155100 }, { "epoch": 2.224588424252854, "grad_norm": 11.97579574584961, "learning_rate": 8.615702479338843e-06, "loss": 0.0365, "step": 155125 }, { "epoch": 2.2249469397120403, "grad_norm": 14.438315391540527, "learning_rate": 8.611718965773653e-06, "loss": 0.095, "step": 155150 }, { "epoch": 2.225305455171227, "grad_norm": 5.699769496917725, "learning_rate": 8.60773545220846e-06, "loss": 0.0854, "step": 155175 }, { "epoch": 2.2256639706304138, "grad_norm": 11.653741836547852, "learning_rate": 8.603751938643269e-06, "loss": 0.1039, "step": 155200 }, { "epoch": 2.2260224860896, "grad_norm": 0.6042342185974121, "learning_rate": 8.599768425078076e-06, "loss": 0.1718, "step": 155225 }, { "epoch": 2.226381001548787, "grad_norm": 6.668888092041016, "learning_rate": 8.595784911512884e-06, "loss": 0.0703, "step": 155250 }, { "epoch": 2.2267395170079736, "grad_norm": 2.7826318740844727, "learning_rate": 8.591801397947694e-06, "loss": 0.0863, "step": 155275 }, { "epoch": 2.22709803246716, "grad_norm": 1.6983146667480469, "learning_rate": 8.587817884382502e-06, "loss": 0.0638, "step": 155300 }, { "epoch": 2.2274565479263466, "grad_norm": 0.5179972052574158, "learning_rate": 8.583834370817311e-06, "loss": 0.0722, "step": 155325 }, { "epoch": 2.2278150633855334, "grad_norm": 28.695430755615234, "learning_rate": 8.579850857252119e-06, "loss": 0.075, "step": 155350 }, { "epoch": 2.2281735788447197, "grad_norm": 0.25836437940597534, "learning_rate": 8.575867343686927e-06, "loss": 0.0517, "step": 155375 }, { "epoch": 2.2285320943039064, "grad_norm": 10.301990509033203, "learning_rate": 8.571883830121736e-06, "loss": 0.0736, "step": 155400 }, { "epoch": 2.228890609763093, "grad_norm": 3.622614860534668, "learning_rate": 8.567900316556544e-06, "loss": 0.1402, "step": 155425 }, { "epoch": 2.2292491252222795, "grad_norm": 0.018690720200538635, "learning_rate": 8.563916802991354e-06, "loss": 0.0363, "step": 155450 }, { "epoch": 2.229607640681466, "grad_norm": 6.808429718017578, "learning_rate": 8.559933289426162e-06, "loss": 0.073, "step": 155475 }, { "epoch": 2.229966156140653, "grad_norm": 0.8675071001052856, "learning_rate": 8.55594977586097e-06, "loss": 0.1707, "step": 155500 }, { "epoch": 2.2303246715998393, "grad_norm": 1.3084269762039185, "learning_rate": 8.551966262295779e-06, "loss": 0.0701, "step": 155525 }, { "epoch": 2.230683187059026, "grad_norm": 1.5000230073928833, "learning_rate": 8.547982748730587e-06, "loss": 0.0826, "step": 155550 }, { "epoch": 2.2310417025182128, "grad_norm": 2.4006407260894775, "learning_rate": 8.543999235165396e-06, "loss": 0.0973, "step": 155575 }, { "epoch": 2.231400217977399, "grad_norm": 1.0935691595077515, "learning_rate": 8.540015721600204e-06, "loss": 0.0802, "step": 155600 }, { "epoch": 2.231758733436586, "grad_norm": 0.2569175660610199, "learning_rate": 8.536032208035014e-06, "loss": 0.1424, "step": 155625 }, { "epoch": 2.2321172488957726, "grad_norm": 1.9856864213943481, "learning_rate": 8.532048694469822e-06, "loss": 0.0869, "step": 155650 }, { "epoch": 2.232475764354959, "grad_norm": 5.648294448852539, "learning_rate": 8.52806518090463e-06, "loss": 0.13, "step": 155675 }, { "epoch": 2.2328342798141456, "grad_norm": 0.4044096767902374, "learning_rate": 8.524081667339439e-06, "loss": 0.0504, "step": 155700 }, { "epoch": 2.2331927952733324, "grad_norm": 7.660882949829102, "learning_rate": 8.520098153774247e-06, "loss": 0.1182, "step": 155725 }, { "epoch": 2.2335513107325187, "grad_norm": 3.0349676609039307, "learning_rate": 8.516114640209056e-06, "loss": 0.0671, "step": 155750 }, { "epoch": 2.2339098261917054, "grad_norm": 0.8068325519561768, "learning_rate": 8.512131126643863e-06, "loss": 0.1035, "step": 155775 }, { "epoch": 2.234268341650892, "grad_norm": 10.102729797363281, "learning_rate": 8.50814761307867e-06, "loss": 0.0679, "step": 155800 }, { "epoch": 2.2346268571100785, "grad_norm": 15.384527206420898, "learning_rate": 8.50416409951348e-06, "loss": 0.109, "step": 155825 }, { "epoch": 2.234985372569265, "grad_norm": 3.3175957202911377, "learning_rate": 8.500180585948288e-06, "loss": 0.1489, "step": 155850 }, { "epoch": 2.235343888028452, "grad_norm": 2.894136905670166, "learning_rate": 8.496197072383097e-06, "loss": 0.1366, "step": 155875 }, { "epoch": 2.2357024034876383, "grad_norm": 0.5096798539161682, "learning_rate": 8.492213558817905e-06, "loss": 0.0707, "step": 155900 }, { "epoch": 2.236060918946825, "grad_norm": 1.0018388032913208, "learning_rate": 8.488230045252715e-06, "loss": 0.0524, "step": 155925 }, { "epoch": 2.2364194344060118, "grad_norm": 7.299844741821289, "learning_rate": 8.484246531687523e-06, "loss": 0.1048, "step": 155950 }, { "epoch": 2.236777949865198, "grad_norm": 1.8598577976226807, "learning_rate": 8.48026301812233e-06, "loss": 0.0849, "step": 155975 }, { "epoch": 2.237136465324385, "grad_norm": 7.405637264251709, "learning_rate": 8.47627950455714e-06, "loss": 0.1094, "step": 156000 }, { "epoch": 2.2374949807835716, "grad_norm": 0.5499764680862427, "learning_rate": 8.472295990991948e-06, "loss": 0.0908, "step": 156025 }, { "epoch": 2.237853496242758, "grad_norm": 23.237201690673828, "learning_rate": 8.468312477426757e-06, "loss": 0.1636, "step": 156050 }, { "epoch": 2.2382120117019446, "grad_norm": 0.8052083253860474, "learning_rate": 8.464328963861565e-06, "loss": 0.09, "step": 156075 }, { "epoch": 2.2385705271611314, "grad_norm": 13.682013511657715, "learning_rate": 8.460345450296373e-06, "loss": 0.0942, "step": 156100 }, { "epoch": 2.2389290426203177, "grad_norm": 0.8964294195175171, "learning_rate": 8.456361936731183e-06, "loss": 0.0602, "step": 156125 }, { "epoch": 2.2392875580795044, "grad_norm": 2.304245948791504, "learning_rate": 8.45237842316599e-06, "loss": 0.0564, "step": 156150 }, { "epoch": 2.239646073538691, "grad_norm": 8.674793243408203, "learning_rate": 8.4483949096008e-06, "loss": 0.0578, "step": 156175 }, { "epoch": 2.2400045889978775, "grad_norm": 2.677730083465576, "learning_rate": 8.444411396035608e-06, "loss": 0.1037, "step": 156200 }, { "epoch": 2.240363104457064, "grad_norm": 1.125256896018982, "learning_rate": 8.440427882470417e-06, "loss": 0.1467, "step": 156225 }, { "epoch": 2.240721619916251, "grad_norm": 4.2120442390441895, "learning_rate": 8.436444368905225e-06, "loss": 0.1695, "step": 156250 }, { "epoch": 2.2410801353754373, "grad_norm": 2.235917568206787, "learning_rate": 8.432460855340033e-06, "loss": 0.0671, "step": 156275 }, { "epoch": 2.241438650834624, "grad_norm": 2.1193649768829346, "learning_rate": 8.428477341774843e-06, "loss": 0.0696, "step": 156300 }, { "epoch": 2.2417971662938108, "grad_norm": 1.0715816020965576, "learning_rate": 8.424493828209649e-06, "loss": 0.0826, "step": 156325 }, { "epoch": 2.242155681752997, "grad_norm": 2.809828758239746, "learning_rate": 8.420510314644458e-06, "loss": 0.0701, "step": 156350 }, { "epoch": 2.242514197212184, "grad_norm": 8.825328826904297, "learning_rate": 8.416526801079266e-06, "loss": 0.0483, "step": 156375 }, { "epoch": 2.2428727126713706, "grad_norm": 0.5583833456039429, "learning_rate": 8.412543287514074e-06, "loss": 0.0989, "step": 156400 }, { "epoch": 2.243231228130557, "grad_norm": 2.3012213706970215, "learning_rate": 8.408559773948884e-06, "loss": 0.0761, "step": 156425 }, { "epoch": 2.2435897435897436, "grad_norm": 1.385063648223877, "learning_rate": 8.404576260383691e-06, "loss": 0.0359, "step": 156450 }, { "epoch": 2.2439482590489304, "grad_norm": 0.09371892362833023, "learning_rate": 8.400592746818501e-06, "loss": 0.0952, "step": 156475 }, { "epoch": 2.2443067745081167, "grad_norm": 3.2833757400512695, "learning_rate": 8.396609233253309e-06, "loss": 0.1826, "step": 156500 }, { "epoch": 2.2446652899673034, "grad_norm": 0.7741411924362183, "learning_rate": 8.392625719688118e-06, "loss": 0.054, "step": 156525 }, { "epoch": 2.24502380542649, "grad_norm": 1.2651708126068115, "learning_rate": 8.388642206122926e-06, "loss": 0.1245, "step": 156550 }, { "epoch": 2.2453823208856765, "grad_norm": 11.753560066223145, "learning_rate": 8.384658692557734e-06, "loss": 0.1815, "step": 156575 }, { "epoch": 2.245740836344863, "grad_norm": 2.453026056289673, "learning_rate": 8.380675178992543e-06, "loss": 0.0793, "step": 156600 }, { "epoch": 2.24609935180405, "grad_norm": 9.621888160705566, "learning_rate": 8.376691665427351e-06, "loss": 0.086, "step": 156625 }, { "epoch": 2.2464578672632363, "grad_norm": 0.023152165114879608, "learning_rate": 8.372708151862161e-06, "loss": 0.1307, "step": 156650 }, { "epoch": 2.246816382722423, "grad_norm": 0.020331207662820816, "learning_rate": 8.368724638296969e-06, "loss": 0.0917, "step": 156675 }, { "epoch": 2.2471748981816098, "grad_norm": 0.2207329422235489, "learning_rate": 8.364741124731777e-06, "loss": 0.0659, "step": 156700 }, { "epoch": 2.247533413640796, "grad_norm": 0.9486859440803528, "learning_rate": 8.360757611166586e-06, "loss": 0.1121, "step": 156725 }, { "epoch": 2.247891929099983, "grad_norm": 3.9675018787384033, "learning_rate": 8.356774097601394e-06, "loss": 0.072, "step": 156750 }, { "epoch": 2.2482504445591696, "grad_norm": 0.9849076867103577, "learning_rate": 8.352790584036203e-06, "loss": 0.1123, "step": 156775 }, { "epoch": 2.248608960018356, "grad_norm": 1.8861134052276611, "learning_rate": 8.348807070471011e-06, "loss": 0.0379, "step": 156800 }, { "epoch": 2.2489674754775426, "grad_norm": 2.0883922576904297, "learning_rate": 8.344823556905821e-06, "loss": 0.0581, "step": 156825 }, { "epoch": 2.2493259909367294, "grad_norm": 0.16171720623970032, "learning_rate": 8.340840043340629e-06, "loss": 0.0957, "step": 156850 }, { "epoch": 2.2496845063959157, "grad_norm": 14.555262565612793, "learning_rate": 8.336856529775435e-06, "loss": 0.1551, "step": 156875 }, { "epoch": 2.2500430218551024, "grad_norm": 0.07132162153720856, "learning_rate": 8.332873016210244e-06, "loss": 0.0571, "step": 156900 }, { "epoch": 2.250401537314289, "grad_norm": 0.3447563648223877, "learning_rate": 8.328889502645052e-06, "loss": 0.0866, "step": 156925 }, { "epoch": 2.2507600527734755, "grad_norm": 0.2722165882587433, "learning_rate": 8.324905989079862e-06, "loss": 0.0903, "step": 156950 }, { "epoch": 2.251118568232662, "grad_norm": 3.2098937034606934, "learning_rate": 8.32092247551467e-06, "loss": 0.0696, "step": 156975 }, { "epoch": 2.251477083691849, "grad_norm": 5.977622985839844, "learning_rate": 8.316938961949477e-06, "loss": 0.086, "step": 157000 }, { "epoch": 2.2518355991510353, "grad_norm": 0.2549048066139221, "learning_rate": 8.312955448384287e-06, "loss": 0.101, "step": 157025 }, { "epoch": 2.252194114610222, "grad_norm": 1.9519654512405396, "learning_rate": 8.308971934819095e-06, "loss": 0.0711, "step": 157050 }, { "epoch": 2.2525526300694088, "grad_norm": 1.0070109367370605, "learning_rate": 8.304988421253904e-06, "loss": 0.047, "step": 157075 }, { "epoch": 2.252911145528595, "grad_norm": 8.870450019836426, "learning_rate": 8.301004907688712e-06, "loss": 0.1311, "step": 157100 }, { "epoch": 2.253269660987782, "grad_norm": 11.765524864196777, "learning_rate": 8.297021394123522e-06, "loss": 0.0863, "step": 157125 }, { "epoch": 2.2536281764469686, "grad_norm": 0.36233755946159363, "learning_rate": 8.29303788055833e-06, "loss": 0.1346, "step": 157150 }, { "epoch": 2.253986691906155, "grad_norm": 0.4493035674095154, "learning_rate": 8.289054366993137e-06, "loss": 0.0859, "step": 157175 }, { "epoch": 2.2543452073653416, "grad_norm": 2.631359577178955, "learning_rate": 8.285070853427947e-06, "loss": 0.0836, "step": 157200 }, { "epoch": 2.2547037228245284, "grad_norm": 0.18366242945194244, "learning_rate": 8.281087339862755e-06, "loss": 0.0461, "step": 157225 }, { "epoch": 2.2550622382837147, "grad_norm": 4.116858005523682, "learning_rate": 8.277103826297564e-06, "loss": 0.0696, "step": 157250 }, { "epoch": 2.2554207537429014, "grad_norm": 1.6832953691482544, "learning_rate": 8.273120312732372e-06, "loss": 0.0744, "step": 157275 }, { "epoch": 2.255779269202088, "grad_norm": 0.7373905181884766, "learning_rate": 8.26913679916718e-06, "loss": 0.0724, "step": 157300 }, { "epoch": 2.2561377846612745, "grad_norm": 6.035656929016113, "learning_rate": 8.26515328560199e-06, "loss": 0.1391, "step": 157325 }, { "epoch": 2.256496300120461, "grad_norm": 4.1640706062316895, "learning_rate": 8.261169772036797e-06, "loss": 0.0497, "step": 157350 }, { "epoch": 2.256854815579648, "grad_norm": 3.297037363052368, "learning_rate": 8.257186258471607e-06, "loss": 0.0922, "step": 157375 }, { "epoch": 2.2572133310388343, "grad_norm": 11.392675399780273, "learning_rate": 8.253202744906415e-06, "loss": 0.0963, "step": 157400 }, { "epoch": 2.257571846498021, "grad_norm": 3.4876558780670166, "learning_rate": 8.249219231341223e-06, "loss": 0.1575, "step": 157425 }, { "epoch": 2.2579303619572078, "grad_norm": 0.18749675154685974, "learning_rate": 8.24523571777603e-06, "loss": 0.0913, "step": 157450 }, { "epoch": 2.258288877416394, "grad_norm": 9.015706062316895, "learning_rate": 8.241252204210838e-06, "loss": 0.0857, "step": 157475 }, { "epoch": 2.258647392875581, "grad_norm": 9.2144136428833, "learning_rate": 8.237268690645648e-06, "loss": 0.0646, "step": 157500 }, { "epoch": 2.2590059083347676, "grad_norm": 0.028400402516126633, "learning_rate": 8.233285177080456e-06, "loss": 0.0849, "step": 157525 }, { "epoch": 2.259364423793954, "grad_norm": 0.08422344923019409, "learning_rate": 8.229301663515265e-06, "loss": 0.0818, "step": 157550 }, { "epoch": 2.2597229392531406, "grad_norm": 26.226438522338867, "learning_rate": 8.225318149950073e-06, "loss": 0.1489, "step": 157575 }, { "epoch": 2.2600814547123274, "grad_norm": 1.0591543912887573, "learning_rate": 8.221334636384881e-06, "loss": 0.148, "step": 157600 }, { "epoch": 2.2604399701715137, "grad_norm": 1.0485457181930542, "learning_rate": 8.21735112281969e-06, "loss": 0.1737, "step": 157625 }, { "epoch": 2.2607984856307004, "grad_norm": 1.566718578338623, "learning_rate": 8.213367609254498e-06, "loss": 0.0751, "step": 157650 }, { "epoch": 2.261157001089887, "grad_norm": 0.16904877126216888, "learning_rate": 8.209384095689308e-06, "loss": 0.0979, "step": 157675 }, { "epoch": 2.2615155165490735, "grad_norm": 8.577412605285645, "learning_rate": 8.205400582124116e-06, "loss": 0.0664, "step": 157700 }, { "epoch": 2.26187403200826, "grad_norm": 0.760193943977356, "learning_rate": 8.201417068558925e-06, "loss": 0.0907, "step": 157725 }, { "epoch": 2.262232547467447, "grad_norm": 6.349118709564209, "learning_rate": 8.197433554993733e-06, "loss": 0.0546, "step": 157750 }, { "epoch": 2.2625910629266333, "grad_norm": 0.5743440985679626, "learning_rate": 8.193450041428541e-06, "loss": 0.0716, "step": 157775 }, { "epoch": 2.26294957838582, "grad_norm": 12.551889419555664, "learning_rate": 8.18946652786335e-06, "loss": 0.0643, "step": 157800 }, { "epoch": 2.2633080938450068, "grad_norm": 0.48988404870033264, "learning_rate": 8.185483014298158e-06, "loss": 0.0455, "step": 157825 }, { "epoch": 2.263666609304193, "grad_norm": 7.777688026428223, "learning_rate": 8.181499500732968e-06, "loss": 0.0806, "step": 157850 }, { "epoch": 2.26402512476338, "grad_norm": 0.25640869140625, "learning_rate": 8.177515987167776e-06, "loss": 0.0702, "step": 157875 }, { "epoch": 2.2643836402225666, "grad_norm": 9.108874320983887, "learning_rate": 8.173532473602584e-06, "loss": 0.0813, "step": 157900 }, { "epoch": 2.264742155681753, "grad_norm": 1.2326754331588745, "learning_rate": 8.169548960037393e-06, "loss": 0.0749, "step": 157925 }, { "epoch": 2.2651006711409396, "grad_norm": 2.405704975128174, "learning_rate": 8.165565446472201e-06, "loss": 0.051, "step": 157950 }, { "epoch": 2.2654591866001264, "grad_norm": 0.28724610805511475, "learning_rate": 8.161581932907009e-06, "loss": 0.1095, "step": 157975 }, { "epoch": 2.2658177020593127, "grad_norm": 0.3221210539340973, "learning_rate": 8.157598419341817e-06, "loss": 0.0719, "step": 158000 }, { "epoch": 2.2661762175184994, "grad_norm": 6.686426162719727, "learning_rate": 8.153614905776626e-06, "loss": 0.1277, "step": 158025 }, { "epoch": 2.266534732977686, "grad_norm": 3.6196839809417725, "learning_rate": 8.149631392211434e-06, "loss": 0.068, "step": 158050 }, { "epoch": 2.2668932484368725, "grad_norm": 4.880591869354248, "learning_rate": 8.145647878646242e-06, "loss": 0.1103, "step": 158075 }, { "epoch": 2.267251763896059, "grad_norm": 0.3514000177383423, "learning_rate": 8.141664365081051e-06, "loss": 0.0914, "step": 158100 }, { "epoch": 2.267610279355246, "grad_norm": 0.42033225297927856, "learning_rate": 8.13768085151586e-06, "loss": 0.1095, "step": 158125 }, { "epoch": 2.2679687948144323, "grad_norm": 10.634469985961914, "learning_rate": 8.133697337950669e-06, "loss": 0.0992, "step": 158150 }, { "epoch": 2.268327310273619, "grad_norm": 12.677026748657227, "learning_rate": 8.129713824385477e-06, "loss": 0.0774, "step": 158175 }, { "epoch": 2.2686858257328057, "grad_norm": 5.128621578216553, "learning_rate": 8.125730310820285e-06, "loss": 0.0615, "step": 158200 }, { "epoch": 2.269044341191992, "grad_norm": 0.1059710681438446, "learning_rate": 8.121746797255094e-06, "loss": 0.0437, "step": 158225 }, { "epoch": 2.269402856651179, "grad_norm": 0.3558415472507477, "learning_rate": 8.117763283689902e-06, "loss": 0.0616, "step": 158250 }, { "epoch": 2.2697613721103655, "grad_norm": 16.324495315551758, "learning_rate": 8.113779770124711e-06, "loss": 0.1137, "step": 158275 }, { "epoch": 2.270119887569552, "grad_norm": 10.871369361877441, "learning_rate": 8.10979625655952e-06, "loss": 0.1712, "step": 158300 }, { "epoch": 2.2704784030287386, "grad_norm": 15.716692924499512, "learning_rate": 8.105812742994329e-06, "loss": 0.1019, "step": 158325 }, { "epoch": 2.2708369184879253, "grad_norm": 5.139155387878418, "learning_rate": 8.101829229429137e-06, "loss": 0.1135, "step": 158350 }, { "epoch": 2.2711954339471117, "grad_norm": 0.6983219385147095, "learning_rate": 8.097845715863945e-06, "loss": 0.0991, "step": 158375 }, { "epoch": 2.2715539494062984, "grad_norm": 2.6164369583129883, "learning_rate": 8.093862202298754e-06, "loss": 0.0877, "step": 158400 }, { "epoch": 2.271912464865485, "grad_norm": 2.614797830581665, "learning_rate": 8.089878688733562e-06, "loss": 0.0863, "step": 158425 }, { "epoch": 2.2722709803246715, "grad_norm": 13.458808898925781, "learning_rate": 8.085895175168371e-06, "loss": 0.1434, "step": 158450 }, { "epoch": 2.272629495783858, "grad_norm": 5.829095363616943, "learning_rate": 8.08191166160318e-06, "loss": 0.1044, "step": 158475 }, { "epoch": 2.272988011243045, "grad_norm": 6.154480457305908, "learning_rate": 8.077928148037987e-06, "loss": 0.1113, "step": 158500 }, { "epoch": 2.2733465267022313, "grad_norm": 3.8520326614379883, "learning_rate": 8.073944634472795e-06, "loss": 0.0693, "step": 158525 }, { "epoch": 2.273705042161418, "grad_norm": 4.632570743560791, "learning_rate": 8.069961120907603e-06, "loss": 0.0731, "step": 158550 }, { "epoch": 2.2740635576206047, "grad_norm": 0.35956457257270813, "learning_rate": 8.065977607342412e-06, "loss": 0.1421, "step": 158575 }, { "epoch": 2.274422073079791, "grad_norm": 0.014108879491686821, "learning_rate": 8.06199409377722e-06, "loss": 0.0422, "step": 158600 }, { "epoch": 2.274780588538978, "grad_norm": 0.060446303337812424, "learning_rate": 8.058010580212028e-06, "loss": 0.1155, "step": 158625 }, { "epoch": 2.2751391039981645, "grad_norm": 1.6093162298202515, "learning_rate": 8.054027066646838e-06, "loss": 0.0374, "step": 158650 }, { "epoch": 2.275497619457351, "grad_norm": 0.4712187647819519, "learning_rate": 8.050043553081645e-06, "loss": 0.0696, "step": 158675 }, { "epoch": 2.2758561349165376, "grad_norm": 0.46313613653182983, "learning_rate": 8.046060039516455e-06, "loss": 0.1615, "step": 158700 }, { "epoch": 2.2762146503757243, "grad_norm": 1.1508748531341553, "learning_rate": 8.042076525951263e-06, "loss": 0.1131, "step": 158725 }, { "epoch": 2.2765731658349107, "grad_norm": 18.520105361938477, "learning_rate": 8.038093012386072e-06, "loss": 0.0781, "step": 158750 }, { "epoch": 2.2769316812940974, "grad_norm": 2.5194785594940186, "learning_rate": 8.03410949882088e-06, "loss": 0.041, "step": 158775 }, { "epoch": 2.277290196753284, "grad_norm": 3.798384428024292, "learning_rate": 8.030125985255688e-06, "loss": 0.1395, "step": 158800 }, { "epoch": 2.2776487122124704, "grad_norm": 11.397229194641113, "learning_rate": 8.026142471690498e-06, "loss": 0.1476, "step": 158825 }, { "epoch": 2.278007227671657, "grad_norm": 1.7467044591903687, "learning_rate": 8.022158958125305e-06, "loss": 0.0983, "step": 158850 }, { "epoch": 2.278365743130844, "grad_norm": 0.06276476383209229, "learning_rate": 8.018175444560115e-06, "loss": 0.0905, "step": 158875 }, { "epoch": 2.2787242585900302, "grad_norm": 0.11769267171621323, "learning_rate": 8.014191930994923e-06, "loss": 0.0448, "step": 158900 }, { "epoch": 2.279082774049217, "grad_norm": 1.887939453125, "learning_rate": 8.01020841742973e-06, "loss": 0.1115, "step": 158925 }, { "epoch": 2.2794412895084037, "grad_norm": 0.2855796813964844, "learning_rate": 8.00622490386454e-06, "loss": 0.0369, "step": 158950 }, { "epoch": 2.27979980496759, "grad_norm": 2.981492757797241, "learning_rate": 8.002241390299348e-06, "loss": 0.1297, "step": 158975 }, { "epoch": 2.280158320426777, "grad_norm": 9.009696006774902, "learning_rate": 7.998257876734158e-06, "loss": 0.0874, "step": 159000 }, { "epoch": 2.2805168358859635, "grad_norm": 1.5753121376037598, "learning_rate": 7.994274363168965e-06, "loss": 0.0636, "step": 159025 }, { "epoch": 2.28087535134515, "grad_norm": 1.0517429113388062, "learning_rate": 7.990290849603773e-06, "loss": 0.0589, "step": 159050 }, { "epoch": 2.2812338668043366, "grad_norm": 4.359711170196533, "learning_rate": 7.986307336038581e-06, "loss": 0.0772, "step": 159075 }, { "epoch": 2.2815923822635233, "grad_norm": 5.257071495056152, "learning_rate": 7.982323822473389e-06, "loss": 0.1263, "step": 159100 }, { "epoch": 2.2819508977227096, "grad_norm": 4.3694539070129395, "learning_rate": 7.978340308908198e-06, "loss": 0.0555, "step": 159125 }, { "epoch": 2.2823094131818964, "grad_norm": 1.420160174369812, "learning_rate": 7.974356795343006e-06, "loss": 0.0661, "step": 159150 }, { "epoch": 2.282667928641083, "grad_norm": 1.7441368103027344, "learning_rate": 7.970373281777816e-06, "loss": 0.1683, "step": 159175 }, { "epoch": 2.2830264441002694, "grad_norm": 0.2762867510318756, "learning_rate": 7.966389768212624e-06, "loss": 0.0517, "step": 159200 }, { "epoch": 2.283384959559456, "grad_norm": 11.764338493347168, "learning_rate": 7.962406254647432e-06, "loss": 0.1235, "step": 159225 }, { "epoch": 2.283743475018643, "grad_norm": 10.605846405029297, "learning_rate": 7.958422741082241e-06, "loss": 0.0618, "step": 159250 }, { "epoch": 2.2841019904778292, "grad_norm": 4.81515645980835, "learning_rate": 7.954439227517049e-06, "loss": 0.0563, "step": 159275 }, { "epoch": 2.284460505937016, "grad_norm": 0.6721713542938232, "learning_rate": 7.950455713951858e-06, "loss": 0.1236, "step": 159300 }, { "epoch": 2.2848190213962027, "grad_norm": 11.07674789428711, "learning_rate": 7.946472200386666e-06, "loss": 0.0894, "step": 159325 }, { "epoch": 2.285177536855389, "grad_norm": 11.467710494995117, "learning_rate": 7.942488686821476e-06, "loss": 0.1224, "step": 159350 }, { "epoch": 2.285536052314576, "grad_norm": 1.1824603080749512, "learning_rate": 7.938505173256284e-06, "loss": 0.1017, "step": 159375 }, { "epoch": 2.2858945677737625, "grad_norm": 10.179330825805664, "learning_rate": 7.934521659691092e-06, "loss": 0.1072, "step": 159400 }, { "epoch": 2.286253083232949, "grad_norm": 3.368623733520508, "learning_rate": 7.930538146125901e-06, "loss": 0.099, "step": 159425 }, { "epoch": 2.2866115986921356, "grad_norm": 0.7798633575439453, "learning_rate": 7.926554632560709e-06, "loss": 0.0993, "step": 159450 }, { "epoch": 2.2869701141513223, "grad_norm": 6.803792953491211, "learning_rate": 7.922571118995518e-06, "loss": 0.0737, "step": 159475 }, { "epoch": 2.2873286296105086, "grad_norm": 0.5127304196357727, "learning_rate": 7.918587605430326e-06, "loss": 0.0635, "step": 159500 }, { "epoch": 2.2876871450696954, "grad_norm": 1.6858198642730713, "learning_rate": 7.914604091865134e-06, "loss": 0.0821, "step": 159525 }, { "epoch": 2.288045660528882, "grad_norm": 1.3126041889190674, "learning_rate": 7.910620578299944e-06, "loss": 0.0798, "step": 159550 }, { "epoch": 2.2884041759880684, "grad_norm": 0.24083930253982544, "learning_rate": 7.906637064734752e-06, "loss": 0.0574, "step": 159575 }, { "epoch": 2.288762691447255, "grad_norm": 2.4888007640838623, "learning_rate": 7.90265355116956e-06, "loss": 0.0279, "step": 159600 }, { "epoch": 2.289121206906442, "grad_norm": 1.1920377016067505, "learning_rate": 7.898670037604367e-06, "loss": 0.1296, "step": 159625 }, { "epoch": 2.2894797223656282, "grad_norm": 0.9014298915863037, "learning_rate": 7.894686524039177e-06, "loss": 0.0556, "step": 159650 }, { "epoch": 2.289838237824815, "grad_norm": 0.6366447806358337, "learning_rate": 7.890703010473985e-06, "loss": 0.0837, "step": 159675 }, { "epoch": 2.2901967532840017, "grad_norm": 1.9424339532852173, "learning_rate": 7.886719496908792e-06, "loss": 0.128, "step": 159700 }, { "epoch": 2.290555268743188, "grad_norm": 0.9471772313117981, "learning_rate": 7.882735983343602e-06, "loss": 0.0533, "step": 159725 }, { "epoch": 2.290913784202375, "grad_norm": 1.137516975402832, "learning_rate": 7.87875246977841e-06, "loss": 0.0702, "step": 159750 }, { "epoch": 2.2912722996615615, "grad_norm": 1.043655276298523, "learning_rate": 7.87476895621322e-06, "loss": 0.0621, "step": 159775 }, { "epoch": 2.291630815120748, "grad_norm": 13.365633964538574, "learning_rate": 7.870785442648027e-06, "loss": 0.0949, "step": 159800 }, { "epoch": 2.2919893305799346, "grad_norm": 8.293864250183105, "learning_rate": 7.866801929082835e-06, "loss": 0.1224, "step": 159825 }, { "epoch": 2.2923478460391213, "grad_norm": 16.045621871948242, "learning_rate": 7.862818415517645e-06, "loss": 0.1167, "step": 159850 }, { "epoch": 2.2927063614983076, "grad_norm": 0.12620848417282104, "learning_rate": 7.858834901952452e-06, "loss": 0.0579, "step": 159875 }, { "epoch": 2.2930648769574944, "grad_norm": 1.635361909866333, "learning_rate": 7.854851388387262e-06, "loss": 0.0873, "step": 159900 }, { "epoch": 2.293423392416681, "grad_norm": 1.915966272354126, "learning_rate": 7.85086787482207e-06, "loss": 0.0504, "step": 159925 }, { "epoch": 2.2937819078758674, "grad_norm": 0.05103413388133049, "learning_rate": 7.84688436125688e-06, "loss": 0.0716, "step": 159950 }, { "epoch": 2.294140423335054, "grad_norm": 1.294585108757019, "learning_rate": 7.842900847691687e-06, "loss": 0.0651, "step": 159975 }, { "epoch": 2.294498938794241, "grad_norm": 9.951254844665527, "learning_rate": 7.838917334126495e-06, "loss": 0.0562, "step": 160000 }, { "epoch": 2.2948574542534272, "grad_norm": 20.118818283081055, "learning_rate": 7.834933820561305e-06, "loss": 0.0788, "step": 160025 }, { "epoch": 2.295215969712614, "grad_norm": 1.0264402627944946, "learning_rate": 7.830950306996112e-06, "loss": 0.0454, "step": 160050 }, { "epoch": 2.2955744851718007, "grad_norm": 6.776556015014648, "learning_rate": 7.826966793430922e-06, "loss": 0.103, "step": 160075 }, { "epoch": 2.295933000630987, "grad_norm": 0.26780426502227783, "learning_rate": 7.82298327986573e-06, "loss": 0.0947, "step": 160100 }, { "epoch": 2.296291516090174, "grad_norm": 1.6918739080429077, "learning_rate": 7.818999766300538e-06, "loss": 0.133, "step": 160125 }, { "epoch": 2.2966500315493605, "grad_norm": 0.09443540871143341, "learning_rate": 7.815016252735346e-06, "loss": 0.0761, "step": 160150 }, { "epoch": 2.297008547008547, "grad_norm": 0.7418338656425476, "learning_rate": 7.811032739170153e-06, "loss": 0.0442, "step": 160175 }, { "epoch": 2.2973670624677336, "grad_norm": 0.20455971360206604, "learning_rate": 7.807049225604963e-06, "loss": 0.1034, "step": 160200 }, { "epoch": 2.2977255779269203, "grad_norm": 12.630624771118164, "learning_rate": 7.80306571203977e-06, "loss": 0.1072, "step": 160225 }, { "epoch": 2.2980840933861066, "grad_norm": 0.5278530716896057, "learning_rate": 7.79908219847458e-06, "loss": 0.0523, "step": 160250 }, { "epoch": 2.2984426088452934, "grad_norm": 9.69958782196045, "learning_rate": 7.795098684909388e-06, "loss": 0.2268, "step": 160275 }, { "epoch": 2.29880112430448, "grad_norm": 0.1520671844482422, "learning_rate": 7.791115171344196e-06, "loss": 0.0777, "step": 160300 }, { "epoch": 2.2991596397636664, "grad_norm": 4.166306018829346, "learning_rate": 7.787131657779006e-06, "loss": 0.0596, "step": 160325 }, { "epoch": 2.299518155222853, "grad_norm": 1.14802885055542, "learning_rate": 7.783148144213813e-06, "loss": 0.0624, "step": 160350 }, { "epoch": 2.29987667068204, "grad_norm": 9.049272537231445, "learning_rate": 7.779164630648623e-06, "loss": 0.0844, "step": 160375 }, { "epoch": 2.3002351861412262, "grad_norm": 0.34499484300613403, "learning_rate": 7.77518111708343e-06, "loss": 0.1272, "step": 160400 }, { "epoch": 2.300593701600413, "grad_norm": 0.25171029567718506, "learning_rate": 7.771197603518239e-06, "loss": 0.0775, "step": 160425 }, { "epoch": 2.3009522170595997, "grad_norm": 0.5437056422233582, "learning_rate": 7.767214089953048e-06, "loss": 0.0834, "step": 160450 }, { "epoch": 2.301310732518786, "grad_norm": 5.907482624053955, "learning_rate": 7.763230576387856e-06, "loss": 0.0843, "step": 160475 }, { "epoch": 2.301669247977973, "grad_norm": 0.336079478263855, "learning_rate": 7.759247062822666e-06, "loss": 0.0759, "step": 160500 }, { "epoch": 2.3020277634371595, "grad_norm": 1.5111596584320068, "learning_rate": 7.755263549257473e-06, "loss": 0.0417, "step": 160525 }, { "epoch": 2.302386278896346, "grad_norm": 7.452073097229004, "learning_rate": 7.751280035692283e-06, "loss": 0.1186, "step": 160550 }, { "epoch": 2.3027447943555326, "grad_norm": 0.28201887011528015, "learning_rate": 7.74729652212709e-06, "loss": 0.0838, "step": 160575 }, { "epoch": 2.3031033098147193, "grad_norm": 0.2088652104139328, "learning_rate": 7.743313008561899e-06, "loss": 0.1282, "step": 160600 }, { "epoch": 2.3034618252739056, "grad_norm": 1.27830171585083, "learning_rate": 7.739329494996708e-06, "loss": 0.0939, "step": 160625 }, { "epoch": 2.3038203407330924, "grad_norm": 2.894735097885132, "learning_rate": 7.735345981431516e-06, "loss": 0.1099, "step": 160650 }, { "epoch": 2.304178856192279, "grad_norm": 0.705580472946167, "learning_rate": 7.731362467866326e-06, "loss": 0.1329, "step": 160675 }, { "epoch": 2.3045373716514654, "grad_norm": 5.135140419006348, "learning_rate": 7.727378954301132e-06, "loss": 0.0939, "step": 160700 }, { "epoch": 2.304895887110652, "grad_norm": 0.1761498600244522, "learning_rate": 7.72339544073594e-06, "loss": 0.0915, "step": 160725 }, { "epoch": 2.305254402569839, "grad_norm": 0.2387409210205078, "learning_rate": 7.719411927170749e-06, "loss": 0.1028, "step": 160750 }, { "epoch": 2.3056129180290252, "grad_norm": 22.548429489135742, "learning_rate": 7.715428413605557e-06, "loss": 0.0692, "step": 160775 }, { "epoch": 2.305971433488212, "grad_norm": 2.2116355895996094, "learning_rate": 7.711444900040366e-06, "loss": 0.0626, "step": 160800 }, { "epoch": 2.3063299489473987, "grad_norm": 7.194084167480469, "learning_rate": 7.707461386475174e-06, "loss": 0.0778, "step": 160825 }, { "epoch": 2.306688464406585, "grad_norm": 5.106793403625488, "learning_rate": 7.703477872909984e-06, "loss": 0.0357, "step": 160850 }, { "epoch": 2.307046979865772, "grad_norm": 18.79507827758789, "learning_rate": 7.699494359344792e-06, "loss": 0.1339, "step": 160875 }, { "epoch": 2.3074054953249585, "grad_norm": 2.7974050045013428, "learning_rate": 7.6955108457796e-06, "loss": 0.0737, "step": 160900 }, { "epoch": 2.307764010784145, "grad_norm": 10.383398056030273, "learning_rate": 7.691527332214409e-06, "loss": 0.0757, "step": 160925 }, { "epoch": 2.3081225262433316, "grad_norm": 20.252092361450195, "learning_rate": 7.687543818649217e-06, "loss": 0.1224, "step": 160950 }, { "epoch": 2.3084810417025183, "grad_norm": 0.8043810725212097, "learning_rate": 7.683560305084026e-06, "loss": 0.0484, "step": 160975 }, { "epoch": 2.3088395571617046, "grad_norm": 0.8910489678382874, "learning_rate": 7.679576791518834e-06, "loss": 0.0494, "step": 161000 }, { "epoch": 2.3091980726208914, "grad_norm": 0.5324689745903015, "learning_rate": 7.675593277953642e-06, "loss": 0.0841, "step": 161025 }, { "epoch": 2.309556588080078, "grad_norm": 0.08194714784622192, "learning_rate": 7.671609764388452e-06, "loss": 0.1187, "step": 161050 }, { "epoch": 2.3099151035392644, "grad_norm": 2.0679969787597656, "learning_rate": 7.66762625082326e-06, "loss": 0.0772, "step": 161075 }, { "epoch": 2.310273618998451, "grad_norm": 0.31085294485092163, "learning_rate": 7.663642737258069e-06, "loss": 0.117, "step": 161100 }, { "epoch": 2.310632134457638, "grad_norm": 5.712726593017578, "learning_rate": 7.659659223692877e-06, "loss": 0.1024, "step": 161125 }, { "epoch": 2.3109906499168242, "grad_norm": 0.12843307852745056, "learning_rate": 7.655675710127686e-06, "loss": 0.1212, "step": 161150 }, { "epoch": 2.311349165376011, "grad_norm": 2.950866222381592, "learning_rate": 7.651692196562494e-06, "loss": 0.0878, "step": 161175 }, { "epoch": 2.3117076808351977, "grad_norm": 6.789420127868652, "learning_rate": 7.647708682997302e-06, "loss": 0.052, "step": 161200 }, { "epoch": 2.312066196294384, "grad_norm": 0.5804111957550049, "learning_rate": 7.643725169432112e-06, "loss": 0.047, "step": 161225 }, { "epoch": 2.312424711753571, "grad_norm": 21.341529846191406, "learning_rate": 7.63974165586692e-06, "loss": 0.1272, "step": 161250 }, { "epoch": 2.3127832272127575, "grad_norm": 3.892828941345215, "learning_rate": 7.635758142301727e-06, "loss": 0.0346, "step": 161275 }, { "epoch": 2.313141742671944, "grad_norm": 2.4034197330474854, "learning_rate": 7.631774628736535e-06, "loss": 0.054, "step": 161300 }, { "epoch": 2.3135002581311306, "grad_norm": 0.17791873216629028, "learning_rate": 7.627791115171344e-06, "loss": 0.1011, "step": 161325 }, { "epoch": 2.3138587735903173, "grad_norm": 2.533936023712158, "learning_rate": 7.623807601606153e-06, "loss": 0.0542, "step": 161350 }, { "epoch": 2.3142172890495036, "grad_norm": 1.419181227684021, "learning_rate": 7.619824088040961e-06, "loss": 0.0218, "step": 161375 }, { "epoch": 2.3145758045086904, "grad_norm": 5.918854236602783, "learning_rate": 7.61584057447577e-06, "loss": 0.0701, "step": 161400 }, { "epoch": 2.314934319967877, "grad_norm": 1.8342528343200684, "learning_rate": 7.611857060910578e-06, "loss": 0.1238, "step": 161425 }, { "epoch": 2.3152928354270634, "grad_norm": 18.970836639404297, "learning_rate": 7.607873547345387e-06, "loss": 0.0813, "step": 161450 }, { "epoch": 2.31565135088625, "grad_norm": 0.6262649893760681, "learning_rate": 7.603890033780195e-06, "loss": 0.1002, "step": 161475 }, { "epoch": 2.316009866345437, "grad_norm": 1.197967290878296, "learning_rate": 7.599906520215003e-06, "loss": 0.067, "step": 161500 }, { "epoch": 2.3163683818046232, "grad_norm": 0.7084975838661194, "learning_rate": 7.5959230066498126e-06, "loss": 0.0949, "step": 161525 }, { "epoch": 2.31672689726381, "grad_norm": 7.262475490570068, "learning_rate": 7.59193949308462e-06, "loss": 0.112, "step": 161550 }, { "epoch": 2.3170854127229967, "grad_norm": 8.324201583862305, "learning_rate": 7.58795597951943e-06, "loss": 0.1358, "step": 161575 }, { "epoch": 2.317443928182183, "grad_norm": 2.0388901233673096, "learning_rate": 7.583972465954238e-06, "loss": 0.0472, "step": 161600 }, { "epoch": 2.31780244364137, "grad_norm": 0.9980286955833435, "learning_rate": 7.579988952389046e-06, "loss": 0.0833, "step": 161625 }, { "epoch": 2.3181609591005565, "grad_norm": 14.912498474121094, "learning_rate": 7.576005438823855e-06, "loss": 0.1225, "step": 161650 }, { "epoch": 2.318519474559743, "grad_norm": 0.08011890202760696, "learning_rate": 7.572021925258662e-06, "loss": 0.0272, "step": 161675 }, { "epoch": 2.3188779900189296, "grad_norm": 4.85637903213501, "learning_rate": 7.568038411693472e-06, "loss": 0.086, "step": 161700 }, { "epoch": 2.3192365054781163, "grad_norm": 17.245386123657227, "learning_rate": 7.5640548981282795e-06, "loss": 0.0578, "step": 161725 }, { "epoch": 2.3195950209373026, "grad_norm": 0.5244060754776001, "learning_rate": 7.560071384563089e-06, "loss": 0.105, "step": 161750 }, { "epoch": 2.3199535363964894, "grad_norm": 0.7400553822517395, "learning_rate": 7.556087870997897e-06, "loss": 0.0864, "step": 161775 }, { "epoch": 2.320312051855676, "grad_norm": 13.496898651123047, "learning_rate": 7.552104357432705e-06, "loss": 0.1504, "step": 161800 }, { "epoch": 2.3206705673148624, "grad_norm": 1.6604810953140259, "learning_rate": 7.548120843867514e-06, "loss": 0.0934, "step": 161825 }, { "epoch": 2.321029082774049, "grad_norm": 3.907785415649414, "learning_rate": 7.544137330302322e-06, "loss": 0.0428, "step": 161850 }, { "epoch": 2.321387598233236, "grad_norm": 0.5595129132270813, "learning_rate": 7.540153816737132e-06, "loss": 0.1113, "step": 161875 }, { "epoch": 2.3217461136924222, "grad_norm": 0.12819838523864746, "learning_rate": 7.5361703031719395e-06, "loss": 0.0578, "step": 161900 }, { "epoch": 2.322104629151609, "grad_norm": 10.80219841003418, "learning_rate": 7.532186789606747e-06, "loss": 0.1006, "step": 161925 }, { "epoch": 2.3224631446107957, "grad_norm": 6.993413925170898, "learning_rate": 7.528203276041556e-06, "loss": 0.0884, "step": 161950 }, { "epoch": 2.322821660069982, "grad_norm": 2.7941792011260986, "learning_rate": 7.524219762476364e-06, "loss": 0.0681, "step": 161975 }, { "epoch": 2.323180175529169, "grad_norm": 6.821771621704102, "learning_rate": 7.5202362489111735e-06, "loss": 0.0884, "step": 162000 }, { "epoch": 2.3235386909883555, "grad_norm": 1.0747929811477661, "learning_rate": 7.516252735345981e-06, "loss": 0.0622, "step": 162025 }, { "epoch": 2.323897206447542, "grad_norm": 1.0582894086837769, "learning_rate": 7.512269221780791e-06, "loss": 0.0419, "step": 162050 }, { "epoch": 2.3242557219067286, "grad_norm": 0.067754827439785, "learning_rate": 7.508285708215599e-06, "loss": 0.1052, "step": 162075 }, { "epoch": 2.3246142373659153, "grad_norm": 0.07870374619960785, "learning_rate": 7.5043021946504065e-06, "loss": 0.0662, "step": 162100 }, { "epoch": 2.3249727528251016, "grad_norm": 5.665012836456299, "learning_rate": 7.500318681085216e-06, "loss": 0.0715, "step": 162125 }, { "epoch": 2.3253312682842884, "grad_norm": 10.117505073547363, "learning_rate": 7.496335167520024e-06, "loss": 0.0709, "step": 162150 }, { "epoch": 2.325689783743475, "grad_norm": 23.800649642944336, "learning_rate": 7.492351653954833e-06, "loss": 0.1276, "step": 162175 }, { "epoch": 2.3260482992026614, "grad_norm": 7.9326372146606445, "learning_rate": 7.488368140389641e-06, "loss": 0.0601, "step": 162200 }, { "epoch": 2.326406814661848, "grad_norm": 3.6776533126831055, "learning_rate": 7.484384626824449e-06, "loss": 0.0983, "step": 162225 }, { "epoch": 2.326765330121035, "grad_norm": 5.6540608406066895, "learning_rate": 7.480401113259258e-06, "loss": 0.0659, "step": 162250 }, { "epoch": 2.3271238455802212, "grad_norm": 11.112275123596191, "learning_rate": 7.4764175996940665e-06, "loss": 0.1128, "step": 162275 }, { "epoch": 2.327482361039408, "grad_norm": 13.562837600708008, "learning_rate": 7.472434086128874e-06, "loss": 0.1201, "step": 162300 }, { "epoch": 2.3278408764985947, "grad_norm": 2.7531590461730957, "learning_rate": 7.468450572563683e-06, "loss": 0.0733, "step": 162325 }, { "epoch": 2.328199391957781, "grad_norm": 0.12168723344802856, "learning_rate": 7.464467058998492e-06, "loss": 0.047, "step": 162350 }, { "epoch": 2.3285579074169678, "grad_norm": 16.505828857421875, "learning_rate": 7.4604835454333004e-06, "loss": 0.0874, "step": 162375 }, { "epoch": 2.3289164228761545, "grad_norm": 3.116530179977417, "learning_rate": 7.456500031868109e-06, "loss": 0.0357, "step": 162400 }, { "epoch": 2.329274938335341, "grad_norm": 0.09544973820447922, "learning_rate": 7.452516518302918e-06, "loss": 0.1157, "step": 162425 }, { "epoch": 2.3296334537945276, "grad_norm": 9.110551834106445, "learning_rate": 7.448533004737726e-06, "loss": 0.1231, "step": 162450 }, { "epoch": 2.3299919692537143, "grad_norm": 15.247376441955566, "learning_rate": 7.444549491172534e-06, "loss": 0.1415, "step": 162475 }, { "epoch": 2.3303504847129006, "grad_norm": 4.700174808502197, "learning_rate": 7.440565977607342e-06, "loss": 0.0696, "step": 162500 }, { "epoch": 2.3307090001720874, "grad_norm": 6.108524799346924, "learning_rate": 7.436582464042151e-06, "loss": 0.0662, "step": 162525 }, { "epoch": 2.331067515631274, "grad_norm": 0.07661247998476028, "learning_rate": 7.43259895047696e-06, "loss": 0.08, "step": 162550 }, { "epoch": 2.3314260310904604, "grad_norm": 11.471022605895996, "learning_rate": 7.428615436911768e-06, "loss": 0.0638, "step": 162575 }, { "epoch": 2.331784546549647, "grad_norm": 0.17231084406375885, "learning_rate": 7.424631923346576e-06, "loss": 0.0751, "step": 162600 }, { "epoch": 2.332143062008834, "grad_norm": 2.230090856552124, "learning_rate": 7.420648409781385e-06, "loss": 0.0414, "step": 162625 }, { "epoch": 2.3325015774680207, "grad_norm": 0.06516376882791519, "learning_rate": 7.4166648962161935e-06, "loss": 0.0626, "step": 162650 }, { "epoch": 2.332860092927207, "grad_norm": 5.934372425079346, "learning_rate": 7.412681382651002e-06, "loss": 0.0333, "step": 162675 }, { "epoch": 2.3332186083863937, "grad_norm": 0.17009784281253815, "learning_rate": 7.408697869085811e-06, "loss": 0.1212, "step": 162700 }, { "epoch": 2.33357712384558, "grad_norm": 22.77688980102539, "learning_rate": 7.40471435552062e-06, "loss": 0.0972, "step": 162725 }, { "epoch": 2.3339356393047668, "grad_norm": 0.07122737169265747, "learning_rate": 7.4007308419554274e-06, "loss": 0.0877, "step": 162750 }, { "epoch": 2.3342941547639535, "grad_norm": 2.5960640907287598, "learning_rate": 7.396747328390235e-06, "loss": 0.1127, "step": 162775 }, { "epoch": 2.3346526702231403, "grad_norm": 7.485243320465088, "learning_rate": 7.392763814825044e-06, "loss": 0.0888, "step": 162800 }, { "epoch": 2.3350111856823266, "grad_norm": 4.5382232666015625, "learning_rate": 7.388780301259853e-06, "loss": 0.1081, "step": 162825 }, { "epoch": 2.3353697011415133, "grad_norm": 1.0727598667144775, "learning_rate": 7.384796787694661e-06, "loss": 0.075, "step": 162850 }, { "epoch": 2.3357282166006996, "grad_norm": 0.39522358775138855, "learning_rate": 7.38081327412947e-06, "loss": 0.0925, "step": 162875 }, { "epoch": 2.3360867320598864, "grad_norm": 1.8878803253173828, "learning_rate": 7.376829760564278e-06, "loss": 0.072, "step": 162900 }, { "epoch": 2.336445247519073, "grad_norm": 0.39035966992378235, "learning_rate": 7.372846246999087e-06, "loss": 0.1223, "step": 162925 }, { "epoch": 2.33680376297826, "grad_norm": 12.369539260864258, "learning_rate": 7.368862733433895e-06, "loss": 0.0888, "step": 162950 }, { "epoch": 2.337162278437446, "grad_norm": 11.67148494720459, "learning_rate": 7.364879219868704e-06, "loss": 0.1428, "step": 162975 }, { "epoch": 2.337520793896633, "grad_norm": 0.2779102623462677, "learning_rate": 7.360895706303513e-06, "loss": 0.0662, "step": 163000 }, { "epoch": 2.3378793093558192, "grad_norm": 1.2156939506530762, "learning_rate": 7.3569121927383205e-06, "loss": 0.0509, "step": 163025 }, { "epoch": 2.338237824815006, "grad_norm": 0.7318113446235657, "learning_rate": 7.352928679173128e-06, "loss": 0.103, "step": 163050 }, { "epoch": 2.3385963402741927, "grad_norm": 5.755573749542236, "learning_rate": 7.348945165607937e-06, "loss": 0.0604, "step": 163075 }, { "epoch": 2.3389548557333795, "grad_norm": 14.361842155456543, "learning_rate": 7.344961652042746e-06, "loss": 0.1836, "step": 163100 }, { "epoch": 2.3393133711925658, "grad_norm": 8.666499137878418, "learning_rate": 7.340978138477554e-06, "loss": 0.1727, "step": 163125 }, { "epoch": 2.3396718866517525, "grad_norm": 2.0761256217956543, "learning_rate": 7.336994624912363e-06, "loss": 0.0969, "step": 163150 }, { "epoch": 2.340030402110939, "grad_norm": 1.906221628189087, "learning_rate": 7.333011111347171e-06, "loss": 0.1106, "step": 163175 }, { "epoch": 2.3403889175701256, "grad_norm": 23.701824188232422, "learning_rate": 7.32902759778198e-06, "loss": 0.204, "step": 163200 }, { "epoch": 2.3407474330293123, "grad_norm": 0.05745486915111542, "learning_rate": 7.325044084216788e-06, "loss": 0.1494, "step": 163225 }, { "epoch": 2.341105948488499, "grad_norm": 0.0704743042588234, "learning_rate": 7.321060570651597e-06, "loss": 0.0899, "step": 163250 }, { "epoch": 2.3414644639476854, "grad_norm": 4.104916572570801, "learning_rate": 7.317077057086406e-06, "loss": 0.0653, "step": 163275 }, { "epoch": 2.341822979406872, "grad_norm": 0.055449966341257095, "learning_rate": 7.313093543521214e-06, "loss": 0.1071, "step": 163300 }, { "epoch": 2.3421814948660584, "grad_norm": 7.984672546386719, "learning_rate": 7.309110029956021e-06, "loss": 0.1052, "step": 163325 }, { "epoch": 2.342540010325245, "grad_norm": 0.19265998899936676, "learning_rate": 7.30512651639083e-06, "loss": 0.0454, "step": 163350 }, { "epoch": 2.342898525784432, "grad_norm": 6.259541034698486, "learning_rate": 7.301143002825639e-06, "loss": 0.1327, "step": 163375 }, { "epoch": 2.3432570412436187, "grad_norm": 1.161564588546753, "learning_rate": 7.2971594892604475e-06, "loss": 0.081, "step": 163400 }, { "epoch": 2.343615556702805, "grad_norm": 0.2336627095937729, "learning_rate": 7.293175975695256e-06, "loss": 0.0605, "step": 163425 }, { "epoch": 2.3439740721619917, "grad_norm": 1.3637337684631348, "learning_rate": 7.289192462130065e-06, "loss": 0.0505, "step": 163450 }, { "epoch": 2.344332587621178, "grad_norm": 0.8654400110244751, "learning_rate": 7.285208948564873e-06, "loss": 0.0825, "step": 163475 }, { "epoch": 2.3446911030803648, "grad_norm": 16.238117218017578, "learning_rate": 7.281225434999681e-06, "loss": 0.1607, "step": 163500 }, { "epoch": 2.3450496185395515, "grad_norm": 8.23314380645752, "learning_rate": 7.27724192143449e-06, "loss": 0.0933, "step": 163525 }, { "epoch": 2.3454081339987383, "grad_norm": 8.754154205322266, "learning_rate": 7.273258407869299e-06, "loss": 0.1257, "step": 163550 }, { "epoch": 2.3457666494579246, "grad_norm": 0.1738220602273941, "learning_rate": 7.2692748943041075e-06, "loss": 0.0516, "step": 163575 }, { "epoch": 2.3461251649171113, "grad_norm": 0.16652309894561768, "learning_rate": 7.265291380738915e-06, "loss": 0.096, "step": 163600 }, { "epoch": 2.3464836803762976, "grad_norm": 2.1643316745758057, "learning_rate": 7.261307867173723e-06, "loss": 0.1021, "step": 163625 }, { "epoch": 2.3468421958354844, "grad_norm": 15.253584861755371, "learning_rate": 7.257324353608532e-06, "loss": 0.0648, "step": 163650 }, { "epoch": 2.347200711294671, "grad_norm": 0.5927780270576477, "learning_rate": 7.2533408400433406e-06, "loss": 0.1142, "step": 163675 }, { "epoch": 2.347559226753858, "grad_norm": 5.249833106994629, "learning_rate": 7.249357326478149e-06, "loss": 0.0553, "step": 163700 }, { "epoch": 2.347917742213044, "grad_norm": 8.69854736328125, "learning_rate": 7.245373812912958e-06, "loss": 0.1217, "step": 163725 }, { "epoch": 2.348276257672231, "grad_norm": 1.4761245250701904, "learning_rate": 7.241390299347767e-06, "loss": 0.1269, "step": 163750 }, { "epoch": 2.348634773131417, "grad_norm": 0.9233182072639465, "learning_rate": 7.2374067857825745e-06, "loss": 0.1836, "step": 163775 }, { "epoch": 2.348993288590604, "grad_norm": 15.162246704101562, "learning_rate": 7.233423272217383e-06, "loss": 0.1056, "step": 163800 }, { "epoch": 2.3493518040497907, "grad_norm": 1.787208080291748, "learning_rate": 7.229439758652192e-06, "loss": 0.1068, "step": 163825 }, { "epoch": 2.3497103195089775, "grad_norm": 0.057767391204833984, "learning_rate": 7.2254562450870005e-06, "loss": 0.1871, "step": 163850 }, { "epoch": 2.3500688349681638, "grad_norm": 0.1193016991019249, "learning_rate": 7.221472731521808e-06, "loss": 0.0462, "step": 163875 }, { "epoch": 2.3504273504273505, "grad_norm": 14.14266300201416, "learning_rate": 7.217489217956617e-06, "loss": 0.0573, "step": 163900 }, { "epoch": 2.350785865886537, "grad_norm": 2.649420976638794, "learning_rate": 7.213505704391425e-06, "loss": 0.0488, "step": 163925 }, { "epoch": 2.3511443813457236, "grad_norm": 9.425215721130371, "learning_rate": 7.209522190826234e-06, "loss": 0.0989, "step": 163950 }, { "epoch": 2.3515028968049103, "grad_norm": 2.0173144340515137, "learning_rate": 7.205538677261042e-06, "loss": 0.0987, "step": 163975 }, { "epoch": 2.351861412264097, "grad_norm": 7.426908016204834, "learning_rate": 7.201555163695851e-06, "loss": 0.0802, "step": 164000 }, { "epoch": 2.3522199277232834, "grad_norm": 1.0811495780944824, "learning_rate": 7.19757165013066e-06, "loss": 0.0973, "step": 164025 }, { "epoch": 2.35257844318247, "grad_norm": 18.0775089263916, "learning_rate": 7.193588136565468e-06, "loss": 0.1161, "step": 164050 }, { "epoch": 2.3529369586416564, "grad_norm": 0.5030035376548767, "learning_rate": 7.189604623000276e-06, "loss": 0.1147, "step": 164075 }, { "epoch": 2.353295474100843, "grad_norm": 14.822407722473145, "learning_rate": 7.185621109435085e-06, "loss": 0.0793, "step": 164100 }, { "epoch": 2.35365398956003, "grad_norm": 2.839564561843872, "learning_rate": 7.181637595869894e-06, "loss": 0.0567, "step": 164125 }, { "epoch": 2.3540125050192167, "grad_norm": 2.6294972896575928, "learning_rate": 7.1776540823047015e-06, "loss": 0.069, "step": 164150 }, { "epoch": 2.354371020478403, "grad_norm": 21.81634521484375, "learning_rate": 7.17367056873951e-06, "loss": 0.1112, "step": 164175 }, { "epoch": 2.3547295359375897, "grad_norm": 1.5795917510986328, "learning_rate": 7.169687055174319e-06, "loss": 0.0587, "step": 164200 }, { "epoch": 2.355088051396776, "grad_norm": 11.880395889282227, "learning_rate": 7.165703541609127e-06, "loss": 0.0898, "step": 164225 }, { "epoch": 2.3554465668559628, "grad_norm": 0.9157381057739258, "learning_rate": 7.161720028043935e-06, "loss": 0.0411, "step": 164250 }, { "epoch": 2.3558050823151495, "grad_norm": 4.722600936889648, "learning_rate": 7.157736514478744e-06, "loss": 0.0914, "step": 164275 }, { "epoch": 2.3561635977743363, "grad_norm": 21.049942016601562, "learning_rate": 7.153753000913553e-06, "loss": 0.0811, "step": 164300 }, { "epoch": 2.3565221132335226, "grad_norm": 0.12850110232830048, "learning_rate": 7.1497694873483615e-06, "loss": 0.0884, "step": 164325 }, { "epoch": 2.3568806286927093, "grad_norm": 3.5528781414031982, "learning_rate": 7.14578597378317e-06, "loss": 0.101, "step": 164350 }, { "epoch": 2.3572391441518956, "grad_norm": 1.0721145868301392, "learning_rate": 7.141802460217978e-06, "loss": 0.1003, "step": 164375 }, { "epoch": 2.3575976596110824, "grad_norm": 0.18487825989723206, "learning_rate": 7.137818946652787e-06, "loss": 0.0344, "step": 164400 }, { "epoch": 2.357956175070269, "grad_norm": 0.7093356251716614, "learning_rate": 7.1338354330875945e-06, "loss": 0.0953, "step": 164425 }, { "epoch": 2.358314690529456, "grad_norm": 13.27027416229248, "learning_rate": 7.129851919522403e-06, "loss": 0.1184, "step": 164450 }, { "epoch": 2.358673205988642, "grad_norm": 9.908243179321289, "learning_rate": 7.125868405957212e-06, "loss": 0.1211, "step": 164475 }, { "epoch": 2.359031721447829, "grad_norm": 3.78974986076355, "learning_rate": 7.121884892392021e-06, "loss": 0.0829, "step": 164500 }, { "epoch": 2.359390236907015, "grad_norm": 0.7563602924346924, "learning_rate": 7.1179013788268284e-06, "loss": 0.0405, "step": 164525 }, { "epoch": 2.359748752366202, "grad_norm": 2.863903522491455, "learning_rate": 7.113917865261637e-06, "loss": 0.0403, "step": 164550 }, { "epoch": 2.3601072678253887, "grad_norm": 1.1978397369384766, "learning_rate": 7.109934351696446e-06, "loss": 0.0969, "step": 164575 }, { "epoch": 2.3604657832845755, "grad_norm": 1.2360037565231323, "learning_rate": 7.1059508381312545e-06, "loss": 0.0869, "step": 164600 }, { "epoch": 2.3608242987437618, "grad_norm": 2.5697848796844482, "learning_rate": 7.101967324566063e-06, "loss": 0.0931, "step": 164625 }, { "epoch": 2.3611828142029485, "grad_norm": 0.5107747316360474, "learning_rate": 7.097983811000872e-06, "loss": 0.06, "step": 164650 }, { "epoch": 2.361541329662135, "grad_norm": 0.06489526480436325, "learning_rate": 7.09400029743568e-06, "loss": 0.0788, "step": 164675 }, { "epoch": 2.3618998451213216, "grad_norm": 6.608963966369629, "learning_rate": 7.090016783870488e-06, "loss": 0.0861, "step": 164700 }, { "epoch": 2.3622583605805083, "grad_norm": 4.502679347991943, "learning_rate": 7.086033270305296e-06, "loss": 0.098, "step": 164725 }, { "epoch": 2.362616876039695, "grad_norm": 0.5552904605865479, "learning_rate": 7.082049756740105e-06, "loss": 0.0533, "step": 164750 }, { "epoch": 2.3629753914988814, "grad_norm": 11.304854393005371, "learning_rate": 7.078066243174914e-06, "loss": 0.1536, "step": 164775 }, { "epoch": 2.363333906958068, "grad_norm": 7.90816068649292, "learning_rate": 7.074082729609722e-06, "loss": 0.077, "step": 164800 }, { "epoch": 2.3636924224172544, "grad_norm": 0.1576608121395111, "learning_rate": 7.07009921604453e-06, "loss": 0.0582, "step": 164825 }, { "epoch": 2.364050937876441, "grad_norm": 2.6050798892974854, "learning_rate": 7.066115702479339e-06, "loss": 0.0473, "step": 164850 }, { "epoch": 2.364409453335628, "grad_norm": 0.5188595652580261, "learning_rate": 7.062132188914148e-06, "loss": 0.0868, "step": 164875 }, { "epoch": 2.3647679687948147, "grad_norm": 0.05258166044950485, "learning_rate": 7.058148675348956e-06, "loss": 0.0745, "step": 164900 }, { "epoch": 2.365126484254001, "grad_norm": 0.7083768248558044, "learning_rate": 7.054165161783765e-06, "loss": 0.113, "step": 164925 }, { "epoch": 2.3654849997131877, "grad_norm": 0.2966974675655365, "learning_rate": 7.050181648218574e-06, "loss": 0.1116, "step": 164950 }, { "epoch": 2.365843515172374, "grad_norm": 9.0908784866333, "learning_rate": 7.046198134653381e-06, "loss": 0.0536, "step": 164975 }, { "epoch": 2.3662020306315608, "grad_norm": 0.9389157295227051, "learning_rate": 7.042214621088189e-06, "loss": 0.0672, "step": 165000 }, { "epoch": 2.3665605460907475, "grad_norm": 9.173766136169434, "learning_rate": 7.038231107522998e-06, "loss": 0.1248, "step": 165025 }, { "epoch": 2.3669190615499343, "grad_norm": 0.07834875583648682, "learning_rate": 7.034247593957807e-06, "loss": 0.1082, "step": 165050 }, { "epoch": 2.3672775770091206, "grad_norm": 6.6620659828186035, "learning_rate": 7.0302640803926154e-06, "loss": 0.1535, "step": 165075 }, { "epoch": 2.3676360924683073, "grad_norm": 0.7564018964767456, "learning_rate": 7.026280566827424e-06, "loss": 0.1281, "step": 165100 }, { "epoch": 2.3679946079274936, "grad_norm": 9.157347679138184, "learning_rate": 7.022297053262232e-06, "loss": 0.0638, "step": 165125 }, { "epoch": 2.3683531233866804, "grad_norm": 0.1240403801202774, "learning_rate": 7.018313539697041e-06, "loss": 0.1164, "step": 165150 }, { "epoch": 2.368711638845867, "grad_norm": 2.2101335525512695, "learning_rate": 7.014330026131849e-06, "loss": 0.0812, "step": 165175 }, { "epoch": 2.369070154305054, "grad_norm": 3.791098117828369, "learning_rate": 7.010346512566658e-06, "loss": 0.0703, "step": 165200 }, { "epoch": 2.36942866976424, "grad_norm": 3.99678373336792, "learning_rate": 7.006362999001467e-06, "loss": 0.0944, "step": 165225 }, { "epoch": 2.369787185223427, "grad_norm": 5.289506435394287, "learning_rate": 7.0023794854362746e-06, "loss": 0.1686, "step": 165250 }, { "epoch": 2.370145700682613, "grad_norm": 1.412682294845581, "learning_rate": 6.998395971871082e-06, "loss": 0.1293, "step": 165275 }, { "epoch": 2.3705042161418, "grad_norm": 8.039234161376953, "learning_rate": 6.994412458305891e-06, "loss": 0.083, "step": 165300 }, { "epoch": 2.3708627316009867, "grad_norm": 0.10718075186014175, "learning_rate": 6.9904289447407e-06, "loss": 0.0833, "step": 165325 }, { "epoch": 2.3712212470601735, "grad_norm": 0.31189149618148804, "learning_rate": 6.9864454311755085e-06, "loss": 0.0628, "step": 165350 }, { "epoch": 2.3715797625193598, "grad_norm": 1.9496207237243652, "learning_rate": 6.982461917610317e-06, "loss": 0.0559, "step": 165375 }, { "epoch": 2.3719382779785465, "grad_norm": 5.806958198547363, "learning_rate": 6.978478404045126e-06, "loss": 0.0825, "step": 165400 }, { "epoch": 2.372296793437733, "grad_norm": 0.8280638456344604, "learning_rate": 6.974494890479934e-06, "loss": 0.0772, "step": 165425 }, { "epoch": 2.3726553088969196, "grad_norm": 6.207957744598389, "learning_rate": 6.970511376914742e-06, "loss": 0.074, "step": 165450 }, { "epoch": 2.3730138243561063, "grad_norm": 1.9301265478134155, "learning_rate": 6.966527863349551e-06, "loss": 0.1428, "step": 165475 }, { "epoch": 2.373372339815293, "grad_norm": 4.138332843780518, "learning_rate": 6.96254434978436e-06, "loss": 0.0865, "step": 165500 }, { "epoch": 2.3737308552744794, "grad_norm": 10.7089262008667, "learning_rate": 6.958560836219168e-06, "loss": 0.0678, "step": 165525 }, { "epoch": 2.374089370733666, "grad_norm": 0.42413052916526794, "learning_rate": 6.954577322653976e-06, "loss": 0.0976, "step": 165550 }, { "epoch": 2.3744478861928524, "grad_norm": 12.382938385009766, "learning_rate": 6.950593809088784e-06, "loss": 0.0631, "step": 165575 }, { "epoch": 2.374806401652039, "grad_norm": 0.2983623445034027, "learning_rate": 6.946610295523593e-06, "loss": 0.0753, "step": 165600 }, { "epoch": 2.375164917111226, "grad_norm": 2.1149041652679443, "learning_rate": 6.9426267819584016e-06, "loss": 0.1396, "step": 165625 }, { "epoch": 2.3755234325704127, "grad_norm": 18.160682678222656, "learning_rate": 6.93864326839321e-06, "loss": 0.1381, "step": 165650 }, { "epoch": 2.375881948029599, "grad_norm": 5.055498123168945, "learning_rate": 6.934659754828019e-06, "loss": 0.0482, "step": 165675 }, { "epoch": 2.3762404634887857, "grad_norm": 9.413471221923828, "learning_rate": 6.930676241262828e-06, "loss": 0.0957, "step": 165700 }, { "epoch": 2.376598978947972, "grad_norm": 0.7977373600006104, "learning_rate": 6.9266927276976355e-06, "loss": 0.1196, "step": 165725 }, { "epoch": 2.3769574944071588, "grad_norm": 4.762379169464111, "learning_rate": 6.922709214132444e-06, "loss": 0.0671, "step": 165750 }, { "epoch": 2.3773160098663455, "grad_norm": 7.904779434204102, "learning_rate": 6.918725700567253e-06, "loss": 0.162, "step": 165775 }, { "epoch": 2.3776745253255323, "grad_norm": 14.309896469116211, "learning_rate": 6.914742187002061e-06, "loss": 0.0941, "step": 165800 }, { "epoch": 2.3780330407847186, "grad_norm": 12.779691696166992, "learning_rate": 6.910758673436869e-06, "loss": 0.1249, "step": 165825 }, { "epoch": 2.3783915562439053, "grad_norm": 1.2702128887176514, "learning_rate": 6.906775159871678e-06, "loss": 0.066, "step": 165850 }, { "epoch": 2.3787500717030916, "grad_norm": 3.2801852226257324, "learning_rate": 6.902791646306486e-06, "loss": 0.0709, "step": 165875 }, { "epoch": 2.3791085871622784, "grad_norm": 9.663558959960938, "learning_rate": 6.898808132741295e-06, "loss": 0.1186, "step": 165900 }, { "epoch": 2.379467102621465, "grad_norm": 0.8297333717346191, "learning_rate": 6.894824619176103e-06, "loss": 0.0377, "step": 165925 }, { "epoch": 2.379825618080652, "grad_norm": 0.9276072978973389, "learning_rate": 6.890841105610912e-06, "loss": 0.0999, "step": 165950 }, { "epoch": 2.380184133539838, "grad_norm": 5.653440475463867, "learning_rate": 6.886857592045721e-06, "loss": 0.0695, "step": 165975 }, { "epoch": 2.380542648999025, "grad_norm": 7.751429557800293, "learning_rate": 6.882874078480529e-06, "loss": 0.0701, "step": 166000 }, { "epoch": 2.380901164458211, "grad_norm": 2.448800802230835, "learning_rate": 6.878890564915337e-06, "loss": 0.09, "step": 166025 }, { "epoch": 2.381259679917398, "grad_norm": 1.6243726015090942, "learning_rate": 6.874907051350146e-06, "loss": 0.2018, "step": 166050 }, { "epoch": 2.3816181953765847, "grad_norm": 1.2664272785186768, "learning_rate": 6.870923537784954e-06, "loss": 0.1034, "step": 166075 }, { "epoch": 2.3819767108357714, "grad_norm": 0.15734964609146118, "learning_rate": 6.8669400242197625e-06, "loss": 0.0628, "step": 166100 }, { "epoch": 2.3823352262949578, "grad_norm": 3.3937723636627197, "learning_rate": 6.862956510654571e-06, "loss": 0.1588, "step": 166125 }, { "epoch": 2.3826937417541445, "grad_norm": 0.3300657272338867, "learning_rate": 6.85897299708938e-06, "loss": 0.0711, "step": 166150 }, { "epoch": 2.383052257213331, "grad_norm": 0.5403594374656677, "learning_rate": 6.854989483524188e-06, "loss": 0.0887, "step": 166175 }, { "epoch": 2.3834107726725176, "grad_norm": 0.3547617495059967, "learning_rate": 6.851005969958996e-06, "loss": 0.1189, "step": 166200 }, { "epoch": 2.3837692881317043, "grad_norm": 16.851150512695312, "learning_rate": 6.847022456393805e-06, "loss": 0.0585, "step": 166225 }, { "epoch": 2.384127803590891, "grad_norm": 0.9602195024490356, "learning_rate": 6.843038942828614e-06, "loss": 0.0783, "step": 166250 }, { "epoch": 2.3844863190500774, "grad_norm": 3.287614345550537, "learning_rate": 6.8390554292634225e-06, "loss": 0.1154, "step": 166275 }, { "epoch": 2.384844834509264, "grad_norm": 3.6124777793884277, "learning_rate": 6.835071915698231e-06, "loss": 0.0733, "step": 166300 }, { "epoch": 2.3852033499684504, "grad_norm": 9.653210639953613, "learning_rate": 6.831088402133039e-06, "loss": 0.0845, "step": 166325 }, { "epoch": 2.385561865427637, "grad_norm": 0.6462048888206482, "learning_rate": 6.827104888567847e-06, "loss": 0.0685, "step": 166350 }, { "epoch": 2.385920380886824, "grad_norm": 12.580662727355957, "learning_rate": 6.8231213750026555e-06, "loss": 0.0793, "step": 166375 }, { "epoch": 2.3862788963460106, "grad_norm": 0.7261788845062256, "learning_rate": 6.819137861437464e-06, "loss": 0.1206, "step": 166400 }, { "epoch": 2.386637411805197, "grad_norm": 10.544707298278809, "learning_rate": 6.815154347872273e-06, "loss": 0.092, "step": 166425 }, { "epoch": 2.3869959272643837, "grad_norm": 7.477362155914307, "learning_rate": 6.811170834307082e-06, "loss": 0.1185, "step": 166450 }, { "epoch": 2.38735444272357, "grad_norm": 16.05775260925293, "learning_rate": 6.8071873207418895e-06, "loss": 0.0999, "step": 166475 }, { "epoch": 2.3877129581827568, "grad_norm": 0.08538142591714859, "learning_rate": 6.803203807176698e-06, "loss": 0.0622, "step": 166500 }, { "epoch": 2.3880714736419435, "grad_norm": 0.029228240251541138, "learning_rate": 6.799220293611507e-06, "loss": 0.152, "step": 166525 }, { "epoch": 2.3884299891011302, "grad_norm": 17.873090744018555, "learning_rate": 6.7952367800463155e-06, "loss": 0.1159, "step": 166550 }, { "epoch": 2.3887885045603166, "grad_norm": 2.83193302154541, "learning_rate": 6.791253266481124e-06, "loss": 0.0737, "step": 166575 }, { "epoch": 2.3891470200195033, "grad_norm": 5.682803153991699, "learning_rate": 6.787269752915933e-06, "loss": 0.0673, "step": 166600 }, { "epoch": 2.3895055354786896, "grad_norm": 0.6095836758613586, "learning_rate": 6.78328623935074e-06, "loss": 0.1548, "step": 166625 }, { "epoch": 2.3898640509378763, "grad_norm": 0.22325550019741058, "learning_rate": 6.779302725785549e-06, "loss": 0.0554, "step": 166650 }, { "epoch": 2.390222566397063, "grad_norm": 0.01958647184073925, "learning_rate": 6.775319212220357e-06, "loss": 0.0505, "step": 166675 }, { "epoch": 2.39058108185625, "grad_norm": 2.1752028465270996, "learning_rate": 6.771335698655166e-06, "loss": 0.0367, "step": 166700 }, { "epoch": 2.390939597315436, "grad_norm": 7.800622463226318, "learning_rate": 6.767352185089975e-06, "loss": 0.1276, "step": 166725 }, { "epoch": 2.391298112774623, "grad_norm": 9.337169647216797, "learning_rate": 6.763368671524783e-06, "loss": 0.0866, "step": 166750 }, { "epoch": 2.391656628233809, "grad_norm": 0.44002625346183777, "learning_rate": 6.759385157959591e-06, "loss": 0.068, "step": 166775 }, { "epoch": 2.392015143692996, "grad_norm": 0.434235155582428, "learning_rate": 6.7554016443944e-06, "loss": 0.1546, "step": 166800 }, { "epoch": 2.3923736591521827, "grad_norm": 21.26596450805664, "learning_rate": 6.751418130829209e-06, "loss": 0.1045, "step": 166825 }, { "epoch": 2.3927321746113694, "grad_norm": 0.34405186772346497, "learning_rate": 6.747434617264017e-06, "loss": 0.1346, "step": 166850 }, { "epoch": 2.3930906900705557, "grad_norm": 0.3626415729522705, "learning_rate": 6.743451103698826e-06, "loss": 0.1238, "step": 166875 }, { "epoch": 2.3934492055297425, "grad_norm": 12.740095138549805, "learning_rate": 6.739467590133634e-06, "loss": 0.0648, "step": 166900 }, { "epoch": 2.393807720988929, "grad_norm": 0.31313198804855347, "learning_rate": 6.735484076568442e-06, "loss": 0.1089, "step": 166925 }, { "epoch": 2.3941662364481155, "grad_norm": 0.1316666156053543, "learning_rate": 6.73150056300325e-06, "loss": 0.0443, "step": 166950 }, { "epoch": 2.3945247519073023, "grad_norm": 0.6886548399925232, "learning_rate": 6.727517049438059e-06, "loss": 0.0423, "step": 166975 }, { "epoch": 2.394883267366489, "grad_norm": 0.7644559741020203, "learning_rate": 6.723533535872868e-06, "loss": 0.1407, "step": 167000 }, { "epoch": 2.3952417828256753, "grad_norm": 1.1261636018753052, "learning_rate": 6.7195500223076764e-06, "loss": 0.0446, "step": 167025 }, { "epoch": 2.395600298284862, "grad_norm": 9.433162689208984, "learning_rate": 6.715566508742485e-06, "loss": 0.0465, "step": 167050 }, { "epoch": 2.3959588137440484, "grad_norm": 1.3077796697616577, "learning_rate": 6.711582995177293e-06, "loss": 0.0644, "step": 167075 }, { "epoch": 2.396317329203235, "grad_norm": 0.20537571609020233, "learning_rate": 6.707599481612102e-06, "loss": 0.0995, "step": 167100 }, { "epoch": 2.396675844662422, "grad_norm": 7.7234578132629395, "learning_rate": 6.70361596804691e-06, "loss": 0.074, "step": 167125 }, { "epoch": 2.3970343601216086, "grad_norm": 8.02213191986084, "learning_rate": 6.699632454481719e-06, "loss": 0.0433, "step": 167150 }, { "epoch": 2.397392875580795, "grad_norm": 3.35626482963562, "learning_rate": 6.695648940916527e-06, "loss": 0.138, "step": 167175 }, { "epoch": 2.3977513910399817, "grad_norm": 2.56575345993042, "learning_rate": 6.691665427351336e-06, "loss": 0.0629, "step": 167200 }, { "epoch": 2.398109906499168, "grad_norm": 2.5752151012420654, "learning_rate": 6.6876819137861434e-06, "loss": 0.1281, "step": 167225 }, { "epoch": 2.3984684219583547, "grad_norm": 0.03930624574422836, "learning_rate": 6.683698400220952e-06, "loss": 0.0555, "step": 167250 }, { "epoch": 2.3988269374175415, "grad_norm": 10.015658378601074, "learning_rate": 6.679714886655761e-06, "loss": 0.1411, "step": 167275 }, { "epoch": 2.3991854528767282, "grad_norm": 13.573426246643066, "learning_rate": 6.6757313730905695e-06, "loss": 0.1062, "step": 167300 }, { "epoch": 2.3995439683359145, "grad_norm": 6.024008274078369, "learning_rate": 6.671747859525378e-06, "loss": 0.1167, "step": 167325 }, { "epoch": 2.3999024837951013, "grad_norm": 2.9810256958007812, "learning_rate": 6.667764345960187e-06, "loss": 0.1102, "step": 167350 }, { "epoch": 2.4002609992542876, "grad_norm": 12.423349380493164, "learning_rate": 6.663780832394995e-06, "loss": 0.0813, "step": 167375 }, { "epoch": 2.4006195147134743, "grad_norm": 1.0775420665740967, "learning_rate": 6.659797318829803e-06, "loss": 0.0396, "step": 167400 }, { "epoch": 2.400978030172661, "grad_norm": 2.8882625102996826, "learning_rate": 6.655813805264612e-06, "loss": 0.1133, "step": 167425 }, { "epoch": 2.401336545631848, "grad_norm": 0.778374433517456, "learning_rate": 6.65183029169942e-06, "loss": 0.0527, "step": 167450 }, { "epoch": 2.401695061091034, "grad_norm": 25.89003562927246, "learning_rate": 6.647846778134229e-06, "loss": 0.1253, "step": 167475 }, { "epoch": 2.402053576550221, "grad_norm": 14.267892837524414, "learning_rate": 6.6438632645690365e-06, "loss": 0.1061, "step": 167500 }, { "epoch": 2.402412092009407, "grad_norm": 6.153668403625488, "learning_rate": 6.639879751003845e-06, "loss": 0.2412, "step": 167525 }, { "epoch": 2.402770607468594, "grad_norm": 5.813910007476807, "learning_rate": 6.635896237438654e-06, "loss": 0.083, "step": 167550 }, { "epoch": 2.4031291229277807, "grad_norm": 0.1640079766511917, "learning_rate": 6.6319127238734626e-06, "loss": 0.0957, "step": 167575 }, { "epoch": 2.4034876383869674, "grad_norm": 0.46061939001083374, "learning_rate": 6.627929210308271e-06, "loss": 0.036, "step": 167600 }, { "epoch": 2.4038461538461537, "grad_norm": 0.6467103958129883, "learning_rate": 6.62394569674308e-06, "loss": 0.0968, "step": 167625 }, { "epoch": 2.4042046693053405, "grad_norm": 7.37000846862793, "learning_rate": 6.619962183177888e-06, "loss": 0.1301, "step": 167650 }, { "epoch": 2.4045631847645272, "grad_norm": 6.827070236206055, "learning_rate": 6.6159786696126965e-06, "loss": 0.1044, "step": 167675 }, { "epoch": 2.4049217002237135, "grad_norm": 0.4385766386985779, "learning_rate": 6.611995156047505e-06, "loss": 0.0833, "step": 167700 }, { "epoch": 2.4052802156829003, "grad_norm": 0.9371958374977112, "learning_rate": 6.608011642482313e-06, "loss": 0.156, "step": 167725 }, { "epoch": 2.405638731142087, "grad_norm": 2.0487558841705322, "learning_rate": 6.604028128917122e-06, "loss": 0.0737, "step": 167750 }, { "epoch": 2.4059972466012733, "grad_norm": 3.6951770782470703, "learning_rate": 6.60004461535193e-06, "loss": 0.0851, "step": 167775 }, { "epoch": 2.40635576206046, "grad_norm": 0.5852952003479004, "learning_rate": 6.596061101786738e-06, "loss": 0.0602, "step": 167800 }, { "epoch": 2.406714277519647, "grad_norm": 2.8663384914398193, "learning_rate": 6.592077588221547e-06, "loss": 0.0534, "step": 167825 }, { "epoch": 2.407072792978833, "grad_norm": 0.19661307334899902, "learning_rate": 6.588094074656356e-06, "loss": 0.0292, "step": 167850 }, { "epoch": 2.40743130843802, "grad_norm": 0.8125025033950806, "learning_rate": 6.584110561091164e-06, "loss": 0.0471, "step": 167875 }, { "epoch": 2.4077898238972066, "grad_norm": 1.6516036987304688, "learning_rate": 6.580127047525973e-06, "loss": 0.1043, "step": 167900 }, { "epoch": 2.408148339356393, "grad_norm": 0.3808016777038574, "learning_rate": 6.576143533960782e-06, "loss": 0.1257, "step": 167925 }, { "epoch": 2.4085068548155797, "grad_norm": 0.7771733403205872, "learning_rate": 6.5721600203955896e-06, "loss": 0.0651, "step": 167950 }, { "epoch": 2.4088653702747664, "grad_norm": 2.122230291366577, "learning_rate": 6.568176506830398e-06, "loss": 0.0827, "step": 167975 }, { "epoch": 2.4092238857339527, "grad_norm": 1.5673637390136719, "learning_rate": 6.564192993265206e-06, "loss": 0.1068, "step": 168000 }, { "epoch": 2.4095824011931395, "grad_norm": 18.947216033935547, "learning_rate": 6.560209479700015e-06, "loss": 0.0873, "step": 168025 }, { "epoch": 2.4099409166523262, "grad_norm": 0.020012017339468002, "learning_rate": 6.5562259661348235e-06, "loss": 0.0575, "step": 168050 }, { "epoch": 2.4102994321115125, "grad_norm": 12.449634552001953, "learning_rate": 6.552242452569632e-06, "loss": 0.0969, "step": 168075 }, { "epoch": 2.4106579475706993, "grad_norm": 21.168699264526367, "learning_rate": 6.54825893900444e-06, "loss": 0.1577, "step": 168100 }, { "epoch": 2.411016463029886, "grad_norm": 5.882604598999023, "learning_rate": 6.544275425439249e-06, "loss": 0.0686, "step": 168125 }, { "epoch": 2.4113749784890723, "grad_norm": 4.209927082061768, "learning_rate": 6.540291911874057e-06, "loss": 0.0935, "step": 168150 }, { "epoch": 2.411733493948259, "grad_norm": 3.4262654781341553, "learning_rate": 6.536308398308866e-06, "loss": 0.1122, "step": 168175 }, { "epoch": 2.412092009407446, "grad_norm": 0.5225535035133362, "learning_rate": 6.532324884743675e-06, "loss": 0.082, "step": 168200 }, { "epoch": 2.412450524866632, "grad_norm": 2.0147364139556885, "learning_rate": 6.5283413711784835e-06, "loss": 0.0672, "step": 168225 }, { "epoch": 2.412809040325819, "grad_norm": 15.454185485839844, "learning_rate": 6.524357857613291e-06, "loss": 0.099, "step": 168250 }, { "epoch": 2.4131675557850056, "grad_norm": 11.845735549926758, "learning_rate": 6.520374344048099e-06, "loss": 0.1014, "step": 168275 }, { "epoch": 2.413526071244192, "grad_norm": 4.277926445007324, "learning_rate": 6.516390830482908e-06, "loss": 0.0634, "step": 168300 }, { "epoch": 2.4138845867033787, "grad_norm": 0.9572390913963318, "learning_rate": 6.5124073169177165e-06, "loss": 0.104, "step": 168325 }, { "epoch": 2.4142431021625654, "grad_norm": 16.859699249267578, "learning_rate": 6.508423803352525e-06, "loss": 0.0743, "step": 168350 }, { "epoch": 2.4146016176217517, "grad_norm": 1.4116685390472412, "learning_rate": 6.504440289787334e-06, "loss": 0.0861, "step": 168375 }, { "epoch": 2.4149601330809385, "grad_norm": 18.456838607788086, "learning_rate": 6.500456776222142e-06, "loss": 0.0632, "step": 168400 }, { "epoch": 2.4153186485401252, "grad_norm": 0.13662664592266083, "learning_rate": 6.4964732626569505e-06, "loss": 0.044, "step": 168425 }, { "epoch": 2.4156771639993115, "grad_norm": 3.5454087257385254, "learning_rate": 6.492489749091759e-06, "loss": 0.0615, "step": 168450 }, { "epoch": 2.4160356794584983, "grad_norm": 1.6687499284744263, "learning_rate": 6.488506235526568e-06, "loss": 0.1086, "step": 168475 }, { "epoch": 2.416394194917685, "grad_norm": 6.248967170715332, "learning_rate": 6.4845227219613765e-06, "loss": 0.0801, "step": 168500 }, { "epoch": 2.4167527103768713, "grad_norm": 0.8166309595108032, "learning_rate": 6.480539208396184e-06, "loss": 0.0981, "step": 168525 }, { "epoch": 2.417111225836058, "grad_norm": 3.0294528007507324, "learning_rate": 6.476555694830992e-06, "loss": 0.133, "step": 168550 }, { "epoch": 2.417469741295245, "grad_norm": 4.827512264251709, "learning_rate": 6.472572181265801e-06, "loss": 0.0725, "step": 168575 }, { "epoch": 2.417828256754431, "grad_norm": 5.5028510093688965, "learning_rate": 6.46858866770061e-06, "loss": 0.1125, "step": 168600 }, { "epoch": 2.418186772213618, "grad_norm": 1.2961410284042358, "learning_rate": 6.464605154135418e-06, "loss": 0.0887, "step": 168625 }, { "epoch": 2.4185452876728046, "grad_norm": 0.15743492543697357, "learning_rate": 6.460621640570227e-06, "loss": 0.149, "step": 168650 }, { "epoch": 2.418903803131991, "grad_norm": 0.653508186340332, "learning_rate": 6.456638127005036e-06, "loss": 0.0883, "step": 168675 }, { "epoch": 2.4192623185911777, "grad_norm": 1.355454683303833, "learning_rate": 6.4526546134398435e-06, "loss": 0.062, "step": 168700 }, { "epoch": 2.4196208340503644, "grad_norm": 3.699422836303711, "learning_rate": 6.448671099874652e-06, "loss": 0.0709, "step": 168725 }, { "epoch": 2.4199793495095507, "grad_norm": 5.147576332092285, "learning_rate": 6.444687586309461e-06, "loss": 0.0665, "step": 168750 }, { "epoch": 2.4203378649687375, "grad_norm": 2.1673402786254883, "learning_rate": 6.44070407274427e-06, "loss": 0.113, "step": 168775 }, { "epoch": 2.4206963804279242, "grad_norm": 4.265462875366211, "learning_rate": 6.4367205591790774e-06, "loss": 0.0649, "step": 168800 }, { "epoch": 2.4210548958871105, "grad_norm": 1.3943711519241333, "learning_rate": 6.432737045613886e-06, "loss": 0.0801, "step": 168825 }, { "epoch": 2.4214134113462973, "grad_norm": 1.1129225492477417, "learning_rate": 6.428753532048694e-06, "loss": 0.1275, "step": 168850 }, { "epoch": 2.421771926805484, "grad_norm": 0.275556355714798, "learning_rate": 6.424770018483503e-06, "loss": 0.0365, "step": 168875 }, { "epoch": 2.4221304422646703, "grad_norm": 0.416557252407074, "learning_rate": 6.420786504918311e-06, "loss": 0.0851, "step": 168900 }, { "epoch": 2.422488957723857, "grad_norm": 0.01860082522034645, "learning_rate": 6.41680299135312e-06, "loss": 0.062, "step": 168925 }, { "epoch": 2.422847473183044, "grad_norm": 6.39784574508667, "learning_rate": 6.412819477787929e-06, "loss": 0.0512, "step": 168950 }, { "epoch": 2.42320598864223, "grad_norm": 5.1048359870910645, "learning_rate": 6.4088359642227374e-06, "loss": 0.0433, "step": 168975 }, { "epoch": 2.423564504101417, "grad_norm": 7.357001781463623, "learning_rate": 6.404852450657545e-06, "loss": 0.1407, "step": 169000 }, { "epoch": 2.4239230195606036, "grad_norm": 2.3475654125213623, "learning_rate": 6.400868937092354e-06, "loss": 0.1076, "step": 169025 }, { "epoch": 2.42428153501979, "grad_norm": 1.119372844696045, "learning_rate": 6.396885423527163e-06, "loss": 0.1139, "step": 169050 }, { "epoch": 2.4246400504789767, "grad_norm": 4.537450313568115, "learning_rate": 6.3929019099619705e-06, "loss": 0.0727, "step": 169075 }, { "epoch": 2.4249985659381634, "grad_norm": 0.40010103583335876, "learning_rate": 6.388918396396779e-06, "loss": 0.0631, "step": 169100 }, { "epoch": 2.4253570813973497, "grad_norm": 1.6204311847686768, "learning_rate": 6.384934882831588e-06, "loss": 0.1059, "step": 169125 }, { "epoch": 2.4257155968565365, "grad_norm": 1.5343528985977173, "learning_rate": 6.380951369266396e-06, "loss": 0.062, "step": 169150 }, { "epoch": 2.4260741123157232, "grad_norm": 8.207232475280762, "learning_rate": 6.3769678557012044e-06, "loss": 0.1283, "step": 169175 }, { "epoch": 2.4264326277749095, "grad_norm": 1.9979506731033325, "learning_rate": 6.372984342136013e-06, "loss": 0.0513, "step": 169200 }, { "epoch": 2.4267911432340963, "grad_norm": 0.1186886876821518, "learning_rate": 6.369000828570822e-06, "loss": 0.0803, "step": 169225 }, { "epoch": 2.427149658693283, "grad_norm": 3.76287841796875, "learning_rate": 6.3650173150056305e-06, "loss": 0.1113, "step": 169250 }, { "epoch": 2.4275081741524693, "grad_norm": 1.305887222290039, "learning_rate": 6.361033801440439e-06, "loss": 0.1268, "step": 169275 }, { "epoch": 2.427866689611656, "grad_norm": 0.3608660101890564, "learning_rate": 6.357050287875247e-06, "loss": 0.0883, "step": 169300 }, { "epoch": 2.428225205070843, "grad_norm": 1.624766230583191, "learning_rate": 6.353066774310056e-06, "loss": 0.1276, "step": 169325 }, { "epoch": 2.428583720530029, "grad_norm": 0.1669517606496811, "learning_rate": 6.349083260744864e-06, "loss": 0.0772, "step": 169350 }, { "epoch": 2.428942235989216, "grad_norm": 0.07345419377088547, "learning_rate": 6.345099747179672e-06, "loss": 0.0974, "step": 169375 }, { "epoch": 2.4293007514484026, "grad_norm": 0.6475887894630432, "learning_rate": 6.341116233614481e-06, "loss": 0.1101, "step": 169400 }, { "epoch": 2.429659266907589, "grad_norm": 0.37386927008628845, "learning_rate": 6.33713272004929e-06, "loss": 0.0544, "step": 169425 }, { "epoch": 2.4300177823667757, "grad_norm": 0.8830362558364868, "learning_rate": 6.3331492064840975e-06, "loss": 0.103, "step": 169450 }, { "epoch": 2.4303762978259624, "grad_norm": 4.203429222106934, "learning_rate": 6.329165692918906e-06, "loss": 0.1292, "step": 169475 }, { "epoch": 2.4307348132851487, "grad_norm": 1.9484206438064575, "learning_rate": 6.325182179353715e-06, "loss": 0.0435, "step": 169500 }, { "epoch": 2.4310933287443355, "grad_norm": 1.567400574684143, "learning_rate": 6.3211986657885236e-06, "loss": 0.1109, "step": 169525 }, { "epoch": 2.4314518442035222, "grad_norm": 1.027736783027649, "learning_rate": 6.317215152223332e-06, "loss": 0.0937, "step": 169550 }, { "epoch": 2.4318103596627085, "grad_norm": 3.571669816970825, "learning_rate": 6.313231638658141e-06, "loss": 0.0977, "step": 169575 }, { "epoch": 2.4321688751218953, "grad_norm": 2.750824451446533, "learning_rate": 6.309248125092949e-06, "loss": 0.0748, "step": 169600 }, { "epoch": 2.432527390581082, "grad_norm": 1.7211494445800781, "learning_rate": 6.3052646115277575e-06, "loss": 0.1089, "step": 169625 }, { "epoch": 2.4328859060402683, "grad_norm": 3.369560718536377, "learning_rate": 6.301281097962565e-06, "loss": 0.0915, "step": 169650 }, { "epoch": 2.433244421499455, "grad_norm": 3.9336979389190674, "learning_rate": 6.297297584397374e-06, "loss": 0.1176, "step": 169675 }, { "epoch": 2.433602936958642, "grad_norm": 0.20127691328525543, "learning_rate": 6.293314070832183e-06, "loss": 0.0492, "step": 169700 }, { "epoch": 2.433961452417828, "grad_norm": 7.021227836608887, "learning_rate": 6.289330557266991e-06, "loss": 0.068, "step": 169725 }, { "epoch": 2.434319967877015, "grad_norm": 8.352824211120605, "learning_rate": 6.285347043701799e-06, "loss": 0.1099, "step": 169750 }, { "epoch": 2.4346784833362016, "grad_norm": 0.11114850640296936, "learning_rate": 6.281363530136608e-06, "loss": 0.0669, "step": 169775 }, { "epoch": 2.435036998795388, "grad_norm": 17.789213180541992, "learning_rate": 6.277380016571417e-06, "loss": 0.1012, "step": 169800 }, { "epoch": 2.4353955142545747, "grad_norm": 15.50032901763916, "learning_rate": 6.273396503006225e-06, "loss": 0.1443, "step": 169825 }, { "epoch": 2.4357540297137614, "grad_norm": 1.247633457183838, "learning_rate": 6.269412989441034e-06, "loss": 0.1236, "step": 169850 }, { "epoch": 2.4361125451729477, "grad_norm": 4.262338161468506, "learning_rate": 6.265429475875843e-06, "loss": 0.0985, "step": 169875 }, { "epoch": 2.4364710606321345, "grad_norm": 7.619831562042236, "learning_rate": 6.2614459623106506e-06, "loss": 0.1326, "step": 169900 }, { "epoch": 2.4368295760913212, "grad_norm": 0.42373019456863403, "learning_rate": 6.257462448745458e-06, "loss": 0.081, "step": 169925 }, { "epoch": 2.4371880915505075, "grad_norm": 1.1667852401733398, "learning_rate": 6.253478935180267e-06, "loss": 0.0802, "step": 169950 }, { "epoch": 2.4375466070096943, "grad_norm": 2.033609390258789, "learning_rate": 6.249495421615076e-06, "loss": 0.0844, "step": 169975 }, { "epoch": 2.437905122468881, "grad_norm": 24.99284553527832, "learning_rate": 6.2455119080498845e-06, "loss": 0.1209, "step": 170000 }, { "epoch": 2.4382636379280673, "grad_norm": 0.07607445865869522, "learning_rate": 6.241528394484693e-06, "loss": 0.0729, "step": 170025 }, { "epoch": 2.438622153387254, "grad_norm": 0.061439502984285355, "learning_rate": 6.237544880919501e-06, "loss": 0.0554, "step": 170050 }, { "epoch": 2.438980668846441, "grad_norm": 8.326957702636719, "learning_rate": 6.23356136735431e-06, "loss": 0.0841, "step": 170075 }, { "epoch": 2.439339184305627, "grad_norm": 5.9397687911987305, "learning_rate": 6.229577853789118e-06, "loss": 0.0877, "step": 170100 }, { "epoch": 2.439697699764814, "grad_norm": 0.7874411940574646, "learning_rate": 6.225594340223927e-06, "loss": 0.1488, "step": 170125 }, { "epoch": 2.4400562152240006, "grad_norm": 0.6540043354034424, "learning_rate": 6.221610826658736e-06, "loss": 0.0541, "step": 170150 }, { "epoch": 2.440414730683187, "grad_norm": 18.07306480407715, "learning_rate": 6.217627313093544e-06, "loss": 0.0879, "step": 170175 }, { "epoch": 2.4407732461423737, "grad_norm": 0.06940533220767975, "learning_rate": 6.2136437995283515e-06, "loss": 0.0778, "step": 170200 }, { "epoch": 2.4411317616015604, "grad_norm": 7.278729438781738, "learning_rate": 6.20966028596316e-06, "loss": 0.1089, "step": 170225 }, { "epoch": 2.4414902770607467, "grad_norm": 2.513650894165039, "learning_rate": 6.205676772397969e-06, "loss": 0.0688, "step": 170250 }, { "epoch": 2.4418487925199335, "grad_norm": 9.794474601745605, "learning_rate": 6.2016932588327775e-06, "loss": 0.1078, "step": 170275 }, { "epoch": 2.4422073079791202, "grad_norm": 5.18770170211792, "learning_rate": 6.197709745267586e-06, "loss": 0.081, "step": 170300 }, { "epoch": 2.4425658234383065, "grad_norm": 3.746039628982544, "learning_rate": 6.193726231702395e-06, "loss": 0.0681, "step": 170325 }, { "epoch": 2.4429243388974933, "grad_norm": 0.09732099622488022, "learning_rate": 6.189742718137203e-06, "loss": 0.0795, "step": 170350 }, { "epoch": 2.44328285435668, "grad_norm": 1.333479881286621, "learning_rate": 6.1857592045720115e-06, "loss": 0.098, "step": 170375 }, { "epoch": 2.4436413698158663, "grad_norm": 0.24489746987819672, "learning_rate": 6.18177569100682e-06, "loss": 0.0731, "step": 170400 }, { "epoch": 2.443999885275053, "grad_norm": 0.1337307244539261, "learning_rate": 6.177792177441629e-06, "loss": 0.0704, "step": 170425 }, { "epoch": 2.44435840073424, "grad_norm": 12.56424617767334, "learning_rate": 6.173808663876437e-06, "loss": 0.0674, "step": 170450 }, { "epoch": 2.444716916193426, "grad_norm": 0.19770827889442444, "learning_rate": 6.169825150311245e-06, "loss": 0.06, "step": 170475 }, { "epoch": 2.445075431652613, "grad_norm": 4.014969348907471, "learning_rate": 6.165841636746053e-06, "loss": 0.0352, "step": 170500 }, { "epoch": 2.4454339471117996, "grad_norm": 0.6418822407722473, "learning_rate": 6.161858123180862e-06, "loss": 0.088, "step": 170525 }, { "epoch": 2.445792462570986, "grad_norm": 1.3304882049560547, "learning_rate": 6.157874609615671e-06, "loss": 0.1334, "step": 170550 }, { "epoch": 2.4461509780301727, "grad_norm": 3.2617876529693604, "learning_rate": 6.153891096050479e-06, "loss": 0.0521, "step": 170575 }, { "epoch": 2.4465094934893594, "grad_norm": 2.0534045696258545, "learning_rate": 6.149907582485288e-06, "loss": 0.1698, "step": 170600 }, { "epoch": 2.4468680089485457, "grad_norm": 2.2176716327667236, "learning_rate": 6.145924068920097e-06, "loss": 0.0959, "step": 170625 }, { "epoch": 2.4472265244077325, "grad_norm": 0.676806628704071, "learning_rate": 6.1419405553549045e-06, "loss": 0.0611, "step": 170650 }, { "epoch": 2.447585039866919, "grad_norm": 5.68393087387085, "learning_rate": 6.137957041789713e-06, "loss": 0.085, "step": 170675 }, { "epoch": 2.4479435553261055, "grad_norm": 3.4752538204193115, "learning_rate": 6.133973528224522e-06, "loss": 0.08, "step": 170700 }, { "epoch": 2.4483020707852923, "grad_norm": 7.984546661376953, "learning_rate": 6.12999001465933e-06, "loss": 0.0759, "step": 170725 }, { "epoch": 2.448660586244479, "grad_norm": 0.08410092443227768, "learning_rate": 6.1260065010941385e-06, "loss": 0.0526, "step": 170750 }, { "epoch": 2.4490191017036653, "grad_norm": 1.8322561979293823, "learning_rate": 6.122022987528947e-06, "loss": 0.0709, "step": 170775 }, { "epoch": 2.449377617162852, "grad_norm": 1.5695364475250244, "learning_rate": 6.118039473963755e-06, "loss": 0.0463, "step": 170800 }, { "epoch": 2.449736132622039, "grad_norm": 0.9878901243209839, "learning_rate": 6.114055960398564e-06, "loss": 0.0643, "step": 170825 }, { "epoch": 2.450094648081225, "grad_norm": 0.33203956484794617, "learning_rate": 6.110072446833372e-06, "loss": 0.0495, "step": 170850 }, { "epoch": 2.450453163540412, "grad_norm": 1.3920730352401733, "learning_rate": 6.106088933268181e-06, "loss": 0.0525, "step": 170875 }, { "epoch": 2.4508116789995986, "grad_norm": 4.001405239105225, "learning_rate": 6.10210541970299e-06, "loss": 0.1448, "step": 170900 }, { "epoch": 2.451170194458785, "grad_norm": 0.834725558757782, "learning_rate": 6.0981219061377984e-06, "loss": 0.0826, "step": 170925 }, { "epoch": 2.4515287099179717, "grad_norm": 17.959444046020508, "learning_rate": 6.094138392572606e-06, "loss": 0.0918, "step": 170950 }, { "epoch": 2.4518872253771584, "grad_norm": 8.230527877807617, "learning_rate": 6.090154879007415e-06, "loss": 0.1998, "step": 170975 }, { "epoch": 2.4522457408363447, "grad_norm": 1.939614176750183, "learning_rate": 6.086171365442223e-06, "loss": 0.0919, "step": 171000 }, { "epoch": 2.4526042562955315, "grad_norm": 0.40233752131462097, "learning_rate": 6.0821878518770315e-06, "loss": 0.0812, "step": 171025 }, { "epoch": 2.452962771754718, "grad_norm": 0.19670246541500092, "learning_rate": 6.07820433831184e-06, "loss": 0.0914, "step": 171050 }, { "epoch": 2.4533212872139045, "grad_norm": 0.4364672303199768, "learning_rate": 6.074220824746649e-06, "loss": 0.0586, "step": 171075 }, { "epoch": 2.4536798026730913, "grad_norm": 9.635234832763672, "learning_rate": 6.070237311181457e-06, "loss": 0.0699, "step": 171100 }, { "epoch": 2.454038318132278, "grad_norm": 0.09199002385139465, "learning_rate": 6.0662537976162654e-06, "loss": 0.1201, "step": 171125 }, { "epoch": 2.4543968335914643, "grad_norm": 0.7996883392333984, "learning_rate": 6.062270284051074e-06, "loss": 0.0998, "step": 171150 }, { "epoch": 2.454755349050651, "grad_norm": 0.39455610513687134, "learning_rate": 6.058286770485883e-06, "loss": 0.0533, "step": 171175 }, { "epoch": 2.455113864509838, "grad_norm": 0.16007569432258606, "learning_rate": 6.0543032569206915e-06, "loss": 0.047, "step": 171200 }, { "epoch": 2.455472379969024, "grad_norm": 9.927469253540039, "learning_rate": 6.0503197433555e-06, "loss": 0.1015, "step": 171225 }, { "epoch": 2.455830895428211, "grad_norm": 11.964567184448242, "learning_rate": 6.046336229790308e-06, "loss": 0.092, "step": 171250 }, { "epoch": 2.4561894108873976, "grad_norm": 1.6036708354949951, "learning_rate": 6.042352716225116e-06, "loss": 0.104, "step": 171275 }, { "epoch": 2.456547926346584, "grad_norm": 0.9305931329727173, "learning_rate": 6.038369202659925e-06, "loss": 0.0612, "step": 171300 }, { "epoch": 2.4569064418057707, "grad_norm": 4.391105651855469, "learning_rate": 6.034385689094733e-06, "loss": 0.1265, "step": 171325 }, { "epoch": 2.4572649572649574, "grad_norm": 0.05877850577235222, "learning_rate": 6.030402175529542e-06, "loss": 0.0486, "step": 171350 }, { "epoch": 2.4576234727241437, "grad_norm": 1.532380223274231, "learning_rate": 6.026418661964351e-06, "loss": 0.0699, "step": 171375 }, { "epoch": 2.4579819881833305, "grad_norm": 9.033149719238281, "learning_rate": 6.0224351483991585e-06, "loss": 0.0505, "step": 171400 }, { "epoch": 2.458340503642517, "grad_norm": 6.542507648468018, "learning_rate": 6.018451634833967e-06, "loss": 0.0802, "step": 171425 }, { "epoch": 2.4586990191017035, "grad_norm": 8.380196571350098, "learning_rate": 6.014468121268776e-06, "loss": 0.109, "step": 171450 }, { "epoch": 2.4590575345608903, "grad_norm": 0.7019802927970886, "learning_rate": 6.010484607703585e-06, "loss": 0.0787, "step": 171475 }, { "epoch": 2.459416050020077, "grad_norm": 0.9342837333679199, "learning_rate": 6.006501094138393e-06, "loss": 0.1029, "step": 171500 }, { "epoch": 2.4597745654792633, "grad_norm": 3.048525094985962, "learning_rate": 6.002517580573202e-06, "loss": 0.0737, "step": 171525 }, { "epoch": 2.46013308093845, "grad_norm": 7.563465595245361, "learning_rate": 5.998534067008009e-06, "loss": 0.1014, "step": 171550 }, { "epoch": 2.460491596397637, "grad_norm": 0.23915018141269684, "learning_rate": 5.994550553442818e-06, "loss": 0.0995, "step": 171575 }, { "epoch": 2.460850111856823, "grad_norm": 15.596272468566895, "learning_rate": 5.990567039877626e-06, "loss": 0.0889, "step": 171600 }, { "epoch": 2.46120862731601, "grad_norm": 9.823634147644043, "learning_rate": 5.986583526312435e-06, "loss": 0.1228, "step": 171625 }, { "epoch": 2.4615671427751966, "grad_norm": 12.908123016357422, "learning_rate": 5.982600012747244e-06, "loss": 0.1019, "step": 171650 }, { "epoch": 2.461925658234383, "grad_norm": 0.3256905674934387, "learning_rate": 5.978616499182052e-06, "loss": 0.1604, "step": 171675 }, { "epoch": 2.4622841736935697, "grad_norm": 0.03561647608876228, "learning_rate": 5.97463298561686e-06, "loss": 0.1261, "step": 171700 }, { "epoch": 2.4626426891527564, "grad_norm": 5.8253092765808105, "learning_rate": 5.970649472051669e-06, "loss": 0.0739, "step": 171725 }, { "epoch": 2.4630012046119427, "grad_norm": 3.2909841537475586, "learning_rate": 5.966665958486478e-06, "loss": 0.1403, "step": 171750 }, { "epoch": 2.4633597200711295, "grad_norm": 0.10162002593278885, "learning_rate": 5.962682444921286e-06, "loss": 0.1253, "step": 171775 }, { "epoch": 2.463718235530316, "grad_norm": 4.01262092590332, "learning_rate": 5.958698931356095e-06, "loss": 0.0834, "step": 171800 }, { "epoch": 2.4640767509895025, "grad_norm": 6.992764472961426, "learning_rate": 5.954715417790903e-06, "loss": 0.0825, "step": 171825 }, { "epoch": 2.4644352664486893, "grad_norm": 0.043681755661964417, "learning_rate": 5.950731904225711e-06, "loss": 0.0778, "step": 171850 }, { "epoch": 2.464793781907876, "grad_norm": 0.5951498746871948, "learning_rate": 5.946748390660519e-06, "loss": 0.0666, "step": 171875 }, { "epoch": 2.4651522973670623, "grad_norm": 3.65701961517334, "learning_rate": 5.942764877095328e-06, "loss": 0.1046, "step": 171900 }, { "epoch": 2.465510812826249, "grad_norm": 1.0031956434249878, "learning_rate": 5.938781363530137e-06, "loss": 0.0702, "step": 171925 }, { "epoch": 2.465869328285436, "grad_norm": 1.1028331518173218, "learning_rate": 5.9347978499649455e-06, "loss": 0.0831, "step": 171950 }, { "epoch": 2.466227843744622, "grad_norm": 0.08812850713729858, "learning_rate": 5.930814336399753e-06, "loss": 0.105, "step": 171975 }, { "epoch": 2.466586359203809, "grad_norm": 13.848312377929688, "learning_rate": 5.926830822834562e-06, "loss": 0.1204, "step": 172000 }, { "epoch": 2.4669448746629956, "grad_norm": 2.2844150066375732, "learning_rate": 5.922847309269371e-06, "loss": 0.1397, "step": 172025 }, { "epoch": 2.467303390122182, "grad_norm": 0.28942808508872986, "learning_rate": 5.918863795704179e-06, "loss": 0.1161, "step": 172050 }, { "epoch": 2.4676619055813687, "grad_norm": 0.3050612807273865, "learning_rate": 5.914880282138988e-06, "loss": 0.0608, "step": 172075 }, { "epoch": 2.4680204210405554, "grad_norm": 1.2376329898834229, "learning_rate": 5.910896768573796e-06, "loss": 0.1038, "step": 172100 }, { "epoch": 2.4683789364997417, "grad_norm": 4.472768306732178, "learning_rate": 5.906913255008604e-06, "loss": 0.067, "step": 172125 }, { "epoch": 2.4687374519589285, "grad_norm": 17.998336791992188, "learning_rate": 5.9029297414434125e-06, "loss": 0.1033, "step": 172150 }, { "epoch": 2.469095967418115, "grad_norm": 4.030461311340332, "learning_rate": 5.898946227878221e-06, "loss": 0.1215, "step": 172175 }, { "epoch": 2.4694544828773015, "grad_norm": 17.028791427612305, "learning_rate": 5.89496271431303e-06, "loss": 0.0705, "step": 172200 }, { "epoch": 2.4698129983364883, "grad_norm": 0.11493870615959167, "learning_rate": 5.8909792007478386e-06, "loss": 0.0795, "step": 172225 }, { "epoch": 2.470171513795675, "grad_norm": 0.8581806421279907, "learning_rate": 5.886995687182647e-06, "loss": 0.0504, "step": 172250 }, { "epoch": 2.4705300292548613, "grad_norm": 6.8606743812561035, "learning_rate": 5.883012173617455e-06, "loss": 0.1064, "step": 172275 }, { "epoch": 2.470888544714048, "grad_norm": 1.13850998878479, "learning_rate": 5.879028660052264e-06, "loss": 0.3041, "step": 172300 }, { "epoch": 2.471247060173235, "grad_norm": 1.3899081945419312, "learning_rate": 5.8750451464870725e-06, "loss": 0.0476, "step": 172325 }, { "epoch": 2.471605575632421, "grad_norm": 2.7986278533935547, "learning_rate": 5.871061632921881e-06, "loss": 0.0991, "step": 172350 }, { "epoch": 2.471964091091608, "grad_norm": 0.13956555724143982, "learning_rate": 5.867078119356689e-06, "loss": 0.0648, "step": 172375 }, { "epoch": 2.4723226065507946, "grad_norm": 0.46275198459625244, "learning_rate": 5.863094605791498e-06, "loss": 0.1068, "step": 172400 }, { "epoch": 2.472681122009981, "grad_norm": 0.09714408218860626, "learning_rate": 5.8591110922263055e-06, "loss": 0.0956, "step": 172425 }, { "epoch": 2.4730396374691677, "grad_norm": 18.15220069885254, "learning_rate": 5.855127578661114e-06, "loss": 0.0695, "step": 172450 }, { "epoch": 2.4733981529283544, "grad_norm": 11.494933128356934, "learning_rate": 5.851144065095923e-06, "loss": 0.0382, "step": 172475 }, { "epoch": 2.4737566683875407, "grad_norm": 0.8369797468185425, "learning_rate": 5.847160551530732e-06, "loss": 0.0826, "step": 172500 }, { "epoch": 2.4741151838467275, "grad_norm": 1.7211329936981201, "learning_rate": 5.84317703796554e-06, "loss": 0.0347, "step": 172525 }, { "epoch": 2.474473699305914, "grad_norm": 11.065835952758789, "learning_rate": 5.839193524400349e-06, "loss": 0.0983, "step": 172550 }, { "epoch": 2.4748322147651005, "grad_norm": 5.062707424163818, "learning_rate": 5.835210010835157e-06, "loss": 0.0519, "step": 172575 }, { "epoch": 2.4751907302242873, "grad_norm": 1.717736005783081, "learning_rate": 5.8312264972699655e-06, "loss": 0.1083, "step": 172600 }, { "epoch": 2.475549245683474, "grad_norm": 23.052207946777344, "learning_rate": 5.827242983704774e-06, "loss": 0.1206, "step": 172625 }, { "epoch": 2.4759077611426603, "grad_norm": 4.356502056121826, "learning_rate": 5.823259470139582e-06, "loss": 0.1205, "step": 172650 }, { "epoch": 2.476266276601847, "grad_norm": 4.406678199768066, "learning_rate": 5.819275956574391e-06, "loss": 0.0982, "step": 172675 }, { "epoch": 2.476624792061034, "grad_norm": 18.156667709350586, "learning_rate": 5.8152924430091995e-06, "loss": 0.1108, "step": 172700 }, { "epoch": 2.47698330752022, "grad_norm": 0.8201711177825928, "learning_rate": 5.811308929444007e-06, "loss": 0.1096, "step": 172725 }, { "epoch": 2.477341822979407, "grad_norm": 0.8881312012672424, "learning_rate": 5.807325415878816e-06, "loss": 0.0727, "step": 172750 }, { "epoch": 2.4777003384385936, "grad_norm": 0.47583964467048645, "learning_rate": 5.803341902313625e-06, "loss": 0.0806, "step": 172775 }, { "epoch": 2.47805885389778, "grad_norm": 1.1239980459213257, "learning_rate": 5.799358388748433e-06, "loss": 0.0652, "step": 172800 }, { "epoch": 2.4784173693569667, "grad_norm": 19.55539321899414, "learning_rate": 5.795374875183242e-06, "loss": 0.081, "step": 172825 }, { "epoch": 2.4787758848161534, "grad_norm": 19.944852828979492, "learning_rate": 5.791391361618051e-06, "loss": 0.0805, "step": 172850 }, { "epoch": 2.4791344002753397, "grad_norm": 15.761075019836426, "learning_rate": 5.787407848052859e-06, "loss": 0.0904, "step": 172875 }, { "epoch": 2.4794929157345265, "grad_norm": 8.248722076416016, "learning_rate": 5.783424334487667e-06, "loss": 0.0844, "step": 172900 }, { "epoch": 2.479851431193713, "grad_norm": 15.374831199645996, "learning_rate": 5.779440820922475e-06, "loss": 0.098, "step": 172925 }, { "epoch": 2.4802099466528995, "grad_norm": 0.05506917089223862, "learning_rate": 5.775457307357284e-06, "loss": 0.0832, "step": 172950 }, { "epoch": 2.4805684621120863, "grad_norm": 4.9746270179748535, "learning_rate": 5.7714737937920925e-06, "loss": 0.0627, "step": 172975 }, { "epoch": 2.480926977571273, "grad_norm": 12.998058319091797, "learning_rate": 5.767490280226901e-06, "loss": 0.0844, "step": 173000 }, { "epoch": 2.4812854930304593, "grad_norm": 1.3969143629074097, "learning_rate": 5.763506766661709e-06, "loss": 0.0752, "step": 173025 }, { "epoch": 2.481644008489646, "grad_norm": 12.940350532531738, "learning_rate": 5.759523253096518e-06, "loss": 0.0826, "step": 173050 }, { "epoch": 2.482002523948833, "grad_norm": 11.118903160095215, "learning_rate": 5.7555397395313264e-06, "loss": 0.0948, "step": 173075 }, { "epoch": 2.482361039408019, "grad_norm": 14.234774589538574, "learning_rate": 5.751556225966135e-06, "loss": 0.1225, "step": 173100 }, { "epoch": 2.482719554867206, "grad_norm": 1.0507643222808838, "learning_rate": 5.747572712400944e-06, "loss": 0.0555, "step": 173125 }, { "epoch": 2.4830780703263926, "grad_norm": 0.4651508629322052, "learning_rate": 5.7435891988357525e-06, "loss": 0.0475, "step": 173150 }, { "epoch": 2.483436585785579, "grad_norm": 0.6152603626251221, "learning_rate": 5.73960568527056e-06, "loss": 0.1457, "step": 173175 }, { "epoch": 2.4837951012447657, "grad_norm": 2.507350206375122, "learning_rate": 5.735622171705368e-06, "loss": 0.0516, "step": 173200 }, { "epoch": 2.4841536167039524, "grad_norm": 0.4211629033088684, "learning_rate": 5.731638658140177e-06, "loss": 0.085, "step": 173225 }, { "epoch": 2.4845121321631387, "grad_norm": 14.445738792419434, "learning_rate": 5.727655144574986e-06, "loss": 0.0865, "step": 173250 }, { "epoch": 2.4848706476223255, "grad_norm": 12.022144317626953, "learning_rate": 5.723671631009794e-06, "loss": 0.0453, "step": 173275 }, { "epoch": 2.485229163081512, "grad_norm": 16.74033546447754, "learning_rate": 5.719688117444603e-06, "loss": 0.0794, "step": 173300 }, { "epoch": 2.4855876785406985, "grad_norm": 0.24088965356349945, "learning_rate": 5.715704603879411e-06, "loss": 0.0839, "step": 173325 }, { "epoch": 2.4859461939998853, "grad_norm": 5.699676990509033, "learning_rate": 5.7117210903142195e-06, "loss": 0.1328, "step": 173350 }, { "epoch": 2.486304709459072, "grad_norm": 13.513569831848145, "learning_rate": 5.707737576749028e-06, "loss": 0.1059, "step": 173375 }, { "epoch": 2.4866632249182583, "grad_norm": 0.16492336988449097, "learning_rate": 5.703754063183837e-06, "loss": 0.0853, "step": 173400 }, { "epoch": 2.487021740377445, "grad_norm": 9.081806182861328, "learning_rate": 5.699770549618646e-06, "loss": 0.0954, "step": 173425 }, { "epoch": 2.487380255836632, "grad_norm": 1.8080925941467285, "learning_rate": 5.695787036053454e-06, "loss": 0.0521, "step": 173450 }, { "epoch": 2.487738771295818, "grad_norm": 16.259496688842773, "learning_rate": 5.691803522488261e-06, "loss": 0.0708, "step": 173475 }, { "epoch": 2.488097286755005, "grad_norm": 0.6863794326782227, "learning_rate": 5.68782000892307e-06, "loss": 0.1199, "step": 173500 }, { "epoch": 2.4884558022141916, "grad_norm": 2.670236110687256, "learning_rate": 5.683836495357879e-06, "loss": 0.084, "step": 173525 }, { "epoch": 2.488814317673378, "grad_norm": 0.7264401316642761, "learning_rate": 5.679852981792687e-06, "loss": 0.1111, "step": 173550 }, { "epoch": 2.4891728331325647, "grad_norm": 0.046132929623126984, "learning_rate": 5.675869468227496e-06, "loss": 0.0643, "step": 173575 }, { "epoch": 2.4895313485917514, "grad_norm": 7.30854606628418, "learning_rate": 5.671885954662305e-06, "loss": 0.0813, "step": 173600 }, { "epoch": 2.4898898640509377, "grad_norm": 1.0481752157211304, "learning_rate": 5.667902441097113e-06, "loss": 0.1479, "step": 173625 }, { "epoch": 2.4902483795101245, "grad_norm": 5.045969009399414, "learning_rate": 5.663918927531921e-06, "loss": 0.096, "step": 173650 }, { "epoch": 2.490606894969311, "grad_norm": 0.5876823663711548, "learning_rate": 5.65993541396673e-06, "loss": 0.0601, "step": 173675 }, { "epoch": 2.4909654104284975, "grad_norm": 7.747250080108643, "learning_rate": 5.655951900401539e-06, "loss": 0.0671, "step": 173700 }, { "epoch": 2.4913239258876843, "grad_norm": 0.41747361421585083, "learning_rate": 5.651968386836347e-06, "loss": 0.0468, "step": 173725 }, { "epoch": 2.491682441346871, "grad_norm": 0.5980417132377625, "learning_rate": 5.647984873271155e-06, "loss": 0.1556, "step": 173750 }, { "epoch": 2.4920409568060573, "grad_norm": 2.045254945755005, "learning_rate": 5.644001359705963e-06, "loss": 0.0767, "step": 173775 }, { "epoch": 2.492399472265244, "grad_norm": 2.388930082321167, "learning_rate": 5.640017846140772e-06, "loss": 0.0508, "step": 173800 }, { "epoch": 2.492757987724431, "grad_norm": 2.760558605194092, "learning_rate": 5.63603433257558e-06, "loss": 0.0469, "step": 173825 }, { "epoch": 2.493116503183617, "grad_norm": 17.89365005493164, "learning_rate": 5.632050819010389e-06, "loss": 0.0726, "step": 173850 }, { "epoch": 2.493475018642804, "grad_norm": 6.256297588348389, "learning_rate": 5.628067305445198e-06, "loss": 0.0695, "step": 173875 }, { "epoch": 2.4938335341019906, "grad_norm": 4.561445713043213, "learning_rate": 5.6240837918800065e-06, "loss": 0.0751, "step": 173900 }, { "epoch": 2.494192049561177, "grad_norm": 18.167049407958984, "learning_rate": 5.620100278314814e-06, "loss": 0.0697, "step": 173925 }, { "epoch": 2.4945505650203637, "grad_norm": 16.664953231811523, "learning_rate": 5.616116764749623e-06, "loss": 0.0827, "step": 173950 }, { "epoch": 2.4949090804795504, "grad_norm": 6.63193941116333, "learning_rate": 5.612133251184432e-06, "loss": 0.1716, "step": 173975 }, { "epoch": 2.4952675959387367, "grad_norm": 0.26837748289108276, "learning_rate": 5.60814973761924e-06, "loss": 0.09, "step": 174000 }, { "epoch": 2.4956261113979235, "grad_norm": 2.2566938400268555, "learning_rate": 5.604166224054048e-06, "loss": 0.0777, "step": 174025 }, { "epoch": 2.49598462685711, "grad_norm": 2.156062364578247, "learning_rate": 5.600182710488857e-06, "loss": 0.0866, "step": 174050 }, { "epoch": 2.4963431423162965, "grad_norm": 0.6712581515312195, "learning_rate": 5.596199196923665e-06, "loss": 0.1198, "step": 174075 }, { "epoch": 2.4967016577754833, "grad_norm": 0.04583769664168358, "learning_rate": 5.5922156833584735e-06, "loss": 0.1748, "step": 174100 }, { "epoch": 2.49706017323467, "grad_norm": 18.622827529907227, "learning_rate": 5.588232169793282e-06, "loss": 0.1063, "step": 174125 }, { "epoch": 2.4974186886938563, "grad_norm": 0.10718676447868347, "learning_rate": 5.584248656228091e-06, "loss": 0.0809, "step": 174150 }, { "epoch": 2.497777204153043, "grad_norm": 13.933679580688477, "learning_rate": 5.5802651426628996e-06, "loss": 0.0747, "step": 174175 }, { "epoch": 2.49813571961223, "grad_norm": 3.0492422580718994, "learning_rate": 5.576281629097708e-06, "loss": 0.0563, "step": 174200 }, { "epoch": 2.498494235071416, "grad_norm": 19.837217330932617, "learning_rate": 5.572298115532516e-06, "loss": 0.0678, "step": 174225 }, { "epoch": 2.498852750530603, "grad_norm": 3.7650909423828125, "learning_rate": 5.568314601967325e-06, "loss": 0.1276, "step": 174250 }, { "epoch": 2.4992112659897896, "grad_norm": 0.35921812057495117, "learning_rate": 5.5643310884021335e-06, "loss": 0.0748, "step": 174275 }, { "epoch": 2.499569781448976, "grad_norm": 18.66141128540039, "learning_rate": 5.560347574836941e-06, "loss": 0.1132, "step": 174300 }, { "epoch": 2.4999282969081627, "grad_norm": 1.0000810623168945, "learning_rate": 5.55636406127175e-06, "loss": 0.096, "step": 174325 }, { "epoch": 2.5002868123673494, "grad_norm": 0.766838550567627, "learning_rate": 5.552380547706559e-06, "loss": 0.059, "step": 174350 }, { "epoch": 2.500645327826536, "grad_norm": 1.8535336256027222, "learning_rate": 5.5483970341413666e-06, "loss": 0.0944, "step": 174375 }, { "epoch": 2.5010038432857224, "grad_norm": 0.06753343343734741, "learning_rate": 5.544413520576175e-06, "loss": 0.1522, "step": 174400 }, { "epoch": 2.501362358744909, "grad_norm": 1.361823320388794, "learning_rate": 5.540430007010984e-06, "loss": 0.1035, "step": 174425 }, { "epoch": 2.5017208742040955, "grad_norm": 1.4260872602462769, "learning_rate": 5.536446493445793e-06, "loss": 0.1097, "step": 174450 }, { "epoch": 2.5020793896632822, "grad_norm": 2.256281852722168, "learning_rate": 5.532462979880601e-06, "loss": 0.06, "step": 174475 }, { "epoch": 2.502437905122469, "grad_norm": 3.3002164363861084, "learning_rate": 5.52847946631541e-06, "loss": 0.1655, "step": 174500 }, { "epoch": 2.5027964205816557, "grad_norm": 1.6457037925720215, "learning_rate": 5.524495952750218e-06, "loss": 0.123, "step": 174525 }, { "epoch": 2.503154936040842, "grad_norm": 1.1107935905456543, "learning_rate": 5.5205124391850266e-06, "loss": 0.1141, "step": 174550 }, { "epoch": 2.503513451500029, "grad_norm": 0.057620320469141006, "learning_rate": 5.516528925619834e-06, "loss": 0.1333, "step": 174575 }, { "epoch": 2.503871966959215, "grad_norm": 0.48684051632881165, "learning_rate": 5.512545412054643e-06, "loss": 0.1155, "step": 174600 }, { "epoch": 2.504230482418402, "grad_norm": 12.984940528869629, "learning_rate": 5.508561898489452e-06, "loss": 0.1028, "step": 174625 }, { "epoch": 2.5045889978775886, "grad_norm": 0.18127231299877167, "learning_rate": 5.5045783849242605e-06, "loss": 0.0695, "step": 174650 }, { "epoch": 2.5049475133367753, "grad_norm": 1.314787745475769, "learning_rate": 5.500594871359068e-06, "loss": 0.0698, "step": 174675 }, { "epoch": 2.5053060287959616, "grad_norm": 1.6230723857879639, "learning_rate": 5.496611357793877e-06, "loss": 0.0679, "step": 174700 }, { "epoch": 2.5056645442551484, "grad_norm": 0.2074616402387619, "learning_rate": 5.492627844228686e-06, "loss": 0.176, "step": 174725 }, { "epoch": 2.5060230597143347, "grad_norm": 23.59209632873535, "learning_rate": 5.488644330663494e-06, "loss": 0.1615, "step": 174750 }, { "epoch": 2.5063815751735214, "grad_norm": 11.159384727478027, "learning_rate": 5.484660817098303e-06, "loss": 0.0848, "step": 174775 }, { "epoch": 2.506740090632708, "grad_norm": 1.0821428298950195, "learning_rate": 5.480677303533112e-06, "loss": 0.1246, "step": 174800 }, { "epoch": 2.507098606091895, "grad_norm": 7.009448528289795, "learning_rate": 5.47669378996792e-06, "loss": 0.0675, "step": 174825 }, { "epoch": 2.5074571215510812, "grad_norm": 1.9025847911834717, "learning_rate": 5.4727102764027275e-06, "loss": 0.1085, "step": 174850 }, { "epoch": 2.507815637010268, "grad_norm": 3.4133212566375732, "learning_rate": 5.468726762837536e-06, "loss": 0.0838, "step": 174875 }, { "epoch": 2.5081741524694543, "grad_norm": 1.4218744039535522, "learning_rate": 5.464743249272345e-06, "loss": 0.193, "step": 174900 }, { "epoch": 2.508532667928641, "grad_norm": 0.06066180765628815, "learning_rate": 5.4607597357071535e-06, "loss": 0.0336, "step": 174925 }, { "epoch": 2.508891183387828, "grad_norm": 0.6945451498031616, "learning_rate": 5.456776222141962e-06, "loss": 0.0918, "step": 174950 }, { "epoch": 2.5092496988470145, "grad_norm": 2.1263699531555176, "learning_rate": 5.45279270857677e-06, "loss": 0.0758, "step": 174975 }, { "epoch": 2.509608214306201, "grad_norm": 0.5960349440574646, "learning_rate": 5.448809195011579e-06, "loss": 0.0673, "step": 175000 }, { "epoch": 2.5099667297653876, "grad_norm": 11.940839767456055, "learning_rate": 5.4448256814463875e-06, "loss": 0.1185, "step": 175025 }, { "epoch": 2.510325245224574, "grad_norm": 9.496074676513672, "learning_rate": 5.440842167881196e-06, "loss": 0.1966, "step": 175050 }, { "epoch": 2.5106837606837606, "grad_norm": 11.426706314086914, "learning_rate": 5.436858654316005e-06, "loss": 0.1085, "step": 175075 }, { "epoch": 2.5110422761429474, "grad_norm": 1.0632522106170654, "learning_rate": 5.4328751407508135e-06, "loss": 0.1416, "step": 175100 }, { "epoch": 2.511400791602134, "grad_norm": 8.727001190185547, "learning_rate": 5.4288916271856205e-06, "loss": 0.0975, "step": 175125 }, { "epoch": 2.5117593070613204, "grad_norm": 2.3731343746185303, "learning_rate": 5.424908113620429e-06, "loss": 0.0748, "step": 175150 }, { "epoch": 2.512117822520507, "grad_norm": 0.37094587087631226, "learning_rate": 5.420924600055238e-06, "loss": 0.1436, "step": 175175 }, { "epoch": 2.5124763379796935, "grad_norm": 12.002881050109863, "learning_rate": 5.416941086490047e-06, "loss": 0.0489, "step": 175200 }, { "epoch": 2.5128348534388802, "grad_norm": 1.4069160223007202, "learning_rate": 5.412957572924855e-06, "loss": 0.0549, "step": 175225 }, { "epoch": 2.513193368898067, "grad_norm": 8.331072807312012, "learning_rate": 5.408974059359664e-06, "loss": 0.1209, "step": 175250 }, { "epoch": 2.5135518843572537, "grad_norm": 0.518610954284668, "learning_rate": 5.404990545794472e-06, "loss": 0.0902, "step": 175275 }, { "epoch": 2.51391039981644, "grad_norm": 0.7473925352096558, "learning_rate": 5.4010070322292805e-06, "loss": 0.0619, "step": 175300 }, { "epoch": 2.514268915275627, "grad_norm": 2.0426042079925537, "learning_rate": 5.397023518664089e-06, "loss": 0.0452, "step": 175325 }, { "epoch": 2.514627430734813, "grad_norm": 5.714893341064453, "learning_rate": 5.393040005098898e-06, "loss": 0.0927, "step": 175350 }, { "epoch": 2.514985946194, "grad_norm": 0.11025633662939072, "learning_rate": 5.389056491533707e-06, "loss": 0.0635, "step": 175375 }, { "epoch": 2.5153444616531866, "grad_norm": 3.3060476779937744, "learning_rate": 5.3850729779685144e-06, "loss": 0.051, "step": 175400 }, { "epoch": 2.5157029771123733, "grad_norm": 9.35588550567627, "learning_rate": 5.381089464403322e-06, "loss": 0.0664, "step": 175425 }, { "epoch": 2.5160614925715596, "grad_norm": 8.006032943725586, "learning_rate": 5.377105950838131e-06, "loss": 0.0991, "step": 175450 }, { "epoch": 2.5164200080307464, "grad_norm": 1.1079837083816528, "learning_rate": 5.37312243727294e-06, "loss": 0.0503, "step": 175475 }, { "epoch": 2.5167785234899327, "grad_norm": 0.13725700974464417, "learning_rate": 5.369138923707748e-06, "loss": 0.065, "step": 175500 }, { "epoch": 2.5171370389491194, "grad_norm": 2.2408554553985596, "learning_rate": 5.365155410142557e-06, "loss": 0.0805, "step": 175525 }, { "epoch": 2.517495554408306, "grad_norm": 8.794536590576172, "learning_rate": 5.361171896577366e-06, "loss": 0.0633, "step": 175550 }, { "epoch": 2.517854069867493, "grad_norm": 0.8084396123886108, "learning_rate": 5.357188383012174e-06, "loss": 0.1697, "step": 175575 }, { "epoch": 2.5182125853266792, "grad_norm": 0.8235390186309814, "learning_rate": 5.353204869446982e-06, "loss": 0.0285, "step": 175600 }, { "epoch": 2.518571100785866, "grad_norm": 16.291366577148438, "learning_rate": 5.349221355881791e-06, "loss": 0.059, "step": 175625 }, { "epoch": 2.5189296162450523, "grad_norm": 9.580750465393066, "learning_rate": 5.3452378423166e-06, "loss": 0.0902, "step": 175650 }, { "epoch": 2.519288131704239, "grad_norm": 2.0438859462738037, "learning_rate": 5.3412543287514075e-06, "loss": 0.0276, "step": 175675 }, { "epoch": 2.519646647163426, "grad_norm": 8.169560432434082, "learning_rate": 5.337270815186216e-06, "loss": 0.0623, "step": 175700 }, { "epoch": 2.5200051626226125, "grad_norm": 8.058489799499512, "learning_rate": 5.333287301621024e-06, "loss": 0.1264, "step": 175725 }, { "epoch": 2.520363678081799, "grad_norm": 0.2213772088289261, "learning_rate": 5.329303788055833e-06, "loss": 0.0749, "step": 175750 }, { "epoch": 2.5207221935409856, "grad_norm": 4.227438926696777, "learning_rate": 5.3253202744906414e-06, "loss": 0.1694, "step": 175775 }, { "epoch": 2.521080709000172, "grad_norm": 0.024979518726468086, "learning_rate": 5.32133676092545e-06, "loss": 0.0972, "step": 175800 }, { "epoch": 2.5214392244593586, "grad_norm": 1.4662278890609741, "learning_rate": 5.317353247360259e-06, "loss": 0.0441, "step": 175825 }, { "epoch": 2.5217977399185454, "grad_norm": 12.53328800201416, "learning_rate": 5.3133697337950675e-06, "loss": 0.1263, "step": 175850 }, { "epoch": 2.522156255377732, "grad_norm": 6.850978851318359, "learning_rate": 5.309386220229875e-06, "loss": 0.11, "step": 175875 }, { "epoch": 2.5225147708369184, "grad_norm": 18.596603393554688, "learning_rate": 5.305402706664684e-06, "loss": 0.0959, "step": 175900 }, { "epoch": 2.522873286296105, "grad_norm": 0.2189481258392334, "learning_rate": 5.301419193099493e-06, "loss": 0.1553, "step": 175925 }, { "epoch": 2.5232318017552915, "grad_norm": 1.9324496984481812, "learning_rate": 5.297435679534301e-06, "loss": 0.0661, "step": 175950 }, { "epoch": 2.5235903172144782, "grad_norm": 0.1179267019033432, "learning_rate": 5.293452165969109e-06, "loss": 0.1237, "step": 175975 }, { "epoch": 2.523948832673665, "grad_norm": 3.562329053878784, "learning_rate": 5.289468652403918e-06, "loss": 0.0504, "step": 176000 }, { "epoch": 2.5243073481328517, "grad_norm": 0.49964380264282227, "learning_rate": 5.285485138838726e-06, "loss": 0.0493, "step": 176025 }, { "epoch": 2.524665863592038, "grad_norm": 10.302499771118164, "learning_rate": 5.2815016252735345e-06, "loss": 0.0741, "step": 176050 }, { "epoch": 2.525024379051225, "grad_norm": 3.4666926860809326, "learning_rate": 5.277518111708343e-06, "loss": 0.0446, "step": 176075 }, { "epoch": 2.525382894510411, "grad_norm": 1.7805771827697754, "learning_rate": 5.273534598143152e-06, "loss": 0.1739, "step": 176100 }, { "epoch": 2.525741409969598, "grad_norm": 0.3656749725341797, "learning_rate": 5.2695510845779606e-06, "loss": 0.0849, "step": 176125 }, { "epoch": 2.5260999254287846, "grad_norm": 21.315235137939453, "learning_rate": 5.265567571012769e-06, "loss": 0.1252, "step": 176150 }, { "epoch": 2.5264584408879713, "grad_norm": 0.905637264251709, "learning_rate": 5.261584057447577e-06, "loss": 0.0639, "step": 176175 }, { "epoch": 2.5268169563471576, "grad_norm": 10.613520622253418, "learning_rate": 5.257600543882386e-06, "loss": 0.0454, "step": 176200 }, { "epoch": 2.5271754718063444, "grad_norm": 0.20229969918727875, "learning_rate": 5.253617030317194e-06, "loss": 0.1369, "step": 176225 }, { "epoch": 2.5275339872655307, "grad_norm": 0.7948322892189026, "learning_rate": 5.249633516752002e-06, "loss": 0.0984, "step": 176250 }, { "epoch": 2.5278925027247174, "grad_norm": 0.013647498562932014, "learning_rate": 5.245650003186811e-06, "loss": 0.0471, "step": 176275 }, { "epoch": 2.528251018183904, "grad_norm": 4.270410537719727, "learning_rate": 5.24166648962162e-06, "loss": 0.0729, "step": 176300 }, { "epoch": 2.528609533643091, "grad_norm": 0.4190516471862793, "learning_rate": 5.2376829760564276e-06, "loss": 0.0917, "step": 176325 }, { "epoch": 2.5289680491022772, "grad_norm": 2.0442044734954834, "learning_rate": 5.233699462491236e-06, "loss": 0.0434, "step": 176350 }, { "epoch": 2.529326564561464, "grad_norm": 0.2840346395969391, "learning_rate": 5.229715948926045e-06, "loss": 0.0961, "step": 176375 }, { "epoch": 2.5296850800206503, "grad_norm": 8.365043640136719, "learning_rate": 5.225732435360854e-06, "loss": 0.0524, "step": 176400 }, { "epoch": 2.530043595479837, "grad_norm": 5.179593086242676, "learning_rate": 5.221748921795662e-06, "loss": 0.1481, "step": 176425 }, { "epoch": 2.530402110939024, "grad_norm": 1.5748025178909302, "learning_rate": 5.21776540823047e-06, "loss": 0.0652, "step": 176450 }, { "epoch": 2.5307606263982105, "grad_norm": 0.43726280331611633, "learning_rate": 5.213781894665279e-06, "loss": 0.0831, "step": 176475 }, { "epoch": 2.531119141857397, "grad_norm": 2.036684989929199, "learning_rate": 5.209798381100087e-06, "loss": 0.1122, "step": 176500 }, { "epoch": 2.5314776573165836, "grad_norm": 0.15879233181476593, "learning_rate": 5.205814867534895e-06, "loss": 0.0761, "step": 176525 }, { "epoch": 2.53183617277577, "grad_norm": 1.035110354423523, "learning_rate": 5.201831353969704e-06, "loss": 0.052, "step": 176550 }, { "epoch": 2.5321946882349566, "grad_norm": 0.2924593687057495, "learning_rate": 5.197847840404513e-06, "loss": 0.0717, "step": 176575 }, { "epoch": 2.5325532036941434, "grad_norm": 0.18025416135787964, "learning_rate": 5.193864326839321e-06, "loss": 0.0871, "step": 176600 }, { "epoch": 2.53291171915333, "grad_norm": 2.385791063308716, "learning_rate": 5.189880813274129e-06, "loss": 0.0943, "step": 176625 }, { "epoch": 2.5332702346125164, "grad_norm": 3.1062357425689697, "learning_rate": 5.185897299708938e-06, "loss": 0.0847, "step": 176650 }, { "epoch": 2.533628750071703, "grad_norm": 4.5450639724731445, "learning_rate": 5.181913786143747e-06, "loss": 0.0704, "step": 176675 }, { "epoch": 2.5339872655308895, "grad_norm": 0.29347777366638184, "learning_rate": 5.177930272578555e-06, "loss": 0.1076, "step": 176700 }, { "epoch": 2.5343457809900762, "grad_norm": 3.076463222503662, "learning_rate": 5.173946759013364e-06, "loss": 0.0395, "step": 176725 }, { "epoch": 2.534704296449263, "grad_norm": 0.08161405473947525, "learning_rate": 5.169963245448172e-06, "loss": 0.1131, "step": 176750 }, { "epoch": 2.5350628119084497, "grad_norm": 0.06614721566438675, "learning_rate": 5.16597973188298e-06, "loss": 0.1921, "step": 176775 }, { "epoch": 2.535421327367636, "grad_norm": 2.410931348800659, "learning_rate": 5.1619962183177885e-06, "loss": 0.1225, "step": 176800 }, { "epoch": 2.535779842826823, "grad_norm": 0.10317426174879074, "learning_rate": 5.158012704752597e-06, "loss": 0.05, "step": 176825 }, { "epoch": 2.536138358286009, "grad_norm": 9.084389686584473, "learning_rate": 5.154029191187406e-06, "loss": 0.0234, "step": 176850 }, { "epoch": 2.536496873745196, "grad_norm": 10.766678810119629, "learning_rate": 5.1500456776222145e-06, "loss": 0.0915, "step": 176875 }, { "epoch": 2.5368553892043826, "grad_norm": 1.1039515733718872, "learning_rate": 5.146062164057022e-06, "loss": 0.0476, "step": 176900 }, { "epoch": 2.5372139046635693, "grad_norm": 0.1431906372308731, "learning_rate": 5.142078650491831e-06, "loss": 0.0643, "step": 176925 }, { "epoch": 2.5375724201227556, "grad_norm": 0.6951930522918701, "learning_rate": 5.13809513692664e-06, "loss": 0.1347, "step": 176950 }, { "epoch": 2.5379309355819424, "grad_norm": 4.546407699584961, "learning_rate": 5.1341116233614485e-06, "loss": 0.0969, "step": 176975 }, { "epoch": 2.5382894510411287, "grad_norm": 6.97310209274292, "learning_rate": 5.130128109796257e-06, "loss": 0.1213, "step": 177000 }, { "epoch": 2.5386479665003154, "grad_norm": 0.448477566242218, "learning_rate": 5.126144596231066e-06, "loss": 0.1516, "step": 177025 }, { "epoch": 2.539006481959502, "grad_norm": 0.19988521933555603, "learning_rate": 5.122161082665873e-06, "loss": 0.1412, "step": 177050 }, { "epoch": 2.539364997418689, "grad_norm": 0.0981219932436943, "learning_rate": 5.1181775691006815e-06, "loss": 0.059, "step": 177075 }, { "epoch": 2.5397235128778752, "grad_norm": 16.210573196411133, "learning_rate": 5.11419405553549e-06, "loss": 0.1237, "step": 177100 }, { "epoch": 2.540082028337062, "grad_norm": 2.5742897987365723, "learning_rate": 5.110210541970299e-06, "loss": 0.0992, "step": 177125 }, { "epoch": 2.5404405437962483, "grad_norm": 1.113258719444275, "learning_rate": 5.106227028405108e-06, "loss": 0.0563, "step": 177150 }, { "epoch": 2.540799059255435, "grad_norm": 1.3298474550247192, "learning_rate": 5.102243514839916e-06, "loss": 0.0669, "step": 177175 }, { "epoch": 2.541157574714622, "grad_norm": 7.997306823730469, "learning_rate": 5.098260001274724e-06, "loss": 0.059, "step": 177200 }, { "epoch": 2.5415160901738085, "grad_norm": 3.121074914932251, "learning_rate": 5.094276487709533e-06, "loss": 0.0774, "step": 177225 }, { "epoch": 2.541874605632995, "grad_norm": 30.03030014038086, "learning_rate": 5.0902929741443415e-06, "loss": 0.1394, "step": 177250 }, { "epoch": 2.5422331210921816, "grad_norm": 0.12097904831171036, "learning_rate": 5.08630946057915e-06, "loss": 0.0805, "step": 177275 }, { "epoch": 2.542591636551368, "grad_norm": 11.298829078674316, "learning_rate": 5.082325947013959e-06, "loss": 0.0759, "step": 177300 }, { "epoch": 2.5429501520105546, "grad_norm": 19.292617797851562, "learning_rate": 5.078342433448767e-06, "loss": 0.1016, "step": 177325 }, { "epoch": 2.5433086674697414, "grad_norm": 0.6684668064117432, "learning_rate": 5.074358919883575e-06, "loss": 0.0795, "step": 177350 }, { "epoch": 2.543667182928928, "grad_norm": 8.181611061096191, "learning_rate": 5.070375406318383e-06, "loss": 0.0851, "step": 177375 }, { "epoch": 2.5440256983881144, "grad_norm": 0.4825032651424408, "learning_rate": 5.066391892753192e-06, "loss": 0.1097, "step": 177400 }, { "epoch": 2.544384213847301, "grad_norm": 1.9405016899108887, "learning_rate": 5.062408379188001e-06, "loss": 0.072, "step": 177425 }, { "epoch": 2.5447427293064875, "grad_norm": 0.2080073058605194, "learning_rate": 5.058424865622809e-06, "loss": 0.1164, "step": 177450 }, { "epoch": 2.5451012447656742, "grad_norm": 2.4464755058288574, "learning_rate": 5.054441352057618e-06, "loss": 0.0889, "step": 177475 }, { "epoch": 2.545459760224861, "grad_norm": 0.2857300937175751, "learning_rate": 5.050457838492426e-06, "loss": 0.0929, "step": 177500 }, { "epoch": 2.5458182756840477, "grad_norm": 0.07952488213777542, "learning_rate": 5.046474324927235e-06, "loss": 0.1267, "step": 177525 }, { "epoch": 2.546176791143234, "grad_norm": 2.5604097843170166, "learning_rate": 5.042490811362043e-06, "loss": 0.1284, "step": 177550 }, { "epoch": 2.546535306602421, "grad_norm": 1.460209846496582, "learning_rate": 5.038507297796852e-06, "loss": 0.0706, "step": 177575 }, { "epoch": 2.546893822061607, "grad_norm": 1.4680932760238647, "learning_rate": 5.03452378423166e-06, "loss": 0.1448, "step": 177600 }, { "epoch": 2.547252337520794, "grad_norm": 0.7725803852081299, "learning_rate": 5.0305402706664685e-06, "loss": 0.0396, "step": 177625 }, { "epoch": 2.5476108529799806, "grad_norm": 17.270906448364258, "learning_rate": 5.026556757101276e-06, "loss": 0.061, "step": 177650 }, { "epoch": 2.5479693684391673, "grad_norm": 0.39131397008895874, "learning_rate": 5.022573243536085e-06, "loss": 0.0563, "step": 177675 }, { "epoch": 2.5483278838983536, "grad_norm": 5.62014627456665, "learning_rate": 5.018589729970894e-06, "loss": 0.1084, "step": 177700 }, { "epoch": 2.5486863993575404, "grad_norm": 0.1864752620458603, "learning_rate": 5.0146062164057024e-06, "loss": 0.0695, "step": 177725 }, { "epoch": 2.5490449148167267, "grad_norm": 9.786015510559082, "learning_rate": 5.010622702840511e-06, "loss": 0.0601, "step": 177750 }, { "epoch": 2.5494034302759134, "grad_norm": 0.42768487334251404, "learning_rate": 5.00663918927532e-06, "loss": 0.0795, "step": 177775 }, { "epoch": 2.5497619457351, "grad_norm": 1.195088267326355, "learning_rate": 5.002655675710128e-06, "loss": 0.1131, "step": 177800 }, { "epoch": 2.550120461194287, "grad_norm": 0.8328823447227478, "learning_rate": 4.998672162144936e-06, "loss": 0.0915, "step": 177825 }, { "epoch": 2.5504789766534732, "grad_norm": 2.4814319610595703, "learning_rate": 4.994688648579745e-06, "loss": 0.0558, "step": 177850 }, { "epoch": 2.55083749211266, "grad_norm": 10.64865493774414, "learning_rate": 4.990705135014553e-06, "loss": 0.1038, "step": 177875 }, { "epoch": 2.5511960075718463, "grad_norm": 0.15930293500423431, "learning_rate": 4.986721621449362e-06, "loss": 0.0585, "step": 177900 }, { "epoch": 2.551554523031033, "grad_norm": 3.2713868618011475, "learning_rate": 4.98273810788417e-06, "loss": 0.0907, "step": 177925 }, { "epoch": 2.5519130384902198, "grad_norm": 2.946589946746826, "learning_rate": 4.978754594318978e-06, "loss": 0.0833, "step": 177950 }, { "epoch": 2.5522715539494065, "grad_norm": 3.0168631076812744, "learning_rate": 4.974771080753787e-06, "loss": 0.0385, "step": 177975 }, { "epoch": 2.552630069408593, "grad_norm": 0.2452031373977661, "learning_rate": 4.9707875671885955e-06, "loss": 0.1601, "step": 178000 }, { "epoch": 2.5529885848677796, "grad_norm": 3.627562999725342, "learning_rate": 4.966804053623404e-06, "loss": 0.0594, "step": 178025 }, { "epoch": 2.553347100326966, "grad_norm": 2.8407816886901855, "learning_rate": 4.962820540058213e-06, "loss": 0.1106, "step": 178050 }, { "epoch": 2.5537056157861526, "grad_norm": 16.36674690246582, "learning_rate": 4.958837026493022e-06, "loss": 0.1503, "step": 178075 }, { "epoch": 2.5540641312453394, "grad_norm": 2.075634002685547, "learning_rate": 4.9548535129278294e-06, "loss": 0.0673, "step": 178100 }, { "epoch": 2.554422646704526, "grad_norm": 0.4607323110103607, "learning_rate": 4.950869999362638e-06, "loss": 0.102, "step": 178125 }, { "epoch": 2.5547811621637124, "grad_norm": 3.130474090576172, "learning_rate": 4.946886485797446e-06, "loss": 0.0393, "step": 178150 }, { "epoch": 2.555139677622899, "grad_norm": 5.467368125915527, "learning_rate": 4.942902972232255e-06, "loss": 0.0307, "step": 178175 }, { "epoch": 2.5554981930820855, "grad_norm": 1.1159300804138184, "learning_rate": 4.938919458667063e-06, "loss": 0.0757, "step": 178200 }, { "epoch": 2.5558567085412722, "grad_norm": 0.7605820894241333, "learning_rate": 4.934935945101872e-06, "loss": 0.0776, "step": 178225 }, { "epoch": 2.556215224000459, "grad_norm": 6.3964009284973145, "learning_rate": 4.93095243153668e-06, "loss": 0.1025, "step": 178250 }, { "epoch": 2.5565737394596457, "grad_norm": 0.16663338243961334, "learning_rate": 4.9269689179714886e-06, "loss": 0.1038, "step": 178275 }, { "epoch": 2.556932254918832, "grad_norm": 3.391505241394043, "learning_rate": 4.922985404406297e-06, "loss": 0.1259, "step": 178300 }, { "epoch": 2.5572907703780188, "grad_norm": 3.3412320613861084, "learning_rate": 4.919001890841106e-06, "loss": 0.0695, "step": 178325 }, { "epoch": 2.557649285837205, "grad_norm": 0.12944482266902924, "learning_rate": 4.915018377275915e-06, "loss": 0.0742, "step": 178350 }, { "epoch": 2.558007801296392, "grad_norm": 3.081124782562256, "learning_rate": 4.911034863710723e-06, "loss": 0.0902, "step": 178375 }, { "epoch": 2.5583663167555786, "grad_norm": 8.263640403747559, "learning_rate": 4.907051350145531e-06, "loss": 0.1173, "step": 178400 }, { "epoch": 2.5587248322147653, "grad_norm": 5.029740810394287, "learning_rate": 4.903067836580339e-06, "loss": 0.0525, "step": 178425 }, { "epoch": 2.5590833476739516, "grad_norm": 1.3097469806671143, "learning_rate": 4.899084323015148e-06, "loss": 0.0469, "step": 178450 }, { "epoch": 2.5594418631331384, "grad_norm": 7.476580619812012, "learning_rate": 4.895100809449956e-06, "loss": 0.0519, "step": 178475 }, { "epoch": 2.5598003785923247, "grad_norm": 0.2122541069984436, "learning_rate": 4.891117295884765e-06, "loss": 0.0538, "step": 178500 }, { "epoch": 2.5601588940515114, "grad_norm": 1.5762890577316284, "learning_rate": 4.887133782319574e-06, "loss": 0.0792, "step": 178525 }, { "epoch": 2.560517409510698, "grad_norm": 6.830185413360596, "learning_rate": 4.883150268754382e-06, "loss": 0.0473, "step": 178550 }, { "epoch": 2.560875924969885, "grad_norm": 18.665231704711914, "learning_rate": 4.87916675518919e-06, "loss": 0.0879, "step": 178575 }, { "epoch": 2.5612344404290712, "grad_norm": 1.4837934970855713, "learning_rate": 4.875183241623999e-06, "loss": 0.1836, "step": 178600 }, { "epoch": 2.561592955888258, "grad_norm": 1.0027785301208496, "learning_rate": 4.871199728058808e-06, "loss": 0.1017, "step": 178625 }, { "epoch": 2.5619514713474443, "grad_norm": 0.360519140958786, "learning_rate": 4.867216214493616e-06, "loss": 0.0618, "step": 178650 }, { "epoch": 2.562309986806631, "grad_norm": 0.04157320037484169, "learning_rate": 4.863232700928425e-06, "loss": 0.0742, "step": 178675 }, { "epoch": 2.5626685022658178, "grad_norm": 1.2680809497833252, "learning_rate": 4.859249187363232e-06, "loss": 0.1246, "step": 178700 }, { "epoch": 2.5630270177250045, "grad_norm": 0.16128486394882202, "learning_rate": 4.855265673798041e-06, "loss": 0.0375, "step": 178725 }, { "epoch": 2.563385533184191, "grad_norm": 0.9523682594299316, "learning_rate": 4.8512821602328495e-06, "loss": 0.0642, "step": 178750 }, { "epoch": 2.5637440486433776, "grad_norm": 3.925168752670288, "learning_rate": 4.847298646667658e-06, "loss": 0.0761, "step": 178775 }, { "epoch": 2.564102564102564, "grad_norm": 9.473267555236816, "learning_rate": 4.843315133102467e-06, "loss": 0.0538, "step": 178800 }, { "epoch": 2.5644610795617506, "grad_norm": 13.451887130737305, "learning_rate": 4.8393316195372756e-06, "loss": 0.0723, "step": 178825 }, { "epoch": 2.5648195950209374, "grad_norm": 1.436693787574768, "learning_rate": 4.835348105972083e-06, "loss": 0.0499, "step": 178850 }, { "epoch": 2.565178110480124, "grad_norm": 3.1107430458068848, "learning_rate": 4.831364592406892e-06, "loss": 0.037, "step": 178875 }, { "epoch": 2.5655366259393104, "grad_norm": 2.60921573638916, "learning_rate": 4.827381078841701e-06, "loss": 0.0752, "step": 178900 }, { "epoch": 2.565895141398497, "grad_norm": 2.708306074142456, "learning_rate": 4.8233975652765095e-06, "loss": 0.0576, "step": 178925 }, { "epoch": 2.5662536568576835, "grad_norm": 8.190834999084473, "learning_rate": 4.819414051711318e-06, "loss": 0.0876, "step": 178950 }, { "epoch": 2.56661217231687, "grad_norm": 7.527346611022949, "learning_rate": 4.815430538146126e-06, "loss": 0.0975, "step": 178975 }, { "epoch": 2.566970687776057, "grad_norm": 11.837692260742188, "learning_rate": 4.811447024580934e-06, "loss": 0.1031, "step": 179000 }, { "epoch": 2.5673292032352437, "grad_norm": 14.63018798828125, "learning_rate": 4.8074635110157425e-06, "loss": 0.1566, "step": 179025 }, { "epoch": 2.56768771869443, "grad_norm": 0.5335046052932739, "learning_rate": 4.803479997450551e-06, "loss": 0.0811, "step": 179050 }, { "epoch": 2.5680462341536168, "grad_norm": 9.26952075958252, "learning_rate": 4.79949648388536e-06, "loss": 0.0774, "step": 179075 }, { "epoch": 2.568404749612803, "grad_norm": 1.2410697937011719, "learning_rate": 4.795512970320169e-06, "loss": 0.1369, "step": 179100 }, { "epoch": 2.56876326507199, "grad_norm": 16.807538986206055, "learning_rate": 4.791529456754977e-06, "loss": 0.0957, "step": 179125 }, { "epoch": 2.5691217805311766, "grad_norm": 14.20601749420166, "learning_rate": 4.787545943189785e-06, "loss": 0.078, "step": 179150 }, { "epoch": 2.5694802959903633, "grad_norm": 0.028751032426953316, "learning_rate": 4.783562429624594e-06, "loss": 0.1118, "step": 179175 }, { "epoch": 2.5698388114495496, "grad_norm": 3.279182195663452, "learning_rate": 4.7795789160594025e-06, "loss": 0.0687, "step": 179200 }, { "epoch": 2.5701973269087364, "grad_norm": 0.1481177657842636, "learning_rate": 4.775595402494211e-06, "loss": 0.0782, "step": 179225 }, { "epoch": 2.5705558423679227, "grad_norm": 2.0862374305725098, "learning_rate": 4.771611888929019e-06, "loss": 0.1515, "step": 179250 }, { "epoch": 2.5709143578271094, "grad_norm": 19.942609786987305, "learning_rate": 4.767628375363828e-06, "loss": 0.0907, "step": 179275 }, { "epoch": 2.571272873286296, "grad_norm": 21.956382751464844, "learning_rate": 4.763644861798636e-06, "loss": 0.0895, "step": 179300 }, { "epoch": 2.571631388745483, "grad_norm": 0.052299246191978455, "learning_rate": 4.759661348233444e-06, "loss": 0.1041, "step": 179325 }, { "epoch": 2.571989904204669, "grad_norm": 0.09775392711162567, "learning_rate": 4.755677834668253e-06, "loss": 0.0373, "step": 179350 }, { "epoch": 2.572348419663856, "grad_norm": 0.2816598415374756, "learning_rate": 4.751694321103062e-06, "loss": 0.1154, "step": 179375 }, { "epoch": 2.5727069351230423, "grad_norm": 1.8624305725097656, "learning_rate": 4.74771080753787e-06, "loss": 0.1634, "step": 179400 }, { "epoch": 2.573065450582229, "grad_norm": 1.8513535261154175, "learning_rate": 4.743727293972679e-06, "loss": 0.0705, "step": 179425 }, { "epoch": 2.5734239660414158, "grad_norm": 1.7985737323760986, "learning_rate": 4.739743780407487e-06, "loss": 0.0395, "step": 179450 }, { "epoch": 2.5737824815006025, "grad_norm": 8.467596054077148, "learning_rate": 4.735760266842296e-06, "loss": 0.0937, "step": 179475 }, { "epoch": 2.574140996959789, "grad_norm": 1.322012186050415, "learning_rate": 4.731776753277104e-06, "loss": 0.0654, "step": 179500 }, { "epoch": 2.5744995124189756, "grad_norm": 0.8691907525062561, "learning_rate": 4.727793239711912e-06, "loss": 0.0396, "step": 179525 }, { "epoch": 2.574858027878162, "grad_norm": 5.25462532043457, "learning_rate": 4.723809726146721e-06, "loss": 0.0573, "step": 179550 }, { "epoch": 2.5752165433373486, "grad_norm": 1.520013689994812, "learning_rate": 4.7198262125815295e-06, "loss": 0.0829, "step": 179575 }, { "epoch": 2.5755750587965354, "grad_norm": 0.4627743363380432, "learning_rate": 4.715842699016337e-06, "loss": 0.0812, "step": 179600 }, { "epoch": 2.575933574255722, "grad_norm": 0.06211034208536148, "learning_rate": 4.711859185451146e-06, "loss": 0.0315, "step": 179625 }, { "epoch": 2.5762920897149084, "grad_norm": 2.0619747638702393, "learning_rate": 4.707875671885955e-06, "loss": 0.0751, "step": 179650 }, { "epoch": 2.576650605174095, "grad_norm": 0.33978936076164246, "learning_rate": 4.7038921583207634e-06, "loss": 0.0719, "step": 179675 }, { "epoch": 2.5770091206332815, "grad_norm": 0.5625811219215393, "learning_rate": 4.699908644755572e-06, "loss": 0.0517, "step": 179700 }, { "epoch": 2.577367636092468, "grad_norm": 10.087987899780273, "learning_rate": 4.695925131190381e-06, "loss": 0.1018, "step": 179725 }, { "epoch": 2.577726151551655, "grad_norm": 7.659821033477783, "learning_rate": 4.691941617625189e-06, "loss": 0.1279, "step": 179750 }, { "epoch": 2.5780846670108417, "grad_norm": 3.6843109130859375, "learning_rate": 4.687958104059997e-06, "loss": 0.0766, "step": 179775 }, { "epoch": 2.578443182470028, "grad_norm": 16.65298080444336, "learning_rate": 4.683974590494805e-06, "loss": 0.1746, "step": 179800 }, { "epoch": 2.5788016979292148, "grad_norm": 0.19053727388381958, "learning_rate": 4.679991076929614e-06, "loss": 0.0932, "step": 179825 }, { "epoch": 2.579160213388401, "grad_norm": 1.702188491821289, "learning_rate": 4.676007563364423e-06, "loss": 0.1048, "step": 179850 }, { "epoch": 2.579518728847588, "grad_norm": 13.44884204864502, "learning_rate": 4.672024049799231e-06, "loss": 0.0716, "step": 179875 }, { "epoch": 2.5798772443067746, "grad_norm": 1.3918030261993408, "learning_rate": 4.668040536234039e-06, "loss": 0.0392, "step": 179900 }, { "epoch": 2.5802357597659613, "grad_norm": 17.19361686706543, "learning_rate": 4.664057022668848e-06, "loss": 0.0488, "step": 179925 }, { "epoch": 2.5805942752251476, "grad_norm": 15.435711860656738, "learning_rate": 4.6600735091036565e-06, "loss": 0.106, "step": 179950 }, { "epoch": 2.5809527906843344, "grad_norm": 1.8360364437103271, "learning_rate": 4.656089995538465e-06, "loss": 0.0474, "step": 179975 }, { "epoch": 2.5813113061435207, "grad_norm": 13.37454891204834, "learning_rate": 4.652106481973274e-06, "loss": 0.0569, "step": 180000 }, { "epoch": 2.5816698216027074, "grad_norm": 0.3841743767261505, "learning_rate": 4.648122968408083e-06, "loss": 0.0818, "step": 180025 }, { "epoch": 2.582028337061894, "grad_norm": 1.195459008216858, "learning_rate": 4.6441394548428904e-06, "loss": 0.0628, "step": 180050 }, { "epoch": 2.582386852521081, "grad_norm": 0.9517312049865723, "learning_rate": 4.640155941277698e-06, "loss": 0.1057, "step": 180075 }, { "epoch": 2.582745367980267, "grad_norm": 0.6177424788475037, "learning_rate": 4.636172427712507e-06, "loss": 0.0845, "step": 180100 }, { "epoch": 2.583103883439454, "grad_norm": 1.59861159324646, "learning_rate": 4.632188914147316e-06, "loss": 0.1019, "step": 180125 }, { "epoch": 2.5834623988986403, "grad_norm": 0.17822323739528656, "learning_rate": 4.628205400582124e-06, "loss": 0.0833, "step": 180150 }, { "epoch": 2.583820914357827, "grad_norm": 8.066176414489746, "learning_rate": 4.624221887016933e-06, "loss": 0.0508, "step": 180175 }, { "epoch": 2.5841794298170138, "grad_norm": 14.913654327392578, "learning_rate": 4.620238373451741e-06, "loss": 0.097, "step": 180200 }, { "epoch": 2.5845379452762005, "grad_norm": 23.830095291137695, "learning_rate": 4.61625485988655e-06, "loss": 0.1049, "step": 180225 }, { "epoch": 2.584896460735387, "grad_norm": 0.6100402474403381, "learning_rate": 4.612271346321358e-06, "loss": 0.1226, "step": 180250 }, { "epoch": 2.5852549761945736, "grad_norm": 5.043722152709961, "learning_rate": 4.608287832756167e-06, "loss": 0.0887, "step": 180275 }, { "epoch": 2.58561349165376, "grad_norm": 19.06695556640625, "learning_rate": 4.604304319190976e-06, "loss": 0.1371, "step": 180300 }, { "epoch": 2.5859720071129466, "grad_norm": 14.077486991882324, "learning_rate": 4.600320805625784e-06, "loss": 0.098, "step": 180325 }, { "epoch": 2.5863305225721334, "grad_norm": 7.002396106719971, "learning_rate": 4.596337292060591e-06, "loss": 0.0946, "step": 180350 }, { "epoch": 2.58668903803132, "grad_norm": 28.440427780151367, "learning_rate": 4.5923537784954e-06, "loss": 0.1191, "step": 180375 }, { "epoch": 2.5870475534905064, "grad_norm": 9.474925994873047, "learning_rate": 4.588370264930209e-06, "loss": 0.0588, "step": 180400 }, { "epoch": 2.587406068949693, "grad_norm": 10.747775077819824, "learning_rate": 4.584386751365017e-06, "loss": 0.0745, "step": 180425 }, { "epoch": 2.5877645844088795, "grad_norm": 2.6731441020965576, "learning_rate": 4.580403237799826e-06, "loss": 0.1232, "step": 180450 }, { "epoch": 2.588123099868066, "grad_norm": 0.05344473943114281, "learning_rate": 4.576419724234635e-06, "loss": 0.1019, "step": 180475 }, { "epoch": 2.588481615327253, "grad_norm": 0.12538114190101624, "learning_rate": 4.572436210669443e-06, "loss": 0.0523, "step": 180500 }, { "epoch": 2.5888401307864397, "grad_norm": 1.9025968313217163, "learning_rate": 4.568452697104251e-06, "loss": 0.0626, "step": 180525 }, { "epoch": 2.589198646245626, "grad_norm": 20.55893325805664, "learning_rate": 4.56446918353906e-06, "loss": 0.0864, "step": 180550 }, { "epoch": 2.5895571617048128, "grad_norm": 0.5036987066268921, "learning_rate": 4.560485669973869e-06, "loss": 0.0708, "step": 180575 }, { "epoch": 2.589915677163999, "grad_norm": 1.6717692613601685, "learning_rate": 4.556502156408677e-06, "loss": 0.0789, "step": 180600 }, { "epoch": 2.590274192623186, "grad_norm": 0.23535633087158203, "learning_rate": 4.552518642843485e-06, "loss": 0.0937, "step": 180625 }, { "epoch": 2.5906327080823726, "grad_norm": 1.8769922256469727, "learning_rate": 4.548535129278293e-06, "loss": 0.061, "step": 180650 }, { "epoch": 2.5909912235415593, "grad_norm": 1.8226499557495117, "learning_rate": 4.544551615713102e-06, "loss": 0.06, "step": 180675 }, { "epoch": 2.5913497390007456, "grad_norm": 10.946120262145996, "learning_rate": 4.5405681021479105e-06, "loss": 0.0704, "step": 180700 }, { "epoch": 2.5917082544599324, "grad_norm": 7.246850967407227, "learning_rate": 4.536584588582719e-06, "loss": 0.039, "step": 180725 }, { "epoch": 2.5920667699191187, "grad_norm": 0.20485246181488037, "learning_rate": 4.532601075017528e-06, "loss": 0.0544, "step": 180750 }, { "epoch": 2.5924252853783054, "grad_norm": 3.3085711002349854, "learning_rate": 4.5286175614523366e-06, "loss": 0.0528, "step": 180775 }, { "epoch": 2.592783800837492, "grad_norm": 0.6227642297744751, "learning_rate": 4.524634047887144e-06, "loss": 0.0906, "step": 180800 }, { "epoch": 2.593142316296679, "grad_norm": 3.818138837814331, "learning_rate": 4.520650534321953e-06, "loss": 0.0909, "step": 180825 }, { "epoch": 2.593500831755865, "grad_norm": 8.852911949157715, "learning_rate": 4.516667020756762e-06, "loss": 0.0853, "step": 180850 }, { "epoch": 2.593859347215052, "grad_norm": 11.753944396972656, "learning_rate": 4.5126835071915705e-06, "loss": 0.0974, "step": 180875 }, { "epoch": 2.5942178626742383, "grad_norm": 0.03611841797828674, "learning_rate": 4.508699993626378e-06, "loss": 0.0702, "step": 180900 }, { "epoch": 2.594576378133425, "grad_norm": 0.9716468453407288, "learning_rate": 4.504716480061186e-06, "loss": 0.0282, "step": 180925 }, { "epoch": 2.5949348935926118, "grad_norm": 0.20007549226284027, "learning_rate": 4.500732966495995e-06, "loss": 0.0942, "step": 180950 }, { "epoch": 2.5952934090517985, "grad_norm": 10.632536888122559, "learning_rate": 4.4967494529308036e-06, "loss": 0.0975, "step": 180975 }, { "epoch": 2.595651924510985, "grad_norm": 0.1540548950433731, "learning_rate": 4.492765939365612e-06, "loss": 0.0489, "step": 181000 }, { "epoch": 2.5960104399701716, "grad_norm": 1.720991849899292, "learning_rate": 4.488782425800421e-06, "loss": 0.1105, "step": 181025 }, { "epoch": 2.596368955429358, "grad_norm": 0.10746143758296967, "learning_rate": 4.48479891223523e-06, "loss": 0.0478, "step": 181050 }, { "epoch": 2.5967274708885446, "grad_norm": 1.5376232862472534, "learning_rate": 4.4808153986700375e-06, "loss": 0.0504, "step": 181075 }, { "epoch": 2.5970859863477314, "grad_norm": 0.017450232058763504, "learning_rate": 4.476831885104846e-06, "loss": 0.1535, "step": 181100 }, { "epoch": 2.597444501806918, "grad_norm": 0.3342825770378113, "learning_rate": 4.472848371539655e-06, "loss": 0.1073, "step": 181125 }, { "epoch": 2.5978030172661044, "grad_norm": 0.14773263037204742, "learning_rate": 4.4688648579744635e-06, "loss": 0.0656, "step": 181150 }, { "epoch": 2.598161532725291, "grad_norm": 1.0801337957382202, "learning_rate": 4.464881344409271e-06, "loss": 0.0282, "step": 181175 }, { "epoch": 2.5985200481844775, "grad_norm": 17.079923629760742, "learning_rate": 4.46089783084408e-06, "loss": 0.0927, "step": 181200 }, { "epoch": 2.598878563643664, "grad_norm": 11.703941345214844, "learning_rate": 4.456914317278888e-06, "loss": 0.0805, "step": 181225 }, { "epoch": 2.599237079102851, "grad_norm": 0.6629113554954529, "learning_rate": 4.452930803713697e-06, "loss": 0.0809, "step": 181250 }, { "epoch": 2.5995955945620377, "grad_norm": 0.29047274589538574, "learning_rate": 4.448947290148505e-06, "loss": 0.0932, "step": 181275 }, { "epoch": 2.599954110021224, "grad_norm": 5.7010040283203125, "learning_rate": 4.444963776583314e-06, "loss": 0.0706, "step": 181300 }, { "epoch": 2.6003126254804108, "grad_norm": 9.645780563354492, "learning_rate": 4.440980263018123e-06, "loss": 0.0793, "step": 181325 }, { "epoch": 2.600671140939597, "grad_norm": 0.2625202536582947, "learning_rate": 4.436996749452931e-06, "loss": 0.0739, "step": 181350 }, { "epoch": 2.601029656398784, "grad_norm": 7.94052267074585, "learning_rate": 4.433013235887739e-06, "loss": 0.0694, "step": 181375 }, { "epoch": 2.6013881718579706, "grad_norm": 0.7759341597557068, "learning_rate": 4.429029722322548e-06, "loss": 0.0629, "step": 181400 }, { "epoch": 2.6017466873171573, "grad_norm": 8.2472505569458, "learning_rate": 4.425046208757357e-06, "loss": 0.1181, "step": 181425 }, { "epoch": 2.6021052027763436, "grad_norm": 4.460129261016846, "learning_rate": 4.4210626951921645e-06, "loss": 0.1402, "step": 181450 }, { "epoch": 2.6024637182355304, "grad_norm": 1.7534868717193604, "learning_rate": 4.417079181626973e-06, "loss": 0.0537, "step": 181475 }, { "epoch": 2.6028222336947167, "grad_norm": 0.18219466507434845, "learning_rate": 4.413095668061782e-06, "loss": 0.0542, "step": 181500 }, { "epoch": 2.6031807491539034, "grad_norm": 1.4935541152954102, "learning_rate": 4.40911215449659e-06, "loss": 0.1012, "step": 181525 }, { "epoch": 2.60353926461309, "grad_norm": 0.07235497236251831, "learning_rate": 4.405128640931398e-06, "loss": 0.0605, "step": 181550 }, { "epoch": 2.603897780072277, "grad_norm": 2.75285005569458, "learning_rate": 4.401145127366207e-06, "loss": 0.0533, "step": 181575 }, { "epoch": 2.604256295531463, "grad_norm": 0.28273457288742065, "learning_rate": 4.397161613801016e-06, "loss": 0.067, "step": 181600 }, { "epoch": 2.60461481099065, "grad_norm": 1.0005218982696533, "learning_rate": 4.3931781002358245e-06, "loss": 0.0646, "step": 181625 }, { "epoch": 2.6049733264498363, "grad_norm": 1.819698691368103, "learning_rate": 4.389194586670633e-06, "loss": 0.1975, "step": 181650 }, { "epoch": 2.605331841909023, "grad_norm": 9.478581428527832, "learning_rate": 4.385211073105441e-06, "loss": 0.0454, "step": 181675 }, { "epoch": 2.6056903573682098, "grad_norm": 5.194200038909912, "learning_rate": 4.38122755954025e-06, "loss": 0.0755, "step": 181700 }, { "epoch": 2.6060488728273965, "grad_norm": 0.9555352330207825, "learning_rate": 4.3772440459750575e-06, "loss": 0.1324, "step": 181725 }, { "epoch": 2.606407388286583, "grad_norm": 1.1491016149520874, "learning_rate": 4.373260532409866e-06, "loss": 0.0795, "step": 181750 }, { "epoch": 2.6067659037457696, "grad_norm": 14.545620918273926, "learning_rate": 4.369277018844675e-06, "loss": 0.0572, "step": 181775 }, { "epoch": 2.607124419204956, "grad_norm": 3.5871846675872803, "learning_rate": 4.365293505279484e-06, "loss": 0.0453, "step": 181800 }, { "epoch": 2.6074829346641426, "grad_norm": 2.8308136463165283, "learning_rate": 4.3613099917142914e-06, "loss": 0.1201, "step": 181825 }, { "epoch": 2.6078414501233294, "grad_norm": 7.301955699920654, "learning_rate": 4.3573264781491e-06, "loss": 0.0722, "step": 181850 }, { "epoch": 2.608199965582516, "grad_norm": 0.17676478624343872, "learning_rate": 4.353342964583909e-06, "loss": 0.0535, "step": 181875 }, { "epoch": 2.6085584810417024, "grad_norm": 0.3637470602989197, "learning_rate": 4.3493594510187175e-06, "loss": 0.037, "step": 181900 }, { "epoch": 2.608916996500889, "grad_norm": 3.8466663360595703, "learning_rate": 4.345375937453526e-06, "loss": 0.0275, "step": 181925 }, { "epoch": 2.6092755119600755, "grad_norm": 1.0605714321136475, "learning_rate": 4.341392423888335e-06, "loss": 0.1024, "step": 181950 }, { "epoch": 2.609634027419262, "grad_norm": 1.531893253326416, "learning_rate": 4.337408910323143e-06, "loss": 0.0681, "step": 181975 }, { "epoch": 2.609992542878449, "grad_norm": 1.5861871242523193, "learning_rate": 4.333425396757951e-06, "loss": 0.083, "step": 182000 }, { "epoch": 2.6103510583376357, "grad_norm": 1.1029013395309448, "learning_rate": 4.329441883192759e-06, "loss": 0.0995, "step": 182025 }, { "epoch": 2.610709573796822, "grad_norm": 11.35914134979248, "learning_rate": 4.325458369627568e-06, "loss": 0.0854, "step": 182050 }, { "epoch": 2.6110680892560088, "grad_norm": 3.862478494644165, "learning_rate": 4.321474856062377e-06, "loss": 0.0378, "step": 182075 }, { "epoch": 2.611426604715195, "grad_norm": 18.633237838745117, "learning_rate": 4.317491342497185e-06, "loss": 0.0586, "step": 182100 }, { "epoch": 2.611785120174382, "grad_norm": 8.023999214172363, "learning_rate": 4.313507828931993e-06, "loss": 0.0581, "step": 182125 }, { "epoch": 2.6121436356335686, "grad_norm": 8.652349472045898, "learning_rate": 4.309524315366802e-06, "loss": 0.1027, "step": 182150 }, { "epoch": 2.6125021510927553, "grad_norm": 0.10185836255550385, "learning_rate": 4.305540801801611e-06, "loss": 0.1073, "step": 182175 }, { "epoch": 2.6128606665519416, "grad_norm": 8.772363662719727, "learning_rate": 4.301557288236419e-06, "loss": 0.0705, "step": 182200 }, { "epoch": 2.6132191820111283, "grad_norm": 0.5390215516090393, "learning_rate": 4.297573774671228e-06, "loss": 0.0506, "step": 182225 }, { "epoch": 2.6135776974703147, "grad_norm": 10.695089340209961, "learning_rate": 4.293590261106037e-06, "loss": 0.1132, "step": 182250 }, { "epoch": 2.6139362129295014, "grad_norm": 2.97585129737854, "learning_rate": 4.289606747540844e-06, "loss": 0.0738, "step": 182275 }, { "epoch": 2.614294728388688, "grad_norm": 11.536471366882324, "learning_rate": 4.285623233975652e-06, "loss": 0.0723, "step": 182300 }, { "epoch": 2.614653243847875, "grad_norm": 0.10700763761997223, "learning_rate": 4.281639720410461e-06, "loss": 0.0923, "step": 182325 }, { "epoch": 2.615011759307061, "grad_norm": 16.897125244140625, "learning_rate": 4.27765620684527e-06, "loss": 0.0819, "step": 182350 }, { "epoch": 2.615370274766248, "grad_norm": 5.417923927307129, "learning_rate": 4.2736726932800784e-06, "loss": 0.0558, "step": 182375 }, { "epoch": 2.6157287902254343, "grad_norm": 0.25492188334465027, "learning_rate": 4.269689179714887e-06, "loss": 0.0781, "step": 182400 }, { "epoch": 2.616087305684621, "grad_norm": 6.296265125274658, "learning_rate": 4.265705666149695e-06, "loss": 0.0348, "step": 182425 }, { "epoch": 2.6164458211438077, "grad_norm": 3.6712775230407715, "learning_rate": 4.261722152584504e-06, "loss": 0.0977, "step": 182450 }, { "epoch": 2.6168043366029945, "grad_norm": 0.2699895203113556, "learning_rate": 4.257738639019312e-06, "loss": 0.1045, "step": 182475 }, { "epoch": 2.617162852062181, "grad_norm": 0.3037721514701843, "learning_rate": 4.253755125454121e-06, "loss": 0.0436, "step": 182500 }, { "epoch": 2.6175213675213675, "grad_norm": 5.514354705810547, "learning_rate": 4.24977161188893e-06, "loss": 0.0704, "step": 182525 }, { "epoch": 2.617879882980554, "grad_norm": 2.404463291168213, "learning_rate": 4.2457880983237376e-06, "loss": 0.0537, "step": 182550 }, { "epoch": 2.6182383984397406, "grad_norm": 4.88896369934082, "learning_rate": 4.241804584758545e-06, "loss": 0.1064, "step": 182575 }, { "epoch": 2.6185969138989273, "grad_norm": 4.443453311920166, "learning_rate": 4.237821071193354e-06, "loss": 0.0703, "step": 182600 }, { "epoch": 2.618955429358114, "grad_norm": 0.4160761833190918, "learning_rate": 4.233837557628163e-06, "loss": 0.0396, "step": 182625 }, { "epoch": 2.6193139448173004, "grad_norm": 16.603124618530273, "learning_rate": 4.2298540440629715e-06, "loss": 0.1089, "step": 182650 }, { "epoch": 2.619672460276487, "grad_norm": 1.3894028663635254, "learning_rate": 4.22587053049778e-06, "loss": 0.1027, "step": 182675 }, { "epoch": 2.6200309757356735, "grad_norm": 3.6083762645721436, "learning_rate": 4.221887016932589e-06, "loss": 0.08, "step": 182700 }, { "epoch": 2.62038949119486, "grad_norm": 0.09882713109254837, "learning_rate": 4.217903503367397e-06, "loss": 0.0661, "step": 182725 }, { "epoch": 2.620748006654047, "grad_norm": 0.17106105387210846, "learning_rate": 4.213919989802205e-06, "loss": 0.0521, "step": 182750 }, { "epoch": 2.6211065221132337, "grad_norm": 1.8170366287231445, "learning_rate": 4.209936476237014e-06, "loss": 0.07, "step": 182775 }, { "epoch": 2.62146503757242, "grad_norm": 2.6060636043548584, "learning_rate": 4.205952962671823e-06, "loss": 0.035, "step": 182800 }, { "epoch": 2.6218235530316067, "grad_norm": 4.681768894195557, "learning_rate": 4.201969449106631e-06, "loss": 0.1781, "step": 182825 }, { "epoch": 2.622182068490793, "grad_norm": 0.8441840410232544, "learning_rate": 4.197985935541439e-06, "loss": 0.0659, "step": 182850 }, { "epoch": 2.62254058394998, "grad_norm": 5.107343673706055, "learning_rate": 4.194002421976247e-06, "loss": 0.0382, "step": 182875 }, { "epoch": 2.6228990994091665, "grad_norm": 0.12491516023874283, "learning_rate": 4.190018908411056e-06, "loss": 0.0462, "step": 182900 }, { "epoch": 2.6232576148683533, "grad_norm": 0.24435020983219147, "learning_rate": 4.1860353948458646e-06, "loss": 0.0589, "step": 182925 }, { "epoch": 2.6236161303275396, "grad_norm": 0.1268116682767868, "learning_rate": 4.182051881280673e-06, "loss": 0.0757, "step": 182950 }, { "epoch": 2.6239746457867263, "grad_norm": 7.243825912475586, "learning_rate": 4.178068367715482e-06, "loss": 0.0698, "step": 182975 }, { "epoch": 2.6243331612459126, "grad_norm": 6.7357869148254395, "learning_rate": 4.174084854150291e-06, "loss": 0.047, "step": 183000 }, { "epoch": 2.6246916767050994, "grad_norm": 6.549193859100342, "learning_rate": 4.1701013405850985e-06, "loss": 0.0809, "step": 183025 }, { "epoch": 2.625050192164286, "grad_norm": 7.882639408111572, "learning_rate": 4.166117827019907e-06, "loss": 0.0852, "step": 183050 }, { "epoch": 2.625408707623473, "grad_norm": 0.5614416003227234, "learning_rate": 4.162134313454716e-06, "loss": 0.0625, "step": 183075 }, { "epoch": 2.625767223082659, "grad_norm": 5.7993974685668945, "learning_rate": 4.158150799889524e-06, "loss": 0.0877, "step": 183100 }, { "epoch": 2.626125738541846, "grad_norm": 0.16318336129188538, "learning_rate": 4.154167286324332e-06, "loss": 0.0927, "step": 183125 }, { "epoch": 2.6264842540010322, "grad_norm": 0.9010622501373291, "learning_rate": 4.150183772759141e-06, "loss": 0.0431, "step": 183150 }, { "epoch": 2.626842769460219, "grad_norm": 0.8621580600738525, "learning_rate": 4.146200259193949e-06, "loss": 0.0703, "step": 183175 }, { "epoch": 2.6272012849194057, "grad_norm": 0.2250141203403473, "learning_rate": 4.142216745628758e-06, "loss": 0.095, "step": 183200 }, { "epoch": 2.6275598003785925, "grad_norm": 1.109422206878662, "learning_rate": 4.138233232063566e-06, "loss": 0.1308, "step": 183225 }, { "epoch": 2.627918315837779, "grad_norm": 0.12471145391464233, "learning_rate": 4.134249718498375e-06, "loss": 0.0324, "step": 183250 }, { "epoch": 2.6282768312969655, "grad_norm": 0.07677695900201797, "learning_rate": 4.130266204933184e-06, "loss": 0.0649, "step": 183275 }, { "epoch": 2.6286353467561523, "grad_norm": 0.18203279376029968, "learning_rate": 4.126282691367992e-06, "loss": 0.1115, "step": 183300 }, { "epoch": 2.6289938622153386, "grad_norm": 0.3301706910133362, "learning_rate": 4.1222991778028e-06, "loss": 0.0457, "step": 183325 }, { "epoch": 2.6293523776745253, "grad_norm": 0.577293872833252, "learning_rate": 4.118315664237609e-06, "loss": 0.0409, "step": 183350 }, { "epoch": 2.629710893133712, "grad_norm": 0.9178422689437866, "learning_rate": 4.114332150672417e-06, "loss": 0.0634, "step": 183375 }, { "epoch": 2.6300694085928984, "grad_norm": 0.44884127378463745, "learning_rate": 4.1103486371072255e-06, "loss": 0.1163, "step": 183400 }, { "epoch": 2.630427924052085, "grad_norm": 2.942573308944702, "learning_rate": 4.106365123542034e-06, "loss": 0.087, "step": 183425 }, { "epoch": 2.630786439511272, "grad_norm": 2.4823291301727295, "learning_rate": 4.102381609976843e-06, "loss": 0.148, "step": 183450 }, { "epoch": 2.631144954970458, "grad_norm": 0.37320464849472046, "learning_rate": 4.098398096411651e-06, "loss": 0.0743, "step": 183475 }, { "epoch": 2.631503470429645, "grad_norm": 0.36124709248542786, "learning_rate": 4.094414582846459e-06, "loss": 0.0864, "step": 183500 }, { "epoch": 2.6318619858888317, "grad_norm": 0.8259526491165161, "learning_rate": 4.090431069281268e-06, "loss": 0.0561, "step": 183525 }, { "epoch": 2.632220501348018, "grad_norm": 0.7260618805885315, "learning_rate": 4.086447555716077e-06, "loss": 0.0891, "step": 183550 }, { "epoch": 2.6325790168072047, "grad_norm": 2.4020490646362305, "learning_rate": 4.0824640421508855e-06, "loss": 0.073, "step": 183575 }, { "epoch": 2.6329375322663915, "grad_norm": 19.919496536254883, "learning_rate": 4.078480528585694e-06, "loss": 0.0724, "step": 183600 }, { "epoch": 2.633296047725578, "grad_norm": 2.1482715606689453, "learning_rate": 4.074497015020502e-06, "loss": 0.0335, "step": 183625 }, { "epoch": 2.6336545631847645, "grad_norm": 16.227554321289062, "learning_rate": 4.07051350145531e-06, "loss": 0.0752, "step": 183650 }, { "epoch": 2.6340130786439513, "grad_norm": 0.7334045171737671, "learning_rate": 4.0665299878901185e-06, "loss": 0.0494, "step": 183675 }, { "epoch": 2.6343715941031376, "grad_norm": 0.13216227293014526, "learning_rate": 4.062546474324927e-06, "loss": 0.1357, "step": 183700 }, { "epoch": 2.6347301095623243, "grad_norm": 1.387617826461792, "learning_rate": 4.058562960759736e-06, "loss": 0.0696, "step": 183725 }, { "epoch": 2.635088625021511, "grad_norm": 2.02410888671875, "learning_rate": 4.054579447194545e-06, "loss": 0.0959, "step": 183750 }, { "epoch": 2.6354471404806974, "grad_norm": 8.492382049560547, "learning_rate": 4.0505959336293525e-06, "loss": 0.1039, "step": 183775 }, { "epoch": 2.635805655939884, "grad_norm": 5.355251789093018, "learning_rate": 4.046612420064161e-06, "loss": 0.1293, "step": 183800 }, { "epoch": 2.636164171399071, "grad_norm": 0.5292341709136963, "learning_rate": 4.04262890649897e-06, "loss": 0.0735, "step": 183825 }, { "epoch": 2.636522686858257, "grad_norm": 1.72669517993927, "learning_rate": 4.0386453929337785e-06, "loss": 0.0938, "step": 183850 }, { "epoch": 2.636881202317444, "grad_norm": 0.4265744686126709, "learning_rate": 4.034661879368587e-06, "loss": 0.0639, "step": 183875 }, { "epoch": 2.6372397177766307, "grad_norm": 2.2718799114227295, "learning_rate": 4.030678365803396e-06, "loss": 0.0804, "step": 183900 }, { "epoch": 2.637598233235817, "grad_norm": 0.054430872201919556, "learning_rate": 4.026694852238203e-06, "loss": 0.0798, "step": 183925 }, { "epoch": 2.6379567486950037, "grad_norm": 0.16866733133792877, "learning_rate": 4.022711338673012e-06, "loss": 0.0891, "step": 183950 }, { "epoch": 2.6383152641541905, "grad_norm": 15.147481918334961, "learning_rate": 4.01872782510782e-06, "loss": 0.0755, "step": 183975 }, { "epoch": 2.638673779613377, "grad_norm": 2.0423080921173096, "learning_rate": 4.014744311542629e-06, "loss": 0.1033, "step": 184000 }, { "epoch": 2.6390322950725635, "grad_norm": 1.3617522716522217, "learning_rate": 4.010760797977438e-06, "loss": 0.1098, "step": 184025 }, { "epoch": 2.6393908105317503, "grad_norm": 4.965594291687012, "learning_rate": 4.006777284412246e-06, "loss": 0.084, "step": 184050 }, { "epoch": 2.6397493259909366, "grad_norm": 9.619161605834961, "learning_rate": 4.002793770847054e-06, "loss": 0.0744, "step": 184075 }, { "epoch": 2.6401078414501233, "grad_norm": 0.46756038069725037, "learning_rate": 3.998810257281863e-06, "loss": 0.0511, "step": 184100 }, { "epoch": 2.64046635690931, "grad_norm": 16.94524574279785, "learning_rate": 3.994826743716672e-06, "loss": 0.2321, "step": 184125 }, { "epoch": 2.6408248723684964, "grad_norm": 3.4357399940490723, "learning_rate": 3.99084323015148e-06, "loss": 0.1124, "step": 184150 }, { "epoch": 2.641183387827683, "grad_norm": 0.3479553461074829, "learning_rate": 3.986859716586289e-06, "loss": 0.048, "step": 184175 }, { "epoch": 2.64154190328687, "grad_norm": 0.08716659247875214, "learning_rate": 3.982876203021097e-06, "loss": 0.067, "step": 184200 }, { "epoch": 2.641900418746056, "grad_norm": 21.35233497619629, "learning_rate": 3.978892689455905e-06, "loss": 0.0938, "step": 184225 }, { "epoch": 2.642258934205243, "grad_norm": 2.2540676593780518, "learning_rate": 3.974909175890713e-06, "loss": 0.1239, "step": 184250 }, { "epoch": 2.6426174496644297, "grad_norm": 0.12637338042259216, "learning_rate": 3.970925662325522e-06, "loss": 0.1056, "step": 184275 }, { "epoch": 2.642975965123616, "grad_norm": 0.14055100083351135, "learning_rate": 3.966942148760331e-06, "loss": 0.0894, "step": 184300 }, { "epoch": 2.6433344805828027, "grad_norm": 1.3121821880340576, "learning_rate": 3.9629586351951394e-06, "loss": 0.0511, "step": 184325 }, { "epoch": 2.6436929960419895, "grad_norm": 1.6470733880996704, "learning_rate": 3.958975121629948e-06, "loss": 0.0956, "step": 184350 }, { "epoch": 2.644051511501176, "grad_norm": 0.3864101469516754, "learning_rate": 3.954991608064756e-06, "loss": 0.1118, "step": 184375 }, { "epoch": 2.6444100269603625, "grad_norm": 1.4419846534729004, "learning_rate": 3.951008094499565e-06, "loss": 0.0294, "step": 184400 }, { "epoch": 2.6447685424195493, "grad_norm": 2.5707948207855225, "learning_rate": 3.947024580934373e-06, "loss": 0.0956, "step": 184425 }, { "epoch": 2.6451270578787356, "grad_norm": 0.09353511035442352, "learning_rate": 3.943041067369182e-06, "loss": 0.0441, "step": 184450 }, { "epoch": 2.6454855733379223, "grad_norm": 2.2119367122650146, "learning_rate": 3.93905755380399e-06, "loss": 0.0784, "step": 184475 }, { "epoch": 2.645844088797109, "grad_norm": 2.8407840728759766, "learning_rate": 3.935074040238799e-06, "loss": 0.1075, "step": 184500 }, { "epoch": 2.6462026042562954, "grad_norm": 2.910557746887207, "learning_rate": 3.9310905266736064e-06, "loss": 0.0592, "step": 184525 }, { "epoch": 2.646561119715482, "grad_norm": 3.3061728477478027, "learning_rate": 3.927107013108415e-06, "loss": 0.0962, "step": 184550 }, { "epoch": 2.646919635174669, "grad_norm": 2.0060534477233887, "learning_rate": 3.923123499543224e-06, "loss": 0.0894, "step": 184575 }, { "epoch": 2.647278150633855, "grad_norm": 2.667236566543579, "learning_rate": 3.9191399859780325e-06, "loss": 0.0972, "step": 184600 }, { "epoch": 2.647636666093042, "grad_norm": 1.0044054985046387, "learning_rate": 3.915156472412841e-06, "loss": 0.0893, "step": 184625 }, { "epoch": 2.6479951815522287, "grad_norm": 0.056624967604875565, "learning_rate": 3.91117295884765e-06, "loss": 0.1375, "step": 184650 }, { "epoch": 2.648353697011415, "grad_norm": 0.02874182164669037, "learning_rate": 3.907189445282458e-06, "loss": 0.065, "step": 184675 }, { "epoch": 2.6487122124706017, "grad_norm": 3.424591064453125, "learning_rate": 3.903205931717266e-06, "loss": 0.1004, "step": 184700 }, { "epoch": 2.6490707279297885, "grad_norm": 5.0784735679626465, "learning_rate": 3.899222418152075e-06, "loss": 0.0719, "step": 184725 }, { "epoch": 2.649429243388975, "grad_norm": 0.45440590381622314, "learning_rate": 3.895238904586883e-06, "loss": 0.0643, "step": 184750 }, { "epoch": 2.6497877588481615, "grad_norm": 7.042709827423096, "learning_rate": 3.891255391021692e-06, "loss": 0.0939, "step": 184775 }, { "epoch": 2.6501462743073483, "grad_norm": 1.740131139755249, "learning_rate": 3.8872718774565e-06, "loss": 0.1027, "step": 184800 }, { "epoch": 2.6505047897665346, "grad_norm": 10.830151557922363, "learning_rate": 3.883288363891308e-06, "loss": 0.0729, "step": 184825 }, { "epoch": 2.6508633052257213, "grad_norm": 0.3219427764415741, "learning_rate": 3.879304850326117e-06, "loss": 0.1153, "step": 184850 }, { "epoch": 2.651221820684908, "grad_norm": 2.203350305557251, "learning_rate": 3.8753213367609256e-06, "loss": 0.0544, "step": 184875 }, { "epoch": 2.6515803361440944, "grad_norm": 1.471508502960205, "learning_rate": 3.871337823195734e-06, "loss": 0.0893, "step": 184900 }, { "epoch": 2.651938851603281, "grad_norm": 22.997047424316406, "learning_rate": 3.867354309630543e-06, "loss": 0.1271, "step": 184925 }, { "epoch": 2.652297367062468, "grad_norm": 8.80277156829834, "learning_rate": 3.863370796065352e-06, "loss": 0.0931, "step": 184950 }, { "epoch": 2.652655882521654, "grad_norm": 1.1311613321304321, "learning_rate": 3.8593872825001595e-06, "loss": 0.1142, "step": 184975 }, { "epoch": 2.653014397980841, "grad_norm": 0.4178033173084259, "learning_rate": 3.855403768934968e-06, "loss": 0.0755, "step": 185000 }, { "epoch": 2.6533729134400277, "grad_norm": 0.4914550185203552, "learning_rate": 3.851420255369776e-06, "loss": 0.0406, "step": 185025 }, { "epoch": 2.653731428899214, "grad_norm": 21.455577850341797, "learning_rate": 3.847436741804585e-06, "loss": 0.0882, "step": 185050 }, { "epoch": 2.6540899443584007, "grad_norm": 1.6959971189498901, "learning_rate": 3.843453228239393e-06, "loss": 0.0705, "step": 185075 }, { "epoch": 2.6544484598175875, "grad_norm": 0.7397421002388, "learning_rate": 3.839469714674202e-06, "loss": 0.1084, "step": 185100 }, { "epoch": 2.654806975276774, "grad_norm": 0.06294701248407364, "learning_rate": 3.83548620110901e-06, "loss": 0.1125, "step": 185125 }, { "epoch": 2.6551654907359605, "grad_norm": 0.16049940884113312, "learning_rate": 3.831502687543819e-06, "loss": 0.068, "step": 185150 }, { "epoch": 2.6555240061951473, "grad_norm": 7.913085460662842, "learning_rate": 3.827519173978627e-06, "loss": 0.0678, "step": 185175 }, { "epoch": 2.6558825216543336, "grad_norm": 0.6790060997009277, "learning_rate": 3.823535660413436e-06, "loss": 0.0684, "step": 185200 }, { "epoch": 2.6562410371135203, "grad_norm": 0.7814542055130005, "learning_rate": 3.819552146848245e-06, "loss": 0.1219, "step": 185225 }, { "epoch": 2.656599552572707, "grad_norm": 7.9079909324646, "learning_rate": 3.8155686332830526e-06, "loss": 0.0864, "step": 185250 }, { "epoch": 2.6569580680318934, "grad_norm": 3.9109396934509277, "learning_rate": 3.811585119717861e-06, "loss": 0.1342, "step": 185275 }, { "epoch": 2.65731658349108, "grad_norm": 1.7090939283370972, "learning_rate": 3.8076016061526695e-06, "loss": 0.0642, "step": 185300 }, { "epoch": 2.657675098950267, "grad_norm": 12.434456825256348, "learning_rate": 3.803618092587478e-06, "loss": 0.1252, "step": 185325 }, { "epoch": 2.658033614409453, "grad_norm": 0.5801421403884888, "learning_rate": 3.7996345790222865e-06, "loss": 0.176, "step": 185350 }, { "epoch": 2.65839212986864, "grad_norm": 0.8630154728889465, "learning_rate": 3.795651065457095e-06, "loss": 0.0748, "step": 185375 }, { "epoch": 2.6587506453278267, "grad_norm": 1.8609058856964111, "learning_rate": 3.791667551891903e-06, "loss": 0.0722, "step": 185400 }, { "epoch": 2.659109160787013, "grad_norm": 12.628447532653809, "learning_rate": 3.7876840383267117e-06, "loss": 0.1059, "step": 185425 }, { "epoch": 2.6594676762461997, "grad_norm": 2.1327731609344482, "learning_rate": 3.7837005247615204e-06, "loss": 0.0721, "step": 185450 }, { "epoch": 2.6598261917053865, "grad_norm": 0.031075740233063698, "learning_rate": 3.779717011196329e-06, "loss": 0.0581, "step": 185475 }, { "epoch": 2.660184707164573, "grad_norm": 2.2196271419525146, "learning_rate": 3.7757334976311374e-06, "loss": 0.1276, "step": 185500 }, { "epoch": 2.6605432226237595, "grad_norm": 1.07736337184906, "learning_rate": 3.771749984065946e-06, "loss": 0.1586, "step": 185525 }, { "epoch": 2.6609017380829463, "grad_norm": 2.2883236408233643, "learning_rate": 3.767766470500754e-06, "loss": 0.1005, "step": 185550 }, { "epoch": 2.6612602535421326, "grad_norm": 9.543708801269531, "learning_rate": 3.7637829569355626e-06, "loss": 0.0955, "step": 185575 }, { "epoch": 2.6616187690013193, "grad_norm": 0.11735499650239944, "learning_rate": 3.7597994433703713e-06, "loss": 0.0972, "step": 185600 }, { "epoch": 2.661977284460506, "grad_norm": 3.0197947025299072, "learning_rate": 3.7558159298051795e-06, "loss": 0.1017, "step": 185625 }, { "epoch": 2.6623357999196924, "grad_norm": 9.055041313171387, "learning_rate": 3.7518324162399882e-06, "loss": 0.0703, "step": 185650 }, { "epoch": 2.662694315378879, "grad_norm": 0.3835581839084625, "learning_rate": 3.7478489026747965e-06, "loss": 0.0393, "step": 185675 }, { "epoch": 2.663052830838066, "grad_norm": 0.09148130565881729, "learning_rate": 3.743865389109605e-06, "loss": 0.0713, "step": 185700 }, { "epoch": 2.663411346297252, "grad_norm": 15.244622230529785, "learning_rate": 3.739881875544414e-06, "loss": 0.0899, "step": 185725 }, { "epoch": 2.663769861756439, "grad_norm": 0.06009970232844353, "learning_rate": 3.735898361979222e-06, "loss": 0.0585, "step": 185750 }, { "epoch": 2.6641283772156257, "grad_norm": 5.076977252960205, "learning_rate": 3.7319148484140304e-06, "loss": 0.0662, "step": 185775 }, { "epoch": 2.664486892674812, "grad_norm": 0.3157016336917877, "learning_rate": 3.727931334848839e-06, "loss": 0.0413, "step": 185800 }, { "epoch": 2.6648454081339987, "grad_norm": 0.5511326789855957, "learning_rate": 3.7239478212836474e-06, "loss": 0.0484, "step": 185825 }, { "epoch": 2.6652039235931855, "grad_norm": 4.715271472930908, "learning_rate": 3.719964307718456e-06, "loss": 0.0938, "step": 185850 }, { "epoch": 2.665562439052372, "grad_norm": 0.6611867547035217, "learning_rate": 3.7159807941532648e-06, "loss": 0.1047, "step": 185875 }, { "epoch": 2.6659209545115585, "grad_norm": 1.1793036460876465, "learning_rate": 3.7119972805880726e-06, "loss": 0.1049, "step": 185900 }, { "epoch": 2.6662794699707453, "grad_norm": 9.333756446838379, "learning_rate": 3.7080137670228813e-06, "loss": 0.07, "step": 185925 }, { "epoch": 2.6666379854299316, "grad_norm": 0.7521401047706604, "learning_rate": 3.70403025345769e-06, "loss": 0.1055, "step": 185950 }, { "epoch": 2.6669965008891183, "grad_norm": 0.9817125797271729, "learning_rate": 3.7000467398924983e-06, "loss": 0.0751, "step": 185975 }, { "epoch": 2.667355016348305, "grad_norm": 0.4850780665874481, "learning_rate": 3.696063226327307e-06, "loss": 0.0687, "step": 186000 }, { "epoch": 2.6677135318074914, "grad_norm": 0.9458164572715759, "learning_rate": 3.6920797127621156e-06, "loss": 0.0555, "step": 186025 }, { "epoch": 2.668072047266678, "grad_norm": 0.1887611597776413, "learning_rate": 3.6880961991969235e-06, "loss": 0.1108, "step": 186050 }, { "epoch": 2.668430562725865, "grad_norm": 4.141504764556885, "learning_rate": 3.684112685631732e-06, "loss": 0.0367, "step": 186075 }, { "epoch": 2.668789078185051, "grad_norm": 2.8177223205566406, "learning_rate": 3.680129172066541e-06, "loss": 0.0649, "step": 186100 }, { "epoch": 2.669147593644238, "grad_norm": 1.0564383268356323, "learning_rate": 3.676145658501349e-06, "loss": 0.1091, "step": 186125 }, { "epoch": 2.6695061091034247, "grad_norm": 0.31433531641960144, "learning_rate": 3.672162144936158e-06, "loss": 0.0314, "step": 186150 }, { "epoch": 2.669864624562611, "grad_norm": 1.6899363994598389, "learning_rate": 3.668178631370966e-06, "loss": 0.0586, "step": 186175 }, { "epoch": 2.6702231400217977, "grad_norm": 0.0908801406621933, "learning_rate": 3.6641951178057744e-06, "loss": 0.0848, "step": 186200 }, { "epoch": 2.6705816554809845, "grad_norm": 1.4410585165023804, "learning_rate": 3.660211604240583e-06, "loss": 0.1246, "step": 186225 }, { "epoch": 2.6709401709401708, "grad_norm": 9.676078796386719, "learning_rate": 3.6562280906753917e-06, "loss": 0.0481, "step": 186250 }, { "epoch": 2.6712986863993575, "grad_norm": 2.968996524810791, "learning_rate": 3.6522445771102e-06, "loss": 0.1583, "step": 186275 }, { "epoch": 2.6716572018585443, "grad_norm": 1.6949658393859863, "learning_rate": 3.6482610635450087e-06, "loss": 0.11, "step": 186300 }, { "epoch": 2.6720157173177306, "grad_norm": 13.4341459274292, "learning_rate": 3.644277549979817e-06, "loss": 0.0737, "step": 186325 }, { "epoch": 2.6723742327769173, "grad_norm": 0.941861093044281, "learning_rate": 3.6402940364146252e-06, "loss": 0.0398, "step": 186350 }, { "epoch": 2.672732748236104, "grad_norm": 0.8048263192176819, "learning_rate": 3.636310522849434e-06, "loss": 0.0515, "step": 186375 }, { "epoch": 2.6730912636952904, "grad_norm": 5.540660858154297, "learning_rate": 3.6323270092842426e-06, "loss": 0.0915, "step": 186400 }, { "epoch": 2.673449779154477, "grad_norm": 0.20186296105384827, "learning_rate": 3.628343495719051e-06, "loss": 0.2066, "step": 186425 }, { "epoch": 2.673808294613664, "grad_norm": 0.6627215147018433, "learning_rate": 3.624359982153859e-06, "loss": 0.1354, "step": 186450 }, { "epoch": 2.67416681007285, "grad_norm": 1.4661870002746582, "learning_rate": 3.620376468588668e-06, "loss": 0.0835, "step": 186475 }, { "epoch": 2.674525325532037, "grad_norm": 0.38950881361961365, "learning_rate": 3.616392955023476e-06, "loss": 0.1076, "step": 186500 }, { "epoch": 2.6748838409912237, "grad_norm": 2.6494791507720947, "learning_rate": 3.612409441458285e-06, "loss": 0.0598, "step": 186525 }, { "epoch": 2.6752423564504104, "grad_norm": 0.49330538511276245, "learning_rate": 3.6084259278930935e-06, "loss": 0.0765, "step": 186550 }, { "epoch": 2.6756008719095967, "grad_norm": 22.520950317382812, "learning_rate": 3.6044424143279018e-06, "loss": 0.1351, "step": 186575 }, { "epoch": 2.6759593873687835, "grad_norm": 11.031904220581055, "learning_rate": 3.60045890076271e-06, "loss": 0.121, "step": 186600 }, { "epoch": 2.6763179028279698, "grad_norm": 0.06616607308387756, "learning_rate": 3.5964753871975187e-06, "loss": 0.0917, "step": 186625 }, { "epoch": 2.6766764182871565, "grad_norm": 3.5579984188079834, "learning_rate": 3.592491873632327e-06, "loss": 0.0807, "step": 186650 }, { "epoch": 2.6770349337463433, "grad_norm": 1.153234601020813, "learning_rate": 3.5885083600671357e-06, "loss": 0.0933, "step": 186675 }, { "epoch": 2.67739344920553, "grad_norm": 4.669939994812012, "learning_rate": 3.5845248465019444e-06, "loss": 0.1099, "step": 186700 }, { "epoch": 2.6777519646647163, "grad_norm": 2.763279676437378, "learning_rate": 3.5805413329367522e-06, "loss": 0.1608, "step": 186725 }, { "epoch": 2.678110480123903, "grad_norm": 2.724475383758545, "learning_rate": 3.576557819371561e-06, "loss": 0.1212, "step": 186750 }, { "epoch": 2.6784689955830894, "grad_norm": 0.22144334018230438, "learning_rate": 3.5725743058063696e-06, "loss": 0.0568, "step": 186775 }, { "epoch": 2.678827511042276, "grad_norm": 18.29250717163086, "learning_rate": 3.568590792241178e-06, "loss": 0.1756, "step": 186800 }, { "epoch": 2.679186026501463, "grad_norm": 1.347664475440979, "learning_rate": 3.5646072786759866e-06, "loss": 0.0344, "step": 186825 }, { "epoch": 2.6795445419606496, "grad_norm": 0.5391956567764282, "learning_rate": 3.5606237651107953e-06, "loss": 0.052, "step": 186850 }, { "epoch": 2.679903057419836, "grad_norm": 6.9876909255981445, "learning_rate": 3.556640251545603e-06, "loss": 0.132, "step": 186875 }, { "epoch": 2.6802615728790227, "grad_norm": 16.171876907348633, "learning_rate": 3.552656737980412e-06, "loss": 0.0848, "step": 186900 }, { "epoch": 2.680620088338209, "grad_norm": 0.4772701561450958, "learning_rate": 3.5486732244152205e-06, "loss": 0.0329, "step": 186925 }, { "epoch": 2.6809786037973957, "grad_norm": 10.624375343322754, "learning_rate": 3.5446897108500288e-06, "loss": 0.0822, "step": 186950 }, { "epoch": 2.6813371192565825, "grad_norm": 9.604177474975586, "learning_rate": 3.5407061972848375e-06, "loss": 0.1144, "step": 186975 }, { "epoch": 2.681695634715769, "grad_norm": 3.840701103210449, "learning_rate": 3.5367226837196457e-06, "loss": 0.0831, "step": 187000 }, { "epoch": 2.6820541501749555, "grad_norm": 0.043627526611089706, "learning_rate": 3.532739170154454e-06, "loss": 0.0719, "step": 187025 }, { "epoch": 2.6824126656341423, "grad_norm": 1.8940320014953613, "learning_rate": 3.5287556565892627e-06, "loss": 0.0609, "step": 187050 }, { "epoch": 2.6827711810933286, "grad_norm": 0.2233426719903946, "learning_rate": 3.5247721430240714e-06, "loss": 0.1042, "step": 187075 }, { "epoch": 2.6831296965525153, "grad_norm": 11.132741928100586, "learning_rate": 3.5207886294588796e-06, "loss": 0.0791, "step": 187100 }, { "epoch": 2.683488212011702, "grad_norm": 0.1406407207250595, "learning_rate": 3.5168051158936883e-06, "loss": 0.0326, "step": 187125 }, { "epoch": 2.683846727470889, "grad_norm": 0.6715279817581177, "learning_rate": 3.5128216023284966e-06, "loss": 0.1119, "step": 187150 }, { "epoch": 2.684205242930075, "grad_norm": 3.4022626876831055, "learning_rate": 3.508838088763305e-06, "loss": 0.0828, "step": 187175 }, { "epoch": 2.684563758389262, "grad_norm": 0.06244932860136032, "learning_rate": 3.5048545751981136e-06, "loss": 0.0842, "step": 187200 }, { "epoch": 2.684922273848448, "grad_norm": 0.7897803783416748, "learning_rate": 3.5008710616329223e-06, "loss": 0.1548, "step": 187225 }, { "epoch": 2.685280789307635, "grad_norm": 1.2692039012908936, "learning_rate": 3.4968875480677305e-06, "loss": 0.055, "step": 187250 }, { "epoch": 2.6856393047668217, "grad_norm": 0.9262430667877197, "learning_rate": 3.492904034502539e-06, "loss": 0.0754, "step": 187275 }, { "epoch": 2.6859978202260084, "grad_norm": 14.266533851623535, "learning_rate": 3.4889205209373475e-06, "loss": 0.0786, "step": 187300 }, { "epoch": 2.6863563356851947, "grad_norm": 24.8304500579834, "learning_rate": 3.4849370073721557e-06, "loss": 0.124, "step": 187325 }, { "epoch": 2.6867148511443815, "grad_norm": 5.179800033569336, "learning_rate": 3.4809534938069644e-06, "loss": 0.0722, "step": 187350 }, { "epoch": 2.6870733666035678, "grad_norm": 0.34031549096107483, "learning_rate": 3.476969980241773e-06, "loss": 0.0742, "step": 187375 }, { "epoch": 2.6874318820627545, "grad_norm": 18.981231689453125, "learning_rate": 3.4729864666765814e-06, "loss": 0.0948, "step": 187400 }, { "epoch": 2.6877903975219413, "grad_norm": 0.5552811622619629, "learning_rate": 3.4690029531113897e-06, "loss": 0.1174, "step": 187425 }, { "epoch": 2.688148912981128, "grad_norm": 2.291246175765991, "learning_rate": 3.465019439546198e-06, "loss": 0.0992, "step": 187450 }, { "epoch": 2.6885074284403143, "grad_norm": 0.9032562375068665, "learning_rate": 3.4610359259810066e-06, "loss": 0.0593, "step": 187475 }, { "epoch": 2.688865943899501, "grad_norm": 1.517305612564087, "learning_rate": 3.4570524124158153e-06, "loss": 0.069, "step": 187500 }, { "epoch": 2.6892244593586874, "grad_norm": 0.6961808800697327, "learning_rate": 3.4530688988506236e-06, "loss": 0.0933, "step": 187525 }, { "epoch": 2.689582974817874, "grad_norm": 0.16448849439620972, "learning_rate": 3.449085385285432e-06, "loss": 0.072, "step": 187550 }, { "epoch": 2.689941490277061, "grad_norm": 0.20989376306533813, "learning_rate": 3.4451018717202405e-06, "loss": 0.0967, "step": 187575 }, { "epoch": 2.6903000057362476, "grad_norm": 7.564021110534668, "learning_rate": 3.441118358155049e-06, "loss": 0.0879, "step": 187600 }, { "epoch": 2.690658521195434, "grad_norm": 0.544752299785614, "learning_rate": 3.4371348445898575e-06, "loss": 0.1284, "step": 187625 }, { "epoch": 2.6910170366546207, "grad_norm": 0.035177405923604965, "learning_rate": 3.433151331024666e-06, "loss": 0.1167, "step": 187650 }, { "epoch": 2.691375552113807, "grad_norm": 0.021429575979709625, "learning_rate": 3.4291678174594745e-06, "loss": 0.0987, "step": 187675 }, { "epoch": 2.6917340675729937, "grad_norm": 19.759017944335938, "learning_rate": 3.4251843038942827e-06, "loss": 0.0838, "step": 187700 }, { "epoch": 2.6920925830321805, "grad_norm": 1.3914345502853394, "learning_rate": 3.4212007903290914e-06, "loss": 0.1089, "step": 187725 }, { "epoch": 2.692451098491367, "grad_norm": 0.3665803372859955, "learning_rate": 3.4172172767638997e-06, "loss": 0.0827, "step": 187750 }, { "epoch": 2.6928096139505535, "grad_norm": 2.561828374862671, "learning_rate": 3.4132337631987084e-06, "loss": 0.1202, "step": 187775 }, { "epoch": 2.6931681294097403, "grad_norm": 0.18027308583259583, "learning_rate": 3.409250249633517e-06, "loss": 0.1814, "step": 187800 }, { "epoch": 2.6935266448689266, "grad_norm": 3.7388644218444824, "learning_rate": 3.405266736068325e-06, "loss": 0.0625, "step": 187825 }, { "epoch": 2.6938851603281133, "grad_norm": 8.194830894470215, "learning_rate": 3.4012832225031336e-06, "loss": 0.0756, "step": 187850 }, { "epoch": 2.6942436757873, "grad_norm": 4.9335222244262695, "learning_rate": 3.3972997089379423e-06, "loss": 0.0815, "step": 187875 }, { "epoch": 2.694602191246487, "grad_norm": 4.0264105796813965, "learning_rate": 3.3933161953727506e-06, "loss": 0.0879, "step": 187900 }, { "epoch": 2.694960706705673, "grad_norm": 0.3118244707584381, "learning_rate": 3.3893326818075593e-06, "loss": 0.0444, "step": 187925 }, { "epoch": 2.69531922216486, "grad_norm": 1.4715609550476074, "learning_rate": 3.385349168242368e-06, "loss": 0.0908, "step": 187950 }, { "epoch": 2.695677737624046, "grad_norm": 7.079183101654053, "learning_rate": 3.381365654677176e-06, "loss": 0.0763, "step": 187975 }, { "epoch": 2.696036253083233, "grad_norm": 19.525836944580078, "learning_rate": 3.3773821411119845e-06, "loss": 0.0904, "step": 188000 }, { "epoch": 2.6963947685424197, "grad_norm": 12.688528060913086, "learning_rate": 3.373398627546793e-06, "loss": 0.0535, "step": 188025 }, { "epoch": 2.6967532840016064, "grad_norm": 1.760196566581726, "learning_rate": 3.3694151139816015e-06, "loss": 0.0595, "step": 188050 }, { "epoch": 2.6971117994607927, "grad_norm": 4.634271144866943, "learning_rate": 3.36543160041641e-06, "loss": 0.0521, "step": 188075 }, { "epoch": 2.6974703149199795, "grad_norm": 4.548500061035156, "learning_rate": 3.3614480868512184e-06, "loss": 0.0569, "step": 188100 }, { "epoch": 2.6978288303791658, "grad_norm": 8.466541290283203, "learning_rate": 3.3574645732860267e-06, "loss": 0.1997, "step": 188125 }, { "epoch": 2.6981873458383525, "grad_norm": 2.8815672397613525, "learning_rate": 3.3534810597208354e-06, "loss": 0.0851, "step": 188150 }, { "epoch": 2.6985458612975393, "grad_norm": 0.10748057067394257, "learning_rate": 3.349497546155644e-06, "loss": 0.1259, "step": 188175 }, { "epoch": 2.698904376756726, "grad_norm": 3.447726011276245, "learning_rate": 3.3455140325904523e-06, "loss": 0.0517, "step": 188200 }, { "epoch": 2.6992628922159123, "grad_norm": 0.08860090374946594, "learning_rate": 3.341530519025261e-06, "loss": 0.0899, "step": 188225 }, { "epoch": 2.699621407675099, "grad_norm": 1.1345337629318237, "learning_rate": 3.3375470054600693e-06, "loss": 0.0962, "step": 188250 }, { "epoch": 2.6999799231342854, "grad_norm": 0.25946861505508423, "learning_rate": 3.3335634918948776e-06, "loss": 0.0995, "step": 188275 }, { "epoch": 2.700338438593472, "grad_norm": 17.214529037475586, "learning_rate": 3.3295799783296863e-06, "loss": 0.0845, "step": 188300 }, { "epoch": 2.700696954052659, "grad_norm": 7.400856971740723, "learning_rate": 3.325596464764495e-06, "loss": 0.0409, "step": 188325 }, { "epoch": 2.7010554695118456, "grad_norm": 1.683427095413208, "learning_rate": 3.3216129511993032e-06, "loss": 0.0423, "step": 188350 }, { "epoch": 2.701413984971032, "grad_norm": 1.4563970565795898, "learning_rate": 3.3176294376341115e-06, "loss": 0.0552, "step": 188375 }, { "epoch": 2.7017725004302187, "grad_norm": 7.160495281219482, "learning_rate": 3.31364592406892e-06, "loss": 0.0721, "step": 188400 }, { "epoch": 2.702131015889405, "grad_norm": 0.10642694681882858, "learning_rate": 3.3096624105037284e-06, "loss": 0.1111, "step": 188425 }, { "epoch": 2.7024895313485917, "grad_norm": 9.881473541259766, "learning_rate": 3.305678896938537e-06, "loss": 0.1047, "step": 188450 }, { "epoch": 2.7028480468077785, "grad_norm": 0.3576287627220154, "learning_rate": 3.301695383373346e-06, "loss": 0.1103, "step": 188475 }, { "epoch": 2.703206562266965, "grad_norm": 0.4929782450199127, "learning_rate": 3.297711869808154e-06, "loss": 0.0431, "step": 188500 }, { "epoch": 2.7035650777261515, "grad_norm": 0.620117723941803, "learning_rate": 3.2937283562429624e-06, "loss": 0.0875, "step": 188525 }, { "epoch": 2.7039235931853383, "grad_norm": 4.445730686187744, "learning_rate": 3.289744842677771e-06, "loss": 0.0815, "step": 188550 }, { "epoch": 2.7042821086445246, "grad_norm": 7.623504638671875, "learning_rate": 3.2857613291125793e-06, "loss": 0.0453, "step": 188575 }, { "epoch": 2.7046406241037113, "grad_norm": 5.843973159790039, "learning_rate": 3.281777815547388e-06, "loss": 0.0845, "step": 188600 }, { "epoch": 2.704999139562898, "grad_norm": 11.945686340332031, "learning_rate": 3.2777943019821967e-06, "loss": 0.1496, "step": 188625 }, { "epoch": 2.705357655022085, "grad_norm": 2.6190853118896484, "learning_rate": 3.2738107884170045e-06, "loss": 0.0797, "step": 188650 }, { "epoch": 2.705716170481271, "grad_norm": 7.3205885887146, "learning_rate": 3.2698272748518132e-06, "loss": 0.1218, "step": 188675 }, { "epoch": 2.706074685940458, "grad_norm": 0.12781547009944916, "learning_rate": 3.265843761286622e-06, "loss": 0.0513, "step": 188700 }, { "epoch": 2.706433201399644, "grad_norm": 0.26618367433547974, "learning_rate": 3.26186024772143e-06, "loss": 0.0547, "step": 188725 }, { "epoch": 2.706791716858831, "grad_norm": 14.860013961791992, "learning_rate": 3.257876734156239e-06, "loss": 0.0695, "step": 188750 }, { "epoch": 2.7071502323180177, "grad_norm": 0.31812402606010437, "learning_rate": 3.2538932205910476e-06, "loss": 0.1006, "step": 188775 }, { "epoch": 2.7075087477772044, "grad_norm": 5.830883979797363, "learning_rate": 3.2499097070258554e-06, "loss": 0.0564, "step": 188800 }, { "epoch": 2.7078672632363907, "grad_norm": 0.6058433055877686, "learning_rate": 3.245926193460664e-06, "loss": 0.0942, "step": 188825 }, { "epoch": 2.7082257786955775, "grad_norm": 0.2616790235042572, "learning_rate": 3.241942679895473e-06, "loss": 0.0709, "step": 188850 }, { "epoch": 2.7085842941547638, "grad_norm": 30.20506477355957, "learning_rate": 3.237959166330281e-06, "loss": 0.0633, "step": 188875 }, { "epoch": 2.7089428096139505, "grad_norm": 1.743723750114441, "learning_rate": 3.2339756527650898e-06, "loss": 0.0721, "step": 188900 }, { "epoch": 2.7093013250731373, "grad_norm": 3.6827890872955322, "learning_rate": 3.229992139199898e-06, "loss": 0.1032, "step": 188925 }, { "epoch": 2.709659840532324, "grad_norm": 0.5121139287948608, "learning_rate": 3.2260086256347063e-06, "loss": 0.0915, "step": 188950 }, { "epoch": 2.7100183559915103, "grad_norm": 4.025233268737793, "learning_rate": 3.222025112069515e-06, "loss": 0.1295, "step": 188975 }, { "epoch": 2.710376871450697, "grad_norm": 7.580421447753906, "learning_rate": 3.2180415985043237e-06, "loss": 0.0942, "step": 189000 }, { "epoch": 2.7107353869098834, "grad_norm": 1.009102463722229, "learning_rate": 3.214058084939132e-06, "loss": 0.0834, "step": 189025 }, { "epoch": 2.71109390236907, "grad_norm": 0.24236413836479187, "learning_rate": 3.2100745713739406e-06, "loss": 0.0846, "step": 189050 }, { "epoch": 2.711452417828257, "grad_norm": 7.794516086578369, "learning_rate": 3.206091057808749e-06, "loss": 0.1307, "step": 189075 }, { "epoch": 2.7118109332874436, "grad_norm": 3.4858124256134033, "learning_rate": 3.202107544243557e-06, "loss": 0.0714, "step": 189100 }, { "epoch": 2.71216944874663, "grad_norm": 0.026575928553938866, "learning_rate": 3.198124030678366e-06, "loss": 0.0953, "step": 189125 }, { "epoch": 2.7125279642058167, "grad_norm": 0.1830701231956482, "learning_rate": 3.1941405171131746e-06, "loss": 0.0943, "step": 189150 }, { "epoch": 2.712886479665003, "grad_norm": 0.04467761516571045, "learning_rate": 3.190157003547983e-06, "loss": 0.0446, "step": 189175 }, { "epoch": 2.7132449951241897, "grad_norm": 1.099063754081726, "learning_rate": 3.186173489982791e-06, "loss": 0.0417, "step": 189200 }, { "epoch": 2.7136035105833765, "grad_norm": 10.476304054260254, "learning_rate": 3.1821899764176e-06, "loss": 0.0571, "step": 189225 }, { "epoch": 2.713962026042563, "grad_norm": 5.012480735778809, "learning_rate": 3.178206462852408e-06, "loss": 0.0715, "step": 189250 }, { "epoch": 2.7143205415017495, "grad_norm": 0.8989059329032898, "learning_rate": 3.1742229492872168e-06, "loss": 0.1044, "step": 189275 }, { "epoch": 2.7146790569609363, "grad_norm": 1.7201220989227295, "learning_rate": 3.1702394357220254e-06, "loss": 0.0569, "step": 189300 }, { "epoch": 2.7150375724201226, "grad_norm": 0.49072661995887756, "learning_rate": 3.1662559221568337e-06, "loss": 0.152, "step": 189325 }, { "epoch": 2.7153960878793093, "grad_norm": 7.320302486419678, "learning_rate": 3.162272408591642e-06, "loss": 0.0486, "step": 189350 }, { "epoch": 2.715754603338496, "grad_norm": 8.7935791015625, "learning_rate": 3.1582888950264507e-06, "loss": 0.0499, "step": 189375 }, { "epoch": 2.716113118797683, "grad_norm": 1.0749168395996094, "learning_rate": 3.154305381461259e-06, "loss": 0.0878, "step": 189400 }, { "epoch": 2.716471634256869, "grad_norm": 7.584621906280518, "learning_rate": 3.1503218678960676e-06, "loss": 0.0444, "step": 189425 }, { "epoch": 2.716830149716056, "grad_norm": 3.6461586952209473, "learning_rate": 3.1463383543308763e-06, "loss": 0.1007, "step": 189450 }, { "epoch": 2.717188665175242, "grad_norm": 21.630626678466797, "learning_rate": 3.142354840765684e-06, "loss": 0.123, "step": 189475 }, { "epoch": 2.717547180634429, "grad_norm": 4.561363697052002, "learning_rate": 3.138371327200493e-06, "loss": 0.07, "step": 189500 }, { "epoch": 2.7179056960936157, "grad_norm": 0.8929556012153625, "learning_rate": 3.1343878136353016e-06, "loss": 0.0929, "step": 189525 }, { "epoch": 2.7182642115528024, "grad_norm": 0.3128781318664551, "learning_rate": 3.13040430007011e-06, "loss": 0.1037, "step": 189550 }, { "epoch": 2.7186227270119887, "grad_norm": 1.0339802503585815, "learning_rate": 3.1264207865049185e-06, "loss": 0.1074, "step": 189575 }, { "epoch": 2.7189812424711755, "grad_norm": 0.3149138391017914, "learning_rate": 3.122437272939727e-06, "loss": 0.1239, "step": 189600 }, { "epoch": 2.7193397579303618, "grad_norm": 0.10532474517822266, "learning_rate": 3.118453759374535e-06, "loss": 0.0666, "step": 189625 }, { "epoch": 2.7196982733895485, "grad_norm": 2.033691167831421, "learning_rate": 3.1144702458093437e-06, "loss": 0.0762, "step": 189650 }, { "epoch": 2.7200567888487353, "grad_norm": 0.5130020380020142, "learning_rate": 3.1104867322441524e-06, "loss": 0.1274, "step": 189675 }, { "epoch": 2.720415304307922, "grad_norm": 0.04009446129202843, "learning_rate": 3.1065032186789607e-06, "loss": 0.129, "step": 189700 }, { "epoch": 2.7207738197671083, "grad_norm": 0.1747894436120987, "learning_rate": 3.1025197051137694e-06, "loss": 0.0603, "step": 189725 }, { "epoch": 2.721132335226295, "grad_norm": 0.3324916362762451, "learning_rate": 3.0985361915485777e-06, "loss": 0.0935, "step": 189750 }, { "epoch": 2.7214908506854814, "grad_norm": 21.08355140686035, "learning_rate": 3.094552677983386e-06, "loss": 0.1285, "step": 189775 }, { "epoch": 2.721849366144668, "grad_norm": 6.727635860443115, "learning_rate": 3.0905691644181946e-06, "loss": 0.059, "step": 189800 }, { "epoch": 2.722207881603855, "grad_norm": 0.4704878628253937, "learning_rate": 3.0865856508530033e-06, "loss": 0.0637, "step": 189825 }, { "epoch": 2.7225663970630416, "grad_norm": 0.05686343461275101, "learning_rate": 3.0826021372878116e-06, "loss": 0.1143, "step": 189850 }, { "epoch": 2.722924912522228, "grad_norm": 2.2635908126831055, "learning_rate": 3.0786186237226203e-06, "loss": 0.0739, "step": 189875 }, { "epoch": 2.7232834279814147, "grad_norm": 0.5129467844963074, "learning_rate": 3.0746351101574285e-06, "loss": 0.0732, "step": 189900 }, { "epoch": 2.723641943440601, "grad_norm": 5.410777568817139, "learning_rate": 3.070651596592237e-06, "loss": 0.0983, "step": 189925 }, { "epoch": 2.7240004588997877, "grad_norm": 0.09660880267620087, "learning_rate": 3.0666680830270455e-06, "loss": 0.0635, "step": 189950 }, { "epoch": 2.7243589743589745, "grad_norm": 1.4584776163101196, "learning_rate": 3.062684569461854e-06, "loss": 0.113, "step": 189975 }, { "epoch": 2.724717489818161, "grad_norm": 6.1833648681640625, "learning_rate": 3.0587010558966625e-06, "loss": 0.1304, "step": 190000 }, { "epoch": 2.7250760052773475, "grad_norm": 0.21563918888568878, "learning_rate": 3.0547175423314707e-06, "loss": 0.1427, "step": 190025 }, { "epoch": 2.7254345207365342, "grad_norm": 1.2607938051223755, "learning_rate": 3.0507340287662794e-06, "loss": 0.0598, "step": 190050 }, { "epoch": 2.7257930361957206, "grad_norm": 11.235557556152344, "learning_rate": 3.0467505152010877e-06, "loss": 0.14, "step": 190075 }, { "epoch": 2.7261515516549073, "grad_norm": 1.3213770389556885, "learning_rate": 3.0427670016358964e-06, "loss": 0.095, "step": 190100 }, { "epoch": 2.726510067114094, "grad_norm": 0.047139257192611694, "learning_rate": 3.038783488070705e-06, "loss": 0.0895, "step": 190125 }, { "epoch": 2.726868582573281, "grad_norm": 0.7464101910591125, "learning_rate": 3.0347999745055133e-06, "loss": 0.0671, "step": 190150 }, { "epoch": 2.727227098032467, "grad_norm": 1.7702107429504395, "learning_rate": 3.0308164609403216e-06, "loss": 0.0971, "step": 190175 }, { "epoch": 2.727585613491654, "grad_norm": 1.1266601085662842, "learning_rate": 3.0268329473751303e-06, "loss": 0.0431, "step": 190200 }, { "epoch": 2.72794412895084, "grad_norm": 0.4294981062412262, "learning_rate": 3.0228494338099386e-06, "loss": 0.0542, "step": 190225 }, { "epoch": 2.728302644410027, "grad_norm": 8.129582405090332, "learning_rate": 3.0188659202447473e-06, "loss": 0.114, "step": 190250 }, { "epoch": 2.7286611598692136, "grad_norm": 0.7173899412155151, "learning_rate": 3.014882406679556e-06, "loss": 0.1075, "step": 190275 }, { "epoch": 2.7290196753284004, "grad_norm": 6.884195804595947, "learning_rate": 3.010898893114364e-06, "loss": 0.1697, "step": 190300 }, { "epoch": 2.7293781907875867, "grad_norm": 0.43678194284439087, "learning_rate": 3.0069153795491725e-06, "loss": 0.0501, "step": 190325 }, { "epoch": 2.7297367062467734, "grad_norm": 0.35124871134757996, "learning_rate": 3.002931865983981e-06, "loss": 0.0989, "step": 190350 }, { "epoch": 2.7300952217059598, "grad_norm": 0.2541212737560272, "learning_rate": 2.9989483524187894e-06, "loss": 0.1066, "step": 190375 }, { "epoch": 2.7304537371651465, "grad_norm": 0.16135385632514954, "learning_rate": 2.994964838853598e-06, "loss": 0.0473, "step": 190400 }, { "epoch": 2.7308122526243332, "grad_norm": 0.04304853081703186, "learning_rate": 2.990981325288407e-06, "loss": 0.0447, "step": 190425 }, { "epoch": 2.73117076808352, "grad_norm": 0.38132035732269287, "learning_rate": 2.9869978117232147e-06, "loss": 0.0529, "step": 190450 }, { "epoch": 2.7315292835427063, "grad_norm": 5.9491400718688965, "learning_rate": 2.9830142981580234e-06, "loss": 0.0822, "step": 190475 }, { "epoch": 2.731887799001893, "grad_norm": 0.010409179143607616, "learning_rate": 2.979030784592832e-06, "loss": 0.0531, "step": 190500 }, { "epoch": 2.7322463144610794, "grad_norm": 1.320574402809143, "learning_rate": 2.9750472710276403e-06, "loss": 0.0454, "step": 190525 }, { "epoch": 2.732604829920266, "grad_norm": 8.107450485229492, "learning_rate": 2.971063757462449e-06, "loss": 0.0574, "step": 190550 }, { "epoch": 2.732963345379453, "grad_norm": 0.6367844343185425, "learning_rate": 2.9670802438972573e-06, "loss": 0.1311, "step": 190575 }, { "epoch": 2.7333218608386396, "grad_norm": 8.098773002624512, "learning_rate": 2.9630967303320656e-06, "loss": 0.1116, "step": 190600 }, { "epoch": 2.733680376297826, "grad_norm": 1.8926169872283936, "learning_rate": 2.9591132167668742e-06, "loss": 0.0962, "step": 190625 }, { "epoch": 2.7340388917570126, "grad_norm": 8.380864143371582, "learning_rate": 2.955129703201683e-06, "loss": 0.0424, "step": 190650 }, { "epoch": 2.734397407216199, "grad_norm": 0.6799733638763428, "learning_rate": 2.951146189636491e-06, "loss": 0.0469, "step": 190675 }, { "epoch": 2.7347559226753857, "grad_norm": 1.5529356002807617, "learning_rate": 2.9471626760713e-06, "loss": 0.0701, "step": 190700 }, { "epoch": 2.7351144381345724, "grad_norm": 29.63248634338379, "learning_rate": 2.943179162506108e-06, "loss": 0.0992, "step": 190725 }, { "epoch": 2.735472953593759, "grad_norm": 0.06447996944189072, "learning_rate": 2.9391956489409164e-06, "loss": 0.0842, "step": 190750 }, { "epoch": 2.7358314690529455, "grad_norm": 12.411834716796875, "learning_rate": 2.935212135375725e-06, "loss": 0.0838, "step": 190775 }, { "epoch": 2.7361899845121322, "grad_norm": 13.866804122924805, "learning_rate": 2.931228621810534e-06, "loss": 0.1829, "step": 190800 }, { "epoch": 2.7365484999713185, "grad_norm": 0.5734347701072693, "learning_rate": 2.927245108245342e-06, "loss": 0.0667, "step": 190825 }, { "epoch": 2.7369070154305053, "grad_norm": 8.561326026916504, "learning_rate": 2.9232615946801504e-06, "loss": 0.038, "step": 190850 }, { "epoch": 2.737265530889692, "grad_norm": 9.743025779724121, "learning_rate": 2.919278081114959e-06, "loss": 0.1147, "step": 190875 }, { "epoch": 2.737624046348879, "grad_norm": 0.011416064575314522, "learning_rate": 2.9152945675497673e-06, "loss": 0.0948, "step": 190900 }, { "epoch": 2.737982561808065, "grad_norm": 12.884407997131348, "learning_rate": 2.911311053984576e-06, "loss": 0.1224, "step": 190925 }, { "epoch": 2.738341077267252, "grad_norm": 0.8603694438934326, "learning_rate": 2.9073275404193847e-06, "loss": 0.0776, "step": 190950 }, { "epoch": 2.738699592726438, "grad_norm": 0.5904979705810547, "learning_rate": 2.903344026854193e-06, "loss": 0.182, "step": 190975 }, { "epoch": 2.739058108185625, "grad_norm": 2.691927433013916, "learning_rate": 2.8993605132890012e-06, "loss": 0.0536, "step": 191000 }, { "epoch": 2.7394166236448116, "grad_norm": 0.22274498641490936, "learning_rate": 2.89537699972381e-06, "loss": 0.0901, "step": 191025 }, { "epoch": 2.7397751391039984, "grad_norm": 6.111625671386719, "learning_rate": 2.891393486158618e-06, "loss": 0.0585, "step": 191050 }, { "epoch": 2.7401336545631847, "grad_norm": 5.51430606842041, "learning_rate": 2.887409972593427e-06, "loss": 0.0706, "step": 191075 }, { "epoch": 2.7404921700223714, "grad_norm": 0.43966493010520935, "learning_rate": 2.8834264590282356e-06, "loss": 0.0583, "step": 191100 }, { "epoch": 2.7408506854815577, "grad_norm": 4.241650581359863, "learning_rate": 2.8794429454630434e-06, "loss": 0.1181, "step": 191125 }, { "epoch": 2.7412092009407445, "grad_norm": 3.93121075630188, "learning_rate": 2.875459431897852e-06, "loss": 0.0908, "step": 191150 }, { "epoch": 2.7415677163999312, "grad_norm": 6.166042804718018, "learning_rate": 2.871475918332661e-06, "loss": 0.0995, "step": 191175 }, { "epoch": 2.741926231859118, "grad_norm": 5.335409641265869, "learning_rate": 2.867492404767469e-06, "loss": 0.0957, "step": 191200 }, { "epoch": 2.7422847473183043, "grad_norm": 12.75605583190918, "learning_rate": 2.8635088912022778e-06, "loss": 0.0719, "step": 191225 }, { "epoch": 2.742643262777491, "grad_norm": 1.5051686763763428, "learning_rate": 2.859525377637086e-06, "loss": 0.1005, "step": 191250 }, { "epoch": 2.7430017782366773, "grad_norm": 9.576895713806152, "learning_rate": 2.8555418640718943e-06, "loss": 0.0758, "step": 191275 }, { "epoch": 2.743360293695864, "grad_norm": 2.7004401683807373, "learning_rate": 2.851558350506703e-06, "loss": 0.0566, "step": 191300 }, { "epoch": 2.743718809155051, "grad_norm": 19.26133155822754, "learning_rate": 2.8475748369415117e-06, "loss": 0.1589, "step": 191325 }, { "epoch": 2.7440773246142376, "grad_norm": 0.048133838921785355, "learning_rate": 2.84359132337632e-06, "loss": 0.0836, "step": 191350 }, { "epoch": 2.744435840073424, "grad_norm": 7.468542098999023, "learning_rate": 2.8396078098111286e-06, "loss": 0.082, "step": 191375 }, { "epoch": 2.7447943555326106, "grad_norm": 3.4207425117492676, "learning_rate": 2.835624296245937e-06, "loss": 0.1844, "step": 191400 }, { "epoch": 2.745152870991797, "grad_norm": 0.2205982506275177, "learning_rate": 2.831640782680745e-06, "loss": 0.1345, "step": 191425 }, { "epoch": 2.7455113864509837, "grad_norm": 1.1143856048583984, "learning_rate": 2.827657269115554e-06, "loss": 0.0901, "step": 191450 }, { "epoch": 2.7458699019101704, "grad_norm": 5.638420104980469, "learning_rate": 2.8236737555503626e-06, "loss": 0.0765, "step": 191475 }, { "epoch": 2.746228417369357, "grad_norm": 20.530776977539062, "learning_rate": 2.819690241985171e-06, "loss": 0.1232, "step": 191500 }, { "epoch": 2.7465869328285435, "grad_norm": 6.85328483581543, "learning_rate": 2.815706728419979e-06, "loss": 0.0571, "step": 191525 }, { "epoch": 2.7469454482877302, "grad_norm": 1.9977291822433472, "learning_rate": 2.811723214854788e-06, "loss": 0.0246, "step": 191550 }, { "epoch": 2.7473039637469165, "grad_norm": 2.580073118209839, "learning_rate": 2.807739701289596e-06, "loss": 0.0361, "step": 191575 }, { "epoch": 2.7476624792061033, "grad_norm": 4.479025363922119, "learning_rate": 2.8037561877244048e-06, "loss": 0.0435, "step": 191600 }, { "epoch": 2.74802099466529, "grad_norm": 0.531394898891449, "learning_rate": 2.7997726741592134e-06, "loss": 0.0871, "step": 191625 }, { "epoch": 2.748379510124477, "grad_norm": 0.9694546461105347, "learning_rate": 2.7957891605940217e-06, "loss": 0.0808, "step": 191650 }, { "epoch": 2.748738025583663, "grad_norm": 0.1542690247297287, "learning_rate": 2.79180564702883e-06, "loss": 0.046, "step": 191675 }, { "epoch": 2.74909654104285, "grad_norm": 0.42647621035575867, "learning_rate": 2.7878221334636387e-06, "loss": 0.0525, "step": 191700 }, { "epoch": 2.749455056502036, "grad_norm": 12.502997398376465, "learning_rate": 2.783838619898447e-06, "loss": 0.0992, "step": 191725 }, { "epoch": 2.749813571961223, "grad_norm": 3.736910343170166, "learning_rate": 2.7798551063332556e-06, "loss": 0.058, "step": 191750 }, { "epoch": 2.7501720874204096, "grad_norm": 2.4471137523651123, "learning_rate": 2.7758715927680643e-06, "loss": 0.1118, "step": 191775 }, { "epoch": 2.7505306028795964, "grad_norm": 10.494178771972656, "learning_rate": 2.771888079202872e-06, "loss": 0.0902, "step": 191800 }, { "epoch": 2.7508891183387827, "grad_norm": 15.249906539916992, "learning_rate": 2.767904565637681e-06, "loss": 0.0922, "step": 191825 }, { "epoch": 2.7512476337979694, "grad_norm": 1.0048906803131104, "learning_rate": 2.7639210520724896e-06, "loss": 0.0818, "step": 191850 }, { "epoch": 2.7516061492571557, "grad_norm": 0.725792646408081, "learning_rate": 2.759937538507298e-06, "loss": 0.0555, "step": 191875 }, { "epoch": 2.7519646647163425, "grad_norm": 15.269486427307129, "learning_rate": 2.7559540249421065e-06, "loss": 0.1007, "step": 191900 }, { "epoch": 2.7523231801755292, "grad_norm": 0.042893584817647934, "learning_rate": 2.7519705113769148e-06, "loss": 0.0708, "step": 191925 }, { "epoch": 2.752681695634716, "grad_norm": 0.5382365584373474, "learning_rate": 2.747986997811723e-06, "loss": 0.1026, "step": 191950 }, { "epoch": 2.7530402110939023, "grad_norm": 4.133412837982178, "learning_rate": 2.7440034842465317e-06, "loss": 0.0382, "step": 191975 }, { "epoch": 2.753398726553089, "grad_norm": 0.9195826649665833, "learning_rate": 2.74001997068134e-06, "loss": 0.1342, "step": 192000 }, { "epoch": 2.7537572420122753, "grad_norm": 0.7397291660308838, "learning_rate": 2.7360364571161487e-06, "loss": 0.0619, "step": 192025 }, { "epoch": 2.754115757471462, "grad_norm": 16.642786026000977, "learning_rate": 2.7320529435509574e-06, "loss": 0.1596, "step": 192050 }, { "epoch": 2.754474272930649, "grad_norm": 0.026099104434251785, "learning_rate": 2.7280694299857652e-06, "loss": 0.1037, "step": 192075 }, { "epoch": 2.7548327883898356, "grad_norm": 8.281732559204102, "learning_rate": 2.724085916420574e-06, "loss": 0.086, "step": 192100 }, { "epoch": 2.755191303849022, "grad_norm": 0.5247018933296204, "learning_rate": 2.7201024028553826e-06, "loss": 0.1239, "step": 192125 }, { "epoch": 2.7555498193082086, "grad_norm": 0.07991501688957214, "learning_rate": 2.716118889290191e-06, "loss": 0.049, "step": 192150 }, { "epoch": 2.755908334767395, "grad_norm": 0.13344301283359528, "learning_rate": 2.7121353757249996e-06, "loss": 0.0621, "step": 192175 }, { "epoch": 2.7562668502265817, "grad_norm": 2.6525216102600098, "learning_rate": 2.7081518621598083e-06, "loss": 0.075, "step": 192200 }, { "epoch": 2.7566253656857684, "grad_norm": 1.4762067794799805, "learning_rate": 2.704168348594616e-06, "loss": 0.0894, "step": 192225 }, { "epoch": 2.756983881144955, "grad_norm": 0.4596113860607147, "learning_rate": 2.700184835029425e-06, "loss": 0.0845, "step": 192250 }, { "epoch": 2.7573423966041415, "grad_norm": 0.48553523421287537, "learning_rate": 2.6962013214642335e-06, "loss": 0.1182, "step": 192275 }, { "epoch": 2.7577009120633282, "grad_norm": 0.7748169302940369, "learning_rate": 2.6922178078990418e-06, "loss": 0.0659, "step": 192300 }, { "epoch": 2.7580594275225145, "grad_norm": 0.00908211711794138, "learning_rate": 2.6882342943338505e-06, "loss": 0.1088, "step": 192325 }, { "epoch": 2.7584179429817013, "grad_norm": 1.0261166095733643, "learning_rate": 2.6842507807686587e-06, "loss": 0.0818, "step": 192350 }, { "epoch": 2.758776458440888, "grad_norm": 24.93418312072754, "learning_rate": 2.680267267203467e-06, "loss": 0.0726, "step": 192375 }, { "epoch": 2.759134973900075, "grad_norm": 2.515131711959839, "learning_rate": 2.6762837536382757e-06, "loss": 0.0831, "step": 192400 }, { "epoch": 2.759493489359261, "grad_norm": 11.158872604370117, "learning_rate": 2.6723002400730844e-06, "loss": 0.0876, "step": 192425 }, { "epoch": 2.759852004818448, "grad_norm": 10.229462623596191, "learning_rate": 2.6683167265078926e-06, "loss": 0.1201, "step": 192450 }, { "epoch": 2.760210520277634, "grad_norm": 1.5488977432250977, "learning_rate": 2.6643332129427013e-06, "loss": 0.0724, "step": 192475 }, { "epoch": 2.760569035736821, "grad_norm": 1.3086957931518555, "learning_rate": 2.6603496993775096e-06, "loss": 0.0618, "step": 192500 }, { "epoch": 2.7609275511960076, "grad_norm": 0.23000817000865936, "learning_rate": 2.656366185812318e-06, "loss": 0.0487, "step": 192525 }, { "epoch": 2.7612860666551944, "grad_norm": 17.658910751342773, "learning_rate": 2.6523826722471266e-06, "loss": 0.1609, "step": 192550 }, { "epoch": 2.7616445821143807, "grad_norm": 1.597326636314392, "learning_rate": 2.6483991586819353e-06, "loss": 0.0384, "step": 192575 }, { "epoch": 2.7620030975735674, "grad_norm": 22.21410369873047, "learning_rate": 2.6444156451167435e-06, "loss": 0.0927, "step": 192600 }, { "epoch": 2.7623616130327537, "grad_norm": 0.8665553331375122, "learning_rate": 2.640432131551552e-06, "loss": 0.0691, "step": 192625 }, { "epoch": 2.7627201284919405, "grad_norm": 3.623789072036743, "learning_rate": 2.6364486179863605e-06, "loss": 0.1121, "step": 192650 }, { "epoch": 2.7630786439511272, "grad_norm": 8.238901138305664, "learning_rate": 2.6324651044211688e-06, "loss": 0.0738, "step": 192675 }, { "epoch": 2.763437159410314, "grad_norm": 2.8928728103637695, "learning_rate": 2.6284815908559774e-06, "loss": 0.0559, "step": 192700 }, { "epoch": 2.7637956748695003, "grad_norm": 6.045582294464111, "learning_rate": 2.624498077290786e-06, "loss": 0.0586, "step": 192725 }, { "epoch": 2.764154190328687, "grad_norm": 1.933014154434204, "learning_rate": 2.6205145637255944e-06, "loss": 0.0608, "step": 192750 }, { "epoch": 2.7645127057878733, "grad_norm": 12.28243637084961, "learning_rate": 2.6165310501604027e-06, "loss": 0.0717, "step": 192775 }, { "epoch": 2.76487122124706, "grad_norm": 0.15189428627490997, "learning_rate": 2.6125475365952114e-06, "loss": 0.0641, "step": 192800 }, { "epoch": 2.765229736706247, "grad_norm": 8.710440635681152, "learning_rate": 2.6085640230300196e-06, "loss": 0.1314, "step": 192825 }, { "epoch": 2.7655882521654336, "grad_norm": 0.6860362887382507, "learning_rate": 2.6045805094648283e-06, "loss": 0.0826, "step": 192850 }, { "epoch": 2.76594676762462, "grad_norm": 0.2190437614917755, "learning_rate": 2.600596995899637e-06, "loss": 0.061, "step": 192875 }, { "epoch": 2.7663052830838066, "grad_norm": 14.686956405639648, "learning_rate": 2.596613482334445e-06, "loss": 0.1119, "step": 192900 }, { "epoch": 2.766663798542993, "grad_norm": 5.916624069213867, "learning_rate": 2.5926299687692536e-06, "loss": 0.0649, "step": 192925 }, { "epoch": 2.7670223140021797, "grad_norm": 7.345480918884277, "learning_rate": 2.5886464552040622e-06, "loss": 0.0689, "step": 192950 }, { "epoch": 2.7673808294613664, "grad_norm": 11.157353401184082, "learning_rate": 2.5846629416388705e-06, "loss": 0.069, "step": 192975 }, { "epoch": 2.767739344920553, "grad_norm": 7.264477252960205, "learning_rate": 2.580679428073679e-06, "loss": 0.1064, "step": 193000 }, { "epoch": 2.7680978603797395, "grad_norm": 4.5190300941467285, "learning_rate": 2.576695914508488e-06, "loss": 0.0735, "step": 193025 }, { "epoch": 2.7684563758389262, "grad_norm": 23.56686782836914, "learning_rate": 2.5727124009432957e-06, "loss": 0.1518, "step": 193050 }, { "epoch": 2.7688148912981125, "grad_norm": 20.940784454345703, "learning_rate": 2.5687288873781044e-06, "loss": 0.1137, "step": 193075 }, { "epoch": 2.7691734067572993, "grad_norm": 1.2564910650253296, "learning_rate": 2.564745373812913e-06, "loss": 0.0309, "step": 193100 }, { "epoch": 2.769531922216486, "grad_norm": 0.6956694722175598, "learning_rate": 2.5607618602477214e-06, "loss": 0.0894, "step": 193125 }, { "epoch": 2.769890437675673, "grad_norm": 3.21496319770813, "learning_rate": 2.55677834668253e-06, "loss": 0.0556, "step": 193150 }, { "epoch": 2.770248953134859, "grad_norm": 0.10032190382480621, "learning_rate": 2.5527948331173383e-06, "loss": 0.073, "step": 193175 }, { "epoch": 2.770607468594046, "grad_norm": 0.6442245244979858, "learning_rate": 2.5488113195521466e-06, "loss": 0.0826, "step": 193200 }, { "epoch": 2.770965984053232, "grad_norm": 0.3856396973133087, "learning_rate": 2.5448278059869553e-06, "loss": 0.0782, "step": 193225 }, { "epoch": 2.771324499512419, "grad_norm": 0.117954321205616, "learning_rate": 2.540844292421764e-06, "loss": 0.0268, "step": 193250 }, { "epoch": 2.7716830149716056, "grad_norm": 1.1441221237182617, "learning_rate": 2.5368607788565723e-06, "loss": 0.0698, "step": 193275 }, { "epoch": 2.7720415304307924, "grad_norm": 1.4471948146820068, "learning_rate": 2.532877265291381e-06, "loss": 0.0616, "step": 193300 }, { "epoch": 2.7724000458899787, "grad_norm": 19.005449295043945, "learning_rate": 2.5288937517261892e-06, "loss": 0.065, "step": 193325 }, { "epoch": 2.7727585613491654, "grad_norm": 1.7309640645980835, "learning_rate": 2.5249102381609975e-06, "loss": 0.076, "step": 193350 }, { "epoch": 2.7731170768083517, "grad_norm": 1.6248606443405151, "learning_rate": 2.520926724595806e-06, "loss": 0.0394, "step": 193375 }, { "epoch": 2.7734755922675385, "grad_norm": 1.2371457815170288, "learning_rate": 2.516943211030615e-06, "loss": 0.1512, "step": 193400 }, { "epoch": 2.7738341077267252, "grad_norm": 6.38738489151001, "learning_rate": 2.512959697465423e-06, "loss": 0.1022, "step": 193425 }, { "epoch": 2.774192623185912, "grad_norm": 0.433059960603714, "learning_rate": 2.5089761839002314e-06, "loss": 0.0553, "step": 193450 }, { "epoch": 2.7745511386450983, "grad_norm": 0.4839921295642853, "learning_rate": 2.50499267033504e-06, "loss": 0.0404, "step": 193475 }, { "epoch": 2.774909654104285, "grad_norm": 1.1828454732894897, "learning_rate": 2.5010091567698484e-06, "loss": 0.1177, "step": 193500 }, { "epoch": 2.7752681695634713, "grad_norm": 5.166435718536377, "learning_rate": 2.497025643204657e-06, "loss": 0.0888, "step": 193525 }, { "epoch": 2.775626685022658, "grad_norm": 1.0173969268798828, "learning_rate": 2.4930421296394658e-06, "loss": 0.0868, "step": 193550 }, { "epoch": 2.775985200481845, "grad_norm": 1.404179573059082, "learning_rate": 2.489058616074274e-06, "loss": 0.102, "step": 193575 }, { "epoch": 2.7763437159410316, "grad_norm": 0.8533747792243958, "learning_rate": 2.4850751025090823e-06, "loss": 0.0315, "step": 193600 }, { "epoch": 2.776702231400218, "grad_norm": 21.758750915527344, "learning_rate": 2.481091588943891e-06, "loss": 0.0707, "step": 193625 }, { "epoch": 2.7770607468594046, "grad_norm": 0.4751928150653839, "learning_rate": 2.4771080753786993e-06, "loss": 0.1275, "step": 193650 }, { "epoch": 2.777419262318591, "grad_norm": 3.9770731925964355, "learning_rate": 2.473124561813508e-06, "loss": 0.1069, "step": 193675 }, { "epoch": 2.7777777777777777, "grad_norm": 15.572744369506836, "learning_rate": 2.4691410482483166e-06, "loss": 0.1032, "step": 193700 }, { "epoch": 2.7781362932369644, "grad_norm": 0.25679725408554077, "learning_rate": 2.4651575346831245e-06, "loss": 0.0726, "step": 193725 }, { "epoch": 2.778494808696151, "grad_norm": 10.520703315734863, "learning_rate": 2.461174021117933e-06, "loss": 0.061, "step": 193750 }, { "epoch": 2.7788533241553375, "grad_norm": 0.38976216316223145, "learning_rate": 2.457190507552742e-06, "loss": 0.1488, "step": 193775 }, { "epoch": 2.7792118396145242, "grad_norm": 6.260612964630127, "learning_rate": 2.45320699398755e-06, "loss": 0.0575, "step": 193800 }, { "epoch": 2.7795703550737105, "grad_norm": 14.633556365966797, "learning_rate": 2.449223480422359e-06, "loss": 0.0676, "step": 193825 }, { "epoch": 2.7799288705328973, "grad_norm": 4.46860933303833, "learning_rate": 2.4452399668571675e-06, "loss": 0.0348, "step": 193850 }, { "epoch": 2.780287385992084, "grad_norm": 1.146621584892273, "learning_rate": 2.4412564532919754e-06, "loss": 0.0991, "step": 193875 }, { "epoch": 2.7806459014512708, "grad_norm": 3.5826666355133057, "learning_rate": 2.437272939726784e-06, "loss": 0.0543, "step": 193900 }, { "epoch": 2.781004416910457, "grad_norm": 5.841976165771484, "learning_rate": 2.4332894261615927e-06, "loss": 0.0954, "step": 193925 }, { "epoch": 2.781362932369644, "grad_norm": 10.113981246948242, "learning_rate": 2.429305912596401e-06, "loss": 0.0522, "step": 193950 }, { "epoch": 2.78172144782883, "grad_norm": 7.22070837020874, "learning_rate": 2.4253223990312097e-06, "loss": 0.0291, "step": 193975 }, { "epoch": 2.782079963288017, "grad_norm": 1.1549798250198364, "learning_rate": 2.421338885466018e-06, "loss": 0.0702, "step": 194000 }, { "epoch": 2.7824384787472036, "grad_norm": 2.5217413902282715, "learning_rate": 2.4173553719008262e-06, "loss": 0.1424, "step": 194025 }, { "epoch": 2.7827969942063904, "grad_norm": 0.8689970374107361, "learning_rate": 2.413371858335635e-06, "loss": 0.0667, "step": 194050 }, { "epoch": 2.7831555096655767, "grad_norm": 3.9103832244873047, "learning_rate": 2.4093883447704436e-06, "loss": 0.076, "step": 194075 }, { "epoch": 2.7835140251247634, "grad_norm": 8.385714530944824, "learning_rate": 2.405404831205252e-06, "loss": 0.1342, "step": 194100 }, { "epoch": 2.7838725405839497, "grad_norm": 28.155141830444336, "learning_rate": 2.4014213176400606e-06, "loss": 0.0926, "step": 194125 }, { "epoch": 2.7842310560431365, "grad_norm": 0.21970342099666595, "learning_rate": 2.397437804074869e-06, "loss": 0.1167, "step": 194150 }, { "epoch": 2.7845895715023232, "grad_norm": 1.8143937587738037, "learning_rate": 2.393454290509677e-06, "loss": 0.1451, "step": 194175 }, { "epoch": 2.78494808696151, "grad_norm": 0.5164679884910583, "learning_rate": 2.389470776944486e-06, "loss": 0.0659, "step": 194200 }, { "epoch": 2.7853066024206963, "grad_norm": 0.7903498411178589, "learning_rate": 2.3854872633792945e-06, "loss": 0.0555, "step": 194225 }, { "epoch": 2.785665117879883, "grad_norm": 4.854432582855225, "learning_rate": 2.3815037498141028e-06, "loss": 0.1257, "step": 194250 }, { "epoch": 2.7860236333390693, "grad_norm": 0.09207174181938171, "learning_rate": 2.377520236248911e-06, "loss": 0.0596, "step": 194275 }, { "epoch": 2.786382148798256, "grad_norm": 2.683924674987793, "learning_rate": 2.3735367226837197e-06, "loss": 0.123, "step": 194300 }, { "epoch": 2.786740664257443, "grad_norm": 11.205132484436035, "learning_rate": 2.369553209118528e-06, "loss": 0.0431, "step": 194325 }, { "epoch": 2.7870991797166296, "grad_norm": 5.057281970977783, "learning_rate": 2.3655696955533367e-06, "loss": 0.0792, "step": 194350 }, { "epoch": 2.787457695175816, "grad_norm": 0.03641526401042938, "learning_rate": 2.3615861819881454e-06, "loss": 0.0842, "step": 194375 }, { "epoch": 2.7878162106350026, "grad_norm": 0.37116435170173645, "learning_rate": 2.3576026684229537e-06, "loss": 0.1198, "step": 194400 }, { "epoch": 2.788174726094189, "grad_norm": 1.2468732595443726, "learning_rate": 2.353619154857762e-06, "loss": 0.0569, "step": 194425 }, { "epoch": 2.7885332415533757, "grad_norm": 0.6199888586997986, "learning_rate": 2.3496356412925706e-06, "loss": 0.0812, "step": 194450 }, { "epoch": 2.7888917570125624, "grad_norm": 21.633440017700195, "learning_rate": 2.345652127727379e-06, "loss": 0.1663, "step": 194475 }, { "epoch": 2.789250272471749, "grad_norm": 1.4640741348266602, "learning_rate": 2.3416686141621876e-06, "loss": 0.0917, "step": 194500 }, { "epoch": 2.7896087879309355, "grad_norm": 8.315271377563477, "learning_rate": 2.3376851005969963e-06, "loss": 0.0745, "step": 194525 }, { "epoch": 2.789967303390122, "grad_norm": 16.20746421813965, "learning_rate": 2.333701587031804e-06, "loss": 0.0926, "step": 194550 }, { "epoch": 2.7903258188493085, "grad_norm": 7.880843162536621, "learning_rate": 2.329718073466613e-06, "loss": 0.0986, "step": 194575 }, { "epoch": 2.7906843343084953, "grad_norm": 0.1662873476743698, "learning_rate": 2.3257345599014215e-06, "loss": 0.1433, "step": 194600 }, { "epoch": 2.791042849767682, "grad_norm": 2.444643974304199, "learning_rate": 2.3217510463362298e-06, "loss": 0.0878, "step": 194625 }, { "epoch": 2.7914013652268688, "grad_norm": 0.47176027297973633, "learning_rate": 2.3177675327710385e-06, "loss": 0.0856, "step": 194650 }, { "epoch": 2.791759880686055, "grad_norm": 0.787897527217865, "learning_rate": 2.313784019205847e-06, "loss": 0.1399, "step": 194675 }, { "epoch": 2.792118396145242, "grad_norm": 0.3480183780193329, "learning_rate": 2.309800505640655e-06, "loss": 0.0356, "step": 194700 }, { "epoch": 2.792476911604428, "grad_norm": 0.10065653175115585, "learning_rate": 2.3058169920754637e-06, "loss": 0.0969, "step": 194725 }, { "epoch": 2.792835427063615, "grad_norm": 1.4493727684020996, "learning_rate": 2.3018334785102724e-06, "loss": 0.1034, "step": 194750 }, { "epoch": 2.7931939425228016, "grad_norm": 7.628594398498535, "learning_rate": 2.2978499649450806e-06, "loss": 0.035, "step": 194775 }, { "epoch": 2.7935524579819884, "grad_norm": 0.012962725013494492, "learning_rate": 2.2938664513798893e-06, "loss": 0.0538, "step": 194800 }, { "epoch": 2.7939109734411747, "grad_norm": 0.10491228848695755, "learning_rate": 2.2898829378146976e-06, "loss": 0.1045, "step": 194825 }, { "epoch": 2.7942694889003614, "grad_norm": 5.680478096008301, "learning_rate": 2.285899424249506e-06, "loss": 0.0592, "step": 194850 }, { "epoch": 2.7946280043595477, "grad_norm": 2.039092540740967, "learning_rate": 2.2819159106843146e-06, "loss": 0.0688, "step": 194875 }, { "epoch": 2.7949865198187345, "grad_norm": 1.019811749458313, "learning_rate": 2.2779323971191232e-06, "loss": 0.0373, "step": 194900 }, { "epoch": 2.795345035277921, "grad_norm": 17.641260147094727, "learning_rate": 2.2739488835539315e-06, "loss": 0.1202, "step": 194925 }, { "epoch": 2.795703550737108, "grad_norm": 9.267711639404297, "learning_rate": 2.26996536998874e-06, "loss": 0.0557, "step": 194950 }, { "epoch": 2.7960620661962943, "grad_norm": 21.951078414916992, "learning_rate": 2.2659818564235485e-06, "loss": 0.0548, "step": 194975 }, { "epoch": 2.796420581655481, "grad_norm": 0.8484728336334229, "learning_rate": 2.2619983428583567e-06, "loss": 0.0584, "step": 195000 }, { "epoch": 2.7967790971146673, "grad_norm": 4.126589775085449, "learning_rate": 2.2580148292931654e-06, "loss": 0.1135, "step": 195025 }, { "epoch": 2.797137612573854, "grad_norm": 3.8225512504577637, "learning_rate": 2.254031315727974e-06, "loss": 0.0496, "step": 195050 }, { "epoch": 2.797496128033041, "grad_norm": 12.534608840942383, "learning_rate": 2.2500478021627824e-06, "loss": 0.0594, "step": 195075 }, { "epoch": 2.7978546434922276, "grad_norm": 13.46942138671875, "learning_rate": 2.2460642885975907e-06, "loss": 0.0696, "step": 195100 }, { "epoch": 2.798213158951414, "grad_norm": 0.08350681513547897, "learning_rate": 2.2420807750323994e-06, "loss": 0.0893, "step": 195125 }, { "epoch": 2.7985716744106006, "grad_norm": 6.188421249389648, "learning_rate": 2.2380972614672076e-06, "loss": 0.1217, "step": 195150 }, { "epoch": 2.798930189869787, "grad_norm": 0.19646702706813812, "learning_rate": 2.2341137479020163e-06, "loss": 0.1195, "step": 195175 }, { "epoch": 2.7992887053289737, "grad_norm": 13.992904663085938, "learning_rate": 2.230130234336825e-06, "loss": 0.064, "step": 195200 }, { "epoch": 2.7996472207881604, "grad_norm": 2.159763813018799, "learning_rate": 2.2261467207716333e-06, "loss": 0.0666, "step": 195225 }, { "epoch": 2.800005736247347, "grad_norm": 8.168936729431152, "learning_rate": 2.2221632072064415e-06, "loss": 0.0659, "step": 195250 }, { "epoch": 2.8003642517065335, "grad_norm": 3.0140798091888428, "learning_rate": 2.2181796936412502e-06, "loss": 0.088, "step": 195275 }, { "epoch": 2.80072276716572, "grad_norm": 14.538262367248535, "learning_rate": 2.2141961800760585e-06, "loss": 0.1099, "step": 195300 }, { "epoch": 2.8010812826249065, "grad_norm": 9.5266695022583, "learning_rate": 2.210212666510867e-06, "loss": 0.1087, "step": 195325 }, { "epoch": 2.8014397980840933, "grad_norm": 8.2529878616333, "learning_rate": 2.206229152945676e-06, "loss": 0.0888, "step": 195350 }, { "epoch": 2.80179831354328, "grad_norm": 5.69735050201416, "learning_rate": 2.2022456393804837e-06, "loss": 0.1206, "step": 195375 }, { "epoch": 2.8021568290024668, "grad_norm": 5.946120738983154, "learning_rate": 2.1982621258152924e-06, "loss": 0.1092, "step": 195400 }, { "epoch": 2.802515344461653, "grad_norm": 0.2307732254266739, "learning_rate": 2.194278612250101e-06, "loss": 0.0843, "step": 195425 }, { "epoch": 2.80287385992084, "grad_norm": 1.4794188737869263, "learning_rate": 2.1902950986849094e-06, "loss": 0.0596, "step": 195450 }, { "epoch": 2.803232375380026, "grad_norm": 14.677102088928223, "learning_rate": 2.186311585119718e-06, "loss": 0.0459, "step": 195475 }, { "epoch": 2.803590890839213, "grad_norm": 14.599203109741211, "learning_rate": 2.1823280715545268e-06, "loss": 0.0538, "step": 195500 }, { "epoch": 2.8039494062983996, "grad_norm": 16.704463958740234, "learning_rate": 2.1783445579893346e-06, "loss": 0.0764, "step": 195525 }, { "epoch": 2.8043079217575864, "grad_norm": 6.282632827758789, "learning_rate": 2.1743610444241433e-06, "loss": 0.0472, "step": 195550 }, { "epoch": 2.8046664372167727, "grad_norm": 1.0102108716964722, "learning_rate": 2.170377530858952e-06, "loss": 0.101, "step": 195575 }, { "epoch": 2.8050249526759594, "grad_norm": 3.61702036857605, "learning_rate": 2.1663940172937603e-06, "loss": 0.0327, "step": 195600 }, { "epoch": 2.8053834681351457, "grad_norm": 4.979215145111084, "learning_rate": 2.162410503728569e-06, "loss": 0.1122, "step": 195625 }, { "epoch": 2.8057419835943325, "grad_norm": 0.4117927849292755, "learning_rate": 2.1584269901633772e-06, "loss": 0.0463, "step": 195650 }, { "epoch": 2.806100499053519, "grad_norm": 0.6441653370857239, "learning_rate": 2.1544434765981855e-06, "loss": 0.0488, "step": 195675 }, { "epoch": 2.806459014512706, "grad_norm": 11.692255020141602, "learning_rate": 2.150459963032994e-06, "loss": 0.0867, "step": 195700 }, { "epoch": 2.8068175299718923, "grad_norm": 0.16368266940116882, "learning_rate": 2.146476449467803e-06, "loss": 0.1161, "step": 195725 }, { "epoch": 2.807176045431079, "grad_norm": 0.15613095462322235, "learning_rate": 2.142492935902611e-06, "loss": 0.0466, "step": 195750 }, { "epoch": 2.8075345608902653, "grad_norm": 0.17626331746578217, "learning_rate": 2.13850942233742e-06, "loss": 0.174, "step": 195775 }, { "epoch": 2.807893076349452, "grad_norm": 8.11132526397705, "learning_rate": 2.134525908772228e-06, "loss": 0.0678, "step": 195800 }, { "epoch": 2.808251591808639, "grad_norm": 4.472731113433838, "learning_rate": 2.1305423952070364e-06, "loss": 0.0837, "step": 195825 }, { "epoch": 2.8086101072678256, "grad_norm": 0.04531437158584595, "learning_rate": 2.126558881641845e-06, "loss": 0.0321, "step": 195850 }, { "epoch": 2.808968622727012, "grad_norm": 0.8241557478904724, "learning_rate": 2.1225753680766538e-06, "loss": 0.0626, "step": 195875 }, { "epoch": 2.8093271381861986, "grad_norm": 10.097281455993652, "learning_rate": 2.118591854511462e-06, "loss": 0.1711, "step": 195900 }, { "epoch": 2.809685653645385, "grad_norm": 7.523350715637207, "learning_rate": 2.1146083409462703e-06, "loss": 0.0846, "step": 195925 }, { "epoch": 2.8100441691045717, "grad_norm": 7.453779220581055, "learning_rate": 2.110624827381079e-06, "loss": 0.0375, "step": 195950 }, { "epoch": 2.8104026845637584, "grad_norm": 0.09068239480257034, "learning_rate": 2.1066413138158872e-06, "loss": 0.0937, "step": 195975 }, { "epoch": 2.810761200022945, "grad_norm": 2.8361237049102783, "learning_rate": 2.102657800250696e-06, "loss": 0.0787, "step": 196000 }, { "epoch": 2.8111197154821315, "grad_norm": 0.9380650520324707, "learning_rate": 2.0986742866855046e-06, "loss": 0.0962, "step": 196025 }, { "epoch": 2.811478230941318, "grad_norm": 18.435546875, "learning_rate": 2.094690773120313e-06, "loss": 0.0446, "step": 196050 }, { "epoch": 2.8118367464005045, "grad_norm": 3.8449277877807617, "learning_rate": 2.090707259555121e-06, "loss": 0.0326, "step": 196075 }, { "epoch": 2.8121952618596913, "grad_norm": 0.30659785866737366, "learning_rate": 2.08672374598993e-06, "loss": 0.2142, "step": 196100 }, { "epoch": 2.812553777318878, "grad_norm": 2.93762469291687, "learning_rate": 2.082740232424738e-06, "loss": 0.0882, "step": 196125 }, { "epoch": 2.8129122927780648, "grad_norm": 8.698197364807129, "learning_rate": 2.078756718859547e-06, "loss": 0.1024, "step": 196150 }, { "epoch": 2.813270808237251, "grad_norm": 0.028761090710759163, "learning_rate": 2.0747732052943555e-06, "loss": 0.0665, "step": 196175 }, { "epoch": 2.813629323696438, "grad_norm": 9.726408004760742, "learning_rate": 2.0707896917291634e-06, "loss": 0.0463, "step": 196200 }, { "epoch": 2.813987839155624, "grad_norm": 19.733673095703125, "learning_rate": 2.066806178163972e-06, "loss": 0.1157, "step": 196225 }, { "epoch": 2.814346354614811, "grad_norm": 4.503650188446045, "learning_rate": 2.0628226645987807e-06, "loss": 0.1181, "step": 196250 }, { "epoch": 2.8147048700739976, "grad_norm": 2.8688769340515137, "learning_rate": 2.058839151033589e-06, "loss": 0.1184, "step": 196275 }, { "epoch": 2.8150633855331844, "grad_norm": 4.9878621101379395, "learning_rate": 2.0548556374683977e-06, "loss": 0.0972, "step": 196300 }, { "epoch": 2.8154219009923707, "grad_norm": 0.11187092214822769, "learning_rate": 2.050872123903206e-06, "loss": 0.0559, "step": 196325 }, { "epoch": 2.8157804164515574, "grad_norm": 7.174285888671875, "learning_rate": 2.0468886103380142e-06, "loss": 0.0449, "step": 196350 }, { "epoch": 2.8161389319107437, "grad_norm": 6.79325008392334, "learning_rate": 2.042905096772823e-06, "loss": 0.0548, "step": 196375 }, { "epoch": 2.8164974473699305, "grad_norm": 11.209386825561523, "learning_rate": 2.038921583207631e-06, "loss": 0.042, "step": 196400 }, { "epoch": 2.816855962829117, "grad_norm": 1.7270654439926147, "learning_rate": 2.03493806964244e-06, "loss": 0.0622, "step": 196425 }, { "epoch": 2.817214478288304, "grad_norm": 17.675796508789062, "learning_rate": 2.0309545560772486e-06, "loss": 0.0818, "step": 196450 }, { "epoch": 2.8175729937474903, "grad_norm": 5.724522590637207, "learning_rate": 2.0269710425120564e-06, "loss": 0.0437, "step": 196475 }, { "epoch": 2.817931509206677, "grad_norm": 2.334113121032715, "learning_rate": 2.022987528946865e-06, "loss": 0.0748, "step": 196500 }, { "epoch": 2.8182900246658638, "grad_norm": 2.1305885314941406, "learning_rate": 2.019004015381674e-06, "loss": 0.0537, "step": 196525 }, { "epoch": 2.81864854012505, "grad_norm": 0.17775298655033112, "learning_rate": 2.015020501816482e-06, "loss": 0.1733, "step": 196550 }, { "epoch": 2.819007055584237, "grad_norm": 16.324661254882812, "learning_rate": 2.0110369882512908e-06, "loss": 0.0909, "step": 196575 }, { "epoch": 2.8193655710434236, "grad_norm": 0.12577898800373077, "learning_rate": 2.0070534746860995e-06, "loss": 0.0873, "step": 196600 }, { "epoch": 2.81972408650261, "grad_norm": 18.335996627807617, "learning_rate": 2.0030699611209073e-06, "loss": 0.0688, "step": 196625 }, { "epoch": 2.8200826019617966, "grad_norm": 0.5258516073226929, "learning_rate": 1.999086447555716e-06, "loss": 0.0486, "step": 196650 }, { "epoch": 2.8204411174209834, "grad_norm": 12.096571922302246, "learning_rate": 1.9951029339905247e-06, "loss": 0.072, "step": 196675 }, { "epoch": 2.8207996328801697, "grad_norm": 2.3015286922454834, "learning_rate": 1.991119420425333e-06, "loss": 0.0959, "step": 196700 }, { "epoch": 2.8211581483393564, "grad_norm": 0.4986613392829895, "learning_rate": 1.9871359068601416e-06, "loss": 0.0392, "step": 196725 }, { "epoch": 2.821516663798543, "grad_norm": 1.0641406774520874, "learning_rate": 1.98315239329495e-06, "loss": 0.0555, "step": 196750 }, { "epoch": 2.8218751792577295, "grad_norm": 0.15271490812301636, "learning_rate": 1.979168879729758e-06, "loss": 0.0445, "step": 196775 }, { "epoch": 2.822233694716916, "grad_norm": 3.310441493988037, "learning_rate": 1.975185366164567e-06, "loss": 0.1154, "step": 196800 }, { "epoch": 2.822592210176103, "grad_norm": 18.595102310180664, "learning_rate": 1.9712018525993756e-06, "loss": 0.1646, "step": 196825 }, { "epoch": 2.8229507256352893, "grad_norm": 5.9162211418151855, "learning_rate": 1.967218339034184e-06, "loss": 0.0639, "step": 196850 }, { "epoch": 2.823309241094476, "grad_norm": 7.2749481201171875, "learning_rate": 1.9632348254689925e-06, "loss": 0.1114, "step": 196875 }, { "epoch": 2.8236677565536628, "grad_norm": 19.521717071533203, "learning_rate": 1.959251311903801e-06, "loss": 0.0749, "step": 196900 }, { "epoch": 2.824026272012849, "grad_norm": 0.1293191760778427, "learning_rate": 1.955267798338609e-06, "loss": 0.0462, "step": 196925 }, { "epoch": 2.824384787472036, "grad_norm": 0.38532698154449463, "learning_rate": 1.9512842847734178e-06, "loss": 0.1094, "step": 196950 }, { "epoch": 2.8247433029312226, "grad_norm": 0.05360395833849907, "learning_rate": 1.9473007712082264e-06, "loss": 0.0585, "step": 196975 }, { "epoch": 2.825101818390409, "grad_norm": 1.205998182296753, "learning_rate": 1.9433172576430347e-06, "loss": 0.0635, "step": 197000 }, { "epoch": 2.8254603338495956, "grad_norm": 6.4408979415893555, "learning_rate": 1.939333744077843e-06, "loss": 0.0632, "step": 197025 }, { "epoch": 2.8258188493087824, "grad_norm": 8.865792274475098, "learning_rate": 1.9353502305126517e-06, "loss": 0.0815, "step": 197050 }, { "epoch": 2.8261773647679687, "grad_norm": 2.7547366619110107, "learning_rate": 1.93136671694746e-06, "loss": 0.0736, "step": 197075 }, { "epoch": 2.8265358802271554, "grad_norm": 4.501839637756348, "learning_rate": 1.9273832033822686e-06, "loss": 0.1332, "step": 197100 }, { "epoch": 2.826894395686342, "grad_norm": 8.392189025878906, "learning_rate": 1.9233996898170773e-06, "loss": 0.0471, "step": 197125 }, { "epoch": 2.8272529111455285, "grad_norm": 17.316654205322266, "learning_rate": 1.9194161762518856e-06, "loss": 0.0453, "step": 197150 }, { "epoch": 2.827611426604715, "grad_norm": 2.2599730491638184, "learning_rate": 1.915432662686694e-06, "loss": 0.0611, "step": 197175 }, { "epoch": 2.827969942063902, "grad_norm": 0.03217492625117302, "learning_rate": 1.9114491491215026e-06, "loss": 0.031, "step": 197200 }, { "epoch": 2.8283284575230883, "grad_norm": 18.504405975341797, "learning_rate": 1.907465635556311e-06, "loss": 0.1092, "step": 197225 }, { "epoch": 2.828686972982275, "grad_norm": 0.415400892496109, "learning_rate": 1.9034821219911195e-06, "loss": 0.0499, "step": 197250 }, { "epoch": 2.8290454884414618, "grad_norm": 2.8800783157348633, "learning_rate": 1.899498608425928e-06, "loss": 0.0753, "step": 197275 }, { "epoch": 2.829404003900648, "grad_norm": 12.572528839111328, "learning_rate": 1.8955150948607363e-06, "loss": 0.1406, "step": 197300 }, { "epoch": 2.829762519359835, "grad_norm": 0.472307950258255, "learning_rate": 1.891531581295545e-06, "loss": 0.0861, "step": 197325 }, { "epoch": 2.8301210348190216, "grad_norm": 0.7546710968017578, "learning_rate": 1.8875480677303534e-06, "loss": 0.0871, "step": 197350 }, { "epoch": 2.830479550278208, "grad_norm": 0.7107766270637512, "learning_rate": 1.8835645541651617e-06, "loss": 0.067, "step": 197375 }, { "epoch": 2.8308380657373946, "grad_norm": 18.816837310791016, "learning_rate": 1.8795810405999702e-06, "loss": 0.0832, "step": 197400 }, { "epoch": 2.8311965811965814, "grad_norm": 0.08990861475467682, "learning_rate": 1.8755975270347789e-06, "loss": 0.1146, "step": 197425 }, { "epoch": 2.8315550966557677, "grad_norm": 4.479008197784424, "learning_rate": 1.8716140134695874e-06, "loss": 0.0395, "step": 197450 }, { "epoch": 2.8319136121149544, "grad_norm": 9.718741416931152, "learning_rate": 1.8676304999043956e-06, "loss": 0.0603, "step": 197475 }, { "epoch": 2.832272127574141, "grad_norm": 13.001216888427734, "learning_rate": 1.863646986339204e-06, "loss": 0.0827, "step": 197500 }, { "epoch": 2.8326306430333275, "grad_norm": 0.9567607641220093, "learning_rate": 1.8596634727740128e-06, "loss": 0.0833, "step": 197525 }, { "epoch": 2.832989158492514, "grad_norm": 0.026683339849114418, "learning_rate": 1.855679959208821e-06, "loss": 0.0667, "step": 197550 }, { "epoch": 2.833347673951701, "grad_norm": 18.262441635131836, "learning_rate": 1.8516964456436295e-06, "loss": 0.0737, "step": 197575 }, { "epoch": 2.8337061894108873, "grad_norm": 0.2531045079231262, "learning_rate": 1.8477129320784382e-06, "loss": 0.1015, "step": 197600 }, { "epoch": 2.834064704870074, "grad_norm": 2.8758544921875, "learning_rate": 1.8437294185132465e-06, "loss": 0.0658, "step": 197625 }, { "epoch": 2.8344232203292608, "grad_norm": 4.3474626541137695, "learning_rate": 1.839745904948055e-06, "loss": 0.0798, "step": 197650 }, { "epoch": 2.834781735788447, "grad_norm": 0.34157654643058777, "learning_rate": 1.8357623913828635e-06, "loss": 0.142, "step": 197675 }, { "epoch": 2.835140251247634, "grad_norm": 1.1583408117294312, "learning_rate": 1.831778877817672e-06, "loss": 0.109, "step": 197700 }, { "epoch": 2.8354987667068206, "grad_norm": 0.24687455594539642, "learning_rate": 1.8277953642524804e-06, "loss": 0.1, "step": 197725 }, { "epoch": 2.835857282166007, "grad_norm": 0.033861711621284485, "learning_rate": 1.823811850687289e-06, "loss": 0.078, "step": 197750 }, { "epoch": 2.8362157976251936, "grad_norm": 9.98733139038086, "learning_rate": 1.8198283371220974e-06, "loss": 0.0987, "step": 197775 }, { "epoch": 2.8365743130843803, "grad_norm": 0.42993560433387756, "learning_rate": 1.8158448235569059e-06, "loss": 0.0896, "step": 197800 }, { "epoch": 2.8369328285435667, "grad_norm": 0.09616346657276154, "learning_rate": 1.8118613099917143e-06, "loss": 0.0588, "step": 197825 }, { "epoch": 2.8372913440027534, "grad_norm": 0.03133273869752884, "learning_rate": 1.8078777964265228e-06, "loss": 0.0882, "step": 197850 }, { "epoch": 2.83764985946194, "grad_norm": 0.326509565114975, "learning_rate": 1.8038942828613313e-06, "loss": 0.0624, "step": 197875 }, { "epoch": 2.8380083749211265, "grad_norm": 0.6154252290725708, "learning_rate": 1.7999107692961398e-06, "loss": 0.0869, "step": 197900 }, { "epoch": 2.838366890380313, "grad_norm": 1.208981990814209, "learning_rate": 1.7959272557309483e-06, "loss": 0.041, "step": 197925 }, { "epoch": 2.8387254058395, "grad_norm": 0.20537571609020233, "learning_rate": 1.7919437421657565e-06, "loss": 0.0707, "step": 197950 }, { "epoch": 2.8390839212986863, "grad_norm": 2.56666898727417, "learning_rate": 1.7879602286005652e-06, "loss": 0.1079, "step": 197975 }, { "epoch": 2.839442436757873, "grad_norm": 0.10788139700889587, "learning_rate": 1.7839767150353737e-06, "loss": 0.0916, "step": 198000 }, { "epoch": 2.8398009522170597, "grad_norm": 0.4124250113964081, "learning_rate": 1.779993201470182e-06, "loss": 0.0631, "step": 198025 }, { "epoch": 2.840159467676246, "grad_norm": 0.3119702637195587, "learning_rate": 1.7760096879049907e-06, "loss": 0.0929, "step": 198050 }, { "epoch": 2.840517983135433, "grad_norm": 1.1512131690979004, "learning_rate": 1.7720261743397991e-06, "loss": 0.0862, "step": 198075 }, { "epoch": 2.8408764985946195, "grad_norm": 12.490194320678711, "learning_rate": 1.7680426607746074e-06, "loss": 0.0559, "step": 198100 }, { "epoch": 2.841235014053806, "grad_norm": 1.5213780403137207, "learning_rate": 1.764059147209416e-06, "loss": 0.0835, "step": 198125 }, { "epoch": 2.8415935295129926, "grad_norm": 10.663723945617676, "learning_rate": 1.7600756336442244e-06, "loss": 0.1355, "step": 198150 }, { "epoch": 2.8419520449721793, "grad_norm": 3.203433036804199, "learning_rate": 1.7560921200790328e-06, "loss": 0.0375, "step": 198175 }, { "epoch": 2.8423105604313657, "grad_norm": 0.23963065445423126, "learning_rate": 1.7521086065138415e-06, "loss": 0.0649, "step": 198200 }, { "epoch": 2.8426690758905524, "grad_norm": 1.61431884765625, "learning_rate": 1.7481250929486498e-06, "loss": 0.0999, "step": 198225 }, { "epoch": 2.843027591349739, "grad_norm": 11.17373275756836, "learning_rate": 1.7441415793834583e-06, "loss": 0.1384, "step": 198250 }, { "epoch": 2.8433861068089255, "grad_norm": 3.3667590618133545, "learning_rate": 1.740158065818267e-06, "loss": 0.0417, "step": 198275 }, { "epoch": 2.843744622268112, "grad_norm": 0.2137415111064911, "learning_rate": 1.7361745522530752e-06, "loss": 0.0743, "step": 198300 }, { "epoch": 2.844103137727299, "grad_norm": 5.585659503936768, "learning_rate": 1.7321910386878837e-06, "loss": 0.0622, "step": 198325 }, { "epoch": 2.8444616531864853, "grad_norm": 0.8378514051437378, "learning_rate": 1.7282075251226924e-06, "loss": 0.063, "step": 198350 }, { "epoch": 2.844820168645672, "grad_norm": 3.34755802154541, "learning_rate": 1.7242240115575007e-06, "loss": 0.1352, "step": 198375 }, { "epoch": 2.8451786841048587, "grad_norm": 6.765665054321289, "learning_rate": 1.7202404979923092e-06, "loss": 0.0755, "step": 198400 }, { "epoch": 2.845537199564045, "grad_norm": 0.06253384053707123, "learning_rate": 1.7162569844271176e-06, "loss": 0.041, "step": 198425 }, { "epoch": 2.845895715023232, "grad_norm": 2.7503111362457275, "learning_rate": 1.7122734708619261e-06, "loss": 0.0437, "step": 198450 }, { "epoch": 2.8462542304824185, "grad_norm": 0.8189980387687683, "learning_rate": 1.7082899572967346e-06, "loss": 0.0876, "step": 198475 }, { "epoch": 2.846612745941605, "grad_norm": 10.979226112365723, "learning_rate": 1.704306443731543e-06, "loss": 0.0854, "step": 198500 }, { "epoch": 2.8469712614007916, "grad_norm": 4.449490547180176, "learning_rate": 1.7003229301663516e-06, "loss": 0.101, "step": 198525 }, { "epoch": 2.8473297768599783, "grad_norm": 2.8712050914764404, "learning_rate": 1.69633941660116e-06, "loss": 0.0881, "step": 198550 }, { "epoch": 2.8476882923191646, "grad_norm": 5.5230584144592285, "learning_rate": 1.6923559030359685e-06, "loss": 0.1093, "step": 198575 }, { "epoch": 2.8480468077783514, "grad_norm": 5.1271772384643555, "learning_rate": 1.688372389470777e-06, "loss": 0.1122, "step": 198600 }, { "epoch": 2.848405323237538, "grad_norm": 2.3058934211730957, "learning_rate": 1.6843888759055855e-06, "loss": 0.0601, "step": 198625 }, { "epoch": 2.8487638386967244, "grad_norm": 0.20913521945476532, "learning_rate": 1.680405362340394e-06, "loss": 0.1648, "step": 198650 }, { "epoch": 2.849122354155911, "grad_norm": 0.36028385162353516, "learning_rate": 1.6764218487752024e-06, "loss": 0.0332, "step": 198675 }, { "epoch": 2.849480869615098, "grad_norm": 13.981510162353516, "learning_rate": 1.6724383352100107e-06, "loss": 0.1088, "step": 198700 }, { "epoch": 2.8498393850742842, "grad_norm": 0.6426104307174683, "learning_rate": 1.6684548216448194e-06, "loss": 0.0907, "step": 198725 }, { "epoch": 2.850197900533471, "grad_norm": 0.4983828663825989, "learning_rate": 1.6644713080796279e-06, "loss": 0.1052, "step": 198750 }, { "epoch": 2.8505564159926577, "grad_norm": 5.031223773956299, "learning_rate": 1.6604877945144361e-06, "loss": 0.052, "step": 198775 }, { "epoch": 2.850914931451844, "grad_norm": 0.0957300066947937, "learning_rate": 1.6565042809492448e-06, "loss": 0.0669, "step": 198800 }, { "epoch": 2.851273446911031, "grad_norm": 6.701606273651123, "learning_rate": 1.6525207673840533e-06, "loss": 0.0591, "step": 198825 }, { "epoch": 2.8516319623702175, "grad_norm": 4.408018112182617, "learning_rate": 1.6485372538188616e-06, "loss": 0.1252, "step": 198850 }, { "epoch": 2.851990477829404, "grad_norm": 1.4837762117385864, "learning_rate": 1.6445537402536703e-06, "loss": 0.1119, "step": 198875 }, { "epoch": 2.8523489932885906, "grad_norm": 11.045553207397461, "learning_rate": 1.6405702266884788e-06, "loss": 0.2088, "step": 198900 }, { "epoch": 2.8527075087477773, "grad_norm": 20.846128463745117, "learning_rate": 1.636586713123287e-06, "loss": 0.1422, "step": 198925 }, { "epoch": 2.8530660242069636, "grad_norm": 8.833349227905273, "learning_rate": 1.6326031995580957e-06, "loss": 0.1093, "step": 198950 }, { "epoch": 2.8534245396661504, "grad_norm": 11.800711631774902, "learning_rate": 1.628619685992904e-06, "loss": 0.0827, "step": 198975 }, { "epoch": 2.853783055125337, "grad_norm": 1.1339483261108398, "learning_rate": 1.6246361724277125e-06, "loss": 0.1026, "step": 199000 }, { "epoch": 2.8541415705845234, "grad_norm": 16.71269989013672, "learning_rate": 1.6206526588625212e-06, "loss": 0.0688, "step": 199025 }, { "epoch": 2.85450008604371, "grad_norm": 3.0100417137145996, "learning_rate": 1.6166691452973294e-06, "loss": 0.0922, "step": 199050 }, { "epoch": 2.854858601502897, "grad_norm": 2.918523073196411, "learning_rate": 1.612685631732138e-06, "loss": 0.1003, "step": 199075 }, { "epoch": 2.8552171169620832, "grad_norm": 18.012575149536133, "learning_rate": 1.6087021181669466e-06, "loss": 0.155, "step": 199100 }, { "epoch": 2.85557563242127, "grad_norm": 17.067481994628906, "learning_rate": 1.6047186046017549e-06, "loss": 0.1301, "step": 199125 }, { "epoch": 2.8559341478804567, "grad_norm": 3.9840879440307617, "learning_rate": 1.6007350910365633e-06, "loss": 0.0934, "step": 199150 }, { "epoch": 2.856292663339643, "grad_norm": 2.1266849040985107, "learning_rate": 1.596751577471372e-06, "loss": 0.1666, "step": 199175 }, { "epoch": 2.85665117879883, "grad_norm": 0.3586488664150238, "learning_rate": 1.5927680639061803e-06, "loss": 0.0437, "step": 199200 }, { "epoch": 2.8570096942580165, "grad_norm": 1.343562126159668, "learning_rate": 1.5887845503409888e-06, "loss": 0.0758, "step": 199225 }, { "epoch": 2.857368209717203, "grad_norm": 2.130579948425293, "learning_rate": 1.5848010367757973e-06, "loss": 0.0487, "step": 199250 }, { "epoch": 2.8577267251763896, "grad_norm": 14.011371612548828, "learning_rate": 1.5808175232106057e-06, "loss": 0.1262, "step": 199275 }, { "epoch": 2.8580852406355763, "grad_norm": 10.515968322753906, "learning_rate": 1.5768340096454142e-06, "loss": 0.057, "step": 199300 }, { "epoch": 2.8584437560947626, "grad_norm": 20.103124618530273, "learning_rate": 1.5728504960802227e-06, "loss": 0.0644, "step": 199325 }, { "epoch": 2.8588022715539494, "grad_norm": 0.09510152786970139, "learning_rate": 1.5688669825150312e-06, "loss": 0.0239, "step": 199350 }, { "epoch": 2.859160787013136, "grad_norm": 8.125730514526367, "learning_rate": 1.5648834689498397e-06, "loss": 0.0456, "step": 199375 }, { "epoch": 2.8595193024723224, "grad_norm": 3.405048370361328, "learning_rate": 1.5608999553846481e-06, "loss": 0.0522, "step": 199400 }, { "epoch": 2.859877817931509, "grad_norm": 1.2148830890655518, "learning_rate": 1.5569164418194566e-06, "loss": 0.0938, "step": 199425 }, { "epoch": 2.860236333390696, "grad_norm": 9.159390449523926, "learning_rate": 1.5529329282542651e-06, "loss": 0.0425, "step": 199450 }, { "epoch": 2.8605948488498822, "grad_norm": 0.8991718888282776, "learning_rate": 1.5489494146890736e-06, "loss": 0.1148, "step": 199475 }, { "epoch": 2.860953364309069, "grad_norm": 0.2360038310289383, "learning_rate": 1.544965901123882e-06, "loss": 0.0442, "step": 199500 }, { "epoch": 2.8613118797682557, "grad_norm": 3.0776495933532715, "learning_rate": 1.5409823875586903e-06, "loss": 0.0399, "step": 199525 }, { "epoch": 2.861670395227442, "grad_norm": 5.308818340301514, "learning_rate": 1.536998873993499e-06, "loss": 0.0621, "step": 199550 }, { "epoch": 2.862028910686629, "grad_norm": 4.8648295402526855, "learning_rate": 1.5330153604283075e-06, "loss": 0.0849, "step": 199575 }, { "epoch": 2.8623874261458155, "grad_norm": 9.132207870483398, "learning_rate": 1.5290318468631158e-06, "loss": 0.0816, "step": 199600 }, { "epoch": 2.862745941605002, "grad_norm": 6.803464889526367, "learning_rate": 1.5250483332979245e-06, "loss": 0.1563, "step": 199625 }, { "epoch": 2.8631044570641886, "grad_norm": 4.068709850311279, "learning_rate": 1.521064819732733e-06, "loss": 0.0831, "step": 199650 }, { "epoch": 2.8634629725233753, "grad_norm": 0.5127690434455872, "learning_rate": 1.5170813061675412e-06, "loss": 0.0502, "step": 199675 }, { "epoch": 2.8638214879825616, "grad_norm": 18.32592010498047, "learning_rate": 1.5130977926023497e-06, "loss": 0.0792, "step": 199700 }, { "epoch": 2.8641800034417484, "grad_norm": 2.53006649017334, "learning_rate": 1.5091142790371584e-06, "loss": 0.0444, "step": 199725 }, { "epoch": 2.864538518900935, "grad_norm": 0.37249118089675903, "learning_rate": 1.5051307654719667e-06, "loss": 0.0475, "step": 199750 }, { "epoch": 2.864897034360122, "grad_norm": 6.436287879943848, "learning_rate": 1.5011472519067751e-06, "loss": 0.0491, "step": 199775 }, { "epoch": 2.865255549819308, "grad_norm": 0.06576058268547058, "learning_rate": 1.4971637383415836e-06, "loss": 0.1082, "step": 199800 }, { "epoch": 2.865614065278495, "grad_norm": 8.787028312683105, "learning_rate": 1.493180224776392e-06, "loss": 0.0339, "step": 199825 }, { "epoch": 2.8659725807376812, "grad_norm": 0.4856167435646057, "learning_rate": 1.4891967112112006e-06, "loss": 0.0734, "step": 199850 }, { "epoch": 2.866331096196868, "grad_norm": 7.076977729797363, "learning_rate": 1.485213197646009e-06, "loss": 0.0668, "step": 199875 }, { "epoch": 2.8666896116560547, "grad_norm": 0.4088360071182251, "learning_rate": 1.4812296840808175e-06, "loss": 0.0508, "step": 199900 }, { "epoch": 2.8670481271152415, "grad_norm": 1.1945254802703857, "learning_rate": 1.477246170515626e-06, "loss": 0.0404, "step": 199925 }, { "epoch": 2.867406642574428, "grad_norm": 0.12630005180835724, "learning_rate": 1.4732626569504345e-06, "loss": 0.0663, "step": 199950 }, { "epoch": 2.8677651580336145, "grad_norm": 2.313392162322998, "learning_rate": 1.469279143385243e-06, "loss": 0.0698, "step": 199975 }, { "epoch": 2.868123673492801, "grad_norm": 7.632847785949707, "learning_rate": 1.4652956298200515e-06, "loss": 0.0364, "step": 200000 }, { "epoch": 2.8684821889519876, "grad_norm": 1.3937361240386963, "learning_rate": 1.46131211625486e-06, "loss": 0.0902, "step": 200025 }, { "epoch": 2.8688407044111743, "grad_norm": 7.967607498168945, "learning_rate": 1.4573286026896684e-06, "loss": 0.0655, "step": 200050 }, { "epoch": 2.869199219870361, "grad_norm": 7.310816287994385, "learning_rate": 1.4533450891244767e-06, "loss": 0.1337, "step": 200075 }, { "epoch": 2.8695577353295474, "grad_norm": 0.15970447659492493, "learning_rate": 1.4493615755592854e-06, "loss": 0.0667, "step": 200100 }, { "epoch": 2.869916250788734, "grad_norm": 16.773611068725586, "learning_rate": 1.4453780619940939e-06, "loss": 0.0484, "step": 200125 }, { "epoch": 2.8702747662479204, "grad_norm": 14.894769668579102, "learning_rate": 1.4413945484289021e-06, "loss": 0.0955, "step": 200150 }, { "epoch": 2.870633281707107, "grad_norm": 0.41921931505203247, "learning_rate": 1.4374110348637108e-06, "loss": 0.0331, "step": 200175 }, { "epoch": 2.870991797166294, "grad_norm": 7.264854907989502, "learning_rate": 1.4334275212985193e-06, "loss": 0.0814, "step": 200200 }, { "epoch": 2.8713503126254807, "grad_norm": 0.7915374040603638, "learning_rate": 1.4294440077333276e-06, "loss": 0.1371, "step": 200225 }, { "epoch": 2.871708828084667, "grad_norm": 0.29436901211738586, "learning_rate": 1.4254604941681363e-06, "loss": 0.0602, "step": 200250 }, { "epoch": 2.8720673435438537, "grad_norm": 3.7227439880371094, "learning_rate": 1.4214769806029447e-06, "loss": 0.1018, "step": 200275 }, { "epoch": 2.87242585900304, "grad_norm": 17.84215545654297, "learning_rate": 1.417493467037753e-06, "loss": 0.1584, "step": 200300 }, { "epoch": 2.872784374462227, "grad_norm": 1.1455891132354736, "learning_rate": 1.4135099534725617e-06, "loss": 0.0358, "step": 200325 }, { "epoch": 2.8731428899214135, "grad_norm": 1.2902789115905762, "learning_rate": 1.40952643990737e-06, "loss": 0.0536, "step": 200350 }, { "epoch": 2.8735014053806003, "grad_norm": 11.794281005859375, "learning_rate": 1.4055429263421784e-06, "loss": 0.0964, "step": 200375 }, { "epoch": 2.8738599208397866, "grad_norm": 0.1841806173324585, "learning_rate": 1.4015594127769871e-06, "loss": 0.1235, "step": 200400 }, { "epoch": 2.8742184362989733, "grad_norm": 0.33643850684165955, "learning_rate": 1.3975758992117954e-06, "loss": 0.076, "step": 200425 }, { "epoch": 2.8745769517581596, "grad_norm": 0.34908604621887207, "learning_rate": 1.3935923856466039e-06, "loss": 0.082, "step": 200450 }, { "epoch": 2.8749354672173464, "grad_norm": 0.1121368482708931, "learning_rate": 1.3896088720814126e-06, "loss": 0.0678, "step": 200475 }, { "epoch": 2.875293982676533, "grad_norm": 1.3862409591674805, "learning_rate": 1.3856253585162208e-06, "loss": 0.0568, "step": 200500 }, { "epoch": 2.87565249813572, "grad_norm": 11.707307815551758, "learning_rate": 1.3816418449510293e-06, "loss": 0.0939, "step": 200525 }, { "epoch": 2.876011013594906, "grad_norm": 0.2757328450679779, "learning_rate": 1.377658331385838e-06, "loss": 0.151, "step": 200550 }, { "epoch": 2.876369529054093, "grad_norm": 0.1818682700395584, "learning_rate": 1.3736748178206463e-06, "loss": 0.0555, "step": 200575 }, { "epoch": 2.8767280445132792, "grad_norm": 0.7264255285263062, "learning_rate": 1.3696913042554548e-06, "loss": 0.0476, "step": 200600 }, { "epoch": 2.877086559972466, "grad_norm": 1.3971965312957764, "learning_rate": 1.3657077906902632e-06, "loss": 0.0893, "step": 200625 }, { "epoch": 2.8774450754316527, "grad_norm": 0.8629310131072998, "learning_rate": 1.3617242771250717e-06, "loss": 0.0726, "step": 200650 }, { "epoch": 2.8778035908908395, "grad_norm": 2.2632851600646973, "learning_rate": 1.3577407635598802e-06, "loss": 0.073, "step": 200675 }, { "epoch": 2.878162106350026, "grad_norm": 0.5177713632583618, "learning_rate": 1.3537572499946887e-06, "loss": 0.0663, "step": 200700 }, { "epoch": 2.8785206218092125, "grad_norm": 0.4191531538963318, "learning_rate": 1.3497737364294972e-06, "loss": 0.0945, "step": 200725 }, { "epoch": 2.878879137268399, "grad_norm": 0.1912693828344345, "learning_rate": 1.3457902228643056e-06, "loss": 0.1056, "step": 200750 }, { "epoch": 2.8792376527275856, "grad_norm": 2.317269802093506, "learning_rate": 1.3418067092991141e-06, "loss": 0.0736, "step": 200775 }, { "epoch": 2.8795961681867723, "grad_norm": 1.8799601793289185, "learning_rate": 1.3378231957339226e-06, "loss": 0.1014, "step": 200800 }, { "epoch": 2.879954683645959, "grad_norm": 0.6050699949264526, "learning_rate": 1.333839682168731e-06, "loss": 0.0746, "step": 200825 }, { "epoch": 2.8803131991051454, "grad_norm": 5.192264080047607, "learning_rate": 1.3298561686035396e-06, "loss": 0.1141, "step": 200850 }, { "epoch": 2.880671714564332, "grad_norm": 3.354613780975342, "learning_rate": 1.325872655038348e-06, "loss": 0.0965, "step": 200875 }, { "epoch": 2.8810302300235184, "grad_norm": 16.038528442382812, "learning_rate": 1.3218891414731563e-06, "loss": 0.1311, "step": 200900 }, { "epoch": 2.881388745482705, "grad_norm": 3.6621007919311523, "learning_rate": 1.317905627907965e-06, "loss": 0.0628, "step": 200925 }, { "epoch": 2.881747260941892, "grad_norm": 3.9471750259399414, "learning_rate": 1.3139221143427735e-06, "loss": 0.1016, "step": 200950 }, { "epoch": 2.8821057764010787, "grad_norm": 5.372446537017822, "learning_rate": 1.3099386007775817e-06, "loss": 0.0726, "step": 200975 }, { "epoch": 2.882464291860265, "grad_norm": 0.3164326548576355, "learning_rate": 1.3059550872123904e-06, "loss": 0.2244, "step": 201000 }, { "epoch": 2.8828228073194517, "grad_norm": 1.3321356773376465, "learning_rate": 1.301971573647199e-06, "loss": 0.0447, "step": 201025 }, { "epoch": 2.883181322778638, "grad_norm": 0.26130473613739014, "learning_rate": 1.2979880600820072e-06, "loss": 0.1041, "step": 201050 }, { "epoch": 2.883539838237825, "grad_norm": 1.237762689590454, "learning_rate": 1.2940045465168159e-06, "loss": 0.0219, "step": 201075 }, { "epoch": 2.8838983536970115, "grad_norm": 0.20445308089256287, "learning_rate": 1.2900210329516244e-06, "loss": 0.1032, "step": 201100 }, { "epoch": 2.8842568691561983, "grad_norm": 22.749122619628906, "learning_rate": 1.2860375193864326e-06, "loss": 0.1391, "step": 201125 }, { "epoch": 2.8846153846153846, "grad_norm": 7.451152324676514, "learning_rate": 1.2820540058212413e-06, "loss": 0.0767, "step": 201150 }, { "epoch": 2.8849739000745713, "grad_norm": 0.16095176339149475, "learning_rate": 1.2780704922560496e-06, "loss": 0.0713, "step": 201175 }, { "epoch": 2.8853324155337576, "grad_norm": 2.6899383068084717, "learning_rate": 1.274086978690858e-06, "loss": 0.1024, "step": 201200 }, { "epoch": 2.8856909309929444, "grad_norm": 11.249686241149902, "learning_rate": 1.2701034651256668e-06, "loss": 0.0556, "step": 201225 }, { "epoch": 2.886049446452131, "grad_norm": 0.5793784260749817, "learning_rate": 1.266119951560475e-06, "loss": 0.0954, "step": 201250 }, { "epoch": 2.886407961911318, "grad_norm": 2.916926383972168, "learning_rate": 1.2621364379952835e-06, "loss": 0.0806, "step": 201275 }, { "epoch": 2.886766477370504, "grad_norm": 1.7968835830688477, "learning_rate": 1.2581529244300922e-06, "loss": 0.0965, "step": 201300 }, { "epoch": 2.887124992829691, "grad_norm": 0.461455762386322, "learning_rate": 1.2541694108649005e-06, "loss": 0.1062, "step": 201325 }, { "epoch": 2.8874835082888772, "grad_norm": 1.9414143562316895, "learning_rate": 1.250185897299709e-06, "loss": 0.0834, "step": 201350 }, { "epoch": 2.887842023748064, "grad_norm": 8.942764282226562, "learning_rate": 1.2462023837345176e-06, "loss": 0.1197, "step": 201375 }, { "epoch": 2.8882005392072507, "grad_norm": 1.7345764636993408, "learning_rate": 1.242218870169326e-06, "loss": 0.0776, "step": 201400 }, { "epoch": 2.8885590546664375, "grad_norm": 3.3850433826446533, "learning_rate": 1.2382353566041344e-06, "loss": 0.0737, "step": 201425 }, { "epoch": 2.888917570125624, "grad_norm": 0.49664443731307983, "learning_rate": 1.2342518430389429e-06, "loss": 0.0717, "step": 201450 }, { "epoch": 2.8892760855848105, "grad_norm": 0.2066420167684555, "learning_rate": 1.2302683294737513e-06, "loss": 0.1145, "step": 201475 }, { "epoch": 2.889634601043997, "grad_norm": 25.562227249145508, "learning_rate": 1.2262848159085598e-06, "loss": 0.1596, "step": 201500 }, { "epoch": 2.8899931165031836, "grad_norm": 4.641901969909668, "learning_rate": 1.2223013023433683e-06, "loss": 0.1035, "step": 201525 }, { "epoch": 2.8903516319623703, "grad_norm": 0.19557683169841766, "learning_rate": 1.2183177887781768e-06, "loss": 0.0699, "step": 201550 }, { "epoch": 2.890710147421557, "grad_norm": 9.299565315246582, "learning_rate": 1.2143342752129853e-06, "loss": 0.0577, "step": 201575 }, { "epoch": 2.8910686628807434, "grad_norm": 1.4348701238632202, "learning_rate": 1.2103507616477937e-06, "loss": 0.0565, "step": 201600 }, { "epoch": 2.89142717833993, "grad_norm": 7.742739677429199, "learning_rate": 1.2063672480826022e-06, "loss": 0.1445, "step": 201625 }, { "epoch": 2.8917856937991164, "grad_norm": 2.5279011726379395, "learning_rate": 1.2023837345174107e-06, "loss": 0.0336, "step": 201650 }, { "epoch": 2.892144209258303, "grad_norm": 0.42892125248908997, "learning_rate": 1.1984002209522192e-06, "loss": 0.0503, "step": 201675 }, { "epoch": 2.89250272471749, "grad_norm": 13.811539649963379, "learning_rate": 1.1944167073870277e-06, "loss": 0.0564, "step": 201700 }, { "epoch": 2.8928612401766767, "grad_norm": 0.9363518953323364, "learning_rate": 1.190433193821836e-06, "loss": 0.0997, "step": 201725 }, { "epoch": 2.893219755635863, "grad_norm": 2.8106417655944824, "learning_rate": 1.1864496802566446e-06, "loss": 0.0711, "step": 201750 }, { "epoch": 2.8935782710950497, "grad_norm": 1.9249889850616455, "learning_rate": 1.182466166691453e-06, "loss": 0.0741, "step": 201775 }, { "epoch": 2.893936786554236, "grad_norm": 1.2907952070236206, "learning_rate": 1.1784826531262614e-06, "loss": 0.0536, "step": 201800 }, { "epoch": 2.8942953020134228, "grad_norm": 4.874639987945557, "learning_rate": 1.17449913956107e-06, "loss": 0.0908, "step": 201825 }, { "epoch": 2.8946538174726095, "grad_norm": 0.8482661247253418, "learning_rate": 1.1705156259958785e-06, "loss": 0.0545, "step": 201850 }, { "epoch": 2.8950123329317963, "grad_norm": 4.85333251953125, "learning_rate": 1.1665321124306868e-06, "loss": 0.0729, "step": 201875 }, { "epoch": 2.8953708483909826, "grad_norm": 9.901185989379883, "learning_rate": 1.1625485988654953e-06, "loss": 0.0784, "step": 201900 }, { "epoch": 2.8957293638501693, "grad_norm": 4.644209861755371, "learning_rate": 1.158565085300304e-06, "loss": 0.0797, "step": 201925 }, { "epoch": 2.8960878793093556, "grad_norm": 4.353074550628662, "learning_rate": 1.1545815717351122e-06, "loss": 0.0924, "step": 201950 }, { "epoch": 2.8964463947685424, "grad_norm": 2.0490968227386475, "learning_rate": 1.1505980581699207e-06, "loss": 0.0679, "step": 201975 }, { "epoch": 2.896804910227729, "grad_norm": 0.2505565881729126, "learning_rate": 1.1466145446047292e-06, "loss": 0.0982, "step": 202000 }, { "epoch": 2.897163425686916, "grad_norm": 7.215460300445557, "learning_rate": 1.1426310310395377e-06, "loss": 0.0496, "step": 202025 }, { "epoch": 2.897521941146102, "grad_norm": 0.20415444672107697, "learning_rate": 1.1386475174743462e-06, "loss": 0.0504, "step": 202050 }, { "epoch": 2.897880456605289, "grad_norm": 6.692841529846191, "learning_rate": 1.1346640039091546e-06, "loss": 0.0913, "step": 202075 }, { "epoch": 2.8982389720644752, "grad_norm": 0.4523155391216278, "learning_rate": 1.1306804903439631e-06, "loss": 0.0839, "step": 202100 }, { "epoch": 2.898597487523662, "grad_norm": 0.9195963740348816, "learning_rate": 1.1266969767787716e-06, "loss": 0.0822, "step": 202125 }, { "epoch": 2.8989560029828487, "grad_norm": 0.06516128033399582, "learning_rate": 1.12271346321358e-06, "loss": 0.0758, "step": 202150 }, { "epoch": 2.8993145184420355, "grad_norm": 0.16698966920375824, "learning_rate": 1.1187299496483886e-06, "loss": 0.0856, "step": 202175 }, { "epoch": 2.8996730339012218, "grad_norm": 0.34505289793014526, "learning_rate": 1.114746436083197e-06, "loss": 0.0937, "step": 202200 }, { "epoch": 2.9000315493604085, "grad_norm": 3.2519514560699463, "learning_rate": 1.1107629225180055e-06, "loss": 0.0927, "step": 202225 }, { "epoch": 2.900390064819595, "grad_norm": 2.831080198287964, "learning_rate": 1.106779408952814e-06, "loss": 0.0408, "step": 202250 }, { "epoch": 2.9007485802787816, "grad_norm": 1.8740824460983276, "learning_rate": 1.1027958953876223e-06, "loss": 0.0923, "step": 202275 }, { "epoch": 2.9011070957379683, "grad_norm": 0.12368595600128174, "learning_rate": 1.098812381822431e-06, "loss": 0.0364, "step": 202300 }, { "epoch": 2.901465611197155, "grad_norm": 0.37987199425697327, "learning_rate": 1.0948288682572394e-06, "loss": 0.0867, "step": 202325 }, { "epoch": 2.9018241266563414, "grad_norm": 0.1739003211259842, "learning_rate": 1.0908453546920477e-06, "loss": 0.0779, "step": 202350 }, { "epoch": 2.902182642115528, "grad_norm": 4.011524677276611, "learning_rate": 1.0868618411268564e-06, "loss": 0.1322, "step": 202375 }, { "epoch": 2.9025411575747144, "grad_norm": 2.1469650268554688, "learning_rate": 1.0828783275616649e-06, "loss": 0.0809, "step": 202400 }, { "epoch": 2.902899673033901, "grad_norm": 2.6882236003875732, "learning_rate": 1.0788948139964732e-06, "loss": 0.1163, "step": 202425 }, { "epoch": 2.903258188493088, "grad_norm": 0.2552194595336914, "learning_rate": 1.0749113004312818e-06, "loss": 0.055, "step": 202450 }, { "epoch": 2.9036167039522747, "grad_norm": 10.33008861541748, "learning_rate": 1.0709277868660901e-06, "loss": 0.0911, "step": 202475 }, { "epoch": 2.903975219411461, "grad_norm": 1.5023266077041626, "learning_rate": 1.0669442733008986e-06, "loss": 0.1626, "step": 202500 }, { "epoch": 2.9043337348706477, "grad_norm": 5.125148296356201, "learning_rate": 1.0629607597357073e-06, "loss": 0.0966, "step": 202525 }, { "epoch": 2.904692250329834, "grad_norm": 2.152557611465454, "learning_rate": 1.0589772461705156e-06, "loss": 0.0865, "step": 202550 }, { "epoch": 2.9050507657890208, "grad_norm": 3.2404770851135254, "learning_rate": 1.054993732605324e-06, "loss": 0.0512, "step": 202575 }, { "epoch": 2.9054092812482075, "grad_norm": 4.383091926574707, "learning_rate": 1.0510102190401327e-06, "loss": 0.1291, "step": 202600 }, { "epoch": 2.9057677967073943, "grad_norm": 6.8117876052856445, "learning_rate": 1.047026705474941e-06, "loss": 0.0809, "step": 202625 }, { "epoch": 2.9061263121665806, "grad_norm": 8.741128921508789, "learning_rate": 1.0430431919097495e-06, "loss": 0.082, "step": 202650 }, { "epoch": 2.9064848276257673, "grad_norm": 0.09786960482597351, "learning_rate": 1.0390596783445582e-06, "loss": 0.0994, "step": 202675 }, { "epoch": 2.9068433430849536, "grad_norm": 0.5112512707710266, "learning_rate": 1.0350761647793664e-06, "loss": 0.0636, "step": 202700 }, { "epoch": 2.9072018585441404, "grad_norm": 7.594112873077393, "learning_rate": 1.031092651214175e-06, "loss": 0.052, "step": 202725 }, { "epoch": 2.907560374003327, "grad_norm": 17.686548233032227, "learning_rate": 1.0271091376489834e-06, "loss": 0.0665, "step": 202750 }, { "epoch": 2.907918889462514, "grad_norm": 0.7200456261634827, "learning_rate": 1.0231256240837919e-06, "loss": 0.0871, "step": 202775 }, { "epoch": 2.9082774049217, "grad_norm": 0.018608154729008675, "learning_rate": 1.0191421105186004e-06, "loss": 0.1542, "step": 202800 }, { "epoch": 2.908635920380887, "grad_norm": 13.646003723144531, "learning_rate": 1.0151585969534088e-06, "loss": 0.1318, "step": 202825 }, { "epoch": 2.9089944358400732, "grad_norm": 16.350725173950195, "learning_rate": 1.0111750833882173e-06, "loss": 0.101, "step": 202850 }, { "epoch": 2.90935295129926, "grad_norm": 1.7530548572540283, "learning_rate": 1.0071915698230258e-06, "loss": 0.0809, "step": 202875 }, { "epoch": 2.9097114667584467, "grad_norm": 6.363104343414307, "learning_rate": 1.0032080562578343e-06, "loss": 0.1652, "step": 202900 }, { "epoch": 2.9100699822176335, "grad_norm": 0.8242420554161072, "learning_rate": 9.992245426926428e-07, "loss": 0.0265, "step": 202925 }, { "epoch": 2.9104284976768198, "grad_norm": 11.065443992614746, "learning_rate": 9.952410291274512e-07, "loss": 0.0498, "step": 202950 }, { "epoch": 2.9107870131360065, "grad_norm": 5.798506259918213, "learning_rate": 9.912575155622597e-07, "loss": 0.0781, "step": 202975 }, { "epoch": 2.911145528595193, "grad_norm": 0.267412394285202, "learning_rate": 9.872740019970682e-07, "loss": 0.1302, "step": 203000 }, { "epoch": 2.9115040440543796, "grad_norm": 0.5412136912345886, "learning_rate": 9.832904884318765e-07, "loss": 0.0601, "step": 203025 }, { "epoch": 2.9118625595135663, "grad_norm": 2.290687322616577, "learning_rate": 9.793069748666852e-07, "loss": 0.1133, "step": 203050 }, { "epoch": 2.912221074972753, "grad_norm": 0.5021854639053345, "learning_rate": 9.753234613014936e-07, "loss": 0.0599, "step": 203075 }, { "epoch": 2.9125795904319394, "grad_norm": 0.016241082921624184, "learning_rate": 9.71339947736302e-07, "loss": 0.0757, "step": 203100 }, { "epoch": 2.912938105891126, "grad_norm": 19.116817474365234, "learning_rate": 9.673564341711106e-07, "loss": 0.0794, "step": 203125 }, { "epoch": 2.9132966213503124, "grad_norm": 0.31803056597709656, "learning_rate": 9.63372920605919e-07, "loss": 0.04, "step": 203150 }, { "epoch": 2.913655136809499, "grad_norm": 6.553877830505371, "learning_rate": 9.593894070407273e-07, "loss": 0.0396, "step": 203175 }, { "epoch": 2.914013652268686, "grad_norm": 0.43542152643203735, "learning_rate": 9.55405893475536e-07, "loss": 0.0469, "step": 203200 }, { "epoch": 2.9143721677278727, "grad_norm": 0.04652760922908783, "learning_rate": 9.514223799103444e-07, "loss": 0.0746, "step": 203225 }, { "epoch": 2.914730683187059, "grad_norm": 0.2622077167034149, "learning_rate": 9.474388663451528e-07, "loss": 0.0609, "step": 203250 }, { "epoch": 2.9150891986462457, "grad_norm": 0.5638665556907654, "learning_rate": 9.434553527799615e-07, "loss": 0.079, "step": 203275 }, { "epoch": 2.915447714105432, "grad_norm": 0.2378416210412979, "learning_rate": 9.394718392147698e-07, "loss": 0.141, "step": 203300 }, { "epoch": 2.9158062295646188, "grad_norm": 0.27443623542785645, "learning_rate": 9.354883256495783e-07, "loss": 0.0686, "step": 203325 }, { "epoch": 2.9161647450238055, "grad_norm": 0.7574479579925537, "learning_rate": 9.315048120843868e-07, "loss": 0.0667, "step": 203350 }, { "epoch": 2.9165232604829923, "grad_norm": 3.2685680389404297, "learning_rate": 9.275212985191953e-07, "loss": 0.0883, "step": 203375 }, { "epoch": 2.9168817759421786, "grad_norm": 25.646207809448242, "learning_rate": 9.235377849540038e-07, "loss": 0.0923, "step": 203400 }, { "epoch": 2.9172402914013653, "grad_norm": 16.71525001525879, "learning_rate": 9.195542713888121e-07, "loss": 0.0829, "step": 203425 }, { "epoch": 2.9175988068605516, "grad_norm": 21.122398376464844, "learning_rate": 9.155707578236207e-07, "loss": 0.1387, "step": 203450 }, { "epoch": 2.9179573223197384, "grad_norm": 0.21616438031196594, "learning_rate": 9.115872442584292e-07, "loss": 0.1082, "step": 203475 }, { "epoch": 2.918315837778925, "grad_norm": 0.2337251901626587, "learning_rate": 9.076037306932376e-07, "loss": 0.1013, "step": 203500 }, { "epoch": 2.918674353238112, "grad_norm": 5.494620323181152, "learning_rate": 9.036202171280461e-07, "loss": 0.0936, "step": 203525 }, { "epoch": 2.919032868697298, "grad_norm": 13.828368186950684, "learning_rate": 8.996367035628545e-07, "loss": 0.1484, "step": 203550 }, { "epoch": 2.919391384156485, "grad_norm": 7.341379165649414, "learning_rate": 8.95653189997663e-07, "loss": 0.0614, "step": 203575 }, { "epoch": 2.919749899615671, "grad_norm": 14.326749801635742, "learning_rate": 8.916696764324715e-07, "loss": 0.0549, "step": 203600 }, { "epoch": 2.920108415074858, "grad_norm": 0.8001575469970703, "learning_rate": 8.8768616286728e-07, "loss": 0.0756, "step": 203625 }, { "epoch": 2.9204669305340447, "grad_norm": 0.8021258115768433, "learning_rate": 8.837026493020885e-07, "loss": 0.1014, "step": 203650 }, { "epoch": 2.9208254459932315, "grad_norm": 0.14883330464363098, "learning_rate": 8.797191357368969e-07, "loss": 0.064, "step": 203675 }, { "epoch": 2.9211839614524178, "grad_norm": 0.3253938555717468, "learning_rate": 8.757356221717053e-07, "loss": 0.0338, "step": 203700 }, { "epoch": 2.9215424769116045, "grad_norm": 2.5789055824279785, "learning_rate": 8.717521086065139e-07, "loss": 0.1039, "step": 203725 }, { "epoch": 2.921900992370791, "grad_norm": 4.459774494171143, "learning_rate": 8.677685950413224e-07, "loss": 0.0741, "step": 203750 }, { "epoch": 2.9222595078299776, "grad_norm": 1.6378268003463745, "learning_rate": 8.637850814761308e-07, "loss": 0.0767, "step": 203775 }, { "epoch": 2.9226180232891643, "grad_norm": 4.593770980834961, "learning_rate": 8.598015679109392e-07, "loss": 0.0483, "step": 203800 }, { "epoch": 2.922976538748351, "grad_norm": 28.3426456451416, "learning_rate": 8.558180543457478e-07, "loss": 0.1441, "step": 203825 }, { "epoch": 2.9233350542075374, "grad_norm": 0.49432671070098877, "learning_rate": 8.518345407805562e-07, "loss": 0.08, "step": 203850 }, { "epoch": 2.923693569666724, "grad_norm": 2.6437668800354004, "learning_rate": 8.478510272153647e-07, "loss": 0.1483, "step": 203875 }, { "epoch": 2.9240520851259104, "grad_norm": 0.1706484705209732, "learning_rate": 8.438675136501731e-07, "loss": 0.0615, "step": 203900 }, { "epoch": 2.924410600585097, "grad_norm": 4.945234298706055, "learning_rate": 8.398840000849816e-07, "loss": 0.104, "step": 203925 }, { "epoch": 2.924769116044284, "grad_norm": 18.362510681152344, "learning_rate": 8.359004865197901e-07, "loss": 0.1022, "step": 203950 }, { "epoch": 2.9251276315034707, "grad_norm": 0.0440586693584919, "learning_rate": 8.319169729545986e-07, "loss": 0.0813, "step": 203975 }, { "epoch": 2.925486146962657, "grad_norm": 3.164523124694824, "learning_rate": 8.279334593894071e-07, "loss": 0.1155, "step": 204000 }, { "epoch": 2.9258446624218437, "grad_norm": 0.13970935344696045, "learning_rate": 8.239499458242155e-07, "loss": 0.0803, "step": 204025 }, { "epoch": 2.92620317788103, "grad_norm": 0.10581483691930771, "learning_rate": 8.19966432259024e-07, "loss": 0.1163, "step": 204050 }, { "epoch": 2.9265616933402168, "grad_norm": 5.733567714691162, "learning_rate": 8.159829186938324e-07, "loss": 0.0429, "step": 204075 }, { "epoch": 2.9269202087994035, "grad_norm": 9.527094841003418, "learning_rate": 8.11999405128641e-07, "loss": 0.0966, "step": 204100 }, { "epoch": 2.9272787242585903, "grad_norm": 2.312443733215332, "learning_rate": 8.080158915634495e-07, "loss": 0.0609, "step": 204125 }, { "epoch": 2.9276372397177766, "grad_norm": 9.621001243591309, "learning_rate": 8.040323779982578e-07, "loss": 0.0913, "step": 204150 }, { "epoch": 2.9279957551769633, "grad_norm": 0.21777720749378204, "learning_rate": 8.000488644330663e-07, "loss": 0.0357, "step": 204175 }, { "epoch": 2.9283542706361496, "grad_norm": 0.07937420159578323, "learning_rate": 7.960653508678749e-07, "loss": 0.1061, "step": 204200 }, { "epoch": 2.9287127860953364, "grad_norm": 0.5340431332588196, "learning_rate": 7.920818373026833e-07, "loss": 0.074, "step": 204225 }, { "epoch": 2.929071301554523, "grad_norm": 1.2128432989120483, "learning_rate": 7.880983237374918e-07, "loss": 0.0806, "step": 204250 }, { "epoch": 2.92942981701371, "grad_norm": 4.75811767578125, "learning_rate": 7.841148101723003e-07, "loss": 0.1127, "step": 204275 }, { "epoch": 2.929788332472896, "grad_norm": 25.050832748413086, "learning_rate": 7.801312966071087e-07, "loss": 0.0411, "step": 204300 }, { "epoch": 2.930146847932083, "grad_norm": 0.6143820285797119, "learning_rate": 7.761477830419172e-07, "loss": 0.0584, "step": 204325 }, { "epoch": 2.930505363391269, "grad_norm": 3.6572883129119873, "learning_rate": 7.721642694767257e-07, "loss": 0.1718, "step": 204350 }, { "epoch": 2.930863878850456, "grad_norm": 7.680206775665283, "learning_rate": 7.681807559115342e-07, "loss": 0.0434, "step": 204375 }, { "epoch": 2.9312223943096427, "grad_norm": 0.056704141199588776, "learning_rate": 7.641972423463426e-07, "loss": 0.0402, "step": 204400 }, { "epoch": 2.9315809097688295, "grad_norm": 0.31164291501045227, "learning_rate": 7.602137287811511e-07, "loss": 0.0596, "step": 204425 }, { "epoch": 2.9319394252280158, "grad_norm": 0.3032820224761963, "learning_rate": 7.562302152159595e-07, "loss": 0.0654, "step": 204450 }, { "epoch": 2.9322979406872025, "grad_norm": 3.552031993865967, "learning_rate": 7.522467016507681e-07, "loss": 0.117, "step": 204475 }, { "epoch": 2.932656456146389, "grad_norm": 1.2969709634780884, "learning_rate": 7.482631880855766e-07, "loss": 0.1044, "step": 204500 }, { "epoch": 2.9330149716055756, "grad_norm": 1.8167507648468018, "learning_rate": 7.442796745203849e-07, "loss": 0.0519, "step": 204525 }, { "epoch": 2.9333734870647623, "grad_norm": 1.3143285512924194, "learning_rate": 7.402961609551935e-07, "loss": 0.0739, "step": 204550 }, { "epoch": 2.933732002523949, "grad_norm": 5.8464035987854, "learning_rate": 7.36312647390002e-07, "loss": 0.0704, "step": 204575 }, { "epoch": 2.9340905179831354, "grad_norm": 1.2992162704467773, "learning_rate": 7.323291338248104e-07, "loss": 0.0842, "step": 204600 }, { "epoch": 2.934449033442322, "grad_norm": 0.3389677107334137, "learning_rate": 7.283456202596189e-07, "loss": 0.096, "step": 204625 }, { "epoch": 2.9348075489015084, "grad_norm": 7.4070353507995605, "learning_rate": 7.243621066944273e-07, "loss": 0.048, "step": 204650 }, { "epoch": 2.935166064360695, "grad_norm": 1.7286304235458374, "learning_rate": 7.203785931292358e-07, "loss": 0.0733, "step": 204675 }, { "epoch": 2.935524579819882, "grad_norm": 0.7398419976234436, "learning_rate": 7.163950795640443e-07, "loss": 0.09, "step": 204700 }, { "epoch": 2.9358830952790687, "grad_norm": 0.3512996733188629, "learning_rate": 7.124115659988527e-07, "loss": 0.107, "step": 204725 }, { "epoch": 2.936241610738255, "grad_norm": 1.3373829126358032, "learning_rate": 7.084280524336613e-07, "loss": 0.0307, "step": 204750 }, { "epoch": 2.9366001261974417, "grad_norm": 14.25399398803711, "learning_rate": 7.044445388684697e-07, "loss": 0.1211, "step": 204775 }, { "epoch": 2.936958641656628, "grad_norm": 3.615168571472168, "learning_rate": 7.004610253032781e-07, "loss": 0.0476, "step": 204800 }, { "epoch": 2.9373171571158148, "grad_norm": 0.37558865547180176, "learning_rate": 6.964775117380867e-07, "loss": 0.0957, "step": 204825 }, { "epoch": 2.9376756725750015, "grad_norm": 5.576295375823975, "learning_rate": 6.924939981728952e-07, "loss": 0.1474, "step": 204850 }, { "epoch": 2.9380341880341883, "grad_norm": 19.046045303344727, "learning_rate": 6.885104846077035e-07, "loss": 0.1093, "step": 204875 }, { "epoch": 2.9383927034933746, "grad_norm": 1.6607745885849, "learning_rate": 6.84526971042512e-07, "loss": 0.0409, "step": 204900 }, { "epoch": 2.9387512189525613, "grad_norm": 0.06526482850313187, "learning_rate": 6.805434574773206e-07, "loss": 0.0666, "step": 204925 }, { "epoch": 2.9391097344117476, "grad_norm": 10.175646781921387, "learning_rate": 6.76559943912129e-07, "loss": 0.0647, "step": 204950 }, { "epoch": 2.9394682498709344, "grad_norm": 5.537781238555908, "learning_rate": 6.725764303469375e-07, "loss": 0.1002, "step": 204975 }, { "epoch": 2.939826765330121, "grad_norm": 0.5248852372169495, "learning_rate": 6.68592916781746e-07, "loss": 0.0906, "step": 205000 }, { "epoch": 2.940185280789308, "grad_norm": 15.130291938781738, "learning_rate": 6.646094032165544e-07, "loss": 0.0692, "step": 205025 }, { "epoch": 2.940543796248494, "grad_norm": 0.45888790488243103, "learning_rate": 6.606258896513629e-07, "loss": 0.0698, "step": 205050 }, { "epoch": 2.940902311707681, "grad_norm": 18.99873924255371, "learning_rate": 6.566423760861714e-07, "loss": 0.0438, "step": 205075 }, { "epoch": 2.941260827166867, "grad_norm": 3.470210552215576, "learning_rate": 6.526588625209799e-07, "loss": 0.1356, "step": 205100 }, { "epoch": 2.941619342626054, "grad_norm": 5.621897220611572, "learning_rate": 6.486753489557883e-07, "loss": 0.0582, "step": 205125 }, { "epoch": 2.9419778580852407, "grad_norm": 0.5561015605926514, "learning_rate": 6.446918353905968e-07, "loss": 0.0681, "step": 205150 }, { "epoch": 2.9423363735444275, "grad_norm": 12.901633262634277, "learning_rate": 6.407083218254052e-07, "loss": 0.0919, "step": 205175 }, { "epoch": 2.9426948890036138, "grad_norm": 2.60176682472229, "learning_rate": 6.367248082602138e-07, "loss": 0.0655, "step": 205200 }, { "epoch": 2.9430534044628005, "grad_norm": 4.241306781768799, "learning_rate": 6.327412946950223e-07, "loss": 0.1088, "step": 205225 }, { "epoch": 2.943411919921987, "grad_norm": 16.244121551513672, "learning_rate": 6.287577811298306e-07, "loss": 0.0564, "step": 205250 }, { "epoch": 2.9437704353811736, "grad_norm": 9.793615341186523, "learning_rate": 6.247742675646391e-07, "loss": 0.0544, "step": 205275 }, { "epoch": 2.9441289508403603, "grad_norm": 0.02309674397110939, "learning_rate": 6.207907539994477e-07, "loss": 0.0694, "step": 205300 }, { "epoch": 2.944487466299547, "grad_norm": 1.0247938632965088, "learning_rate": 6.168072404342561e-07, "loss": 0.1532, "step": 205325 }, { "epoch": 2.9448459817587334, "grad_norm": 7.357635974884033, "learning_rate": 6.128237268690646e-07, "loss": 0.0686, "step": 205350 }, { "epoch": 2.94520449721792, "grad_norm": 3.606300115585327, "learning_rate": 6.088402133038731e-07, "loss": 0.11, "step": 205375 }, { "epoch": 2.9455630126771064, "grad_norm": 2.807851791381836, "learning_rate": 6.048566997386815e-07, "loss": 0.0823, "step": 205400 }, { "epoch": 2.945921528136293, "grad_norm": 2.445861339569092, "learning_rate": 6.0087318617349e-07, "loss": 0.082, "step": 205425 }, { "epoch": 2.94628004359548, "grad_norm": 2.637619733810425, "learning_rate": 5.968896726082985e-07, "loss": 0.0606, "step": 205450 }, { "epoch": 2.9466385590546667, "grad_norm": 0.10430954396724701, "learning_rate": 5.92906159043107e-07, "loss": 0.109, "step": 205475 }, { "epoch": 2.946997074513853, "grad_norm": 0.4511021375656128, "learning_rate": 5.889226454779154e-07, "loss": 0.1154, "step": 205500 }, { "epoch": 2.9473555899730397, "grad_norm": 8.138222694396973, "learning_rate": 5.849391319127239e-07, "loss": 0.0553, "step": 205525 }, { "epoch": 2.947714105432226, "grad_norm": 0.3534753918647766, "learning_rate": 5.809556183475323e-07, "loss": 0.0874, "step": 205550 }, { "epoch": 2.9480726208914128, "grad_norm": 0.9535727500915527, "learning_rate": 5.769721047823409e-07, "loss": 0.1072, "step": 205575 }, { "epoch": 2.9484311363505995, "grad_norm": 5.029990196228027, "learning_rate": 5.729885912171494e-07, "loss": 0.0953, "step": 205600 }, { "epoch": 2.9487896518097862, "grad_norm": 2.320683002471924, "learning_rate": 5.690050776519577e-07, "loss": 0.0976, "step": 205625 }, { "epoch": 2.9491481672689726, "grad_norm": 0.25156086683273315, "learning_rate": 5.650215640867662e-07, "loss": 0.0973, "step": 205650 }, { "epoch": 2.9495066827281593, "grad_norm": 5.732408046722412, "learning_rate": 5.610380505215748e-07, "loss": 0.1249, "step": 205675 }, { "epoch": 2.9498651981873456, "grad_norm": 1.0657570362091064, "learning_rate": 5.570545369563832e-07, "loss": 0.0485, "step": 205700 }, { "epoch": 2.9502237136465324, "grad_norm": 7.534172534942627, "learning_rate": 5.530710233911917e-07, "loss": 0.1166, "step": 205725 }, { "epoch": 2.950582229105719, "grad_norm": 6.010597229003906, "learning_rate": 5.490875098260002e-07, "loss": 0.0932, "step": 205750 }, { "epoch": 2.950940744564906, "grad_norm": 1.2400457859039307, "learning_rate": 5.451039962608086e-07, "loss": 0.129, "step": 205775 }, { "epoch": 2.951299260024092, "grad_norm": 11.15288257598877, "learning_rate": 5.411204826956171e-07, "loss": 0.0874, "step": 205800 }, { "epoch": 2.951657775483279, "grad_norm": 0.5299756526947021, "learning_rate": 5.371369691304255e-07, "loss": 0.0876, "step": 205825 }, { "epoch": 2.952016290942465, "grad_norm": 6.9370903968811035, "learning_rate": 5.33153455565234e-07, "loss": 0.1408, "step": 205850 }, { "epoch": 2.952374806401652, "grad_norm": 6.072323799133301, "learning_rate": 5.291699420000425e-07, "loss": 0.0796, "step": 205875 }, { "epoch": 2.9527333218608387, "grad_norm": 0.4311206638813019, "learning_rate": 5.251864284348509e-07, "loss": 0.07, "step": 205900 }, { "epoch": 2.9530918373200254, "grad_norm": 0.4304375648498535, "learning_rate": 5.212029148696594e-07, "loss": 0.0495, "step": 205925 }, { "epoch": 2.9534503527792118, "grad_norm": 2.270953893661499, "learning_rate": 5.17219401304468e-07, "loss": 0.074, "step": 205950 }, { "epoch": 2.9538088682383985, "grad_norm": 1.6462745666503906, "learning_rate": 5.132358877392763e-07, "loss": 0.0372, "step": 205975 }, { "epoch": 2.954167383697585, "grad_norm": 0.16900666058063507, "learning_rate": 5.092523741740848e-07, "loss": 0.0616, "step": 206000 }, { "epoch": 2.9545258991567716, "grad_norm": 0.6038618683815002, "learning_rate": 5.052688606088934e-07, "loss": 0.0563, "step": 206025 }, { "epoch": 2.9548844146159583, "grad_norm": 21.775588989257812, "learning_rate": 5.012853470437018e-07, "loss": 0.0957, "step": 206050 }, { "epoch": 2.955242930075145, "grad_norm": 0.6194428205490112, "learning_rate": 4.973018334785103e-07, "loss": 0.0669, "step": 206075 }, { "epoch": 2.9556014455343314, "grad_norm": 0.73482745885849, "learning_rate": 4.933183199133187e-07, "loss": 0.0849, "step": 206100 }, { "epoch": 2.955959960993518, "grad_norm": 2.724799871444702, "learning_rate": 4.893348063481272e-07, "loss": 0.1226, "step": 206125 }, { "epoch": 2.9563184764527044, "grad_norm": 0.36235493421554565, "learning_rate": 4.853512927829357e-07, "loss": 0.0832, "step": 206150 }, { "epoch": 2.956676991911891, "grad_norm": 1.0853239297866821, "learning_rate": 4.813677792177442e-07, "loss": 0.0365, "step": 206175 }, { "epoch": 2.957035507371078, "grad_norm": 0.2988751530647278, "learning_rate": 4.773842656525526e-07, "loss": 0.0525, "step": 206200 }, { "epoch": 2.9573940228302646, "grad_norm": 0.19244548678398132, "learning_rate": 4.7340075208736114e-07, "loss": 0.0561, "step": 206225 }, { "epoch": 2.957752538289451, "grad_norm": 0.1685238629579544, "learning_rate": 4.694172385221696e-07, "loss": 0.0884, "step": 206250 }, { "epoch": 2.9581110537486377, "grad_norm": 11.527033805847168, "learning_rate": 4.654337249569781e-07, "loss": 0.033, "step": 206275 }, { "epoch": 2.958469569207824, "grad_norm": 3.8348491191864014, "learning_rate": 4.6145021139178653e-07, "loss": 0.0605, "step": 206300 }, { "epoch": 2.9588280846670107, "grad_norm": 8.794478416442871, "learning_rate": 4.57466697826595e-07, "loss": 0.0549, "step": 206325 }, { "epoch": 2.9591866001261975, "grad_norm": 3.133808135986328, "learning_rate": 4.534831842614035e-07, "loss": 0.0635, "step": 206350 }, { "epoch": 2.9595451155853842, "grad_norm": 21.772811889648438, "learning_rate": 4.4949967069621197e-07, "loss": 0.0779, "step": 206375 }, { "epoch": 2.9599036310445705, "grad_norm": 0.7049189805984497, "learning_rate": 4.455161571310204e-07, "loss": 0.0732, "step": 206400 }, { "epoch": 2.9602621465037573, "grad_norm": 0.38424184918403625, "learning_rate": 4.4153264356582893e-07, "loss": 0.0771, "step": 206425 }, { "epoch": 2.9606206619629436, "grad_norm": 6.946709156036377, "learning_rate": 4.3754913000063736e-07, "loss": 0.0627, "step": 206450 }, { "epoch": 2.9609791774221303, "grad_norm": 1.5764868259429932, "learning_rate": 4.3356561643544584e-07, "loss": 0.1196, "step": 206475 }, { "epoch": 2.961337692881317, "grad_norm": 1.0721585750579834, "learning_rate": 4.295821028702543e-07, "loss": 0.1356, "step": 206500 }, { "epoch": 2.961696208340504, "grad_norm": 9.229329109191895, "learning_rate": 4.255985893050628e-07, "loss": 0.041, "step": 206525 }, { "epoch": 2.96205472379969, "grad_norm": 0.019280755892395973, "learning_rate": 4.216150757398713e-07, "loss": 0.0633, "step": 206550 }, { "epoch": 2.962413239258877, "grad_norm": 1.8850892782211304, "learning_rate": 4.1763156217467976e-07, "loss": 0.0646, "step": 206575 }, { "epoch": 2.962771754718063, "grad_norm": 0.40623486042022705, "learning_rate": 4.1364804860948824e-07, "loss": 0.172, "step": 206600 }, { "epoch": 2.96313027017725, "grad_norm": 3.9384872913360596, "learning_rate": 4.0966453504429666e-07, "loss": 0.068, "step": 206625 }, { "epoch": 2.9634887856364367, "grad_norm": 3.3635761737823486, "learning_rate": 4.0568102147910514e-07, "loss": 0.139, "step": 206650 }, { "epoch": 2.9638473010956234, "grad_norm": 0.637531578540802, "learning_rate": 4.016975079139136e-07, "loss": 0.0512, "step": 206675 }, { "epoch": 2.9642058165548097, "grad_norm": 0.43770018219947815, "learning_rate": 3.977139943487221e-07, "loss": 0.099, "step": 206700 }, { "epoch": 2.9645643320139965, "grad_norm": 0.3626309037208557, "learning_rate": 3.9373048078353053e-07, "loss": 0.0653, "step": 206725 }, { "epoch": 2.964922847473183, "grad_norm": 1.1204755306243896, "learning_rate": 3.8974696721833906e-07, "loss": 0.0637, "step": 206750 }, { "epoch": 2.9652813629323695, "grad_norm": 9.26832389831543, "learning_rate": 3.857634536531475e-07, "loss": 0.1075, "step": 206775 }, { "epoch": 2.9656398783915563, "grad_norm": 0.8820796608924866, "learning_rate": 3.8177994008795597e-07, "loss": 0.035, "step": 206800 }, { "epoch": 2.965998393850743, "grad_norm": 0.3144121468067169, "learning_rate": 3.777964265227645e-07, "loss": 0.1244, "step": 206825 }, { "epoch": 2.9663569093099293, "grad_norm": 0.10953253507614136, "learning_rate": 3.7381291295757293e-07, "loss": 0.0753, "step": 206850 }, { "epoch": 2.966715424769116, "grad_norm": 1.1803072690963745, "learning_rate": 3.698293993923814e-07, "loss": 0.0698, "step": 206875 }, { "epoch": 2.9670739402283024, "grad_norm": 2.869842767715454, "learning_rate": 3.658458858271899e-07, "loss": 0.0703, "step": 206900 }, { "epoch": 2.967432455687489, "grad_norm": 3.3082845211029053, "learning_rate": 3.6186237226199837e-07, "loss": 0.0561, "step": 206925 }, { "epoch": 2.967790971146676, "grad_norm": 1.79576575756073, "learning_rate": 3.578788586968068e-07, "loss": 0.0942, "step": 206950 }, { "epoch": 2.9681494866058626, "grad_norm": 8.177495956420898, "learning_rate": 3.5389534513161533e-07, "loss": 0.0741, "step": 206975 }, { "epoch": 2.968508002065049, "grad_norm": 0.28549015522003174, "learning_rate": 3.4991183156642376e-07, "loss": 0.0878, "step": 207000 }, { "epoch": 2.9688665175242357, "grad_norm": 14.144893646240234, "learning_rate": 3.4592831800123224e-07, "loss": 0.1279, "step": 207025 }, { "epoch": 2.969225032983422, "grad_norm": 0.1835658848285675, "learning_rate": 3.419448044360407e-07, "loss": 0.067, "step": 207050 }, { "epoch": 2.9695835484426087, "grad_norm": 1.8747442960739136, "learning_rate": 3.379612908708492e-07, "loss": 0.0322, "step": 207075 }, { "epoch": 2.9699420639017955, "grad_norm": 16.58586311340332, "learning_rate": 3.339777773056577e-07, "loss": 0.0596, "step": 207100 }, { "epoch": 2.9703005793609822, "grad_norm": 14.86831283569336, "learning_rate": 3.2999426374046616e-07, "loss": 0.0618, "step": 207125 }, { "epoch": 2.9706590948201685, "grad_norm": 2.6311607360839844, "learning_rate": 3.2601075017527464e-07, "loss": 0.0615, "step": 207150 }, { "epoch": 2.9710176102793553, "grad_norm": 9.420514106750488, "learning_rate": 3.2202723661008306e-07, "loss": 0.1509, "step": 207175 }, { "epoch": 2.9713761257385416, "grad_norm": 21.01744842529297, "learning_rate": 3.1804372304489154e-07, "loss": 0.0765, "step": 207200 }, { "epoch": 2.9717346411977283, "grad_norm": 2.655179500579834, "learning_rate": 3.140602094797e-07, "loss": 0.0636, "step": 207225 }, { "epoch": 2.972093156656915, "grad_norm": 1.5435760021209717, "learning_rate": 3.100766959145085e-07, "loss": 0.0362, "step": 207250 }, { "epoch": 2.972451672116102, "grad_norm": 0.7712477445602417, "learning_rate": 3.0609318234931693e-07, "loss": 0.078, "step": 207275 }, { "epoch": 2.972810187575288, "grad_norm": 15.514156341552734, "learning_rate": 3.0210966878412546e-07, "loss": 0.0828, "step": 207300 }, { "epoch": 2.973168703034475, "grad_norm": 1.7985066175460815, "learning_rate": 2.981261552189339e-07, "loss": 0.0541, "step": 207325 }, { "epoch": 2.973527218493661, "grad_norm": 0.35727760195732117, "learning_rate": 2.9414264165374237e-07, "loss": 0.0585, "step": 207350 }, { "epoch": 2.973885733952848, "grad_norm": 0.695503294467926, "learning_rate": 2.9015912808855085e-07, "loss": 0.0647, "step": 207375 }, { "epoch": 2.9742442494120347, "grad_norm": 10.734185218811035, "learning_rate": 2.8617561452335933e-07, "loss": 0.0871, "step": 207400 }, { "epoch": 2.9746027648712214, "grad_norm": 4.863291263580322, "learning_rate": 2.821921009581678e-07, "loss": 0.0882, "step": 207425 }, { "epoch": 2.9749612803304077, "grad_norm": 3.944349527359009, "learning_rate": 2.782085873929763e-07, "loss": 0.1265, "step": 207450 }, { "epoch": 2.9753197957895945, "grad_norm": 1.9465038776397705, "learning_rate": 2.7422507382778477e-07, "loss": 0.1007, "step": 207475 }, { "epoch": 2.975678311248781, "grad_norm": 0.2755599915981293, "learning_rate": 2.702415602625932e-07, "loss": 0.0569, "step": 207500 }, { "epoch": 2.9760368267079675, "grad_norm": 0.3037880063056946, "learning_rate": 2.6625804669740173e-07, "loss": 0.1149, "step": 207525 }, { "epoch": 2.9763953421671543, "grad_norm": 0.16527371108531952, "learning_rate": 2.6227453313221015e-07, "loss": 0.08, "step": 207550 }, { "epoch": 2.976753857626341, "grad_norm": 0.31051895022392273, "learning_rate": 2.5829101956701863e-07, "loss": 0.1038, "step": 207575 }, { "epoch": 2.9771123730855273, "grad_norm": 0.23071855306625366, "learning_rate": 2.543075060018271e-07, "loss": 0.1205, "step": 207600 }, { "epoch": 2.977470888544714, "grad_norm": 0.45868629217147827, "learning_rate": 2.503239924366356e-07, "loss": 0.0811, "step": 207625 }, { "epoch": 2.9778294040039004, "grad_norm": 0.0871330052614212, "learning_rate": 2.46340478871444e-07, "loss": 0.0485, "step": 207650 }, { "epoch": 2.978187919463087, "grad_norm": 2.2569456100463867, "learning_rate": 2.4235696530625255e-07, "loss": 0.0745, "step": 207675 }, { "epoch": 2.978546434922274, "grad_norm": 3.2263967990875244, "learning_rate": 2.38373451741061e-07, "loss": 0.0733, "step": 207700 }, { "epoch": 2.9789049503814606, "grad_norm": 0.07793991267681122, "learning_rate": 2.3438993817586946e-07, "loss": 0.0976, "step": 207725 }, { "epoch": 2.979263465840647, "grad_norm": 2.4181501865386963, "learning_rate": 2.3040642461067794e-07, "loss": 0.0857, "step": 207750 }, { "epoch": 2.9796219812998337, "grad_norm": 0.6517219543457031, "learning_rate": 2.2642291104548642e-07, "loss": 0.1033, "step": 207775 }, { "epoch": 2.97998049675902, "grad_norm": 2.681511163711548, "learning_rate": 2.2243939748029487e-07, "loss": 0.0646, "step": 207800 }, { "epoch": 2.9803390122182067, "grad_norm": 1.0396250486373901, "learning_rate": 2.1845588391510338e-07, "loss": 0.0933, "step": 207825 }, { "epoch": 2.9806975276773935, "grad_norm": 16.994354248046875, "learning_rate": 2.1447237034991183e-07, "loss": 0.0721, "step": 207850 }, { "epoch": 2.9810560431365802, "grad_norm": 19.253253936767578, "learning_rate": 2.1048885678472031e-07, "loss": 0.124, "step": 207875 }, { "epoch": 2.9814145585957665, "grad_norm": 0.008240513503551483, "learning_rate": 2.065053432195288e-07, "loss": 0.1014, "step": 207900 }, { "epoch": 2.9817730740549533, "grad_norm": 3.0799551010131836, "learning_rate": 2.0252182965433725e-07, "loss": 0.1134, "step": 207925 }, { "epoch": 2.9821315895141396, "grad_norm": 0.22011218965053558, "learning_rate": 1.9853831608914573e-07, "loss": 0.0622, "step": 207950 }, { "epoch": 2.9824901049733263, "grad_norm": 2.4745521545410156, "learning_rate": 1.945548025239542e-07, "loss": 0.0734, "step": 207975 }, { "epoch": 2.982848620432513, "grad_norm": 1.7012921571731567, "learning_rate": 1.9057128895876266e-07, "loss": 0.0979, "step": 208000 }, { "epoch": 2.9832071358917, "grad_norm": 2.072309970855713, "learning_rate": 1.8658777539357114e-07, "loss": 0.0654, "step": 208025 }, { "epoch": 2.983565651350886, "grad_norm": 0.24343881011009216, "learning_rate": 1.8260426182837962e-07, "loss": 0.0967, "step": 208050 }, { "epoch": 2.983924166810073, "grad_norm": 3.0550918579101562, "learning_rate": 1.7862074826318807e-07, "loss": 0.1094, "step": 208075 }, { "epoch": 2.984282682269259, "grad_norm": 4.231513977050781, "learning_rate": 1.7463723469799658e-07, "loss": 0.0629, "step": 208100 }, { "epoch": 2.984641197728446, "grad_norm": 0.2699524760246277, "learning_rate": 1.7065372113280503e-07, "loss": 0.029, "step": 208125 }, { "epoch": 2.9849997131876327, "grad_norm": 0.9386650323867798, "learning_rate": 1.666702075676135e-07, "loss": 0.1481, "step": 208150 }, { "epoch": 2.9853582286468194, "grad_norm": 0.16538187861442566, "learning_rate": 1.62686694002422e-07, "loss": 0.1332, "step": 208175 }, { "epoch": 2.9857167441060057, "grad_norm": 3.144259452819824, "learning_rate": 1.5870318043723045e-07, "loss": 0.1092, "step": 208200 }, { "epoch": 2.9860752595651925, "grad_norm": 12.549966812133789, "learning_rate": 1.5471966687203893e-07, "loss": 0.0771, "step": 208225 }, { "epoch": 2.986433775024379, "grad_norm": 7.304028034210205, "learning_rate": 1.507361533068474e-07, "loss": 0.0502, "step": 208250 }, { "epoch": 2.9867922904835655, "grad_norm": 1.638700008392334, "learning_rate": 1.4675263974165586e-07, "loss": 0.0565, "step": 208275 }, { "epoch": 2.9871508059427523, "grad_norm": 4.026493072509766, "learning_rate": 1.4276912617646434e-07, "loss": 0.0607, "step": 208300 }, { "epoch": 2.987509321401939, "grad_norm": 10.72415542602539, "learning_rate": 1.3878561261127282e-07, "loss": 0.0827, "step": 208325 }, { "epoch": 2.9878678368611253, "grad_norm": 0.032012294977903366, "learning_rate": 1.3480209904608127e-07, "loss": 0.0895, "step": 208350 }, { "epoch": 2.988226352320312, "grad_norm": 1.5659688711166382, "learning_rate": 1.3081858548088975e-07, "loss": 0.0732, "step": 208375 }, { "epoch": 2.9885848677794984, "grad_norm": 0.26062536239624023, "learning_rate": 1.2683507191569823e-07, "loss": 0.0731, "step": 208400 }, { "epoch": 2.988943383238685, "grad_norm": 0.2648525536060333, "learning_rate": 1.228515583505067e-07, "loss": 0.0882, "step": 208425 }, { "epoch": 2.989301898697872, "grad_norm": 0.23668479919433594, "learning_rate": 1.1886804478531518e-07, "loss": 0.0678, "step": 208450 }, { "epoch": 2.9896604141570586, "grad_norm": 5.549989700317383, "learning_rate": 1.1488453122012366e-07, "loss": 0.0711, "step": 208475 }, { "epoch": 2.990018929616245, "grad_norm": 2.4187347888946533, "learning_rate": 1.1090101765493213e-07, "loss": 0.0748, "step": 208500 }, { "epoch": 2.9903774450754317, "grad_norm": 0.09225217252969742, "learning_rate": 1.0691750408974059e-07, "loss": 0.0617, "step": 208525 }, { "epoch": 2.990735960534618, "grad_norm": 1.1576809883117676, "learning_rate": 1.0293399052454907e-07, "loss": 0.1365, "step": 208550 }, { "epoch": 2.9910944759938047, "grad_norm": 0.29273560643196106, "learning_rate": 9.895047695935754e-08, "loss": 0.0805, "step": 208575 }, { "epoch": 2.9914529914529915, "grad_norm": 0.42837631702423096, "learning_rate": 9.496696339416602e-08, "loss": 0.0515, "step": 208600 }, { "epoch": 2.9918115069121782, "grad_norm": 0.022002125158905983, "learning_rate": 9.098344982897449e-08, "loss": 0.0493, "step": 208625 }, { "epoch": 2.9921700223713645, "grad_norm": 10.927491188049316, "learning_rate": 8.699993626378296e-08, "loss": 0.1235, "step": 208650 }, { "epoch": 2.9925285378305513, "grad_norm": 6.809746742248535, "learning_rate": 8.301642269859143e-08, "loss": 0.0262, "step": 208675 }, { "epoch": 2.9928870532897376, "grad_norm": 0.23406508564949036, "learning_rate": 7.90329091333999e-08, "loss": 0.0604, "step": 208700 }, { "epoch": 2.9932455687489243, "grad_norm": 2.529329776763916, "learning_rate": 7.504939556820838e-08, "loss": 0.1332, "step": 208725 }, { "epoch": 2.993604084208111, "grad_norm": 0.6248497366905212, "learning_rate": 7.106588200301686e-08, "loss": 0.0843, "step": 208750 }, { "epoch": 2.993962599667298, "grad_norm": 0.12870603799819946, "learning_rate": 6.708236843782532e-08, "loss": 0.0471, "step": 208775 }, { "epoch": 2.994321115126484, "grad_norm": 13.371145248413086, "learning_rate": 6.309885487263379e-08, "loss": 0.1138, "step": 208800 }, { "epoch": 2.994679630585671, "grad_norm": 0.5307474732398987, "learning_rate": 5.911534130744227e-08, "loss": 0.0727, "step": 208825 }, { "epoch": 2.995038146044857, "grad_norm": 0.9232656359672546, "learning_rate": 5.513182774225074e-08, "loss": 0.1825, "step": 208850 }, { "epoch": 2.995396661504044, "grad_norm": 1.194833517074585, "learning_rate": 5.114831417705921e-08, "loss": 0.0646, "step": 208875 }, { "epoch": 2.9957551769632307, "grad_norm": 6.368671894073486, "learning_rate": 4.7164800611867684e-08, "loss": 0.0592, "step": 208900 }, { "epoch": 2.9961136924224174, "grad_norm": 0.32219234108924866, "learning_rate": 4.318128704667616e-08, "loss": 0.0875, "step": 208925 }, { "epoch": 2.9964722078816037, "grad_norm": 15.352099418640137, "learning_rate": 3.919777348148463e-08, "loss": 0.058, "step": 208950 }, { "epoch": 2.9968307233407905, "grad_norm": 3.433809280395508, "learning_rate": 3.5214259916293104e-08, "loss": 0.0605, "step": 208975 }, { "epoch": 2.997189238799977, "grad_norm": 14.743083000183105, "learning_rate": 3.123074635110158e-08, "loss": 0.0947, "step": 209000 }, { "epoch": 2.9975477542591635, "grad_norm": 3.961277961730957, "learning_rate": 2.7247232785910047e-08, "loss": 0.1296, "step": 209025 }, { "epoch": 2.9979062697183503, "grad_norm": 0.39265793561935425, "learning_rate": 2.326371922071852e-08, "loss": 0.1953, "step": 209050 }, { "epoch": 2.998264785177537, "grad_norm": 32.515743255615234, "learning_rate": 1.928020565552699e-08, "loss": 0.0723, "step": 209075 }, { "epoch": 2.9986233006367233, "grad_norm": 0.7974006533622742, "learning_rate": 1.5296692090335467e-08, "loss": 0.1464, "step": 209100 }, { "epoch": 2.99898181609591, "grad_norm": 8.061833381652832, "learning_rate": 1.1313178525143937e-08, "loss": 0.0794, "step": 209125 }, { "epoch": 2.9993403315550964, "grad_norm": 0.1935645043849945, "learning_rate": 7.32966495995241e-09, "loss": 0.0672, "step": 209150 }, { "epoch": 2.999698847014283, "grad_norm": 1.3035776615142822, "learning_rate": 3.346151394760883e-09, "loss": 0.0631, "step": 209175 }, { "epoch": 3.0, "eval_cosine_accuracy": 0.9600546780072904, "eval_loss": 0.20218822360038757, "eval_runtime": 248.9175, "eval_samples_per_second": 26.451, "eval_steps_per_second": 1.655, "step": 209196 } ], "logging_steps": 25, "max_steps": 209196, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }