{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1626, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024622960911049553, "grad_norm": 1.5390625, "learning_rate": 2.040816326530612e-10, "loss": 1.3865270614624023, "step": 2 }, { "epoch": 0.0049245921822099106, "grad_norm": 4.375, "learning_rate": 6.122448979591837e-10, "loss": 1.8760377168655396, "step": 4 }, { "epoch": 0.007386888273314866, "grad_norm": 2.359375, "learning_rate": 1.020408163265306e-09, "loss": 1.1314038038253784, "step": 6 }, { "epoch": 0.009849184364419821, "grad_norm": 5.71875, "learning_rate": 1.4285714285714286e-09, "loss": 1.8253700733184814, "step": 8 }, { "epoch": 0.012311480455524777, "grad_norm": 12.625, "learning_rate": 1.8367346938775511e-09, "loss": 2.2051210403442383, "step": 10 }, { "epoch": 0.014773776546629732, "grad_norm": 20.375, "learning_rate": 2.2448979591836736e-09, "loss": 2.4439101219177246, "step": 12 }, { "epoch": 0.017236072637734686, "grad_norm": 3.578125, "learning_rate": 2.653061224489796e-09, "loss": 1.3878843784332275, "step": 14 }, { "epoch": 0.019698368728839642, "grad_norm": 1.765625, "learning_rate": 3.0612244897959187e-09, "loss": 1.1822748184204102, "step": 16 }, { "epoch": 0.0221606648199446, "grad_norm": 2.53125, "learning_rate": 3.4693877551020408e-09, "loss": 1.1794735193252563, "step": 18 }, { "epoch": 0.024622960911049555, "grad_norm": 14.625, "learning_rate": 3.877551020408163e-09, "loss": 2.3212547302246094, "step": 20 }, { "epoch": 0.02708525700215451, "grad_norm": 5.625, "learning_rate": 4.285714285714286e-09, "loss": 1.7700073719024658, "step": 22 }, { "epoch": 0.029547553093259463, "grad_norm": 14.25, "learning_rate": 4.693877551020409e-09, "loss": 2.191647529602051, "step": 24 }, { "epoch": 0.03200984918436442, "grad_norm": 4.15625, "learning_rate": 5.102040816326531e-09, "loss": 1.7301385402679443, "step": 26 }, { "epoch": 0.03447214527546937, "grad_norm": 14.1875, "learning_rate": 5.510204081632653e-09, "loss": 2.343463659286499, "step": 28 }, { "epoch": 0.03693444136657433, "grad_norm": 5.90625, "learning_rate": 5.918367346938776e-09, "loss": 1.2581849098205566, "step": 30 }, { "epoch": 0.039396737457679284, "grad_norm": 5.1875, "learning_rate": 6.326530612244899e-09, "loss": 1.9037660360336304, "step": 32 }, { "epoch": 0.041859033548784244, "grad_norm": 6.25, "learning_rate": 6.73469387755102e-09, "loss": 1.8926417827606201, "step": 34 }, { "epoch": 0.0443213296398892, "grad_norm": 4.15625, "learning_rate": 7.142857142857143e-09, "loss": 1.494161605834961, "step": 36 }, { "epoch": 0.04678362573099415, "grad_norm": 72.5, "learning_rate": 7.551020408163264e-09, "loss": 2.4310765266418457, "step": 38 }, { "epoch": 0.04924592182209911, "grad_norm": 13.1875, "learning_rate": 7.959183673469387e-09, "loss": 2.401200294494629, "step": 40 }, { "epoch": 0.05170821791320406, "grad_norm": 17.875, "learning_rate": 8.36734693877551e-09, "loss": 2.269543170928955, "step": 42 }, { "epoch": 0.05417051400430902, "grad_norm": 6.375, "learning_rate": 8.775510204081633e-09, "loss": 1.880392074584961, "step": 44 }, { "epoch": 0.056632810095413974, "grad_norm": 10.9375, "learning_rate": 9.183673469387756e-09, "loss": 2.2891359329223633, "step": 46 }, { "epoch": 0.05909510618651893, "grad_norm": 2.953125, "learning_rate": 9.591836734693877e-09, "loss": 1.245388150215149, "step": 48 }, { "epoch": 0.061557402277623886, "grad_norm": 14.0625, "learning_rate": 1e-08, "loss": 1.8519728183746338, "step": 50 }, { "epoch": 0.06401969836872884, "grad_norm": 12.3125, "learning_rate": 9.99996825131286e-09, "loss": 2.678940773010254, "step": 52 }, { "epoch": 0.0664819944598338, "grad_norm": 13.4375, "learning_rate": 9.999873005755431e-09, "loss": 2.3168435096740723, "step": 54 }, { "epoch": 0.06894429055093874, "grad_norm": 23.125, "learning_rate": 9.999714264839672e-09, "loss": 2.218395233154297, "step": 56 }, { "epoch": 0.0714065866420437, "grad_norm": 3.265625, "learning_rate": 9.999492031085492e-09, "loss": 1.2967658042907715, "step": 58 }, { "epoch": 0.07386888273314866, "grad_norm": 8.4375, "learning_rate": 9.999206308020707e-09, "loss": 2.0597116947174072, "step": 60 }, { "epoch": 0.07633117882425362, "grad_norm": 3.984375, "learning_rate": 9.99885710018098e-09, "loss": 1.6437733173370361, "step": 62 }, { "epoch": 0.07879347491535857, "grad_norm": 6.9375, "learning_rate": 9.99844441310976e-09, "loss": 1.878865122795105, "step": 64 }, { "epoch": 0.08125577100646353, "grad_norm": 5.34375, "learning_rate": 9.997968253358178e-09, "loss": 1.8909335136413574, "step": 66 }, { "epoch": 0.08371806709756849, "grad_norm": 15.9375, "learning_rate": 9.997428628484963e-09, "loss": 2.290242910385132, "step": 68 }, { "epoch": 0.08618036318867343, "grad_norm": 8.9375, "learning_rate": 9.996825547056302e-09, "loss": 2.0678482055664062, "step": 70 }, { "epoch": 0.0886426592797784, "grad_norm": 5.75, "learning_rate": 9.996159018645721e-09, "loss": 1.8928303718566895, "step": 72 }, { "epoch": 0.09110495537088335, "grad_norm": 7.53125, "learning_rate": 9.995429053833917e-09, "loss": 1.9023447036743164, "step": 74 }, { "epoch": 0.0935672514619883, "grad_norm": 7.59375, "learning_rate": 9.994635664208602e-09, "loss": 1.914489507675171, "step": 76 }, { "epoch": 0.09602954755309326, "grad_norm": 11.125, "learning_rate": 9.99377886236432e-09, "loss": 2.057431221008301, "step": 78 }, { "epoch": 0.09849184364419822, "grad_norm": 7.21875, "learning_rate": 9.992858661902233e-09, "loss": 1.9636759757995605, "step": 80 }, { "epoch": 0.10095413973530316, "grad_norm": 4.15625, "learning_rate": 9.99187507742992e-09, "loss": 1.298654317855835, "step": 82 }, { "epoch": 0.10341643582640812, "grad_norm": 2.953125, "learning_rate": 9.990828124561143e-09, "loss": 1.1845377683639526, "step": 84 }, { "epoch": 0.10587873191751308, "grad_norm": 12.0625, "learning_rate": 9.989717819915584e-09, "loss": 2.3120527267456055, "step": 86 }, { "epoch": 0.10834102800861804, "grad_norm": 6.75, "learning_rate": 9.988544181118608e-09, "loss": 1.792182445526123, "step": 88 }, { "epoch": 0.11080332409972299, "grad_norm": 4.03125, "learning_rate": 9.987307226800957e-09, "loss": 1.4169440269470215, "step": 90 }, { "epoch": 0.11326562019082795, "grad_norm": 19.375, "learning_rate": 9.98600697659847e-09, "loss": 2.2629003524780273, "step": 92 }, { "epoch": 0.11572791628193291, "grad_norm": 4.65625, "learning_rate": 9.984643451151764e-09, "loss": 1.8561232089996338, "step": 94 }, { "epoch": 0.11819021237303785, "grad_norm": 5.03125, "learning_rate": 9.98321667210591e-09, "loss": 1.8327598571777344, "step": 96 }, { "epoch": 0.12065250846414281, "grad_norm": 3.34375, "learning_rate": 9.98172666211009e-09, "loss": 1.2463821172714233, "step": 98 }, { "epoch": 0.12311480455524777, "grad_norm": 6.375, "learning_rate": 9.980173444817238e-09, "loss": 1.351346731185913, "step": 100 }, { "epoch": 0.12557710064635272, "grad_norm": 4.0625, "learning_rate": 9.978557044883651e-09, "loss": 1.2666093111038208, "step": 102 }, { "epoch": 0.12803939673745768, "grad_norm": 4.78125, "learning_rate": 9.976877487968623e-09, "loss": 1.905246615409851, "step": 104 }, { "epoch": 0.13050169282856264, "grad_norm": 3.0, "learning_rate": 9.975134800734015e-09, "loss": 1.1379789113998413, "step": 106 }, { "epoch": 0.1329639889196676, "grad_norm": 4.65625, "learning_rate": 9.973329010843847e-09, "loss": 1.8731987476348877, "step": 108 }, { "epoch": 0.13542628501077256, "grad_norm": 10.3125, "learning_rate": 9.97146014696384e-09, "loss": 1.897504448890686, "step": 110 }, { "epoch": 0.1378885811018775, "grad_norm": 2.375, "learning_rate": 9.96952823876099e-09, "loss": 1.1055809259414673, "step": 112 }, { "epoch": 0.14035087719298245, "grad_norm": 14.5625, "learning_rate": 9.967533316903066e-09, "loss": 2.4285759925842285, "step": 114 }, { "epoch": 0.1428131732840874, "grad_norm": 6.0625, "learning_rate": 9.965475413058142e-09, "loss": 1.8401623964309692, "step": 116 }, { "epoch": 0.14527546937519237, "grad_norm": 3.625, "learning_rate": 9.963354559894099e-09, "loss": 1.2698298692703247, "step": 118 }, { "epoch": 0.14773776546629733, "grad_norm": 1.6875, "learning_rate": 9.961170791078078e-09, "loss": 1.1040065288543701, "step": 120 }, { "epoch": 0.1502000615574023, "grad_norm": 16.375, "learning_rate": 9.958924141275982e-09, "loss": 1.8983745574951172, "step": 122 }, { "epoch": 0.15266235764850725, "grad_norm": 5.125, "learning_rate": 9.956614646151903e-09, "loss": 1.9957232475280762, "step": 124 }, { "epoch": 0.15512465373961218, "grad_norm": 12.75, "learning_rate": 9.954242342367555e-09, "loss": 2.3904964923858643, "step": 126 }, { "epoch": 0.15758694983071714, "grad_norm": 5.0625, "learning_rate": 9.951807267581707e-09, "loss": 1.8866188526153564, "step": 128 }, { "epoch": 0.1600492459218221, "grad_norm": 1.984375, "learning_rate": 9.94930946044957e-09, "loss": 1.2808419466018677, "step": 130 }, { "epoch": 0.16251154201292706, "grad_norm": 2.484375, "learning_rate": 9.946748960622197e-09, "loss": 1.3167526721954346, "step": 132 }, { "epoch": 0.16497383810403202, "grad_norm": 2.4375, "learning_rate": 9.944125808745837e-09, "loss": 1.2127764225006104, "step": 134 }, { "epoch": 0.16743613419513698, "grad_norm": 4.5, "learning_rate": 9.941440046461305e-09, "loss": 1.9335191249847412, "step": 136 }, { "epoch": 0.1698984302862419, "grad_norm": 9.0, "learning_rate": 9.938691716403316e-09, "loss": 1.9803462028503418, "step": 138 }, { "epoch": 0.17236072637734687, "grad_norm": 4.65625, "learning_rate": 9.935880862199809e-09, "loss": 1.820433259010315, "step": 140 }, { "epoch": 0.17482302246845183, "grad_norm": 5.78125, "learning_rate": 9.93300752847124e-09, "loss": 1.9337809085845947, "step": 142 }, { "epoch": 0.1772853185595568, "grad_norm": 5.28125, "learning_rate": 9.930071760829904e-09, "loss": 1.8973931074142456, "step": 144 }, { "epoch": 0.17974761465066175, "grad_norm": 5.40625, "learning_rate": 9.927073605879185e-09, "loss": 1.9531124830245972, "step": 146 }, { "epoch": 0.1822099107417667, "grad_norm": 5.75, "learning_rate": 9.924013111212818e-09, "loss": 1.9310762882232666, "step": 148 }, { "epoch": 0.18467220683287167, "grad_norm": 9.375, "learning_rate": 9.920890325414153e-09, "loss": 2.008820056915283, "step": 150 }, { "epoch": 0.1871345029239766, "grad_norm": 82.5, "learning_rate": 9.917705298055361e-09, "loss": 3.0185141563415527, "step": 152 }, { "epoch": 0.18959679901508156, "grad_norm": 8.625, "learning_rate": 9.914458079696664e-09, "loss": 2.008962631225586, "step": 154 }, { "epoch": 0.19205909510618652, "grad_norm": 9.25, "learning_rate": 9.91114872188552e-09, "loss": 1.6197317838668823, "step": 156 }, { "epoch": 0.19452139119729148, "grad_norm": 4.53125, "learning_rate": 9.907777277155811e-09, "loss": 1.8305246829986572, "step": 158 }, { "epoch": 0.19698368728839644, "grad_norm": 9.75, "learning_rate": 9.904343799027012e-09, "loss": 1.9033877849578857, "step": 160 }, { "epoch": 0.1994459833795014, "grad_norm": 8.5, "learning_rate": 9.90084834200333e-09, "loss": 1.9224884510040283, "step": 162 }, { "epoch": 0.20190827947060633, "grad_norm": 5.5, "learning_rate": 9.897290961572854e-09, "loss": 1.5109963417053223, "step": 164 }, { "epoch": 0.2043705755617113, "grad_norm": 6.0625, "learning_rate": 9.893671714206662e-09, "loss": 1.9377520084381104, "step": 166 }, { "epoch": 0.20683287165281625, "grad_norm": 5.03125, "learning_rate": 9.889990657357933e-09, "loss": 1.6958491802215576, "step": 168 }, { "epoch": 0.2092951677439212, "grad_norm": 5.1875, "learning_rate": 9.886247849461023e-09, "loss": 1.320851445198059, "step": 170 }, { "epoch": 0.21175746383502617, "grad_norm": 17.375, "learning_rate": 9.882443349930552e-09, "loss": 2.529175281524658, "step": 172 }, { "epoch": 0.21421975992613113, "grad_norm": 5.53125, "learning_rate": 9.878577219160456e-09, "loss": 1.9636085033416748, "step": 174 }, { "epoch": 0.21668205601723609, "grad_norm": 5.84375, "learning_rate": 9.87464951852302e-09, "loss": 1.9693580865859985, "step": 176 }, { "epoch": 0.21914435210834102, "grad_norm": 8.5, "learning_rate": 9.870660310367915e-09, "loss": 1.955024242401123, "step": 178 }, { "epoch": 0.22160664819944598, "grad_norm": 11.5, "learning_rate": 9.866609658021202e-09, "loss": 2.3577377796173096, "step": 180 }, { "epoch": 0.22406894429055094, "grad_norm": 14.1875, "learning_rate": 9.862497625784324e-09, "loss": 2.3302321434020996, "step": 182 }, { "epoch": 0.2265312403816559, "grad_norm": 5.40625, "learning_rate": 9.8583242789331e-09, "loss": 1.872032642364502, "step": 184 }, { "epoch": 0.22899353647276086, "grad_norm": 8.1875, "learning_rate": 9.854089683716666e-09, "loss": 1.9843339920043945, "step": 186 }, { "epoch": 0.23145583256386582, "grad_norm": 6.375, "learning_rate": 9.849793907356444e-09, "loss": 1.8600096702575684, "step": 188 }, { "epoch": 0.23391812865497075, "grad_norm": 11.0, "learning_rate": 9.845437018045063e-09, "loss": 2.281198024749756, "step": 190 }, { "epoch": 0.2363804247460757, "grad_norm": 4.34375, "learning_rate": 9.841019084945281e-09, "loss": 1.8489793539047241, "step": 192 }, { "epoch": 0.23884272083718067, "grad_norm": 4.40625, "learning_rate": 9.836540178188888e-09, "loss": 1.8184915781021118, "step": 194 }, { "epoch": 0.24130501692828563, "grad_norm": 39.5, "learning_rate": 9.832000368875586e-09, "loss": 2.5119130611419678, "step": 196 }, { "epoch": 0.24376731301939059, "grad_norm": 42.25, "learning_rate": 9.82739972907187e-09, "loss": 1.7983183860778809, "step": 198 }, { "epoch": 0.24622960911049555, "grad_norm": 7.9375, "learning_rate": 9.822738331809873e-09, "loss": 1.8701186180114746, "step": 200 }, { "epoch": 0.2486919052016005, "grad_norm": 10.8125, "learning_rate": 9.818016251086222e-09, "loss": 2.0227789878845215, "step": 202 }, { "epoch": 0.25115420129270544, "grad_norm": 9.625, "learning_rate": 9.813233561860844e-09, "loss": 2.185953140258789, "step": 204 }, { "epoch": 0.2536164973838104, "grad_norm": 5.0, "learning_rate": 9.808390340055792e-09, "loss": 1.850534439086914, "step": 206 }, { "epoch": 0.25607879347491536, "grad_norm": 5.125, "learning_rate": 9.803486662554038e-09, "loss": 1.9469786882400513, "step": 208 }, { "epoch": 0.2585410895660203, "grad_norm": 4.03125, "learning_rate": 9.798522607198235e-09, "loss": 1.7527638673782349, "step": 210 }, { "epoch": 0.2610033856571253, "grad_norm": 4.75, "learning_rate": 9.79349825278951e-09, "loss": 1.9203780889511108, "step": 212 }, { "epoch": 0.2634656817482302, "grad_norm": 4.53125, "learning_rate": 9.788413679086188e-09, "loss": 1.8700388669967651, "step": 214 }, { "epoch": 0.2659279778393352, "grad_norm": 5.78125, "learning_rate": 9.783268966802539e-09, "loss": 2.030698299407959, "step": 216 }, { "epoch": 0.2683902739304401, "grad_norm": 7.375, "learning_rate": 9.778064197607495e-09, "loss": 1.936469316482544, "step": 218 }, { "epoch": 0.2708525700215451, "grad_norm": 18.875, "learning_rate": 9.772799454123349e-09, "loss": 2.471208095550537, "step": 220 }, { "epoch": 0.27331486611265005, "grad_norm": 13.25, "learning_rate": 9.767474819924447e-09, "loss": 2.437526226043701, "step": 222 }, { "epoch": 0.275777162203755, "grad_norm": 6.5625, "learning_rate": 9.762090379535862e-09, "loss": 2.013521909713745, "step": 224 }, { "epoch": 0.27823945829485996, "grad_norm": 14.0625, "learning_rate": 9.756646218432053e-09, "loss": 2.0168678760528564, "step": 226 }, { "epoch": 0.2807017543859649, "grad_norm": 5.8125, "learning_rate": 9.751142423035501e-09, "loss": 1.995202660560608, "step": 228 }, { "epoch": 0.2831640504770699, "grad_norm": 42.5, "learning_rate": 9.74557908071535e-09, "loss": 1.953993320465088, "step": 230 }, { "epoch": 0.2856263465681748, "grad_norm": 2.46875, "learning_rate": 9.739956279786e-09, "loss": 1.149980068206787, "step": 232 }, { "epoch": 0.2880886426592798, "grad_norm": 4.21875, "learning_rate": 9.734274109505729e-09, "loss": 1.7589616775512695, "step": 234 }, { "epoch": 0.29055093875038474, "grad_norm": 5.0625, "learning_rate": 9.72853266007526e-09, "loss": 1.9171326160430908, "step": 236 }, { "epoch": 0.29301323484148967, "grad_norm": 11.4375, "learning_rate": 9.722732022636333e-09, "loss": 1.6742775440216064, "step": 238 }, { "epoch": 0.29547553093259465, "grad_norm": 4.78125, "learning_rate": 9.716872289270262e-09, "loss": 1.7873895168304443, "step": 240 }, { "epoch": 0.2979378270236996, "grad_norm": 4.40625, "learning_rate": 9.710953552996464e-09, "loss": 1.9001209735870361, "step": 242 }, { "epoch": 0.3004001231148046, "grad_norm": 4.78125, "learning_rate": 9.704975907770995e-09, "loss": 1.869600534439087, "step": 244 }, { "epoch": 0.3028624192059095, "grad_norm": 3.46875, "learning_rate": 9.69893944848505e-09, "loss": 1.5148907899856567, "step": 246 }, { "epoch": 0.3053247152970145, "grad_norm": 14.6875, "learning_rate": 9.69284427096345e-09, "loss": 1.914973497390747, "step": 248 }, { "epoch": 0.3077870113881194, "grad_norm": 13.125, "learning_rate": 9.686690471963147e-09, "loss": 2.230684757232666, "step": 250 }, { "epoch": 0.31024930747922436, "grad_norm": 7.34375, "learning_rate": 9.680478149171657e-09, "loss": 2.0974578857421875, "step": 252 }, { "epoch": 0.31271160357032934, "grad_norm": 13.5625, "learning_rate": 9.674207401205524e-09, "loss": 2.2117700576782227, "step": 254 }, { "epoch": 0.3151738996614343, "grad_norm": 5.25, "learning_rate": 9.667878327608756e-09, "loss": 1.8505613803863525, "step": 256 }, { "epoch": 0.31763619575253926, "grad_norm": 14.25, "learning_rate": 9.661491028851246e-09, "loss": 1.7967166900634766, "step": 258 }, { "epoch": 0.3200984918436442, "grad_norm": 4.0625, "learning_rate": 9.655045606327165e-09, "loss": 1.869051456451416, "step": 260 }, { "epoch": 0.3225607879347491, "grad_norm": 9.0625, "learning_rate": 9.648542162353366e-09, "loss": 1.876924753189087, "step": 262 }, { "epoch": 0.3250230840258541, "grad_norm": 5.21875, "learning_rate": 9.64198080016775e-09, "loss": 2.0315141677856445, "step": 264 }, { "epoch": 0.32748538011695905, "grad_norm": 8.5625, "learning_rate": 9.635361623927643e-09, "loss": 2.1542179584503174, "step": 266 }, { "epoch": 0.32994767620806403, "grad_norm": 3.140625, "learning_rate": 9.62868473870811e-09, "loss": 1.1597316265106201, "step": 268 }, { "epoch": 0.33240997229916897, "grad_norm": 10.5, "learning_rate": 9.621950250500333e-09, "loss": 2.637326717376709, "step": 270 }, { "epoch": 0.33487226839027395, "grad_norm": 2.859375, "learning_rate": 9.615158266209887e-09, "loss": 1.283077597618103, "step": 272 }, { "epoch": 0.3373345644813789, "grad_norm": 7.125, "learning_rate": 9.608308893655061e-09, "loss": 2.046065092086792, "step": 274 }, { "epoch": 0.3397968605724838, "grad_norm": 2.953125, "learning_rate": 9.601402241565154e-09, "loss": 1.1603574752807617, "step": 276 }, { "epoch": 0.3422591566635888, "grad_norm": 5.34375, "learning_rate": 9.59443841957873e-09, "loss": 1.7637038230895996, "step": 278 }, { "epoch": 0.34472145275469374, "grad_norm": 5.21875, "learning_rate": 9.587417538241892e-09, "loss": 1.938485860824585, "step": 280 }, { "epoch": 0.3471837488457987, "grad_norm": 29.0, "learning_rate": 9.580339709006524e-09, "loss": 2.3233187198638916, "step": 282 }, { "epoch": 0.34964604493690365, "grad_norm": 6.0, "learning_rate": 9.573205044228518e-09, "loss": 1.4073760509490967, "step": 284 }, { "epoch": 0.35210834102800864, "grad_norm": 6.375, "learning_rate": 9.566013657165994e-09, "loss": 1.3963334560394287, "step": 286 }, { "epoch": 0.3545706371191136, "grad_norm": 6.8125, "learning_rate": 9.558765661977503e-09, "loss": 1.9514954090118408, "step": 288 }, { "epoch": 0.3570329332102185, "grad_norm": 5.75, "learning_rate": 9.551461173720208e-09, "loss": 2.0840539932250977, "step": 290 }, { "epoch": 0.3594952293013235, "grad_norm": 11.9375, "learning_rate": 9.544100308348067e-09, "loss": 2.2709197998046875, "step": 292 }, { "epoch": 0.3619575253924284, "grad_norm": 12.3125, "learning_rate": 9.536683182709986e-09, "loss": 2.443535327911377, "step": 294 }, { "epoch": 0.3644198214835334, "grad_norm": 18.875, "learning_rate": 9.529209914547962e-09, "loss": 2.240347385406494, "step": 296 }, { "epoch": 0.36688211757463834, "grad_norm": 12.375, "learning_rate": 9.521680622495228e-09, "loss": 2.1307570934295654, "step": 298 }, { "epoch": 0.36934441366574333, "grad_norm": 11.8125, "learning_rate": 9.514095426074347e-09, "loss": 2.510369062423706, "step": 300 }, { "epoch": 0.37180670975684826, "grad_norm": 5.03125, "learning_rate": 9.506454445695337e-09, "loss": 1.9031611680984497, "step": 302 }, { "epoch": 0.3742690058479532, "grad_norm": 2.484375, "learning_rate": 9.498757802653741e-09, "loss": 1.2329223155975342, "step": 304 }, { "epoch": 0.3767313019390582, "grad_norm": 5.28125, "learning_rate": 9.491005619128721e-09, "loss": 1.8155068159103394, "step": 306 }, { "epoch": 0.3791935980301631, "grad_norm": 7.625, "learning_rate": 9.483198018181099e-09, "loss": 1.736093282699585, "step": 308 }, { "epoch": 0.3816558941212681, "grad_norm": 13.5625, "learning_rate": 9.475335123751412e-09, "loss": 1.9234977960586548, "step": 310 }, { "epoch": 0.38411819021237303, "grad_norm": 8.5, "learning_rate": 9.467417060657952e-09, "loss": 1.9270076751708984, "step": 312 }, { "epoch": 0.38658048630347797, "grad_norm": 4.0625, "learning_rate": 9.459443954594769e-09, "loss": 1.350337028503418, "step": 314 }, { "epoch": 0.38904278239458295, "grad_norm": 2.609375, "learning_rate": 9.451415932129692e-09, "loss": 1.1429853439331055, "step": 316 }, { "epoch": 0.3915050784856879, "grad_norm": 4.90625, "learning_rate": 9.443333120702307e-09, "loss": 1.8531888723373413, "step": 318 }, { "epoch": 0.3939673745767929, "grad_norm": 3.0625, "learning_rate": 9.435195648621935e-09, "loss": 1.3913381099700928, "step": 320 }, { "epoch": 0.3964296706678978, "grad_norm": 5.15625, "learning_rate": 9.42700364506561e-09, "loss": 1.8761987686157227, "step": 322 }, { "epoch": 0.3988919667590028, "grad_norm": 4.4375, "learning_rate": 9.418757240076008e-09, "loss": 1.9191958904266357, "step": 324 }, { "epoch": 0.4013542628501077, "grad_norm": 2.75, "learning_rate": 9.410456564559393e-09, "loss": 1.175315260887146, "step": 326 }, { "epoch": 0.40381655894121266, "grad_norm": 12.375, "learning_rate": 9.402101750283545e-09, "loss": 2.3216049671173096, "step": 328 }, { "epoch": 0.40627885503231764, "grad_norm": 2.265625, "learning_rate": 9.39369292987565e-09, "loss": 1.1453694105148315, "step": 330 }, { "epoch": 0.4087411511234226, "grad_norm": 5.71875, "learning_rate": 9.38523023682022e-09, "loss": 1.9262512922286987, "step": 332 }, { "epoch": 0.41120344721452756, "grad_norm": 10.375, "learning_rate": 9.376713805456945e-09, "loss": 2.126582622528076, "step": 334 }, { "epoch": 0.4136657433056325, "grad_norm": 2.609375, "learning_rate": 9.368143770978586e-09, "loss": 1.1786751747131348, "step": 336 }, { "epoch": 0.4161280393967375, "grad_norm": 8.125, "learning_rate": 9.359520269428812e-09, "loss": 2.126143217086792, "step": 338 }, { "epoch": 0.4185903354878424, "grad_norm": 2.390625, "learning_rate": 9.350843437700052e-09, "loss": 1.245577335357666, "step": 340 }, { "epoch": 0.42105263157894735, "grad_norm": 29.625, "learning_rate": 9.342113413531315e-09, "loss": 2.009819507598877, "step": 342 }, { "epoch": 0.42351492767005233, "grad_norm": 3.875, "learning_rate": 9.333330335506001e-09, "loss": 1.1387863159179688, "step": 344 }, { "epoch": 0.42597722376115726, "grad_norm": 12.5625, "learning_rate": 9.324494343049707e-09, "loss": 2.2192680835723877, "step": 346 }, { "epoch": 0.42843951985226225, "grad_norm": 11.0, "learning_rate": 9.315605576428018e-09, "loss": 1.939860463142395, "step": 348 }, { "epoch": 0.4309018159433672, "grad_norm": 13.8125, "learning_rate": 9.306664176744266e-09, "loss": 2.318619728088379, "step": 350 }, { "epoch": 0.43336411203447217, "grad_norm": 3.15625, "learning_rate": 9.297670285937303e-09, "loss": 1.0619254112243652, "step": 352 }, { "epoch": 0.4358264081255771, "grad_norm": 5.4375, "learning_rate": 9.288624046779241e-09, "loss": 1.834202766418457, "step": 354 }, { "epoch": 0.43828870421668203, "grad_norm": 9.8125, "learning_rate": 9.279525602873189e-09, "loss": 1.9926815032958984, "step": 356 }, { "epoch": 0.440751000307787, "grad_norm": 4.8125, "learning_rate": 9.27037509865097e-09, "loss": 1.9792507886886597, "step": 358 }, { "epoch": 0.44321329639889195, "grad_norm": 9.125, "learning_rate": 9.26117267937083e-09, "loss": 1.5881253480911255, "step": 360 }, { "epoch": 0.44567559248999694, "grad_norm": 15.5, "learning_rate": 9.251918491115142e-09, "loss": 2.488168239593506, "step": 362 }, { "epoch": 0.4481378885811019, "grad_norm": 4.8125, "learning_rate": 9.242612680788061e-09, "loss": 1.9684348106384277, "step": 364 }, { "epoch": 0.45060018467220686, "grad_norm": 9.375, "learning_rate": 9.233255396113223e-09, "loss": 2.305130958557129, "step": 366 }, { "epoch": 0.4530624807633118, "grad_norm": 16.75, "learning_rate": 9.223846785631378e-09, "loss": 2.335341215133667, "step": 368 }, { "epoch": 0.4555247768544167, "grad_norm": 5.25, "learning_rate": 9.214386998698039e-09, "loss": 1.7638440132141113, "step": 370 }, { "epoch": 0.4579870729455217, "grad_norm": 5.53125, "learning_rate": 9.20487618548112e-09, "loss": 1.7996431589126587, "step": 372 }, { "epoch": 0.46044936903662664, "grad_norm": 4.6875, "learning_rate": 9.195314496958531e-09, "loss": 1.7842280864715576, "step": 374 }, { "epoch": 0.46291166512773163, "grad_norm": 38.0, "learning_rate": 9.185702084915805e-09, "loss": 2.152765989303589, "step": 376 }, { "epoch": 0.46537396121883656, "grad_norm": 5.125, "learning_rate": 9.176039101943672e-09, "loss": 1.7519220113754272, "step": 378 }, { "epoch": 0.4678362573099415, "grad_norm": 45.0, "learning_rate": 9.166325701435644e-09, "loss": 2.9101526737213135, "step": 380 }, { "epoch": 0.4702985534010465, "grad_norm": 12.9375, "learning_rate": 9.156562037585576e-09, "loss": 2.2048463821411133, "step": 382 }, { "epoch": 0.4727608494921514, "grad_norm": 5.4375, "learning_rate": 9.146748265385223e-09, "loss": 1.8226771354675293, "step": 384 }, { "epoch": 0.4752231455832564, "grad_norm": 16.0, "learning_rate": 9.13688454062178e-09, "loss": 2.297773838043213, "step": 386 }, { "epoch": 0.47768544167436133, "grad_norm": 16.375, "learning_rate": 9.126971019875397e-09, "loss": 2.2794573307037354, "step": 388 }, { "epoch": 0.4801477377654663, "grad_norm": 8.1875, "learning_rate": 9.117007860516713e-09, "loss": 1.2689777612686157, "step": 390 }, { "epoch": 0.48261003385657125, "grad_norm": 10.4375, "learning_rate": 9.106995220704344e-09, "loss": 2.273574113845825, "step": 392 }, { "epoch": 0.4850723299476762, "grad_norm": 4.3125, "learning_rate": 9.09693325938237e-09, "loss": 1.7581639289855957, "step": 394 }, { "epoch": 0.48753462603878117, "grad_norm": 4.25, "learning_rate": 9.08682213627782e-09, "loss": 1.8824234008789062, "step": 396 }, { "epoch": 0.4899969221298861, "grad_norm": 40.0, "learning_rate": 9.076662011898145e-09, "loss": 2.692976951599121, "step": 398 }, { "epoch": 0.4924592182209911, "grad_norm": 5.0625, "learning_rate": 9.066453047528642e-09, "loss": 1.951959490776062, "step": 400 }, { "epoch": 0.494921514312096, "grad_norm": 19.125, "learning_rate": 9.056195405229922e-09, "loss": 2.419041156768799, "step": 402 }, { "epoch": 0.497383810403201, "grad_norm": 4.3125, "learning_rate": 9.045889247835322e-09, "loss": 1.7131880521774292, "step": 404 }, { "epoch": 0.49984610649430594, "grad_norm": 2.875, "learning_rate": 9.035534738948328e-09, "loss": 1.2638614177703857, "step": 406 }, { "epoch": 0.5023084025854109, "grad_norm": 6.90625, "learning_rate": 9.02513204293997e-09, "loss": 1.8727983236312866, "step": 408 }, { "epoch": 0.5047706986765158, "grad_norm": 2.203125, "learning_rate": 9.014681324946216e-09, "loss": 1.1091878414154053, "step": 410 }, { "epoch": 0.5072329947676208, "grad_norm": 5.5625, "learning_rate": 9.004182750865357e-09, "loss": 2.032684326171875, "step": 412 }, { "epoch": 0.5096952908587258, "grad_norm": 3.25, "learning_rate": 8.993636487355366e-09, "loss": 1.4393967390060425, "step": 414 }, { "epoch": 0.5121575869498307, "grad_norm": 15.4375, "learning_rate": 8.98304270183125e-09, "loss": 2.364288806915283, "step": 416 }, { "epoch": 0.5146198830409356, "grad_norm": 6.84375, "learning_rate": 8.9724015624624e-09, "loss": 1.4677906036376953, "step": 418 }, { "epoch": 0.5170821791320406, "grad_norm": 4.53125, "learning_rate": 8.961713238169922e-09, "loss": 1.9610824584960938, "step": 420 }, { "epoch": 0.5195444752231456, "grad_norm": 6.1875, "learning_rate": 8.950977898623947e-09, "loss": 1.8107311725616455, "step": 422 }, { "epoch": 0.5220067713142506, "grad_norm": 2.859375, "learning_rate": 8.940195714240937e-09, "loss": 1.2439892292022705, "step": 424 }, { "epoch": 0.5244690674053555, "grad_norm": 9.75, "learning_rate": 8.929366856181003e-09, "loss": 1.985514521598816, "step": 426 }, { "epoch": 0.5269313634964604, "grad_norm": 3.703125, "learning_rate": 8.918491496345149e-09, "loss": 1.8395881652832031, "step": 428 }, { "epoch": 0.5293936595875655, "grad_norm": 3.421875, "learning_rate": 8.907569807372576e-09, "loss": 1.2282559871673584, "step": 430 }, { "epoch": 0.5318559556786704, "grad_norm": 4.75, "learning_rate": 8.896601962637927e-09, "loss": 1.9522662162780762, "step": 432 }, { "epoch": 0.5343182517697753, "grad_norm": 6.4375, "learning_rate": 8.885588136248539e-09, "loss": 1.831364631652832, "step": 434 }, { "epoch": 0.5367805478608803, "grad_norm": 3.21875, "learning_rate": 8.874528503041674e-09, "loss": 1.3392367362976074, "step": 436 }, { "epoch": 0.5392428439519852, "grad_norm": 2.03125, "learning_rate": 8.86342323858175e-09, "loss": 1.154931664466858, "step": 438 }, { "epoch": 0.5417051400430902, "grad_norm": 2.84375, "learning_rate": 8.852272519157554e-09, "loss": 1.1106712818145752, "step": 440 }, { "epoch": 0.5441674361341952, "grad_norm": 12.6875, "learning_rate": 8.841076521779431e-09, "loss": 2.266367197036743, "step": 442 }, { "epoch": 0.5466297322253001, "grad_norm": 6.78125, "learning_rate": 8.829835424176495e-09, "loss": 1.9257324934005737, "step": 444 }, { "epoch": 0.549092028316405, "grad_norm": 7.6875, "learning_rate": 8.81854940479379e-09, "loss": 1.2584561109542847, "step": 446 }, { "epoch": 0.55155432440751, "grad_norm": 8.3125, "learning_rate": 8.807218642789463e-09, "loss": 2.150424003601074, "step": 448 }, { "epoch": 0.554016620498615, "grad_norm": 3.6875, "learning_rate": 8.795843318031926e-09, "loss": 1.100125789642334, "step": 450 }, { "epoch": 0.5564789165897199, "grad_norm": 4.71875, "learning_rate": 8.78442361109699e-09, "loss": 1.8502240180969238, "step": 452 }, { "epoch": 0.5589412126808249, "grad_norm": 4.625, "learning_rate": 8.772959703265008e-09, "loss": 1.7188208103179932, "step": 454 }, { "epoch": 0.5614035087719298, "grad_norm": 2.25, "learning_rate": 8.76145177651799e-09, "loss": 1.1569561958312988, "step": 456 }, { "epoch": 0.5638658048630347, "grad_norm": 13.5, "learning_rate": 8.74990001353672e-09, "loss": 2.2237837314605713, "step": 458 }, { "epoch": 0.5663281009541398, "grad_norm": 2.625, "learning_rate": 8.738304597697855e-09, "loss": 1.2278821468353271, "step": 460 }, { "epoch": 0.5687903970452447, "grad_norm": 2.984375, "learning_rate": 8.726665713071004e-09, "loss": 1.4073512554168701, "step": 462 }, { "epoch": 0.5712526931363496, "grad_norm": 12.375, "learning_rate": 8.714983544415824e-09, "loss": 2.3128976821899414, "step": 464 }, { "epoch": 0.5737149892274546, "grad_norm": 13.3125, "learning_rate": 8.703258277179076e-09, "loss": 2.249760627746582, "step": 466 }, { "epoch": 0.5761772853185596, "grad_norm": 5.75, "learning_rate": 8.691490097491676e-09, "loss": 1.949746012687683, "step": 468 }, { "epoch": 0.5786395814096645, "grad_norm": 8.0625, "learning_rate": 8.679679192165755e-09, "loss": 2.0255026817321777, "step": 470 }, { "epoch": 0.5811018775007695, "grad_norm": 2.953125, "learning_rate": 8.667825748691678e-09, "loss": 1.172034502029419, "step": 472 }, { "epoch": 0.5835641735918744, "grad_norm": 13.4375, "learning_rate": 8.655929955235084e-09, "loss": 1.7464905977249146, "step": 474 }, { "epoch": 0.5860264696829793, "grad_norm": 4.875, "learning_rate": 8.643992000633882e-09, "loss": 1.7516231536865234, "step": 476 }, { "epoch": 0.5884887657740844, "grad_norm": 13.6875, "learning_rate": 8.632012074395267e-09, "loss": 1.9086973667144775, "step": 478 }, { "epoch": 0.5909510618651893, "grad_norm": 20.375, "learning_rate": 8.619990366692703e-09, "loss": 1.120478630065918, "step": 480 }, { "epoch": 0.5934133579562942, "grad_norm": 5.40625, "learning_rate": 8.607927068362909e-09, "loss": 1.8365321159362793, "step": 482 }, { "epoch": 0.5958756540473992, "grad_norm": 4.21875, "learning_rate": 8.595822370902824e-09, "loss": 1.8781213760375977, "step": 484 }, { "epoch": 0.5983379501385041, "grad_norm": 5.09375, "learning_rate": 8.583676466466578e-09, "loss": 1.8990083932876587, "step": 486 }, { "epoch": 0.6008002462296091, "grad_norm": 9.25, "learning_rate": 8.571489547862432e-09, "loss": 2.005687713623047, "step": 488 }, { "epoch": 0.6032625423207141, "grad_norm": 11.75, "learning_rate": 8.559261808549717e-09, "loss": 2.288544178009033, "step": 490 }, { "epoch": 0.605724838411819, "grad_norm": 12.0625, "learning_rate": 8.546993442635767e-09, "loss": 1.9239308834075928, "step": 492 }, { "epoch": 0.6081871345029239, "grad_norm": 3.203125, "learning_rate": 8.534684644872836e-09, "loss": 1.2520358562469482, "step": 494 }, { "epoch": 0.610649430594029, "grad_norm": 7.65625, "learning_rate": 8.522335610655014e-09, "loss": 2.1090569496154785, "step": 496 }, { "epoch": 0.6131117266851339, "grad_norm": 10.3125, "learning_rate": 8.509946536015109e-09, "loss": 2.2030882835388184, "step": 498 }, { "epoch": 0.6155740227762388, "grad_norm": 18.75, "learning_rate": 8.497517617621549e-09, "loss": 2.205538034439087, "step": 500 }, { "epoch": 0.6180363188673438, "grad_norm": 3.484375, "learning_rate": 8.485049052775255e-09, "loss": 1.5225834846496582, "step": 502 }, { "epoch": 0.6204986149584487, "grad_norm": 4.8125, "learning_rate": 8.472541039406509e-09, "loss": 1.8662419319152832, "step": 504 }, { "epoch": 0.6229609110495538, "grad_norm": 3.3125, "learning_rate": 8.459993776071815e-09, "loss": 1.5459778308868408, "step": 506 }, { "epoch": 0.6254232071406587, "grad_norm": 2.359375, "learning_rate": 8.44740746195074e-09, "loss": 1.2113550901412964, "step": 508 }, { "epoch": 0.6278855032317636, "grad_norm": 3.078125, "learning_rate": 8.434782296842755e-09, "loss": 1.2501018047332764, "step": 510 }, { "epoch": 0.6303477993228686, "grad_norm": 5.46875, "learning_rate": 8.422118481164076e-09, "loss": 1.3121228218078613, "step": 512 }, { "epoch": 0.6328100954139735, "grad_norm": 8.875, "learning_rate": 8.409416215944459e-09, "loss": 2.0257339477539062, "step": 514 }, { "epoch": 0.6352723915050785, "grad_norm": 2.828125, "learning_rate": 8.396675702824026e-09, "loss": 1.249032974243164, "step": 516 }, { "epoch": 0.6377346875961835, "grad_norm": 2.40625, "learning_rate": 8.38389714405006e-09, "loss": 1.089784026145935, "step": 518 }, { "epoch": 0.6401969836872884, "grad_norm": 3.078125, "learning_rate": 8.371080742473797e-09, "loss": 1.107433795928955, "step": 520 }, { "epoch": 0.6426592797783933, "grad_norm": 24.25, "learning_rate": 8.358226701547196e-09, "loss": 2.397225856781006, "step": 522 }, { "epoch": 0.6451215758694983, "grad_norm": 30.625, "learning_rate": 8.345335225319716e-09, "loss": 2.917544364929199, "step": 524 }, { "epoch": 0.6475838719606033, "grad_norm": 5.3125, "learning_rate": 8.332406518435087e-09, "loss": 1.9733543395996094, "step": 526 }, { "epoch": 0.6500461680517082, "grad_norm": 11.5625, "learning_rate": 8.319440786128039e-09, "loss": 2.30487060546875, "step": 528 }, { "epoch": 0.6525084641428132, "grad_norm": 14.125, "learning_rate": 8.306438234221058e-09, "loss": 2.489694118499756, "step": 530 }, { "epoch": 0.6549707602339181, "grad_norm": 4.90625, "learning_rate": 8.293399069121128e-09, "loss": 1.7912418842315674, "step": 532 }, { "epoch": 0.6574330563250231, "grad_norm": 9.4375, "learning_rate": 8.280323497816431e-09, "loss": 1.935392141342163, "step": 534 }, { "epoch": 0.6598953524161281, "grad_norm": 6.09375, "learning_rate": 8.267211727873078e-09, "loss": 1.9411722421646118, "step": 536 }, { "epoch": 0.662357648507233, "grad_norm": 3.765625, "learning_rate": 8.254063967431816e-09, "loss": 1.7723370790481567, "step": 538 }, { "epoch": 0.6648199445983379, "grad_norm": 10.75, "learning_rate": 8.240880425204702e-09, "loss": 2.3154473304748535, "step": 540 }, { "epoch": 0.6672822406894429, "grad_norm": 6.25, "learning_rate": 8.22766131047182e-09, "loss": 1.941293716430664, "step": 542 }, { "epoch": 0.6697445367805479, "grad_norm": 5.15625, "learning_rate": 8.21440683307794e-09, "loss": 1.8273173570632935, "step": 544 }, { "epoch": 0.6722068328716528, "grad_norm": 5.75, "learning_rate": 8.201117203429187e-09, "loss": 1.917323112487793, "step": 546 }, { "epoch": 0.6746691289627578, "grad_norm": 4.53125, "learning_rate": 8.18779263248971e-09, "loss": 1.5516306161880493, "step": 548 }, { "epoch": 0.6771314250538627, "grad_norm": 5.71875, "learning_rate": 8.174433331778322e-09, "loss": 2.0121002197265625, "step": 550 }, { "epoch": 0.6795937211449676, "grad_norm": 4.34375, "learning_rate": 8.161039513365158e-09, "loss": 1.2636222839355469, "step": 552 }, { "epoch": 0.6820560172360727, "grad_norm": 3.8125, "learning_rate": 8.147611389868293e-09, "loss": 1.3448388576507568, "step": 554 }, { "epoch": 0.6845183133271776, "grad_norm": 8.0625, "learning_rate": 8.13414917445037e-09, "loss": 2.0951576232910156, "step": 556 }, { "epoch": 0.6869806094182825, "grad_norm": 10.875, "learning_rate": 8.120653080815219e-09, "loss": 2.3154006004333496, "step": 558 }, { "epoch": 0.6894429055093875, "grad_norm": 2.96875, "learning_rate": 8.107123323204473e-09, "loss": 1.1850239038467407, "step": 560 }, { "epoch": 0.6919052016004925, "grad_norm": 4.4375, "learning_rate": 8.093560116394149e-09, "loss": 1.9023423194885254, "step": 562 }, { "epoch": 0.6943674976915974, "grad_norm": 5.5, "learning_rate": 8.079963675691255e-09, "loss": 1.9364053010940552, "step": 564 }, { "epoch": 0.6968297937827024, "grad_norm": 8.1875, "learning_rate": 8.06633421693036e-09, "loss": 1.8559212684631348, "step": 566 }, { "epoch": 0.6992920898738073, "grad_norm": 12.0, "learning_rate": 8.052671956470177e-09, "loss": 1.9172155857086182, "step": 568 }, { "epoch": 0.7017543859649122, "grad_norm": 5.1875, "learning_rate": 8.038977111190119e-09, "loss": 1.7878023386001587, "step": 570 }, { "epoch": 0.7042166820560173, "grad_norm": 5.28125, "learning_rate": 8.025249898486866e-09, "loss": 1.9518636465072632, "step": 572 }, { "epoch": 0.7066789781471222, "grad_norm": 4.875, "learning_rate": 8.011490536270911e-09, "loss": 1.7933154106140137, "step": 574 }, { "epoch": 0.7091412742382271, "grad_norm": 4.75, "learning_rate": 7.997699242963094e-09, "loss": 1.7392499446868896, "step": 576 }, { "epoch": 0.7116035703293321, "grad_norm": 3.734375, "learning_rate": 7.983876237491148e-09, "loss": 1.403039813041687, "step": 578 }, { "epoch": 0.714065866420437, "grad_norm": 2.921875, "learning_rate": 7.970021739286207e-09, "loss": 1.1680914163589478, "step": 580 }, { "epoch": 0.716528162511542, "grad_norm": 2.234375, "learning_rate": 7.956135968279335e-09, "loss": 1.1165484189987183, "step": 582 }, { "epoch": 0.718990458602647, "grad_norm": 10.9375, "learning_rate": 7.942219144898033e-09, "loss": 2.342836856842041, "step": 584 }, { "epoch": 0.7214527546937519, "grad_norm": 3.953125, "learning_rate": 7.928271490062737e-09, "loss": 1.8495182991027832, "step": 586 }, { "epoch": 0.7239150507848569, "grad_norm": 5.875, "learning_rate": 7.914293225183313e-09, "loss": 1.9028046131134033, "step": 588 }, { "epoch": 0.7263773468759618, "grad_norm": 10.0625, "learning_rate": 7.900284572155538e-09, "loss": 1.9208589792251587, "step": 590 }, { "epoch": 0.7288396429670668, "grad_norm": 4.59375, "learning_rate": 7.886245753357586e-09, "loss": 1.8670642375946045, "step": 592 }, { "epoch": 0.7313019390581718, "grad_norm": 65.0, "learning_rate": 7.872176991646488e-09, "loss": 1.555503487586975, "step": 594 }, { "epoch": 0.7337642351492767, "grad_norm": 5.46875, "learning_rate": 7.858078510354597e-09, "loss": 1.9539310932159424, "step": 596 }, { "epoch": 0.7362265312403816, "grad_norm": 2.703125, "learning_rate": 7.843950533286057e-09, "loss": 1.2128690481185913, "step": 598 }, { "epoch": 0.7386888273314867, "grad_norm": 4.46875, "learning_rate": 7.829793284713224e-09, "loss": 1.873086929321289, "step": 600 }, { "epoch": 0.7411511234225916, "grad_norm": 2.578125, "learning_rate": 7.81560698937313e-09, "loss": 1.1673393249511719, "step": 602 }, { "epoch": 0.7436134195136965, "grad_norm": 12.8125, "learning_rate": 7.801391872463896e-09, "loss": 2.315310001373291, "step": 604 }, { "epoch": 0.7460757156048015, "grad_norm": 11.3125, "learning_rate": 7.787148159641176e-09, "loss": 2.4388017654418945, "step": 606 }, { "epoch": 0.7485380116959064, "grad_norm": 8.75, "learning_rate": 7.77287607701456e-09, "loss": 2.1161627769470215, "step": 608 }, { "epoch": 0.7510003077870114, "grad_norm": 3.921875, "learning_rate": 7.758575851143987e-09, "loss": 1.1796162128448486, "step": 610 }, { "epoch": 0.7534626038781164, "grad_norm": 4.90625, "learning_rate": 7.744247709036165e-09, "loss": 1.3470849990844727, "step": 612 }, { "epoch": 0.7559248999692213, "grad_norm": 12.3125, "learning_rate": 7.729891878140936e-09, "loss": 2.33459734916687, "step": 614 }, { "epoch": 0.7583871960603262, "grad_norm": 6.96875, "learning_rate": 7.715508586347695e-09, "loss": 1.9637078046798706, "step": 616 }, { "epoch": 0.7608494921514312, "grad_norm": 4.34375, "learning_rate": 7.701098061981757e-09, "loss": 1.9413955211639404, "step": 618 }, { "epoch": 0.7633117882425362, "grad_norm": 9.6875, "learning_rate": 7.686660533800736e-09, "loss": 1.9719551801681519, "step": 620 }, { "epoch": 0.7657740843336411, "grad_norm": 3.71875, "learning_rate": 7.672196230990918e-09, "loss": 1.3401029109954834, "step": 622 }, { "epoch": 0.7682363804247461, "grad_norm": 6.59375, "learning_rate": 7.65770538316361e-09, "loss": 1.7963333129882812, "step": 624 }, { "epoch": 0.770698676515851, "grad_norm": 7.96875, "learning_rate": 7.643188220351516e-09, "loss": 2.0712432861328125, "step": 626 }, { "epoch": 0.7731609726069559, "grad_norm": 11.125, "learning_rate": 7.628644973005061e-09, "loss": 2.3805270195007324, "step": 628 }, { "epoch": 0.775623268698061, "grad_norm": 7.34375, "learning_rate": 7.61407587198875e-09, "loss": 1.2845838069915771, "step": 630 }, { "epoch": 0.7780855647891659, "grad_norm": 20.875, "learning_rate": 7.5994811485775e-09, "loss": 2.2516846656799316, "step": 632 }, { "epoch": 0.7805478608802708, "grad_norm": 5.0, "learning_rate": 7.584861034452963e-09, "loss": 1.964002251625061, "step": 634 }, { "epoch": 0.7830101569713758, "grad_norm": 3.046875, "learning_rate": 7.570215761699855e-09, "loss": 1.3124688863754272, "step": 636 }, { "epoch": 0.7854724530624808, "grad_norm": 11.6875, "learning_rate": 7.55554556280227e-09, "loss": 2.2107834815979004, "step": 638 }, { "epoch": 0.7879347491535857, "grad_norm": 5.6875, "learning_rate": 7.540850670639978e-09, "loss": 1.9630699157714844, "step": 640 }, { "epoch": 0.7903970452446907, "grad_norm": 5.75, "learning_rate": 7.526131318484753e-09, "loss": 1.9335198402404785, "step": 642 }, { "epoch": 0.7928593413357956, "grad_norm": 3.765625, "learning_rate": 7.511387739996644e-09, "loss": 1.2916162014007568, "step": 644 }, { "epoch": 0.7953216374269005, "grad_norm": 14.5625, "learning_rate": 7.496620169220286e-09, "loss": 2.1263046264648438, "step": 646 }, { "epoch": 0.7977839335180056, "grad_norm": 5.78125, "learning_rate": 7.481828840581164e-09, "loss": 1.8862347602844238, "step": 648 }, { "epoch": 0.8002462296091105, "grad_norm": 11.75, "learning_rate": 7.46701398888192e-09, "loss": 2.1435751914978027, "step": 650 }, { "epoch": 0.8027085257002154, "grad_norm": 36.25, "learning_rate": 7.45217584929859e-09, "loss": 1.8985021114349365, "step": 652 }, { "epoch": 0.8051708217913204, "grad_norm": 3.96875, "learning_rate": 7.437314657376906e-09, "loss": 1.255218267440796, "step": 654 }, { "epoch": 0.8076331178824253, "grad_norm": 6.71875, "learning_rate": 7.422430649028533e-09, "loss": 1.8039145469665527, "step": 656 }, { "epoch": 0.8100954139735304, "grad_norm": 2.828125, "learning_rate": 7.407524060527333e-09, "loss": 1.2014645338058472, "step": 658 }, { "epoch": 0.8125577100646353, "grad_norm": 9.5625, "learning_rate": 7.3925951285056146e-09, "loss": 2.114205837249756, "step": 660 }, { "epoch": 0.8150200061557402, "grad_norm": 18.0, "learning_rate": 7.377644089950371e-09, "loss": 2.3271141052246094, "step": 662 }, { "epoch": 0.8174823022468451, "grad_norm": 4.59375, "learning_rate": 7.362671182199527e-09, "loss": 1.9512523412704468, "step": 664 }, { "epoch": 0.8199445983379502, "grad_norm": 4.875, "learning_rate": 7.347676642938163e-09, "loss": 1.875675082206726, "step": 666 }, { "epoch": 0.8224068944290551, "grad_norm": 7.28125, "learning_rate": 7.332660710194749e-09, "loss": 2.120806932449341, "step": 668 }, { "epoch": 0.8248691905201601, "grad_norm": 12.1875, "learning_rate": 7.3176236223373595e-09, "loss": 2.482332229614258, "step": 670 }, { "epoch": 0.827331486611265, "grad_norm": 5.34375, "learning_rate": 7.302565618069894e-09, "loss": 1.932433843612671, "step": 672 }, { "epoch": 0.8297937827023699, "grad_norm": 2.296875, "learning_rate": 7.287486936428282e-09, "loss": 1.1869601011276245, "step": 674 }, { "epoch": 0.832256078793475, "grad_norm": 2.40625, "learning_rate": 7.272387816776704e-09, "loss": 1.2416247129440308, "step": 676 }, { "epoch": 0.8347183748845799, "grad_norm": 6.34375, "learning_rate": 7.257268498803767e-09, "loss": 1.4887652397155762, "step": 678 }, { "epoch": 0.8371806709756848, "grad_norm": 5.34375, "learning_rate": 7.2421292225187186e-09, "loss": 1.833484411239624, "step": 680 }, { "epoch": 0.8396429670667898, "grad_norm": 13.8125, "learning_rate": 7.2269702282476335e-09, "loss": 2.041853904724121, "step": 682 }, { "epoch": 0.8421052631578947, "grad_norm": 14.625, "learning_rate": 7.211791756629598e-09, "loss": 2.366133689880371, "step": 684 }, { "epoch": 0.8445675592489997, "grad_norm": 10.875, "learning_rate": 7.196594048612881e-09, "loss": 1.9250491857528687, "step": 686 }, { "epoch": 0.8470298553401047, "grad_norm": 10.3125, "learning_rate": 7.1813773454511215e-09, "loss": 2.2896928787231445, "step": 688 }, { "epoch": 0.8494921514312096, "grad_norm": 5.40625, "learning_rate": 7.166141888699495e-09, "loss": 1.9879870414733887, "step": 690 }, { "epoch": 0.8519544475223145, "grad_norm": 11.625, "learning_rate": 7.150887920210878e-09, "loss": 2.2236876487731934, "step": 692 }, { "epoch": 0.8544167436134195, "grad_norm": 10.0, "learning_rate": 7.135615682132004e-09, "loss": 1.4050698280334473, "step": 694 }, { "epoch": 0.8568790397045245, "grad_norm": 22.25, "learning_rate": 7.120325416899629e-09, "loss": 2.2749319076538086, "step": 696 }, { "epoch": 0.8593413357956294, "grad_norm": 15.75, "learning_rate": 7.105017367236675e-09, "loss": 2.3958988189697266, "step": 698 }, { "epoch": 0.8618036318867344, "grad_norm": 11.0, "learning_rate": 7.089691776148384e-09, "loss": 2.313142776489258, "step": 700 }, { "epoch": 0.8642659279778393, "grad_norm": 11.625, "learning_rate": 7.0743488869184535e-09, "loss": 2.3592798709869385, "step": 702 }, { "epoch": 0.8667282240689443, "grad_norm": 8.5625, "learning_rate": 7.058988943105175e-09, "loss": 2.11894154548645, "step": 704 }, { "epoch": 0.8691905201600493, "grad_norm": 2.34375, "learning_rate": 7.04361218853758e-09, "loss": 1.3712561130523682, "step": 706 }, { "epoch": 0.8716528162511542, "grad_norm": 13.4375, "learning_rate": 7.0282188673115514e-09, "loss": 2.092770576477051, "step": 708 }, { "epoch": 0.8741151123422591, "grad_norm": 15.0625, "learning_rate": 7.012809223785957e-09, "loss": 1.9357192516326904, "step": 710 }, { "epoch": 0.8765774084333641, "grad_norm": 2.953125, "learning_rate": 6.9973835025787715e-09, "loss": 1.2680325508117676, "step": 712 }, { "epoch": 0.8790397045244691, "grad_norm": 7.125, "learning_rate": 6.981941948563198e-09, "loss": 1.7719722986221313, "step": 714 }, { "epoch": 0.881502000615574, "grad_norm": 5.0625, "learning_rate": 6.966484806863764e-09, "loss": 1.8633275032043457, "step": 716 }, { "epoch": 0.883964296706679, "grad_norm": 3.296875, "learning_rate": 6.9510123228524545e-09, "loss": 1.4539438486099243, "step": 718 }, { "epoch": 0.8864265927977839, "grad_norm": 13.25, "learning_rate": 6.935524742144792e-09, "loss": 2.2359728813171387, "step": 720 }, { "epoch": 0.8888888888888888, "grad_norm": 6.78125, "learning_rate": 6.920022310595953e-09, "loss": 1.8414530754089355, "step": 722 }, { "epoch": 0.8913511849799939, "grad_norm": 3.84375, "learning_rate": 6.904505274296864e-09, "loss": 1.2079766988754272, "step": 724 }, { "epoch": 0.8938134810710988, "grad_norm": 8.625, "learning_rate": 6.88897387957029e-09, "loss": 1.9165315628051758, "step": 726 }, { "epoch": 0.8962757771622037, "grad_norm": 3.34375, "learning_rate": 6.87342837296693e-09, "loss": 1.2759442329406738, "step": 728 }, { "epoch": 0.8987380732533087, "grad_norm": 5.34375, "learning_rate": 6.857869001261491e-09, "loss": 1.2644639015197754, "step": 730 }, { "epoch": 0.9012003693444137, "grad_norm": 12.75, "learning_rate": 6.842296011448788e-09, "loss": 2.2167718410491943, "step": 732 }, { "epoch": 0.9036626654355187, "grad_norm": 7.1875, "learning_rate": 6.826709650739812e-09, "loss": 1.402853012084961, "step": 734 }, { "epoch": 0.9061249615266236, "grad_norm": 9.25, "learning_rate": 6.811110166557809e-09, "loss": 2.0942487716674805, "step": 736 }, { "epoch": 0.9085872576177285, "grad_norm": 4.40625, "learning_rate": 6.795497806534348e-09, "loss": 1.8234786987304688, "step": 738 }, { "epoch": 0.9110495537088334, "grad_norm": 16.5, "learning_rate": 6.779872818505397e-09, "loss": 1.8784126043319702, "step": 740 }, { "epoch": 0.9135118497999385, "grad_norm": 9.5, "learning_rate": 6.7642354505073835e-09, "loss": 2.2190794944763184, "step": 742 }, { "epoch": 0.9159741458910434, "grad_norm": 4.8125, "learning_rate": 6.748585950773263e-09, "loss": 1.9413115978240967, "step": 744 }, { "epoch": 0.9184364419821484, "grad_norm": 3.109375, "learning_rate": 6.732924567728566e-09, "loss": 1.3823771476745605, "step": 746 }, { "epoch": 0.9208987380732533, "grad_norm": 5.03125, "learning_rate": 6.7172515499874705e-09, "loss": 1.9463045597076416, "step": 748 }, { "epoch": 0.9233610341643582, "grad_norm": 6.71875, "learning_rate": 6.701567146348843e-09, "loss": 2.0039689540863037, "step": 750 }, { "epoch": 0.9258233302554633, "grad_norm": 3.828125, "learning_rate": 6.685871605792301e-09, "loss": 1.438122272491455, "step": 752 }, { "epoch": 0.9282856263465682, "grad_norm": 34.25, "learning_rate": 6.670165177474241e-09, "loss": 1.7374298572540283, "step": 754 }, { "epoch": 0.9307479224376731, "grad_norm": 2.796875, "learning_rate": 6.6544481107239054e-09, "loss": 1.4571634531021118, "step": 756 }, { "epoch": 0.9332102185287781, "grad_norm": 4.78125, "learning_rate": 6.638720655039412e-09, "loss": 1.7221906185150146, "step": 758 }, { "epoch": 0.935672514619883, "grad_norm": 22.25, "learning_rate": 6.622983060083796e-09, "loss": 1.344387173652649, "step": 760 }, { "epoch": 0.938134810710988, "grad_norm": 2.4375, "learning_rate": 6.607235575681045e-09, "loss": 1.2809216976165771, "step": 762 }, { "epoch": 0.940597106802093, "grad_norm": 2.609375, "learning_rate": 6.591478451812138e-09, "loss": 1.1766109466552734, "step": 764 }, { "epoch": 0.9430594028931979, "grad_norm": 3.765625, "learning_rate": 6.575711938611073e-09, "loss": 1.3128526210784912, "step": 766 }, { "epoch": 0.9455216989843028, "grad_norm": 5.625, "learning_rate": 6.559936286360897e-09, "loss": 1.8674499988555908, "step": 768 }, { "epoch": 0.9479839950754079, "grad_norm": 5.28125, "learning_rate": 6.544151745489735e-09, "loss": 1.934564471244812, "step": 770 }, { "epoch": 0.9504462911665128, "grad_norm": 7.625, "learning_rate": 6.52835856656681e-09, "loss": 2.1300408840179443, "step": 772 }, { "epoch": 0.9529085872576177, "grad_norm": 10.3125, "learning_rate": 6.512557000298471e-09, "loss": 2.284024715423584, "step": 774 }, { "epoch": 0.9553708833487227, "grad_norm": 5.15625, "learning_rate": 6.49674729752421e-09, "loss": 1.9190423488616943, "step": 776 }, { "epoch": 0.9578331794398276, "grad_norm": 9.0, "learning_rate": 6.480929709212682e-09, "loss": 2.2223734855651855, "step": 778 }, { "epoch": 0.9602954755309326, "grad_norm": 5.5, "learning_rate": 6.465104486457718e-09, "loss": 1.9598147869110107, "step": 780 }, { "epoch": 0.9627577716220376, "grad_norm": 6.59375, "learning_rate": 6.4492718804743365e-09, "loss": 2.041882276535034, "step": 782 }, { "epoch": 0.9652200677131425, "grad_norm": 2.125, "learning_rate": 6.433432142594771e-09, "loss": 1.2188262939453125, "step": 784 }, { "epoch": 0.9676823638042474, "grad_norm": 11.375, "learning_rate": 6.4175855242644575e-09, "loss": 2.208829879760742, "step": 786 }, { "epoch": 0.9701446598953524, "grad_norm": 5.0, "learning_rate": 6.401732277038063e-09, "loss": 2.0125837326049805, "step": 788 }, { "epoch": 0.9726069559864574, "grad_norm": 8.75, "learning_rate": 6.3858726525754814e-09, "loss": 2.2643885612487793, "step": 790 }, { "epoch": 0.9750692520775623, "grad_norm": 7.0625, "learning_rate": 6.370006902637836e-09, "loss": 1.9207779169082642, "step": 792 }, { "epoch": 0.9775315481686673, "grad_norm": 2.59375, "learning_rate": 6.354135279083497e-09, "loss": 1.2121376991271973, "step": 794 }, { "epoch": 0.9799938442597722, "grad_norm": 10.9375, "learning_rate": 6.338258033864067e-09, "loss": 2.1134583950042725, "step": 796 }, { "epoch": 0.9824561403508771, "grad_norm": 18.125, "learning_rate": 6.3223754190203895e-09, "loss": 2.3652374744415283, "step": 798 }, { "epoch": 0.9849184364419822, "grad_norm": 11.6875, "learning_rate": 6.306487686678556e-09, "loss": 1.956110954284668, "step": 800 }, { "epoch": 0.9873807325330871, "grad_norm": 5.21875, "learning_rate": 6.290595089045882e-09, "loss": 1.993713140487671, "step": 802 }, { "epoch": 0.989843028624192, "grad_norm": 19.25, "learning_rate": 6.274697878406925e-09, "loss": 1.3555768728256226, "step": 804 }, { "epoch": 0.992305324715297, "grad_norm": 14.9375, "learning_rate": 6.2587963071194695e-09, "loss": 1.7694034576416016, "step": 806 }, { "epoch": 0.994767620806402, "grad_norm": 14.0, "learning_rate": 6.242890627610518e-09, "loss": 2.2126145362854004, "step": 808 }, { "epoch": 0.997229916897507, "grad_norm": 5.46875, "learning_rate": 6.226981092372297e-09, "loss": 1.7438420057296753, "step": 810 }, { "epoch": 0.9996922129886119, "grad_norm": 3.671875, "learning_rate": 6.211067953958229e-09, "loss": 1.237831711769104, "step": 812 }, { "epoch": 1.0012311480455525, "grad_norm": 2.15625, "learning_rate": 6.195151464978945e-09, "loss": 1.2776278257369995, "step": 814 }, { "epoch": 1.0036934441366574, "grad_norm": 5.625, "learning_rate": 6.179231878098257e-09, "loss": 1.6098976135253906, "step": 816 }, { "epoch": 1.0061557402277623, "grad_norm": 1.875, "learning_rate": 6.163309446029157e-09, "loss": 1.5421602725982666, "step": 818 }, { "epoch": 1.0086180363188673, "grad_norm": 5.3125, "learning_rate": 6.1473844215298045e-09, "loss": 1.4228730201721191, "step": 820 }, { "epoch": 1.0110803324099722, "grad_norm": 14.75, "learning_rate": 6.131457057399506e-09, "loss": 2.0147526264190674, "step": 822 }, { "epoch": 1.0135426285010773, "grad_norm": 13.9375, "learning_rate": 6.115527606474713e-09, "loss": 2.301534652709961, "step": 824 }, { "epoch": 1.0160049245921823, "grad_norm": 6.75, "learning_rate": 6.099596321625005e-09, "loss": 1.9000599384307861, "step": 826 }, { "epoch": 1.0184672206832872, "grad_norm": 1.4140625, "learning_rate": 6.083663455749068e-09, "loss": 1.2694454193115234, "step": 828 }, { "epoch": 1.0209295167743921, "grad_norm": 2.453125, "learning_rate": 6.0677292617706915e-09, "loss": 1.1476200819015503, "step": 830 }, { "epoch": 1.023391812865497, "grad_norm": 15.125, "learning_rate": 6.051793992634741e-09, "loss": 1.685870885848999, "step": 832 }, { "epoch": 1.025854108956602, "grad_norm": 5.15625, "learning_rate": 6.035857901303159e-09, "loss": 2.1021130084991455, "step": 834 }, { "epoch": 1.028316405047707, "grad_norm": 9.25, "learning_rate": 6.019921240750932e-09, "loss": 1.9393489360809326, "step": 836 }, { "epoch": 1.0307787011388119, "grad_norm": 3.640625, "learning_rate": 6.0039842639620844e-09, "loss": 1.9408633708953857, "step": 838 }, { "epoch": 1.0332409972299168, "grad_norm": 16.875, "learning_rate": 5.988047223925661e-09, "loss": 2.042579174041748, "step": 840 }, { "epoch": 1.035703293321022, "grad_norm": 2.328125, "learning_rate": 5.9721103736317114e-09, "loss": 1.7358704805374146, "step": 842 }, { "epoch": 1.0381655894121269, "grad_norm": 7.53125, "learning_rate": 5.956173966067275e-09, "loss": 1.5867335796356201, "step": 844 }, { "epoch": 1.0406278855032318, "grad_norm": 4.34375, "learning_rate": 5.940238254212358e-09, "loss": 1.8849399089813232, "step": 846 }, { "epoch": 1.0430901815943368, "grad_norm": 4.84375, "learning_rate": 5.924303491035925e-09, "loss": 1.643231987953186, "step": 848 }, { "epoch": 1.0455524776854417, "grad_norm": 14.0625, "learning_rate": 5.9083699294918835e-09, "loss": 2.0420408248901367, "step": 850 }, { "epoch": 1.0480147737765466, "grad_norm": 10.5, "learning_rate": 5.89243782251506e-09, "loss": 2.353334903717041, "step": 852 }, { "epoch": 1.0504770698676515, "grad_norm": 12.625, "learning_rate": 5.876507423017199e-09, "loss": 2.2866880893707275, "step": 854 }, { "epoch": 1.0529393659587565, "grad_norm": 5.09375, "learning_rate": 5.8605789838829335e-09, "loss": 2.091262102127075, "step": 856 }, { "epoch": 1.0554016620498614, "grad_norm": 15.1875, "learning_rate": 5.844652757965778e-09, "loss": 2.1091365814208984, "step": 858 }, { "epoch": 1.0578639581409663, "grad_norm": 2.4375, "learning_rate": 5.828728998084117e-09, "loss": 1.6677895784378052, "step": 860 }, { "epoch": 1.0603262542320715, "grad_norm": 4.4375, "learning_rate": 5.812807957017181e-09, "loss": 1.5235992670059204, "step": 862 }, { "epoch": 1.0627885503231764, "grad_norm": 12.1875, "learning_rate": 5.796889887501051e-09, "loss": 2.279834270477295, "step": 864 }, { "epoch": 1.0652508464142814, "grad_norm": 9.125, "learning_rate": 5.780975042224629e-09, "loss": 2.450547456741333, "step": 866 }, { "epoch": 1.0677131425053863, "grad_norm": 61.25, "learning_rate": 5.765063673825634e-09, "loss": 2.2601470947265625, "step": 868 }, { "epoch": 1.0701754385964912, "grad_norm": 3.140625, "learning_rate": 5.749156034886602e-09, "loss": 1.6974682807922363, "step": 870 }, { "epoch": 1.0726377346875962, "grad_norm": 5.75, "learning_rate": 5.733252377930853e-09, "loss": 1.7504122257232666, "step": 872 }, { "epoch": 1.075100030778701, "grad_norm": 3.640625, "learning_rate": 5.7173529554185045e-09, "loss": 1.7744596004486084, "step": 874 }, { "epoch": 1.077562326869806, "grad_norm": 5.0625, "learning_rate": 5.701458019742448e-09, "loss": 1.8063809871673584, "step": 876 }, { "epoch": 1.080024622960911, "grad_norm": 4.75, "learning_rate": 5.685567823224358e-09, "loss": 1.8798420429229736, "step": 878 }, { "epoch": 1.082486919052016, "grad_norm": 12.75, "learning_rate": 5.669682618110672e-09, "loss": 2.0758848190307617, "step": 880 }, { "epoch": 1.084949215143121, "grad_norm": 12.5, "learning_rate": 5.653802656568592e-09, "loss": 2.1326591968536377, "step": 882 }, { "epoch": 1.087411511234226, "grad_norm": 4.8125, "learning_rate": 5.637928190682084e-09, "loss": 1.9486507177352905, "step": 884 }, { "epoch": 1.089873807325331, "grad_norm": 6.75, "learning_rate": 5.622059472447876e-09, "loss": 1.9365224838256836, "step": 886 }, { "epoch": 1.0923361034164358, "grad_norm": 6.4375, "learning_rate": 5.606196753771449e-09, "loss": 1.8881072998046875, "step": 888 }, { "epoch": 1.0947983995075408, "grad_norm": 7.21875, "learning_rate": 5.590340286463054e-09, "loss": 1.9489333629608154, "step": 890 }, { "epoch": 1.0972606955986457, "grad_norm": 8.5, "learning_rate": 5.574490322233697e-09, "loss": 1.9946143627166748, "step": 892 }, { "epoch": 1.0997229916897506, "grad_norm": 3.484375, "learning_rate": 5.558647112691158e-09, "loss": 1.6062787771224976, "step": 894 }, { "epoch": 1.1021852877808556, "grad_norm": 2.859375, "learning_rate": 5.542810909335987e-09, "loss": 1.2802103757858276, "step": 896 }, { "epoch": 1.1046475838719605, "grad_norm": 17.0, "learning_rate": 5.526981963557518e-09, "loss": 1.7315878868103027, "step": 898 }, { "epoch": 1.1071098799630656, "grad_norm": 9.0, "learning_rate": 5.511160526629875e-09, "loss": 1.9750934839248657, "step": 900 }, { "epoch": 1.1095721760541706, "grad_norm": 3.515625, "learning_rate": 5.495346849707981e-09, "loss": 1.6797375679016113, "step": 902 }, { "epoch": 1.1120344721452755, "grad_norm": 10.75, "learning_rate": 5.479541183823578e-09, "loss": 1.8305199146270752, "step": 904 }, { "epoch": 1.1144967682363804, "grad_norm": 4.84375, "learning_rate": 5.463743779881238e-09, "loss": 1.9975595474243164, "step": 906 }, { "epoch": 1.1169590643274854, "grad_norm": 4.65625, "learning_rate": 5.447954888654378e-09, "loss": 1.7815577983856201, "step": 908 }, { "epoch": 1.1194213604185903, "grad_norm": 3.109375, "learning_rate": 5.432174760781281e-09, "loss": 1.5837122201919556, "step": 910 }, { "epoch": 1.1218836565096952, "grad_norm": 2.25, "learning_rate": 5.416403646761119e-09, "loss": 1.2701913118362427, "step": 912 }, { "epoch": 1.1243459526008002, "grad_norm": 2.890625, "learning_rate": 5.400641796949976e-09, "loss": 1.3599649667739868, "step": 914 }, { "epoch": 1.1268082486919053, "grad_norm": 6.34375, "learning_rate": 5.384889461556868e-09, "loss": 1.5575028657913208, "step": 916 }, { "epoch": 1.1292705447830103, "grad_norm": 3.34375, "learning_rate": 5.36914689063978e-09, "loss": 1.4743753671646118, "step": 918 }, { "epoch": 1.1317328408741152, "grad_norm": 5.25, "learning_rate": 5.353414334101692e-09, "loss": 1.5236045122146606, "step": 920 }, { "epoch": 1.1341951369652201, "grad_norm": 4.4375, "learning_rate": 5.337692041686615e-09, "loss": 1.891930341720581, "step": 922 }, { "epoch": 1.136657433056325, "grad_norm": 2.046875, "learning_rate": 5.321980262975614e-09, "loss": 1.522653341293335, "step": 924 }, { "epoch": 1.13911972914743, "grad_norm": 15.625, "learning_rate": 5.306279247382867e-09, "loss": 1.66744065284729, "step": 926 }, { "epoch": 1.141582025238535, "grad_norm": 16.875, "learning_rate": 5.290589244151689e-09, "loss": 2.157740592956543, "step": 928 }, { "epoch": 1.1440443213296398, "grad_norm": 2.390625, "learning_rate": 5.274910502350581e-09, "loss": 1.5675222873687744, "step": 930 }, { "epoch": 1.1465066174207448, "grad_norm": 4.84375, "learning_rate": 5.259243270869276e-09, "loss": 1.1499652862548828, "step": 932 }, { "epoch": 1.1489689135118497, "grad_norm": 12.75, "learning_rate": 5.243587798414792e-09, "loss": 1.5367200374603271, "step": 934 }, { "epoch": 1.1514312096029546, "grad_norm": 5.34375, "learning_rate": 5.227944333507477e-09, "loss": 1.9310216903686523, "step": 936 }, { "epoch": 1.1538935056940598, "grad_norm": 11.5, "learning_rate": 5.212313124477067e-09, "loss": 2.123908519744873, "step": 938 }, { "epoch": 1.1563558017851647, "grad_norm": 7.28125, "learning_rate": 5.196694419458744e-09, "loss": 2.1816015243530273, "step": 940 }, { "epoch": 1.1588180978762697, "grad_norm": 1.84375, "learning_rate": 5.1810884663891986e-09, "loss": 1.5526807308197021, "step": 942 }, { "epoch": 1.1612803939673746, "grad_norm": 1.8671875, "learning_rate": 5.165495513002691e-09, "loss": 1.3024842739105225, "step": 944 }, { "epoch": 1.1637426900584795, "grad_norm": 2.796875, "learning_rate": 5.149915806827121e-09, "loss": 1.2783153057098389, "step": 946 }, { "epoch": 1.1662049861495845, "grad_norm": 5.125, "learning_rate": 5.134349595180094e-09, "loss": 1.5641247034072876, "step": 948 }, { "epoch": 1.1686672822406894, "grad_norm": 7.0, "learning_rate": 5.1187971251650065e-09, "loss": 1.9546620845794678, "step": 950 }, { "epoch": 1.1711295783317943, "grad_norm": 4.4375, "learning_rate": 5.10325864366711e-09, "loss": 1.87162446975708, "step": 952 }, { "epoch": 1.1735918744228995, "grad_norm": 11.5, "learning_rate": 5.087734397349596e-09, "loss": 1.8723485469818115, "step": 954 }, { "epoch": 1.1760541705140044, "grad_norm": 5.21875, "learning_rate": 5.072224632649684e-09, "loss": 1.91074538230896, "step": 956 }, { "epoch": 1.1785164666051093, "grad_norm": 5.25, "learning_rate": 5.056729595774712e-09, "loss": 1.9009315967559814, "step": 958 }, { "epoch": 1.1809787626962143, "grad_norm": 7.3125, "learning_rate": 5.041249532698214e-09, "loss": 1.9836119413375854, "step": 960 }, { "epoch": 1.1834410587873192, "grad_norm": 9.375, "learning_rate": 5.025784689156032e-09, "loss": 1.9037981033325195, "step": 962 }, { "epoch": 1.1859033548784241, "grad_norm": 27.875, "learning_rate": 5.0103353106424065e-09, "loss": 2.551020622253418, "step": 964 }, { "epoch": 1.188365650969529, "grad_norm": 12.75, "learning_rate": 4.994901642406078e-09, "loss": 2.474264144897461, "step": 966 }, { "epoch": 1.190827947060634, "grad_norm": 11.5625, "learning_rate": 4.979483929446398e-09, "loss": 1.7837506532669067, "step": 968 }, { "epoch": 1.193290243151739, "grad_norm": 3.65625, "learning_rate": 4.964082416509442e-09, "loss": 1.760176181793213, "step": 970 }, { "epoch": 1.1957525392428439, "grad_norm": 17.75, "learning_rate": 4.948697348084115e-09, "loss": 1.9721624851226807, "step": 972 }, { "epoch": 1.1982148353339488, "grad_norm": 6.6875, "learning_rate": 4.933328968398283e-09, "loss": 1.8035709857940674, "step": 974 }, { "epoch": 1.200677131425054, "grad_norm": 5.21875, "learning_rate": 4.9179775214148806e-09, "loss": 1.6362351179122925, "step": 976 }, { "epoch": 1.2031394275161589, "grad_norm": 5.90625, "learning_rate": 4.902643250828055e-09, "loss": 1.7732539176940918, "step": 978 }, { "epoch": 1.2056017236072638, "grad_norm": 4.875, "learning_rate": 4.887326400059283e-09, "loss": 1.7590731382369995, "step": 980 }, { "epoch": 1.2080640196983687, "grad_norm": 2.421875, "learning_rate": 4.8720272122535195e-09, "loss": 1.590978980064392, "step": 982 }, { "epoch": 1.2105263157894737, "grad_norm": 22.875, "learning_rate": 4.8567459302753234e-09, "loss": 1.8453547954559326, "step": 984 }, { "epoch": 1.2129886118805786, "grad_norm": 6.71875, "learning_rate": 4.841482796705019e-09, "loss": 2.2472167015075684, "step": 986 }, { "epoch": 1.2154509079716835, "grad_norm": 5.0625, "learning_rate": 4.826238053834831e-09, "loss": 1.9840574264526367, "step": 988 }, { "epoch": 1.2179132040627885, "grad_norm": 9.3125, "learning_rate": 4.811011943665047e-09, "loss": 1.930182695388794, "step": 990 }, { "epoch": 1.2203755001538936, "grad_norm": 15.875, "learning_rate": 4.795804707900169e-09, "loss": 2.222364664077759, "step": 992 }, { "epoch": 1.2228377962449986, "grad_norm": 10.9375, "learning_rate": 4.780616587945083e-09, "loss": 2.241105079650879, "step": 994 }, { "epoch": 1.2253000923361035, "grad_norm": 6.09375, "learning_rate": 4.765447824901222e-09, "loss": 2.1059789657592773, "step": 996 }, { "epoch": 1.2277623884272084, "grad_norm": 5.0625, "learning_rate": 4.750298659562745e-09, "loss": 1.9286503791809082, "step": 998 }, { "epoch": 1.2302246845183133, "grad_norm": 4.84375, "learning_rate": 4.735169332412704e-09, "loss": 1.8667454719543457, "step": 1000 }, { "epoch": 1.2326869806094183, "grad_norm": 9.4375, "learning_rate": 4.720060083619239e-09, "loss": 2.0463290214538574, "step": 1002 }, { "epoch": 1.2351492767005232, "grad_norm": 6.28125, "learning_rate": 4.7049711530317564e-09, "loss": 2.106719970703125, "step": 1004 }, { "epoch": 1.2376115727916281, "grad_norm": 3.8125, "learning_rate": 4.6899027801771234e-09, "loss": 1.829174518585205, "step": 1006 }, { "epoch": 1.240073868882733, "grad_norm": 47.5, "learning_rate": 4.6748552042558664e-09, "loss": 2.110135555267334, "step": 1008 }, { "epoch": 1.242536164973838, "grad_norm": 15.9375, "learning_rate": 4.659828664138378e-09, "loss": 2.152853012084961, "step": 1010 }, { "epoch": 1.244998461064943, "grad_norm": 10.75, "learning_rate": 4.6448233983611165e-09, "loss": 1.862748622894287, "step": 1012 }, { "epoch": 1.247460757156048, "grad_norm": 20.375, "learning_rate": 4.629839645122828e-09, "loss": 2.054180860519409, "step": 1014 }, { "epoch": 1.249923053247153, "grad_norm": 10.5, "learning_rate": 4.614877642280759e-09, "loss": 2.0183398723602295, "step": 1016 }, { "epoch": 1.252385349338258, "grad_norm": 4.3125, "learning_rate": 4.59993762734688e-09, "loss": 1.9448716640472412, "step": 1018 }, { "epoch": 1.254847645429363, "grad_norm": 5.3125, "learning_rate": 4.585019837484127e-09, "loss": 1.909618854522705, "step": 1020 }, { "epoch": 1.2573099415204678, "grad_norm": 4.4375, "learning_rate": 4.5701245095026175e-09, "loss": 1.8093581199645996, "step": 1022 }, { "epoch": 1.2597722376115728, "grad_norm": 4.375, "learning_rate": 4.555251879855905e-09, "loss": 1.8561820983886719, "step": 1024 }, { "epoch": 1.2622345337026777, "grad_norm": 5.71875, "learning_rate": 4.540402184637225e-09, "loss": 1.9136399030685425, "step": 1026 }, { "epoch": 1.2646968297937828, "grad_norm": 6.1875, "learning_rate": 4.525575659575739e-09, "loss": 1.922465443611145, "step": 1028 }, { "epoch": 1.2671591258848878, "grad_norm": 6.125, "learning_rate": 4.510772540032801e-09, "loss": 1.945884346961975, "step": 1030 }, { "epoch": 1.2696214219759927, "grad_norm": 11.6875, "learning_rate": 4.495993060998216e-09, "loss": 2.1394665241241455, "step": 1032 }, { "epoch": 1.2720837180670976, "grad_norm": 12.875, "learning_rate": 4.481237457086511e-09, "loss": 2.548738479614258, "step": 1034 }, { "epoch": 1.2745460141582026, "grad_norm": 6.65625, "learning_rate": 4.466505962533216e-09, "loss": 2.148568868637085, "step": 1036 }, { "epoch": 1.2770083102493075, "grad_norm": 143.0, "learning_rate": 4.451798811191132e-09, "loss": 2.0206987857818604, "step": 1038 }, { "epoch": 1.2794706063404124, "grad_norm": 4.78125, "learning_rate": 4.437116236526635e-09, "loss": 2.025409698486328, "step": 1040 }, { "epoch": 1.2819329024315174, "grad_norm": 14.875, "learning_rate": 4.42245847161596e-09, "loss": 1.8983882665634155, "step": 1042 }, { "epoch": 1.2843951985226223, "grad_norm": 1.8515625, "learning_rate": 4.4078257491415e-09, "loss": 1.594254732131958, "step": 1044 }, { "epoch": 1.2868574946137272, "grad_norm": 3.75, "learning_rate": 4.393218301388123e-09, "loss": 1.4578649997711182, "step": 1046 }, { "epoch": 1.2893197907048322, "grad_norm": 6.0625, "learning_rate": 4.378636360239471e-09, "loss": 1.8163200616836548, "step": 1048 }, { "epoch": 1.291782086795937, "grad_norm": 21.625, "learning_rate": 4.364080157174287e-09, "loss": 1.811424732208252, "step": 1050 }, { "epoch": 1.2942443828870422, "grad_norm": 6.46875, "learning_rate": 4.349549923262743e-09, "loss": 1.6952979564666748, "step": 1052 }, { "epoch": 1.2967066789781472, "grad_norm": 8.9375, "learning_rate": 4.33504588916276e-09, "loss": 1.85584557056427, "step": 1054 }, { "epoch": 1.299168975069252, "grad_norm": 6.25, "learning_rate": 4.320568285116362e-09, "loss": 1.8780372142791748, "step": 1056 }, { "epoch": 1.301631271160357, "grad_norm": 3.265625, "learning_rate": 4.306117340946008e-09, "loss": 1.694900393486023, "step": 1058 }, { "epoch": 1.304093567251462, "grad_norm": 5.40625, "learning_rate": 4.291693286050951e-09, "loss": 1.7237621545791626, "step": 1060 }, { "epoch": 1.306555863342567, "grad_norm": 7.8125, "learning_rate": 4.277296349403592e-09, "loss": 1.9782402515411377, "step": 1062 }, { "epoch": 1.3090181594336718, "grad_norm": 11.625, "learning_rate": 4.262926759545853e-09, "loss": 2.2806496620178223, "step": 1064 }, { "epoch": 1.311480455524777, "grad_norm": 14.9375, "learning_rate": 4.2485847445855384e-09, "loss": 2.0329091548919678, "step": 1066 }, { "epoch": 1.313942751615882, "grad_norm": 7.8125, "learning_rate": 4.234270532192722e-09, "loss": 1.996172308921814, "step": 1068 }, { "epoch": 1.3164050477069869, "grad_norm": 5.4375, "learning_rate": 4.219984349596131e-09, "loss": 1.7426702976226807, "step": 1070 }, { "epoch": 1.3188673437980918, "grad_norm": 4.09375, "learning_rate": 4.205726423579531e-09, "loss": 1.9689075946807861, "step": 1072 }, { "epoch": 1.3213296398891967, "grad_norm": 4.375, "learning_rate": 4.1914969804781435e-09, "loss": 1.851407766342163, "step": 1074 }, { "epoch": 1.3237919359803016, "grad_norm": 4.5625, "learning_rate": 4.177296246175035e-09, "loss": 1.9321177005767822, "step": 1076 }, { "epoch": 1.3262542320714066, "grad_norm": 10.75, "learning_rate": 4.1631244460975395e-09, "loss": 2.1217970848083496, "step": 1078 }, { "epoch": 1.3287165281625115, "grad_norm": 2.34375, "learning_rate": 4.148981805213683e-09, "loss": 1.6175642013549805, "step": 1080 }, { "epoch": 1.3311788242536164, "grad_norm": 9.9375, "learning_rate": 4.134868548028603e-09, "loss": 1.8694862127304077, "step": 1082 }, { "epoch": 1.3336411203447214, "grad_norm": 3.9375, "learning_rate": 4.120784898580994e-09, "loss": 1.9671717882156372, "step": 1084 }, { "epoch": 1.3361034164358263, "grad_norm": 5.9375, "learning_rate": 4.106731080439549e-09, "loss": 1.6825287342071533, "step": 1086 }, { "epoch": 1.3385657125269312, "grad_norm": 3.03125, "learning_rate": 4.092707316699403e-09, "loss": 1.5507920980453491, "step": 1088 }, { "epoch": 1.3410280086180364, "grad_norm": 6.03125, "learning_rate": 4.078713829978599e-09, "loss": 1.4552762508392334, "step": 1090 }, { "epoch": 1.3434903047091413, "grad_norm": 7.09375, "learning_rate": 4.064750842414555e-09, "loss": 1.8754684925079346, "step": 1092 }, { "epoch": 1.3459526008002463, "grad_norm": 94.5, "learning_rate": 4.050818575660528e-09, "loss": 2.175379753112793, "step": 1094 }, { "epoch": 1.3484148968913512, "grad_norm": 2.921875, "learning_rate": 4.0369172508821154e-09, "loss": 1.8554493188858032, "step": 1096 }, { "epoch": 1.3508771929824561, "grad_norm": 4.5625, "learning_rate": 4.023047088753718e-09, "loss": 1.2790199518203735, "step": 1098 }, { "epoch": 1.353339489073561, "grad_norm": 4.75, "learning_rate": 4.009208309455052e-09, "loss": 1.7523287534713745, "step": 1100 }, { "epoch": 1.355801785164666, "grad_norm": 7.9375, "learning_rate": 3.9954011326676595e-09, "loss": 2.061239242553711, "step": 1102 }, { "epoch": 1.3582640812557711, "grad_norm": 8.875, "learning_rate": 3.981625777571407e-09, "loss": 2.029423713684082, "step": 1104 }, { "epoch": 1.360726377346876, "grad_norm": 22.125, "learning_rate": 3.967882462841013e-09, "loss": 2.4487719535827637, "step": 1106 }, { "epoch": 1.363188673437981, "grad_norm": 13.4375, "learning_rate": 3.954171406642579e-09, "loss": 2.2747087478637695, "step": 1108 }, { "epoch": 1.365650969529086, "grad_norm": 8.875, "learning_rate": 3.940492826630122e-09, "loss": 2.142123222351074, "step": 1110 }, { "epoch": 1.3681132656201909, "grad_norm": 13.3125, "learning_rate": 3.926846939942119e-09, "loss": 2.411155939102173, "step": 1112 }, { "epoch": 1.3705755617112958, "grad_norm": 6.96875, "learning_rate": 3.913233963198062e-09, "loss": 2.1852264404296875, "step": 1114 }, { "epoch": 1.3730378578024007, "grad_norm": 2.71875, "learning_rate": 3.899654112495024e-09, "loss": 1.5160444974899292, "step": 1116 }, { "epoch": 1.3755001538935057, "grad_norm": 4.59375, "learning_rate": 3.886107603404221e-09, "loss": 1.5113252401351929, "step": 1118 }, { "epoch": 1.3779624499846106, "grad_norm": 4.71875, "learning_rate": 3.872594650967591e-09, "loss": 1.700373649597168, "step": 1120 }, { "epoch": 1.3804247460757155, "grad_norm": 9.5625, "learning_rate": 3.859115469694385e-09, "loss": 1.9584300518035889, "step": 1122 }, { "epoch": 1.3828870421668205, "grad_norm": 5.5, "learning_rate": 3.845670273557754e-09, "loss": 1.8532516956329346, "step": 1124 }, { "epoch": 1.3853493382579254, "grad_norm": 4.21875, "learning_rate": 3.832259275991365e-09, "loss": 1.640071988105774, "step": 1126 }, { "epoch": 1.3878116343490305, "grad_norm": 3.390625, "learning_rate": 3.818882689885998e-09, "loss": 1.2326576709747314, "step": 1128 }, { "epoch": 1.3902739304401355, "grad_norm": 4.375, "learning_rate": 3.80554072758618e-09, "loss": 1.5156090259552002, "step": 1130 }, { "epoch": 1.3927362265312404, "grad_norm": 2.625, "learning_rate": 3.7922336008868e-09, "loss": 1.5685241222381592, "step": 1132 }, { "epoch": 1.3951985226223453, "grad_norm": 5.09375, "learning_rate": 3.778961521029762e-09, "loss": 1.6617923974990845, "step": 1134 }, { "epoch": 1.3976608187134503, "grad_norm": 6.46875, "learning_rate": 3.765724698700621e-09, "loss": 1.8906147480010986, "step": 1136 }, { "epoch": 1.4001231148045552, "grad_norm": 2.875, "learning_rate": 3.752523344025243e-09, "loss": 1.545287847518921, "step": 1138 }, { "epoch": 1.4025854108956601, "grad_norm": 7.78125, "learning_rate": 3.7393576665664675e-09, "loss": 1.732557773590088, "step": 1140 }, { "epoch": 1.4050477069867653, "grad_norm": 2.25, "learning_rate": 3.7262278753207815e-09, "loss": 1.72062087059021, "step": 1142 }, { "epoch": 1.4075100030778702, "grad_norm": 8.75, "learning_rate": 3.7131341787150018e-09, "loss": 1.5638048648834229, "step": 1144 }, { "epoch": 1.4099722991689752, "grad_norm": 25.0, "learning_rate": 3.7000767846029665e-09, "loss": 2.013415575027466, "step": 1146 }, { "epoch": 1.41243459526008, "grad_norm": 2.46875, "learning_rate": 3.687055900262238e-09, "loss": 1.5985221862792969, "step": 1148 }, { "epoch": 1.414896891351185, "grad_norm": 12.1875, "learning_rate": 3.6740717323908046e-09, "loss": 1.7952547073364258, "step": 1150 }, { "epoch": 1.41735918744229, "grad_norm": 2.9375, "learning_rate": 3.6611244871038118e-09, "loss": 1.5459375381469727, "step": 1152 }, { "epoch": 1.4198214835333949, "grad_norm": 6.84375, "learning_rate": 3.648214369930278e-09, "loss": 1.641556739807129, "step": 1154 }, { "epoch": 1.4222837796244998, "grad_norm": 2.109375, "learning_rate": 3.635341585809837e-09, "loss": 1.5961995124816895, "step": 1156 }, { "epoch": 1.4247460757156047, "grad_norm": 9.125, "learning_rate": 3.6225063390894896e-09, "loss": 1.6079602241516113, "step": 1158 }, { "epoch": 1.4272083718067097, "grad_norm": 4.84375, "learning_rate": 3.609708833520351e-09, "loss": 2.1076085567474365, "step": 1160 }, { "epoch": 1.4296706678978146, "grad_norm": 19.125, "learning_rate": 3.5969492722544207e-09, "loss": 2.1435282230377197, "step": 1162 }, { "epoch": 1.4321329639889195, "grad_norm": 1.796875, "learning_rate": 3.5842278578413577e-09, "loss": 1.6422967910766602, "step": 1164 }, { "epoch": 1.4345952600800247, "grad_norm": 4.1875, "learning_rate": 3.5715447922252655e-09, "loss": 1.4160196781158447, "step": 1166 }, { "epoch": 1.4370575561711296, "grad_norm": 7.78125, "learning_rate": 3.558900276741485e-09, "loss": 1.9306385517120361, "step": 1168 }, { "epoch": 1.4395198522622346, "grad_norm": 6.625, "learning_rate": 3.5462945121134016e-09, "loss": 2.028043508529663, "step": 1170 }, { "epoch": 1.4419821483533395, "grad_norm": 18.125, "learning_rate": 3.533727698449252e-09, "loss": 1.7561140060424805, "step": 1172 }, { "epoch": 1.4444444444444444, "grad_norm": 11.6875, "learning_rate": 3.521200035238954e-09, "loss": 1.9722295999526978, "step": 1174 }, { "epoch": 1.4469067405355494, "grad_norm": 5.40625, "learning_rate": 3.5087117213509367e-09, "loss": 2.2334213256835938, "step": 1176 }, { "epoch": 1.4493690366266543, "grad_norm": 10.1875, "learning_rate": 3.4962629550289858e-09, "loss": 2.2049357891082764, "step": 1178 }, { "epoch": 1.4518313327177594, "grad_norm": 11.0625, "learning_rate": 3.4838539338890964e-09, "loss": 2.2469396591186523, "step": 1180 }, { "epoch": 1.4542936288088644, "grad_norm": 5.59375, "learning_rate": 3.4714848549163314e-09, "loss": 2.023268938064575, "step": 1182 }, { "epoch": 1.4567559248999693, "grad_norm": 3.671875, "learning_rate": 3.4591559144617014e-09, "loss": 1.8120558261871338, "step": 1184 }, { "epoch": 1.4592182209910742, "grad_norm": 5.65625, "learning_rate": 3.4468673082390432e-09, "loss": 1.7612297534942627, "step": 1186 }, { "epoch": 1.4616805170821792, "grad_norm": 23.5, "learning_rate": 3.434619231321912e-09, "loss": 1.9972333908081055, "step": 1188 }, { "epoch": 1.464142813173284, "grad_norm": 4.3125, "learning_rate": 3.4224118781404923e-09, "loss": 1.8834655284881592, "step": 1190 }, { "epoch": 1.466605109264389, "grad_norm": 35.25, "learning_rate": 3.4102454424784997e-09, "loss": 2.4007821083068848, "step": 1192 }, { "epoch": 1.469067405355494, "grad_norm": 9.0, "learning_rate": 3.398120117470115e-09, "loss": 2.477167844772339, "step": 1194 }, { "epoch": 1.471529701446599, "grad_norm": 8.625, "learning_rate": 3.3860360955969127e-09, "loss": 2.0541319847106934, "step": 1196 }, { "epoch": 1.4739919975377038, "grad_norm": 11.3125, "learning_rate": 3.373993568684808e-09, "loss": 2.007800579071045, "step": 1198 }, { "epoch": 1.4764542936288088, "grad_norm": 13.125, "learning_rate": 3.36199272790101e-09, "loss": 2.2932679653167725, "step": 1200 }, { "epoch": 1.4789165897199137, "grad_norm": 2.8125, "learning_rate": 3.350033763750989e-09, "loss": 1.7902061939239502, "step": 1202 }, { "epoch": 1.4813788858110188, "grad_norm": 15.0625, "learning_rate": 3.3381168660754523e-09, "loss": 1.8084830045700073, "step": 1204 }, { "epoch": 1.4838411819021238, "grad_norm": 5.46875, "learning_rate": 3.3262422240473268e-09, "loss": 1.930219054222107, "step": 1206 }, { "epoch": 1.4863034779932287, "grad_norm": 4.65625, "learning_rate": 3.314410026168757e-09, "loss": 1.8515759706497192, "step": 1208 }, { "epoch": 1.4887657740843336, "grad_norm": 20.875, "learning_rate": 3.30262046026812e-09, "loss": 2.1966378688812256, "step": 1210 }, { "epoch": 1.4912280701754386, "grad_norm": 5.0, "learning_rate": 3.2908737134970367e-09, "loss": 2.388540744781494, "step": 1212 }, { "epoch": 1.4936903662665435, "grad_norm": 10.375, "learning_rate": 3.2791699723273984e-09, "loss": 2.1200718879699707, "step": 1214 }, { "epoch": 1.4961526623576484, "grad_norm": 3.515625, "learning_rate": 3.2675094225484135e-09, "loss": 2.037621021270752, "step": 1216 }, { "epoch": 1.4986149584487536, "grad_norm": 3.234375, "learning_rate": 3.2558922492636578e-09, "loss": 1.5640082359313965, "step": 1218 }, { "epoch": 1.5010772545398585, "grad_norm": 6.59375, "learning_rate": 3.2443186368881287e-09, "loss": 1.5967392921447754, "step": 1220 }, { "epoch": 1.5035395506309635, "grad_norm": 1.1875, "learning_rate": 3.2327887691453277e-09, "loss": 1.4248828887939453, "step": 1222 }, { "epoch": 1.5060018467220684, "grad_norm": 5.84375, "learning_rate": 3.2213028290643363e-09, "loss": 1.5917315483093262, "step": 1224 }, { "epoch": 1.5084641428131733, "grad_norm": 5.59375, "learning_rate": 3.2098609989769122e-09, "loss": 1.761174201965332, "step": 1226 }, { "epoch": 1.5109264389042782, "grad_norm": 13.8125, "learning_rate": 3.198463460514598e-09, "loss": 1.7805390357971191, "step": 1228 }, { "epoch": 1.5133887349953832, "grad_norm": 3.125, "learning_rate": 3.1871103946058343e-09, "loss": 2.06949782371521, "step": 1230 }, { "epoch": 1.515851031086488, "grad_norm": 8.0625, "learning_rate": 3.1758019814730902e-09, "loss": 1.6458537578582764, "step": 1232 }, { "epoch": 1.518313327177593, "grad_norm": 5.90625, "learning_rate": 3.1645384006300033e-09, "loss": 1.8969038724899292, "step": 1234 }, { "epoch": 1.520775623268698, "grad_norm": 2.53125, "learning_rate": 3.153319830878523e-09, "loss": 1.5056371688842773, "step": 1236 }, { "epoch": 1.523237919359803, "grad_norm": 25.5, "learning_rate": 3.142146450306082e-09, "loss": 1.7204036712646484, "step": 1238 }, { "epoch": 1.5257002154509078, "grad_norm": 5.5625, "learning_rate": 3.1310184362827594e-09, "loss": 1.7970688343048096, "step": 1240 }, { "epoch": 1.5281625115420128, "grad_norm": 2.75, "learning_rate": 3.1199359654584756e-09, "loss": 1.5522937774658203, "step": 1242 }, { "epoch": 1.530624807633118, "grad_norm": 5.46875, "learning_rate": 3.1088992137601797e-09, "loss": 1.5566771030426025, "step": 1244 }, { "epoch": 1.5330871037242229, "grad_norm": 4.875, "learning_rate": 3.097908356389059e-09, "loss": 1.8924975395202637, "step": 1246 }, { "epoch": 1.5355493998153278, "grad_norm": 2.234375, "learning_rate": 3.08696356781776e-09, "loss": 1.5438798666000366, "step": 1248 }, { "epoch": 1.5380116959064327, "grad_norm": 1.8515625, "learning_rate": 3.0760650217876174e-09, "loss": 1.286960482597351, "step": 1250 }, { "epoch": 1.5404739919975377, "grad_norm": 3.140625, "learning_rate": 3.0652128913058935e-09, "loss": 1.1232177019119263, "step": 1252 }, { "epoch": 1.5429362880886428, "grad_norm": 10.0625, "learning_rate": 3.0544073486430396e-09, "loss": 1.7119476795196533, "step": 1254 }, { "epoch": 1.5453985841797477, "grad_norm": 4.84375, "learning_rate": 3.0436485653299487e-09, "loss": 2.0494632720947266, "step": 1256 }, { "epoch": 1.5478608802708527, "grad_norm": 3.1875, "learning_rate": 3.032936712155246e-09, "loss": 1.5645394325256348, "step": 1258 }, { "epoch": 1.5503231763619576, "grad_norm": 11.1875, "learning_rate": 3.022271959162567e-09, "loss": 1.7430448532104492, "step": 1260 }, { "epoch": 1.5527854724530625, "grad_norm": 3.25, "learning_rate": 3.0116544756478663e-09, "loss": 1.6215105056762695, "step": 1262 }, { "epoch": 1.5552477685441675, "grad_norm": 5.40625, "learning_rate": 3.001084430156724e-09, "loss": 1.4022070169448853, "step": 1264 }, { "epoch": 1.5577100646352724, "grad_norm": 4.3125, "learning_rate": 2.990561990481675e-09, "loss": 1.7849698066711426, "step": 1266 }, { "epoch": 1.5601723607263773, "grad_norm": 2.90625, "learning_rate": 2.9800873236595416e-09, "loss": 1.514677882194519, "step": 1268 }, { "epoch": 1.5626346568174823, "grad_norm": 10.0, "learning_rate": 2.9696605959687833e-09, "loss": 1.529390573501587, "step": 1270 }, { "epoch": 1.5650969529085872, "grad_norm": 2.5625, "learning_rate": 2.9592819729268566e-09, "loss": 1.8093581199645996, "step": 1272 }, { "epoch": 1.5675592489996921, "grad_norm": 10.0625, "learning_rate": 2.948951619287592e-09, "loss": 1.3842357397079468, "step": 1274 }, { "epoch": 1.570021545090797, "grad_norm": 14.5, "learning_rate": 2.938669699038571e-09, "loss": 1.85842764377594, "step": 1276 }, { "epoch": 1.572483841181902, "grad_norm": 29.0, "learning_rate": 2.928436375398528e-09, "loss": 2.2186334133148193, "step": 1278 }, { "epoch": 1.574946137273007, "grad_norm": 7.625, "learning_rate": 2.9182518108147588e-09, "loss": 2.11116361618042, "step": 1280 }, { "epoch": 1.577408433364112, "grad_norm": 10.5625, "learning_rate": 2.9081161669605395e-09, "loss": 2.039137363433838, "step": 1282 }, { "epoch": 1.579870729455217, "grad_norm": 1.7578125, "learning_rate": 2.8980296047325638e-09, "loss": 1.548026204109192, "step": 1284 }, { "epoch": 1.582333025546322, "grad_norm": 6.34375, "learning_rate": 2.8879922842483867e-09, "loss": 1.4916882514953613, "step": 1286 }, { "epoch": 1.5847953216374269, "grad_norm": 4.5, "learning_rate": 2.8780043648438818e-09, "loss": 1.6858062744140625, "step": 1288 }, { "epoch": 1.587257617728532, "grad_norm": 6.84375, "learning_rate": 2.868066005070713e-09, "loss": 1.8366402387619019, "step": 1290 }, { "epoch": 1.589719913819637, "grad_norm": 3.15625, "learning_rate": 2.8581773626938166e-09, "loss": 1.4952478408813477, "step": 1292 }, { "epoch": 1.5921822099107419, "grad_norm": 4.3125, "learning_rate": 2.8483385946889017e-09, "loss": 1.4701340198516846, "step": 1294 }, { "epoch": 1.5946445060018468, "grad_norm": 5.25, "learning_rate": 2.8385498572399503e-09, "loss": 1.8555335998535156, "step": 1296 }, { "epoch": 1.5971068020929517, "grad_norm": 5.0, "learning_rate": 2.828811305736743e-09, "loss": 1.8610620498657227, "step": 1298 }, { "epoch": 1.5995690981840567, "grad_norm": 7.09375, "learning_rate": 2.8191230947723945e-09, "loss": 1.883762240409851, "step": 1300 }, { "epoch": 1.6020313942751616, "grad_norm": 14.5625, "learning_rate": 2.809485378140893e-09, "loss": 2.238772392272949, "step": 1302 }, { "epoch": 1.6044936903662665, "grad_norm": 6.25, "learning_rate": 2.7998983088346625e-09, "loss": 2.1114282608032227, "step": 1304 }, { "epoch": 1.6069559864573715, "grad_norm": 1.9140625, "learning_rate": 2.7903620390421363e-09, "loss": 1.6002395153045654, "step": 1306 }, { "epoch": 1.6094182825484764, "grad_norm": 9.4375, "learning_rate": 2.7808767201453376e-09, "loss": 1.6772760152816772, "step": 1308 }, { "epoch": 1.6118805786395813, "grad_norm": 10.4375, "learning_rate": 2.771442502717478e-09, "loss": 2.111185073852539, "step": 1310 }, { "epoch": 1.6143428747306863, "grad_norm": 14.125, "learning_rate": 2.7620595365205627e-09, "loss": 2.0705718994140625, "step": 1312 }, { "epoch": 1.6168051708217912, "grad_norm": 4.46875, "learning_rate": 2.752727970503024e-09, "loss": 1.95082426071167, "step": 1314 }, { "epoch": 1.6192674669128961, "grad_norm": 5.03125, "learning_rate": 2.7434479527973477e-09, "loss": 1.7210240364074707, "step": 1316 }, { "epoch": 1.621729763004001, "grad_norm": 3.515625, "learning_rate": 2.7342196307177214e-09, "loss": 1.6697207689285278, "step": 1318 }, { "epoch": 1.6241920590951062, "grad_norm": 2.65625, "learning_rate": 2.7250431507577004e-09, "loss": 1.4422950744628906, "step": 1320 }, { "epoch": 1.6266543551862112, "grad_norm": 2.84375, "learning_rate": 2.7159186585878816e-09, "loss": 1.1386830806732178, "step": 1322 }, { "epoch": 1.629116651277316, "grad_norm": 3.015625, "learning_rate": 2.7068462990535863e-09, "loss": 1.2971214056015015, "step": 1324 }, { "epoch": 1.631578947368421, "grad_norm": 19.875, "learning_rate": 2.697826216172569e-09, "loss": 1.638606309890747, "step": 1326 }, { "epoch": 1.6340412434595262, "grad_norm": 3.109375, "learning_rate": 2.688858553132723e-09, "loss": 1.6914677619934082, "step": 1328 }, { "epoch": 1.636503539550631, "grad_norm": 2.28125, "learning_rate": 2.6799434522898126e-09, "loss": 1.1819281578063965, "step": 1330 }, { "epoch": 1.638965835641736, "grad_norm": 2.140625, "learning_rate": 2.6710810551652133e-09, "loss": 1.1034936904907227, "step": 1332 }, { "epoch": 1.641428131732841, "grad_norm": 34.5, "learning_rate": 2.66227150244366e-09, "loss": 1.6707381010055542, "step": 1334 }, { "epoch": 1.643890427823946, "grad_norm": 25.5, "learning_rate": 2.6535149339710184e-09, "loss": 2.70631742477417, "step": 1336 }, { "epoch": 1.6463527239150508, "grad_norm": 30.75, "learning_rate": 2.644811488752068e-09, "loss": 2.4394781589508057, "step": 1338 }, { "epoch": 1.6488150200061558, "grad_norm": 13.625, "learning_rate": 2.636161304948286e-09, "loss": 2.2337255477905273, "step": 1340 }, { "epoch": 1.6512773160972607, "grad_norm": 13.0, "learning_rate": 2.627564519875663e-09, "loss": 2.295048236846924, "step": 1342 }, { "epoch": 1.6537396121883656, "grad_norm": 20.0, "learning_rate": 2.6190212700025183e-09, "loss": 2.110807418823242, "step": 1344 }, { "epoch": 1.6562019082794706, "grad_norm": 4.84375, "learning_rate": 2.6105316909473364e-09, "loss": 1.8732104301452637, "step": 1346 }, { "epoch": 1.6586642043705755, "grad_norm": 8.125, "learning_rate": 2.6020959174766106e-09, "loss": 1.9254186153411865, "step": 1348 }, { "epoch": 1.6611265004616804, "grad_norm": 6.15625, "learning_rate": 2.5937140835027097e-09, "loss": 1.8715019226074219, "step": 1350 }, { "epoch": 1.6635887965527854, "grad_norm": 9.8125, "learning_rate": 2.5853863220817436e-09, "loss": 1.9434764385223389, "step": 1352 }, { "epoch": 1.6660510926438903, "grad_norm": 5.25, "learning_rate": 2.577112765411459e-09, "loss": 2.207705497741699, "step": 1354 }, { "epoch": 1.6685133887349952, "grad_norm": 12.625, "learning_rate": 2.568893544829136e-09, "loss": 1.880719780921936, "step": 1356 }, { "epoch": 1.6709756848261004, "grad_norm": 9.5625, "learning_rate": 2.560728790809509e-09, "loss": 1.8875178098678589, "step": 1358 }, { "epoch": 1.6734379809172053, "grad_norm": 5.4375, "learning_rate": 2.5526186329626865e-09, "loss": 1.6963284015655518, "step": 1360 }, { "epoch": 1.6759002770083102, "grad_norm": 5.90625, "learning_rate": 2.5445632000320995e-09, "loss": 1.791224718093872, "step": 1362 }, { "epoch": 1.6783625730994152, "grad_norm": 3.890625, "learning_rate": 2.5365626198924598e-09, "loss": 1.6278963088989258, "step": 1364 }, { "epoch": 1.6808248691905203, "grad_norm": 3.375, "learning_rate": 2.528617019547723e-09, "loss": 1.3288359642028809, "step": 1366 }, { "epoch": 1.6832871652816253, "grad_norm": 9.0625, "learning_rate": 2.5207265251290823e-09, "loss": 1.6888291835784912, "step": 1368 }, { "epoch": 1.6857494613727302, "grad_norm": 13.375, "learning_rate": 2.512891261892955e-09, "loss": 2.285770893096924, "step": 1370 }, { "epoch": 1.6882117574638351, "grad_norm": 3.1875, "learning_rate": 2.505111354219002e-09, "loss": 1.671492099761963, "step": 1372 }, { "epoch": 1.69067405355494, "grad_norm": 6.25, "learning_rate": 2.49738692560815e-09, "loss": 1.5187859535217285, "step": 1374 }, { "epoch": 1.693136349646045, "grad_norm": 7.0625, "learning_rate": 2.4897180986806322e-09, "loss": 1.9461727142333984, "step": 1376 }, { "epoch": 1.69559864573715, "grad_norm": 7.53125, "learning_rate": 2.482104995174044e-09, "loss": 1.8825700283050537, "step": 1378 }, { "epoch": 1.6980609418282548, "grad_norm": 5.28125, "learning_rate": 2.474547735941405e-09, "loss": 1.8659740686416626, "step": 1380 }, { "epoch": 1.7005232379193598, "grad_norm": 5.59375, "learning_rate": 2.4670464409492447e-09, "loss": 1.7924315929412842, "step": 1382 }, { "epoch": 1.7029855340104647, "grad_norm": 13.4375, "learning_rate": 2.459601229275697e-09, "loss": 1.9610867500305176, "step": 1384 }, { "epoch": 1.7054478301015696, "grad_norm": 8.5, "learning_rate": 2.4522122191086104e-09, "loss": 1.836552381515503, "step": 1386 }, { "epoch": 1.7079101261926746, "grad_norm": 8.8125, "learning_rate": 2.4448795277436698e-09, "loss": 1.7403874397277832, "step": 1388 }, { "epoch": 1.7103724222837795, "grad_norm": 4.625, "learning_rate": 2.4376032715825386e-09, "loss": 1.5626749992370605, "step": 1390 }, { "epoch": 1.7128347183748844, "grad_norm": 3.625, "learning_rate": 2.4303835661310066e-09, "loss": 1.3395249843597412, "step": 1392 }, { "epoch": 1.7152970144659896, "grad_norm": 13.125, "learning_rate": 2.4232205259971584e-09, "loss": 1.0826705694198608, "step": 1394 }, { "epoch": 1.7177593105570945, "grad_norm": 12.875, "learning_rate": 2.4161142648895533e-09, "loss": 1.810969352722168, "step": 1396 }, { "epoch": 1.7202216066481995, "grad_norm": 9.0, "learning_rate": 2.4090648956154223e-09, "loss": 2.039994239807129, "step": 1398 }, { "epoch": 1.7226839027393044, "grad_norm": 7.625, "learning_rate": 2.402072530078876e-09, "loss": 1.8878741264343262, "step": 1400 }, { "epoch": 1.7251461988304093, "grad_norm": 4.5625, "learning_rate": 2.395137279279127e-09, "loss": 1.8724961280822754, "step": 1402 }, { "epoch": 1.7276084949215145, "grad_norm": 4.0, "learning_rate": 2.3882592533087286e-09, "loss": 1.9301607608795166, "step": 1404 }, { "epoch": 1.7300707910126194, "grad_norm": 24.125, "learning_rate": 2.3814385613518284e-09, "loss": 1.6868252754211426, "step": 1406 }, { "epoch": 1.7325330871037243, "grad_norm": 6.78125, "learning_rate": 2.374675311682433e-09, "loss": 1.7913291454315186, "step": 1408 }, { "epoch": 1.7349953831948293, "grad_norm": 2.59375, "learning_rate": 2.3679696116626936e-09, "loss": 1.5577332973480225, "step": 1410 }, { "epoch": 1.7374576792859342, "grad_norm": 4.875, "learning_rate": 2.3613215677411944e-09, "loss": 1.5362656116485596, "step": 1412 }, { "epoch": 1.7399199753770391, "grad_norm": 1.75, "learning_rate": 2.354731285451268e-09, "loss": 1.5279173851013184, "step": 1414 }, { "epoch": 1.742382271468144, "grad_norm": 10.6875, "learning_rate": 2.348198869409322e-09, "loss": 1.696439504623413, "step": 1416 }, { "epoch": 1.744844567559249, "grad_norm": 18.5, "learning_rate": 2.341724423313171e-09, "loss": 2.554849147796631, "step": 1418 }, { "epoch": 1.747306863650354, "grad_norm": 13.0625, "learning_rate": 2.335308049940398e-09, "loss": 2.1925854682922363, "step": 1420 }, { "epoch": 1.7497691597414589, "grad_norm": 3.46875, "learning_rate": 2.328949851146718e-09, "loss": 1.593017816543579, "step": 1422 }, { "epoch": 1.7522314558325638, "grad_norm": 4.0, "learning_rate": 2.322649927864363e-09, "loss": 1.229564905166626, "step": 1424 }, { "epoch": 1.7546937519236687, "grad_norm": 15.6875, "learning_rate": 2.3164083801004798e-09, "loss": 1.9423973560333252, "step": 1426 }, { "epoch": 1.7571560480147737, "grad_norm": 5.75, "learning_rate": 2.3102253069355413e-09, "loss": 2.0594370365142822, "step": 1428 }, { "epoch": 1.7596183441058786, "grad_norm": 6.53125, "learning_rate": 2.3041008065217754e-09, "loss": 1.9393881559371948, "step": 1430 }, { "epoch": 1.7620806401969837, "grad_norm": 7.90625, "learning_rate": 2.298034976081607e-09, "loss": 1.8895037174224854, "step": 1432 }, { "epoch": 1.7645429362880887, "grad_norm": 8.125, "learning_rate": 2.292027911906112e-09, "loss": 1.7276127338409424, "step": 1434 }, { "epoch": 1.7670052323791936, "grad_norm": 6.125, "learning_rate": 2.286079709353491e-09, "loss": 1.5182913541793823, "step": 1436 }, { "epoch": 1.7694675284702985, "grad_norm": 8.6875, "learning_rate": 2.2801904628475545e-09, "loss": 1.845018982887268, "step": 1438 }, { "epoch": 1.7719298245614035, "grad_norm": 13.0625, "learning_rate": 2.274360265876225e-09, "loss": 2.4570071697235107, "step": 1440 }, { "epoch": 1.7743921206525086, "grad_norm": 6.53125, "learning_rate": 2.268589210990052e-09, "loss": 1.779624342918396, "step": 1442 }, { "epoch": 1.7768544167436136, "grad_norm": 11.0, "learning_rate": 2.262877389800745e-09, "loss": 1.5919256210327148, "step": 1444 }, { "epoch": 1.7793167128347185, "grad_norm": 5.96875, "learning_rate": 2.257224892979714e-09, "loss": 2.230924129486084, "step": 1446 }, { "epoch": 1.7817790089258234, "grad_norm": 2.296875, "learning_rate": 2.2516318102566373e-09, "loss": 1.6709070205688477, "step": 1448 }, { "epoch": 1.7842413050169283, "grad_norm": 7.1875, "learning_rate": 2.24609823041803e-09, "loss": 1.5729997158050537, "step": 1450 }, { "epoch": 1.7867036011080333, "grad_norm": 23.125, "learning_rate": 2.240624241305841e-09, "loss": 2.22371768951416, "step": 1452 }, { "epoch": 1.7891658971991382, "grad_norm": 9.0, "learning_rate": 2.2352099298160545e-09, "loss": 1.9387813806533813, "step": 1454 }, { "epoch": 1.7916281932902431, "grad_norm": 6.96875, "learning_rate": 2.2298553818973096e-09, "loss": 1.6565120220184326, "step": 1456 }, { "epoch": 1.794090489381348, "grad_norm": 24.0, "learning_rate": 2.2245606825495408e-09, "loss": 1.6322071552276611, "step": 1458 }, { "epoch": 1.796552785472453, "grad_norm": 6.0625, "learning_rate": 2.219325915822624e-09, "loss": 2.004333257675171, "step": 1460 }, { "epoch": 1.799015081563558, "grad_norm": 11.625, "learning_rate": 2.214151164815044e-09, "loss": 2.2140424251556396, "step": 1462 }, { "epoch": 1.8014773776546629, "grad_norm": 5.90625, "learning_rate": 2.2090365116725787e-09, "loss": 1.876783847808838, "step": 1464 }, { "epoch": 1.8039396737457678, "grad_norm": 2.921875, "learning_rate": 2.203982037586988e-09, "loss": 1.5903770923614502, "step": 1466 }, { "epoch": 1.8064019698368727, "grad_norm": 5.78125, "learning_rate": 2.1989878227947297e-09, "loss": 1.4093436002731323, "step": 1468 }, { "epoch": 1.8088642659279779, "grad_norm": 5.4375, "learning_rate": 2.1940539465756848e-09, "loss": 1.5252522230148315, "step": 1470 }, { "epoch": 1.8113265620190828, "grad_norm": 11.1875, "learning_rate": 2.1891804872519013e-09, "loss": 1.6333411931991577, "step": 1472 }, { "epoch": 1.8137888581101878, "grad_norm": 12.125, "learning_rate": 2.1843675221863456e-09, "loss": 2.395686626434326, "step": 1474 }, { "epoch": 1.8162511542012927, "grad_norm": 6.40625, "learning_rate": 2.179615127781678e-09, "loss": 2.011446475982666, "step": 1476 }, { "epoch": 1.8187134502923976, "grad_norm": 27.75, "learning_rate": 2.1749233794790424e-09, "loss": 1.9201209545135498, "step": 1478 }, { "epoch": 1.8211757463835028, "grad_norm": 8.75, "learning_rate": 2.1702923517568608e-09, "loss": 1.9654639959335327, "step": 1480 }, { "epoch": 1.8236380424746077, "grad_norm": 14.4375, "learning_rate": 2.1657221181296596e-09, "loss": 2.4255740642547607, "step": 1482 }, { "epoch": 1.8261003385657126, "grad_norm": 5.46875, "learning_rate": 2.161212751146898e-09, "loss": 2.1441259384155273, "step": 1484 }, { "epoch": 1.8285626346568176, "grad_norm": 3.03125, "learning_rate": 2.1567643223918164e-09, "loss": 1.5081210136413574, "step": 1486 }, { "epoch": 1.8310249307479225, "grad_norm": 3.15625, "learning_rate": 2.1523769024803013e-09, "loss": 1.219706416130066, "step": 1488 }, { "epoch": 1.8334872268390274, "grad_norm": 3.296875, "learning_rate": 2.148050561059763e-09, "loss": 1.3154406547546387, "step": 1490 }, { "epoch": 1.8359495229301324, "grad_norm": 4.84375, "learning_rate": 2.1437853668080316e-09, "loss": 1.663912057876587, "step": 1492 }, { "epoch": 1.8384118190212373, "grad_norm": 5.5, "learning_rate": 2.139581387432267e-09, "loss": 1.9996685981750488, "step": 1494 }, { "epoch": 1.8408741151123422, "grad_norm": 9.125, "learning_rate": 2.135438689667882e-09, "loss": 2.1527910232543945, "step": 1496 }, { "epoch": 1.8433364112034472, "grad_norm": 5.4375, "learning_rate": 2.1313573392774835e-09, "loss": 2.181238889694214, "step": 1498 }, { "epoch": 1.845798707294552, "grad_norm": 26.625, "learning_rate": 2.1273374010498306e-09, "loss": 2.07470965385437, "step": 1500 }, { "epoch": 1.848261003385657, "grad_norm": 7.375, "learning_rate": 2.123378938798803e-09, "loss": 2.180095672607422, "step": 1502 }, { "epoch": 1.850723299476762, "grad_norm": 10.25, "learning_rate": 2.119482015362392e-09, "loss": 2.023428440093994, "step": 1504 }, { "epoch": 1.8531855955678669, "grad_norm": 6.03125, "learning_rate": 2.1156466926016974e-09, "loss": 1.9310382604599, "step": 1506 }, { "epoch": 1.855647891658972, "grad_norm": 10.9375, "learning_rate": 2.1118730313999516e-09, "loss": 1.7410407066345215, "step": 1508 }, { "epoch": 1.858110187750077, "grad_norm": 14.9375, "learning_rate": 2.108161091661548e-09, "loss": 2.463320732116699, "step": 1510 }, { "epoch": 1.860572483841182, "grad_norm": 10.1875, "learning_rate": 2.1045109323110943e-09, "loss": 2.164478302001953, "step": 1512 }, { "epoch": 1.8630347799322868, "grad_norm": 11.0, "learning_rate": 2.1009226112924727e-09, "loss": 2.304097890853882, "step": 1514 }, { "epoch": 1.8654970760233918, "grad_norm": 11.4375, "learning_rate": 2.097396185567926e-09, "loss": 2.384671688079834, "step": 1516 }, { "epoch": 1.867959372114497, "grad_norm": 11.875, "learning_rate": 2.0939317111171467e-09, "loss": 1.752406358718872, "step": 1518 }, { "epoch": 1.8704216682056019, "grad_norm": 19.875, "learning_rate": 2.090529242936392e-09, "loss": 1.5490081310272217, "step": 1520 }, { "epoch": 1.8728839642967068, "grad_norm": 5.90625, "learning_rate": 2.087188835037611e-09, "loss": 2.0984854698181152, "step": 1522 }, { "epoch": 1.8753462603878117, "grad_norm": 2.890625, "learning_rate": 2.0839105404475866e-09, "loss": 1.6633992195129395, "step": 1524 }, { "epoch": 1.8778085564789166, "grad_norm": 3.6875, "learning_rate": 2.080694411207094e-09, "loss": 1.4255918264389038, "step": 1526 }, { "epoch": 1.8802708525700216, "grad_norm": 4.84375, "learning_rate": 2.0775404983700724e-09, "loss": 1.845369577407837, "step": 1528 }, { "epoch": 1.8827331486611265, "grad_norm": 4.40625, "learning_rate": 2.074448852002819e-09, "loss": 1.7371915578842163, "step": 1530 }, { "epoch": 1.8851954447522314, "grad_norm": 13.3125, "learning_rate": 2.07141952118319e-09, "loss": 1.805029034614563, "step": 1532 }, { "epoch": 1.8876577408433364, "grad_norm": 6.65625, "learning_rate": 2.068452553999822e-09, "loss": 2.060267448425293, "step": 1534 }, { "epoch": 1.8901200369344413, "grad_norm": 3.625, "learning_rate": 2.065547997551375e-09, "loss": 1.525952935218811, "step": 1536 }, { "epoch": 1.8925823330255462, "grad_norm": 7.46875, "learning_rate": 2.062705897945773e-09, "loss": 1.4751570224761963, "step": 1538 }, { "epoch": 1.8950446291166512, "grad_norm": 5.0625, "learning_rate": 2.059926300299483e-09, "loss": 1.6626102924346924, "step": 1540 }, { "epoch": 1.897506925207756, "grad_norm": 5.65625, "learning_rate": 2.057209248736792e-09, "loss": 1.2773092985153198, "step": 1542 }, { "epoch": 1.899969221298861, "grad_norm": 13.0625, "learning_rate": 2.054554786389111e-09, "loss": 1.6589457988739014, "step": 1544 }, { "epoch": 1.9024315173899662, "grad_norm": 6.25, "learning_rate": 2.051962955394286e-09, "loss": 1.9413405656814575, "step": 1546 }, { "epoch": 1.9048938134810711, "grad_norm": 10.25, "learning_rate": 2.0494337968959344e-09, "loss": 1.6395326852798462, "step": 1548 }, { "epoch": 1.907356109572176, "grad_norm": 5.21875, "learning_rate": 2.0469673510427865e-09, "loss": 1.9667985439300537, "step": 1550 }, { "epoch": 1.909818405663281, "grad_norm": 4.90625, "learning_rate": 2.0445636569880505e-09, "loss": 1.8468351364135742, "step": 1552 }, { "epoch": 1.912280701754386, "grad_norm": 11.25, "learning_rate": 2.0422227528887923e-09, "loss": 2.118504524230957, "step": 1554 }, { "epoch": 1.914742997845491, "grad_norm": 10.375, "learning_rate": 2.0399446759053274e-09, "loss": 2.0504517555236816, "step": 1556 }, { "epoch": 1.917205293936596, "grad_norm": 5.25, "learning_rate": 2.037729462200633e-09, "loss": 1.661136507987976, "step": 1558 }, { "epoch": 1.919667590027701, "grad_norm": 6.03125, "learning_rate": 2.0355771469397726e-09, "loss": 1.5671418905258179, "step": 1560 }, { "epoch": 1.9221298861188059, "grad_norm": 5.34375, "learning_rate": 2.0334877642893373e-09, "loss": 2.0463449954986572, "step": 1562 }, { "epoch": 1.9245921822099108, "grad_norm": 3.96875, "learning_rate": 2.0314613474169064e-09, "loss": 1.7543866634368896, "step": 1564 }, { "epoch": 1.9270544783010157, "grad_norm": 23.375, "learning_rate": 2.029497928490516e-09, "loss": 1.5825181007385254, "step": 1566 }, { "epoch": 1.9295167743921207, "grad_norm": 8.6875, "learning_rate": 2.027597538678154e-09, "loss": 1.5585989952087402, "step": 1568 }, { "epoch": 1.9319790704832256, "grad_norm": 10.0625, "learning_rate": 2.0257602081472603e-09, "loss": 1.5373648405075073, "step": 1570 }, { "epoch": 1.9344413665743305, "grad_norm": 3.296875, "learning_rate": 2.023985966064252e-09, "loss": 1.638904333114624, "step": 1572 }, { "epoch": 1.9369036626654355, "grad_norm": 2.71875, "learning_rate": 2.0222748405940567e-09, "loss": 1.3301455974578857, "step": 1574 }, { "epoch": 1.9393659587565404, "grad_norm": 2.734375, "learning_rate": 2.0206268588996686e-09, "loss": 1.1727893352508545, "step": 1576 }, { "epoch": 1.9418282548476453, "grad_norm": 4.46875, "learning_rate": 2.019042047141714e-09, "loss": 1.2285372018814087, "step": 1578 }, { "epoch": 1.9442905509387503, "grad_norm": 5.0625, "learning_rate": 2.0175204304780413e-09, "loss": 1.5906985998153687, "step": 1580 }, { "epoch": 1.9467528470298552, "grad_norm": 18.875, "learning_rate": 2.016062033063314e-09, "loss": 1.8927161693572998, "step": 1582 }, { "epoch": 1.9492151431209603, "grad_norm": 11.4375, "learning_rate": 2.0146668780486356e-09, "loss": 2.0817370414733887, "step": 1584 }, { "epoch": 1.9516774392120653, "grad_norm": 8.4375, "learning_rate": 2.0133349875811752e-09, "loss": 2.1541638374328613, "step": 1586 }, { "epoch": 1.9541397353031702, "grad_norm": 6.03125, "learning_rate": 2.0120663828038197e-09, "loss": 2.136171340942383, "step": 1588 }, { "epoch": 1.9566020313942751, "grad_norm": 8.8125, "learning_rate": 2.010861083854838e-09, "loss": 2.047274112701416, "step": 1590 }, { "epoch": 1.95906432748538, "grad_norm": 5.4375, "learning_rate": 2.009719109867558e-09, "loss": 2.093939781188965, "step": 1592 }, { "epoch": 1.9615266235764852, "grad_norm": 8.0625, "learning_rate": 2.0086404789700686e-09, "loss": 1.9545447826385498, "step": 1594 }, { "epoch": 1.9639889196675901, "grad_norm": 4.03125, "learning_rate": 2.0076252082849266e-09, "loss": 1.710350751876831, "step": 1596 }, { "epoch": 1.966451215758695, "grad_norm": 9.8125, "learning_rate": 2.006673313928888e-09, "loss": 1.6602602005004883, "step": 1598 }, { "epoch": 1.9689135118498, "grad_norm": 6.96875, "learning_rate": 2.0057848110126513e-09, "loss": 2.073413848876953, "step": 1600 }, { "epoch": 1.971375807940905, "grad_norm": 18.75, "learning_rate": 2.0049597136406157e-09, "loss": 2.155198574066162, "step": 1602 }, { "epoch": 1.9738381040320099, "grad_norm": 7.4375, "learning_rate": 2.004198034910662e-09, "loss": 2.1142520904541016, "step": 1604 }, { "epoch": 1.9763004001231148, "grad_norm": 2.6875, "learning_rate": 2.003499786913938e-09, "loss": 1.6299633979797363, "step": 1606 }, { "epoch": 1.9787626962142197, "grad_norm": 11.3125, "learning_rate": 2.0028649807346742e-09, "loss": 1.5626764297485352, "step": 1608 }, { "epoch": 1.9812249923053247, "grad_norm": 16.875, "learning_rate": 2.0022936264500017e-09, "loss": 2.2909412384033203, "step": 1610 }, { "epoch": 1.9836872883964296, "grad_norm": 11.25, "learning_rate": 2.0017857331297935e-09, "loss": 2.1796622276306152, "step": 1612 }, { "epoch": 1.9861495844875345, "grad_norm": 5.375, "learning_rate": 2.001341308836524e-09, "loss": 1.9472308158874512, "step": 1614 }, { "epoch": 1.9886118805786395, "grad_norm": 8.5625, "learning_rate": 2.000960360625136e-09, "loss": 1.743130087852478, "step": 1616 }, { "epoch": 1.9910741766697444, "grad_norm": 10.1875, "learning_rate": 2.0006428945429335e-09, "loss": 1.43598210811615, "step": 1618 }, { "epoch": 1.9935364727608493, "grad_norm": 12.8125, "learning_rate": 2.0003889156294813e-09, "loss": 1.9119551181793213, "step": 1620 }, { "epoch": 1.9959987688519545, "grad_norm": 5.71875, "learning_rate": 2.0001984279165285e-09, "loss": 2.036318302154541, "step": 1622 }, { "epoch": 1.9984610649430594, "grad_norm": 5.28125, "learning_rate": 2.0000714344279417e-09, "loss": 1.577465295791626, "step": 1624 }, { "epoch": 2.0, "grad_norm": 3.578125, "learning_rate": 2.00000793717966e-09, "loss": 1.1681241989135742, "step": 1626 }, { "epoch": 2.0, "step": 1626, "total_flos": 2.5753569883429274e+18, "train_loss": 1.8335715001506265, "train_runtime": 15477.0683, "train_samples_per_second": 1.679, "train_steps_per_second": 0.105 } ], "logging_steps": 2, "max_steps": 1626, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5753569883429274e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }