diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": 0.6779661016949152, "best_model_checkpoint": "DF_Image_VIT_V1/checkpoint-13812", - "epoch": 8.0, + "epoch": 12.0, "eval_steps": 500, - "global_step": 36832, + "global_step": 55248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -25862,19 +25862,12942 @@ "step": 36832 }, { - "epoch": 8.0, - "step": 36832, - "total_flos": 4.566447964008677e+19, - "train_loss": 0.0017740661595287623, - "train_runtime": 6442.5194, - "train_samples_per_second": 91.467, - "train_steps_per_second": 5.717 + "epoch": 8.001737619461338, + "grad_norm": 0.0004186915175523609, + "learning_rate": 1.6668476686938895e-05, + "loss": 0.0, + "step": 36840 + }, + { + "epoch": 8.00390964378801, + "grad_norm": 0.0004101640952285379, + "learning_rate": 1.665942658557776e-05, + "loss": 0.0, + "step": 36850 + }, + { + "epoch": 8.006081668114684, + "grad_norm": 0.00040353488293476403, + "learning_rate": 1.6650376484216623e-05, + "loss": 0.004, + "step": 36860 + }, + { + "epoch": 8.008253692441356, + "grad_norm": 0.0004050274728797376, + "learning_rate": 1.6641326382855488e-05, + "loss": 0.0, + "step": 36870 + }, + { + "epoch": 8.010425716768028, + "grad_norm": 0.000409485975978896, + "learning_rate": 1.6632276281494352e-05, + "loss": 0.0, + "step": 36880 + }, + { + "epoch": 8.0125977410947, + "grad_norm": 0.0004029431729577482, + "learning_rate": 1.662322618013322e-05, + "loss": 0.0, + "step": 36890 + }, + { + "epoch": 8.014769765421372, + "grad_norm": 0.0004054335586261004, + "learning_rate": 1.6614176078772084e-05, + "loss": 0.0001, + "step": 36900 + }, + { + "epoch": 8.016941789748046, + "grad_norm": 0.001848601852543652, + "learning_rate": 1.660512597741095e-05, + "loss": 0.0, + "step": 36910 + }, + { + "epoch": 8.019113814074718, + "grad_norm": 0.00040781169082038105, + "learning_rate": 1.6596075876049813e-05, + "loss": 0.0001, + "step": 36920 + }, + { + "epoch": 8.02128583840139, + "grad_norm": 0.0006863618618808687, + "learning_rate": 1.6587025774688677e-05, + "loss": 0.0, + "step": 36930 + }, + { + "epoch": 8.023457862728062, + "grad_norm": 0.0003956135187763721, + "learning_rate": 1.657797567332754e-05, + "loss": 0.0001, + "step": 36940 + }, + { + "epoch": 8.025629887054736, + "grad_norm": 0.0007481368957087398, + "learning_rate": 1.6568925571966406e-05, + "loss": 0.0, + "step": 36950 + }, + { + "epoch": 8.027801911381408, + "grad_norm": 0.0006480406154878438, + "learning_rate": 1.655987547060527e-05, + "loss": 0.0001, + "step": 36960 + }, + { + "epoch": 8.02997393570808, + "grad_norm": 0.00040580949280411005, + "learning_rate": 1.6550825369244134e-05, + "loss": 0.0, + "step": 36970 + }, + { + "epoch": 8.032145960034752, + "grad_norm": 0.0004226068267598748, + "learning_rate": 1.6541775267883e-05, + "loss": 0.0, + "step": 36980 + }, + { + "epoch": 8.034317984361424, + "grad_norm": 0.0007375205168500543, + "learning_rate": 1.6532725166521863e-05, + "loss": 0.0034, + "step": 36990 + }, + { + "epoch": 8.036490008688098, + "grad_norm": 0.0003932391991838813, + "learning_rate": 1.652367506516073e-05, + "loss": 0.0, + "step": 37000 + }, + { + "epoch": 8.03866203301477, + "grad_norm": 0.00039350654697045684, + "learning_rate": 1.6514624963799595e-05, + "loss": 0.0, + "step": 37010 + }, + { + "epoch": 8.040834057341442, + "grad_norm": 0.0003966047952417284, + "learning_rate": 1.650557486243846e-05, + "loss": 0.0, + "step": 37020 + }, + { + "epoch": 8.043006081668114, + "grad_norm": 0.00040773957152850926, + "learning_rate": 1.6496524761077324e-05, + "loss": 0.0001, + "step": 37030 + }, + { + "epoch": 8.045178105994788, + "grad_norm": 0.0004274372477084398, + "learning_rate": 1.648747465971619e-05, + "loss": 0.0, + "step": 37040 + }, + { + "epoch": 8.04735013032146, + "grad_norm": 0.0003937635920010507, + "learning_rate": 1.6478424558355056e-05, + "loss": 0.0001, + "step": 37050 + }, + { + "epoch": 8.049522154648132, + "grad_norm": 0.0006448153289966285, + "learning_rate": 1.646937445699392e-05, + "loss": 0.0001, + "step": 37060 + }, + { + "epoch": 8.051694178974804, + "grad_norm": 0.3185959756374359, + "learning_rate": 1.6460324355632785e-05, + "loss": 0.005, + "step": 37070 + }, + { + "epoch": 8.053866203301476, + "grad_norm": 0.00040226714918389916, + "learning_rate": 1.645127425427165e-05, + "loss": 0.0064, + "step": 37080 + }, + { + "epoch": 8.05603822762815, + "grad_norm": 0.00040559517219662666, + "learning_rate": 1.6442224152910513e-05, + "loss": 0.0028, + "step": 37090 + }, + { + "epoch": 8.058210251954822, + "grad_norm": 0.0003909075167030096, + "learning_rate": 1.6433174051549378e-05, + "loss": 0.0, + "step": 37100 + }, + { + "epoch": 8.060382276281494, + "grad_norm": 0.0004615616344381124, + "learning_rate": 1.6424123950188242e-05, + "loss": 0.0001, + "step": 37110 + }, + { + "epoch": 8.062554300608166, + "grad_norm": 0.0003968609671574086, + "learning_rate": 1.641507384882711e-05, + "loss": 0.0049, + "step": 37120 + }, + { + "epoch": 8.064726324934838, + "grad_norm": 0.00038003415102139115, + "learning_rate": 1.6406023747465974e-05, + "loss": 0.0, + "step": 37130 + }, + { + "epoch": 8.066898349261512, + "grad_norm": 0.00037840873119421303, + "learning_rate": 1.639697364610484e-05, + "loss": 0.0, + "step": 37140 + }, + { + "epoch": 8.069070373588184, + "grad_norm": 0.0003778162645176053, + "learning_rate": 1.6387923544743703e-05, + "loss": 0.0025, + "step": 37150 + }, + { + "epoch": 8.071242397914856, + "grad_norm": 0.00038293536636047065, + "learning_rate": 1.6378873443382567e-05, + "loss": 0.0, + "step": 37160 + }, + { + "epoch": 8.073414422241528, + "grad_norm": 0.0005740802153013647, + "learning_rate": 1.636982334202143e-05, + "loss": 0.0002, + "step": 37170 + }, + { + "epoch": 8.075586446568202, + "grad_norm": 0.00036886456655338407, + "learning_rate": 1.6360773240660296e-05, + "loss": 0.0, + "step": 37180 + }, + { + "epoch": 8.077758470894874, + "grad_norm": 0.0003802602586802095, + "learning_rate": 1.635172313929916e-05, + "loss": 0.0, + "step": 37190 + }, + { + "epoch": 8.079930495221546, + "grad_norm": 0.0006068776128813624, + "learning_rate": 1.6342673037938024e-05, + "loss": 0.0001, + "step": 37200 + }, + { + "epoch": 8.082102519548219, + "grad_norm": 0.0003752399352379143, + "learning_rate": 1.633362293657689e-05, + "loss": 0.0001, + "step": 37210 + }, + { + "epoch": 8.08427454387489, + "grad_norm": 0.00037461266038008034, + "learning_rate": 1.6324572835215753e-05, + "loss": 0.0107, + "step": 37220 + }, + { + "epoch": 8.086446568201564, + "grad_norm": 0.000390580331441015, + "learning_rate": 1.631552273385462e-05, + "loss": 0.0, + "step": 37230 + }, + { + "epoch": 8.088618592528237, + "grad_norm": 0.0003792881325352937, + "learning_rate": 1.6306472632493485e-05, + "loss": 0.0, + "step": 37240 + }, + { + "epoch": 8.090790616854909, + "grad_norm": 0.00038958745426498353, + "learning_rate": 1.629742253113235e-05, + "loss": 0.0065, + "step": 37250 + }, + { + "epoch": 8.09296264118158, + "grad_norm": 0.0004001133784186095, + "learning_rate": 1.6288372429771214e-05, + "loss": 0.0, + "step": 37260 + }, + { + "epoch": 8.095134665508255, + "grad_norm": 0.00038350385148078203, + "learning_rate": 1.6279322328410078e-05, + "loss": 0.0065, + "step": 37270 + }, + { + "epoch": 8.097306689834927, + "grad_norm": 0.0003897896967828274, + "learning_rate": 1.6270272227048942e-05, + "loss": 0.0, + "step": 37280 + }, + { + "epoch": 8.099478714161599, + "grad_norm": 0.0007525041582994163, + "learning_rate": 1.6261222125687807e-05, + "loss": 0.0, + "step": 37290 + }, + { + "epoch": 8.10165073848827, + "grad_norm": 0.00039500088314525783, + "learning_rate": 1.6252172024326675e-05, + "loss": 0.0, + "step": 37300 + }, + { + "epoch": 8.103822762814943, + "grad_norm": 0.000395832525100559, + "learning_rate": 1.624312192296554e-05, + "loss": 0.0, + "step": 37310 + }, + { + "epoch": 8.105994787141617, + "grad_norm": 0.0004039072955492884, + "learning_rate": 1.6234071821604403e-05, + "loss": 0.0, + "step": 37320 + }, + { + "epoch": 8.108166811468289, + "grad_norm": 0.00039355579065158963, + "learning_rate": 1.6225021720243268e-05, + "loss": 0.0044, + "step": 37330 + }, + { + "epoch": 8.11033883579496, + "grad_norm": 0.0004178201488684863, + "learning_rate": 1.6215971618882132e-05, + "loss": 0.0016, + "step": 37340 + }, + { + "epoch": 8.112510860121633, + "grad_norm": 0.00037369929486885667, + "learning_rate": 1.6206921517521e-05, + "loss": 0.0, + "step": 37350 + }, + { + "epoch": 8.114682884448305, + "grad_norm": 0.0003739323001354933, + "learning_rate": 1.6197871416159864e-05, + "loss": 0.0001, + "step": 37360 + }, + { + "epoch": 8.116854908774979, + "grad_norm": 0.00037694178172387183, + "learning_rate": 1.6188821314798728e-05, + "loss": 0.0, + "step": 37370 + }, + { + "epoch": 8.119026933101651, + "grad_norm": 0.0003731514443643391, + "learning_rate": 1.6179771213437593e-05, + "loss": 0.0, + "step": 37380 + }, + { + "epoch": 8.121198957428323, + "grad_norm": 0.00037904002238065004, + "learning_rate": 1.6170721112076457e-05, + "loss": 0.0, + "step": 37390 + }, + { + "epoch": 8.123370981754995, + "grad_norm": 0.00039728908450342715, + "learning_rate": 1.616167101071532e-05, + "loss": 0.0086, + "step": 37400 + }, + { + "epoch": 8.125543006081669, + "grad_norm": 0.00038232721271924675, + "learning_rate": 1.6152620909354186e-05, + "loss": 0.0033, + "step": 37410 + }, + { + "epoch": 8.127715030408341, + "grad_norm": 1.541982650756836, + "learning_rate": 1.614357080799305e-05, + "loss": 0.0008, + "step": 37420 + }, + { + "epoch": 8.129887054735013, + "grad_norm": 0.00039564137114211917, + "learning_rate": 1.6134520706631914e-05, + "loss": 0.035, + "step": 37430 + }, + { + "epoch": 8.132059079061685, + "grad_norm": 0.0007069563725963235, + "learning_rate": 1.612547060527078e-05, + "loss": 0.1284, + "step": 37440 + }, + { + "epoch": 8.134231103388357, + "grad_norm": 1.2225791215896606, + "learning_rate": 1.6116420503909643e-05, + "loss": 0.0416, + "step": 37450 + }, + { + "epoch": 8.136403127715031, + "grad_norm": 0.23964758217334747, + "learning_rate": 1.610737040254851e-05, + "loss": 0.0642, + "step": 37460 + }, + { + "epoch": 8.138575152041703, + "grad_norm": 0.3129146099090576, + "learning_rate": 1.6098320301187375e-05, + "loss": 0.0416, + "step": 37470 + }, + { + "epoch": 8.140747176368375, + "grad_norm": 0.00330977700650692, + "learning_rate": 1.608927019982624e-05, + "loss": 0.0241, + "step": 37480 + }, + { + "epoch": 8.142919200695047, + "grad_norm": 0.0025991355068981647, + "learning_rate": 1.6080220098465104e-05, + "loss": 0.0003, + "step": 37490 + }, + { + "epoch": 8.145091225021721, + "grad_norm": 0.00236403476446867, + "learning_rate": 1.6071169997103968e-05, + "loss": 0.0165, + "step": 37500 + }, + { + "epoch": 8.147263249348393, + "grad_norm": 0.6998372077941895, + "learning_rate": 1.6062119895742832e-05, + "loss": 0.0019, + "step": 37510 + }, + { + "epoch": 8.149435273675065, + "grad_norm": 2.8501572608947754, + "learning_rate": 1.6053069794381697e-05, + "loss": 0.0155, + "step": 37520 + }, + { + "epoch": 8.151607298001737, + "grad_norm": 0.19623400270938873, + "learning_rate": 1.604401969302056e-05, + "loss": 0.0515, + "step": 37530 + }, + { + "epoch": 8.15377932232841, + "grad_norm": 0.010287540033459663, + "learning_rate": 1.6034969591659425e-05, + "loss": 0.0021, + "step": 37540 + }, + { + "epoch": 8.155951346655083, + "grad_norm": 0.07258111238479614, + "learning_rate": 1.602591949029829e-05, + "loss": 0.0462, + "step": 37550 + }, + { + "epoch": 8.158123370981755, + "grad_norm": 0.013868864625692368, + "learning_rate": 1.6016869388937154e-05, + "loss": 0.0072, + "step": 37560 + }, + { + "epoch": 8.160295395308427, + "grad_norm": 0.027811523526906967, + "learning_rate": 1.6007819287576022e-05, + "loss": 0.0009, + "step": 37570 + }, + { + "epoch": 8.1624674196351, + "grad_norm": 0.003218551864847541, + "learning_rate": 1.5998769186214886e-05, + "loss": 0.0004, + "step": 37580 + }, + { + "epoch": 8.164639443961772, + "grad_norm": 0.0022526709362864494, + "learning_rate": 1.598971908485375e-05, + "loss": 0.0002, + "step": 37590 + }, + { + "epoch": 8.166811468288445, + "grad_norm": 0.002147641032934189, + "learning_rate": 1.5980668983492615e-05, + "loss": 0.0003, + "step": 37600 + }, + { + "epoch": 8.168983492615117, + "grad_norm": 0.0033935250248759985, + "learning_rate": 1.5971618882131483e-05, + "loss": 0.0004, + "step": 37610 + }, + { + "epoch": 8.17115551694179, + "grad_norm": 0.001787520362995565, + "learning_rate": 1.5962568780770347e-05, + "loss": 0.0002, + "step": 37620 + }, + { + "epoch": 8.173327541268462, + "grad_norm": 0.0018216808093711734, + "learning_rate": 1.595351867940921e-05, + "loss": 0.0012, + "step": 37630 + }, + { + "epoch": 8.175499565595135, + "grad_norm": 0.0020013265311717987, + "learning_rate": 1.5944468578048076e-05, + "loss": 0.0003, + "step": 37640 + }, + { + "epoch": 8.177671589921808, + "grad_norm": 0.0020018210634589195, + "learning_rate": 1.593541847668694e-05, + "loss": 0.0002, + "step": 37650 + }, + { + "epoch": 8.17984361424848, + "grad_norm": 0.0015087234787642956, + "learning_rate": 1.5926368375325804e-05, + "loss": 0.0006, + "step": 37660 + }, + { + "epoch": 8.182015638575152, + "grad_norm": 0.005730865523219109, + "learning_rate": 1.591731827396467e-05, + "loss": 0.0013, + "step": 37670 + }, + { + "epoch": 8.184187662901824, + "grad_norm": 0.0019866772927343845, + "learning_rate": 1.5908268172603533e-05, + "loss": 0.0031, + "step": 37680 + }, + { + "epoch": 8.186359687228498, + "grad_norm": 0.001469621085561812, + "learning_rate": 1.58992180712424e-05, + "loss": 0.0216, + "step": 37690 + }, + { + "epoch": 8.18853171155517, + "grad_norm": 0.0011489508906379342, + "learning_rate": 1.5890167969881265e-05, + "loss": 0.0001, + "step": 37700 + }, + { + "epoch": 8.190703735881842, + "grad_norm": 0.0011148941703140736, + "learning_rate": 1.588111786852013e-05, + "loss": 0.0045, + "step": 37710 + }, + { + "epoch": 8.192875760208514, + "grad_norm": 0.001104211900383234, + "learning_rate": 1.5872067767158994e-05, + "loss": 0.0006, + "step": 37720 + }, + { + "epoch": 8.195047784535188, + "grad_norm": 0.001130530028603971, + "learning_rate": 1.5863017665797858e-05, + "loss": 0.0002, + "step": 37730 + }, + { + "epoch": 8.19721980886186, + "grad_norm": 0.0012663186062127352, + "learning_rate": 1.5853967564436722e-05, + "loss": 0.0001, + "step": 37740 + }, + { + "epoch": 8.199391833188532, + "grad_norm": 0.0013724026503041387, + "learning_rate": 1.5844917463075587e-05, + "loss": 0.0001, + "step": 37750 + }, + { + "epoch": 8.201563857515204, + "grad_norm": 0.0024533451069146395, + "learning_rate": 1.583586736171445e-05, + "loss": 0.0001, + "step": 37760 + }, + { + "epoch": 8.203735881841876, + "grad_norm": 0.001503446139395237, + "learning_rate": 1.5826817260353315e-05, + "loss": 0.0211, + "step": 37770 + }, + { + "epoch": 8.20590790616855, + "grad_norm": 0.000942540296819061, + "learning_rate": 1.581776715899218e-05, + "loss": 0.0001, + "step": 37780 + }, + { + "epoch": 8.208079930495222, + "grad_norm": 0.001057905494235456, + "learning_rate": 1.5808717057631044e-05, + "loss": 0.039, + "step": 37790 + }, + { + "epoch": 8.210251954821894, + "grad_norm": 0.009021256119012833, + "learning_rate": 1.5799666956269912e-05, + "loss": 0.0001, + "step": 37800 + }, + { + "epoch": 8.212423979148566, + "grad_norm": 0.0017717586597427726, + "learning_rate": 1.5790616854908776e-05, + "loss": 0.0002, + "step": 37810 + }, + { + "epoch": 8.214596003475238, + "grad_norm": 1.0658934116363525, + "learning_rate": 1.578156675354764e-05, + "loss": 0.0011, + "step": 37820 + }, + { + "epoch": 8.216768027801912, + "grad_norm": 0.0011891911271959543, + "learning_rate": 1.5772516652186505e-05, + "loss": 0.0001, + "step": 37830 + }, + { + "epoch": 8.218940052128584, + "grad_norm": 0.001057581976056099, + "learning_rate": 1.576346655082537e-05, + "loss": 0.0001, + "step": 37840 + }, + { + "epoch": 8.221112076455256, + "grad_norm": 0.0015104110352694988, + "learning_rate": 1.5754416449464234e-05, + "loss": 0.0001, + "step": 37850 + }, + { + "epoch": 8.223284100781928, + "grad_norm": 0.001172617427073419, + "learning_rate": 1.5745366348103098e-05, + "loss": 0.0001, + "step": 37860 + }, + { + "epoch": 8.225456125108602, + "grad_norm": 0.0010320774745196104, + "learning_rate": 1.5736316246741966e-05, + "loss": 0.0002, + "step": 37870 + }, + { + "epoch": 8.227628149435274, + "grad_norm": 0.0011193450773134828, + "learning_rate": 1.572726614538083e-05, + "loss": 0.0001, + "step": 37880 + }, + { + "epoch": 8.229800173761946, + "grad_norm": 0.0009711516322568059, + "learning_rate": 1.5718216044019694e-05, + "loss": 0.0001, + "step": 37890 + }, + { + "epoch": 8.231972198088618, + "grad_norm": 0.0009323352132923901, + "learning_rate": 1.570916594265856e-05, + "loss": 0.0001, + "step": 37900 + }, + { + "epoch": 8.23414422241529, + "grad_norm": 0.0009416076354682446, + "learning_rate": 1.5700115841297423e-05, + "loss": 0.0037, + "step": 37910 + }, + { + "epoch": 8.236316246741964, + "grad_norm": 0.0008424947736784816, + "learning_rate": 1.569106573993629e-05, + "loss": 0.0001, + "step": 37920 + }, + { + "epoch": 8.238488271068636, + "grad_norm": 0.36671534180641174, + "learning_rate": 1.5682015638575155e-05, + "loss": 0.0035, + "step": 37930 + }, + { + "epoch": 8.240660295395308, + "grad_norm": 0.0008022770052775741, + "learning_rate": 1.567296553721402e-05, + "loss": 0.0001, + "step": 37940 + }, + { + "epoch": 8.24283231972198, + "grad_norm": 0.0008299489854834974, + "learning_rate": 1.5663915435852884e-05, + "loss": 0.0001, + "step": 37950 + }, + { + "epoch": 8.245004344048652, + "grad_norm": 0.0008080134284682572, + "learning_rate": 1.5654865334491748e-05, + "loss": 0.0001, + "step": 37960 + }, + { + "epoch": 8.247176368375326, + "grad_norm": 0.0008203451288864017, + "learning_rate": 1.5645815233130612e-05, + "loss": 0.0001, + "step": 37970 + }, + { + "epoch": 8.249348392701998, + "grad_norm": 0.0008341401698999107, + "learning_rate": 1.5636765131769477e-05, + "loss": 0.0001, + "step": 37980 + }, + { + "epoch": 8.25152041702867, + "grad_norm": 0.0007914734305813909, + "learning_rate": 1.562771503040834e-05, + "loss": 0.0001, + "step": 37990 + }, + { + "epoch": 8.253692441355343, + "grad_norm": 0.0007827221415936947, + "learning_rate": 1.5618664929047205e-05, + "loss": 0.0001, + "step": 38000 + }, + { + "epoch": 8.255864465682016, + "grad_norm": 0.000880706706084311, + "learning_rate": 1.560961482768607e-05, + "loss": 0.0001, + "step": 38010 + }, + { + "epoch": 8.258036490008688, + "grad_norm": 0.0007812479743734002, + "learning_rate": 1.5600564726324934e-05, + "loss": 0.0001, + "step": 38020 + }, + { + "epoch": 8.26020851433536, + "grad_norm": 0.0007665276643820107, + "learning_rate": 1.5591514624963802e-05, + "loss": 0.0001, + "step": 38030 + }, + { + "epoch": 8.262380538662033, + "grad_norm": 0.0007834586431272328, + "learning_rate": 1.5582464523602666e-05, + "loss": 0.0001, + "step": 38040 + }, + { + "epoch": 8.264552562988705, + "grad_norm": 0.000701862561982125, + "learning_rate": 1.557341442224153e-05, + "loss": 0.0052, + "step": 38050 + }, + { + "epoch": 8.266724587315379, + "grad_norm": 0.0008326490060426295, + "learning_rate": 1.5564364320880395e-05, + "loss": 0.0001, + "step": 38060 + }, + { + "epoch": 8.26889661164205, + "grad_norm": 0.0007477857870981097, + "learning_rate": 1.555531421951926e-05, + "loss": 0.0001, + "step": 38070 + }, + { + "epoch": 8.271068635968723, + "grad_norm": 0.0017798726912587881, + "learning_rate": 1.5546264118158123e-05, + "loss": 0.001, + "step": 38080 + }, + { + "epoch": 8.273240660295395, + "grad_norm": 0.0012174885487183928, + "learning_rate": 1.5537214016796988e-05, + "loss": 0.0001, + "step": 38090 + }, + { + "epoch": 8.275412684622069, + "grad_norm": 0.0007682919967919588, + "learning_rate": 1.5528163915435852e-05, + "loss": 0.001, + "step": 38100 + }, + { + "epoch": 8.27758470894874, + "grad_norm": 0.0007978286594152451, + "learning_rate": 1.5519113814074716e-05, + "loss": 0.0001, + "step": 38110 + }, + { + "epoch": 8.279756733275413, + "grad_norm": 0.0008347800467163324, + "learning_rate": 1.551006371271358e-05, + "loss": 0.0001, + "step": 38120 + }, + { + "epoch": 8.281928757602085, + "grad_norm": 0.0007353154360316694, + "learning_rate": 1.5501013611352445e-05, + "loss": 0.0001, + "step": 38130 + }, + { + "epoch": 8.284100781928757, + "grad_norm": 0.0007409664103761315, + "learning_rate": 1.5491963509991313e-05, + "loss": 0.0002, + "step": 38140 + }, + { + "epoch": 8.28627280625543, + "grad_norm": 0.000824456918053329, + "learning_rate": 1.5482913408630177e-05, + "loss": 0.0001, + "step": 38150 + }, + { + "epoch": 8.288444830582103, + "grad_norm": 0.0006615397287532687, + "learning_rate": 1.547386330726904e-05, + "loss": 0.0001, + "step": 38160 + }, + { + "epoch": 8.290616854908775, + "grad_norm": 0.000863892724737525, + "learning_rate": 1.546481320590791e-05, + "loss": 0.0025, + "step": 38170 + }, + { + "epoch": 8.292788879235447, + "grad_norm": 0.002789223100990057, + "learning_rate": 1.5455763104546774e-05, + "loss": 0.0001, + "step": 38180 + }, + { + "epoch": 8.29496090356212, + "grad_norm": 0.0010462104110047221, + "learning_rate": 1.5446713003185638e-05, + "loss": 0.0001, + "step": 38190 + }, + { + "epoch": 8.297132927888793, + "grad_norm": 0.0007007081294432282, + "learning_rate": 1.5437662901824502e-05, + "loss": 0.0001, + "step": 38200 + }, + { + "epoch": 8.299304952215465, + "grad_norm": 0.0006954580312594771, + "learning_rate": 1.5428612800463367e-05, + "loss": 0.0001, + "step": 38210 + }, + { + "epoch": 8.301476976542137, + "grad_norm": 0.0007824657950550318, + "learning_rate": 1.541956269910223e-05, + "loss": 0.0001, + "step": 38220 + }, + { + "epoch": 8.303649000868809, + "grad_norm": 0.0009597129537723958, + "learning_rate": 1.5410512597741095e-05, + "loss": 0.0001, + "step": 38230 + }, + { + "epoch": 8.305821025195483, + "grad_norm": 0.0038668003398925066, + "learning_rate": 1.540146249637996e-05, + "loss": 0.0001, + "step": 38240 + }, + { + "epoch": 8.307993049522155, + "grad_norm": 0.0006443361635319889, + "learning_rate": 1.5392412395018824e-05, + "loss": 0.0001, + "step": 38250 + }, + { + "epoch": 8.310165073848827, + "grad_norm": 0.0010253982618451118, + "learning_rate": 1.5383362293657692e-05, + "loss": 0.0008, + "step": 38260 + }, + { + "epoch": 8.3123370981755, + "grad_norm": 0.0017492685001343489, + "learning_rate": 1.5374312192296556e-05, + "loss": 0.0001, + "step": 38270 + }, + { + "epoch": 8.314509122502171, + "grad_norm": 0.1790028214454651, + "learning_rate": 1.536526209093542e-05, + "loss": 0.0002, + "step": 38280 + }, + { + "epoch": 8.316681146828845, + "grad_norm": 0.002354201627895236, + "learning_rate": 1.5356211989574285e-05, + "loss": 0.0002, + "step": 38290 + }, + { + "epoch": 8.318853171155517, + "grad_norm": 0.004823198076337576, + "learning_rate": 1.534716188821315e-05, + "loss": 0.0001, + "step": 38300 + }, + { + "epoch": 8.32102519548219, + "grad_norm": 0.0006782126147300005, + "learning_rate": 1.5338111786852013e-05, + "loss": 0.0073, + "step": 38310 + }, + { + "epoch": 8.323197219808861, + "grad_norm": 0.0016151006566360593, + "learning_rate": 1.5329061685490878e-05, + "loss": 0.0001, + "step": 38320 + }, + { + "epoch": 8.325369244135535, + "grad_norm": 0.0009404784650541842, + "learning_rate": 1.5320011584129742e-05, + "loss": 0.0001, + "step": 38330 + }, + { + "epoch": 8.327541268462207, + "grad_norm": 0.002452719956636429, + "learning_rate": 1.5310961482768606e-05, + "loss": 0.0121, + "step": 38340 + }, + { + "epoch": 8.32971329278888, + "grad_norm": 0.0007282031583599746, + "learning_rate": 1.530191138140747e-05, + "loss": 0.0259, + "step": 38350 + }, + { + "epoch": 8.331885317115551, + "grad_norm": 0.0006568600074388087, + "learning_rate": 1.5292861280046335e-05, + "loss": 0.0081, + "step": 38360 + }, + { + "epoch": 8.334057341442223, + "grad_norm": 0.0006093371193856001, + "learning_rate": 1.52838111786852e-05, + "loss": 0.0391, + "step": 38370 + }, + { + "epoch": 8.336229365768897, + "grad_norm": 0.0006100427708588541, + "learning_rate": 1.5274761077324067e-05, + "loss": 0.0001, + "step": 38380 + }, + { + "epoch": 8.33840139009557, + "grad_norm": 0.0006650561117567122, + "learning_rate": 1.526571097596293e-05, + "loss": 0.0001, + "step": 38390 + }, + { + "epoch": 8.340573414422241, + "grad_norm": 0.0009484770707786083, + "learning_rate": 1.5256660874601798e-05, + "loss": 0.0001, + "step": 38400 + }, + { + "epoch": 8.342745438748914, + "grad_norm": 0.0006599361076951027, + "learning_rate": 1.5247610773240662e-05, + "loss": 0.0001, + "step": 38410 + }, + { + "epoch": 8.344917463075586, + "grad_norm": 0.000644295010715723, + "learning_rate": 1.5238560671879526e-05, + "loss": 0.0021, + "step": 38420 + }, + { + "epoch": 8.34708948740226, + "grad_norm": 0.0008500678814016283, + "learning_rate": 1.522951057051839e-05, + "loss": 0.0001, + "step": 38430 + }, + { + "epoch": 8.349261511728931, + "grad_norm": 0.0006609293050132692, + "learning_rate": 1.5220460469157255e-05, + "loss": 0.0001, + "step": 38440 + }, + { + "epoch": 8.351433536055604, + "grad_norm": 0.0006641658837907016, + "learning_rate": 1.521141036779612e-05, + "loss": 0.0001, + "step": 38450 + }, + { + "epoch": 8.353605560382276, + "grad_norm": 0.000987962819635868, + "learning_rate": 1.5202360266434984e-05, + "loss": 0.0001, + "step": 38460 + }, + { + "epoch": 8.35577758470895, + "grad_norm": 0.003739980747923255, + "learning_rate": 1.5193310165073848e-05, + "loss": 0.0001, + "step": 38470 + }, + { + "epoch": 8.357949609035622, + "grad_norm": 0.0006407461478374898, + "learning_rate": 1.5184260063712712e-05, + "loss": 0.0001, + "step": 38480 + }, + { + "epoch": 8.360121633362294, + "grad_norm": 0.0005970309721305966, + "learning_rate": 1.517520996235158e-05, + "loss": 0.0054, + "step": 38490 + }, + { + "epoch": 8.362293657688966, + "grad_norm": 0.0006966181681491435, + "learning_rate": 1.5166159860990444e-05, + "loss": 0.0001, + "step": 38500 + }, + { + "epoch": 8.364465682015638, + "grad_norm": 0.0006113156559877098, + "learning_rate": 1.5157109759629309e-05, + "loss": 0.0001, + "step": 38510 + }, + { + "epoch": 8.366637706342312, + "grad_norm": 0.0006402576109394431, + "learning_rate": 1.5148059658268173e-05, + "loss": 0.0001, + "step": 38520 + }, + { + "epoch": 8.368809730668984, + "grad_norm": 0.0006629484705626965, + "learning_rate": 1.5139009556907039e-05, + "loss": 0.0001, + "step": 38530 + }, + { + "epoch": 8.370981754995656, + "grad_norm": 0.0009653762681409717, + "learning_rate": 1.5129959455545903e-05, + "loss": 0.0001, + "step": 38540 + }, + { + "epoch": 8.373153779322328, + "grad_norm": 0.000561116321478039, + "learning_rate": 1.5120909354184768e-05, + "loss": 0.0001, + "step": 38550 + }, + { + "epoch": 8.375325803649002, + "grad_norm": 0.0005761196371167898, + "learning_rate": 1.5111859252823632e-05, + "loss": 0.0197, + "step": 38560 + }, + { + "epoch": 8.377497827975674, + "grad_norm": 0.0005579580320045352, + "learning_rate": 1.5102809151462496e-05, + "loss": 0.0001, + "step": 38570 + }, + { + "epoch": 8.379669852302346, + "grad_norm": 0.0008516975794918835, + "learning_rate": 1.509375905010136e-05, + "loss": 0.0315, + "step": 38580 + }, + { + "epoch": 8.381841876629018, + "grad_norm": 0.002303760964423418, + "learning_rate": 1.5084708948740225e-05, + "loss": 0.0001, + "step": 38590 + }, + { + "epoch": 8.38401390095569, + "grad_norm": 0.000604830333031714, + "learning_rate": 1.5075658847379093e-05, + "loss": 0.0001, + "step": 38600 + }, + { + "epoch": 8.386185925282364, + "grad_norm": 0.0026536278892308474, + "learning_rate": 1.5066608746017957e-05, + "loss": 0.0068, + "step": 38610 + }, + { + "epoch": 8.388357949609036, + "grad_norm": 0.0028301740530878305, + "learning_rate": 1.5057558644656821e-05, + "loss": 0.0001, + "step": 38620 + }, + { + "epoch": 8.390529973935708, + "grad_norm": 0.0008189657819457352, + "learning_rate": 1.5048508543295686e-05, + "loss": 0.0002, + "step": 38630 + }, + { + "epoch": 8.39270199826238, + "grad_norm": 0.0007581166573800147, + "learning_rate": 1.503945844193455e-05, + "loss": 0.0088, + "step": 38640 + }, + { + "epoch": 8.394874022589054, + "grad_norm": 0.000769811449572444, + "learning_rate": 1.5030408340573415e-05, + "loss": 0.0062, + "step": 38650 + }, + { + "epoch": 8.397046046915726, + "grad_norm": 0.0007086714031174779, + "learning_rate": 1.502135823921228e-05, + "loss": 0.0001, + "step": 38660 + }, + { + "epoch": 8.399218071242398, + "grad_norm": 0.0005801831721328199, + "learning_rate": 1.5012308137851145e-05, + "loss": 0.0001, + "step": 38670 + }, + { + "epoch": 8.40139009556907, + "grad_norm": 0.0008001170353963971, + "learning_rate": 1.500325803649001e-05, + "loss": 0.0001, + "step": 38680 + }, + { + "epoch": 8.403562119895742, + "grad_norm": 0.0005618112627416849, + "learning_rate": 1.4994207935128874e-05, + "loss": 0.0461, + "step": 38690 + }, + { + "epoch": 8.405734144222416, + "grad_norm": 0.0006344440625980496, + "learning_rate": 1.4985157833767738e-05, + "loss": 0.0001, + "step": 38700 + }, + { + "epoch": 8.407906168549088, + "grad_norm": 0.0007707632030360401, + "learning_rate": 1.4976107732406602e-05, + "loss": 0.0001, + "step": 38710 + }, + { + "epoch": 8.41007819287576, + "grad_norm": 0.001076050684787333, + "learning_rate": 1.496705763104547e-05, + "loss": 0.0003, + "step": 38720 + }, + { + "epoch": 8.412250217202432, + "grad_norm": 0.0006171344430185854, + "learning_rate": 1.4958007529684334e-05, + "loss": 0.0092, + "step": 38730 + }, + { + "epoch": 8.414422241529104, + "grad_norm": 0.0005999759305268526, + "learning_rate": 1.4948957428323199e-05, + "loss": 0.0009, + "step": 38740 + }, + { + "epoch": 8.416594265855778, + "grad_norm": 0.005478974897414446, + "learning_rate": 1.4939907326962063e-05, + "loss": 0.0002, + "step": 38750 + }, + { + "epoch": 8.41876629018245, + "grad_norm": 0.0006091848481446505, + "learning_rate": 1.4930857225600927e-05, + "loss": 0.0001, + "step": 38760 + }, + { + "epoch": 8.420938314509122, + "grad_norm": 0.0005783461383543909, + "learning_rate": 1.4921807124239792e-05, + "loss": 0.0001, + "step": 38770 + }, + { + "epoch": 8.423110338835794, + "grad_norm": 0.0005965415039099753, + "learning_rate": 1.4912757022878656e-05, + "loss": 0.0002, + "step": 38780 + }, + { + "epoch": 8.425282363162468, + "grad_norm": 0.0006767417071387172, + "learning_rate": 1.4903706921517522e-05, + "loss": 0.0075, + "step": 38790 + }, + { + "epoch": 8.42745438748914, + "grad_norm": 0.0005308814579620957, + "learning_rate": 1.4894656820156386e-05, + "loss": 0.0001, + "step": 38800 + }, + { + "epoch": 8.429626411815812, + "grad_norm": 0.0010998549405485392, + "learning_rate": 1.488560671879525e-05, + "loss": 0.0001, + "step": 38810 + }, + { + "epoch": 8.431798436142484, + "grad_norm": 0.000694752496201545, + "learning_rate": 1.4876556617434115e-05, + "loss": 0.0002, + "step": 38820 + }, + { + "epoch": 8.433970460469157, + "grad_norm": 0.0007519248174503446, + "learning_rate": 1.4867506516072983e-05, + "loss": 0.0001, + "step": 38830 + }, + { + "epoch": 8.43614248479583, + "grad_norm": 0.0005111406790092587, + "learning_rate": 1.4858456414711847e-05, + "loss": 0.0001, + "step": 38840 + }, + { + "epoch": 8.438314509122502, + "grad_norm": 0.0005285285878926516, + "learning_rate": 1.4849406313350711e-05, + "loss": 0.0001, + "step": 38850 + }, + { + "epoch": 8.440486533449175, + "grad_norm": 0.0014295554719865322, + "learning_rate": 1.4840356211989576e-05, + "loss": 0.0325, + "step": 38860 + }, + { + "epoch": 8.442658557775847, + "grad_norm": 0.04170700162649155, + "learning_rate": 1.483130611062844e-05, + "loss": 0.0001, + "step": 38870 + }, + { + "epoch": 8.444830582102519, + "grad_norm": 0.0023596552200615406, + "learning_rate": 1.4822256009267304e-05, + "loss": 0.0001, + "step": 38880 + }, + { + "epoch": 8.447002606429193, + "grad_norm": 0.0032503660768270493, + "learning_rate": 1.4813205907906169e-05, + "loss": 0.0001, + "step": 38890 + }, + { + "epoch": 8.449174630755865, + "grad_norm": 0.0007728934870101511, + "learning_rate": 1.4804155806545033e-05, + "loss": 0.0001, + "step": 38900 + }, + { + "epoch": 8.451346655082537, + "grad_norm": 0.3552645146846771, + "learning_rate": 1.4795105705183897e-05, + "loss": 0.0047, + "step": 38910 + }, + { + "epoch": 8.453518679409209, + "grad_norm": 0.0006022463203407824, + "learning_rate": 1.4786055603822762e-05, + "loss": 0.0001, + "step": 38920 + }, + { + "epoch": 8.455690703735883, + "grad_norm": 0.000813568476587534, + "learning_rate": 1.4777005502461628e-05, + "loss": 0.0001, + "step": 38930 + }, + { + "epoch": 8.457862728062555, + "grad_norm": 0.0005289530963636935, + "learning_rate": 1.4767955401100492e-05, + "loss": 0.0001, + "step": 38940 + }, + { + "epoch": 8.460034752389227, + "grad_norm": 0.0014709793031215668, + "learning_rate": 1.4758905299739358e-05, + "loss": 0.0054, + "step": 38950 + }, + { + "epoch": 8.462206776715899, + "grad_norm": 0.0005715846200473607, + "learning_rate": 1.4749855198378224e-05, + "loss": 0.0045, + "step": 38960 + }, + { + "epoch": 8.464378801042571, + "grad_norm": 0.0005603663739748299, + "learning_rate": 1.4740805097017089e-05, + "loss": 0.0001, + "step": 38970 + }, + { + "epoch": 8.466550825369245, + "grad_norm": 0.0005445944261737168, + "learning_rate": 1.4731754995655953e-05, + "loss": 0.0001, + "step": 38980 + }, + { + "epoch": 8.468722849695917, + "grad_norm": 0.1283123642206192, + "learning_rate": 1.4722704894294817e-05, + "loss": 0.0002, + "step": 38990 + }, + { + "epoch": 8.470894874022589, + "grad_norm": 0.0005378610803745687, + "learning_rate": 1.4713654792933682e-05, + "loss": 0.0001, + "step": 39000 + }, + { + "epoch": 8.473066898349261, + "grad_norm": 0.0007877520401962101, + "learning_rate": 1.4704604691572546e-05, + "loss": 0.0001, + "step": 39010 + }, + { + "epoch": 8.475238922675935, + "grad_norm": 0.0006150746485218406, + "learning_rate": 1.469555459021141e-05, + "loss": 0.0001, + "step": 39020 + }, + { + "epoch": 8.477410947002607, + "grad_norm": 0.0017679219599813223, + "learning_rate": 1.4686504488850275e-05, + "loss": 0.0059, + "step": 39030 + }, + { + "epoch": 8.479582971329279, + "grad_norm": 0.018884725868701935, + "learning_rate": 1.4677454387489139e-05, + "loss": 0.0001, + "step": 39040 + }, + { + "epoch": 8.481754995655951, + "grad_norm": 0.0010196593357250094, + "learning_rate": 1.4668404286128003e-05, + "loss": 0.0017, + "step": 39050 + }, + { + "epoch": 8.483927019982623, + "grad_norm": 0.0005284012877382338, + "learning_rate": 1.4659354184766871e-05, + "loss": 0.0001, + "step": 39060 + }, + { + "epoch": 8.486099044309297, + "grad_norm": 0.008003398776054382, + "learning_rate": 1.4650304083405735e-05, + "loss": 0.0001, + "step": 39070 + }, + { + "epoch": 8.488271068635969, + "grad_norm": 0.00324247102253139, + "learning_rate": 1.46412539820446e-05, + "loss": 0.0001, + "step": 39080 + }, + { + "epoch": 8.490443092962641, + "grad_norm": 0.0009854629170149565, + "learning_rate": 1.4633108890819578e-05, + "loss": 0.0232, + "step": 39090 + }, + { + "epoch": 8.492615117289313, + "grad_norm": 0.0005126107134856284, + "learning_rate": 1.4624058789458442e-05, + "loss": 0.0001, + "step": 39100 + }, + { + "epoch": 8.494787141615985, + "grad_norm": 0.0005185718182474375, + "learning_rate": 1.4615008688097306e-05, + "loss": 0.0001, + "step": 39110 + }, + { + "epoch": 8.496959165942659, + "grad_norm": 0.0007868999964557588, + "learning_rate": 1.460595858673617e-05, + "loss": 0.0001, + "step": 39120 + }, + { + "epoch": 8.499131190269331, + "grad_norm": 0.0005237551522441208, + "learning_rate": 1.4596908485375038e-05, + "loss": 0.0001, + "step": 39130 + }, + { + "epoch": 8.501303214596003, + "grad_norm": 0.001091836835257709, + "learning_rate": 1.4587858384013903e-05, + "loss": 0.0001, + "step": 39140 + }, + { + "epoch": 8.503475238922675, + "grad_norm": 0.000753571861423552, + "learning_rate": 1.4578808282652767e-05, + "loss": 0.0001, + "step": 39150 + }, + { + "epoch": 8.50564726324935, + "grad_norm": 0.0010601403191685677, + "learning_rate": 1.4569758181291631e-05, + "loss": 0.0001, + "step": 39160 + }, + { + "epoch": 8.507819287576021, + "grad_norm": 0.0004902264918200672, + "learning_rate": 1.4560708079930496e-05, + "loss": 0.0001, + "step": 39170 + }, + { + "epoch": 8.509991311902693, + "grad_norm": 0.0004899317282252014, + "learning_rate": 1.455165797856936e-05, + "loss": 0.0001, + "step": 39180 + }, + { + "epoch": 8.512163336229365, + "grad_norm": 0.00047692423686385155, + "learning_rate": 1.4542607877208226e-05, + "loss": 0.0053, + "step": 39190 + }, + { + "epoch": 8.514335360556037, + "grad_norm": 0.006296246778219938, + "learning_rate": 1.453355777584709e-05, + "loss": 0.0072, + "step": 39200 + }, + { + "epoch": 8.516507384882711, + "grad_norm": 0.0026722548063844442, + "learning_rate": 1.4524507674485955e-05, + "loss": 0.0001, + "step": 39210 + }, + { + "epoch": 8.518679409209383, + "grad_norm": 0.0005234789568930864, + "learning_rate": 1.4515457573124819e-05, + "loss": 0.0001, + "step": 39220 + }, + { + "epoch": 8.520851433536055, + "grad_norm": 0.0007127950084395707, + "learning_rate": 1.4506407471763683e-05, + "loss": 0.0018, + "step": 39230 + }, + { + "epoch": 8.523023457862728, + "grad_norm": 0.0004754096153192222, + "learning_rate": 1.4497357370402548e-05, + "loss": 0.0001, + "step": 39240 + }, + { + "epoch": 8.5251954821894, + "grad_norm": 0.0004704821913037449, + "learning_rate": 1.4488307269041416e-05, + "loss": 0.0001, + "step": 39250 + }, + { + "epoch": 8.527367506516073, + "grad_norm": 0.0004992211470380425, + "learning_rate": 1.447925716768028e-05, + "loss": 0.0062, + "step": 39260 + }, + { + "epoch": 8.529539530842746, + "grad_norm": 0.0004700243007391691, + "learning_rate": 1.4470207066319144e-05, + "loss": 0.006, + "step": 39270 + }, + { + "epoch": 8.531711555169418, + "grad_norm": 0.0009023443562909961, + "learning_rate": 1.4461156964958009e-05, + "loss": 0.0001, + "step": 39280 + }, + { + "epoch": 8.53388357949609, + "grad_norm": 0.00045638513984158635, + "learning_rate": 1.4452106863596873e-05, + "loss": 0.0088, + "step": 39290 + }, + { + "epoch": 8.536055603822764, + "grad_norm": 0.0004613750206772238, + "learning_rate": 1.4443056762235737e-05, + "loss": 0.0, + "step": 39300 + }, + { + "epoch": 8.538227628149436, + "grad_norm": 0.0005079872789792717, + "learning_rate": 1.4434006660874602e-05, + "loss": 0.0001, + "step": 39310 + }, + { + "epoch": 8.540399652476108, + "grad_norm": 0.00045794787001796067, + "learning_rate": 1.4424956559513466e-05, + "loss": 0.0, + "step": 39320 + }, + { + "epoch": 8.54257167680278, + "grad_norm": 0.0005387124256230891, + "learning_rate": 1.4415906458152332e-05, + "loss": 0.0001, + "step": 39330 + }, + { + "epoch": 8.544743701129452, + "grad_norm": 0.0005204430781304836, + "learning_rate": 1.4406856356791196e-05, + "loss": 0.0001, + "step": 39340 + }, + { + "epoch": 8.546915725456126, + "grad_norm": 0.0004614073259290308, + "learning_rate": 1.439780625543006e-05, + "loss": 0.0, + "step": 39350 + }, + { + "epoch": 8.549087749782798, + "grad_norm": 0.00045747487456537783, + "learning_rate": 1.4388756154068928e-05, + "loss": 0.0001, + "step": 39360 + }, + { + "epoch": 8.55125977410947, + "grad_norm": 0.00136624276638031, + "learning_rate": 1.4379706052707793e-05, + "loss": 0.0, + "step": 39370 + }, + { + "epoch": 8.553431798436142, + "grad_norm": 0.00045062758726999164, + "learning_rate": 1.4370655951346657e-05, + "loss": 0.004, + "step": 39380 + }, + { + "epoch": 8.555603822762816, + "grad_norm": 0.00045702551142312586, + "learning_rate": 1.4361605849985521e-05, + "loss": 0.0001, + "step": 39390 + }, + { + "epoch": 8.557775847089488, + "grad_norm": 0.0004522506205830723, + "learning_rate": 1.4352555748624386e-05, + "loss": 0.0, + "step": 39400 + }, + { + "epoch": 8.55994787141616, + "grad_norm": 0.0004662805295083672, + "learning_rate": 1.434350564726325e-05, + "loss": 0.0001, + "step": 39410 + }, + { + "epoch": 8.562119895742832, + "grad_norm": 0.0004481975338421762, + "learning_rate": 1.4334455545902114e-05, + "loss": 0.0001, + "step": 39420 + }, + { + "epoch": 8.564291920069504, + "grad_norm": 0.0004583086702041328, + "learning_rate": 1.4325405444540979e-05, + "loss": 0.0, + "step": 39430 + }, + { + "epoch": 8.566463944396178, + "grad_norm": 0.0004621725529432297, + "learning_rate": 1.4316355343179843e-05, + "loss": 0.0, + "step": 39440 + }, + { + "epoch": 8.56863596872285, + "grad_norm": 0.00045265990775078535, + "learning_rate": 1.4307305241818707e-05, + "loss": 0.0, + "step": 39450 + }, + { + "epoch": 8.570807993049522, + "grad_norm": 0.0004928920534439385, + "learning_rate": 1.4298255140457573e-05, + "loss": 0.0001, + "step": 39460 + }, + { + "epoch": 8.572980017376194, + "grad_norm": 0.15685199201107025, + "learning_rate": 1.428920503909644e-05, + "loss": 0.0036, + "step": 39470 + }, + { + "epoch": 8.575152041702868, + "grad_norm": 0.0004423794453032315, + "learning_rate": 1.4280154937735304e-05, + "loss": 0.0001, + "step": 39480 + }, + { + "epoch": 8.57732406602954, + "grad_norm": 0.00043715888750739396, + "learning_rate": 1.4271104836374168e-05, + "loss": 0.0, + "step": 39490 + }, + { + "epoch": 8.579496090356212, + "grad_norm": 0.00045279006008058786, + "learning_rate": 1.4262054735013034e-05, + "loss": 0.0, + "step": 39500 + }, + { + "epoch": 8.581668114682884, + "grad_norm": 0.00045085386955179274, + "learning_rate": 1.4253004633651899e-05, + "loss": 0.0001, + "step": 39510 + }, + { + "epoch": 8.583840139009556, + "grad_norm": 0.0004693444352596998, + "learning_rate": 1.4243954532290763e-05, + "loss": 0.0001, + "step": 39520 + }, + { + "epoch": 8.58601216333623, + "grad_norm": 0.0004297494888305664, + "learning_rate": 1.4234904430929627e-05, + "loss": 0.0, + "step": 39530 + }, + { + "epoch": 8.588184187662902, + "grad_norm": 0.0005316234892234206, + "learning_rate": 1.4225854329568492e-05, + "loss": 0.0, + "step": 39540 + }, + { + "epoch": 8.590356211989574, + "grad_norm": 0.0004413987626321614, + "learning_rate": 1.4216804228207356e-05, + "loss": 0.0001, + "step": 39550 + }, + { + "epoch": 8.592528236316246, + "grad_norm": 0.0004459246410988271, + "learning_rate": 1.420775412684622e-05, + "loss": 0.0058, + "step": 39560 + }, + { + "epoch": 8.59470026064292, + "grad_norm": 0.0004368575755506754, + "learning_rate": 1.4198704025485085e-05, + "loss": 0.0001, + "step": 39570 + }, + { + "epoch": 8.596872284969592, + "grad_norm": 0.0004350426606833935, + "learning_rate": 1.4189653924123949e-05, + "loss": 0.0001, + "step": 39580 + }, + { + "epoch": 8.599044309296264, + "grad_norm": 0.0005037335213273764, + "learning_rate": 1.4180603822762817e-05, + "loss": 0.0001, + "step": 39590 + }, + { + "epoch": 8.601216333622936, + "grad_norm": 0.04642053693532944, + "learning_rate": 1.4171553721401681e-05, + "loss": 0.0001, + "step": 39600 + }, + { + "epoch": 8.603388357949608, + "grad_norm": 0.00044416176388040185, + "learning_rate": 1.4162503620040545e-05, + "loss": 0.0, + "step": 39610 + }, + { + "epoch": 8.605560382276282, + "grad_norm": 0.0013276516692712903, + "learning_rate": 1.415345351867941e-05, + "loss": 0.0001, + "step": 39620 + }, + { + "epoch": 8.607732406602954, + "grad_norm": 0.0006277307402342558, + "learning_rate": 1.4144403417318276e-05, + "loss": 0.0, + "step": 39630 + }, + { + "epoch": 8.609904430929626, + "grad_norm": 0.0004260788264218718, + "learning_rate": 1.413535331595714e-05, + "loss": 0.0, + "step": 39640 + }, + { + "epoch": 8.612076455256299, + "grad_norm": 0.0004270559875294566, + "learning_rate": 1.4126303214596004e-05, + "loss": 0.0, + "step": 39650 + }, + { + "epoch": 8.61424847958297, + "grad_norm": 0.0004411809495650232, + "learning_rate": 1.4117253113234869e-05, + "loss": 0.0, + "step": 39660 + }, + { + "epoch": 8.616420503909644, + "grad_norm": 0.00043373851804062724, + "learning_rate": 1.4108203011873733e-05, + "loss": 0.0, + "step": 39670 + }, + { + "epoch": 8.618592528236316, + "grad_norm": 0.0004543520917650312, + "learning_rate": 1.4099152910512597e-05, + "loss": 0.0001, + "step": 39680 + }, + { + "epoch": 8.620764552562989, + "grad_norm": 0.00047740931040607393, + "learning_rate": 1.4090102809151462e-05, + "loss": 0.0035, + "step": 39690 + }, + { + "epoch": 8.62293657688966, + "grad_norm": 0.0004208452010061592, + "learning_rate": 1.408105270779033e-05, + "loss": 0.0, + "step": 39700 + }, + { + "epoch": 8.625108601216333, + "grad_norm": 0.0012495043920353055, + "learning_rate": 1.4072002606429194e-05, + "loss": 0.0001, + "step": 39710 + }, + { + "epoch": 8.627280625543007, + "grad_norm": 0.00042732080328278244, + "learning_rate": 1.4062952505068058e-05, + "loss": 0.0, + "step": 39720 + }, + { + "epoch": 8.629452649869679, + "grad_norm": 0.00045733177103102207, + "learning_rate": 1.4053902403706922e-05, + "loss": 0.0001, + "step": 39730 + }, + { + "epoch": 8.63162467419635, + "grad_norm": 0.00040695929783396423, + "learning_rate": 1.4044852302345787e-05, + "loss": 0.001, + "step": 39740 + }, + { + "epoch": 8.633796698523023, + "grad_norm": 0.0004219406400807202, + "learning_rate": 1.4035802200984651e-05, + "loss": 0.0002, + "step": 39750 + }, + { + "epoch": 8.635968722849697, + "grad_norm": 0.0004689696943387389, + "learning_rate": 1.4026752099623517e-05, + "loss": 0.0001, + "step": 39760 + }, + { + "epoch": 8.638140747176369, + "grad_norm": 0.0004632935451809317, + "learning_rate": 1.4017701998262381e-05, + "loss": 0.0001, + "step": 39770 + }, + { + "epoch": 8.64031277150304, + "grad_norm": 0.01873939484357834, + "learning_rate": 1.4008651896901246e-05, + "loss": 0.0001, + "step": 39780 + }, + { + "epoch": 8.642484795829713, + "grad_norm": 0.00048569723730906844, + "learning_rate": 1.399960179554011e-05, + "loss": 0.0057, + "step": 39790 + }, + { + "epoch": 8.644656820156385, + "grad_norm": 0.0009552659466862679, + "learning_rate": 1.3990551694178974e-05, + "loss": 0.0011, + "step": 39800 + }, + { + "epoch": 8.646828844483059, + "grad_norm": 34.08624267578125, + "learning_rate": 1.3981501592817839e-05, + "loss": 0.0244, + "step": 39810 + }, + { + "epoch": 8.64900086880973, + "grad_norm": 0.000415660731960088, + "learning_rate": 1.3972451491456707e-05, + "loss": 0.0, + "step": 39820 + }, + { + "epoch": 8.651172893136403, + "grad_norm": 0.0006440936122089624, + "learning_rate": 1.3963401390095571e-05, + "loss": 0.0001, + "step": 39830 + }, + { + "epoch": 8.653344917463075, + "grad_norm": 0.0013137052301317453, + "learning_rate": 1.3954351288734435e-05, + "loss": 0.0064, + "step": 39840 + }, + { + "epoch": 8.655516941789749, + "grad_norm": 0.0004171818436589092, + "learning_rate": 1.39453011873733e-05, + "loss": 0.0001, + "step": 39850 + }, + { + "epoch": 8.657688966116421, + "grad_norm": 0.00041515124030411243, + "learning_rate": 1.3936251086012164e-05, + "loss": 0.0, + "step": 39860 + }, + { + "epoch": 8.659860990443093, + "grad_norm": 0.00483085447922349, + "learning_rate": 1.3927200984651028e-05, + "loss": 0.0001, + "step": 39870 + }, + { + "epoch": 8.662033014769765, + "grad_norm": 0.00041427038377150893, + "learning_rate": 1.3918150883289893e-05, + "loss": 0.0, + "step": 39880 + }, + { + "epoch": 8.664205039096437, + "grad_norm": 0.0004123119288124144, + "learning_rate": 1.3909100781928757e-05, + "loss": 0.0, + "step": 39890 + }, + { + "epoch": 8.666377063423111, + "grad_norm": 0.0004082611412741244, + "learning_rate": 1.3900050680567623e-05, + "loss": 0.0042, + "step": 39900 + }, + { + "epoch": 8.668549087749783, + "grad_norm": 0.00040656846249476075, + "learning_rate": 1.3891000579206487e-05, + "loss": 0.0, + "step": 39910 + }, + { + "epoch": 8.670721112076455, + "grad_norm": 0.00041427111136727035, + "learning_rate": 1.3881950477845352e-05, + "loss": 0.0, + "step": 39920 + }, + { + "epoch": 8.672893136403127, + "grad_norm": 0.0007840920588932931, + "learning_rate": 1.387290037648422e-05, + "loss": 0.0611, + "step": 39930 + }, + { + "epoch": 8.675065160729801, + "grad_norm": 0.0017191257793456316, + "learning_rate": 1.3863850275123084e-05, + "loss": 0.0057, + "step": 39940 + }, + { + "epoch": 8.677237185056473, + "grad_norm": 0.0021400810219347477, + "learning_rate": 1.3854800173761948e-05, + "loss": 0.0064, + "step": 39950 + }, + { + "epoch": 8.679409209383145, + "grad_norm": 0.002642604988068342, + "learning_rate": 1.3845750072400812e-05, + "loss": 0.0002, + "step": 39960 + }, + { + "epoch": 8.681581233709817, + "grad_norm": 0.0015774896601215005, + "learning_rate": 1.3836699971039677e-05, + "loss": 0.0002, + "step": 39970 + }, + { + "epoch": 8.68375325803649, + "grad_norm": 0.0011439747177064419, + "learning_rate": 1.3827649869678541e-05, + "loss": 0.0001, + "step": 39980 + }, + { + "epoch": 8.685925282363163, + "grad_norm": 0.0010642379056662321, + "learning_rate": 1.3818599768317405e-05, + "loss": 0.0053, + "step": 39990 + }, + { + "epoch": 8.688097306689835, + "grad_norm": 0.0014020922826603055, + "learning_rate": 1.380954966695627e-05, + "loss": 0.0043, + "step": 40000 + }, + { + "epoch": 8.690269331016507, + "grad_norm": 0.001087503507733345, + "learning_rate": 1.3800499565595134e-05, + "loss": 0.0001, + "step": 40010 + }, + { + "epoch": 8.69244135534318, + "grad_norm": 0.0009494571713730693, + "learning_rate": 1.3791449464233998e-05, + "loss": 0.0001, + "step": 40020 + }, + { + "epoch": 8.694613379669851, + "grad_norm": 0.0008690246613696218, + "learning_rate": 1.3782399362872864e-05, + "loss": 0.0001, + "step": 40030 + }, + { + "epoch": 8.696785403996525, + "grad_norm": 0.0007888951804488897, + "learning_rate": 1.3773349261511729e-05, + "loss": 0.0001, + "step": 40040 + }, + { + "epoch": 8.698957428323197, + "grad_norm": 0.0010448385728523135, + "learning_rate": 1.3764299160150595e-05, + "loss": 0.0001, + "step": 40050 + }, + { + "epoch": 8.70112945264987, + "grad_norm": 0.0006909267976880074, + "learning_rate": 1.375524905878946e-05, + "loss": 0.0001, + "step": 40060 + }, + { + "epoch": 8.703301476976542, + "grad_norm": 0.000706880702637136, + "learning_rate": 1.3746198957428325e-05, + "loss": 0.0052, + "step": 40070 + }, + { + "epoch": 8.705473501303215, + "grad_norm": 0.0007796403951942921, + "learning_rate": 1.373714885606719e-05, + "loss": 0.0001, + "step": 40080 + }, + { + "epoch": 8.707645525629887, + "grad_norm": 0.0005805023247376084, + "learning_rate": 1.3728098754706054e-05, + "loss": 0.0054, + "step": 40090 + }, + { + "epoch": 8.70981754995656, + "grad_norm": 0.0006314498605206609, + "learning_rate": 1.3719048653344918e-05, + "loss": 0.0001, + "step": 40100 + }, + { + "epoch": 8.711989574283232, + "grad_norm": 0.0006966400542296469, + "learning_rate": 1.3709998551983783e-05, + "loss": 0.0001, + "step": 40110 + }, + { + "epoch": 8.714161598609904, + "grad_norm": 0.0006841762224212289, + "learning_rate": 1.3700948450622647e-05, + "loss": 0.0054, + "step": 40120 + }, + { + "epoch": 8.716333622936578, + "grad_norm": 0.0008951672934927046, + "learning_rate": 1.3691898349261511e-05, + "loss": 0.0001, + "step": 40130 + }, + { + "epoch": 8.71850564726325, + "grad_norm": 0.0005166791379451752, + "learning_rate": 1.3682848247900376e-05, + "loss": 0.0001, + "step": 40140 + }, + { + "epoch": 8.720677671589922, + "grad_norm": 0.0013270946219563484, + "learning_rate": 1.367379814653924e-05, + "loss": 0.0001, + "step": 40150 + }, + { + "epoch": 8.722849695916594, + "grad_norm": 0.0004784400516655296, + "learning_rate": 1.3664748045178108e-05, + "loss": 0.0001, + "step": 40160 + }, + { + "epoch": 8.725021720243266, + "grad_norm": 0.0004924449021928012, + "learning_rate": 1.3655697943816972e-05, + "loss": 0.0001, + "step": 40170 + }, + { + "epoch": 8.72719374456994, + "grad_norm": 0.0005478410166688263, + "learning_rate": 1.3646647842455836e-05, + "loss": 0.0001, + "step": 40180 + }, + { + "epoch": 8.729365768896612, + "grad_norm": 0.00045741075882688165, + "learning_rate": 1.3638502751230814e-05, + "loss": 0.008, + "step": 40190 + }, + { + "epoch": 8.731537793223284, + "grad_norm": 0.00053097412455827, + "learning_rate": 1.3629452649869679e-05, + "loss": 0.0001, + "step": 40200 + }, + { + "epoch": 8.733709817549956, + "grad_norm": 0.00047605347936041653, + "learning_rate": 1.3620402548508543e-05, + "loss": 0.0049, + "step": 40210 + }, + { + "epoch": 8.73588184187663, + "grad_norm": 0.00046191230649128556, + "learning_rate": 1.3611352447147407e-05, + "loss": 0.0001, + "step": 40220 + }, + { + "epoch": 8.738053866203302, + "grad_norm": 0.0006621324573643506, + "learning_rate": 1.3602302345786275e-05, + "loss": 0.0042, + "step": 40230 + }, + { + "epoch": 8.740225890529974, + "grad_norm": 0.0006459571304731071, + "learning_rate": 1.359325224442514e-05, + "loss": 0.0001, + "step": 40240 + }, + { + "epoch": 8.742397914856646, + "grad_norm": 0.0004514079191721976, + "learning_rate": 1.3584202143064004e-05, + "loss": 0.0001, + "step": 40250 + }, + { + "epoch": 8.744569939183318, + "grad_norm": 0.0008339316700585186, + "learning_rate": 1.3575152041702868e-05, + "loss": 0.0001, + "step": 40260 + }, + { + "epoch": 8.746741963509992, + "grad_norm": 0.0004699197015725076, + "learning_rate": 1.3567006950477846e-05, + "loss": 0.0176, + "step": 40270 + }, + { + "epoch": 8.748913987836664, + "grad_norm": 0.0005045742727816105, + "learning_rate": 1.355795684911671e-05, + "loss": 0.0, + "step": 40280 + }, + { + "epoch": 8.751086012163336, + "grad_norm": 0.0014669718220829964, + "learning_rate": 1.3548906747755575e-05, + "loss": 0.0001, + "step": 40290 + }, + { + "epoch": 8.753258036490008, + "grad_norm": 0.0008972100913524628, + "learning_rate": 1.3539856646394442e-05, + "loss": 0.0346, + "step": 40300 + }, + { + "epoch": 8.755430060816682, + "grad_norm": 0.1194867491722107, + "learning_rate": 1.3530806545033307e-05, + "loss": 0.0004, + "step": 40310 + }, + { + "epoch": 8.757602085143354, + "grad_norm": 0.00045046405284665525, + "learning_rate": 1.3521756443672171e-05, + "loss": 0.0023, + "step": 40320 + }, + { + "epoch": 8.759774109470026, + "grad_norm": 0.005810149013996124, + "learning_rate": 1.3512706342311035e-05, + "loss": 0.0001, + "step": 40330 + }, + { + "epoch": 8.761946133796698, + "grad_norm": 0.0006038735737092793, + "learning_rate": 1.35036562409499e-05, + "loss": 0.0, + "step": 40340 + }, + { + "epoch": 8.76411815812337, + "grad_norm": 0.001365828444249928, + "learning_rate": 1.3494606139588764e-05, + "loss": 0.0001, + "step": 40350 + }, + { + "epoch": 8.766290182450044, + "grad_norm": 0.0006109884125180542, + "learning_rate": 1.3485556038227628e-05, + "loss": 0.0001, + "step": 40360 + }, + { + "epoch": 8.768462206776716, + "grad_norm": 0.00043821826693601906, + "learning_rate": 1.3476505936866493e-05, + "loss": 0.0002, + "step": 40370 + }, + { + "epoch": 8.770634231103388, + "grad_norm": 0.038904059678316116, + "learning_rate": 1.3467455835505357e-05, + "loss": 0.0002, + "step": 40380 + }, + { + "epoch": 8.77280625543006, + "grad_norm": 0.00041763324406929314, + "learning_rate": 1.3458405734144221e-05, + "loss": 0.0049, + "step": 40390 + }, + { + "epoch": 8.774978279756734, + "grad_norm": 0.0004849474353250116, + "learning_rate": 1.3449355632783087e-05, + "loss": 0.0001, + "step": 40400 + }, + { + "epoch": 8.777150304083406, + "grad_norm": 0.0006203539669513702, + "learning_rate": 1.3440305531421953e-05, + "loss": 0.0001, + "step": 40410 + }, + { + "epoch": 8.779322328410078, + "grad_norm": 0.0029206108301877975, + "learning_rate": 1.3431255430060818e-05, + "loss": 0.0001, + "step": 40420 + }, + { + "epoch": 8.78149435273675, + "grad_norm": 0.00042043565190397203, + "learning_rate": 1.3422205328699684e-05, + "loss": 0.0042, + "step": 40430 + }, + { + "epoch": 8.783666377063422, + "grad_norm": 0.00043287023436278105, + "learning_rate": 1.3413155227338548e-05, + "loss": 0.0001, + "step": 40440 + }, + { + "epoch": 8.785838401390096, + "grad_norm": 0.00042107931221835315, + "learning_rate": 1.3404105125977412e-05, + "loss": 0.0001, + "step": 40450 + }, + { + "epoch": 8.788010425716768, + "grad_norm": 0.00041855682502500713, + "learning_rate": 1.3395055024616277e-05, + "loss": 0.0001, + "step": 40460 + }, + { + "epoch": 8.79018245004344, + "grad_norm": 0.0005362842348404229, + "learning_rate": 1.3386004923255141e-05, + "loss": 0.0, + "step": 40470 + }, + { + "epoch": 8.792354474370113, + "grad_norm": 0.00041211023926734924, + "learning_rate": 1.3376954821894005e-05, + "loss": 0.0001, + "step": 40480 + }, + { + "epoch": 8.794526498696785, + "grad_norm": 0.00043726610601879656, + "learning_rate": 1.336790472053287e-05, + "loss": 0.0, + "step": 40490 + }, + { + "epoch": 8.796698523023458, + "grad_norm": 0.0005625045741908252, + "learning_rate": 1.3358854619171734e-05, + "loss": 0.0001, + "step": 40500 + }, + { + "epoch": 8.79887054735013, + "grad_norm": 0.0004177498340141028, + "learning_rate": 1.3349804517810599e-05, + "loss": 0.0001, + "step": 40510 + }, + { + "epoch": 8.801042571676803, + "grad_norm": 0.0007373658008873463, + "learning_rate": 1.3340754416449463e-05, + "loss": 0.0036, + "step": 40520 + }, + { + "epoch": 8.803214596003475, + "grad_norm": 0.00042282009962946177, + "learning_rate": 1.333170431508833e-05, + "loss": 0.0001, + "step": 40530 + }, + { + "epoch": 8.805386620330149, + "grad_norm": 0.00047931907465681434, + "learning_rate": 1.3322654213727195e-05, + "loss": 0.0, + "step": 40540 + }, + { + "epoch": 8.80755864465682, + "grad_norm": 0.0020538109820336103, + "learning_rate": 1.331360411236606e-05, + "loss": 0.0001, + "step": 40550 + }, + { + "epoch": 8.809730668983493, + "grad_norm": 0.00045984104508534074, + "learning_rate": 1.3304554011004924e-05, + "loss": 0.0001, + "step": 40560 + }, + { + "epoch": 8.811902693310165, + "grad_norm": 0.00040731808985583484, + "learning_rate": 1.329550390964379e-05, + "loss": 0.0303, + "step": 40570 + }, + { + "epoch": 8.814074717636837, + "grad_norm": 0.0009267780114896595, + "learning_rate": 1.3286453808282654e-05, + "loss": 0.0001, + "step": 40580 + }, + { + "epoch": 8.81624674196351, + "grad_norm": 0.0004015139420516789, + "learning_rate": 1.3277403706921518e-05, + "loss": 0.0049, + "step": 40590 + }, + { + "epoch": 8.818418766290183, + "grad_norm": 0.044160980731248856, + "learning_rate": 1.3268353605560383e-05, + "loss": 0.006, + "step": 40600 + }, + { + "epoch": 8.820590790616855, + "grad_norm": 0.00041005350067280233, + "learning_rate": 1.3259303504199247e-05, + "loss": 0.0036, + "step": 40610 + }, + { + "epoch": 8.822762814943527, + "grad_norm": 0.00040421911398880184, + "learning_rate": 1.3250253402838111e-05, + "loss": 0.0, + "step": 40620 + }, + { + "epoch": 8.824934839270199, + "grad_norm": 0.00039571928209625185, + "learning_rate": 1.3241203301476976e-05, + "loss": 0.0, + "step": 40630 + }, + { + "epoch": 8.827106863596873, + "grad_norm": 0.00040001931483857334, + "learning_rate": 1.3232153200115843e-05, + "loss": 0.0046, + "step": 40640 + }, + { + "epoch": 8.829278887923545, + "grad_norm": 0.0003898143768310547, + "learning_rate": 1.3223103098754708e-05, + "loss": 0.0, + "step": 40650 + }, + { + "epoch": 8.831450912250217, + "grad_norm": 0.00039375320193357766, + "learning_rate": 1.3214052997393572e-05, + "loss": 0.0001, + "step": 40660 + }, + { + "epoch": 8.833622936576889, + "grad_norm": 0.0003916964342352003, + "learning_rate": 1.3205002896032436e-05, + "loss": 0.0, + "step": 40670 + }, + { + "epoch": 8.835794960903563, + "grad_norm": 0.00039076615939848125, + "learning_rate": 1.31959527946713e-05, + "loss": 0.0, + "step": 40680 + }, + { + "epoch": 8.837966985230235, + "grad_norm": 0.00039495810051448643, + "learning_rate": 1.3186902693310165e-05, + "loss": 0.0002, + "step": 40690 + }, + { + "epoch": 8.840139009556907, + "grad_norm": 0.0003964413481298834, + "learning_rate": 1.3177852591949031e-05, + "loss": 0.0, + "step": 40700 + }, + { + "epoch": 8.842311033883579, + "grad_norm": 0.00039823653060011566, + "learning_rate": 1.3168802490587895e-05, + "loss": 0.0, + "step": 40710 + }, + { + "epoch": 8.844483058210251, + "grad_norm": 0.00038708062493242323, + "learning_rate": 1.315975238922676e-05, + "loss": 0.0062, + "step": 40720 + }, + { + "epoch": 8.846655082536925, + "grad_norm": 0.00040895427810028195, + "learning_rate": 1.3150702287865624e-05, + "loss": 0.0, + "step": 40730 + }, + { + "epoch": 8.848827106863597, + "grad_norm": 0.00038919615326449275, + "learning_rate": 1.3141652186504488e-05, + "loss": 0.0, + "step": 40740 + }, + { + "epoch": 8.85099913119027, + "grad_norm": 0.0006516836583614349, + "learning_rate": 1.3132602085143353e-05, + "loss": 0.0, + "step": 40750 + }, + { + "epoch": 8.853171155516941, + "grad_norm": 0.00040941167389974, + "learning_rate": 1.312355198378222e-05, + "loss": 0.0036, + "step": 40760 + }, + { + "epoch": 8.855343179843615, + "grad_norm": 0.0003876253613270819, + "learning_rate": 1.3114501882421085e-05, + "loss": 0.0, + "step": 40770 + }, + { + "epoch": 8.857515204170287, + "grad_norm": 0.0003910251543857157, + "learning_rate": 1.310545178105995e-05, + "loss": 0.0, + "step": 40780 + }, + { + "epoch": 8.85968722849696, + "grad_norm": 0.0004716122057288885, + "learning_rate": 1.3096401679698814e-05, + "loss": 0.0, + "step": 40790 + }, + { + "epoch": 8.861859252823631, + "grad_norm": 0.00038500205846503377, + "learning_rate": 1.3087351578337678e-05, + "loss": 0.0001, + "step": 40800 + }, + { + "epoch": 8.864031277150303, + "grad_norm": 0.0003877005656249821, + "learning_rate": 1.3078301476976542e-05, + "loss": 0.0052, + "step": 40810 + }, + { + "epoch": 8.866203301476977, + "grad_norm": 0.00038981385296210647, + "learning_rate": 1.3069251375615407e-05, + "loss": 0.0053, + "step": 40820 + }, + { + "epoch": 8.86837532580365, + "grad_norm": 0.00039689007098786533, + "learning_rate": 1.3060201274254273e-05, + "loss": 0.0001, + "step": 40830 + }, + { + "epoch": 8.870547350130321, + "grad_norm": 0.00038903128006495535, + "learning_rate": 1.3051151172893137e-05, + "loss": 0.0004, + "step": 40840 + }, + { + "epoch": 8.872719374456993, + "grad_norm": 0.0003967289230786264, + "learning_rate": 1.3042101071532001e-05, + "loss": 0.0, + "step": 40850 + }, + { + "epoch": 8.874891398783667, + "grad_norm": 0.00039026138256303966, + "learning_rate": 1.3033050970170866e-05, + "loss": 0.0, + "step": 40860 + }, + { + "epoch": 8.87706342311034, + "grad_norm": 0.00043888072832487524, + "learning_rate": 1.3024000868809733e-05, + "loss": 0.0018, + "step": 40870 + }, + { + "epoch": 8.879235447437011, + "grad_norm": 0.0013480924535542727, + "learning_rate": 1.3014950767448598e-05, + "loss": 0.0089, + "step": 40880 + }, + { + "epoch": 8.881407471763684, + "grad_norm": 0.00042966246837750077, + "learning_rate": 1.3005900666087462e-05, + "loss": 0.0001, + "step": 40890 + }, + { + "epoch": 8.883579496090356, + "grad_norm": 0.0003938440349884331, + "learning_rate": 1.2996850564726326e-05, + "loss": 0.0001, + "step": 40900 + }, + { + "epoch": 8.88575152041703, + "grad_norm": 0.0004265749885234982, + "learning_rate": 1.298780046336519e-05, + "loss": 0.0024, + "step": 40910 + }, + { + "epoch": 8.887923544743701, + "grad_norm": 0.00041098997462540865, + "learning_rate": 1.2978750362004055e-05, + "loss": 0.0001, + "step": 40920 + }, + { + "epoch": 8.890095569070374, + "grad_norm": 0.0007312253001146019, + "learning_rate": 1.296970026064292e-05, + "loss": 0.0, + "step": 40930 + }, + { + "epoch": 8.892267593397046, + "grad_norm": 0.0003898316062986851, + "learning_rate": 1.2960650159281784e-05, + "loss": 0.0, + "step": 40940 + }, + { + "epoch": 8.894439617723718, + "grad_norm": 0.0003812254872173071, + "learning_rate": 1.2951600057920648e-05, + "loss": 0.0001, + "step": 40950 + }, + { + "epoch": 8.896611642050392, + "grad_norm": 0.0003883030731230974, + "learning_rate": 1.2942549956559512e-05, + "loss": 0.0, + "step": 40960 + }, + { + "epoch": 8.898783666377064, + "grad_norm": 0.000394894159398973, + "learning_rate": 1.2933499855198378e-05, + "loss": 0.0, + "step": 40970 + }, + { + "epoch": 8.900955690703736, + "grad_norm": 0.0003820984566118568, + "learning_rate": 1.2924449753837243e-05, + "loss": 0.0355, + "step": 40980 + }, + { + "epoch": 8.903127715030408, + "grad_norm": 0.0004881360218860209, + "learning_rate": 1.2915399652476109e-05, + "loss": 0.0001, + "step": 40990 + }, + { + "epoch": 8.90529973935708, + "grad_norm": 0.00045840549864806235, + "learning_rate": 1.2906349551114975e-05, + "loss": 0.0002, + "step": 41000 + }, + { + "epoch": 8.907471763683754, + "grad_norm": 0.0004596387152560055, + "learning_rate": 1.289729944975384e-05, + "loss": 0.0, + "step": 41010 + }, + { + "epoch": 8.909643788010426, + "grad_norm": 0.00038116611540317535, + "learning_rate": 1.2888249348392704e-05, + "loss": 0.0047, + "step": 41020 + }, + { + "epoch": 8.911815812337098, + "grad_norm": 0.0007415604195557535, + "learning_rate": 1.2879199247031568e-05, + "loss": 0.0057, + "step": 41030 + }, + { + "epoch": 8.91398783666377, + "grad_norm": 0.0012384551810100675, + "learning_rate": 1.2870149145670432e-05, + "loss": 0.0, + "step": 41040 + }, + { + "epoch": 8.916159860990444, + "grad_norm": 0.00042603438487276435, + "learning_rate": 1.2861099044309297e-05, + "loss": 0.0, + "step": 41050 + }, + { + "epoch": 8.918331885317116, + "grad_norm": 0.00038078860961832106, + "learning_rate": 1.2852048942948161e-05, + "loss": 0.0001, + "step": 41060 + }, + { + "epoch": 8.920503909643788, + "grad_norm": 0.0003853098605759442, + "learning_rate": 1.2842998841587025e-05, + "loss": 0.0, + "step": 41070 + }, + { + "epoch": 8.92267593397046, + "grad_norm": 0.000514326267875731, + "learning_rate": 1.283394874022589e-05, + "loss": 0.0, + "step": 41080 + }, + { + "epoch": 8.924847958297132, + "grad_norm": 0.00038332334952428937, + "learning_rate": 1.2824898638864754e-05, + "loss": 0.0046, + "step": 41090 + }, + { + "epoch": 8.927019982623806, + "grad_norm": 0.0004047720576636493, + "learning_rate": 1.2815848537503622e-05, + "loss": 0.0, + "step": 41100 + }, + { + "epoch": 8.929192006950478, + "grad_norm": 0.0003732458280865103, + "learning_rate": 1.2806798436142486e-05, + "loss": 0.0048, + "step": 41110 + }, + { + "epoch": 8.93136403127715, + "grad_norm": 0.0007458085892722011, + "learning_rate": 1.279774833478135e-05, + "loss": 0.0, + "step": 41120 + }, + { + "epoch": 8.933536055603822, + "grad_norm": 0.0003851301735267043, + "learning_rate": 1.2788698233420215e-05, + "loss": 0.0, + "step": 41130 + }, + { + "epoch": 8.935708079930496, + "grad_norm": 0.0003756080404855311, + "learning_rate": 1.277964813205908e-05, + "loss": 0.0, + "step": 41140 + }, + { + "epoch": 8.937880104257168, + "grad_norm": 0.0003725398564711213, + "learning_rate": 1.2770598030697945e-05, + "loss": 0.0, + "step": 41150 + }, + { + "epoch": 8.94005212858384, + "grad_norm": 0.00038527336437255144, + "learning_rate": 1.276154792933681e-05, + "loss": 0.0001, + "step": 41160 + }, + { + "epoch": 8.942224152910512, + "grad_norm": 0.006219358649104834, + "learning_rate": 1.2752497827975674e-05, + "loss": 0.0001, + "step": 41170 + }, + { + "epoch": 8.944396177237184, + "grad_norm": 0.00037608726415783167, + "learning_rate": 1.2743447726614538e-05, + "loss": 0.0001, + "step": 41180 + }, + { + "epoch": 8.946568201563858, + "grad_norm": 0.0003803640138357878, + "learning_rate": 1.2734397625253402e-05, + "loss": 0.0, + "step": 41190 + }, + { + "epoch": 8.94874022589053, + "grad_norm": 0.0003750100440811366, + "learning_rate": 1.2725347523892267e-05, + "loss": 0.0, + "step": 41200 + }, + { + "epoch": 8.950912250217202, + "grad_norm": 0.0003808810724876821, + "learning_rate": 1.2716297422531134e-05, + "loss": 0.0, + "step": 41210 + }, + { + "epoch": 8.953084274543874, + "grad_norm": 0.0003753194469027221, + "learning_rate": 1.2707247321169999e-05, + "loss": 0.0, + "step": 41220 + }, + { + "epoch": 8.955256298870548, + "grad_norm": 0.0005497315432876348, + "learning_rate": 1.2698197219808863e-05, + "loss": 0.0, + "step": 41230 + }, + { + "epoch": 8.95742832319722, + "grad_norm": 0.0004073216987308115, + "learning_rate": 1.2689147118447727e-05, + "loss": 0.0042, + "step": 41240 + }, + { + "epoch": 8.959600347523892, + "grad_norm": 0.005487049464136362, + "learning_rate": 1.2680097017086592e-05, + "loss": 0.0001, + "step": 41250 + }, + { + "epoch": 8.961772371850564, + "grad_norm": 0.0003656859917100519, + "learning_rate": 1.2671046915725456e-05, + "loss": 0.0, + "step": 41260 + }, + { + "epoch": 8.963944396177236, + "grad_norm": 0.00038666161708533764, + "learning_rate": 1.2661996814364322e-05, + "loss": 0.0, + "step": 41270 + }, + { + "epoch": 8.96611642050391, + "grad_norm": 0.00037817287375219166, + "learning_rate": 1.2652946713003186e-05, + "loss": 0.027, + "step": 41280 + }, + { + "epoch": 8.968288444830582, + "grad_norm": 0.0003763465501833707, + "learning_rate": 1.264389661164205e-05, + "loss": 0.0001, + "step": 41290 + }, + { + "epoch": 8.970460469157254, + "grad_norm": 0.0003695207415148616, + "learning_rate": 1.2634846510280915e-05, + "loss": 0.0001, + "step": 41300 + }, + { + "epoch": 8.972632493483927, + "grad_norm": 0.0009629792766645551, + "learning_rate": 1.262579640891978e-05, + "loss": 0.0, + "step": 41310 + }, + { + "epoch": 8.9748045178106, + "grad_norm": 0.0003619254275690764, + "learning_rate": 1.2616746307558644e-05, + "loss": 0.0, + "step": 41320 + }, + { + "epoch": 8.976976542137272, + "grad_norm": 0.00037999844062142074, + "learning_rate": 1.2607696206197512e-05, + "loss": 0.0365, + "step": 41330 + }, + { + "epoch": 8.979148566463945, + "grad_norm": 0.0004289183998480439, + "learning_rate": 1.2598646104836376e-05, + "loss": 0.0001, + "step": 41340 + }, + { + "epoch": 8.981320590790617, + "grad_norm": 0.004200364463031292, + "learning_rate": 1.258959600347524e-05, + "loss": 0.0001, + "step": 41350 + }, + { + "epoch": 8.983492615117289, + "grad_norm": 0.0006380841950885952, + "learning_rate": 1.2580545902114105e-05, + "loss": 0.0002, + "step": 41360 + }, + { + "epoch": 8.985664639443963, + "grad_norm": 3.8622334003448486, + "learning_rate": 1.2571495800752969e-05, + "loss": 0.0493, + "step": 41370 + }, + { + "epoch": 8.987836663770635, + "grad_norm": 0.0026256830897182226, + "learning_rate": 1.2562445699391833e-05, + "loss": 0.0002, + "step": 41380 + }, + { + "epoch": 8.990008688097307, + "grad_norm": 0.00362397450953722, + "learning_rate": 1.2553395598030698e-05, + "loss": 0.0006, + "step": 41390 + }, + { + "epoch": 8.992180712423979, + "grad_norm": 0.0019957926124334335, + "learning_rate": 1.2544345496669564e-05, + "loss": 0.0002, + "step": 41400 + }, + { + "epoch": 8.99435273675065, + "grad_norm": 0.007589931599795818, + "learning_rate": 1.2535295395308428e-05, + "loss": 0.0002, + "step": 41410 + }, + { + "epoch": 8.996524761077325, + "grad_norm": 0.002436482347548008, + "learning_rate": 1.2526245293947292e-05, + "loss": 0.0002, + "step": 41420 + }, + { + "epoch": 8.998696785403997, + "grad_norm": 0.0034616016782820225, + "learning_rate": 1.2517195192586157e-05, + "loss": 0.0006, + "step": 41430 + }, + { + "epoch": 9.0, + "eval_f1": 0.5344129554655871, + "eval_loss": 0.07725337892770767, + "eval_runtime": 83.1311, + "eval_samples_per_second": 119.991, + "eval_steps_per_second": 7.506, + "step": 41436 + }, + { + "epoch": 9.000868809730669, + "grad_norm": 0.0007821933249942958, + "learning_rate": 1.2508145091225024e-05, + "loss": 0.0001, + "step": 41440 + }, + { + "epoch": 9.003040834057341, + "grad_norm": 0.0006520500173792243, + "learning_rate": 1.2499094989863887e-05, + "loss": 0.0009, + "step": 41450 + }, + { + "epoch": 9.005212858384015, + "grad_norm": 0.17783880233764648, + "learning_rate": 1.2490044888502751e-05, + "loss": 0.0102, + "step": 41460 + }, + { + "epoch": 9.007384882710687, + "grad_norm": 0.0009389424230903387, + "learning_rate": 1.2480994787141617e-05, + "loss": 0.0001, + "step": 41470 + }, + { + "epoch": 9.009556907037359, + "grad_norm": 0.0015950956149026752, + "learning_rate": 1.2471944685780482e-05, + "loss": 0.0089, + "step": 41480 + }, + { + "epoch": 9.011728931364031, + "grad_norm": 0.00043575678137131035, + "learning_rate": 1.2462894584419346e-05, + "loss": 0.0001, + "step": 41490 + }, + { + "epoch": 9.013900955690703, + "grad_norm": 0.00048693272401578724, + "learning_rate": 1.245384448305821e-05, + "loss": 0.0001, + "step": 41500 + }, + { + "epoch": 9.016072980017377, + "grad_norm": 0.0023798400070518255, + "learning_rate": 1.2444794381697075e-05, + "loss": 0.0214, + "step": 41510 + }, + { + "epoch": 9.018245004344049, + "grad_norm": 0.0006145633524283767, + "learning_rate": 1.2435744280335939e-05, + "loss": 0.0029, + "step": 41520 + }, + { + "epoch": 9.020417028670721, + "grad_norm": 0.00047548647853545845, + "learning_rate": 1.2426694178974805e-05, + "loss": 0.0003, + "step": 41530 + }, + { + "epoch": 9.022589052997393, + "grad_norm": 0.0006101019098423421, + "learning_rate": 1.241764407761367e-05, + "loss": 0.0001, + "step": 41540 + }, + { + "epoch": 9.024761077324065, + "grad_norm": 0.000736563524696976, + "learning_rate": 1.2408593976252536e-05, + "loss": 0.0001, + "step": 41550 + }, + { + "epoch": 9.026933101650739, + "grad_norm": 0.0004573471669573337, + "learning_rate": 1.23995438748914e-05, + "loss": 0.0, + "step": 41560 + }, + { + "epoch": 9.029105125977411, + "grad_norm": 0.00043995672604069114, + "learning_rate": 1.2390493773530264e-05, + "loss": 0.0007, + "step": 41570 + }, + { + "epoch": 9.031277150304083, + "grad_norm": 0.0004224168078508228, + "learning_rate": 1.2381443672169129e-05, + "loss": 0.0, + "step": 41580 + }, + { + "epoch": 9.033449174630755, + "grad_norm": 0.00042783026583492756, + "learning_rate": 1.2372393570807995e-05, + "loss": 0.0001, + "step": 41590 + }, + { + "epoch": 9.035621198957429, + "grad_norm": 0.003369641024619341, + "learning_rate": 1.2363343469446859e-05, + "loss": 0.0003, + "step": 41600 + }, + { + "epoch": 9.037793223284101, + "grad_norm": 0.0005451114848256111, + "learning_rate": 1.2354293368085723e-05, + "loss": 0.0001, + "step": 41610 + }, + { + "epoch": 9.039965247610773, + "grad_norm": 0.0005068237660452724, + "learning_rate": 1.2345243266724588e-05, + "loss": 0.0001, + "step": 41620 + }, + { + "epoch": 9.042137271937445, + "grad_norm": 0.0003632347797974944, + "learning_rate": 1.2336193165363452e-05, + "loss": 0.0001, + "step": 41630 + }, + { + "epoch": 9.044309296264117, + "grad_norm": 0.0005446246359497309, + "learning_rate": 1.2327143064002318e-05, + "loss": 0.0001, + "step": 41640 + }, + { + "epoch": 9.046481320590791, + "grad_norm": 0.0005631268722936511, + "learning_rate": 1.2318092962641182e-05, + "loss": 0.0, + "step": 41650 + }, + { + "epoch": 9.048653344917463, + "grad_norm": 0.0007002344354987144, + "learning_rate": 1.2309042861280047e-05, + "loss": 0.0001, + "step": 41660 + }, + { + "epoch": 9.050825369244135, + "grad_norm": 0.0005598576390184462, + "learning_rate": 1.2299992759918911e-05, + "loss": 0.0042, + "step": 41670 + }, + { + "epoch": 9.052997393570807, + "grad_norm": 0.0006812529754824936, + "learning_rate": 1.2290942658557775e-05, + "loss": 0.0038, + "step": 41680 + }, + { + "epoch": 9.055169417897481, + "grad_norm": 0.0003943377232644707, + "learning_rate": 1.2281892557196641e-05, + "loss": 0.0153, + "step": 41690 + }, + { + "epoch": 9.057341442224153, + "grad_norm": 0.00037149019772186875, + "learning_rate": 1.2272842455835506e-05, + "loss": 0.0, + "step": 41700 + }, + { + "epoch": 9.059513466550825, + "grad_norm": 0.0005129647324793041, + "learning_rate": 1.2263792354474372e-05, + "loss": 0.0, + "step": 41710 + }, + { + "epoch": 9.061685490877498, + "grad_norm": 0.00036699813790619373, + "learning_rate": 1.2254742253113236e-05, + "loss": 0.0, + "step": 41720 + }, + { + "epoch": 9.06385751520417, + "grad_norm": 0.00037562238867394626, + "learning_rate": 1.22456921517521e-05, + "loss": 0.0047, + "step": 41730 + }, + { + "epoch": 9.066029539530843, + "grad_norm": 0.00037983700167387724, + "learning_rate": 1.2236642050390965e-05, + "loss": 0.0184, + "step": 41740 + }, + { + "epoch": 9.068201563857516, + "grad_norm": 0.0007805681088939309, + "learning_rate": 1.2227591949029829e-05, + "loss": 0.0046, + "step": 41750 + }, + { + "epoch": 9.070373588184188, + "grad_norm": 0.00037100425106473267, + "learning_rate": 1.2218541847668695e-05, + "loss": 0.0001, + "step": 41760 + }, + { + "epoch": 9.07254561251086, + "grad_norm": 0.0004620937106665224, + "learning_rate": 1.220949174630756e-05, + "loss": 0.0001, + "step": 41770 + }, + { + "epoch": 9.074717636837532, + "grad_norm": 0.00037084464565850794, + "learning_rate": 1.2200441644946424e-05, + "loss": 0.0, + "step": 41780 + }, + { + "epoch": 9.076889661164206, + "grad_norm": 0.00036705503589473665, + "learning_rate": 1.2191391543585288e-05, + "loss": 0.0, + "step": 41790 + }, + { + "epoch": 9.079061685490878, + "grad_norm": 0.022258194163441658, + "learning_rate": 1.2182341442224152e-05, + "loss": 0.0001, + "step": 41800 + }, + { + "epoch": 9.08123370981755, + "grad_norm": 0.0010294326348230243, + "learning_rate": 1.2173291340863018e-05, + "loss": 0.0, + "step": 41810 + }, + { + "epoch": 9.083405734144222, + "grad_norm": 0.00035624494194053113, + "learning_rate": 1.2164241239501883e-05, + "loss": 0.0101, + "step": 41820 + }, + { + "epoch": 9.085577758470896, + "grad_norm": 0.00036939565325155854, + "learning_rate": 1.2155191138140747e-05, + "loss": 0.0, + "step": 41830 + }, + { + "epoch": 9.087749782797568, + "grad_norm": 0.0003831423236988485, + "learning_rate": 1.2146141036779613e-05, + "loss": 0.0, + "step": 41840 + }, + { + "epoch": 9.08992180712424, + "grad_norm": 0.0003595768066588789, + "learning_rate": 1.2137090935418478e-05, + "loss": 0.0, + "step": 41850 + }, + { + "epoch": 9.092093831450912, + "grad_norm": 0.00042781129013746977, + "learning_rate": 1.2128040834057342e-05, + "loss": 0.0, + "step": 41860 + }, + { + "epoch": 9.094265855777584, + "grad_norm": 0.00036690133856609464, + "learning_rate": 1.2118990732696208e-05, + "loss": 0.0, + "step": 41870 + }, + { + "epoch": 9.096437880104258, + "grad_norm": 0.00036495571839623153, + "learning_rate": 1.2109940631335072e-05, + "loss": 0.0, + "step": 41880 + }, + { + "epoch": 9.09860990443093, + "grad_norm": 0.00035877502523362637, + "learning_rate": 1.2100890529973937e-05, + "loss": 0.0, + "step": 41890 + }, + { + "epoch": 9.100781928757602, + "grad_norm": 0.0006629744893871248, + "learning_rate": 1.2091840428612801e-05, + "loss": 0.0, + "step": 41900 + }, + { + "epoch": 9.102953953084274, + "grad_norm": 0.0003537225420586765, + "learning_rate": 1.2082790327251665e-05, + "loss": 0.0034, + "step": 41910 + }, + { + "epoch": 9.105125977410948, + "grad_norm": 0.0003657103516161442, + "learning_rate": 1.207374022589053e-05, + "loss": 0.0, + "step": 41920 + }, + { + "epoch": 9.10729800173762, + "grad_norm": 0.0004622082051355392, + "learning_rate": 1.2064690124529396e-05, + "loss": 0.0, + "step": 41930 + }, + { + "epoch": 9.109470026064292, + "grad_norm": 0.0004466444079298526, + "learning_rate": 1.205564002316826e-05, + "loss": 0.0011, + "step": 41940 + }, + { + "epoch": 9.111642050390964, + "grad_norm": 0.0007034140289761126, + "learning_rate": 1.2046589921807124e-05, + "loss": 0.0489, + "step": 41950 + }, + { + "epoch": 9.113814074717636, + "grad_norm": 0.0019044640939682722, + "learning_rate": 1.2037539820445989e-05, + "loss": 0.0001, + "step": 41960 + }, + { + "epoch": 9.11598609904431, + "grad_norm": 0.0009093311382457614, + "learning_rate": 1.2028489719084855e-05, + "loss": 0.0292, + "step": 41970 + }, + { + "epoch": 9.118158123370982, + "grad_norm": 0.0011715837754309177, + "learning_rate": 1.2019439617723719e-05, + "loss": 0.0001, + "step": 41980 + }, + { + "epoch": 9.120330147697654, + "grad_norm": 0.0005745973321609199, + "learning_rate": 1.2010389516362585e-05, + "loss": 0.0001, + "step": 41990 + }, + { + "epoch": 9.122502172024326, + "grad_norm": 0.000761601550038904, + "learning_rate": 1.200133941500145e-05, + "loss": 0.0001, + "step": 42000 + }, + { + "epoch": 9.124674196350998, + "grad_norm": 0.000572199176531285, + "learning_rate": 1.1992289313640314e-05, + "loss": 0.0001, + "step": 42010 + }, + { + "epoch": 9.126846220677672, + "grad_norm": 0.00113860541023314, + "learning_rate": 1.1983239212279178e-05, + "loss": 0.0001, + "step": 42020 + }, + { + "epoch": 9.129018245004344, + "grad_norm": 0.000481676310300827, + "learning_rate": 1.1974189110918042e-05, + "loss": 0.0001, + "step": 42030 + }, + { + "epoch": 9.131190269331016, + "grad_norm": 0.0013643910642713308, + "learning_rate": 1.1965139009556908e-05, + "loss": 0.0001, + "step": 42040 + }, + { + "epoch": 9.133362293657688, + "grad_norm": 0.026149902492761612, + "learning_rate": 1.1956088908195773e-05, + "loss": 0.0086, + "step": 42050 + }, + { + "epoch": 9.135534317984362, + "grad_norm": 0.0005695584695786238, + "learning_rate": 1.1947038806834637e-05, + "loss": 0.0061, + "step": 42060 + }, + { + "epoch": 9.137706342311034, + "grad_norm": 0.0012668960262089968, + "learning_rate": 1.1937988705473501e-05, + "loss": 0.0001, + "step": 42070 + }, + { + "epoch": 9.139878366637706, + "grad_norm": 0.0003972501726821065, + "learning_rate": 1.1928938604112366e-05, + "loss": 0.0001, + "step": 42080 + }, + { + "epoch": 9.142050390964378, + "grad_norm": 0.0004158159426879138, + "learning_rate": 1.191988850275123e-05, + "loss": 0.0, + "step": 42090 + }, + { + "epoch": 9.14422241529105, + "grad_norm": 0.0015036650002002716, + "learning_rate": 1.1910838401390096e-05, + "loss": 0.0001, + "step": 42100 + }, + { + "epoch": 9.146394439617724, + "grad_norm": 0.011430252343416214, + "learning_rate": 1.190178830002896e-05, + "loss": 0.0001, + "step": 42110 + }, + { + "epoch": 9.148566463944396, + "grad_norm": 0.0003950317041017115, + "learning_rate": 1.1892738198667827e-05, + "loss": 0.0001, + "step": 42120 + }, + { + "epoch": 9.150738488271069, + "grad_norm": 0.0005114611121825874, + "learning_rate": 1.1883688097306691e-05, + "loss": 0.0033, + "step": 42130 + }, + { + "epoch": 9.15291051259774, + "grad_norm": 0.00044346958748064935, + "learning_rate": 1.1874637995945555e-05, + "loss": 0.0001, + "step": 42140 + }, + { + "epoch": 9.155082536924414, + "grad_norm": 0.00038853855221532285, + "learning_rate": 1.186558789458442e-05, + "loss": 0.0, + "step": 42150 + }, + { + "epoch": 9.157254561251086, + "grad_norm": 0.0003854171955026686, + "learning_rate": 1.1856537793223286e-05, + "loss": 0.0, + "step": 42160 + }, + { + "epoch": 9.159426585577759, + "grad_norm": 0.0006933953263796866, + "learning_rate": 1.184748769186215e-05, + "loss": 0.0, + "step": 42170 + }, + { + "epoch": 9.16159860990443, + "grad_norm": 0.00041202042484655976, + "learning_rate": 1.1838437590501014e-05, + "loss": 0.0001, + "step": 42180 + }, + { + "epoch": 9.163770634231103, + "grad_norm": 0.0004293081583455205, + "learning_rate": 1.1829387489139879e-05, + "loss": 0.0, + "step": 42190 + }, + { + "epoch": 9.165942658557777, + "grad_norm": 0.0003802312712650746, + "learning_rate": 1.1820337387778743e-05, + "loss": 0.0, + "step": 42200 + }, + { + "epoch": 9.168114682884449, + "grad_norm": 0.15740595757961273, + "learning_rate": 1.1811287286417609e-05, + "loss": 0.0001, + "step": 42210 + }, + { + "epoch": 9.17028670721112, + "grad_norm": 0.00046438779099844396, + "learning_rate": 1.1802237185056473e-05, + "loss": 0.0034, + "step": 42220 + }, + { + "epoch": 9.172458731537793, + "grad_norm": 0.06625476479530334, + "learning_rate": 1.1793187083695338e-05, + "loss": 0.0001, + "step": 42230 + }, + { + "epoch": 9.174630755864465, + "grad_norm": 0.0003445383335929364, + "learning_rate": 1.1784136982334202e-05, + "loss": 0.0, + "step": 42240 + }, + { + "epoch": 9.176802780191139, + "grad_norm": 0.00036485432065092027, + "learning_rate": 1.1775086880973066e-05, + "loss": 0.0016, + "step": 42250 + }, + { + "epoch": 9.17897480451781, + "grad_norm": 0.0023723444901406765, + "learning_rate": 1.1766036779611932e-05, + "loss": 0.0001, + "step": 42260 + }, + { + "epoch": 9.181146828844483, + "grad_norm": 0.0036938569974154234, + "learning_rate": 1.1756986678250798e-05, + "loss": 0.0001, + "step": 42270 + }, + { + "epoch": 9.183318853171155, + "grad_norm": 0.0008719200850464404, + "learning_rate": 1.1747936576889663e-05, + "loss": 0.0001, + "step": 42280 + }, + { + "epoch": 9.185490877497829, + "grad_norm": 0.0011817128397524357, + "learning_rate": 1.1738886475528527e-05, + "loss": 0.0047, + "step": 42290 + }, + { + "epoch": 9.1876629018245, + "grad_norm": 0.0005411884048953652, + "learning_rate": 1.1729836374167391e-05, + "loss": 0.0056, + "step": 42300 + }, + { + "epoch": 9.189834926151173, + "grad_norm": 0.0003679801884572953, + "learning_rate": 1.1720786272806256e-05, + "loss": 0.0, + "step": 42310 + }, + { + "epoch": 9.192006950477845, + "grad_norm": 0.0003470522933639586, + "learning_rate": 1.171173617144512e-05, + "loss": 0.0, + "step": 42320 + }, + { + "epoch": 9.194178974804517, + "grad_norm": 0.0004540376248769462, + "learning_rate": 1.1702686070083986e-05, + "loss": 0.0, + "step": 42330 + }, + { + "epoch": 9.196350999131191, + "grad_norm": 0.0004474143497645855, + "learning_rate": 1.169363596872285e-05, + "loss": 0.0239, + "step": 42340 + }, + { + "epoch": 9.198523023457863, + "grad_norm": 0.0003767700691241771, + "learning_rate": 1.1684585867361715e-05, + "loss": 0.0001, + "step": 42350 + }, + { + "epoch": 9.200695047784535, + "grad_norm": 0.006673621945083141, + "learning_rate": 1.167553576600058e-05, + "loss": 0.0001, + "step": 42360 + }, + { + "epoch": 9.202867072111207, + "grad_norm": 0.00039552341331727803, + "learning_rate": 1.1666485664639443e-05, + "loss": 0.0, + "step": 42370 + }, + { + "epoch": 9.20503909643788, + "grad_norm": 0.004886478651314974, + "learning_rate": 1.1657435563278308e-05, + "loss": 0.0001, + "step": 42380 + }, + { + "epoch": 9.207211120764553, + "grad_norm": 0.0011676917783915997, + "learning_rate": 1.1648385461917174e-05, + "loss": 0.0001, + "step": 42390 + }, + { + "epoch": 9.209383145091225, + "grad_norm": 0.0006531733088195324, + "learning_rate": 1.1639335360556038e-05, + "loss": 0.0001, + "step": 42400 + }, + { + "epoch": 9.211555169417897, + "grad_norm": 0.00045200990280136466, + "learning_rate": 1.1630285259194904e-05, + "loss": 0.0062, + "step": 42410 + }, + { + "epoch": 9.21372719374457, + "grad_norm": 0.0005120881251059473, + "learning_rate": 1.1621235157833769e-05, + "loss": 0.0, + "step": 42420 + }, + { + "epoch": 9.215899218071243, + "grad_norm": 0.001150411320850253, + "learning_rate": 1.1612185056472633e-05, + "loss": 0.0001, + "step": 42430 + }, + { + "epoch": 9.218071242397915, + "grad_norm": 0.0007097712368704379, + "learning_rate": 1.1603134955111499e-05, + "loss": 0.0001, + "step": 42440 + }, + { + "epoch": 9.220243266724587, + "grad_norm": 0.00039890228072181344, + "learning_rate": 1.1594084853750363e-05, + "loss": 0.0061, + "step": 42450 + }, + { + "epoch": 9.22241529105126, + "grad_norm": 0.0003842746955342591, + "learning_rate": 1.1585034752389228e-05, + "loss": 0.0001, + "step": 42460 + }, + { + "epoch": 9.224587315377931, + "grad_norm": 0.0003777845704462379, + "learning_rate": 1.1575984651028092e-05, + "loss": 0.0, + "step": 42470 + }, + { + "epoch": 9.226759339704605, + "grad_norm": 0.0031357433181256056, + "learning_rate": 1.1566934549666956e-05, + "loss": 0.0036, + "step": 42480 + }, + { + "epoch": 9.228931364031277, + "grad_norm": 0.0004119759250897914, + "learning_rate": 1.155788444830582e-05, + "loss": 0.0001, + "step": 42490 + }, + { + "epoch": 9.23110338835795, + "grad_norm": 0.0005053351633250713, + "learning_rate": 1.1548834346944687e-05, + "loss": 0.0014, + "step": 42500 + }, + { + "epoch": 9.233275412684621, + "grad_norm": 0.00035216548712924123, + "learning_rate": 1.1539784245583551e-05, + "loss": 0.0094, + "step": 42510 + }, + { + "epoch": 9.235447437011295, + "grad_norm": 0.00039408242446370423, + "learning_rate": 1.1530734144222415e-05, + "loss": 0.0, + "step": 42520 + }, + { + "epoch": 9.237619461337967, + "grad_norm": 0.25304147601127625, + "learning_rate": 1.152168404286128e-05, + "loss": 0.0001, + "step": 42530 + }, + { + "epoch": 9.23979148566464, + "grad_norm": 0.0003541614569257945, + "learning_rate": 1.1512633941500146e-05, + "loss": 0.0044, + "step": 42540 + }, + { + "epoch": 9.241963509991312, + "grad_norm": 0.0003510701353661716, + "learning_rate": 1.150358384013901e-05, + "loss": 0.0002, + "step": 42550 + }, + { + "epoch": 9.244135534317984, + "grad_norm": 0.00037902756594121456, + "learning_rate": 1.1494533738777876e-05, + "loss": 0.0001, + "step": 42560 + }, + { + "epoch": 9.246307558644657, + "grad_norm": 0.0019823191687464714, + "learning_rate": 1.148548363741674e-05, + "loss": 0.0001, + "step": 42570 + }, + { + "epoch": 9.24847958297133, + "grad_norm": 0.0004513237508945167, + "learning_rate": 1.1476433536055605e-05, + "loss": 0.0, + "step": 42580 + }, + { + "epoch": 9.250651607298002, + "grad_norm": 0.0013202824629843235, + "learning_rate": 1.1467383434694469e-05, + "loss": 0.0038, + "step": 42590 + }, + { + "epoch": 9.252823631624674, + "grad_norm": 0.00035054978798143566, + "learning_rate": 1.1458333333333333e-05, + "loss": 0.0182, + "step": 42600 + }, + { + "epoch": 9.254995655951348, + "grad_norm": 0.0003493396216072142, + "learning_rate": 1.14492832319722e-05, + "loss": 0.0001, + "step": 42610 + }, + { + "epoch": 9.25716768027802, + "grad_norm": 0.0005488857277669013, + "learning_rate": 1.1440233130611064e-05, + "loss": 0.0, + "step": 42620 + }, + { + "epoch": 9.259339704604692, + "grad_norm": 0.0003464900655671954, + "learning_rate": 1.1431183029249928e-05, + "loss": 0.0068, + "step": 42630 + }, + { + "epoch": 9.261511728931364, + "grad_norm": 0.00038118616794236004, + "learning_rate": 1.1422132927888793e-05, + "loss": 0.0, + "step": 42640 + }, + { + "epoch": 9.263683753258036, + "grad_norm": 0.0003550401597749442, + "learning_rate": 1.1413082826527657e-05, + "loss": 0.0005, + "step": 42650 + }, + { + "epoch": 9.26585577758471, + "grad_norm": 0.0003692580503411591, + "learning_rate": 1.1404032725166521e-05, + "loss": 0.0001, + "step": 42660 + }, + { + "epoch": 9.268027801911382, + "grad_norm": 0.00035148989991284907, + "learning_rate": 1.1394982623805387e-05, + "loss": 0.0, + "step": 42670 + }, + { + "epoch": 9.270199826238054, + "grad_norm": 0.000443141907453537, + "learning_rate": 1.1385932522444252e-05, + "loss": 0.0, + "step": 42680 + }, + { + "epoch": 9.272371850564726, + "grad_norm": 0.0008299656328745186, + "learning_rate": 1.1376882421083118e-05, + "loss": 0.0312, + "step": 42690 + }, + { + "epoch": 9.274543874891398, + "grad_norm": 0.0032954856287688017, + "learning_rate": 1.1367832319721982e-05, + "loss": 0.0002, + "step": 42700 + }, + { + "epoch": 9.276715899218072, + "grad_norm": 0.0026082557160407305, + "learning_rate": 1.1358782218360846e-05, + "loss": 0.0057, + "step": 42710 + }, + { + "epoch": 9.278887923544744, + "grad_norm": 0.0017727608792483807, + "learning_rate": 1.134973211699971e-05, + "loss": 0.0001, + "step": 42720 + }, + { + "epoch": 9.281059947871416, + "grad_norm": 0.2710913121700287, + "learning_rate": 1.1340682015638577e-05, + "loss": 0.0049, + "step": 42730 + }, + { + "epoch": 9.283231972198088, + "grad_norm": 0.03456174209713936, + "learning_rate": 1.1331631914277441e-05, + "loss": 0.0001, + "step": 42740 + }, + { + "epoch": 9.285403996524762, + "grad_norm": 0.0008368910639546812, + "learning_rate": 1.1322581812916305e-05, + "loss": 0.0001, + "step": 42750 + }, + { + "epoch": 9.287576020851434, + "grad_norm": 0.00035930657759308815, + "learning_rate": 1.131353171155517e-05, + "loss": 0.0001, + "step": 42760 + }, + { + "epoch": 9.289748045178106, + "grad_norm": 0.0006045507034286857, + "learning_rate": 1.1304481610194034e-05, + "loss": 0.0, + "step": 42770 + }, + { + "epoch": 9.291920069504778, + "grad_norm": 0.0003401483118068427, + "learning_rate": 1.1295431508832898e-05, + "loss": 0.0001, + "step": 42780 + }, + { + "epoch": 9.29409209383145, + "grad_norm": 0.0005668572848662734, + "learning_rate": 1.1286381407471764e-05, + "loss": 0.0, + "step": 42790 + }, + { + "epoch": 9.296264118158124, + "grad_norm": 0.00042387290159240365, + "learning_rate": 1.1277331306110629e-05, + "loss": 0.0046, + "step": 42800 + }, + { + "epoch": 9.298436142484796, + "grad_norm": 0.0019156066700816154, + "learning_rate": 1.1268281204749493e-05, + "loss": 0.0, + "step": 42810 + }, + { + "epoch": 9.300608166811468, + "grad_norm": 0.0004135738417971879, + "learning_rate": 1.1259231103388359e-05, + "loss": 0.0006, + "step": 42820 + }, + { + "epoch": 9.30278019113814, + "grad_norm": 0.0003466247289907187, + "learning_rate": 1.1250181002027223e-05, + "loss": 0.0, + "step": 42830 + }, + { + "epoch": 9.304952215464812, + "grad_norm": 0.0003484385379124433, + "learning_rate": 1.124113090066609e-05, + "loss": 0.0001, + "step": 42840 + }, + { + "epoch": 9.307124239791486, + "grad_norm": 0.0003456638951320201, + "learning_rate": 1.1232080799304954e-05, + "loss": 0.0039, + "step": 42850 + }, + { + "epoch": 9.309296264118158, + "grad_norm": 0.00045184456394053996, + "learning_rate": 1.1223030697943818e-05, + "loss": 0.0, + "step": 42860 + }, + { + "epoch": 9.31146828844483, + "grad_norm": 0.0007712736260145903, + "learning_rate": 1.1213980596582682e-05, + "loss": 0.0, + "step": 42870 + }, + { + "epoch": 9.313640312771502, + "grad_norm": 0.00035233920789323747, + "learning_rate": 1.1204930495221547e-05, + "loss": 0.0, + "step": 42880 + }, + { + "epoch": 9.315812337098176, + "grad_norm": 0.0008341504144482315, + "learning_rate": 1.1195880393860411e-05, + "loss": 0.0, + "step": 42890 + }, + { + "epoch": 9.317984361424848, + "grad_norm": 0.0003776673402171582, + "learning_rate": 1.1186830292499277e-05, + "loss": 0.0, + "step": 42900 + }, + { + "epoch": 9.32015638575152, + "grad_norm": 0.00034413248067721725, + "learning_rate": 1.1177780191138142e-05, + "loss": 0.0, + "step": 42910 + }, + { + "epoch": 9.322328410078192, + "grad_norm": 0.0004014780279248953, + "learning_rate": 1.1168730089777006e-05, + "loss": 0.0, + "step": 42920 + }, + { + "epoch": 9.324500434404865, + "grad_norm": 0.00033698673360049725, + "learning_rate": 1.115967998841587e-05, + "loss": 0.0001, + "step": 42930 + }, + { + "epoch": 9.326672458731538, + "grad_norm": 0.0003441080334596336, + "learning_rate": 1.1150629887054735e-05, + "loss": 0.0, + "step": 42940 + }, + { + "epoch": 9.32884448305821, + "grad_norm": 0.0003518610610626638, + "learning_rate": 1.1141579785693599e-05, + "loss": 0.0, + "step": 42950 + }, + { + "epoch": 9.331016507384883, + "grad_norm": 0.0003449947398621589, + "learning_rate": 1.1132529684332465e-05, + "loss": 0.0, + "step": 42960 + }, + { + "epoch": 9.333188531711555, + "grad_norm": 0.0003442805027589202, + "learning_rate": 1.112347958297133e-05, + "loss": 0.0034, + "step": 42970 + }, + { + "epoch": 9.335360556038228, + "grad_norm": 0.0005408208235166967, + "learning_rate": 1.1114429481610195e-05, + "loss": 0.0, + "step": 42980 + }, + { + "epoch": 9.3375325803649, + "grad_norm": 0.0003692790924105793, + "learning_rate": 1.110537938024906e-05, + "loss": 0.0, + "step": 42990 + }, + { + "epoch": 9.339704604691573, + "grad_norm": 0.00034044485073536634, + "learning_rate": 1.1096329278887924e-05, + "loss": 0.0, + "step": 43000 + }, + { + "epoch": 9.341876629018245, + "grad_norm": 0.0026759139727801085, + "learning_rate": 1.108727917752679e-05, + "loss": 0.0, + "step": 43010 + }, + { + "epoch": 9.344048653344917, + "grad_norm": 0.0003308370942249894, + "learning_rate": 1.1078229076165654e-05, + "loss": 0.0056, + "step": 43020 + }, + { + "epoch": 9.34622067767159, + "grad_norm": 0.00035013555316254497, + "learning_rate": 1.1069178974804519e-05, + "loss": 0.0, + "step": 43030 + }, + { + "epoch": 9.348392701998263, + "grad_norm": 0.00034313698415644467, + "learning_rate": 1.1060128873443383e-05, + "loss": 0.0, + "step": 43040 + }, + { + "epoch": 9.350564726324935, + "grad_norm": 0.00045629485975950956, + "learning_rate": 1.1051078772082247e-05, + "loss": 0.0001, + "step": 43050 + }, + { + "epoch": 9.352736750651607, + "grad_norm": 0.00033453330979682505, + "learning_rate": 1.1042028670721112e-05, + "loss": 0.0, + "step": 43060 + }, + { + "epoch": 9.35490877497828, + "grad_norm": 0.0017623642925173044, + "learning_rate": 1.1032978569359978e-05, + "loss": 0.0006, + "step": 43070 + }, + { + "epoch": 9.357080799304953, + "grad_norm": 0.0007430663681589067, + "learning_rate": 1.1023928467998842e-05, + "loss": 0.0001, + "step": 43080 + }, + { + "epoch": 9.359252823631625, + "grad_norm": 0.005379736889153719, + "learning_rate": 1.1014878366637706e-05, + "loss": 0.0002, + "step": 43090 + }, + { + "epoch": 9.361424847958297, + "grad_norm": 1.7879000902175903, + "learning_rate": 1.100582826527657e-05, + "loss": 0.0005, + "step": 43100 + }, + { + "epoch": 9.363596872284969, + "grad_norm": 0.000462701718788594, + "learning_rate": 1.0996778163915437e-05, + "loss": 0.0, + "step": 43110 + }, + { + "epoch": 9.365768896611643, + "grad_norm": 0.00035364291397854686, + "learning_rate": 1.0987728062554301e-05, + "loss": 0.0047, + "step": 43120 + }, + { + "epoch": 9.367940920938315, + "grad_norm": 0.00048209200031124055, + "learning_rate": 1.0978677961193167e-05, + "loss": 0.0, + "step": 43130 + }, + { + "epoch": 9.370112945264987, + "grad_norm": 0.0003437796258367598, + "learning_rate": 1.0969627859832031e-05, + "loss": 0.0, + "step": 43140 + }, + { + "epoch": 9.372284969591659, + "grad_norm": 0.00034089843393303454, + "learning_rate": 1.0960577758470896e-05, + "loss": 0.0, + "step": 43150 + }, + { + "epoch": 9.374456993918331, + "grad_norm": 0.00034152084845118225, + "learning_rate": 1.095152765710976e-05, + "loss": 0.0, + "step": 43160 + }, + { + "epoch": 9.376629018245005, + "grad_norm": 0.00033195436117239296, + "learning_rate": 1.0942477555748625e-05, + "loss": 0.0, + "step": 43170 + }, + { + "epoch": 9.378801042571677, + "grad_norm": 0.00033621469628997147, + "learning_rate": 1.0933427454387489e-05, + "loss": 0.0, + "step": 43180 + }, + { + "epoch": 9.380973066898349, + "grad_norm": 0.0003306234139017761, + "learning_rate": 1.0924377353026355e-05, + "loss": 0.0, + "step": 43190 + }, + { + "epoch": 9.383145091225021, + "grad_norm": 0.0003234837204217911, + "learning_rate": 1.091532725166522e-05, + "loss": 0.0, + "step": 43200 + }, + { + "epoch": 9.385317115551695, + "grad_norm": 0.0003395713574718684, + "learning_rate": 1.0906277150304084e-05, + "loss": 0.0, + "step": 43210 + }, + { + "epoch": 9.387489139878367, + "grad_norm": 0.00036190488026477396, + "learning_rate": 1.0897227048942948e-05, + "loss": 0.0, + "step": 43220 + }, + { + "epoch": 9.38966116420504, + "grad_norm": 0.00032648214255459607, + "learning_rate": 1.0888176947581812e-05, + "loss": 0.0, + "step": 43230 + }, + { + "epoch": 9.391833188531711, + "grad_norm": 0.0004206506710033864, + "learning_rate": 1.0879126846220678e-05, + "loss": 0.0, + "step": 43240 + }, + { + "epoch": 9.394005212858383, + "grad_norm": 0.0004962489474564791, + "learning_rate": 1.0870076744859543e-05, + "loss": 0.0, + "step": 43250 + }, + { + "epoch": 9.396177237185057, + "grad_norm": 0.0003266745188739151, + "learning_rate": 1.0861026643498409e-05, + "loss": 0.0, + "step": 43260 + }, + { + "epoch": 9.39834926151173, + "grad_norm": 0.0003320554969832301, + "learning_rate": 1.0851976542137273e-05, + "loss": 0.0, + "step": 43270 + }, + { + "epoch": 9.400521285838401, + "grad_norm": 0.00033323868410661817, + "learning_rate": 1.0842926440776137e-05, + "loss": 0.0042, + "step": 43280 + }, + { + "epoch": 9.402693310165073, + "grad_norm": 0.0003688148863147944, + "learning_rate": 1.0833876339415002e-05, + "loss": 0.0, + "step": 43290 + }, + { + "epoch": 9.404865334491745, + "grad_norm": 0.0004509967693593353, + "learning_rate": 1.0824826238053868e-05, + "loss": 0.0001, + "step": 43300 + }, + { + "epoch": 9.40703735881842, + "grad_norm": 0.0004598804807756096, + "learning_rate": 1.0815776136692732e-05, + "loss": 0.0, + "step": 43310 + }, + { + "epoch": 9.409209383145091, + "grad_norm": 0.0003177253529429436, + "learning_rate": 1.0806726035331596e-05, + "loss": 0.0, + "step": 43320 + }, + { + "epoch": 9.411381407471763, + "grad_norm": 0.00031996675534173846, + "learning_rate": 1.079767593397046e-05, + "loss": 0.0, + "step": 43330 + }, + { + "epoch": 9.413553431798436, + "grad_norm": 0.0003233585739508271, + "learning_rate": 1.0788625832609325e-05, + "loss": 0.0, + "step": 43340 + }, + { + "epoch": 9.41572545612511, + "grad_norm": 0.00032162151183001697, + "learning_rate": 1.077957573124819e-05, + "loss": 0.0, + "step": 43350 + }, + { + "epoch": 9.417897480451781, + "grad_norm": 0.00042038553510792553, + "learning_rate": 1.0770525629887055e-05, + "loss": 0.0038, + "step": 43360 + }, + { + "epoch": 9.420069504778454, + "grad_norm": 0.00036600345629267395, + "learning_rate": 1.076147552852592e-05, + "loss": 0.0, + "step": 43370 + }, + { + "epoch": 9.422241529105126, + "grad_norm": 0.00032596764503978193, + "learning_rate": 1.0752425427164784e-05, + "loss": 0.0001, + "step": 43380 + }, + { + "epoch": 9.424413553431798, + "grad_norm": 0.00031555883469991386, + "learning_rate": 1.074337532580365e-05, + "loss": 0.0, + "step": 43390 + }, + { + "epoch": 9.426585577758472, + "grad_norm": 0.0003250258741900325, + "learning_rate": 1.0734325224442514e-05, + "loss": 0.0, + "step": 43400 + }, + { + "epoch": 9.428757602085144, + "grad_norm": 0.0003132763667963445, + "learning_rate": 1.072527512308138e-05, + "loss": 0.0, + "step": 43410 + }, + { + "epoch": 9.430929626411816, + "grad_norm": 0.0003153624420519918, + "learning_rate": 1.0716225021720245e-05, + "loss": 0.0039, + "step": 43420 + }, + { + "epoch": 9.433101650738488, + "grad_norm": 0.00031512047280557454, + "learning_rate": 1.070717492035911e-05, + "loss": 0.0, + "step": 43430 + }, + { + "epoch": 9.435273675065162, + "grad_norm": 0.00031897996086627245, + "learning_rate": 1.0698124818997974e-05, + "loss": 0.0036, + "step": 43440 + }, + { + "epoch": 9.437445699391834, + "grad_norm": 0.0005444668349809945, + "learning_rate": 1.0689074717636838e-05, + "loss": 0.0, + "step": 43450 + }, + { + "epoch": 9.439617723718506, + "grad_norm": 0.00032076716888695955, + "learning_rate": 1.0680024616275702e-05, + "loss": 0.0, + "step": 43460 + }, + { + "epoch": 9.441789748045178, + "grad_norm": 0.0004752585955429822, + "learning_rate": 1.0670974514914568e-05, + "loss": 0.0, + "step": 43470 + }, + { + "epoch": 9.44396177237185, + "grad_norm": 0.00048087709001265466, + "learning_rate": 1.0661924413553433e-05, + "loss": 0.0, + "step": 43480 + }, + { + "epoch": 9.446133796698524, + "grad_norm": 0.00044686091132462025, + "learning_rate": 1.0652874312192297e-05, + "loss": 0.0, + "step": 43490 + }, + { + "epoch": 9.448305821025196, + "grad_norm": 0.00031458461307920516, + "learning_rate": 1.0643824210831161e-05, + "loss": 0.0, + "step": 43500 + }, + { + "epoch": 9.450477845351868, + "grad_norm": 0.0003263599646743387, + "learning_rate": 1.0634774109470026e-05, + "loss": 0.0001, + "step": 43510 + }, + { + "epoch": 9.45264986967854, + "grad_norm": 0.00030845761648379266, + "learning_rate": 1.062572400810889e-05, + "loss": 0.0067, + "step": 43520 + }, + { + "epoch": 9.454821894005212, + "grad_norm": 0.00030990008963271976, + "learning_rate": 1.0616673906747756e-05, + "loss": 0.0, + "step": 43530 + }, + { + "epoch": 9.456993918331886, + "grad_norm": 0.000313706899760291, + "learning_rate": 1.0607623805386622e-05, + "loss": 0.0, + "step": 43540 + }, + { + "epoch": 9.459165942658558, + "grad_norm": 0.0003094021521974355, + "learning_rate": 1.0598573704025486e-05, + "loss": 0.0, + "step": 43550 + }, + { + "epoch": 9.46133796698523, + "grad_norm": 0.0004979989607818425, + "learning_rate": 1.058952360266435e-05, + "loss": 0.0058, + "step": 43560 + }, + { + "epoch": 9.463509991311902, + "grad_norm": 0.0003103738999925554, + "learning_rate": 1.0580473501303215e-05, + "loss": 0.0, + "step": 43570 + }, + { + "epoch": 9.465682015638576, + "grad_norm": 0.00030864804284647107, + "learning_rate": 1.057142339994208e-05, + "loss": 0.0, + "step": 43580 + }, + { + "epoch": 9.467854039965248, + "grad_norm": 0.00030936236726120114, + "learning_rate": 1.0562373298580945e-05, + "loss": 0.0, + "step": 43590 + }, + { + "epoch": 9.47002606429192, + "grad_norm": 0.0003084845084231347, + "learning_rate": 1.055332319721981e-05, + "loss": 0.0, + "step": 43600 + }, + { + "epoch": 9.472198088618592, + "grad_norm": 0.00030784294358454645, + "learning_rate": 1.0544273095858674e-05, + "loss": 0.0, + "step": 43610 + }, + { + "epoch": 9.474370112945264, + "grad_norm": 0.0003053463879041374, + "learning_rate": 1.0535222994497538e-05, + "loss": 0.0, + "step": 43620 + }, + { + "epoch": 9.476542137271938, + "grad_norm": 0.00030371634056791663, + "learning_rate": 1.0526172893136403e-05, + "loss": 0.0, + "step": 43630 + }, + { + "epoch": 9.47871416159861, + "grad_norm": 0.0034877255093306303, + "learning_rate": 1.0517122791775269e-05, + "loss": 0.0049, + "step": 43640 + }, + { + "epoch": 9.480886185925282, + "grad_norm": 0.00030639933538623154, + "learning_rate": 1.0508072690414133e-05, + "loss": 0.0043, + "step": 43650 + }, + { + "epoch": 9.483058210251954, + "grad_norm": 0.0003288072475697845, + "learning_rate": 1.0499022589052997e-05, + "loss": 0.0, + "step": 43660 + }, + { + "epoch": 9.485230234578628, + "grad_norm": 0.0003033705288544297, + "learning_rate": 1.0489972487691862e-05, + "loss": 0.0, + "step": 43670 + }, + { + "epoch": 9.4874022589053, + "grad_norm": 0.00031130280694924295, + "learning_rate": 1.0480922386330728e-05, + "loss": 0.0, + "step": 43680 + }, + { + "epoch": 9.489574283231972, + "grad_norm": 0.003796252654865384, + "learning_rate": 1.0471872284969592e-05, + "loss": 0.0, + "step": 43690 + }, + { + "epoch": 9.491746307558644, + "grad_norm": 0.0003003615129273385, + "learning_rate": 1.0462822183608458e-05, + "loss": 0.0058, + "step": 43700 + }, + { + "epoch": 9.493918331885316, + "grad_norm": 0.00043470592936500907, + "learning_rate": 1.0453772082247323e-05, + "loss": 0.0, + "step": 43710 + }, + { + "epoch": 9.49609035621199, + "grad_norm": 0.0003017736307810992, + "learning_rate": 1.0444721980886187e-05, + "loss": 0.0157, + "step": 43720 + }, + { + "epoch": 9.498262380538662, + "grad_norm": 0.00031047649099491537, + "learning_rate": 1.0435671879525051e-05, + "loss": 0.0, + "step": 43730 + }, + { + "epoch": 9.500434404865334, + "grad_norm": 0.27765092253685, + "learning_rate": 1.0426621778163916e-05, + "loss": 0.0052, + "step": 43740 + }, + { + "epoch": 9.502606429192006, + "grad_norm": 0.00031226209830492735, + "learning_rate": 1.041757167680278e-05, + "loss": 0.0, + "step": 43750 + }, + { + "epoch": 9.504778453518679, + "grad_norm": 0.00032370752887800336, + "learning_rate": 1.0408521575441646e-05, + "loss": 0.0001, + "step": 43760 + }, + { + "epoch": 9.506950477845352, + "grad_norm": 0.0005092357750982046, + "learning_rate": 1.039947147408051e-05, + "loss": 0.0, + "step": 43770 + }, + { + "epoch": 9.509122502172024, + "grad_norm": 0.0003147079551126808, + "learning_rate": 1.0390421372719375e-05, + "loss": 0.0047, + "step": 43780 + }, + { + "epoch": 9.511294526498697, + "grad_norm": 0.00042028294410556555, + "learning_rate": 1.0381371271358239e-05, + "loss": 0.0, + "step": 43790 + }, + { + "epoch": 9.513466550825369, + "grad_norm": 0.0003288674633949995, + "learning_rate": 1.0372321169997103e-05, + "loss": 0.0001, + "step": 43800 + }, + { + "epoch": 9.515638575152042, + "grad_norm": 0.0003080323222093284, + "learning_rate": 1.036327106863597e-05, + "loss": 0.0, + "step": 43810 + }, + { + "epoch": 9.517810599478715, + "grad_norm": 0.00030454035731963813, + "learning_rate": 1.0354220967274834e-05, + "loss": 0.0, + "step": 43820 + }, + { + "epoch": 9.519982623805387, + "grad_norm": 0.00030711121507920325, + "learning_rate": 1.03451708659137e-05, + "loss": 0.0, + "step": 43830 + }, + { + "epoch": 9.522154648132059, + "grad_norm": 0.00031087957904674113, + "learning_rate": 1.0336120764552564e-05, + "loss": 0.0068, + "step": 43840 + }, + { + "epoch": 9.52432667245873, + "grad_norm": 0.0006094526033848524, + "learning_rate": 1.0327070663191428e-05, + "loss": 0.0, + "step": 43850 + }, + { + "epoch": 9.526498696785405, + "grad_norm": 0.00036623451160266995, + "learning_rate": 1.0318020561830293e-05, + "loss": 0.0, + "step": 43860 + }, + { + "epoch": 9.528670721112077, + "grad_norm": 0.0005215948331169784, + "learning_rate": 1.0308970460469159e-05, + "loss": 0.0, + "step": 43870 + }, + { + "epoch": 9.530842745438749, + "grad_norm": 0.0004186656151432544, + "learning_rate": 1.0299920359108023e-05, + "loss": 0.0004, + "step": 43880 + }, + { + "epoch": 9.53301476976542, + "grad_norm": 0.0003144047223031521, + "learning_rate": 1.0290870257746887e-05, + "loss": 0.0, + "step": 43890 + }, + { + "epoch": 9.535186794092095, + "grad_norm": 0.00030454093939624727, + "learning_rate": 1.0281820156385752e-05, + "loss": 0.0, + "step": 43900 + }, + { + "epoch": 9.537358818418767, + "grad_norm": 0.0003108079545199871, + "learning_rate": 1.0272770055024616e-05, + "loss": 0.0, + "step": 43910 + }, + { + "epoch": 9.539530842745439, + "grad_norm": 0.0003084295312874019, + "learning_rate": 1.026371995366348e-05, + "loss": 0.0, + "step": 43920 + }, + { + "epoch": 9.541702867072111, + "grad_norm": 0.00048682757187634706, + "learning_rate": 1.0254669852302346e-05, + "loss": 0.0, + "step": 43930 + }, + { + "epoch": 9.543874891398783, + "grad_norm": 0.000300971616525203, + "learning_rate": 1.024561975094121e-05, + "loss": 0.0432, + "step": 43940 + }, + { + "epoch": 9.546046915725457, + "grad_norm": 0.0003139590844511986, + "learning_rate": 1.0236569649580075e-05, + "loss": 0.0001, + "step": 43950 + }, + { + "epoch": 9.548218940052129, + "grad_norm": 8.009101867675781, + "learning_rate": 1.0227519548218941e-05, + "loss": 0.0572, + "step": 43960 + }, + { + "epoch": 9.550390964378801, + "grad_norm": 0.0003988199750892818, + "learning_rate": 1.0218469446857806e-05, + "loss": 0.0, + "step": 43970 + }, + { + "epoch": 9.552562988705473, + "grad_norm": 0.0009871821384876966, + "learning_rate": 1.020941934549667e-05, + "loss": 0.0, + "step": 43980 + }, + { + "epoch": 9.554735013032147, + "grad_norm": 0.0007358947768807411, + "learning_rate": 1.0200369244135536e-05, + "loss": 0.005, + "step": 43990 + }, + { + "epoch": 9.556907037358819, + "grad_norm": 0.0010422732448205352, + "learning_rate": 1.01913191427744e-05, + "loss": 0.0001, + "step": 44000 + }, + { + "epoch": 9.559079061685491, + "grad_norm": 0.0018725309055298567, + "learning_rate": 1.0182269041413265e-05, + "loss": 0.0002, + "step": 44010 + }, + { + "epoch": 9.561251086012163, + "grad_norm": 0.0004772630927618593, + "learning_rate": 1.0173218940052129e-05, + "loss": 0.0041, + "step": 44020 + }, + { + "epoch": 9.563423110338835, + "grad_norm": 0.0005594859248958528, + "learning_rate": 1.0164168838690993e-05, + "loss": 0.0086, + "step": 44030 + }, + { + "epoch": 9.565595134665509, + "grad_norm": 0.00434377184137702, + "learning_rate": 1.015511873732986e-05, + "loss": 0.0001, + "step": 44040 + }, + { + "epoch": 9.567767158992181, + "grad_norm": 0.0006473969551734626, + "learning_rate": 1.0146068635968724e-05, + "loss": 0.0036, + "step": 44050 + }, + { + "epoch": 9.569939183318853, + "grad_norm": 0.0010716840624809265, + "learning_rate": 1.0137018534607588e-05, + "loss": 0.0, + "step": 44060 + }, + { + "epoch": 9.572111207645525, + "grad_norm": 0.0012535667046904564, + "learning_rate": 1.0127968433246452e-05, + "loss": 0.0, + "step": 44070 + }, + { + "epoch": 9.574283231972197, + "grad_norm": 0.0003759227111004293, + "learning_rate": 1.0118918331885317e-05, + "loss": 0.0057, + "step": 44080 + }, + { + "epoch": 9.576455256298871, + "grad_norm": 0.000605809735134244, + "learning_rate": 1.0109868230524181e-05, + "loss": 0.0, + "step": 44090 + }, + { + "epoch": 9.578627280625543, + "grad_norm": 0.0003496602294035256, + "learning_rate": 1.0100818129163047e-05, + "loss": 0.0, + "step": 44100 + }, + { + "epoch": 9.580799304952215, + "grad_norm": 0.00036835906212218106, + "learning_rate": 1.0091768027801913e-05, + "loss": 0.0, + "step": 44110 + }, + { + "epoch": 9.582971329278887, + "grad_norm": 0.0003797638928517699, + "learning_rate": 1.0082717926440777e-05, + "loss": 0.0, + "step": 44120 + }, + { + "epoch": 9.58514335360556, + "grad_norm": 0.0003261095262132585, + "learning_rate": 1.0073667825079642e-05, + "loss": 0.0, + "step": 44130 + }, + { + "epoch": 9.587315377932233, + "grad_norm": 0.00031275799847207963, + "learning_rate": 1.0064617723718506e-05, + "loss": 0.0, + "step": 44140 + }, + { + "epoch": 9.589487402258905, + "grad_norm": 0.0005705951480194926, + "learning_rate": 1.005556762235737e-05, + "loss": 0.0001, + "step": 44150 + }, + { + "epoch": 9.591659426585577, + "grad_norm": 0.0004992802278138697, + "learning_rate": 1.0046517520996236e-05, + "loss": 0.0, + "step": 44160 + }, + { + "epoch": 9.59383145091225, + "grad_norm": 0.000599740247707814, + "learning_rate": 1.00374674196351e-05, + "loss": 0.0001, + "step": 44170 + }, + { + "epoch": 9.596003475238923, + "grad_norm": 0.1466888040304184, + "learning_rate": 1.0028417318273965e-05, + "loss": 0.0036, + "step": 44180 + }, + { + "epoch": 9.598175499565595, + "grad_norm": 0.0003275081980973482, + "learning_rate": 1.001936721691283e-05, + "loss": 0.0, + "step": 44190 + }, + { + "epoch": 9.600347523892268, + "grad_norm": 0.00044672199874185026, + "learning_rate": 1.0010317115551694e-05, + "loss": 0.0, + "step": 44200 + }, + { + "epoch": 9.60251954821894, + "grad_norm": 0.00030891658389009535, + "learning_rate": 1.0001267014190558e-05, + "loss": 0.0, + "step": 44210 + }, + { + "epoch": 9.604691572545612, + "grad_norm": 0.0003040438750758767, + "learning_rate": 9.992216912829424e-06, + "loss": 0.0001, + "step": 44220 + }, + { + "epoch": 9.606863596872286, + "grad_norm": 0.007986625656485558, + "learning_rate": 9.983166811468288e-06, + "loss": 0.0, + "step": 44230 + }, + { + "epoch": 9.609035621198958, + "grad_norm": 0.0003891444648616016, + "learning_rate": 9.974116710107153e-06, + "loss": 0.0, + "step": 44240 + }, + { + "epoch": 9.61120764552563, + "grad_norm": 0.0005247259396128356, + "learning_rate": 9.965066608746019e-06, + "loss": 0.0057, + "step": 44250 + }, + { + "epoch": 9.613379669852302, + "grad_norm": 0.0003096143191214651, + "learning_rate": 9.956016507384883e-06, + "loss": 0.0, + "step": 44260 + }, + { + "epoch": 9.615551694178976, + "grad_norm": 0.0003360881528351456, + "learning_rate": 9.94696640602375e-06, + "loss": 0.0, + "step": 44270 + }, + { + "epoch": 9.617723718505648, + "grad_norm": 0.00029326791991479695, + "learning_rate": 9.937916304662614e-06, + "loss": 0.0, + "step": 44280 + }, + { + "epoch": 9.61989574283232, + "grad_norm": 0.0003966445801779628, + "learning_rate": 9.928866203301478e-06, + "loss": 0.0, + "step": 44290 + }, + { + "epoch": 9.622067767158992, + "grad_norm": 0.0006641658837907016, + "learning_rate": 9.919816101940342e-06, + "loss": 0.0001, + "step": 44300 + }, + { + "epoch": 9.624239791485664, + "grad_norm": 0.0003245220868848264, + "learning_rate": 9.910766000579207e-06, + "loss": 0.0045, + "step": 44310 + }, + { + "epoch": 9.626411815812338, + "grad_norm": 0.0003133430436719209, + "learning_rate": 9.901715899218071e-06, + "loss": 0.0, + "step": 44320 + }, + { + "epoch": 9.62858384013901, + "grad_norm": 0.0003544339269865304, + "learning_rate": 9.892665797856937e-06, + "loss": 0.0409, + "step": 44330 + }, + { + "epoch": 9.630755864465682, + "grad_norm": 0.0011449995217844844, + "learning_rate": 9.883615696495801e-06, + "loss": 0.0001, + "step": 44340 + }, + { + "epoch": 9.632927888792354, + "grad_norm": 0.0006344979046843946, + "learning_rate": 9.874565595134666e-06, + "loss": 0.0001, + "step": 44350 + }, + { + "epoch": 9.635099913119028, + "grad_norm": 0.01246078684926033, + "learning_rate": 9.86551549377353e-06, + "loss": 0.0054, + "step": 44360 + }, + { + "epoch": 9.6372719374457, + "grad_norm": 0.0012406118912622333, + "learning_rate": 9.856465392412394e-06, + "loss": 0.0001, + "step": 44370 + }, + { + "epoch": 9.639443961772372, + "grad_norm": 0.00043785336310975254, + "learning_rate": 9.84741529105126e-06, + "loss": 0.0, + "step": 44380 + }, + { + "epoch": 9.641615986099044, + "grad_norm": 0.00039675500011071563, + "learning_rate": 9.838365189690125e-06, + "loss": 0.0001, + "step": 44390 + }, + { + "epoch": 9.643788010425716, + "grad_norm": 0.0011241411557421088, + "learning_rate": 9.82931508832899e-06, + "loss": 0.0001, + "step": 44400 + }, + { + "epoch": 9.64596003475239, + "grad_norm": 0.00045028634485788643, + "learning_rate": 9.820264986967855e-06, + "loss": 0.0001, + "step": 44410 + }, + { + "epoch": 9.648132059079062, + "grad_norm": 0.0003788200847338885, + "learning_rate": 9.81121488560672e-06, + "loss": 0.0, + "step": 44420 + }, + { + "epoch": 9.650304083405734, + "grad_norm": 0.00032590579940006137, + "learning_rate": 9.802164784245584e-06, + "loss": 0.0001, + "step": 44430 + }, + { + "epoch": 9.652476107732406, + "grad_norm": 0.0004667961038649082, + "learning_rate": 9.79311468288445e-06, + "loss": 0.0001, + "step": 44440 + }, + { + "epoch": 9.654648132059078, + "grad_norm": 0.0004204972938168794, + "learning_rate": 9.784064581523314e-06, + "loss": 0.0, + "step": 44450 + }, + { + "epoch": 9.656820156385752, + "grad_norm": 0.22058351337909698, + "learning_rate": 9.775014480162178e-06, + "loss": 0.0052, + "step": 44460 + }, + { + "epoch": 9.658992180712424, + "grad_norm": 0.0003753335040528327, + "learning_rate": 9.765964378801043e-06, + "loss": 0.0001, + "step": 44470 + }, + { + "epoch": 9.661164205039096, + "grad_norm": 0.0003022757591679692, + "learning_rate": 9.756914277439907e-06, + "loss": 0.0, + "step": 44480 + }, + { + "epoch": 9.663336229365768, + "grad_norm": 0.0008022760157473385, + "learning_rate": 9.747864176078771e-06, + "loss": 0.0, + "step": 44490 + }, + { + "epoch": 9.665508253692442, + "grad_norm": 0.0003668908029794693, + "learning_rate": 9.738814074717638e-06, + "loss": 0.0001, + "step": 44500 + }, + { + "epoch": 9.667680278019114, + "grad_norm": 0.00030293557210825384, + "learning_rate": 9.729763973356502e-06, + "loss": 0.0, + "step": 44510 + }, + { + "epoch": 9.669852302345786, + "grad_norm": 0.00043993687722831964, + "learning_rate": 9.720713871995366e-06, + "loss": 0.0, + "step": 44520 + }, + { + "epoch": 9.672024326672458, + "grad_norm": 0.0004386714135762304, + "learning_rate": 9.711663770634232e-06, + "loss": 0.0059, + "step": 44530 + }, + { + "epoch": 9.67419635099913, + "grad_norm": 0.001005807425826788, + "learning_rate": 9.702613669273097e-06, + "loss": 0.0, + "step": 44540 + }, + { + "epoch": 9.676368375325804, + "grad_norm": 0.0004584961279761046, + "learning_rate": 9.693563567911961e-06, + "loss": 0.0045, + "step": 44550 + }, + { + "epoch": 9.678540399652476, + "grad_norm": 0.0004154407943133265, + "learning_rate": 9.684513466550827e-06, + "loss": 0.0, + "step": 44560 + }, + { + "epoch": 9.680712423979148, + "grad_norm": 0.15402913093566895, + "learning_rate": 9.675463365189691e-06, + "loss": 0.0038, + "step": 44570 + }, + { + "epoch": 9.68288444830582, + "grad_norm": 0.0003599036717787385, + "learning_rate": 9.666413263828556e-06, + "loss": 0.0045, + "step": 44580 + }, + { + "epoch": 9.685056472632493, + "grad_norm": 0.0002962095313705504, + "learning_rate": 9.65736316246742e-06, + "loss": 0.0, + "step": 44590 + }, + { + "epoch": 9.687228496959166, + "grad_norm": 0.0003011043299920857, + "learning_rate": 9.648313061106284e-06, + "loss": 0.0004, + "step": 44600 + }, + { + "epoch": 9.689400521285839, + "grad_norm": 0.0003057145804632455, + "learning_rate": 9.639262959745149e-06, + "loss": 0.0, + "step": 44610 + }, + { + "epoch": 9.69157254561251, + "grad_norm": 0.004641965497285128, + "learning_rate": 9.630212858384015e-06, + "loss": 0.0131, + "step": 44620 + }, + { + "epoch": 9.693744569939183, + "grad_norm": 0.001000256510451436, + "learning_rate": 9.621162757022879e-06, + "loss": 0.0001, + "step": 44630 + }, + { + "epoch": 9.695916594265857, + "grad_norm": 0.0003455234400462359, + "learning_rate": 9.612112655661743e-06, + "loss": 0.0038, + "step": 44640 + }, + { + "epoch": 9.698088618592529, + "grad_norm": 0.0002961833088193089, + "learning_rate": 9.603062554300608e-06, + "loss": 0.0011, + "step": 44650 + }, + { + "epoch": 9.7002606429192, + "grad_norm": 0.0002967560722026974, + "learning_rate": 9.594012452939474e-06, + "loss": 0.0, + "step": 44660 + }, + { + "epoch": 9.702432667245873, + "grad_norm": 0.0003102968621533364, + "learning_rate": 9.584962351578338e-06, + "loss": 0.0, + "step": 44670 + }, + { + "epoch": 9.704604691572545, + "grad_norm": 0.00029592696228064597, + "learning_rate": 9.575912250217204e-06, + "loss": 0.0, + "step": 44680 + }, + { + "epoch": 9.706776715899219, + "grad_norm": 0.00031772974762134254, + "learning_rate": 9.566862148856068e-06, + "loss": 0.0, + "step": 44690 + }, + { + "epoch": 9.70894874022589, + "grad_norm": 0.00029936572536826134, + "learning_rate": 9.557812047494933e-06, + "loss": 0.0, + "step": 44700 + }, + { + "epoch": 9.711120764552563, + "grad_norm": 0.00072198745328933, + "learning_rate": 9.548761946133797e-06, + "loss": 0.0001, + "step": 44710 + }, + { + "epoch": 9.713292788879235, + "grad_norm": 0.00028860653401352465, + "learning_rate": 9.539711844772661e-06, + "loss": 0.005, + "step": 44720 + }, + { + "epoch": 9.715464813205909, + "grad_norm": 15.439363479614258, + "learning_rate": 9.530661743411527e-06, + "loss": 0.0426, + "step": 44730 + }, + { + "epoch": 9.71763683753258, + "grad_norm": 0.0002893265336751938, + "learning_rate": 9.521611642050392e-06, + "loss": 0.0035, + "step": 44740 + }, + { + "epoch": 9.719808861859253, + "grad_norm": 0.0003055331180803478, + "learning_rate": 9.512561540689256e-06, + "loss": 0.0, + "step": 44750 + }, + { + "epoch": 9.721980886185925, + "grad_norm": 0.00029819607152603567, + "learning_rate": 9.50351143932812e-06, + "loss": 0.0, + "step": 44760 + }, + { + "epoch": 9.724152910512597, + "grad_norm": 0.00038984580896794796, + "learning_rate": 9.494461337966985e-06, + "loss": 0.0, + "step": 44770 + }, + { + "epoch": 9.72632493483927, + "grad_norm": 0.0006327672745101154, + "learning_rate": 9.48541123660585e-06, + "loss": 0.0, + "step": 44780 + }, + { + "epoch": 9.728496959165943, + "grad_norm": 0.0003006533079314977, + "learning_rate": 9.476361135244715e-06, + "loss": 0.0, + "step": 44790 + }, + { + "epoch": 9.730668983492615, + "grad_norm": 0.006074481178075075, + "learning_rate": 9.46731103388358e-06, + "loss": 0.0001, + "step": 44800 + }, + { + "epoch": 9.732841007819287, + "grad_norm": 0.00029355884180404246, + "learning_rate": 9.458260932522444e-06, + "loss": 0.0052, + "step": 44810 + }, + { + "epoch": 9.735013032145961, + "grad_norm": 0.00029236814589239657, + "learning_rate": 9.44921083116131e-06, + "loss": 0.0, + "step": 44820 + }, + { + "epoch": 9.737185056472633, + "grad_norm": 0.00391758419573307, + "learning_rate": 9.440160729800174e-06, + "loss": 0.004, + "step": 44830 + }, + { + "epoch": 9.739357080799305, + "grad_norm": 0.0002860849490389228, + "learning_rate": 9.43111062843904e-06, + "loss": 0.0, + "step": 44840 + }, + { + "epoch": 9.741529105125977, + "grad_norm": 0.00028976661269553006, + "learning_rate": 9.422060527077905e-06, + "loss": 0.0, + "step": 44850 + }, + { + "epoch": 9.74370112945265, + "grad_norm": 0.0003024868783541024, + "learning_rate": 9.413010425716769e-06, + "loss": 0.0, + "step": 44860 + }, + { + "epoch": 9.745873153779323, + "grad_norm": 0.0003289765154477209, + "learning_rate": 9.403960324355633e-06, + "loss": 0.0, + "step": 44870 + }, + { + "epoch": 9.748045178105995, + "grad_norm": 0.00030651132692582905, + "learning_rate": 9.394910222994498e-06, + "loss": 0.0001, + "step": 44880 + }, + { + "epoch": 9.750217202432667, + "grad_norm": 0.0009024749742820859, + "learning_rate": 9.385860121633362e-06, + "loss": 0.0, + "step": 44890 + }, + { + "epoch": 9.75238922675934, + "grad_norm": 0.000285855756374076, + "learning_rate": 9.376810020272228e-06, + "loss": 0.0, + "step": 44900 + }, + { + "epoch": 9.754561251086011, + "grad_norm": 0.0003339897666592151, + "learning_rate": 9.367759918911092e-06, + "loss": 0.0, + "step": 44910 + }, + { + "epoch": 9.756733275412685, + "grad_norm": 0.00028623559046536684, + "learning_rate": 9.358709817549957e-06, + "loss": 0.0068, + "step": 44920 + }, + { + "epoch": 9.758905299739357, + "grad_norm": 0.00029339815955609083, + "learning_rate": 9.349659716188821e-06, + "loss": 0.0, + "step": 44930 + }, + { + "epoch": 9.76107732406603, + "grad_norm": 0.00038780056638643146, + "learning_rate": 9.340609614827685e-06, + "loss": 0.0, + "step": 44940 + }, + { + "epoch": 9.763249348392701, + "grad_norm": 0.0005002043908461928, + "learning_rate": 9.331559513466551e-06, + "loss": 0.0055, + "step": 44950 + }, + { + "epoch": 9.765421372719375, + "grad_norm": 0.00046502629993483424, + "learning_rate": 9.322509412105416e-06, + "loss": 0.0, + "step": 44960 + }, + { + "epoch": 9.767593397046047, + "grad_norm": 0.0007907067192718387, + "learning_rate": 9.313459310744282e-06, + "loss": 0.0, + "step": 44970 + }, + { + "epoch": 9.76976542137272, + "grad_norm": 0.00028519314946606755, + "learning_rate": 9.304409209383146e-06, + "loss": 0.0, + "step": 44980 + }, + { + "epoch": 9.771937445699391, + "grad_norm": 0.001314941211603582, + "learning_rate": 9.29535910802201e-06, + "loss": 0.0, + "step": 44990 + }, + { + "epoch": 9.774109470026064, + "grad_norm": 0.0015476990956813097, + "learning_rate": 9.286309006660875e-06, + "loss": 0.0, + "step": 45000 + }, + { + "epoch": 9.776281494352737, + "grad_norm": 0.00029333692509680986, + "learning_rate": 9.277258905299739e-06, + "loss": 0.0, + "step": 45010 + }, + { + "epoch": 9.77845351867941, + "grad_norm": 0.0005370720755308867, + "learning_rate": 9.268208803938605e-06, + "loss": 0.0, + "step": 45020 + }, + { + "epoch": 9.780625543006082, + "grad_norm": 0.0005170804797671735, + "learning_rate": 9.25915870257747e-06, + "loss": 0.0, + "step": 45030 + }, + { + "epoch": 9.782797567332754, + "grad_norm": 0.00028901023324579, + "learning_rate": 9.250108601216334e-06, + "loss": 0.0, + "step": 45040 + }, + { + "epoch": 9.784969591659426, + "grad_norm": 0.0002907784946728498, + "learning_rate": 9.241058499855198e-06, + "loss": 0.0001, + "step": 45050 + }, + { + "epoch": 9.7871416159861, + "grad_norm": 0.0002850813325494528, + "learning_rate": 9.232008398494063e-06, + "loss": 0.0, + "step": 45060 + }, + { + "epoch": 9.789313640312772, + "grad_norm": 0.0004792496911250055, + "learning_rate": 9.222958297132929e-06, + "loss": 0.0052, + "step": 45070 + }, + { + "epoch": 9.791485664639444, + "grad_norm": 0.0003039147413801402, + "learning_rate": 9.213908195771793e-06, + "loss": 0.0055, + "step": 45080 + }, + { + "epoch": 9.793657688966116, + "grad_norm": 0.0002853998448699713, + "learning_rate": 9.204858094410657e-06, + "loss": 0.0, + "step": 45090 + }, + { + "epoch": 9.79582971329279, + "grad_norm": 0.00031133025186136365, + "learning_rate": 9.195807993049523e-06, + "loss": 0.0043, + "step": 45100 + }, + { + "epoch": 9.798001737619462, + "grad_norm": 0.0002923411375377327, + "learning_rate": 9.186757891688388e-06, + "loss": 0.0, + "step": 45110 + }, + { + "epoch": 9.800173761946134, + "grad_norm": 0.0005165540496818721, + "learning_rate": 9.177707790327252e-06, + "loss": 0.0, + "step": 45120 + }, + { + "epoch": 9.802345786272806, + "grad_norm": 0.000464085751445964, + "learning_rate": 9.168657688966118e-06, + "loss": 0.0, + "step": 45130 + }, + { + "epoch": 9.804517810599478, + "grad_norm": 0.0002842270187102258, + "learning_rate": 9.159607587604982e-06, + "loss": 0.0, + "step": 45140 + }, + { + "epoch": 9.806689834926152, + "grad_norm": 0.00037417063140310347, + "learning_rate": 9.150557486243847e-06, + "loss": 0.0, + "step": 45150 + }, + { + "epoch": 9.808861859252824, + "grad_norm": 0.0002841727982740849, + "learning_rate": 9.141507384882711e-06, + "loss": 0.0001, + "step": 45160 + }, + { + "epoch": 9.811033883579496, + "grad_norm": 0.00028294374351389706, + "learning_rate": 9.132457283521575e-06, + "loss": 0.0001, + "step": 45170 + }, + { + "epoch": 9.813205907906168, + "grad_norm": 0.0004307189374230802, + "learning_rate": 9.12340718216044e-06, + "loss": 0.0048, + "step": 45180 + }, + { + "epoch": 9.815377932232842, + "grad_norm": 0.0002827845746651292, + "learning_rate": 9.114357080799306e-06, + "loss": 0.0, + "step": 45190 + }, + { + "epoch": 9.817549956559514, + "grad_norm": 0.00047424028161913157, + "learning_rate": 9.10530697943817e-06, + "loss": 0.0, + "step": 45200 + }, + { + "epoch": 9.819721980886186, + "grad_norm": 0.00027991452952846885, + "learning_rate": 9.096256878077034e-06, + "loss": 0.0085, + "step": 45210 + }, + { + "epoch": 9.821894005212858, + "grad_norm": 0.00029596214881166816, + "learning_rate": 9.087206776715899e-06, + "loss": 0.0, + "step": 45220 + }, + { + "epoch": 9.82406602953953, + "grad_norm": 0.00028860315796919167, + "learning_rate": 9.078156675354765e-06, + "loss": 0.0, + "step": 45230 + }, + { + "epoch": 9.826238053866204, + "grad_norm": 0.000579252140596509, + "learning_rate": 9.069106573993629e-06, + "loss": 0.0, + "step": 45240 + }, + { + "epoch": 9.828410078192876, + "grad_norm": 0.0004747594066429883, + "learning_rate": 9.060056472632495e-06, + "loss": 0.0, + "step": 45250 + }, + { + "epoch": 9.830582102519548, + "grad_norm": 0.0003230497823096812, + "learning_rate": 9.05100637127136e-06, + "loss": 0.0, + "step": 45260 + }, + { + "epoch": 9.83275412684622, + "grad_norm": 0.0002852856705430895, + "learning_rate": 9.041956269910224e-06, + "loss": 0.0, + "step": 45270 + }, + { + "epoch": 9.834926151172894, + "grad_norm": 0.0002863154513761401, + "learning_rate": 9.032906168549088e-06, + "loss": 0.0045, + "step": 45280 + }, + { + "epoch": 9.837098175499566, + "grad_norm": 0.0002846615097951144, + "learning_rate": 9.023856067187952e-06, + "loss": 0.0, + "step": 45290 + }, + { + "epoch": 9.839270199826238, + "grad_norm": 0.00028884748462587595, + "learning_rate": 9.014805965826819e-06, + "loss": 0.0, + "step": 45300 + }, + { + "epoch": 9.84144222415291, + "grad_norm": 0.0002860878885257989, + "learning_rate": 9.005755864465683e-06, + "loss": 0.0, + "step": 45310 + }, + { + "epoch": 9.843614248479582, + "grad_norm": 0.0004514564643613994, + "learning_rate": 8.996705763104547e-06, + "loss": 0.0, + "step": 45320 + }, + { + "epoch": 9.845786272806256, + "grad_norm": 0.0002968948392663151, + "learning_rate": 8.987655661743412e-06, + "loss": 0.0, + "step": 45330 + }, + { + "epoch": 9.847958297132928, + "grad_norm": 0.0002735159359872341, + "learning_rate": 8.978605560382276e-06, + "loss": 0.0, + "step": 45340 + }, + { + "epoch": 9.8501303214596, + "grad_norm": 0.0004131652240175754, + "learning_rate": 8.96955545902114e-06, + "loss": 0.0, + "step": 45350 + }, + { + "epoch": 9.852302345786272, + "grad_norm": 0.0002853725745808333, + "learning_rate": 8.960505357660006e-06, + "loss": 0.0, + "step": 45360 + }, + { + "epoch": 9.854474370112944, + "grad_norm": 0.001196343800984323, + "learning_rate": 8.95145525629887e-06, + "loss": 0.0, + "step": 45370 + }, + { + "epoch": 9.856646394439618, + "grad_norm": 0.000468127807835117, + "learning_rate": 8.942405154937737e-06, + "loss": 0.0, + "step": 45380 + }, + { + "epoch": 9.85881841876629, + "grad_norm": 0.0002808228600770235, + "learning_rate": 8.933355053576601e-06, + "loss": 0.0, + "step": 45390 + }, + { + "epoch": 9.860990443092962, + "grad_norm": 0.00028429063968360424, + "learning_rate": 8.924304952215465e-06, + "loss": 0.0, + "step": 45400 + }, + { + "epoch": 9.863162467419635, + "grad_norm": 0.0003117309242952615, + "learning_rate": 8.91525485085433e-06, + "loss": 0.0, + "step": 45410 + }, + { + "epoch": 9.865334491746308, + "grad_norm": 0.00027799929375760257, + "learning_rate": 8.906204749493196e-06, + "loss": 0.0, + "step": 45420 + }, + { + "epoch": 9.86750651607298, + "grad_norm": 0.0002811710874084383, + "learning_rate": 8.89715464813206e-06, + "loss": 0.0, + "step": 45430 + }, + { + "epoch": 9.869678540399653, + "grad_norm": 0.0002788385027088225, + "learning_rate": 8.888104546770924e-06, + "loss": 0.0, + "step": 45440 + }, + { + "epoch": 9.871850564726325, + "grad_norm": 0.0002747518883552402, + "learning_rate": 8.879054445409789e-06, + "loss": 0.0, + "step": 45450 + }, + { + "epoch": 9.874022589052997, + "grad_norm": 0.0004052985750604421, + "learning_rate": 8.870004344048653e-06, + "loss": 0.0, + "step": 45460 + }, + { + "epoch": 9.87619461337967, + "grad_norm": 0.00027624680660665035, + "learning_rate": 8.860954242687519e-06, + "loss": 0.0, + "step": 45470 + }, + { + "epoch": 9.878366637706343, + "grad_norm": 0.00027512721135281026, + "learning_rate": 8.851904141326383e-06, + "loss": 0.0054, + "step": 45480 + }, + { + "epoch": 9.880538662033015, + "grad_norm": 0.000608505099080503, + "learning_rate": 8.842854039965248e-06, + "loss": 0.0, + "step": 45490 + }, + { + "epoch": 9.882710686359687, + "grad_norm": 0.00027142054750584066, + "learning_rate": 8.833803938604112e-06, + "loss": 0.0048, + "step": 45500 + }, + { + "epoch": 9.884882710686359, + "grad_norm": 0.00027802452677860856, + "learning_rate": 8.824753837242976e-06, + "loss": 0.0, + "step": 45510 + }, + { + "epoch": 9.887054735013033, + "grad_norm": 0.00029872122104279697, + "learning_rate": 8.815703735881842e-06, + "loss": 0.0, + "step": 45520 + }, + { + "epoch": 9.889226759339705, + "grad_norm": 0.000274182966677472, + "learning_rate": 8.806653634520708e-06, + "loss": 0.0, + "step": 45530 + }, + { + "epoch": 9.891398783666377, + "grad_norm": 0.00028276850935071707, + "learning_rate": 8.797603533159573e-06, + "loss": 0.0, + "step": 45540 + }, + { + "epoch": 9.893570807993049, + "grad_norm": 0.00027952558593824506, + "learning_rate": 8.788553431798437e-06, + "loss": 0.0, + "step": 45550 + }, + { + "epoch": 9.895742832319723, + "grad_norm": 0.0002717770403251052, + "learning_rate": 8.779503330437301e-06, + "loss": 0.0, + "step": 45560 + }, + { + "epoch": 9.897914856646395, + "grad_norm": 0.00030153506668284535, + "learning_rate": 8.770453229076166e-06, + "loss": 0.0039, + "step": 45570 + }, + { + "epoch": 9.900086880973067, + "grad_norm": 0.00028142359224148095, + "learning_rate": 8.76140312771503e-06, + "loss": 0.0, + "step": 45580 + }, + { + "epoch": 9.902258905299739, + "grad_norm": 0.0002846510033123195, + "learning_rate": 8.752353026353896e-06, + "loss": 0.0, + "step": 45590 + }, + { + "epoch": 9.904430929626411, + "grad_norm": 0.00043699092930182815, + "learning_rate": 8.74330292499276e-06, + "loss": 0.0, + "step": 45600 + }, + { + "epoch": 9.906602953953085, + "grad_norm": 0.00026862454251386225, + "learning_rate": 8.734252823631625e-06, + "loss": 0.0, + "step": 45610 + }, + { + "epoch": 9.908774978279757, + "grad_norm": 0.0002761590003501624, + "learning_rate": 8.72520272227049e-06, + "loss": 0.0, + "step": 45620 + }, + { + "epoch": 9.910947002606429, + "grad_norm": 0.0004086109984200448, + "learning_rate": 8.716152620909354e-06, + "loss": 0.0, + "step": 45630 + }, + { + "epoch": 9.913119026933101, + "grad_norm": 0.00026836665347218513, + "learning_rate": 8.70710251954822e-06, + "loss": 0.0, + "step": 45640 + }, + { + "epoch": 9.915291051259775, + "grad_norm": 0.00026761431945487857, + "learning_rate": 8.698052418187084e-06, + "loss": 0.0, + "step": 45650 + }, + { + "epoch": 9.917463075586447, + "grad_norm": 0.00026968028396368027, + "learning_rate": 8.689002316825948e-06, + "loss": 0.0, + "step": 45660 + }, + { + "epoch": 9.919635099913119, + "grad_norm": 0.0003493023104965687, + "learning_rate": 8.679952215464814e-06, + "loss": 0.0041, + "step": 45670 + }, + { + "epoch": 9.921807124239791, + "grad_norm": 0.0002837933134287596, + "learning_rate": 8.670902114103679e-06, + "loss": 0.0, + "step": 45680 + }, + { + "epoch": 9.923979148566463, + "grad_norm": 0.00027246540412306786, + "learning_rate": 8.661852012742543e-06, + "loss": 0.0, + "step": 45690 + }, + { + "epoch": 9.926151172893137, + "grad_norm": 0.00026831854484044015, + "learning_rate": 8.652801911381409e-06, + "loss": 0.0, + "step": 45700 + }, + { + "epoch": 9.92832319721981, + "grad_norm": 0.00026500673266127706, + "learning_rate": 8.643751810020273e-06, + "loss": 0.0, + "step": 45710 + }, + { + "epoch": 9.930495221546481, + "grad_norm": 0.0002658453886397183, + "learning_rate": 8.634701708659138e-06, + "loss": 0.0039, + "step": 45720 + }, + { + "epoch": 9.932667245873153, + "grad_norm": 0.000474416243378073, + "learning_rate": 8.625651607298002e-06, + "loss": 0.0056, + "step": 45730 + }, + { + "epoch": 9.934839270199827, + "grad_norm": 0.0003779761027544737, + "learning_rate": 8.616601505936866e-06, + "loss": 0.0, + "step": 45740 + }, + { + "epoch": 9.9370112945265, + "grad_norm": 0.0002920727420132607, + "learning_rate": 8.60755140457573e-06, + "loss": 0.0, + "step": 45750 + }, + { + "epoch": 9.939183318853171, + "grad_norm": 0.00026948191225528717, + "learning_rate": 8.598501303214597e-06, + "loss": 0.0, + "step": 45760 + }, + { + "epoch": 9.941355343179843, + "grad_norm": 0.0003025501500815153, + "learning_rate": 8.589451201853461e-06, + "loss": 0.0, + "step": 45770 + }, + { + "epoch": 9.943527367506515, + "grad_norm": 0.00027173449052497745, + "learning_rate": 8.580401100492325e-06, + "loss": 0.0, + "step": 45780 + }, + { + "epoch": 9.94569939183319, + "grad_norm": 0.00026526564033702016, + "learning_rate": 8.57135099913119e-06, + "loss": 0.0, + "step": 45790 + }, + { + "epoch": 9.947871416159861, + "grad_norm": 0.00026329734828323126, + "learning_rate": 8.562300897770056e-06, + "loss": 0.0, + "step": 45800 + }, + { + "epoch": 9.950043440486533, + "grad_norm": 0.0003687392745632678, + "learning_rate": 8.55325079640892e-06, + "loss": 0.0, + "step": 45810 + }, + { + "epoch": 9.952215464813206, + "grad_norm": 0.0006631419528275728, + "learning_rate": 8.544200695047786e-06, + "loss": 0.0, + "step": 45820 + }, + { + "epoch": 9.954387489139878, + "grad_norm": 0.00026899162912741303, + "learning_rate": 8.53515059368665e-06, + "loss": 0.0, + "step": 45830 + }, + { + "epoch": 9.956559513466551, + "grad_norm": 0.0002637408615555614, + "learning_rate": 8.526100492325515e-06, + "loss": 0.0, + "step": 45840 + }, + { + "epoch": 9.958731537793224, + "grad_norm": 0.000278302701190114, + "learning_rate": 8.51705039096438e-06, + "loss": 0.0, + "step": 45850 + }, + { + "epoch": 9.960903562119896, + "grad_norm": 0.00026116601657122374, + "learning_rate": 8.508000289603244e-06, + "loss": 0.0, + "step": 45860 + }, + { + "epoch": 9.963075586446568, + "grad_norm": 0.0002752375148702413, + "learning_rate": 8.49895018824211e-06, + "loss": 0.0, + "step": 45870 + }, + { + "epoch": 9.96524761077324, + "grad_norm": 0.00026467558927834034, + "learning_rate": 8.489900086880974e-06, + "loss": 0.0, + "step": 45880 + }, + { + "epoch": 9.967419635099914, + "grad_norm": 0.0004597961960826069, + "learning_rate": 8.480849985519838e-06, + "loss": 0.0, + "step": 45890 + }, + { + "epoch": 9.969591659426586, + "grad_norm": 0.00026340316981077194, + "learning_rate": 8.471799884158703e-06, + "loss": 0.0, + "step": 45900 + }, + { + "epoch": 9.971763683753258, + "grad_norm": 0.0002597050915937871, + "learning_rate": 8.462749782797567e-06, + "loss": 0.0, + "step": 45910 + }, + { + "epoch": 9.97393570807993, + "grad_norm": 0.00026734822313301265, + "learning_rate": 8.453699681436431e-06, + "loss": 0.0, + "step": 45920 + }, + { + "epoch": 9.976107732406604, + "grad_norm": 0.0002649006200954318, + "learning_rate": 8.444649580075297e-06, + "loss": 0.0, + "step": 45930 + }, + { + "epoch": 9.978279756733276, + "grad_norm": 0.00026825052918866277, + "learning_rate": 8.435599478714162e-06, + "loss": 0.0, + "step": 45940 + }, + { + "epoch": 9.980451781059948, + "grad_norm": 0.0002630847448017448, + "learning_rate": 8.426549377353028e-06, + "loss": 0.0056, + "step": 45950 + }, + { + "epoch": 9.98262380538662, + "grad_norm": 0.0011709785321727395, + "learning_rate": 8.417499275991892e-06, + "loss": 0.0, + "step": 45960 + }, + { + "epoch": 9.984795829713292, + "grad_norm": 0.00026459337095730007, + "learning_rate": 8.408449174630756e-06, + "loss": 0.0, + "step": 45970 + }, + { + "epoch": 9.986967854039966, + "grad_norm": 0.0002779015921987593, + "learning_rate": 8.39939907326962e-06, + "loss": 0.0, + "step": 45980 + }, + { + "epoch": 9.989139878366638, + "grad_norm": 0.00027847522869706154, + "learning_rate": 8.390348971908487e-06, + "loss": 0.0, + "step": 45990 + }, + { + "epoch": 9.99131190269331, + "grad_norm": 0.0005873033660463989, + "learning_rate": 8.381298870547351e-06, + "loss": 0.0039, + "step": 46000 + }, + { + "epoch": 9.993483927019982, + "grad_norm": 0.00026374112349003553, + "learning_rate": 8.372248769186215e-06, + "loss": 0.0, + "step": 46010 + }, + { + "epoch": 9.995655951346656, + "grad_norm": 0.00026576846721582115, + "learning_rate": 8.36319866782508e-06, + "loss": 0.0, + "step": 46020 + }, + { + "epoch": 9.997827975673328, + "grad_norm": 0.0003927880898118019, + "learning_rate": 8.354148566463944e-06, + "loss": 0.0, + "step": 46030 + }, + { + "epoch": 10.0, + "grad_norm": 0.00025913305580616, + "learning_rate": 8.34509846510281e-06, + "loss": 0.0, + "step": 46040 + }, + { + "epoch": 10.0, + "eval_f1": 0.5868725868725868, + "eval_loss": 0.08650576323270798, + "eval_runtime": 83.6602, + "eval_samples_per_second": 119.232, + "eval_steps_per_second": 7.459, + "step": 46040 + }, + { + "epoch": 10.002172024326672, + "grad_norm": 0.0003737666120287031, + "learning_rate": 8.336048363741674e-06, + "loss": 0.0, + "step": 46050 + }, + { + "epoch": 10.004344048653344, + "grad_norm": 0.000261695240624249, + "learning_rate": 8.326998262380539e-06, + "loss": 0.0038, + "step": 46060 + }, + { + "epoch": 10.006516072980018, + "grad_norm": 0.0002736767055466771, + "learning_rate": 8.317948161019403e-06, + "loss": 0.0, + "step": 46070 + }, + { + "epoch": 10.00868809730669, + "grad_norm": 0.00025755647220648825, + "learning_rate": 8.308898059658267e-06, + "loss": 0.0, + "step": 46080 + }, + { + "epoch": 10.010860121633362, + "grad_norm": 0.0005032480112276971, + "learning_rate": 8.299847958297133e-06, + "loss": 0.0, + "step": 46090 + }, + { + "epoch": 10.013032145960034, + "grad_norm": 0.0003156476595904678, + "learning_rate": 8.290797856936e-06, + "loss": 0.0, + "step": 46100 + }, + { + "epoch": 10.015204170286708, + "grad_norm": 0.0002580749278422445, + "learning_rate": 8.281747755574864e-06, + "loss": 0.0056, + "step": 46110 + }, + { + "epoch": 10.01737619461338, + "grad_norm": 0.0004497791233006865, + "learning_rate": 8.272697654213728e-06, + "loss": 0.0, + "step": 46120 + }, + { + "epoch": 10.019548218940052, + "grad_norm": 0.0002614956465549767, + "learning_rate": 8.263647552852593e-06, + "loss": 0.0, + "step": 46130 + }, + { + "epoch": 10.021720243266724, + "grad_norm": 0.00025961664505302906, + "learning_rate": 8.254597451491457e-06, + "loss": 0.0, + "step": 46140 + }, + { + "epoch": 10.023892267593396, + "grad_norm": 0.0003735600912477821, + "learning_rate": 8.245547350130321e-06, + "loss": 0.0, + "step": 46150 + }, + { + "epoch": 10.02606429192007, + "grad_norm": 0.0002574862737674266, + "learning_rate": 8.236497248769187e-06, + "loss": 0.0, + "step": 46160 + }, + { + "epoch": 10.028236316246742, + "grad_norm": 0.0002677075390238315, + "learning_rate": 8.227447147408052e-06, + "loss": 0.0, + "step": 46170 + }, + { + "epoch": 10.030408340573414, + "grad_norm": 0.00025923867360688746, + "learning_rate": 8.218397046046916e-06, + "loss": 0.0, + "step": 46180 + }, + { + "epoch": 10.032580364900086, + "grad_norm": 0.0003291449975222349, + "learning_rate": 8.20934694468578e-06, + "loss": 0.0, + "step": 46190 + }, + { + "epoch": 10.034752389226758, + "grad_norm": 0.0002590891672298312, + "learning_rate": 8.200296843324645e-06, + "loss": 0.0, + "step": 46200 + }, + { + "epoch": 10.036924413553432, + "grad_norm": 0.000344390602549538, + "learning_rate": 8.191246741963509e-06, + "loss": 0.0, + "step": 46210 + }, + { + "epoch": 10.039096437880104, + "grad_norm": 0.0002652654657140374, + "learning_rate": 8.182196640602375e-06, + "loss": 0.0, + "step": 46220 + }, + { + "epoch": 10.041268462206776, + "grad_norm": 0.0002611863019410521, + "learning_rate": 8.17314653924124e-06, + "loss": 0.0, + "step": 46230 + }, + { + "epoch": 10.043440486533449, + "grad_norm": 0.00025722087593749166, + "learning_rate": 8.164096437880105e-06, + "loss": 0.0, + "step": 46240 + }, + { + "epoch": 10.045612510860122, + "grad_norm": 0.00026090044411830604, + "learning_rate": 8.15504633651897e-06, + "loss": 0.0, + "step": 46250 + }, + { + "epoch": 10.047784535186794, + "grad_norm": 0.0002696911687962711, + "learning_rate": 8.145996235157834e-06, + "loss": 0.0, + "step": 46260 + }, + { + "epoch": 10.049956559513467, + "grad_norm": 0.0002560535504017025, + "learning_rate": 8.1369461337967e-06, + "loss": 0.0, + "step": 46270 + }, + { + "epoch": 10.052128583840139, + "grad_norm": 0.0002532984653953463, + "learning_rate": 8.127896032435564e-06, + "loss": 0.0, + "step": 46280 + }, + { + "epoch": 10.05430060816681, + "grad_norm": 0.0002560800057835877, + "learning_rate": 8.118845931074429e-06, + "loss": 0.0, + "step": 46290 + }, + { + "epoch": 10.056472632493485, + "grad_norm": 0.00025648128939792514, + "learning_rate": 8.109795829713293e-06, + "loss": 0.0, + "step": 46300 + }, + { + "epoch": 10.058644656820157, + "grad_norm": 0.00025430944515392184, + "learning_rate": 8.100745728352157e-06, + "loss": 0.0037, + "step": 46310 + }, + { + "epoch": 10.060816681146829, + "grad_norm": 0.00025855813873931766, + "learning_rate": 8.091695626991022e-06, + "loss": 0.0, + "step": 46320 + }, + { + "epoch": 10.0629887054735, + "grad_norm": 0.00025462303892709315, + "learning_rate": 8.082645525629888e-06, + "loss": 0.0039, + "step": 46330 + }, + { + "epoch": 10.065160729800175, + "grad_norm": 0.000251170014962554, + "learning_rate": 8.073595424268752e-06, + "loss": 0.0, + "step": 46340 + }, + { + "epoch": 10.067332754126847, + "grad_norm": 0.00026278331642970443, + "learning_rate": 8.064545322907616e-06, + "loss": 0.0034, + "step": 46350 + }, + { + "epoch": 10.069504778453519, + "grad_norm": 0.000444703153334558, + "learning_rate": 8.05549522154648e-06, + "loss": 0.0062, + "step": 46360 + }, + { + "epoch": 10.07167680278019, + "grad_norm": 0.0002562769514042884, + "learning_rate": 8.046445120185347e-06, + "loss": 0.0, + "step": 46370 + }, + { + "epoch": 10.073848827106863, + "grad_norm": 0.0030441167764365673, + "learning_rate": 8.037395018824211e-06, + "loss": 0.0, + "step": 46380 + }, + { + "epoch": 10.076020851433537, + "grad_norm": 0.00029628712218254805, + "learning_rate": 8.028344917463077e-06, + "loss": 0.0, + "step": 46390 + }, + { + "epoch": 10.078192875760209, + "grad_norm": 0.000754713371861726, + "learning_rate": 8.019294816101942e-06, + "loss": 0.0, + "step": 46400 + }, + { + "epoch": 10.080364900086881, + "grad_norm": 0.0002676001749932766, + "learning_rate": 8.010244714740806e-06, + "loss": 0.0001, + "step": 46410 + }, + { + "epoch": 10.082536924413553, + "grad_norm": 0.000641426129732281, + "learning_rate": 8.00119461337967e-06, + "loss": 0.0, + "step": 46420 + }, + { + "epoch": 10.084708948740225, + "grad_norm": 0.00025804596953094006, + "learning_rate": 7.992144512018535e-06, + "loss": 0.0, + "step": 46430 + }, + { + "epoch": 10.086880973066899, + "grad_norm": 0.00027530855732038617, + "learning_rate": 7.9830944106574e-06, + "loss": 0.0, + "step": 46440 + }, + { + "epoch": 10.089052997393571, + "grad_norm": 0.0004725789185613394, + "learning_rate": 7.974044309296265e-06, + "loss": 0.0001, + "step": 46450 + }, + { + "epoch": 10.091225021720243, + "grad_norm": 0.000585050496738404, + "learning_rate": 7.96499420793513e-06, + "loss": 0.0, + "step": 46460 + }, + { + "epoch": 10.093397046046915, + "grad_norm": 0.0002689628745429218, + "learning_rate": 7.955944106573994e-06, + "loss": 0.0, + "step": 46470 + }, + { + "epoch": 10.095569070373589, + "grad_norm": 0.006118168588727713, + "learning_rate": 7.946894005212858e-06, + "loss": 0.0035, + "step": 46480 + }, + { + "epoch": 10.097741094700261, + "grad_norm": 0.00026504171546548605, + "learning_rate": 7.937843903851722e-06, + "loss": 0.0, + "step": 46490 + }, + { + "epoch": 10.099913119026933, + "grad_norm": 0.00029210373759269714, + "learning_rate": 7.928793802490588e-06, + "loss": 0.0, + "step": 46500 + }, + { + "epoch": 10.102085143353605, + "grad_norm": 0.00025521591305732727, + "learning_rate": 7.919743701129453e-06, + "loss": 0.0, + "step": 46510 + }, + { + "epoch": 10.104257167680277, + "grad_norm": 0.0004499217902775854, + "learning_rate": 7.910693599768319e-06, + "loss": 0.0, + "step": 46520 + }, + { + "epoch": 10.106429192006951, + "grad_norm": 0.00027370412135496736, + "learning_rate": 7.901643498407183e-06, + "loss": 0.0, + "step": 46530 + }, + { + "epoch": 10.108601216333623, + "grad_norm": 0.0002593390236143023, + "learning_rate": 7.892593397046047e-06, + "loss": 0.0, + "step": 46540 + }, + { + "epoch": 10.110773240660295, + "grad_norm": 0.00035731252864934504, + "learning_rate": 7.883543295684912e-06, + "loss": 0.0, + "step": 46550 + }, + { + "epoch": 10.112945264986967, + "grad_norm": 0.0002545152383390814, + "learning_rate": 7.874493194323778e-06, + "loss": 0.003, + "step": 46560 + }, + { + "epoch": 10.115117289313641, + "grad_norm": 0.0002480170805938542, + "learning_rate": 7.865443092962642e-06, + "loss": 0.0, + "step": 46570 + }, + { + "epoch": 10.117289313640313, + "grad_norm": 0.0002511973725631833, + "learning_rate": 7.856392991601506e-06, + "loss": 0.0, + "step": 46580 + }, + { + "epoch": 10.119461337966985, + "grad_norm": 0.0002490723563823849, + "learning_rate": 7.84734289024037e-06, + "loss": 0.0006, + "step": 46590 + }, + { + "epoch": 10.121633362293657, + "grad_norm": 0.000278045772574842, + "learning_rate": 7.838292788879235e-06, + "loss": 0.0, + "step": 46600 + }, + { + "epoch": 10.12380538662033, + "grad_norm": 0.00024990536621771753, + "learning_rate": 7.8292426875181e-06, + "loss": 0.0, + "step": 46610 + }, + { + "epoch": 10.125977410947003, + "grad_norm": 0.0017814398743212223, + "learning_rate": 7.820192586156965e-06, + "loss": 0.0, + "step": 46620 + }, + { + "epoch": 10.128149435273675, + "grad_norm": 0.00024815337383188307, + "learning_rate": 7.81114248479583e-06, + "loss": 0.0, + "step": 46630 + }, + { + "epoch": 10.130321459600347, + "grad_norm": 0.0002462256234139204, + "learning_rate": 7.802092383434694e-06, + "loss": 0.0, + "step": 46640 + }, + { + "epoch": 10.13249348392702, + "grad_norm": 0.0002477782254572958, + "learning_rate": 7.79304228207356e-06, + "loss": 0.0, + "step": 46650 + }, + { + "epoch": 10.134665508253692, + "grad_norm": 0.00024287530686706305, + "learning_rate": 7.783992180712425e-06, + "loss": 0.0, + "step": 46660 + }, + { + "epoch": 10.136837532580365, + "grad_norm": 0.00024437796673737466, + "learning_rate": 7.77494207935129e-06, + "loss": 0.0, + "step": 46670 + }, + { + "epoch": 10.139009556907038, + "grad_norm": 0.0002460273972246796, + "learning_rate": 7.765891977990155e-06, + "loss": 0.0035, + "step": 46680 + }, + { + "epoch": 10.14118158123371, + "grad_norm": 0.00024691823637112975, + "learning_rate": 7.75684187662902e-06, + "loss": 0.0061, + "step": 46690 + }, + { + "epoch": 10.143353605560382, + "grad_norm": 0.016955753788352013, + "learning_rate": 7.747791775267884e-06, + "loss": 0.0038, + "step": 46700 + }, + { + "epoch": 10.145525629887056, + "grad_norm": 0.00024686206597834826, + "learning_rate": 7.738741673906748e-06, + "loss": 0.0, + "step": 46710 + }, + { + "epoch": 10.147697654213728, + "grad_norm": 0.00024295347975566983, + "learning_rate": 7.729691572545612e-06, + "loss": 0.0, + "step": 46720 + }, + { + "epoch": 10.1498696785404, + "grad_norm": 0.00026942809927277267, + "learning_rate": 7.720641471184478e-06, + "loss": 0.0, + "step": 46730 + }, + { + "epoch": 10.152041702867072, + "grad_norm": 0.00024324421247001737, + "learning_rate": 7.711591369823343e-06, + "loss": 0.0, + "step": 46740 + }, + { + "epoch": 10.154213727193744, + "grad_norm": 0.00023985575535334647, + "learning_rate": 7.702541268462207e-06, + "loss": 0.0033, + "step": 46750 + }, + { + "epoch": 10.156385751520418, + "grad_norm": 0.0002428061852697283, + "learning_rate": 7.693491167101071e-06, + "loss": 0.0, + "step": 46760 + }, + { + "epoch": 10.15855777584709, + "grad_norm": 0.00024216584279201925, + "learning_rate": 7.684441065739936e-06, + "loss": 0.0, + "step": 46770 + }, + { + "epoch": 10.160729800173762, + "grad_norm": 0.00024820497492328286, + "learning_rate": 7.6753909643788e-06, + "loss": 0.0071, + "step": 46780 + }, + { + "epoch": 10.162901824500434, + "grad_norm": 0.0002512444625608623, + "learning_rate": 7.666340863017666e-06, + "loss": 0.0, + "step": 46790 + }, + { + "epoch": 10.165073848827106, + "grad_norm": 0.0002537766413297504, + "learning_rate": 7.65729076165653e-06, + "loss": 0.0, + "step": 46800 + }, + { + "epoch": 10.16724587315378, + "grad_norm": 0.00023964142019394785, + "learning_rate": 7.648240660295396e-06, + "loss": 0.0, + "step": 46810 + }, + { + "epoch": 10.169417897480452, + "grad_norm": 0.00023952442279551178, + "learning_rate": 7.63919055893426e-06, + "loss": 0.0031, + "step": 46820 + }, + { + "epoch": 10.171589921807124, + "grad_norm": 0.00023879566288087517, + "learning_rate": 7.630140457573125e-06, + "loss": 0.0, + "step": 46830 + }, + { + "epoch": 10.173761946133796, + "grad_norm": 0.000258896267041564, + "learning_rate": 7.621090356211989e-06, + "loss": 0.0, + "step": 46840 + }, + { + "epoch": 10.17593397046047, + "grad_norm": 0.00023973002680577338, + "learning_rate": 7.6120402548508554e-06, + "loss": 0.0, + "step": 46850 + }, + { + "epoch": 10.178105994787142, + "grad_norm": 0.0002393622271483764, + "learning_rate": 7.60299015348972e-06, + "loss": 0.0, + "step": 46860 + }, + { + "epoch": 10.180278019113814, + "grad_norm": 0.0002395124320173636, + "learning_rate": 7.593940052128584e-06, + "loss": 0.0, + "step": 46870 + }, + { + "epoch": 10.182450043440486, + "grad_norm": 0.00024033243244048208, + "learning_rate": 7.5848899507674485e-06, + "loss": 0.0, + "step": 46880 + }, + { + "epoch": 10.184622067767158, + "grad_norm": 0.001191705116070807, + "learning_rate": 7.575839849406313e-06, + "loss": 0.0062, + "step": 46890 + }, + { + "epoch": 10.186794092093832, + "grad_norm": 0.0002385633415542543, + "learning_rate": 7.566789748045179e-06, + "loss": 0.0, + "step": 46900 + }, + { + "epoch": 10.188966116420504, + "grad_norm": 0.0002466611040290445, + "learning_rate": 7.557739646684044e-06, + "loss": 0.0, + "step": 46910 + }, + { + "epoch": 10.191138140747176, + "grad_norm": 0.00024101352028083056, + "learning_rate": 7.548689545322908e-06, + "loss": 0.0, + "step": 46920 + }, + { + "epoch": 10.193310165073848, + "grad_norm": 0.00024026913160923868, + "learning_rate": 7.539639443961773e-06, + "loss": 0.0, + "step": 46930 + }, + { + "epoch": 10.195482189400522, + "grad_norm": 0.00024203627253882587, + "learning_rate": 7.530589342600637e-06, + "loss": 0.0, + "step": 46940 + }, + { + "epoch": 10.197654213727194, + "grad_norm": 0.00024288229178637266, + "learning_rate": 7.521539241239501e-06, + "loss": 0.0, + "step": 46950 + }, + { + "epoch": 10.199826238053866, + "grad_norm": 0.00024093518732115626, + "learning_rate": 7.512489139878367e-06, + "loss": 0.0, + "step": 46960 + }, + { + "epoch": 10.201998262380538, + "grad_norm": 0.00023962014529388398, + "learning_rate": 7.503439038517232e-06, + "loss": 0.0035, + "step": 46970 + }, + { + "epoch": 10.20417028670721, + "grad_norm": 0.0004495520843192935, + "learning_rate": 7.494388937156097e-06, + "loss": 0.006, + "step": 46980 + }, + { + "epoch": 10.206342311033884, + "grad_norm": 0.00027092520031146705, + "learning_rate": 7.485338835794961e-06, + "loss": 0.0, + "step": 46990 + }, + { + "epoch": 10.208514335360556, + "grad_norm": 0.00037834502290934324, + "learning_rate": 7.476288734433826e-06, + "loss": 0.0, + "step": 47000 + }, + { + "epoch": 10.210686359687228, + "grad_norm": 0.00024022634897846729, + "learning_rate": 7.46723863307269e-06, + "loss": 0.0, + "step": 47010 + }, + { + "epoch": 10.2128583840139, + "grad_norm": 0.00024379647220484912, + "learning_rate": 7.458188531711556e-06, + "loss": 0.0, + "step": 47020 + }, + { + "epoch": 10.215030408340574, + "grad_norm": 0.00023640983272343874, + "learning_rate": 7.44913843035042e-06, + "loss": 0.0, + "step": 47030 + }, + { + "epoch": 10.217202432667246, + "grad_norm": 0.00023816687462385744, + "learning_rate": 7.440088328989285e-06, + "loss": 0.0059, + "step": 47040 + }, + { + "epoch": 10.219374456993918, + "grad_norm": 0.00024333465262316167, + "learning_rate": 7.43103822762815e-06, + "loss": 0.0, + "step": 47050 + }, + { + "epoch": 10.22154648132059, + "grad_norm": 0.00023967133893165737, + "learning_rate": 7.421988126267014e-06, + "loss": 0.0, + "step": 47060 + }, + { + "epoch": 10.223718505647263, + "grad_norm": 0.0003435276448726654, + "learning_rate": 7.41293802490588e-06, + "loss": 0.0, + "step": 47070 + }, + { + "epoch": 10.225890529973936, + "grad_norm": 0.0002411666646366939, + "learning_rate": 7.4038879235447446e-06, + "loss": 0.0, + "step": 47080 + }, + { + "epoch": 10.228062554300609, + "grad_norm": 0.0003091098624281585, + "learning_rate": 7.394837822183609e-06, + "loss": 0.0, + "step": 47090 + }, + { + "epoch": 10.23023457862728, + "grad_norm": 0.00024104368640109897, + "learning_rate": 7.385787720822473e-06, + "loss": 0.0053, + "step": 47100 + }, + { + "epoch": 10.232406602953953, + "grad_norm": 0.00024548693909309804, + "learning_rate": 7.376737619461338e-06, + "loss": 0.0, + "step": 47110 + }, + { + "epoch": 10.234578627280625, + "grad_norm": 0.00024295470211654902, + "learning_rate": 7.367687518100203e-06, + "loss": 0.0, + "step": 47120 + }, + { + "epoch": 10.236750651607299, + "grad_norm": 0.0002608944196254015, + "learning_rate": 7.358637416739069e-06, + "loss": 0.0, + "step": 47130 + }, + { + "epoch": 10.23892267593397, + "grad_norm": 0.00024352218315470964, + "learning_rate": 7.349587315377933e-06, + "loss": 0.0, + "step": 47140 + }, + { + "epoch": 10.241094700260643, + "grad_norm": 0.0002514914667699486, + "learning_rate": 7.3405372140167975e-06, + "loss": 0.0, + "step": 47150 + }, + { + "epoch": 10.243266724587315, + "grad_norm": 0.00023742808843962848, + "learning_rate": 7.331487112655662e-06, + "loss": 0.0, + "step": 47160 + }, + { + "epoch": 10.245438748913989, + "grad_norm": 0.0003619439958129078, + "learning_rate": 7.322437011294526e-06, + "loss": 0.0, + "step": 47170 + }, + { + "epoch": 10.24761077324066, + "grad_norm": 0.00024090451188385487, + "learning_rate": 7.313386909933391e-06, + "loss": 0.0, + "step": 47180 + }, + { + "epoch": 10.249782797567333, + "grad_norm": 0.0002406891289865598, + "learning_rate": 7.3043368085722565e-06, + "loss": 0.0, + "step": 47190 + }, + { + "epoch": 10.251954821894005, + "grad_norm": 0.00024321397359017283, + "learning_rate": 7.295286707211122e-06, + "loss": 0.0, + "step": 47200 + }, + { + "epoch": 10.254126846220677, + "grad_norm": 0.00024024557205848396, + "learning_rate": 7.286236605849986e-06, + "loss": 0.0, + "step": 47210 + }, + { + "epoch": 10.25629887054735, + "grad_norm": 0.00023844727547839284, + "learning_rate": 7.27718650448885e-06, + "loss": 0.0, + "step": 47220 + }, + { + "epoch": 10.258470894874023, + "grad_norm": 0.0002457252121530473, + "learning_rate": 7.268136403127715e-06, + "loss": 0.0, + "step": 47230 + }, + { + "epoch": 10.260642919200695, + "grad_norm": 0.0002406234125373885, + "learning_rate": 7.259086301766579e-06, + "loss": 0.0037, + "step": 47240 + }, + { + "epoch": 10.262814943527367, + "grad_norm": 0.0005037950468249619, + "learning_rate": 7.250036200405445e-06, + "loss": 0.0, + "step": 47250 + }, + { + "epoch": 10.264986967854039, + "grad_norm": 0.00024237303296104074, + "learning_rate": 7.24098609904431e-06, + "loss": 0.0, + "step": 47260 + }, + { + "epoch": 10.267158992180713, + "grad_norm": 0.00032777455635368824, + "learning_rate": 7.231935997683175e-06, + "loss": 0.0, + "step": 47270 + }, + { + "epoch": 10.269331016507385, + "grad_norm": 0.00023873074678704143, + "learning_rate": 7.222885896322039e-06, + "loss": 0.0, + "step": 47280 + }, + { + "epoch": 10.271503040834057, + "grad_norm": 0.00024848480825312436, + "learning_rate": 7.213835794960903e-06, + "loss": 0.0, + "step": 47290 + }, + { + "epoch": 10.27367506516073, + "grad_norm": 0.0002387873100815341, + "learning_rate": 7.204785693599769e-06, + "loss": 0.0, + "step": 47300 + }, + { + "epoch": 10.275847089487403, + "grad_norm": 0.0002637762518133968, + "learning_rate": 7.195735592238634e-06, + "loss": 0.0, + "step": 47310 + }, + { + "epoch": 10.278019113814075, + "grad_norm": 0.0002451884211041033, + "learning_rate": 7.186685490877498e-06, + "loss": 0.0, + "step": 47320 + }, + { + "epoch": 10.280191138140747, + "grad_norm": 0.00024033308727666736, + "learning_rate": 7.177635389516363e-06, + "loss": 0.0, + "step": 47330 + }, + { + "epoch": 10.28236316246742, + "grad_norm": 0.0003520794562064111, + "learning_rate": 7.1685852881552275e-06, + "loss": 0.0, + "step": 47340 + }, + { + "epoch": 10.284535186794091, + "grad_norm": 0.0002365164109505713, + "learning_rate": 7.159535186794092e-06, + "loss": 0.0, + "step": 47350 + }, + { + "epoch": 10.286707211120765, + "grad_norm": 0.0002398495125817135, + "learning_rate": 7.150485085432958e-06, + "loss": 0.0052, + "step": 47360 + }, + { + "epoch": 10.288879235447437, + "grad_norm": 0.0003276202769484371, + "learning_rate": 7.141434984071822e-06, + "loss": 0.0, + "step": 47370 + }, + { + "epoch": 10.29105125977411, + "grad_norm": 0.0002395589544903487, + "learning_rate": 7.132384882710687e-06, + "loss": 0.0, + "step": 47380 + }, + { + "epoch": 10.293223284100781, + "grad_norm": 0.00023600317945238203, + "learning_rate": 7.123334781349551e-06, + "loss": 0.0, + "step": 47390 + }, + { + "epoch": 10.295395308427455, + "grad_norm": 0.00023694564879406244, + "learning_rate": 7.114284679988416e-06, + "loss": 0.0038, + "step": 47400 + }, + { + "epoch": 10.297567332754127, + "grad_norm": 0.00024834126816131175, + "learning_rate": 7.1052345786272804e-06, + "loss": 0.005, + "step": 47410 + }, + { + "epoch": 10.2997393570808, + "grad_norm": 0.00023720713215880096, + "learning_rate": 7.0961844772661465e-06, + "loss": 0.0, + "step": 47420 + }, + { + "epoch": 10.301911381407471, + "grad_norm": 0.00023603474255651236, + "learning_rate": 7.087134375905011e-06, + "loss": 0.0, + "step": 47430 + }, + { + "epoch": 10.304083405734143, + "grad_norm": 0.00023258681176230311, + "learning_rate": 7.078084274543875e-06, + "loss": 0.0, + "step": 47440 + }, + { + "epoch": 10.306255430060817, + "grad_norm": 0.00023691673413850367, + "learning_rate": 7.0690341731827395e-06, + "loss": 0.0, + "step": 47450 + }, + { + "epoch": 10.30842745438749, + "grad_norm": 0.0002366963162785396, + "learning_rate": 7.059984071821605e-06, + "loss": 0.0, + "step": 47460 + }, + { + "epoch": 10.310599478714161, + "grad_norm": 0.0003267844149377197, + "learning_rate": 7.05093397046047e-06, + "loss": 0.0, + "step": 47470 + }, + { + "epoch": 10.312771503040834, + "grad_norm": 0.00031106435926631093, + "learning_rate": 7.041883869099335e-06, + "loss": 0.0044, + "step": 47480 + }, + { + "epoch": 10.314943527367507, + "grad_norm": 0.00023934834462124854, + "learning_rate": 7.032833767738199e-06, + "loss": 0.0054, + "step": 47490 + }, + { + "epoch": 10.31711555169418, + "grad_norm": 0.0011841370724141598, + "learning_rate": 7.023783666377064e-06, + "loss": 0.0, + "step": 47500 + }, + { + "epoch": 10.319287576020852, + "grad_norm": 0.00024405766453128308, + "learning_rate": 7.014733565015928e-06, + "loss": 0.0039, + "step": 47510 + }, + { + "epoch": 10.321459600347524, + "grad_norm": 0.0005109702469781041, + "learning_rate": 7.005683463654792e-06, + "loss": 0.0, + "step": 47520 + }, + { + "epoch": 10.323631624674196, + "grad_norm": 0.00024197706079576164, + "learning_rate": 6.9966333622936584e-06, + "loss": 0.0, + "step": 47530 + }, + { + "epoch": 10.32580364900087, + "grad_norm": 0.00023690721718594432, + "learning_rate": 6.987583260932523e-06, + "loss": 0.0, + "step": 47540 + }, + { + "epoch": 10.327975673327542, + "grad_norm": 0.0002361913357162848, + "learning_rate": 6.978533159571388e-06, + "loss": 0.0, + "step": 47550 + }, + { + "epoch": 10.330147697654214, + "grad_norm": 0.00023627316113561392, + "learning_rate": 6.969483058210252e-06, + "loss": 0.0, + "step": 47560 + }, + { + "epoch": 10.332319721980886, + "grad_norm": 0.00023409361892845482, + "learning_rate": 6.960432956849117e-06, + "loss": 0.0048, + "step": 47570 + }, + { + "epoch": 10.334491746307558, + "grad_norm": 0.00023509345191996545, + "learning_rate": 6.951382855487981e-06, + "loss": 0.0043, + "step": 47580 + }, + { + "epoch": 10.336663770634232, + "grad_norm": 0.0002373265306232497, + "learning_rate": 6.942332754126847e-06, + "loss": 0.0, + "step": 47590 + }, + { + "epoch": 10.338835794960904, + "grad_norm": 0.00023308381787501276, + "learning_rate": 6.933282652765711e-06, + "loss": 0.0, + "step": 47600 + }, + { + "epoch": 10.341007819287576, + "grad_norm": 0.00023303077614400536, + "learning_rate": 6.924232551404576e-06, + "loss": 0.0, + "step": 47610 + }, + { + "epoch": 10.343179843614248, + "grad_norm": 0.0002339918282814324, + "learning_rate": 6.915182450043441e-06, + "loss": 0.0, + "step": 47620 + }, + { + "epoch": 10.345351867940922, + "grad_norm": 0.00023397189215756953, + "learning_rate": 6.906132348682305e-06, + "loss": 0.0, + "step": 47630 + }, + { + "epoch": 10.347523892267594, + "grad_norm": 0.00023484285338781774, + "learning_rate": 6.8970822473211696e-06, + "loss": 0.0, + "step": 47640 + }, + { + "epoch": 10.349695916594266, + "grad_norm": 0.00023741343466099352, + "learning_rate": 6.888032145960036e-06, + "loss": 0.0, + "step": 47650 + }, + { + "epoch": 10.351867940920938, + "grad_norm": 0.00023638234415557235, + "learning_rate": 6.8789820445989e-06, + "loss": 0.0, + "step": 47660 + }, + { + "epoch": 10.35403996524761, + "grad_norm": 0.0002334350865567103, + "learning_rate": 6.869931943237764e-06, + "loss": 0.0, + "step": 47670 + }, + { + "epoch": 10.356211989574284, + "grad_norm": 0.00023239596339408308, + "learning_rate": 6.8608818418766295e-06, + "loss": 0.0, + "step": 47680 + }, + { + "epoch": 10.358384013900956, + "grad_norm": 0.0002384950203122571, + "learning_rate": 6.851831740515494e-06, + "loss": 0.0052, + "step": 47690 + }, + { + "epoch": 10.360556038227628, + "grad_norm": 0.00023201614385470748, + "learning_rate": 6.84278163915436e-06, + "loss": 0.0, + "step": 47700 + }, + { + "epoch": 10.3627280625543, + "grad_norm": 0.0002313854784006253, + "learning_rate": 6.833731537793224e-06, + "loss": 0.0, + "step": 47710 + }, + { + "epoch": 10.364900086880972, + "grad_norm": 0.0002525383315514773, + "learning_rate": 6.8246814364320885e-06, + "loss": 0.0, + "step": 47720 + }, + { + "epoch": 10.367072111207646, + "grad_norm": 0.00023291408433578908, + "learning_rate": 6.815631335070953e-06, + "loss": 0.0, + "step": 47730 + }, + { + "epoch": 10.369244135534318, + "grad_norm": 0.0003160819469485432, + "learning_rate": 6.806581233709817e-06, + "loss": 0.0, + "step": 47740 + }, + { + "epoch": 10.37141615986099, + "grad_norm": 0.00023806083481758833, + "learning_rate": 6.797531132348682e-06, + "loss": 0.0, + "step": 47750 + }, + { + "epoch": 10.373588184187662, + "grad_norm": 0.00023504970886278898, + "learning_rate": 6.7884810309875476e-06, + "loss": 0.0, + "step": 47760 + }, + { + "epoch": 10.375760208514336, + "grad_norm": 0.00023199041606858373, + "learning_rate": 6.779430929626413e-06, + "loss": 0.004, + "step": 47770 + }, + { + "epoch": 10.377932232841008, + "grad_norm": 0.0002341802028240636, + "learning_rate": 6.770380828265277e-06, + "loss": 0.0, + "step": 47780 + }, + { + "epoch": 10.38010425716768, + "grad_norm": 0.00031527108512818813, + "learning_rate": 6.7613307269041414e-06, + "loss": 0.0, + "step": 47790 + }, + { + "epoch": 10.382276281494352, + "grad_norm": 0.00023442119709216058, + "learning_rate": 6.752280625543006e-06, + "loss": 0.0091, + "step": 47800 + }, + { + "epoch": 10.384448305821024, + "grad_norm": 0.00039361134986393154, + "learning_rate": 6.74323052418187e-06, + "loss": 0.0051, + "step": 47810 + }, + { + "epoch": 10.386620330147698, + "grad_norm": 0.00023753584537189454, + "learning_rate": 6.734180422820736e-06, + "loss": 0.0052, + "step": 47820 + }, + { + "epoch": 10.38879235447437, + "grad_norm": 0.00023664938635192811, + "learning_rate": 6.725130321459601e-06, + "loss": 0.0001, + "step": 47830 + }, + { + "epoch": 10.390964378801042, + "grad_norm": 0.00023270200472325087, + "learning_rate": 6.716080220098466e-06, + "loss": 0.0, + "step": 47840 + }, + { + "epoch": 10.393136403127714, + "grad_norm": 0.00024005438899621367, + "learning_rate": 6.70703011873733e-06, + "loss": 0.0, + "step": 47850 + }, + { + "epoch": 10.395308427454388, + "grad_norm": 0.00023178478295449167, + "learning_rate": 6.697980017376194e-06, + "loss": 0.0, + "step": 47860 + }, + { + "epoch": 10.39748045178106, + "grad_norm": 0.00023401351063512266, + "learning_rate": 6.68892991601506e-06, + "loss": 0.0, + "step": 47870 + }, + { + "epoch": 10.399652476107732, + "grad_norm": 0.0003052498504985124, + "learning_rate": 6.679879814653925e-06, + "loss": 0.0039, + "step": 47880 + }, + { + "epoch": 10.401824500434405, + "grad_norm": 0.00023244529438670725, + "learning_rate": 6.670829713292789e-06, + "loss": 0.0044, + "step": 47890 + }, + { + "epoch": 10.403996524761077, + "grad_norm": 0.0028232275508344173, + "learning_rate": 6.661779611931654e-06, + "loss": 0.0, + "step": 47900 + }, + { + "epoch": 10.40616854908775, + "grad_norm": 0.00023363882792182267, + "learning_rate": 6.652729510570519e-06, + "loss": 0.0, + "step": 47910 + }, + { + "epoch": 10.408340573414423, + "grad_norm": 0.00023200880968943238, + "learning_rate": 6.643679409209383e-06, + "loss": 0.0047, + "step": 47920 + }, + { + "epoch": 10.410512597741095, + "grad_norm": 0.00023291223624255508, + "learning_rate": 6.634629307848249e-06, + "loss": 0.0, + "step": 47930 + }, + { + "epoch": 10.412684622067767, + "grad_norm": 0.00023574443184770644, + "learning_rate": 6.625579206487113e-06, + "loss": 0.0047, + "step": 47940 + }, + { + "epoch": 10.41485664639444, + "grad_norm": 0.00023099414829630405, + "learning_rate": 6.616529105125978e-06, + "loss": 0.0, + "step": 47950 + }, + { + "epoch": 10.417028670721113, + "grad_norm": 0.0002286070812260732, + "learning_rate": 6.607479003764842e-06, + "loss": 0.0, + "step": 47960 + }, + { + "epoch": 10.419200695047785, + "grad_norm": 0.0002335784665774554, + "learning_rate": 6.598428902403707e-06, + "loss": 0.0, + "step": 47970 + }, + { + "epoch": 10.421372719374457, + "grad_norm": 0.00023560720728710294, + "learning_rate": 6.5893788010425715e-06, + "loss": 0.0, + "step": 47980 + }, + { + "epoch": 10.423544743701129, + "grad_norm": 0.00025418924633413553, + "learning_rate": 6.5803286996814375e-06, + "loss": 0.0, + "step": 47990 + }, + { + "epoch": 10.425716768027803, + "grad_norm": 0.00023169341147877276, + "learning_rate": 6.571278598320302e-06, + "loss": 0.0, + "step": 48000 + }, + { + "epoch": 10.427888792354475, + "grad_norm": 0.0002545344177633524, + "learning_rate": 6.562228496959166e-06, + "loss": 0.0397, + "step": 48010 + }, + { + "epoch": 10.430060816681147, + "grad_norm": 0.00028514183941297233, + "learning_rate": 6.5531783955980305e-06, + "loss": 0.0051, + "step": 48020 + }, + { + "epoch": 10.432232841007819, + "grad_norm": 0.00029667047783732414, + "learning_rate": 6.544128294236896e-06, + "loss": 0.0003, + "step": 48030 + }, + { + "epoch": 10.434404865334491, + "grad_norm": 0.00027842583949677646, + "learning_rate": 6.53507819287576e-06, + "loss": 0.0001, + "step": 48040 + }, + { + "epoch": 10.436576889661165, + "grad_norm": 0.007493993733078241, + "learning_rate": 6.526028091514626e-06, + "loss": 0.0, + "step": 48050 + }, + { + "epoch": 10.438748913987837, + "grad_norm": 0.0002761534124147147, + "learning_rate": 6.5169779901534904e-06, + "loss": 0.0042, + "step": 48060 + }, + { + "epoch": 10.440920938314509, + "grad_norm": 0.00026150167104788125, + "learning_rate": 6.507927888792355e-06, + "loss": 0.0, + "step": 48070 + }, + { + "epoch": 10.443092962641181, + "grad_norm": 0.0003275485069025308, + "learning_rate": 6.498877787431219e-06, + "loss": 0.0042, + "step": 48080 + }, + { + "epoch": 10.445264986967855, + "grad_norm": 0.0002545382303651422, + "learning_rate": 6.4898276860700835e-06, + "loss": 0.0001, + "step": 48090 + }, + { + "epoch": 10.447437011294527, + "grad_norm": 0.0002404392434982583, + "learning_rate": 6.4807775847089495e-06, + "loss": 0.0051, + "step": 48100 + }, + { + "epoch": 10.449609035621199, + "grad_norm": 0.0002498602552805096, + "learning_rate": 6.471727483347814e-06, + "loss": 0.0, + "step": 48110 + }, + { + "epoch": 10.451781059947871, + "grad_norm": 0.00023550092009827495, + "learning_rate": 6.462677381986679e-06, + "loss": 0.0, + "step": 48120 + }, + { + "epoch": 10.453953084274543, + "grad_norm": 0.000236342559219338, + "learning_rate": 6.453627280625543e-06, + "loss": 0.0, + "step": 48130 + }, + { + "epoch": 10.456125108601217, + "grad_norm": 0.00025661668041720986, + "learning_rate": 6.444577179264408e-06, + "loss": 0.0043, + "step": 48140 + }, + { + "epoch": 10.458297132927889, + "grad_norm": 0.00042461883276700974, + "learning_rate": 6.435527077903272e-06, + "loss": 0.0, + "step": 48150 + }, + { + "epoch": 10.460469157254561, + "grad_norm": 0.0003444579488132149, + "learning_rate": 6.426476976542138e-06, + "loss": 0.0, + "step": 48160 + }, + { + "epoch": 10.462641181581233, + "grad_norm": 0.0003125490911770612, + "learning_rate": 6.417426875181002e-06, + "loss": 0.0, + "step": 48170 + }, + { + "epoch": 10.464813205907905, + "grad_norm": 0.0002570390061009675, + "learning_rate": 6.408376773819868e-06, + "loss": 0.0058, + "step": 48180 + }, + { + "epoch": 10.46698523023458, + "grad_norm": 0.0004991987370885909, + "learning_rate": 6.399326672458732e-06, + "loss": 0.0042, + "step": 48190 + }, + { + "epoch": 10.469157254561251, + "grad_norm": 0.0003679233486764133, + "learning_rate": 6.390276571097596e-06, + "loss": 0.0, + "step": 48200 + }, + { + "epoch": 10.471329278887923, + "grad_norm": 0.0002311110874870792, + "learning_rate": 6.381226469736461e-06, + "loss": 0.0, + "step": 48210 + }, + { + "epoch": 10.473501303214595, + "grad_norm": 0.00023058304213918746, + "learning_rate": 6.372176368375327e-06, + "loss": 0.0, + "step": 48220 + }, + { + "epoch": 10.47567332754127, + "grad_norm": 0.000238971013459377, + "learning_rate": 6.363126267014191e-06, + "loss": 0.0, + "step": 48230 + }, + { + "epoch": 10.477845351867941, + "grad_norm": 0.00023313738347496837, + "learning_rate": 6.354076165653055e-06, + "loss": 0.0035, + "step": 48240 + }, + { + "epoch": 10.480017376194613, + "grad_norm": 0.0002325878303963691, + "learning_rate": 6.3450260642919205e-06, + "loss": 0.0, + "step": 48250 + }, + { + "epoch": 10.482189400521285, + "grad_norm": 0.00031619417131878436, + "learning_rate": 6.335975962930785e-06, + "loss": 0.0, + "step": 48260 + }, + { + "epoch": 10.484361424847958, + "grad_norm": 0.0002313339791726321, + "learning_rate": 6.326925861569651e-06, + "loss": 0.0, + "step": 48270 + }, + { + "epoch": 10.486533449174631, + "grad_norm": 0.00022938975598663092, + "learning_rate": 6.317875760208515e-06, + "loss": 0.0, + "step": 48280 + }, + { + "epoch": 10.488705473501303, + "grad_norm": 0.00022925181838218123, + "learning_rate": 6.3088256588473796e-06, + "loss": 0.0, + "step": 48290 + }, + { + "epoch": 10.490877497827976, + "grad_norm": 0.00023099995451048017, + "learning_rate": 6.299775557486244e-06, + "loss": 0.0, + "step": 48300 + }, + { + "epoch": 10.493049522154648, + "grad_norm": 0.0003380636335350573, + "learning_rate": 6.290725456125108e-06, + "loss": 0.0, + "step": 48310 + }, + { + "epoch": 10.495221546481321, + "grad_norm": 0.0002366377302678302, + "learning_rate": 6.281675354763973e-06, + "loss": 0.0, + "step": 48320 + }, + { + "epoch": 10.497393570807994, + "grad_norm": 0.0002278648898936808, + "learning_rate": 6.272625253402839e-06, + "loss": 0.0035, + "step": 48330 + }, + { + "epoch": 10.499565595134666, + "grad_norm": 0.0002272507263114676, + "learning_rate": 6.263575152041704e-06, + "loss": 0.0085, + "step": 48340 + }, + { + "epoch": 10.501737619461338, + "grad_norm": 0.00023705446801614016, + "learning_rate": 6.254525050680568e-06, + "loss": 0.0, + "step": 48350 + }, + { + "epoch": 10.50390964378801, + "grad_norm": 0.00023882483947090805, + "learning_rate": 6.2454749493194325e-06, + "loss": 0.0, + "step": 48360 + }, + { + "epoch": 10.506081668114684, + "grad_norm": 0.00023301866895053536, + "learning_rate": 6.236424847958298e-06, + "loss": 0.0, + "step": 48370 + }, + { + "epoch": 10.508253692441356, + "grad_norm": 0.0002331801224499941, + "learning_rate": 6.227374746597162e-06, + "loss": 0.0, + "step": 48380 + }, + { + "epoch": 10.510425716768028, + "grad_norm": 0.00022870057728141546, + "learning_rate": 6.218324645236027e-06, + "loss": 0.0, + "step": 48390 + }, + { + "epoch": 10.5125977410947, + "grad_norm": 0.00023882264213170856, + "learning_rate": 6.209274543874892e-06, + "loss": 0.0, + "step": 48400 + }, + { + "epoch": 10.514769765421374, + "grad_norm": 0.0002304526569787413, + "learning_rate": 6.200224442513757e-06, + "loss": 0.0, + "step": 48410 + }, + { + "epoch": 10.516941789748046, + "grad_norm": 0.0003574812435545027, + "learning_rate": 6.191174341152621e-06, + "loss": 0.0, + "step": 48420 + }, + { + "epoch": 10.519113814074718, + "grad_norm": 0.00023324844369199127, + "learning_rate": 6.182124239791486e-06, + "loss": 0.0, + "step": 48430 + }, + { + "epoch": 10.52128583840139, + "grad_norm": 0.00022773882665205747, + "learning_rate": 6.1730741384303506e-06, + "loss": 0.0, + "step": 48440 + }, + { + "epoch": 10.523457862728062, + "grad_norm": 0.00023141004203353077, + "learning_rate": 6.164024037069215e-06, + "loss": 0.0, + "step": 48450 + }, + { + "epoch": 10.525629887054736, + "grad_norm": 0.00022916783927939832, + "learning_rate": 6.15497393570808e-06, + "loss": 0.0, + "step": 48460 + }, + { + "epoch": 10.527801911381408, + "grad_norm": 0.0003077143628615886, + "learning_rate": 6.145923834346945e-06, + "loss": 0.0, + "step": 48470 + }, + { + "epoch": 10.52997393570808, + "grad_norm": 0.0004104797844775021, + "learning_rate": 6.13687373298581e-06, + "loss": 0.0, + "step": 48480 + }, + { + "epoch": 10.532145960034752, + "grad_norm": 0.0003059869341086596, + "learning_rate": 6.127823631624675e-06, + "loss": 0.0, + "step": 48490 + }, + { + "epoch": 10.534317984361424, + "grad_norm": 0.00022940864437259734, + "learning_rate": 6.118773530263539e-06, + "loss": 0.0, + "step": 48500 + }, + { + "epoch": 10.536490008688098, + "grad_norm": 0.000472991174319759, + "learning_rate": 6.1097234289024035e-06, + "loss": 0.0, + "step": 48510 + }, + { + "epoch": 10.53866203301477, + "grad_norm": 0.0002505776647012681, + "learning_rate": 6.100673327541269e-06, + "loss": 0.0, + "step": 48520 + }, + { + "epoch": 10.540834057341442, + "grad_norm": 0.00022529246052727103, + "learning_rate": 6.091623226180133e-06, + "loss": 0.0, + "step": 48530 + }, + { + "epoch": 10.543006081668114, + "grad_norm": 0.000230813238886185, + "learning_rate": 6.082573124818998e-06, + "loss": 0.0037, + "step": 48540 + }, + { + "epoch": 10.545178105994786, + "grad_norm": 0.00022568507120013237, + "learning_rate": 6.073523023457863e-06, + "loss": 0.0, + "step": 48550 + }, + { + "epoch": 10.54735013032146, + "grad_norm": 0.0002367023262195289, + "learning_rate": 6.064472922096728e-06, + "loss": 0.0, + "step": 48560 + }, + { + "epoch": 10.549522154648132, + "grad_norm": 0.00022548387642018497, + "learning_rate": 6.055422820735593e-06, + "loss": 0.0, + "step": 48570 + }, + { + "epoch": 10.551694178974804, + "grad_norm": 0.00022634779452346265, + "learning_rate": 6.046372719374457e-06, + "loss": 0.0053, + "step": 48580 + }, + { + "epoch": 10.553866203301476, + "grad_norm": 0.00023146615421865135, + "learning_rate": 6.037322618013322e-06, + "loss": 0.0057, + "step": 48590 + }, + { + "epoch": 10.55603822762815, + "grad_norm": 0.0002269007236463949, + "learning_rate": 6.028272516652187e-06, + "loss": 0.0, + "step": 48600 + }, + { + "epoch": 10.558210251954822, + "grad_norm": 0.00022439331223722547, + "learning_rate": 6.019222415291052e-06, + "loss": 0.0, + "step": 48610 + }, + { + "epoch": 10.560382276281494, + "grad_norm": 0.00022950119455344975, + "learning_rate": 6.010172313929916e-06, + "loss": 0.0, + "step": 48620 + }, + { + "epoch": 10.562554300608166, + "grad_norm": 0.300792396068573, + "learning_rate": 6.0011222125687815e-06, + "loss": 0.0001, + "step": 48630 + }, + { + "epoch": 10.564726324934838, + "grad_norm": 0.00022572164016310126, + "learning_rate": 5.992072111207646e-06, + "loss": 0.0, + "step": 48640 + }, + { + "epoch": 10.566898349261512, + "grad_norm": 0.000312326563289389, + "learning_rate": 5.98302200984651e-06, + "loss": 0.0, + "step": 48650 + }, + { + "epoch": 10.569070373588184, + "grad_norm": 0.00022468189126811922, + "learning_rate": 5.973971908485375e-06, + "loss": 0.0, + "step": 48660 + }, + { + "epoch": 10.571242397914856, + "grad_norm": 0.00023983907885849476, + "learning_rate": 5.96492180712424e-06, + "loss": 0.0, + "step": 48670 + }, + { + "epoch": 10.573414422241528, + "grad_norm": 0.0002266259107273072, + "learning_rate": 5.955871705763105e-06, + "loss": 0.0, + "step": 48680 + }, + { + "epoch": 10.575586446568202, + "grad_norm": 0.00022839626763015985, + "learning_rate": 5.94682160440197e-06, + "loss": 0.0, + "step": 48690 + }, + { + "epoch": 10.577758470894874, + "grad_norm": 0.000231469253776595, + "learning_rate": 5.937771503040834e-06, + "loss": 0.0, + "step": 48700 + }, + { + "epoch": 10.579930495221546, + "grad_norm": 0.00023143812722992152, + "learning_rate": 5.928721401679699e-06, + "loss": 0.0, + "step": 48710 + }, + { + "epoch": 10.582102519548219, + "grad_norm": 0.00031239417148754, + "learning_rate": 5.919671300318564e-06, + "loss": 0.0, + "step": 48720 + }, + { + "epoch": 10.58427454387489, + "grad_norm": 0.00022409581288229674, + "learning_rate": 5.910621198957428e-06, + "loss": 0.0, + "step": 48730 + }, + { + "epoch": 10.586446568201564, + "grad_norm": 0.0002723014331422746, + "learning_rate": 5.9015710975962934e-06, + "loss": 0.0043, + "step": 48740 + }, + { + "epoch": 10.588618592528237, + "grad_norm": 0.00022293497750069946, + "learning_rate": 5.892520996235159e-06, + "loss": 0.0, + "step": 48750 + }, + { + "epoch": 10.590790616854909, + "grad_norm": 0.0003747916198335588, + "learning_rate": 5.883470894874023e-06, + "loss": 0.0, + "step": 48760 + }, + { + "epoch": 10.59296264118158, + "grad_norm": 0.00022305836318992078, + "learning_rate": 5.874420793512888e-06, + "loss": 0.0, + "step": 48770 + }, + { + "epoch": 10.595134665508255, + "grad_norm": 0.00023418181808665395, + "learning_rate": 5.8653706921517525e-06, + "loss": 0.0, + "step": 48780 + }, + { + "epoch": 10.597306689834927, + "grad_norm": 0.00022406678181141615, + "learning_rate": 5.856320590790617e-06, + "loss": 0.0, + "step": 48790 + }, + { + "epoch": 10.599478714161599, + "grad_norm": 0.00023150903871282935, + "learning_rate": 5.847270489429482e-06, + "loss": 0.0, + "step": 48800 + }, + { + "epoch": 10.60165073848827, + "grad_norm": 0.0003837795229628682, + "learning_rate": 5.838220388068346e-06, + "loss": 0.0056, + "step": 48810 + }, + { + "epoch": 10.603822762814943, + "grad_norm": 0.00023224468168336898, + "learning_rate": 5.8291702867072115e-06, + "loss": 0.005, + "step": 48820 + }, + { + "epoch": 10.605994787141617, + "grad_norm": 0.17190490663051605, + "learning_rate": 5.820120185346077e-06, + "loss": 0.0047, + "step": 48830 + }, + { + "epoch": 10.608166811468289, + "grad_norm": 0.00022320100106298923, + "learning_rate": 5.811070083984941e-06, + "loss": 0.0, + "step": 48840 + }, + { + "epoch": 10.61033883579496, + "grad_norm": 0.00032960029784590006, + "learning_rate": 5.802019982623805e-06, + "loss": 0.0, + "step": 48850 + }, + { + "epoch": 10.612510860121633, + "grad_norm": 0.00023157663235906512, + "learning_rate": 5.792969881262671e-06, + "loss": 0.0, + "step": 48860 + }, + { + "epoch": 10.614682884448305, + "grad_norm": 0.00022281825658865273, + "learning_rate": 5.783919779901535e-06, + "loss": 0.0, + "step": 48870 + }, + { + "epoch": 10.616854908774979, + "grad_norm": 0.0002246215008199215, + "learning_rate": 5.774869678540399e-06, + "loss": 0.0, + "step": 48880 + }, + { + "epoch": 10.619026933101651, + "grad_norm": 0.00022870773682370782, + "learning_rate": 5.7658195771792645e-06, + "loss": 0.0, + "step": 48890 + }, + { + "epoch": 10.621198957428323, + "grad_norm": 0.0002241594047518447, + "learning_rate": 5.75676947581813e-06, + "loss": 0.0, + "step": 48900 + }, + { + "epoch": 10.623370981754995, + "grad_norm": 0.0002240870235254988, + "learning_rate": 5.747719374456994e-06, + "loss": 0.0, + "step": 48910 + }, + { + "epoch": 10.625543006081669, + "grad_norm": 0.00022091949358582497, + "learning_rate": 5.738669273095859e-06, + "loss": 0.0, + "step": 48920 + }, + { + "epoch": 10.627715030408341, + "grad_norm": 0.00022567623818758875, + "learning_rate": 5.7296191717347235e-06, + "loss": 0.0, + "step": 48930 + }, + { + "epoch": 10.629887054735013, + "grad_norm": 0.00022049974359106272, + "learning_rate": 5.720569070373588e-06, + "loss": 0.0085, + "step": 48940 + }, + { + "epoch": 10.632059079061685, + "grad_norm": 0.0002641979663167149, + "learning_rate": 5.711518969012453e-06, + "loss": 0.0, + "step": 48950 + }, + { + "epoch": 10.634231103388357, + "grad_norm": 0.0002207313955295831, + "learning_rate": 5.702468867651318e-06, + "loss": 0.0, + "step": 48960 + }, + { + "epoch": 10.636403127715031, + "grad_norm": 0.00021989879314787686, + "learning_rate": 5.693418766290183e-06, + "loss": 0.0, + "step": 48970 + }, + { + "epoch": 10.638575152041703, + "grad_norm": 0.00022552079462911934, + "learning_rate": 5.684368664929048e-06, + "loss": 0.0, + "step": 48980 + }, + { + "epoch": 10.640747176368375, + "grad_norm": 0.00022579723736271262, + "learning_rate": 5.675318563567912e-06, + "loss": 0.0049, + "step": 48990 + }, + { + "epoch": 10.642919200695047, + "grad_norm": 0.0003467233618721366, + "learning_rate": 5.666268462206777e-06, + "loss": 0.0, + "step": 49000 + }, + { + "epoch": 10.64509122502172, + "grad_norm": 0.00022149008873384446, + "learning_rate": 5.657218360845642e-06, + "loss": 0.0, + "step": 49010 + }, + { + "epoch": 10.647263249348393, + "grad_norm": 0.0002353919408051297, + "learning_rate": 5.648168259484506e-06, + "loss": 0.0, + "step": 49020 + }, + { + "epoch": 10.649435273675065, + "grad_norm": 0.00022022766643203795, + "learning_rate": 5.639118158123371e-06, + "loss": 0.0, + "step": 49030 + }, + { + "epoch": 10.651607298001737, + "grad_norm": 0.00022060348419472575, + "learning_rate": 5.630068056762236e-06, + "loss": 0.0, + "step": 49040 + }, + { + "epoch": 10.65377932232841, + "grad_norm": 0.00022515558521263301, + "learning_rate": 5.621017955401101e-06, + "loss": 0.0, + "step": 49050 + }, + { + "epoch": 10.655951346655083, + "grad_norm": 0.00022750017524231225, + "learning_rate": 5.611967854039966e-06, + "loss": 0.0, + "step": 49060 + }, + { + "epoch": 10.658123370981755, + "grad_norm": 0.00023040255473461002, + "learning_rate": 5.60291775267883e-06, + "loss": 0.0, + "step": 49070 + }, + { + "epoch": 10.660295395308427, + "grad_norm": 0.0002275063015986234, + "learning_rate": 5.5938676513176945e-06, + "loss": 0.0, + "step": 49080 + }, + { + "epoch": 10.6624674196351, + "grad_norm": 0.00023775036970619112, + "learning_rate": 5.58481754995656e-06, + "loss": 0.0, + "step": 49090 + }, + { + "epoch": 10.664639443961772, + "grad_norm": 0.0002341943036299199, + "learning_rate": 5.575767448595425e-06, + "loss": 0.0, + "step": 49100 + }, + { + "epoch": 10.666811468288445, + "grad_norm": 0.00022713349608238786, + "learning_rate": 5.566717347234289e-06, + "loss": 0.0, + "step": 49110 + }, + { + "epoch": 10.668983492615117, + "grad_norm": 0.00022505798551719636, + "learning_rate": 5.5576672458731544e-06, + "loss": 0.0047, + "step": 49120 + }, + { + "epoch": 10.67115551694179, + "grad_norm": 0.0002189160732086748, + "learning_rate": 5.548617144512019e-06, + "loss": 0.0, + "step": 49130 + }, + { + "epoch": 10.673327541268462, + "grad_norm": 0.00022511309362016618, + "learning_rate": 5.539567043150883e-06, + "loss": 0.0, + "step": 49140 + }, + { + "epoch": 10.675499565595135, + "grad_norm": 0.0002203310577897355, + "learning_rate": 5.530516941789748e-06, + "loss": 0.0, + "step": 49150 + }, + { + "epoch": 10.677671589921808, + "grad_norm": 0.0002233179402537644, + "learning_rate": 5.521466840428613e-06, + "loss": 0.0049, + "step": 49160 + }, + { + "epoch": 10.67984361424848, + "grad_norm": 0.0003005561593454331, + "learning_rate": 5.512416739067478e-06, + "loss": 0.0, + "step": 49170 + }, + { + "epoch": 10.682015638575152, + "grad_norm": 0.00022120712674222887, + "learning_rate": 5.503366637706343e-06, + "loss": 0.0, + "step": 49180 + }, + { + "epoch": 10.684187662901824, + "grad_norm": 0.00033629994140937924, + "learning_rate": 5.494316536345207e-06, + "loss": 0.0043, + "step": 49190 + }, + { + "epoch": 10.686359687228498, + "grad_norm": 0.00021915044635534286, + "learning_rate": 5.4852664349840725e-06, + "loss": 0.0049, + "step": 49200 + }, + { + "epoch": 10.68853171155517, + "grad_norm": 0.0002180044393753633, + "learning_rate": 5.476216333622937e-06, + "loss": 0.0, + "step": 49210 + }, + { + "epoch": 10.690703735881842, + "grad_norm": 0.00023001058434601873, + "learning_rate": 5.467166232261801e-06, + "loss": 0.0, + "step": 49220 + }, + { + "epoch": 10.692875760208514, + "grad_norm": 0.00022796729172114283, + "learning_rate": 5.458116130900666e-06, + "loss": 0.0, + "step": 49230 + }, + { + "epoch": 10.695047784535188, + "grad_norm": 0.00021778048539999872, + "learning_rate": 5.449066029539531e-06, + "loss": 0.0045, + "step": 49240 + }, + { + "epoch": 10.69721980886186, + "grad_norm": 0.00022593970061279833, + "learning_rate": 5.440015928178396e-06, + "loss": 0.0, + "step": 49250 + }, + { + "epoch": 10.699391833188532, + "grad_norm": 0.00022018144954927266, + "learning_rate": 5.430965826817261e-06, + "loss": 0.0, + "step": 49260 + }, + { + "epoch": 10.701563857515204, + "grad_norm": 0.00022087100660428405, + "learning_rate": 5.4219157254561254e-06, + "loss": 0.0, + "step": 49270 + }, + { + "epoch": 10.703735881841876, + "grad_norm": 0.00022406043717637658, + "learning_rate": 5.41286562409499e-06, + "loss": 0.0, + "step": 49280 + }, + { + "epoch": 10.70590790616855, + "grad_norm": 0.00021810720500070602, + "learning_rate": 5.403815522733855e-06, + "loss": 0.0045, + "step": 49290 + }, + { + "epoch": 10.708079930495222, + "grad_norm": 0.00022153431200422347, + "learning_rate": 5.394765421372719e-06, + "loss": 0.0041, + "step": 49300 + }, + { + "epoch": 10.710251954821894, + "grad_norm": 0.00021901496802456677, + "learning_rate": 5.3857153200115845e-06, + "loss": 0.0, + "step": 49310 + }, + { + "epoch": 10.712423979148566, + "grad_norm": 0.00022114437888376415, + "learning_rate": 5.37666521865045e-06, + "loss": 0.0101, + "step": 49320 + }, + { + "epoch": 10.714596003475238, + "grad_norm": 0.00022218165395315737, + "learning_rate": 5.367615117289314e-06, + "loss": 0.0045, + "step": 49330 + }, + { + "epoch": 10.716768027801912, + "grad_norm": 0.00023637167760170996, + "learning_rate": 5.358565015928178e-06, + "loss": 0.0, + "step": 49340 + }, + { + "epoch": 10.718940052128584, + "grad_norm": 0.00023195294488687068, + "learning_rate": 5.3495149145670435e-06, + "loss": 0.0, + "step": 49350 + }, + { + "epoch": 10.721112076455256, + "grad_norm": 0.0002186178317060694, + "learning_rate": 5.340464813205908e-06, + "loss": 0.0, + "step": 49360 + }, + { + "epoch": 10.723284100781928, + "grad_norm": 0.0003002184384968132, + "learning_rate": 5.331414711844773e-06, + "loss": 0.0, + "step": 49370 + }, + { + "epoch": 10.725456125108602, + "grad_norm": 0.00022116424224805087, + "learning_rate": 5.322364610483637e-06, + "loss": 0.0, + "step": 49380 + }, + { + "epoch": 10.727628149435274, + "grad_norm": 0.0002234865096397698, + "learning_rate": 5.313314509122503e-06, + "loss": 0.0, + "step": 49390 + }, + { + "epoch": 10.729800173761946, + "grad_norm": 0.00022370461374521255, + "learning_rate": 5.304264407761368e-06, + "loss": 0.0044, + "step": 49400 + }, + { + "epoch": 10.731972198088618, + "grad_norm": 0.00022042910859454423, + "learning_rate": 5.295214306400232e-06, + "loss": 0.0, + "step": 49410 + }, + { + "epoch": 10.73414422241529, + "grad_norm": 0.00028280200785957277, + "learning_rate": 5.2861642050390965e-06, + "loss": 0.0, + "step": 49420 + }, + { + "epoch": 10.736316246741964, + "grad_norm": 0.00021926072076894343, + "learning_rate": 5.277114103677962e-06, + "loss": 0.0, + "step": 49430 + }, + { + "epoch": 10.738488271068636, + "grad_norm": 0.00023302929184865206, + "learning_rate": 5.268064002316826e-06, + "loss": 0.0, + "step": 49440 + }, + { + "epoch": 10.740660295395308, + "grad_norm": 0.0002172905660700053, + "learning_rate": 5.25901390095569e-06, + "loss": 0.0, + "step": 49450 + }, + { + "epoch": 10.74283231972198, + "grad_norm": 0.00022836528660263866, + "learning_rate": 5.249963799594556e-06, + "loss": 0.0, + "step": 49460 + }, + { + "epoch": 10.745004344048652, + "grad_norm": 0.00032331692636944354, + "learning_rate": 5.240913698233421e-06, + "loss": 0.004, + "step": 49470 + }, + { + "epoch": 10.747176368375326, + "grad_norm": 0.0002181615273002535, + "learning_rate": 5.231863596872285e-06, + "loss": 0.0048, + "step": 49480 + }, + { + "epoch": 10.749348392701998, + "grad_norm": 0.00029964291024953127, + "learning_rate": 5.22281349551115e-06, + "loss": 0.0, + "step": 49490 + }, + { + "epoch": 10.75152041702867, + "grad_norm": 0.00022976213949732482, + "learning_rate": 5.2137633941500146e-06, + "loss": 0.0, + "step": 49500 + }, + { + "epoch": 10.753692441355343, + "grad_norm": 0.00021645518427249044, + "learning_rate": 5.204713292788879e-06, + "loss": 0.0, + "step": 49510 + }, + { + "epoch": 10.755864465682016, + "grad_norm": 0.00021738260693382472, + "learning_rate": 5.195663191427744e-06, + "loss": 0.0, + "step": 49520 + }, + { + "epoch": 10.758036490008688, + "grad_norm": 0.00022163652465678751, + "learning_rate": 5.186613090066609e-06, + "loss": 0.0, + "step": 49530 + }, + { + "epoch": 10.76020851433536, + "grad_norm": 0.00026377089670859277, + "learning_rate": 5.177562988705474e-06, + "loss": 0.0, + "step": 49540 + }, + { + "epoch": 10.762380538662033, + "grad_norm": 0.00021624031069222838, + "learning_rate": 5.168512887344339e-06, + "loss": 0.0, + "step": 49550 + }, + { + "epoch": 10.764552562988705, + "grad_norm": 0.00021584972273558378, + "learning_rate": 5.159462785983203e-06, + "loss": 0.0, + "step": 49560 + }, + { + "epoch": 10.766724587315379, + "grad_norm": 0.0002195909182773903, + "learning_rate": 5.150412684622068e-06, + "loss": 0.0, + "step": 49570 + }, + { + "epoch": 10.76889661164205, + "grad_norm": 0.00021642334468197078, + "learning_rate": 5.141362583260933e-06, + "loss": 0.0, + "step": 49580 + }, + { + "epoch": 10.771068635968723, + "grad_norm": 0.00022419106971938163, + "learning_rate": 5.132312481899797e-06, + "loss": 0.0, + "step": 49590 + }, + { + "epoch": 10.773240660295395, + "grad_norm": 0.0002185798075515777, + "learning_rate": 5.123262380538662e-06, + "loss": 0.0, + "step": 49600 + }, + { + "epoch": 10.775412684622069, + "grad_norm": 0.00021758888033218682, + "learning_rate": 5.114212279177527e-06, + "loss": 0.0, + "step": 49610 + }, + { + "epoch": 10.77758470894874, + "grad_norm": 0.00030351977329701185, + "learning_rate": 5.105162177816392e-06, + "loss": 0.0, + "step": 49620 + }, + { + "epoch": 10.779756733275413, + "grad_norm": 0.0002139311982318759, + "learning_rate": 5.096112076455257e-06, + "loss": 0.0, + "step": 49630 + }, + { + "epoch": 10.781928757602085, + "grad_norm": 0.00025567892589606345, + "learning_rate": 5.087061975094121e-06, + "loss": 0.0, + "step": 49640 + }, + { + "epoch": 10.784100781928757, + "grad_norm": 0.0002477488887961954, + "learning_rate": 5.0780118737329856e-06, + "loss": 0.0, + "step": 49650 + }, + { + "epoch": 10.78627280625543, + "grad_norm": 0.0002187481295550242, + "learning_rate": 5.068961772371851e-06, + "loss": 0.0042, + "step": 49660 + }, + { + "epoch": 10.788444830582103, + "grad_norm": 0.0002477424859534949, + "learning_rate": 5.059911671010716e-06, + "loss": 0.0051, + "step": 49670 + }, + { + "epoch": 10.790616854908775, + "grad_norm": 0.0002899961546063423, + "learning_rate": 5.05086156964958e-06, + "loss": 0.0, + "step": 49680 + }, + { + "epoch": 10.792788879235447, + "grad_norm": 0.00028191893943585455, + "learning_rate": 5.0418114682884455e-06, + "loss": 0.0, + "step": 49690 + }, + { + "epoch": 10.79496090356212, + "grad_norm": 0.0002191022940678522, + "learning_rate": 5.03276136692731e-06, + "loss": 0.0, + "step": 49700 + }, + { + "epoch": 10.797132927888793, + "grad_norm": 0.0002154961839551106, + "learning_rate": 5.023711265566174e-06, + "loss": 0.0, + "step": 49710 + }, + { + "epoch": 10.799304952215465, + "grad_norm": 0.0002220691676484421, + "learning_rate": 5.014661164205039e-06, + "loss": 0.0, + "step": 49720 + }, + { + "epoch": 10.801476976542137, + "grad_norm": 0.00021827526506967843, + "learning_rate": 5.005611062843904e-06, + "loss": 0.0, + "step": 49730 + }, + { + "epoch": 10.803649000868809, + "grad_norm": 0.002871220000088215, + "learning_rate": 4.996560961482769e-06, + "loss": 0.0, + "step": 49740 + }, + { + "epoch": 10.805821025195483, + "grad_norm": 0.00022947440447751433, + "learning_rate": 4.987510860121634e-06, + "loss": 0.0, + "step": 49750 + }, + { + "epoch": 10.807993049522155, + "grad_norm": 0.00021768937585875392, + "learning_rate": 4.978460758760498e-06, + "loss": 0.0, + "step": 49760 + }, + { + "epoch": 10.810165073848827, + "grad_norm": 0.00021608640963677317, + "learning_rate": 4.9694106573993636e-06, + "loss": 0.0, + "step": 49770 + }, + { + "epoch": 10.8123370981755, + "grad_norm": 0.0002164438192266971, + "learning_rate": 4.960360556038228e-06, + "loss": 0.0, + "step": 49780 + }, + { + "epoch": 10.814509122502171, + "grad_norm": 0.00021347503934521228, + "learning_rate": 4.951310454677092e-06, + "loss": 0.0, + "step": 49790 + }, + { + "epoch": 10.816681146828845, + "grad_norm": 0.00025063398061320186, + "learning_rate": 4.9422603533159574e-06, + "loss": 0.0, + "step": 49800 + }, + { + "epoch": 10.818853171155517, + "grad_norm": 0.00022577929485123605, + "learning_rate": 4.933210251954823e-06, + "loss": 0.0, + "step": 49810 + }, + { + "epoch": 10.82102519548219, + "grad_norm": 0.0003572820278350264, + "learning_rate": 4.924160150593687e-06, + "loss": 0.0, + "step": 49820 + }, + { + "epoch": 10.823197219808861, + "grad_norm": 0.00021640605700667948, + "learning_rate": 4.915110049232552e-06, + "loss": 0.0041, + "step": 49830 + }, + { + "epoch": 10.825369244135535, + "grad_norm": 0.00021510386432055384, + "learning_rate": 4.9060599478714165e-06, + "loss": 0.0, + "step": 49840 + }, + { + "epoch": 10.827541268462207, + "grad_norm": 0.0002934297954197973, + "learning_rate": 4.897009846510281e-06, + "loss": 0.0, + "step": 49850 + }, + { + "epoch": 10.82971329278888, + "grad_norm": 0.16740760207176208, + "learning_rate": 4.887959745149146e-06, + "loss": 0.005, + "step": 49860 + }, + { + "epoch": 10.831885317115551, + "grad_norm": 0.0002762663352768868, + "learning_rate": 4.87890964378801e-06, + "loss": 0.0, + "step": 49870 + }, + { + "epoch": 10.834057341442223, + "grad_norm": 0.00021649140398949385, + "learning_rate": 4.8698595424268755e-06, + "loss": 0.0, + "step": 49880 + }, + { + "epoch": 10.836229365768897, + "grad_norm": 0.00021669291891157627, + "learning_rate": 4.860809441065741e-06, + "loss": 0.0, + "step": 49890 + }, + { + "epoch": 10.83840139009557, + "grad_norm": 0.0002275826846016571, + "learning_rate": 4.851759339704605e-06, + "loss": 0.0, + "step": 49900 + }, + { + "epoch": 10.840573414422241, + "grad_norm": 0.00021657983597833663, + "learning_rate": 4.842709238343469e-06, + "loss": 0.0, + "step": 49910 + }, + { + "epoch": 10.842745438748914, + "grad_norm": 0.00029274701955728233, + "learning_rate": 4.833659136982335e-06, + "loss": 0.0, + "step": 49920 + }, + { + "epoch": 10.844917463075586, + "grad_norm": 0.00021872835350222886, + "learning_rate": 4.824609035621199e-06, + "loss": 0.0, + "step": 49930 + }, + { + "epoch": 10.84708948740226, + "grad_norm": 0.00033134184195660055, + "learning_rate": 4.815558934260063e-06, + "loss": 0.0, + "step": 49940 + }, + { + "epoch": 10.849261511728931, + "grad_norm": 0.0002122445439454168, + "learning_rate": 4.8065088328989284e-06, + "loss": 0.0, + "step": 49950 + }, + { + "epoch": 10.851433536055604, + "grad_norm": 0.00027742431848309934, + "learning_rate": 4.797458731537794e-06, + "loss": 0.0, + "step": 49960 + }, + { + "epoch": 10.853605560382276, + "grad_norm": 0.0002131950604962185, + "learning_rate": 4.788408630176658e-06, + "loss": 0.0, + "step": 49970 + }, + { + "epoch": 10.85577758470895, + "grad_norm": 0.00021493675012607127, + "learning_rate": 4.779358528815523e-06, + "loss": 0.0, + "step": 49980 + }, + { + "epoch": 10.857949609035622, + "grad_norm": 0.00021079306316096336, + "learning_rate": 4.7703084274543875e-06, + "loss": 0.0041, + "step": 49990 + }, + { + "epoch": 10.860121633362294, + "grad_norm": 0.00021532582468353212, + "learning_rate": 4.761258326093253e-06, + "loss": 0.0, + "step": 50000 + }, + { + "epoch": 10.862293657688966, + "grad_norm": 0.00021149902022443712, + "learning_rate": 4.752208224732117e-06, + "loss": 0.0, + "step": 50010 + }, + { + "epoch": 10.864465682015638, + "grad_norm": 0.00021205766825005412, + "learning_rate": 4.743158123370982e-06, + "loss": 0.0, + "step": 50020 + }, + { + "epoch": 10.866637706342312, + "grad_norm": 0.00030917723779566586, + "learning_rate": 4.734108022009847e-06, + "loss": 0.0043, + "step": 50030 + }, + { + "epoch": 10.868809730668984, + "grad_norm": 0.0002103663864545524, + "learning_rate": 4.725057920648712e-06, + "loss": 0.0, + "step": 50040 + }, + { + "epoch": 10.870981754995656, + "grad_norm": 0.00021224647935014218, + "learning_rate": 4.716007819287576e-06, + "loss": 0.0051, + "step": 50050 + }, + { + "epoch": 10.873153779322328, + "grad_norm": 0.00035064792609773576, + "learning_rate": 4.706957717926441e-06, + "loss": 0.005, + "step": 50060 + }, + { + "epoch": 10.875325803649002, + "grad_norm": 0.00020956051594112068, + "learning_rate": 4.697907616565306e-06, + "loss": 0.0, + "step": 50070 + }, + { + "epoch": 10.877497827975674, + "grad_norm": 0.00021312307217158377, + "learning_rate": 4.68885751520417e-06, + "loss": 0.0, + "step": 50080 + }, + { + "epoch": 10.879669852302346, + "grad_norm": 0.0002734953013714403, + "learning_rate": 4.679807413843035e-06, + "loss": 0.0, + "step": 50090 + }, + { + "epoch": 10.881841876629018, + "grad_norm": 0.00021291685698088259, + "learning_rate": 4.6707573124819e-06, + "loss": 0.0, + "step": 50100 + }, + { + "epoch": 10.88401390095569, + "grad_norm": 0.0002116796822519973, + "learning_rate": 4.661707211120765e-06, + "loss": 0.0, + "step": 50110 + }, + { + "epoch": 10.886185925282364, + "grad_norm": 0.00021584934438578784, + "learning_rate": 4.65265710975963e-06, + "loss": 0.0, + "step": 50120 + }, + { + "epoch": 10.888357949609036, + "grad_norm": 0.00021120175370015204, + "learning_rate": 4.643607008398494e-06, + "loss": 0.0, + "step": 50130 + }, + { + "epoch": 10.890529973935708, + "grad_norm": 0.00021197435853537172, + "learning_rate": 4.6345569070373585e-06, + "loss": 0.0, + "step": 50140 + }, + { + "epoch": 10.89270199826238, + "grad_norm": 0.00021428019681479782, + "learning_rate": 4.625506805676224e-06, + "loss": 0.0, + "step": 50150 + }, + { + "epoch": 10.894874022589054, + "grad_norm": 0.00020921516988892108, + "learning_rate": 4.616456704315088e-06, + "loss": 0.0047, + "step": 50160 + }, + { + "epoch": 10.897046046915726, + "grad_norm": 0.00027560058515518904, + "learning_rate": 4.607406602953953e-06, + "loss": 0.0, + "step": 50170 + }, + { + "epoch": 10.899218071242398, + "grad_norm": 0.00023641523148398846, + "learning_rate": 4.598356501592818e-06, + "loss": 0.0, + "step": 50180 + }, + { + "epoch": 10.90139009556907, + "grad_norm": 0.000214401152334176, + "learning_rate": 4.589306400231683e-06, + "loss": 0.0, + "step": 50190 + }, + { + "epoch": 10.903562119895742, + "grad_norm": 0.0002733489091042429, + "learning_rate": 4.580256298870548e-06, + "loss": 0.0044, + "step": 50200 + }, + { + "epoch": 10.905734144222416, + "grad_norm": 0.00022136476763989776, + "learning_rate": 4.571206197509412e-06, + "loss": 0.0, + "step": 50210 + }, + { + "epoch": 10.907906168549088, + "grad_norm": 0.0002093520452035591, + "learning_rate": 4.562156096148277e-06, + "loss": 0.0, + "step": 50220 + }, + { + "epoch": 10.91007819287576, + "grad_norm": 0.0002700627373997122, + "learning_rate": 4.553105994787142e-06, + "loss": 0.0, + "step": 50230 + }, + { + "epoch": 10.912250217202432, + "grad_norm": 0.00028141363873146474, + "learning_rate": 4.544055893426007e-06, + "loss": 0.0, + "step": 50240 + }, + { + "epoch": 10.914422241529104, + "grad_norm": 0.00021085295884404331, + "learning_rate": 4.535005792064871e-06, + "loss": 0.0043, + "step": 50250 + }, + { + "epoch": 10.916594265855778, + "grad_norm": 0.00021086714696139097, + "learning_rate": 4.5259556907037365e-06, + "loss": 0.0, + "step": 50260 + }, + { + "epoch": 10.91876629018245, + "grad_norm": 0.00020958646200597286, + "learning_rate": 4.516905589342601e-06, + "loss": 0.0, + "step": 50270 + }, + { + "epoch": 10.920938314509122, + "grad_norm": 0.00021219199697952718, + "learning_rate": 4.507855487981465e-06, + "loss": 0.0141, + "step": 50280 + }, + { + "epoch": 10.923110338835794, + "grad_norm": 0.0002100965939462185, + "learning_rate": 4.49880538662033e-06, + "loss": 0.0, + "step": 50290 + }, + { + "epoch": 10.925282363162466, + "grad_norm": 0.00022129624267108738, + "learning_rate": 4.489755285259195e-06, + "loss": 0.0, + "step": 50300 + }, + { + "epoch": 10.92745438748914, + "grad_norm": 0.000218289002077654, + "learning_rate": 4.48070518389806e-06, + "loss": 0.0, + "step": 50310 + }, + { + "epoch": 10.929626411815812, + "grad_norm": 0.0002210328821092844, + "learning_rate": 4.471655082536925e-06, + "loss": 0.0, + "step": 50320 + }, + { + "epoch": 10.931798436142484, + "grad_norm": 0.00021514331456273794, + "learning_rate": 4.4626049811757894e-06, + "loss": 0.0, + "step": 50330 + }, + { + "epoch": 10.933970460469157, + "grad_norm": 0.00020960548135917634, + "learning_rate": 4.453554879814654e-06, + "loss": 0.0, + "step": 50340 + }, + { + "epoch": 10.93614248479583, + "grad_norm": 0.0002283220092067495, + "learning_rate": 4.444504778453519e-06, + "loss": 0.0, + "step": 50350 + }, + { + "epoch": 10.938314509122502, + "grad_norm": 0.0002141373261110857, + "learning_rate": 4.435454677092383e-06, + "loss": 0.0, + "step": 50360 + }, + { + "epoch": 10.940486533449175, + "grad_norm": 0.00021023667068220675, + "learning_rate": 4.4264045757312485e-06, + "loss": 0.0, + "step": 50370 + }, + { + "epoch": 10.942658557775847, + "grad_norm": 0.00021123423357494175, + "learning_rate": 4.417354474370114e-06, + "loss": 0.0, + "step": 50380 + }, + { + "epoch": 10.944830582102519, + "grad_norm": 0.00023207410413306206, + "learning_rate": 4.408304373008978e-06, + "loss": 0.0042, + "step": 50390 + }, + { + "epoch": 10.947002606429193, + "grad_norm": 0.00020956066146027297, + "learning_rate": 4.399254271647843e-06, + "loss": 0.0, + "step": 50400 + }, + { + "epoch": 10.949174630755865, + "grad_norm": 0.0002111865032929927, + "learning_rate": 4.3902041702867075e-06, + "loss": 0.0, + "step": 50410 + }, + { + "epoch": 10.951346655082537, + "grad_norm": 0.0002907202870119363, + "learning_rate": 4.381154068925572e-06, + "loss": 0.0, + "step": 50420 + }, + { + "epoch": 10.953518679409209, + "grad_norm": 0.00020739728643093258, + "learning_rate": 4.372103967564437e-06, + "loss": 0.0, + "step": 50430 + }, + { + "epoch": 10.955690703735883, + "grad_norm": 0.00033158701262436807, + "learning_rate": 4.363053866203301e-06, + "loss": 0.0, + "step": 50440 + }, + { + "epoch": 10.957862728062555, + "grad_norm": 0.00020972038328181952, + "learning_rate": 4.354003764842167e-06, + "loss": 0.0, + "step": 50450 + }, + { + "epoch": 10.960034752389227, + "grad_norm": 0.00020969909382984042, + "learning_rate": 4.344953663481032e-06, + "loss": 0.0, + "step": 50460 + }, + { + "epoch": 10.962206776715899, + "grad_norm": 0.00020956064690835774, + "learning_rate": 4.335903562119896e-06, + "loss": 0.0041, + "step": 50470 + }, + { + "epoch": 10.964378801042571, + "grad_norm": 0.0002089909539790824, + "learning_rate": 4.3268534607587604e-06, + "loss": 0.0, + "step": 50480 + }, + { + "epoch": 10.966550825369245, + "grad_norm": 0.00020743778441101313, + "learning_rate": 4.317803359397626e-06, + "loss": 0.0, + "step": 50490 + }, + { + "epoch": 10.968722849695917, + "grad_norm": 0.00020597422553692013, + "learning_rate": 4.30875325803649e-06, + "loss": 0.0, + "step": 50500 + }, + { + "epoch": 10.970894874022589, + "grad_norm": 0.0002715985174290836, + "learning_rate": 4.299703156675354e-06, + "loss": 0.0, + "step": 50510 + }, + { + "epoch": 10.973066898349261, + "grad_norm": 0.0002085827582050115, + "learning_rate": 4.2906530553142195e-06, + "loss": 0.0, + "step": 50520 + }, + { + "epoch": 10.975238922675935, + "grad_norm": 0.00021042587468400598, + "learning_rate": 4.281602953953085e-06, + "loss": 0.0, + "step": 50530 + }, + { + "epoch": 10.977410947002607, + "grad_norm": 0.0002095496020046994, + "learning_rate": 4.272552852591949e-06, + "loss": 0.0, + "step": 50540 + }, + { + "epoch": 10.979582971329279, + "grad_norm": 0.00026932769105769694, + "learning_rate": 4.263502751230814e-06, + "loss": 0.0, + "step": 50550 + }, + { + "epoch": 10.981754995655951, + "grad_norm": 0.00020780177146662027, + "learning_rate": 4.2544526498696785e-06, + "loss": 0.0, + "step": 50560 + }, + { + "epoch": 10.983927019982623, + "grad_norm": 0.00031749060144647956, + "learning_rate": 4.245402548508543e-06, + "loss": 0.0, + "step": 50570 + }, + { + "epoch": 10.986099044309297, + "grad_norm": 0.0002206074568675831, + "learning_rate": 4.236352447147408e-06, + "loss": 0.0, + "step": 50580 + }, + { + "epoch": 10.988271068635969, + "grad_norm": 0.00020820109057240188, + "learning_rate": 4.227302345786273e-06, + "loss": 0.0048, + "step": 50590 + }, + { + "epoch": 10.990443092962641, + "grad_norm": 0.00021198611648287624, + "learning_rate": 4.2182522444251384e-06, + "loss": 0.0, + "step": 50600 + }, + { + "epoch": 10.992615117289313, + "grad_norm": 0.00020806727115996182, + "learning_rate": 4.209202143064003e-06, + "loss": 0.0, + "step": 50610 + }, + { + "epoch": 10.994787141615987, + "grad_norm": 0.00027271060389466584, + "learning_rate": 4.200152041702867e-06, + "loss": 0.0, + "step": 50620 + }, + { + "epoch": 10.996959165942659, + "grad_norm": 0.00020815835159737617, + "learning_rate": 4.191101940341732e-06, + "loss": 0.0, + "step": 50630 + }, + { + "epoch": 10.999131190269331, + "grad_norm": 0.00021419930271804333, + "learning_rate": 4.182051838980597e-06, + "loss": 0.0, + "step": 50640 + }, + { + "epoch": 11.0, + "eval_f1": 0.6240601503759399, + "eval_loss": 0.09284297376871109, + "eval_runtime": 84.1266, + "eval_samples_per_second": 118.571, + "eval_steps_per_second": 7.417, + "step": 50644 + }, + { + "epoch": 11.001303214596003, + "grad_norm": 0.09123075008392334, + "learning_rate": 4.173001737619461e-06, + "loss": 0.0084, + "step": 50650 + }, + { + "epoch": 11.003475238922675, + "grad_norm": 0.00022338244889397174, + "learning_rate": 4.163951636258326e-06, + "loss": 0.0042, + "step": 50660 + }, + { + "epoch": 11.00564726324935, + "grad_norm": 0.0002102691651089117, + "learning_rate": 4.154901534897191e-06, + "loss": 0.0, + "step": 50670 + }, + { + "epoch": 11.007819287576021, + "grad_norm": 0.00021717610070481896, + "learning_rate": 4.145851433536056e-06, + "loss": 0.0049, + "step": 50680 + }, + { + "epoch": 11.009991311902693, + "grad_norm": 0.0002068415778921917, + "learning_rate": 4.136801332174921e-06, + "loss": 0.0, + "step": 50690 + }, + { + "epoch": 11.012163336229365, + "grad_norm": 0.00020632539235521108, + "learning_rate": 4.127751230813785e-06, + "loss": 0.0, + "step": 50700 + }, + { + "epoch": 11.014335360556037, + "grad_norm": 0.00022067526879254729, + "learning_rate": 4.1187011294526496e-06, + "loss": 0.0, + "step": 50710 + }, + { + "epoch": 11.016507384882711, + "grad_norm": 0.00027272888110019267, + "learning_rate": 4.109651028091515e-06, + "loss": 0.0, + "step": 50720 + }, + { + "epoch": 11.018679409209383, + "grad_norm": 0.0002099119737977162, + "learning_rate": 4.10060092673038e-06, + "loss": 0.0, + "step": 50730 + }, + { + "epoch": 11.020851433536055, + "grad_norm": 0.00020631964434869587, + "learning_rate": 4.091550825369244e-06, + "loss": 0.0041, + "step": 50740 + }, + { + "epoch": 11.023023457862728, + "grad_norm": 0.00020822268561460078, + "learning_rate": 4.0825007240081095e-06, + "loss": 0.0, + "step": 50750 + }, + { + "epoch": 11.025195482189401, + "grad_norm": 0.00020695666898973286, + "learning_rate": 4.073450622646974e-06, + "loss": 0.0, + "step": 50760 + }, + { + "epoch": 11.027367506516073, + "grad_norm": 0.00021030766947660595, + "learning_rate": 4.064400521285838e-06, + "loss": 0.0041, + "step": 50770 + }, + { + "epoch": 11.029539530842746, + "grad_norm": 0.000298218394163996, + "learning_rate": 4.055350419924703e-06, + "loss": 0.0, + "step": 50780 + }, + { + "epoch": 11.031711555169418, + "grad_norm": 0.00020447710994631052, + "learning_rate": 4.046300318563568e-06, + "loss": 0.0, + "step": 50790 + }, + { + "epoch": 11.03388357949609, + "grad_norm": 0.00021559254673775285, + "learning_rate": 4.037250217202433e-06, + "loss": 0.0, + "step": 50800 + }, + { + "epoch": 11.036055603822764, + "grad_norm": 0.0002991893270518631, + "learning_rate": 4.028200115841298e-06, + "loss": 0.0, + "step": 50810 + }, + { + "epoch": 11.038227628149436, + "grad_norm": 0.00020672754908446223, + "learning_rate": 4.019150014480162e-06, + "loss": 0.0, + "step": 50820 + }, + { + "epoch": 11.040399652476108, + "grad_norm": 0.00020618090638890862, + "learning_rate": 4.0100999131190276e-06, + "loss": 0.0047, + "step": 50830 + }, + { + "epoch": 11.04257167680278, + "grad_norm": 0.0002046554145636037, + "learning_rate": 4.001049811757892e-06, + "loss": 0.0, + "step": 50840 + }, + { + "epoch": 11.044743701129452, + "grad_norm": 0.00021028323681093752, + "learning_rate": 3.991999710396756e-06, + "loss": 0.0, + "step": 50850 + }, + { + "epoch": 11.046915725456126, + "grad_norm": 0.0002057504461845383, + "learning_rate": 3.982949609035621e-06, + "loss": 0.0, + "step": 50860 + }, + { + "epoch": 11.049087749782798, + "grad_norm": 0.00020733078417833894, + "learning_rate": 3.973899507674486e-06, + "loss": 0.0, + "step": 50870 + }, + { + "epoch": 11.05125977410947, + "grad_norm": 0.00020432537712622434, + "learning_rate": 3.964849406313351e-06, + "loss": 0.0, + "step": 50880 + }, + { + "epoch": 11.053431798436142, + "grad_norm": 0.0002050166658591479, + "learning_rate": 3.955799304952216e-06, + "loss": 0.0, + "step": 50890 + }, + { + "epoch": 11.055603822762816, + "grad_norm": 0.00023185595637187362, + "learning_rate": 3.9467492035910805e-06, + "loss": 0.0, + "step": 50900 + }, + { + "epoch": 11.057775847089488, + "grad_norm": 0.0003302676195744425, + "learning_rate": 3.937699102229945e-06, + "loss": 0.0, + "step": 50910 + }, + { + "epoch": 11.05994787141616, + "grad_norm": 0.0002059488178929314, + "learning_rate": 3.92864900086881e-06, + "loss": 0.0044, + "step": 50920 + }, + { + "epoch": 11.062119895742832, + "grad_norm": 0.0002076542004942894, + "learning_rate": 3.919598899507674e-06, + "loss": 0.0, + "step": 50930 + }, + { + "epoch": 11.064291920069504, + "grad_norm": 0.00020699271408375353, + "learning_rate": 3.9105487981465395e-06, + "loss": 0.0, + "step": 50940 + }, + { + "epoch": 11.066463944396178, + "grad_norm": 0.0002058657701127231, + "learning_rate": 3.901498696785405e-06, + "loss": 0.0, + "step": 50950 + }, + { + "epoch": 11.06863596872285, + "grad_norm": 0.0002156527916667983, + "learning_rate": 3.892448595424269e-06, + "loss": 0.0, + "step": 50960 + }, + { + "epoch": 11.070807993049522, + "grad_norm": 0.000269353884505108, + "learning_rate": 3.883398494063133e-06, + "loss": 0.0, + "step": 50970 + }, + { + "epoch": 11.072980017376194, + "grad_norm": 0.0002053501084446907, + "learning_rate": 3.8743483927019986e-06, + "loss": 0.0, + "step": 50980 + }, + { + "epoch": 11.075152041702868, + "grad_norm": 0.0002061450359178707, + "learning_rate": 3.865298291340863e-06, + "loss": 0.0, + "step": 50990 + }, + { + "epoch": 11.07732406602954, + "grad_norm": 0.00020651152590289712, + "learning_rate": 3.856248189979728e-06, + "loss": 0.0, + "step": 51000 + }, + { + "epoch": 11.079496090356212, + "grad_norm": 0.00026513266493566334, + "learning_rate": 3.8471980886185924e-06, + "loss": 0.0, + "step": 51010 + }, + { + "epoch": 11.081668114682884, + "grad_norm": 0.00023419792705681175, + "learning_rate": 3.838147987257458e-06, + "loss": 0.0, + "step": 51020 + }, + { + "epoch": 11.083840139009556, + "grad_norm": 0.00022187073773238808, + "learning_rate": 3.829097885896323e-06, + "loss": 0.0, + "step": 51030 + }, + { + "epoch": 11.08601216333623, + "grad_norm": 0.00020510748436208814, + "learning_rate": 3.820047784535187e-06, + "loss": 0.0, + "step": 51040 + }, + { + "epoch": 11.088184187662902, + "grad_norm": 0.0002099367993650958, + "learning_rate": 3.8109976831740515e-06, + "loss": 0.0, + "step": 51050 + }, + { + "epoch": 11.090356211989574, + "grad_norm": 0.0002047920279437676, + "learning_rate": 3.8019475818129167e-06, + "loss": 0.0, + "step": 51060 + }, + { + "epoch": 11.092528236316246, + "grad_norm": 0.00020407889678608626, + "learning_rate": 3.7928974804517814e-06, + "loss": 0.0, + "step": 51070 + }, + { + "epoch": 11.094700260642918, + "grad_norm": 0.00020646867051254958, + "learning_rate": 3.7838473790906458e-06, + "loss": 0.0, + "step": 51080 + }, + { + "epoch": 11.096872284969592, + "grad_norm": 0.0002043266867985949, + "learning_rate": 3.774797277729511e-06, + "loss": 0.0, + "step": 51090 + }, + { + "epoch": 11.099044309296264, + "grad_norm": 0.0002051582414424047, + "learning_rate": 3.7657471763683753e-06, + "loss": 0.0, + "step": 51100 + }, + { + "epoch": 11.101216333622936, + "grad_norm": 0.00020401063375175, + "learning_rate": 3.75669707500724e-06, + "loss": 0.0, + "step": 51110 + }, + { + "epoch": 11.103388357949608, + "grad_norm": 0.00020295396097935736, + "learning_rate": 3.7476469736461052e-06, + "loss": 0.0, + "step": 51120 + }, + { + "epoch": 11.105560382276282, + "grad_norm": 0.00020517785742413253, + "learning_rate": 3.7385968722849696e-06, + "loss": 0.0, + "step": 51130 + }, + { + "epoch": 11.107732406602954, + "grad_norm": 0.0002051791234407574, + "learning_rate": 3.7295467709238343e-06, + "loss": 0.0, + "step": 51140 + }, + { + "epoch": 11.109904430929626, + "grad_norm": 0.00026767002418637276, + "learning_rate": 3.7204966695626995e-06, + "loss": 0.0, + "step": 51150 + }, + { + "epoch": 11.112076455256299, + "grad_norm": 0.00020361509814392775, + "learning_rate": 3.711446568201564e-06, + "loss": 0.0, + "step": 51160 + }, + { + "epoch": 11.11424847958297, + "grad_norm": 0.0002157597045879811, + "learning_rate": 3.7023964668404286e-06, + "loss": 0.0, + "step": 51170 + }, + { + "epoch": 11.116420503909644, + "grad_norm": 0.00020699432934634387, + "learning_rate": 3.693346365479294e-06, + "loss": 0.0, + "step": 51180 + }, + { + "epoch": 11.118592528236316, + "grad_norm": 0.0002198000584030524, + "learning_rate": 3.684296264118158e-06, + "loss": 0.0, + "step": 51190 + }, + { + "epoch": 11.120764552562989, + "grad_norm": 0.14969320595264435, + "learning_rate": 3.6752461627570233e-06, + "loss": 0.0081, + "step": 51200 + }, + { + "epoch": 11.12293657688966, + "grad_norm": 0.00020602031145244837, + "learning_rate": 3.6661960613958877e-06, + "loss": 0.0, + "step": 51210 + }, + { + "epoch": 11.125108601216334, + "grad_norm": 0.00020917251822538674, + "learning_rate": 3.6571459600347524e-06, + "loss": 0.0, + "step": 51220 + }, + { + "epoch": 11.127280625543007, + "grad_norm": 0.0002067875029752031, + "learning_rate": 3.6480958586736176e-06, + "loss": 0.0, + "step": 51230 + }, + { + "epoch": 11.129452649869679, + "grad_norm": 0.00020278933516237885, + "learning_rate": 3.639045757312482e-06, + "loss": 0.0, + "step": 51240 + }, + { + "epoch": 11.13162467419635, + "grad_norm": 0.00020642305025830865, + "learning_rate": 3.6299956559513467e-06, + "loss": 0.0, + "step": 51250 + }, + { + "epoch": 11.133796698523023, + "grad_norm": 0.00020503332780208439, + "learning_rate": 3.620945554590212e-06, + "loss": 0.0072, + "step": 51260 + }, + { + "epoch": 11.135968722849697, + "grad_norm": 0.00020080467220395803, + "learning_rate": 3.6118954532290763e-06, + "loss": 0.0, + "step": 51270 + }, + { + "epoch": 11.138140747176369, + "grad_norm": 0.00020930106984451413, + "learning_rate": 3.602845351867941e-06, + "loss": 0.0, + "step": 51280 + }, + { + "epoch": 11.14031277150304, + "grad_norm": 0.0001999106170842424, + "learning_rate": 3.593795250506806e-06, + "loss": 0.0, + "step": 51290 + }, + { + "epoch": 11.142484795829713, + "grad_norm": 0.00020284131460357457, + "learning_rate": 3.5847451491456705e-06, + "loss": 0.0, + "step": 51300 + }, + { + "epoch": 11.144656820156385, + "grad_norm": 0.0002229460224043578, + "learning_rate": 3.575695047784535e-06, + "loss": 0.0, + "step": 51310 + }, + { + "epoch": 11.146828844483059, + "grad_norm": 0.00020142899302300066, + "learning_rate": 3.5666449464234005e-06, + "loss": 0.0, + "step": 51320 + }, + { + "epoch": 11.14900086880973, + "grad_norm": 0.00020130908524151891, + "learning_rate": 3.557594845062265e-06, + "loss": 0.0, + "step": 51330 + }, + { + "epoch": 11.151172893136403, + "grad_norm": 0.0002031168551184237, + "learning_rate": 3.548544743701129e-06, + "loss": 0.0, + "step": 51340 + }, + { + "epoch": 11.153344917463075, + "grad_norm": 0.00019986261031590402, + "learning_rate": 3.5394946423399944e-06, + "loss": 0.0047, + "step": 51350 + }, + { + "epoch": 11.155516941789749, + "grad_norm": 0.00020868683350272477, + "learning_rate": 3.530444540978859e-06, + "loss": 0.0, + "step": 51360 + }, + { + "epoch": 11.157688966116421, + "grad_norm": 0.00025968122645281255, + "learning_rate": 3.5213944396177235e-06, + "loss": 0.0, + "step": 51370 + }, + { + "epoch": 11.159860990443093, + "grad_norm": 0.00020654463151004165, + "learning_rate": 3.5123443382565887e-06, + "loss": 0.0, + "step": 51380 + }, + { + "epoch": 11.162033014769765, + "grad_norm": 0.00020101090194657445, + "learning_rate": 3.5032942368954534e-06, + "loss": 0.0094, + "step": 51390 + }, + { + "epoch": 11.164205039096437, + "grad_norm": 0.00020278354350011796, + "learning_rate": 3.4942441355343186e-06, + "loss": 0.0051, + "step": 51400 + }, + { + "epoch": 11.166377063423111, + "grad_norm": 0.00029431344592012465, + "learning_rate": 3.485194034173183e-06, + "loss": 0.0, + "step": 51410 + }, + { + "epoch": 11.168549087749783, + "grad_norm": 0.00020167112234048545, + "learning_rate": 3.4761439328120477e-06, + "loss": 0.0, + "step": 51420 + }, + { + "epoch": 11.170721112076455, + "grad_norm": 0.0002188723155995831, + "learning_rate": 3.467093831450913e-06, + "loss": 0.0, + "step": 51430 + }, + { + "epoch": 11.172893136403127, + "grad_norm": 0.00021310002193786204, + "learning_rate": 3.4580437300897772e-06, + "loss": 0.0043, + "step": 51440 + }, + { + "epoch": 11.175065160729801, + "grad_norm": 0.0002156758273486048, + "learning_rate": 3.4489936287286416e-06, + "loss": 0.0, + "step": 51450 + }, + { + "epoch": 11.177237185056473, + "grad_norm": 0.0002034068020293489, + "learning_rate": 3.4399435273675068e-06, + "loss": 0.0, + "step": 51460 + }, + { + "epoch": 11.179409209383145, + "grad_norm": 0.0002786774421110749, + "learning_rate": 3.4308934260063715e-06, + "loss": 0.0, + "step": 51470 + }, + { + "epoch": 11.181581233709817, + "grad_norm": 0.0002041921834461391, + "learning_rate": 3.421843324645236e-06, + "loss": 0.0, + "step": 51480 + }, + { + "epoch": 11.18375325803649, + "grad_norm": 0.000210064637940377, + "learning_rate": 3.412793223284101e-06, + "loss": 0.0, + "step": 51490 + }, + { + "epoch": 11.185925282363163, + "grad_norm": 0.00020386015239637345, + "learning_rate": 3.403743121922966e-06, + "loss": 0.0, + "step": 51500 + }, + { + "epoch": 11.188097306689835, + "grad_norm": 0.00020263722399249673, + "learning_rate": 3.39469302056183e-06, + "loss": 0.0, + "step": 51510 + }, + { + "epoch": 11.190269331016507, + "grad_norm": 0.00019954588788095862, + "learning_rate": 3.3856429192006953e-06, + "loss": 0.0087, + "step": 51520 + }, + { + "epoch": 11.19244135534318, + "grad_norm": 0.1446802169084549, + "learning_rate": 3.37659281783956e-06, + "loss": 0.0079, + "step": 51530 + }, + { + "epoch": 11.194613379669851, + "grad_norm": 0.0002020450192503631, + "learning_rate": 3.3675427164784244e-06, + "loss": 0.0, + "step": 51540 + }, + { + "epoch": 11.196785403996525, + "grad_norm": 0.00020075002976227552, + "learning_rate": 3.3584926151172896e-06, + "loss": 0.0, + "step": 51550 + }, + { + "epoch": 11.198957428323197, + "grad_norm": 0.00020184363529551774, + "learning_rate": 3.349442513756154e-06, + "loss": 0.0, + "step": 51560 + }, + { + "epoch": 11.20112945264987, + "grad_norm": 0.00020253408001735806, + "learning_rate": 3.3403924123950187e-06, + "loss": 0.0, + "step": 51570 + }, + { + "epoch": 11.203301476976542, + "grad_norm": 0.00020108975877519697, + "learning_rate": 3.331342311033884e-06, + "loss": 0.0, + "step": 51580 + }, + { + "epoch": 11.205473501303215, + "grad_norm": 0.00026143176364712417, + "learning_rate": 3.3222922096727482e-06, + "loss": 0.0, + "step": 51590 + }, + { + "epoch": 11.207645525629887, + "grad_norm": 0.00020975733059458435, + "learning_rate": 3.3132421083116134e-06, + "loss": 0.0, + "step": 51600 + }, + { + "epoch": 11.20981754995656, + "grad_norm": 0.000205664211534895, + "learning_rate": 3.304192006950478e-06, + "loss": 0.0, + "step": 51610 + }, + { + "epoch": 11.211989574283232, + "grad_norm": 0.00042388608562760055, + "learning_rate": 3.2951419055893425e-06, + "loss": 0.0, + "step": 51620 + }, + { + "epoch": 11.214161598609904, + "grad_norm": 0.00020332858548499644, + "learning_rate": 3.2860918042282077e-06, + "loss": 0.0, + "step": 51630 + }, + { + "epoch": 11.216333622936578, + "grad_norm": 0.00020449819567147642, + "learning_rate": 3.2770417028670725e-06, + "loss": 0.0, + "step": 51640 + }, + { + "epoch": 11.21850564726325, + "grad_norm": 0.000200850103283301, + "learning_rate": 3.267991601505937e-06, + "loss": 0.0, + "step": 51650 + }, + { + "epoch": 11.220677671589922, + "grad_norm": 0.0002570762299001217, + "learning_rate": 3.258941500144802e-06, + "loss": 0.0, + "step": 51660 + }, + { + "epoch": 11.222849695916594, + "grad_norm": 0.00020373582083266228, + "learning_rate": 3.2498913987836663e-06, + "loss": 0.0, + "step": 51670 + }, + { + "epoch": 11.225021720243266, + "grad_norm": 0.00020122008572798222, + "learning_rate": 3.240841297422531e-06, + "loss": 0.0046, + "step": 51680 + }, + { + "epoch": 11.22719374456994, + "grad_norm": 0.00019910384435206652, + "learning_rate": 3.2317911960613963e-06, + "loss": 0.0, + "step": 51690 + }, + { + "epoch": 11.229365768896612, + "grad_norm": 0.00020562649297062308, + "learning_rate": 3.2227410947002606e-06, + "loss": 0.0, + "step": 51700 + }, + { + "epoch": 11.231537793223284, + "grad_norm": 0.0002331691503059119, + "learning_rate": 3.2136909933391254e-06, + "loss": 0.0, + "step": 51710 + }, + { + "epoch": 11.233709817549956, + "grad_norm": 0.00020238434080965817, + "learning_rate": 3.2046408919779906e-06, + "loss": 0.005, + "step": 51720 + }, + { + "epoch": 11.23588184187663, + "grad_norm": 0.0001993612531805411, + "learning_rate": 3.195590790616855e-06, + "loss": 0.0046, + "step": 51730 + }, + { + "epoch": 11.238053866203302, + "grad_norm": 0.0001995191996684298, + "learning_rate": 3.1865406892557197e-06, + "loss": 0.0, + "step": 51740 + }, + { + "epoch": 11.240225890529974, + "grad_norm": 0.00020470732124522328, + "learning_rate": 3.177490587894585e-06, + "loss": 0.0, + "step": 51750 + }, + { + "epoch": 11.242397914856646, + "grad_norm": 0.00020886657875962555, + "learning_rate": 3.168440486533449e-06, + "loss": 0.0, + "step": 51760 + }, + { + "epoch": 11.244569939183318, + "grad_norm": 0.00020291624241508543, + "learning_rate": 3.1593903851723135e-06, + "loss": 0.0049, + "step": 51770 + }, + { + "epoch": 11.246741963509992, + "grad_norm": 0.0002016778162214905, + "learning_rate": 3.150340283811179e-06, + "loss": 0.0038, + "step": 51780 + }, + { + "epoch": 11.248913987836664, + "grad_norm": 0.0002016224607359618, + "learning_rate": 3.1412901824500435e-06, + "loss": 0.0, + "step": 51790 + }, + { + "epoch": 11.251086012163336, + "grad_norm": 0.00019841018365696073, + "learning_rate": 3.1322400810889087e-06, + "loss": 0.0, + "step": 51800 + }, + { + "epoch": 11.253258036490008, + "grad_norm": 0.00019952683942392468, + "learning_rate": 3.123189979727773e-06, + "loss": 0.0085, + "step": 51810 + }, + { + "epoch": 11.255430060816682, + "grad_norm": 0.00032277125865221024, + "learning_rate": 3.1141398783666378e-06, + "loss": 0.0079, + "step": 51820 + }, + { + "epoch": 11.257602085143354, + "grad_norm": 0.000201634771656245, + "learning_rate": 3.1050897770055025e-06, + "loss": 0.0035, + "step": 51830 + }, + { + "epoch": 11.259774109470026, + "grad_norm": 0.00019903185602743179, + "learning_rate": 3.0960396756443673e-06, + "loss": 0.0, + "step": 51840 + }, + { + "epoch": 11.261946133796698, + "grad_norm": 0.00020083566778339446, + "learning_rate": 3.0869895742832325e-06, + "loss": 0.0048, + "step": 51850 + }, + { + "epoch": 11.26411815812337, + "grad_norm": 0.00025526623358018696, + "learning_rate": 3.077939472922097e-06, + "loss": 0.0, + "step": 51860 + }, + { + "epoch": 11.266290182450044, + "grad_norm": 0.00020170937932562083, + "learning_rate": 3.0688893715609616e-06, + "loss": 0.0, + "step": 51870 + }, + { + "epoch": 11.268462206776716, + "grad_norm": 0.00020052251056768, + "learning_rate": 3.0598392701998264e-06, + "loss": 0.0041, + "step": 51880 + }, + { + "epoch": 11.270634231103388, + "grad_norm": 0.00019856641301885247, + "learning_rate": 3.050789168838691e-06, + "loss": 0.0, + "step": 51890 + }, + { + "epoch": 11.27280625543006, + "grad_norm": 0.00025934906443580985, + "learning_rate": 3.041739067477556e-06, + "loss": 0.0, + "step": 51900 + }, + { + "epoch": 11.274978279756734, + "grad_norm": 0.00019815000996459275, + "learning_rate": 3.0326889661164206e-06, + "loss": 0.0, + "step": 51910 + }, + { + "epoch": 11.277150304083406, + "grad_norm": 0.0002467527228873223, + "learning_rate": 3.0236388647552854e-06, + "loss": 0.0, + "step": 51920 + }, + { + "epoch": 11.279322328410078, + "grad_norm": 0.00019958475604653358, + "learning_rate": 3.01458876339415e-06, + "loss": 0.0, + "step": 51930 + }, + { + "epoch": 11.28149435273675, + "grad_norm": 0.00019774853717535734, + "learning_rate": 3.005538662033015e-06, + "loss": 0.0, + "step": 51940 + }, + { + "epoch": 11.283666377063422, + "grad_norm": 0.00019953559967689216, + "learning_rate": 2.9964885606718797e-06, + "loss": 0.0, + "step": 51950 + }, + { + "epoch": 11.285838401390096, + "grad_norm": 0.0002085790765704587, + "learning_rate": 2.9874384593107445e-06, + "loss": 0.0, + "step": 51960 + }, + { + "epoch": 11.288010425716768, + "grad_norm": 0.0002594150719232857, + "learning_rate": 2.9783883579496092e-06, + "loss": 0.0, + "step": 51970 + }, + { + "epoch": 11.29018245004344, + "grad_norm": 0.00026963651180267334, + "learning_rate": 2.969338256588474e-06, + "loss": 0.0, + "step": 51980 + }, + { + "epoch": 11.292354474370113, + "grad_norm": 0.00020867727289441973, + "learning_rate": 2.9602881552273387e-06, + "loss": 0.0, + "step": 51990 + }, + { + "epoch": 11.294526498696785, + "grad_norm": 0.00019796937704086304, + "learning_rate": 2.9512380538662035e-06, + "loss": 0.0, + "step": 52000 + }, + { + "epoch": 11.296698523023458, + "grad_norm": 0.000202857336262241, + "learning_rate": 2.9421879525050683e-06, + "loss": 0.0036, + "step": 52010 + }, + { + "epoch": 11.29887054735013, + "grad_norm": 0.0002166828780900687, + "learning_rate": 2.9331378511439326e-06, + "loss": 0.0, + "step": 52020 + }, + { + "epoch": 11.301042571676803, + "grad_norm": 0.00021046474284958094, + "learning_rate": 2.924087749782798e-06, + "loss": 0.0048, + "step": 52030 + }, + { + "epoch": 11.303214596003475, + "grad_norm": 0.00020024993864353746, + "learning_rate": 2.9150376484216626e-06, + "loss": 0.0, + "step": 52040 + }, + { + "epoch": 11.305386620330149, + "grad_norm": 0.00020179520652163774, + "learning_rate": 2.9059875470605273e-06, + "loss": 0.0047, + "step": 52050 + }, + { + "epoch": 11.30755864465682, + "grad_norm": 0.00020518811652436852, + "learning_rate": 2.896937445699392e-06, + "loss": 0.0, + "step": 52060 + }, + { + "epoch": 11.309730668983493, + "grad_norm": 0.0002528747427277267, + "learning_rate": 2.887887344338257e-06, + "loss": 0.0, + "step": 52070 + }, + { + "epoch": 11.311902693310165, + "grad_norm": 0.00019628154404927045, + "learning_rate": 2.8788372429771216e-06, + "loss": 0.0, + "step": 52080 + }, + { + "epoch": 11.314074717636837, + "grad_norm": 0.0002019262028625235, + "learning_rate": 2.869787141615986e-06, + "loss": 0.0, + "step": 52090 + }, + { + "epoch": 11.31624674196351, + "grad_norm": 0.00019746019097510725, + "learning_rate": 2.860737040254851e-06, + "loss": 0.0, + "step": 52100 + }, + { + "epoch": 11.318418766290183, + "grad_norm": 0.00020607651094906032, + "learning_rate": 2.851686938893716e-06, + "loss": 0.0, + "step": 52110 + }, + { + "epoch": 11.320590790616855, + "grad_norm": 0.0002728735562413931, + "learning_rate": 2.8426368375325802e-06, + "loss": 0.0, + "step": 52120 + }, + { + "epoch": 11.322762814943527, + "grad_norm": 0.00019636953948065639, + "learning_rate": 2.8335867361714454e-06, + "loss": 0.0, + "step": 52130 + }, + { + "epoch": 11.324934839270199, + "grad_norm": 0.00019914501172024757, + "learning_rate": 2.82453663481031e-06, + "loss": 0.0, + "step": 52140 + }, + { + "epoch": 11.327106863596873, + "grad_norm": 0.00020094211504328996, + "learning_rate": 2.815486533449175e-06, + "loss": 0.0, + "step": 52150 + }, + { + "epoch": 11.329278887923545, + "grad_norm": 0.00033041482674889266, + "learning_rate": 2.8064364320880393e-06, + "loss": 0.0, + "step": 52160 + }, + { + "epoch": 11.331450912250217, + "grad_norm": 0.0002014395868172869, + "learning_rate": 2.7973863307269045e-06, + "loss": 0.0, + "step": 52170 + }, + { + "epoch": 11.333622936576889, + "grad_norm": 0.00019805788178928196, + "learning_rate": 2.7883362293657692e-06, + "loss": 0.0046, + "step": 52180 + }, + { + "epoch": 11.335794960903563, + "grad_norm": 0.00020002457313239574, + "learning_rate": 2.7792861280046336e-06, + "loss": 0.0, + "step": 52190 + }, + { + "epoch": 11.337966985230235, + "grad_norm": 0.00019858147425111383, + "learning_rate": 2.7702360266434983e-06, + "loss": 0.0, + "step": 52200 + }, + { + "epoch": 11.340139009556907, + "grad_norm": 0.00019874947611242533, + "learning_rate": 2.7611859252823635e-06, + "loss": 0.0, + "step": 52210 + }, + { + "epoch": 11.342311033883579, + "grad_norm": 0.00019803144095931202, + "learning_rate": 2.752135823921228e-06, + "loss": 0.0, + "step": 52220 + }, + { + "epoch": 11.344483058210251, + "grad_norm": 0.00022093136794865131, + "learning_rate": 2.7430857225600926e-06, + "loss": 0.0, + "step": 52230 + }, + { + "epoch": 11.346655082536925, + "grad_norm": 0.00019645935390144587, + "learning_rate": 2.734035621198958e-06, + "loss": 0.0, + "step": 52240 + }, + { + "epoch": 11.348827106863597, + "grad_norm": 0.00020049404702149332, + "learning_rate": 2.7249855198378226e-06, + "loss": 0.0035, + "step": 52250 + }, + { + "epoch": 11.35099913119027, + "grad_norm": 0.00020869314903393388, + "learning_rate": 2.715935418476687e-06, + "loss": 0.0, + "step": 52260 + }, + { + "epoch": 11.353171155516941, + "grad_norm": 0.00019707180035766214, + "learning_rate": 2.7077903272516652e-06, + "loss": 0.0077, + "step": 52270 + }, + { + "epoch": 11.355343179843615, + "grad_norm": 0.0002042713458649814, + "learning_rate": 2.69874022589053e-06, + "loss": 0.0, + "step": 52280 + }, + { + "epoch": 11.357515204170287, + "grad_norm": 0.00019734865054488182, + "learning_rate": 2.6896901245293948e-06, + "loss": 0.0, + "step": 52290 + }, + { + "epoch": 11.35968722849696, + "grad_norm": 0.000252844620263204, + "learning_rate": 2.6806400231682595e-06, + "loss": 0.0, + "step": 52300 + }, + { + "epoch": 11.361859252823631, + "grad_norm": 0.00019945701933465898, + "learning_rate": 2.6715899218071243e-06, + "loss": 0.0, + "step": 52310 + }, + { + "epoch": 11.364031277150303, + "grad_norm": 0.00019766220066230744, + "learning_rate": 2.6625398204459895e-06, + "loss": 0.0, + "step": 52320 + }, + { + "epoch": 11.366203301476977, + "grad_norm": 0.0001986150018638, + "learning_rate": 2.653489719084854e-06, + "loss": 0.0, + "step": 52330 + }, + { + "epoch": 11.36837532580365, + "grad_norm": 0.00027733895694836974, + "learning_rate": 2.6444396177237186e-06, + "loss": 0.0, + "step": 52340 + }, + { + "epoch": 11.370547350130321, + "grad_norm": 0.00020426575792953372, + "learning_rate": 2.6353895163625833e-06, + "loss": 0.0, + "step": 52350 + }, + { + "epoch": 11.372719374456993, + "grad_norm": 0.0002020970277953893, + "learning_rate": 2.626339415001448e-06, + "loss": 0.0, + "step": 52360 + }, + { + "epoch": 11.374891398783667, + "grad_norm": 0.00019799098663497716, + "learning_rate": 2.617289313640313e-06, + "loss": 0.0, + "step": 52370 + }, + { + "epoch": 11.37706342311034, + "grad_norm": 0.00022170203737914562, + "learning_rate": 2.6082392122791776e-06, + "loss": 0.0, + "step": 52380 + }, + { + "epoch": 11.379235447437011, + "grad_norm": 0.0001970038574654609, + "learning_rate": 2.5991891109180424e-06, + "loss": 0.0, + "step": 52390 + }, + { + "epoch": 11.381407471763684, + "grad_norm": 0.0003018612042069435, + "learning_rate": 2.590139009556907e-06, + "loss": 0.0, + "step": 52400 + }, + { + "epoch": 11.383579496090356, + "grad_norm": 0.00019775984401348978, + "learning_rate": 2.581088908195772e-06, + "loss": 0.0049, + "step": 52410 + }, + { + "epoch": 11.38575152041703, + "grad_norm": 0.0001969892909983173, + "learning_rate": 2.5720388068346367e-06, + "loss": 0.0, + "step": 52420 + }, + { + "epoch": 11.387923544743701, + "grad_norm": 0.0002039795508608222, + "learning_rate": 2.5629887054735014e-06, + "loss": 0.0, + "step": 52430 + }, + { + "epoch": 11.390095569070374, + "grad_norm": 0.0001986090501304716, + "learning_rate": 2.553938604112366e-06, + "loss": 0.0, + "step": 52440 + }, + { + "epoch": 11.392267593397046, + "grad_norm": 0.00019856690778397024, + "learning_rate": 2.544888502751231e-06, + "loss": 0.0, + "step": 52450 + }, + { + "epoch": 11.394439617723718, + "grad_norm": 0.0001971104647964239, + "learning_rate": 2.5358384013900957e-06, + "loss": 0.0, + "step": 52460 + }, + { + "epoch": 11.396611642050392, + "grad_norm": 0.00019835439161397517, + "learning_rate": 2.5267883000289605e-06, + "loss": 0.0, + "step": 52470 + }, + { + "epoch": 11.398783666377064, + "grad_norm": 0.0002031942130997777, + "learning_rate": 2.5177381986678253e-06, + "loss": 0.0, + "step": 52480 + }, + { + "epoch": 11.400955690703736, + "grad_norm": 0.00019665226864162832, + "learning_rate": 2.5086880973066896e-06, + "loss": 0.0, + "step": 52490 + }, + { + "epoch": 11.403127715030408, + "grad_norm": 0.00019563220848795027, + "learning_rate": 2.4996379959455548e-06, + "loss": 0.0055, + "step": 52500 + }, + { + "epoch": 11.405299739357082, + "grad_norm": 0.00020207982743158937, + "learning_rate": 2.4905878945844195e-06, + "loss": 0.0046, + "step": 52510 + }, + { + "epoch": 11.407471763683754, + "grad_norm": 0.0002643620246089995, + "learning_rate": 2.4815377932232843e-06, + "loss": 0.0, + "step": 52520 + }, + { + "epoch": 11.409643788010426, + "grad_norm": 0.0002021700784098357, + "learning_rate": 2.472487691862149e-06, + "loss": 0.0, + "step": 52530 + }, + { + "epoch": 11.411815812337098, + "grad_norm": 0.00019614986376836896, + "learning_rate": 2.463437590501014e-06, + "loss": 0.0, + "step": 52540 + }, + { + "epoch": 11.41398783666377, + "grad_norm": 0.00019699233234860003, + "learning_rate": 2.4543874891398786e-06, + "loss": 0.0, + "step": 52550 + }, + { + "epoch": 11.416159860990444, + "grad_norm": 0.00019868268282152712, + "learning_rate": 2.445337387778743e-06, + "loss": 0.0, + "step": 52560 + }, + { + "epoch": 11.418331885317116, + "grad_norm": 0.0001990678283618763, + "learning_rate": 2.436287286417608e-06, + "loss": 0.0, + "step": 52570 + }, + { + "epoch": 11.420503909643788, + "grad_norm": 0.00019700905249919742, + "learning_rate": 2.427237185056473e-06, + "loss": 0.0, + "step": 52580 + }, + { + "epoch": 11.42267593397046, + "grad_norm": 0.00019700094708241522, + "learning_rate": 2.4181870836953372e-06, + "loss": 0.0045, + "step": 52590 + }, + { + "epoch": 11.424847958297132, + "grad_norm": 0.00019845672068186104, + "learning_rate": 2.4091369823342024e-06, + "loss": 0.0, + "step": 52600 + }, + { + "epoch": 11.427019982623806, + "grad_norm": 0.00019562583474908024, + "learning_rate": 2.400086880973067e-06, + "loss": 0.0, + "step": 52610 + }, + { + "epoch": 11.429192006950478, + "grad_norm": 0.0001981602981686592, + "learning_rate": 2.391036779611932e-06, + "loss": 0.0, + "step": 52620 + }, + { + "epoch": 11.43136403127715, + "grad_norm": 0.000247137708356604, + "learning_rate": 2.3819866782507963e-06, + "loss": 0.0048, + "step": 52630 + }, + { + "epoch": 11.433536055603822, + "grad_norm": 0.00019939042977057397, + "learning_rate": 2.3729365768896615e-06, + "loss": 0.0, + "step": 52640 + }, + { + "epoch": 11.435708079930496, + "grad_norm": 0.00020174359087832272, + "learning_rate": 2.3638864755285262e-06, + "loss": 0.0, + "step": 52650 + }, + { + "epoch": 11.437880104257168, + "grad_norm": 0.00020131657947786152, + "learning_rate": 2.3548363741673906e-06, + "loss": 0.0, + "step": 52660 + }, + { + "epoch": 11.44005212858384, + "grad_norm": 0.0001996621285798028, + "learning_rate": 2.3457862728062553e-06, + "loss": 0.0, + "step": 52670 + }, + { + "epoch": 11.442224152910512, + "grad_norm": 0.00019746148609556258, + "learning_rate": 2.3367361714451205e-06, + "loss": 0.0, + "step": 52680 + }, + { + "epoch": 11.444396177237184, + "grad_norm": 0.0001999850501306355, + "learning_rate": 2.327686070083985e-06, + "loss": 0.0, + "step": 52690 + }, + { + "epoch": 11.446568201563858, + "grad_norm": 0.0001960912486538291, + "learning_rate": 2.3186359687228496e-06, + "loss": 0.0, + "step": 52700 + }, + { + "epoch": 11.44874022589053, + "grad_norm": 0.00019612940377555788, + "learning_rate": 2.309585867361715e-06, + "loss": 0.0, + "step": 52710 + }, + { + "epoch": 11.450912250217202, + "grad_norm": 0.00020600967400241643, + "learning_rate": 2.3005357660005796e-06, + "loss": 0.0, + "step": 52720 + }, + { + "epoch": 11.453084274543874, + "grad_norm": 0.0001977920619538054, + "learning_rate": 2.291485664639444e-06, + "loss": 0.0, + "step": 52730 + }, + { + "epoch": 11.455256298870548, + "grad_norm": 0.0001973673061002046, + "learning_rate": 2.2824355632783087e-06, + "loss": 0.0, + "step": 52740 + }, + { + "epoch": 11.45742832319722, + "grad_norm": 0.00019590802548918873, + "learning_rate": 2.273385461917174e-06, + "loss": 0.0036, + "step": 52750 + }, + { + "epoch": 11.459600347523892, + "grad_norm": 0.0001969041331904009, + "learning_rate": 2.264335360556038e-06, + "loss": 0.0, + "step": 52760 + }, + { + "epoch": 11.461772371850564, + "grad_norm": 0.00019703819998539984, + "learning_rate": 2.255285259194903e-06, + "loss": 0.0, + "step": 52770 + }, + { + "epoch": 11.463944396177236, + "grad_norm": 0.0001995855272980407, + "learning_rate": 2.246235157833768e-06, + "loss": 0.0, + "step": 52780 + }, + { + "epoch": 11.46611642050391, + "grad_norm": 0.00020003956160508096, + "learning_rate": 2.2371850564726325e-06, + "loss": 0.0, + "step": 52790 + }, + { + "epoch": 11.468288444830582, + "grad_norm": 0.00020116106315981597, + "learning_rate": 2.2281349551114972e-06, + "loss": 0.0, + "step": 52800 + }, + { + "epoch": 11.470460469157254, + "grad_norm": 0.00020231454982422292, + "learning_rate": 2.219084853750362e-06, + "loss": 0.0, + "step": 52810 + }, + { + "epoch": 11.472632493483927, + "grad_norm": 0.0001936189946718514, + "learning_rate": 2.210034752389227e-06, + "loss": 0.0, + "step": 52820 + }, + { + "epoch": 11.474804517810599, + "grad_norm": 0.0001966677518794313, + "learning_rate": 2.2009846510280915e-06, + "loss": 0.0, + "step": 52830 + }, + { + "epoch": 11.476976542137272, + "grad_norm": 0.00019911628623958677, + "learning_rate": 2.1919345496669563e-06, + "loss": 0.0, + "step": 52840 + }, + { + "epoch": 11.479148566463945, + "grad_norm": 0.00019401231838855892, + "learning_rate": 2.182884448305821e-06, + "loss": 0.0, + "step": 52850 + }, + { + "epoch": 11.481320590790617, + "grad_norm": 0.00019698469259310514, + "learning_rate": 2.173834346944686e-06, + "loss": 0.0, + "step": 52860 + }, + { + "epoch": 11.483492615117289, + "grad_norm": 0.0001935689797392115, + "learning_rate": 2.1647842455835506e-06, + "loss": 0.0, + "step": 52870 + }, + { + "epoch": 11.485664639443963, + "grad_norm": 0.0008656844729557633, + "learning_rate": 2.1557341442224153e-06, + "loss": 0.0, + "step": 52880 + }, + { + "epoch": 11.487836663770635, + "grad_norm": 0.00019588657596614212, + "learning_rate": 2.14668404286128e-06, + "loss": 0.0, + "step": 52890 + }, + { + "epoch": 11.490008688097307, + "grad_norm": 0.00019638192316051573, + "learning_rate": 2.137633941500145e-06, + "loss": 0.0049, + "step": 52900 + }, + { + "epoch": 11.492180712423979, + "grad_norm": 0.00020129536278545856, + "learning_rate": 2.1285838401390096e-06, + "loss": 0.0, + "step": 52910 + }, + { + "epoch": 11.49435273675065, + "grad_norm": 0.00019692025671247393, + "learning_rate": 2.1195337387778744e-06, + "loss": 0.0, + "step": 52920 + }, + { + "epoch": 11.496524761077325, + "grad_norm": 0.00019469275139272213, + "learning_rate": 2.110483637416739e-06, + "loss": 0.0, + "step": 52930 + }, + { + "epoch": 11.498696785403997, + "grad_norm": 0.0002038736711256206, + "learning_rate": 2.101433536055604e-06, + "loss": 0.0, + "step": 52940 + }, + { + "epoch": 11.500868809730669, + "grad_norm": 0.00019812029495369643, + "learning_rate": 2.0923834346944687e-06, + "loss": 0.0054, + "step": 52950 + }, + { + "epoch": 11.503040834057341, + "grad_norm": 0.000194476917386055, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.0088, + "step": 52960 + }, + { + "epoch": 11.505212858384013, + "grad_norm": 0.0001952751917997375, + "learning_rate": 2.074283231972198e-06, + "loss": 0.0, + "step": 52970 + }, + { + "epoch": 11.507384882710687, + "grad_norm": 0.0001947148412000388, + "learning_rate": 2.065233130611063e-06, + "loss": 0.0, + "step": 52980 + }, + { + "epoch": 11.509556907037359, + "grad_norm": 0.00020383935770951211, + "learning_rate": 2.0561830292499277e-06, + "loss": 0.0043, + "step": 52990 + }, + { + "epoch": 11.511728931364031, + "grad_norm": 0.0002019301027758047, + "learning_rate": 2.0471329278887925e-06, + "loss": 0.0, + "step": 53000 + }, + { + "epoch": 11.513900955690703, + "grad_norm": 0.000210135942324996, + "learning_rate": 2.0380828265276572e-06, + "loss": 0.0, + "step": 53010 + }, + { + "epoch": 11.516072980017377, + "grad_norm": 0.00020146237511653453, + "learning_rate": 2.029032725166522e-06, + "loss": 0.0, + "step": 53020 + }, + { + "epoch": 11.518245004344049, + "grad_norm": 0.0004484684322960675, + "learning_rate": 2.0199826238053868e-06, + "loss": 0.0, + "step": 53030 + }, + { + "epoch": 11.520417028670721, + "grad_norm": 0.0001989894371945411, + "learning_rate": 2.0109325224442515e-06, + "loss": 0.0, + "step": 53040 + }, + { + "epoch": 11.522589052997393, + "grad_norm": 0.00019518462067935616, + "learning_rate": 2.0018824210831163e-06, + "loss": 0.0, + "step": 53050 + }, + { + "epoch": 11.524761077324065, + "grad_norm": 0.00031708358437754214, + "learning_rate": 1.992832319721981e-06, + "loss": 0.0, + "step": 53060 + }, + { + "epoch": 11.526933101650739, + "grad_norm": 0.00019694925867952406, + "learning_rate": 1.983782218360846e-06, + "loss": 0.0, + "step": 53070 + }, + { + "epoch": 11.529105125977411, + "grad_norm": 0.00019491143757477403, + "learning_rate": 1.9747321169997106e-06, + "loss": 0.0, + "step": 53080 + }, + { + "epoch": 11.531277150304083, + "grad_norm": 0.00019578862702473998, + "learning_rate": 1.965682015638575e-06, + "loss": 0.0, + "step": 53090 + }, + { + "epoch": 11.533449174630755, + "grad_norm": 0.0002560818975325674, + "learning_rate": 1.95663191427744e-06, + "loss": 0.0, + "step": 53100 + }, + { + "epoch": 11.535621198957429, + "grad_norm": 0.00019754750246647745, + "learning_rate": 1.947581812916305e-06, + "loss": 0.0, + "step": 53110 + }, + { + "epoch": 11.537793223284101, + "grad_norm": 0.00020018761279061437, + "learning_rate": 1.9385317115551696e-06, + "loss": 0.0, + "step": 53120 + }, + { + "epoch": 11.539965247610773, + "grad_norm": 0.00019547737610992044, + "learning_rate": 1.9294816101940344e-06, + "loss": 0.0, + "step": 53130 + }, + { + "epoch": 11.542137271937445, + "grad_norm": 0.00019425964273978025, + "learning_rate": 1.920431508832899e-06, + "loss": 0.0, + "step": 53140 + }, + { + "epoch": 11.544309296264117, + "grad_norm": 0.00020175384997855872, + "learning_rate": 1.911381407471764e-06, + "loss": 0.0, + "step": 53150 + }, + { + "epoch": 11.546481320590791, + "grad_norm": 0.00025675594224594533, + "learning_rate": 1.9023313061106285e-06, + "loss": 0.0, + "step": 53160 + }, + { + "epoch": 11.548653344917463, + "grad_norm": 0.0001961501402547583, + "learning_rate": 1.8932812047494932e-06, + "loss": 0.0, + "step": 53170 + }, + { + "epoch": 11.550825369244135, + "grad_norm": 0.0002024386340053752, + "learning_rate": 1.8842311033883582e-06, + "loss": 0.0, + "step": 53180 + }, + { + "epoch": 11.552997393570807, + "grad_norm": 0.0002116846153512597, + "learning_rate": 1.8751810020272225e-06, + "loss": 0.0, + "step": 53190 + }, + { + "epoch": 11.555169417897481, + "grad_norm": 0.00024865844170562923, + "learning_rate": 1.8661309006660875e-06, + "loss": 0.0, + "step": 53200 + }, + { + "epoch": 11.557341442224153, + "grad_norm": 0.00020497996592894197, + "learning_rate": 1.8570807993049523e-06, + "loss": 0.0, + "step": 53210 + }, + { + "epoch": 11.559513466550825, + "grad_norm": 0.0003205514221917838, + "learning_rate": 1.8480306979438173e-06, + "loss": 0.0043, + "step": 53220 + }, + { + "epoch": 11.561685490877498, + "grad_norm": 0.00035617026151157916, + "learning_rate": 1.8389805965826818e-06, + "loss": 0.0, + "step": 53230 + }, + { + "epoch": 11.56385751520417, + "grad_norm": 0.00019723277364391834, + "learning_rate": 1.8299304952215466e-06, + "loss": 0.0, + "step": 53240 + }, + { + "epoch": 11.566029539530843, + "grad_norm": 0.00019615769269876182, + "learning_rate": 1.8208803938604115e-06, + "loss": 0.0, + "step": 53250 + }, + { + "epoch": 11.568201563857516, + "grad_norm": 0.00026189288473688066, + "learning_rate": 1.8118302924992759e-06, + "loss": 0.0052, + "step": 53260 + }, + { + "epoch": 11.570373588184188, + "grad_norm": 0.0003638887428678572, + "learning_rate": 1.8027801911381409e-06, + "loss": 0.0, + "step": 53270 + }, + { + "epoch": 11.57254561251086, + "grad_norm": 0.0002062423445750028, + "learning_rate": 1.7937300897770056e-06, + "loss": 0.0, + "step": 53280 + }, + { + "epoch": 11.574717636837534, + "grad_norm": 0.0001934427273226902, + "learning_rate": 1.7846799884158702e-06, + "loss": 0.0, + "step": 53290 + }, + { + "epoch": 11.576889661164206, + "grad_norm": 0.00019680126570165157, + "learning_rate": 1.7756298870547351e-06, + "loss": 0.0, + "step": 53300 + }, + { + "epoch": 11.579061685490878, + "grad_norm": 0.0001950986625161022, + "learning_rate": 1.7665797856936e-06, + "loss": 0.0, + "step": 53310 + }, + { + "epoch": 11.58123370981755, + "grad_norm": 0.14624013006687164, + "learning_rate": 1.7575296843324647e-06, + "loss": 0.009, + "step": 53320 + }, + { + "epoch": 11.583405734144222, + "grad_norm": 0.0001986539427889511, + "learning_rate": 1.7484795829713292e-06, + "loss": 0.0, + "step": 53330 + }, + { + "epoch": 11.585577758470896, + "grad_norm": 0.00019793520914390683, + "learning_rate": 1.7394294816101942e-06, + "loss": 0.0, + "step": 53340 + }, + { + "epoch": 11.587749782797568, + "grad_norm": 0.00019655383948702365, + "learning_rate": 1.730379380249059e-06, + "loss": 0.0, + "step": 53350 + }, + { + "epoch": 11.58992180712424, + "grad_norm": 0.00019772254745475948, + "learning_rate": 1.7213292788879235e-06, + "loss": 0.0039, + "step": 53360 + }, + { + "epoch": 11.592093831450912, + "grad_norm": 0.00019759469432756305, + "learning_rate": 1.7122791775267883e-06, + "loss": 0.0, + "step": 53370 + }, + { + "epoch": 11.594265855777584, + "grad_norm": 0.0001963729882845655, + "learning_rate": 1.7032290761656532e-06, + "loss": 0.0, + "step": 53380 + }, + { + "epoch": 11.596437880104258, + "grad_norm": 0.00021433483925648034, + "learning_rate": 1.6941789748045178e-06, + "loss": 0.0, + "step": 53390 + }, + { + "epoch": 11.59860990443093, + "grad_norm": 0.0002027168811764568, + "learning_rate": 1.6851288734433826e-06, + "loss": 0.0047, + "step": 53400 + }, + { + "epoch": 11.600781928757602, + "grad_norm": 0.00019676884403452277, + "learning_rate": 1.6760787720822475e-06, + "loss": 0.0, + "step": 53410 + }, + { + "epoch": 11.602953953084274, + "grad_norm": 0.0002005890419241041, + "learning_rate": 1.6670286707211123e-06, + "loss": 0.0, + "step": 53420 + }, + { + "epoch": 11.605125977410946, + "grad_norm": 0.00019666865409817547, + "learning_rate": 1.6579785693599769e-06, + "loss": 0.0, + "step": 53430 + }, + { + "epoch": 11.60729800173762, + "grad_norm": 0.00019894151773769408, + "learning_rate": 1.6489284679988416e-06, + "loss": 0.0, + "step": 53440 + }, + { + "epoch": 11.609470026064292, + "grad_norm": 0.0002711409470066428, + "learning_rate": 1.6398783666377066e-06, + "loss": 0.0, + "step": 53450 + }, + { + "epoch": 11.611642050390964, + "grad_norm": 0.00020063733973074704, + "learning_rate": 1.6308282652765711e-06, + "loss": 0.0, + "step": 53460 + }, + { + "epoch": 11.613814074717636, + "grad_norm": 0.00024653473519720137, + "learning_rate": 1.621778163915436e-06, + "loss": 0.0, + "step": 53470 + }, + { + "epoch": 11.61598609904431, + "grad_norm": 0.0002731581625994295, + "learning_rate": 1.6127280625543009e-06, + "loss": 0.0, + "step": 53480 + }, + { + "epoch": 11.618158123370982, + "grad_norm": 0.00021144855418242514, + "learning_rate": 1.6036779611931652e-06, + "loss": 0.0, + "step": 53490 + }, + { + "epoch": 11.620330147697654, + "grad_norm": 0.0002458023955114186, + "learning_rate": 1.5946278598320302e-06, + "loss": 0.0, + "step": 53500 + }, + { + "epoch": 11.622502172024326, + "grad_norm": 0.00019741586584132165, + "learning_rate": 1.585577758470895e-06, + "loss": 0.0, + "step": 53510 + }, + { + "epoch": 11.624674196350998, + "grad_norm": 0.00020069196762051433, + "learning_rate": 1.57652765710976e-06, + "loss": 0.0, + "step": 53520 + }, + { + "epoch": 11.626846220677672, + "grad_norm": 0.00023551438061986119, + "learning_rate": 1.5674775557486245e-06, + "loss": 0.0, + "step": 53530 + }, + { + "epoch": 11.629018245004344, + "grad_norm": 0.00019561080262064934, + "learning_rate": 1.5584274543874892e-06, + "loss": 0.0, + "step": 53540 + }, + { + "epoch": 11.631190269331016, + "grad_norm": 0.0001954359613591805, + "learning_rate": 1.549377353026354e-06, + "loss": 0.0, + "step": 53550 + }, + { + "epoch": 11.633362293657688, + "grad_norm": 0.00019509869161993265, + "learning_rate": 1.5403272516652188e-06, + "loss": 0.0, + "step": 53560 + }, + { + "epoch": 11.635534317984362, + "grad_norm": 0.0003004560712724924, + "learning_rate": 1.5312771503040835e-06, + "loss": 0.0, + "step": 53570 + }, + { + "epoch": 11.637706342311034, + "grad_norm": 0.0001922779920278117, + "learning_rate": 1.522227048942948e-06, + "loss": 0.0, + "step": 53580 + }, + { + "epoch": 11.639878366637706, + "grad_norm": 0.0001934824831550941, + "learning_rate": 1.513176947581813e-06, + "loss": 0.0045, + "step": 53590 + }, + { + "epoch": 11.642050390964378, + "grad_norm": 0.00019457947928458452, + "learning_rate": 1.5041268462206776e-06, + "loss": 0.0, + "step": 53600 + }, + { + "epoch": 11.64422241529105, + "grad_norm": 0.0001975044870050624, + "learning_rate": 1.4950767448595426e-06, + "loss": 0.0, + "step": 53610 + }, + { + "epoch": 11.646394439617724, + "grad_norm": 0.00019748661725316197, + "learning_rate": 1.4860266434984073e-06, + "loss": 0.0, + "step": 53620 + }, + { + "epoch": 11.648566463944396, + "grad_norm": 0.0003096135624218732, + "learning_rate": 1.4769765421372719e-06, + "loss": 0.0, + "step": 53630 + }, + { + "epoch": 11.650738488271069, + "grad_norm": 0.00019721903663594276, + "learning_rate": 1.4679264407761369e-06, + "loss": 0.0, + "step": 53640 + }, + { + "epoch": 11.65291051259774, + "grad_norm": 0.00019730576605070382, + "learning_rate": 1.4588763394150014e-06, + "loss": 0.0, + "step": 53650 + }, + { + "epoch": 11.655082536924414, + "grad_norm": 0.00019422029436100274, + "learning_rate": 1.4498262380538664e-06, + "loss": 0.0, + "step": 53660 + }, + { + "epoch": 11.657254561251086, + "grad_norm": 0.00019854224228765815, + "learning_rate": 1.440776136692731e-06, + "loss": 0.0, + "step": 53670 + }, + { + "epoch": 11.659426585577759, + "grad_norm": 0.000195562926819548, + "learning_rate": 1.4317260353315957e-06, + "loss": 0.0, + "step": 53680 + }, + { + "epoch": 11.66159860990443, + "grad_norm": 0.00019524309027474374, + "learning_rate": 1.4226759339704605e-06, + "loss": 0.0051, + "step": 53690 + }, + { + "epoch": 11.663770634231103, + "grad_norm": 0.0001945122639881447, + "learning_rate": 1.4136258326093252e-06, + "loss": 0.0, + "step": 53700 + }, + { + "epoch": 11.665942658557777, + "grad_norm": 0.00041479626088403165, + "learning_rate": 1.4045757312481902e-06, + "loss": 0.0, + "step": 53710 + }, + { + "epoch": 11.668114682884449, + "grad_norm": 0.00019896173034794629, + "learning_rate": 1.3955256298870548e-06, + "loss": 0.0, + "step": 53720 + }, + { + "epoch": 11.67028670721112, + "grad_norm": 0.00019946540123783052, + "learning_rate": 1.3864755285259195e-06, + "loss": 0.0, + "step": 53730 + }, + { + "epoch": 11.672458731537793, + "grad_norm": 0.00019381535821594298, + "learning_rate": 1.3774254271647843e-06, + "loss": 0.0048, + "step": 53740 + }, + { + "epoch": 11.674630755864465, + "grad_norm": 0.0001966750860447064, + "learning_rate": 1.368375325803649e-06, + "loss": 0.0, + "step": 53750 + }, + { + "epoch": 11.676802780191139, + "grad_norm": 0.0001956068881554529, + "learning_rate": 1.3593252244425138e-06, + "loss": 0.0, + "step": 53760 + }, + { + "epoch": 11.67897480451781, + "grad_norm": 0.0001950589648913592, + "learning_rate": 1.3502751230813786e-06, + "loss": 0.0, + "step": 53770 + }, + { + "epoch": 11.681146828844483, + "grad_norm": 0.00019512952712830156, + "learning_rate": 1.3412250217202433e-06, + "loss": 0.0093, + "step": 53780 + }, + { + "epoch": 11.683318853171155, + "grad_norm": 0.00019810015510302037, + "learning_rate": 1.332174920359108e-06, + "loss": 0.0, + "step": 53790 + }, + { + "epoch": 11.685490877497829, + "grad_norm": 0.00019662882550619543, + "learning_rate": 1.3231248189979729e-06, + "loss": 0.0045, + "step": 53800 + }, + { + "epoch": 11.6876629018245, + "grad_norm": 0.0001998993247980252, + "learning_rate": 1.3140747176368376e-06, + "loss": 0.0, + "step": 53810 + }, + { + "epoch": 11.689834926151173, + "grad_norm": 0.0001943770475918427, + "learning_rate": 1.3050246162757024e-06, + "loss": 0.0, + "step": 53820 + }, + { + "epoch": 11.692006950477845, + "grad_norm": 0.0002038269303739071, + "learning_rate": 1.295974514914567e-06, + "loss": 0.0, + "step": 53830 + }, + { + "epoch": 11.694178974804517, + "grad_norm": 0.00019428586529102176, + "learning_rate": 1.286924413553432e-06, + "loss": 0.0, + "step": 53840 + }, + { + "epoch": 11.696350999131191, + "grad_norm": 0.00023232153034768999, + "learning_rate": 1.2778743121922967e-06, + "loss": 0.0, + "step": 53850 + }, + { + "epoch": 11.698523023457863, + "grad_norm": 0.00020070855680387467, + "learning_rate": 1.2688242108311614e-06, + "loss": 0.0, + "step": 53860 + }, + { + "epoch": 11.700695047784535, + "grad_norm": 0.0002596253762021661, + "learning_rate": 1.2597741094700262e-06, + "loss": 0.0, + "step": 53870 + }, + { + "epoch": 11.702867072111207, + "grad_norm": 0.00019496695313137025, + "learning_rate": 1.2507240081088907e-06, + "loss": 0.0, + "step": 53880 + }, + { + "epoch": 11.70503909643788, + "grad_norm": 0.00021064665634185076, + "learning_rate": 1.2416739067477557e-06, + "loss": 0.0, + "step": 53890 + }, + { + "epoch": 11.707211120764553, + "grad_norm": 0.00019595645426306874, + "learning_rate": 1.2326238053866203e-06, + "loss": 0.0, + "step": 53900 + }, + { + "epoch": 11.709383145091225, + "grad_norm": 0.0001996116479858756, + "learning_rate": 1.2235737040254852e-06, + "loss": 0.0, + "step": 53910 + }, + { + "epoch": 11.711555169417897, + "grad_norm": 0.0001989005832001567, + "learning_rate": 1.2145236026643498e-06, + "loss": 0.0, + "step": 53920 + }, + { + "epoch": 11.71372719374457, + "grad_norm": 0.0001964517723536119, + "learning_rate": 1.2054735013032146e-06, + "loss": 0.004, + "step": 53930 + }, + { + "epoch": 11.715899218071243, + "grad_norm": 0.00024799612583592534, + "learning_rate": 1.1964233999420795e-06, + "loss": 0.0036, + "step": 53940 + }, + { + "epoch": 11.718071242397915, + "grad_norm": 0.00019944515952374786, + "learning_rate": 1.187373298580944e-06, + "loss": 0.0, + "step": 53950 + }, + { + "epoch": 11.720243266724587, + "grad_norm": 0.00020389580458868295, + "learning_rate": 1.178323197219809e-06, + "loss": 0.0042, + "step": 53960 + }, + { + "epoch": 11.72241529105126, + "grad_norm": 0.00019486738892737776, + "learning_rate": 1.1692730958586736e-06, + "loss": 0.0, + "step": 53970 + }, + { + "epoch": 11.724587315377931, + "grad_norm": 0.00019491862622089684, + "learning_rate": 1.1602229944975384e-06, + "loss": 0.0046, + "step": 53980 + }, + { + "epoch": 11.726759339704605, + "grad_norm": 0.00019389142107684165, + "learning_rate": 1.1511728931364031e-06, + "loss": 0.0, + "step": 53990 + }, + { + "epoch": 11.728931364031277, + "grad_norm": 0.00019837978470604867, + "learning_rate": 1.142122791775268e-06, + "loss": 0.0, + "step": 54000 + }, + { + "epoch": 11.73110338835795, + "grad_norm": 0.00021431567438412458, + "learning_rate": 1.1330726904141327e-06, + "loss": 0.0, + "step": 54010 + }, + { + "epoch": 11.733275412684621, + "grad_norm": 0.00024961764574982226, + "learning_rate": 1.1240225890529974e-06, + "loss": 0.0, + "step": 54020 + }, + { + "epoch": 11.735447437011295, + "grad_norm": 0.00020189674978610128, + "learning_rate": 1.1149724876918622e-06, + "loss": 0.0, + "step": 54030 + }, + { + "epoch": 11.737619461337967, + "grad_norm": 0.000198492401978001, + "learning_rate": 1.105922386330727e-06, + "loss": 0.0, + "step": 54040 + }, + { + "epoch": 11.73979148566464, + "grad_norm": 0.00026981427799910307, + "learning_rate": 1.0968722849695917e-06, + "loss": 0.0, + "step": 54050 + }, + { + "epoch": 11.741963509991312, + "grad_norm": 0.002462733769789338, + "learning_rate": 1.0878221836084565e-06, + "loss": 0.0, + "step": 54060 + }, + { + "epoch": 11.744135534317984, + "grad_norm": 0.0001970611629076302, + "learning_rate": 1.0787720822473212e-06, + "loss": 0.0, + "step": 54070 + }, + { + "epoch": 11.746307558644657, + "grad_norm": 0.0001956707565113902, + "learning_rate": 1.069721980886186e-06, + "loss": 0.0, + "step": 54080 + }, + { + "epoch": 11.74847958297133, + "grad_norm": 0.00031365029281005263, + "learning_rate": 1.0606718795250508e-06, + "loss": 0.0, + "step": 54090 + }, + { + "epoch": 11.750651607298002, + "grad_norm": 0.00019289724878035486, + "learning_rate": 1.0516217781639155e-06, + "loss": 0.0, + "step": 54100 + }, + { + "epoch": 11.752823631624674, + "grad_norm": 0.00020483179832808673, + "learning_rate": 1.0425716768027803e-06, + "loss": 0.0055, + "step": 54110 + }, + { + "epoch": 11.754995655951348, + "grad_norm": 0.00019535243336576968, + "learning_rate": 1.033521575441645e-06, + "loss": 0.0051, + "step": 54120 + }, + { + "epoch": 11.75716768027802, + "grad_norm": 0.00019420160970184952, + "learning_rate": 1.0244714740805096e-06, + "loss": 0.0, + "step": 54130 + }, + { + "epoch": 11.759339704604692, + "grad_norm": 0.0002710081171244383, + "learning_rate": 1.0154213727193746e-06, + "loss": 0.0, + "step": 54140 + }, + { + "epoch": 11.761511728931364, + "grad_norm": 0.00019317958503961563, + "learning_rate": 1.0063712713582393e-06, + "loss": 0.0049, + "step": 54150 + }, + { + "epoch": 11.763683753258036, + "grad_norm": 0.0001933265448315069, + "learning_rate": 9.97321169997104e-07, + "loss": 0.0, + "step": 54160 + }, + { + "epoch": 11.76585577758471, + "grad_norm": 0.00019155530026182532, + "learning_rate": 9.882710686359689e-07, + "loss": 0.0, + "step": 54170 + }, + { + "epoch": 11.768027801911382, + "grad_norm": 0.0002440862444927916, + "learning_rate": 9.792209672748334e-07, + "loss": 0.0, + "step": 54180 + }, + { + "epoch": 11.770199826238054, + "grad_norm": 0.00020259429584257305, + "learning_rate": 9.701708659136984e-07, + "loss": 0.0, + "step": 54190 + }, + { + "epoch": 11.772371850564726, + "grad_norm": 0.00019361911108717322, + "learning_rate": 9.61120764552563e-07, + "loss": 0.0, + "step": 54200 + }, + { + "epoch": 11.774543874891398, + "grad_norm": 0.00019974037422798574, + "learning_rate": 9.520706631914279e-07, + "loss": 0.0053, + "step": 54210 + }, + { + "epoch": 11.776715899218072, + "grad_norm": 0.0001930526486830786, + "learning_rate": 9.430205618302926e-07, + "loss": 0.0, + "step": 54220 + }, + { + "epoch": 11.778887923544744, + "grad_norm": 0.00019227658049203455, + "learning_rate": 9.339704604691572e-07, + "loss": 0.0048, + "step": 54230 + }, + { + "epoch": 11.781059947871416, + "grad_norm": 0.00019346507906448096, + "learning_rate": 9.249203591080221e-07, + "loss": 0.0, + "step": 54240 + }, + { + "epoch": 11.783231972198088, + "grad_norm": 0.0002069149340968579, + "learning_rate": 9.158702577468867e-07, + "loss": 0.0046, + "step": 54250 + }, + { + "epoch": 11.785403996524762, + "grad_norm": 0.00019784543837886304, + "learning_rate": 9.068201563857516e-07, + "loss": 0.0, + "step": 54260 + }, + { + "epoch": 11.787576020851434, + "grad_norm": 0.0001948605931829661, + "learning_rate": 8.986750651607298e-07, + "loss": 0.0, + "step": 54270 + }, + { + "epoch": 11.789748045178106, + "grad_norm": 0.00019112876907456666, + "learning_rate": 8.896249637995947e-07, + "loss": 0.0, + "step": 54280 + }, + { + "epoch": 11.791920069504778, + "grad_norm": 0.00019701290875673294, + "learning_rate": 8.805748624384594e-07, + "loss": 0.0, + "step": 54290 + }, + { + "epoch": 11.79409209383145, + "grad_norm": 0.00027486091130413115, + "learning_rate": 8.71524761077324e-07, + "loss": 0.0, + "step": 54300 + }, + { + "epoch": 11.796264118158124, + "grad_norm": 0.0002852969046216458, + "learning_rate": 8.624746597161889e-07, + "loss": 0.0, + "step": 54310 + }, + { + "epoch": 11.798436142484796, + "grad_norm": 0.00019162050739396363, + "learning_rate": 8.534245583550535e-07, + "loss": 0.0, + "step": 54320 + }, + { + "epoch": 11.800608166811468, + "grad_norm": 0.0002552253135945648, + "learning_rate": 8.443744569939184e-07, + "loss": 0.0043, + "step": 54330 + }, + { + "epoch": 11.80278019113814, + "grad_norm": 0.00023266920470632613, + "learning_rate": 8.353243556327831e-07, + "loss": 0.0, + "step": 54340 + }, + { + "epoch": 11.804952215464812, + "grad_norm": 0.00019286252791061997, + "learning_rate": 8.262742542716478e-07, + "loss": 0.0, + "step": 54350 + }, + { + "epoch": 11.807124239791486, + "grad_norm": 0.00029586063465103507, + "learning_rate": 8.172241529105127e-07, + "loss": 0.0, + "step": 54360 + }, + { + "epoch": 11.809296264118158, + "grad_norm": 0.00019190393504686654, + "learning_rate": 8.081740515493774e-07, + "loss": 0.0, + "step": 54370 + }, + { + "epoch": 11.81146828844483, + "grad_norm": 0.00025092356372624636, + "learning_rate": 7.991239501882422e-07, + "loss": 0.0, + "step": 54380 + }, + { + "epoch": 11.813640312771502, + "grad_norm": 0.00019765045726671815, + "learning_rate": 7.900738488271069e-07, + "loss": 0.0, + "step": 54390 + }, + { + "epoch": 11.815812337098176, + "grad_norm": 0.00019178666116204113, + "learning_rate": 7.810237474659716e-07, + "loss": 0.0087, + "step": 54400 + }, + { + "epoch": 11.817984361424848, + "grad_norm": 0.0002000819513341412, + "learning_rate": 7.719736461048364e-07, + "loss": 0.0, + "step": 54410 + }, + { + "epoch": 11.82015638575152, + "grad_norm": 0.00020233175018802285, + "learning_rate": 7.629235447437012e-07, + "loss": 0.0, + "step": 54420 + }, + { + "epoch": 11.822328410078192, + "grad_norm": 0.00019419100135564804, + "learning_rate": 7.538734433825659e-07, + "loss": 0.0, + "step": 54430 + }, + { + "epoch": 11.824500434404865, + "grad_norm": 0.00020138765103183687, + "learning_rate": 7.448233420214307e-07, + "loss": 0.0, + "step": 54440 + }, + { + "epoch": 11.826672458731538, + "grad_norm": 0.00019825338677037507, + "learning_rate": 7.357732406602955e-07, + "loss": 0.0, + "step": 54450 + }, + { + "epoch": 11.82884448305821, + "grad_norm": 0.0001928244309965521, + "learning_rate": 7.267231392991602e-07, + "loss": 0.0, + "step": 54460 + }, + { + "epoch": 11.831016507384883, + "grad_norm": 0.00019557155610527843, + "learning_rate": 7.176730379380249e-07, + "loss": 0.0, + "step": 54470 + }, + { + "epoch": 11.833188531711555, + "grad_norm": 0.00019326162873767316, + "learning_rate": 7.086229365768896e-07, + "loss": 0.0, + "step": 54480 + }, + { + "epoch": 11.835360556038228, + "grad_norm": 0.00019357928249519318, + "learning_rate": 6.995728352157544e-07, + "loss": 0.0, + "step": 54490 + }, + { + "epoch": 11.8375325803649, + "grad_norm": 0.00025056168669834733, + "learning_rate": 6.905227338546192e-07, + "loss": 0.0, + "step": 54500 + }, + { + "epoch": 11.839704604691573, + "grad_norm": 0.1530565619468689, + "learning_rate": 6.81472632493484e-07, + "loss": 0.004, + "step": 54510 + }, + { + "epoch": 11.841876629018245, + "grad_norm": 0.0001981578243430704, + "learning_rate": 6.724225311323487e-07, + "loss": 0.0, + "step": 54520 + }, + { + "epoch": 11.844048653344917, + "grad_norm": 0.0001922385417856276, + "learning_rate": 6.633724297712135e-07, + "loss": 0.0, + "step": 54530 + }, + { + "epoch": 11.84622067767159, + "grad_norm": 0.00022527927649207413, + "learning_rate": 6.543223284100782e-07, + "loss": 0.0, + "step": 54540 + }, + { + "epoch": 11.848392701998263, + "grad_norm": 0.00019796429842244834, + "learning_rate": 6.45272227048943e-07, + "loss": 0.0, + "step": 54550 + }, + { + "epoch": 11.850564726324935, + "grad_norm": 0.00025319092674180865, + "learning_rate": 6.362221256878077e-07, + "loss": 0.0, + "step": 54560 + }, + { + "epoch": 11.852736750651607, + "grad_norm": 0.0001936436165124178, + "learning_rate": 6.271720243266724e-07, + "loss": 0.0044, + "step": 54570 + }, + { + "epoch": 11.85490877497828, + "grad_norm": 0.0001932820159709081, + "learning_rate": 6.181219229655373e-07, + "loss": 0.0, + "step": 54580 + }, + { + "epoch": 11.857080799304953, + "grad_norm": 0.00019361200975254178, + "learning_rate": 6.09071821604402e-07, + "loss": 0.0, + "step": 54590 + }, + { + "epoch": 11.859252823631625, + "grad_norm": 0.00020959909306839108, + "learning_rate": 6.000217202432668e-07, + "loss": 0.0, + "step": 54600 + }, + { + "epoch": 11.861424847958297, + "grad_norm": 0.00019550872093532234, + "learning_rate": 5.909716188821316e-07, + "loss": 0.0, + "step": 54610 + }, + { + "epoch": 11.863596872284969, + "grad_norm": 0.0002506279561202973, + "learning_rate": 5.819215175209962e-07, + "loss": 0.0045, + "step": 54620 + }, + { + "epoch": 11.865768896611643, + "grad_norm": 0.0002453475899528712, + "learning_rate": 5.72871416159861e-07, + "loss": 0.0, + "step": 54630 + }, + { + "epoch": 11.867940920938315, + "grad_norm": 0.00019125892140436918, + "learning_rate": 5.638213147987257e-07, + "loss": 0.0, + "step": 54640 + }, + { + "epoch": 11.870112945264987, + "grad_norm": 0.00019863221677951515, + "learning_rate": 5.547712134375905e-07, + "loss": 0.0, + "step": 54650 + }, + { + "epoch": 11.872284969591659, + "grad_norm": 0.00019293044169899076, + "learning_rate": 5.457211120764553e-07, + "loss": 0.0, + "step": 54660 + }, + { + "epoch": 11.874456993918331, + "grad_norm": 0.00019518301996868104, + "learning_rate": 5.3667101071532e-07, + "loss": 0.0, + "step": 54670 + }, + { + "epoch": 11.876629018245005, + "grad_norm": 0.00019262121350038797, + "learning_rate": 5.276209093541848e-07, + "loss": 0.0048, + "step": 54680 + }, + { + "epoch": 11.878801042571677, + "grad_norm": 0.00019273081852588803, + "learning_rate": 5.185708079930495e-07, + "loss": 0.0, + "step": 54690 + }, + { + "epoch": 11.880973066898349, + "grad_norm": 0.0001927847770275548, + "learning_rate": 5.095207066319143e-07, + "loss": 0.005, + "step": 54700 + }, + { + "epoch": 11.883145091225021, + "grad_norm": 0.0001937558117788285, + "learning_rate": 5.004706052707791e-07, + "loss": 0.0, + "step": 54710 + }, + { + "epoch": 11.885317115551695, + "grad_norm": 0.00019255632651038468, + "learning_rate": 4.914205039096437e-07, + "loss": 0.0, + "step": 54720 + }, + { + "epoch": 11.887489139878367, + "grad_norm": 0.00024727650452405214, + "learning_rate": 4.823704025485086e-07, + "loss": 0.0, + "step": 54730 + }, + { + "epoch": 11.88966116420504, + "grad_norm": 0.00019835654529742897, + "learning_rate": 4.733203011873733e-07, + "loss": 0.0, + "step": 54740 + }, + { + "epoch": 11.891833188531711, + "grad_norm": 0.00019501452334225178, + "learning_rate": 4.6427019982623807e-07, + "loss": 0.0, + "step": 54750 + }, + { + "epoch": 11.894005212858383, + "grad_norm": 0.00020195850811433047, + "learning_rate": 4.552200984651029e-07, + "loss": 0.0, + "step": 54760 + }, + { + "epoch": 11.896177237185057, + "grad_norm": 0.17142532765865326, + "learning_rate": 4.4616999710396754e-07, + "loss": 0.0047, + "step": 54770 + }, + { + "epoch": 11.89834926151173, + "grad_norm": 0.0001938289642566815, + "learning_rate": 4.371198957428323e-07, + "loss": 0.0, + "step": 54780 + }, + { + "epoch": 11.900521285838401, + "grad_norm": 0.00020454356854315847, + "learning_rate": 4.2806979438169707e-07, + "loss": 0.0, + "step": 54790 + }, + { + "epoch": 11.902693310165073, + "grad_norm": 0.043572086840867996, + "learning_rate": 4.190196930205619e-07, + "loss": 0.0, + "step": 54800 + }, + { + "epoch": 11.904865334491745, + "grad_norm": 0.00019681689445860684, + "learning_rate": 4.0996959165942665e-07, + "loss": 0.0, + "step": 54810 + }, + { + "epoch": 11.90703735881842, + "grad_norm": 0.0002530421188566834, + "learning_rate": 4.009194902982913e-07, + "loss": 0.0094, + "step": 54820 + }, + { + "epoch": 11.909209383145091, + "grad_norm": 0.00019192055333405733, + "learning_rate": 3.918693889371561e-07, + "loss": 0.0, + "step": 54830 + }, + { + "epoch": 11.911381407471763, + "grad_norm": 0.00019355231779627502, + "learning_rate": 3.828192875760209e-07, + "loss": 0.0, + "step": 54840 + }, + { + "epoch": 11.913553431798436, + "grad_norm": 0.00023093956406228244, + "learning_rate": 3.7376918621488564e-07, + "loss": 0.0, + "step": 54850 + }, + { + "epoch": 11.91572545612511, + "grad_norm": 0.00019186771532986313, + "learning_rate": 3.6471908485375035e-07, + "loss": 0.0, + "step": 54860 + }, + { + "epoch": 11.917897480451781, + "grad_norm": 0.00019328697817400098, + "learning_rate": 3.556689834926151e-07, + "loss": 0.0, + "step": 54870 + }, + { + "epoch": 11.920069504778454, + "grad_norm": 0.000195430257008411, + "learning_rate": 3.466188821314799e-07, + "loss": 0.0, + "step": 54880 + }, + { + "epoch": 11.922241529105126, + "grad_norm": 0.00019267095194663852, + "learning_rate": 3.3756878077034464e-07, + "loss": 0.0, + "step": 54890 + }, + { + "epoch": 11.924413553431798, + "grad_norm": 0.00019316418911330402, + "learning_rate": 3.285186794092094e-07, + "loss": 0.0, + "step": 54900 + }, + { + "epoch": 11.926585577758472, + "grad_norm": 0.00019968993728980422, + "learning_rate": 3.1946857804807417e-07, + "loss": 0.0044, + "step": 54910 + }, + { + "epoch": 11.928757602085144, + "grad_norm": 0.00019761281146202236, + "learning_rate": 3.1041847668693893e-07, + "loss": 0.0, + "step": 54920 + }, + { + "epoch": 11.930929626411816, + "grad_norm": 0.16718927025794983, + "learning_rate": 3.0136837532580364e-07, + "loss": 0.0048, + "step": 54930 + }, + { + "epoch": 11.933101650738488, + "grad_norm": 0.00022303135483525693, + "learning_rate": 2.923182739646684e-07, + "loss": 0.0, + "step": 54940 + }, + { + "epoch": 11.935273675065162, + "grad_norm": 0.00019292996148578823, + "learning_rate": 2.8326817260353316e-07, + "loss": 0.0, + "step": 54950 + }, + { + "epoch": 11.937445699391834, + "grad_norm": 0.0001956681371666491, + "learning_rate": 2.7421807124239793e-07, + "loss": 0.0, + "step": 54960 + }, + { + "epoch": 11.939617723718506, + "grad_norm": 0.00019297373364679515, + "learning_rate": 2.651679698812627e-07, + "loss": 0.0, + "step": 54970 + }, + { + "epoch": 11.941789748045178, + "grad_norm": 0.00019810539379250258, + "learning_rate": 2.561178685201274e-07, + "loss": 0.0, + "step": 54980 + }, + { + "epoch": 11.94396177237185, + "grad_norm": 0.0001998866646317765, + "learning_rate": 2.470677671589922e-07, + "loss": 0.0, + "step": 54990 + }, + { + "epoch": 11.946133796698524, + "grad_norm": 0.00024236796889454126, + "learning_rate": 2.3801766579785698e-07, + "loss": 0.0, + "step": 55000 + }, + { + "epoch": 11.948305821025196, + "grad_norm": 0.00019588737632147968, + "learning_rate": 2.2896756443672169e-07, + "loss": 0.0048, + "step": 55010 + }, + { + "epoch": 11.950477845351868, + "grad_norm": 0.00029732659459114075, + "learning_rate": 2.1991746307558648e-07, + "loss": 0.0, + "step": 55020 + }, + { + "epoch": 11.95264986967854, + "grad_norm": 0.00019339253776706755, + "learning_rate": 2.1086736171445119e-07, + "loss": 0.0, + "step": 55030 + }, + { + "epoch": 11.954821894005214, + "grad_norm": 0.0001921090151881799, + "learning_rate": 2.0181726035331597e-07, + "loss": 0.005, + "step": 55040 + }, + { + "epoch": 11.956993918331886, + "grad_norm": 0.00019395742856431752, + "learning_rate": 1.927671589921807e-07, + "loss": 0.0, + "step": 55050 + }, + { + "epoch": 11.959165942658558, + "grad_norm": 0.00019353099924046546, + "learning_rate": 1.8371705763104547e-07, + "loss": 0.0, + "step": 55060 + }, + { + "epoch": 11.96133796698523, + "grad_norm": 0.00025086343521252275, + "learning_rate": 1.7466695626991024e-07, + "loss": 0.0, + "step": 55070 + }, + { + "epoch": 11.963509991311902, + "grad_norm": 0.0002000442473217845, + "learning_rate": 1.65616854908775e-07, + "loss": 0.0, + "step": 55080 + }, + { + "epoch": 11.965682015638576, + "grad_norm": 0.00019300452549941838, + "learning_rate": 1.5656675354763973e-07, + "loss": 0.0, + "step": 55090 + }, + { + "epoch": 11.967854039965248, + "grad_norm": 0.000195394764887169, + "learning_rate": 1.475166521865045e-07, + "loss": 0.0048, + "step": 55100 + }, + { + "epoch": 11.97002606429192, + "grad_norm": 0.00020142064022365957, + "learning_rate": 1.3846655082536923e-07, + "loss": 0.0044, + "step": 55110 + }, + { + "epoch": 11.972198088618592, + "grad_norm": 0.00020469767332542688, + "learning_rate": 1.2941644946423402e-07, + "loss": 0.0, + "step": 55120 + }, + { + "epoch": 11.974370112945264, + "grad_norm": 0.00019746186444535851, + "learning_rate": 1.2036634810309876e-07, + "loss": 0.0, + "step": 55130 + }, + { + "epoch": 11.976542137271938, + "grad_norm": 0.00019035911827813834, + "learning_rate": 1.1131624674196352e-07, + "loss": 0.0, + "step": 55140 + }, + { + "epoch": 11.97871416159861, + "grad_norm": 0.00019300123676657677, + "learning_rate": 1.0226614538082827e-07, + "loss": 0.0, + "step": 55150 + }, + { + "epoch": 11.980886185925282, + "grad_norm": 0.00021308429131750017, + "learning_rate": 9.321604401969303e-08, + "loss": 0.0, + "step": 55160 + }, + { + "epoch": 11.983058210251954, + "grad_norm": 0.00019624890410341322, + "learning_rate": 8.416594265855778e-08, + "loss": 0.0, + "step": 55170 + }, + { + "epoch": 11.985230234578626, + "grad_norm": 0.0002495882799848914, + "learning_rate": 7.511584129742253e-08, + "loss": 0.0, + "step": 55180 + }, + { + "epoch": 11.9874022589053, + "grad_norm": 0.0002517025568522513, + "learning_rate": 6.60657399362873e-08, + "loss": 0.0, + "step": 55190 + }, + { + "epoch": 11.989574283231972, + "grad_norm": 0.0002455560024827719, + "learning_rate": 5.7015638575152043e-08, + "loss": 0.0, + "step": 55200 + }, + { + "epoch": 11.991746307558644, + "grad_norm": 0.0002454043715260923, + "learning_rate": 4.79655372140168e-08, + "loss": 0.0, + "step": 55210 + }, + { + "epoch": 11.993918331885316, + "grad_norm": 0.00019305164460092783, + "learning_rate": 3.8915435852881555e-08, + "loss": 0.0, + "step": 55220 + }, + { + "epoch": 11.99609035621199, + "grad_norm": 0.0004981443635188043, + "learning_rate": 2.986533449174631e-08, + "loss": 0.0, + "step": 55230 + }, + { + "epoch": 11.998262380538662, + "grad_norm": 0.00019264254660811275, + "learning_rate": 2.0815233130611064e-08, + "loss": 0.0, + "step": 55240 + }, + { + "epoch": 12.0, + "eval_f1": 0.6394052044609665, + "eval_loss": 0.09074818342924118, + "eval_runtime": 83.8935, + "eval_samples_per_second": 118.901, + "eval_steps_per_second": 7.438, + "step": 55248 + }, + { + "epoch": 12.0, + "step": 55248, + "total_flos": 6.849671946013016e+19, + "train_loss": 0.0005868991143025554, + "train_runtime": 6465.7027, + "train_samples_per_second": 136.709, + "train_steps_per_second": 8.545 } ], "logging_steps": 10, - "max_steps": 36832, + "max_steps": 55248, "num_input_tokens_seen": 0, - "num_train_epochs": 8, + "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -25888,7 +38811,7 @@ "attributes": {} } }, - "total_flos": 4.566447964008677e+19, + "total_flos": 6.849671946013016e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null