diff --git "a/checkpoint-31253/trainer_state.json" "b/checkpoint-31253/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-31253/trainer_state.json" @@ -0,0 +1,21909 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 31253, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003199692829488369, + "grad_norm": 24.0, + "learning_rate": 1.1516314779270635e-07, + "loss": 0.5918, + "step": 10 + }, + { + "epoch": 0.0006399385658976738, + "grad_norm": 18.875, + "learning_rate": 2.431222008957134e-07, + "loss": 0.5934, + "step": 20 + }, + { + "epoch": 0.0009599078488465107, + "grad_norm": 15.4375, + "learning_rate": 3.7108125399872046e-07, + "loss": 0.6051, + "step": 30 + }, + { + "epoch": 0.0012798771317953477, + "grad_norm": 19.5, + "learning_rate": 4.990403071017275e-07, + "loss": 0.5723, + "step": 40 + }, + { + "epoch": 0.0015998464147441846, + "grad_norm": 11.75, + "learning_rate": 6.269993602047346e-07, + "loss": 0.6254, + "step": 50 + }, + { + "epoch": 0.0019198156976930215, + "grad_norm": 11.0625, + "learning_rate": 7.549584133077416e-07, + "loss": 0.5764, + "step": 60 + }, + { + "epoch": 0.0022397849806418584, + "grad_norm": 7.5, + "learning_rate": 8.829174664107486e-07, + "loss": 0.584, + "step": 70 + }, + { + "epoch": 0.0025597542635906953, + "grad_norm": 15.25, + "learning_rate": 1.0108765195137557e-06, + "loss": 0.5797, + "step": 80 + }, + { + "epoch": 0.002879723546539532, + "grad_norm": 14.0625, + "learning_rate": 1.1388355726167627e-06, + "loss": 0.6012, + "step": 90 + }, + { + "epoch": 0.003199692829488369, + "grad_norm": 7.90625, + "learning_rate": 1.2667946257197696e-06, + "loss": 0.5617, + "step": 100 + }, + { + "epoch": 0.003519662112437206, + "grad_norm": 6.90625, + "learning_rate": 1.394753678822777e-06, + "loss": 0.584, + "step": 110 + }, + { + "epoch": 0.003839631395386043, + "grad_norm": 18.0, + "learning_rate": 1.5227127319257839e-06, + "loss": 0.6004, + "step": 120 + }, + { + "epoch": 0.00415960067833488, + "grad_norm": 18.25, + "learning_rate": 1.650671785028791e-06, + "loss": 0.5977, + "step": 130 + }, + { + "epoch": 0.004479569961283717, + "grad_norm": 10.9375, + "learning_rate": 1.778630838131798e-06, + "loss": 0.5578, + "step": 140 + }, + { + "epoch": 0.004799539244232554, + "grad_norm": 6.9375, + "learning_rate": 1.906589891234805e-06, + "loss": 0.5926, + "step": 150 + }, + { + "epoch": 0.005119508527181391, + "grad_norm": 11.0, + "learning_rate": 2.0345489443378122e-06, + "loss": 0.6055, + "step": 160 + }, + { + "epoch": 0.0054394778101302275, + "grad_norm": 11.1875, + "learning_rate": 2.1625079974408194e-06, + "loss": 0.5736, + "step": 170 + }, + { + "epoch": 0.005759447093079064, + "grad_norm": 7.15625, + "learning_rate": 2.290467050543826e-06, + "loss": 0.5375, + "step": 180 + }, + { + "epoch": 0.006079416376027901, + "grad_norm": 15.375, + "learning_rate": 2.4184261036468333e-06, + "loss": 0.6051, + "step": 190 + }, + { + "epoch": 0.006399385658976738, + "grad_norm": 10.375, + "learning_rate": 2.5463851567498404e-06, + "loss": 0.5988, + "step": 200 + }, + { + "epoch": 0.006719354941925575, + "grad_norm": 6.90625, + "learning_rate": 2.674344209852847e-06, + "loss": 0.5555, + "step": 210 + }, + { + "epoch": 0.007039324224874412, + "grad_norm": 12.1875, + "learning_rate": 2.8023032629558543e-06, + "loss": 0.5713, + "step": 220 + }, + { + "epoch": 0.007359293507823249, + "grad_norm": 17.25, + "learning_rate": 2.930262316058861e-06, + "loss": 0.5605, + "step": 230 + }, + { + "epoch": 0.007679262790772086, + "grad_norm": 10.375, + "learning_rate": 3.0582213691618685e-06, + "loss": 0.565, + "step": 240 + }, + { + "epoch": 0.007999232073720922, + "grad_norm": 9.25, + "learning_rate": 3.1861804222648757e-06, + "loss": 0.5705, + "step": 250 + }, + { + "epoch": 0.00831920135666976, + "grad_norm": 25.125, + "learning_rate": 3.3141394753678824e-06, + "loss": 0.5398, + "step": 260 + }, + { + "epoch": 0.008639170639618596, + "grad_norm": 14.25, + "learning_rate": 3.4420985284708895e-06, + "loss": 0.5424, + "step": 270 + }, + { + "epoch": 0.008959139922567434, + "grad_norm": 24.0, + "learning_rate": 3.5700575815738963e-06, + "loss": 0.5867, + "step": 280 + }, + { + "epoch": 0.00927910920551627, + "grad_norm": 6.34375, + "learning_rate": 3.698016634676904e-06, + "loss": 0.5211, + "step": 290 + }, + { + "epoch": 0.009599078488465107, + "grad_norm": 8.375, + "learning_rate": 3.8259756877799105e-06, + "loss": 0.5086, + "step": 300 + }, + { + "epoch": 0.009919047771413943, + "grad_norm": 13.125, + "learning_rate": 3.953934740882917e-06, + "loss": 0.5596, + "step": 310 + }, + { + "epoch": 0.010239017054362781, + "grad_norm": 6.875, + "learning_rate": 4.081893793985925e-06, + "loss": 0.565, + "step": 320 + }, + { + "epoch": 0.010558986337311617, + "grad_norm": 7.25, + "learning_rate": 4.209852847088932e-06, + "loss": 0.5412, + "step": 330 + }, + { + "epoch": 0.010878955620260455, + "grad_norm": 13.125, + "learning_rate": 4.337811900191939e-06, + "loss": 0.5969, + "step": 340 + }, + { + "epoch": 0.011198924903209291, + "grad_norm": 10.375, + "learning_rate": 4.465770953294946e-06, + "loss": 0.5518, + "step": 350 + }, + { + "epoch": 0.011518894186158129, + "grad_norm": 11.625, + "learning_rate": 4.5937300063979526e-06, + "loss": 0.5232, + "step": 360 + }, + { + "epoch": 0.011838863469106965, + "grad_norm": 7.53125, + "learning_rate": 4.72168905950096e-06, + "loss": 0.5139, + "step": 370 + }, + { + "epoch": 0.012158832752055803, + "grad_norm": 11.625, + "learning_rate": 4.849648112603968e-06, + "loss": 0.5734, + "step": 380 + }, + { + "epoch": 0.012478802035004639, + "grad_norm": 9.25, + "learning_rate": 4.977607165706974e-06, + "loss": 0.4797, + "step": 390 + }, + { + "epoch": 0.012798771317953477, + "grad_norm": 8.75, + "learning_rate": 5.105566218809981e-06, + "loss": 0.4947, + "step": 400 + }, + { + "epoch": 0.013118740600902313, + "grad_norm": 17.25, + "learning_rate": 5.233525271912988e-06, + "loss": 0.5643, + "step": 410 + }, + { + "epoch": 0.01343870988385115, + "grad_norm": 15.625, + "learning_rate": 5.361484325015995e-06, + "loss": 0.5088, + "step": 420 + }, + { + "epoch": 0.013758679166799986, + "grad_norm": 10.5625, + "learning_rate": 5.489443378119003e-06, + "loss": 0.5064, + "step": 430 + }, + { + "epoch": 0.014078648449748824, + "grad_norm": 8.0625, + "learning_rate": 5.61740243122201e-06, + "loss": 0.5547, + "step": 440 + }, + { + "epoch": 0.01439861773269766, + "grad_norm": 6.75, + "learning_rate": 5.745361484325016e-06, + "loss": 0.4967, + "step": 450 + }, + { + "epoch": 0.014718587015646498, + "grad_norm": 18.125, + "learning_rate": 5.873320537428023e-06, + "loss": 0.5059, + "step": 460 + }, + { + "epoch": 0.015038556298595334, + "grad_norm": 16.0, + "learning_rate": 6.001279590531031e-06, + "loss": 0.5262, + "step": 470 + }, + { + "epoch": 0.015358525581544172, + "grad_norm": 10.25, + "learning_rate": 6.129238643634038e-06, + "loss": 0.5311, + "step": 480 + }, + { + "epoch": 0.01567849486449301, + "grad_norm": 14.3125, + "learning_rate": 6.257197696737045e-06, + "loss": 0.5551, + "step": 490 + }, + { + "epoch": 0.015998464147441844, + "grad_norm": 12.6875, + "learning_rate": 6.385156749840052e-06, + "loss": 0.5479, + "step": 500 + }, + { + "epoch": 0.01631843343039068, + "grad_norm": 20.25, + "learning_rate": 6.513115802943058e-06, + "loss": 0.5518, + "step": 510 + }, + { + "epoch": 0.01663840271333952, + "grad_norm": 8.0, + "learning_rate": 6.641074856046066e-06, + "loss": 0.565, + "step": 520 + }, + { + "epoch": 0.016958371996288357, + "grad_norm": 19.125, + "learning_rate": 6.769033909149073e-06, + "loss": 0.5084, + "step": 530 + }, + { + "epoch": 0.01727834127923719, + "grad_norm": 9.5, + "learning_rate": 6.89699296225208e-06, + "loss": 0.5621, + "step": 540 + }, + { + "epoch": 0.01759831056218603, + "grad_norm": 12.5625, + "learning_rate": 7.024952015355086e-06, + "loss": 0.527, + "step": 550 + }, + { + "epoch": 0.017918279845134867, + "grad_norm": 11.875, + "learning_rate": 7.152911068458094e-06, + "loss": 0.5236, + "step": 560 + }, + { + "epoch": 0.018238249128083705, + "grad_norm": 9.8125, + "learning_rate": 7.280870121561101e-06, + "loss": 0.5033, + "step": 570 + }, + { + "epoch": 0.01855821841103254, + "grad_norm": 19.125, + "learning_rate": 7.408829174664108e-06, + "loss": 0.5205, + "step": 580 + }, + { + "epoch": 0.018878187693981377, + "grad_norm": 14.875, + "learning_rate": 7.5367882277671156e-06, + "loss": 0.583, + "step": 590 + }, + { + "epoch": 0.019198156976930215, + "grad_norm": 11.3125, + "learning_rate": 7.664747280870121e-06, + "loss": 0.4984, + "step": 600 + }, + { + "epoch": 0.019518126259879053, + "grad_norm": 17.875, + "learning_rate": 7.79270633397313e-06, + "loss": 0.4883, + "step": 610 + }, + { + "epoch": 0.019838095542827887, + "grad_norm": 26.75, + "learning_rate": 7.920665387076137e-06, + "loss": 0.534, + "step": 620 + }, + { + "epoch": 0.020158064825776725, + "grad_norm": 18.125, + "learning_rate": 8.048624440179143e-06, + "loss": 0.4973, + "step": 630 + }, + { + "epoch": 0.020478034108725562, + "grad_norm": 17.5, + "learning_rate": 8.176583493282152e-06, + "loss": 0.4695, + "step": 640 + }, + { + "epoch": 0.0207980033916744, + "grad_norm": 13.5625, + "learning_rate": 8.304542546385157e-06, + "loss": 0.5316, + "step": 650 + }, + { + "epoch": 0.021117972674623234, + "grad_norm": 44.0, + "learning_rate": 8.432501599488163e-06, + "loss": 0.4807, + "step": 660 + }, + { + "epoch": 0.021437941957572072, + "grad_norm": 41.5, + "learning_rate": 8.560460652591172e-06, + "loss": 0.4648, + "step": 670 + }, + { + "epoch": 0.02175791124052091, + "grad_norm": 20.375, + "learning_rate": 8.688419705694179e-06, + "loss": 0.4713, + "step": 680 + }, + { + "epoch": 0.022077880523469748, + "grad_norm": 46.75, + "learning_rate": 8.816378758797185e-06, + "loss": 0.3724, + "step": 690 + }, + { + "epoch": 0.022397849806418582, + "grad_norm": 26.875, + "learning_rate": 8.944337811900192e-06, + "loss": 0.449, + "step": 700 + }, + { + "epoch": 0.02271781908936742, + "grad_norm": 47.75, + "learning_rate": 9.072296865003199e-06, + "loss": 0.4127, + "step": 710 + }, + { + "epoch": 0.023037788372316258, + "grad_norm": 27.375, + "learning_rate": 9.200255918106207e-06, + "loss": 0.4557, + "step": 720 + }, + { + "epoch": 0.023357757655265095, + "grad_norm": 15.1875, + "learning_rate": 9.328214971209214e-06, + "loss": 0.4498, + "step": 730 + }, + { + "epoch": 0.02367772693821393, + "grad_norm": 39.0, + "learning_rate": 9.45617402431222e-06, + "loss": 0.4418, + "step": 740 + }, + { + "epoch": 0.023997696221162768, + "grad_norm": 33.75, + "learning_rate": 9.584133077415227e-06, + "loss": 0.4421, + "step": 750 + }, + { + "epoch": 0.024317665504111605, + "grad_norm": 28.125, + "learning_rate": 9.712092130518234e-06, + "loss": 0.4023, + "step": 760 + }, + { + "epoch": 0.024637634787060443, + "grad_norm": 14.5625, + "learning_rate": 9.840051183621242e-06, + "loss": 0.3801, + "step": 770 + }, + { + "epoch": 0.024957604070009277, + "grad_norm": 35.0, + "learning_rate": 9.968010236724249e-06, + "loss": 0.4164, + "step": 780 + }, + { + "epoch": 0.025277573352958115, + "grad_norm": 24.125, + "learning_rate": 1.0095969289827256e-05, + "loss": 0.4189, + "step": 790 + }, + { + "epoch": 0.025597542635906953, + "grad_norm": 35.0, + "learning_rate": 1.0223928342930263e-05, + "loss": 0.3312, + "step": 800 + }, + { + "epoch": 0.02591751191885579, + "grad_norm": 87.5, + "learning_rate": 1.0351887396033271e-05, + "loss": 0.3484, + "step": 810 + }, + { + "epoch": 0.026237481201804625, + "grad_norm": 31.75, + "learning_rate": 1.0479846449136278e-05, + "loss": 0.2955, + "step": 820 + }, + { + "epoch": 0.026557450484753463, + "grad_norm": 34.0, + "learning_rate": 1.0607805502239283e-05, + "loss": 0.238, + "step": 830 + }, + { + "epoch": 0.0268774197677023, + "grad_norm": 61.75, + "learning_rate": 1.0735764555342291e-05, + "loss": 0.2479, + "step": 840 + }, + { + "epoch": 0.02719738905065114, + "grad_norm": 37.25, + "learning_rate": 1.0863723608445298e-05, + "loss": 0.2883, + "step": 850 + }, + { + "epoch": 0.027517358333599973, + "grad_norm": 68.0, + "learning_rate": 1.0991682661548306e-05, + "loss": 0.3091, + "step": 860 + }, + { + "epoch": 0.02783732761654881, + "grad_norm": 54.25, + "learning_rate": 1.1119641714651313e-05, + "loss": 0.185, + "step": 870 + }, + { + "epoch": 0.02815729689949765, + "grad_norm": 24.125, + "learning_rate": 1.1247600767754318e-05, + "loss": 0.2128, + "step": 880 + }, + { + "epoch": 0.028477266182446486, + "grad_norm": 17.25, + "learning_rate": 1.1375559820857326e-05, + "loss": 0.2112, + "step": 890 + }, + { + "epoch": 0.02879723546539532, + "grad_norm": 67.5, + "learning_rate": 1.1503518873960333e-05, + "loss": 0.1948, + "step": 900 + }, + { + "epoch": 0.029117204748344158, + "grad_norm": 39.75, + "learning_rate": 1.1631477927063342e-05, + "loss": 0.1911, + "step": 910 + }, + { + "epoch": 0.029437174031292996, + "grad_norm": 35.25, + "learning_rate": 1.1759436980166348e-05, + "loss": 0.2865, + "step": 920 + }, + { + "epoch": 0.029757143314241834, + "grad_norm": 41.75, + "learning_rate": 1.1887396033269353e-05, + "loss": 0.188, + "step": 930 + }, + { + "epoch": 0.030077112597190668, + "grad_norm": 43.25, + "learning_rate": 1.2015355086372362e-05, + "loss": 0.1514, + "step": 940 + }, + { + "epoch": 0.030397081880139506, + "grad_norm": 62.75, + "learning_rate": 1.2143314139475368e-05, + "loss": 0.2148, + "step": 950 + }, + { + "epoch": 0.030717051163088344, + "grad_norm": 15.125, + "learning_rate": 1.2271273192578377e-05, + "loss": 0.2088, + "step": 960 + }, + { + "epoch": 0.03103702044603718, + "grad_norm": 23.125, + "learning_rate": 1.2399232245681384e-05, + "loss": 0.1531, + "step": 970 + }, + { + "epoch": 0.03135698972898602, + "grad_norm": 10.875, + "learning_rate": 1.2527191298784389e-05, + "loss": 0.2149, + "step": 980 + }, + { + "epoch": 0.031676959011934853, + "grad_norm": 26.0, + "learning_rate": 1.2655150351887397e-05, + "loss": 0.1947, + "step": 990 + }, + { + "epoch": 0.03199692829488369, + "grad_norm": 23.875, + "learning_rate": 1.2783109404990404e-05, + "loss": 0.2217, + "step": 1000 + }, + { + "epoch": 0.03231689757783253, + "grad_norm": 23.875, + "learning_rate": 1.2911068458093412e-05, + "loss": 0.1784, + "step": 1010 + }, + { + "epoch": 0.03263686686078136, + "grad_norm": 13.3125, + "learning_rate": 1.3039027511196419e-05, + "loss": 0.1642, + "step": 1020 + }, + { + "epoch": 0.032956836143730205, + "grad_norm": 177.0, + "learning_rate": 1.3166986564299424e-05, + "loss": 0.222, + "step": 1030 + }, + { + "epoch": 0.03327680542667904, + "grad_norm": 31.25, + "learning_rate": 1.3294945617402432e-05, + "loss": 0.1771, + "step": 1040 + }, + { + "epoch": 0.03359677470962787, + "grad_norm": 23.875, + "learning_rate": 1.3422904670505439e-05, + "loss": 0.1787, + "step": 1050 + }, + { + "epoch": 0.033916743992576714, + "grad_norm": 53.25, + "learning_rate": 1.3550863723608447e-05, + "loss": 0.2089, + "step": 1060 + }, + { + "epoch": 0.03423671327552555, + "grad_norm": 23.5, + "learning_rate": 1.3678822776711454e-05, + "loss": 0.2751, + "step": 1070 + }, + { + "epoch": 0.03455668255847438, + "grad_norm": 8.875, + "learning_rate": 1.380678182981446e-05, + "loss": 0.2051, + "step": 1080 + }, + { + "epoch": 0.034876651841423224, + "grad_norm": 32.75, + "learning_rate": 1.3934740882917468e-05, + "loss": 0.1436, + "step": 1090 + }, + { + "epoch": 0.03519662112437206, + "grad_norm": 41.25, + "learning_rate": 1.4062699936020474e-05, + "loss": 0.2113, + "step": 1100 + }, + { + "epoch": 0.0355165904073209, + "grad_norm": 11.0, + "learning_rate": 1.4190658989123483e-05, + "loss": 0.2181, + "step": 1110 + }, + { + "epoch": 0.035836559690269734, + "grad_norm": 15.25, + "learning_rate": 1.431861804222649e-05, + "loss": 0.1552, + "step": 1120 + }, + { + "epoch": 0.03615652897321857, + "grad_norm": 9.125, + "learning_rate": 1.4446577095329494e-05, + "loss": 0.1851, + "step": 1130 + }, + { + "epoch": 0.03647649825616741, + "grad_norm": 16.375, + "learning_rate": 1.4574536148432503e-05, + "loss": 0.1531, + "step": 1140 + }, + { + "epoch": 0.036796467539116244, + "grad_norm": 10.875, + "learning_rate": 1.470249520153551e-05, + "loss": 0.1532, + "step": 1150 + }, + { + "epoch": 0.03711643682206508, + "grad_norm": 4.6875, + "learning_rate": 1.4830454254638518e-05, + "loss": 0.2477, + "step": 1160 + }, + { + "epoch": 0.03743640610501392, + "grad_norm": 27.0, + "learning_rate": 1.4958413307741525e-05, + "loss": 0.1875, + "step": 1170 + }, + { + "epoch": 0.037756375387962754, + "grad_norm": 14.75, + "learning_rate": 1.508637236084453e-05, + "loss": 0.1873, + "step": 1180 + }, + { + "epoch": 0.038076344670911595, + "grad_norm": 26.625, + "learning_rate": 1.5214331413947538e-05, + "loss": 0.2008, + "step": 1190 + }, + { + "epoch": 0.03839631395386043, + "grad_norm": 43.5, + "learning_rate": 1.5342290467050545e-05, + "loss": 0.1827, + "step": 1200 + }, + { + "epoch": 0.038716283236809264, + "grad_norm": 56.25, + "learning_rate": 1.547024952015355e-05, + "loss": 0.2334, + "step": 1210 + }, + { + "epoch": 0.039036252519758105, + "grad_norm": 14.375, + "learning_rate": 1.559820857325656e-05, + "loss": 0.1578, + "step": 1220 + }, + { + "epoch": 0.03935622180270694, + "grad_norm": 1.9765625, + "learning_rate": 1.5726167626359565e-05, + "loss": 0.2213, + "step": 1230 + }, + { + "epoch": 0.039676191085655774, + "grad_norm": 23.25, + "learning_rate": 1.5854126679462575e-05, + "loss": 0.1362, + "step": 1240 + }, + { + "epoch": 0.039996160368604615, + "grad_norm": 17.625, + "learning_rate": 1.598208573256558e-05, + "loss": 0.1613, + "step": 1250 + }, + { + "epoch": 0.04031612965155345, + "grad_norm": 10.9375, + "learning_rate": 1.611004478566859e-05, + "loss": 0.1239, + "step": 1260 + }, + { + "epoch": 0.04063609893450229, + "grad_norm": 15.625, + "learning_rate": 1.6238003838771595e-05, + "loss": 0.1669, + "step": 1270 + }, + { + "epoch": 0.040956068217451125, + "grad_norm": 27.75, + "learning_rate": 1.6365962891874602e-05, + "loss": 0.1449, + "step": 1280 + }, + { + "epoch": 0.04127603750039996, + "grad_norm": 35.0, + "learning_rate": 1.649392194497761e-05, + "loss": 0.1824, + "step": 1290 + }, + { + "epoch": 0.0415960067833488, + "grad_norm": 15.5625, + "learning_rate": 1.6621880998080615e-05, + "loss": 0.2166, + "step": 1300 + }, + { + "epoch": 0.041915976066297635, + "grad_norm": 14.3125, + "learning_rate": 1.6749840051183622e-05, + "loss": 0.1918, + "step": 1310 + }, + { + "epoch": 0.04223594534924647, + "grad_norm": 14.0625, + "learning_rate": 1.687779910428663e-05, + "loss": 0.2087, + "step": 1320 + }, + { + "epoch": 0.04255591463219531, + "grad_norm": 17.625, + "learning_rate": 1.7005758157389636e-05, + "loss": 0.1757, + "step": 1330 + }, + { + "epoch": 0.042875883915144145, + "grad_norm": 12.75, + "learning_rate": 1.7133717210492646e-05, + "loss": 0.2006, + "step": 1340 + }, + { + "epoch": 0.043195853198092986, + "grad_norm": 75.0, + "learning_rate": 1.726167626359565e-05, + "loss": 0.1555, + "step": 1350 + }, + { + "epoch": 0.04351582248104182, + "grad_norm": 25.875, + "learning_rate": 1.738963531669866e-05, + "loss": 0.2205, + "step": 1360 + }, + { + "epoch": 0.043835791763990654, + "grad_norm": 8.5625, + "learning_rate": 1.7517594369801666e-05, + "loss": 0.1482, + "step": 1370 + }, + { + "epoch": 0.044155761046939496, + "grad_norm": 4.875, + "learning_rate": 1.7645553422904673e-05, + "loss": 0.1595, + "step": 1380 + }, + { + "epoch": 0.04447573032988833, + "grad_norm": 20.125, + "learning_rate": 1.777351247600768e-05, + "loss": 0.2334, + "step": 1390 + }, + { + "epoch": 0.044795699612837164, + "grad_norm": 10.8125, + "learning_rate": 1.7901471529110686e-05, + "loss": 0.1588, + "step": 1400 + }, + { + "epoch": 0.045115668895786006, + "grad_norm": 12.8125, + "learning_rate": 1.8029430582213693e-05, + "loss": 0.174, + "step": 1410 + }, + { + "epoch": 0.04543563817873484, + "grad_norm": 35.25, + "learning_rate": 1.81573896353167e-05, + "loss": 0.1197, + "step": 1420 + }, + { + "epoch": 0.04575560746168368, + "grad_norm": 0.5703125, + "learning_rate": 1.8285348688419706e-05, + "loss": 0.0996, + "step": 1430 + }, + { + "epoch": 0.046075576744632515, + "grad_norm": 12.875, + "learning_rate": 1.8413307741522716e-05, + "loss": 0.2596, + "step": 1440 + }, + { + "epoch": 0.04639554602758135, + "grad_norm": 53.5, + "learning_rate": 1.854126679462572e-05, + "loss": 0.1828, + "step": 1450 + }, + { + "epoch": 0.04671551531053019, + "grad_norm": 9.625, + "learning_rate": 1.866922584772873e-05, + "loss": 0.2171, + "step": 1460 + }, + { + "epoch": 0.047035484593479025, + "grad_norm": 7.375, + "learning_rate": 1.8797184900831736e-05, + "loss": 0.1726, + "step": 1470 + }, + { + "epoch": 0.04735545387642786, + "grad_norm": 10.375, + "learning_rate": 1.892514395393474e-05, + "loss": 0.1681, + "step": 1480 + }, + { + "epoch": 0.0476754231593767, + "grad_norm": 12.3125, + "learning_rate": 1.905310300703775e-05, + "loss": 0.1735, + "step": 1490 + }, + { + "epoch": 0.047995392442325535, + "grad_norm": 16.25, + "learning_rate": 1.9181062060140757e-05, + "loss": 0.136, + "step": 1500 + }, + { + "epoch": 0.048315361725274376, + "grad_norm": 5.84375, + "learning_rate": 1.9309021113243763e-05, + "loss": 0.1463, + "step": 1510 + }, + { + "epoch": 0.04863533100822321, + "grad_norm": 12.5, + "learning_rate": 1.943698016634677e-05, + "loss": 0.1917, + "step": 1520 + }, + { + "epoch": 0.048955300291172045, + "grad_norm": 7.53125, + "learning_rate": 1.9564939219449777e-05, + "loss": 0.1536, + "step": 1530 + }, + { + "epoch": 0.049275269574120886, + "grad_norm": 19.25, + "learning_rate": 1.9692898272552783e-05, + "loss": 0.1839, + "step": 1540 + }, + { + "epoch": 0.04959523885706972, + "grad_norm": 11.25, + "learning_rate": 1.982085732565579e-05, + "loss": 0.1862, + "step": 1550 + }, + { + "epoch": 0.049915208140018555, + "grad_norm": 10.8125, + "learning_rate": 1.99488163787588e-05, + "loss": 0.1564, + "step": 1560 + }, + { + "epoch": 0.050235177422967396, + "grad_norm": 11.75, + "learning_rate": 1.9995958235095995e-05, + "loss": 0.1404, + "step": 1570 + }, + { + "epoch": 0.05055514670591623, + "grad_norm": 22.0, + "learning_rate": 1.998922196025598e-05, + "loss": 0.127, + "step": 1580 + }, + { + "epoch": 0.05087511598886507, + "grad_norm": 26.875, + "learning_rate": 1.9982485685415967e-05, + "loss": 0.1784, + "step": 1590 + }, + { + "epoch": 0.051195085271813906, + "grad_norm": 28.375, + "learning_rate": 1.997574941057595e-05, + "loss": 0.1716, + "step": 1600 + }, + { + "epoch": 0.05151505455476274, + "grad_norm": 26.0, + "learning_rate": 1.996901313573594e-05, + "loss": 0.2349, + "step": 1610 + }, + { + "epoch": 0.05183502383771158, + "grad_norm": 15.375, + "learning_rate": 1.9962276860895927e-05, + "loss": 0.1424, + "step": 1620 + }, + { + "epoch": 0.052154993120660416, + "grad_norm": 16.625, + "learning_rate": 1.995554058605591e-05, + "loss": 0.1324, + "step": 1630 + }, + { + "epoch": 0.05247496240360925, + "grad_norm": 5.5, + "learning_rate": 1.99488043112159e-05, + "loss": 0.131, + "step": 1640 + }, + { + "epoch": 0.05279493168655809, + "grad_norm": 6.25, + "learning_rate": 1.9942068036375887e-05, + "loss": 0.1536, + "step": 1650 + }, + { + "epoch": 0.053114900969506926, + "grad_norm": 27.875, + "learning_rate": 1.993533176153587e-05, + "loss": 0.193, + "step": 1660 + }, + { + "epoch": 0.05343487025245577, + "grad_norm": 8.4375, + "learning_rate": 1.992859548669586e-05, + "loss": 0.162, + "step": 1670 + }, + { + "epoch": 0.0537548395354046, + "grad_norm": 9.0, + "learning_rate": 1.9921859211855846e-05, + "loss": 0.1653, + "step": 1680 + }, + { + "epoch": 0.054074808818353436, + "grad_norm": 20.875, + "learning_rate": 1.991512293701583e-05, + "loss": 0.1217, + "step": 1690 + }, + { + "epoch": 0.05439477810130228, + "grad_norm": 26.375, + "learning_rate": 1.990838666217582e-05, + "loss": 0.1305, + "step": 1700 + }, + { + "epoch": 0.05471474738425111, + "grad_norm": 21.125, + "learning_rate": 1.9901650387335806e-05, + "loss": 0.1377, + "step": 1710 + }, + { + "epoch": 0.055034716667199945, + "grad_norm": 58.0, + "learning_rate": 1.989491411249579e-05, + "loss": 0.1531, + "step": 1720 + }, + { + "epoch": 0.05535468595014879, + "grad_norm": 14.25, + "learning_rate": 1.988817783765578e-05, + "loss": 0.1842, + "step": 1730 + }, + { + "epoch": 0.05567465523309762, + "grad_norm": 30.0, + "learning_rate": 1.9881441562815763e-05, + "loss": 0.1714, + "step": 1740 + }, + { + "epoch": 0.05599462451604646, + "grad_norm": 39.75, + "learning_rate": 1.987470528797575e-05, + "loss": 0.1354, + "step": 1750 + }, + { + "epoch": 0.0563145937989953, + "grad_norm": 11.6875, + "learning_rate": 1.9867969013135738e-05, + "loss": 0.1808, + "step": 1760 + }, + { + "epoch": 0.05663456308194413, + "grad_norm": 11.3125, + "learning_rate": 1.9861232738295723e-05, + "loss": 0.1694, + "step": 1770 + }, + { + "epoch": 0.05695453236489297, + "grad_norm": 9.5625, + "learning_rate": 1.985449646345571e-05, + "loss": 0.098, + "step": 1780 + }, + { + "epoch": 0.057274501647841806, + "grad_norm": 12.9375, + "learning_rate": 1.9847760188615698e-05, + "loss": 0.0779, + "step": 1790 + }, + { + "epoch": 0.05759447093079064, + "grad_norm": 10.5625, + "learning_rate": 1.9841023913775682e-05, + "loss": 0.1768, + "step": 1800 + }, + { + "epoch": 0.05791444021373948, + "grad_norm": 18.875, + "learning_rate": 1.983428763893567e-05, + "loss": 0.157, + "step": 1810 + }, + { + "epoch": 0.058234409496688316, + "grad_norm": 18.0, + "learning_rate": 1.9827551364095658e-05, + "loss": 0.1839, + "step": 1820 + }, + { + "epoch": 0.05855437877963716, + "grad_norm": 23.875, + "learning_rate": 1.9820815089255642e-05, + "loss": 0.1748, + "step": 1830 + }, + { + "epoch": 0.05887434806258599, + "grad_norm": 15.0, + "learning_rate": 1.981407881441563e-05, + "loss": 0.1252, + "step": 1840 + }, + { + "epoch": 0.059194317345534826, + "grad_norm": 5.8125, + "learning_rate": 1.9807342539575614e-05, + "loss": 0.173, + "step": 1850 + }, + { + "epoch": 0.05951428662848367, + "grad_norm": 32.25, + "learning_rate": 1.9800606264735602e-05, + "loss": 0.1857, + "step": 1860 + }, + { + "epoch": 0.0598342559114325, + "grad_norm": 9.9375, + "learning_rate": 1.979386998989559e-05, + "loss": 0.1365, + "step": 1870 + }, + { + "epoch": 0.060154225194381336, + "grad_norm": 7.4375, + "learning_rate": 1.9787133715055574e-05, + "loss": 0.176, + "step": 1880 + }, + { + "epoch": 0.06047419447733018, + "grad_norm": 6.125, + "learning_rate": 1.9780397440215562e-05, + "loss": 0.1885, + "step": 1890 + }, + { + "epoch": 0.06079416376027901, + "grad_norm": 18.25, + "learning_rate": 1.977366116537555e-05, + "loss": 0.2063, + "step": 1900 + }, + { + "epoch": 0.06111413304322785, + "grad_norm": 10.1875, + "learning_rate": 1.9766924890535534e-05, + "loss": 0.1415, + "step": 1910 + }, + { + "epoch": 0.06143410232617669, + "grad_norm": 0.47265625, + "learning_rate": 1.976018861569552e-05, + "loss": 0.1379, + "step": 1920 + }, + { + "epoch": 0.06175407160912552, + "grad_norm": 3.84375, + "learning_rate": 1.975345234085551e-05, + "loss": 0.167, + "step": 1930 + }, + { + "epoch": 0.06207404089207436, + "grad_norm": 20.875, + "learning_rate": 1.9746716066015494e-05, + "loss": 0.1686, + "step": 1940 + }, + { + "epoch": 0.0623940101750232, + "grad_norm": 38.25, + "learning_rate": 1.973997979117548e-05, + "loss": 0.1934, + "step": 1950 + }, + { + "epoch": 0.06271397945797204, + "grad_norm": 14.0, + "learning_rate": 1.9733243516335466e-05, + "loss": 0.1013, + "step": 1960 + }, + { + "epoch": 0.06303394874092087, + "grad_norm": 15.25, + "learning_rate": 1.9726507241495454e-05, + "loss": 0.1828, + "step": 1970 + }, + { + "epoch": 0.06335391802386971, + "grad_norm": 23.5, + "learning_rate": 1.971977096665544e-05, + "loss": 0.1755, + "step": 1980 + }, + { + "epoch": 0.06367388730681854, + "grad_norm": 6.71875, + "learning_rate": 1.9713034691815426e-05, + "loss": 0.1267, + "step": 1990 + }, + { + "epoch": 0.06399385658976738, + "grad_norm": 0.46875, + "learning_rate": 1.9706298416975413e-05, + "loss": 0.1271, + "step": 2000 + }, + { + "epoch": 0.06431382587271622, + "grad_norm": 18.25, + "learning_rate": 1.96995621421354e-05, + "loss": 0.1519, + "step": 2010 + }, + { + "epoch": 0.06463379515566506, + "grad_norm": 12.8125, + "learning_rate": 1.9692825867295385e-05, + "loss": 0.1959, + "step": 2020 + }, + { + "epoch": 0.06495376443861389, + "grad_norm": 7.375, + "learning_rate": 1.9686089592455373e-05, + "loss": 0.1422, + "step": 2030 + }, + { + "epoch": 0.06527373372156273, + "grad_norm": 26.75, + "learning_rate": 1.967935331761536e-05, + "loss": 0.1793, + "step": 2040 + }, + { + "epoch": 0.06559370300451156, + "grad_norm": 2.75, + "learning_rate": 1.9672617042775345e-05, + "loss": 0.1432, + "step": 2050 + }, + { + "epoch": 0.06591367228746041, + "grad_norm": 8.0625, + "learning_rate": 1.9665880767935333e-05, + "loss": 0.1691, + "step": 2060 + }, + { + "epoch": 0.06623364157040924, + "grad_norm": 29.625, + "learning_rate": 1.9659144493095317e-05, + "loss": 0.1804, + "step": 2070 + }, + { + "epoch": 0.06655361085335808, + "grad_norm": 5.90625, + "learning_rate": 1.9652408218255305e-05, + "loss": 0.1773, + "step": 2080 + }, + { + "epoch": 0.06687358013630691, + "grad_norm": 18.375, + "learning_rate": 1.9645671943415293e-05, + "loss": 0.1771, + "step": 2090 + }, + { + "epoch": 0.06719354941925575, + "grad_norm": 7.5, + "learning_rate": 1.9638935668575277e-05, + "loss": 0.1267, + "step": 2100 + }, + { + "epoch": 0.0675135187022046, + "grad_norm": 11.125, + "learning_rate": 1.9632199393735265e-05, + "loss": 0.1476, + "step": 2110 + }, + { + "epoch": 0.06783348798515343, + "grad_norm": 32.0, + "learning_rate": 1.9625463118895253e-05, + "loss": 0.1533, + "step": 2120 + }, + { + "epoch": 0.06815345726810226, + "grad_norm": 7.15625, + "learning_rate": 1.9618726844055237e-05, + "loss": 0.1269, + "step": 2130 + }, + { + "epoch": 0.0684734265510511, + "grad_norm": 25.375, + "learning_rate": 1.9611990569215225e-05, + "loss": 0.1749, + "step": 2140 + }, + { + "epoch": 0.06879339583399993, + "grad_norm": 4.5625, + "learning_rate": 1.9605254294375212e-05, + "loss": 0.1499, + "step": 2150 + }, + { + "epoch": 0.06911336511694877, + "grad_norm": 24.125, + "learning_rate": 1.9598518019535197e-05, + "loss": 0.1549, + "step": 2160 + }, + { + "epoch": 0.06943333439989761, + "grad_norm": 32.25, + "learning_rate": 1.9591781744695185e-05, + "loss": 0.2125, + "step": 2170 + }, + { + "epoch": 0.06975330368284645, + "grad_norm": 7.1875, + "learning_rate": 1.9585045469855172e-05, + "loss": 0.1628, + "step": 2180 + }, + { + "epoch": 0.07007327296579528, + "grad_norm": 6.1875, + "learning_rate": 1.9578309195015157e-05, + "loss": 0.2004, + "step": 2190 + }, + { + "epoch": 0.07039324224874412, + "grad_norm": 11.3125, + "learning_rate": 1.9571572920175144e-05, + "loss": 0.1494, + "step": 2200 + }, + { + "epoch": 0.07071321153169295, + "grad_norm": 5.6875, + "learning_rate": 1.9564836645335132e-05, + "loss": 0.1356, + "step": 2210 + }, + { + "epoch": 0.0710331808146418, + "grad_norm": 45.75, + "learning_rate": 1.955810037049512e-05, + "loss": 0.1338, + "step": 2220 + }, + { + "epoch": 0.07135315009759063, + "grad_norm": 10.5625, + "learning_rate": 1.9551364095655104e-05, + "loss": 0.1166, + "step": 2230 + }, + { + "epoch": 0.07167311938053947, + "grad_norm": 3.34375, + "learning_rate": 1.9544627820815092e-05, + "loss": 0.1911, + "step": 2240 + }, + { + "epoch": 0.0719930886634883, + "grad_norm": 2.03125, + "learning_rate": 1.953789154597508e-05, + "loss": 0.148, + "step": 2250 + }, + { + "epoch": 0.07231305794643714, + "grad_norm": 6.875, + "learning_rate": 1.9531155271135064e-05, + "loss": 0.1107, + "step": 2260 + }, + { + "epoch": 0.07263302722938599, + "grad_norm": 9.375, + "learning_rate": 1.9524418996295052e-05, + "loss": 0.2081, + "step": 2270 + }, + { + "epoch": 0.07295299651233482, + "grad_norm": 4.8125, + "learning_rate": 1.951768272145504e-05, + "loss": 0.1822, + "step": 2280 + }, + { + "epoch": 0.07327296579528365, + "grad_norm": 7.625, + "learning_rate": 1.9510946446615024e-05, + "loss": 0.149, + "step": 2290 + }, + { + "epoch": 0.07359293507823249, + "grad_norm": 16.375, + "learning_rate": 1.950421017177501e-05, + "loss": 0.152, + "step": 2300 + }, + { + "epoch": 0.07391290436118132, + "grad_norm": 20.625, + "learning_rate": 1.9497473896934996e-05, + "loss": 0.1349, + "step": 2310 + }, + { + "epoch": 0.07423287364413016, + "grad_norm": 9.1875, + "learning_rate": 1.9490737622094984e-05, + "loss": 0.1564, + "step": 2320 + }, + { + "epoch": 0.074552842927079, + "grad_norm": 8.25, + "learning_rate": 1.948400134725497e-05, + "loss": 0.1706, + "step": 2330 + }, + { + "epoch": 0.07487281221002784, + "grad_norm": 5.84375, + "learning_rate": 1.9477265072414956e-05, + "loss": 0.1003, + "step": 2340 + }, + { + "epoch": 0.07519278149297667, + "grad_norm": 11.8125, + "learning_rate": 1.9470528797574943e-05, + "loss": 0.1849, + "step": 2350 + }, + { + "epoch": 0.07551275077592551, + "grad_norm": 8.25, + "learning_rate": 1.946379252273493e-05, + "loss": 0.1823, + "step": 2360 + }, + { + "epoch": 0.07583272005887434, + "grad_norm": 9.625, + "learning_rate": 1.9457056247894916e-05, + "loss": 0.1679, + "step": 2370 + }, + { + "epoch": 0.07615268934182319, + "grad_norm": 4.75, + "learning_rate": 1.9450319973054903e-05, + "loss": 0.1187, + "step": 2380 + }, + { + "epoch": 0.07647265862477202, + "grad_norm": 3.484375, + "learning_rate": 1.944358369821489e-05, + "loss": 0.1311, + "step": 2390 + }, + { + "epoch": 0.07679262790772086, + "grad_norm": 11.5, + "learning_rate": 1.9436847423374875e-05, + "loss": 0.145, + "step": 2400 + }, + { + "epoch": 0.0771125971906697, + "grad_norm": 4.75, + "learning_rate": 1.9430111148534863e-05, + "loss": 0.1396, + "step": 2410 + }, + { + "epoch": 0.07743256647361853, + "grad_norm": 11.375, + "learning_rate": 1.9423374873694847e-05, + "loss": 0.155, + "step": 2420 + }, + { + "epoch": 0.07775253575656738, + "grad_norm": 12.5, + "learning_rate": 1.9416638598854835e-05, + "loss": 0.174, + "step": 2430 + }, + { + "epoch": 0.07807250503951621, + "grad_norm": 4.4375, + "learning_rate": 1.9409902324014823e-05, + "loss": 0.1328, + "step": 2440 + }, + { + "epoch": 0.07839247432246504, + "grad_norm": 12.0625, + "learning_rate": 1.9403166049174807e-05, + "loss": 0.1469, + "step": 2450 + }, + { + "epoch": 0.07871244360541388, + "grad_norm": 27.375, + "learning_rate": 1.9396429774334795e-05, + "loss": 0.1726, + "step": 2460 + }, + { + "epoch": 0.07903241288836271, + "grad_norm": 9.0, + "learning_rate": 1.9389693499494783e-05, + "loss": 0.205, + "step": 2470 + }, + { + "epoch": 0.07935238217131155, + "grad_norm": 18.375, + "learning_rate": 1.9382957224654767e-05, + "loss": 0.1394, + "step": 2480 + }, + { + "epoch": 0.0796723514542604, + "grad_norm": 9.4375, + "learning_rate": 1.9376220949814755e-05, + "loss": 0.1348, + "step": 2490 + }, + { + "epoch": 0.07999232073720923, + "grad_norm": 6.71875, + "learning_rate": 1.9369484674974743e-05, + "loss": 0.1325, + "step": 2500 + }, + { + "epoch": 0.08031229002015806, + "grad_norm": 10.0, + "learning_rate": 1.9362748400134727e-05, + "loss": 0.1706, + "step": 2510 + }, + { + "epoch": 0.0806322593031069, + "grad_norm": 7.0, + "learning_rate": 1.9356012125294715e-05, + "loss": 0.1367, + "step": 2520 + }, + { + "epoch": 0.08095222858605573, + "grad_norm": 4.78125, + "learning_rate": 1.9349275850454702e-05, + "loss": 0.1703, + "step": 2530 + }, + { + "epoch": 0.08127219786900458, + "grad_norm": 20.0, + "learning_rate": 1.9342539575614687e-05, + "loss": 0.17, + "step": 2540 + }, + { + "epoch": 0.08159216715195342, + "grad_norm": 7.09375, + "learning_rate": 1.9335803300774674e-05, + "loss": 0.118, + "step": 2550 + }, + { + "epoch": 0.08191213643490225, + "grad_norm": 3.046875, + "learning_rate": 1.932906702593466e-05, + "loss": 0.1729, + "step": 2560 + }, + { + "epoch": 0.08223210571785108, + "grad_norm": 17.0, + "learning_rate": 1.9322330751094647e-05, + "loss": 0.1268, + "step": 2570 + }, + { + "epoch": 0.08255207500079992, + "grad_norm": 5.9375, + "learning_rate": 1.9315594476254634e-05, + "loss": 0.1394, + "step": 2580 + }, + { + "epoch": 0.08287204428374877, + "grad_norm": 29.75, + "learning_rate": 1.930885820141462e-05, + "loss": 0.1999, + "step": 2590 + }, + { + "epoch": 0.0831920135666976, + "grad_norm": 6.6875, + "learning_rate": 1.9302121926574606e-05, + "loss": 0.1494, + "step": 2600 + }, + { + "epoch": 0.08351198284964644, + "grad_norm": 4.15625, + "learning_rate": 1.9295385651734594e-05, + "loss": 0.1139, + "step": 2610 + }, + { + "epoch": 0.08383195213259527, + "grad_norm": 21.375, + "learning_rate": 1.928864937689458e-05, + "loss": 0.1486, + "step": 2620 + }, + { + "epoch": 0.0841519214155441, + "grad_norm": 25.75, + "learning_rate": 1.9281913102054566e-05, + "loss": 0.1235, + "step": 2630 + }, + { + "epoch": 0.08447189069849294, + "grad_norm": 37.75, + "learning_rate": 1.9275176827214554e-05, + "loss": 0.1411, + "step": 2640 + }, + { + "epoch": 0.08479185998144179, + "grad_norm": 7.40625, + "learning_rate": 1.9268440552374538e-05, + "loss": 0.1276, + "step": 2650 + }, + { + "epoch": 0.08511182926439062, + "grad_norm": 8.0625, + "learning_rate": 1.9261704277534526e-05, + "loss": 0.1168, + "step": 2660 + }, + { + "epoch": 0.08543179854733945, + "grad_norm": 26.5, + "learning_rate": 1.925496800269451e-05, + "loss": 0.2217, + "step": 2670 + }, + { + "epoch": 0.08575176783028829, + "grad_norm": 14.5, + "learning_rate": 1.9248231727854498e-05, + "loss": 0.1944, + "step": 2680 + }, + { + "epoch": 0.08607173711323712, + "grad_norm": 15.0, + "learning_rate": 1.9241495453014486e-05, + "loss": 0.1108, + "step": 2690 + }, + { + "epoch": 0.08639170639618597, + "grad_norm": 35.25, + "learning_rate": 1.923475917817447e-05, + "loss": 0.116, + "step": 2700 + }, + { + "epoch": 0.0867116756791348, + "grad_norm": 10.3125, + "learning_rate": 1.9228022903334458e-05, + "loss": 0.0653, + "step": 2710 + }, + { + "epoch": 0.08703164496208364, + "grad_norm": 24.875, + "learning_rate": 1.9221286628494446e-05, + "loss": 0.1254, + "step": 2720 + }, + { + "epoch": 0.08735161424503247, + "grad_norm": 8.0625, + "learning_rate": 1.921455035365443e-05, + "loss": 0.1092, + "step": 2730 + }, + { + "epoch": 0.08767158352798131, + "grad_norm": 3.953125, + "learning_rate": 1.9207814078814418e-05, + "loss": 0.145, + "step": 2740 + }, + { + "epoch": 0.08799155281093016, + "grad_norm": 13.625, + "learning_rate": 1.9201077803974405e-05, + "loss": 0.2331, + "step": 2750 + }, + { + "epoch": 0.08831152209387899, + "grad_norm": 9.9375, + "learning_rate": 1.919434152913439e-05, + "loss": 0.1263, + "step": 2760 + }, + { + "epoch": 0.08863149137682783, + "grad_norm": 7.625, + "learning_rate": 1.9187605254294378e-05, + "loss": 0.0827, + "step": 2770 + }, + { + "epoch": 0.08895146065977666, + "grad_norm": 10.625, + "learning_rate": 1.9180868979454362e-05, + "loss": 0.1144, + "step": 2780 + }, + { + "epoch": 0.0892714299427255, + "grad_norm": 9.1875, + "learning_rate": 1.917413270461435e-05, + "loss": 0.1831, + "step": 2790 + }, + { + "epoch": 0.08959139922567433, + "grad_norm": 8.625, + "learning_rate": 1.9167396429774337e-05, + "loss": 0.1774, + "step": 2800 + }, + { + "epoch": 0.08991136850862318, + "grad_norm": 3.578125, + "learning_rate": 1.916066015493432e-05, + "loss": 0.1444, + "step": 2810 + }, + { + "epoch": 0.09023133779157201, + "grad_norm": 14.6875, + "learning_rate": 1.915392388009431e-05, + "loss": 0.1691, + "step": 2820 + }, + { + "epoch": 0.09055130707452085, + "grad_norm": 11.5, + "learning_rate": 1.9147187605254297e-05, + "loss": 0.119, + "step": 2830 + }, + { + "epoch": 0.09087127635746968, + "grad_norm": 2.296875, + "learning_rate": 1.914045133041428e-05, + "loss": 0.1408, + "step": 2840 + }, + { + "epoch": 0.09119124564041851, + "grad_norm": 11.125, + "learning_rate": 1.913371505557427e-05, + "loss": 0.1178, + "step": 2850 + }, + { + "epoch": 0.09151121492336736, + "grad_norm": 17.5, + "learning_rate": 1.9126978780734257e-05, + "loss": 0.0589, + "step": 2860 + }, + { + "epoch": 0.0918311842063162, + "grad_norm": 21.375, + "learning_rate": 1.912024250589424e-05, + "loss": 0.2153, + "step": 2870 + }, + { + "epoch": 0.09215115348926503, + "grad_norm": 1.3046875, + "learning_rate": 1.911350623105423e-05, + "loss": 0.1499, + "step": 2880 + }, + { + "epoch": 0.09247112277221387, + "grad_norm": 2.71875, + "learning_rate": 1.9106769956214213e-05, + "loss": 0.1632, + "step": 2890 + }, + { + "epoch": 0.0927910920551627, + "grad_norm": 4.625, + "learning_rate": 1.91000336813742e-05, + "loss": 0.1638, + "step": 2900 + }, + { + "epoch": 0.09311106133811155, + "grad_norm": 1.9296875, + "learning_rate": 1.909329740653419e-05, + "loss": 0.0883, + "step": 2910 + }, + { + "epoch": 0.09343103062106038, + "grad_norm": 3.453125, + "learning_rate": 1.9086561131694173e-05, + "loss": 0.1125, + "step": 2920 + }, + { + "epoch": 0.09375099990400922, + "grad_norm": 19.625, + "learning_rate": 1.907982485685416e-05, + "loss": 0.1405, + "step": 2930 + }, + { + "epoch": 0.09407096918695805, + "grad_norm": 27.0, + "learning_rate": 1.907308858201415e-05, + "loss": 0.176, + "step": 2940 + }, + { + "epoch": 0.09439093846990688, + "grad_norm": 12.1875, + "learning_rate": 1.9066352307174133e-05, + "loss": 0.1551, + "step": 2950 + }, + { + "epoch": 0.09471090775285572, + "grad_norm": 6.5625, + "learning_rate": 1.905961603233412e-05, + "loss": 0.1922, + "step": 2960 + }, + { + "epoch": 0.09503087703580457, + "grad_norm": 9.75, + "learning_rate": 1.905287975749411e-05, + "loss": 0.1658, + "step": 2970 + }, + { + "epoch": 0.0953508463187534, + "grad_norm": 2.59375, + "learning_rate": 1.9046143482654093e-05, + "loss": 0.1546, + "step": 2980 + }, + { + "epoch": 0.09567081560170224, + "grad_norm": 12.4375, + "learning_rate": 1.903940720781408e-05, + "loss": 0.1501, + "step": 2990 + }, + { + "epoch": 0.09599078488465107, + "grad_norm": 13.0, + "learning_rate": 1.903267093297407e-05, + "loss": 0.086, + "step": 3000 + }, + { + "epoch": 0.0963107541675999, + "grad_norm": 16.375, + "learning_rate": 1.9025934658134053e-05, + "loss": 0.135, + "step": 3010 + }, + { + "epoch": 0.09663072345054875, + "grad_norm": 7.40625, + "learning_rate": 1.901919838329404e-05, + "loss": 0.1558, + "step": 3020 + }, + { + "epoch": 0.09695069273349759, + "grad_norm": 14.5625, + "learning_rate": 1.9012462108454025e-05, + "loss": 0.1903, + "step": 3030 + }, + { + "epoch": 0.09727066201644642, + "grad_norm": 9.25, + "learning_rate": 1.9005725833614012e-05, + "loss": 0.1578, + "step": 3040 + }, + { + "epoch": 0.09759063129939526, + "grad_norm": 13.1875, + "learning_rate": 1.8998989558774e-05, + "loss": 0.164, + "step": 3050 + }, + { + "epoch": 0.09791060058234409, + "grad_norm": 16.5, + "learning_rate": 1.8992253283933985e-05, + "loss": 0.1594, + "step": 3060 + }, + { + "epoch": 0.09823056986529294, + "grad_norm": 28.125, + "learning_rate": 1.8985517009093972e-05, + "loss": 0.1359, + "step": 3070 + }, + { + "epoch": 0.09855053914824177, + "grad_norm": 10.375, + "learning_rate": 1.897878073425396e-05, + "loss": 0.1359, + "step": 3080 + }, + { + "epoch": 0.0988705084311906, + "grad_norm": 9.8125, + "learning_rate": 1.8972044459413944e-05, + "loss": 0.1366, + "step": 3090 + }, + { + "epoch": 0.09919047771413944, + "grad_norm": 5.21875, + "learning_rate": 1.8965308184573932e-05, + "loss": 0.1435, + "step": 3100 + }, + { + "epoch": 0.09951044699708828, + "grad_norm": 16.125, + "learning_rate": 1.895857190973392e-05, + "loss": 0.1572, + "step": 3110 + }, + { + "epoch": 0.09983041628003711, + "grad_norm": 3.828125, + "learning_rate": 1.8951835634893904e-05, + "loss": 0.0966, + "step": 3120 + }, + { + "epoch": 0.10015038556298596, + "grad_norm": 3.546875, + "learning_rate": 1.8945099360053892e-05, + "loss": 0.1203, + "step": 3130 + }, + { + "epoch": 0.10047035484593479, + "grad_norm": 24.625, + "learning_rate": 1.8938363085213876e-05, + "loss": 0.1025, + "step": 3140 + }, + { + "epoch": 0.10079032412888363, + "grad_norm": 13.0625, + "learning_rate": 1.8931626810373864e-05, + "loss": 0.1449, + "step": 3150 + }, + { + "epoch": 0.10111029341183246, + "grad_norm": 15.75, + "learning_rate": 1.8924890535533852e-05, + "loss": 0.1695, + "step": 3160 + }, + { + "epoch": 0.1014302626947813, + "grad_norm": 37.5, + "learning_rate": 1.8918154260693836e-05, + "loss": 0.1122, + "step": 3170 + }, + { + "epoch": 0.10175023197773014, + "grad_norm": 10.1875, + "learning_rate": 1.8911417985853824e-05, + "loss": 0.1035, + "step": 3180 + }, + { + "epoch": 0.10207020126067898, + "grad_norm": 20.5, + "learning_rate": 1.890468171101381e-05, + "loss": 0.1842, + "step": 3190 + }, + { + "epoch": 0.10239017054362781, + "grad_norm": 9.6875, + "learning_rate": 1.8897945436173796e-05, + "loss": 0.1156, + "step": 3200 + }, + { + "epoch": 0.10271013982657665, + "grad_norm": 6.03125, + "learning_rate": 1.8891209161333784e-05, + "loss": 0.1611, + "step": 3210 + }, + { + "epoch": 0.10303010910952548, + "grad_norm": 8.5, + "learning_rate": 1.888447288649377e-05, + "loss": 0.1497, + "step": 3220 + }, + { + "epoch": 0.10335007839247433, + "grad_norm": 13.125, + "learning_rate": 1.8877736611653756e-05, + "loss": 0.1232, + "step": 3230 + }, + { + "epoch": 0.10367004767542316, + "grad_norm": 21.25, + "learning_rate": 1.8871000336813743e-05, + "loss": 0.1139, + "step": 3240 + }, + { + "epoch": 0.103990016958372, + "grad_norm": 25.75, + "learning_rate": 1.8864264061973728e-05, + "loss": 0.1872, + "step": 3250 + }, + { + "epoch": 0.10430998624132083, + "grad_norm": 6.28125, + "learning_rate": 1.8857527787133716e-05, + "loss": 0.0994, + "step": 3260 + }, + { + "epoch": 0.10462995552426967, + "grad_norm": 28.0, + "learning_rate": 1.8850791512293703e-05, + "loss": 0.0856, + "step": 3270 + }, + { + "epoch": 0.1049499248072185, + "grad_norm": 13.8125, + "learning_rate": 1.8844055237453688e-05, + "loss": 0.1072, + "step": 3280 + }, + { + "epoch": 0.10526989409016735, + "grad_norm": 12.6875, + "learning_rate": 1.8837318962613675e-05, + "loss": 0.1836, + "step": 3290 + }, + { + "epoch": 0.10558986337311618, + "grad_norm": 16.75, + "learning_rate": 1.8830582687773663e-05, + "loss": 0.171, + "step": 3300 + }, + { + "epoch": 0.10590983265606502, + "grad_norm": 2.296875, + "learning_rate": 1.8823846412933647e-05, + "loss": 0.1256, + "step": 3310 + }, + { + "epoch": 0.10622980193901385, + "grad_norm": 9.6875, + "learning_rate": 1.8817110138093635e-05, + "loss": 0.071, + "step": 3320 + }, + { + "epoch": 0.10654977122196269, + "grad_norm": 22.0, + "learning_rate": 1.8810373863253623e-05, + "loss": 0.2003, + "step": 3330 + }, + { + "epoch": 0.10686974050491153, + "grad_norm": 10.0, + "learning_rate": 1.8803637588413607e-05, + "loss": 0.0824, + "step": 3340 + }, + { + "epoch": 0.10718970978786037, + "grad_norm": 9.5625, + "learning_rate": 1.8796901313573595e-05, + "loss": 0.1043, + "step": 3350 + }, + { + "epoch": 0.1075096790708092, + "grad_norm": 7.25, + "learning_rate": 1.879016503873358e-05, + "loss": 0.1802, + "step": 3360 + }, + { + "epoch": 0.10782964835375804, + "grad_norm": 4.8125, + "learning_rate": 1.8783428763893567e-05, + "loss": 0.1372, + "step": 3370 + }, + { + "epoch": 0.10814961763670687, + "grad_norm": 11.4375, + "learning_rate": 1.8776692489053555e-05, + "loss": 0.1097, + "step": 3380 + }, + { + "epoch": 0.10846958691965572, + "grad_norm": 15.3125, + "learning_rate": 1.876995621421354e-05, + "loss": 0.1801, + "step": 3390 + }, + { + "epoch": 0.10878955620260455, + "grad_norm": 12.375, + "learning_rate": 1.8763219939373527e-05, + "loss": 0.1607, + "step": 3400 + }, + { + "epoch": 0.10910952548555339, + "grad_norm": 11.8125, + "learning_rate": 1.8756483664533515e-05, + "loss": 0.1361, + "step": 3410 + }, + { + "epoch": 0.10942949476850222, + "grad_norm": 12.75, + "learning_rate": 1.87497473896935e-05, + "loss": 0.1432, + "step": 3420 + }, + { + "epoch": 0.10974946405145106, + "grad_norm": 18.75, + "learning_rate": 1.8743011114853487e-05, + "loss": 0.1197, + "step": 3430 + }, + { + "epoch": 0.11006943333439989, + "grad_norm": 12.0, + "learning_rate": 1.8736274840013474e-05, + "loss": 0.1405, + "step": 3440 + }, + { + "epoch": 0.11038940261734874, + "grad_norm": 6.59375, + "learning_rate": 1.872953856517346e-05, + "loss": 0.1271, + "step": 3450 + }, + { + "epoch": 0.11070937190029757, + "grad_norm": 27.125, + "learning_rate": 1.8722802290333447e-05, + "loss": 0.1116, + "step": 3460 + }, + { + "epoch": 0.11102934118324641, + "grad_norm": 7.46875, + "learning_rate": 1.8716066015493434e-05, + "loss": 0.1374, + "step": 3470 + }, + { + "epoch": 0.11134931046619524, + "grad_norm": 2.5, + "learning_rate": 1.870932974065342e-05, + "loss": 0.1483, + "step": 3480 + }, + { + "epoch": 0.11166927974914408, + "grad_norm": 5.53125, + "learning_rate": 1.8702593465813406e-05, + "loss": 0.1863, + "step": 3490 + }, + { + "epoch": 0.11198924903209292, + "grad_norm": 6.34375, + "learning_rate": 1.869585719097339e-05, + "loss": 0.1741, + "step": 3500 + }, + { + "epoch": 0.11230921831504176, + "grad_norm": 2.328125, + "learning_rate": 1.8689120916133382e-05, + "loss": 0.0774, + "step": 3510 + }, + { + "epoch": 0.1126291875979906, + "grad_norm": 3.859375, + "learning_rate": 1.8682384641293366e-05, + "loss": 0.1872, + "step": 3520 + }, + { + "epoch": 0.11294915688093943, + "grad_norm": 13.4375, + "learning_rate": 1.8675648366453354e-05, + "loss": 0.0897, + "step": 3530 + }, + { + "epoch": 0.11326912616388826, + "grad_norm": 12.25, + "learning_rate": 1.866891209161334e-05, + "loss": 0.1631, + "step": 3540 + }, + { + "epoch": 0.11358909544683711, + "grad_norm": 9.6875, + "learning_rate": 1.8662175816773326e-05, + "loss": 0.1896, + "step": 3550 + }, + { + "epoch": 0.11390906472978594, + "grad_norm": 7.90625, + "learning_rate": 1.8655439541933314e-05, + "loss": 0.1373, + "step": 3560 + }, + { + "epoch": 0.11422903401273478, + "grad_norm": 9.625, + "learning_rate": 1.86487032670933e-05, + "loss": 0.1341, + "step": 3570 + }, + { + "epoch": 0.11454900329568361, + "grad_norm": 12.25, + "learning_rate": 1.8641966992253286e-05, + "loss": 0.1644, + "step": 3580 + }, + { + "epoch": 0.11486897257863245, + "grad_norm": 1.1171875, + "learning_rate": 1.8635230717413274e-05, + "loss": 0.1105, + "step": 3590 + }, + { + "epoch": 0.11518894186158128, + "grad_norm": 5.71875, + "learning_rate": 1.8628494442573258e-05, + "loss": 0.0822, + "step": 3600 + }, + { + "epoch": 0.11550891114453013, + "grad_norm": 18.25, + "learning_rate": 1.8621758167733246e-05, + "loss": 0.1805, + "step": 3610 + }, + { + "epoch": 0.11582888042747896, + "grad_norm": 12.0, + "learning_rate": 1.8615021892893233e-05, + "loss": 0.1365, + "step": 3620 + }, + { + "epoch": 0.1161488497104278, + "grad_norm": 10.375, + "learning_rate": 1.8608285618053218e-05, + "loss": 0.1521, + "step": 3630 + }, + { + "epoch": 0.11646881899337663, + "grad_norm": 9.125, + "learning_rate": 1.8601549343213205e-05, + "loss": 0.175, + "step": 3640 + }, + { + "epoch": 0.11678878827632547, + "grad_norm": 9.25, + "learning_rate": 1.8594813068373193e-05, + "loss": 0.0963, + "step": 3650 + }, + { + "epoch": 0.11710875755927432, + "grad_norm": 16.75, + "learning_rate": 1.8588076793533178e-05, + "loss": 0.1028, + "step": 3660 + }, + { + "epoch": 0.11742872684222315, + "grad_norm": 4.25, + "learning_rate": 1.8581340518693165e-05, + "loss": 0.1375, + "step": 3670 + }, + { + "epoch": 0.11774869612517198, + "grad_norm": 6.6875, + "learning_rate": 1.8574604243853153e-05, + "loss": 0.1694, + "step": 3680 + }, + { + "epoch": 0.11806866540812082, + "grad_norm": 12.6875, + "learning_rate": 1.8567867969013137e-05, + "loss": 0.1823, + "step": 3690 + }, + { + "epoch": 0.11838863469106965, + "grad_norm": 11.0, + "learning_rate": 1.8561131694173125e-05, + "loss": 0.1314, + "step": 3700 + }, + { + "epoch": 0.1187086039740185, + "grad_norm": 12.375, + "learning_rate": 1.855439541933311e-05, + "loss": 0.105, + "step": 3710 + }, + { + "epoch": 0.11902857325696733, + "grad_norm": 0.9375, + "learning_rate": 1.8547659144493097e-05, + "loss": 0.1525, + "step": 3720 + }, + { + "epoch": 0.11934854253991617, + "grad_norm": 6.84375, + "learning_rate": 1.8540922869653085e-05, + "loss": 0.0756, + "step": 3730 + }, + { + "epoch": 0.119668511822865, + "grad_norm": 9.9375, + "learning_rate": 1.853418659481307e-05, + "loss": 0.1043, + "step": 3740 + }, + { + "epoch": 0.11998848110581384, + "grad_norm": 13.6875, + "learning_rate": 1.8527450319973057e-05, + "loss": 0.1945, + "step": 3750 + }, + { + "epoch": 0.12030845038876267, + "grad_norm": 16.125, + "learning_rate": 1.8520714045133045e-05, + "loss": 0.1316, + "step": 3760 + }, + { + "epoch": 0.12062841967171152, + "grad_norm": 13.875, + "learning_rate": 1.851397777029303e-05, + "loss": 0.1479, + "step": 3770 + }, + { + "epoch": 0.12094838895466035, + "grad_norm": 15.125, + "learning_rate": 1.8507241495453017e-05, + "loss": 0.1301, + "step": 3780 + }, + { + "epoch": 0.12126835823760919, + "grad_norm": 8.8125, + "learning_rate": 1.8500505220613005e-05, + "loss": 0.159, + "step": 3790 + }, + { + "epoch": 0.12158832752055802, + "grad_norm": 9.8125, + "learning_rate": 1.849376894577299e-05, + "loss": 0.1204, + "step": 3800 + }, + { + "epoch": 0.12190829680350686, + "grad_norm": 8.25, + "learning_rate": 1.8487032670932977e-05, + "loss": 0.1535, + "step": 3810 + }, + { + "epoch": 0.1222282660864557, + "grad_norm": 6.15625, + "learning_rate": 1.848029639609296e-05, + "loss": 0.1375, + "step": 3820 + }, + { + "epoch": 0.12254823536940454, + "grad_norm": 7.9375, + "learning_rate": 1.847356012125295e-05, + "loss": 0.147, + "step": 3830 + }, + { + "epoch": 0.12286820465235337, + "grad_norm": 29.375, + "learning_rate": 1.8466823846412936e-05, + "loss": 0.0999, + "step": 3840 + }, + { + "epoch": 0.12318817393530221, + "grad_norm": 5.96875, + "learning_rate": 1.846008757157292e-05, + "loss": 0.131, + "step": 3850 + }, + { + "epoch": 0.12350814321825104, + "grad_norm": 14.125, + "learning_rate": 1.845335129673291e-05, + "loss": 0.1024, + "step": 3860 + }, + { + "epoch": 0.12382811250119989, + "grad_norm": 6.09375, + "learning_rate": 1.8446615021892896e-05, + "loss": 0.1799, + "step": 3870 + }, + { + "epoch": 0.12414808178414873, + "grad_norm": 9.1875, + "learning_rate": 1.843987874705288e-05, + "loss": 0.145, + "step": 3880 + }, + { + "epoch": 0.12446805106709756, + "grad_norm": 23.875, + "learning_rate": 1.843314247221287e-05, + "loss": 0.1232, + "step": 3890 + }, + { + "epoch": 0.1247880203500464, + "grad_norm": 5.5, + "learning_rate": 1.8426406197372856e-05, + "loss": 0.1572, + "step": 3900 + }, + { + "epoch": 0.12510798963299524, + "grad_norm": 13.625, + "learning_rate": 1.841966992253284e-05, + "loss": 0.0954, + "step": 3910 + }, + { + "epoch": 0.12542795891594408, + "grad_norm": 18.375, + "learning_rate": 1.8412933647692828e-05, + "loss": 0.1774, + "step": 3920 + }, + { + "epoch": 0.1257479281988929, + "grad_norm": 7.03125, + "learning_rate": 1.8406197372852816e-05, + "loss": 0.1103, + "step": 3930 + }, + { + "epoch": 0.12606789748184175, + "grad_norm": 5.78125, + "learning_rate": 1.83994610980128e-05, + "loss": 0.1105, + "step": 3940 + }, + { + "epoch": 0.12638786676479058, + "grad_norm": 10.8125, + "learning_rate": 1.8392724823172788e-05, + "loss": 0.0947, + "step": 3950 + }, + { + "epoch": 0.12670783604773941, + "grad_norm": 24.75, + "learning_rate": 1.8385988548332772e-05, + "loss": 0.1479, + "step": 3960 + }, + { + "epoch": 0.12702780533068825, + "grad_norm": 9.4375, + "learning_rate": 1.837925227349276e-05, + "loss": 0.1065, + "step": 3970 + }, + { + "epoch": 0.12734777461363708, + "grad_norm": 9.4375, + "learning_rate": 1.8372515998652748e-05, + "loss": 0.1766, + "step": 3980 + }, + { + "epoch": 0.12766774389658592, + "grad_norm": 18.375, + "learning_rate": 1.8365779723812732e-05, + "loss": 0.0801, + "step": 3990 + }, + { + "epoch": 0.12798771317953475, + "grad_norm": 20.375, + "learning_rate": 1.835904344897272e-05, + "loss": 0.1089, + "step": 4000 + }, + { + "epoch": 0.1283076824624836, + "grad_norm": 26.5, + "learning_rate": 1.8352307174132708e-05, + "loss": 0.0943, + "step": 4010 + }, + { + "epoch": 0.12862765174543245, + "grad_norm": 33.75, + "learning_rate": 1.8345570899292692e-05, + "loss": 0.1652, + "step": 4020 + }, + { + "epoch": 0.12894762102838128, + "grad_norm": 17.0, + "learning_rate": 1.833883462445268e-05, + "loss": 0.1705, + "step": 4030 + }, + { + "epoch": 0.12926759031133012, + "grad_norm": 14.25, + "learning_rate": 1.8332098349612667e-05, + "loss": 0.1203, + "step": 4040 + }, + { + "epoch": 0.12958755959427895, + "grad_norm": 15.5, + "learning_rate": 1.8325362074772652e-05, + "loss": 0.0968, + "step": 4050 + }, + { + "epoch": 0.12990752887722778, + "grad_norm": 15.25, + "learning_rate": 1.831862579993264e-05, + "loss": 0.0765, + "step": 4060 + }, + { + "epoch": 0.13022749816017662, + "grad_norm": 20.625, + "learning_rate": 1.8311889525092624e-05, + "loss": 0.1665, + "step": 4070 + }, + { + "epoch": 0.13054746744312545, + "grad_norm": 12.375, + "learning_rate": 1.830515325025261e-05, + "loss": 0.1342, + "step": 4080 + }, + { + "epoch": 0.1308674367260743, + "grad_norm": 28.25, + "learning_rate": 1.82984169754126e-05, + "loss": 0.1102, + "step": 4090 + }, + { + "epoch": 0.13118740600902312, + "grad_norm": 7.78125, + "learning_rate": 1.8291680700572584e-05, + "loss": 0.1577, + "step": 4100 + }, + { + "epoch": 0.13150737529197198, + "grad_norm": 17.75, + "learning_rate": 1.828494442573257e-05, + "loss": 0.1468, + "step": 4110 + }, + { + "epoch": 0.13182734457492082, + "grad_norm": 15.9375, + "learning_rate": 1.827820815089256e-05, + "loss": 0.1475, + "step": 4120 + }, + { + "epoch": 0.13214731385786965, + "grad_norm": 10.875, + "learning_rate": 1.8271471876052544e-05, + "loss": 0.124, + "step": 4130 + }, + { + "epoch": 0.1324672831408185, + "grad_norm": 17.375, + "learning_rate": 1.826473560121253e-05, + "loss": 0.1515, + "step": 4140 + }, + { + "epoch": 0.13278725242376732, + "grad_norm": 6.9375, + "learning_rate": 1.825799932637252e-05, + "loss": 0.1272, + "step": 4150 + }, + { + "epoch": 0.13310722170671616, + "grad_norm": 12.6875, + "learning_rate": 1.8251263051532503e-05, + "loss": 0.109, + "step": 4160 + }, + { + "epoch": 0.133427190989665, + "grad_norm": 2.828125, + "learning_rate": 1.824452677669249e-05, + "loss": 0.1038, + "step": 4170 + }, + { + "epoch": 0.13374716027261382, + "grad_norm": 15.375, + "learning_rate": 1.8237790501852475e-05, + "loss": 0.135, + "step": 4180 + }, + { + "epoch": 0.13406712955556266, + "grad_norm": 15.5, + "learning_rate": 1.8231054227012463e-05, + "loss": 0.0941, + "step": 4190 + }, + { + "epoch": 0.1343870988385115, + "grad_norm": 1.4765625, + "learning_rate": 1.822431795217245e-05, + "loss": 0.1058, + "step": 4200 + }, + { + "epoch": 0.13470706812146033, + "grad_norm": 29.375, + "learning_rate": 1.8217581677332435e-05, + "loss": 0.133, + "step": 4210 + }, + { + "epoch": 0.1350270374044092, + "grad_norm": 18.0, + "learning_rate": 1.8210845402492423e-05, + "loss": 0.1107, + "step": 4220 + }, + { + "epoch": 0.13534700668735802, + "grad_norm": 6.78125, + "learning_rate": 1.820410912765241e-05, + "loss": 0.1734, + "step": 4230 + }, + { + "epoch": 0.13566697597030686, + "grad_norm": 3.9375, + "learning_rate": 1.8197372852812395e-05, + "loss": 0.1262, + "step": 4240 + }, + { + "epoch": 0.1359869452532557, + "grad_norm": 3.765625, + "learning_rate": 1.8190636577972383e-05, + "loss": 0.0814, + "step": 4250 + }, + { + "epoch": 0.13630691453620453, + "grad_norm": 14.5625, + "learning_rate": 1.818390030313237e-05, + "loss": 0.1544, + "step": 4260 + }, + { + "epoch": 0.13662688381915336, + "grad_norm": 4.90625, + "learning_rate": 1.8177164028292355e-05, + "loss": 0.0671, + "step": 4270 + }, + { + "epoch": 0.1369468531021022, + "grad_norm": 1.390625, + "learning_rate": 1.8170427753452343e-05, + "loss": 0.0982, + "step": 4280 + }, + { + "epoch": 0.13726682238505103, + "grad_norm": 10.8125, + "learning_rate": 1.8163691478612327e-05, + "loss": 0.1116, + "step": 4290 + }, + { + "epoch": 0.13758679166799986, + "grad_norm": 16.625, + "learning_rate": 1.8156955203772315e-05, + "loss": 0.0687, + "step": 4300 + }, + { + "epoch": 0.1379067609509487, + "grad_norm": 21.375, + "learning_rate": 1.8150218928932302e-05, + "loss": 0.1176, + "step": 4310 + }, + { + "epoch": 0.13822673023389753, + "grad_norm": 10.5, + "learning_rate": 1.8143482654092287e-05, + "loss": 0.149, + "step": 4320 + }, + { + "epoch": 0.1385466995168464, + "grad_norm": 1.1328125, + "learning_rate": 1.8136746379252274e-05, + "loss": 0.1887, + "step": 4330 + }, + { + "epoch": 0.13886666879979523, + "grad_norm": 11.1875, + "learning_rate": 1.8130010104412262e-05, + "loss": 0.1482, + "step": 4340 + }, + { + "epoch": 0.13918663808274406, + "grad_norm": 23.625, + "learning_rate": 1.8123273829572247e-05, + "loss": 0.0944, + "step": 4350 + }, + { + "epoch": 0.1395066073656929, + "grad_norm": 36.75, + "learning_rate": 1.8116537554732234e-05, + "loss": 0.1426, + "step": 4360 + }, + { + "epoch": 0.13982657664864173, + "grad_norm": 13.1875, + "learning_rate": 1.8109801279892222e-05, + "loss": 0.1638, + "step": 4370 + }, + { + "epoch": 0.14014654593159057, + "grad_norm": 9.375, + "learning_rate": 1.8103065005052206e-05, + "loss": 0.169, + "step": 4380 + }, + { + "epoch": 0.1404665152145394, + "grad_norm": 9.4375, + "learning_rate": 1.8096328730212194e-05, + "loss": 0.129, + "step": 4390 + }, + { + "epoch": 0.14078648449748823, + "grad_norm": 4.75, + "learning_rate": 1.8089592455372182e-05, + "loss": 0.1339, + "step": 4400 + }, + { + "epoch": 0.14110645378043707, + "grad_norm": 3.609375, + "learning_rate": 1.8082856180532166e-05, + "loss": 0.15, + "step": 4410 + }, + { + "epoch": 0.1414264230633859, + "grad_norm": 14.3125, + "learning_rate": 1.8076119905692154e-05, + "loss": 0.1241, + "step": 4420 + }, + { + "epoch": 0.14174639234633477, + "grad_norm": 10.9375, + "learning_rate": 1.8069383630852138e-05, + "loss": 0.1227, + "step": 4430 + }, + { + "epoch": 0.1420663616292836, + "grad_norm": 6.46875, + "learning_rate": 1.8062647356012126e-05, + "loss": 0.1585, + "step": 4440 + }, + { + "epoch": 0.14238633091223243, + "grad_norm": 11.8125, + "learning_rate": 1.8055911081172114e-05, + "loss": 0.1177, + "step": 4450 + }, + { + "epoch": 0.14270630019518127, + "grad_norm": 20.625, + "learning_rate": 1.8049174806332098e-05, + "loss": 0.1297, + "step": 4460 + }, + { + "epoch": 0.1430262694781301, + "grad_norm": 10.875, + "learning_rate": 1.8042438531492086e-05, + "loss": 0.169, + "step": 4470 + }, + { + "epoch": 0.14334623876107894, + "grad_norm": 0.90625, + "learning_rate": 1.8035702256652074e-05, + "loss": 0.179, + "step": 4480 + }, + { + "epoch": 0.14366620804402777, + "grad_norm": 10.125, + "learning_rate": 1.8028965981812058e-05, + "loss": 0.1445, + "step": 4490 + }, + { + "epoch": 0.1439861773269766, + "grad_norm": 12.4375, + "learning_rate": 1.8022229706972046e-05, + "loss": 0.0873, + "step": 4500 + }, + { + "epoch": 0.14430614660992544, + "grad_norm": 22.625, + "learning_rate": 1.8015493432132033e-05, + "loss": 0.1543, + "step": 4510 + }, + { + "epoch": 0.14462611589287427, + "grad_norm": 19.25, + "learning_rate": 1.8008757157292018e-05, + "loss": 0.1299, + "step": 4520 + }, + { + "epoch": 0.1449460851758231, + "grad_norm": 27.5, + "learning_rate": 1.8002020882452005e-05, + "loss": 0.1449, + "step": 4530 + }, + { + "epoch": 0.14526605445877197, + "grad_norm": 2.78125, + "learning_rate": 1.799528460761199e-05, + "loss": 0.0934, + "step": 4540 + }, + { + "epoch": 0.1455860237417208, + "grad_norm": 7.90625, + "learning_rate": 1.7988548332771978e-05, + "loss": 0.1668, + "step": 4550 + }, + { + "epoch": 0.14590599302466964, + "grad_norm": 5.125, + "learning_rate": 1.7981812057931965e-05, + "loss": 0.1796, + "step": 4560 + }, + { + "epoch": 0.14622596230761847, + "grad_norm": 31.125, + "learning_rate": 1.797507578309195e-05, + "loss": 0.1353, + "step": 4570 + }, + { + "epoch": 0.1465459315905673, + "grad_norm": 5.6875, + "learning_rate": 1.7968339508251937e-05, + "loss": 0.0967, + "step": 4580 + }, + { + "epoch": 0.14686590087351614, + "grad_norm": 4.65625, + "learning_rate": 1.7961603233411925e-05, + "loss": 0.1131, + "step": 4590 + }, + { + "epoch": 0.14718587015646498, + "grad_norm": 11.0, + "learning_rate": 1.795486695857191e-05, + "loss": 0.1127, + "step": 4600 + }, + { + "epoch": 0.1475058394394138, + "grad_norm": 22.875, + "learning_rate": 1.7948130683731897e-05, + "loss": 0.1579, + "step": 4610 + }, + { + "epoch": 0.14782580872236264, + "grad_norm": 0.42578125, + "learning_rate": 1.7941394408891885e-05, + "loss": 0.1329, + "step": 4620 + }, + { + "epoch": 0.14814577800531148, + "grad_norm": 10.625, + "learning_rate": 1.793465813405187e-05, + "loss": 0.1194, + "step": 4630 + }, + { + "epoch": 0.1484657472882603, + "grad_norm": 11.5625, + "learning_rate": 1.7927921859211857e-05, + "loss": 0.1486, + "step": 4640 + }, + { + "epoch": 0.14878571657120918, + "grad_norm": 11.25, + "learning_rate": 1.792118558437184e-05, + "loss": 0.1315, + "step": 4650 + }, + { + "epoch": 0.149105685854158, + "grad_norm": 23.625, + "learning_rate": 1.791444930953183e-05, + "loss": 0.2333, + "step": 4660 + }, + { + "epoch": 0.14942565513710684, + "grad_norm": 10.0625, + "learning_rate": 1.7907713034691817e-05, + "loss": 0.1039, + "step": 4670 + }, + { + "epoch": 0.14974562442005568, + "grad_norm": 9.1875, + "learning_rate": 1.79009767598518e-05, + "loss": 0.1562, + "step": 4680 + }, + { + "epoch": 0.1500655937030045, + "grad_norm": 6.4375, + "learning_rate": 1.789424048501179e-05, + "loss": 0.1116, + "step": 4690 + }, + { + "epoch": 0.15038556298595335, + "grad_norm": 5.0625, + "learning_rate": 1.7887504210171777e-05, + "loss": 0.1069, + "step": 4700 + }, + { + "epoch": 0.15070553226890218, + "grad_norm": 1.6953125, + "learning_rate": 1.788076793533176e-05, + "loss": 0.0816, + "step": 4710 + }, + { + "epoch": 0.15102550155185102, + "grad_norm": 13.5625, + "learning_rate": 1.787403166049175e-05, + "loss": 0.1824, + "step": 4720 + }, + { + "epoch": 0.15134547083479985, + "grad_norm": 10.3125, + "learning_rate": 1.7867295385651736e-05, + "loss": 0.1502, + "step": 4730 + }, + { + "epoch": 0.15166544011774868, + "grad_norm": 1.2109375, + "learning_rate": 1.786055911081172e-05, + "loss": 0.1234, + "step": 4740 + }, + { + "epoch": 0.15198540940069755, + "grad_norm": 11.75, + "learning_rate": 1.785382283597171e-05, + "loss": 0.1784, + "step": 4750 + }, + { + "epoch": 0.15230537868364638, + "grad_norm": 0.74609375, + "learning_rate": 1.7847086561131693e-05, + "loss": 0.0886, + "step": 4760 + }, + { + "epoch": 0.15262534796659522, + "grad_norm": 10.6875, + "learning_rate": 1.784035028629168e-05, + "loss": 0.1783, + "step": 4770 + }, + { + "epoch": 0.15294531724954405, + "grad_norm": 10.4375, + "learning_rate": 1.783361401145167e-05, + "loss": 0.1132, + "step": 4780 + }, + { + "epoch": 0.15326528653249288, + "grad_norm": 24.0, + "learning_rate": 1.7826877736611653e-05, + "loss": 0.1719, + "step": 4790 + }, + { + "epoch": 0.15358525581544172, + "grad_norm": 9.0625, + "learning_rate": 1.7820141461771644e-05, + "loss": 0.1314, + "step": 4800 + }, + { + "epoch": 0.15390522509839055, + "grad_norm": 1.6015625, + "learning_rate": 1.7813405186931628e-05, + "loss": 0.0683, + "step": 4810 + }, + { + "epoch": 0.1542251943813394, + "grad_norm": 7.5, + "learning_rate": 1.7806668912091616e-05, + "loss": 0.1135, + "step": 4820 + }, + { + "epoch": 0.15454516366428822, + "grad_norm": 18.125, + "learning_rate": 1.7799932637251604e-05, + "loss": 0.226, + "step": 4830 + }, + { + "epoch": 0.15486513294723706, + "grad_norm": 0.57421875, + "learning_rate": 1.7793196362411588e-05, + "loss": 0.096, + "step": 4840 + }, + { + "epoch": 0.1551851022301859, + "grad_norm": 34.5, + "learning_rate": 1.7786460087571576e-05, + "loss": 0.1353, + "step": 4850 + }, + { + "epoch": 0.15550507151313475, + "grad_norm": 9.5625, + "learning_rate": 1.7779723812731563e-05, + "loss": 0.1998, + "step": 4860 + }, + { + "epoch": 0.15582504079608359, + "grad_norm": 7.53125, + "learning_rate": 1.7772987537891548e-05, + "loss": 0.1153, + "step": 4870 + }, + { + "epoch": 0.15614501007903242, + "grad_norm": 3.328125, + "learning_rate": 1.7766251263051536e-05, + "loss": 0.1819, + "step": 4880 + }, + { + "epoch": 0.15646497936198125, + "grad_norm": 4.5625, + "learning_rate": 1.775951498821152e-05, + "loss": 0.1191, + "step": 4890 + }, + { + "epoch": 0.1567849486449301, + "grad_norm": 40.5, + "learning_rate": 1.7752778713371508e-05, + "loss": 0.1025, + "step": 4900 + }, + { + "epoch": 0.15710491792787892, + "grad_norm": 1.328125, + "learning_rate": 1.7746042438531495e-05, + "loss": 0.0941, + "step": 4910 + }, + { + "epoch": 0.15742488721082776, + "grad_norm": 11.3125, + "learning_rate": 1.773930616369148e-05, + "loss": 0.096, + "step": 4920 + }, + { + "epoch": 0.1577448564937766, + "grad_norm": 7.40625, + "learning_rate": 1.7732569888851467e-05, + "loss": 0.2236, + "step": 4930 + }, + { + "epoch": 0.15806482577672543, + "grad_norm": 5.90625, + "learning_rate": 1.7725833614011455e-05, + "loss": 0.1338, + "step": 4940 + }, + { + "epoch": 0.15838479505967426, + "grad_norm": 8.875, + "learning_rate": 1.771909733917144e-05, + "loss": 0.1438, + "step": 4950 + }, + { + "epoch": 0.1587047643426231, + "grad_norm": 8.75, + "learning_rate": 1.7712361064331427e-05, + "loss": 0.0856, + "step": 4960 + }, + { + "epoch": 0.15902473362557196, + "grad_norm": 2.609375, + "learning_rate": 1.7705624789491415e-05, + "loss": 0.1023, + "step": 4970 + }, + { + "epoch": 0.1593447029085208, + "grad_norm": 7.96875, + "learning_rate": 1.76988885146514e-05, + "loss": 0.1126, + "step": 4980 + }, + { + "epoch": 0.15966467219146963, + "grad_norm": 9.75, + "learning_rate": 1.7692152239811387e-05, + "loss": 0.1572, + "step": 4990 + }, + { + "epoch": 0.15998464147441846, + "grad_norm": 9.0, + "learning_rate": 1.768541596497137e-05, + "loss": 0.0723, + "step": 5000 + }, + { + "epoch": 0.1603046107573673, + "grad_norm": 10.375, + "learning_rate": 1.767867969013136e-05, + "loss": 0.1359, + "step": 5010 + }, + { + "epoch": 0.16062458004031613, + "grad_norm": 0.546875, + "learning_rate": 1.7671943415291347e-05, + "loss": 0.1012, + "step": 5020 + }, + { + "epoch": 0.16094454932326496, + "grad_norm": 10.1875, + "learning_rate": 1.766520714045133e-05, + "loss": 0.1514, + "step": 5030 + }, + { + "epoch": 0.1612645186062138, + "grad_norm": 7.78125, + "learning_rate": 1.765847086561132e-05, + "loss": 0.0964, + "step": 5040 + }, + { + "epoch": 0.16158448788916263, + "grad_norm": 1.703125, + "learning_rate": 1.7651734590771307e-05, + "loss": 0.1257, + "step": 5050 + }, + { + "epoch": 0.16190445717211147, + "grad_norm": 0.734375, + "learning_rate": 1.764499831593129e-05, + "loss": 0.0897, + "step": 5060 + }, + { + "epoch": 0.16222442645506033, + "grad_norm": 7.375, + "learning_rate": 1.763826204109128e-05, + "loss": 0.1221, + "step": 5070 + }, + { + "epoch": 0.16254439573800916, + "grad_norm": 3.640625, + "learning_rate": 1.7631525766251267e-05, + "loss": 0.1566, + "step": 5080 + }, + { + "epoch": 0.162864365020958, + "grad_norm": 5.125, + "learning_rate": 1.762478949141125e-05, + "loss": 0.1231, + "step": 5090 + }, + { + "epoch": 0.16318433430390683, + "grad_norm": 3.421875, + "learning_rate": 1.761805321657124e-05, + "loss": 0.1446, + "step": 5100 + }, + { + "epoch": 0.16350430358685566, + "grad_norm": 6.34375, + "learning_rate": 1.7611316941731223e-05, + "loss": 0.132, + "step": 5110 + }, + { + "epoch": 0.1638242728698045, + "grad_norm": 5.53125, + "learning_rate": 1.760458066689121e-05, + "loss": 0.1764, + "step": 5120 + }, + { + "epoch": 0.16414424215275333, + "grad_norm": 11.6875, + "learning_rate": 1.75978443920512e-05, + "loss": 0.1182, + "step": 5130 + }, + { + "epoch": 0.16446421143570217, + "grad_norm": 2.875, + "learning_rate": 1.7591108117211183e-05, + "loss": 0.1019, + "step": 5140 + }, + { + "epoch": 0.164784180718651, + "grad_norm": 9.3125, + "learning_rate": 1.758437184237117e-05, + "loss": 0.0536, + "step": 5150 + }, + { + "epoch": 0.16510415000159984, + "grad_norm": 11.9375, + "learning_rate": 1.7577635567531158e-05, + "loss": 0.1366, + "step": 5160 + }, + { + "epoch": 0.16542411928454867, + "grad_norm": 11.125, + "learning_rate": 1.7570899292691143e-05, + "loss": 0.0998, + "step": 5170 + }, + { + "epoch": 0.16574408856749753, + "grad_norm": 4.03125, + "learning_rate": 1.756416301785113e-05, + "loss": 0.1519, + "step": 5180 + }, + { + "epoch": 0.16606405785044637, + "grad_norm": 13.8125, + "learning_rate": 1.7557426743011118e-05, + "loss": 0.1814, + "step": 5190 + }, + { + "epoch": 0.1663840271333952, + "grad_norm": 10.8125, + "learning_rate": 1.7550690468171102e-05, + "loss": 0.1636, + "step": 5200 + }, + { + "epoch": 0.16670399641634404, + "grad_norm": 2.078125, + "learning_rate": 1.754395419333109e-05, + "loss": 0.0829, + "step": 5210 + }, + { + "epoch": 0.16702396569929287, + "grad_norm": 4.96875, + "learning_rate": 1.7537217918491078e-05, + "loss": 0.0843, + "step": 5220 + }, + { + "epoch": 0.1673439349822417, + "grad_norm": 0.51171875, + "learning_rate": 1.7530481643651062e-05, + "loss": 0.1332, + "step": 5230 + }, + { + "epoch": 0.16766390426519054, + "grad_norm": 10.875, + "learning_rate": 1.752374536881105e-05, + "loss": 0.1808, + "step": 5240 + }, + { + "epoch": 0.16798387354813937, + "grad_norm": 13.6875, + "learning_rate": 1.7517009093971034e-05, + "loss": 0.0635, + "step": 5250 + }, + { + "epoch": 0.1683038428310882, + "grad_norm": 2.0, + "learning_rate": 1.7510272819131022e-05, + "loss": 0.1045, + "step": 5260 + }, + { + "epoch": 0.16862381211403704, + "grad_norm": 15.25, + "learning_rate": 1.750353654429101e-05, + "loss": 0.1292, + "step": 5270 + }, + { + "epoch": 0.16894378139698588, + "grad_norm": 4.25, + "learning_rate": 1.7496800269450994e-05, + "loss": 0.1462, + "step": 5280 + }, + { + "epoch": 0.16926375067993474, + "grad_norm": 4.78125, + "learning_rate": 1.7490063994610982e-05, + "loss": 0.0763, + "step": 5290 + }, + { + "epoch": 0.16958371996288357, + "grad_norm": 18.625, + "learning_rate": 1.748332771977097e-05, + "loss": 0.1251, + "step": 5300 + }, + { + "epoch": 0.1699036892458324, + "grad_norm": 22.25, + "learning_rate": 1.7476591444930954e-05, + "loss": 0.084, + "step": 5310 + }, + { + "epoch": 0.17022365852878124, + "grad_norm": 9.9375, + "learning_rate": 1.7469855170090942e-05, + "loss": 0.1101, + "step": 5320 + }, + { + "epoch": 0.17054362781173008, + "grad_norm": 16.625, + "learning_rate": 1.746311889525093e-05, + "loss": 0.1222, + "step": 5330 + }, + { + "epoch": 0.1708635970946789, + "grad_norm": 14.4375, + "learning_rate": 1.7456382620410914e-05, + "loss": 0.1125, + "step": 5340 + }, + { + "epoch": 0.17118356637762774, + "grad_norm": 5.8125, + "learning_rate": 1.74496463455709e-05, + "loss": 0.1216, + "step": 5350 + }, + { + "epoch": 0.17150353566057658, + "grad_norm": 25.5, + "learning_rate": 1.7442910070730886e-05, + "loss": 0.0724, + "step": 5360 + }, + { + "epoch": 0.1718235049435254, + "grad_norm": 23.875, + "learning_rate": 1.7436173795890874e-05, + "loss": 0.0847, + "step": 5370 + }, + { + "epoch": 0.17214347422647425, + "grad_norm": 0.3125, + "learning_rate": 1.742943752105086e-05, + "loss": 0.1212, + "step": 5380 + }, + { + "epoch": 0.1724634435094231, + "grad_norm": 21.125, + "learning_rate": 1.7422701246210846e-05, + "loss": 0.1834, + "step": 5390 + }, + { + "epoch": 0.17278341279237194, + "grad_norm": 8.25, + "learning_rate": 1.7415964971370833e-05, + "loss": 0.1676, + "step": 5400 + }, + { + "epoch": 0.17310338207532078, + "grad_norm": 1.7265625, + "learning_rate": 1.740922869653082e-05, + "loss": 0.107, + "step": 5410 + }, + { + "epoch": 0.1734233513582696, + "grad_norm": 7.5625, + "learning_rate": 1.7402492421690806e-05, + "loss": 0.118, + "step": 5420 + }, + { + "epoch": 0.17374332064121845, + "grad_norm": 7.15625, + "learning_rate": 1.7395756146850793e-05, + "loss": 0.1029, + "step": 5430 + }, + { + "epoch": 0.17406328992416728, + "grad_norm": 25.25, + "learning_rate": 1.738901987201078e-05, + "loss": 0.1236, + "step": 5440 + }, + { + "epoch": 0.17438325920711611, + "grad_norm": 1.21875, + "learning_rate": 1.7382283597170765e-05, + "loss": 0.1597, + "step": 5450 + }, + { + "epoch": 0.17470322849006495, + "grad_norm": 6.71875, + "learning_rate": 1.7375547322330753e-05, + "loss": 0.1246, + "step": 5460 + }, + { + "epoch": 0.17502319777301378, + "grad_norm": 1.4453125, + "learning_rate": 1.7368811047490737e-05, + "loss": 0.0773, + "step": 5470 + }, + { + "epoch": 0.17534316705596262, + "grad_norm": 5.21875, + "learning_rate": 1.7362074772650725e-05, + "loss": 0.1136, + "step": 5480 + }, + { + "epoch": 0.17566313633891145, + "grad_norm": 12.375, + "learning_rate": 1.7355338497810713e-05, + "loss": 0.0981, + "step": 5490 + }, + { + "epoch": 0.17598310562186031, + "grad_norm": 12.125, + "learning_rate": 1.7348602222970697e-05, + "loss": 0.1362, + "step": 5500 + }, + { + "epoch": 0.17630307490480915, + "grad_norm": 3.34375, + "learning_rate": 1.7341865948130685e-05, + "loss": 0.1623, + "step": 5510 + }, + { + "epoch": 0.17662304418775798, + "grad_norm": 4.15625, + "learning_rate": 1.7335129673290673e-05, + "loss": 0.1106, + "step": 5520 + }, + { + "epoch": 0.17694301347070682, + "grad_norm": 7.34375, + "learning_rate": 1.7328393398450657e-05, + "loss": 0.1518, + "step": 5530 + }, + { + "epoch": 0.17726298275365565, + "grad_norm": 1.46875, + "learning_rate": 1.7321657123610645e-05, + "loss": 0.1014, + "step": 5540 + }, + { + "epoch": 0.17758295203660449, + "grad_norm": 3.859375, + "learning_rate": 1.7314920848770633e-05, + "loss": 0.0938, + "step": 5550 + }, + { + "epoch": 0.17790292131955332, + "grad_norm": 26.375, + "learning_rate": 1.7308184573930617e-05, + "loss": 0.0974, + "step": 5560 + }, + { + "epoch": 0.17822289060250215, + "grad_norm": 7.0625, + "learning_rate": 1.7301448299090605e-05, + "loss": 0.105, + "step": 5570 + }, + { + "epoch": 0.178542859885451, + "grad_norm": 8.75, + "learning_rate": 1.729471202425059e-05, + "loss": 0.1591, + "step": 5580 + }, + { + "epoch": 0.17886282916839982, + "grad_norm": 9.125, + "learning_rate": 1.7287975749410577e-05, + "loss": 0.204, + "step": 5590 + }, + { + "epoch": 0.17918279845134866, + "grad_norm": 18.375, + "learning_rate": 1.7281239474570564e-05, + "loss": 0.1568, + "step": 5600 + }, + { + "epoch": 0.17950276773429752, + "grad_norm": 4.21875, + "learning_rate": 1.727450319973055e-05, + "loss": 0.172, + "step": 5610 + }, + { + "epoch": 0.17982273701724635, + "grad_norm": 12.125, + "learning_rate": 1.7267766924890537e-05, + "loss": 0.095, + "step": 5620 + }, + { + "epoch": 0.1801427063001952, + "grad_norm": 7.21875, + "learning_rate": 1.7261030650050524e-05, + "loss": 0.1384, + "step": 5630 + }, + { + "epoch": 0.18046267558314402, + "grad_norm": 3.109375, + "learning_rate": 1.725429437521051e-05, + "loss": 0.0906, + "step": 5640 + }, + { + "epoch": 0.18078264486609286, + "grad_norm": 2.75, + "learning_rate": 1.7247558100370496e-05, + "loss": 0.17, + "step": 5650 + }, + { + "epoch": 0.1811026141490417, + "grad_norm": 14.5, + "learning_rate": 1.7240821825530484e-05, + "loss": 0.1198, + "step": 5660 + }, + { + "epoch": 0.18142258343199053, + "grad_norm": 1.90625, + "learning_rate": 1.723408555069047e-05, + "loss": 0.1267, + "step": 5670 + }, + { + "epoch": 0.18174255271493936, + "grad_norm": 2.3125, + "learning_rate": 1.7227349275850456e-05, + "loss": 0.0867, + "step": 5680 + }, + { + "epoch": 0.1820625219978882, + "grad_norm": 4.75, + "learning_rate": 1.7220613001010444e-05, + "loss": 0.1107, + "step": 5690 + }, + { + "epoch": 0.18238249128083703, + "grad_norm": 71.5, + "learning_rate": 1.7213876726170428e-05, + "loss": 0.1062, + "step": 5700 + }, + { + "epoch": 0.1827024605637859, + "grad_norm": 12.5625, + "learning_rate": 1.7207140451330416e-05, + "loss": 0.1304, + "step": 5710 + }, + { + "epoch": 0.18302242984673472, + "grad_norm": 27.75, + "learning_rate": 1.72004041764904e-05, + "loss": 0.1224, + "step": 5720 + }, + { + "epoch": 0.18334239912968356, + "grad_norm": 20.375, + "learning_rate": 1.7193667901650388e-05, + "loss": 0.1224, + "step": 5730 + }, + { + "epoch": 0.1836623684126324, + "grad_norm": 6.53125, + "learning_rate": 1.7186931626810376e-05, + "loss": 0.1302, + "step": 5740 + }, + { + "epoch": 0.18398233769558123, + "grad_norm": 1.6640625, + "learning_rate": 1.718019535197036e-05, + "loss": 0.104, + "step": 5750 + }, + { + "epoch": 0.18430230697853006, + "grad_norm": 5.78125, + "learning_rate": 1.7173459077130348e-05, + "loss": 0.1589, + "step": 5760 + }, + { + "epoch": 0.1846222762614789, + "grad_norm": 13.625, + "learning_rate": 1.7166722802290336e-05, + "loss": 0.1316, + "step": 5770 + }, + { + "epoch": 0.18494224554442773, + "grad_norm": 7.28125, + "learning_rate": 1.715998652745032e-05, + "loss": 0.1645, + "step": 5780 + }, + { + "epoch": 0.18526221482737656, + "grad_norm": 9.0625, + "learning_rate": 1.7153250252610308e-05, + "loss": 0.1323, + "step": 5790 + }, + { + "epoch": 0.1855821841103254, + "grad_norm": 6.875, + "learning_rate": 1.7146513977770295e-05, + "loss": 0.0834, + "step": 5800 + }, + { + "epoch": 0.18590215339327423, + "grad_norm": 0.49609375, + "learning_rate": 1.713977770293028e-05, + "loss": 0.075, + "step": 5810 + }, + { + "epoch": 0.1862221226762231, + "grad_norm": 1.3515625, + "learning_rate": 1.7133041428090268e-05, + "loss": 0.122, + "step": 5820 + }, + { + "epoch": 0.18654209195917193, + "grad_norm": 2.046875, + "learning_rate": 1.7126305153250252e-05, + "loss": 0.0694, + "step": 5830 + }, + { + "epoch": 0.18686206124212076, + "grad_norm": 11.9375, + "learning_rate": 1.711956887841024e-05, + "loss": 0.1691, + "step": 5840 + }, + { + "epoch": 0.1871820305250696, + "grad_norm": 6.625, + "learning_rate": 1.7112832603570227e-05, + "loss": 0.1055, + "step": 5850 + }, + { + "epoch": 0.18750199980801843, + "grad_norm": 5.1875, + "learning_rate": 1.710609632873021e-05, + "loss": 0.0935, + "step": 5860 + }, + { + "epoch": 0.18782196909096727, + "grad_norm": 23.75, + "learning_rate": 1.70993600538902e-05, + "loss": 0.1484, + "step": 5870 + }, + { + "epoch": 0.1881419383739161, + "grad_norm": 9.8125, + "learning_rate": 1.7092623779050187e-05, + "loss": 0.1102, + "step": 5880 + }, + { + "epoch": 0.18846190765686494, + "grad_norm": 9.75, + "learning_rate": 1.708588750421017e-05, + "loss": 0.1075, + "step": 5890 + }, + { + "epoch": 0.18878187693981377, + "grad_norm": 9.6875, + "learning_rate": 1.707915122937016e-05, + "loss": 0.1185, + "step": 5900 + }, + { + "epoch": 0.1891018462227626, + "grad_norm": 1.6015625, + "learning_rate": 1.7072414954530147e-05, + "loss": 0.1335, + "step": 5910 + }, + { + "epoch": 0.18942181550571144, + "grad_norm": 5.375, + "learning_rate": 1.706567867969013e-05, + "loss": 0.0948, + "step": 5920 + }, + { + "epoch": 0.1897417847886603, + "grad_norm": 13.75, + "learning_rate": 1.705894240485012e-05, + "loss": 0.1593, + "step": 5930 + }, + { + "epoch": 0.19006175407160913, + "grad_norm": 17.625, + "learning_rate": 1.7052206130010103e-05, + "loss": 0.1549, + "step": 5940 + }, + { + "epoch": 0.19038172335455797, + "grad_norm": 5.0, + "learning_rate": 1.704546985517009e-05, + "loss": 0.1825, + "step": 5950 + }, + { + "epoch": 0.1907016926375068, + "grad_norm": 4.28125, + "learning_rate": 1.703873358033008e-05, + "loss": 0.1993, + "step": 5960 + }, + { + "epoch": 0.19102166192045564, + "grad_norm": 10.625, + "learning_rate": 1.7031997305490063e-05, + "loss": 0.1825, + "step": 5970 + }, + { + "epoch": 0.19134163120340447, + "grad_norm": 7.375, + "learning_rate": 1.702526103065005e-05, + "loss": 0.1268, + "step": 5980 + }, + { + "epoch": 0.1916616004863533, + "grad_norm": 5.59375, + "learning_rate": 1.701852475581004e-05, + "loss": 0.0864, + "step": 5990 + }, + { + "epoch": 0.19198156976930214, + "grad_norm": 6.5625, + "learning_rate": 1.7011788480970023e-05, + "loss": 0.1339, + "step": 6000 + }, + { + "epoch": 0.19230153905225097, + "grad_norm": 24.625, + "learning_rate": 1.700505220613001e-05, + "loss": 0.1293, + "step": 6010 + }, + { + "epoch": 0.1926215083351998, + "grad_norm": 8.8125, + "learning_rate": 1.699831593129e-05, + "loss": 0.1562, + "step": 6020 + }, + { + "epoch": 0.19294147761814867, + "grad_norm": 4.40625, + "learning_rate": 1.6991579656449983e-05, + "loss": 0.1033, + "step": 6030 + }, + { + "epoch": 0.1932614469010975, + "grad_norm": 9.5, + "learning_rate": 1.698484338160997e-05, + "loss": 0.2078, + "step": 6040 + }, + { + "epoch": 0.19358141618404634, + "grad_norm": 11.375, + "learning_rate": 1.6978107106769955e-05, + "loss": 0.1351, + "step": 6050 + }, + { + "epoch": 0.19390138546699517, + "grad_norm": 13.6875, + "learning_rate": 1.6971370831929943e-05, + "loss": 0.119, + "step": 6060 + }, + { + "epoch": 0.194221354749944, + "grad_norm": 20.75, + "learning_rate": 1.696463455708993e-05, + "loss": 0.1564, + "step": 6070 + }, + { + "epoch": 0.19454132403289284, + "grad_norm": 11.4375, + "learning_rate": 1.6957898282249915e-05, + "loss": 0.1463, + "step": 6080 + }, + { + "epoch": 0.19486129331584168, + "grad_norm": 4.0625, + "learning_rate": 1.6951162007409906e-05, + "loss": 0.0718, + "step": 6090 + }, + { + "epoch": 0.1951812625987905, + "grad_norm": 1.125, + "learning_rate": 1.694442573256989e-05, + "loss": 0.0924, + "step": 6100 + }, + { + "epoch": 0.19550123188173935, + "grad_norm": 8.4375, + "learning_rate": 1.6937689457729878e-05, + "loss": 0.1603, + "step": 6110 + }, + { + "epoch": 0.19582120116468818, + "grad_norm": 23.25, + "learning_rate": 1.6930953182889866e-05, + "loss": 0.0743, + "step": 6120 + }, + { + "epoch": 0.19614117044763701, + "grad_norm": 10.4375, + "learning_rate": 1.692421690804985e-05, + "loss": 0.0738, + "step": 6130 + }, + { + "epoch": 0.19646113973058588, + "grad_norm": 39.5, + "learning_rate": 1.6917480633209838e-05, + "loss": 0.1504, + "step": 6140 + }, + { + "epoch": 0.1967811090135347, + "grad_norm": 5.71875, + "learning_rate": 1.6910744358369826e-05, + "loss": 0.1192, + "step": 6150 + }, + { + "epoch": 0.19710107829648355, + "grad_norm": 8.375, + "learning_rate": 1.690400808352981e-05, + "loss": 0.1748, + "step": 6160 + }, + { + "epoch": 0.19742104757943238, + "grad_norm": 12.625, + "learning_rate": 1.6897271808689798e-05, + "loss": 0.0997, + "step": 6170 + }, + { + "epoch": 0.1977410168623812, + "grad_norm": 12.6875, + "learning_rate": 1.6890535533849782e-05, + "loss": 0.1513, + "step": 6180 + }, + { + "epoch": 0.19806098614533005, + "grad_norm": 5.1875, + "learning_rate": 1.688379925900977e-05, + "loss": 0.1493, + "step": 6190 + }, + { + "epoch": 0.19838095542827888, + "grad_norm": 2.5, + "learning_rate": 1.6877062984169757e-05, + "loss": 0.0716, + "step": 6200 + }, + { + "epoch": 0.19870092471122772, + "grad_norm": 9.0625, + "learning_rate": 1.6870326709329742e-05, + "loss": 0.1744, + "step": 6210 + }, + { + "epoch": 0.19902089399417655, + "grad_norm": 10.0625, + "learning_rate": 1.686359043448973e-05, + "loss": 0.0735, + "step": 6220 + }, + { + "epoch": 0.19934086327712539, + "grad_norm": 4.5, + "learning_rate": 1.6856854159649717e-05, + "loss": 0.1173, + "step": 6230 + }, + { + "epoch": 0.19966083256007422, + "grad_norm": 1.859375, + "learning_rate": 1.68501178848097e-05, + "loss": 0.0549, + "step": 6240 + }, + { + "epoch": 0.19998080184302308, + "grad_norm": 4.375, + "learning_rate": 1.684338160996969e-05, + "loss": 0.0844, + "step": 6250 + }, + { + "epoch": 0.20030077112597192, + "grad_norm": 13.625, + "learning_rate": 1.6836645335129677e-05, + "loss": 0.1093, + "step": 6260 + }, + { + "epoch": 0.20062074040892075, + "grad_norm": 6.40625, + "learning_rate": 1.682990906028966e-05, + "loss": 0.1535, + "step": 6270 + }, + { + "epoch": 0.20094070969186958, + "grad_norm": 10.0625, + "learning_rate": 1.682317278544965e-05, + "loss": 0.1913, + "step": 6280 + }, + { + "epoch": 0.20126067897481842, + "grad_norm": 8.375, + "learning_rate": 1.6816436510609633e-05, + "loss": 0.1162, + "step": 6290 + }, + { + "epoch": 0.20158064825776725, + "grad_norm": 8.8125, + "learning_rate": 1.680970023576962e-05, + "loss": 0.1074, + "step": 6300 + }, + { + "epoch": 0.2019006175407161, + "grad_norm": 15.625, + "learning_rate": 1.680296396092961e-05, + "loss": 0.0973, + "step": 6310 + }, + { + "epoch": 0.20222058682366492, + "grad_norm": 23.125, + "learning_rate": 1.6796227686089593e-05, + "loss": 0.1102, + "step": 6320 + }, + { + "epoch": 0.20254055610661376, + "grad_norm": 5.3125, + "learning_rate": 1.678949141124958e-05, + "loss": 0.1171, + "step": 6330 + }, + { + "epoch": 0.2028605253895626, + "grad_norm": 6.34375, + "learning_rate": 1.678275513640957e-05, + "loss": 0.1504, + "step": 6340 + }, + { + "epoch": 0.20318049467251145, + "grad_norm": 4.0625, + "learning_rate": 1.6776018861569553e-05, + "loss": 0.0638, + "step": 6350 + }, + { + "epoch": 0.2035004639554603, + "grad_norm": 7.625, + "learning_rate": 1.676928258672954e-05, + "loss": 0.1289, + "step": 6360 + }, + { + "epoch": 0.20382043323840912, + "grad_norm": 9.4375, + "learning_rate": 1.676254631188953e-05, + "loss": 0.1183, + "step": 6370 + }, + { + "epoch": 0.20414040252135796, + "grad_norm": 9.5625, + "learning_rate": 1.6755810037049513e-05, + "loss": 0.124, + "step": 6380 + }, + { + "epoch": 0.2044603718043068, + "grad_norm": 16.25, + "learning_rate": 1.67490737622095e-05, + "loss": 0.1014, + "step": 6390 + }, + { + "epoch": 0.20478034108725562, + "grad_norm": 2.65625, + "learning_rate": 1.6742337487369485e-05, + "loss": 0.1125, + "step": 6400 + }, + { + "epoch": 0.20510031037020446, + "grad_norm": 8.8125, + "learning_rate": 1.6735601212529473e-05, + "loss": 0.1733, + "step": 6410 + }, + { + "epoch": 0.2054202796531533, + "grad_norm": 11.9375, + "learning_rate": 1.672886493768946e-05, + "loss": 0.1378, + "step": 6420 + }, + { + "epoch": 0.20574024893610213, + "grad_norm": 6.03125, + "learning_rate": 1.6722128662849445e-05, + "loss": 0.1147, + "step": 6430 + }, + { + "epoch": 0.20606021821905096, + "grad_norm": 19.5, + "learning_rate": 1.6715392388009433e-05, + "loss": 0.0789, + "step": 6440 + }, + { + "epoch": 0.2063801875019998, + "grad_norm": 6.0625, + "learning_rate": 1.670865611316942e-05, + "loss": 0.1286, + "step": 6450 + }, + { + "epoch": 0.20670015678494866, + "grad_norm": 21.125, + "learning_rate": 1.6701919838329405e-05, + "loss": 0.0632, + "step": 6460 + }, + { + "epoch": 0.2070201260678975, + "grad_norm": 16.125, + "learning_rate": 1.6695183563489392e-05, + "loss": 0.1671, + "step": 6470 + }, + { + "epoch": 0.20734009535084633, + "grad_norm": 2.453125, + "learning_rate": 1.668844728864938e-05, + "loss": 0.1409, + "step": 6480 + }, + { + "epoch": 0.20766006463379516, + "grad_norm": 7.1875, + "learning_rate": 1.6681711013809364e-05, + "loss": 0.1769, + "step": 6490 + }, + { + "epoch": 0.207980033916744, + "grad_norm": 1.6640625, + "learning_rate": 1.6674974738969352e-05, + "loss": 0.1058, + "step": 6500 + }, + { + "epoch": 0.20830000319969283, + "grad_norm": 13.75, + "learning_rate": 1.6668238464129337e-05, + "loss": 0.1337, + "step": 6510 + }, + { + "epoch": 0.20861997248264166, + "grad_norm": 17.125, + "learning_rate": 1.6661502189289324e-05, + "loss": 0.131, + "step": 6520 + }, + { + "epoch": 0.2089399417655905, + "grad_norm": 14.3125, + "learning_rate": 1.6654765914449312e-05, + "loss": 0.0857, + "step": 6530 + }, + { + "epoch": 0.20925991104853933, + "grad_norm": 16.0, + "learning_rate": 1.6648029639609296e-05, + "loss": 0.1194, + "step": 6540 + }, + { + "epoch": 0.20957988033148817, + "grad_norm": 7.0, + "learning_rate": 1.6641293364769284e-05, + "loss": 0.1127, + "step": 6550 + }, + { + "epoch": 0.209899849614437, + "grad_norm": 10.125, + "learning_rate": 1.6634557089929272e-05, + "loss": 0.1289, + "step": 6560 + }, + { + "epoch": 0.21021981889738586, + "grad_norm": 8.8125, + "learning_rate": 1.6627820815089256e-05, + "loss": 0.09, + "step": 6570 + }, + { + "epoch": 0.2105397881803347, + "grad_norm": 9.9375, + "learning_rate": 1.6621084540249244e-05, + "loss": 0.0909, + "step": 6580 + }, + { + "epoch": 0.21085975746328353, + "grad_norm": 23.75, + "learning_rate": 1.661434826540923e-05, + "loss": 0.1099, + "step": 6590 + }, + { + "epoch": 0.21117972674623237, + "grad_norm": 4.84375, + "learning_rate": 1.6607611990569216e-05, + "loss": 0.1152, + "step": 6600 + }, + { + "epoch": 0.2114996960291812, + "grad_norm": 12.5625, + "learning_rate": 1.6600875715729204e-05, + "loss": 0.164, + "step": 6610 + }, + { + "epoch": 0.21181966531213003, + "grad_norm": 7.5625, + "learning_rate": 1.659413944088919e-05, + "loss": 0.14, + "step": 6620 + }, + { + "epoch": 0.21213963459507887, + "grad_norm": 13.75, + "learning_rate": 1.6587403166049176e-05, + "loss": 0.1046, + "step": 6630 + }, + { + "epoch": 0.2124596038780277, + "grad_norm": 13.75, + "learning_rate": 1.6580666891209164e-05, + "loss": 0.0868, + "step": 6640 + }, + { + "epoch": 0.21277957316097654, + "grad_norm": 5.5625, + "learning_rate": 1.6573930616369148e-05, + "loss": 0.2311, + "step": 6650 + }, + { + "epoch": 0.21309954244392537, + "grad_norm": 15.375, + "learning_rate": 1.6567194341529136e-05, + "loss": 0.1174, + "step": 6660 + }, + { + "epoch": 0.21341951172687423, + "grad_norm": 1.1875, + "learning_rate": 1.6560458066689123e-05, + "loss": 0.0757, + "step": 6670 + }, + { + "epoch": 0.21373948100982307, + "grad_norm": 5.5625, + "learning_rate": 1.6553721791849108e-05, + "loss": 0.1379, + "step": 6680 + }, + { + "epoch": 0.2140594502927719, + "grad_norm": 7.25, + "learning_rate": 1.6546985517009095e-05, + "loss": 0.1872, + "step": 6690 + }, + { + "epoch": 0.21437941957572074, + "grad_norm": 3.78125, + "learning_rate": 1.6540249242169083e-05, + "loss": 0.115, + "step": 6700 + }, + { + "epoch": 0.21469938885866957, + "grad_norm": 12.375, + "learning_rate": 1.6533512967329068e-05, + "loss": 0.1369, + "step": 6710 + }, + { + "epoch": 0.2150193581416184, + "grad_norm": 7.09375, + "learning_rate": 1.6526776692489055e-05, + "loss": 0.1265, + "step": 6720 + }, + { + "epoch": 0.21533932742456724, + "grad_norm": 10.3125, + "learning_rate": 1.6520040417649043e-05, + "loss": 0.0941, + "step": 6730 + }, + { + "epoch": 0.21565929670751607, + "grad_norm": 3.6875, + "learning_rate": 1.6513304142809027e-05, + "loss": 0.153, + "step": 6740 + }, + { + "epoch": 0.2159792659904649, + "grad_norm": 9.9375, + "learning_rate": 1.6506567867969015e-05, + "loss": 0.0875, + "step": 6750 + }, + { + "epoch": 0.21629923527341374, + "grad_norm": 0.443359375, + "learning_rate": 1.6499831593129e-05, + "loss": 0.0803, + "step": 6760 + }, + { + "epoch": 0.21661920455636258, + "grad_norm": 3.65625, + "learning_rate": 1.6493095318288987e-05, + "loss": 0.0814, + "step": 6770 + }, + { + "epoch": 0.21693917383931144, + "grad_norm": 27.0, + "learning_rate": 1.6486359043448975e-05, + "loss": 0.1629, + "step": 6780 + }, + { + "epoch": 0.21725914312226027, + "grad_norm": 6.25, + "learning_rate": 1.647962276860896e-05, + "loss": 0.1529, + "step": 6790 + }, + { + "epoch": 0.2175791124052091, + "grad_norm": 8.6875, + "learning_rate": 1.6472886493768947e-05, + "loss": 0.1408, + "step": 6800 + }, + { + "epoch": 0.21789908168815794, + "grad_norm": 1.1328125, + "learning_rate": 1.6466150218928935e-05, + "loss": 0.1601, + "step": 6810 + }, + { + "epoch": 0.21821905097110678, + "grad_norm": 9.1875, + "learning_rate": 1.645941394408892e-05, + "loss": 0.1226, + "step": 6820 + }, + { + "epoch": 0.2185390202540556, + "grad_norm": 5.28125, + "learning_rate": 1.6452677669248907e-05, + "loss": 0.0844, + "step": 6830 + }, + { + "epoch": 0.21885898953700444, + "grad_norm": 6.96875, + "learning_rate": 1.6445941394408895e-05, + "loss": 0.248, + "step": 6840 + }, + { + "epoch": 0.21917895881995328, + "grad_norm": 12.1875, + "learning_rate": 1.643920511956888e-05, + "loss": 0.0982, + "step": 6850 + }, + { + "epoch": 0.2194989281029021, + "grad_norm": 13.0, + "learning_rate": 1.6432468844728867e-05, + "loss": 0.117, + "step": 6860 + }, + { + "epoch": 0.21981889738585095, + "grad_norm": 7.90625, + "learning_rate": 1.642573256988885e-05, + "loss": 0.1546, + "step": 6870 + }, + { + "epoch": 0.22013886666879978, + "grad_norm": 15.3125, + "learning_rate": 1.641899629504884e-05, + "loss": 0.1001, + "step": 6880 + }, + { + "epoch": 0.22045883595174864, + "grad_norm": 1.5390625, + "learning_rate": 1.6412260020208826e-05, + "loss": 0.1154, + "step": 6890 + }, + { + "epoch": 0.22077880523469748, + "grad_norm": 17.0, + "learning_rate": 1.640552374536881e-05, + "loss": 0.2123, + "step": 6900 + }, + { + "epoch": 0.2210987745176463, + "grad_norm": 8.875, + "learning_rate": 1.63987874705288e-05, + "loss": 0.1526, + "step": 6910 + }, + { + "epoch": 0.22141874380059515, + "grad_norm": 3.421875, + "learning_rate": 1.6392051195688786e-05, + "loss": 0.1026, + "step": 6920 + }, + { + "epoch": 0.22173871308354398, + "grad_norm": 9.8125, + "learning_rate": 1.638531492084877e-05, + "loss": 0.11, + "step": 6930 + }, + { + "epoch": 0.22205868236649282, + "grad_norm": 18.0, + "learning_rate": 1.637857864600876e-05, + "loss": 0.1496, + "step": 6940 + }, + { + "epoch": 0.22237865164944165, + "grad_norm": 9.25, + "learning_rate": 1.6371842371168746e-05, + "loss": 0.1105, + "step": 6950 + }, + { + "epoch": 0.22269862093239048, + "grad_norm": 15.875, + "learning_rate": 1.636510609632873e-05, + "loss": 0.1363, + "step": 6960 + }, + { + "epoch": 0.22301859021533932, + "grad_norm": 9.6875, + "learning_rate": 1.6358369821488718e-05, + "loss": 0.0606, + "step": 6970 + }, + { + "epoch": 0.22333855949828815, + "grad_norm": 23.75, + "learning_rate": 1.6351633546648703e-05, + "loss": 0.0749, + "step": 6980 + }, + { + "epoch": 0.22365852878123701, + "grad_norm": 22.375, + "learning_rate": 1.634489727180869e-05, + "loss": 0.1165, + "step": 6990 + }, + { + "epoch": 0.22397849806418585, + "grad_norm": 11.8125, + "learning_rate": 1.6338160996968678e-05, + "loss": 0.1138, + "step": 7000 + }, + { + "epoch": 0.22429846734713468, + "grad_norm": 9.6875, + "learning_rate": 1.6331424722128662e-05, + "loss": 0.1439, + "step": 7010 + }, + { + "epoch": 0.22461843663008352, + "grad_norm": 11.9375, + "learning_rate": 1.632468844728865e-05, + "loss": 0.0958, + "step": 7020 + }, + { + "epoch": 0.22493840591303235, + "grad_norm": 6.375, + "learning_rate": 1.6317952172448638e-05, + "loss": 0.2063, + "step": 7030 + }, + { + "epoch": 0.2252583751959812, + "grad_norm": 5.28125, + "learning_rate": 1.6311215897608622e-05, + "loss": 0.1988, + "step": 7040 + }, + { + "epoch": 0.22557834447893002, + "grad_norm": 2.09375, + "learning_rate": 1.630447962276861e-05, + "loss": 0.1764, + "step": 7050 + }, + { + "epoch": 0.22589831376187886, + "grad_norm": 10.1875, + "learning_rate": 1.6297743347928598e-05, + "loss": 0.133, + "step": 7060 + }, + { + "epoch": 0.2262182830448277, + "grad_norm": 4.71875, + "learning_rate": 1.6291007073088582e-05, + "loss": 0.1223, + "step": 7070 + }, + { + "epoch": 0.22653825232777652, + "grad_norm": 4.5625, + "learning_rate": 1.628427079824857e-05, + "loss": 0.1131, + "step": 7080 + }, + { + "epoch": 0.22685822161072536, + "grad_norm": 8.625, + "learning_rate": 1.6277534523408557e-05, + "loss": 0.1424, + "step": 7090 + }, + { + "epoch": 0.22717819089367422, + "grad_norm": 11.375, + "learning_rate": 1.6270798248568542e-05, + "loss": 0.0843, + "step": 7100 + }, + { + "epoch": 0.22749816017662305, + "grad_norm": 2.0625, + "learning_rate": 1.626406197372853e-05, + "loss": 0.1195, + "step": 7110 + }, + { + "epoch": 0.2278181294595719, + "grad_norm": 7.96875, + "learning_rate": 1.6257325698888514e-05, + "loss": 0.1178, + "step": 7120 + }, + { + "epoch": 0.22813809874252072, + "grad_norm": 9.3125, + "learning_rate": 1.62505894240485e-05, + "loss": 0.0507, + "step": 7130 + }, + { + "epoch": 0.22845806802546956, + "grad_norm": 10.25, + "learning_rate": 1.624385314920849e-05, + "loss": 0.1338, + "step": 7140 + }, + { + "epoch": 0.2287780373084184, + "grad_norm": 8.6875, + "learning_rate": 1.6237116874368474e-05, + "loss": 0.1333, + "step": 7150 + }, + { + "epoch": 0.22909800659136723, + "grad_norm": 6.875, + "learning_rate": 1.623038059952846e-05, + "loss": 0.1183, + "step": 7160 + }, + { + "epoch": 0.22941797587431606, + "grad_norm": 9.375, + "learning_rate": 1.622364432468845e-05, + "loss": 0.1329, + "step": 7170 + }, + { + "epoch": 0.2297379451572649, + "grad_norm": 12.375, + "learning_rate": 1.6216908049848433e-05, + "loss": 0.1709, + "step": 7180 + }, + { + "epoch": 0.23005791444021373, + "grad_norm": 9.1875, + "learning_rate": 1.621017177500842e-05, + "loss": 0.1209, + "step": 7190 + }, + { + "epoch": 0.23037788372316256, + "grad_norm": 15.8125, + "learning_rate": 1.620343550016841e-05, + "loss": 0.1131, + "step": 7200 + }, + { + "epoch": 0.23069785300611143, + "grad_norm": 13.125, + "learning_rate": 1.6196699225328393e-05, + "loss": 0.1538, + "step": 7210 + }, + { + "epoch": 0.23101782228906026, + "grad_norm": 4.46875, + "learning_rate": 1.618996295048838e-05, + "loss": 0.1317, + "step": 7220 + }, + { + "epoch": 0.2313377915720091, + "grad_norm": 4.78125, + "learning_rate": 1.6183226675648365e-05, + "loss": 0.1416, + "step": 7230 + }, + { + "epoch": 0.23165776085495793, + "grad_norm": 16.375, + "learning_rate": 1.6176490400808353e-05, + "loss": 0.0833, + "step": 7240 + }, + { + "epoch": 0.23197773013790676, + "grad_norm": 10.0625, + "learning_rate": 1.616975412596834e-05, + "loss": 0.0988, + "step": 7250 + }, + { + "epoch": 0.2322976994208556, + "grad_norm": 10.8125, + "learning_rate": 1.6163017851128325e-05, + "loss": 0.137, + "step": 7260 + }, + { + "epoch": 0.23261766870380443, + "grad_norm": 15.5, + "learning_rate": 1.6156281576288313e-05, + "loss": 0.1676, + "step": 7270 + }, + { + "epoch": 0.23293763798675327, + "grad_norm": 20.875, + "learning_rate": 1.61495453014483e-05, + "loss": 0.1127, + "step": 7280 + }, + { + "epoch": 0.2332576072697021, + "grad_norm": 4.4375, + "learning_rate": 1.6142809026608285e-05, + "loss": 0.111, + "step": 7290 + }, + { + "epoch": 0.23357757655265093, + "grad_norm": 14.6875, + "learning_rate": 1.6136072751768273e-05, + "loss": 0.1173, + "step": 7300 + }, + { + "epoch": 0.2338975458355998, + "grad_norm": 0.61328125, + "learning_rate": 1.612933647692826e-05, + "loss": 0.0615, + "step": 7310 + }, + { + "epoch": 0.23421751511854863, + "grad_norm": 5.09375, + "learning_rate": 1.6122600202088245e-05, + "loss": 0.1078, + "step": 7320 + }, + { + "epoch": 0.23453748440149746, + "grad_norm": 4.46875, + "learning_rate": 1.6115863927248233e-05, + "loss": 0.0745, + "step": 7330 + }, + { + "epoch": 0.2348574536844463, + "grad_norm": 4.40625, + "learning_rate": 1.6109127652408217e-05, + "loss": 0.1212, + "step": 7340 + }, + { + "epoch": 0.23517742296739513, + "grad_norm": 2.0625, + "learning_rate": 1.6102391377568205e-05, + "loss": 0.0647, + "step": 7350 + }, + { + "epoch": 0.23549739225034397, + "grad_norm": 17.0, + "learning_rate": 1.6095655102728192e-05, + "loss": 0.2942, + "step": 7360 + }, + { + "epoch": 0.2358173615332928, + "grad_norm": 6.125, + "learning_rate": 1.6088918827888177e-05, + "loss": 0.1488, + "step": 7370 + }, + { + "epoch": 0.23613733081624164, + "grad_norm": 12.0, + "learning_rate": 1.6082182553048168e-05, + "loss": 0.0842, + "step": 7380 + }, + { + "epoch": 0.23645730009919047, + "grad_norm": 11.25, + "learning_rate": 1.6075446278208152e-05, + "loss": 0.1328, + "step": 7390 + }, + { + "epoch": 0.2367772693821393, + "grad_norm": 14.3125, + "learning_rate": 1.606871000336814e-05, + "loss": 0.1691, + "step": 7400 + }, + { + "epoch": 0.23709723866508814, + "grad_norm": 1.390625, + "learning_rate": 1.6061973728528128e-05, + "loss": 0.1522, + "step": 7410 + }, + { + "epoch": 0.237417207948037, + "grad_norm": 10.625, + "learning_rate": 1.6055237453688112e-05, + "loss": 0.075, + "step": 7420 + }, + { + "epoch": 0.23773717723098584, + "grad_norm": 0.625, + "learning_rate": 1.60485011788481e-05, + "loss": 0.0629, + "step": 7430 + }, + { + "epoch": 0.23805714651393467, + "grad_norm": 27.375, + "learning_rate": 1.6041764904008088e-05, + "loss": 0.0742, + "step": 7440 + }, + { + "epoch": 0.2383771157968835, + "grad_norm": 17.75, + "learning_rate": 1.6035028629168072e-05, + "loss": 0.1081, + "step": 7450 + }, + { + "epoch": 0.23869708507983234, + "grad_norm": 15.6875, + "learning_rate": 1.602829235432806e-05, + "loss": 0.2006, + "step": 7460 + }, + { + "epoch": 0.23901705436278117, + "grad_norm": 0.423828125, + "learning_rate": 1.6021556079488044e-05, + "loss": 0.0896, + "step": 7470 + }, + { + "epoch": 0.23933702364573, + "grad_norm": 2.59375, + "learning_rate": 1.601481980464803e-05, + "loss": 0.0886, + "step": 7480 + }, + { + "epoch": 0.23965699292867884, + "grad_norm": 41.5, + "learning_rate": 1.600808352980802e-05, + "loss": 0.1366, + "step": 7490 + }, + { + "epoch": 0.23997696221162768, + "grad_norm": 7.0, + "learning_rate": 1.6001347254968004e-05, + "loss": 0.0731, + "step": 7500 + }, + { + "epoch": 0.2402969314945765, + "grad_norm": 0.349609375, + "learning_rate": 1.599461098012799e-05, + "loss": 0.1803, + "step": 7510 + }, + { + "epoch": 0.24061690077752534, + "grad_norm": 17.125, + "learning_rate": 1.598787470528798e-05, + "loss": 0.1282, + "step": 7520 + }, + { + "epoch": 0.2409368700604742, + "grad_norm": 12.0625, + "learning_rate": 1.5981138430447964e-05, + "loss": 0.1225, + "step": 7530 + }, + { + "epoch": 0.24125683934342304, + "grad_norm": 11.75, + "learning_rate": 1.597440215560795e-05, + "loss": 0.1015, + "step": 7540 + }, + { + "epoch": 0.24157680862637188, + "grad_norm": 9.375, + "learning_rate": 1.596766588076794e-05, + "loss": 0.0981, + "step": 7550 + }, + { + "epoch": 0.2418967779093207, + "grad_norm": 11.6875, + "learning_rate": 1.5960929605927923e-05, + "loss": 0.1125, + "step": 7560 + }, + { + "epoch": 0.24221674719226954, + "grad_norm": 11.25, + "learning_rate": 1.595419333108791e-05, + "loss": 0.171, + "step": 7570 + }, + { + "epoch": 0.24253671647521838, + "grad_norm": 14.1875, + "learning_rate": 1.5947457056247895e-05, + "loss": 0.0962, + "step": 7580 + }, + { + "epoch": 0.2428566857581672, + "grad_norm": 1.78125, + "learning_rate": 1.5940720781407883e-05, + "loss": 0.0968, + "step": 7590 + }, + { + "epoch": 0.24317665504111605, + "grad_norm": 17.625, + "learning_rate": 1.593398450656787e-05, + "loss": 0.0923, + "step": 7600 + }, + { + "epoch": 0.24349662432406488, + "grad_norm": 5.0625, + "learning_rate": 1.5927248231727855e-05, + "loss": 0.1091, + "step": 7610 + }, + { + "epoch": 0.24381659360701372, + "grad_norm": 20.875, + "learning_rate": 1.5920511956887843e-05, + "loss": 0.149, + "step": 7620 + }, + { + "epoch": 0.24413656288996258, + "grad_norm": 11.3125, + "learning_rate": 1.591377568204783e-05, + "loss": 0.1362, + "step": 7630 + }, + { + "epoch": 0.2444565321729114, + "grad_norm": 16.875, + "learning_rate": 1.5907039407207815e-05, + "loss": 0.172, + "step": 7640 + }, + { + "epoch": 0.24477650145586025, + "grad_norm": 11.25, + "learning_rate": 1.5900303132367803e-05, + "loss": 0.1634, + "step": 7650 + }, + { + "epoch": 0.24509647073880908, + "grad_norm": 10.4375, + "learning_rate": 1.589356685752779e-05, + "loss": 0.0898, + "step": 7660 + }, + { + "epoch": 0.24541644002175791, + "grad_norm": 15.0625, + "learning_rate": 1.5886830582687775e-05, + "loss": 0.2014, + "step": 7670 + }, + { + "epoch": 0.24573640930470675, + "grad_norm": 9.4375, + "learning_rate": 1.5880094307847763e-05, + "loss": 0.1195, + "step": 7680 + }, + { + "epoch": 0.24605637858765558, + "grad_norm": 3.09375, + "learning_rate": 1.5873358033007747e-05, + "loss": 0.0837, + "step": 7690 + }, + { + "epoch": 0.24637634787060442, + "grad_norm": 10.0625, + "learning_rate": 1.5866621758167735e-05, + "loss": 0.1372, + "step": 7700 + }, + { + "epoch": 0.24669631715355325, + "grad_norm": 25.375, + "learning_rate": 1.5859885483327722e-05, + "loss": 0.15, + "step": 7710 + }, + { + "epoch": 0.24701628643650209, + "grad_norm": 8.3125, + "learning_rate": 1.5853149208487707e-05, + "loss": 0.1025, + "step": 7720 + }, + { + "epoch": 0.24733625571945092, + "grad_norm": 4.71875, + "learning_rate": 1.5846412933647695e-05, + "loss": 0.132, + "step": 7730 + }, + { + "epoch": 0.24765622500239978, + "grad_norm": 5.65625, + "learning_rate": 1.5839676658807682e-05, + "loss": 0.1472, + "step": 7740 + }, + { + "epoch": 0.24797619428534862, + "grad_norm": 10.6875, + "learning_rate": 1.5832940383967667e-05, + "loss": 0.1148, + "step": 7750 + }, + { + "epoch": 0.24829616356829745, + "grad_norm": 9.75, + "learning_rate": 1.5826204109127654e-05, + "loss": 0.1518, + "step": 7760 + }, + { + "epoch": 0.24861613285124629, + "grad_norm": 4.4375, + "learning_rate": 1.5819467834287642e-05, + "loss": 0.1138, + "step": 7770 + }, + { + "epoch": 0.24893610213419512, + "grad_norm": 18.125, + "learning_rate": 1.5812731559447626e-05, + "loss": 0.1483, + "step": 7780 + }, + { + "epoch": 0.24925607141714395, + "grad_norm": 10.0625, + "learning_rate": 1.5805995284607614e-05, + "loss": 0.0812, + "step": 7790 + }, + { + "epoch": 0.2495760407000928, + "grad_norm": 2.484375, + "learning_rate": 1.57992590097676e-05, + "loss": 0.0918, + "step": 7800 + }, + { + "epoch": 0.24989600998304162, + "grad_norm": 12.5625, + "learning_rate": 1.5792522734927586e-05, + "loss": 0.1659, + "step": 7810 + }, + { + "epoch": 0.2502159792659905, + "grad_norm": 1.5546875, + "learning_rate": 1.5785786460087574e-05, + "loss": 0.1116, + "step": 7820 + }, + { + "epoch": 0.2505359485489393, + "grad_norm": 20.375, + "learning_rate": 1.577905018524756e-05, + "loss": 0.1521, + "step": 7830 + }, + { + "epoch": 0.25085591783188815, + "grad_norm": 20.5, + "learning_rate": 1.5772313910407546e-05, + "loss": 0.1637, + "step": 7840 + }, + { + "epoch": 0.25117588711483696, + "grad_norm": 3.796875, + "learning_rate": 1.5765577635567534e-05, + "loss": 0.111, + "step": 7850 + }, + { + "epoch": 0.2514958563977858, + "grad_norm": 19.25, + "learning_rate": 1.5758841360727518e-05, + "loss": 0.1933, + "step": 7860 + }, + { + "epoch": 0.25181582568073463, + "grad_norm": 10.5, + "learning_rate": 1.5752105085887506e-05, + "loss": 0.1028, + "step": 7870 + }, + { + "epoch": 0.2521357949636835, + "grad_norm": 6.8125, + "learning_rate": 1.5745368811047494e-05, + "loss": 0.1171, + "step": 7880 + }, + { + "epoch": 0.2524557642466323, + "grad_norm": 9.8125, + "learning_rate": 1.5738632536207478e-05, + "loss": 0.1498, + "step": 7890 + }, + { + "epoch": 0.25277573352958116, + "grad_norm": 9.125, + "learning_rate": 1.5731896261367466e-05, + "loss": 0.1366, + "step": 7900 + }, + { + "epoch": 0.25309570281253, + "grad_norm": 8.0625, + "learning_rate": 1.5725159986527453e-05, + "loss": 0.1279, + "step": 7910 + }, + { + "epoch": 0.25341567209547883, + "grad_norm": 9.125, + "learning_rate": 1.5718423711687438e-05, + "loss": 0.0726, + "step": 7920 + }, + { + "epoch": 0.2537356413784277, + "grad_norm": 13.1875, + "learning_rate": 1.5711687436847426e-05, + "loss": 0.1327, + "step": 7930 + }, + { + "epoch": 0.2540556106613765, + "grad_norm": 2.296875, + "learning_rate": 1.570495116200741e-05, + "loss": 0.1418, + "step": 7940 + }, + { + "epoch": 0.25437557994432536, + "grad_norm": 20.125, + "learning_rate": 1.5698214887167398e-05, + "loss": 0.1674, + "step": 7950 + }, + { + "epoch": 0.25469554922727417, + "grad_norm": 22.0, + "learning_rate": 1.5691478612327385e-05, + "loss": 0.0886, + "step": 7960 + }, + { + "epoch": 0.255015518510223, + "grad_norm": 8.625, + "learning_rate": 1.568474233748737e-05, + "loss": 0.1446, + "step": 7970 + }, + { + "epoch": 0.25533548779317183, + "grad_norm": 7.875, + "learning_rate": 1.5678006062647357e-05, + "loss": 0.1127, + "step": 7980 + }, + { + "epoch": 0.2556554570761207, + "grad_norm": 5.875, + "learning_rate": 1.5671269787807345e-05, + "loss": 0.1043, + "step": 7990 + }, + { + "epoch": 0.2559754263590695, + "grad_norm": 30.5, + "learning_rate": 1.566453351296733e-05, + "loss": 0.1512, + "step": 8000 + }, + { + "epoch": 0.25629539564201836, + "grad_norm": 17.125, + "learning_rate": 1.5657797238127317e-05, + "loss": 0.1103, + "step": 8010 + }, + { + "epoch": 0.2566153649249672, + "grad_norm": 2.609375, + "learning_rate": 1.5651060963287305e-05, + "loss": 0.1067, + "step": 8020 + }, + { + "epoch": 0.25693533420791603, + "grad_norm": 1.0859375, + "learning_rate": 1.564432468844729e-05, + "loss": 0.075, + "step": 8030 + }, + { + "epoch": 0.2572553034908649, + "grad_norm": 6.46875, + "learning_rate": 1.5637588413607277e-05, + "loss": 0.0878, + "step": 8040 + }, + { + "epoch": 0.2575752727738137, + "grad_norm": 0.66796875, + "learning_rate": 1.563085213876726e-05, + "loss": 0.1468, + "step": 8050 + }, + { + "epoch": 0.25789524205676256, + "grad_norm": 0.482421875, + "learning_rate": 1.562411586392725e-05, + "loss": 0.1102, + "step": 8060 + }, + { + "epoch": 0.25821521133971137, + "grad_norm": 12.4375, + "learning_rate": 1.5617379589087237e-05, + "loss": 0.118, + "step": 8070 + }, + { + "epoch": 0.25853518062266023, + "grad_norm": 16.125, + "learning_rate": 1.561064331424722e-05, + "loss": 0.1772, + "step": 8080 + }, + { + "epoch": 0.25885514990560904, + "grad_norm": 7.65625, + "learning_rate": 1.560390703940721e-05, + "loss": 0.0989, + "step": 8090 + }, + { + "epoch": 0.2591751191885579, + "grad_norm": 21.375, + "learning_rate": 1.5597170764567197e-05, + "loss": 0.1573, + "step": 8100 + }, + { + "epoch": 0.25949508847150676, + "grad_norm": 4.6875, + "learning_rate": 1.559043448972718e-05, + "loss": 0.1529, + "step": 8110 + }, + { + "epoch": 0.25981505775445557, + "grad_norm": 7.125, + "learning_rate": 1.558369821488717e-05, + "loss": 0.1163, + "step": 8120 + }, + { + "epoch": 0.26013502703740443, + "grad_norm": 15.125, + "learning_rate": 1.5576961940047157e-05, + "loss": 0.1446, + "step": 8130 + }, + { + "epoch": 0.26045499632035324, + "grad_norm": 9.5, + "learning_rate": 1.557022566520714e-05, + "loss": 0.1785, + "step": 8140 + }, + { + "epoch": 0.2607749656033021, + "grad_norm": 11.125, + "learning_rate": 1.556348939036713e-05, + "loss": 0.1158, + "step": 8150 + }, + { + "epoch": 0.2610949348862509, + "grad_norm": 12.125, + "learning_rate": 1.5556753115527113e-05, + "loss": 0.1184, + "step": 8160 + }, + { + "epoch": 0.26141490416919977, + "grad_norm": 4.21875, + "learning_rate": 1.55500168406871e-05, + "loss": 0.0967, + "step": 8170 + }, + { + "epoch": 0.2617348734521486, + "grad_norm": 13.6875, + "learning_rate": 1.554328056584709e-05, + "loss": 0.055, + "step": 8180 + }, + { + "epoch": 0.26205484273509744, + "grad_norm": 13.5, + "learning_rate": 1.5536544291007073e-05, + "loss": 0.082, + "step": 8190 + }, + { + "epoch": 0.26237481201804624, + "grad_norm": 12.5, + "learning_rate": 1.552980801616706e-05, + "loss": 0.1091, + "step": 8200 + }, + { + "epoch": 0.2626947813009951, + "grad_norm": 12.3125, + "learning_rate": 1.5523071741327048e-05, + "loss": 0.112, + "step": 8210 + }, + { + "epoch": 0.26301475058394397, + "grad_norm": 8.125, + "learning_rate": 1.5516335466487033e-05, + "loss": 0.1267, + "step": 8220 + }, + { + "epoch": 0.2633347198668928, + "grad_norm": 4.5625, + "learning_rate": 1.550959919164702e-05, + "loss": 0.1499, + "step": 8230 + }, + { + "epoch": 0.26365468914984164, + "grad_norm": 16.0, + "learning_rate": 1.5502862916807008e-05, + "loss": 0.1749, + "step": 8240 + }, + { + "epoch": 0.26397465843279044, + "grad_norm": 15.875, + "learning_rate": 1.5496126641966992e-05, + "loss": 0.1058, + "step": 8250 + }, + { + "epoch": 0.2642946277157393, + "grad_norm": 25.75, + "learning_rate": 1.548939036712698e-05, + "loss": 0.1219, + "step": 8260 + }, + { + "epoch": 0.2646145969986881, + "grad_norm": 17.875, + "learning_rate": 1.5482654092286965e-05, + "loss": 0.0995, + "step": 8270 + }, + { + "epoch": 0.264934566281637, + "grad_norm": 34.25, + "learning_rate": 1.5475917817446952e-05, + "loss": 0.1514, + "step": 8280 + }, + { + "epoch": 0.2652545355645858, + "grad_norm": 9.8125, + "learning_rate": 1.546918154260694e-05, + "loss": 0.2644, + "step": 8290 + }, + { + "epoch": 0.26557450484753464, + "grad_norm": 15.0, + "learning_rate": 1.5462445267766924e-05, + "loss": 0.1118, + "step": 8300 + }, + { + "epoch": 0.26589447413048345, + "grad_norm": 2.5, + "learning_rate": 1.5455708992926912e-05, + "loss": 0.0594, + "step": 8310 + }, + { + "epoch": 0.2662144434134323, + "grad_norm": 5.75, + "learning_rate": 1.54489727180869e-05, + "loss": 0.0984, + "step": 8320 + }, + { + "epoch": 0.2665344126963812, + "grad_norm": 14.8125, + "learning_rate": 1.5442236443246884e-05, + "loss": 0.1146, + "step": 8330 + }, + { + "epoch": 0.26685438197933, + "grad_norm": 0.71484375, + "learning_rate": 1.5435500168406872e-05, + "loss": 0.078, + "step": 8340 + }, + { + "epoch": 0.26717435126227884, + "grad_norm": 1.6328125, + "learning_rate": 1.542876389356686e-05, + "loss": 0.1246, + "step": 8350 + }, + { + "epoch": 0.26749432054522765, + "grad_norm": 11.9375, + "learning_rate": 1.5422027618726844e-05, + "loss": 0.1302, + "step": 8360 + }, + { + "epoch": 0.2678142898281765, + "grad_norm": 8.8125, + "learning_rate": 1.5415291343886832e-05, + "loss": 0.0946, + "step": 8370 + }, + { + "epoch": 0.2681342591111253, + "grad_norm": 1.9140625, + "learning_rate": 1.5408555069046816e-05, + "loss": 0.0854, + "step": 8380 + }, + { + "epoch": 0.2684542283940742, + "grad_norm": 7.5625, + "learning_rate": 1.5401818794206804e-05, + "loss": 0.0904, + "step": 8390 + }, + { + "epoch": 0.268774197677023, + "grad_norm": 0.88671875, + "learning_rate": 1.539508251936679e-05, + "loss": 0.1549, + "step": 8400 + }, + { + "epoch": 0.26909416695997185, + "grad_norm": 1.953125, + "learning_rate": 1.5388346244526776e-05, + "loss": 0.1385, + "step": 8410 + }, + { + "epoch": 0.26941413624292065, + "grad_norm": 6.0625, + "learning_rate": 1.5381609969686764e-05, + "loss": 0.135, + "step": 8420 + }, + { + "epoch": 0.2697341055258695, + "grad_norm": 2.15625, + "learning_rate": 1.537487369484675e-05, + "loss": 0.0887, + "step": 8430 + }, + { + "epoch": 0.2700540748088184, + "grad_norm": 7.65625, + "learning_rate": 1.5368137420006736e-05, + "loss": 0.1285, + "step": 8440 + }, + { + "epoch": 0.2703740440917672, + "grad_norm": 7.15625, + "learning_rate": 1.5361401145166723e-05, + "loss": 0.128, + "step": 8450 + }, + { + "epoch": 0.27069401337471605, + "grad_norm": 10.875, + "learning_rate": 1.535466487032671e-05, + "loss": 0.1024, + "step": 8460 + }, + { + "epoch": 0.27101398265766485, + "grad_norm": 11.0625, + "learning_rate": 1.5347928595486696e-05, + "loss": 0.1384, + "step": 8470 + }, + { + "epoch": 0.2713339519406137, + "grad_norm": 9.875, + "learning_rate": 1.5341192320646683e-05, + "loss": 0.1007, + "step": 8480 + }, + { + "epoch": 0.2716539212235625, + "grad_norm": 7.3125, + "learning_rate": 1.533445604580667e-05, + "loss": 0.0678, + "step": 8490 + }, + { + "epoch": 0.2719738905065114, + "grad_norm": 1.859375, + "learning_rate": 1.5327719770966655e-05, + "loss": 0.1092, + "step": 8500 + }, + { + "epoch": 0.2722938597894602, + "grad_norm": 0.392578125, + "learning_rate": 1.5320983496126643e-05, + "loss": 0.0776, + "step": 8510 + }, + { + "epoch": 0.27261382907240905, + "grad_norm": 16.0, + "learning_rate": 1.5314247221286627e-05, + "loss": 0.0872, + "step": 8520 + }, + { + "epoch": 0.27293379835535786, + "grad_norm": 7.0625, + "learning_rate": 1.5307510946446615e-05, + "loss": 0.1059, + "step": 8530 + }, + { + "epoch": 0.2732537676383067, + "grad_norm": 9.125, + "learning_rate": 1.5300774671606603e-05, + "loss": 0.099, + "step": 8540 + }, + { + "epoch": 0.2735737369212556, + "grad_norm": 0.6171875, + "learning_rate": 1.5294038396766587e-05, + "loss": 0.1089, + "step": 8550 + }, + { + "epoch": 0.2738937062042044, + "grad_norm": 1.0859375, + "learning_rate": 1.5287302121926575e-05, + "loss": 0.1797, + "step": 8560 + }, + { + "epoch": 0.27421367548715325, + "grad_norm": 12.8125, + "learning_rate": 1.5280565847086563e-05, + "loss": 0.0968, + "step": 8570 + }, + { + "epoch": 0.27453364477010206, + "grad_norm": 15.0, + "learning_rate": 1.5273829572246547e-05, + "loss": 0.1877, + "step": 8580 + }, + { + "epoch": 0.2748536140530509, + "grad_norm": 17.625, + "learning_rate": 1.5267093297406535e-05, + "loss": 0.1551, + "step": 8590 + }, + { + "epoch": 0.2751735833359997, + "grad_norm": 20.25, + "learning_rate": 1.5260357022566523e-05, + "loss": 0.1506, + "step": 8600 + }, + { + "epoch": 0.2754935526189486, + "grad_norm": 10.3125, + "learning_rate": 1.5253620747726507e-05, + "loss": 0.1837, + "step": 8610 + }, + { + "epoch": 0.2758135219018974, + "grad_norm": 7.25, + "learning_rate": 1.5246884472886495e-05, + "loss": 0.1349, + "step": 8620 + }, + { + "epoch": 0.27613349118484626, + "grad_norm": 6.28125, + "learning_rate": 1.524014819804648e-05, + "loss": 0.0888, + "step": 8630 + }, + { + "epoch": 0.27645346046779506, + "grad_norm": 1.546875, + "learning_rate": 1.5233411923206467e-05, + "loss": 0.1094, + "step": 8640 + }, + { + "epoch": 0.2767734297507439, + "grad_norm": 11.1875, + "learning_rate": 1.5226675648366453e-05, + "loss": 0.1154, + "step": 8650 + }, + { + "epoch": 0.2770933990336928, + "grad_norm": 7.84375, + "learning_rate": 1.521993937352644e-05, + "loss": 0.1251, + "step": 8660 + }, + { + "epoch": 0.2774133683166416, + "grad_norm": 10.375, + "learning_rate": 1.5213203098686428e-05, + "loss": 0.1021, + "step": 8670 + }, + { + "epoch": 0.27773333759959046, + "grad_norm": 10.1875, + "learning_rate": 1.5206466823846416e-05, + "loss": 0.0825, + "step": 8680 + }, + { + "epoch": 0.27805330688253926, + "grad_norm": 6.0, + "learning_rate": 1.5199730549006402e-05, + "loss": 0.1144, + "step": 8690 + }, + { + "epoch": 0.2783732761654881, + "grad_norm": 0.953125, + "learning_rate": 1.5192994274166388e-05, + "loss": 0.0845, + "step": 8700 + }, + { + "epoch": 0.27869324544843693, + "grad_norm": 7.375, + "learning_rate": 1.5186257999326374e-05, + "loss": 0.137, + "step": 8710 + }, + { + "epoch": 0.2790132147313858, + "grad_norm": 10.5625, + "learning_rate": 1.5179521724486362e-05, + "loss": 0.1011, + "step": 8720 + }, + { + "epoch": 0.2793331840143346, + "grad_norm": 17.75, + "learning_rate": 1.5172785449646348e-05, + "loss": 0.0991, + "step": 8730 + }, + { + "epoch": 0.27965315329728346, + "grad_norm": 41.5, + "learning_rate": 1.5166049174806334e-05, + "loss": 0.1776, + "step": 8740 + }, + { + "epoch": 0.2799731225802323, + "grad_norm": 4.21875, + "learning_rate": 1.5159312899966322e-05, + "loss": 0.0632, + "step": 8750 + }, + { + "epoch": 0.28029309186318113, + "grad_norm": 1.1640625, + "learning_rate": 1.5152576625126308e-05, + "loss": 0.0542, + "step": 8760 + }, + { + "epoch": 0.28061306114613, + "grad_norm": 4.96875, + "learning_rate": 1.5145840350286294e-05, + "loss": 0.0905, + "step": 8770 + }, + { + "epoch": 0.2809330304290788, + "grad_norm": 11.5625, + "learning_rate": 1.513910407544628e-05, + "loss": 0.1196, + "step": 8780 + }, + { + "epoch": 0.28125299971202766, + "grad_norm": 5.28125, + "learning_rate": 1.5132367800606267e-05, + "loss": 0.1511, + "step": 8790 + }, + { + "epoch": 0.28157296899497647, + "grad_norm": 11.9375, + "learning_rate": 1.5125631525766254e-05, + "loss": 0.1364, + "step": 8800 + }, + { + "epoch": 0.28189293827792533, + "grad_norm": 20.0, + "learning_rate": 1.511889525092624e-05, + "loss": 0.1594, + "step": 8810 + }, + { + "epoch": 0.28221290756087414, + "grad_norm": 8.9375, + "learning_rate": 1.5112158976086226e-05, + "loss": 0.0953, + "step": 8820 + }, + { + "epoch": 0.282532876843823, + "grad_norm": 4.8125, + "learning_rate": 1.5105422701246213e-05, + "loss": 0.1271, + "step": 8830 + }, + { + "epoch": 0.2828528461267718, + "grad_norm": 5.71875, + "learning_rate": 1.50986864264062e-05, + "loss": 0.0866, + "step": 8840 + }, + { + "epoch": 0.28317281540972067, + "grad_norm": 8.5625, + "learning_rate": 1.5091950151566185e-05, + "loss": 0.1211, + "step": 8850 + }, + { + "epoch": 0.28349278469266953, + "grad_norm": 1.75, + "learning_rate": 1.5085213876726173e-05, + "loss": 0.1184, + "step": 8860 + }, + { + "epoch": 0.28381275397561834, + "grad_norm": 19.875, + "learning_rate": 1.507847760188616e-05, + "loss": 0.0913, + "step": 8870 + }, + { + "epoch": 0.2841327232585672, + "grad_norm": 20.0, + "learning_rate": 1.5071741327046145e-05, + "loss": 0.0933, + "step": 8880 + }, + { + "epoch": 0.284452692541516, + "grad_norm": 1.34375, + "learning_rate": 1.5065005052206131e-05, + "loss": 0.1482, + "step": 8890 + }, + { + "epoch": 0.28477266182446487, + "grad_norm": 1.734375, + "learning_rate": 1.5058268777366119e-05, + "loss": 0.0946, + "step": 8900 + }, + { + "epoch": 0.2850926311074137, + "grad_norm": 14.25, + "learning_rate": 1.5051532502526105e-05, + "loss": 0.145, + "step": 8910 + }, + { + "epoch": 0.28541260039036254, + "grad_norm": 10.5, + "learning_rate": 1.5044796227686091e-05, + "loss": 0.106, + "step": 8920 + }, + { + "epoch": 0.28573256967331134, + "grad_norm": 3.828125, + "learning_rate": 1.5038059952846079e-05, + "loss": 0.0602, + "step": 8930 + }, + { + "epoch": 0.2860525389562602, + "grad_norm": 7.375, + "learning_rate": 1.5031323678006065e-05, + "loss": 0.1149, + "step": 8940 + }, + { + "epoch": 0.286372508239209, + "grad_norm": 17.0, + "learning_rate": 1.5024587403166051e-05, + "loss": 0.1095, + "step": 8950 + }, + { + "epoch": 0.2866924775221579, + "grad_norm": 9.75, + "learning_rate": 1.5017851128326037e-05, + "loss": 0.0986, + "step": 8960 + }, + { + "epoch": 0.28701244680510674, + "grad_norm": 17.25, + "learning_rate": 1.5011114853486025e-05, + "loss": 0.1214, + "step": 8970 + }, + { + "epoch": 0.28733241608805554, + "grad_norm": 11.0625, + "learning_rate": 1.500437857864601e-05, + "loss": 0.1167, + "step": 8980 + }, + { + "epoch": 0.2876523853710044, + "grad_norm": 0.9921875, + "learning_rate": 1.4997642303805997e-05, + "loss": 0.1413, + "step": 8990 + }, + { + "epoch": 0.2879723546539532, + "grad_norm": 28.875, + "learning_rate": 1.4990906028965983e-05, + "loss": 0.0858, + "step": 9000 + }, + { + "epoch": 0.2882923239369021, + "grad_norm": 9.9375, + "learning_rate": 1.498416975412597e-05, + "loss": 0.0825, + "step": 9010 + }, + { + "epoch": 0.2886122932198509, + "grad_norm": 19.0, + "learning_rate": 1.4977433479285957e-05, + "loss": 0.1569, + "step": 9020 + }, + { + "epoch": 0.28893226250279974, + "grad_norm": 7.71875, + "learning_rate": 1.4970697204445943e-05, + "loss": 0.1458, + "step": 9030 + }, + { + "epoch": 0.28925223178574855, + "grad_norm": 5.28125, + "learning_rate": 1.496396092960593e-05, + "loss": 0.1042, + "step": 9040 + }, + { + "epoch": 0.2895722010686974, + "grad_norm": 11.125, + "learning_rate": 1.4957224654765916e-05, + "loss": 0.1046, + "step": 9050 + }, + { + "epoch": 0.2898921703516462, + "grad_norm": 11.9375, + "learning_rate": 1.4950488379925902e-05, + "loss": 0.1126, + "step": 9060 + }, + { + "epoch": 0.2902121396345951, + "grad_norm": 12.875, + "learning_rate": 1.4943752105085888e-05, + "loss": 0.0573, + "step": 9070 + }, + { + "epoch": 0.29053210891754394, + "grad_norm": 14.3125, + "learning_rate": 1.4937015830245876e-05, + "loss": 0.1601, + "step": 9080 + }, + { + "epoch": 0.29085207820049275, + "grad_norm": 12.625, + "learning_rate": 1.4930279555405862e-05, + "loss": 0.119, + "step": 9090 + }, + { + "epoch": 0.2911720474834416, + "grad_norm": 6.84375, + "learning_rate": 1.4923543280565848e-05, + "loss": 0.1089, + "step": 9100 + }, + { + "epoch": 0.2914920167663904, + "grad_norm": 2.46875, + "learning_rate": 1.4916807005725834e-05, + "loss": 0.0874, + "step": 9110 + }, + { + "epoch": 0.2918119860493393, + "grad_norm": 0.9765625, + "learning_rate": 1.4910070730885822e-05, + "loss": 0.1135, + "step": 9120 + }, + { + "epoch": 0.2921319553322881, + "grad_norm": 9.0, + "learning_rate": 1.4903334456045808e-05, + "loss": 0.1416, + "step": 9130 + }, + { + "epoch": 0.29245192461523695, + "grad_norm": 17.25, + "learning_rate": 1.4896598181205794e-05, + "loss": 0.1287, + "step": 9140 + }, + { + "epoch": 0.29277189389818575, + "grad_norm": 9.875, + "learning_rate": 1.4889861906365782e-05, + "loss": 0.141, + "step": 9150 + }, + { + "epoch": 0.2930918631811346, + "grad_norm": 9.4375, + "learning_rate": 1.4883125631525768e-05, + "loss": 0.1045, + "step": 9160 + }, + { + "epoch": 0.2934118324640834, + "grad_norm": 24.25, + "learning_rate": 1.4876389356685754e-05, + "loss": 0.1603, + "step": 9170 + }, + { + "epoch": 0.2937318017470323, + "grad_norm": 13.625, + "learning_rate": 1.486965308184574e-05, + "loss": 0.1114, + "step": 9180 + }, + { + "epoch": 0.29405177102998115, + "grad_norm": 5.15625, + "learning_rate": 1.4862916807005728e-05, + "loss": 0.1317, + "step": 9190 + }, + { + "epoch": 0.29437174031292995, + "grad_norm": 5.5, + "learning_rate": 1.4856180532165714e-05, + "loss": 0.0948, + "step": 9200 + }, + { + "epoch": 0.2946917095958788, + "grad_norm": 8.75, + "learning_rate": 1.48494442573257e-05, + "loss": 0.1485, + "step": 9210 + }, + { + "epoch": 0.2950116788788276, + "grad_norm": 12.9375, + "learning_rate": 1.4842707982485688e-05, + "loss": 0.1273, + "step": 9220 + }, + { + "epoch": 0.2953316481617765, + "grad_norm": 9.375, + "learning_rate": 1.4835971707645674e-05, + "loss": 0.0799, + "step": 9230 + }, + { + "epoch": 0.2956516174447253, + "grad_norm": 4.9375, + "learning_rate": 1.482923543280566e-05, + "loss": 0.1229, + "step": 9240 + }, + { + "epoch": 0.29597158672767415, + "grad_norm": 18.75, + "learning_rate": 1.4822499157965646e-05, + "loss": 0.1457, + "step": 9250 + }, + { + "epoch": 0.29629155601062296, + "grad_norm": 2.328125, + "learning_rate": 1.4815762883125633e-05, + "loss": 0.095, + "step": 9260 + }, + { + "epoch": 0.2966115252935718, + "grad_norm": 17.5, + "learning_rate": 1.480902660828562e-05, + "loss": 0.097, + "step": 9270 + }, + { + "epoch": 0.2969314945765206, + "grad_norm": 0.53125, + "learning_rate": 1.4802290333445606e-05, + "loss": 0.1556, + "step": 9280 + }, + { + "epoch": 0.2972514638594695, + "grad_norm": 26.25, + "learning_rate": 1.4795554058605592e-05, + "loss": 0.1373, + "step": 9290 + }, + { + "epoch": 0.29757143314241835, + "grad_norm": 25.125, + "learning_rate": 1.478881778376558e-05, + "loss": 0.1193, + "step": 9300 + }, + { + "epoch": 0.29789140242536716, + "grad_norm": 14.125, + "learning_rate": 1.4782081508925565e-05, + "loss": 0.1143, + "step": 9310 + }, + { + "epoch": 0.298211371708316, + "grad_norm": 9.5, + "learning_rate": 1.4775345234085551e-05, + "loss": 0.0895, + "step": 9320 + }, + { + "epoch": 0.2985313409912648, + "grad_norm": 7.5, + "learning_rate": 1.4768608959245539e-05, + "loss": 0.0846, + "step": 9330 + }, + { + "epoch": 0.2988513102742137, + "grad_norm": 20.0, + "learning_rate": 1.4761872684405525e-05, + "loss": 0.1106, + "step": 9340 + }, + { + "epoch": 0.2991712795571625, + "grad_norm": 9.8125, + "learning_rate": 1.4755136409565511e-05, + "loss": 0.1722, + "step": 9350 + }, + { + "epoch": 0.29949124884011136, + "grad_norm": 3.4375, + "learning_rate": 1.4748400134725497e-05, + "loss": 0.0708, + "step": 9360 + }, + { + "epoch": 0.29981121812306016, + "grad_norm": 11.6875, + "learning_rate": 1.4741663859885485e-05, + "loss": 0.1534, + "step": 9370 + }, + { + "epoch": 0.300131187406009, + "grad_norm": 15.6875, + "learning_rate": 1.4734927585045471e-05, + "loss": 0.1129, + "step": 9380 + }, + { + "epoch": 0.3004511566889579, + "grad_norm": 9.25, + "learning_rate": 1.4728191310205457e-05, + "loss": 0.1174, + "step": 9390 + }, + { + "epoch": 0.3007711259719067, + "grad_norm": 1.15625, + "learning_rate": 1.4721455035365445e-05, + "loss": 0.0873, + "step": 9400 + }, + { + "epoch": 0.30109109525485556, + "grad_norm": 11.375, + "learning_rate": 1.471471876052543e-05, + "loss": 0.0908, + "step": 9410 + }, + { + "epoch": 0.30141106453780436, + "grad_norm": 3.703125, + "learning_rate": 1.4707982485685417e-05, + "loss": 0.154, + "step": 9420 + }, + { + "epoch": 0.3017310338207532, + "grad_norm": 16.875, + "learning_rate": 1.4701246210845403e-05, + "loss": 0.1004, + "step": 9430 + }, + { + "epoch": 0.30205100310370203, + "grad_norm": 0.57421875, + "learning_rate": 1.469450993600539e-05, + "loss": 0.0703, + "step": 9440 + }, + { + "epoch": 0.3023709723866509, + "grad_norm": 9.4375, + "learning_rate": 1.4687773661165377e-05, + "loss": 0.1026, + "step": 9450 + }, + { + "epoch": 0.3026909416695997, + "grad_norm": 0.83984375, + "learning_rate": 1.4681037386325363e-05, + "loss": 0.1203, + "step": 9460 + }, + { + "epoch": 0.30301091095254856, + "grad_norm": 6.78125, + "learning_rate": 1.4674301111485349e-05, + "loss": 0.1584, + "step": 9470 + }, + { + "epoch": 0.30333088023549737, + "grad_norm": 1.7734375, + "learning_rate": 1.4667564836645337e-05, + "loss": 0.1524, + "step": 9480 + }, + { + "epoch": 0.30365084951844623, + "grad_norm": 6.40625, + "learning_rate": 1.4660828561805323e-05, + "loss": 0.1914, + "step": 9490 + }, + { + "epoch": 0.3039708188013951, + "grad_norm": 11.125, + "learning_rate": 1.4654092286965309e-05, + "loss": 0.1154, + "step": 9500 + }, + { + "epoch": 0.3042907880843439, + "grad_norm": 9.5, + "learning_rate": 1.4647356012125296e-05, + "loss": 0.1174, + "step": 9510 + }, + { + "epoch": 0.30461075736729276, + "grad_norm": 5.3125, + "learning_rate": 1.4640619737285282e-05, + "loss": 0.1947, + "step": 9520 + }, + { + "epoch": 0.30493072665024157, + "grad_norm": 5.78125, + "learning_rate": 1.4633883462445268e-05, + "loss": 0.0843, + "step": 9530 + }, + { + "epoch": 0.30525069593319043, + "grad_norm": 12.875, + "learning_rate": 1.4627147187605254e-05, + "loss": 0.1404, + "step": 9540 + }, + { + "epoch": 0.30557066521613924, + "grad_norm": 5.75, + "learning_rate": 1.4620410912765242e-05, + "loss": 0.1464, + "step": 9550 + }, + { + "epoch": 0.3058906344990881, + "grad_norm": 10.9375, + "learning_rate": 1.4613674637925228e-05, + "loss": 0.1197, + "step": 9560 + }, + { + "epoch": 0.3062106037820369, + "grad_norm": 9.125, + "learning_rate": 1.4606938363085214e-05, + "loss": 0.1624, + "step": 9570 + }, + { + "epoch": 0.30653057306498577, + "grad_norm": 8.375, + "learning_rate": 1.46002020882452e-05, + "loss": 0.1093, + "step": 9580 + }, + { + "epoch": 0.3068505423479346, + "grad_norm": 11.0625, + "learning_rate": 1.4593465813405188e-05, + "loss": 0.1241, + "step": 9590 + }, + { + "epoch": 0.30717051163088344, + "grad_norm": 9.5625, + "learning_rate": 1.4586729538565174e-05, + "loss": 0.114, + "step": 9600 + }, + { + "epoch": 0.3074904809138323, + "grad_norm": 0.640625, + "learning_rate": 1.457999326372516e-05, + "loss": 0.1178, + "step": 9610 + }, + { + "epoch": 0.3078104501967811, + "grad_norm": 14.125, + "learning_rate": 1.4573256988885148e-05, + "loss": 0.1257, + "step": 9620 + }, + { + "epoch": 0.30813041947972997, + "grad_norm": 15.0, + "learning_rate": 1.4566520714045134e-05, + "loss": 0.1045, + "step": 9630 + }, + { + "epoch": 0.3084503887626788, + "grad_norm": 7.65625, + "learning_rate": 1.455978443920512e-05, + "loss": 0.1169, + "step": 9640 + }, + { + "epoch": 0.30877035804562764, + "grad_norm": 3.296875, + "learning_rate": 1.4553048164365106e-05, + "loss": 0.1398, + "step": 9650 + }, + { + "epoch": 0.30909032732857644, + "grad_norm": 5.46875, + "learning_rate": 1.4546311889525094e-05, + "loss": 0.1164, + "step": 9660 + }, + { + "epoch": 0.3094102966115253, + "grad_norm": 6.53125, + "learning_rate": 1.453957561468508e-05, + "loss": 0.0888, + "step": 9670 + }, + { + "epoch": 0.3097302658944741, + "grad_norm": 0.60546875, + "learning_rate": 1.4532839339845066e-05, + "loss": 0.0635, + "step": 9680 + }, + { + "epoch": 0.310050235177423, + "grad_norm": 1.3046875, + "learning_rate": 1.4526103065005054e-05, + "loss": 0.0842, + "step": 9690 + }, + { + "epoch": 0.3103702044603718, + "grad_norm": 1.828125, + "learning_rate": 1.451936679016504e-05, + "loss": 0.148, + "step": 9700 + }, + { + "epoch": 0.31069017374332064, + "grad_norm": 13.3125, + "learning_rate": 1.4512630515325026e-05, + "loss": 0.1911, + "step": 9710 + }, + { + "epoch": 0.3110101430262695, + "grad_norm": 10.4375, + "learning_rate": 1.4505894240485012e-05, + "loss": 0.0915, + "step": 9720 + }, + { + "epoch": 0.3113301123092183, + "grad_norm": 3.234375, + "learning_rate": 1.4499157965645e-05, + "loss": 0.0696, + "step": 9730 + }, + { + "epoch": 0.31165008159216717, + "grad_norm": 18.5, + "learning_rate": 1.4492421690804985e-05, + "loss": 0.1597, + "step": 9740 + }, + { + "epoch": 0.311970050875116, + "grad_norm": 8.6875, + "learning_rate": 1.4485685415964971e-05, + "loss": 0.1108, + "step": 9750 + }, + { + "epoch": 0.31229002015806484, + "grad_norm": 9.75, + "learning_rate": 1.4478949141124958e-05, + "loss": 0.1168, + "step": 9760 + }, + { + "epoch": 0.31260998944101365, + "grad_norm": 6.28125, + "learning_rate": 1.4472212866284945e-05, + "loss": 0.0774, + "step": 9770 + }, + { + "epoch": 0.3129299587239625, + "grad_norm": 11.6875, + "learning_rate": 1.4465476591444931e-05, + "loss": 0.1198, + "step": 9780 + }, + { + "epoch": 0.3132499280069113, + "grad_norm": 33.0, + "learning_rate": 1.4458740316604917e-05, + "loss": 0.1354, + "step": 9790 + }, + { + "epoch": 0.3135698972898602, + "grad_norm": 5.78125, + "learning_rate": 1.4452004041764905e-05, + "loss": 0.1465, + "step": 9800 + }, + { + "epoch": 0.313889866572809, + "grad_norm": 14.375, + "learning_rate": 1.4445267766924891e-05, + "loss": 0.1162, + "step": 9810 + }, + { + "epoch": 0.31420983585575785, + "grad_norm": 11.0625, + "learning_rate": 1.4438531492084877e-05, + "loss": 0.0746, + "step": 9820 + }, + { + "epoch": 0.3145298051387067, + "grad_norm": 4.375, + "learning_rate": 1.4431795217244863e-05, + "loss": 0.1316, + "step": 9830 + }, + { + "epoch": 0.3148497744216555, + "grad_norm": 5.78125, + "learning_rate": 1.4425058942404851e-05, + "loss": 0.1076, + "step": 9840 + }, + { + "epoch": 0.3151697437046044, + "grad_norm": 0.5390625, + "learning_rate": 1.4418322667564837e-05, + "loss": 0.1284, + "step": 9850 + }, + { + "epoch": 0.3154897129875532, + "grad_norm": 11.0, + "learning_rate": 1.4411586392724823e-05, + "loss": 0.0869, + "step": 9860 + }, + { + "epoch": 0.31580968227050205, + "grad_norm": 8.8125, + "learning_rate": 1.440485011788481e-05, + "loss": 0.1006, + "step": 9870 + }, + { + "epoch": 0.31612965155345085, + "grad_norm": 4.59375, + "learning_rate": 1.4398113843044797e-05, + "loss": 0.0717, + "step": 9880 + }, + { + "epoch": 0.3164496208363997, + "grad_norm": 4.03125, + "learning_rate": 1.4391377568204783e-05, + "loss": 0.1192, + "step": 9890 + }, + { + "epoch": 0.3167695901193485, + "grad_norm": 9.25, + "learning_rate": 1.4384641293364769e-05, + "loss": 0.1052, + "step": 9900 + }, + { + "epoch": 0.3170895594022974, + "grad_norm": 2.4375, + "learning_rate": 1.4377905018524757e-05, + "loss": 0.1174, + "step": 9910 + }, + { + "epoch": 0.3174095286852462, + "grad_norm": 8.0625, + "learning_rate": 1.4371168743684743e-05, + "loss": 0.1466, + "step": 9920 + }, + { + "epoch": 0.31772949796819505, + "grad_norm": 4.34375, + "learning_rate": 1.4364432468844729e-05, + "loss": 0.108, + "step": 9930 + }, + { + "epoch": 0.3180494672511439, + "grad_norm": 7.34375, + "learning_rate": 1.4357696194004715e-05, + "loss": 0.1376, + "step": 9940 + }, + { + "epoch": 0.3183694365340927, + "grad_norm": 8.25, + "learning_rate": 1.4350959919164702e-05, + "loss": 0.0933, + "step": 9950 + }, + { + "epoch": 0.3186894058170416, + "grad_norm": 4.90625, + "learning_rate": 1.434422364432469e-05, + "loss": 0.1111, + "step": 9960 + }, + { + "epoch": 0.3190093750999904, + "grad_norm": 20.75, + "learning_rate": 1.4337487369484678e-05, + "loss": 0.1331, + "step": 9970 + }, + { + "epoch": 0.31932934438293925, + "grad_norm": 5.1875, + "learning_rate": 1.4330751094644664e-05, + "loss": 0.148, + "step": 9980 + }, + { + "epoch": 0.31964931366588806, + "grad_norm": 14.5625, + "learning_rate": 1.432401481980465e-05, + "loss": 0.0841, + "step": 9990 + }, + { + "epoch": 0.3199692829488369, + "grad_norm": 16.125, + "learning_rate": 1.4317278544964636e-05, + "loss": 0.1145, + "step": 10000 + }, + { + "epoch": 0.3202892522317857, + "grad_norm": 5.21875, + "learning_rate": 1.4310542270124624e-05, + "loss": 0.1404, + "step": 10010 + }, + { + "epoch": 0.3206092215147346, + "grad_norm": 1.1484375, + "learning_rate": 1.430380599528461e-05, + "loss": 0.1227, + "step": 10020 + }, + { + "epoch": 0.32092919079768345, + "grad_norm": 15.5625, + "learning_rate": 1.4297069720444596e-05, + "loss": 0.1495, + "step": 10030 + }, + { + "epoch": 0.32124916008063226, + "grad_norm": 5.125, + "learning_rate": 1.4290333445604584e-05, + "loss": 0.1115, + "step": 10040 + }, + { + "epoch": 0.3215691293635811, + "grad_norm": 8.375, + "learning_rate": 1.428359717076457e-05, + "loss": 0.0982, + "step": 10050 + }, + { + "epoch": 0.3218890986465299, + "grad_norm": 9.1875, + "learning_rate": 1.4276860895924556e-05, + "loss": 0.1136, + "step": 10060 + }, + { + "epoch": 0.3222090679294788, + "grad_norm": 12.875, + "learning_rate": 1.4270124621084542e-05, + "loss": 0.166, + "step": 10070 + }, + { + "epoch": 0.3225290372124276, + "grad_norm": 8.8125, + "learning_rate": 1.426338834624453e-05, + "loss": 0.0961, + "step": 10080 + }, + { + "epoch": 0.32284900649537646, + "grad_norm": 5.5625, + "learning_rate": 1.4256652071404516e-05, + "loss": 0.1153, + "step": 10090 + }, + { + "epoch": 0.32316897577832526, + "grad_norm": 13.75, + "learning_rate": 1.4249915796564502e-05, + "loss": 0.1336, + "step": 10100 + }, + { + "epoch": 0.3234889450612741, + "grad_norm": 15.125, + "learning_rate": 1.4243179521724488e-05, + "loss": 0.1708, + "step": 10110 + }, + { + "epoch": 0.32380891434422293, + "grad_norm": 1.03125, + "learning_rate": 1.4236443246884475e-05, + "loss": 0.1428, + "step": 10120 + }, + { + "epoch": 0.3241288836271718, + "grad_norm": 1.140625, + "learning_rate": 1.4229706972044461e-05, + "loss": 0.1125, + "step": 10130 + }, + { + "epoch": 0.32444885291012066, + "grad_norm": 4.4375, + "learning_rate": 1.4222970697204447e-05, + "loss": 0.1269, + "step": 10140 + }, + { + "epoch": 0.32476882219306946, + "grad_norm": 17.5, + "learning_rate": 1.4216234422364435e-05, + "loss": 0.157, + "step": 10150 + }, + { + "epoch": 0.3250887914760183, + "grad_norm": 9.5, + "learning_rate": 1.4209498147524421e-05, + "loss": 0.0895, + "step": 10160 + }, + { + "epoch": 0.32540876075896713, + "grad_norm": 7.21875, + "learning_rate": 1.4202761872684407e-05, + "loss": 0.0963, + "step": 10170 + }, + { + "epoch": 0.325728730041916, + "grad_norm": 7.90625, + "learning_rate": 1.4196025597844393e-05, + "loss": 0.108, + "step": 10180 + }, + { + "epoch": 0.3260486993248648, + "grad_norm": 9.5625, + "learning_rate": 1.4189289323004381e-05, + "loss": 0.1689, + "step": 10190 + }, + { + "epoch": 0.32636866860781366, + "grad_norm": 8.25, + "learning_rate": 1.4182553048164367e-05, + "loss": 0.1628, + "step": 10200 + }, + { + "epoch": 0.32668863789076247, + "grad_norm": 5.625, + "learning_rate": 1.4175816773324353e-05, + "loss": 0.0784, + "step": 10210 + }, + { + "epoch": 0.32700860717371133, + "grad_norm": 17.75, + "learning_rate": 1.4169080498484339e-05, + "loss": 0.1304, + "step": 10220 + }, + { + "epoch": 0.32732857645666014, + "grad_norm": 5.90625, + "learning_rate": 1.4162344223644327e-05, + "loss": 0.1488, + "step": 10230 + }, + { + "epoch": 0.327648545739609, + "grad_norm": 2.609375, + "learning_rate": 1.4155607948804313e-05, + "loss": 0.0688, + "step": 10240 + }, + { + "epoch": 0.32796851502255786, + "grad_norm": 5.84375, + "learning_rate": 1.4148871673964299e-05, + "loss": 0.0739, + "step": 10250 + }, + { + "epoch": 0.32828848430550667, + "grad_norm": 23.75, + "learning_rate": 1.4142135399124287e-05, + "loss": 0.1272, + "step": 10260 + }, + { + "epoch": 0.32860845358845553, + "grad_norm": 8.875, + "learning_rate": 1.4135399124284273e-05, + "loss": 0.1684, + "step": 10270 + }, + { + "epoch": 0.32892842287140434, + "grad_norm": 26.5, + "learning_rate": 1.4128662849444259e-05, + "loss": 0.1469, + "step": 10280 + }, + { + "epoch": 0.3292483921543532, + "grad_norm": 11.75, + "learning_rate": 1.4121926574604245e-05, + "loss": 0.1232, + "step": 10290 + }, + { + "epoch": 0.329568361437302, + "grad_norm": 17.0, + "learning_rate": 1.4115190299764233e-05, + "loss": 0.1062, + "step": 10300 + }, + { + "epoch": 0.32988833072025087, + "grad_norm": 14.6875, + "learning_rate": 1.4108454024924219e-05, + "loss": 0.0957, + "step": 10310 + }, + { + "epoch": 0.3302083000031997, + "grad_norm": 13.5625, + "learning_rate": 1.4101717750084205e-05, + "loss": 0.1465, + "step": 10320 + }, + { + "epoch": 0.33052826928614853, + "grad_norm": 12.6875, + "learning_rate": 1.4094981475244192e-05, + "loss": 0.1535, + "step": 10330 + }, + { + "epoch": 0.33084823856909734, + "grad_norm": 9.75, + "learning_rate": 1.4088245200404178e-05, + "loss": 0.1099, + "step": 10340 + }, + { + "epoch": 0.3311682078520462, + "grad_norm": 3.046875, + "learning_rate": 1.4081508925564164e-05, + "loss": 0.137, + "step": 10350 + }, + { + "epoch": 0.33148817713499507, + "grad_norm": 2.234375, + "learning_rate": 1.407477265072415e-05, + "loss": 0.0814, + "step": 10360 + }, + { + "epoch": 0.33180814641794387, + "grad_norm": 10.25, + "learning_rate": 1.4068036375884138e-05, + "loss": 0.0976, + "step": 10370 + }, + { + "epoch": 0.33212811570089273, + "grad_norm": 1.4921875, + "learning_rate": 1.4061300101044124e-05, + "loss": 0.0583, + "step": 10380 + }, + { + "epoch": 0.33244808498384154, + "grad_norm": 0.6640625, + "learning_rate": 1.405456382620411e-05, + "loss": 0.1555, + "step": 10390 + }, + { + "epoch": 0.3327680542667904, + "grad_norm": 9.0, + "learning_rate": 1.4047827551364096e-05, + "loss": 0.2356, + "step": 10400 + }, + { + "epoch": 0.3330880235497392, + "grad_norm": 0.890625, + "learning_rate": 1.4041091276524084e-05, + "loss": 0.0423, + "step": 10410 + }, + { + "epoch": 0.33340799283268807, + "grad_norm": 3.28125, + "learning_rate": 1.403435500168407e-05, + "loss": 0.1153, + "step": 10420 + }, + { + "epoch": 0.3337279621156369, + "grad_norm": 18.5, + "learning_rate": 1.4027618726844056e-05, + "loss": 0.132, + "step": 10430 + }, + { + "epoch": 0.33404793139858574, + "grad_norm": 7.78125, + "learning_rate": 1.4020882452004044e-05, + "loss": 0.0954, + "step": 10440 + }, + { + "epoch": 0.33436790068153455, + "grad_norm": 20.375, + "learning_rate": 1.401414617716403e-05, + "loss": 0.1082, + "step": 10450 + }, + { + "epoch": 0.3346878699644834, + "grad_norm": 0.796875, + "learning_rate": 1.4007409902324016e-05, + "loss": 0.1149, + "step": 10460 + }, + { + "epoch": 0.33500783924743227, + "grad_norm": 8.875, + "learning_rate": 1.4000673627484002e-05, + "loss": 0.1498, + "step": 10470 + }, + { + "epoch": 0.3353278085303811, + "grad_norm": 8.25, + "learning_rate": 1.399393735264399e-05, + "loss": 0.1202, + "step": 10480 + }, + { + "epoch": 0.33564777781332994, + "grad_norm": 34.5, + "learning_rate": 1.3987201077803976e-05, + "loss": 0.1411, + "step": 10490 + }, + { + "epoch": 0.33596774709627875, + "grad_norm": 9.375, + "learning_rate": 1.3980464802963962e-05, + "loss": 0.0587, + "step": 10500 + }, + { + "epoch": 0.3362877163792276, + "grad_norm": 0.8828125, + "learning_rate": 1.397372852812395e-05, + "loss": 0.0687, + "step": 10510 + }, + { + "epoch": 0.3366076856621764, + "grad_norm": 7.5625, + "learning_rate": 1.3966992253283936e-05, + "loss": 0.1296, + "step": 10520 + }, + { + "epoch": 0.3369276549451253, + "grad_norm": 7.28125, + "learning_rate": 1.3960255978443922e-05, + "loss": 0.1319, + "step": 10530 + }, + { + "epoch": 0.3372476242280741, + "grad_norm": 9.6875, + "learning_rate": 1.3953519703603908e-05, + "loss": 0.1231, + "step": 10540 + }, + { + "epoch": 0.33756759351102295, + "grad_norm": 8.375, + "learning_rate": 1.3946783428763895e-05, + "loss": 0.1108, + "step": 10550 + }, + { + "epoch": 0.33788756279397175, + "grad_norm": 10.375, + "learning_rate": 1.3940047153923881e-05, + "loss": 0.1038, + "step": 10560 + }, + { + "epoch": 0.3382075320769206, + "grad_norm": 11.125, + "learning_rate": 1.3933310879083868e-05, + "loss": 0.1357, + "step": 10570 + }, + { + "epoch": 0.3385275013598695, + "grad_norm": 12.875, + "learning_rate": 1.3926574604243854e-05, + "loss": 0.1081, + "step": 10580 + }, + { + "epoch": 0.3388474706428183, + "grad_norm": 4.15625, + "learning_rate": 1.3919838329403841e-05, + "loss": 0.1027, + "step": 10590 + }, + { + "epoch": 0.33916743992576714, + "grad_norm": 1.6640625, + "learning_rate": 1.3913102054563827e-05, + "loss": 0.1465, + "step": 10600 + }, + { + "epoch": 0.33948740920871595, + "grad_norm": 8.9375, + "learning_rate": 1.3906365779723813e-05, + "loss": 0.1377, + "step": 10610 + }, + { + "epoch": 0.3398073784916648, + "grad_norm": 6.5, + "learning_rate": 1.3899629504883801e-05, + "loss": 0.1005, + "step": 10620 + }, + { + "epoch": 0.3401273477746136, + "grad_norm": 13.0, + "learning_rate": 1.3892893230043787e-05, + "loss": 0.146, + "step": 10630 + }, + { + "epoch": 0.3404473170575625, + "grad_norm": 10.9375, + "learning_rate": 1.3886156955203773e-05, + "loss": 0.117, + "step": 10640 + }, + { + "epoch": 0.3407672863405113, + "grad_norm": 2.6875, + "learning_rate": 1.387942068036376e-05, + "loss": 0.0816, + "step": 10650 + }, + { + "epoch": 0.34108725562346015, + "grad_norm": 18.25, + "learning_rate": 1.3872684405523747e-05, + "loss": 0.1183, + "step": 10660 + }, + { + "epoch": 0.34140722490640896, + "grad_norm": 11.5625, + "learning_rate": 1.3865948130683733e-05, + "loss": 0.1237, + "step": 10670 + }, + { + "epoch": 0.3417271941893578, + "grad_norm": 4.59375, + "learning_rate": 1.3859211855843719e-05, + "loss": 0.1043, + "step": 10680 + }, + { + "epoch": 0.3420471634723067, + "grad_norm": 1.453125, + "learning_rate": 1.3852475581003705e-05, + "loss": 0.0968, + "step": 10690 + }, + { + "epoch": 0.3423671327552555, + "grad_norm": 8.5, + "learning_rate": 1.3845739306163693e-05, + "loss": 0.1448, + "step": 10700 + }, + { + "epoch": 0.34268710203820435, + "grad_norm": 6.96875, + "learning_rate": 1.3839003031323679e-05, + "loss": 0.1658, + "step": 10710 + }, + { + "epoch": 0.34300707132115316, + "grad_norm": 2.1875, + "learning_rate": 1.3832266756483665e-05, + "loss": 0.1195, + "step": 10720 + }, + { + "epoch": 0.343327040604102, + "grad_norm": 14.5, + "learning_rate": 1.3825530481643653e-05, + "loss": 0.1541, + "step": 10730 + }, + { + "epoch": 0.3436470098870508, + "grad_norm": 10.1875, + "learning_rate": 1.3818794206803639e-05, + "loss": 0.1049, + "step": 10740 + }, + { + "epoch": 0.3439669791699997, + "grad_norm": 6.75, + "learning_rate": 1.3812057931963625e-05, + "loss": 0.1166, + "step": 10750 + }, + { + "epoch": 0.3442869484529485, + "grad_norm": 6.25, + "learning_rate": 1.380532165712361e-05, + "loss": 0.1394, + "step": 10760 + }, + { + "epoch": 0.34460691773589736, + "grad_norm": 1.8046875, + "learning_rate": 1.3798585382283599e-05, + "loss": 0.1192, + "step": 10770 + }, + { + "epoch": 0.3449268870188462, + "grad_norm": 10.9375, + "learning_rate": 1.3791849107443585e-05, + "loss": 0.1256, + "step": 10780 + }, + { + "epoch": 0.345246856301795, + "grad_norm": 13.3125, + "learning_rate": 1.378511283260357e-05, + "loss": 0.1219, + "step": 10790 + }, + { + "epoch": 0.3455668255847439, + "grad_norm": 8.0, + "learning_rate": 1.3778376557763558e-05, + "loss": 0.091, + "step": 10800 + }, + { + "epoch": 0.3458867948676927, + "grad_norm": 8.9375, + "learning_rate": 1.3771640282923544e-05, + "loss": 0.0598, + "step": 10810 + }, + { + "epoch": 0.34620676415064155, + "grad_norm": 19.375, + "learning_rate": 1.376490400808353e-05, + "loss": 0.1141, + "step": 10820 + }, + { + "epoch": 0.34652673343359036, + "grad_norm": 17.125, + "learning_rate": 1.3758167733243516e-05, + "loss": 0.112, + "step": 10830 + }, + { + "epoch": 0.3468467027165392, + "grad_norm": 4.3125, + "learning_rate": 1.3751431458403504e-05, + "loss": 0.1354, + "step": 10840 + }, + { + "epoch": 0.34716667199948803, + "grad_norm": 6.875, + "learning_rate": 1.374469518356349e-05, + "loss": 0.0969, + "step": 10850 + }, + { + "epoch": 0.3474866412824369, + "grad_norm": 2.125, + "learning_rate": 1.3737958908723476e-05, + "loss": 0.0671, + "step": 10860 + }, + { + "epoch": 0.3478066105653857, + "grad_norm": 4.46875, + "learning_rate": 1.3731222633883462e-05, + "loss": 0.1177, + "step": 10870 + }, + { + "epoch": 0.34812657984833456, + "grad_norm": 4.1875, + "learning_rate": 1.372448635904345e-05, + "loss": 0.1349, + "step": 10880 + }, + { + "epoch": 0.3484465491312834, + "grad_norm": 7.71875, + "learning_rate": 1.3717750084203436e-05, + "loss": 0.0932, + "step": 10890 + }, + { + "epoch": 0.34876651841423223, + "grad_norm": 7.46875, + "learning_rate": 1.3711013809363422e-05, + "loss": 0.1284, + "step": 10900 + }, + { + "epoch": 0.3490864876971811, + "grad_norm": 10.25, + "learning_rate": 1.370427753452341e-05, + "loss": 0.097, + "step": 10910 + }, + { + "epoch": 0.3494064569801299, + "grad_norm": 6.5625, + "learning_rate": 1.3697541259683396e-05, + "loss": 0.1912, + "step": 10920 + }, + { + "epoch": 0.34972642626307876, + "grad_norm": 1.0, + "learning_rate": 1.3690804984843382e-05, + "loss": 0.096, + "step": 10930 + }, + { + "epoch": 0.35004639554602757, + "grad_norm": 4.5625, + "learning_rate": 1.3684068710003368e-05, + "loss": 0.099, + "step": 10940 + }, + { + "epoch": 0.35036636482897643, + "grad_norm": 11.0625, + "learning_rate": 1.3677332435163356e-05, + "loss": 0.2057, + "step": 10950 + }, + { + "epoch": 0.35068633411192524, + "grad_norm": 2.65625, + "learning_rate": 1.3670596160323342e-05, + "loss": 0.1023, + "step": 10960 + }, + { + "epoch": 0.3510063033948741, + "grad_norm": 10.0, + "learning_rate": 1.3663859885483328e-05, + "loss": 0.1038, + "step": 10970 + }, + { + "epoch": 0.3513262726778229, + "grad_norm": 17.125, + "learning_rate": 1.3657123610643316e-05, + "loss": 0.125, + "step": 10980 + }, + { + "epoch": 0.35164624196077177, + "grad_norm": 7.6875, + "learning_rate": 1.3650387335803302e-05, + "loss": 0.0943, + "step": 10990 + }, + { + "epoch": 0.35196621124372063, + "grad_norm": 8.8125, + "learning_rate": 1.3643651060963288e-05, + "loss": 0.1054, + "step": 11000 + }, + { + "epoch": 0.35228618052666943, + "grad_norm": 9.3125, + "learning_rate": 1.3636914786123274e-05, + "loss": 0.0683, + "step": 11010 + }, + { + "epoch": 0.3526061498096183, + "grad_norm": 8.0, + "learning_rate": 1.3630178511283261e-05, + "loss": 0.1136, + "step": 11020 + }, + { + "epoch": 0.3529261190925671, + "grad_norm": 3.03125, + "learning_rate": 1.3623442236443247e-05, + "loss": 0.2113, + "step": 11030 + }, + { + "epoch": 0.35324608837551597, + "grad_norm": 7.625, + "learning_rate": 1.3616705961603233e-05, + "loss": 0.1611, + "step": 11040 + }, + { + "epoch": 0.35356605765846477, + "grad_norm": 8.0, + "learning_rate": 1.360996968676322e-05, + "loss": 0.0868, + "step": 11050 + }, + { + "epoch": 0.35388602694141363, + "grad_norm": 12.5, + "learning_rate": 1.3603233411923207e-05, + "loss": 0.09, + "step": 11060 + }, + { + "epoch": 0.35420599622436244, + "grad_norm": 19.375, + "learning_rate": 1.3596497137083193e-05, + "loss": 0.0934, + "step": 11070 + }, + { + "epoch": 0.3545259655073113, + "grad_norm": 15.9375, + "learning_rate": 1.358976086224318e-05, + "loss": 0.0803, + "step": 11080 + }, + { + "epoch": 0.3548459347902601, + "grad_norm": 6.15625, + "learning_rate": 1.3583024587403167e-05, + "loss": 0.1367, + "step": 11090 + }, + { + "epoch": 0.35516590407320897, + "grad_norm": 6.15625, + "learning_rate": 1.3576288312563153e-05, + "loss": 0.0836, + "step": 11100 + }, + { + "epoch": 0.35548587335615783, + "grad_norm": 1.3125, + "learning_rate": 1.356955203772314e-05, + "loss": 0.0831, + "step": 11110 + }, + { + "epoch": 0.35580584263910664, + "grad_norm": 1.9765625, + "learning_rate": 1.3562815762883125e-05, + "loss": 0.1429, + "step": 11120 + }, + { + "epoch": 0.3561258119220555, + "grad_norm": 4.71875, + "learning_rate": 1.3556079488043113e-05, + "loss": 0.1425, + "step": 11130 + }, + { + "epoch": 0.3564457812050043, + "grad_norm": 12.375, + "learning_rate": 1.3549343213203099e-05, + "loss": 0.1023, + "step": 11140 + }, + { + "epoch": 0.35676575048795317, + "grad_norm": 4.5, + "learning_rate": 1.3542606938363085e-05, + "loss": 0.0813, + "step": 11150 + }, + { + "epoch": 0.357085719770902, + "grad_norm": 3.78125, + "learning_rate": 1.3535870663523071e-05, + "loss": 0.0891, + "step": 11160 + }, + { + "epoch": 0.35740568905385084, + "grad_norm": 12.6875, + "learning_rate": 1.3529134388683059e-05, + "loss": 0.0813, + "step": 11170 + }, + { + "epoch": 0.35772565833679965, + "grad_norm": 2.78125, + "learning_rate": 1.3522398113843045e-05, + "loss": 0.144, + "step": 11180 + }, + { + "epoch": 0.3580456276197485, + "grad_norm": 2.28125, + "learning_rate": 1.3515661839003031e-05, + "loss": 0.0854, + "step": 11190 + }, + { + "epoch": 0.3583655969026973, + "grad_norm": 8.0625, + "learning_rate": 1.3508925564163019e-05, + "loss": 0.0996, + "step": 11200 + }, + { + "epoch": 0.3586855661856462, + "grad_norm": 3.265625, + "learning_rate": 1.3502189289323005e-05, + "loss": 0.098, + "step": 11210 + }, + { + "epoch": 0.35900553546859504, + "grad_norm": 3.296875, + "learning_rate": 1.349545301448299e-05, + "loss": 0.0949, + "step": 11220 + }, + { + "epoch": 0.35932550475154384, + "grad_norm": 1.640625, + "learning_rate": 1.3488716739642977e-05, + "loss": 0.1067, + "step": 11230 + }, + { + "epoch": 0.3596454740344927, + "grad_norm": 1.390625, + "learning_rate": 1.3481980464802964e-05, + "loss": 0.0982, + "step": 11240 + }, + { + "epoch": 0.3599654433174415, + "grad_norm": 0.90625, + "learning_rate": 1.3475244189962952e-05, + "loss": 0.1197, + "step": 11250 + }, + { + "epoch": 0.3602854126003904, + "grad_norm": 3.25, + "learning_rate": 1.346850791512294e-05, + "loss": 0.0754, + "step": 11260 + }, + { + "epoch": 0.3606053818833392, + "grad_norm": 1.15625, + "learning_rate": 1.3461771640282926e-05, + "loss": 0.1188, + "step": 11270 + }, + { + "epoch": 0.36092535116628804, + "grad_norm": 8.3125, + "learning_rate": 1.3455035365442912e-05, + "loss": 0.1126, + "step": 11280 + }, + { + "epoch": 0.36124532044923685, + "grad_norm": 6.03125, + "learning_rate": 1.3448299090602898e-05, + "loss": 0.1263, + "step": 11290 + }, + { + "epoch": 0.3615652897321857, + "grad_norm": 5.90625, + "learning_rate": 1.3441562815762886e-05, + "loss": 0.1119, + "step": 11300 + }, + { + "epoch": 0.3618852590151345, + "grad_norm": 25.0, + "learning_rate": 1.3434826540922872e-05, + "loss": 0.107, + "step": 11310 + }, + { + "epoch": 0.3622052282980834, + "grad_norm": 12.0625, + "learning_rate": 1.3428090266082858e-05, + "loss": 0.1341, + "step": 11320 + }, + { + "epoch": 0.36252519758103224, + "grad_norm": 9.25, + "learning_rate": 1.3421353991242844e-05, + "loss": 0.1262, + "step": 11330 + }, + { + "epoch": 0.36284516686398105, + "grad_norm": 2.890625, + "learning_rate": 1.3414617716402832e-05, + "loss": 0.0878, + "step": 11340 + }, + { + "epoch": 0.3631651361469299, + "grad_norm": 2.25, + "learning_rate": 1.3407881441562818e-05, + "loss": 0.0722, + "step": 11350 + }, + { + "epoch": 0.3634851054298787, + "grad_norm": 29.625, + "learning_rate": 1.3401145166722804e-05, + "loss": 0.1658, + "step": 11360 + }, + { + "epoch": 0.3638050747128276, + "grad_norm": 4.875, + "learning_rate": 1.3394408891882791e-05, + "loss": 0.0945, + "step": 11370 + }, + { + "epoch": 0.3641250439957764, + "grad_norm": 6.25, + "learning_rate": 1.3387672617042778e-05, + "loss": 0.1189, + "step": 11380 + }, + { + "epoch": 0.36444501327872525, + "grad_norm": 9.3125, + "learning_rate": 1.3380936342202764e-05, + "loss": 0.1067, + "step": 11390 + }, + { + "epoch": 0.36476498256167406, + "grad_norm": 2.296875, + "learning_rate": 1.337420006736275e-05, + "loss": 0.1142, + "step": 11400 + }, + { + "epoch": 0.3650849518446229, + "grad_norm": 8.0625, + "learning_rate": 1.3367463792522737e-05, + "loss": 0.1694, + "step": 11410 + }, + { + "epoch": 0.3654049211275718, + "grad_norm": 11.0625, + "learning_rate": 1.3360727517682723e-05, + "loss": 0.0988, + "step": 11420 + }, + { + "epoch": 0.3657248904105206, + "grad_norm": 5.4375, + "learning_rate": 1.335399124284271e-05, + "loss": 0.0628, + "step": 11430 + }, + { + "epoch": 0.36604485969346945, + "grad_norm": 0.388671875, + "learning_rate": 1.3347254968002697e-05, + "loss": 0.1021, + "step": 11440 + }, + { + "epoch": 0.36636482897641826, + "grad_norm": 23.5, + "learning_rate": 1.3340518693162683e-05, + "loss": 0.1237, + "step": 11450 + }, + { + "epoch": 0.3666847982593671, + "grad_norm": 17.0, + "learning_rate": 1.333378241832267e-05, + "loss": 0.143, + "step": 11460 + }, + { + "epoch": 0.3670047675423159, + "grad_norm": 13.3125, + "learning_rate": 1.3327046143482655e-05, + "loss": 0.2044, + "step": 11470 + }, + { + "epoch": 0.3673247368252648, + "grad_norm": 6.28125, + "learning_rate": 1.3320309868642643e-05, + "loss": 0.0776, + "step": 11480 + }, + { + "epoch": 0.3676447061082136, + "grad_norm": 5.15625, + "learning_rate": 1.3313573593802629e-05, + "loss": 0.0917, + "step": 11490 + }, + { + "epoch": 0.36796467539116245, + "grad_norm": 7.78125, + "learning_rate": 1.3306837318962615e-05, + "loss": 0.1108, + "step": 11500 + }, + { + "epoch": 0.36828464467411126, + "grad_norm": 6.875, + "learning_rate": 1.3300101044122601e-05, + "loss": 0.0931, + "step": 11510 + }, + { + "epoch": 0.3686046139570601, + "grad_norm": 16.25, + "learning_rate": 1.3293364769282589e-05, + "loss": 0.1893, + "step": 11520 + }, + { + "epoch": 0.368924583240009, + "grad_norm": 5.40625, + "learning_rate": 1.3286628494442575e-05, + "loss": 0.0839, + "step": 11530 + }, + { + "epoch": 0.3692445525229578, + "grad_norm": 3.046875, + "learning_rate": 1.3279892219602561e-05, + "loss": 0.0858, + "step": 11540 + }, + { + "epoch": 0.36956452180590665, + "grad_norm": 1.65625, + "learning_rate": 1.3273155944762549e-05, + "loss": 0.0876, + "step": 11550 + }, + { + "epoch": 0.36988449108885546, + "grad_norm": 17.375, + "learning_rate": 1.3266419669922535e-05, + "loss": 0.181, + "step": 11560 + }, + { + "epoch": 0.3702044603718043, + "grad_norm": 10.1875, + "learning_rate": 1.325968339508252e-05, + "loss": 0.1222, + "step": 11570 + }, + { + "epoch": 0.37052442965475313, + "grad_norm": 9.8125, + "learning_rate": 1.3252947120242507e-05, + "loss": 0.1103, + "step": 11580 + }, + { + "epoch": 0.370844398937702, + "grad_norm": 0.5078125, + "learning_rate": 1.3246210845402495e-05, + "loss": 0.0888, + "step": 11590 + }, + { + "epoch": 0.3711643682206508, + "grad_norm": 0.53125, + "learning_rate": 1.323947457056248e-05, + "loss": 0.1216, + "step": 11600 + }, + { + "epoch": 0.37148433750359966, + "grad_norm": 20.875, + "learning_rate": 1.3232738295722467e-05, + "loss": 0.1632, + "step": 11610 + }, + { + "epoch": 0.37180430678654847, + "grad_norm": 5.40625, + "learning_rate": 1.3226002020882454e-05, + "loss": 0.1715, + "step": 11620 + }, + { + "epoch": 0.37212427606949733, + "grad_norm": 1.59375, + "learning_rate": 1.321926574604244e-05, + "loss": 0.1272, + "step": 11630 + }, + { + "epoch": 0.3724442453524462, + "grad_norm": 60.25, + "learning_rate": 1.3212529471202426e-05, + "loss": 0.1158, + "step": 11640 + }, + { + "epoch": 0.372764214635395, + "grad_norm": 9.5, + "learning_rate": 1.3205793196362413e-05, + "loss": 0.1397, + "step": 11650 + }, + { + "epoch": 0.37308418391834386, + "grad_norm": 22.5, + "learning_rate": 1.31990569215224e-05, + "loss": 0.1447, + "step": 11660 + }, + { + "epoch": 0.37340415320129267, + "grad_norm": 14.3125, + "learning_rate": 1.3192320646682386e-05, + "loss": 0.11, + "step": 11670 + }, + { + "epoch": 0.3737241224842415, + "grad_norm": 8.125, + "learning_rate": 1.3185584371842372e-05, + "loss": 0.1564, + "step": 11680 + }, + { + "epoch": 0.37404409176719033, + "grad_norm": 18.625, + "learning_rate": 1.3178848097002358e-05, + "loss": 0.1502, + "step": 11690 + }, + { + "epoch": 0.3743640610501392, + "grad_norm": 21.625, + "learning_rate": 1.3172111822162346e-05, + "loss": 0.1343, + "step": 11700 + }, + { + "epoch": 0.374684030333088, + "grad_norm": 8.1875, + "learning_rate": 1.3165375547322332e-05, + "loss": 0.0997, + "step": 11710 + }, + { + "epoch": 0.37500399961603686, + "grad_norm": 9.6875, + "learning_rate": 1.3158639272482318e-05, + "loss": 0.087, + "step": 11720 + }, + { + "epoch": 0.37532396889898567, + "grad_norm": 12.125, + "learning_rate": 1.3151902997642306e-05, + "loss": 0.1206, + "step": 11730 + }, + { + "epoch": 0.37564393818193453, + "grad_norm": 21.125, + "learning_rate": 1.3145166722802292e-05, + "loss": 0.1031, + "step": 11740 + }, + { + "epoch": 0.3759639074648834, + "grad_norm": 9.75, + "learning_rate": 1.3138430447962278e-05, + "loss": 0.0723, + "step": 11750 + }, + { + "epoch": 0.3762838767478322, + "grad_norm": 5.90625, + "learning_rate": 1.3131694173122264e-05, + "loss": 0.1148, + "step": 11760 + }, + { + "epoch": 0.37660384603078106, + "grad_norm": 66.0, + "learning_rate": 1.3124957898282252e-05, + "loss": 0.1133, + "step": 11770 + }, + { + "epoch": 0.37692381531372987, + "grad_norm": 2.21875, + "learning_rate": 1.3118221623442238e-05, + "loss": 0.0676, + "step": 11780 + }, + { + "epoch": 0.37724378459667873, + "grad_norm": 31.0, + "learning_rate": 1.3111485348602224e-05, + "loss": 0.1, + "step": 11790 + }, + { + "epoch": 0.37756375387962754, + "grad_norm": 1.375, + "learning_rate": 1.310474907376221e-05, + "loss": 0.0846, + "step": 11800 + }, + { + "epoch": 0.3778837231625764, + "grad_norm": 5.6875, + "learning_rate": 1.3098012798922198e-05, + "loss": 0.0763, + "step": 11810 + }, + { + "epoch": 0.3782036924455252, + "grad_norm": 6.53125, + "learning_rate": 1.3091276524082184e-05, + "loss": 0.1412, + "step": 11820 + }, + { + "epoch": 0.37852366172847407, + "grad_norm": 18.5, + "learning_rate": 1.308454024924217e-05, + "loss": 0.16, + "step": 11830 + }, + { + "epoch": 0.3788436310114229, + "grad_norm": 13.3125, + "learning_rate": 1.3077803974402157e-05, + "loss": 0.1328, + "step": 11840 + }, + { + "epoch": 0.37916360029437174, + "grad_norm": 20.0, + "learning_rate": 1.3071067699562143e-05, + "loss": 0.1119, + "step": 11850 + }, + { + "epoch": 0.3794835695773206, + "grad_norm": 8.1875, + "learning_rate": 1.306433142472213e-05, + "loss": 0.1059, + "step": 11860 + }, + { + "epoch": 0.3798035388602694, + "grad_norm": 9.9375, + "learning_rate": 1.3057595149882116e-05, + "loss": 0.1181, + "step": 11870 + }, + { + "epoch": 0.38012350814321827, + "grad_norm": 0.6875, + "learning_rate": 1.3050858875042103e-05, + "loss": 0.1044, + "step": 11880 + }, + { + "epoch": 0.3804434774261671, + "grad_norm": 9.375, + "learning_rate": 1.304412260020209e-05, + "loss": 0.1582, + "step": 11890 + }, + { + "epoch": 0.38076344670911594, + "grad_norm": 12.5625, + "learning_rate": 1.3037386325362075e-05, + "loss": 0.0781, + "step": 11900 + }, + { + "epoch": 0.38108341599206474, + "grad_norm": 9.0, + "learning_rate": 1.3030650050522063e-05, + "loss": 0.1283, + "step": 11910 + }, + { + "epoch": 0.3814033852750136, + "grad_norm": 28.5, + "learning_rate": 1.302391377568205e-05, + "loss": 0.152, + "step": 11920 + }, + { + "epoch": 0.3817233545579624, + "grad_norm": 1.9140625, + "learning_rate": 1.3017177500842035e-05, + "loss": 0.0666, + "step": 11930 + }, + { + "epoch": 0.3820433238409113, + "grad_norm": 24.5, + "learning_rate": 1.3010441226002021e-05, + "loss": 0.1481, + "step": 11940 + }, + { + "epoch": 0.3823632931238601, + "grad_norm": 5.40625, + "learning_rate": 1.3003704951162009e-05, + "loss": 0.1329, + "step": 11950 + }, + { + "epoch": 0.38268326240680894, + "grad_norm": 31.75, + "learning_rate": 1.2996968676321995e-05, + "loss": 0.1511, + "step": 11960 + }, + { + "epoch": 0.3830032316897578, + "grad_norm": 5.78125, + "learning_rate": 1.2990232401481981e-05, + "loss": 0.0712, + "step": 11970 + }, + { + "epoch": 0.3833232009727066, + "grad_norm": 5.75, + "learning_rate": 1.2983496126641967e-05, + "loss": 0.1224, + "step": 11980 + }, + { + "epoch": 0.3836431702556555, + "grad_norm": 11.9375, + "learning_rate": 1.2976759851801955e-05, + "loss": 0.0876, + "step": 11990 + }, + { + "epoch": 0.3839631395386043, + "grad_norm": 15.5625, + "learning_rate": 1.2970023576961941e-05, + "loss": 0.1448, + "step": 12000 + }, + { + "epoch": 0.38428310882155314, + "grad_norm": 10.0, + "learning_rate": 1.2963287302121927e-05, + "loss": 0.1001, + "step": 12010 + }, + { + "epoch": 0.38460307810450195, + "grad_norm": 12.8125, + "learning_rate": 1.2956551027281915e-05, + "loss": 0.1316, + "step": 12020 + }, + { + "epoch": 0.3849230473874508, + "grad_norm": 1.3203125, + "learning_rate": 1.29498147524419e-05, + "loss": 0.1228, + "step": 12030 + }, + { + "epoch": 0.3852430166703996, + "grad_norm": 3.125, + "learning_rate": 1.2943078477601887e-05, + "loss": 0.0713, + "step": 12040 + }, + { + "epoch": 0.3855629859533485, + "grad_norm": 21.875, + "learning_rate": 1.2936342202761873e-05, + "loss": 0.1133, + "step": 12050 + }, + { + "epoch": 0.38588295523629734, + "grad_norm": 16.625, + "learning_rate": 1.292960592792186e-05, + "loss": 0.1818, + "step": 12060 + }, + { + "epoch": 0.38620292451924615, + "grad_norm": 17.25, + "learning_rate": 1.2922869653081847e-05, + "loss": 0.0688, + "step": 12070 + }, + { + "epoch": 0.386522893802195, + "grad_norm": 15.375, + "learning_rate": 1.2916133378241833e-05, + "loss": 0.0991, + "step": 12080 + }, + { + "epoch": 0.3868428630851438, + "grad_norm": 13.625, + "learning_rate": 1.290939710340182e-05, + "loss": 0.1509, + "step": 12090 + }, + { + "epoch": 0.3871628323680927, + "grad_norm": 11.3125, + "learning_rate": 1.2902660828561806e-05, + "loss": 0.1587, + "step": 12100 + }, + { + "epoch": 0.3874828016510415, + "grad_norm": 11.3125, + "learning_rate": 1.2895924553721792e-05, + "loss": 0.1229, + "step": 12110 + }, + { + "epoch": 0.38780277093399035, + "grad_norm": 5.5, + "learning_rate": 1.2889188278881778e-05, + "loss": 0.1131, + "step": 12120 + }, + { + "epoch": 0.38812274021693915, + "grad_norm": 12.375, + "learning_rate": 1.2882452004041766e-05, + "loss": 0.1133, + "step": 12130 + }, + { + "epoch": 0.388442709499888, + "grad_norm": 6.6875, + "learning_rate": 1.2875715729201752e-05, + "loss": 0.0825, + "step": 12140 + }, + { + "epoch": 0.3887626787828368, + "grad_norm": 6.625, + "learning_rate": 1.2868979454361738e-05, + "loss": 0.1165, + "step": 12150 + }, + { + "epoch": 0.3890826480657857, + "grad_norm": 10.25, + "learning_rate": 1.2862243179521724e-05, + "loss": 0.1629, + "step": 12160 + }, + { + "epoch": 0.38940261734873455, + "grad_norm": 10.0, + "learning_rate": 1.2855506904681712e-05, + "loss": 0.1254, + "step": 12170 + }, + { + "epoch": 0.38972258663168335, + "grad_norm": 16.375, + "learning_rate": 1.2848770629841698e-05, + "loss": 0.1573, + "step": 12180 + }, + { + "epoch": 0.3900425559146322, + "grad_norm": 7.75, + "learning_rate": 1.2842034355001684e-05, + "loss": 0.1404, + "step": 12190 + }, + { + "epoch": 0.390362525197581, + "grad_norm": 9.0, + "learning_rate": 1.2835298080161672e-05, + "loss": 0.0921, + "step": 12200 + }, + { + "epoch": 0.3906824944805299, + "grad_norm": 14.0625, + "learning_rate": 1.2828561805321658e-05, + "loss": 0.1462, + "step": 12210 + }, + { + "epoch": 0.3910024637634787, + "grad_norm": 11.9375, + "learning_rate": 1.2821825530481644e-05, + "loss": 0.0841, + "step": 12220 + }, + { + "epoch": 0.39132243304642755, + "grad_norm": 7.1875, + "learning_rate": 1.281508925564163e-05, + "loss": 0.1122, + "step": 12230 + }, + { + "epoch": 0.39164240232937636, + "grad_norm": 8.6875, + "learning_rate": 1.2808352980801618e-05, + "loss": 0.046, + "step": 12240 + }, + { + "epoch": 0.3919623716123252, + "grad_norm": 4.65625, + "learning_rate": 1.2801616705961604e-05, + "loss": 0.1462, + "step": 12250 + }, + { + "epoch": 0.39228234089527403, + "grad_norm": 5.09375, + "learning_rate": 1.279488043112159e-05, + "loss": 0.0608, + "step": 12260 + }, + { + "epoch": 0.3926023101782229, + "grad_norm": 0.81640625, + "learning_rate": 1.2788144156281576e-05, + "loss": 0.1465, + "step": 12270 + }, + { + "epoch": 0.39292227946117175, + "grad_norm": 31.5, + "learning_rate": 1.2781407881441564e-05, + "loss": 0.1733, + "step": 12280 + }, + { + "epoch": 0.39324224874412056, + "grad_norm": 6.21875, + "learning_rate": 1.277467160660155e-05, + "loss": 0.1205, + "step": 12290 + }, + { + "epoch": 0.3935622180270694, + "grad_norm": 3.015625, + "learning_rate": 1.2767935331761536e-05, + "loss": 0.1399, + "step": 12300 + }, + { + "epoch": 0.39388218731001823, + "grad_norm": 11.5, + "learning_rate": 1.2761199056921523e-05, + "loss": 0.2015, + "step": 12310 + }, + { + "epoch": 0.3942021565929671, + "grad_norm": 13.1875, + "learning_rate": 1.275446278208151e-05, + "loss": 0.1061, + "step": 12320 + }, + { + "epoch": 0.3945221258759159, + "grad_norm": 14.6875, + "learning_rate": 1.2747726507241496e-05, + "loss": 0.1066, + "step": 12330 + }, + { + "epoch": 0.39484209515886476, + "grad_norm": 11.6875, + "learning_rate": 1.2740990232401482e-05, + "loss": 0.1269, + "step": 12340 + }, + { + "epoch": 0.39516206444181357, + "grad_norm": 8.5625, + "learning_rate": 1.273425395756147e-05, + "loss": 0.1048, + "step": 12350 + }, + { + "epoch": 0.3954820337247624, + "grad_norm": 5.78125, + "learning_rate": 1.2727517682721455e-05, + "loss": 0.1347, + "step": 12360 + }, + { + "epoch": 0.39580200300771123, + "grad_norm": 21.25, + "learning_rate": 1.2720781407881441e-05, + "loss": 0.1056, + "step": 12370 + }, + { + "epoch": 0.3961219722906601, + "grad_norm": 7.90625, + "learning_rate": 1.2714045133041429e-05, + "loss": 0.0858, + "step": 12380 + }, + { + "epoch": 0.39644194157360896, + "grad_norm": 1.171875, + "learning_rate": 1.2707308858201415e-05, + "loss": 0.0857, + "step": 12390 + }, + { + "epoch": 0.39676191085655776, + "grad_norm": 9.3125, + "learning_rate": 1.2700572583361401e-05, + "loss": 0.1692, + "step": 12400 + }, + { + "epoch": 0.3970818801395066, + "grad_norm": 2.984375, + "learning_rate": 1.2693836308521387e-05, + "loss": 0.1272, + "step": 12410 + }, + { + "epoch": 0.39740184942245543, + "grad_norm": 13.375, + "learning_rate": 1.2687100033681375e-05, + "loss": 0.1234, + "step": 12420 + }, + { + "epoch": 0.3977218187054043, + "grad_norm": 6.15625, + "learning_rate": 1.2680363758841361e-05, + "loss": 0.107, + "step": 12430 + }, + { + "epoch": 0.3980417879883531, + "grad_norm": 11.1875, + "learning_rate": 1.2673627484001347e-05, + "loss": 0.1873, + "step": 12440 + }, + { + "epoch": 0.39836175727130196, + "grad_norm": 3.3125, + "learning_rate": 1.2666891209161333e-05, + "loss": 0.085, + "step": 12450 + }, + { + "epoch": 0.39868172655425077, + "grad_norm": 9.1875, + "learning_rate": 1.266015493432132e-05, + "loss": 0.1116, + "step": 12460 + }, + { + "epoch": 0.39900169583719963, + "grad_norm": 4.21875, + "learning_rate": 1.2653418659481307e-05, + "loss": 0.111, + "step": 12470 + }, + { + "epoch": 0.39932166512014844, + "grad_norm": 11.0625, + "learning_rate": 1.2646682384641293e-05, + "loss": 0.0514, + "step": 12480 + }, + { + "epoch": 0.3996416344030973, + "grad_norm": 1.890625, + "learning_rate": 1.263994610980128e-05, + "loss": 0.0951, + "step": 12490 + }, + { + "epoch": 0.39996160368604616, + "grad_norm": 6.59375, + "learning_rate": 1.2633209834961267e-05, + "loss": 0.1671, + "step": 12500 + }, + { + "epoch": 0.40028157296899497, + "grad_norm": 30.125, + "learning_rate": 1.2626473560121253e-05, + "loss": 0.1168, + "step": 12510 + }, + { + "epoch": 0.40060154225194383, + "grad_norm": 22.625, + "learning_rate": 1.2619737285281239e-05, + "loss": 0.0853, + "step": 12520 + }, + { + "epoch": 0.40092151153489264, + "grad_norm": 20.25, + "learning_rate": 1.2613001010441226e-05, + "loss": 0.138, + "step": 12530 + }, + { + "epoch": 0.4012414808178415, + "grad_norm": 23.125, + "learning_rate": 1.2606264735601214e-05, + "loss": 0.0791, + "step": 12540 + }, + { + "epoch": 0.4015614501007903, + "grad_norm": 11.8125, + "learning_rate": 1.2599528460761202e-05, + "loss": 0.1199, + "step": 12550 + }, + { + "epoch": 0.40188141938373917, + "grad_norm": 19.75, + "learning_rate": 1.2592792185921188e-05, + "loss": 0.1116, + "step": 12560 + }, + { + "epoch": 0.402201388666688, + "grad_norm": 1.1015625, + "learning_rate": 1.2586055911081174e-05, + "loss": 0.1268, + "step": 12570 + }, + { + "epoch": 0.40252135794963684, + "grad_norm": 10.3125, + "learning_rate": 1.257931963624116e-05, + "loss": 0.1817, + "step": 12580 + }, + { + "epoch": 0.40284132723258564, + "grad_norm": 22.25, + "learning_rate": 1.2572583361401148e-05, + "loss": 0.0754, + "step": 12590 + }, + { + "epoch": 0.4031612965155345, + "grad_norm": 13.625, + "learning_rate": 1.2565847086561134e-05, + "loss": 0.1348, + "step": 12600 + }, + { + "epoch": 0.40348126579848337, + "grad_norm": 5.96875, + "learning_rate": 1.255911081172112e-05, + "loss": 0.0907, + "step": 12610 + }, + { + "epoch": 0.4038012350814322, + "grad_norm": 11.0, + "learning_rate": 1.2552374536881106e-05, + "loss": 0.1318, + "step": 12620 + }, + { + "epoch": 0.40412120436438104, + "grad_norm": 10.25, + "learning_rate": 1.2545638262041094e-05, + "loss": 0.1327, + "step": 12630 + }, + { + "epoch": 0.40444117364732984, + "grad_norm": 12.3125, + "learning_rate": 1.253890198720108e-05, + "loss": 0.1387, + "step": 12640 + }, + { + "epoch": 0.4047611429302787, + "grad_norm": 15.25, + "learning_rate": 1.2532165712361066e-05, + "loss": 0.1375, + "step": 12650 + }, + { + "epoch": 0.4050811122132275, + "grad_norm": 3.296875, + "learning_rate": 1.2525429437521054e-05, + "loss": 0.0695, + "step": 12660 + }, + { + "epoch": 0.4054010814961764, + "grad_norm": 0.94140625, + "learning_rate": 1.251869316268104e-05, + "loss": 0.1343, + "step": 12670 + }, + { + "epoch": 0.4057210507791252, + "grad_norm": 1.0234375, + "learning_rate": 1.2511956887841026e-05, + "loss": 0.0992, + "step": 12680 + }, + { + "epoch": 0.40604102006207404, + "grad_norm": 11.625, + "learning_rate": 1.2505220613001012e-05, + "loss": 0.1291, + "step": 12690 + }, + { + "epoch": 0.4063609893450229, + "grad_norm": 1.40625, + "learning_rate": 1.2498484338161e-05, + "loss": 0.0713, + "step": 12700 + }, + { + "epoch": 0.4066809586279717, + "grad_norm": 13.0625, + "learning_rate": 1.2491748063320985e-05, + "loss": 0.1136, + "step": 12710 + }, + { + "epoch": 0.4070009279109206, + "grad_norm": 10.25, + "learning_rate": 1.2485011788480971e-05, + "loss": 0.104, + "step": 12720 + }, + { + "epoch": 0.4073208971938694, + "grad_norm": 10.625, + "learning_rate": 1.247827551364096e-05, + "loss": 0.1258, + "step": 12730 + }, + { + "epoch": 0.40764086647681824, + "grad_norm": 11.6875, + "learning_rate": 1.2471539238800945e-05, + "loss": 0.1554, + "step": 12740 + }, + { + "epoch": 0.40796083575976705, + "grad_norm": 2.046875, + "learning_rate": 1.2464802963960931e-05, + "loss": 0.1023, + "step": 12750 + }, + { + "epoch": 0.4082808050427159, + "grad_norm": 1.8125, + "learning_rate": 1.2458066689120917e-05, + "loss": 0.1532, + "step": 12760 + }, + { + "epoch": 0.4086007743256647, + "grad_norm": 6.96875, + "learning_rate": 1.2451330414280905e-05, + "loss": 0.1538, + "step": 12770 + }, + { + "epoch": 0.4089207436086136, + "grad_norm": 0.640625, + "learning_rate": 1.2444594139440891e-05, + "loss": 0.0604, + "step": 12780 + }, + { + "epoch": 0.4092407128915624, + "grad_norm": 1.0, + "learning_rate": 1.2437857864600877e-05, + "loss": 0.1201, + "step": 12790 + }, + { + "epoch": 0.40956068217451125, + "grad_norm": 10.5, + "learning_rate": 1.2431121589760863e-05, + "loss": 0.1449, + "step": 12800 + }, + { + "epoch": 0.4098806514574601, + "grad_norm": 6.46875, + "learning_rate": 1.2424385314920851e-05, + "loss": 0.1129, + "step": 12810 + }, + { + "epoch": 0.4102006207404089, + "grad_norm": 28.875, + "learning_rate": 1.2417649040080837e-05, + "loss": 0.0656, + "step": 12820 + }, + { + "epoch": 0.4105205900233578, + "grad_norm": 14.75, + "learning_rate": 1.2410912765240823e-05, + "loss": 0.072, + "step": 12830 + }, + { + "epoch": 0.4108405593063066, + "grad_norm": 17.625, + "learning_rate": 1.240417649040081e-05, + "loss": 0.0477, + "step": 12840 + }, + { + "epoch": 0.41116052858925545, + "grad_norm": 18.75, + "learning_rate": 1.2397440215560797e-05, + "loss": 0.0945, + "step": 12850 + }, + { + "epoch": 0.41148049787220425, + "grad_norm": 3.796875, + "learning_rate": 1.2390703940720783e-05, + "loss": 0.1411, + "step": 12860 + }, + { + "epoch": 0.4118004671551531, + "grad_norm": 12.8125, + "learning_rate": 1.2383967665880769e-05, + "loss": 0.1358, + "step": 12870 + }, + { + "epoch": 0.4121204364381019, + "grad_norm": 18.125, + "learning_rate": 1.2377231391040757e-05, + "loss": 0.108, + "step": 12880 + }, + { + "epoch": 0.4124404057210508, + "grad_norm": 13.3125, + "learning_rate": 1.2370495116200743e-05, + "loss": 0.141, + "step": 12890 + }, + { + "epoch": 0.4127603750039996, + "grad_norm": 8.6875, + "learning_rate": 1.2363758841360729e-05, + "loss": 0.0725, + "step": 12900 + }, + { + "epoch": 0.41308034428694845, + "grad_norm": 16.125, + "learning_rate": 1.2357022566520715e-05, + "loss": 0.1443, + "step": 12910 + }, + { + "epoch": 0.4134003135698973, + "grad_norm": 6.6875, + "learning_rate": 1.2350286291680702e-05, + "loss": 0.0807, + "step": 12920 + }, + { + "epoch": 0.4137202828528461, + "grad_norm": 15.0, + "learning_rate": 1.2343550016840688e-05, + "loss": 0.0965, + "step": 12930 + }, + { + "epoch": 0.414040252135795, + "grad_norm": 13.125, + "learning_rate": 1.2336813742000675e-05, + "loss": 0.1492, + "step": 12940 + }, + { + "epoch": 0.4143602214187438, + "grad_norm": 17.125, + "learning_rate": 1.2330077467160662e-05, + "loss": 0.217, + "step": 12950 + }, + { + "epoch": 0.41468019070169265, + "grad_norm": 4.03125, + "learning_rate": 1.2323341192320648e-05, + "loss": 0.087, + "step": 12960 + }, + { + "epoch": 0.41500015998464146, + "grad_norm": 7.90625, + "learning_rate": 1.2316604917480634e-05, + "loss": 0.1075, + "step": 12970 + }, + { + "epoch": 0.4153201292675903, + "grad_norm": 9.5625, + "learning_rate": 1.230986864264062e-05, + "loss": 0.0746, + "step": 12980 + }, + { + "epoch": 0.4156400985505391, + "grad_norm": 0.92578125, + "learning_rate": 1.2303132367800608e-05, + "loss": 0.0914, + "step": 12990 + }, + { + "epoch": 0.415960067833488, + "grad_norm": 22.5, + "learning_rate": 1.2296396092960594e-05, + "loss": 0.1077, + "step": 13000 + }, + { + "epoch": 0.4162800371164368, + "grad_norm": 9.5625, + "learning_rate": 1.228965981812058e-05, + "loss": 0.1814, + "step": 13010 + }, + { + "epoch": 0.41660000639938566, + "grad_norm": 0.6640625, + "learning_rate": 1.2282923543280568e-05, + "loss": 0.135, + "step": 13020 + }, + { + "epoch": 0.4169199756823345, + "grad_norm": 13.0625, + "learning_rate": 1.2276187268440554e-05, + "loss": 0.1697, + "step": 13030 + }, + { + "epoch": 0.4172399449652833, + "grad_norm": 26.5, + "learning_rate": 1.226945099360054e-05, + "loss": 0.0988, + "step": 13040 + }, + { + "epoch": 0.4175599142482322, + "grad_norm": 8.875, + "learning_rate": 1.2262714718760526e-05, + "loss": 0.1446, + "step": 13050 + }, + { + "epoch": 0.417879883531181, + "grad_norm": 17.5, + "learning_rate": 1.2255978443920514e-05, + "loss": 0.1545, + "step": 13060 + }, + { + "epoch": 0.41819985281412986, + "grad_norm": 5.1875, + "learning_rate": 1.22492421690805e-05, + "loss": 0.1427, + "step": 13070 + }, + { + "epoch": 0.41851982209707866, + "grad_norm": 9.25, + "learning_rate": 1.2242505894240486e-05, + "loss": 0.1168, + "step": 13080 + }, + { + "epoch": 0.4188397913800275, + "grad_norm": 6.15625, + "learning_rate": 1.2235769619400472e-05, + "loss": 0.0839, + "step": 13090 + }, + { + "epoch": 0.41915976066297633, + "grad_norm": 4.84375, + "learning_rate": 1.222903334456046e-05, + "loss": 0.1375, + "step": 13100 + }, + { + "epoch": 0.4194797299459252, + "grad_norm": 15.1875, + "learning_rate": 1.2222297069720446e-05, + "loss": 0.1034, + "step": 13110 + }, + { + "epoch": 0.419799699228874, + "grad_norm": 14.625, + "learning_rate": 1.2215560794880432e-05, + "loss": 0.1141, + "step": 13120 + }, + { + "epoch": 0.42011966851182286, + "grad_norm": 6.875, + "learning_rate": 1.220882452004042e-05, + "loss": 0.1263, + "step": 13130 + }, + { + "epoch": 0.4204396377947717, + "grad_norm": 1.9375, + "learning_rate": 1.2202088245200406e-05, + "loss": 0.1666, + "step": 13140 + }, + { + "epoch": 0.42075960707772053, + "grad_norm": 12.6875, + "learning_rate": 1.2195351970360392e-05, + "loss": 0.1097, + "step": 13150 + }, + { + "epoch": 0.4210795763606694, + "grad_norm": 8.125, + "learning_rate": 1.2188615695520378e-05, + "loss": 0.0806, + "step": 13160 + }, + { + "epoch": 0.4213995456436182, + "grad_norm": 9.1875, + "learning_rate": 1.2181879420680365e-05, + "loss": 0.0913, + "step": 13170 + }, + { + "epoch": 0.42171951492656706, + "grad_norm": 24.875, + "learning_rate": 1.2175143145840351e-05, + "loss": 0.1109, + "step": 13180 + }, + { + "epoch": 0.42203948420951587, + "grad_norm": 0.62890625, + "learning_rate": 1.2168406871000337e-05, + "loss": 0.0954, + "step": 13190 + }, + { + "epoch": 0.42235945349246473, + "grad_norm": 6.375, + "learning_rate": 1.2161670596160325e-05, + "loss": 0.097, + "step": 13200 + }, + { + "epoch": 0.42267942277541354, + "grad_norm": 0.9140625, + "learning_rate": 1.2154934321320311e-05, + "loss": 0.092, + "step": 13210 + }, + { + "epoch": 0.4229993920583624, + "grad_norm": 7.53125, + "learning_rate": 1.2148198046480297e-05, + "loss": 0.1513, + "step": 13220 + }, + { + "epoch": 0.4233193613413112, + "grad_norm": 2.734375, + "learning_rate": 1.2141461771640283e-05, + "loss": 0.0842, + "step": 13230 + }, + { + "epoch": 0.42363933062426007, + "grad_norm": 44.75, + "learning_rate": 1.2134725496800271e-05, + "loss": 0.1017, + "step": 13240 + }, + { + "epoch": 0.42395929990720893, + "grad_norm": 16.75, + "learning_rate": 1.2127989221960257e-05, + "loss": 0.0391, + "step": 13250 + }, + { + "epoch": 0.42427926919015774, + "grad_norm": 4.625, + "learning_rate": 1.2121252947120243e-05, + "loss": 0.0945, + "step": 13260 + }, + { + "epoch": 0.4245992384731066, + "grad_norm": 22.125, + "learning_rate": 1.2114516672280229e-05, + "loss": 0.0986, + "step": 13270 + }, + { + "epoch": 0.4249192077560554, + "grad_norm": 5.875, + "learning_rate": 1.2107780397440217e-05, + "loss": 0.1013, + "step": 13280 + }, + { + "epoch": 0.42523917703900427, + "grad_norm": 28.75, + "learning_rate": 1.2101044122600203e-05, + "loss": 0.1588, + "step": 13290 + }, + { + "epoch": 0.4255591463219531, + "grad_norm": 0.88671875, + "learning_rate": 1.2094307847760189e-05, + "loss": 0.1424, + "step": 13300 + }, + { + "epoch": 0.42587911560490194, + "grad_norm": 12.8125, + "learning_rate": 1.2087571572920177e-05, + "loss": 0.1216, + "step": 13310 + }, + { + "epoch": 0.42619908488785074, + "grad_norm": 22.0, + "learning_rate": 1.2080835298080163e-05, + "loss": 0.1575, + "step": 13320 + }, + { + "epoch": 0.4265190541707996, + "grad_norm": 25.625, + "learning_rate": 1.2074099023240149e-05, + "loss": 0.0764, + "step": 13330 + }, + { + "epoch": 0.42683902345374847, + "grad_norm": 4.5, + "learning_rate": 1.2067362748400135e-05, + "loss": 0.1406, + "step": 13340 + }, + { + "epoch": 0.4271589927366973, + "grad_norm": 9.0, + "learning_rate": 1.2060626473560123e-05, + "loss": 0.103, + "step": 13350 + }, + { + "epoch": 0.42747896201964614, + "grad_norm": 7.1875, + "learning_rate": 1.2053890198720109e-05, + "loss": 0.1186, + "step": 13360 + }, + { + "epoch": 0.42779893130259494, + "grad_norm": 13.25, + "learning_rate": 1.2047153923880095e-05, + "loss": 0.148, + "step": 13370 + }, + { + "epoch": 0.4281189005855438, + "grad_norm": 14.1875, + "learning_rate": 1.204041764904008e-05, + "loss": 0.08, + "step": 13380 + }, + { + "epoch": 0.4284388698684926, + "grad_norm": 8.75, + "learning_rate": 1.2033681374200068e-05, + "loss": 0.0665, + "step": 13390 + }, + { + "epoch": 0.4287588391514415, + "grad_norm": 6.53125, + "learning_rate": 1.2026945099360054e-05, + "loss": 0.0673, + "step": 13400 + }, + { + "epoch": 0.4290788084343903, + "grad_norm": 24.875, + "learning_rate": 1.202020882452004e-05, + "loss": 0.0881, + "step": 13410 + }, + { + "epoch": 0.42939877771733914, + "grad_norm": 11.6875, + "learning_rate": 1.2013472549680028e-05, + "loss": 0.087, + "step": 13420 + }, + { + "epoch": 0.42971874700028795, + "grad_norm": 17.0, + "learning_rate": 1.2006736274840014e-05, + "loss": 0.0621, + "step": 13430 + }, + { + "epoch": 0.4300387162832368, + "grad_norm": 16.125, + "learning_rate": 1.2e-05, + "loss": 0.0973, + "step": 13440 + }, + { + "epoch": 0.4303586855661857, + "grad_norm": 0.875, + "learning_rate": 1.1993263725159986e-05, + "loss": 0.1739, + "step": 13450 + }, + { + "epoch": 0.4306786548491345, + "grad_norm": 12.9375, + "learning_rate": 1.1986527450319974e-05, + "loss": 0.0507, + "step": 13460 + }, + { + "epoch": 0.43099862413208334, + "grad_norm": 6.75, + "learning_rate": 1.197979117547996e-05, + "loss": 0.1018, + "step": 13470 + }, + { + "epoch": 0.43131859341503215, + "grad_norm": 15.6875, + "learning_rate": 1.1973054900639946e-05, + "loss": 0.1714, + "step": 13480 + }, + { + "epoch": 0.431638562697981, + "grad_norm": 52.5, + "learning_rate": 1.1966318625799934e-05, + "loss": 0.1345, + "step": 13490 + }, + { + "epoch": 0.4319585319809298, + "grad_norm": 9.0625, + "learning_rate": 1.195958235095992e-05, + "loss": 0.1365, + "step": 13500 + }, + { + "epoch": 0.4322785012638787, + "grad_norm": 9.3125, + "learning_rate": 1.1952846076119906e-05, + "loss": 0.0933, + "step": 13510 + }, + { + "epoch": 0.4325984705468275, + "grad_norm": 62.25, + "learning_rate": 1.1946109801279892e-05, + "loss": 0.1471, + "step": 13520 + }, + { + "epoch": 0.43291843982977635, + "grad_norm": 11.125, + "learning_rate": 1.193937352643988e-05, + "loss": 0.1052, + "step": 13530 + }, + { + "epoch": 0.43323840911272515, + "grad_norm": 8.9375, + "learning_rate": 1.1932637251599866e-05, + "loss": 0.1031, + "step": 13540 + }, + { + "epoch": 0.433558378395674, + "grad_norm": 3.046875, + "learning_rate": 1.1925900976759852e-05, + "loss": 0.1373, + "step": 13550 + }, + { + "epoch": 0.4338783476786229, + "grad_norm": 8.75, + "learning_rate": 1.1919164701919838e-05, + "loss": 0.071, + "step": 13560 + }, + { + "epoch": 0.4341983169615717, + "grad_norm": 4.84375, + "learning_rate": 1.1912428427079826e-05, + "loss": 0.1588, + "step": 13570 + }, + { + "epoch": 0.43451828624452055, + "grad_norm": 32.5, + "learning_rate": 1.1905692152239812e-05, + "loss": 0.1497, + "step": 13580 + }, + { + "epoch": 0.43483825552746935, + "grad_norm": 11.375, + "learning_rate": 1.1898955877399798e-05, + "loss": 0.0853, + "step": 13590 + }, + { + "epoch": 0.4351582248104182, + "grad_norm": 2.5625, + "learning_rate": 1.1892219602559785e-05, + "loss": 0.0652, + "step": 13600 + }, + { + "epoch": 0.435478194093367, + "grad_norm": 15.3125, + "learning_rate": 1.1885483327719771e-05, + "loss": 0.115, + "step": 13610 + }, + { + "epoch": 0.4357981633763159, + "grad_norm": 28.0, + "learning_rate": 1.1878747052879758e-05, + "loss": 0.0928, + "step": 13620 + }, + { + "epoch": 0.4361181326592647, + "grad_norm": 13.5, + "learning_rate": 1.1872010778039744e-05, + "loss": 0.1408, + "step": 13630 + }, + { + "epoch": 0.43643810194221355, + "grad_norm": 4.25, + "learning_rate": 1.1865274503199731e-05, + "loss": 0.1242, + "step": 13640 + }, + { + "epoch": 0.43675807122516236, + "grad_norm": 5.0625, + "learning_rate": 1.1858538228359717e-05, + "loss": 0.0633, + "step": 13650 + }, + { + "epoch": 0.4370780405081112, + "grad_norm": 20.375, + "learning_rate": 1.1851801953519703e-05, + "loss": 0.1442, + "step": 13660 + }, + { + "epoch": 0.4373980097910601, + "grad_norm": 13.0, + "learning_rate": 1.184506567867969e-05, + "loss": 0.1632, + "step": 13670 + }, + { + "epoch": 0.4377179790740089, + "grad_norm": 7.71875, + "learning_rate": 1.1838329403839677e-05, + "loss": 0.1413, + "step": 13680 + }, + { + "epoch": 0.43803794835695775, + "grad_norm": 16.5, + "learning_rate": 1.1831593128999663e-05, + "loss": 0.0996, + "step": 13690 + }, + { + "epoch": 0.43835791763990656, + "grad_norm": 4.6875, + "learning_rate": 1.182485685415965e-05, + "loss": 0.1778, + "step": 13700 + }, + { + "epoch": 0.4386778869228554, + "grad_norm": 14.625, + "learning_rate": 1.1818120579319637e-05, + "loss": 0.103, + "step": 13710 + }, + { + "epoch": 0.4389978562058042, + "grad_norm": 16.875, + "learning_rate": 1.1811384304479623e-05, + "loss": 0.127, + "step": 13720 + }, + { + "epoch": 0.4393178254887531, + "grad_norm": 8.625, + "learning_rate": 1.1804648029639609e-05, + "loss": 0.1439, + "step": 13730 + }, + { + "epoch": 0.4396377947717019, + "grad_norm": 11.5, + "learning_rate": 1.1797911754799595e-05, + "loss": 0.0937, + "step": 13740 + }, + { + "epoch": 0.43995776405465076, + "grad_norm": 12.6875, + "learning_rate": 1.1791175479959583e-05, + "loss": 0.0906, + "step": 13750 + }, + { + "epoch": 0.44027773333759956, + "grad_norm": 13.9375, + "learning_rate": 1.1784439205119569e-05, + "loss": 0.1649, + "step": 13760 + }, + { + "epoch": 0.4405977026205484, + "grad_norm": 4.53125, + "learning_rate": 1.1777702930279555e-05, + "loss": 0.1203, + "step": 13770 + }, + { + "epoch": 0.4409176719034973, + "grad_norm": 1.09375, + "learning_rate": 1.1770966655439543e-05, + "loss": 0.0982, + "step": 13780 + }, + { + "epoch": 0.4412376411864461, + "grad_norm": 6.78125, + "learning_rate": 1.1764230380599529e-05, + "loss": 0.1056, + "step": 13790 + }, + { + "epoch": 0.44155761046939496, + "grad_norm": 17.625, + "learning_rate": 1.1757494105759515e-05, + "loss": 0.1607, + "step": 13800 + }, + { + "epoch": 0.44187757975234376, + "grad_norm": 19.0, + "learning_rate": 1.17507578309195e-05, + "loss": 0.1308, + "step": 13810 + }, + { + "epoch": 0.4421975490352926, + "grad_norm": 3.234375, + "learning_rate": 1.1744021556079489e-05, + "loss": 0.0923, + "step": 13820 + }, + { + "epoch": 0.44251751831824143, + "grad_norm": 14.6875, + "learning_rate": 1.1737285281239476e-05, + "loss": 0.126, + "step": 13830 + }, + { + "epoch": 0.4428374876011903, + "grad_norm": 1.1875, + "learning_rate": 1.1730549006399464e-05, + "loss": 0.0808, + "step": 13840 + }, + { + "epoch": 0.4431574568841391, + "grad_norm": 26.125, + "learning_rate": 1.172381273155945e-05, + "loss": 0.1405, + "step": 13850 + }, + { + "epoch": 0.44347742616708796, + "grad_norm": 14.625, + "learning_rate": 1.1717076456719436e-05, + "loss": 0.1164, + "step": 13860 + }, + { + "epoch": 0.44379739545003677, + "grad_norm": 12.25, + "learning_rate": 1.1710340181879422e-05, + "loss": 0.1179, + "step": 13870 + }, + { + "epoch": 0.44411736473298563, + "grad_norm": 6.09375, + "learning_rate": 1.170360390703941e-05, + "loss": 0.074, + "step": 13880 + }, + { + "epoch": 0.4444373340159345, + "grad_norm": 25.625, + "learning_rate": 1.1696867632199396e-05, + "loss": 0.097, + "step": 13890 + }, + { + "epoch": 0.4447573032988833, + "grad_norm": 6.0625, + "learning_rate": 1.1690131357359382e-05, + "loss": 0.1389, + "step": 13900 + }, + { + "epoch": 0.44507727258183216, + "grad_norm": 6.25, + "learning_rate": 1.1683395082519368e-05, + "loss": 0.1156, + "step": 13910 + }, + { + "epoch": 0.44539724186478097, + "grad_norm": 2.46875, + "learning_rate": 1.1676658807679356e-05, + "loss": 0.0628, + "step": 13920 + }, + { + "epoch": 0.44571721114772983, + "grad_norm": 17.25, + "learning_rate": 1.1669922532839342e-05, + "loss": 0.2375, + "step": 13930 + }, + { + "epoch": 0.44603718043067864, + "grad_norm": 9.875, + "learning_rate": 1.1663186257999328e-05, + "loss": 0.086, + "step": 13940 + }, + { + "epoch": 0.4463571497136275, + "grad_norm": 17.875, + "learning_rate": 1.1656449983159316e-05, + "loss": 0.1142, + "step": 13950 + }, + { + "epoch": 0.4466771189965763, + "grad_norm": 16.875, + "learning_rate": 1.1649713708319302e-05, + "loss": 0.1143, + "step": 13960 + }, + { + "epoch": 0.44699708827952517, + "grad_norm": 15.625, + "learning_rate": 1.1642977433479288e-05, + "loss": 0.1356, + "step": 13970 + }, + { + "epoch": 0.44731705756247403, + "grad_norm": 13.0, + "learning_rate": 1.1636241158639274e-05, + "loss": 0.1781, + "step": 13980 + }, + { + "epoch": 0.44763702684542284, + "grad_norm": 7.625, + "learning_rate": 1.1629504883799261e-05, + "loss": 0.1361, + "step": 13990 + }, + { + "epoch": 0.4479569961283717, + "grad_norm": 11.75, + "learning_rate": 1.1622768608959247e-05, + "loss": 0.068, + "step": 14000 + }, + { + "epoch": 0.4482769654113205, + "grad_norm": 5.375, + "learning_rate": 1.1616032334119233e-05, + "loss": 0.1016, + "step": 14010 + }, + { + "epoch": 0.44859693469426937, + "grad_norm": 9.5, + "learning_rate": 1.160929605927922e-05, + "loss": 0.1316, + "step": 14020 + }, + { + "epoch": 0.4489169039772182, + "grad_norm": 2.90625, + "learning_rate": 1.1602559784439207e-05, + "loss": 0.0897, + "step": 14030 + }, + { + "epoch": 0.44923687326016704, + "grad_norm": 12.4375, + "learning_rate": 1.1595823509599193e-05, + "loss": 0.0831, + "step": 14040 + }, + { + "epoch": 0.44955684254311584, + "grad_norm": 0.734375, + "learning_rate": 1.158908723475918e-05, + "loss": 0.1336, + "step": 14050 + }, + { + "epoch": 0.4498768118260647, + "grad_norm": 15.0625, + "learning_rate": 1.1582350959919167e-05, + "loss": 0.0792, + "step": 14060 + }, + { + "epoch": 0.4501967811090135, + "grad_norm": 1.0078125, + "learning_rate": 1.1575614685079153e-05, + "loss": 0.1652, + "step": 14070 + }, + { + "epoch": 0.4505167503919624, + "grad_norm": 10.0, + "learning_rate": 1.1568878410239139e-05, + "loss": 0.0984, + "step": 14080 + }, + { + "epoch": 0.45083671967491123, + "grad_norm": 9.875, + "learning_rate": 1.1562142135399125e-05, + "loss": 0.1038, + "step": 14090 + }, + { + "epoch": 0.45115668895786004, + "grad_norm": 0.69140625, + "learning_rate": 1.1555405860559113e-05, + "loss": 0.0797, + "step": 14100 + }, + { + "epoch": 0.4514766582408089, + "grad_norm": 14.0625, + "learning_rate": 1.1548669585719099e-05, + "loss": 0.1299, + "step": 14110 + }, + { + "epoch": 0.4517966275237577, + "grad_norm": 12.9375, + "learning_rate": 1.1541933310879085e-05, + "loss": 0.1071, + "step": 14120 + }, + { + "epoch": 0.45211659680670657, + "grad_norm": 18.25, + "learning_rate": 1.1535197036039073e-05, + "loss": 0.1636, + "step": 14130 + }, + { + "epoch": 0.4524365660896554, + "grad_norm": 6.125, + "learning_rate": 1.1528460761199059e-05, + "loss": 0.1152, + "step": 14140 + }, + { + "epoch": 0.45275653537260424, + "grad_norm": 3.5, + "learning_rate": 1.1521724486359045e-05, + "loss": 0.1073, + "step": 14150 + }, + { + "epoch": 0.45307650465555305, + "grad_norm": 11.3125, + "learning_rate": 1.1514988211519031e-05, + "loss": 0.1152, + "step": 14160 + }, + { + "epoch": 0.4533964739385019, + "grad_norm": 6.78125, + "learning_rate": 1.1508251936679019e-05, + "loss": 0.1045, + "step": 14170 + }, + { + "epoch": 0.4537164432214507, + "grad_norm": 3.78125, + "learning_rate": 1.1501515661839005e-05, + "loss": 0.1372, + "step": 14180 + }, + { + "epoch": 0.4540364125043996, + "grad_norm": 14.6875, + "learning_rate": 1.149477938699899e-05, + "loss": 0.133, + "step": 14190 + }, + { + "epoch": 0.45435638178734844, + "grad_norm": 5.78125, + "learning_rate": 1.1488043112158977e-05, + "loss": 0.1355, + "step": 14200 + }, + { + "epoch": 0.45467635107029725, + "grad_norm": 5.125, + "learning_rate": 1.1481306837318964e-05, + "loss": 0.1443, + "step": 14210 + }, + { + "epoch": 0.4549963203532461, + "grad_norm": 20.875, + "learning_rate": 1.147457056247895e-05, + "loss": 0.1197, + "step": 14220 + }, + { + "epoch": 0.4553162896361949, + "grad_norm": 10.9375, + "learning_rate": 1.1467834287638937e-05, + "loss": 0.0711, + "step": 14230 + }, + { + "epoch": 0.4556362589191438, + "grad_norm": 4.21875, + "learning_rate": 1.1461098012798924e-05, + "loss": 0.0803, + "step": 14240 + }, + { + "epoch": 0.4559562282020926, + "grad_norm": 14.0625, + "learning_rate": 1.145436173795891e-05, + "loss": 0.1262, + "step": 14250 + }, + { + "epoch": 0.45627619748504145, + "grad_norm": 21.5, + "learning_rate": 1.1447625463118896e-05, + "loss": 0.0693, + "step": 14260 + }, + { + "epoch": 0.45659616676799025, + "grad_norm": 15.125, + "learning_rate": 1.1440889188278882e-05, + "loss": 0.1039, + "step": 14270 + }, + { + "epoch": 0.4569161360509391, + "grad_norm": 8.8125, + "learning_rate": 1.143415291343887e-05, + "loss": 0.1364, + "step": 14280 + }, + { + "epoch": 0.4572361053338879, + "grad_norm": 15.875, + "learning_rate": 1.1427416638598856e-05, + "loss": 0.0857, + "step": 14290 + }, + { + "epoch": 0.4575560746168368, + "grad_norm": 6.3125, + "learning_rate": 1.1420680363758842e-05, + "loss": 0.11, + "step": 14300 + }, + { + "epoch": 0.45787604389978565, + "grad_norm": 2.078125, + "learning_rate": 1.141394408891883e-05, + "loss": 0.1195, + "step": 14310 + }, + { + "epoch": 0.45819601318273445, + "grad_norm": 18.25, + "learning_rate": 1.1407207814078816e-05, + "loss": 0.1222, + "step": 14320 + }, + { + "epoch": 0.4585159824656833, + "grad_norm": 11.9375, + "learning_rate": 1.1400471539238802e-05, + "loss": 0.135, + "step": 14330 + }, + { + "epoch": 0.4588359517486321, + "grad_norm": 0.671875, + "learning_rate": 1.1393735264398788e-05, + "loss": 0.1464, + "step": 14340 + }, + { + "epoch": 0.459155921031581, + "grad_norm": 5.1875, + "learning_rate": 1.1386998989558776e-05, + "loss": 0.0776, + "step": 14350 + }, + { + "epoch": 0.4594758903145298, + "grad_norm": 7.375, + "learning_rate": 1.1380262714718762e-05, + "loss": 0.1042, + "step": 14360 + }, + { + "epoch": 0.45979585959747865, + "grad_norm": 11.0, + "learning_rate": 1.1373526439878748e-05, + "loss": 0.1421, + "step": 14370 + }, + { + "epoch": 0.46011582888042746, + "grad_norm": 5.46875, + "learning_rate": 1.1366790165038734e-05, + "loss": 0.1339, + "step": 14380 + }, + { + "epoch": 0.4604357981633763, + "grad_norm": 15.75, + "learning_rate": 1.1360053890198722e-05, + "loss": 0.0889, + "step": 14390 + }, + { + "epoch": 0.4607557674463251, + "grad_norm": 11.125, + "learning_rate": 1.1353317615358708e-05, + "loss": 0.0998, + "step": 14400 + }, + { + "epoch": 0.461075736729274, + "grad_norm": 9.125, + "learning_rate": 1.1346581340518694e-05, + "loss": 0.1288, + "step": 14410 + }, + { + "epoch": 0.46139570601222285, + "grad_norm": 0.375, + "learning_rate": 1.1339845065678681e-05, + "loss": 0.0567, + "step": 14420 + }, + { + "epoch": 0.46171567529517166, + "grad_norm": 14.125, + "learning_rate": 1.1333108790838668e-05, + "loss": 0.1746, + "step": 14430 + }, + { + "epoch": 0.4620356445781205, + "grad_norm": 35.25, + "learning_rate": 1.1326372515998654e-05, + "loss": 0.1098, + "step": 14440 + }, + { + "epoch": 0.4623556138610693, + "grad_norm": 25.75, + "learning_rate": 1.131963624115864e-05, + "loss": 0.1504, + "step": 14450 + }, + { + "epoch": 0.4626755831440182, + "grad_norm": 7.78125, + "learning_rate": 1.1312899966318627e-05, + "loss": 0.0965, + "step": 14460 + }, + { + "epoch": 0.462995552426967, + "grad_norm": 3.34375, + "learning_rate": 1.1306163691478613e-05, + "loss": 0.1065, + "step": 14470 + }, + { + "epoch": 0.46331552170991586, + "grad_norm": 7.625, + "learning_rate": 1.12994274166386e-05, + "loss": 0.0949, + "step": 14480 + }, + { + "epoch": 0.46363549099286466, + "grad_norm": 1.3359375, + "learning_rate": 1.1292691141798585e-05, + "loss": 0.086, + "step": 14490 + }, + { + "epoch": 0.4639554602758135, + "grad_norm": 11.4375, + "learning_rate": 1.1285954866958573e-05, + "loss": 0.1052, + "step": 14500 + }, + { + "epoch": 0.46427542955876233, + "grad_norm": 1.6953125, + "learning_rate": 1.127921859211856e-05, + "loss": 0.1108, + "step": 14510 + }, + { + "epoch": 0.4645953988417112, + "grad_norm": 10.8125, + "learning_rate": 1.1272482317278545e-05, + "loss": 0.1441, + "step": 14520 + }, + { + "epoch": 0.46491536812466006, + "grad_norm": 0.78125, + "learning_rate": 1.1265746042438533e-05, + "loss": 0.1438, + "step": 14530 + }, + { + "epoch": 0.46523533740760886, + "grad_norm": 80.0, + "learning_rate": 1.1259009767598519e-05, + "loss": 0.1715, + "step": 14540 + }, + { + "epoch": 0.4655553066905577, + "grad_norm": 20.0, + "learning_rate": 1.1252273492758505e-05, + "loss": 0.1282, + "step": 14550 + }, + { + "epoch": 0.46587527597350653, + "grad_norm": 0.62109375, + "learning_rate": 1.1245537217918491e-05, + "loss": 0.1233, + "step": 14560 + }, + { + "epoch": 0.4661952452564554, + "grad_norm": 10.5625, + "learning_rate": 1.1238800943078479e-05, + "loss": 0.1397, + "step": 14570 + }, + { + "epoch": 0.4665152145394042, + "grad_norm": 22.25, + "learning_rate": 1.1232064668238465e-05, + "loss": 0.0896, + "step": 14580 + }, + { + "epoch": 0.46683518382235306, + "grad_norm": 14.4375, + "learning_rate": 1.1225328393398451e-05, + "loss": 0.1315, + "step": 14590 + }, + { + "epoch": 0.46715515310530187, + "grad_norm": 9.125, + "learning_rate": 1.1218592118558439e-05, + "loss": 0.1294, + "step": 14600 + }, + { + "epoch": 0.46747512238825073, + "grad_norm": 10.0625, + "learning_rate": 1.1211855843718425e-05, + "loss": 0.1686, + "step": 14610 + }, + { + "epoch": 0.4677950916711996, + "grad_norm": 3.0, + "learning_rate": 1.120511956887841e-05, + "loss": 0.0702, + "step": 14620 + }, + { + "epoch": 0.4681150609541484, + "grad_norm": 9.375, + "learning_rate": 1.1198383294038397e-05, + "loss": 0.145, + "step": 14630 + }, + { + "epoch": 0.46843503023709726, + "grad_norm": 4.15625, + "learning_rate": 1.1191647019198385e-05, + "loss": 0.097, + "step": 14640 + }, + { + "epoch": 0.46875499952004607, + "grad_norm": 5.8125, + "learning_rate": 1.118491074435837e-05, + "loss": 0.0526, + "step": 14650 + }, + { + "epoch": 0.46907496880299493, + "grad_norm": 1.3203125, + "learning_rate": 1.1178174469518357e-05, + "loss": 0.1244, + "step": 14660 + }, + { + "epoch": 0.46939493808594374, + "grad_norm": 12.875, + "learning_rate": 1.1171438194678343e-05, + "loss": 0.0983, + "step": 14670 + }, + { + "epoch": 0.4697149073688926, + "grad_norm": 9.25, + "learning_rate": 1.116470191983833e-05, + "loss": 0.0571, + "step": 14680 + }, + { + "epoch": 0.4700348766518414, + "grad_norm": 16.25, + "learning_rate": 1.1157965644998316e-05, + "loss": 0.0954, + "step": 14690 + }, + { + "epoch": 0.47035484593479027, + "grad_norm": 26.625, + "learning_rate": 1.1151229370158302e-05, + "loss": 0.0811, + "step": 14700 + }, + { + "epoch": 0.4706748152177391, + "grad_norm": 19.25, + "learning_rate": 1.114449309531829e-05, + "loss": 0.1657, + "step": 14710 + }, + { + "epoch": 0.47099478450068794, + "grad_norm": 10.1875, + "learning_rate": 1.1137756820478276e-05, + "loss": 0.0761, + "step": 14720 + }, + { + "epoch": 0.4713147537836368, + "grad_norm": 8.9375, + "learning_rate": 1.1131020545638262e-05, + "loss": 0.084, + "step": 14730 + }, + { + "epoch": 0.4716347230665856, + "grad_norm": 6.46875, + "learning_rate": 1.1124284270798248e-05, + "loss": 0.1656, + "step": 14740 + }, + { + "epoch": 0.47195469234953447, + "grad_norm": 1.6171875, + "learning_rate": 1.1117547995958236e-05, + "loss": 0.0599, + "step": 14750 + }, + { + "epoch": 0.4722746616324833, + "grad_norm": 23.375, + "learning_rate": 1.1110811721118222e-05, + "loss": 0.116, + "step": 14760 + }, + { + "epoch": 0.47259463091543213, + "grad_norm": 7.3125, + "learning_rate": 1.1104075446278208e-05, + "loss": 0.1249, + "step": 14770 + }, + { + "epoch": 0.47291460019838094, + "grad_norm": 3.5625, + "learning_rate": 1.1097339171438194e-05, + "loss": 0.0941, + "step": 14780 + }, + { + "epoch": 0.4732345694813298, + "grad_norm": 9.375, + "learning_rate": 1.1090602896598182e-05, + "loss": 0.0901, + "step": 14790 + }, + { + "epoch": 0.4735545387642786, + "grad_norm": 7.34375, + "learning_rate": 1.1083866621758168e-05, + "loss": 0.0754, + "step": 14800 + }, + { + "epoch": 0.47387450804722747, + "grad_norm": 6.71875, + "learning_rate": 1.1077130346918154e-05, + "loss": 0.0792, + "step": 14810 + }, + { + "epoch": 0.4741944773301763, + "grad_norm": 18.875, + "learning_rate": 1.1070394072078142e-05, + "loss": 0.1138, + "step": 14820 + }, + { + "epoch": 0.47451444661312514, + "grad_norm": 10.75, + "learning_rate": 1.1063657797238128e-05, + "loss": 0.1074, + "step": 14830 + }, + { + "epoch": 0.474834415896074, + "grad_norm": 0.74609375, + "learning_rate": 1.1056921522398114e-05, + "loss": 0.1801, + "step": 14840 + }, + { + "epoch": 0.4751543851790228, + "grad_norm": 9.0, + "learning_rate": 1.10501852475581e-05, + "loss": 0.0982, + "step": 14850 + }, + { + "epoch": 0.47547435446197167, + "grad_norm": 6.09375, + "learning_rate": 1.1043448972718088e-05, + "loss": 0.0937, + "step": 14860 + }, + { + "epoch": 0.4757943237449205, + "grad_norm": 2.53125, + "learning_rate": 1.1036712697878074e-05, + "loss": 0.162, + "step": 14870 + }, + { + "epoch": 0.47611429302786934, + "grad_norm": 14.4375, + "learning_rate": 1.102997642303806e-05, + "loss": 0.1362, + "step": 14880 + }, + { + "epoch": 0.47643426231081815, + "grad_norm": 2.09375, + "learning_rate": 1.1023240148198047e-05, + "loss": 0.1303, + "step": 14890 + }, + { + "epoch": 0.476754231593767, + "grad_norm": 1.4453125, + "learning_rate": 1.1016503873358033e-05, + "loss": 0.0811, + "step": 14900 + }, + { + "epoch": 0.4770742008767158, + "grad_norm": 6.71875, + "learning_rate": 1.100976759851802e-05, + "loss": 0.1527, + "step": 14910 + }, + { + "epoch": 0.4773941701596647, + "grad_norm": 11.1875, + "learning_rate": 1.1003031323678006e-05, + "loss": 0.1181, + "step": 14920 + }, + { + "epoch": 0.4777141394426135, + "grad_norm": 4.59375, + "learning_rate": 1.0996295048837993e-05, + "loss": 0.1016, + "step": 14930 + }, + { + "epoch": 0.47803410872556235, + "grad_norm": 11.3125, + "learning_rate": 1.098955877399798e-05, + "loss": 0.1299, + "step": 14940 + }, + { + "epoch": 0.4783540780085112, + "grad_norm": 18.375, + "learning_rate": 1.0982822499157965e-05, + "loss": 0.0982, + "step": 14950 + }, + { + "epoch": 0.47867404729146, + "grad_norm": 15.1875, + "learning_rate": 1.0976086224317951e-05, + "loss": 0.1061, + "step": 14960 + }, + { + "epoch": 0.4789940165744089, + "grad_norm": 11.875, + "learning_rate": 1.096934994947794e-05, + "loss": 0.0838, + "step": 14970 + }, + { + "epoch": 0.4793139858573577, + "grad_norm": 7.46875, + "learning_rate": 1.0962613674637925e-05, + "loss": 0.1507, + "step": 14980 + }, + { + "epoch": 0.47963395514030654, + "grad_norm": 6.1875, + "learning_rate": 1.0955877399797911e-05, + "loss": 0.2222, + "step": 14990 + }, + { + "epoch": 0.47995392442325535, + "grad_norm": 7.125, + "learning_rate": 1.0949141124957899e-05, + "loss": 0.1062, + "step": 15000 + }, + { + "epoch": 0.4802738937062042, + "grad_norm": 8.3125, + "learning_rate": 1.0942404850117885e-05, + "loss": 0.1796, + "step": 15010 + }, + { + "epoch": 0.480593862989153, + "grad_norm": 8.9375, + "learning_rate": 1.0935668575277871e-05, + "loss": 0.1774, + "step": 15020 + }, + { + "epoch": 0.4809138322721019, + "grad_norm": 7.46875, + "learning_rate": 1.0928932300437857e-05, + "loss": 0.1015, + "step": 15030 + }, + { + "epoch": 0.4812338015550507, + "grad_norm": 12.0625, + "learning_rate": 1.0922196025597845e-05, + "loss": 0.1353, + "step": 15040 + }, + { + "epoch": 0.48155377083799955, + "grad_norm": 9.5625, + "learning_rate": 1.0915459750757831e-05, + "loss": 0.0966, + "step": 15050 + }, + { + "epoch": 0.4818737401209484, + "grad_norm": 6.65625, + "learning_rate": 1.0908723475917817e-05, + "loss": 0.0704, + "step": 15060 + }, + { + "epoch": 0.4821937094038972, + "grad_norm": 3.734375, + "learning_rate": 1.0901987201077805e-05, + "loss": 0.0683, + "step": 15070 + }, + { + "epoch": 0.4825136786868461, + "grad_norm": 8.5, + "learning_rate": 1.089525092623779e-05, + "loss": 0.1077, + "step": 15080 + }, + { + "epoch": 0.4828336479697949, + "grad_norm": 17.25, + "learning_rate": 1.0888514651397777e-05, + "loss": 0.1185, + "step": 15090 + }, + { + "epoch": 0.48315361725274375, + "grad_norm": 7.4375, + "learning_rate": 1.0881778376557763e-05, + "loss": 0.1238, + "step": 15100 + }, + { + "epoch": 0.48347358653569256, + "grad_norm": 8.1875, + "learning_rate": 1.087504210171775e-05, + "loss": 0.0841, + "step": 15110 + }, + { + "epoch": 0.4837935558186414, + "grad_norm": 3.671875, + "learning_rate": 1.0868305826877738e-05, + "loss": 0.0856, + "step": 15120 + }, + { + "epoch": 0.4841135251015902, + "grad_norm": 1.96875, + "learning_rate": 1.0861569552037724e-05, + "loss": 0.0966, + "step": 15130 + }, + { + "epoch": 0.4844334943845391, + "grad_norm": 1.515625, + "learning_rate": 1.0854833277197712e-05, + "loss": 0.1029, + "step": 15140 + }, + { + "epoch": 0.4847534636674879, + "grad_norm": 14.4375, + "learning_rate": 1.0848097002357698e-05, + "loss": 0.1597, + "step": 15150 + }, + { + "epoch": 0.48507343295043676, + "grad_norm": 11.0, + "learning_rate": 1.0841360727517684e-05, + "loss": 0.1657, + "step": 15160 + }, + { + "epoch": 0.4853934022333856, + "grad_norm": 13.9375, + "learning_rate": 1.0834624452677672e-05, + "loss": 0.1039, + "step": 15170 + }, + { + "epoch": 0.4857133715163344, + "grad_norm": 6.75, + "learning_rate": 1.0827888177837658e-05, + "loss": 0.0897, + "step": 15180 + }, + { + "epoch": 0.4860333407992833, + "grad_norm": 7.90625, + "learning_rate": 1.0821151902997644e-05, + "loss": 0.16, + "step": 15190 + }, + { + "epoch": 0.4863533100822321, + "grad_norm": 0.85546875, + "learning_rate": 1.081441562815763e-05, + "loss": 0.1087, + "step": 15200 + }, + { + "epoch": 0.48667327936518096, + "grad_norm": 6.65625, + "learning_rate": 1.0807679353317618e-05, + "loss": 0.0997, + "step": 15210 + }, + { + "epoch": 0.48699324864812976, + "grad_norm": 10.25, + "learning_rate": 1.0800943078477604e-05, + "loss": 0.1594, + "step": 15220 + }, + { + "epoch": 0.4873132179310786, + "grad_norm": 14.25, + "learning_rate": 1.079420680363759e-05, + "loss": 0.1314, + "step": 15230 + }, + { + "epoch": 0.48763318721402743, + "grad_norm": 2.40625, + "learning_rate": 1.0787470528797578e-05, + "loss": 0.0725, + "step": 15240 + }, + { + "epoch": 0.4879531564969763, + "grad_norm": 8.1875, + "learning_rate": 1.0780734253957564e-05, + "loss": 0.1273, + "step": 15250 + }, + { + "epoch": 0.48827312577992515, + "grad_norm": 18.125, + "learning_rate": 1.077399797911755e-05, + "loss": 0.1245, + "step": 15260 + }, + { + "epoch": 0.48859309506287396, + "grad_norm": 11.1875, + "learning_rate": 1.0767261704277536e-05, + "loss": 0.1129, + "step": 15270 + }, + { + "epoch": 0.4889130643458228, + "grad_norm": 24.25, + "learning_rate": 1.0760525429437523e-05, + "loss": 0.106, + "step": 15280 + }, + { + "epoch": 0.48923303362877163, + "grad_norm": 21.75, + "learning_rate": 1.075378915459751e-05, + "loss": 0.1526, + "step": 15290 + }, + { + "epoch": 0.4895530029117205, + "grad_norm": 38.75, + "learning_rate": 1.0747052879757495e-05, + "loss": 0.137, + "step": 15300 + }, + { + "epoch": 0.4898729721946693, + "grad_norm": 12.5625, + "learning_rate": 1.0740316604917482e-05, + "loss": 0.1571, + "step": 15310 + }, + { + "epoch": 0.49019294147761816, + "grad_norm": 6.0, + "learning_rate": 1.073358033007747e-05, + "loss": 0.1228, + "step": 15320 + }, + { + "epoch": 0.49051291076056697, + "grad_norm": 0.78515625, + "learning_rate": 1.0726844055237455e-05, + "loss": 0.0661, + "step": 15330 + }, + { + "epoch": 0.49083288004351583, + "grad_norm": 6.3125, + "learning_rate": 1.0720107780397441e-05, + "loss": 0.1023, + "step": 15340 + }, + { + "epoch": 0.49115284932646464, + "grad_norm": 1.5546875, + "learning_rate": 1.0713371505557429e-05, + "loss": 0.1107, + "step": 15350 + }, + { + "epoch": 0.4914728186094135, + "grad_norm": 5.4375, + "learning_rate": 1.0706635230717415e-05, + "loss": 0.0765, + "step": 15360 + }, + { + "epoch": 0.49179278789236236, + "grad_norm": 4.5, + "learning_rate": 1.0699898955877401e-05, + "loss": 0.1254, + "step": 15370 + }, + { + "epoch": 0.49211275717531117, + "grad_norm": 1.125, + "learning_rate": 1.0693162681037387e-05, + "loss": 0.112, + "step": 15380 + }, + { + "epoch": 0.49243272645826003, + "grad_norm": 24.0, + "learning_rate": 1.0686426406197375e-05, + "loss": 0.0961, + "step": 15390 + }, + { + "epoch": 0.49275269574120883, + "grad_norm": 8.4375, + "learning_rate": 1.0679690131357361e-05, + "loss": 0.146, + "step": 15400 + }, + { + "epoch": 0.4930726650241577, + "grad_norm": 13.875, + "learning_rate": 1.0672953856517347e-05, + "loss": 0.0843, + "step": 15410 + }, + { + "epoch": 0.4933926343071065, + "grad_norm": 19.75, + "learning_rate": 1.0666217581677335e-05, + "loss": 0.1355, + "step": 15420 + }, + { + "epoch": 0.49371260359005537, + "grad_norm": 16.75, + "learning_rate": 1.065948130683732e-05, + "loss": 0.1467, + "step": 15430 + }, + { + "epoch": 0.49403257287300417, + "grad_norm": 3.09375, + "learning_rate": 1.0652745031997307e-05, + "loss": 0.0314, + "step": 15440 + }, + { + "epoch": 0.49435254215595303, + "grad_norm": 7.125, + "learning_rate": 1.0646008757157293e-05, + "loss": 0.1102, + "step": 15450 + }, + { + "epoch": 0.49467251143890184, + "grad_norm": 18.875, + "learning_rate": 1.063927248231728e-05, + "loss": 0.1862, + "step": 15460 + }, + { + "epoch": 0.4949924807218507, + "grad_norm": 30.125, + "learning_rate": 1.0632536207477267e-05, + "loss": 0.0807, + "step": 15470 + }, + { + "epoch": 0.49531245000479956, + "grad_norm": 4.46875, + "learning_rate": 1.0625799932637253e-05, + "loss": 0.1243, + "step": 15480 + }, + { + "epoch": 0.49563241928774837, + "grad_norm": 10.0625, + "learning_rate": 1.0619063657797239e-05, + "loss": 0.1785, + "step": 15490 + }, + { + "epoch": 0.49595238857069723, + "grad_norm": 29.0, + "learning_rate": 1.0612327382957226e-05, + "loss": 0.131, + "step": 15500 + }, + { + "epoch": 0.49627235785364604, + "grad_norm": 32.0, + "learning_rate": 1.0605591108117213e-05, + "loss": 0.1062, + "step": 15510 + }, + { + "epoch": 0.4965923271365949, + "grad_norm": 8.4375, + "learning_rate": 1.0598854833277199e-05, + "loss": 0.0994, + "step": 15520 + }, + { + "epoch": 0.4969122964195437, + "grad_norm": 4.1875, + "learning_rate": 1.0592118558437186e-05, + "loss": 0.0746, + "step": 15530 + }, + { + "epoch": 0.49723226570249257, + "grad_norm": 6.5, + "learning_rate": 1.0585382283597172e-05, + "loss": 0.1101, + "step": 15540 + }, + { + "epoch": 0.4975522349854414, + "grad_norm": 20.125, + "learning_rate": 1.0578646008757158e-05, + "loss": 0.1653, + "step": 15550 + }, + { + "epoch": 0.49787220426839024, + "grad_norm": 4.21875, + "learning_rate": 1.0571909733917144e-05, + "loss": 0.0993, + "step": 15560 + }, + { + "epoch": 0.49819217355133905, + "grad_norm": 9.1875, + "learning_rate": 1.0565173459077132e-05, + "loss": 0.1378, + "step": 15570 + }, + { + "epoch": 0.4985121428342879, + "grad_norm": 2.78125, + "learning_rate": 1.0558437184237118e-05, + "loss": 0.0934, + "step": 15580 + }, + { + "epoch": 0.49883211211723677, + "grad_norm": 12.1875, + "learning_rate": 1.0551700909397104e-05, + "loss": 0.1055, + "step": 15590 + }, + { + "epoch": 0.4991520814001856, + "grad_norm": 16.5, + "learning_rate": 1.054496463455709e-05, + "loss": 0.138, + "step": 15600 + }, + { + "epoch": 0.49947205068313444, + "grad_norm": 13.4375, + "learning_rate": 1.0538228359717078e-05, + "loss": 0.1034, + "step": 15610 + }, + { + "epoch": 0.49979201996608325, + "grad_norm": 3.984375, + "learning_rate": 1.0531492084877064e-05, + "loss": 0.1236, + "step": 15620 + }, + { + "epoch": 0.5001119892490321, + "grad_norm": 20.0, + "learning_rate": 1.052475581003705e-05, + "loss": 0.0922, + "step": 15630 + }, + { + "epoch": 0.500431958531981, + "grad_norm": 26.25, + "learning_rate": 1.0518019535197038e-05, + "loss": 0.1105, + "step": 15640 + }, + { + "epoch": 0.5007519278149297, + "grad_norm": 15.3125, + "learning_rate": 1.0511283260357024e-05, + "loss": 0.148, + "step": 15650 + }, + { + "epoch": 0.5010718970978786, + "grad_norm": 0.58984375, + "learning_rate": 1.050454698551701e-05, + "loss": 0.1202, + "step": 15660 + }, + { + "epoch": 0.5013918663808274, + "grad_norm": 15.25, + "learning_rate": 1.0497810710676996e-05, + "loss": 0.0687, + "step": 15670 + }, + { + "epoch": 0.5017118356637763, + "grad_norm": 4.0, + "learning_rate": 1.0491074435836984e-05, + "loss": 0.1277, + "step": 15680 + }, + { + "epoch": 0.5020318049467252, + "grad_norm": 4.96875, + "learning_rate": 1.048433816099697e-05, + "loss": 0.1179, + "step": 15690 + }, + { + "epoch": 0.5023517742296739, + "grad_norm": 0.88671875, + "learning_rate": 1.0477601886156956e-05, + "loss": 0.1262, + "step": 15700 + }, + { + "epoch": 0.5026717435126228, + "grad_norm": 14.0625, + "learning_rate": 1.0470865611316943e-05, + "loss": 0.1401, + "step": 15710 + }, + { + "epoch": 0.5029917127955716, + "grad_norm": 4.6875, + "learning_rate": 1.046412933647693e-05, + "loss": 0.1198, + "step": 15720 + }, + { + "epoch": 0.5033116820785205, + "grad_norm": 10.3125, + "learning_rate": 1.0457393061636916e-05, + "loss": 0.1152, + "step": 15730 + }, + { + "epoch": 0.5036316513614693, + "grad_norm": 2.515625, + "learning_rate": 1.0450656786796902e-05, + "loss": 0.0833, + "step": 15740 + }, + { + "epoch": 0.5039516206444181, + "grad_norm": 14.5625, + "learning_rate": 1.044392051195689e-05, + "loss": 0.1354, + "step": 15750 + }, + { + "epoch": 0.504271589927367, + "grad_norm": 4.65625, + "learning_rate": 1.0437184237116875e-05, + "loss": 0.0848, + "step": 15760 + }, + { + "epoch": 0.5045915592103158, + "grad_norm": 20.25, + "learning_rate": 1.0430447962276861e-05, + "loss": 0.1016, + "step": 15770 + }, + { + "epoch": 0.5049115284932646, + "grad_norm": 14.25, + "learning_rate": 1.0423711687436847e-05, + "loss": 0.1103, + "step": 15780 + }, + { + "epoch": 0.5052314977762135, + "grad_norm": 3.0625, + "learning_rate": 1.0416975412596835e-05, + "loss": 0.0355, + "step": 15790 + }, + { + "epoch": 0.5055514670591623, + "grad_norm": 30.75, + "learning_rate": 1.0410239137756821e-05, + "loss": 0.1866, + "step": 15800 + }, + { + "epoch": 0.5058714363421112, + "grad_norm": 0.65625, + "learning_rate": 1.0403502862916807e-05, + "loss": 0.0898, + "step": 15810 + }, + { + "epoch": 0.50619140562506, + "grad_norm": 15.875, + "learning_rate": 1.0396766588076795e-05, + "loss": 0.1066, + "step": 15820 + }, + { + "epoch": 0.5065113749080088, + "grad_norm": 12.9375, + "learning_rate": 1.0390030313236781e-05, + "loss": 0.1045, + "step": 15830 + }, + { + "epoch": 0.5068313441909577, + "grad_norm": 7.34375, + "learning_rate": 1.0383294038396767e-05, + "loss": 0.1217, + "step": 15840 + }, + { + "epoch": 0.5071513134739065, + "grad_norm": 2.671875, + "learning_rate": 1.0376557763556753e-05, + "loss": 0.0884, + "step": 15850 + }, + { + "epoch": 0.5074712827568554, + "grad_norm": 0.51953125, + "learning_rate": 1.0369821488716741e-05, + "loss": 0.0915, + "step": 15860 + }, + { + "epoch": 0.5077912520398041, + "grad_norm": 6.15625, + "learning_rate": 1.0363085213876727e-05, + "loss": 0.1297, + "step": 15870 + }, + { + "epoch": 0.508111221322753, + "grad_norm": 1.09375, + "learning_rate": 1.0356348939036713e-05, + "loss": 0.0718, + "step": 15880 + }, + { + "epoch": 0.5084311906057019, + "grad_norm": 13.4375, + "learning_rate": 1.0349612664196699e-05, + "loss": 0.0644, + "step": 15890 + }, + { + "epoch": 0.5087511598886507, + "grad_norm": 14.3125, + "learning_rate": 1.0342876389356687e-05, + "loss": 0.108, + "step": 15900 + }, + { + "epoch": 0.5090711291715996, + "grad_norm": 12.75, + "learning_rate": 1.0336140114516673e-05, + "loss": 0.1508, + "step": 15910 + }, + { + "epoch": 0.5093910984545483, + "grad_norm": 11.9375, + "learning_rate": 1.0329403839676659e-05, + "loss": 0.1289, + "step": 15920 + }, + { + "epoch": 0.5097110677374972, + "grad_norm": 7.4375, + "learning_rate": 1.0322667564836647e-05, + "loss": 0.1498, + "step": 15930 + }, + { + "epoch": 0.510031037020446, + "grad_norm": 6.78125, + "learning_rate": 1.0315931289996633e-05, + "loss": 0.1483, + "step": 15940 + }, + { + "epoch": 0.5103510063033949, + "grad_norm": 1.7421875, + "learning_rate": 1.0309195015156619e-05, + "loss": 0.1524, + "step": 15950 + }, + { + "epoch": 0.5106709755863437, + "grad_norm": 11.75, + "learning_rate": 1.0302458740316605e-05, + "loss": 0.1435, + "step": 15960 + }, + { + "epoch": 0.5109909448692925, + "grad_norm": 26.0, + "learning_rate": 1.0295722465476592e-05, + "loss": 0.1148, + "step": 15970 + }, + { + "epoch": 0.5113109141522414, + "grad_norm": 5.6875, + "learning_rate": 1.0288986190636578e-05, + "loss": 0.1448, + "step": 15980 + }, + { + "epoch": 0.5116308834351903, + "grad_norm": 0.64453125, + "learning_rate": 1.0282249915796565e-05, + "loss": 0.08, + "step": 15990 + }, + { + "epoch": 0.511950852718139, + "grad_norm": 5.21875, + "learning_rate": 1.0275513640956552e-05, + "loss": 0.079, + "step": 16000 + }, + { + "epoch": 0.5122708220010879, + "grad_norm": 12.0625, + "learning_rate": 1.0268777366116538e-05, + "loss": 0.1187, + "step": 16010 + }, + { + "epoch": 0.5125907912840367, + "grad_norm": 3.734375, + "learning_rate": 1.0262041091276524e-05, + "loss": 0.0895, + "step": 16020 + }, + { + "epoch": 0.5129107605669856, + "grad_norm": 2.546875, + "learning_rate": 1.025530481643651e-05, + "loss": 0.0903, + "step": 16030 + }, + { + "epoch": 0.5132307298499345, + "grad_norm": 3.390625, + "learning_rate": 1.0248568541596498e-05, + "loss": 0.0901, + "step": 16040 + }, + { + "epoch": 0.5135506991328832, + "grad_norm": 4.59375, + "learning_rate": 1.0241832266756484e-05, + "loss": 0.1249, + "step": 16050 + }, + { + "epoch": 0.5138706684158321, + "grad_norm": 9.4375, + "learning_rate": 1.023509599191647e-05, + "loss": 0.1342, + "step": 16060 + }, + { + "epoch": 0.5141906376987809, + "grad_norm": 0.796875, + "learning_rate": 1.0228359717076456e-05, + "loss": 0.1455, + "step": 16070 + }, + { + "epoch": 0.5145106069817298, + "grad_norm": 15.25, + "learning_rate": 1.0221623442236444e-05, + "loss": 0.0825, + "step": 16080 + }, + { + "epoch": 0.5148305762646785, + "grad_norm": 14.5, + "learning_rate": 1.021488716739643e-05, + "loss": 0.1039, + "step": 16090 + }, + { + "epoch": 0.5151505455476274, + "grad_norm": 9.0625, + "learning_rate": 1.0208150892556416e-05, + "loss": 0.1111, + "step": 16100 + }, + { + "epoch": 0.5154705148305763, + "grad_norm": 15.875, + "learning_rate": 1.0201414617716404e-05, + "loss": 0.1266, + "step": 16110 + }, + { + "epoch": 0.5157904841135251, + "grad_norm": 7.71875, + "learning_rate": 1.019467834287639e-05, + "loss": 0.0752, + "step": 16120 + }, + { + "epoch": 0.516110453396474, + "grad_norm": 2.328125, + "learning_rate": 1.0187942068036376e-05, + "loss": 0.0807, + "step": 16130 + }, + { + "epoch": 0.5164304226794227, + "grad_norm": 1.046875, + "learning_rate": 1.0181205793196362e-05, + "loss": 0.1098, + "step": 16140 + }, + { + "epoch": 0.5167503919623716, + "grad_norm": 9.25, + "learning_rate": 1.017446951835635e-05, + "loss": 0.1115, + "step": 16150 + }, + { + "epoch": 0.5170703612453205, + "grad_norm": 7.8125, + "learning_rate": 1.0167733243516336e-05, + "loss": 0.106, + "step": 16160 + }, + { + "epoch": 0.5173903305282693, + "grad_norm": 12.25, + "learning_rate": 1.0160996968676322e-05, + "loss": 0.1005, + "step": 16170 + }, + { + "epoch": 0.5177102998112181, + "grad_norm": 21.75, + "learning_rate": 1.015426069383631e-05, + "loss": 0.1228, + "step": 16180 + }, + { + "epoch": 0.5180302690941669, + "grad_norm": 3.96875, + "learning_rate": 1.0147524418996296e-05, + "loss": 0.1322, + "step": 16190 + }, + { + "epoch": 0.5183502383771158, + "grad_norm": 5.09375, + "learning_rate": 1.0140788144156282e-05, + "loss": 0.1218, + "step": 16200 + }, + { + "epoch": 0.5186702076600647, + "grad_norm": 12.625, + "learning_rate": 1.0134051869316268e-05, + "loss": 0.1476, + "step": 16210 + }, + { + "epoch": 0.5189901769430135, + "grad_norm": 3.796875, + "learning_rate": 1.0127315594476255e-05, + "loss": 0.1353, + "step": 16220 + }, + { + "epoch": 0.5193101462259623, + "grad_norm": 7.9375, + "learning_rate": 1.0120579319636241e-05, + "loss": 0.0939, + "step": 16230 + }, + { + "epoch": 0.5196301155089111, + "grad_norm": 16.5, + "learning_rate": 1.0113843044796227e-05, + "loss": 0.1778, + "step": 16240 + }, + { + "epoch": 0.51995008479186, + "grad_norm": 12.9375, + "learning_rate": 1.0107106769956213e-05, + "loss": 0.0831, + "step": 16250 + }, + { + "epoch": 0.5202700540748089, + "grad_norm": 25.5, + "learning_rate": 1.0100370495116201e-05, + "loss": 0.1169, + "step": 16260 + }, + { + "epoch": 0.5205900233577576, + "grad_norm": 0.890625, + "learning_rate": 1.0093634220276187e-05, + "loss": 0.1802, + "step": 16270 + }, + { + "epoch": 0.5209099926407065, + "grad_norm": 318.0, + "learning_rate": 1.0086897945436173e-05, + "loss": 0.1238, + "step": 16280 + }, + { + "epoch": 0.5212299619236553, + "grad_norm": 12.25, + "learning_rate": 1.0080161670596161e-05, + "loss": 0.0891, + "step": 16290 + }, + { + "epoch": 0.5215499312066042, + "grad_norm": 0.9296875, + "learning_rate": 1.0073425395756147e-05, + "loss": 0.1026, + "step": 16300 + }, + { + "epoch": 0.521869900489553, + "grad_norm": 9.375, + "learning_rate": 1.0066689120916133e-05, + "loss": 0.1511, + "step": 16310 + }, + { + "epoch": 0.5221898697725018, + "grad_norm": 6.9375, + "learning_rate": 1.0059952846076119e-05, + "loss": 0.1108, + "step": 16320 + }, + { + "epoch": 0.5225098390554507, + "grad_norm": 0.7890625, + "learning_rate": 1.0053216571236107e-05, + "loss": 0.1441, + "step": 16330 + }, + { + "epoch": 0.5228298083383995, + "grad_norm": 36.5, + "learning_rate": 1.0046480296396093e-05, + "loss": 0.0913, + "step": 16340 + }, + { + "epoch": 0.5231497776213484, + "grad_norm": 6.59375, + "learning_rate": 1.0039744021556079e-05, + "loss": 0.1113, + "step": 16350 + }, + { + "epoch": 0.5234697469042972, + "grad_norm": 24.625, + "learning_rate": 1.0033007746716065e-05, + "loss": 0.1369, + "step": 16360 + }, + { + "epoch": 0.523789716187246, + "grad_norm": 8.0625, + "learning_rate": 1.0026271471876053e-05, + "loss": 0.1328, + "step": 16370 + }, + { + "epoch": 0.5241096854701949, + "grad_norm": 15.875, + "learning_rate": 1.0019535197036039e-05, + "loss": 0.0887, + "step": 16380 + }, + { + "epoch": 0.5244296547531437, + "grad_norm": 1.3203125, + "learning_rate": 1.0012798922196025e-05, + "loss": 0.1961, + "step": 16390 + }, + { + "epoch": 0.5247496240360925, + "grad_norm": 5.6875, + "learning_rate": 1.0006062647356013e-05, + "loss": 0.109, + "step": 16400 + }, + { + "epoch": 0.5250695933190414, + "grad_norm": 25.25, + "learning_rate": 9.999326372516e-06, + "loss": 0.1243, + "step": 16410 + }, + { + "epoch": 0.5253895626019902, + "grad_norm": 15.5, + "learning_rate": 9.992590097675986e-06, + "loss": 0.0806, + "step": 16420 + }, + { + "epoch": 0.5257095318849391, + "grad_norm": 16.5, + "learning_rate": 9.985853822835972e-06, + "loss": 0.1409, + "step": 16430 + }, + { + "epoch": 0.5260295011678879, + "grad_norm": 0.50390625, + "learning_rate": 9.979117547995958e-06, + "loss": 0.0668, + "step": 16440 + }, + { + "epoch": 0.5263494704508367, + "grad_norm": 1.203125, + "learning_rate": 9.972381273155946e-06, + "loss": 0.0781, + "step": 16450 + }, + { + "epoch": 0.5266694397337855, + "grad_norm": 10.0, + "learning_rate": 9.965644998315932e-06, + "loss": 0.1415, + "step": 16460 + }, + { + "epoch": 0.5269894090167344, + "grad_norm": 11.75, + "learning_rate": 9.958908723475918e-06, + "loss": 0.0876, + "step": 16470 + }, + { + "epoch": 0.5273093782996833, + "grad_norm": 14.125, + "learning_rate": 9.952172448635904e-06, + "loss": 0.1419, + "step": 16480 + }, + { + "epoch": 0.527629347582632, + "grad_norm": 12.5625, + "learning_rate": 9.945436173795892e-06, + "loss": 0.1513, + "step": 16490 + }, + { + "epoch": 0.5279493168655809, + "grad_norm": 13.1875, + "learning_rate": 9.938699898955878e-06, + "loss": 0.1461, + "step": 16500 + }, + { + "epoch": 0.5282692861485297, + "grad_norm": 29.125, + "learning_rate": 9.931963624115864e-06, + "loss": 0.1361, + "step": 16510 + }, + { + "epoch": 0.5285892554314786, + "grad_norm": 12.4375, + "learning_rate": 9.925227349275852e-06, + "loss": 0.1173, + "step": 16520 + }, + { + "epoch": 0.5289092247144274, + "grad_norm": 11.4375, + "learning_rate": 9.918491074435838e-06, + "loss": 0.1368, + "step": 16530 + }, + { + "epoch": 0.5292291939973762, + "grad_norm": 3.46875, + "learning_rate": 9.911754799595824e-06, + "loss": 0.1205, + "step": 16540 + }, + { + "epoch": 0.5295491632803251, + "grad_norm": 13.125, + "learning_rate": 9.90501852475581e-06, + "loss": 0.0839, + "step": 16550 + }, + { + "epoch": 0.529869132563274, + "grad_norm": 7.34375, + "learning_rate": 9.898282249915798e-06, + "loss": 0.0928, + "step": 16560 + }, + { + "epoch": 0.5301891018462228, + "grad_norm": 1.4609375, + "learning_rate": 9.891545975075784e-06, + "loss": 0.1458, + "step": 16570 + }, + { + "epoch": 0.5305090711291716, + "grad_norm": 13.1875, + "learning_rate": 9.88480970023577e-06, + "loss": 0.0947, + "step": 16580 + }, + { + "epoch": 0.5308290404121204, + "grad_norm": 16.25, + "learning_rate": 9.878073425395757e-06, + "loss": 0.1135, + "step": 16590 + }, + { + "epoch": 0.5311490096950693, + "grad_norm": 2.921875, + "learning_rate": 9.871337150555744e-06, + "loss": 0.1173, + "step": 16600 + }, + { + "epoch": 0.5314689789780181, + "grad_norm": 4.40625, + "learning_rate": 9.86460087571573e-06, + "loss": 0.1075, + "step": 16610 + }, + { + "epoch": 0.5317889482609669, + "grad_norm": 2.28125, + "learning_rate": 9.857864600875716e-06, + "loss": 0.1312, + "step": 16620 + }, + { + "epoch": 0.5321089175439158, + "grad_norm": 7.75, + "learning_rate": 9.851128326035703e-06, + "loss": 0.1079, + "step": 16630 + }, + { + "epoch": 0.5324288868268646, + "grad_norm": 17.25, + "learning_rate": 9.84439205119569e-06, + "loss": 0.1423, + "step": 16640 + }, + { + "epoch": 0.5327488561098135, + "grad_norm": 3.0625, + "learning_rate": 9.837655776355675e-06, + "loss": 0.1032, + "step": 16650 + }, + { + "epoch": 0.5330688253927623, + "grad_norm": 7.96875, + "learning_rate": 9.830919501515661e-06, + "loss": 0.0872, + "step": 16660 + }, + { + "epoch": 0.5333887946757111, + "grad_norm": 15.5, + "learning_rate": 9.82418322667565e-06, + "loss": 0.1218, + "step": 16670 + }, + { + "epoch": 0.53370876395866, + "grad_norm": 9.625, + "learning_rate": 9.817446951835635e-06, + "loss": 0.1358, + "step": 16680 + }, + { + "epoch": 0.5340287332416088, + "grad_norm": 2.328125, + "learning_rate": 9.810710676995621e-06, + "loss": 0.0717, + "step": 16690 + }, + { + "epoch": 0.5343487025245577, + "grad_norm": 48.25, + "learning_rate": 9.803974402155609e-06, + "loss": 0.1161, + "step": 16700 + }, + { + "epoch": 0.5346686718075064, + "grad_norm": 14.5625, + "learning_rate": 9.797238127315595e-06, + "loss": 0.0845, + "step": 16710 + }, + { + "epoch": 0.5349886410904553, + "grad_norm": 0.6875, + "learning_rate": 9.790501852475581e-06, + "loss": 0.0844, + "step": 16720 + }, + { + "epoch": 0.5353086103734042, + "grad_norm": 18.5, + "learning_rate": 9.783765577635567e-06, + "loss": 0.1187, + "step": 16730 + }, + { + "epoch": 0.535628579656353, + "grad_norm": 9.3125, + "learning_rate": 9.777029302795555e-06, + "loss": 0.0853, + "step": 16740 + }, + { + "epoch": 0.5359485489393018, + "grad_norm": 2.15625, + "learning_rate": 9.770293027955543e-06, + "loss": 0.0776, + "step": 16750 + }, + { + "epoch": 0.5362685182222506, + "grad_norm": 13.5625, + "learning_rate": 9.763556753115529e-06, + "loss": 0.0546, + "step": 16760 + }, + { + "epoch": 0.5365884875051995, + "grad_norm": 1.8203125, + "learning_rate": 9.756820478275515e-06, + "loss": 0.1381, + "step": 16770 + }, + { + "epoch": 0.5369084567881484, + "grad_norm": 8.4375, + "learning_rate": 9.7500842034355e-06, + "loss": 0.1048, + "step": 16780 + }, + { + "epoch": 0.5372284260710972, + "grad_norm": 9.5625, + "learning_rate": 9.743347928595488e-06, + "loss": 0.1545, + "step": 16790 + }, + { + "epoch": 0.537548395354046, + "grad_norm": 8.0, + "learning_rate": 9.736611653755475e-06, + "loss": 0.0584, + "step": 16800 + }, + { + "epoch": 0.5378683646369948, + "grad_norm": 11.0, + "learning_rate": 9.72987537891546e-06, + "loss": 0.127, + "step": 16810 + }, + { + "epoch": 0.5381883339199437, + "grad_norm": 16.5, + "learning_rate": 9.723139104075448e-06, + "loss": 0.1087, + "step": 16820 + }, + { + "epoch": 0.5385083032028926, + "grad_norm": 3.84375, + "learning_rate": 9.716402829235434e-06, + "loss": 0.1302, + "step": 16830 + }, + { + "epoch": 0.5388282724858413, + "grad_norm": 24.375, + "learning_rate": 9.70966655439542e-06, + "loss": 0.0874, + "step": 16840 + }, + { + "epoch": 0.5391482417687902, + "grad_norm": 17.75, + "learning_rate": 9.702930279555406e-06, + "loss": 0.0768, + "step": 16850 + }, + { + "epoch": 0.539468211051739, + "grad_norm": 10.625, + "learning_rate": 9.696194004715394e-06, + "loss": 0.1197, + "step": 16860 + }, + { + "epoch": 0.5397881803346879, + "grad_norm": 23.25, + "learning_rate": 9.68945772987538e-06, + "loss": 0.1135, + "step": 16870 + }, + { + "epoch": 0.5401081496176368, + "grad_norm": 5.375, + "learning_rate": 9.682721455035366e-06, + "loss": 0.1187, + "step": 16880 + }, + { + "epoch": 0.5404281189005855, + "grad_norm": 14.9375, + "learning_rate": 9.675985180195352e-06, + "loss": 0.1447, + "step": 16890 + }, + { + "epoch": 0.5407480881835344, + "grad_norm": 6.5, + "learning_rate": 9.66924890535534e-06, + "loss": 0.073, + "step": 16900 + }, + { + "epoch": 0.5410680574664832, + "grad_norm": 1.8203125, + "learning_rate": 9.662512630515326e-06, + "loss": 0.1211, + "step": 16910 + }, + { + "epoch": 0.5413880267494321, + "grad_norm": 10.625, + "learning_rate": 9.655776355675312e-06, + "loss": 0.103, + "step": 16920 + }, + { + "epoch": 0.5417079960323808, + "grad_norm": 2.25, + "learning_rate": 9.6490400808353e-06, + "loss": 0.1791, + "step": 16930 + }, + { + "epoch": 0.5420279653153297, + "grad_norm": 8.9375, + "learning_rate": 9.642303805995286e-06, + "loss": 0.1161, + "step": 16940 + }, + { + "epoch": 0.5423479345982786, + "grad_norm": 0.921875, + "learning_rate": 9.635567531155272e-06, + "loss": 0.1118, + "step": 16950 + }, + { + "epoch": 0.5426679038812274, + "grad_norm": 10.8125, + "learning_rate": 9.628831256315258e-06, + "loss": 0.1129, + "step": 16960 + }, + { + "epoch": 0.5429878731641763, + "grad_norm": 16.25, + "learning_rate": 9.622094981475246e-06, + "loss": 0.1488, + "step": 16970 + }, + { + "epoch": 0.543307842447125, + "grad_norm": 7.34375, + "learning_rate": 9.615358706635232e-06, + "loss": 0.1121, + "step": 16980 + }, + { + "epoch": 0.5436278117300739, + "grad_norm": 5.6875, + "learning_rate": 9.608622431795218e-06, + "loss": 0.1185, + "step": 16990 + }, + { + "epoch": 0.5439477810130228, + "grad_norm": 16.375, + "learning_rate": 9.601886156955204e-06, + "loss": 0.0769, + "step": 17000 + }, + { + "epoch": 0.5442677502959716, + "grad_norm": 12.25, + "learning_rate": 9.595149882115192e-06, + "loss": 0.1605, + "step": 17010 + }, + { + "epoch": 0.5445877195789204, + "grad_norm": 1.140625, + "learning_rate": 9.588413607275178e-06, + "loss": 0.1395, + "step": 17020 + }, + { + "epoch": 0.5449076888618692, + "grad_norm": 21.5, + "learning_rate": 9.581677332435164e-06, + "loss": 0.1563, + "step": 17030 + }, + { + "epoch": 0.5452276581448181, + "grad_norm": 22.875, + "learning_rate": 9.574941057595151e-06, + "loss": 0.1045, + "step": 17040 + }, + { + "epoch": 0.545547627427767, + "grad_norm": 20.875, + "learning_rate": 9.568204782755137e-06, + "loss": 0.175, + "step": 17050 + }, + { + "epoch": 0.5458675967107157, + "grad_norm": 23.625, + "learning_rate": 9.561468507915123e-06, + "loss": 0.1244, + "step": 17060 + }, + { + "epoch": 0.5461875659936646, + "grad_norm": 16.5, + "learning_rate": 9.55473223307511e-06, + "loss": 0.0841, + "step": 17070 + }, + { + "epoch": 0.5465075352766134, + "grad_norm": 10.625, + "learning_rate": 9.547995958235097e-06, + "loss": 0.1042, + "step": 17080 + }, + { + "epoch": 0.5468275045595623, + "grad_norm": 9.75, + "learning_rate": 9.541259683395083e-06, + "loss": 0.0513, + "step": 17090 + }, + { + "epoch": 0.5471474738425112, + "grad_norm": 10.125, + "learning_rate": 9.53452340855507e-06, + "loss": 0.1127, + "step": 17100 + }, + { + "epoch": 0.5474674431254599, + "grad_norm": 16.125, + "learning_rate": 9.527787133715057e-06, + "loss": 0.1141, + "step": 17110 + }, + { + "epoch": 0.5477874124084088, + "grad_norm": 3.6875, + "learning_rate": 9.521050858875043e-06, + "loss": 0.1045, + "step": 17120 + }, + { + "epoch": 0.5481073816913576, + "grad_norm": 4.875, + "learning_rate": 9.514314584035029e-06, + "loss": 0.0447, + "step": 17130 + }, + { + "epoch": 0.5484273509743065, + "grad_norm": 0.71875, + "learning_rate": 9.507578309195015e-06, + "loss": 0.1104, + "step": 17140 + }, + { + "epoch": 0.5487473202572553, + "grad_norm": 19.25, + "learning_rate": 9.500842034355003e-06, + "loss": 0.1749, + "step": 17150 + }, + { + "epoch": 0.5490672895402041, + "grad_norm": 15.5625, + "learning_rate": 9.494105759514989e-06, + "loss": 0.0682, + "step": 17160 + }, + { + "epoch": 0.549387258823153, + "grad_norm": 10.125, + "learning_rate": 9.487369484674975e-06, + "loss": 0.0973, + "step": 17170 + }, + { + "epoch": 0.5497072281061018, + "grad_norm": 11.9375, + "learning_rate": 9.480633209834961e-06, + "loss": 0.1071, + "step": 17180 + }, + { + "epoch": 0.5500271973890507, + "grad_norm": 12.5625, + "learning_rate": 9.473896934994949e-06, + "loss": 0.0739, + "step": 17190 + }, + { + "epoch": 0.5503471666719995, + "grad_norm": 4.59375, + "learning_rate": 9.467160660154935e-06, + "loss": 0.1195, + "step": 17200 + }, + { + "epoch": 0.5506671359549483, + "grad_norm": 13.125, + "learning_rate": 9.46042438531492e-06, + "loss": 0.1357, + "step": 17210 + }, + { + "epoch": 0.5509871052378972, + "grad_norm": 6.75, + "learning_rate": 9.453688110474909e-06, + "loss": 0.0672, + "step": 17220 + }, + { + "epoch": 0.551307074520846, + "grad_norm": 7.90625, + "learning_rate": 9.446951835634895e-06, + "loss": 0.1207, + "step": 17230 + }, + { + "epoch": 0.5516270438037948, + "grad_norm": 2.34375, + "learning_rate": 9.44021556079488e-06, + "loss": 0.135, + "step": 17240 + }, + { + "epoch": 0.5519470130867437, + "grad_norm": 10.1875, + "learning_rate": 9.433479285954867e-06, + "loss": 0.1543, + "step": 17250 + }, + { + "epoch": 0.5522669823696925, + "grad_norm": 15.125, + "learning_rate": 9.426743011114854e-06, + "loss": 0.1163, + "step": 17260 + }, + { + "epoch": 0.5525869516526414, + "grad_norm": 0.890625, + "learning_rate": 9.42000673627484e-06, + "loss": 0.084, + "step": 17270 + }, + { + "epoch": 0.5529069209355901, + "grad_norm": 32.0, + "learning_rate": 9.413270461434827e-06, + "loss": 0.0906, + "step": 17280 + }, + { + "epoch": 0.553226890218539, + "grad_norm": 28.75, + "learning_rate": 9.406534186594814e-06, + "loss": 0.092, + "step": 17290 + }, + { + "epoch": 0.5535468595014879, + "grad_norm": 2.671875, + "learning_rate": 9.3997979117548e-06, + "loss": 0.074, + "step": 17300 + }, + { + "epoch": 0.5538668287844367, + "grad_norm": 9.4375, + "learning_rate": 9.393061636914786e-06, + "loss": 0.0844, + "step": 17310 + }, + { + "epoch": 0.5541867980673856, + "grad_norm": 22.0, + "learning_rate": 9.386325362074772e-06, + "loss": 0.1496, + "step": 17320 + }, + { + "epoch": 0.5545067673503343, + "grad_norm": 1.6171875, + "learning_rate": 9.37958908723476e-06, + "loss": 0.092, + "step": 17330 + }, + { + "epoch": 0.5548267366332832, + "grad_norm": 5.875, + "learning_rate": 9.372852812394746e-06, + "loss": 0.0977, + "step": 17340 + }, + { + "epoch": 0.555146705916232, + "grad_norm": 9.875, + "learning_rate": 9.366116537554732e-06, + "loss": 0.1711, + "step": 17350 + }, + { + "epoch": 0.5554666751991809, + "grad_norm": 3.21875, + "learning_rate": 9.359380262714718e-06, + "loss": 0.1138, + "step": 17360 + }, + { + "epoch": 0.5557866444821297, + "grad_norm": 0.181640625, + "learning_rate": 9.352643987874706e-06, + "loss": 0.1155, + "step": 17370 + }, + { + "epoch": 0.5561066137650785, + "grad_norm": 13.875, + "learning_rate": 9.345907713034694e-06, + "loss": 0.1035, + "step": 17380 + }, + { + "epoch": 0.5564265830480274, + "grad_norm": 6.40625, + "learning_rate": 9.33917143819468e-06, + "loss": 0.1296, + "step": 17390 + }, + { + "epoch": 0.5567465523309763, + "grad_norm": 7.84375, + "learning_rate": 9.332435163354666e-06, + "loss": 0.0792, + "step": 17400 + }, + { + "epoch": 0.5570665216139251, + "grad_norm": 7.9375, + "learning_rate": 9.325698888514652e-06, + "loss": 0.1096, + "step": 17410 + }, + { + "epoch": 0.5573864908968739, + "grad_norm": 11.3125, + "learning_rate": 9.31896261367464e-06, + "loss": 0.1383, + "step": 17420 + }, + { + "epoch": 0.5577064601798227, + "grad_norm": 12.375, + "learning_rate": 9.312226338834626e-06, + "loss": 0.0889, + "step": 17430 + }, + { + "epoch": 0.5580264294627716, + "grad_norm": 2.421875, + "learning_rate": 9.305490063994612e-06, + "loss": 0.1013, + "step": 17440 + }, + { + "epoch": 0.5583463987457205, + "grad_norm": 20.0, + "learning_rate": 9.2987537891546e-06, + "loss": 0.0904, + "step": 17450 + }, + { + "epoch": 0.5586663680286692, + "grad_norm": 14.4375, + "learning_rate": 9.292017514314585e-06, + "loss": 0.092, + "step": 17460 + }, + { + "epoch": 0.5589863373116181, + "grad_norm": 9.875, + "learning_rate": 9.285281239474571e-06, + "loss": 0.1124, + "step": 17470 + }, + { + "epoch": 0.5593063065945669, + "grad_norm": 6.96875, + "learning_rate": 9.278544964634558e-06, + "loss": 0.0693, + "step": 17480 + }, + { + "epoch": 0.5596262758775158, + "grad_norm": 9.8125, + "learning_rate": 9.271808689794545e-06, + "loss": 0.0827, + "step": 17490 + }, + { + "epoch": 0.5599462451604647, + "grad_norm": 2.96875, + "learning_rate": 9.265072414954531e-06, + "loss": 0.0921, + "step": 17500 + }, + { + "epoch": 0.5602662144434134, + "grad_norm": 11.25, + "learning_rate": 9.258336140114517e-06, + "loss": 0.1227, + "step": 17510 + }, + { + "epoch": 0.5605861837263623, + "grad_norm": 20.0, + "learning_rate": 9.251599865274505e-06, + "loss": 0.2023, + "step": 17520 + }, + { + "epoch": 0.5609061530093111, + "grad_norm": 2.796875, + "learning_rate": 9.244863590434491e-06, + "loss": 0.0955, + "step": 17530 + }, + { + "epoch": 0.56122612229226, + "grad_norm": 12.125, + "learning_rate": 9.238127315594477e-06, + "loss": 0.0827, + "step": 17540 + }, + { + "epoch": 0.5615460915752087, + "grad_norm": 9.1875, + "learning_rate": 9.231391040754463e-06, + "loss": 0.1101, + "step": 17550 + }, + { + "epoch": 0.5618660608581576, + "grad_norm": 16.75, + "learning_rate": 9.224654765914451e-06, + "loss": 0.1245, + "step": 17560 + }, + { + "epoch": 0.5621860301411065, + "grad_norm": 6.15625, + "learning_rate": 9.217918491074437e-06, + "loss": 0.1227, + "step": 17570 + }, + { + "epoch": 0.5625059994240553, + "grad_norm": 37.25, + "learning_rate": 9.211182216234423e-06, + "loss": 0.1178, + "step": 17580 + }, + { + "epoch": 0.5628259687070041, + "grad_norm": 13.375, + "learning_rate": 9.204445941394409e-06, + "loss": 0.1369, + "step": 17590 + }, + { + "epoch": 0.5631459379899529, + "grad_norm": 2.609375, + "learning_rate": 9.197709666554397e-06, + "loss": 0.1473, + "step": 17600 + }, + { + "epoch": 0.5634659072729018, + "grad_norm": 23.75, + "learning_rate": 9.190973391714383e-06, + "loss": 0.1329, + "step": 17610 + }, + { + "epoch": 0.5637858765558507, + "grad_norm": 18.125, + "learning_rate": 9.184237116874369e-06, + "loss": 0.0889, + "step": 17620 + }, + { + "epoch": 0.5641058458387995, + "grad_norm": 3.0625, + "learning_rate": 9.177500842034357e-06, + "loss": 0.0962, + "step": 17630 + }, + { + "epoch": 0.5644258151217483, + "grad_norm": 27.0, + "learning_rate": 9.170764567194343e-06, + "loss": 0.1296, + "step": 17640 + }, + { + "epoch": 0.5647457844046971, + "grad_norm": 13.0625, + "learning_rate": 9.164028292354329e-06, + "loss": 0.1092, + "step": 17650 + }, + { + "epoch": 0.565065753687646, + "grad_norm": 9.25, + "learning_rate": 9.157292017514315e-06, + "loss": 0.1075, + "step": 17660 + }, + { + "epoch": 0.5653857229705949, + "grad_norm": 16.75, + "learning_rate": 9.150555742674302e-06, + "loss": 0.1134, + "step": 17670 + }, + { + "epoch": 0.5657056922535436, + "grad_norm": 7.6875, + "learning_rate": 9.143819467834289e-06, + "loss": 0.1229, + "step": 17680 + }, + { + "epoch": 0.5660256615364925, + "grad_norm": 12.75, + "learning_rate": 9.137083192994275e-06, + "loss": 0.1269, + "step": 17690 + }, + { + "epoch": 0.5663456308194413, + "grad_norm": 4.125, + "learning_rate": 9.130346918154262e-06, + "loss": 0.1302, + "step": 17700 + }, + { + "epoch": 0.5666656001023902, + "grad_norm": 8.75, + "learning_rate": 9.123610643314248e-06, + "loss": 0.2007, + "step": 17710 + }, + { + "epoch": 0.5669855693853391, + "grad_norm": 16.625, + "learning_rate": 9.116874368474234e-06, + "loss": 0.127, + "step": 17720 + }, + { + "epoch": 0.5673055386682878, + "grad_norm": 17.5, + "learning_rate": 9.11013809363422e-06, + "loss": 0.1337, + "step": 17730 + }, + { + "epoch": 0.5676255079512367, + "grad_norm": 0.94140625, + "learning_rate": 9.103401818794208e-06, + "loss": 0.0603, + "step": 17740 + }, + { + "epoch": 0.5679454772341855, + "grad_norm": 7.90625, + "learning_rate": 9.096665543954194e-06, + "loss": 0.07, + "step": 17750 + }, + { + "epoch": 0.5682654465171344, + "grad_norm": 0.83984375, + "learning_rate": 9.08992926911418e-06, + "loss": 0.0582, + "step": 17760 + }, + { + "epoch": 0.5685854158000831, + "grad_norm": 18.0, + "learning_rate": 9.083192994274166e-06, + "loss": 0.1592, + "step": 17770 + }, + { + "epoch": 0.568905385083032, + "grad_norm": 11.625, + "learning_rate": 9.076456719434154e-06, + "loss": 0.1247, + "step": 17780 + }, + { + "epoch": 0.5692253543659809, + "grad_norm": 9.6875, + "learning_rate": 9.06972044459414e-06, + "loss": 0.0788, + "step": 17790 + }, + { + "epoch": 0.5695453236489297, + "grad_norm": 15.75, + "learning_rate": 9.062984169754126e-06, + "loss": 0.0767, + "step": 17800 + }, + { + "epoch": 0.5698652929318785, + "grad_norm": 3.125, + "learning_rate": 9.056247894914114e-06, + "loss": 0.0748, + "step": 17810 + }, + { + "epoch": 0.5701852622148273, + "grad_norm": 0.75390625, + "learning_rate": 9.0495116200741e-06, + "loss": 0.1468, + "step": 17820 + }, + { + "epoch": 0.5705052314977762, + "grad_norm": 3.90625, + "learning_rate": 9.042775345234086e-06, + "loss": 0.0623, + "step": 17830 + }, + { + "epoch": 0.5708252007807251, + "grad_norm": 15.375, + "learning_rate": 9.036039070394072e-06, + "loss": 0.0719, + "step": 17840 + }, + { + "epoch": 0.5711451700636739, + "grad_norm": 3.484375, + "learning_rate": 9.02930279555406e-06, + "loss": 0.1176, + "step": 17850 + }, + { + "epoch": 0.5714651393466227, + "grad_norm": 30.125, + "learning_rate": 9.022566520714046e-06, + "loss": 0.075, + "step": 17860 + }, + { + "epoch": 0.5717851086295715, + "grad_norm": 26.75, + "learning_rate": 9.015830245874032e-06, + "loss": 0.093, + "step": 17870 + }, + { + "epoch": 0.5721050779125204, + "grad_norm": 1.1015625, + "learning_rate": 9.009093971034018e-06, + "loss": 0.1431, + "step": 17880 + }, + { + "epoch": 0.5724250471954693, + "grad_norm": 6.9375, + "learning_rate": 9.002357696194006e-06, + "loss": 0.1061, + "step": 17890 + }, + { + "epoch": 0.572745016478418, + "grad_norm": 4.71875, + "learning_rate": 8.995621421353992e-06, + "loss": 0.1591, + "step": 17900 + }, + { + "epoch": 0.5730649857613669, + "grad_norm": 17.25, + "learning_rate": 8.988885146513978e-06, + "loss": 0.1671, + "step": 17910 + }, + { + "epoch": 0.5733849550443157, + "grad_norm": 1.109375, + "learning_rate": 8.982148871673965e-06, + "loss": 0.087, + "step": 17920 + }, + { + "epoch": 0.5737049243272646, + "grad_norm": 10.0, + "learning_rate": 8.975412596833951e-06, + "loss": 0.0833, + "step": 17930 + }, + { + "epoch": 0.5740248936102135, + "grad_norm": 9.1875, + "learning_rate": 8.968676321993937e-06, + "loss": 0.0906, + "step": 17940 + }, + { + "epoch": 0.5743448628931622, + "grad_norm": 2.03125, + "learning_rate": 8.961940047153923e-06, + "loss": 0.1227, + "step": 17950 + }, + { + "epoch": 0.5746648321761111, + "grad_norm": 10.4375, + "learning_rate": 8.955203772313911e-06, + "loss": 0.106, + "step": 17960 + }, + { + "epoch": 0.57498480145906, + "grad_norm": 8.625, + "learning_rate": 8.948467497473897e-06, + "loss": 0.0896, + "step": 17970 + }, + { + "epoch": 0.5753047707420088, + "grad_norm": 29.125, + "learning_rate": 8.941731222633883e-06, + "loss": 0.1356, + "step": 17980 + }, + { + "epoch": 0.5756247400249576, + "grad_norm": 8.875, + "learning_rate": 8.934994947793871e-06, + "loss": 0.1414, + "step": 17990 + }, + { + "epoch": 0.5759447093079064, + "grad_norm": 10.0625, + "learning_rate": 8.928258672953857e-06, + "loss": 0.1731, + "step": 18000 + }, + { + "epoch": 0.5762646785908553, + "grad_norm": 2.015625, + "learning_rate": 8.921522398113843e-06, + "loss": 0.1153, + "step": 18010 + }, + { + "epoch": 0.5765846478738041, + "grad_norm": 10.25, + "learning_rate": 8.91478612327383e-06, + "loss": 0.1249, + "step": 18020 + }, + { + "epoch": 0.5769046171567529, + "grad_norm": 9.9375, + "learning_rate": 8.908049848433817e-06, + "loss": 0.0728, + "step": 18030 + }, + { + "epoch": 0.5772245864397018, + "grad_norm": 12.375, + "learning_rate": 8.901313573593805e-06, + "loss": 0.119, + "step": 18040 + }, + { + "epoch": 0.5775445557226506, + "grad_norm": 6.03125, + "learning_rate": 8.89457729875379e-06, + "loss": 0.1275, + "step": 18050 + }, + { + "epoch": 0.5778645250055995, + "grad_norm": 6.28125, + "learning_rate": 8.887841023913777e-06, + "loss": 0.0936, + "step": 18060 + }, + { + "epoch": 0.5781844942885483, + "grad_norm": 6.84375, + "learning_rate": 8.881104749073763e-06, + "loss": 0.1294, + "step": 18070 + }, + { + "epoch": 0.5785044635714971, + "grad_norm": 17.25, + "learning_rate": 8.87436847423375e-06, + "loss": 0.1323, + "step": 18080 + }, + { + "epoch": 0.578824432854446, + "grad_norm": 2.53125, + "learning_rate": 8.867632199393737e-06, + "loss": 0.0497, + "step": 18090 + }, + { + "epoch": 0.5791444021373948, + "grad_norm": 8.8125, + "learning_rate": 8.860895924553723e-06, + "loss": 0.0782, + "step": 18100 + }, + { + "epoch": 0.5794643714203437, + "grad_norm": 0.55859375, + "learning_rate": 8.854159649713709e-06, + "loss": 0.0648, + "step": 18110 + }, + { + "epoch": 0.5797843407032924, + "grad_norm": 11.8125, + "learning_rate": 8.847423374873696e-06, + "loss": 0.0994, + "step": 18120 + }, + { + "epoch": 0.5801043099862413, + "grad_norm": 0.66796875, + "learning_rate": 8.840687100033682e-06, + "loss": 0.1044, + "step": 18130 + }, + { + "epoch": 0.5804242792691902, + "grad_norm": 6.71875, + "learning_rate": 8.833950825193668e-06, + "loss": 0.1147, + "step": 18140 + }, + { + "epoch": 0.580744248552139, + "grad_norm": 4.5, + "learning_rate": 8.827214550353656e-06, + "loss": 0.1071, + "step": 18150 + }, + { + "epoch": 0.5810642178350879, + "grad_norm": 5.53125, + "learning_rate": 8.820478275513642e-06, + "loss": 0.0716, + "step": 18160 + }, + { + "epoch": 0.5813841871180366, + "grad_norm": 13.8125, + "learning_rate": 8.813742000673628e-06, + "loss": 0.1637, + "step": 18170 + }, + { + "epoch": 0.5817041564009855, + "grad_norm": 0.7578125, + "learning_rate": 8.807005725833614e-06, + "loss": 0.0763, + "step": 18180 + }, + { + "epoch": 0.5820241256839344, + "grad_norm": 35.25, + "learning_rate": 8.800269450993602e-06, + "loss": 0.1926, + "step": 18190 + }, + { + "epoch": 0.5823440949668832, + "grad_norm": 3.96875, + "learning_rate": 8.793533176153588e-06, + "loss": 0.1365, + "step": 18200 + }, + { + "epoch": 0.582664064249832, + "grad_norm": 17.625, + "learning_rate": 8.786796901313574e-06, + "loss": 0.1747, + "step": 18210 + }, + { + "epoch": 0.5829840335327808, + "grad_norm": 4.28125, + "learning_rate": 8.780060626473562e-06, + "loss": 0.0777, + "step": 18220 + }, + { + "epoch": 0.5833040028157297, + "grad_norm": 7.28125, + "learning_rate": 8.773324351633548e-06, + "loss": 0.172, + "step": 18230 + }, + { + "epoch": 0.5836239720986786, + "grad_norm": 1.2734375, + "learning_rate": 8.766588076793534e-06, + "loss": 0.0922, + "step": 18240 + }, + { + "epoch": 0.5839439413816274, + "grad_norm": 10.9375, + "learning_rate": 8.75985180195352e-06, + "loss": 0.1585, + "step": 18250 + }, + { + "epoch": 0.5842639106645762, + "grad_norm": 18.625, + "learning_rate": 8.753115527113508e-06, + "loss": 0.195, + "step": 18260 + }, + { + "epoch": 0.584583879947525, + "grad_norm": 39.75, + "learning_rate": 8.746379252273494e-06, + "loss": 0.1074, + "step": 18270 + }, + { + "epoch": 0.5849038492304739, + "grad_norm": 15.6875, + "learning_rate": 8.73964297743348e-06, + "loss": 0.1355, + "step": 18280 + }, + { + "epoch": 0.5852238185134228, + "grad_norm": 8.25, + "learning_rate": 8.732906702593466e-06, + "loss": 0.1012, + "step": 18290 + }, + { + "epoch": 0.5855437877963715, + "grad_norm": 12.75, + "learning_rate": 8.726170427753454e-06, + "loss": 0.1346, + "step": 18300 + }, + { + "epoch": 0.5858637570793204, + "grad_norm": 12.125, + "learning_rate": 8.71943415291344e-06, + "loss": 0.0888, + "step": 18310 + }, + { + "epoch": 0.5861837263622692, + "grad_norm": 4.6875, + "learning_rate": 8.712697878073426e-06, + "loss": 0.0843, + "step": 18320 + }, + { + "epoch": 0.5865036956452181, + "grad_norm": 11.1875, + "learning_rate": 8.705961603233413e-06, + "loss": 0.1001, + "step": 18330 + }, + { + "epoch": 0.5868236649281668, + "grad_norm": 12.0625, + "learning_rate": 8.6992253283934e-06, + "loss": 0.0906, + "step": 18340 + }, + { + "epoch": 0.5871436342111157, + "grad_norm": 13.5, + "learning_rate": 8.692489053553385e-06, + "loss": 0.0789, + "step": 18350 + }, + { + "epoch": 0.5874636034940646, + "grad_norm": 43.5, + "learning_rate": 8.685752778713372e-06, + "loss": 0.1321, + "step": 18360 + }, + { + "epoch": 0.5877835727770134, + "grad_norm": 12.1875, + "learning_rate": 8.67901650387336e-06, + "loss": 0.1096, + "step": 18370 + }, + { + "epoch": 0.5881035420599623, + "grad_norm": 3.875, + "learning_rate": 8.672280229033345e-06, + "loss": 0.1238, + "step": 18380 + }, + { + "epoch": 0.588423511342911, + "grad_norm": 11.0625, + "learning_rate": 8.665543954193331e-06, + "loss": 0.1241, + "step": 18390 + }, + { + "epoch": 0.5887434806258599, + "grad_norm": 7.65625, + "learning_rate": 8.658807679353319e-06, + "loss": 0.0787, + "step": 18400 + }, + { + "epoch": 0.5890634499088088, + "grad_norm": 6.75, + "learning_rate": 8.652071404513305e-06, + "loss": 0.1444, + "step": 18410 + }, + { + "epoch": 0.5893834191917576, + "grad_norm": 7.84375, + "learning_rate": 8.645335129673291e-06, + "loss": 0.0956, + "step": 18420 + }, + { + "epoch": 0.5897033884747064, + "grad_norm": 11.875, + "learning_rate": 8.638598854833277e-06, + "loss": 0.0662, + "step": 18430 + }, + { + "epoch": 0.5900233577576552, + "grad_norm": 0.546875, + "learning_rate": 8.631862579993265e-06, + "loss": 0.0972, + "step": 18440 + }, + { + "epoch": 0.5903433270406041, + "grad_norm": 0.515625, + "learning_rate": 8.625126305153251e-06, + "loss": 0.1032, + "step": 18450 + }, + { + "epoch": 0.590663296323553, + "grad_norm": 48.75, + "learning_rate": 8.618390030313237e-06, + "loss": 0.1035, + "step": 18460 + }, + { + "epoch": 0.5909832656065018, + "grad_norm": 12.25, + "learning_rate": 8.611653755473223e-06, + "loss": 0.1245, + "step": 18470 + }, + { + "epoch": 0.5913032348894506, + "grad_norm": 4.0625, + "learning_rate": 8.60491748063321e-06, + "loss": 0.1371, + "step": 18480 + }, + { + "epoch": 0.5916232041723994, + "grad_norm": 13.125, + "learning_rate": 8.598181205793197e-06, + "loss": 0.1607, + "step": 18490 + }, + { + "epoch": 0.5919431734553483, + "grad_norm": 12.375, + "learning_rate": 8.591444930953183e-06, + "loss": 0.14, + "step": 18500 + }, + { + "epoch": 0.5922631427382972, + "grad_norm": 4.4375, + "learning_rate": 8.58470865611317e-06, + "loss": 0.1604, + "step": 18510 + }, + { + "epoch": 0.5925831120212459, + "grad_norm": 12.6875, + "learning_rate": 8.577972381273157e-06, + "loss": 0.1063, + "step": 18520 + }, + { + "epoch": 0.5929030813041948, + "grad_norm": 7.6875, + "learning_rate": 8.571236106433143e-06, + "loss": 0.14, + "step": 18530 + }, + { + "epoch": 0.5932230505871436, + "grad_norm": 10.875, + "learning_rate": 8.564499831593129e-06, + "loss": 0.1498, + "step": 18540 + }, + { + "epoch": 0.5935430198700925, + "grad_norm": 10.625, + "learning_rate": 8.557763556753116e-06, + "loss": 0.0913, + "step": 18550 + }, + { + "epoch": 0.5938629891530413, + "grad_norm": 4.0, + "learning_rate": 8.551027281913102e-06, + "loss": 0.103, + "step": 18560 + }, + { + "epoch": 0.5941829584359901, + "grad_norm": 3.875, + "learning_rate": 8.544291007073089e-06, + "loss": 0.0771, + "step": 18570 + }, + { + "epoch": 0.594502927718939, + "grad_norm": 15.125, + "learning_rate": 8.537554732233075e-06, + "loss": 0.0844, + "step": 18580 + }, + { + "epoch": 0.5948228970018878, + "grad_norm": 20.0, + "learning_rate": 8.530818457393062e-06, + "loss": 0.118, + "step": 18590 + }, + { + "epoch": 0.5951428662848367, + "grad_norm": 19.875, + "learning_rate": 8.524082182553048e-06, + "loss": 0.1205, + "step": 18600 + }, + { + "epoch": 0.5954628355677855, + "grad_norm": 7.84375, + "learning_rate": 8.517345907713034e-06, + "loss": 0.1637, + "step": 18610 + }, + { + "epoch": 0.5957828048507343, + "grad_norm": 13.8125, + "learning_rate": 8.510609632873022e-06, + "loss": 0.1474, + "step": 18620 + }, + { + "epoch": 0.5961027741336832, + "grad_norm": 12.875, + "learning_rate": 8.503873358033008e-06, + "loss": 0.0772, + "step": 18630 + }, + { + "epoch": 0.596422743416632, + "grad_norm": 7.0, + "learning_rate": 8.497137083192994e-06, + "loss": 0.1206, + "step": 18640 + }, + { + "epoch": 0.5967427126995808, + "grad_norm": 11.8125, + "learning_rate": 8.49040080835298e-06, + "loss": 0.0877, + "step": 18650 + }, + { + "epoch": 0.5970626819825297, + "grad_norm": 0.5390625, + "learning_rate": 8.483664533512968e-06, + "loss": 0.1119, + "step": 18660 + }, + { + "epoch": 0.5973826512654785, + "grad_norm": 30.625, + "learning_rate": 8.476928258672956e-06, + "loss": 0.088, + "step": 18670 + }, + { + "epoch": 0.5977026205484274, + "grad_norm": 3.03125, + "learning_rate": 8.470191983832942e-06, + "loss": 0.0467, + "step": 18680 + }, + { + "epoch": 0.5980225898313762, + "grad_norm": 5.28125, + "learning_rate": 8.463455708992928e-06, + "loss": 0.0885, + "step": 18690 + }, + { + "epoch": 0.598342559114325, + "grad_norm": 1.421875, + "learning_rate": 8.456719434152914e-06, + "loss": 0.0802, + "step": 18700 + }, + { + "epoch": 0.5986625283972739, + "grad_norm": 1.6328125, + "learning_rate": 8.449983159312902e-06, + "loss": 0.0619, + "step": 18710 + }, + { + "epoch": 0.5989824976802227, + "grad_norm": 5.9375, + "learning_rate": 8.443246884472888e-06, + "loss": 0.0996, + "step": 18720 + }, + { + "epoch": 0.5993024669631716, + "grad_norm": 13.875, + "learning_rate": 8.436510609632874e-06, + "loss": 0.2121, + "step": 18730 + }, + { + "epoch": 0.5996224362461203, + "grad_norm": 10.5625, + "learning_rate": 8.429774334792861e-06, + "loss": 0.1205, + "step": 18740 + }, + { + "epoch": 0.5999424055290692, + "grad_norm": 5.03125, + "learning_rate": 8.423038059952847e-06, + "loss": 0.1047, + "step": 18750 + }, + { + "epoch": 0.600262374812018, + "grad_norm": 13.1875, + "learning_rate": 8.416301785112833e-06, + "loss": 0.1272, + "step": 18760 + }, + { + "epoch": 0.6005823440949669, + "grad_norm": 1.2734375, + "learning_rate": 8.40956551027282e-06, + "loss": 0.084, + "step": 18770 + }, + { + "epoch": 0.6009023133779158, + "grad_norm": 19.875, + "learning_rate": 8.402829235432807e-06, + "loss": 0.1013, + "step": 18780 + }, + { + "epoch": 0.6012222826608645, + "grad_norm": 10.0625, + "learning_rate": 8.396092960592793e-06, + "loss": 0.0841, + "step": 18790 + }, + { + "epoch": 0.6015422519438134, + "grad_norm": 14.0, + "learning_rate": 8.38935668575278e-06, + "loss": 0.1079, + "step": 18800 + }, + { + "epoch": 0.6018622212267623, + "grad_norm": 8.875, + "learning_rate": 8.382620410912767e-06, + "loss": 0.0743, + "step": 18810 + }, + { + "epoch": 0.6021821905097111, + "grad_norm": 22.875, + "learning_rate": 8.375884136072753e-06, + "loss": 0.1192, + "step": 18820 + }, + { + "epoch": 0.6025021597926599, + "grad_norm": 23.75, + "learning_rate": 8.36914786123274e-06, + "loss": 0.0997, + "step": 18830 + }, + { + "epoch": 0.6028221290756087, + "grad_norm": 10.6875, + "learning_rate": 8.362411586392725e-06, + "loss": 0.0791, + "step": 18840 + }, + { + "epoch": 0.6031420983585576, + "grad_norm": 9.125, + "learning_rate": 8.355675311552713e-06, + "loss": 0.1361, + "step": 18850 + }, + { + "epoch": 0.6034620676415064, + "grad_norm": 11.9375, + "learning_rate": 8.348939036712699e-06, + "loss": 0.0868, + "step": 18860 + }, + { + "epoch": 0.6037820369244552, + "grad_norm": 9.0, + "learning_rate": 8.342202761872685e-06, + "loss": 0.1216, + "step": 18870 + }, + { + "epoch": 0.6041020062074041, + "grad_norm": 14.875, + "learning_rate": 8.335466487032671e-06, + "loss": 0.1298, + "step": 18880 + }, + { + "epoch": 0.6044219754903529, + "grad_norm": 1.15625, + "learning_rate": 8.328730212192659e-06, + "loss": 0.1657, + "step": 18890 + }, + { + "epoch": 0.6047419447733018, + "grad_norm": 1.1640625, + "learning_rate": 8.321993937352645e-06, + "loss": 0.113, + "step": 18900 + }, + { + "epoch": 0.6050619140562506, + "grad_norm": 9.625, + "learning_rate": 8.315257662512631e-06, + "loss": 0.0905, + "step": 18910 + }, + { + "epoch": 0.6053818833391994, + "grad_norm": 18.375, + "learning_rate": 8.308521387672619e-06, + "loss": 0.1088, + "step": 18920 + }, + { + "epoch": 0.6057018526221483, + "grad_norm": 1.625, + "learning_rate": 8.301785112832605e-06, + "loss": 0.0995, + "step": 18930 + }, + { + "epoch": 0.6060218219050971, + "grad_norm": 9.875, + "learning_rate": 8.29504883799259e-06, + "loss": 0.1592, + "step": 18940 + }, + { + "epoch": 0.606341791188046, + "grad_norm": 13.25, + "learning_rate": 8.288312563152577e-06, + "loss": 0.1075, + "step": 18950 + }, + { + "epoch": 0.6066617604709947, + "grad_norm": 12.4375, + "learning_rate": 8.281576288312564e-06, + "loss": 0.1155, + "step": 18960 + }, + { + "epoch": 0.6069817297539436, + "grad_norm": 4.625, + "learning_rate": 8.27484001347255e-06, + "loss": 0.1474, + "step": 18970 + }, + { + "epoch": 0.6073016990368925, + "grad_norm": 9.5, + "learning_rate": 8.268103738632537e-06, + "loss": 0.0962, + "step": 18980 + }, + { + "epoch": 0.6076216683198413, + "grad_norm": 21.625, + "learning_rate": 8.261367463792523e-06, + "loss": 0.1287, + "step": 18990 + }, + { + "epoch": 0.6079416376027902, + "grad_norm": 8.625, + "learning_rate": 8.25463118895251e-06, + "loss": 0.0784, + "step": 19000 + }, + { + "epoch": 0.6082616068857389, + "grad_norm": 7.875, + "learning_rate": 8.247894914112496e-06, + "loss": 0.1569, + "step": 19010 + }, + { + "epoch": 0.6085815761686878, + "grad_norm": 7.96875, + "learning_rate": 8.241158639272482e-06, + "loss": 0.0737, + "step": 19020 + }, + { + "epoch": 0.6089015454516367, + "grad_norm": 16.375, + "learning_rate": 8.23442236443247e-06, + "loss": 0.116, + "step": 19030 + }, + { + "epoch": 0.6092215147345855, + "grad_norm": 1.234375, + "learning_rate": 8.227686089592456e-06, + "loss": 0.098, + "step": 19040 + }, + { + "epoch": 0.6095414840175343, + "grad_norm": 22.375, + "learning_rate": 8.220949814752442e-06, + "loss": 0.1514, + "step": 19050 + }, + { + "epoch": 0.6098614533004831, + "grad_norm": 24.5, + "learning_rate": 8.214213539912428e-06, + "loss": 0.1491, + "step": 19060 + }, + { + "epoch": 0.610181422583432, + "grad_norm": 0.58984375, + "learning_rate": 8.207477265072416e-06, + "loss": 0.1118, + "step": 19070 + }, + { + "epoch": 0.6105013918663809, + "grad_norm": 9.5, + "learning_rate": 8.200740990232402e-06, + "loss": 0.1323, + "step": 19080 + }, + { + "epoch": 0.6108213611493296, + "grad_norm": 19.5, + "learning_rate": 8.194004715392388e-06, + "loss": 0.1056, + "step": 19090 + }, + { + "epoch": 0.6111413304322785, + "grad_norm": 1.2421875, + "learning_rate": 8.187268440552376e-06, + "loss": 0.0918, + "step": 19100 + }, + { + "epoch": 0.6114612997152273, + "grad_norm": 22.25, + "learning_rate": 8.180532165712362e-06, + "loss": 0.1669, + "step": 19110 + }, + { + "epoch": 0.6117812689981762, + "grad_norm": 9.9375, + "learning_rate": 8.173795890872348e-06, + "loss": 0.0863, + "step": 19120 + }, + { + "epoch": 0.6121012382811251, + "grad_norm": 15.375, + "learning_rate": 8.167059616032334e-06, + "loss": 0.1289, + "step": 19130 + }, + { + "epoch": 0.6124212075640738, + "grad_norm": 6.84375, + "learning_rate": 8.160323341192322e-06, + "loss": 0.0885, + "step": 19140 + }, + { + "epoch": 0.6127411768470227, + "grad_norm": 5.46875, + "learning_rate": 8.153587066352308e-06, + "loss": 0.0948, + "step": 19150 + }, + { + "epoch": 0.6130611461299715, + "grad_norm": 9.125, + "learning_rate": 8.146850791512294e-06, + "loss": 0.1265, + "step": 19160 + }, + { + "epoch": 0.6133811154129204, + "grad_norm": 6.53125, + "learning_rate": 8.14011451667228e-06, + "loss": 0.1113, + "step": 19170 + }, + { + "epoch": 0.6137010846958691, + "grad_norm": 15.9375, + "learning_rate": 8.133378241832268e-06, + "loss": 0.1489, + "step": 19180 + }, + { + "epoch": 0.614021053978818, + "grad_norm": 4.5, + "learning_rate": 8.126641966992254e-06, + "loss": 0.0789, + "step": 19190 + }, + { + "epoch": 0.6143410232617669, + "grad_norm": 16.25, + "learning_rate": 8.11990569215224e-06, + "loss": 0.0689, + "step": 19200 + }, + { + "epoch": 0.6146609925447157, + "grad_norm": 10.625, + "learning_rate": 8.113169417312227e-06, + "loss": 0.067, + "step": 19210 + }, + { + "epoch": 0.6149809618276646, + "grad_norm": 14.25, + "learning_rate": 8.106433142472213e-06, + "loss": 0.1397, + "step": 19220 + }, + { + "epoch": 0.6153009311106133, + "grad_norm": 8.3125, + "learning_rate": 8.0996968676322e-06, + "loss": 0.0818, + "step": 19230 + }, + { + "epoch": 0.6156209003935622, + "grad_norm": 8.875, + "learning_rate": 8.092960592792185e-06, + "loss": 0.1415, + "step": 19240 + }, + { + "epoch": 0.6159408696765111, + "grad_norm": 14.625, + "learning_rate": 8.086224317952173e-06, + "loss": 0.0961, + "step": 19250 + }, + { + "epoch": 0.6162608389594599, + "grad_norm": 6.71875, + "learning_rate": 8.07948804311216e-06, + "loss": 0.1307, + "step": 19260 + }, + { + "epoch": 0.6165808082424087, + "grad_norm": 4.3125, + "learning_rate": 8.072751768272145e-06, + "loss": 0.1551, + "step": 19270 + }, + { + "epoch": 0.6169007775253575, + "grad_norm": 5.53125, + "learning_rate": 8.066015493432131e-06, + "loss": 0.0824, + "step": 19280 + }, + { + "epoch": 0.6172207468083064, + "grad_norm": 16.875, + "learning_rate": 8.059279218592119e-06, + "loss": 0.1849, + "step": 19290 + }, + { + "epoch": 0.6175407160912553, + "grad_norm": 13.5, + "learning_rate": 8.052542943752105e-06, + "loss": 0.0824, + "step": 19300 + }, + { + "epoch": 0.617860685374204, + "grad_norm": 9.9375, + "learning_rate": 8.045806668912091e-06, + "loss": 0.1108, + "step": 19310 + }, + { + "epoch": 0.6181806546571529, + "grad_norm": 7.96875, + "learning_rate": 8.039070394072079e-06, + "loss": 0.0684, + "step": 19320 + }, + { + "epoch": 0.6185006239401017, + "grad_norm": 31.0, + "learning_rate": 8.032334119232067e-06, + "loss": 0.0614, + "step": 19330 + }, + { + "epoch": 0.6188205932230506, + "grad_norm": 3.171875, + "learning_rate": 8.025597844392053e-06, + "loss": 0.0985, + "step": 19340 + }, + { + "epoch": 0.6191405625059995, + "grad_norm": 17.0, + "learning_rate": 8.018861569552039e-06, + "loss": 0.0931, + "step": 19350 + }, + { + "epoch": 0.6194605317889482, + "grad_norm": 10.625, + "learning_rate": 8.012125294712025e-06, + "loss": 0.1456, + "step": 19360 + }, + { + "epoch": 0.6197805010718971, + "grad_norm": 2.5625, + "learning_rate": 8.005389019872012e-06, + "loss": 0.1399, + "step": 19370 + }, + { + "epoch": 0.620100470354846, + "grad_norm": 15.6875, + "learning_rate": 7.998652745031999e-06, + "loss": 0.1441, + "step": 19380 + }, + { + "epoch": 0.6204204396377948, + "grad_norm": 5.09375, + "learning_rate": 7.991916470191985e-06, + "loss": 0.0776, + "step": 19390 + }, + { + "epoch": 0.6207404089207436, + "grad_norm": 1.6171875, + "learning_rate": 7.98518019535197e-06, + "loss": 0.0576, + "step": 19400 + }, + { + "epoch": 0.6210603782036924, + "grad_norm": 5.46875, + "learning_rate": 7.978443920511958e-06, + "loss": 0.1035, + "step": 19410 + }, + { + "epoch": 0.6213803474866413, + "grad_norm": 12.375, + "learning_rate": 7.971707645671944e-06, + "loss": 0.1826, + "step": 19420 + }, + { + "epoch": 0.6217003167695901, + "grad_norm": 8.0, + "learning_rate": 7.96497137083193e-06, + "loss": 0.1464, + "step": 19430 + }, + { + "epoch": 0.622020286052539, + "grad_norm": 0.765625, + "learning_rate": 7.958235095991918e-06, + "loss": 0.1282, + "step": 19440 + }, + { + "epoch": 0.6223402553354878, + "grad_norm": 15.6875, + "learning_rate": 7.951498821151904e-06, + "loss": 0.0828, + "step": 19450 + }, + { + "epoch": 0.6226602246184366, + "grad_norm": 26.25, + "learning_rate": 7.94476254631189e-06, + "loss": 0.171, + "step": 19460 + }, + { + "epoch": 0.6229801939013855, + "grad_norm": 20.75, + "learning_rate": 7.938026271471876e-06, + "loss": 0.0819, + "step": 19470 + }, + { + "epoch": 0.6233001631843343, + "grad_norm": 1.140625, + "learning_rate": 7.931289996631864e-06, + "loss": 0.1237, + "step": 19480 + }, + { + "epoch": 0.6236201324672831, + "grad_norm": 14.8125, + "learning_rate": 7.92455372179185e-06, + "loss": 0.1599, + "step": 19490 + }, + { + "epoch": 0.623940101750232, + "grad_norm": 4.78125, + "learning_rate": 7.917817446951836e-06, + "loss": 0.0942, + "step": 19500 + }, + { + "epoch": 0.6242600710331808, + "grad_norm": 4.25, + "learning_rate": 7.911081172111824e-06, + "loss": 0.1062, + "step": 19510 + }, + { + "epoch": 0.6245800403161297, + "grad_norm": 10.3125, + "learning_rate": 7.90434489727181e-06, + "loss": 0.1284, + "step": 19520 + }, + { + "epoch": 0.6249000095990785, + "grad_norm": 3.390625, + "learning_rate": 7.897608622431796e-06, + "loss": 0.0548, + "step": 19530 + }, + { + "epoch": 0.6252199788820273, + "grad_norm": 10.125, + "learning_rate": 7.890872347591782e-06, + "loss": 0.1392, + "step": 19540 + }, + { + "epoch": 0.6255399481649762, + "grad_norm": 1.9296875, + "learning_rate": 7.88413607275177e-06, + "loss": 0.0849, + "step": 19550 + }, + { + "epoch": 0.625859917447925, + "grad_norm": 0.62890625, + "learning_rate": 7.877399797911756e-06, + "loss": 0.0724, + "step": 19560 + }, + { + "epoch": 0.6261798867308739, + "grad_norm": 15.1875, + "learning_rate": 7.870663523071742e-06, + "loss": 0.1187, + "step": 19570 + }, + { + "epoch": 0.6264998560138226, + "grad_norm": 15.0, + "learning_rate": 7.863927248231728e-06, + "loss": 0.1365, + "step": 19580 + }, + { + "epoch": 0.6268198252967715, + "grad_norm": 12.6875, + "learning_rate": 7.857190973391716e-06, + "loss": 0.1425, + "step": 19590 + }, + { + "epoch": 0.6271397945797204, + "grad_norm": 17.875, + "learning_rate": 7.850454698551702e-06, + "loss": 0.1039, + "step": 19600 + }, + { + "epoch": 0.6274597638626692, + "grad_norm": 4.4375, + "learning_rate": 7.843718423711688e-06, + "loss": 0.0568, + "step": 19610 + }, + { + "epoch": 0.627779733145618, + "grad_norm": 19.625, + "learning_rate": 7.836982148871675e-06, + "loss": 0.0746, + "step": 19620 + }, + { + "epoch": 0.6280997024285668, + "grad_norm": 14.1875, + "learning_rate": 7.830245874031661e-06, + "loss": 0.1533, + "step": 19630 + }, + { + "epoch": 0.6284196717115157, + "grad_norm": 0.478515625, + "learning_rate": 7.823509599191647e-06, + "loss": 0.1254, + "step": 19640 + }, + { + "epoch": 0.6287396409944646, + "grad_norm": 16.125, + "learning_rate": 7.816773324351634e-06, + "loss": 0.0725, + "step": 19650 + }, + { + "epoch": 0.6290596102774134, + "grad_norm": 19.375, + "learning_rate": 7.810037049511621e-06, + "loss": 0.1433, + "step": 19660 + }, + { + "epoch": 0.6293795795603622, + "grad_norm": 31.125, + "learning_rate": 7.803300774671607e-06, + "loss": 0.1035, + "step": 19670 + }, + { + "epoch": 0.629699548843311, + "grad_norm": 3.09375, + "learning_rate": 7.796564499831593e-06, + "loss": 0.1315, + "step": 19680 + }, + { + "epoch": 0.6300195181262599, + "grad_norm": 10.625, + "learning_rate": 7.78982822499158e-06, + "loss": 0.1518, + "step": 19690 + }, + { + "epoch": 0.6303394874092088, + "grad_norm": 4.65625, + "learning_rate": 7.783091950151567e-06, + "loss": 0.0929, + "step": 19700 + }, + { + "epoch": 0.6306594566921575, + "grad_norm": 7.84375, + "learning_rate": 7.776355675311553e-06, + "loss": 0.1516, + "step": 19710 + }, + { + "epoch": 0.6309794259751064, + "grad_norm": 6.3125, + "learning_rate": 7.76961940047154e-06, + "loss": 0.092, + "step": 19720 + }, + { + "epoch": 0.6312993952580552, + "grad_norm": 7.34375, + "learning_rate": 7.762883125631527e-06, + "loss": 0.059, + "step": 19730 + }, + { + "epoch": 0.6316193645410041, + "grad_norm": 2.578125, + "learning_rate": 7.756146850791513e-06, + "loss": 0.1217, + "step": 19740 + }, + { + "epoch": 0.631939333823953, + "grad_norm": 0.88671875, + "learning_rate": 7.749410575951499e-06, + "loss": 0.0466, + "step": 19750 + }, + { + "epoch": 0.6322593031069017, + "grad_norm": 2.703125, + "learning_rate": 7.742674301111485e-06, + "loss": 0.1331, + "step": 19760 + }, + { + "epoch": 0.6325792723898506, + "grad_norm": 27.0, + "learning_rate": 7.735938026271473e-06, + "loss": 0.1202, + "step": 19770 + }, + { + "epoch": 0.6328992416727994, + "grad_norm": 3.9375, + "learning_rate": 7.729201751431459e-06, + "loss": 0.0823, + "step": 19780 + }, + { + "epoch": 0.6332192109557483, + "grad_norm": 17.625, + "learning_rate": 7.722465476591445e-06, + "loss": 0.1079, + "step": 19790 + }, + { + "epoch": 0.633539180238697, + "grad_norm": 8.9375, + "learning_rate": 7.715729201751433e-06, + "loss": 0.1229, + "step": 19800 + }, + { + "epoch": 0.6338591495216459, + "grad_norm": 14.0, + "learning_rate": 7.708992926911419e-06, + "loss": 0.1233, + "step": 19810 + }, + { + "epoch": 0.6341791188045948, + "grad_norm": 7.09375, + "learning_rate": 7.702256652071405e-06, + "loss": 0.082, + "step": 19820 + }, + { + "epoch": 0.6344990880875436, + "grad_norm": 1.2421875, + "learning_rate": 7.69552037723139e-06, + "loss": 0.0586, + "step": 19830 + }, + { + "epoch": 0.6348190573704924, + "grad_norm": 4.1875, + "learning_rate": 7.688784102391378e-06, + "loss": 0.0743, + "step": 19840 + }, + { + "epoch": 0.6351390266534412, + "grad_norm": 7.15625, + "learning_rate": 7.682047827551365e-06, + "loss": 0.1212, + "step": 19850 + }, + { + "epoch": 0.6354589959363901, + "grad_norm": 17.875, + "learning_rate": 7.67531155271135e-06, + "loss": 0.0919, + "step": 19860 + }, + { + "epoch": 0.635778965219339, + "grad_norm": 1.03125, + "learning_rate": 7.668575277871337e-06, + "loss": 0.1626, + "step": 19870 + }, + { + "epoch": 0.6360989345022878, + "grad_norm": 8.1875, + "learning_rate": 7.661839003031324e-06, + "loss": 0.1455, + "step": 19880 + }, + { + "epoch": 0.6364189037852366, + "grad_norm": 20.125, + "learning_rate": 7.65510272819131e-06, + "loss": 0.1018, + "step": 19890 + }, + { + "epoch": 0.6367388730681854, + "grad_norm": 22.0, + "learning_rate": 7.648366453351296e-06, + "loss": 0.1173, + "step": 19900 + }, + { + "epoch": 0.6370588423511343, + "grad_norm": 1.2734375, + "learning_rate": 7.641630178511284e-06, + "loss": 0.1276, + "step": 19910 + }, + { + "epoch": 0.6373788116340832, + "grad_norm": 8.375, + "learning_rate": 7.63489390367127e-06, + "loss": 0.1154, + "step": 19920 + }, + { + "epoch": 0.6376987809170319, + "grad_norm": 10.25, + "learning_rate": 7.628157628831256e-06, + "loss": 0.0659, + "step": 19930 + }, + { + "epoch": 0.6380187501999808, + "grad_norm": 9.75, + "learning_rate": 7.621421353991243e-06, + "loss": 0.1202, + "step": 19940 + }, + { + "epoch": 0.6383387194829296, + "grad_norm": 14.9375, + "learning_rate": 7.614685079151229e-06, + "loss": 0.1326, + "step": 19950 + }, + { + "epoch": 0.6386586887658785, + "grad_norm": 16.125, + "learning_rate": 7.607948804311217e-06, + "loss": 0.102, + "step": 19960 + }, + { + "epoch": 0.6389786580488274, + "grad_norm": 20.75, + "learning_rate": 7.601212529471204e-06, + "loss": 0.1642, + "step": 19970 + }, + { + "epoch": 0.6392986273317761, + "grad_norm": 10.0, + "learning_rate": 7.59447625463119e-06, + "loss": 0.086, + "step": 19980 + }, + { + "epoch": 0.639618596614725, + "grad_norm": 10.0625, + "learning_rate": 7.587739979791177e-06, + "loss": 0.178, + "step": 19990 + }, + { + "epoch": 0.6399385658976738, + "grad_norm": 15.3125, + "learning_rate": 7.581003704951163e-06, + "loss": 0.136, + "step": 20000 + }, + { + "epoch": 0.6402585351806227, + "grad_norm": 0.4296875, + "learning_rate": 7.57426743011115e-06, + "loss": 0.0693, + "step": 20010 + }, + { + "epoch": 0.6405785044635715, + "grad_norm": 0.96875, + "learning_rate": 7.5675311552711365e-06, + "loss": 0.0695, + "step": 20020 + }, + { + "epoch": 0.6408984737465203, + "grad_norm": 20.375, + "learning_rate": 7.5607948804311226e-06, + "loss": 0.0644, + "step": 20030 + }, + { + "epoch": 0.6412184430294692, + "grad_norm": 4.75, + "learning_rate": 7.5540586055911095e-06, + "loss": 0.1316, + "step": 20040 + }, + { + "epoch": 0.641538412312418, + "grad_norm": 6.1875, + "learning_rate": 7.5473223307510955e-06, + "loss": 0.1294, + "step": 20050 + }, + { + "epoch": 0.6418583815953669, + "grad_norm": 6.5625, + "learning_rate": 7.540586055911082e-06, + "loss": 0.049, + "step": 20060 + }, + { + "epoch": 0.6421783508783157, + "grad_norm": 9.0625, + "learning_rate": 7.5338497810710684e-06, + "loss": 0.0606, + "step": 20070 + }, + { + "epoch": 0.6424983201612645, + "grad_norm": 0.56640625, + "learning_rate": 7.527113506231055e-06, + "loss": 0.0997, + "step": 20080 + }, + { + "epoch": 0.6428182894442134, + "grad_norm": 31.375, + "learning_rate": 7.520377231391041e-06, + "loss": 0.1517, + "step": 20090 + }, + { + "epoch": 0.6431382587271622, + "grad_norm": 6.1875, + "learning_rate": 7.513640956551028e-06, + "loss": 0.1015, + "step": 20100 + }, + { + "epoch": 0.643458228010111, + "grad_norm": 17.75, + "learning_rate": 7.506904681711015e-06, + "loss": 0.1589, + "step": 20110 + }, + { + "epoch": 0.6437781972930599, + "grad_norm": 1.5859375, + "learning_rate": 7.500168406871001e-06, + "loss": 0.1131, + "step": 20120 + }, + { + "epoch": 0.6440981665760087, + "grad_norm": 0.76171875, + "learning_rate": 7.493432132030988e-06, + "loss": 0.1569, + "step": 20130 + }, + { + "epoch": 0.6444181358589576, + "grad_norm": 9.1875, + "learning_rate": 7.486695857190974e-06, + "loss": 0.0943, + "step": 20140 + }, + { + "epoch": 0.6447381051419063, + "grad_norm": 6.75, + "learning_rate": 7.479959582350961e-06, + "loss": 0.1397, + "step": 20150 + }, + { + "epoch": 0.6450580744248552, + "grad_norm": 12.875, + "learning_rate": 7.473223307510947e-06, + "loss": 0.1906, + "step": 20160 + }, + { + "epoch": 0.645378043707804, + "grad_norm": 8.375, + "learning_rate": 7.466487032670934e-06, + "loss": 0.1165, + "step": 20170 + }, + { + "epoch": 0.6456980129907529, + "grad_norm": 12.125, + "learning_rate": 7.45975075783092e-06, + "loss": 0.033, + "step": 20180 + }, + { + "epoch": 0.6460179822737018, + "grad_norm": 2.078125, + "learning_rate": 7.453014482990907e-06, + "loss": 0.1061, + "step": 20190 + }, + { + "epoch": 0.6463379515566505, + "grad_norm": 11.0625, + "learning_rate": 7.446278208150893e-06, + "loss": 0.1324, + "step": 20200 + }, + { + "epoch": 0.6466579208395994, + "grad_norm": 5.75, + "learning_rate": 7.43954193331088e-06, + "loss": 0.1213, + "step": 20210 + }, + { + "epoch": 0.6469778901225482, + "grad_norm": 28.125, + "learning_rate": 7.432805658470867e-06, + "loss": 0.0962, + "step": 20220 + }, + { + "epoch": 0.6472978594054971, + "grad_norm": 1.515625, + "learning_rate": 7.426069383630853e-06, + "loss": 0.1376, + "step": 20230 + }, + { + "epoch": 0.6476178286884459, + "grad_norm": 11.4375, + "learning_rate": 7.41933310879084e-06, + "loss": 0.1109, + "step": 20240 + }, + { + "epoch": 0.6479377979713947, + "grad_norm": 4.25, + "learning_rate": 7.412596833950826e-06, + "loss": 0.0387, + "step": 20250 + }, + { + "epoch": 0.6482577672543436, + "grad_norm": 10.125, + "learning_rate": 7.4058605591108125e-06, + "loss": 0.0841, + "step": 20260 + }, + { + "epoch": 0.6485777365372924, + "grad_norm": 9.4375, + "learning_rate": 7.3991242842707986e-06, + "loss": 0.0833, + "step": 20270 + }, + { + "epoch": 0.6488977058202413, + "grad_norm": 3.796875, + "learning_rate": 7.3923880094307855e-06, + "loss": 0.0976, + "step": 20280 + }, + { + "epoch": 0.6492176751031901, + "grad_norm": 13.5625, + "learning_rate": 7.3856517345907715e-06, + "loss": 0.1339, + "step": 20290 + }, + { + "epoch": 0.6495376443861389, + "grad_norm": 1.546875, + "learning_rate": 7.378915459750758e-06, + "loss": 0.0921, + "step": 20300 + }, + { + "epoch": 0.6498576136690878, + "grad_norm": 32.5, + "learning_rate": 7.372179184910745e-06, + "loss": 0.1333, + "step": 20310 + }, + { + "epoch": 0.6501775829520366, + "grad_norm": 21.625, + "learning_rate": 7.365442910070731e-06, + "loss": 0.1174, + "step": 20320 + }, + { + "epoch": 0.6504975522349854, + "grad_norm": 26.0, + "learning_rate": 7.358706635230718e-06, + "loss": 0.1552, + "step": 20330 + }, + { + "epoch": 0.6508175215179343, + "grad_norm": 6.84375, + "learning_rate": 7.351970360390704e-06, + "loss": 0.1214, + "step": 20340 + }, + { + "epoch": 0.6511374908008831, + "grad_norm": 1.4453125, + "learning_rate": 7.345234085550691e-06, + "loss": 0.1544, + "step": 20350 + }, + { + "epoch": 0.651457460083832, + "grad_norm": 12.6875, + "learning_rate": 7.338497810710677e-06, + "loss": 0.1477, + "step": 20360 + }, + { + "epoch": 0.6517774293667807, + "grad_norm": 3.75, + "learning_rate": 7.331761535870664e-06, + "loss": 0.046, + "step": 20370 + }, + { + "epoch": 0.6520973986497296, + "grad_norm": 16.875, + "learning_rate": 7.32502526103065e-06, + "loss": 0.1528, + "step": 20380 + }, + { + "epoch": 0.6524173679326785, + "grad_norm": 12.375, + "learning_rate": 7.318288986190637e-06, + "loss": 0.0999, + "step": 20390 + }, + { + "epoch": 0.6527373372156273, + "grad_norm": 8.75, + "learning_rate": 7.311552711350624e-06, + "loss": 0.1011, + "step": 20400 + }, + { + "epoch": 0.6530573064985762, + "grad_norm": 1.5546875, + "learning_rate": 7.30481643651061e-06, + "loss": 0.166, + "step": 20410 + }, + { + "epoch": 0.6533772757815249, + "grad_norm": 11.5, + "learning_rate": 7.298080161670597e-06, + "loss": 0.1344, + "step": 20420 + }, + { + "epoch": 0.6536972450644738, + "grad_norm": 13.5625, + "learning_rate": 7.291343886830583e-06, + "loss": 0.1874, + "step": 20430 + }, + { + "epoch": 0.6540172143474227, + "grad_norm": 9.8125, + "learning_rate": 7.28460761199057e-06, + "loss": 0.0561, + "step": 20440 + }, + { + "epoch": 0.6543371836303715, + "grad_norm": 0.7890625, + "learning_rate": 7.277871337150556e-06, + "loss": 0.1654, + "step": 20450 + }, + { + "epoch": 0.6546571529133203, + "grad_norm": 7.34375, + "learning_rate": 7.271135062310543e-06, + "loss": 0.0674, + "step": 20460 + }, + { + "epoch": 0.6549771221962691, + "grad_norm": 10.0625, + "learning_rate": 7.264398787470529e-06, + "loss": 0.1458, + "step": 20470 + }, + { + "epoch": 0.655297091479218, + "grad_norm": 9.5625, + "learning_rate": 7.257662512630516e-06, + "loss": 0.1359, + "step": 20480 + }, + { + "epoch": 0.6556170607621669, + "grad_norm": 12.5, + "learning_rate": 7.2509262377905025e-06, + "loss": 0.1537, + "step": 20490 + }, + { + "epoch": 0.6559370300451157, + "grad_norm": 21.75, + "learning_rate": 7.2441899629504885e-06, + "loss": 0.1127, + "step": 20500 + }, + { + "epoch": 0.6562569993280645, + "grad_norm": 13.875, + "learning_rate": 7.237453688110475e-06, + "loss": 0.1263, + "step": 20510 + }, + { + "epoch": 0.6565769686110133, + "grad_norm": 20.375, + "learning_rate": 7.2307174132704615e-06, + "loss": 0.0912, + "step": 20520 + }, + { + "epoch": 0.6568969378939622, + "grad_norm": 10.25, + "learning_rate": 7.223981138430448e-06, + "loss": 0.0811, + "step": 20530 + }, + { + "epoch": 0.6572169071769111, + "grad_norm": 8.3125, + "learning_rate": 7.217244863590434e-06, + "loss": 0.209, + "step": 20540 + }, + { + "epoch": 0.6575368764598598, + "grad_norm": 7.96875, + "learning_rate": 7.210508588750421e-06, + "loss": 0.1284, + "step": 20550 + }, + { + "epoch": 0.6578568457428087, + "grad_norm": 0.921875, + "learning_rate": 7.203772313910407e-06, + "loss": 0.0822, + "step": 20560 + }, + { + "epoch": 0.6581768150257575, + "grad_norm": 11.3125, + "learning_rate": 7.197036039070394e-06, + "loss": 0.1542, + "step": 20570 + }, + { + "epoch": 0.6584967843087064, + "grad_norm": 5.46875, + "learning_rate": 7.19029976423038e-06, + "loss": 0.1542, + "step": 20580 + }, + { + "epoch": 0.6588167535916551, + "grad_norm": 11.4375, + "learning_rate": 7.183563489390367e-06, + "loss": 0.0781, + "step": 20590 + }, + { + "epoch": 0.659136722874604, + "grad_norm": 5.1875, + "learning_rate": 7.176827214550354e-06, + "loss": 0.0887, + "step": 20600 + }, + { + "epoch": 0.6594566921575529, + "grad_norm": 1.6171875, + "learning_rate": 7.170090939710341e-06, + "loss": 0.0882, + "step": 20610 + }, + { + "epoch": 0.6597766614405017, + "grad_norm": 10.5, + "learning_rate": 7.163354664870328e-06, + "loss": 0.0617, + "step": 20620 + }, + { + "epoch": 0.6600966307234506, + "grad_norm": 10.0625, + "learning_rate": 7.156618390030315e-06, + "loss": 0.1362, + "step": 20630 + }, + { + "epoch": 0.6604166000063993, + "grad_norm": 8.1875, + "learning_rate": 7.149882115190301e-06, + "loss": 0.0496, + "step": 20640 + }, + { + "epoch": 0.6607365692893482, + "grad_norm": 12.4375, + "learning_rate": 7.143145840350288e-06, + "loss": 0.0986, + "step": 20650 + }, + { + "epoch": 0.6610565385722971, + "grad_norm": 1.3046875, + "learning_rate": 7.136409565510274e-06, + "loss": 0.0777, + "step": 20660 + }, + { + "epoch": 0.6613765078552459, + "grad_norm": 1.734375, + "learning_rate": 7.1296732906702606e-06, + "loss": 0.062, + "step": 20670 + }, + { + "epoch": 0.6616964771381947, + "grad_norm": 10.75, + "learning_rate": 7.122937015830247e-06, + "loss": 0.1215, + "step": 20680 + }, + { + "epoch": 0.6620164464211435, + "grad_norm": 16.75, + "learning_rate": 7.1162007409902335e-06, + "loss": 0.1362, + "step": 20690 + }, + { + "epoch": 0.6623364157040924, + "grad_norm": 15.875, + "learning_rate": 7.1094644661502195e-06, + "loss": 0.1404, + "step": 20700 + }, + { + "epoch": 0.6626563849870413, + "grad_norm": 6.65625, + "learning_rate": 7.102728191310206e-06, + "loss": 0.1633, + "step": 20710 + }, + { + "epoch": 0.6629763542699901, + "grad_norm": 15.0, + "learning_rate": 7.095991916470193e-06, + "loss": 0.0841, + "step": 20720 + }, + { + "epoch": 0.6632963235529389, + "grad_norm": 13.6875, + "learning_rate": 7.089255641630179e-06, + "loss": 0.1324, + "step": 20730 + }, + { + "epoch": 0.6636162928358877, + "grad_norm": 7.5625, + "learning_rate": 7.082519366790166e-06, + "loss": 0.1023, + "step": 20740 + }, + { + "epoch": 0.6639362621188366, + "grad_norm": 11.875, + "learning_rate": 7.075783091950152e-06, + "loss": 0.0865, + "step": 20750 + }, + { + "epoch": 0.6642562314017855, + "grad_norm": 19.125, + "learning_rate": 7.069046817110139e-06, + "loss": 0.1169, + "step": 20760 + }, + { + "epoch": 0.6645762006847342, + "grad_norm": 11.5, + "learning_rate": 7.062310542270125e-06, + "loss": 0.1276, + "step": 20770 + }, + { + "epoch": 0.6648961699676831, + "grad_norm": 23.5, + "learning_rate": 7.055574267430112e-06, + "loss": 0.0881, + "step": 20780 + }, + { + "epoch": 0.6652161392506319, + "grad_norm": 0.9140625, + "learning_rate": 7.048837992590098e-06, + "loss": 0.1109, + "step": 20790 + }, + { + "epoch": 0.6655361085335808, + "grad_norm": 26.5, + "learning_rate": 7.042101717750085e-06, + "loss": 0.1002, + "step": 20800 + }, + { + "epoch": 0.6658560778165297, + "grad_norm": 12.75, + "learning_rate": 7.035365442910072e-06, + "loss": 0.1059, + "step": 20810 + }, + { + "epoch": 0.6661760470994784, + "grad_norm": 4.625, + "learning_rate": 7.028629168070058e-06, + "loss": 0.0575, + "step": 20820 + }, + { + "epoch": 0.6664960163824273, + "grad_norm": 13.6875, + "learning_rate": 7.021892893230045e-06, + "loss": 0.0989, + "step": 20830 + }, + { + "epoch": 0.6668159856653761, + "grad_norm": 9.0625, + "learning_rate": 7.015156618390031e-06, + "loss": 0.0852, + "step": 20840 + }, + { + "epoch": 0.667135954948325, + "grad_norm": 8.4375, + "learning_rate": 7.008420343550018e-06, + "loss": 0.117, + "step": 20850 + }, + { + "epoch": 0.6674559242312738, + "grad_norm": 5.28125, + "learning_rate": 7.001684068710004e-06, + "loss": 0.0628, + "step": 20860 + }, + { + "epoch": 0.6677758935142226, + "grad_norm": 3.625, + "learning_rate": 6.994947793869991e-06, + "loss": 0.1088, + "step": 20870 + }, + { + "epoch": 0.6680958627971715, + "grad_norm": 5.03125, + "learning_rate": 6.988211519029977e-06, + "loss": 0.0809, + "step": 20880 + }, + { + "epoch": 0.6684158320801203, + "grad_norm": 0.3203125, + "learning_rate": 6.981475244189964e-06, + "loss": 0.1406, + "step": 20890 + }, + { + "epoch": 0.6687358013630691, + "grad_norm": 4.53125, + "learning_rate": 6.9747389693499505e-06, + "loss": 0.095, + "step": 20900 + }, + { + "epoch": 0.669055770646018, + "grad_norm": 11.5, + "learning_rate": 6.9680026945099366e-06, + "loss": 0.162, + "step": 20910 + }, + { + "epoch": 0.6693757399289668, + "grad_norm": 11.75, + "learning_rate": 6.9612664196699234e-06, + "loss": 0.1381, + "step": 20920 + }, + { + "epoch": 0.6696957092119157, + "grad_norm": 6.46875, + "learning_rate": 6.9545301448299095e-06, + "loss": 0.1148, + "step": 20930 + }, + { + "epoch": 0.6700156784948645, + "grad_norm": 6.3125, + "learning_rate": 6.947793869989896e-06, + "loss": 0.1229, + "step": 20940 + }, + { + "epoch": 0.6703356477778133, + "grad_norm": 9.9375, + "learning_rate": 6.941057595149882e-06, + "loss": 0.0953, + "step": 20950 + }, + { + "epoch": 0.6706556170607622, + "grad_norm": 6.75, + "learning_rate": 6.934321320309869e-06, + "loss": 0.1972, + "step": 20960 + }, + { + "epoch": 0.670975586343711, + "grad_norm": 24.5, + "learning_rate": 6.927585045469855e-06, + "loss": 0.109, + "step": 20970 + }, + { + "epoch": 0.6712955556266599, + "grad_norm": 2.953125, + "learning_rate": 6.920848770629842e-06, + "loss": 0.0661, + "step": 20980 + }, + { + "epoch": 0.6716155249096086, + "grad_norm": 10.0, + "learning_rate": 6.914112495789828e-06, + "loss": 0.106, + "step": 20990 + }, + { + "epoch": 0.6719354941925575, + "grad_norm": 12.25, + "learning_rate": 6.907376220949815e-06, + "loss": 0.1246, + "step": 21000 + }, + { + "epoch": 0.6722554634755064, + "grad_norm": 15.4375, + "learning_rate": 6.900639946109802e-06, + "loss": 0.0964, + "step": 21010 + }, + { + "epoch": 0.6725754327584552, + "grad_norm": 2.546875, + "learning_rate": 6.893903671269788e-06, + "loss": 0.0711, + "step": 21020 + }, + { + "epoch": 0.6728954020414041, + "grad_norm": 16.375, + "learning_rate": 6.887167396429775e-06, + "loss": 0.1346, + "step": 21030 + }, + { + "epoch": 0.6732153713243528, + "grad_norm": 1.359375, + "learning_rate": 6.880431121589761e-06, + "loss": 0.1528, + "step": 21040 + }, + { + "epoch": 0.6735353406073017, + "grad_norm": 10.0625, + "learning_rate": 6.873694846749748e-06, + "loss": 0.1028, + "step": 21050 + }, + { + "epoch": 0.6738553098902506, + "grad_norm": 6.75, + "learning_rate": 6.866958571909734e-06, + "loss": 0.1368, + "step": 21060 + }, + { + "epoch": 0.6741752791731994, + "grad_norm": 2.5, + "learning_rate": 6.860222297069721e-06, + "loss": 0.1235, + "step": 21070 + }, + { + "epoch": 0.6744952484561482, + "grad_norm": 8.875, + "learning_rate": 6.853486022229707e-06, + "loss": 0.0645, + "step": 21080 + }, + { + "epoch": 0.674815217739097, + "grad_norm": 19.0, + "learning_rate": 6.846749747389694e-06, + "loss": 0.1734, + "step": 21090 + }, + { + "epoch": 0.6751351870220459, + "grad_norm": 1.1953125, + "learning_rate": 6.840013472549681e-06, + "loss": 0.1741, + "step": 21100 + }, + { + "epoch": 0.6754551563049948, + "grad_norm": 1.578125, + "learning_rate": 6.833277197709667e-06, + "loss": 0.1117, + "step": 21110 + }, + { + "epoch": 0.6757751255879435, + "grad_norm": 9.0625, + "learning_rate": 6.826540922869654e-06, + "loss": 0.0723, + "step": 21120 + }, + { + "epoch": 0.6760950948708924, + "grad_norm": 9.25, + "learning_rate": 6.81980464802964e-06, + "loss": 0.1361, + "step": 21130 + }, + { + "epoch": 0.6764150641538412, + "grad_norm": 1.9921875, + "learning_rate": 6.8130683731896265e-06, + "loss": 0.1702, + "step": 21140 + }, + { + "epoch": 0.6767350334367901, + "grad_norm": 11.0, + "learning_rate": 6.8063320983496126e-06, + "loss": 0.1383, + "step": 21150 + }, + { + "epoch": 0.677055002719739, + "grad_norm": 9.1875, + "learning_rate": 6.7995958235095994e-06, + "loss": 0.164, + "step": 21160 + }, + { + "epoch": 0.6773749720026877, + "grad_norm": 1.921875, + "learning_rate": 6.7928595486695855e-06, + "loss": 0.0882, + "step": 21170 + }, + { + "epoch": 0.6776949412856366, + "grad_norm": 0.6484375, + "learning_rate": 6.786123273829572e-06, + "loss": 0.0731, + "step": 21180 + }, + { + "epoch": 0.6780149105685854, + "grad_norm": 5.40625, + "learning_rate": 6.779386998989559e-06, + "loss": 0.1255, + "step": 21190 + }, + { + "epoch": 0.6783348798515343, + "grad_norm": 12.875, + "learning_rate": 6.772650724149545e-06, + "loss": 0.1471, + "step": 21200 + }, + { + "epoch": 0.678654849134483, + "grad_norm": 7.25, + "learning_rate": 6.765914449309532e-06, + "loss": 0.109, + "step": 21210 + }, + { + "epoch": 0.6789748184174319, + "grad_norm": 46.75, + "learning_rate": 6.759178174469518e-06, + "loss": 0.0797, + "step": 21220 + }, + { + "epoch": 0.6792947877003808, + "grad_norm": 20.875, + "learning_rate": 6.752441899629505e-06, + "loss": 0.1738, + "step": 21230 + }, + { + "epoch": 0.6796147569833296, + "grad_norm": 9.0625, + "learning_rate": 6.745705624789491e-06, + "loss": 0.0804, + "step": 21240 + }, + { + "epoch": 0.6799347262662785, + "grad_norm": 8.0625, + "learning_rate": 6.738969349949479e-06, + "loss": 0.1467, + "step": 21250 + }, + { + "epoch": 0.6802546955492272, + "grad_norm": 15.4375, + "learning_rate": 6.732233075109466e-06, + "loss": 0.1002, + "step": 21260 + }, + { + "epoch": 0.6805746648321761, + "grad_norm": 11.375, + "learning_rate": 6.725496800269452e-06, + "loss": 0.0615, + "step": 21270 + }, + { + "epoch": 0.680894634115125, + "grad_norm": 14.25, + "learning_rate": 6.718760525429439e-06, + "loss": 0.0938, + "step": 21280 + }, + { + "epoch": 0.6812146033980738, + "grad_norm": 4.65625, + "learning_rate": 6.712024250589425e-06, + "loss": 0.1235, + "step": 21290 + }, + { + "epoch": 0.6815345726810226, + "grad_norm": 5.6875, + "learning_rate": 6.705287975749412e-06, + "loss": 0.1374, + "step": 21300 + }, + { + "epoch": 0.6818545419639714, + "grad_norm": 8.0625, + "learning_rate": 6.698551700909398e-06, + "loss": 0.1177, + "step": 21310 + }, + { + "epoch": 0.6821745112469203, + "grad_norm": 15.4375, + "learning_rate": 6.691815426069385e-06, + "loss": 0.1472, + "step": 21320 + }, + { + "epoch": 0.6824944805298692, + "grad_norm": 27.625, + "learning_rate": 6.6850791512293715e-06, + "loss": 0.1317, + "step": 21330 + }, + { + "epoch": 0.6828144498128179, + "grad_norm": 10.5625, + "learning_rate": 6.6783428763893575e-06, + "loss": 0.0762, + "step": 21340 + }, + { + "epoch": 0.6831344190957668, + "grad_norm": 12.4375, + "learning_rate": 6.671606601549344e-06, + "loss": 0.0873, + "step": 21350 + }, + { + "epoch": 0.6834543883787156, + "grad_norm": 11.8125, + "learning_rate": 6.6648703267093304e-06, + "loss": 0.1275, + "step": 21360 + }, + { + "epoch": 0.6837743576616645, + "grad_norm": 11.75, + "learning_rate": 6.658134051869317e-06, + "loss": 0.1848, + "step": 21370 + }, + { + "epoch": 0.6840943269446134, + "grad_norm": 21.375, + "learning_rate": 6.651397777029303e-06, + "loss": 0.1494, + "step": 21380 + }, + { + "epoch": 0.6844142962275621, + "grad_norm": 8.625, + "learning_rate": 6.64466150218929e-06, + "loss": 0.1111, + "step": 21390 + }, + { + "epoch": 0.684734265510511, + "grad_norm": 1.34375, + "learning_rate": 6.637925227349276e-06, + "loss": 0.1096, + "step": 21400 + }, + { + "epoch": 0.6850542347934598, + "grad_norm": 0.51171875, + "learning_rate": 6.631188952509263e-06, + "loss": 0.1192, + "step": 21410 + }, + { + "epoch": 0.6853742040764087, + "grad_norm": 8.0625, + "learning_rate": 6.62445267766925e-06, + "loss": 0.0696, + "step": 21420 + }, + { + "epoch": 0.6856941733593575, + "grad_norm": 19.0, + "learning_rate": 6.617716402829236e-06, + "loss": 0.1002, + "step": 21430 + }, + { + "epoch": 0.6860141426423063, + "grad_norm": 1.34375, + "learning_rate": 6.610980127989223e-06, + "loss": 0.0872, + "step": 21440 + }, + { + "epoch": 0.6863341119252552, + "grad_norm": 11.1875, + "learning_rate": 6.604243853149209e-06, + "loss": 0.0812, + "step": 21450 + }, + { + "epoch": 0.686654081208204, + "grad_norm": 1.6015625, + "learning_rate": 6.597507578309196e-06, + "loss": 0.094, + "step": 21460 + }, + { + "epoch": 0.6869740504911529, + "grad_norm": 15.625, + "learning_rate": 6.590771303469182e-06, + "loss": 0.1171, + "step": 21470 + }, + { + "epoch": 0.6872940197741016, + "grad_norm": 6.28125, + "learning_rate": 6.584035028629169e-06, + "loss": 0.0997, + "step": 21480 + }, + { + "epoch": 0.6876139890570505, + "grad_norm": 12.9375, + "learning_rate": 6.577298753789155e-06, + "loss": 0.0879, + "step": 21490 + }, + { + "epoch": 0.6879339583399994, + "grad_norm": 2.5, + "learning_rate": 6.570562478949142e-06, + "loss": 0.1138, + "step": 21500 + }, + { + "epoch": 0.6882539276229482, + "grad_norm": 7.6875, + "learning_rate": 6.563826204109129e-06, + "loss": 0.0852, + "step": 21510 + }, + { + "epoch": 0.688573896905897, + "grad_norm": 19.0, + "learning_rate": 6.557089929269115e-06, + "loss": 0.1887, + "step": 21520 + }, + { + "epoch": 0.6888938661888458, + "grad_norm": 14.0625, + "learning_rate": 6.550353654429102e-06, + "loss": 0.092, + "step": 21530 + }, + { + "epoch": 0.6892138354717947, + "grad_norm": 19.625, + "learning_rate": 6.543617379589088e-06, + "loss": 0.1611, + "step": 21540 + }, + { + "epoch": 0.6895338047547436, + "grad_norm": 26.25, + "learning_rate": 6.5368811047490745e-06, + "loss": 0.131, + "step": 21550 + }, + { + "epoch": 0.6898537740376924, + "grad_norm": 3.203125, + "learning_rate": 6.530144829909061e-06, + "loss": 0.0562, + "step": 21560 + }, + { + "epoch": 0.6901737433206412, + "grad_norm": 3.234375, + "learning_rate": 6.5234085550690475e-06, + "loss": 0.1075, + "step": 21570 + }, + { + "epoch": 0.69049371260359, + "grad_norm": 7.09375, + "learning_rate": 6.5166722802290335e-06, + "loss": 0.1253, + "step": 21580 + }, + { + "epoch": 0.6908136818865389, + "grad_norm": 22.875, + "learning_rate": 6.50993600538902e-06, + "loss": 0.107, + "step": 21590 + }, + { + "epoch": 0.6911336511694878, + "grad_norm": 15.625, + "learning_rate": 6.503199730549007e-06, + "loss": 0.1003, + "step": 21600 + }, + { + "epoch": 0.6914536204524365, + "grad_norm": 8.0625, + "learning_rate": 6.496463455708993e-06, + "loss": 0.1314, + "step": 21610 + }, + { + "epoch": 0.6917735897353854, + "grad_norm": 5.625, + "learning_rate": 6.48972718086898e-06, + "loss": 0.1194, + "step": 21620 + }, + { + "epoch": 0.6920935590183342, + "grad_norm": 6.375, + "learning_rate": 6.482990906028966e-06, + "loss": 0.1135, + "step": 21630 + }, + { + "epoch": 0.6924135283012831, + "grad_norm": 0.482421875, + "learning_rate": 6.476254631188953e-06, + "loss": 0.0806, + "step": 21640 + }, + { + "epoch": 0.6927334975842319, + "grad_norm": 22.375, + "learning_rate": 6.469518356348939e-06, + "loss": 0.1119, + "step": 21650 + }, + { + "epoch": 0.6930534668671807, + "grad_norm": 17.0, + "learning_rate": 6.462782081508926e-06, + "loss": 0.0831, + "step": 21660 + }, + { + "epoch": 0.6933734361501296, + "grad_norm": 14.5625, + "learning_rate": 6.456045806668912e-06, + "loss": 0.1121, + "step": 21670 + }, + { + "epoch": 0.6936934054330784, + "grad_norm": 12.375, + "learning_rate": 6.449309531828899e-06, + "loss": 0.0832, + "step": 21680 + }, + { + "epoch": 0.6940133747160273, + "grad_norm": 18.5, + "learning_rate": 6.442573256988885e-06, + "loss": 0.1105, + "step": 21690 + }, + { + "epoch": 0.6943333439989761, + "grad_norm": 11.8125, + "learning_rate": 6.435836982148872e-06, + "loss": 0.1359, + "step": 21700 + }, + { + "epoch": 0.6946533132819249, + "grad_norm": 13.5, + "learning_rate": 6.429100707308859e-06, + "loss": 0.1036, + "step": 21710 + }, + { + "epoch": 0.6949732825648738, + "grad_norm": 22.625, + "learning_rate": 6.422364432468845e-06, + "loss": 0.1059, + "step": 21720 + }, + { + "epoch": 0.6952932518478226, + "grad_norm": 4.9375, + "learning_rate": 6.415628157628832e-06, + "loss": 0.0756, + "step": 21730 + }, + { + "epoch": 0.6956132211307714, + "grad_norm": 23.625, + "learning_rate": 6.408891882788818e-06, + "loss": 0.0888, + "step": 21740 + }, + { + "epoch": 0.6959331904137203, + "grad_norm": 2.65625, + "learning_rate": 6.402155607948805e-06, + "loss": 0.0691, + "step": 21750 + }, + { + "epoch": 0.6962531596966691, + "grad_norm": 19.25, + "learning_rate": 6.395419333108791e-06, + "loss": 0.1483, + "step": 21760 + }, + { + "epoch": 0.696573128979618, + "grad_norm": 3.625, + "learning_rate": 6.388683058268778e-06, + "loss": 0.1069, + "step": 21770 + }, + { + "epoch": 0.6968930982625668, + "grad_norm": 12.4375, + "learning_rate": 6.381946783428764e-06, + "loss": 0.0811, + "step": 21780 + }, + { + "epoch": 0.6972130675455156, + "grad_norm": 0.73828125, + "learning_rate": 6.3752105085887505e-06, + "loss": 0.1532, + "step": 21790 + }, + { + "epoch": 0.6975330368284645, + "grad_norm": 9.625, + "learning_rate": 6.3684742337487374e-06, + "loss": 0.1015, + "step": 21800 + }, + { + "epoch": 0.6978530061114133, + "grad_norm": 9.125, + "learning_rate": 6.3617379589087235e-06, + "loss": 0.1437, + "step": 21810 + }, + { + "epoch": 0.6981729753943622, + "grad_norm": 6.3125, + "learning_rate": 6.35500168406871e-06, + "loss": 0.0685, + "step": 21820 + }, + { + "epoch": 0.6984929446773109, + "grad_norm": 10.875, + "learning_rate": 6.348265409228696e-06, + "loss": 0.1084, + "step": 21830 + }, + { + "epoch": 0.6988129139602598, + "grad_norm": 0.7578125, + "learning_rate": 6.341529134388683e-06, + "loss": 0.1048, + "step": 21840 + }, + { + "epoch": 0.6991328832432087, + "grad_norm": 5.84375, + "learning_rate": 6.334792859548669e-06, + "loss": 0.1149, + "step": 21850 + }, + { + "epoch": 0.6994528525261575, + "grad_norm": 1.875, + "learning_rate": 6.328056584708656e-06, + "loss": 0.1618, + "step": 21860 + }, + { + "epoch": 0.6997728218091063, + "grad_norm": 13.125, + "learning_rate": 6.321320309868642e-06, + "loss": 0.1054, + "step": 21870 + }, + { + "epoch": 0.7000927910920551, + "grad_norm": 5.78125, + "learning_rate": 6.314584035028629e-06, + "loss": 0.1341, + "step": 21880 + }, + { + "epoch": 0.700412760375004, + "grad_norm": 8.8125, + "learning_rate": 6.307847760188616e-06, + "loss": 0.0701, + "step": 21890 + }, + { + "epoch": 0.7007327296579529, + "grad_norm": 20.625, + "learning_rate": 6.301111485348603e-06, + "loss": 0.1398, + "step": 21900 + }, + { + "epoch": 0.7010526989409017, + "grad_norm": 17.875, + "learning_rate": 6.29437521050859e-06, + "loss": 0.1051, + "step": 21910 + }, + { + "epoch": 0.7013726682238505, + "grad_norm": 11.5, + "learning_rate": 6.287638935668577e-06, + "loss": 0.1059, + "step": 21920 + }, + { + "epoch": 0.7016926375067993, + "grad_norm": 42.75, + "learning_rate": 6.280902660828563e-06, + "loss": 0.0976, + "step": 21930 + }, + { + "epoch": 0.7020126067897482, + "grad_norm": 2.6875, + "learning_rate": 6.27416638598855e-06, + "loss": 0.1205, + "step": 21940 + }, + { + "epoch": 0.7023325760726971, + "grad_norm": 0.455078125, + "learning_rate": 6.267430111148536e-06, + "loss": 0.082, + "step": 21950 + }, + { + "epoch": 0.7026525453556458, + "grad_norm": 12.6875, + "learning_rate": 6.2606938363085226e-06, + "loss": 0.1083, + "step": 21960 + }, + { + "epoch": 0.7029725146385947, + "grad_norm": 11.1875, + "learning_rate": 6.253957561468509e-06, + "loss": 0.0909, + "step": 21970 + }, + { + "epoch": 0.7032924839215435, + "grad_norm": 5.46875, + "learning_rate": 6.2472212866284955e-06, + "loss": 0.0966, + "step": 21980 + }, + { + "epoch": 0.7036124532044924, + "grad_norm": 11.75, + "learning_rate": 6.2404850117884815e-06, + "loss": 0.1157, + "step": 21990 + }, + { + "epoch": 0.7039324224874413, + "grad_norm": 21.625, + "learning_rate": 6.2337487369484684e-06, + "loss": 0.1797, + "step": 22000 + }, + { + "epoch": 0.70425239177039, + "grad_norm": 11.1875, + "learning_rate": 6.227012462108455e-06, + "loss": 0.1077, + "step": 22010 + }, + { + "epoch": 0.7045723610533389, + "grad_norm": 12.4375, + "learning_rate": 6.220276187268441e-06, + "loss": 0.1404, + "step": 22020 + }, + { + "epoch": 0.7048923303362877, + "grad_norm": 1.4375, + "learning_rate": 6.213539912428428e-06, + "loss": 0.0437, + "step": 22030 + }, + { + "epoch": 0.7052122996192366, + "grad_norm": 8.5625, + "learning_rate": 6.206803637588414e-06, + "loss": 0.1201, + "step": 22040 + }, + { + "epoch": 0.7055322689021853, + "grad_norm": 9.4375, + "learning_rate": 6.200067362748401e-06, + "loss": 0.131, + "step": 22050 + }, + { + "epoch": 0.7058522381851342, + "grad_norm": 6.96875, + "learning_rate": 6.193331087908387e-06, + "loss": 0.1306, + "step": 22060 + }, + { + "epoch": 0.7061722074680831, + "grad_norm": 16.75, + "learning_rate": 6.186594813068374e-06, + "loss": 0.2234, + "step": 22070 + }, + { + "epoch": 0.7064921767510319, + "grad_norm": 25.75, + "learning_rate": 6.17985853822836e-06, + "loss": 0.1111, + "step": 22080 + }, + { + "epoch": 0.7068121460339808, + "grad_norm": 14.9375, + "learning_rate": 6.173122263388347e-06, + "loss": 0.0629, + "step": 22090 + }, + { + "epoch": 0.7071321153169295, + "grad_norm": 13.1875, + "learning_rate": 6.166385988548333e-06, + "loss": 0.0609, + "step": 22100 + }, + { + "epoch": 0.7074520845998784, + "grad_norm": 11.875, + "learning_rate": 6.15964971370832e-06, + "loss": 0.217, + "step": 22110 + }, + { + "epoch": 0.7077720538828273, + "grad_norm": 14.0625, + "learning_rate": 6.152913438868307e-06, + "loss": 0.0941, + "step": 22120 + }, + { + "epoch": 0.7080920231657761, + "grad_norm": 3.53125, + "learning_rate": 6.146177164028293e-06, + "loss": 0.1563, + "step": 22130 + }, + { + "epoch": 0.7084119924487249, + "grad_norm": 5.28125, + "learning_rate": 6.13944088918828e-06, + "loss": 0.1234, + "step": 22140 + }, + { + "epoch": 0.7087319617316737, + "grad_norm": 15.3125, + "learning_rate": 6.132704614348266e-06, + "loss": 0.1205, + "step": 22150 + }, + { + "epoch": 0.7090519310146226, + "grad_norm": 1.4921875, + "learning_rate": 6.125968339508253e-06, + "loss": 0.068, + "step": 22160 + }, + { + "epoch": 0.7093719002975715, + "grad_norm": 25.875, + "learning_rate": 6.119232064668239e-06, + "loss": 0.1535, + "step": 22170 + }, + { + "epoch": 0.7096918695805202, + "grad_norm": 14.0625, + "learning_rate": 6.112495789828226e-06, + "loss": 0.1171, + "step": 22180 + }, + { + "epoch": 0.7100118388634691, + "grad_norm": 13.9375, + "learning_rate": 6.105759514988212e-06, + "loss": 0.0756, + "step": 22190 + }, + { + "epoch": 0.7103318081464179, + "grad_norm": 7.96875, + "learning_rate": 6.0990232401481986e-06, + "loss": 0.0698, + "step": 22200 + }, + { + "epoch": 0.7106517774293668, + "grad_norm": 26.75, + "learning_rate": 6.0922869653081855e-06, + "loss": 0.1055, + "step": 22210 + }, + { + "epoch": 0.7109717467123157, + "grad_norm": 14.3125, + "learning_rate": 6.0855506904681715e-06, + "loss": 0.0668, + "step": 22220 + }, + { + "epoch": 0.7112917159952644, + "grad_norm": 25.625, + "learning_rate": 6.078814415628158e-06, + "loss": 0.1934, + "step": 22230 + }, + { + "epoch": 0.7116116852782133, + "grad_norm": 12.1875, + "learning_rate": 6.0720781407881444e-06, + "loss": 0.1518, + "step": 22240 + }, + { + "epoch": 0.7119316545611621, + "grad_norm": 0.84765625, + "learning_rate": 6.065341865948131e-06, + "loss": 0.0687, + "step": 22250 + }, + { + "epoch": 0.712251623844111, + "grad_norm": 21.125, + "learning_rate": 6.058605591108117e-06, + "loss": 0.1328, + "step": 22260 + }, + { + "epoch": 0.7125715931270598, + "grad_norm": 0.90625, + "learning_rate": 6.051869316268104e-06, + "loss": 0.1137, + "step": 22270 + }, + { + "epoch": 0.7128915624100086, + "grad_norm": 5.34375, + "learning_rate": 6.04513304142809e-06, + "loss": 0.0989, + "step": 22280 + }, + { + "epoch": 0.7132115316929575, + "grad_norm": 4.25, + "learning_rate": 6.038396766588077e-06, + "loss": 0.0996, + "step": 22290 + }, + { + "epoch": 0.7135315009759063, + "grad_norm": 2.15625, + "learning_rate": 6.031660491748064e-06, + "loss": 0.1374, + "step": 22300 + }, + { + "epoch": 0.7138514702588552, + "grad_norm": 0.703125, + "learning_rate": 6.02492421690805e-06, + "loss": 0.054, + "step": 22310 + }, + { + "epoch": 0.714171439541804, + "grad_norm": 8.1875, + "learning_rate": 6.018187942068037e-06, + "loss": 0.0791, + "step": 22320 + }, + { + "epoch": 0.7144914088247528, + "grad_norm": 12.75, + "learning_rate": 6.011451667228023e-06, + "loss": 0.1519, + "step": 22330 + }, + { + "epoch": 0.7148113781077017, + "grad_norm": 8.0625, + "learning_rate": 6.00471539238801e-06, + "loss": 0.1205, + "step": 22340 + }, + { + "epoch": 0.7151313473906505, + "grad_norm": 11.0, + "learning_rate": 5.997979117547996e-06, + "loss": 0.1396, + "step": 22350 + }, + { + "epoch": 0.7154513166735993, + "grad_norm": 10.375, + "learning_rate": 5.991242842707983e-06, + "loss": 0.1234, + "step": 22360 + }, + { + "epoch": 0.7157712859565482, + "grad_norm": 3.921875, + "learning_rate": 5.984506567867969e-06, + "loss": 0.0351, + "step": 22370 + }, + { + "epoch": 0.716091255239497, + "grad_norm": 6.375, + "learning_rate": 5.977770293027956e-06, + "loss": 0.0943, + "step": 22380 + }, + { + "epoch": 0.7164112245224459, + "grad_norm": 11.6875, + "learning_rate": 5.971034018187943e-06, + "loss": 0.1087, + "step": 22390 + }, + { + "epoch": 0.7167311938053946, + "grad_norm": 9.8125, + "learning_rate": 5.964297743347929e-06, + "loss": 0.079, + "step": 22400 + }, + { + "epoch": 0.7170511630883435, + "grad_norm": 3.6875, + "learning_rate": 5.957561468507916e-06, + "loss": 0.0985, + "step": 22410 + }, + { + "epoch": 0.7173711323712924, + "grad_norm": 13.8125, + "learning_rate": 5.950825193667902e-06, + "loss": 0.0748, + "step": 22420 + }, + { + "epoch": 0.7176911016542412, + "grad_norm": 27.875, + "learning_rate": 5.9440889188278885e-06, + "loss": 0.1493, + "step": 22430 + }, + { + "epoch": 0.7180110709371901, + "grad_norm": 4.09375, + "learning_rate": 5.9373526439878746e-06, + "loss": 0.0855, + "step": 22440 + }, + { + "epoch": 0.7183310402201388, + "grad_norm": 12.9375, + "learning_rate": 5.9306163691478615e-06, + "loss": 0.1036, + "step": 22450 + }, + { + "epoch": 0.7186510095030877, + "grad_norm": 21.375, + "learning_rate": 5.9238800943078475e-06, + "loss": 0.1174, + "step": 22460 + }, + { + "epoch": 0.7189709787860366, + "grad_norm": 9.6875, + "learning_rate": 5.917143819467834e-06, + "loss": 0.0934, + "step": 22470 + }, + { + "epoch": 0.7192909480689854, + "grad_norm": 6.4375, + "learning_rate": 5.9104075446278204e-06, + "loss": 0.0783, + "step": 22480 + }, + { + "epoch": 0.7196109173519342, + "grad_norm": 2.0, + "learning_rate": 5.903671269787807e-06, + "loss": 0.1512, + "step": 22490 + }, + { + "epoch": 0.719930886634883, + "grad_norm": 21.125, + "learning_rate": 5.896934994947794e-06, + "loss": 0.0826, + "step": 22500 + }, + { + "epoch": 0.7202508559178319, + "grad_norm": 16.375, + "learning_rate": 5.89019872010778e-06, + "loss": 0.1183, + "step": 22510 + }, + { + "epoch": 0.7205708252007808, + "grad_norm": 9.8125, + "learning_rate": 5.883462445267767e-06, + "loss": 0.0709, + "step": 22520 + }, + { + "epoch": 0.7208907944837296, + "grad_norm": 12.875, + "learning_rate": 5.876726170427753e-06, + "loss": 0.1073, + "step": 22530 + }, + { + "epoch": 0.7212107637666784, + "grad_norm": 0.9140625, + "learning_rate": 5.86998989558774e-06, + "loss": 0.1094, + "step": 22540 + }, + { + "epoch": 0.7215307330496272, + "grad_norm": 13.625, + "learning_rate": 5.863253620747728e-06, + "loss": 0.0847, + "step": 22550 + }, + { + "epoch": 0.7218507023325761, + "grad_norm": 2.640625, + "learning_rate": 5.856517345907714e-06, + "loss": 0.1061, + "step": 22560 + }, + { + "epoch": 0.722170671615525, + "grad_norm": 7.46875, + "learning_rate": 5.849781071067701e-06, + "loss": 0.1372, + "step": 22570 + }, + { + "epoch": 0.7224906408984737, + "grad_norm": 12.5, + "learning_rate": 5.843044796227687e-06, + "loss": 0.1453, + "step": 22580 + }, + { + "epoch": 0.7228106101814226, + "grad_norm": 1.109375, + "learning_rate": 5.836308521387674e-06, + "loss": 0.07, + "step": 22590 + }, + { + "epoch": 0.7231305794643714, + "grad_norm": 16.625, + "learning_rate": 5.82957224654766e-06, + "loss": 0.1414, + "step": 22600 + }, + { + "epoch": 0.7234505487473203, + "grad_norm": 0.90234375, + "learning_rate": 5.822835971707647e-06, + "loss": 0.1263, + "step": 22610 + }, + { + "epoch": 0.723770518030269, + "grad_norm": 15.125, + "learning_rate": 5.8160996968676335e-06, + "loss": 0.191, + "step": 22620 + }, + { + "epoch": 0.7240904873132179, + "grad_norm": 0.7265625, + "learning_rate": 5.8093634220276195e-06, + "loss": 0.1746, + "step": 22630 + }, + { + "epoch": 0.7244104565961668, + "grad_norm": 5.40625, + "learning_rate": 5.802627147187606e-06, + "loss": 0.0994, + "step": 22640 + }, + { + "epoch": 0.7247304258791156, + "grad_norm": 8.6875, + "learning_rate": 5.7958908723475925e-06, + "loss": 0.1345, + "step": 22650 + }, + { + "epoch": 0.7250503951620645, + "grad_norm": 11.375, + "learning_rate": 5.789154597507579e-06, + "loss": 0.1087, + "step": 22660 + }, + { + "epoch": 0.7253703644450132, + "grad_norm": 9.0625, + "learning_rate": 5.782418322667565e-06, + "loss": 0.1405, + "step": 22670 + }, + { + "epoch": 0.7256903337279621, + "grad_norm": 0.578125, + "learning_rate": 5.775682047827552e-06, + "loss": 0.1551, + "step": 22680 + }, + { + "epoch": 0.726010303010911, + "grad_norm": 36.5, + "learning_rate": 5.768945772987538e-06, + "loss": 0.1213, + "step": 22690 + }, + { + "epoch": 0.7263302722938598, + "grad_norm": 10.8125, + "learning_rate": 5.762209498147525e-06, + "loss": 0.1649, + "step": 22700 + }, + { + "epoch": 0.7266502415768086, + "grad_norm": 14.875, + "learning_rate": 5.755473223307512e-06, + "loss": 0.0815, + "step": 22710 + }, + { + "epoch": 0.7269702108597574, + "grad_norm": 2.28125, + "learning_rate": 5.748736948467498e-06, + "loss": 0.0946, + "step": 22720 + }, + { + "epoch": 0.7272901801427063, + "grad_norm": 8.375, + "learning_rate": 5.742000673627485e-06, + "loss": 0.0957, + "step": 22730 + }, + { + "epoch": 0.7276101494256552, + "grad_norm": 23.875, + "learning_rate": 5.735264398787471e-06, + "loss": 0.0901, + "step": 22740 + }, + { + "epoch": 0.727930118708604, + "grad_norm": 5.78125, + "learning_rate": 5.728528123947458e-06, + "loss": 0.1153, + "step": 22750 + }, + { + "epoch": 0.7282500879915528, + "grad_norm": 18.375, + "learning_rate": 5.721791849107444e-06, + "loss": 0.153, + "step": 22760 + }, + { + "epoch": 0.7285700572745016, + "grad_norm": 6.0, + "learning_rate": 5.715055574267431e-06, + "loss": 0.1297, + "step": 22770 + }, + { + "epoch": 0.7288900265574505, + "grad_norm": 29.375, + "learning_rate": 5.708319299427417e-06, + "loss": 0.083, + "step": 22780 + }, + { + "epoch": 0.7292099958403994, + "grad_norm": 2.171875, + "learning_rate": 5.701583024587404e-06, + "loss": 0.0606, + "step": 22790 + }, + { + "epoch": 0.7295299651233481, + "grad_norm": 10.875, + "learning_rate": 5.69484674974739e-06, + "loss": 0.1635, + "step": 22800 + }, + { + "epoch": 0.729849934406297, + "grad_norm": 3.328125, + "learning_rate": 5.688110474907377e-06, + "loss": 0.0733, + "step": 22810 + }, + { + "epoch": 0.7301699036892458, + "grad_norm": 1.1875, + "learning_rate": 5.681374200067364e-06, + "loss": 0.0844, + "step": 22820 + }, + { + "epoch": 0.7304898729721947, + "grad_norm": 10.125, + "learning_rate": 5.67463792522735e-06, + "loss": 0.1162, + "step": 22830 + }, + { + "epoch": 0.7308098422551436, + "grad_norm": 5.59375, + "learning_rate": 5.6679016503873366e-06, + "loss": 0.0697, + "step": 22840 + }, + { + "epoch": 0.7311298115380923, + "grad_norm": 10.4375, + "learning_rate": 5.661165375547323e-06, + "loss": 0.1829, + "step": 22850 + }, + { + "epoch": 0.7314497808210412, + "grad_norm": 8.1875, + "learning_rate": 5.6544291007073095e-06, + "loss": 0.0806, + "step": 22860 + }, + { + "epoch": 0.73176975010399, + "grad_norm": 10.625, + "learning_rate": 5.6476928258672955e-06, + "loss": 0.1137, + "step": 22870 + }, + { + "epoch": 0.7320897193869389, + "grad_norm": 6.03125, + "learning_rate": 5.640956551027282e-06, + "loss": 0.1092, + "step": 22880 + }, + { + "epoch": 0.7324096886698876, + "grad_norm": 11.9375, + "learning_rate": 5.6342202761872685e-06, + "loss": 0.143, + "step": 22890 + }, + { + "epoch": 0.7327296579528365, + "grad_norm": 0.93359375, + "learning_rate": 5.627484001347255e-06, + "loss": 0.1039, + "step": 22900 + }, + { + "epoch": 0.7330496272357854, + "grad_norm": 14.6875, + "learning_rate": 5.620747726507242e-06, + "loss": 0.1587, + "step": 22910 + }, + { + "epoch": 0.7333695965187342, + "grad_norm": 1.546875, + "learning_rate": 5.614011451667228e-06, + "loss": 0.0811, + "step": 22920 + }, + { + "epoch": 0.733689565801683, + "grad_norm": 14.625, + "learning_rate": 5.607275176827215e-06, + "loss": 0.0906, + "step": 22930 + }, + { + "epoch": 0.7340095350846318, + "grad_norm": 6.1875, + "learning_rate": 5.600538901987201e-06, + "loss": 0.088, + "step": 22940 + }, + { + "epoch": 0.7343295043675807, + "grad_norm": 18.25, + "learning_rate": 5.593802627147188e-06, + "loss": 0.1123, + "step": 22950 + }, + { + "epoch": 0.7346494736505296, + "grad_norm": 12.1875, + "learning_rate": 5.587066352307174e-06, + "loss": 0.123, + "step": 22960 + }, + { + "epoch": 0.7349694429334784, + "grad_norm": 2.171875, + "learning_rate": 5.580330077467161e-06, + "loss": 0.125, + "step": 22970 + }, + { + "epoch": 0.7352894122164272, + "grad_norm": 5.5625, + "learning_rate": 5.573593802627147e-06, + "loss": 0.0638, + "step": 22980 + }, + { + "epoch": 0.735609381499376, + "grad_norm": 5.34375, + "learning_rate": 5.566857527787134e-06, + "loss": 0.1272, + "step": 22990 + }, + { + "epoch": 0.7359293507823249, + "grad_norm": 11.9375, + "learning_rate": 5.560121252947121e-06, + "loss": 0.0605, + "step": 23000 + }, + { + "epoch": 0.7362493200652738, + "grad_norm": 1.78125, + "learning_rate": 5.553384978107107e-06, + "loss": 0.0956, + "step": 23010 + }, + { + "epoch": 0.7365692893482225, + "grad_norm": 9.625, + "learning_rate": 5.546648703267094e-06, + "loss": 0.1145, + "step": 23020 + }, + { + "epoch": 0.7368892586311714, + "grad_norm": 1.4453125, + "learning_rate": 5.53991242842708e-06, + "loss": 0.0846, + "step": 23030 + }, + { + "epoch": 0.7372092279141202, + "grad_norm": 3.46875, + "learning_rate": 5.533176153587067e-06, + "loss": 0.0686, + "step": 23040 + }, + { + "epoch": 0.7375291971970691, + "grad_norm": 3.09375, + "learning_rate": 5.526439878747053e-06, + "loss": 0.1311, + "step": 23050 + }, + { + "epoch": 0.737849166480018, + "grad_norm": 2.390625, + "learning_rate": 5.51970360390704e-06, + "loss": 0.0558, + "step": 23060 + }, + { + "epoch": 0.7381691357629667, + "grad_norm": 4.84375, + "learning_rate": 5.512967329067026e-06, + "loss": 0.1392, + "step": 23070 + }, + { + "epoch": 0.7384891050459156, + "grad_norm": 0.69921875, + "learning_rate": 5.5062310542270126e-06, + "loss": 0.0896, + "step": 23080 + }, + { + "epoch": 0.7388090743288644, + "grad_norm": 8.0, + "learning_rate": 5.4994947793869994e-06, + "loss": 0.0966, + "step": 23090 + }, + { + "epoch": 0.7391290436118133, + "grad_norm": 2.390625, + "learning_rate": 5.4927585045469855e-06, + "loss": 0.082, + "step": 23100 + }, + { + "epoch": 0.7394490128947621, + "grad_norm": 7.125, + "learning_rate": 5.486022229706972e-06, + "loss": 0.0849, + "step": 23110 + }, + { + "epoch": 0.7397689821777109, + "grad_norm": 25.375, + "learning_rate": 5.479285954866958e-06, + "loss": 0.1424, + "step": 23120 + }, + { + "epoch": 0.7400889514606598, + "grad_norm": 0.8046875, + "learning_rate": 5.472549680026945e-06, + "loss": 0.0932, + "step": 23130 + }, + { + "epoch": 0.7404089207436086, + "grad_norm": 14.0625, + "learning_rate": 5.465813405186931e-06, + "loss": 0.0951, + "step": 23140 + }, + { + "epoch": 0.7407288900265574, + "grad_norm": 8.9375, + "learning_rate": 5.459077130346918e-06, + "loss": 0.0788, + "step": 23150 + }, + { + "epoch": 0.7410488593095063, + "grad_norm": 17.875, + "learning_rate": 5.452340855506904e-06, + "loss": 0.0854, + "step": 23160 + }, + { + "epoch": 0.7413688285924551, + "grad_norm": 1.78125, + "learning_rate": 5.445604580666891e-06, + "loss": 0.0994, + "step": 23170 + }, + { + "epoch": 0.741688797875404, + "grad_norm": 0.85546875, + "learning_rate": 5.438868305826878e-06, + "loss": 0.0653, + "step": 23180 + }, + { + "epoch": 0.7420087671583528, + "grad_norm": 25.125, + "learning_rate": 5.432132030986865e-06, + "loss": 0.093, + "step": 23190 + }, + { + "epoch": 0.7423287364413016, + "grad_norm": 9.375, + "learning_rate": 5.425395756146852e-06, + "loss": 0.1216, + "step": 23200 + }, + { + "epoch": 0.7426487057242505, + "grad_norm": 4.8125, + "learning_rate": 5.418659481306838e-06, + "loss": 0.1029, + "step": 23210 + }, + { + "epoch": 0.7429686750071993, + "grad_norm": 24.875, + "learning_rate": 5.411923206466825e-06, + "loss": 0.1356, + "step": 23220 + }, + { + "epoch": 0.7432886442901482, + "grad_norm": 8.75, + "learning_rate": 5.405186931626812e-06, + "loss": 0.1165, + "step": 23230 + }, + { + "epoch": 0.7436086135730969, + "grad_norm": 13.0, + "learning_rate": 5.398450656786798e-06, + "loss": 0.1202, + "step": 23240 + }, + { + "epoch": 0.7439285828560458, + "grad_norm": 2.28125, + "learning_rate": 5.391714381946785e-06, + "loss": 0.0822, + "step": 23250 + }, + { + "epoch": 0.7442485521389947, + "grad_norm": 1.734375, + "learning_rate": 5.384978107106771e-06, + "loss": 0.0837, + "step": 23260 + }, + { + "epoch": 0.7445685214219435, + "grad_norm": 0.294921875, + "learning_rate": 5.3782418322667575e-06, + "loss": 0.134, + "step": 23270 + }, + { + "epoch": 0.7448884907048924, + "grad_norm": 11.9375, + "learning_rate": 5.3715055574267435e-06, + "loss": 0.1752, + "step": 23280 + }, + { + "epoch": 0.7452084599878411, + "grad_norm": 0.73046875, + "learning_rate": 5.3647692825867304e-06, + "loss": 0.1131, + "step": 23290 + }, + { + "epoch": 0.74552842927079, + "grad_norm": 13.375, + "learning_rate": 5.3580330077467165e-06, + "loss": 0.0687, + "step": 23300 + }, + { + "epoch": 0.7458483985537389, + "grad_norm": 0.5390625, + "learning_rate": 5.351296732906703e-06, + "loss": 0.1376, + "step": 23310 + }, + { + "epoch": 0.7461683678366877, + "grad_norm": 19.75, + "learning_rate": 5.34456045806669e-06, + "loss": 0.167, + "step": 23320 + }, + { + "epoch": 0.7464883371196365, + "grad_norm": 17.125, + "learning_rate": 5.337824183226676e-06, + "loss": 0.1862, + "step": 23330 + }, + { + "epoch": 0.7468083064025853, + "grad_norm": 19.0, + "learning_rate": 5.331087908386663e-06, + "loss": 0.1813, + "step": 23340 + }, + { + "epoch": 0.7471282756855342, + "grad_norm": 5.625, + "learning_rate": 5.324351633546649e-06, + "loss": 0.1141, + "step": 23350 + }, + { + "epoch": 0.747448244968483, + "grad_norm": 15.875, + "learning_rate": 5.317615358706636e-06, + "loss": 0.0961, + "step": 23360 + }, + { + "epoch": 0.7477682142514319, + "grad_norm": 1.984375, + "learning_rate": 5.310879083866622e-06, + "loss": 0.125, + "step": 23370 + }, + { + "epoch": 0.7480881835343807, + "grad_norm": 1.6640625, + "learning_rate": 5.304142809026609e-06, + "loss": 0.1249, + "step": 23380 + }, + { + "epoch": 0.7484081528173295, + "grad_norm": 4.34375, + "learning_rate": 5.297406534186595e-06, + "loss": 0.0731, + "step": 23390 + }, + { + "epoch": 0.7487281221002784, + "grad_norm": 2.078125, + "learning_rate": 5.290670259346582e-06, + "loss": 0.094, + "step": 23400 + }, + { + "epoch": 0.7490480913832273, + "grad_norm": 26.125, + "learning_rate": 5.283933984506569e-06, + "loss": 0.1594, + "step": 23410 + }, + { + "epoch": 0.749368060666176, + "grad_norm": 0.93359375, + "learning_rate": 5.277197709666555e-06, + "loss": 0.1418, + "step": 23420 + }, + { + "epoch": 0.7496880299491249, + "grad_norm": 14.375, + "learning_rate": 5.270461434826542e-06, + "loss": 0.1032, + "step": 23430 + }, + { + "epoch": 0.7500079992320737, + "grad_norm": 7.5625, + "learning_rate": 5.263725159986528e-06, + "loss": 0.0911, + "step": 23440 + }, + { + "epoch": 0.7503279685150226, + "grad_norm": 5.5625, + "learning_rate": 5.256988885146515e-06, + "loss": 0.1399, + "step": 23450 + }, + { + "epoch": 0.7506479377979713, + "grad_norm": 14.1875, + "learning_rate": 5.250252610306501e-06, + "loss": 0.1017, + "step": 23460 + }, + { + "epoch": 0.7509679070809202, + "grad_norm": 9.6875, + "learning_rate": 5.243516335466488e-06, + "loss": 0.0914, + "step": 23470 + }, + { + "epoch": 0.7512878763638691, + "grad_norm": 16.5, + "learning_rate": 5.236780060626474e-06, + "loss": 0.1092, + "step": 23480 + }, + { + "epoch": 0.7516078456468179, + "grad_norm": 8.375, + "learning_rate": 5.230043785786461e-06, + "loss": 0.1256, + "step": 23490 + }, + { + "epoch": 0.7519278149297668, + "grad_norm": 15.25, + "learning_rate": 5.2233075109464475e-06, + "loss": 0.1031, + "step": 23500 + }, + { + "epoch": 0.7522477842127155, + "grad_norm": 19.875, + "learning_rate": 5.2165712361064335e-06, + "loss": 0.121, + "step": 23510 + }, + { + "epoch": 0.7525677534956644, + "grad_norm": 7.5, + "learning_rate": 5.20983496126642e-06, + "loss": 0.0723, + "step": 23520 + }, + { + "epoch": 0.7528877227786133, + "grad_norm": 0.47265625, + "learning_rate": 5.2030986864264064e-06, + "loss": 0.0531, + "step": 23530 + }, + { + "epoch": 0.7532076920615621, + "grad_norm": 20.625, + "learning_rate": 5.196362411586393e-06, + "loss": 0.113, + "step": 23540 + }, + { + "epoch": 0.7535276613445109, + "grad_norm": 18.5, + "learning_rate": 5.189626136746379e-06, + "loss": 0.0848, + "step": 23550 + }, + { + "epoch": 0.7538476306274597, + "grad_norm": 1.0859375, + "learning_rate": 5.182889861906366e-06, + "loss": 0.1387, + "step": 23560 + }, + { + "epoch": 0.7541675999104086, + "grad_norm": 13.8125, + "learning_rate": 5.176153587066352e-06, + "loss": 0.1026, + "step": 23570 + }, + { + "epoch": 0.7544875691933575, + "grad_norm": 7.8125, + "learning_rate": 5.169417312226339e-06, + "loss": 0.0791, + "step": 23580 + }, + { + "epoch": 0.7548075384763063, + "grad_norm": 17.875, + "learning_rate": 5.162681037386325e-06, + "loss": 0.1348, + "step": 23590 + }, + { + "epoch": 0.7551275077592551, + "grad_norm": 11.875, + "learning_rate": 5.155944762546312e-06, + "loss": 0.084, + "step": 23600 + }, + { + "epoch": 0.7554474770422039, + "grad_norm": 18.25, + "learning_rate": 5.149208487706299e-06, + "loss": 0.0931, + "step": 23610 + }, + { + "epoch": 0.7557674463251528, + "grad_norm": 31.625, + "learning_rate": 5.142472212866285e-06, + "loss": 0.1076, + "step": 23620 + }, + { + "epoch": 0.7560874156081017, + "grad_norm": 5.6875, + "learning_rate": 5.135735938026272e-06, + "loss": 0.0966, + "step": 23630 + }, + { + "epoch": 0.7564073848910504, + "grad_norm": 0.412109375, + "learning_rate": 5.128999663186258e-06, + "loss": 0.0546, + "step": 23640 + }, + { + "epoch": 0.7567273541739993, + "grad_norm": 14.25, + "learning_rate": 5.122263388346245e-06, + "loss": 0.1344, + "step": 23650 + }, + { + "epoch": 0.7570473234569481, + "grad_norm": 20.0, + "learning_rate": 5.115527113506231e-06, + "loss": 0.1221, + "step": 23660 + }, + { + "epoch": 0.757367292739897, + "grad_norm": 9.9375, + "learning_rate": 5.108790838666218e-06, + "loss": 0.1176, + "step": 23670 + }, + { + "epoch": 0.7576872620228458, + "grad_norm": 0.796875, + "learning_rate": 5.102054563826204e-06, + "loss": 0.0572, + "step": 23680 + }, + { + "epoch": 0.7580072313057946, + "grad_norm": 2.8125, + "learning_rate": 5.095318288986191e-06, + "loss": 0.0788, + "step": 23690 + }, + { + "epoch": 0.7583272005887435, + "grad_norm": 14.75, + "learning_rate": 5.088582014146178e-06, + "loss": 0.1099, + "step": 23700 + }, + { + "epoch": 0.7586471698716923, + "grad_norm": 9.75, + "learning_rate": 5.081845739306164e-06, + "loss": 0.0943, + "step": 23710 + }, + { + "epoch": 0.7589671391546412, + "grad_norm": 6.0625, + "learning_rate": 5.0751094644661505e-06, + "loss": 0.1269, + "step": 23720 + }, + { + "epoch": 0.75928710843759, + "grad_norm": 15.1875, + "learning_rate": 5.068373189626137e-06, + "loss": 0.1426, + "step": 23730 + }, + { + "epoch": 0.7596070777205388, + "grad_norm": 4.71875, + "learning_rate": 5.0616369147861235e-06, + "loss": 0.0816, + "step": 23740 + }, + { + "epoch": 0.7599270470034877, + "grad_norm": 15.25, + "learning_rate": 5.0549006399461095e-06, + "loss": 0.1168, + "step": 23750 + }, + { + "epoch": 0.7602470162864365, + "grad_norm": 20.125, + "learning_rate": 5.048164365106096e-06, + "loss": 0.1414, + "step": 23760 + }, + { + "epoch": 0.7605669855693853, + "grad_norm": 12.25, + "learning_rate": 5.0414280902660824e-06, + "loss": 0.159, + "step": 23770 + }, + { + "epoch": 0.7608869548523342, + "grad_norm": 7.0625, + "learning_rate": 5.034691815426069e-06, + "loss": 0.1347, + "step": 23780 + }, + { + "epoch": 0.761206924135283, + "grad_norm": 16.875, + "learning_rate": 5.027955540586056e-06, + "loss": 0.1062, + "step": 23790 + }, + { + "epoch": 0.7615268934182319, + "grad_norm": 0.9765625, + "learning_rate": 5.021219265746042e-06, + "loss": 0.1231, + "step": 23800 + }, + { + "epoch": 0.7618468627011807, + "grad_norm": 0.94921875, + "learning_rate": 5.014482990906029e-06, + "loss": 0.1353, + "step": 23810 + }, + { + "epoch": 0.7621668319841295, + "grad_norm": 1.1640625, + "learning_rate": 5.007746716066015e-06, + "loss": 0.0671, + "step": 23820 + }, + { + "epoch": 0.7624868012670784, + "grad_norm": 5.71875, + "learning_rate": 5.001010441226002e-06, + "loss": 0.0989, + "step": 23830 + }, + { + "epoch": 0.7628067705500272, + "grad_norm": 0.91015625, + "learning_rate": 4.994274166385989e-06, + "loss": 0.0582, + "step": 23840 + }, + { + "epoch": 0.7631267398329761, + "grad_norm": 25.5, + "learning_rate": 4.987537891545976e-06, + "loss": 0.1007, + "step": 23850 + }, + { + "epoch": 0.7634467091159248, + "grad_norm": 25.75, + "learning_rate": 4.980801616705962e-06, + "loss": 0.1645, + "step": 23860 + }, + { + "epoch": 0.7637666783988737, + "grad_norm": 3.453125, + "learning_rate": 4.974065341865949e-06, + "loss": 0.1078, + "step": 23870 + }, + { + "epoch": 0.7640866476818226, + "grad_norm": 13.5, + "learning_rate": 4.967329067025935e-06, + "loss": 0.1069, + "step": 23880 + }, + { + "epoch": 0.7644066169647714, + "grad_norm": 6.375, + "learning_rate": 4.960592792185922e-06, + "loss": 0.2056, + "step": 23890 + }, + { + "epoch": 0.7647265862477202, + "grad_norm": 20.875, + "learning_rate": 4.953856517345908e-06, + "loss": 0.1696, + "step": 23900 + }, + { + "epoch": 0.765046555530669, + "grad_norm": 14.8125, + "learning_rate": 4.947120242505895e-06, + "loss": 0.0932, + "step": 23910 + }, + { + "epoch": 0.7653665248136179, + "grad_norm": 1.65625, + "learning_rate": 4.940383967665881e-06, + "loss": 0.1406, + "step": 23920 + }, + { + "epoch": 0.7656864940965667, + "grad_norm": 16.875, + "learning_rate": 4.9336476928258676e-06, + "loss": 0.0738, + "step": 23930 + }, + { + "epoch": 0.7660064633795156, + "grad_norm": 33.5, + "learning_rate": 4.926911417985854e-06, + "loss": 0.1093, + "step": 23940 + }, + { + "epoch": 0.7663264326624644, + "grad_norm": 0.40234375, + "learning_rate": 4.9201751431458405e-06, + "loss": 0.1582, + "step": 23950 + }, + { + "epoch": 0.7666464019454132, + "grad_norm": 4.71875, + "learning_rate": 4.913438868305827e-06, + "loss": 0.1022, + "step": 23960 + }, + { + "epoch": 0.7669663712283621, + "grad_norm": 9.0625, + "learning_rate": 4.9067025934658134e-06, + "loss": 0.1207, + "step": 23970 + }, + { + "epoch": 0.767286340511311, + "grad_norm": 20.0, + "learning_rate": 4.8999663186258e-06, + "loss": 0.044, + "step": 23980 + }, + { + "epoch": 0.7676063097942597, + "grad_norm": 14.5, + "learning_rate": 4.893230043785786e-06, + "loss": 0.0908, + "step": 23990 + }, + { + "epoch": 0.7679262790772086, + "grad_norm": 11.0, + "learning_rate": 4.886493768945773e-06, + "loss": 0.1493, + "step": 24000 + }, + { + "epoch": 0.7682462483601574, + "grad_norm": 6.03125, + "learning_rate": 4.87975749410576e-06, + "loss": 0.0609, + "step": 24010 + }, + { + "epoch": 0.7685662176431063, + "grad_norm": 6.875, + "learning_rate": 4.873021219265747e-06, + "loss": 0.1127, + "step": 24020 + }, + { + "epoch": 0.7688861869260551, + "grad_norm": 3.46875, + "learning_rate": 4.866284944425733e-06, + "loss": 0.0408, + "step": 24030 + }, + { + "epoch": 0.7692061562090039, + "grad_norm": 10.5, + "learning_rate": 4.85954866958572e-06, + "loss": 0.1193, + "step": 24040 + }, + { + "epoch": 0.7695261254919528, + "grad_norm": 7.6875, + "learning_rate": 4.852812394745706e-06, + "loss": 0.1095, + "step": 24050 + }, + { + "epoch": 0.7698460947749016, + "grad_norm": 19.25, + "learning_rate": 4.846076119905693e-06, + "loss": 0.1393, + "step": 24060 + }, + { + "epoch": 0.7701660640578505, + "grad_norm": 20.875, + "learning_rate": 4.839339845065679e-06, + "loss": 0.113, + "step": 24070 + }, + { + "epoch": 0.7704860333407992, + "grad_norm": 5.5625, + "learning_rate": 4.832603570225666e-06, + "loss": 0.0762, + "step": 24080 + }, + { + "epoch": 0.7708060026237481, + "grad_norm": 2.890625, + "learning_rate": 4.825867295385652e-06, + "loss": 0.0885, + "step": 24090 + }, + { + "epoch": 0.771125971906697, + "grad_norm": 17.625, + "learning_rate": 4.819131020545639e-06, + "loss": 0.0782, + "step": 24100 + }, + { + "epoch": 0.7714459411896458, + "grad_norm": 16.0, + "learning_rate": 4.812394745705626e-06, + "loss": 0.1233, + "step": 24110 + }, + { + "epoch": 0.7717659104725947, + "grad_norm": 0.890625, + "learning_rate": 4.805658470865612e-06, + "loss": 0.0857, + "step": 24120 + }, + { + "epoch": 0.7720858797555434, + "grad_norm": 1.15625, + "learning_rate": 4.7989221960255986e-06, + "loss": 0.1749, + "step": 24130 + }, + { + "epoch": 0.7724058490384923, + "grad_norm": 0.2138671875, + "learning_rate": 4.792185921185585e-06, + "loss": 0.1039, + "step": 24140 + }, + { + "epoch": 0.7727258183214412, + "grad_norm": 12.25, + "learning_rate": 4.7854496463455715e-06, + "loss": 0.0921, + "step": 24150 + }, + { + "epoch": 0.77304578760439, + "grad_norm": 3.65625, + "learning_rate": 4.7787133715055575e-06, + "loss": 0.1247, + "step": 24160 + }, + { + "epoch": 0.7733657568873388, + "grad_norm": 1.9609375, + "learning_rate": 4.7719770966655444e-06, + "loss": 0.1465, + "step": 24170 + }, + { + "epoch": 0.7736857261702876, + "grad_norm": 3.265625, + "learning_rate": 4.7652408218255305e-06, + "loss": 0.0654, + "step": 24180 + }, + { + "epoch": 0.7740056954532365, + "grad_norm": 21.25, + "learning_rate": 4.758504546985517e-06, + "loss": 0.0878, + "step": 24190 + }, + { + "epoch": 0.7743256647361854, + "grad_norm": 10.25, + "learning_rate": 4.751768272145504e-06, + "loss": 0.1133, + "step": 24200 + }, + { + "epoch": 0.7746456340191341, + "grad_norm": 6.5625, + "learning_rate": 4.74503199730549e-06, + "loss": 0.1045, + "step": 24210 + }, + { + "epoch": 0.774965603302083, + "grad_norm": 21.25, + "learning_rate": 4.738295722465477e-06, + "loss": 0.1489, + "step": 24220 + }, + { + "epoch": 0.7752855725850318, + "grad_norm": 10.75, + "learning_rate": 4.731559447625463e-06, + "loss": 0.0739, + "step": 24230 + }, + { + "epoch": 0.7756055418679807, + "grad_norm": 12.875, + "learning_rate": 4.72482317278545e-06, + "loss": 0.0788, + "step": 24240 + }, + { + "epoch": 0.7759255111509296, + "grad_norm": 28.375, + "learning_rate": 4.718086897945436e-06, + "loss": 0.1271, + "step": 24250 + }, + { + "epoch": 0.7762454804338783, + "grad_norm": 1.1640625, + "learning_rate": 4.711350623105423e-06, + "loss": 0.0603, + "step": 24260 + }, + { + "epoch": 0.7765654497168272, + "grad_norm": 2.53125, + "learning_rate": 4.704614348265409e-06, + "loss": 0.0914, + "step": 24270 + }, + { + "epoch": 0.776885418999776, + "grad_norm": 23.75, + "learning_rate": 4.697878073425396e-06, + "loss": 0.112, + "step": 24280 + }, + { + "epoch": 0.7772053882827249, + "grad_norm": 23.125, + "learning_rate": 4.691141798585383e-06, + "loss": 0.1202, + "step": 24290 + }, + { + "epoch": 0.7775253575656736, + "grad_norm": 12.25, + "learning_rate": 4.684405523745369e-06, + "loss": 0.1734, + "step": 24300 + }, + { + "epoch": 0.7778453268486225, + "grad_norm": 15.75, + "learning_rate": 4.677669248905356e-06, + "loss": 0.0789, + "step": 24310 + }, + { + "epoch": 0.7781652961315714, + "grad_norm": 35.25, + "learning_rate": 4.670932974065343e-06, + "loss": 0.1652, + "step": 24320 + }, + { + "epoch": 0.7784852654145202, + "grad_norm": 10.375, + "learning_rate": 4.664196699225329e-06, + "loss": 0.0908, + "step": 24330 + }, + { + "epoch": 0.7788052346974691, + "grad_norm": 17.625, + "learning_rate": 4.657460424385316e-06, + "loss": 0.1501, + "step": 24340 + }, + { + "epoch": 0.7791252039804178, + "grad_norm": 10.125, + "learning_rate": 4.650724149545302e-06, + "loss": 0.1478, + "step": 24350 + }, + { + "epoch": 0.7794451732633667, + "grad_norm": 4.5625, + "learning_rate": 4.6439878747052885e-06, + "loss": 0.1584, + "step": 24360 + }, + { + "epoch": 0.7797651425463156, + "grad_norm": 14.1875, + "learning_rate": 4.637251599865275e-06, + "loss": 0.1025, + "step": 24370 + }, + { + "epoch": 0.7800851118292644, + "grad_norm": 25.875, + "learning_rate": 4.6305153250252615e-06, + "loss": 0.1103, + "step": 24380 + }, + { + "epoch": 0.7804050811122132, + "grad_norm": 5.3125, + "learning_rate": 4.623779050185248e-06, + "loss": 0.0872, + "step": 24390 + }, + { + "epoch": 0.780725050395162, + "grad_norm": 12.4375, + "learning_rate": 4.617042775345234e-06, + "loss": 0.0856, + "step": 24400 + }, + { + "epoch": 0.7810450196781109, + "grad_norm": 1.4296875, + "learning_rate": 4.610306500505221e-06, + "loss": 0.1048, + "step": 24410 + }, + { + "epoch": 0.7813649889610598, + "grad_norm": 54.5, + "learning_rate": 4.603570225665207e-06, + "loss": 0.094, + "step": 24420 + }, + { + "epoch": 0.7816849582440085, + "grad_norm": 15.9375, + "learning_rate": 4.596833950825194e-06, + "loss": 0.1369, + "step": 24430 + }, + { + "epoch": 0.7820049275269574, + "grad_norm": 2.90625, + "learning_rate": 4.59009767598518e-06, + "loss": 0.0601, + "step": 24440 + }, + { + "epoch": 0.7823248968099062, + "grad_norm": 11.75, + "learning_rate": 4.583361401145167e-06, + "loss": 0.1012, + "step": 24450 + }, + { + "epoch": 0.7826448660928551, + "grad_norm": 11.1875, + "learning_rate": 4.576625126305154e-06, + "loss": 0.0969, + "step": 24460 + }, + { + "epoch": 0.782964835375804, + "grad_norm": 25.375, + "learning_rate": 4.56988885146514e-06, + "loss": 0.1122, + "step": 24470 + }, + { + "epoch": 0.7832848046587527, + "grad_norm": 15.9375, + "learning_rate": 4.563152576625127e-06, + "loss": 0.053, + "step": 24480 + }, + { + "epoch": 0.7836047739417016, + "grad_norm": 2.1875, + "learning_rate": 4.556416301785113e-06, + "loss": 0.0673, + "step": 24490 + }, + { + "epoch": 0.7839247432246504, + "grad_norm": 16.25, + "learning_rate": 4.5496800269451e-06, + "loss": 0.1106, + "step": 24500 + }, + { + "epoch": 0.7842447125075993, + "grad_norm": 2.703125, + "learning_rate": 4.542943752105086e-06, + "loss": 0.0674, + "step": 24510 + }, + { + "epoch": 0.7845646817905481, + "grad_norm": 4.9375, + "learning_rate": 4.536207477265073e-06, + "loss": 0.1191, + "step": 24520 + }, + { + "epoch": 0.7848846510734969, + "grad_norm": 8.3125, + "learning_rate": 4.529471202425059e-06, + "loss": 0.0918, + "step": 24530 + }, + { + "epoch": 0.7852046203564458, + "grad_norm": 49.75, + "learning_rate": 4.522734927585046e-06, + "loss": 0.1386, + "step": 24540 + }, + { + "epoch": 0.7855245896393946, + "grad_norm": 24.25, + "learning_rate": 4.515998652745033e-06, + "loss": 0.1061, + "step": 24550 + }, + { + "epoch": 0.7858445589223435, + "grad_norm": 7.5625, + "learning_rate": 4.509262377905019e-06, + "loss": 0.1245, + "step": 24560 + }, + { + "epoch": 0.7861645282052923, + "grad_norm": 13.8125, + "learning_rate": 4.5025261030650056e-06, + "loss": 0.1331, + "step": 24570 + }, + { + "epoch": 0.7864844974882411, + "grad_norm": 0.79296875, + "learning_rate": 4.495789828224992e-06, + "loss": 0.0716, + "step": 24580 + }, + { + "epoch": 0.78680446677119, + "grad_norm": 11.6875, + "learning_rate": 4.4890535533849785e-06, + "loss": 0.0899, + "step": 24590 + }, + { + "epoch": 0.7871244360541388, + "grad_norm": 17.875, + "learning_rate": 4.4823172785449645e-06, + "loss": 0.1161, + "step": 24600 + }, + { + "epoch": 0.7874444053370876, + "grad_norm": 6.28125, + "learning_rate": 4.475581003704951e-06, + "loss": 0.1215, + "step": 24610 + }, + { + "epoch": 0.7877643746200365, + "grad_norm": 10.5, + "learning_rate": 4.4688447288649375e-06, + "loss": 0.0983, + "step": 24620 + }, + { + "epoch": 0.7880843439029853, + "grad_norm": 14.1875, + "learning_rate": 4.462108454024924e-06, + "loss": 0.112, + "step": 24630 + }, + { + "epoch": 0.7884043131859342, + "grad_norm": 5.78125, + "learning_rate": 4.455372179184911e-06, + "loss": 0.0698, + "step": 24640 + }, + { + "epoch": 0.788724282468883, + "grad_norm": 5.28125, + "learning_rate": 4.448635904344898e-06, + "loss": 0.101, + "step": 24650 + }, + { + "epoch": 0.7890442517518318, + "grad_norm": 13.1875, + "learning_rate": 4.441899629504884e-06, + "loss": 0.1169, + "step": 24660 + }, + { + "epoch": 0.7893642210347807, + "grad_norm": 5.28125, + "learning_rate": 4.435163354664871e-06, + "loss": 0.076, + "step": 24670 + }, + { + "epoch": 0.7896841903177295, + "grad_norm": 13.5625, + "learning_rate": 4.428427079824857e-06, + "loss": 0.107, + "step": 24680 + }, + { + "epoch": 0.7900041596006784, + "grad_norm": 8.75, + "learning_rate": 4.421690804984844e-06, + "loss": 0.0896, + "step": 24690 + }, + { + "epoch": 0.7903241288836271, + "grad_norm": 9.0, + "learning_rate": 4.41495453014483e-06, + "loss": 0.0668, + "step": 24700 + }, + { + "epoch": 0.790644098166576, + "grad_norm": 25.75, + "learning_rate": 4.408218255304817e-06, + "loss": 0.1462, + "step": 24710 + }, + { + "epoch": 0.7909640674495249, + "grad_norm": 7.46875, + "learning_rate": 4.401481980464804e-06, + "loss": 0.0954, + "step": 24720 + }, + { + "epoch": 0.7912840367324737, + "grad_norm": 3.890625, + "learning_rate": 4.39474570562479e-06, + "loss": 0.1271, + "step": 24730 + }, + { + "epoch": 0.7916040060154225, + "grad_norm": 24.625, + "learning_rate": 4.388009430784777e-06, + "loss": 0.0901, + "step": 24740 + }, + { + "epoch": 0.7919239752983713, + "grad_norm": 7.09375, + "learning_rate": 4.381273155944763e-06, + "loss": 0.101, + "step": 24750 + }, + { + "epoch": 0.7922439445813202, + "grad_norm": 1.21875, + "learning_rate": 4.37453688110475e-06, + "loss": 0.1219, + "step": 24760 + }, + { + "epoch": 0.792563913864269, + "grad_norm": 4.15625, + "learning_rate": 4.367800606264736e-06, + "loss": 0.119, + "step": 24770 + }, + { + "epoch": 0.7928838831472179, + "grad_norm": 15.5625, + "learning_rate": 4.361064331424723e-06, + "loss": 0.1503, + "step": 24780 + }, + { + "epoch": 0.7932038524301667, + "grad_norm": 17.375, + "learning_rate": 4.354328056584709e-06, + "loss": 0.1509, + "step": 24790 + }, + { + "epoch": 0.7935238217131155, + "grad_norm": 20.5, + "learning_rate": 4.3475917817446955e-06, + "loss": 0.1321, + "step": 24800 + }, + { + "epoch": 0.7938437909960644, + "grad_norm": 18.625, + "learning_rate": 4.340855506904682e-06, + "loss": 0.0568, + "step": 24810 + }, + { + "epoch": 0.7941637602790133, + "grad_norm": 1.015625, + "learning_rate": 4.3341192320646685e-06, + "loss": 0.1381, + "step": 24820 + }, + { + "epoch": 0.794483729561962, + "grad_norm": 9.1875, + "learning_rate": 4.327382957224655e-06, + "loss": 0.13, + "step": 24830 + }, + { + "epoch": 0.7948036988449109, + "grad_norm": 14.5, + "learning_rate": 4.320646682384641e-06, + "loss": 0.1341, + "step": 24840 + }, + { + "epoch": 0.7951236681278597, + "grad_norm": 8.5, + "learning_rate": 4.313910407544628e-06, + "loss": 0.0861, + "step": 24850 + }, + { + "epoch": 0.7954436374108086, + "grad_norm": 5.25, + "learning_rate": 4.307174132704614e-06, + "loss": 0.1732, + "step": 24860 + }, + { + "epoch": 0.7957636066937575, + "grad_norm": 7.0625, + "learning_rate": 4.300437857864601e-06, + "loss": 0.073, + "step": 24870 + }, + { + "epoch": 0.7960835759767062, + "grad_norm": 15.4375, + "learning_rate": 4.293701583024587e-06, + "loss": 0.1574, + "step": 24880 + }, + { + "epoch": 0.7964035452596551, + "grad_norm": 4.15625, + "learning_rate": 4.286965308184574e-06, + "loss": 0.1038, + "step": 24890 + }, + { + "epoch": 0.7967235145426039, + "grad_norm": 0.494140625, + "learning_rate": 4.280229033344561e-06, + "loss": 0.0847, + "step": 24900 + }, + { + "epoch": 0.7970434838255528, + "grad_norm": 1.1640625, + "learning_rate": 4.273492758504547e-06, + "loss": 0.1113, + "step": 24910 + }, + { + "epoch": 0.7973634531085015, + "grad_norm": 12.375, + "learning_rate": 4.266756483664534e-06, + "loss": 0.1403, + "step": 24920 + }, + { + "epoch": 0.7976834223914504, + "grad_norm": 10.1875, + "learning_rate": 4.26002020882452e-06, + "loss": 0.07, + "step": 24930 + }, + { + "epoch": 0.7980033916743993, + "grad_norm": 3.140625, + "learning_rate": 4.253283933984507e-06, + "loss": 0.1548, + "step": 24940 + }, + { + "epoch": 0.7983233609573481, + "grad_norm": 13.3125, + "learning_rate": 4.246547659144493e-06, + "loss": 0.1071, + "step": 24950 + }, + { + "epoch": 0.7986433302402969, + "grad_norm": 15.875, + "learning_rate": 4.23981138430448e-06, + "loss": 0.1146, + "step": 24960 + }, + { + "epoch": 0.7989632995232457, + "grad_norm": 4.8125, + "learning_rate": 4.233075109464467e-06, + "loss": 0.1451, + "step": 24970 + }, + { + "epoch": 0.7992832688061946, + "grad_norm": 7.84375, + "learning_rate": 4.226338834624454e-06, + "loss": 0.1184, + "step": 24980 + }, + { + "epoch": 0.7996032380891435, + "grad_norm": 7.34375, + "learning_rate": 4.21960255978444e-06, + "loss": 0.1455, + "step": 24990 + }, + { + "epoch": 0.7999232073720923, + "grad_norm": 1.5546875, + "learning_rate": 4.2128662849444265e-06, + "loss": 0.1296, + "step": 25000 + }, + { + "epoch": 0.8002431766550411, + "grad_norm": 8.875, + "learning_rate": 4.2061300101044126e-06, + "loss": 0.1127, + "step": 25010 + }, + { + "epoch": 0.8005631459379899, + "grad_norm": 12.25, + "learning_rate": 4.1993937352643994e-06, + "loss": 0.0406, + "step": 25020 + }, + { + "epoch": 0.8008831152209388, + "grad_norm": 9.0625, + "learning_rate": 4.1926574604243855e-06, + "loss": 0.1461, + "step": 25030 + }, + { + "epoch": 0.8012030845038877, + "grad_norm": 25.375, + "learning_rate": 4.185921185584372e-06, + "loss": 0.0801, + "step": 25040 + }, + { + "epoch": 0.8015230537868364, + "grad_norm": 17.625, + "learning_rate": 4.179184910744358e-06, + "loss": 0.0857, + "step": 25050 + }, + { + "epoch": 0.8018430230697853, + "grad_norm": 14.8125, + "learning_rate": 4.172448635904345e-06, + "loss": 0.118, + "step": 25060 + }, + { + "epoch": 0.8021629923527341, + "grad_norm": 1.2265625, + "learning_rate": 4.165712361064332e-06, + "loss": 0.1443, + "step": 25070 + }, + { + "epoch": 0.802482961635683, + "grad_norm": 29.5, + "learning_rate": 4.158976086224318e-06, + "loss": 0.0692, + "step": 25080 + }, + { + "epoch": 0.8028029309186319, + "grad_norm": 3.9375, + "learning_rate": 4.152239811384305e-06, + "loss": 0.1291, + "step": 25090 + }, + { + "epoch": 0.8031229002015806, + "grad_norm": 20.5, + "learning_rate": 4.145503536544291e-06, + "loss": 0.0995, + "step": 25100 + }, + { + "epoch": 0.8034428694845295, + "grad_norm": 0.84765625, + "learning_rate": 4.138767261704278e-06, + "loss": 0.1184, + "step": 25110 + }, + { + "epoch": 0.8037628387674783, + "grad_norm": 5.3125, + "learning_rate": 4.132030986864264e-06, + "loss": 0.0971, + "step": 25120 + }, + { + "epoch": 0.8040828080504272, + "grad_norm": 8.0625, + "learning_rate": 4.125294712024251e-06, + "loss": 0.0952, + "step": 25130 + }, + { + "epoch": 0.804402777333376, + "grad_norm": 1.4453125, + "learning_rate": 4.118558437184237e-06, + "loss": 0.108, + "step": 25140 + }, + { + "epoch": 0.8047227466163248, + "grad_norm": 11.6875, + "learning_rate": 4.111822162344224e-06, + "loss": 0.1038, + "step": 25150 + }, + { + "epoch": 0.8050427158992737, + "grad_norm": 12.0625, + "learning_rate": 4.105085887504211e-06, + "loss": 0.1048, + "step": 25160 + }, + { + "epoch": 0.8053626851822225, + "grad_norm": 12.3125, + "learning_rate": 4.098349612664197e-06, + "loss": 0.075, + "step": 25170 + }, + { + "epoch": 0.8056826544651713, + "grad_norm": 23.875, + "learning_rate": 4.091613337824184e-06, + "loss": 0.0974, + "step": 25180 + }, + { + "epoch": 0.8060026237481202, + "grad_norm": 5.5625, + "learning_rate": 4.08487706298417e-06, + "loss": 0.104, + "step": 25190 + }, + { + "epoch": 0.806322593031069, + "grad_norm": 16.5, + "learning_rate": 4.078140788144157e-06, + "loss": 0.0788, + "step": 25200 + }, + { + "epoch": 0.8066425623140179, + "grad_norm": 26.125, + "learning_rate": 4.071404513304143e-06, + "loss": 0.1206, + "step": 25210 + }, + { + "epoch": 0.8069625315969667, + "grad_norm": 0.412109375, + "learning_rate": 4.06466823846413e-06, + "loss": 0.0719, + "step": 25220 + }, + { + "epoch": 0.8072825008799155, + "grad_norm": 1.0078125, + "learning_rate": 4.057931963624116e-06, + "loss": 0.0406, + "step": 25230 + }, + { + "epoch": 0.8076024701628643, + "grad_norm": 5.5625, + "learning_rate": 4.0511956887841025e-06, + "loss": 0.1575, + "step": 25240 + }, + { + "epoch": 0.8079224394458132, + "grad_norm": 3.265625, + "learning_rate": 4.044459413944089e-06, + "loss": 0.0573, + "step": 25250 + }, + { + "epoch": 0.8082424087287621, + "grad_norm": 21.5, + "learning_rate": 4.0377231391040754e-06, + "loss": 0.1253, + "step": 25260 + }, + { + "epoch": 0.8085623780117108, + "grad_norm": 0.337890625, + "learning_rate": 4.030986864264062e-06, + "loss": 0.0968, + "step": 25270 + }, + { + "epoch": 0.8088823472946597, + "grad_norm": 0.67578125, + "learning_rate": 4.024250589424048e-06, + "loss": 0.1109, + "step": 25280 + }, + { + "epoch": 0.8092023165776085, + "grad_norm": 0.94921875, + "learning_rate": 4.017514314584035e-06, + "loss": 0.0831, + "step": 25290 + }, + { + "epoch": 0.8095222858605574, + "grad_norm": 32.5, + "learning_rate": 4.010778039744022e-06, + "loss": 0.1295, + "step": 25300 + }, + { + "epoch": 0.8098422551435063, + "grad_norm": 7.40625, + "learning_rate": 4.004041764904009e-06, + "loss": 0.0988, + "step": 25310 + }, + { + "epoch": 0.810162224426455, + "grad_norm": 9.1875, + "learning_rate": 3.997305490063995e-06, + "loss": 0.1058, + "step": 25320 + }, + { + "epoch": 0.8104821937094039, + "grad_norm": 3.09375, + "learning_rate": 3.990569215223982e-06, + "loss": 0.1499, + "step": 25330 + }, + { + "epoch": 0.8108021629923527, + "grad_norm": 1.0234375, + "learning_rate": 3.983832940383968e-06, + "loss": 0.0452, + "step": 25340 + }, + { + "epoch": 0.8111221322753016, + "grad_norm": 13.5, + "learning_rate": 3.977096665543955e-06, + "loss": 0.0572, + "step": 25350 + }, + { + "epoch": 0.8114421015582504, + "grad_norm": 12.875, + "learning_rate": 3.970360390703941e-06, + "loss": 0.1075, + "step": 25360 + }, + { + "epoch": 0.8117620708411992, + "grad_norm": 8.0, + "learning_rate": 3.963624115863928e-06, + "loss": 0.0984, + "step": 25370 + }, + { + "epoch": 0.8120820401241481, + "grad_norm": 19.125, + "learning_rate": 3.956887841023914e-06, + "loss": 0.1493, + "step": 25380 + }, + { + "epoch": 0.812402009407097, + "grad_norm": 12.125, + "learning_rate": 3.950151566183901e-06, + "loss": 0.1212, + "step": 25390 + }, + { + "epoch": 0.8127219786900458, + "grad_norm": 8.625, + "learning_rate": 3.943415291343888e-06, + "loss": 0.1503, + "step": 25400 + }, + { + "epoch": 0.8130419479729946, + "grad_norm": 11.6875, + "learning_rate": 3.936679016503874e-06, + "loss": 0.1024, + "step": 25410 + }, + { + "epoch": 0.8133619172559434, + "grad_norm": 1.2265625, + "learning_rate": 3.929942741663861e-06, + "loss": 0.0917, + "step": 25420 + }, + { + "epoch": 0.8136818865388923, + "grad_norm": 11.0625, + "learning_rate": 3.923206466823847e-06, + "loss": 0.1651, + "step": 25430 + }, + { + "epoch": 0.8140018558218411, + "grad_norm": 4.125, + "learning_rate": 3.9164701919838335e-06, + "loss": 0.0405, + "step": 25440 + }, + { + "epoch": 0.8143218251047899, + "grad_norm": 13.625, + "learning_rate": 3.9097339171438195e-06, + "loss": 0.0846, + "step": 25450 + }, + { + "epoch": 0.8146417943877388, + "grad_norm": 17.875, + "learning_rate": 3.9029976423038064e-06, + "loss": 0.107, + "step": 25460 + }, + { + "epoch": 0.8149617636706876, + "grad_norm": 19.25, + "learning_rate": 3.8962613674637925e-06, + "loss": 0.1355, + "step": 25470 + }, + { + "epoch": 0.8152817329536365, + "grad_norm": 0.95703125, + "learning_rate": 3.889525092623779e-06, + "loss": 0.0642, + "step": 25480 + }, + { + "epoch": 0.8156017022365852, + "grad_norm": 8.5, + "learning_rate": 3.882788817783765e-06, + "loss": 0.0953, + "step": 25490 + }, + { + "epoch": 0.8159216715195341, + "grad_norm": 1.765625, + "learning_rate": 3.876052542943752e-06, + "loss": 0.1238, + "step": 25500 + }, + { + "epoch": 0.816241640802483, + "grad_norm": 19.5, + "learning_rate": 3.869316268103739e-06, + "loss": 0.1087, + "step": 25510 + }, + { + "epoch": 0.8165616100854318, + "grad_norm": 14.6875, + "learning_rate": 3.862579993263725e-06, + "loss": 0.1071, + "step": 25520 + }, + { + "epoch": 0.8168815793683807, + "grad_norm": 2.390625, + "learning_rate": 3.855843718423712e-06, + "loss": 0.1191, + "step": 25530 + }, + { + "epoch": 0.8172015486513294, + "grad_norm": 27.0, + "learning_rate": 3.849107443583698e-06, + "loss": 0.1081, + "step": 25540 + }, + { + "epoch": 0.8175215179342783, + "grad_norm": 21.25, + "learning_rate": 3.842371168743685e-06, + "loss": 0.1368, + "step": 25550 + }, + { + "epoch": 0.8178414872172272, + "grad_norm": 10.6875, + "learning_rate": 3.835634893903671e-06, + "loss": 0.1391, + "step": 25560 + }, + { + "epoch": 0.818161456500176, + "grad_norm": 0.37109375, + "learning_rate": 3.828898619063658e-06, + "loss": 0.0959, + "step": 25570 + }, + { + "epoch": 0.8184814257831248, + "grad_norm": 11.75, + "learning_rate": 3.822162344223644e-06, + "loss": 0.1031, + "step": 25580 + }, + { + "epoch": 0.8188013950660736, + "grad_norm": 2.046875, + "learning_rate": 3.815426069383631e-06, + "loss": 0.1009, + "step": 25590 + }, + { + "epoch": 0.8191213643490225, + "grad_norm": 41.0, + "learning_rate": 3.8086897945436174e-06, + "loss": 0.1584, + "step": 25600 + }, + { + "epoch": 0.8194413336319714, + "grad_norm": 1.0078125, + "learning_rate": 3.8019535197036047e-06, + "loss": 0.084, + "step": 25610 + }, + { + "epoch": 0.8197613029149202, + "grad_norm": 25.25, + "learning_rate": 3.795217244863591e-06, + "loss": 0.0917, + "step": 25620 + }, + { + "epoch": 0.820081272197869, + "grad_norm": 4.6875, + "learning_rate": 3.7884809700235776e-06, + "loss": 0.076, + "step": 25630 + }, + { + "epoch": 0.8204012414808178, + "grad_norm": 18.0, + "learning_rate": 3.781744695183564e-06, + "loss": 0.1561, + "step": 25640 + }, + { + "epoch": 0.8207212107637667, + "grad_norm": 10.375, + "learning_rate": 3.7750084203435505e-06, + "loss": 0.0642, + "step": 25650 + }, + { + "epoch": 0.8210411800467156, + "grad_norm": 21.0, + "learning_rate": 3.768272145503537e-06, + "loss": 0.0947, + "step": 25660 + }, + { + "epoch": 0.8213611493296643, + "grad_norm": 8.625, + "learning_rate": 3.7615358706635235e-06, + "loss": 0.1877, + "step": 25670 + }, + { + "epoch": 0.8216811186126132, + "grad_norm": 35.5, + "learning_rate": 3.75479959582351e-06, + "loss": 0.1457, + "step": 25680 + }, + { + "epoch": 0.822001087895562, + "grad_norm": 2.0, + "learning_rate": 3.7480633209834964e-06, + "loss": 0.062, + "step": 25690 + }, + { + "epoch": 0.8223210571785109, + "grad_norm": 10.9375, + "learning_rate": 3.7413270461434833e-06, + "loss": 0.1069, + "step": 25700 + }, + { + "epoch": 0.8226410264614596, + "grad_norm": 13.5, + "learning_rate": 3.7345907713034698e-06, + "loss": 0.0682, + "step": 25710 + }, + { + "epoch": 0.8229609957444085, + "grad_norm": 17.25, + "learning_rate": 3.7278544964634562e-06, + "loss": 0.0749, + "step": 25720 + }, + { + "epoch": 0.8232809650273574, + "grad_norm": 16.0, + "learning_rate": 3.7211182216234427e-06, + "loss": 0.0674, + "step": 25730 + }, + { + "epoch": 0.8236009343103062, + "grad_norm": 1.6875, + "learning_rate": 3.714381946783429e-06, + "loss": 0.1215, + "step": 25740 + }, + { + "epoch": 0.8239209035932551, + "grad_norm": 2.328125, + "learning_rate": 3.7076456719434156e-06, + "loss": 0.13, + "step": 25750 + }, + { + "epoch": 0.8242408728762038, + "grad_norm": 1.3046875, + "learning_rate": 3.700909397103402e-06, + "loss": 0.0849, + "step": 25760 + }, + { + "epoch": 0.8245608421591527, + "grad_norm": 16.25, + "learning_rate": 3.6941731222633885e-06, + "loss": 0.0829, + "step": 25770 + }, + { + "epoch": 0.8248808114421016, + "grad_norm": 10.375, + "learning_rate": 3.687436847423375e-06, + "loss": 0.1161, + "step": 25780 + }, + { + "epoch": 0.8252007807250504, + "grad_norm": 1.4609375, + "learning_rate": 3.6807005725833615e-06, + "loss": 0.1281, + "step": 25790 + }, + { + "epoch": 0.8255207500079992, + "grad_norm": 6.90625, + "learning_rate": 3.6739642977433484e-06, + "loss": 0.1703, + "step": 25800 + }, + { + "epoch": 0.825840719290948, + "grad_norm": 19.375, + "learning_rate": 3.667228022903335e-06, + "loss": 0.1104, + "step": 25810 + }, + { + "epoch": 0.8261606885738969, + "grad_norm": 1.6484375, + "learning_rate": 3.6604917480633213e-06, + "loss": 0.1361, + "step": 25820 + }, + { + "epoch": 0.8264806578568458, + "grad_norm": 13.0, + "learning_rate": 3.6537554732233078e-06, + "loss": 0.0433, + "step": 25830 + }, + { + "epoch": 0.8268006271397946, + "grad_norm": 24.75, + "learning_rate": 3.6470191983832942e-06, + "loss": 0.087, + "step": 25840 + }, + { + "epoch": 0.8271205964227434, + "grad_norm": 3.8125, + "learning_rate": 3.6402829235432807e-06, + "loss": 0.1182, + "step": 25850 + }, + { + "epoch": 0.8274405657056922, + "grad_norm": 5.0, + "learning_rate": 3.633546648703267e-06, + "loss": 0.1062, + "step": 25860 + }, + { + "epoch": 0.8277605349886411, + "grad_norm": 12.5625, + "learning_rate": 3.6268103738632536e-06, + "loss": 0.1426, + "step": 25870 + }, + { + "epoch": 0.82808050427159, + "grad_norm": 3.625, + "learning_rate": 3.62007409902324e-06, + "loss": 0.1421, + "step": 25880 + }, + { + "epoch": 0.8284004735545387, + "grad_norm": 2.0625, + "learning_rate": 3.613337824183227e-06, + "loss": 0.0257, + "step": 25890 + }, + { + "epoch": 0.8287204428374876, + "grad_norm": 19.125, + "learning_rate": 3.6066015493432134e-06, + "loss": 0.1668, + "step": 25900 + }, + { + "epoch": 0.8290404121204364, + "grad_norm": 8.25, + "learning_rate": 3.5998652745032e-06, + "loss": 0.1159, + "step": 25910 + }, + { + "epoch": 0.8293603814033853, + "grad_norm": 2.234375, + "learning_rate": 3.5931289996631864e-06, + "loss": 0.0604, + "step": 25920 + }, + { + "epoch": 0.8296803506863342, + "grad_norm": 6.59375, + "learning_rate": 3.5863927248231732e-06, + "loss": 0.1068, + "step": 25930 + }, + { + "epoch": 0.8300003199692829, + "grad_norm": 6.9375, + "learning_rate": 3.5796564499831597e-06, + "loss": 0.1053, + "step": 25940 + }, + { + "epoch": 0.8303202892522318, + "grad_norm": 2.03125, + "learning_rate": 3.572920175143146e-06, + "loss": 0.1022, + "step": 25950 + }, + { + "epoch": 0.8306402585351806, + "grad_norm": 14.5625, + "learning_rate": 3.566183900303133e-06, + "loss": 0.04, + "step": 25960 + }, + { + "epoch": 0.8309602278181295, + "grad_norm": 20.5, + "learning_rate": 3.5594476254631195e-06, + "loss": 0.1607, + "step": 25970 + }, + { + "epoch": 0.8312801971010783, + "grad_norm": 13.1875, + "learning_rate": 3.552711350623106e-06, + "loss": 0.1777, + "step": 25980 + }, + { + "epoch": 0.8316001663840271, + "grad_norm": 26.75, + "learning_rate": 3.5459750757830925e-06, + "loss": 0.0784, + "step": 25990 + }, + { + "epoch": 0.831920135666976, + "grad_norm": 7.46875, + "learning_rate": 3.539238800943079e-06, + "loss": 0.0927, + "step": 26000 + }, + { + "epoch": 0.8322401049499248, + "grad_norm": 3.390625, + "learning_rate": 3.5325025261030654e-06, + "loss": 0.1095, + "step": 26010 + }, + { + "epoch": 0.8325600742328736, + "grad_norm": 14.8125, + "learning_rate": 3.525766251263052e-06, + "loss": 0.0787, + "step": 26020 + }, + { + "epoch": 0.8328800435158225, + "grad_norm": 20.375, + "learning_rate": 3.5190299764230383e-06, + "loss": 0.0561, + "step": 26030 + }, + { + "epoch": 0.8332000127987713, + "grad_norm": 7.625, + "learning_rate": 3.5122937015830248e-06, + "loss": 0.0864, + "step": 26040 + }, + { + "epoch": 0.8335199820817202, + "grad_norm": 27.5, + "learning_rate": 3.5055574267430117e-06, + "loss": 0.1491, + "step": 26050 + }, + { + "epoch": 0.833839951364669, + "grad_norm": 10.625, + "learning_rate": 3.498821151902998e-06, + "loss": 0.0612, + "step": 26060 + }, + { + "epoch": 0.8341599206476178, + "grad_norm": 7.40625, + "learning_rate": 3.4920848770629846e-06, + "loss": 0.1652, + "step": 26070 + }, + { + "epoch": 0.8344798899305667, + "grad_norm": 0.4375, + "learning_rate": 3.485348602222971e-06, + "loss": 0.0912, + "step": 26080 + }, + { + "epoch": 0.8347998592135155, + "grad_norm": 38.0, + "learning_rate": 3.4786123273829575e-06, + "loss": 0.0419, + "step": 26090 + }, + { + "epoch": 0.8351198284964644, + "grad_norm": 11.125, + "learning_rate": 3.471876052542944e-06, + "loss": 0.0766, + "step": 26100 + }, + { + "epoch": 0.8354397977794131, + "grad_norm": 31.25, + "learning_rate": 3.4651397777029305e-06, + "loss": 0.121, + "step": 26110 + }, + { + "epoch": 0.835759767062362, + "grad_norm": 6.5, + "learning_rate": 3.458403502862917e-06, + "loss": 0.0931, + "step": 26120 + }, + { + "epoch": 0.8360797363453109, + "grad_norm": 14.8125, + "learning_rate": 3.4516672280229034e-06, + "loss": 0.0664, + "step": 26130 + }, + { + "epoch": 0.8363997056282597, + "grad_norm": 22.25, + "learning_rate": 3.44493095318289e-06, + "loss": 0.1157, + "step": 26140 + }, + { + "epoch": 0.8367196749112086, + "grad_norm": 4.8125, + "learning_rate": 3.4381946783428767e-06, + "loss": 0.1538, + "step": 26150 + }, + { + "epoch": 0.8370396441941573, + "grad_norm": 7.15625, + "learning_rate": 3.4314584035028632e-06, + "loss": 0.1346, + "step": 26160 + }, + { + "epoch": 0.8373596134771062, + "grad_norm": 21.875, + "learning_rate": 3.4247221286628497e-06, + "loss": 0.1148, + "step": 26170 + }, + { + "epoch": 0.837679582760055, + "grad_norm": 17.25, + "learning_rate": 3.417985853822836e-06, + "loss": 0.1236, + "step": 26180 + }, + { + "epoch": 0.8379995520430039, + "grad_norm": 35.0, + "learning_rate": 3.4112495789828226e-06, + "loss": 0.1587, + "step": 26190 + }, + { + "epoch": 0.8383195213259527, + "grad_norm": 16.0, + "learning_rate": 3.404513304142809e-06, + "loss": 0.1405, + "step": 26200 + }, + { + "epoch": 0.8386394906089015, + "grad_norm": 13.5625, + "learning_rate": 3.3977770293027955e-06, + "loss": 0.1576, + "step": 26210 + }, + { + "epoch": 0.8389594598918504, + "grad_norm": 40.0, + "learning_rate": 3.391040754462782e-06, + "loss": 0.1091, + "step": 26220 + }, + { + "epoch": 0.8392794291747993, + "grad_norm": 1.2734375, + "learning_rate": 3.3843044796227685e-06, + "loss": 0.1204, + "step": 26230 + }, + { + "epoch": 0.839599398457748, + "grad_norm": 7.8125, + "learning_rate": 3.3775682047827554e-06, + "loss": 0.0883, + "step": 26240 + }, + { + "epoch": 0.8399193677406969, + "grad_norm": 16.875, + "learning_rate": 3.370831929942742e-06, + "loss": 0.0634, + "step": 26250 + }, + { + "epoch": 0.8402393370236457, + "grad_norm": 12.4375, + "learning_rate": 3.3640956551027287e-06, + "loss": 0.1545, + "step": 26260 + }, + { + "epoch": 0.8405593063065946, + "grad_norm": 5.78125, + "learning_rate": 3.357359380262715e-06, + "loss": 0.082, + "step": 26270 + }, + { + "epoch": 0.8408792755895435, + "grad_norm": 18.375, + "learning_rate": 3.3506231054227016e-06, + "loss": 0.1701, + "step": 26280 + }, + { + "epoch": 0.8411992448724922, + "grad_norm": 0.74609375, + "learning_rate": 3.343886830582688e-06, + "loss": 0.0834, + "step": 26290 + }, + { + "epoch": 0.8415192141554411, + "grad_norm": 9.25, + "learning_rate": 3.3371505557426746e-06, + "loss": 0.1042, + "step": 26300 + }, + { + "epoch": 0.8418391834383899, + "grad_norm": 10.6875, + "learning_rate": 3.3304142809026615e-06, + "loss": 0.1273, + "step": 26310 + }, + { + "epoch": 0.8421591527213388, + "grad_norm": 11.1875, + "learning_rate": 3.323678006062648e-06, + "loss": 0.0917, + "step": 26320 + }, + { + "epoch": 0.8424791220042875, + "grad_norm": 1.3125, + "learning_rate": 3.3169417312226344e-06, + "loss": 0.1099, + "step": 26330 + }, + { + "epoch": 0.8427990912872364, + "grad_norm": 7.0, + "learning_rate": 3.310205456382621e-06, + "loss": 0.0941, + "step": 26340 + }, + { + "epoch": 0.8431190605701853, + "grad_norm": 10.75, + "learning_rate": 3.3034691815426073e-06, + "loss": 0.108, + "step": 26350 + }, + { + "epoch": 0.8434390298531341, + "grad_norm": 2.359375, + "learning_rate": 3.2967329067025938e-06, + "loss": 0.0746, + "step": 26360 + }, + { + "epoch": 0.843758999136083, + "grad_norm": 20.75, + "learning_rate": 3.2899966318625802e-06, + "loss": 0.188, + "step": 26370 + }, + { + "epoch": 0.8440789684190317, + "grad_norm": 15.0, + "learning_rate": 3.2832603570225667e-06, + "loss": 0.1475, + "step": 26380 + }, + { + "epoch": 0.8443989377019806, + "grad_norm": 4.09375, + "learning_rate": 3.276524082182553e-06, + "loss": 0.1436, + "step": 26390 + }, + { + "epoch": 0.8447189069849295, + "grad_norm": 0.5078125, + "learning_rate": 3.26978780734254e-06, + "loss": 0.1557, + "step": 26400 + }, + { + "epoch": 0.8450388762678783, + "grad_norm": 8.1875, + "learning_rate": 3.2630515325025265e-06, + "loss": 0.0868, + "step": 26410 + }, + { + "epoch": 0.8453588455508271, + "grad_norm": 4.75, + "learning_rate": 3.256315257662513e-06, + "loss": 0.0824, + "step": 26420 + }, + { + "epoch": 0.8456788148337759, + "grad_norm": 3.859375, + "learning_rate": 3.2495789828224995e-06, + "loss": 0.0943, + "step": 26430 + }, + { + "epoch": 0.8459987841167248, + "grad_norm": 2.046875, + "learning_rate": 3.242842707982486e-06, + "loss": 0.0855, + "step": 26440 + }, + { + "epoch": 0.8463187533996737, + "grad_norm": 13.8125, + "learning_rate": 3.2361064331424724e-06, + "loss": 0.1283, + "step": 26450 + }, + { + "epoch": 0.8466387226826224, + "grad_norm": 1.40625, + "learning_rate": 3.229370158302459e-06, + "loss": 0.0688, + "step": 26460 + }, + { + "epoch": 0.8469586919655713, + "grad_norm": 5.625, + "learning_rate": 3.2226338834624453e-06, + "loss": 0.0882, + "step": 26470 + }, + { + "epoch": 0.8472786612485201, + "grad_norm": 12.375, + "learning_rate": 3.2158976086224318e-06, + "loss": 0.109, + "step": 26480 + }, + { + "epoch": 0.847598630531469, + "grad_norm": 12.125, + "learning_rate": 3.2091613337824182e-06, + "loss": 0.1799, + "step": 26490 + }, + { + "epoch": 0.8479185998144179, + "grad_norm": 14.9375, + "learning_rate": 3.202425058942405e-06, + "loss": 0.0885, + "step": 26500 + }, + { + "epoch": 0.8482385690973666, + "grad_norm": 6.625, + "learning_rate": 3.1956887841023916e-06, + "loss": 0.094, + "step": 26510 + }, + { + "epoch": 0.8485585383803155, + "grad_norm": 1.0859375, + "learning_rate": 3.188952509262378e-06, + "loss": 0.0746, + "step": 26520 + }, + { + "epoch": 0.8488785076632643, + "grad_norm": 0.64453125, + "learning_rate": 3.1822162344223645e-06, + "loss": 0.0866, + "step": 26530 + }, + { + "epoch": 0.8491984769462132, + "grad_norm": 8.5, + "learning_rate": 3.175479959582351e-06, + "loss": 0.1575, + "step": 26540 + }, + { + "epoch": 0.849518446229162, + "grad_norm": 5.71875, + "learning_rate": 3.1687436847423375e-06, + "loss": 0.0354, + "step": 26550 + }, + { + "epoch": 0.8498384155121108, + "grad_norm": 10.0625, + "learning_rate": 3.162007409902324e-06, + "loss": 0.1632, + "step": 26560 + }, + { + "epoch": 0.8501583847950597, + "grad_norm": 3.546875, + "learning_rate": 3.1552711350623104e-06, + "loss": 0.075, + "step": 26570 + }, + { + "epoch": 0.8504783540780085, + "grad_norm": 0.578125, + "learning_rate": 3.1485348602222977e-06, + "loss": 0.1137, + "step": 26580 + }, + { + "epoch": 0.8507983233609574, + "grad_norm": 1.2265625, + "learning_rate": 3.141798585382284e-06, + "loss": 0.0632, + "step": 26590 + }, + { + "epoch": 0.8511182926439061, + "grad_norm": 18.5, + "learning_rate": 3.1350623105422706e-06, + "loss": 0.1179, + "step": 26600 + }, + { + "epoch": 0.851438261926855, + "grad_norm": 0.765625, + "learning_rate": 3.128326035702257e-06, + "loss": 0.1367, + "step": 26610 + }, + { + "epoch": 0.8517582312098039, + "grad_norm": 23.75, + "learning_rate": 3.1215897608622436e-06, + "loss": 0.1312, + "step": 26620 + }, + { + "epoch": 0.8520782004927527, + "grad_norm": 0.380859375, + "learning_rate": 3.11485348602223e-06, + "loss": 0.1576, + "step": 26630 + }, + { + "epoch": 0.8523981697757015, + "grad_norm": 16.5, + "learning_rate": 3.1081172111822165e-06, + "loss": 0.1373, + "step": 26640 + }, + { + "epoch": 0.8527181390586503, + "grad_norm": 22.125, + "learning_rate": 3.101380936342203e-06, + "loss": 0.1213, + "step": 26650 + }, + { + "epoch": 0.8530381083415992, + "grad_norm": 0.2353515625, + "learning_rate": 3.09464466150219e-06, + "loss": 0.0798, + "step": 26660 + }, + { + "epoch": 0.8533580776245481, + "grad_norm": 29.125, + "learning_rate": 3.0879083866621763e-06, + "loss": 0.1044, + "step": 26670 + }, + { + "epoch": 0.8536780469074969, + "grad_norm": 10.1875, + "learning_rate": 3.0811721118221628e-06, + "loss": 0.1142, + "step": 26680 + }, + { + "epoch": 0.8539980161904457, + "grad_norm": 12.125, + "learning_rate": 3.0744358369821492e-06, + "loss": 0.0464, + "step": 26690 + }, + { + "epoch": 0.8543179854733945, + "grad_norm": 9.0, + "learning_rate": 3.0676995621421357e-06, + "loss": 0.0788, + "step": 26700 + }, + { + "epoch": 0.8546379547563434, + "grad_norm": 0.427734375, + "learning_rate": 3.060963287302122e-06, + "loss": 0.1067, + "step": 26710 + }, + { + "epoch": 0.8549579240392923, + "grad_norm": 1.0859375, + "learning_rate": 3.0542270124621086e-06, + "loss": 0.161, + "step": 26720 + }, + { + "epoch": 0.855277893322241, + "grad_norm": 14.6875, + "learning_rate": 3.047490737622095e-06, + "loss": 0.1488, + "step": 26730 + }, + { + "epoch": 0.8555978626051899, + "grad_norm": 16.5, + "learning_rate": 3.0407544627820816e-06, + "loss": 0.0899, + "step": 26740 + }, + { + "epoch": 0.8559178318881387, + "grad_norm": 12.0, + "learning_rate": 3.0340181879420684e-06, + "loss": 0.1283, + "step": 26750 + }, + { + "epoch": 0.8562378011710876, + "grad_norm": 13.8125, + "learning_rate": 3.027281913102055e-06, + "loss": 0.1142, + "step": 26760 + }, + { + "epoch": 0.8565577704540364, + "grad_norm": 10.75, + "learning_rate": 3.0205456382620414e-06, + "loss": 0.1071, + "step": 26770 + }, + { + "epoch": 0.8568777397369852, + "grad_norm": 15.1875, + "learning_rate": 3.013809363422028e-06, + "loss": 0.1298, + "step": 26780 + }, + { + "epoch": 0.8571977090199341, + "grad_norm": 0.408203125, + "learning_rate": 3.0070730885820143e-06, + "loss": 0.081, + "step": 26790 + }, + { + "epoch": 0.857517678302883, + "grad_norm": 4.1875, + "learning_rate": 3.0003368137420008e-06, + "loss": 0.1733, + "step": 26800 + }, + { + "epoch": 0.8578376475858318, + "grad_norm": 4.59375, + "learning_rate": 2.9936005389019872e-06, + "loss": 0.0857, + "step": 26810 + }, + { + "epoch": 0.8581576168687806, + "grad_norm": 18.125, + "learning_rate": 2.9868642640619737e-06, + "loss": 0.087, + "step": 26820 + }, + { + "epoch": 0.8584775861517294, + "grad_norm": 10.0625, + "learning_rate": 2.98012798922196e-06, + "loss": 0.143, + "step": 26830 + }, + { + "epoch": 0.8587975554346783, + "grad_norm": 12.0, + "learning_rate": 2.973391714381947e-06, + "loss": 0.1119, + "step": 26840 + }, + { + "epoch": 0.8591175247176271, + "grad_norm": 1.328125, + "learning_rate": 2.9666554395419335e-06, + "loss": 0.1347, + "step": 26850 + }, + { + "epoch": 0.8594374940005759, + "grad_norm": 15.3125, + "learning_rate": 2.95991916470192e-06, + "loss": 0.1421, + "step": 26860 + }, + { + "epoch": 0.8597574632835248, + "grad_norm": 3.984375, + "learning_rate": 2.9531828898619065e-06, + "loss": 0.0878, + "step": 26870 + }, + { + "epoch": 0.8600774325664736, + "grad_norm": 11.5625, + "learning_rate": 2.946446615021893e-06, + "loss": 0.1361, + "step": 26880 + }, + { + "epoch": 0.8603974018494225, + "grad_norm": 2.0, + "learning_rate": 2.9397103401818794e-06, + "loss": 0.0791, + "step": 26890 + }, + { + "epoch": 0.8607173711323713, + "grad_norm": 18.375, + "learning_rate": 2.9329740653418663e-06, + "loss": 0.1408, + "step": 26900 + }, + { + "epoch": 0.8610373404153201, + "grad_norm": 32.75, + "learning_rate": 2.926237790501853e-06, + "loss": 0.2028, + "step": 26910 + }, + { + "epoch": 0.861357309698269, + "grad_norm": 13.6875, + "learning_rate": 2.9195015156618396e-06, + "loss": 0.1018, + "step": 26920 + }, + { + "epoch": 0.8616772789812178, + "grad_norm": 1.3671875, + "learning_rate": 2.912765240821826e-06, + "loss": 0.1583, + "step": 26930 + }, + { + "epoch": 0.8619972482641667, + "grad_norm": 5.3125, + "learning_rate": 2.9060289659818126e-06, + "loss": 0.1422, + "step": 26940 + }, + { + "epoch": 0.8623172175471154, + "grad_norm": 10.375, + "learning_rate": 2.899292691141799e-06, + "loss": 0.1404, + "step": 26950 + }, + { + "epoch": 0.8626371868300643, + "grad_norm": 5.78125, + "learning_rate": 2.8925564163017855e-06, + "loss": 0.1187, + "step": 26960 + }, + { + "epoch": 0.8629571561130132, + "grad_norm": 10.4375, + "learning_rate": 2.885820141461772e-06, + "loss": 0.0961, + "step": 26970 + }, + { + "epoch": 0.863277125395962, + "grad_norm": 14.125, + "learning_rate": 2.8790838666217584e-06, + "loss": 0.0927, + "step": 26980 + }, + { + "epoch": 0.8635970946789108, + "grad_norm": 20.25, + "learning_rate": 2.872347591781745e-06, + "loss": 0.1489, + "step": 26990 + }, + { + "epoch": 0.8639170639618596, + "grad_norm": 11.6875, + "learning_rate": 2.8656113169417318e-06, + "loss": 0.1408, + "step": 27000 + }, + { + "epoch": 0.8642370332448085, + "grad_norm": 14.8125, + "learning_rate": 2.8588750421017182e-06, + "loss": 0.081, + "step": 27010 + }, + { + "epoch": 0.8645570025277574, + "grad_norm": 1.65625, + "learning_rate": 2.8521387672617047e-06, + "loss": 0.1263, + "step": 27020 + }, + { + "epoch": 0.8648769718107062, + "grad_norm": 16.25, + "learning_rate": 2.845402492421691e-06, + "loss": 0.1127, + "step": 27030 + }, + { + "epoch": 0.865196941093655, + "grad_norm": 16.25, + "learning_rate": 2.8386662175816776e-06, + "loss": 0.2059, + "step": 27040 + }, + { + "epoch": 0.8655169103766038, + "grad_norm": 18.875, + "learning_rate": 2.831929942741664e-06, + "loss": 0.1622, + "step": 27050 + }, + { + "epoch": 0.8658368796595527, + "grad_norm": 5.4375, + "learning_rate": 2.8251936679016506e-06, + "loss": 0.1294, + "step": 27060 + }, + { + "epoch": 0.8661568489425016, + "grad_norm": 14.0625, + "learning_rate": 2.818457393061637e-06, + "loss": 0.1177, + "step": 27070 + }, + { + "epoch": 0.8664768182254503, + "grad_norm": 9.875, + "learning_rate": 2.8117211182216235e-06, + "loss": 0.1083, + "step": 27080 + }, + { + "epoch": 0.8667967875083992, + "grad_norm": 2.015625, + "learning_rate": 2.80498484338161e-06, + "loss": 0.097, + "step": 27090 + }, + { + "epoch": 0.867116756791348, + "grad_norm": 1.015625, + "learning_rate": 2.798248568541597e-06, + "loss": 0.1104, + "step": 27100 + }, + { + "epoch": 0.8674367260742969, + "grad_norm": 0.890625, + "learning_rate": 2.7915122937015833e-06, + "loss": 0.0396, + "step": 27110 + }, + { + "epoch": 0.8677566953572458, + "grad_norm": 6.6875, + "learning_rate": 2.7847760188615698e-06, + "loss": 0.1441, + "step": 27120 + }, + { + "epoch": 0.8680766646401945, + "grad_norm": 65.0, + "learning_rate": 2.7780397440215562e-06, + "loss": 0.1234, + "step": 27130 + }, + { + "epoch": 0.8683966339231434, + "grad_norm": 5.75, + "learning_rate": 2.7713034691815427e-06, + "loss": 0.0783, + "step": 27140 + }, + { + "epoch": 0.8687166032060922, + "grad_norm": 5.78125, + "learning_rate": 2.764567194341529e-06, + "loss": 0.1072, + "step": 27150 + }, + { + "epoch": 0.8690365724890411, + "grad_norm": 10.375, + "learning_rate": 2.7578309195015156e-06, + "loss": 0.0951, + "step": 27160 + }, + { + "epoch": 0.8693565417719898, + "grad_norm": 15.6875, + "learning_rate": 2.751094644661502e-06, + "loss": 0.0661, + "step": 27170 + }, + { + "epoch": 0.8696765110549387, + "grad_norm": 1.9140625, + "learning_rate": 2.7443583698214886e-06, + "loss": 0.0462, + "step": 27180 + }, + { + "epoch": 0.8699964803378876, + "grad_norm": 8.3125, + "learning_rate": 2.7376220949814754e-06, + "loss": 0.1362, + "step": 27190 + }, + { + "epoch": 0.8703164496208364, + "grad_norm": 10.8125, + "learning_rate": 2.730885820141462e-06, + "loss": 0.1367, + "step": 27200 + }, + { + "epoch": 0.8706364189037853, + "grad_norm": 11.3125, + "learning_rate": 2.7241495453014484e-06, + "loss": 0.099, + "step": 27210 + }, + { + "epoch": 0.870956388186734, + "grad_norm": 1.34375, + "learning_rate": 2.717413270461435e-06, + "loss": 0.0747, + "step": 27220 + }, + { + "epoch": 0.8712763574696829, + "grad_norm": 6.65625, + "learning_rate": 2.7106769956214217e-06, + "loss": 0.1146, + "step": 27230 + }, + { + "epoch": 0.8715963267526318, + "grad_norm": 11.3125, + "learning_rate": 2.703940720781408e-06, + "loss": 0.1006, + "step": 27240 + }, + { + "epoch": 0.8719162960355806, + "grad_norm": 5.0625, + "learning_rate": 2.6972044459413947e-06, + "loss": 0.0773, + "step": 27250 + }, + { + "epoch": 0.8722362653185294, + "grad_norm": 10.375, + "learning_rate": 2.6904681711013815e-06, + "loss": 0.1592, + "step": 27260 + }, + { + "epoch": 0.8725562346014782, + "grad_norm": 19.375, + "learning_rate": 2.683731896261368e-06, + "loss": 0.1435, + "step": 27270 + }, + { + "epoch": 0.8728762038844271, + "grad_norm": 14.5625, + "learning_rate": 2.6769956214213545e-06, + "loss": 0.1256, + "step": 27280 + }, + { + "epoch": 0.873196173167376, + "grad_norm": 20.375, + "learning_rate": 2.670259346581341e-06, + "loss": 0.1737, + "step": 27290 + }, + { + "epoch": 0.8735161424503247, + "grad_norm": 14.5625, + "learning_rate": 2.6635230717413274e-06, + "loss": 0.1213, + "step": 27300 + }, + { + "epoch": 0.8738361117332736, + "grad_norm": 10.5, + "learning_rate": 2.656786796901314e-06, + "loss": 0.1409, + "step": 27310 + }, + { + "epoch": 0.8741560810162224, + "grad_norm": 16.75, + "learning_rate": 2.6500505220613003e-06, + "loss": 0.0825, + "step": 27320 + }, + { + "epoch": 0.8744760502991713, + "grad_norm": 3.53125, + "learning_rate": 2.643314247221287e-06, + "loss": 0.1038, + "step": 27330 + }, + { + "epoch": 0.8747960195821202, + "grad_norm": 1.9375, + "learning_rate": 2.6365779723812733e-06, + "loss": 0.0745, + "step": 27340 + }, + { + "epoch": 0.8751159888650689, + "grad_norm": 1.4921875, + "learning_rate": 2.62984169754126e-06, + "loss": 0.0833, + "step": 27350 + }, + { + "epoch": 0.8754359581480178, + "grad_norm": 12.6875, + "learning_rate": 2.6231054227012466e-06, + "loss": 0.0746, + "step": 27360 + }, + { + "epoch": 0.8757559274309666, + "grad_norm": 18.25, + "learning_rate": 2.616369147861233e-06, + "loss": 0.1527, + "step": 27370 + }, + { + "epoch": 0.8760758967139155, + "grad_norm": 1.921875, + "learning_rate": 2.6096328730212195e-06, + "loss": 0.1121, + "step": 27380 + }, + { + "epoch": 0.8763958659968643, + "grad_norm": 1.0078125, + "learning_rate": 2.602896598181206e-06, + "loss": 0.1009, + "step": 27390 + }, + { + "epoch": 0.8767158352798131, + "grad_norm": 4.9375, + "learning_rate": 2.5961603233411925e-06, + "loss": 0.0946, + "step": 27400 + }, + { + "epoch": 0.877035804562762, + "grad_norm": 1.3203125, + "learning_rate": 2.589424048501179e-06, + "loss": 0.0971, + "step": 27410 + }, + { + "epoch": 0.8773557738457108, + "grad_norm": 0.79296875, + "learning_rate": 2.5826877736611654e-06, + "loss": 0.0833, + "step": 27420 + }, + { + "epoch": 0.8776757431286597, + "grad_norm": 10.875, + "learning_rate": 2.575951498821152e-06, + "loss": 0.1689, + "step": 27430 + }, + { + "epoch": 0.8779957124116085, + "grad_norm": 14.75, + "learning_rate": 2.5692152239811383e-06, + "loss": 0.1001, + "step": 27440 + }, + { + "epoch": 0.8783156816945573, + "grad_norm": 6.125, + "learning_rate": 2.5624789491411252e-06, + "loss": 0.0838, + "step": 27450 + }, + { + "epoch": 0.8786356509775062, + "grad_norm": 22.375, + "learning_rate": 2.5557426743011117e-06, + "loss": 0.1145, + "step": 27460 + }, + { + "epoch": 0.878955620260455, + "grad_norm": 3.03125, + "learning_rate": 2.549006399461098e-06, + "loss": 0.0925, + "step": 27470 + }, + { + "epoch": 0.8792755895434038, + "grad_norm": 5.25, + "learning_rate": 2.5422701246210846e-06, + "loss": 0.1017, + "step": 27480 + }, + { + "epoch": 0.8795955588263527, + "grad_norm": 0.6171875, + "learning_rate": 2.535533849781071e-06, + "loss": 0.0581, + "step": 27490 + }, + { + "epoch": 0.8799155281093015, + "grad_norm": 26.125, + "learning_rate": 2.5287975749410575e-06, + "loss": 0.162, + "step": 27500 + }, + { + "epoch": 0.8802354973922504, + "grad_norm": 5.71875, + "learning_rate": 2.522061300101044e-06, + "loss": 0.1068, + "step": 27510 + }, + { + "epoch": 0.8805554666751991, + "grad_norm": 14.125, + "learning_rate": 2.5153250252610305e-06, + "loss": 0.145, + "step": 27520 + }, + { + "epoch": 0.880875435958148, + "grad_norm": 34.25, + "learning_rate": 2.508588750421017e-06, + "loss": 0.1379, + "step": 27530 + }, + { + "epoch": 0.8811954052410969, + "grad_norm": 6.25, + "learning_rate": 2.501852475581004e-06, + "loss": 0.0572, + "step": 27540 + }, + { + "epoch": 0.8815153745240457, + "grad_norm": 7.90625, + "learning_rate": 2.4951162007409903e-06, + "loss": 0.1623, + "step": 27550 + }, + { + "epoch": 0.8818353438069946, + "grad_norm": 9.5625, + "learning_rate": 2.4883799259009768e-06, + "loss": 0.1115, + "step": 27560 + }, + { + "epoch": 0.8821553130899433, + "grad_norm": 13.375, + "learning_rate": 2.4816436510609636e-06, + "loss": 0.1245, + "step": 27570 + }, + { + "epoch": 0.8824752823728922, + "grad_norm": 20.625, + "learning_rate": 2.47490737622095e-06, + "loss": 0.141, + "step": 27580 + }, + { + "epoch": 0.882795251655841, + "grad_norm": 10.6875, + "learning_rate": 2.4681711013809366e-06, + "loss": 0.077, + "step": 27590 + }, + { + "epoch": 0.8831152209387899, + "grad_norm": 13.375, + "learning_rate": 2.461434826540923e-06, + "loss": 0.0968, + "step": 27600 + }, + { + "epoch": 0.8834351902217387, + "grad_norm": 23.25, + "learning_rate": 2.4546985517009095e-06, + "loss": 0.0824, + "step": 27610 + }, + { + "epoch": 0.8837551595046875, + "grad_norm": 11.5, + "learning_rate": 2.447962276860896e-06, + "loss": 0.146, + "step": 27620 + }, + { + "epoch": 0.8840751287876364, + "grad_norm": 9.0625, + "learning_rate": 2.441226002020883e-06, + "loss": 0.0869, + "step": 27630 + }, + { + "epoch": 0.8843950980705853, + "grad_norm": 5.4375, + "learning_rate": 2.4344897271808693e-06, + "loss": 0.1064, + "step": 27640 + }, + { + "epoch": 0.8847150673535341, + "grad_norm": 16.5, + "learning_rate": 2.427753452340856e-06, + "loss": 0.1236, + "step": 27650 + }, + { + "epoch": 0.8850350366364829, + "grad_norm": 15.1875, + "learning_rate": 2.4210171775008423e-06, + "loss": 0.0924, + "step": 27660 + }, + { + "epoch": 0.8853550059194317, + "grad_norm": 1.5, + "learning_rate": 2.4142809026608287e-06, + "loss": 0.137, + "step": 27670 + }, + { + "epoch": 0.8856749752023806, + "grad_norm": 11.25, + "learning_rate": 2.407544627820815e-06, + "loss": 0.1282, + "step": 27680 + }, + { + "epoch": 0.8859949444853294, + "grad_norm": 7.375, + "learning_rate": 2.4008083529808017e-06, + "loss": 0.1409, + "step": 27690 + }, + { + "epoch": 0.8863149137682782, + "grad_norm": 0.9609375, + "learning_rate": 2.3940720781407885e-06, + "loss": 0.1107, + "step": 27700 + }, + { + "epoch": 0.8866348830512271, + "grad_norm": 4.625, + "learning_rate": 2.387335803300775e-06, + "loss": 0.1221, + "step": 27710 + }, + { + "epoch": 0.8869548523341759, + "grad_norm": 5.125, + "learning_rate": 2.3805995284607615e-06, + "loss": 0.0751, + "step": 27720 + }, + { + "epoch": 0.8872748216171248, + "grad_norm": 0.5859375, + "learning_rate": 2.373863253620748e-06, + "loss": 0.1011, + "step": 27730 + }, + { + "epoch": 0.8875947909000735, + "grad_norm": 16.125, + "learning_rate": 2.3671269787807344e-06, + "loss": 0.137, + "step": 27740 + }, + { + "epoch": 0.8879147601830224, + "grad_norm": 22.125, + "learning_rate": 2.360390703940721e-06, + "loss": 0.1568, + "step": 27750 + }, + { + "epoch": 0.8882347294659713, + "grad_norm": 4.53125, + "learning_rate": 2.3536544291007073e-06, + "loss": 0.0755, + "step": 27760 + }, + { + "epoch": 0.8885546987489201, + "grad_norm": 10.375, + "learning_rate": 2.346918154260694e-06, + "loss": 0.1248, + "step": 27770 + }, + { + "epoch": 0.888874668031869, + "grad_norm": 5.875, + "learning_rate": 2.3401818794206803e-06, + "loss": 0.0736, + "step": 27780 + }, + { + "epoch": 0.8891946373148177, + "grad_norm": 16.125, + "learning_rate": 2.333445604580667e-06, + "loss": 0.121, + "step": 27790 + }, + { + "epoch": 0.8895146065977666, + "grad_norm": 16.125, + "learning_rate": 2.3267093297406536e-06, + "loss": 0.208, + "step": 27800 + }, + { + "epoch": 0.8898345758807155, + "grad_norm": 0.859375, + "learning_rate": 2.31997305490064e-06, + "loss": 0.0572, + "step": 27810 + }, + { + "epoch": 0.8901545451636643, + "grad_norm": 5.53125, + "learning_rate": 2.3132367800606265e-06, + "loss": 0.1395, + "step": 27820 + }, + { + "epoch": 0.8904745144466131, + "grad_norm": 15.6875, + "learning_rate": 2.3065005052206134e-06, + "loss": 0.1638, + "step": 27830 + }, + { + "epoch": 0.8907944837295619, + "grad_norm": 5.71875, + "learning_rate": 2.2997642303806e-06, + "loss": 0.1019, + "step": 27840 + }, + { + "epoch": 0.8911144530125108, + "grad_norm": 9.75, + "learning_rate": 2.2930279555405864e-06, + "loss": 0.1173, + "step": 27850 + }, + { + "epoch": 0.8914344222954597, + "grad_norm": 11.75, + "learning_rate": 2.286291680700573e-06, + "loss": 0.0744, + "step": 27860 + }, + { + "epoch": 0.8917543915784085, + "grad_norm": 9.1875, + "learning_rate": 2.2795554058605593e-06, + "loss": 0.1343, + "step": 27870 + }, + { + "epoch": 0.8920743608613573, + "grad_norm": 5.28125, + "learning_rate": 2.2728191310205458e-06, + "loss": 0.1461, + "step": 27880 + }, + { + "epoch": 0.8923943301443061, + "grad_norm": 7.53125, + "learning_rate": 2.2660828561805322e-06, + "loss": 0.0907, + "step": 27890 + }, + { + "epoch": 0.892714299427255, + "grad_norm": 2.359375, + "learning_rate": 2.2593465813405187e-06, + "loss": 0.1232, + "step": 27900 + }, + { + "epoch": 0.8930342687102039, + "grad_norm": 20.25, + "learning_rate": 2.252610306500505e-06, + "loss": 0.1363, + "step": 27910 + }, + { + "epoch": 0.8933542379931526, + "grad_norm": 2.40625, + "learning_rate": 2.245874031660492e-06, + "loss": 0.154, + "step": 27920 + }, + { + "epoch": 0.8936742072761015, + "grad_norm": 4.8125, + "learning_rate": 2.2391377568204785e-06, + "loss": 0.1451, + "step": 27930 + }, + { + "epoch": 0.8939941765590503, + "grad_norm": 40.25, + "learning_rate": 2.232401481980465e-06, + "loss": 0.2074, + "step": 27940 + }, + { + "epoch": 0.8943141458419992, + "grad_norm": 9.6875, + "learning_rate": 2.225665207140452e-06, + "loss": 0.1017, + "step": 27950 + }, + { + "epoch": 0.8946341151249481, + "grad_norm": 18.0, + "learning_rate": 2.2189289323004383e-06, + "loss": 0.1106, + "step": 27960 + }, + { + "epoch": 0.8949540844078968, + "grad_norm": 0.6015625, + "learning_rate": 2.2121926574604248e-06, + "loss": 0.0851, + "step": 27970 + }, + { + "epoch": 0.8952740536908457, + "grad_norm": 1.3125, + "learning_rate": 2.2054563826204112e-06, + "loss": 0.112, + "step": 27980 + }, + { + "epoch": 0.8955940229737945, + "grad_norm": 6.78125, + "learning_rate": 2.1987201077803977e-06, + "loss": 0.0813, + "step": 27990 + }, + { + "epoch": 0.8959139922567434, + "grad_norm": 23.875, + "learning_rate": 2.191983832940384e-06, + "loss": 0.1408, + "step": 28000 + }, + { + "epoch": 0.8962339615396921, + "grad_norm": 14.625, + "learning_rate": 2.1852475581003706e-06, + "loss": 0.1109, + "step": 28010 + }, + { + "epoch": 0.896553930822641, + "grad_norm": 1.53125, + "learning_rate": 2.178511283260357e-06, + "loss": 0.0799, + "step": 28020 + }, + { + "epoch": 0.8968739001055899, + "grad_norm": 6.21875, + "learning_rate": 2.1717750084203436e-06, + "loss": 0.1101, + "step": 28030 + }, + { + "epoch": 0.8971938693885387, + "grad_norm": 13.4375, + "learning_rate": 2.16503873358033e-06, + "loss": 0.1114, + "step": 28040 + }, + { + "epoch": 0.8975138386714875, + "grad_norm": 19.5, + "learning_rate": 2.158302458740317e-06, + "loss": 0.0906, + "step": 28050 + }, + { + "epoch": 0.8978338079544363, + "grad_norm": 4.53125, + "learning_rate": 2.1515661839003034e-06, + "loss": 0.1327, + "step": 28060 + }, + { + "epoch": 0.8981537772373852, + "grad_norm": 23.625, + "learning_rate": 2.14482990906029e-06, + "loss": 0.1127, + "step": 28070 + }, + { + "epoch": 0.8984737465203341, + "grad_norm": 14.3125, + "learning_rate": 2.1380936342202763e-06, + "loss": 0.0846, + "step": 28080 + }, + { + "epoch": 0.8987937158032829, + "grad_norm": 13.9375, + "learning_rate": 2.1313573593802628e-06, + "loss": 0.0603, + "step": 28090 + }, + { + "epoch": 0.8991136850862317, + "grad_norm": 6.65625, + "learning_rate": 2.1246210845402493e-06, + "loss": 0.1015, + "step": 28100 + }, + { + "epoch": 0.8994336543691805, + "grad_norm": 32.25, + "learning_rate": 2.117884809700236e-06, + "loss": 0.0892, + "step": 28110 + }, + { + "epoch": 0.8997536236521294, + "grad_norm": 6.5625, + "learning_rate": 2.1111485348602226e-06, + "loss": 0.0981, + "step": 28120 + }, + { + "epoch": 0.9000735929350783, + "grad_norm": 2.609375, + "learning_rate": 2.104412260020209e-06, + "loss": 0.1675, + "step": 28130 + }, + { + "epoch": 0.900393562218027, + "grad_norm": 10.0, + "learning_rate": 2.0976759851801955e-06, + "loss": 0.1065, + "step": 28140 + }, + { + "epoch": 0.9007135315009759, + "grad_norm": 12.625, + "learning_rate": 2.090939710340182e-06, + "loss": 0.1469, + "step": 28150 + }, + { + "epoch": 0.9010335007839247, + "grad_norm": 15.6875, + "learning_rate": 2.0842034355001685e-06, + "loss": 0.0984, + "step": 28160 + }, + { + "epoch": 0.9013534700668736, + "grad_norm": 10.1875, + "learning_rate": 2.077467160660155e-06, + "loss": 0.0784, + "step": 28170 + }, + { + "epoch": 0.9016734393498225, + "grad_norm": 3.90625, + "learning_rate": 2.070730885820142e-06, + "loss": 0.1035, + "step": 28180 + }, + { + "epoch": 0.9019934086327712, + "grad_norm": 20.125, + "learning_rate": 2.0639946109801283e-06, + "loss": 0.1184, + "step": 28190 + }, + { + "epoch": 0.9023133779157201, + "grad_norm": 2.515625, + "learning_rate": 2.0572583361401147e-06, + "loss": 0.1011, + "step": 28200 + }, + { + "epoch": 0.902633347198669, + "grad_norm": 9.125, + "learning_rate": 2.050522061300101e-06, + "loss": 0.0984, + "step": 28210 + }, + { + "epoch": 0.9029533164816178, + "grad_norm": 1.859375, + "learning_rate": 2.0437857864600877e-06, + "loss": 0.1193, + "step": 28220 + }, + { + "epoch": 0.9032732857645666, + "grad_norm": 9.8125, + "learning_rate": 2.037049511620074e-06, + "loss": 0.1004, + "step": 28230 + }, + { + "epoch": 0.9035932550475154, + "grad_norm": 19.0, + "learning_rate": 2.0303132367800606e-06, + "loss": 0.1256, + "step": 28240 + }, + { + "epoch": 0.9039132243304643, + "grad_norm": 0.546875, + "learning_rate": 2.023576961940047e-06, + "loss": 0.1225, + "step": 28250 + }, + { + "epoch": 0.9042331936134131, + "grad_norm": 8.875, + "learning_rate": 2.0168406871000335e-06, + "loss": 0.0965, + "step": 28260 + }, + { + "epoch": 0.9045531628963619, + "grad_norm": 5.59375, + "learning_rate": 2.0101044122600204e-06, + "loss": 0.0995, + "step": 28270 + }, + { + "epoch": 0.9048731321793108, + "grad_norm": 6.40625, + "learning_rate": 2.003368137420007e-06, + "loss": 0.0994, + "step": 28280 + }, + { + "epoch": 0.9051931014622596, + "grad_norm": 8.6875, + "learning_rate": 1.9966318625799934e-06, + "loss": 0.1405, + "step": 28290 + }, + { + "epoch": 0.9055130707452085, + "grad_norm": 1.390625, + "learning_rate": 1.9898955877399802e-06, + "loss": 0.1157, + "step": 28300 + }, + { + "epoch": 0.9058330400281573, + "grad_norm": 123.0, + "learning_rate": 1.9831593128999667e-06, + "loss": 0.1767, + "step": 28310 + }, + { + "epoch": 0.9061530093111061, + "grad_norm": 8.75, + "learning_rate": 1.976423038059953e-06, + "loss": 0.1001, + "step": 28320 + }, + { + "epoch": 0.906472978594055, + "grad_norm": 9.8125, + "learning_rate": 1.9696867632199396e-06, + "loss": 0.1721, + "step": 28330 + }, + { + "epoch": 0.9067929478770038, + "grad_norm": 3.625, + "learning_rate": 1.962950488379926e-06, + "loss": 0.0725, + "step": 28340 + }, + { + "epoch": 0.9071129171599527, + "grad_norm": 1.2578125, + "learning_rate": 1.9562142135399126e-06, + "loss": 0.1292, + "step": 28350 + }, + { + "epoch": 0.9074328864429014, + "grad_norm": 6.71875, + "learning_rate": 1.949477938699899e-06, + "loss": 0.0912, + "step": 28360 + }, + { + "epoch": 0.9077528557258503, + "grad_norm": 8.0625, + "learning_rate": 1.9427416638598855e-06, + "loss": 0.1139, + "step": 28370 + }, + { + "epoch": 0.9080728250087992, + "grad_norm": 11.0625, + "learning_rate": 1.936005389019872e-06, + "loss": 0.1651, + "step": 28380 + }, + { + "epoch": 0.908392794291748, + "grad_norm": 12.4375, + "learning_rate": 1.9292691141798584e-06, + "loss": 0.1372, + "step": 28390 + }, + { + "epoch": 0.9087127635746969, + "grad_norm": 27.875, + "learning_rate": 1.9225328393398453e-06, + "loss": 0.1306, + "step": 28400 + }, + { + "epoch": 0.9090327328576456, + "grad_norm": 0.99609375, + "learning_rate": 1.9157965644998318e-06, + "loss": 0.0986, + "step": 28410 + }, + { + "epoch": 0.9093527021405945, + "grad_norm": 23.25, + "learning_rate": 1.9090602896598182e-06, + "loss": 0.1372, + "step": 28420 + }, + { + "epoch": 0.9096726714235434, + "grad_norm": 8.4375, + "learning_rate": 1.9023240148198047e-06, + "loss": 0.1003, + "step": 28430 + }, + { + "epoch": 0.9099926407064922, + "grad_norm": 4.9375, + "learning_rate": 1.8955877399797914e-06, + "loss": 0.0856, + "step": 28440 + }, + { + "epoch": 0.910312609989441, + "grad_norm": 2.15625, + "learning_rate": 1.888851465139778e-06, + "loss": 0.1232, + "step": 28450 + }, + { + "epoch": 0.9106325792723898, + "grad_norm": 4.90625, + "learning_rate": 1.8821151902997645e-06, + "loss": 0.1138, + "step": 28460 + }, + { + "epoch": 0.9109525485553387, + "grad_norm": 0.7421875, + "learning_rate": 1.875378915459751e-06, + "loss": 0.11, + "step": 28470 + }, + { + "epoch": 0.9112725178382876, + "grad_norm": 9.75, + "learning_rate": 1.8686426406197375e-06, + "loss": 0.0547, + "step": 28480 + }, + { + "epoch": 0.9115924871212364, + "grad_norm": 7.78125, + "learning_rate": 1.861906365779724e-06, + "loss": 0.1488, + "step": 28490 + }, + { + "epoch": 0.9119124564041852, + "grad_norm": 12.625, + "learning_rate": 1.8551700909397106e-06, + "loss": 0.1034, + "step": 28500 + }, + { + "epoch": 0.912232425687134, + "grad_norm": 24.625, + "learning_rate": 1.848433816099697e-06, + "loss": 0.0859, + "step": 28510 + }, + { + "epoch": 0.9125523949700829, + "grad_norm": 2.703125, + "learning_rate": 1.8416975412596835e-06, + "loss": 0.1189, + "step": 28520 + }, + { + "epoch": 0.9128723642530318, + "grad_norm": 1.609375, + "learning_rate": 1.83496126641967e-06, + "loss": 0.0758, + "step": 28530 + }, + { + "epoch": 0.9131923335359805, + "grad_norm": 13.125, + "learning_rate": 1.8282249915796565e-06, + "loss": 0.0856, + "step": 28540 + }, + { + "epoch": 0.9135123028189294, + "grad_norm": 6.3125, + "learning_rate": 1.8214887167396431e-06, + "loss": 0.1443, + "step": 28550 + }, + { + "epoch": 0.9138322721018782, + "grad_norm": 11.6875, + "learning_rate": 1.8147524418996296e-06, + "loss": 0.1287, + "step": 28560 + }, + { + "epoch": 0.9141522413848271, + "grad_norm": 4.4375, + "learning_rate": 1.808016167059616e-06, + "loss": 0.1398, + "step": 28570 + }, + { + "epoch": 0.9144722106677758, + "grad_norm": 20.625, + "learning_rate": 1.8012798922196025e-06, + "loss": 0.1299, + "step": 28580 + }, + { + "epoch": 0.9147921799507247, + "grad_norm": 22.125, + "learning_rate": 1.794543617379589e-06, + "loss": 0.1137, + "step": 28590 + }, + { + "epoch": 0.9151121492336736, + "grad_norm": 6.75, + "learning_rate": 1.7878073425395759e-06, + "loss": 0.1472, + "step": 28600 + }, + { + "epoch": 0.9154321185166224, + "grad_norm": 9.25, + "learning_rate": 1.7810710676995623e-06, + "loss": 0.1124, + "step": 28610 + }, + { + "epoch": 0.9157520877995713, + "grad_norm": 17.75, + "learning_rate": 1.7743347928595488e-06, + "loss": 0.1162, + "step": 28620 + }, + { + "epoch": 0.91607205708252, + "grad_norm": 0.32421875, + "learning_rate": 1.7675985180195355e-06, + "loss": 0.0355, + "step": 28630 + }, + { + "epoch": 0.9163920263654689, + "grad_norm": 40.0, + "learning_rate": 1.760862243179522e-06, + "loss": 0.1198, + "step": 28640 + }, + { + "epoch": 0.9167119956484178, + "grad_norm": 0.54296875, + "learning_rate": 1.7541259683395084e-06, + "loss": 0.0996, + "step": 28650 + }, + { + "epoch": 0.9170319649313666, + "grad_norm": 6.71875, + "learning_rate": 1.7473896934994949e-06, + "loss": 0.0999, + "step": 28660 + }, + { + "epoch": 0.9173519342143154, + "grad_norm": 4.625, + "learning_rate": 1.7406534186594813e-06, + "loss": 0.1017, + "step": 28670 + }, + { + "epoch": 0.9176719034972642, + "grad_norm": 3.1875, + "learning_rate": 1.733917143819468e-06, + "loss": 0.0565, + "step": 28680 + }, + { + "epoch": 0.9179918727802131, + "grad_norm": 1.53125, + "learning_rate": 1.7271808689794545e-06, + "loss": 0.0843, + "step": 28690 + }, + { + "epoch": 0.918311842063162, + "grad_norm": 13.4375, + "learning_rate": 1.720444594139441e-06, + "loss": 0.138, + "step": 28700 + }, + { + "epoch": 0.9186318113461108, + "grad_norm": 4.75, + "learning_rate": 1.7137083192994274e-06, + "loss": 0.0942, + "step": 28710 + }, + { + "epoch": 0.9189517806290596, + "grad_norm": 6.25, + "learning_rate": 1.706972044459414e-06, + "loss": 0.1002, + "step": 28720 + }, + { + "epoch": 0.9192717499120084, + "grad_norm": 1.2109375, + "learning_rate": 1.7002357696194006e-06, + "loss": 0.0911, + "step": 28730 + }, + { + "epoch": 0.9195917191949573, + "grad_norm": 24.5, + "learning_rate": 1.693499494779387e-06, + "loss": 0.1303, + "step": 28740 + }, + { + "epoch": 0.9199116884779062, + "grad_norm": 9.1875, + "learning_rate": 1.6867632199393735e-06, + "loss": 0.1017, + "step": 28750 + }, + { + "epoch": 0.9202316577608549, + "grad_norm": 2.5625, + "learning_rate": 1.6800269450993604e-06, + "loss": 0.0624, + "step": 28760 + }, + { + "epoch": 0.9205516270438038, + "grad_norm": 8.8125, + "learning_rate": 1.6732906702593468e-06, + "loss": 0.1414, + "step": 28770 + }, + { + "epoch": 0.9208715963267526, + "grad_norm": 31.5, + "learning_rate": 1.6665543954193333e-06, + "loss": 0.087, + "step": 28780 + }, + { + "epoch": 0.9211915656097015, + "grad_norm": 13.25, + "learning_rate": 1.6598181205793198e-06, + "loss": 0.125, + "step": 28790 + }, + { + "epoch": 0.9215115348926503, + "grad_norm": 23.625, + "learning_rate": 1.6530818457393064e-06, + "loss": 0.1184, + "step": 28800 + }, + { + "epoch": 0.9218315041755991, + "grad_norm": 8.75, + "learning_rate": 1.646345570899293e-06, + "loss": 0.0559, + "step": 28810 + }, + { + "epoch": 0.922151473458548, + "grad_norm": 6.78125, + "learning_rate": 1.6396092960592794e-06, + "loss": 0.0866, + "step": 28820 + }, + { + "epoch": 0.9224714427414968, + "grad_norm": 31.875, + "learning_rate": 1.6328730212192658e-06, + "loss": 0.1563, + "step": 28830 + }, + { + "epoch": 0.9227914120244457, + "grad_norm": 8.9375, + "learning_rate": 1.6261367463792523e-06, + "loss": 0.0692, + "step": 28840 + }, + { + "epoch": 0.9231113813073945, + "grad_norm": 20.625, + "learning_rate": 1.619400471539239e-06, + "loss": 0.1056, + "step": 28850 + }, + { + "epoch": 0.9234313505903433, + "grad_norm": 2.46875, + "learning_rate": 1.6126641966992254e-06, + "loss": 0.0678, + "step": 28860 + }, + { + "epoch": 0.9237513198732922, + "grad_norm": 14.4375, + "learning_rate": 1.605927921859212e-06, + "loss": 0.1029, + "step": 28870 + }, + { + "epoch": 0.924071289156241, + "grad_norm": 12.875, + "learning_rate": 1.5991916470191984e-06, + "loss": 0.0639, + "step": 28880 + }, + { + "epoch": 0.9243912584391898, + "grad_norm": 10.0625, + "learning_rate": 1.5924553721791848e-06, + "loss": 0.0996, + "step": 28890 + }, + { + "epoch": 0.9247112277221387, + "grad_norm": 21.75, + "learning_rate": 1.5857190973391715e-06, + "loss": 0.1423, + "step": 28900 + }, + { + "epoch": 0.9250311970050875, + "grad_norm": 0.5703125, + "learning_rate": 1.578982822499158e-06, + "loss": 0.0792, + "step": 28910 + }, + { + "epoch": 0.9253511662880364, + "grad_norm": 6.75, + "learning_rate": 1.5722465476591447e-06, + "loss": 0.0905, + "step": 28920 + }, + { + "epoch": 0.9256711355709852, + "grad_norm": 12.1875, + "learning_rate": 1.5655102728191313e-06, + "loss": 0.0832, + "step": 28930 + }, + { + "epoch": 0.925991104853934, + "grad_norm": 11.875, + "learning_rate": 1.5587739979791178e-06, + "loss": 0.0703, + "step": 28940 + }, + { + "epoch": 0.9263110741368829, + "grad_norm": 59.25, + "learning_rate": 1.5520377231391043e-06, + "loss": 0.1043, + "step": 28950 + }, + { + "epoch": 0.9266310434198317, + "grad_norm": 4.46875, + "learning_rate": 1.5453014482990907e-06, + "loss": 0.0946, + "step": 28960 + }, + { + "epoch": 0.9269510127027806, + "grad_norm": 0.68359375, + "learning_rate": 1.5385651734590772e-06, + "loss": 0.0877, + "step": 28970 + }, + { + "epoch": 0.9272709819857293, + "grad_norm": 5.78125, + "learning_rate": 1.5318288986190639e-06, + "loss": 0.159, + "step": 28980 + }, + { + "epoch": 0.9275909512686782, + "grad_norm": 9.9375, + "learning_rate": 1.5250926237790503e-06, + "loss": 0.0843, + "step": 28990 + }, + { + "epoch": 0.927910920551627, + "grad_norm": 9.6875, + "learning_rate": 1.5183563489390368e-06, + "loss": 0.1604, + "step": 29000 + }, + { + "epoch": 0.9282308898345759, + "grad_norm": 13.3125, + "learning_rate": 1.5116200740990233e-06, + "loss": 0.0657, + "step": 29010 + }, + { + "epoch": 0.9285508591175247, + "grad_norm": 8.0, + "learning_rate": 1.50488379925901e-06, + "loss": 0.108, + "step": 29020 + }, + { + "epoch": 0.9288708284004735, + "grad_norm": 16.875, + "learning_rate": 1.4981475244189964e-06, + "loss": 0.1829, + "step": 29030 + }, + { + "epoch": 0.9291907976834224, + "grad_norm": 6.4375, + "learning_rate": 1.4914112495789829e-06, + "loss": 0.0916, + "step": 29040 + }, + { + "epoch": 0.9295107669663712, + "grad_norm": 14.625, + "learning_rate": 1.4846749747389693e-06, + "loss": 0.115, + "step": 29050 + }, + { + "epoch": 0.9298307362493201, + "grad_norm": 1.171875, + "learning_rate": 1.4779386998989558e-06, + "loss": 0.1315, + "step": 29060 + }, + { + "epoch": 0.9301507055322689, + "grad_norm": 20.625, + "learning_rate": 1.4712024250589425e-06, + "loss": 0.1696, + "step": 29070 + }, + { + "epoch": 0.9304706748152177, + "grad_norm": 63.5, + "learning_rate": 1.4644661502189292e-06, + "loss": 0.172, + "step": 29080 + }, + { + "epoch": 0.9307906440981666, + "grad_norm": 26.125, + "learning_rate": 1.4577298753789156e-06, + "loss": 0.1579, + "step": 29090 + }, + { + "epoch": 0.9311106133811154, + "grad_norm": 25.375, + "learning_rate": 1.4509936005389023e-06, + "loss": 0.1369, + "step": 29100 + }, + { + "epoch": 0.9314305826640642, + "grad_norm": 3.21875, + "learning_rate": 1.4442573256988888e-06, + "loss": 0.1342, + "step": 29110 + }, + { + "epoch": 0.9317505519470131, + "grad_norm": 11.125, + "learning_rate": 1.4375210508588752e-06, + "loss": 0.0699, + "step": 29120 + }, + { + "epoch": 0.9320705212299619, + "grad_norm": 5.59375, + "learning_rate": 1.4307847760188617e-06, + "loss": 0.1644, + "step": 29130 + }, + { + "epoch": 0.9323904905129108, + "grad_norm": 17.375, + "learning_rate": 1.4240485011788482e-06, + "loss": 0.1777, + "step": 29140 + }, + { + "epoch": 0.9327104597958596, + "grad_norm": 25.875, + "learning_rate": 1.4173122263388348e-06, + "loss": 0.1518, + "step": 29150 + }, + { + "epoch": 0.9330304290788084, + "grad_norm": 12.1875, + "learning_rate": 1.4105759514988213e-06, + "loss": 0.1291, + "step": 29160 + }, + { + "epoch": 0.9333503983617573, + "grad_norm": 1.9375, + "learning_rate": 1.4038396766588078e-06, + "loss": 0.0807, + "step": 29170 + }, + { + "epoch": 0.9336703676447061, + "grad_norm": 8.4375, + "learning_rate": 1.3971034018187942e-06, + "loss": 0.0914, + "step": 29180 + }, + { + "epoch": 0.933990336927655, + "grad_norm": 7.5625, + "learning_rate": 1.3903671269787807e-06, + "loss": 0.1007, + "step": 29190 + }, + { + "epoch": 0.9343103062106037, + "grad_norm": 7.78125, + "learning_rate": 1.3836308521387674e-06, + "loss": 0.1125, + "step": 29200 + }, + { + "epoch": 0.9346302754935526, + "grad_norm": 22.875, + "learning_rate": 1.3768945772987538e-06, + "loss": 0.0648, + "step": 29210 + }, + { + "epoch": 0.9349502447765015, + "grad_norm": 5.46875, + "learning_rate": 1.3701583024587403e-06, + "loss": 0.0528, + "step": 29220 + }, + { + "epoch": 0.9352702140594503, + "grad_norm": 10.875, + "learning_rate": 1.3634220276187268e-06, + "loss": 0.1367, + "step": 29230 + }, + { + "epoch": 0.9355901833423992, + "grad_norm": 4.96875, + "learning_rate": 1.3566857527787137e-06, + "loss": 0.0562, + "step": 29240 + }, + { + "epoch": 0.9359101526253479, + "grad_norm": 4.28125, + "learning_rate": 1.3499494779387001e-06, + "loss": 0.0813, + "step": 29250 + }, + { + "epoch": 0.9362301219082968, + "grad_norm": 1.015625, + "learning_rate": 1.3432132030986866e-06, + "loss": 0.1542, + "step": 29260 + }, + { + "epoch": 0.9365500911912457, + "grad_norm": 7.6875, + "learning_rate": 1.336476928258673e-06, + "loss": 0.1144, + "step": 29270 + }, + { + "epoch": 0.9368700604741945, + "grad_norm": 25.0, + "learning_rate": 1.3297406534186597e-06, + "loss": 0.1216, + "step": 29280 + }, + { + "epoch": 0.9371900297571433, + "grad_norm": 12.0625, + "learning_rate": 1.3230043785786462e-06, + "loss": 0.0869, + "step": 29290 + }, + { + "epoch": 0.9375099990400921, + "grad_norm": 12.625, + "learning_rate": 1.3162681037386327e-06, + "loss": 0.1606, + "step": 29300 + }, + { + "epoch": 0.937829968323041, + "grad_norm": 6.09375, + "learning_rate": 1.3095318288986191e-06, + "loss": 0.1795, + "step": 29310 + }, + { + "epoch": 0.9381499376059899, + "grad_norm": 2.171875, + "learning_rate": 1.3027955540586058e-06, + "loss": 0.0819, + "step": 29320 + }, + { + "epoch": 0.9384699068889386, + "grad_norm": 22.375, + "learning_rate": 1.2960592792185923e-06, + "loss": 0.1286, + "step": 29330 + }, + { + "epoch": 0.9387898761718875, + "grad_norm": 17.875, + "learning_rate": 1.2893230043785787e-06, + "loss": 0.1253, + "step": 29340 + }, + { + "epoch": 0.9391098454548363, + "grad_norm": 0.8125, + "learning_rate": 1.2825867295385652e-06, + "loss": 0.047, + "step": 29350 + }, + { + "epoch": 0.9394298147377852, + "grad_norm": 19.375, + "learning_rate": 1.2758504546985517e-06, + "loss": 0.0671, + "step": 29360 + }, + { + "epoch": 0.9397497840207341, + "grad_norm": 18.125, + "learning_rate": 1.2691141798585383e-06, + "loss": 0.1547, + "step": 29370 + }, + { + "epoch": 0.9400697533036828, + "grad_norm": 12.0, + "learning_rate": 1.2623779050185248e-06, + "loss": 0.0987, + "step": 29380 + }, + { + "epoch": 0.9403897225866317, + "grad_norm": 7.03125, + "learning_rate": 1.2556416301785113e-06, + "loss": 0.1345, + "step": 29390 + }, + { + "epoch": 0.9407096918695805, + "grad_norm": 8.5, + "learning_rate": 1.248905355338498e-06, + "loss": 0.0626, + "step": 29400 + }, + { + "epoch": 0.9410296611525294, + "grad_norm": 14.3125, + "learning_rate": 1.2421690804984844e-06, + "loss": 0.0972, + "step": 29410 + }, + { + "epoch": 0.9413496304354781, + "grad_norm": 16.875, + "learning_rate": 1.2354328056584709e-06, + "loss": 0.1044, + "step": 29420 + }, + { + "epoch": 0.941669599718427, + "grad_norm": 9.3125, + "learning_rate": 1.2286965308184573e-06, + "loss": 0.1222, + "step": 29430 + }, + { + "epoch": 0.9419895690013759, + "grad_norm": 5.28125, + "learning_rate": 1.221960255978444e-06, + "loss": 0.1749, + "step": 29440 + }, + { + "epoch": 0.9423095382843247, + "grad_norm": 18.875, + "learning_rate": 1.2152239811384307e-06, + "loss": 0.1283, + "step": 29450 + }, + { + "epoch": 0.9426295075672736, + "grad_norm": 10.1875, + "learning_rate": 1.2084877062984172e-06, + "loss": 0.0836, + "step": 29460 + }, + { + "epoch": 0.9429494768502223, + "grad_norm": 5.03125, + "learning_rate": 1.2017514314584036e-06, + "loss": 0.1439, + "step": 29470 + }, + { + "epoch": 0.9432694461331712, + "grad_norm": 2.65625, + "learning_rate": 1.19501515661839e-06, + "loss": 0.0865, + "step": 29480 + }, + { + "epoch": 0.9435894154161201, + "grad_norm": 15.1875, + "learning_rate": 1.1882788817783765e-06, + "loss": 0.1332, + "step": 29490 + }, + { + "epoch": 0.9439093846990689, + "grad_norm": 14.5625, + "learning_rate": 1.1815426069383632e-06, + "loss": 0.1292, + "step": 29500 + }, + { + "epoch": 0.9442293539820177, + "grad_norm": 1.8359375, + "learning_rate": 1.1748063320983497e-06, + "loss": 0.1111, + "step": 29510 + }, + { + "epoch": 0.9445493232649665, + "grad_norm": 0.53515625, + "learning_rate": 1.1680700572583364e-06, + "loss": 0.0876, + "step": 29520 + }, + { + "epoch": 0.9448692925479154, + "grad_norm": 2.984375, + "learning_rate": 1.1613337824183228e-06, + "loss": 0.0458, + "step": 29530 + }, + { + "epoch": 0.9451892618308643, + "grad_norm": 39.75, + "learning_rate": 1.1545975075783093e-06, + "loss": 0.1522, + "step": 29540 + }, + { + "epoch": 0.945509231113813, + "grad_norm": 5.875, + "learning_rate": 1.1478612327382958e-06, + "loss": 0.1341, + "step": 29550 + }, + { + "epoch": 0.9458292003967619, + "grad_norm": 31.875, + "learning_rate": 1.1411249578982824e-06, + "loss": 0.1406, + "step": 29560 + }, + { + "epoch": 0.9461491696797107, + "grad_norm": 2.75, + "learning_rate": 1.134388683058269e-06, + "loss": 0.1329, + "step": 29570 + }, + { + "epoch": 0.9464691389626596, + "grad_norm": 10.6875, + "learning_rate": 1.1276524082182554e-06, + "loss": 0.0694, + "step": 29580 + }, + { + "epoch": 0.9467891082456085, + "grad_norm": 11.625, + "learning_rate": 1.1209161333782418e-06, + "loss": 0.1565, + "step": 29590 + }, + { + "epoch": 0.9471090775285572, + "grad_norm": 14.8125, + "learning_rate": 1.1141798585382283e-06, + "loss": 0.0868, + "step": 29600 + }, + { + "epoch": 0.9474290468115061, + "grad_norm": 9.5, + "learning_rate": 1.107443583698215e-06, + "loss": 0.1018, + "step": 29610 + }, + { + "epoch": 0.9477490160944549, + "grad_norm": 13.3125, + "learning_rate": 1.1007073088582014e-06, + "loss": 0.0625, + "step": 29620 + }, + { + "epoch": 0.9480689853774038, + "grad_norm": 14.4375, + "learning_rate": 1.0939710340181881e-06, + "loss": 0.1285, + "step": 29630 + }, + { + "epoch": 0.9483889546603526, + "grad_norm": 3.578125, + "learning_rate": 1.0872347591781746e-06, + "loss": 0.0586, + "step": 29640 + }, + { + "epoch": 0.9487089239433014, + "grad_norm": 0.95703125, + "learning_rate": 1.080498484338161e-06, + "loss": 0.049, + "step": 29650 + }, + { + "epoch": 0.9490288932262503, + "grad_norm": 24.125, + "learning_rate": 1.0737622094981475e-06, + "loss": 0.0841, + "step": 29660 + }, + { + "epoch": 0.9493488625091991, + "grad_norm": 16.875, + "learning_rate": 1.0670259346581342e-06, + "loss": 0.0813, + "step": 29670 + }, + { + "epoch": 0.949668831792148, + "grad_norm": 8.0625, + "learning_rate": 1.0602896598181206e-06, + "loss": 0.171, + "step": 29680 + }, + { + "epoch": 0.9499888010750968, + "grad_norm": 15.875, + "learning_rate": 1.0535533849781073e-06, + "loss": 0.1438, + "step": 29690 + }, + { + "epoch": 0.9503087703580456, + "grad_norm": 5.375, + "learning_rate": 1.0468171101380938e-06, + "loss": 0.1315, + "step": 29700 + }, + { + "epoch": 0.9506287396409945, + "grad_norm": 8.5625, + "learning_rate": 1.0400808352980803e-06, + "loss": 0.0772, + "step": 29710 + }, + { + "epoch": 0.9509487089239433, + "grad_norm": 10.375, + "learning_rate": 1.0333445604580667e-06, + "loss": 0.0884, + "step": 29720 + }, + { + "epoch": 0.9512686782068921, + "grad_norm": 8.5, + "learning_rate": 1.0266082856180532e-06, + "loss": 0.0834, + "step": 29730 + }, + { + "epoch": 0.951588647489841, + "grad_norm": 3.640625, + "learning_rate": 1.0198720107780399e-06, + "loss": 0.1144, + "step": 29740 + }, + { + "epoch": 0.9519086167727898, + "grad_norm": 1.4609375, + "learning_rate": 1.0131357359380263e-06, + "loss": 0.1354, + "step": 29750 + }, + { + "epoch": 0.9522285860557387, + "grad_norm": 1.6171875, + "learning_rate": 1.0063994610980128e-06, + "loss": 0.1206, + "step": 29760 + }, + { + "epoch": 0.9525485553386875, + "grad_norm": 0.8125, + "learning_rate": 9.996631862579995e-07, + "loss": 0.1125, + "step": 29770 + }, + { + "epoch": 0.9528685246216363, + "grad_norm": 7.625, + "learning_rate": 9.92926911417986e-07, + "loss": 0.1352, + "step": 29780 + }, + { + "epoch": 0.9531884939045852, + "grad_norm": 31.5, + "learning_rate": 9.861906365779724e-07, + "loss": 0.0936, + "step": 29790 + }, + { + "epoch": 0.953508463187534, + "grad_norm": 8.75, + "learning_rate": 9.79454361737959e-07, + "loss": 0.0617, + "step": 29800 + }, + { + "epoch": 0.9538284324704829, + "grad_norm": 17.125, + "learning_rate": 9.727180868979455e-07, + "loss": 0.078, + "step": 29810 + }, + { + "epoch": 0.9541484017534316, + "grad_norm": 4.09375, + "learning_rate": 9.65981812057932e-07, + "loss": 0.0936, + "step": 29820 + }, + { + "epoch": 0.9544683710363805, + "grad_norm": 7.25, + "learning_rate": 9.592455372179185e-07, + "loss": 0.1529, + "step": 29830 + }, + { + "epoch": 0.9547883403193294, + "grad_norm": 9.375, + "learning_rate": 9.52509262377905e-07, + "loss": 0.0909, + "step": 29840 + }, + { + "epoch": 0.9551083096022782, + "grad_norm": 0.625, + "learning_rate": 9.457729875378917e-07, + "loss": 0.1272, + "step": 29850 + }, + { + "epoch": 0.955428278885227, + "grad_norm": 7.9375, + "learning_rate": 9.390367126978782e-07, + "loss": 0.0567, + "step": 29860 + }, + { + "epoch": 0.9557482481681758, + "grad_norm": 4.40625, + "learning_rate": 9.323004378578646e-07, + "loss": 0.1161, + "step": 29870 + }, + { + "epoch": 0.9560682174511247, + "grad_norm": 16.25, + "learning_rate": 9.255641630178512e-07, + "loss": 0.1593, + "step": 29880 + }, + { + "epoch": 0.9563881867340736, + "grad_norm": 16.875, + "learning_rate": 9.188278881778377e-07, + "loss": 0.142, + "step": 29890 + }, + { + "epoch": 0.9567081560170224, + "grad_norm": 18.875, + "learning_rate": 9.120916133378243e-07, + "loss": 0.1522, + "step": 29900 + }, + { + "epoch": 0.9570281252999712, + "grad_norm": 1.9140625, + "learning_rate": 9.053553384978107e-07, + "loss": 0.1324, + "step": 29910 + }, + { + "epoch": 0.95734809458292, + "grad_norm": 6.53125, + "learning_rate": 8.986190636577973e-07, + "loss": 0.0888, + "step": 29920 + }, + { + "epoch": 0.9576680638658689, + "grad_norm": 36.0, + "learning_rate": 8.918827888177839e-07, + "loss": 0.2007, + "step": 29930 + }, + { + "epoch": 0.9579880331488178, + "grad_norm": 12.3125, + "learning_rate": 8.851465139777704e-07, + "loss": 0.0833, + "step": 29940 + }, + { + "epoch": 0.9583080024317665, + "grad_norm": 7.40625, + "learning_rate": 8.784102391377569e-07, + "loss": 0.1167, + "step": 29950 + }, + { + "epoch": 0.9586279717147154, + "grad_norm": 1.1015625, + "learning_rate": 8.716739642977435e-07, + "loss": 0.1334, + "step": 29960 + }, + { + "epoch": 0.9589479409976642, + "grad_norm": 15.0, + "learning_rate": 8.649376894577299e-07, + "loss": 0.0692, + "step": 29970 + }, + { + "epoch": 0.9592679102806131, + "grad_norm": 10.3125, + "learning_rate": 8.582014146177164e-07, + "loss": 0.1986, + "step": 29980 + }, + { + "epoch": 0.959587879563562, + "grad_norm": 0.55078125, + "learning_rate": 8.51465139777703e-07, + "loss": 0.0679, + "step": 29990 + }, + { + "epoch": 0.9599078488465107, + "grad_norm": 9.4375, + "learning_rate": 8.447288649376894e-07, + "loss": 0.1193, + "step": 30000 + }, + { + "epoch": 0.9602278181294596, + "grad_norm": 11.375, + "learning_rate": 8.379925900976761e-07, + "loss": 0.0968, + "step": 30010 + }, + { + "epoch": 0.9605477874124084, + "grad_norm": 10.9375, + "learning_rate": 8.312563152576626e-07, + "loss": 0.1227, + "step": 30020 + }, + { + "epoch": 0.9608677566953573, + "grad_norm": 18.125, + "learning_rate": 8.245200404176491e-07, + "loss": 0.1113, + "step": 30030 + }, + { + "epoch": 0.961187725978306, + "grad_norm": 18.125, + "learning_rate": 8.177837655776356e-07, + "loss": 0.1165, + "step": 30040 + }, + { + "epoch": 0.9615076952612549, + "grad_norm": 1.9375, + "learning_rate": 8.110474907376222e-07, + "loss": 0.0833, + "step": 30050 + }, + { + "epoch": 0.9618276645442038, + "grad_norm": 6.84375, + "learning_rate": 8.043112158976086e-07, + "loss": 0.0866, + "step": 30060 + }, + { + "epoch": 0.9621476338271526, + "grad_norm": 6.71875, + "learning_rate": 7.975749410575952e-07, + "loss": 0.1212, + "step": 30070 + }, + { + "epoch": 0.9624676031101014, + "grad_norm": 10.75, + "learning_rate": 7.908386662175817e-07, + "loss": 0.1781, + "step": 30080 + }, + { + "epoch": 0.9627875723930502, + "grad_norm": 3.859375, + "learning_rate": 7.841023913775684e-07, + "loss": 0.1305, + "step": 30090 + }, + { + "epoch": 0.9631075416759991, + "grad_norm": 18.125, + "learning_rate": 7.773661165375548e-07, + "loss": 0.1295, + "step": 30100 + }, + { + "epoch": 0.963427510958948, + "grad_norm": 16.375, + "learning_rate": 7.706298416975414e-07, + "loss": 0.1152, + "step": 30110 + }, + { + "epoch": 0.9637474802418968, + "grad_norm": 0.78125, + "learning_rate": 7.638935668575279e-07, + "loss": 0.137, + "step": 30120 + }, + { + "epoch": 0.9640674495248456, + "grad_norm": 7.28125, + "learning_rate": 7.571572920175143e-07, + "loss": 0.1233, + "step": 30130 + }, + { + "epoch": 0.9643874188077944, + "grad_norm": 4.03125, + "learning_rate": 7.504210171775009e-07, + "loss": 0.129, + "step": 30140 + }, + { + "epoch": 0.9647073880907433, + "grad_norm": 9.8125, + "learning_rate": 7.436847423374874e-07, + "loss": 0.1409, + "step": 30150 + }, + { + "epoch": 0.9650273573736922, + "grad_norm": 13.75, + "learning_rate": 7.369484674974739e-07, + "loss": 0.082, + "step": 30160 + }, + { + "epoch": 0.9653473266566409, + "grad_norm": 0.490234375, + "learning_rate": 7.302121926574605e-07, + "loss": 0.0944, + "step": 30170 + }, + { + "epoch": 0.9656672959395898, + "grad_norm": 6.9375, + "learning_rate": 7.234759178174471e-07, + "loss": 0.0378, + "step": 30180 + }, + { + "epoch": 0.9659872652225386, + "grad_norm": 7.1875, + "learning_rate": 7.167396429774335e-07, + "loss": 0.1357, + "step": 30190 + }, + { + "epoch": 0.9663072345054875, + "grad_norm": 0.9609375, + "learning_rate": 7.100033681374201e-07, + "loss": 0.1785, + "step": 30200 + }, + { + "epoch": 0.9666272037884364, + "grad_norm": 10.25, + "learning_rate": 7.032670932974066e-07, + "loss": 0.0899, + "step": 30210 + }, + { + "epoch": 0.9669471730713851, + "grad_norm": 7.5625, + "learning_rate": 6.965308184573931e-07, + "loss": 0.1482, + "step": 30220 + }, + { + "epoch": 0.967267142354334, + "grad_norm": 8.25, + "learning_rate": 6.897945436173796e-07, + "loss": 0.107, + "step": 30230 + }, + { + "epoch": 0.9675871116372828, + "grad_norm": 5.6875, + "learning_rate": 6.830582687773661e-07, + "loss": 0.0703, + "step": 30240 + }, + { + "epoch": 0.9679070809202317, + "grad_norm": 18.0, + "learning_rate": 6.763219939373527e-07, + "loss": 0.1616, + "step": 30250 + }, + { + "epoch": 0.9682270502031805, + "grad_norm": 3.90625, + "learning_rate": 6.695857190973393e-07, + "loss": 0.1073, + "step": 30260 + }, + { + "epoch": 0.9685470194861293, + "grad_norm": 6.6875, + "learning_rate": 6.628494442573258e-07, + "loss": 0.1452, + "step": 30270 + }, + { + "epoch": 0.9688669887690782, + "grad_norm": 14.4375, + "learning_rate": 6.561131694173122e-07, + "loss": 0.2076, + "step": 30280 + }, + { + "epoch": 0.969186958052027, + "grad_norm": 14.5, + "learning_rate": 6.493768945772988e-07, + "loss": 0.1903, + "step": 30290 + }, + { + "epoch": 0.9695069273349758, + "grad_norm": 16.5, + "learning_rate": 6.426406197372853e-07, + "loss": 0.1945, + "step": 30300 + }, + { + "epoch": 0.9698268966179246, + "grad_norm": 8.875, + "learning_rate": 6.359043448972719e-07, + "loss": 0.1507, + "step": 30310 + }, + { + "epoch": 0.9701468659008735, + "grad_norm": 11.6875, + "learning_rate": 6.291680700572583e-07, + "loss": 0.1027, + "step": 30320 + }, + { + "epoch": 0.9704668351838224, + "grad_norm": 14.125, + "learning_rate": 6.224317952172449e-07, + "loss": 0.1336, + "step": 30330 + }, + { + "epoch": 0.9707868044667712, + "grad_norm": 16.875, + "learning_rate": 6.156955203772315e-07, + "loss": 0.0553, + "step": 30340 + }, + { + "epoch": 0.97110677374972, + "grad_norm": 20.625, + "learning_rate": 6.08959245537218e-07, + "loss": 0.0632, + "step": 30350 + }, + { + "epoch": 0.9714267430326688, + "grad_norm": 13.5, + "learning_rate": 6.022229706972045e-07, + "loss": 0.1034, + "step": 30360 + }, + { + "epoch": 0.9717467123156177, + "grad_norm": 15.0625, + "learning_rate": 5.954866958571911e-07, + "loss": 0.0737, + "step": 30370 + }, + { + "epoch": 0.9720666815985666, + "grad_norm": 24.375, + "learning_rate": 5.887504210171775e-07, + "loss": 0.1024, + "step": 30380 + }, + { + "epoch": 0.9723866508815153, + "grad_norm": 4.96875, + "learning_rate": 5.820141461771641e-07, + "loss": 0.1472, + "step": 30390 + }, + { + "epoch": 0.9727066201644642, + "grad_norm": 15.25, + "learning_rate": 5.752778713371506e-07, + "loss": 0.091, + "step": 30400 + }, + { + "epoch": 0.973026589447413, + "grad_norm": 30.875, + "learning_rate": 5.685415964971371e-07, + "loss": 0.105, + "step": 30410 + }, + { + "epoch": 0.9733465587303619, + "grad_norm": 0.5, + "learning_rate": 5.618053216571236e-07, + "loss": 0.1065, + "step": 30420 + }, + { + "epoch": 0.9736665280133108, + "grad_norm": 1.2421875, + "learning_rate": 5.550690468171102e-07, + "loss": 0.1332, + "step": 30430 + }, + { + "epoch": 0.9739864972962595, + "grad_norm": 40.5, + "learning_rate": 5.483327719770967e-07, + "loss": 0.1038, + "step": 30440 + }, + { + "epoch": 0.9743064665792084, + "grad_norm": 10.4375, + "learning_rate": 5.415964971370832e-07, + "loss": 0.1251, + "step": 30450 + }, + { + "epoch": 0.9746264358621572, + "grad_norm": 19.875, + "learning_rate": 5.348602222970698e-07, + "loss": 0.1756, + "step": 30460 + }, + { + "epoch": 0.9749464051451061, + "grad_norm": 15.9375, + "learning_rate": 5.281239474570563e-07, + "loss": 0.0858, + "step": 30470 + }, + { + "epoch": 0.9752663744280549, + "grad_norm": 1.734375, + "learning_rate": 5.213876726170428e-07, + "loss": 0.0836, + "step": 30480 + }, + { + "epoch": 0.9755863437110037, + "grad_norm": 8.5625, + "learning_rate": 5.146513977770294e-07, + "loss": 0.0787, + "step": 30490 + }, + { + "epoch": 0.9759063129939526, + "grad_norm": 2.3125, + "learning_rate": 5.079151229370158e-07, + "loss": 0.1419, + "step": 30500 + }, + { + "epoch": 0.9762262822769014, + "grad_norm": 1.8203125, + "learning_rate": 5.011788480970024e-07, + "loss": 0.1126, + "step": 30510 + }, + { + "epoch": 0.9765462515598503, + "grad_norm": 6.0, + "learning_rate": 4.94442573256989e-07, + "loss": 0.0274, + "step": 30520 + }, + { + "epoch": 0.9768662208427991, + "grad_norm": 29.625, + "learning_rate": 4.877062984169755e-07, + "loss": 0.1164, + "step": 30530 + }, + { + "epoch": 0.9771861901257479, + "grad_norm": 54.75, + "learning_rate": 4.809700235769619e-07, + "loss": 0.0588, + "step": 30540 + }, + { + "epoch": 0.9775061594086968, + "grad_norm": 1.21875, + "learning_rate": 4.7423374873694854e-07, + "loss": 0.1148, + "step": 30550 + }, + { + "epoch": 0.9778261286916456, + "grad_norm": 0.7265625, + "learning_rate": 4.6749747389693506e-07, + "loss": 0.1387, + "step": 30560 + }, + { + "epoch": 0.9781460979745944, + "grad_norm": 17.0, + "learning_rate": 4.607611990569216e-07, + "loss": 0.1025, + "step": 30570 + }, + { + "epoch": 0.9784660672575433, + "grad_norm": 8.8125, + "learning_rate": 4.5402492421690804e-07, + "loss": 0.1132, + "step": 30580 + }, + { + "epoch": 0.9787860365404921, + "grad_norm": 7.9375, + "learning_rate": 4.4728864937689467e-07, + "loss": 0.0996, + "step": 30590 + }, + { + "epoch": 0.979106005823441, + "grad_norm": 21.625, + "learning_rate": 4.4055237453688113e-07, + "loss": 0.1536, + "step": 30600 + }, + { + "epoch": 0.9794259751063897, + "grad_norm": 6.8125, + "learning_rate": 4.3381609969686765e-07, + "loss": 0.1974, + "step": 30610 + }, + { + "epoch": 0.9797459443893386, + "grad_norm": 8.4375, + "learning_rate": 4.2707982485685417e-07, + "loss": 0.1202, + "step": 30620 + }, + { + "epoch": 0.9800659136722875, + "grad_norm": 22.375, + "learning_rate": 4.2034355001684074e-07, + "loss": 0.1091, + "step": 30630 + }, + { + "epoch": 0.9803858829552363, + "grad_norm": 9.625, + "learning_rate": 4.1360727517682726e-07, + "loss": 0.0747, + "step": 30640 + }, + { + "epoch": 0.9807058522381852, + "grad_norm": 25.5, + "learning_rate": 4.068710003368138e-07, + "loss": 0.102, + "step": 30650 + }, + { + "epoch": 0.9810258215211339, + "grad_norm": 5.625, + "learning_rate": 4.001347254968003e-07, + "loss": 0.1312, + "step": 30660 + }, + { + "epoch": 0.9813457908040828, + "grad_norm": 10.4375, + "learning_rate": 3.9339845065678686e-07, + "loss": 0.1142, + "step": 30670 + }, + { + "epoch": 0.9816657600870317, + "grad_norm": 4.46875, + "learning_rate": 3.866621758167734e-07, + "loss": 0.0949, + "step": 30680 + }, + { + "epoch": 0.9819857293699805, + "grad_norm": 25.125, + "learning_rate": 3.799259009767599e-07, + "loss": 0.0841, + "step": 30690 + }, + { + "epoch": 0.9823056986529293, + "grad_norm": 0.9453125, + "learning_rate": 3.731896261367464e-07, + "loss": 0.1005, + "step": 30700 + }, + { + "epoch": 0.9826256679358781, + "grad_norm": 19.25, + "learning_rate": 3.66453351296733e-07, + "loss": 0.1297, + "step": 30710 + }, + { + "epoch": 0.982945637218827, + "grad_norm": 4.5625, + "learning_rate": 3.597170764567195e-07, + "loss": 0.114, + "step": 30720 + }, + { + "epoch": 0.9832656065017759, + "grad_norm": 19.625, + "learning_rate": 3.5298080161670597e-07, + "loss": 0.1473, + "step": 30730 + }, + { + "epoch": 0.9835855757847247, + "grad_norm": 18.25, + "learning_rate": 3.462445267766925e-07, + "loss": 0.1091, + "step": 30740 + }, + { + "epoch": 0.9839055450676735, + "grad_norm": 0.72265625, + "learning_rate": 3.3950825193667906e-07, + "loss": 0.0664, + "step": 30750 + }, + { + "epoch": 0.9842255143506223, + "grad_norm": 14.125, + "learning_rate": 3.327719770966656e-07, + "loss": 0.0747, + "step": 30760 + }, + { + "epoch": 0.9845454836335712, + "grad_norm": 16.25, + "learning_rate": 3.260357022566521e-07, + "loss": 0.0941, + "step": 30770 + }, + { + "epoch": 0.9848654529165201, + "grad_norm": 6.6875, + "learning_rate": 3.192994274166386e-07, + "loss": 0.1096, + "step": 30780 + }, + { + "epoch": 0.9851854221994688, + "grad_norm": 19.5, + "learning_rate": 3.1256315257662513e-07, + "loss": 0.0965, + "step": 30790 + }, + { + "epoch": 0.9855053914824177, + "grad_norm": 11.5625, + "learning_rate": 3.0582687773661165e-07, + "loss": 0.1353, + "step": 30800 + }, + { + "epoch": 0.9858253607653665, + "grad_norm": 26.25, + "learning_rate": 2.990906028965982e-07, + "loss": 0.0604, + "step": 30810 + }, + { + "epoch": 0.9861453300483154, + "grad_norm": 9.375, + "learning_rate": 2.9235432805658474e-07, + "loss": 0.1302, + "step": 30820 + }, + { + "epoch": 0.9864652993312641, + "grad_norm": 14.8125, + "learning_rate": 2.8561805321657125e-07, + "loss": 0.1513, + "step": 30830 + }, + { + "epoch": 0.986785268614213, + "grad_norm": 16.0, + "learning_rate": 2.7888177837655777e-07, + "loss": 0.0773, + "step": 30840 + }, + { + "epoch": 0.9871052378971619, + "grad_norm": 0.279296875, + "learning_rate": 2.721455035365443e-07, + "loss": 0.0452, + "step": 30850 + }, + { + "epoch": 0.9874252071801107, + "grad_norm": 22.125, + "learning_rate": 2.654092286965308e-07, + "loss": 0.1346, + "step": 30860 + }, + { + "epoch": 0.9877451764630596, + "grad_norm": 1.6328125, + "learning_rate": 2.586729538565174e-07, + "loss": 0.1433, + "step": 30870 + }, + { + "epoch": 0.9880651457460083, + "grad_norm": 7.71875, + "learning_rate": 2.519366790165039e-07, + "loss": 0.1037, + "step": 30880 + }, + { + "epoch": 0.9883851150289572, + "grad_norm": 8.75, + "learning_rate": 2.452004041764904e-07, + "loss": 0.154, + "step": 30890 + }, + { + "epoch": 0.9887050843119061, + "grad_norm": 24.25, + "learning_rate": 2.3846412933647693e-07, + "loss": 0.1128, + "step": 30900 + }, + { + "epoch": 0.9890250535948549, + "grad_norm": 17.375, + "learning_rate": 2.3172785449646348e-07, + "loss": 0.0822, + "step": 30910 + }, + { + "epoch": 0.9893450228778037, + "grad_norm": 34.0, + "learning_rate": 2.2499157965645e-07, + "loss": 0.1088, + "step": 30920 + }, + { + "epoch": 0.9896649921607525, + "grad_norm": 1.296875, + "learning_rate": 2.1825530481643654e-07, + "loss": 0.0829, + "step": 30930 + }, + { + "epoch": 0.9899849614437014, + "grad_norm": 2.96875, + "learning_rate": 2.1151902997642303e-07, + "loss": 0.082, + "step": 30940 + }, + { + "epoch": 0.9903049307266503, + "grad_norm": 20.25, + "learning_rate": 2.0478275513640957e-07, + "loss": 0.1768, + "step": 30950 + }, + { + "epoch": 0.9906249000095991, + "grad_norm": 7.1875, + "learning_rate": 1.980464802963961e-07, + "loss": 0.0725, + "step": 30960 + }, + { + "epoch": 0.9909448692925479, + "grad_norm": 1.1171875, + "learning_rate": 1.9131020545638264e-07, + "loss": 0.1199, + "step": 30970 + }, + { + "epoch": 0.9912648385754967, + "grad_norm": 13.625, + "learning_rate": 1.8457393061636915e-07, + "loss": 0.0619, + "step": 30980 + }, + { + "epoch": 0.9915848078584456, + "grad_norm": 1.765625, + "learning_rate": 1.778376557763557e-07, + "loss": 0.1439, + "step": 30990 + }, + { + "epoch": 0.9919047771413945, + "grad_norm": 1.484375, + "learning_rate": 1.7110138093634222e-07, + "loss": 0.1021, + "step": 31000 + }, + { + "epoch": 0.9922247464243432, + "grad_norm": 21.0, + "learning_rate": 1.6436510609632876e-07, + "loss": 0.1386, + "step": 31010 + }, + { + "epoch": 0.9925447157072921, + "grad_norm": 16.5, + "learning_rate": 1.5762883125631525e-07, + "loss": 0.1167, + "step": 31020 + }, + { + "epoch": 0.9928646849902409, + "grad_norm": 1.875, + "learning_rate": 1.508925564163018e-07, + "loss": 0.0717, + "step": 31030 + }, + { + "epoch": 0.9931846542731898, + "grad_norm": 27.5, + "learning_rate": 1.4415628157628834e-07, + "loss": 0.1387, + "step": 31040 + }, + { + "epoch": 0.9935046235561386, + "grad_norm": 1.90625, + "learning_rate": 1.3742000673627486e-07, + "loss": 0.0809, + "step": 31050 + }, + { + "epoch": 0.9938245928390874, + "grad_norm": 0.41015625, + "learning_rate": 1.3068373189626137e-07, + "loss": 0.0664, + "step": 31060 + }, + { + "epoch": 0.9941445621220363, + "grad_norm": 10.0625, + "learning_rate": 1.2394745705624792e-07, + "loss": 0.1082, + "step": 31070 + }, + { + "epoch": 0.9944645314049851, + "grad_norm": 19.25, + "learning_rate": 1.1721118221623444e-07, + "loss": 0.1239, + "step": 31080 + }, + { + "epoch": 0.994784500687934, + "grad_norm": 10.0, + "learning_rate": 1.1047490737622097e-07, + "loss": 0.0653, + "step": 31090 + }, + { + "epoch": 0.9951044699708828, + "grad_norm": 13.5, + "learning_rate": 1.0373863253620749e-07, + "loss": 0.0619, + "step": 31100 + }, + { + "epoch": 0.9954244392538316, + "grad_norm": 22.875, + "learning_rate": 9.700235769619402e-08, + "loss": 0.1128, + "step": 31110 + }, + { + "epoch": 0.9957444085367805, + "grad_norm": 0.82421875, + "learning_rate": 9.026608285618055e-08, + "loss": 0.0826, + "step": 31120 + }, + { + "epoch": 0.9960643778197293, + "grad_norm": 7.8125, + "learning_rate": 8.352980801616707e-08, + "loss": 0.1081, + "step": 31130 + }, + { + "epoch": 0.9963843471026781, + "grad_norm": 12.0625, + "learning_rate": 7.679353317615358e-08, + "loss": 0.1246, + "step": 31140 + }, + { + "epoch": 0.996704316385627, + "grad_norm": 9.0, + "learning_rate": 7.005725833614011e-08, + "loss": 0.0712, + "step": 31150 + }, + { + "epoch": 0.9970242856685758, + "grad_norm": 26.75, + "learning_rate": 6.332098349612665e-08, + "loss": 0.0889, + "step": 31160 + }, + { + "epoch": 0.9973442549515247, + "grad_norm": 12.0, + "learning_rate": 5.658470865611317e-08, + "loss": 0.0913, + "step": 31170 + }, + { + "epoch": 0.9976642242344735, + "grad_norm": 15.0, + "learning_rate": 4.98484338160997e-08, + "loss": 0.1362, + "step": 31180 + }, + { + "epoch": 0.9979841935174223, + "grad_norm": 4.875, + "learning_rate": 4.3112158976086225e-08, + "loss": 0.117, + "step": 31190 + }, + { + "epoch": 0.9983041628003712, + "grad_norm": 29.375, + "learning_rate": 3.6375884136072756e-08, + "loss": 0.1383, + "step": 31200 + }, + { + "epoch": 0.99862413208332, + "grad_norm": 7.09375, + "learning_rate": 2.9639609296059284e-08, + "loss": 0.1093, + "step": 31210 + }, + { + "epoch": 0.9989441013662689, + "grad_norm": 13.5625, + "learning_rate": 2.2903334456045812e-08, + "loss": 0.1618, + "step": 31220 + }, + { + "epoch": 0.9992640706492176, + "grad_norm": 22.25, + "learning_rate": 1.6167059616032336e-08, + "loss": 0.0873, + "step": 31230 + }, + { + "epoch": 0.9995840399321665, + "grad_norm": 9.0, + "learning_rate": 9.430784776018862e-09, + "loss": 0.1089, + "step": 31240 + }, + { + "epoch": 0.9999040092151154, + "grad_norm": 33.25, + "learning_rate": 2.694509936005389e-09, + "loss": 0.0919, + "step": 31250 + } + ], + "logging_steps": 10, + "max_steps": 31253, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.105650020954463e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}