{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014524328249818446, "grad_norm": 4.328955480739728, "learning_rate": 1.3062409288824383e-07, "loss": 0.9607, "step": 10 }, { "epoch": 0.002904865649963689, "grad_norm": 4.469323164876104, "learning_rate": 2.757619738751814e-07, "loss": 0.9859, "step": 20 }, { "epoch": 0.004357298474945534, "grad_norm": 4.000416594025176, "learning_rate": 4.2089985486211904e-07, "loss": 0.9872, "step": 30 }, { "epoch": 0.005809731299927378, "grad_norm": 3.1566001029759914, "learning_rate": 5.660377358490567e-07, "loss": 0.9191, "step": 40 }, { "epoch": 0.007262164124909223, "grad_norm": 2.000776925354802, "learning_rate": 7.111756168359943e-07, "loss": 0.866, "step": 50 }, { "epoch": 0.008714596949891068, "grad_norm": 2.03383269865318, "learning_rate": 8.563134978229319e-07, "loss": 0.8475, "step": 60 }, { "epoch": 0.010167029774872912, "grad_norm": 1.981671850063017, "learning_rate": 1.0014513788098695e-06, "loss": 0.8145, "step": 70 }, { "epoch": 0.011619462599854757, "grad_norm": 1.9935447101504142, "learning_rate": 1.146589259796807e-06, "loss": 0.7874, "step": 80 }, { "epoch": 0.013071895424836602, "grad_norm": 1.696794144473072, "learning_rate": 1.2917271407837448e-06, "loss": 0.7606, "step": 90 }, { "epoch": 0.014524328249818447, "grad_norm": 1.8441704167155635, "learning_rate": 1.4368650217706823e-06, "loss": 0.7505, "step": 100 }, { "epoch": 0.01597676107480029, "grad_norm": 1.6167640330505846, "learning_rate": 1.5820029027576197e-06, "loss": 0.7432, "step": 110 }, { "epoch": 0.017429193899782137, "grad_norm": 1.7310300613256226, "learning_rate": 1.7271407837445576e-06, "loss": 0.7502, "step": 120 }, { "epoch": 0.01888162672476398, "grad_norm": 1.5504171157690307, "learning_rate": 1.872278664731495e-06, "loss": 0.7075, "step": 130 }, { "epoch": 0.020334059549745823, "grad_norm": 1.5001595551333269, "learning_rate": 2.0174165457184327e-06, "loss": 0.7242, "step": 140 }, { "epoch": 0.02178649237472767, "grad_norm": 1.7680255328873922, "learning_rate": 2.1625544267053704e-06, "loss": 0.7299, "step": 150 }, { "epoch": 0.023238925199709513, "grad_norm": 1.9776874021989124, "learning_rate": 2.307692307692308e-06, "loss": 0.7074, "step": 160 }, { "epoch": 0.024691358024691357, "grad_norm": 1.645294675336186, "learning_rate": 2.4528301886792453e-06, "loss": 0.7003, "step": 170 }, { "epoch": 0.026143790849673203, "grad_norm": 1.903626800669526, "learning_rate": 2.597968069666183e-06, "loss": 0.6935, "step": 180 }, { "epoch": 0.027596223674655047, "grad_norm": 1.6296522016767983, "learning_rate": 2.7431059506531207e-06, "loss": 0.7099, "step": 190 }, { "epoch": 0.029048656499636893, "grad_norm": 1.5624745122869332, "learning_rate": 2.8882438316400583e-06, "loss": 0.7082, "step": 200 }, { "epoch": 0.030501089324618737, "grad_norm": 1.5327148829437787, "learning_rate": 3.033381712626996e-06, "loss": 0.6847, "step": 210 }, { "epoch": 0.03195352214960058, "grad_norm": 1.4217156007581908, "learning_rate": 3.1785195936139337e-06, "loss": 0.6997, "step": 220 }, { "epoch": 0.03340595497458242, "grad_norm": 1.678714535521671, "learning_rate": 3.323657474600871e-06, "loss": 0.6922, "step": 230 }, { "epoch": 0.034858387799564274, "grad_norm": 1.6893028132334575, "learning_rate": 3.4687953555878086e-06, "loss": 0.6764, "step": 240 }, { "epoch": 0.03631082062454612, "grad_norm": 1.6842923668045748, "learning_rate": 3.6139332365747467e-06, "loss": 0.6838, "step": 250 }, { "epoch": 0.03776325344952796, "grad_norm": 2.0758637079489306, "learning_rate": 3.759071117561684e-06, "loss": 0.6961, "step": 260 }, { "epoch": 0.0392156862745098, "grad_norm": 1.651886885559497, "learning_rate": 3.904208998548621e-06, "loss": 0.6619, "step": 270 }, { "epoch": 0.04066811909949165, "grad_norm": 1.6813735734416895, "learning_rate": 4.049346879535559e-06, "loss": 0.691, "step": 280 }, { "epoch": 0.04212055192447349, "grad_norm": 1.8001370749006687, "learning_rate": 4.194484760522497e-06, "loss": 0.6646, "step": 290 }, { "epoch": 0.04357298474945534, "grad_norm": 1.8255351447030483, "learning_rate": 4.339622641509435e-06, "loss": 0.6595, "step": 300 }, { "epoch": 0.04502541757443718, "grad_norm": 1.7918481140936697, "learning_rate": 4.484760522496372e-06, "loss": 0.6555, "step": 310 }, { "epoch": 0.04647785039941903, "grad_norm": 1.6697318257583398, "learning_rate": 4.629898403483309e-06, "loss": 0.6734, "step": 320 }, { "epoch": 0.04793028322440087, "grad_norm": 1.5656777878920214, "learning_rate": 4.775036284470247e-06, "loss": 0.6511, "step": 330 }, { "epoch": 0.04938271604938271, "grad_norm": 1.6515736055504289, "learning_rate": 4.920174165457185e-06, "loss": 0.6651, "step": 340 }, { "epoch": 0.050835148874364564, "grad_norm": 1.6517233906536315, "learning_rate": 5.065312046444122e-06, "loss": 0.665, "step": 350 }, { "epoch": 0.05228758169934641, "grad_norm": 1.6987223199576384, "learning_rate": 5.210449927431061e-06, "loss": 0.6632, "step": 360 }, { "epoch": 0.05374001452432825, "grad_norm": 1.578744968443496, "learning_rate": 5.355587808417998e-06, "loss": 0.665, "step": 370 }, { "epoch": 0.05519244734931009, "grad_norm": 1.4975426293081397, "learning_rate": 5.500725689404935e-06, "loss": 0.6511, "step": 380 }, { "epoch": 0.05664488017429194, "grad_norm": 1.7386717568110297, "learning_rate": 5.645863570391873e-06, "loss": 0.6676, "step": 390 }, { "epoch": 0.05809731299927379, "grad_norm": 1.5916583497500596, "learning_rate": 5.7910014513788105e-06, "loss": 0.6635, "step": 400 }, { "epoch": 0.05954974582425563, "grad_norm": 1.6931617934865184, "learning_rate": 5.936139332365748e-06, "loss": 0.6668, "step": 410 }, { "epoch": 0.06100217864923747, "grad_norm": 1.5616372247201953, "learning_rate": 6.081277213352685e-06, "loss": 0.6685, "step": 420 }, { "epoch": 0.06245461147421932, "grad_norm": 1.5424914283941253, "learning_rate": 6.226415094339623e-06, "loss": 0.659, "step": 430 }, { "epoch": 0.06390704429920116, "grad_norm": 1.6468311050594455, "learning_rate": 6.37155297532656e-06, "loss": 0.6453, "step": 440 }, { "epoch": 0.06535947712418301, "grad_norm": 1.5765402125957226, "learning_rate": 6.5166908563134976e-06, "loss": 0.6598, "step": 450 }, { "epoch": 0.06681190994916485, "grad_norm": 1.7349394887283642, "learning_rate": 6.6618287373004365e-06, "loss": 0.6619, "step": 460 }, { "epoch": 0.0682643427741467, "grad_norm": 1.6385635232751372, "learning_rate": 6.806966618287374e-06, "loss": 0.6692, "step": 470 }, { "epoch": 0.06971677559912855, "grad_norm": 1.4945507177883908, "learning_rate": 6.952104499274311e-06, "loss": 0.6484, "step": 480 }, { "epoch": 0.07116920842411038, "grad_norm": 1.583857774726375, "learning_rate": 7.097242380261249e-06, "loss": 0.657, "step": 490 }, { "epoch": 0.07262164124909223, "grad_norm": 1.8780189334850588, "learning_rate": 7.242380261248186e-06, "loss": 0.6601, "step": 500 }, { "epoch": 0.07407407407407407, "grad_norm": 1.5153409007972507, "learning_rate": 7.387518142235124e-06, "loss": 0.6542, "step": 510 }, { "epoch": 0.07552650689905592, "grad_norm": 1.5243833834622142, "learning_rate": 7.532656023222062e-06, "loss": 0.6476, "step": 520 }, { "epoch": 0.07697893972403776, "grad_norm": 1.6429693792028686, "learning_rate": 7.677793904208998e-06, "loss": 0.6451, "step": 530 }, { "epoch": 0.0784313725490196, "grad_norm": 1.802860360098263, "learning_rate": 7.822931785195936e-06, "loss": 0.6527, "step": 540 }, { "epoch": 0.07988380537400146, "grad_norm": 1.6594363957156038, "learning_rate": 7.968069666182874e-06, "loss": 0.661, "step": 550 }, { "epoch": 0.0813362381989833, "grad_norm": 1.5938255936259151, "learning_rate": 8.113207547169812e-06, "loss": 0.6547, "step": 560 }, { "epoch": 0.08278867102396514, "grad_norm": 1.3939924292770436, "learning_rate": 8.25834542815675e-06, "loss": 0.6609, "step": 570 }, { "epoch": 0.08424110384894698, "grad_norm": 1.5321796462771227, "learning_rate": 8.403483309143687e-06, "loss": 0.6419, "step": 580 }, { "epoch": 0.08569353667392883, "grad_norm": 1.5907007682060863, "learning_rate": 8.548621190130625e-06, "loss": 0.625, "step": 590 }, { "epoch": 0.08714596949891068, "grad_norm": 1.6048966671231157, "learning_rate": 8.693759071117563e-06, "loss": 0.658, "step": 600 }, { "epoch": 0.08859840232389252, "grad_norm": 1.457751877262412, "learning_rate": 8.8388969521045e-06, "loss": 0.6456, "step": 610 }, { "epoch": 0.09005083514887437, "grad_norm": 1.3925725985786772, "learning_rate": 8.984034833091438e-06, "loss": 0.6494, "step": 620 }, { "epoch": 0.0915032679738562, "grad_norm": 1.6476815627809678, "learning_rate": 9.129172714078376e-06, "loss": 0.6604, "step": 630 }, { "epoch": 0.09295570079883805, "grad_norm": 1.4844043302240553, "learning_rate": 9.274310595065312e-06, "loss": 0.6462, "step": 640 }, { "epoch": 0.0944081336238199, "grad_norm": 1.5541257847812342, "learning_rate": 9.41944847605225e-06, "loss": 0.6464, "step": 650 }, { "epoch": 0.09586056644880174, "grad_norm": 1.5339956751582804, "learning_rate": 9.564586357039188e-06, "loss": 0.6471, "step": 660 }, { "epoch": 0.09731299927378359, "grad_norm": 1.550006983868159, "learning_rate": 9.709724238026126e-06, "loss": 0.6519, "step": 670 }, { "epoch": 0.09876543209876543, "grad_norm": 1.298622779401985, "learning_rate": 9.854862119013063e-06, "loss": 0.6508, "step": 680 }, { "epoch": 0.10021786492374728, "grad_norm": 1.4545201677417376, "learning_rate": 1e-05, "loss": 0.6483, "step": 690 }, { "epoch": 0.10167029774872913, "grad_norm": 1.7514454450540817, "learning_rate": 9.999935728859667e-06, "loss": 0.6517, "step": 700 }, { "epoch": 0.10312273057371096, "grad_norm": 1.3010290416328456, "learning_rate": 9.999742917090981e-06, "loss": 0.6435, "step": 710 }, { "epoch": 0.10457516339869281, "grad_norm": 1.5222737445349914, "learning_rate": 9.999421569650833e-06, "loss": 0.6355, "step": 720 }, { "epoch": 0.10602759622367465, "grad_norm": 1.5758824439402839, "learning_rate": 9.99897169480057e-06, "loss": 0.6414, "step": 730 }, { "epoch": 0.1074800290486565, "grad_norm": 1.3245458819453462, "learning_rate": 9.99839330410578e-06, "loss": 0.6416, "step": 740 }, { "epoch": 0.10893246187363835, "grad_norm": 1.4753577499137038, "learning_rate": 9.997686412435996e-06, "loss": 0.6381, "step": 750 }, { "epoch": 0.11038489469862019, "grad_norm": 1.4578988593383, "learning_rate": 9.99685103796431e-06, "loss": 0.6369, "step": 760 }, { "epoch": 0.11183732752360204, "grad_norm": 1.389881220599468, "learning_rate": 9.99588720216691e-06, "loss": 0.6622, "step": 770 }, { "epoch": 0.11328976034858387, "grad_norm": 1.2318560606230133, "learning_rate": 9.994794929822527e-06, "loss": 0.6279, "step": 780 }, { "epoch": 0.11474219317356572, "grad_norm": 1.355472620629438, "learning_rate": 9.993574249011797e-06, "loss": 0.641, "step": 790 }, { "epoch": 0.11619462599854757, "grad_norm": 1.4379602146139996, "learning_rate": 9.992225191116538e-06, "loss": 0.6439, "step": 800 }, { "epoch": 0.11764705882352941, "grad_norm": 1.4777958226910466, "learning_rate": 9.990747790818946e-06, "loss": 0.6457, "step": 810 }, { "epoch": 0.11909949164851126, "grad_norm": 1.2895229336241503, "learning_rate": 9.989142086100703e-06, "loss": 0.6483, "step": 820 }, { "epoch": 0.1205519244734931, "grad_norm": 1.4811460587250382, "learning_rate": 9.987408118241995e-06, "loss": 0.6509, "step": 830 }, { "epoch": 0.12200435729847495, "grad_norm": 1.3189208191268318, "learning_rate": 9.985545931820463e-06, "loss": 0.6181, "step": 840 }, { "epoch": 0.12345679012345678, "grad_norm": 1.3731300368595278, "learning_rate": 9.983555574710043e-06, "loss": 0.6274, "step": 850 }, { "epoch": 0.12490922294843863, "grad_norm": 1.4055775942483093, "learning_rate": 9.981437098079743e-06, "loss": 0.6398, "step": 860 }, { "epoch": 0.12636165577342048, "grad_norm": 1.3307192435974602, "learning_rate": 9.979190556392326e-06, "loss": 0.6393, "step": 870 }, { "epoch": 0.12781408859840232, "grad_norm": 1.5622917958142868, "learning_rate": 9.976816007402912e-06, "loss": 0.6456, "step": 880 }, { "epoch": 0.12926652142338416, "grad_norm": 1.390636406480548, "learning_rate": 9.974313512157488e-06, "loss": 0.6288, "step": 890 }, { "epoch": 0.13071895424836602, "grad_norm": 1.4427250843896926, "learning_rate": 9.971683134991344e-06, "loss": 0.6266, "step": 900 }, { "epoch": 0.13217138707334786, "grad_norm": 1.4098179198178282, "learning_rate": 9.968924943527418e-06, "loss": 0.6411, "step": 910 }, { "epoch": 0.1336238198983297, "grad_norm": 1.4962238363929918, "learning_rate": 9.96603900867455e-06, "loss": 0.6315, "step": 920 }, { "epoch": 0.13507625272331156, "grad_norm": 1.3209044251278015, "learning_rate": 9.963025404625673e-06, "loss": 0.6423, "step": 930 }, { "epoch": 0.1365286855482934, "grad_norm": 1.39955503516968, "learning_rate": 9.959884208855893e-06, "loss": 0.6361, "step": 940 }, { "epoch": 0.13798111837327523, "grad_norm": 1.5348970475105241, "learning_rate": 9.956615502120504e-06, "loss": 0.6241, "step": 950 }, { "epoch": 0.1394335511982571, "grad_norm": 1.48874630945738, "learning_rate": 9.953219368452908e-06, "loss": 0.631, "step": 960 }, { "epoch": 0.14088598402323893, "grad_norm": 1.310857282598366, "learning_rate": 9.949695895162464e-06, "loss": 0.627, "step": 970 }, { "epoch": 0.14233841684822077, "grad_norm": 1.3619342578169393, "learning_rate": 9.946045172832224e-06, "loss": 0.6387, "step": 980 }, { "epoch": 0.1437908496732026, "grad_norm": 1.4936986486504984, "learning_rate": 9.942267295316625e-06, "loss": 0.6331, "step": 990 }, { "epoch": 0.14524328249818447, "grad_norm": 1.32511584393411, "learning_rate": 9.938362359739068e-06, "loss": 0.626, "step": 1000 }, { "epoch": 0.1466957153231663, "grad_norm": 1.3291454266011833, "learning_rate": 9.934330466489414e-06, "loss": 0.6451, "step": 1010 }, { "epoch": 0.14814814814814814, "grad_norm": 1.3289648153139675, "learning_rate": 9.930171719221418e-06, "loss": 0.6333, "step": 1020 }, { "epoch": 0.14960058097313, "grad_norm": 1.3388955314518605, "learning_rate": 9.925886224850047e-06, "loss": 0.6329, "step": 1030 }, { "epoch": 0.15105301379811184, "grad_norm": 1.3788458990043229, "learning_rate": 9.921474093548748e-06, "loss": 0.6308, "step": 1040 }, { "epoch": 0.15250544662309368, "grad_norm": 1.2630947233952987, "learning_rate": 9.916935438746604e-06, "loss": 0.6366, "step": 1050 }, { "epoch": 0.1539578794480755, "grad_norm": 1.2586848110727198, "learning_rate": 9.912270377125424e-06, "loss": 0.6224, "step": 1060 }, { "epoch": 0.15541031227305738, "grad_norm": 1.5648142512317709, "learning_rate": 9.90747902861674e-06, "loss": 0.6261, "step": 1070 }, { "epoch": 0.1568627450980392, "grad_norm": 1.477705850244199, "learning_rate": 9.902561516398723e-06, "loss": 0.6207, "step": 1080 }, { "epoch": 0.15831517792302105, "grad_norm": 1.2950681154644361, "learning_rate": 9.897517966893023e-06, "loss": 0.6218, "step": 1090 }, { "epoch": 0.15976761074800291, "grad_norm": 1.4613516139089748, "learning_rate": 9.892348509761509e-06, "loss": 0.6237, "step": 1100 }, { "epoch": 0.16122004357298475, "grad_norm": 1.2641419484176866, "learning_rate": 9.887053277902943e-06, "loss": 0.6425, "step": 1110 }, { "epoch": 0.1626724763979666, "grad_norm": 1.2419109246681843, "learning_rate": 9.881632407449561e-06, "loss": 0.6423, "step": 1120 }, { "epoch": 0.16412490922294845, "grad_norm": 1.4096648257937974, "learning_rate": 9.876086037763575e-06, "loss": 0.6383, "step": 1130 }, { "epoch": 0.1655773420479303, "grad_norm": 1.2574892255736747, "learning_rate": 9.870414311433585e-06, "loss": 0.6059, "step": 1140 }, { "epoch": 0.16702977487291212, "grad_norm": 1.2716145459010044, "learning_rate": 9.86461737427092e-06, "loss": 0.6098, "step": 1150 }, { "epoch": 0.16848220769789396, "grad_norm": 1.1998298755084313, "learning_rate": 9.858695375305885e-06, "loss": 0.6214, "step": 1160 }, { "epoch": 0.16993464052287582, "grad_norm": 1.4281449888166444, "learning_rate": 9.852648466783927e-06, "loss": 0.6241, "step": 1170 }, { "epoch": 0.17138707334785766, "grad_norm": 1.4071764477667867, "learning_rate": 9.84647680416173e-06, "loss": 0.6474, "step": 1180 }, { "epoch": 0.1728395061728395, "grad_norm": 1.2174453861834778, "learning_rate": 9.840180546103215e-06, "loss": 0.6326, "step": 1190 }, { "epoch": 0.17429193899782136, "grad_norm": 1.3029300772595094, "learning_rate": 9.833759854475453e-06, "loss": 0.6185, "step": 1200 }, { "epoch": 0.1757443718228032, "grad_norm": 1.271112016193465, "learning_rate": 9.827214894344514e-06, "loss": 0.6301, "step": 1210 }, { "epoch": 0.17719680464778503, "grad_norm": 1.2997276991719462, "learning_rate": 9.82054583397122e-06, "loss": 0.6317, "step": 1220 }, { "epoch": 0.1786492374727669, "grad_norm": 1.2096030387104992, "learning_rate": 9.813752844806814e-06, "loss": 0.6159, "step": 1230 }, { "epoch": 0.18010167029774873, "grad_norm": 1.2973416257944899, "learning_rate": 9.806836101488561e-06, "loss": 0.6289, "step": 1240 }, { "epoch": 0.18155410312273057, "grad_norm": 1.3197440048632956, "learning_rate": 9.799795781835253e-06, "loss": 0.6088, "step": 1250 }, { "epoch": 0.1830065359477124, "grad_norm": 1.2535036782710556, "learning_rate": 9.79263206684264e-06, "loss": 0.6206, "step": 1260 }, { "epoch": 0.18445896877269427, "grad_norm": 1.3190252094745194, "learning_rate": 9.785345140678775e-06, "loss": 0.6149, "step": 1270 }, { "epoch": 0.1859114015976761, "grad_norm": 1.3148617882447478, "learning_rate": 9.777935190679277e-06, "loss": 0.6134, "step": 1280 }, { "epoch": 0.18736383442265794, "grad_norm": 1.3368521794263946, "learning_rate": 9.770402407342524e-06, "loss": 0.6258, "step": 1290 }, { "epoch": 0.1888162672476398, "grad_norm": 1.3941700458180073, "learning_rate": 9.762746984324743e-06, "loss": 0.6191, "step": 1300 }, { "epoch": 0.19026870007262164, "grad_norm": 1.3152403546822757, "learning_rate": 9.754969118435043e-06, "loss": 0.6446, "step": 1310 }, { "epoch": 0.19172113289760348, "grad_norm": 1.3013626770341264, "learning_rate": 9.747069009630347e-06, "loss": 0.6312, "step": 1320 }, { "epoch": 0.19317356572258534, "grad_norm": 1.3966383885583535, "learning_rate": 9.739046861010255e-06, "loss": 0.6207, "step": 1330 }, { "epoch": 0.19462599854756718, "grad_norm": 1.1439991746974036, "learning_rate": 9.730902878811825e-06, "loss": 0.6144, "step": 1340 }, { "epoch": 0.19607843137254902, "grad_norm": 1.3540894709055364, "learning_rate": 9.722637272404263e-06, "loss": 0.6044, "step": 1350 }, { "epoch": 0.19753086419753085, "grad_norm": 1.100639588271217, "learning_rate": 9.71425025428355e-06, "loss": 0.6036, "step": 1360 }, { "epoch": 0.19898329702251272, "grad_norm": 1.1874319432290736, "learning_rate": 9.705742040066977e-06, "loss": 0.6039, "step": 1370 }, { "epoch": 0.20043572984749455, "grad_norm": 1.1767671647303808, "learning_rate": 9.697112848487591e-06, "loss": 0.6376, "step": 1380 }, { "epoch": 0.2018881626724764, "grad_norm": 1.135879944041461, "learning_rate": 9.688362901388586e-06, "loss": 0.6035, "step": 1390 }, { "epoch": 0.20334059549745825, "grad_norm": 1.2315910796359388, "learning_rate": 9.679492423717596e-06, "loss": 0.6098, "step": 1400 }, { "epoch": 0.2047930283224401, "grad_norm": 1.4949408462288012, "learning_rate": 9.670501643520904e-06, "loss": 0.6203, "step": 1410 }, { "epoch": 0.20624546114742193, "grad_norm": 1.3180181445795711, "learning_rate": 9.66139079193759e-06, "loss": 0.6286, "step": 1420 }, { "epoch": 0.20769789397240376, "grad_norm": 1.2616556885045909, "learning_rate": 9.652160103193583e-06, "loss": 0.6274, "step": 1430 }, { "epoch": 0.20915032679738563, "grad_norm": 1.3174449455574337, "learning_rate": 9.642809814595637e-06, "loss": 0.6136, "step": 1440 }, { "epoch": 0.21060275962236746, "grad_norm": 1.296735377133819, "learning_rate": 9.633340166525238e-06, "loss": 0.6145, "step": 1450 }, { "epoch": 0.2120551924473493, "grad_norm": 1.2502497833244608, "learning_rate": 9.62375140243242e-06, "loss": 0.6031, "step": 1460 }, { "epoch": 0.21350762527233116, "grad_norm": 1.2288830705505374, "learning_rate": 9.6140437688295e-06, "loss": 0.6128, "step": 1470 }, { "epoch": 0.214960058097313, "grad_norm": 1.1119473380240397, "learning_rate": 9.604217515284753e-06, "loss": 0.6171, "step": 1480 }, { "epoch": 0.21641249092229484, "grad_norm": 1.2070397164389806, "learning_rate": 9.594272894415986e-06, "loss": 0.6238, "step": 1490 }, { "epoch": 0.2178649237472767, "grad_norm": 1.3345637205372078, "learning_rate": 9.584210161884049e-06, "loss": 0.6163, "step": 1500 }, { "epoch": 0.21931735657225854, "grad_norm": 1.1385043759036517, "learning_rate": 9.57402957638626e-06, "loss": 0.6083, "step": 1510 }, { "epoch": 0.22076978939724037, "grad_norm": 1.1936988121465326, "learning_rate": 9.563731399649756e-06, "loss": 0.5992, "step": 1520 }, { "epoch": 0.2222222222222222, "grad_norm": 1.4103572503621762, "learning_rate": 9.553315896424758e-06, "loss": 0.6054, "step": 1530 }, { "epoch": 0.22367465504720407, "grad_norm": 1.3209719950503893, "learning_rate": 9.54278333447778e-06, "loss": 0.596, "step": 1540 }, { "epoch": 0.2251270878721859, "grad_norm": 1.1693016501696898, "learning_rate": 9.532133984584721e-06, "loss": 0.6323, "step": 1550 }, { "epoch": 0.22657952069716775, "grad_norm": 1.1691510921859125, "learning_rate": 9.521368120523931e-06, "loss": 0.6027, "step": 1560 }, { "epoch": 0.2280319535221496, "grad_norm": 1.2114364957172101, "learning_rate": 9.510486019069154e-06, "loss": 0.6245, "step": 1570 }, { "epoch": 0.22948438634713145, "grad_norm": 1.265123327235345, "learning_rate": 9.499487959982415e-06, "loss": 0.6189, "step": 1580 }, { "epoch": 0.23093681917211328, "grad_norm": 1.3773059483594046, "learning_rate": 9.488374226006836e-06, "loss": 0.6106, "step": 1590 }, { "epoch": 0.23238925199709515, "grad_norm": 1.2737618179619303, "learning_rate": 9.477145102859357e-06, "loss": 0.6115, "step": 1600 }, { "epoch": 0.23384168482207698, "grad_norm": 1.3066121502077, "learning_rate": 9.4658008792234e-06, "loss": 0.609, "step": 1610 }, { "epoch": 0.23529411764705882, "grad_norm": 1.242518893517758, "learning_rate": 9.45434184674144e-06, "loss": 0.6, "step": 1620 }, { "epoch": 0.23674655047204066, "grad_norm": 1.2493334973003818, "learning_rate": 9.442768300007511e-06, "loss": 0.6144, "step": 1630 }, { "epoch": 0.23819898329702252, "grad_norm": 1.2775874117960886, "learning_rate": 9.431080536559631e-06, "loss": 0.6245, "step": 1640 }, { "epoch": 0.23965141612200436, "grad_norm": 1.247039996382283, "learning_rate": 9.419278856872154e-06, "loss": 0.6279, "step": 1650 }, { "epoch": 0.2411038489469862, "grad_norm": 1.302601682600637, "learning_rate": 9.407363564348047e-06, "loss": 0.5933, "step": 1660 }, { "epoch": 0.24255628177196806, "grad_norm": 1.431347455463815, "learning_rate": 9.39533496531108e-06, "loss": 0.6171, "step": 1670 }, { "epoch": 0.2440087145969499, "grad_norm": 1.2527655662771335, "learning_rate": 9.38319336899797e-06, "loss": 0.6099, "step": 1680 }, { "epoch": 0.24546114742193173, "grad_norm": 1.205551788839019, "learning_rate": 9.370939087550407e-06, "loss": 0.6077, "step": 1690 }, { "epoch": 0.24691358024691357, "grad_norm": 1.332981320431861, "learning_rate": 9.358572436007052e-06, "loss": 0.6126, "step": 1700 }, { "epoch": 0.24836601307189543, "grad_norm": 1.2112905977700383, "learning_rate": 9.346093732295422e-06, "loss": 0.6141, "step": 1710 }, { "epoch": 0.24981844589687727, "grad_norm": 1.1741115783770129, "learning_rate": 9.333503297223725e-06, "loss": 0.5977, "step": 1720 }, { "epoch": 0.2512708787218591, "grad_norm": 1.2308239868942004, "learning_rate": 9.320801454472607e-06, "loss": 0.6213, "step": 1730 }, { "epoch": 0.25272331154684097, "grad_norm": 1.3933258283474292, "learning_rate": 9.30798853058684e-06, "loss": 0.6217, "step": 1740 }, { "epoch": 0.2541757443718228, "grad_norm": 1.2467959691205432, "learning_rate": 9.29506485496691e-06, "loss": 0.6089, "step": 1750 }, { "epoch": 0.25562817719680464, "grad_norm": 1.106847677662664, "learning_rate": 9.282030759860566e-06, "loss": 0.6113, "step": 1760 }, { "epoch": 0.2570806100217865, "grad_norm": 1.225606521070107, "learning_rate": 9.268886580354272e-06, "loss": 0.6041, "step": 1770 }, { "epoch": 0.2585330428467683, "grad_norm": 1.1249241718792773, "learning_rate": 9.255632654364591e-06, "loss": 0.6112, "step": 1780 }, { "epoch": 0.2599854756717502, "grad_norm": 1.2347205288363368, "learning_rate": 9.242269322629494e-06, "loss": 0.6003, "step": 1790 }, { "epoch": 0.26143790849673204, "grad_norm": 1.3040805105750026, "learning_rate": 9.228796928699613e-06, "loss": 0.6187, "step": 1800 }, { "epoch": 0.26289034132171385, "grad_norm": 1.4585670240799034, "learning_rate": 9.215215818929392e-06, "loss": 0.612, "step": 1810 }, { "epoch": 0.2643427741466957, "grad_norm": 1.0974130075617774, "learning_rate": 9.201526342468202e-06, "loss": 0.6124, "step": 1820 }, { "epoch": 0.2657952069716776, "grad_norm": 1.2918051377461068, "learning_rate": 9.18772885125134e-06, "loss": 0.6055, "step": 1830 }, { "epoch": 0.2672476397966594, "grad_norm": 1.199609927095931, "learning_rate": 9.17382369999101e-06, "loss": 0.6086, "step": 1840 }, { "epoch": 0.26870007262164125, "grad_norm": 1.2736244478450063, "learning_rate": 9.159811246167182e-06, "loss": 0.6111, "step": 1850 }, { "epoch": 0.2701525054466231, "grad_norm": 1.2484696326393374, "learning_rate": 9.14569185001841e-06, "loss": 0.5951, "step": 1860 }, { "epoch": 0.2716049382716049, "grad_norm": 1.3221301583704237, "learning_rate": 9.131465874532568e-06, "loss": 0.5861, "step": 1870 }, { "epoch": 0.2730573710965868, "grad_norm": 1.2578322361866867, "learning_rate": 9.117133685437524e-06, "loss": 0.6073, "step": 1880 }, { "epoch": 0.27450980392156865, "grad_norm": 1.3260698149158467, "learning_rate": 9.102695651191737e-06, "loss": 0.5838, "step": 1890 }, { "epoch": 0.27596223674655046, "grad_norm": 1.2373193794097532, "learning_rate": 9.088152142974771e-06, "loss": 0.6013, "step": 1900 }, { "epoch": 0.2774146695715323, "grad_norm": 1.1997047870357698, "learning_rate": 9.073503534677773e-06, "loss": 0.6219, "step": 1910 }, { "epoch": 0.2788671023965142, "grad_norm": 1.2769112952981858, "learning_rate": 9.058750202893844e-06, "loss": 0.6052, "step": 1920 }, { "epoch": 0.280319535221496, "grad_norm": 1.2302296498321919, "learning_rate": 9.04389252690837e-06, "loss": 0.6124, "step": 1930 }, { "epoch": 0.28177196804647786, "grad_norm": 1.2009594091858158, "learning_rate": 9.02893088868926e-06, "loss": 0.604, "step": 1940 }, { "epoch": 0.28322440087145967, "grad_norm": 1.0539872600155336, "learning_rate": 9.013865672877133e-06, "loss": 0.6052, "step": 1950 }, { "epoch": 0.28467683369644153, "grad_norm": 1.2561895098497668, "learning_rate": 8.998697266775433e-06, "loss": 0.6077, "step": 1960 }, { "epoch": 0.2861292665214234, "grad_norm": 1.2763583417414128, "learning_rate": 8.98342606034046e-06, "loss": 0.6059, "step": 1970 }, { "epoch": 0.2875816993464052, "grad_norm": 1.1463184995763767, "learning_rate": 8.96805244617135e-06, "loss": 0.6183, "step": 1980 }, { "epoch": 0.28903413217138707, "grad_norm": 1.1421597790792624, "learning_rate": 8.952576819499998e-06, "loss": 0.602, "step": 1990 }, { "epoch": 0.29048656499636893, "grad_norm": 1.3046866547593934, "learning_rate": 8.93699957818087e-06, "loss": 0.5925, "step": 2000 }, { "epoch": 0.29193899782135074, "grad_norm": 1.27239619384718, "learning_rate": 8.921321122680789e-06, "loss": 0.6037, "step": 2010 }, { "epoch": 0.2933914306463326, "grad_norm": 1.3073284462474046, "learning_rate": 8.905541856068641e-06, "loss": 0.6077, "step": 2020 }, { "epoch": 0.29484386347131447, "grad_norm": 1.2694028140938955, "learning_rate": 8.889662184005007e-06, "loss": 0.6076, "step": 2030 }, { "epoch": 0.2962962962962963, "grad_norm": 1.1075058528848678, "learning_rate": 8.873682514731746e-06, "loss": 0.5986, "step": 2040 }, { "epoch": 0.29774872912127814, "grad_norm": 1.25011183641691, "learning_rate": 8.85760325906148e-06, "loss": 0.5911, "step": 2050 }, { "epoch": 0.29920116194626, "grad_norm": 1.230690665069067, "learning_rate": 8.841424830367051e-06, "loss": 0.5918, "step": 2060 }, { "epoch": 0.3006535947712418, "grad_norm": 1.2143851276582127, "learning_rate": 8.82514764457088e-06, "loss": 0.6026, "step": 2070 }, { "epoch": 0.3021060275962237, "grad_norm": 1.1711415813258073, "learning_rate": 8.808772120134286e-06, "loss": 0.6208, "step": 2080 }, { "epoch": 0.30355846042120554, "grad_norm": 1.2105658122447378, "learning_rate": 8.79229867804672e-06, "loss": 0.6178, "step": 2090 }, { "epoch": 0.30501089324618735, "grad_norm": 1.260614604486508, "learning_rate": 8.775727741814945e-06, "loss": 0.6033, "step": 2100 }, { "epoch": 0.3064633260711692, "grad_norm": 1.1949196588242055, "learning_rate": 8.75905973745215e-06, "loss": 0.5954, "step": 2110 }, { "epoch": 0.307915758896151, "grad_norm": 1.2358431757504627, "learning_rate": 8.742295093466993e-06, "loss": 0.5929, "step": 2120 }, { "epoch": 0.3093681917211329, "grad_norm": 1.1788915626896657, "learning_rate": 8.725434240852586e-06, "loss": 0.6014, "step": 2130 }, { "epoch": 0.31082062454611475, "grad_norm": 1.2899429468502281, "learning_rate": 8.708477613075422e-06, "loss": 0.588, "step": 2140 }, { "epoch": 0.31227305737109656, "grad_norm": 1.0436767601630443, "learning_rate": 8.691425646064222e-06, "loss": 0.6128, "step": 2150 }, { "epoch": 0.3137254901960784, "grad_norm": 1.1823668694466984, "learning_rate": 8.674278778198731e-06, "loss": 0.5939, "step": 2160 }, { "epoch": 0.3151779230210603, "grad_norm": 1.2287777612088193, "learning_rate": 8.657037450298449e-06, "loss": 0.5942, "step": 2170 }, { "epoch": 0.3166303558460421, "grad_norm": 1.1210160142803036, "learning_rate": 8.6397021056113e-06, "loss": 0.6068, "step": 2180 }, { "epoch": 0.31808278867102396, "grad_norm": 1.176574092958882, "learning_rate": 8.622273189802231e-06, "loss": 0.6099, "step": 2190 }, { "epoch": 0.31953522149600583, "grad_norm": 1.2276623152067967, "learning_rate": 8.604751150941758e-06, "loss": 0.598, "step": 2200 }, { "epoch": 0.32098765432098764, "grad_norm": 1.2049029589388036, "learning_rate": 8.58713643949445e-06, "loss": 0.5934, "step": 2210 }, { "epoch": 0.3224400871459695, "grad_norm": 1.2650704032924422, "learning_rate": 8.569429508307345e-06, "loss": 0.6039, "step": 2220 }, { "epoch": 0.32389251997095136, "grad_norm": 1.088534753663297, "learning_rate": 8.551630812598303e-06, "loss": 0.6038, "step": 2230 }, { "epoch": 0.3253449527959332, "grad_norm": 1.1678210415173849, "learning_rate": 8.533740809944317e-06, "loss": 0.6084, "step": 2240 }, { "epoch": 0.32679738562091504, "grad_norm": 1.251355519441971, "learning_rate": 8.515759960269731e-06, "loss": 0.5975, "step": 2250 }, { "epoch": 0.3282498184458969, "grad_norm": 1.1662322522769242, "learning_rate": 8.497688725834432e-06, "loss": 0.6106, "step": 2260 }, { "epoch": 0.3297022512708787, "grad_norm": 1.336372713961502, "learning_rate": 8.479527571221957e-06, "loss": 0.6224, "step": 2270 }, { "epoch": 0.3311546840958606, "grad_norm": 1.148371532122775, "learning_rate": 8.461276963327555e-06, "loss": 0.607, "step": 2280 }, { "epoch": 0.33260711692084244, "grad_norm": 1.3691981401078914, "learning_rate": 8.442937371346174e-06, "loss": 0.6001, "step": 2290 }, { "epoch": 0.33405954974582425, "grad_norm": 1.3343569533197541, "learning_rate": 8.424509266760413e-06, "loss": 0.6009, "step": 2300 }, { "epoch": 0.3355119825708061, "grad_norm": 1.0903008241967769, "learning_rate": 8.405993123328388e-06, "loss": 0.5852, "step": 2310 }, { "epoch": 0.3369644153957879, "grad_norm": 1.2770798153391716, "learning_rate": 8.387389417071565e-06, "loss": 0.5967, "step": 2320 }, { "epoch": 0.3384168482207698, "grad_norm": 1.1893611624135727, "learning_rate": 8.368698626262506e-06, "loss": 0.5906, "step": 2330 }, { "epoch": 0.33986928104575165, "grad_norm": 1.1182656055274527, "learning_rate": 8.349921231412588e-06, "loss": 0.6144, "step": 2340 }, { "epoch": 0.34132171387073346, "grad_norm": 1.1569225334439495, "learning_rate": 8.331057715259643e-06, "loss": 0.5945, "step": 2350 }, { "epoch": 0.3427741466957153, "grad_norm": 1.0553585361032343, "learning_rate": 8.312108562755547e-06, "loss": 0.6012, "step": 2360 }, { "epoch": 0.3442265795206972, "grad_norm": 1.0429439932782214, "learning_rate": 8.29307426105376e-06, "loss": 0.602, "step": 2370 }, { "epoch": 0.345679012345679, "grad_norm": 1.0397368512389722, "learning_rate": 8.273955299496787e-06, "loss": 0.5932, "step": 2380 }, { "epoch": 0.34713144517066086, "grad_norm": 1.0989788243486265, "learning_rate": 8.254752169603614e-06, "loss": 0.5987, "step": 2390 }, { "epoch": 0.3485838779956427, "grad_norm": 1.2513128657031618, "learning_rate": 8.235465365057067e-06, "loss": 0.597, "step": 2400 }, { "epoch": 0.35003631082062453, "grad_norm": 1.2696804086094644, "learning_rate": 8.21609538169111e-06, "loss": 0.5962, "step": 2410 }, { "epoch": 0.3514887436456064, "grad_norm": 1.3765675743894579, "learning_rate": 8.196642717478113e-06, "loss": 0.6083, "step": 2420 }, { "epoch": 0.35294117647058826, "grad_norm": 1.1525716644685924, "learning_rate": 8.177107872516041e-06, "loss": 0.5912, "step": 2430 }, { "epoch": 0.35439360929557007, "grad_norm": 1.1930516036081553, "learning_rate": 8.157491349015599e-06, "loss": 0.601, "step": 2440 }, { "epoch": 0.35584604212055193, "grad_norm": 1.3453249916774477, "learning_rate": 8.137793651287317e-06, "loss": 0.62, "step": 2450 }, { "epoch": 0.3572984749455338, "grad_norm": 1.216543063547056, "learning_rate": 8.118015285728598e-06, "loss": 0.6037, "step": 2460 }, { "epoch": 0.3587509077705156, "grad_norm": 1.129394528084983, "learning_rate": 8.098156760810683e-06, "loss": 0.598, "step": 2470 }, { "epoch": 0.36020334059549747, "grad_norm": 1.124156367954234, "learning_rate": 8.078218587065589e-06, "loss": 0.5813, "step": 2480 }, { "epoch": 0.3616557734204793, "grad_norm": 1.2039082584679666, "learning_rate": 8.058201277072981e-06, "loss": 0.5876, "step": 2490 }, { "epoch": 0.36310820624546114, "grad_norm": 1.1919842026488203, "learning_rate": 8.038105345446994e-06, "loss": 0.6115, "step": 2500 }, { "epoch": 0.364560639070443, "grad_norm": 1.2851968482663827, "learning_rate": 8.017931308823006e-06, "loss": 0.592, "step": 2510 }, { "epoch": 0.3660130718954248, "grad_norm": 1.1538243634302991, "learning_rate": 7.997679685844353e-06, "loss": 0.5867, "step": 2520 }, { "epoch": 0.3674655047204067, "grad_norm": 1.0704432112589999, "learning_rate": 7.977350997148994e-06, "loss": 0.6007, "step": 2530 }, { "epoch": 0.36891793754538854, "grad_norm": 1.2707334756597408, "learning_rate": 7.956945765356133e-06, "loss": 0.5746, "step": 2540 }, { "epoch": 0.37037037037037035, "grad_norm": 1.2061421625898763, "learning_rate": 7.936464515052776e-06, "loss": 0.601, "step": 2550 }, { "epoch": 0.3718228031953522, "grad_norm": 1.318015728266432, "learning_rate": 7.915907772780244e-06, "loss": 0.6081, "step": 2560 }, { "epoch": 0.3732752360203341, "grad_norm": 1.253197445356757, "learning_rate": 7.89527606702065e-06, "loss": 0.6046, "step": 2570 }, { "epoch": 0.3747276688453159, "grad_norm": 1.190199765539676, "learning_rate": 7.87456992818329e-06, "loss": 0.5986, "step": 2580 }, { "epoch": 0.37618010167029775, "grad_norm": 1.193398450040499, "learning_rate": 7.853789888591032e-06, "loss": 0.5889, "step": 2590 }, { "epoch": 0.3776325344952796, "grad_norm": 1.035053671117003, "learning_rate": 7.832936482466612e-06, "loss": 0.5934, "step": 2600 }, { "epoch": 0.3790849673202614, "grad_norm": 1.1386993400574172, "learning_rate": 7.812010245918903e-06, "loss": 0.586, "step": 2610 }, { "epoch": 0.3805374001452433, "grad_norm": 1.1022458257608025, "learning_rate": 7.79101171692914e-06, "loss": 0.5806, "step": 2620 }, { "epoch": 0.38198983297022515, "grad_norm": 1.1758543851880188, "learning_rate": 7.769941435337083e-06, "loss": 0.5618, "step": 2630 }, { "epoch": 0.38344226579520696, "grad_norm": 1.2426818455480244, "learning_rate": 7.748799942827147e-06, "loss": 0.6012, "step": 2640 }, { "epoch": 0.3848946986201888, "grad_norm": 1.0718204571931684, "learning_rate": 7.72758778291446e-06, "loss": 0.5887, "step": 2650 }, { "epoch": 0.3863471314451707, "grad_norm": 1.0289005823465374, "learning_rate": 7.706305500930909e-06, "loss": 0.6037, "step": 2660 }, { "epoch": 0.3877995642701525, "grad_norm": 1.2478985029233107, "learning_rate": 7.684953644011103e-06, "loss": 0.584, "step": 2670 }, { "epoch": 0.38925199709513436, "grad_norm": 1.1066991243562059, "learning_rate": 7.66353276107832e-06, "loss": 0.6007, "step": 2680 }, { "epoch": 0.39070442992011617, "grad_norm": 1.2345614999374477, "learning_rate": 7.64204340283039e-06, "loss": 0.6033, "step": 2690 }, { "epoch": 0.39215686274509803, "grad_norm": 1.0798799696274017, "learning_rate": 7.620486121725536e-06, "loss": 0.59, "step": 2700 }, { "epoch": 0.3936092955700799, "grad_norm": 1.1600968806836478, "learning_rate": 7.598861471968174e-06, "loss": 0.5948, "step": 2710 }, { "epoch": 0.3950617283950617, "grad_norm": 1.1860847221048887, "learning_rate": 7.577170009494665e-06, "loss": 0.5981, "step": 2720 }, { "epoch": 0.39651416122004357, "grad_norm": 1.0670434364146835, "learning_rate": 7.555412291959018e-06, "loss": 0.5772, "step": 2730 }, { "epoch": 0.39796659404502543, "grad_norm": 1.1865817610815497, "learning_rate": 7.533588878718561e-06, "loss": 0.584, "step": 2740 }, { "epoch": 0.39941902687000724, "grad_norm": 1.2092053148497965, "learning_rate": 7.511700330819556e-06, "loss": 0.5832, "step": 2750 }, { "epoch": 0.4008714596949891, "grad_norm": 1.1770338237370501, "learning_rate": 7.489747210982777e-06, "loss": 0.5984, "step": 2760 }, { "epoch": 0.40232389251997097, "grad_norm": 1.1434774901575833, "learning_rate": 7.4677300835890424e-06, "loss": 0.5755, "step": 2770 }, { "epoch": 0.4037763253449528, "grad_norm": 1.0366368031771818, "learning_rate": 7.445649514664703e-06, "loss": 0.5886, "step": 2780 }, { "epoch": 0.40522875816993464, "grad_norm": 1.2729396302065998, "learning_rate": 7.423506071867101e-06, "loss": 0.6134, "step": 2790 }, { "epoch": 0.4066811909949165, "grad_norm": 1.0518352889412923, "learning_rate": 7.401300324469961e-06, "loss": 0.5737, "step": 2800 }, { "epoch": 0.4081336238198983, "grad_norm": 1.2001944481237583, "learning_rate": 7.3790328433487665e-06, "loss": 0.5874, "step": 2810 }, { "epoch": 0.4095860566448802, "grad_norm": 1.250231920993964, "learning_rate": 7.3567042009660786e-06, "loss": 0.5862, "step": 2820 }, { "epoch": 0.41103848946986205, "grad_norm": 1.1512872210708966, "learning_rate": 7.3343149713568215e-06, "loss": 0.593, "step": 2830 }, { "epoch": 0.41249092229484385, "grad_norm": 1.1605256860138091, "learning_rate": 7.311865730113525e-06, "loss": 0.5939, "step": 2840 }, { "epoch": 0.4139433551198257, "grad_norm": 1.3940208410225592, "learning_rate": 7.2893570543715174e-06, "loss": 0.6028, "step": 2850 }, { "epoch": 0.4153957879448075, "grad_norm": 1.1976078557092422, "learning_rate": 7.266789522794104e-06, "loss": 0.6065, "step": 2860 }, { "epoch": 0.4168482207697894, "grad_norm": 1.035110243445679, "learning_rate": 7.244163715557683e-06, "loss": 0.5915, "step": 2870 }, { "epoch": 0.41830065359477125, "grad_norm": 1.1865073190747897, "learning_rate": 7.2214802143368225e-06, "loss": 0.5961, "step": 2880 }, { "epoch": 0.41975308641975306, "grad_norm": 1.0991372561424138, "learning_rate": 7.1987396022893216e-06, "loss": 0.5857, "step": 2890 }, { "epoch": 0.4212055192447349, "grad_norm": 1.0801243737112538, "learning_rate": 7.175942464041209e-06, "loss": 0.5829, "step": 2900 }, { "epoch": 0.4226579520697168, "grad_norm": 1.3295568712189132, "learning_rate": 7.15308938567171e-06, "loss": 0.5869, "step": 2910 }, { "epoch": 0.4241103848946986, "grad_norm": 1.0402363831702612, "learning_rate": 7.130180954698187e-06, "loss": 0.5842, "step": 2920 }, { "epoch": 0.42556281771968046, "grad_norm": 1.1031276144488775, "learning_rate": 7.107217760061036e-06, "loss": 0.5923, "step": 2930 }, { "epoch": 0.42701525054466233, "grad_norm": 1.183086396688286, "learning_rate": 7.0842003921085376e-06, "loss": 0.6053, "step": 2940 }, { "epoch": 0.42846768336964414, "grad_norm": 1.244303339507363, "learning_rate": 7.061129442581685e-06, "loss": 0.5924, "step": 2950 }, { "epoch": 0.429920116194626, "grad_norm": 1.2478572360385807, "learning_rate": 7.038005504598975e-06, "loss": 0.5922, "step": 2960 }, { "epoch": 0.43137254901960786, "grad_norm": 1.0447681879549313, "learning_rate": 7.0148291726411486e-06, "loss": 0.5825, "step": 2970 }, { "epoch": 0.4328249818445897, "grad_norm": 1.1025428022026995, "learning_rate": 6.9916010425359214e-06, "loss": 0.5956, "step": 2980 }, { "epoch": 0.43427741466957154, "grad_norm": 1.329010163267056, "learning_rate": 6.968321711442658e-06, "loss": 0.5772, "step": 2990 }, { "epoch": 0.4357298474945534, "grad_norm": 1.2330587975332181, "learning_rate": 6.9449917778370216e-06, "loss": 0.5933, "step": 3000 }, { "epoch": 0.4371822803195352, "grad_norm": 1.1656344009683823, "learning_rate": 6.921611841495584e-06, "loss": 0.5922, "step": 3010 }, { "epoch": 0.4386347131445171, "grad_norm": 1.2709734185927093, "learning_rate": 6.898182503480414e-06, "loss": 0.5911, "step": 3020 }, { "epoch": 0.4400871459694989, "grad_norm": 1.269770194129687, "learning_rate": 6.8747043661236215e-06, "loss": 0.6103, "step": 3030 }, { "epoch": 0.44153957879448075, "grad_norm": 1.106713465551905, "learning_rate": 6.851178033011869e-06, "loss": 0.5997, "step": 3040 }, { "epoch": 0.4429920116194626, "grad_norm": 1.1985970638971495, "learning_rate": 6.82760410897086e-06, "loss": 0.5727, "step": 3050 }, { "epoch": 0.4444444444444444, "grad_norm": 1.1259472634689607, "learning_rate": 6.8039832000497865e-06, "loss": 0.5983, "step": 3060 }, { "epoch": 0.4458968772694263, "grad_norm": 1.212189906596056, "learning_rate": 6.78031591350575e-06, "loss": 0.5958, "step": 3070 }, { "epoch": 0.44734931009440815, "grad_norm": 1.0999728539824523, "learning_rate": 6.756602857788148e-06, "loss": 0.5717, "step": 3080 }, { "epoch": 0.44880174291938996, "grad_norm": 1.1130187014726358, "learning_rate": 6.732844642523032e-06, "loss": 0.5793, "step": 3090 }, { "epoch": 0.4502541757443718, "grad_norm": 1.075132513625087, "learning_rate": 6.70904187849744e-06, "loss": 0.562, "step": 3100 }, { "epoch": 0.4517066085693537, "grad_norm": 1.2147850552839328, "learning_rate": 6.685195177643684e-06, "loss": 0.5978, "step": 3110 }, { "epoch": 0.4531590413943355, "grad_norm": 1.2836246837826484, "learning_rate": 6.661305153023628e-06, "loss": 0.5912, "step": 3120 }, { "epoch": 0.45461147421931736, "grad_norm": 1.1766776836268427, "learning_rate": 6.637372418812921e-06, "loss": 0.586, "step": 3130 }, { "epoch": 0.4560639070442992, "grad_norm": 1.3613669267848012, "learning_rate": 6.613397590285211e-06, "loss": 0.5998, "step": 3140 }, { "epoch": 0.45751633986928103, "grad_norm": 1.2051701552338834, "learning_rate": 6.589381283796325e-06, "loss": 0.5812, "step": 3150 }, { "epoch": 0.4589687726942629, "grad_norm": 1.1519365736041338, "learning_rate": 6.565324116768428e-06, "loss": 0.583, "step": 3160 }, { "epoch": 0.46042120551924476, "grad_norm": 1.1475917123110242, "learning_rate": 6.54122670767414e-06, "loss": 0.5765, "step": 3170 }, { "epoch": 0.46187363834422657, "grad_norm": 1.088676956077236, "learning_rate": 6.517089676020648e-06, "loss": 0.5997, "step": 3180 }, { "epoch": 0.46332607116920843, "grad_norm": 1.1195203213303881, "learning_rate": 6.492913642333768e-06, "loss": 0.565, "step": 3190 }, { "epoch": 0.4647785039941903, "grad_norm": 1.0927178103796473, "learning_rate": 6.468699228142004e-06, "loss": 0.5988, "step": 3200 }, { "epoch": 0.4662309368191721, "grad_norm": 1.1180323598233408, "learning_rate": 6.444447055960559e-06, "loss": 0.6034, "step": 3210 }, { "epoch": 0.46768336964415397, "grad_norm": 1.1581218721076667, "learning_rate": 6.420157749275341e-06, "loss": 0.5792, "step": 3220 }, { "epoch": 0.4691358024691358, "grad_norm": 1.2355006071990586, "learning_rate": 6.395831932526924e-06, "loss": 0.5914, "step": 3230 }, { "epoch": 0.47058823529411764, "grad_norm": 1.2628642644632941, "learning_rate": 6.371470231094498e-06, "loss": 0.5972, "step": 3240 }, { "epoch": 0.4720406681190995, "grad_norm": 1.30372441555249, "learning_rate": 6.3470732712798e-06, "loss": 0.5943, "step": 3250 }, { "epoch": 0.4734931009440813, "grad_norm": 1.2732465621842586, "learning_rate": 6.322641680290997e-06, "loss": 0.59, "step": 3260 }, { "epoch": 0.4749455337690632, "grad_norm": 1.1957460012906904, "learning_rate": 6.298176086226577e-06, "loss": 0.5908, "step": 3270 }, { "epoch": 0.47639796659404504, "grad_norm": 1.2666436895215651, "learning_rate": 6.273677118059192e-06, "loss": 0.579, "step": 3280 }, { "epoch": 0.47785039941902685, "grad_norm": 1.1740612442844354, "learning_rate": 6.24914540561949e-06, "loss": 0.5849, "step": 3290 }, { "epoch": 0.4793028322440087, "grad_norm": 1.170368029656733, "learning_rate": 6.2245815795799235e-06, "loss": 0.5914, "step": 3300 }, { "epoch": 0.4807552650689906, "grad_norm": 1.060432274782722, "learning_rate": 6.199986271438536e-06, "loss": 0.5692, "step": 3310 }, { "epoch": 0.4822076978939724, "grad_norm": 1.133481629336483, "learning_rate": 6.17536011350273e-06, "loss": 0.5789, "step": 3320 }, { "epoch": 0.48366013071895425, "grad_norm": 1.0779584839433474, "learning_rate": 6.150703738873004e-06, "loss": 0.5815, "step": 3330 }, { "epoch": 0.4851125635439361, "grad_norm": 1.138478981177591, "learning_rate": 6.1260177814266855e-06, "loss": 0.5754, "step": 3340 }, { "epoch": 0.4865649963689179, "grad_norm": 1.1290987276585867, "learning_rate": 6.101302875801628e-06, "loss": 0.5778, "step": 3350 }, { "epoch": 0.4880174291938998, "grad_norm": 1.1468009205478524, "learning_rate": 6.0765596573798994e-06, "loss": 0.5689, "step": 3360 }, { "epoch": 0.48946986201888165, "grad_norm": 1.0683998313181482, "learning_rate": 6.051788762271442e-06, "loss": 0.5692, "step": 3370 }, { "epoch": 0.49092229484386346, "grad_norm": 1.1889646870467425, "learning_rate": 6.0269908272977295e-06, "loss": 0.5808, "step": 3380 }, { "epoch": 0.4923747276688453, "grad_norm": 1.2529890364621932, "learning_rate": 6.002166489975385e-06, "loss": 0.5772, "step": 3390 }, { "epoch": 0.49382716049382713, "grad_norm": 1.1925487080641164, "learning_rate": 5.977316388499794e-06, "loss": 0.5862, "step": 3400 }, { "epoch": 0.495279593318809, "grad_norm": 1.1372201366075154, "learning_rate": 5.952441161728701e-06, "loss": 0.5662, "step": 3410 }, { "epoch": 0.49673202614379086, "grad_norm": 1.2981299245914195, "learning_rate": 5.927541449165783e-06, "loss": 0.5682, "step": 3420 }, { "epoch": 0.49818445896877267, "grad_norm": 1.1198285033650917, "learning_rate": 5.902617890944207e-06, "loss": 0.5894, "step": 3430 }, { "epoch": 0.49963689179375453, "grad_norm": 1.1442459802118357, "learning_rate": 5.8776711278101765e-06, "loss": 0.5735, "step": 3440 }, { "epoch": 0.5010893246187363, "grad_norm": 1.10045421098352, "learning_rate": 5.852701801106458e-06, "loss": 0.5838, "step": 3450 }, { "epoch": 0.5025417574437182, "grad_norm": 1.1675311387395517, "learning_rate": 5.82771055275589e-06, "loss": 0.5847, "step": 3460 }, { "epoch": 0.5039941902687001, "grad_norm": 1.0028532762834719, "learning_rate": 5.802698025244886e-06, "loss": 0.5656, "step": 3470 }, { "epoch": 0.5054466230936819, "grad_norm": 1.028656973511835, "learning_rate": 5.777664861606912e-06, "loss": 0.5871, "step": 3480 }, { "epoch": 0.5068990559186638, "grad_norm": 1.2007383871296113, "learning_rate": 5.752611705405957e-06, "loss": 0.5895, "step": 3490 }, { "epoch": 0.5083514887436456, "grad_norm": 1.1281898149999334, "learning_rate": 5.7275392007199896e-06, "loss": 0.573, "step": 3500 }, { "epoch": 0.5098039215686274, "grad_norm": 1.282146433020574, "learning_rate": 5.702447992124394e-06, "loss": 0.57, "step": 3510 }, { "epoch": 0.5112563543936093, "grad_norm": 1.05801689608913, "learning_rate": 5.677338724675406e-06, "loss": 0.5751, "step": 3520 }, { "epoch": 0.5127087872185911, "grad_norm": 1.2511793245069922, "learning_rate": 5.652212043893528e-06, "loss": 0.5805, "step": 3530 }, { "epoch": 0.514161220043573, "grad_norm": 1.2496537928999953, "learning_rate": 5.627068595746931e-06, "loss": 0.5734, "step": 3540 }, { "epoch": 0.5156136528685549, "grad_norm": 1.0586939290192166, "learning_rate": 5.601909026634846e-06, "loss": 0.573, "step": 3550 }, { "epoch": 0.5170660856935366, "grad_norm": 1.2135072197108623, "learning_rate": 5.576733983370955e-06, "loss": 0.5696, "step": 3560 }, { "epoch": 0.5185185185185185, "grad_norm": 1.096951604322022, "learning_rate": 5.551544113166752e-06, "loss": 0.5764, "step": 3570 }, { "epoch": 0.5199709513435004, "grad_norm": 1.067656908278471, "learning_rate": 5.5263400636149104e-06, "loss": 0.5945, "step": 3580 }, { "epoch": 0.5214233841684822, "grad_norm": 1.2528345132805765, "learning_rate": 5.50112248267263e-06, "loss": 0.5698, "step": 3590 }, { "epoch": 0.5228758169934641, "grad_norm": 1.153586426579592, "learning_rate": 5.475892018644989e-06, "loss": 0.5939, "step": 3600 }, { "epoch": 0.524328249818446, "grad_norm": 1.321281822598792, "learning_rate": 5.450649320168263e-06, "loss": 0.5764, "step": 3610 }, { "epoch": 0.5257806826434277, "grad_norm": 1.1546247883125684, "learning_rate": 5.4253950361932565e-06, "loss": 0.5698, "step": 3620 }, { "epoch": 0.5272331154684096, "grad_norm": 1.3090075714265825, "learning_rate": 5.400129815968623e-06, "loss": 0.58, "step": 3630 }, { "epoch": 0.5286855482933914, "grad_norm": 1.3546772950978652, "learning_rate": 5.374854309024167e-06, "loss": 0.5906, "step": 3640 }, { "epoch": 0.5301379811183733, "grad_norm": 1.0728126839197956, "learning_rate": 5.349569165154153e-06, "loss": 0.5617, "step": 3650 }, { "epoch": 0.5315904139433552, "grad_norm": 1.0481388119854531, "learning_rate": 5.32427503440059e-06, "loss": 0.5752, "step": 3660 }, { "epoch": 0.533042846768337, "grad_norm": 1.251734474368655, "learning_rate": 5.29897256703653e-06, "loss": 0.577, "step": 3670 }, { "epoch": 0.5344952795933188, "grad_norm": 1.1273771235496188, "learning_rate": 5.2736624135493465e-06, "loss": 0.5604, "step": 3680 }, { "epoch": 0.5359477124183006, "grad_norm": 1.1728285082039356, "learning_rate": 5.248345224624007e-06, "loss": 0.5799, "step": 3690 }, { "epoch": 0.5374001452432825, "grad_norm": 1.1207082347004158, "learning_rate": 5.223021651126356e-06, "loss": 0.5792, "step": 3700 }, { "epoch": 0.5388525780682644, "grad_norm": 1.096111126610637, "learning_rate": 5.197692344086369e-06, "loss": 0.582, "step": 3710 }, { "epoch": 0.5403050108932462, "grad_norm": 1.1432895144261512, "learning_rate": 5.172357954681427e-06, "loss": 0.5669, "step": 3720 }, { "epoch": 0.541757443718228, "grad_norm": 1.2795186578480655, "learning_rate": 5.147019134219569e-06, "loss": 0.5727, "step": 3730 }, { "epoch": 0.5432098765432098, "grad_norm": 1.1497619263404009, "learning_rate": 5.121676534122746e-06, "loss": 0.5665, "step": 3740 }, { "epoch": 0.5446623093681917, "grad_norm": 1.053760679670929, "learning_rate": 5.096330805910085e-06, "loss": 0.5758, "step": 3750 }, { "epoch": 0.5461147421931736, "grad_norm": 1.2455461930319618, "learning_rate": 5.0709826011811246e-06, "loss": 0.5715, "step": 3760 }, { "epoch": 0.5475671750181554, "grad_norm": 1.2714142743729588, "learning_rate": 5.045632571599076e-06, "loss": 0.5764, "step": 3770 }, { "epoch": 0.5490196078431373, "grad_norm": 1.2596602396359573, "learning_rate": 5.020281368874063e-06, "loss": 0.5777, "step": 3780 }, { "epoch": 0.5504720406681191, "grad_norm": 1.096076072807335, "learning_rate": 4.994929644746366e-06, "loss": 0.5752, "step": 3790 }, { "epoch": 0.5519244734931009, "grad_norm": 1.1180419407959938, "learning_rate": 4.969578050969675e-06, "loss": 0.5783, "step": 3800 }, { "epoch": 0.5533769063180828, "grad_norm": 1.1457632992717688, "learning_rate": 4.944227239294327e-06, "loss": 0.5706, "step": 3810 }, { "epoch": 0.5548293391430646, "grad_norm": 1.0431686309314605, "learning_rate": 4.918877861450553e-06, "loss": 0.5629, "step": 3820 }, { "epoch": 0.5562817719680465, "grad_norm": 1.1033442319502207, "learning_rate": 4.893530569131716e-06, "loss": 0.5611, "step": 3830 }, { "epoch": 0.5577342047930284, "grad_norm": 1.1929600913303742, "learning_rate": 4.8681860139775745e-06, "loss": 0.568, "step": 3840 }, { "epoch": 0.5591866376180101, "grad_norm": 1.281488846532093, "learning_rate": 4.842844847557508e-06, "loss": 0.5882, "step": 3850 }, { "epoch": 0.560639070442992, "grad_norm": 1.1195048036816224, "learning_rate": 4.817507721353785e-06, "loss": 0.596, "step": 3860 }, { "epoch": 0.5620915032679739, "grad_norm": 1.1077419816516767, "learning_rate": 4.792175286744802e-06, "loss": 0.5747, "step": 3870 }, { "epoch": 0.5635439360929557, "grad_norm": 1.3502747193694702, "learning_rate": 4.766848194988344e-06, "loss": 0.5915, "step": 3880 }, { "epoch": 0.5649963689179376, "grad_norm": 1.001203957804234, "learning_rate": 4.741527097204837e-06, "loss": 0.5732, "step": 3890 }, { "epoch": 0.5664488017429193, "grad_norm": 1.1428305709772093, "learning_rate": 4.7162126443606145e-06, "loss": 0.5682, "step": 3900 }, { "epoch": 0.5679012345679012, "grad_norm": 1.220191866232699, "learning_rate": 4.690905487251174e-06, "loss": 0.5695, "step": 3910 }, { "epoch": 0.5693536673928831, "grad_norm": 1.0555952997249456, "learning_rate": 4.665606276484455e-06, "loss": 0.5684, "step": 3920 }, { "epoch": 0.5708061002178649, "grad_norm": 1.1675138439049109, "learning_rate": 4.6403156624641085e-06, "loss": 0.5876, "step": 3930 }, { "epoch": 0.5722585330428468, "grad_norm": 1.2418849374572543, "learning_rate": 4.615034295372777e-06, "loss": 0.5838, "step": 3940 }, { "epoch": 0.5737109658678287, "grad_norm": 1.0616817293128535, "learning_rate": 4.589762825155374e-06, "loss": 0.57, "step": 3950 }, { "epoch": 0.5751633986928104, "grad_norm": 1.2414737852232787, "learning_rate": 4.564501901502386e-06, "loss": 0.5521, "step": 3960 }, { "epoch": 0.5766158315177923, "grad_norm": 1.0962764476368352, "learning_rate": 4.5392521738331585e-06, "loss": 0.5761, "step": 3970 }, { "epoch": 0.5780682643427741, "grad_norm": 1.2445755051746221, "learning_rate": 4.514014291279208e-06, "loss": 0.5612, "step": 3980 }, { "epoch": 0.579520697167756, "grad_norm": 1.1248791169953434, "learning_rate": 4.488788902667534e-06, "loss": 0.5651, "step": 3990 }, { "epoch": 0.5809731299927379, "grad_norm": 1.1052395709597995, "learning_rate": 4.463576656503927e-06, "loss": 0.5624, "step": 4000 }, { "epoch": 0.5824255628177197, "grad_norm": 1.0979993545936089, "learning_rate": 4.438378200956318e-06, "loss": 0.5747, "step": 4010 }, { "epoch": 0.5838779956427015, "grad_norm": 1.1585156096079503, "learning_rate": 4.413194183838091e-06, "loss": 0.5757, "step": 4020 }, { "epoch": 0.5853304284676834, "grad_norm": 1.0657343307419072, "learning_rate": 4.388025252591448e-06, "loss": 0.5826, "step": 4030 }, { "epoch": 0.5867828612926652, "grad_norm": 1.1584399941372348, "learning_rate": 4.362872054270753e-06, "loss": 0.561, "step": 4040 }, { "epoch": 0.5882352941176471, "grad_norm": 1.1136815017444102, "learning_rate": 4.337735235525904e-06, "loss": 0.5801, "step": 4050 }, { "epoch": 0.5896877269426289, "grad_norm": 1.2048049573288624, "learning_rate": 4.312615442585699e-06, "loss": 0.5748, "step": 4060 }, { "epoch": 0.5911401597676107, "grad_norm": 1.106968794623351, "learning_rate": 4.287513321241237e-06, "loss": 0.5665, "step": 4070 }, { "epoch": 0.5925925925925926, "grad_norm": 1.0773536810915454, "learning_rate": 4.262429516829299e-06, "loss": 0.5739, "step": 4080 }, { "epoch": 0.5940450254175744, "grad_norm": 1.2780512286596586, "learning_rate": 4.237364674215774e-06, "loss": 0.573, "step": 4090 }, { "epoch": 0.5954974582425563, "grad_norm": 1.015175880325257, "learning_rate": 4.212319437779066e-06, "loss": 0.5637, "step": 4100 }, { "epoch": 0.5969498910675382, "grad_norm": 1.1403330329394572, "learning_rate": 4.187294451393541e-06, "loss": 0.5807, "step": 4110 }, { "epoch": 0.59840232389252, "grad_norm": 1.1083139371642667, "learning_rate": 4.162290358412962e-06, "loss": 0.5704, "step": 4120 }, { "epoch": 0.5998547567175018, "grad_norm": 1.1372343052927192, "learning_rate": 4.1373078016539535e-06, "loss": 0.5559, "step": 4130 }, { "epoch": 0.6013071895424836, "grad_norm": 1.2137905963682751, "learning_rate": 4.1123474233794845e-06, "loss": 0.5588, "step": 4140 }, { "epoch": 0.6027596223674655, "grad_norm": 1.2130103389722957, "learning_rate": 4.087409865282341e-06, "loss": 0.5776, "step": 4150 }, { "epoch": 0.6042120551924474, "grad_norm": 1.21914550825707, "learning_rate": 4.062495768468646e-06, "loss": 0.5618, "step": 4160 }, { "epoch": 0.6056644880174292, "grad_norm": 1.1540562248868875, "learning_rate": 4.03760577344136e-06, "loss": 0.5784, "step": 4170 }, { "epoch": 0.6071169208424111, "grad_norm": 1.214796762228358, "learning_rate": 4.012740520083832e-06, "loss": 0.5814, "step": 4180 }, { "epoch": 0.6085693536673928, "grad_norm": 1.157806370832285, "learning_rate": 3.987900647643334e-06, "loss": 0.5791, "step": 4190 }, { "epoch": 0.6100217864923747, "grad_norm": 1.1517956672556253, "learning_rate": 3.963086794714639e-06, "loss": 0.5652, "step": 4200 }, { "epoch": 0.6114742193173566, "grad_norm": 1.1605789001720612, "learning_rate": 3.9382995992235955e-06, "loss": 0.5728, "step": 4210 }, { "epoch": 0.6129266521423384, "grad_norm": 1.0630436480054268, "learning_rate": 3.913539698410734e-06, "loss": 0.5684, "step": 4220 }, { "epoch": 0.6143790849673203, "grad_norm": 1.175513347812724, "learning_rate": 3.888807728814874e-06, "loss": 0.5664, "step": 4230 }, { "epoch": 0.615831517792302, "grad_norm": 1.1583525329647688, "learning_rate": 3.864104326256775e-06, "loss": 0.5805, "step": 4240 }, { "epoch": 0.6172839506172839, "grad_norm": 1.1058170223844426, "learning_rate": 3.8394301258227756e-06, "loss": 0.5622, "step": 4250 }, { "epoch": 0.6187363834422658, "grad_norm": 1.2295319541574912, "learning_rate": 3.814785761848475e-06, "loss": 0.5583, "step": 4260 }, { "epoch": 0.6201888162672476, "grad_norm": 1.092280135001415, "learning_rate": 3.790171867902426e-06, "loss": 0.5755, "step": 4270 }, { "epoch": 0.6216412490922295, "grad_norm": 1.274653674496685, "learning_rate": 3.7655890767698384e-06, "loss": 0.5729, "step": 4280 }, { "epoch": 0.6230936819172114, "grad_norm": 1.2166924621577075, "learning_rate": 3.741038020436323e-06, "loss": 0.5572, "step": 4290 }, { "epoch": 0.6245461147421931, "grad_norm": 1.0296689666125658, "learning_rate": 3.7165193300716297e-06, "loss": 0.5664, "step": 4300 }, { "epoch": 0.625998547567175, "grad_norm": 1.0530929308425294, "learning_rate": 3.6920336360134378e-06, "loss": 0.5679, "step": 4310 }, { "epoch": 0.6274509803921569, "grad_norm": 1.1137539642969592, "learning_rate": 3.6675815677511382e-06, "loss": 0.5607, "step": 4320 }, { "epoch": 0.6289034132171387, "grad_norm": 1.0875536687719785, "learning_rate": 3.6431637539096565e-06, "loss": 0.5691, "step": 4330 }, { "epoch": 0.6303558460421206, "grad_norm": 1.1268225507247402, "learning_rate": 3.6187808222332852e-06, "loss": 0.5668, "step": 4340 }, { "epoch": 0.6318082788671024, "grad_norm": 1.1757316218974525, "learning_rate": 3.594433399569559e-06, "loss": 0.5551, "step": 4350 }, { "epoch": 0.6332607116920842, "grad_norm": 1.1554119314408926, "learning_rate": 3.5701221118531195e-06, "loss": 0.5785, "step": 4360 }, { "epoch": 0.6347131445170661, "grad_norm": 1.0947128171930913, "learning_rate": 3.5458475840896434e-06, "loss": 0.5677, "step": 4370 }, { "epoch": 0.6361655773420479, "grad_norm": 1.2477952532418557, "learning_rate": 3.5216104403397623e-06, "loss": 0.5504, "step": 4380 }, { "epoch": 0.6376180101670298, "grad_norm": 1.1149755483280817, "learning_rate": 3.4974113037030257e-06, "loss": 0.5753, "step": 4390 }, { "epoch": 0.6390704429920117, "grad_norm": 1.214526641921585, "learning_rate": 3.473250796301874e-06, "loss": 0.5669, "step": 4400 }, { "epoch": 0.6405228758169934, "grad_norm": 1.1149175312128623, "learning_rate": 3.4491295392656497e-06, "loss": 0.5604, "step": 4410 }, { "epoch": 0.6419753086419753, "grad_norm": 1.1763746140746527, "learning_rate": 3.425048152714635e-06, "loss": 0.5651, "step": 4420 }, { "epoch": 0.6434277414669571, "grad_norm": 1.169802661186734, "learning_rate": 3.4010072557440967e-06, "loss": 0.5685, "step": 4430 }, { "epoch": 0.644880174291939, "grad_norm": 1.1404701148865375, "learning_rate": 3.3770074664083827e-06, "loss": 0.577, "step": 4440 }, { "epoch": 0.6463326071169209, "grad_norm": 1.2951511455390947, "learning_rate": 3.353049401705022e-06, "loss": 0.5546, "step": 4450 }, { "epoch": 0.6477850399419027, "grad_norm": 1.2188858191779428, "learning_rate": 3.329133677558873e-06, "loss": 0.5697, "step": 4460 }, { "epoch": 0.6492374727668845, "grad_norm": 1.1239635889524127, "learning_rate": 3.3052609088062767e-06, "loss": 0.5901, "step": 4470 }, { "epoch": 0.6506899055918663, "grad_norm": 1.0931476283773633, "learning_rate": 3.281431709179264e-06, "loss": 0.566, "step": 4480 }, { "epoch": 0.6521423384168482, "grad_norm": 1.4718901865939953, "learning_rate": 3.2576466912897674e-06, "loss": 0.5761, "step": 4490 }, { "epoch": 0.6535947712418301, "grad_norm": 1.2062192465520678, "learning_rate": 3.2339064666138783e-06, "loss": 0.5757, "step": 4500 }, { "epoch": 0.6550472040668119, "grad_norm": 1.2732571104572175, "learning_rate": 3.2102116454761168e-06, "loss": 0.5615, "step": 4510 }, { "epoch": 0.6564996368917938, "grad_norm": 1.198522063919598, "learning_rate": 3.1865628370337575e-06, "loss": 0.5632, "step": 4520 }, { "epoch": 0.6579520697167756, "grad_norm": 1.208764455797361, "learning_rate": 3.162960649261152e-06, "loss": 0.5472, "step": 4530 }, { "epoch": 0.6594045025417574, "grad_norm": 1.2300085896818644, "learning_rate": 3.1394056889341086e-06, "loss": 0.5737, "step": 4540 }, { "epoch": 0.6608569353667393, "grad_norm": 1.2362227883984134, "learning_rate": 3.1158985616142944e-06, "loss": 0.5467, "step": 4550 }, { "epoch": 0.6623093681917211, "grad_norm": 1.2577141886691818, "learning_rate": 3.092439871633658e-06, "loss": 0.5652, "step": 4560 }, { "epoch": 0.663761801016703, "grad_norm": 1.2246719550977323, "learning_rate": 3.0690302220789036e-06, "loss": 0.564, "step": 4570 }, { "epoch": 0.6652142338416849, "grad_norm": 0.952770111510269, "learning_rate": 3.0456702147759797e-06, "loss": 0.5538, "step": 4580 }, { "epoch": 0.6666666666666666, "grad_norm": 1.2114290005968387, "learning_rate": 3.0223604502746097e-06, "loss": 0.5624, "step": 4590 }, { "epoch": 0.6681190994916485, "grad_norm": 1.2379634249474247, "learning_rate": 2.999101527832849e-06, "loss": 0.5581, "step": 4600 }, { "epoch": 0.6695715323166304, "grad_norm": 1.2432970361649818, "learning_rate": 2.9758940454016893e-06, "loss": 0.5519, "step": 4610 }, { "epoch": 0.6710239651416122, "grad_norm": 1.1827840525798392, "learning_rate": 2.9527385996096702e-06, "loss": 0.5512, "step": 4620 }, { "epoch": 0.6724763979665941, "grad_norm": 1.1313263342846276, "learning_rate": 2.929635785747558e-06, "loss": 0.5615, "step": 4630 }, { "epoch": 0.6739288307915758, "grad_norm": 1.0718626125088186, "learning_rate": 2.9065861977530263e-06, "loss": 0.5577, "step": 4640 }, { "epoch": 0.6753812636165577, "grad_norm": 1.2058366328226908, "learning_rate": 2.8835904281953984e-06, "loss": 0.5543, "step": 4650 }, { "epoch": 0.6768336964415396, "grad_norm": 1.2044090066060698, "learning_rate": 2.8606490682604083e-06, "loss": 0.563, "step": 4660 }, { "epoch": 0.6782861292665214, "grad_norm": 1.2440783490748353, "learning_rate": 2.837762707734999e-06, "loss": 0.5678, "step": 4670 }, { "epoch": 0.6797385620915033, "grad_norm": 1.1447619754452882, "learning_rate": 2.8149319349921678e-06, "loss": 0.5443, "step": 4680 }, { "epoch": 0.6811909949164852, "grad_norm": 1.0682059420594845, "learning_rate": 2.7921573369758344e-06, "loss": 0.5548, "step": 4690 }, { "epoch": 0.6826434277414669, "grad_norm": 1.0786981942796325, "learning_rate": 2.769439499185752e-06, "loss": 0.557, "step": 4700 }, { "epoch": 0.6840958605664488, "grad_norm": 1.1021974391300458, "learning_rate": 2.7467790056624565e-06, "loss": 0.5641, "step": 4710 }, { "epoch": 0.6855482933914306, "grad_norm": 1.172642324603278, "learning_rate": 2.7241764389722536e-06, "loss": 0.5579, "step": 4720 }, { "epoch": 0.6870007262164125, "grad_norm": 1.1739344769196898, "learning_rate": 2.7016323801922327e-06, "loss": 0.5426, "step": 4730 }, { "epoch": 0.6884531590413944, "grad_norm": 1.0908808031509236, "learning_rate": 2.679147408895349e-06, "loss": 0.5667, "step": 4740 }, { "epoch": 0.6899055918663762, "grad_norm": 1.1345661062696517, "learning_rate": 2.6567221031354907e-06, "loss": 0.5639, "step": 4750 }, { "epoch": 0.691358024691358, "grad_norm": 1.0249096917283105, "learning_rate": 2.634357039432656e-06, "loss": 0.5648, "step": 4760 }, { "epoch": 0.6928104575163399, "grad_norm": 1.1583880032183098, "learning_rate": 2.612052792758095e-06, "loss": 0.5651, "step": 4770 }, { "epoch": 0.6942628903413217, "grad_norm": 1.069684864764473, "learning_rate": 2.5898099365195626e-06, "loss": 0.5722, "step": 4780 }, { "epoch": 0.6957153231663036, "grad_norm": 1.0867414593247826, "learning_rate": 2.5676290425465496e-06, "loss": 0.5664, "step": 4790 }, { "epoch": 0.6971677559912854, "grad_norm": 1.1375716473128172, "learning_rate": 2.5455106810755957e-06, "loss": 0.5585, "step": 4800 }, { "epoch": 0.6986201888162672, "grad_norm": 1.034623153574018, "learning_rate": 2.5234554207356266e-06, "loss": 0.5722, "step": 4810 }, { "epoch": 0.7000726216412491, "grad_norm": 1.0654655922639538, "learning_rate": 2.5014638285333357e-06, "loss": 0.5643, "step": 4820 }, { "epoch": 0.7015250544662309, "grad_norm": 1.0988829596394427, "learning_rate": 2.479536469838606e-06, "loss": 0.5635, "step": 4830 }, { "epoch": 0.7029774872912128, "grad_norm": 1.050301540250255, "learning_rate": 2.4576739083699764e-06, "loss": 0.55, "step": 4840 }, { "epoch": 0.7044299201161947, "grad_norm": 1.3185971209726384, "learning_rate": 2.43587670618015e-06, "loss": 0.5686, "step": 4850 }, { "epoch": 0.7058823529411765, "grad_norm": 1.1036440984293434, "learning_rate": 2.4141454236415428e-06, "loss": 0.5617, "step": 4860 }, { "epoch": 0.7073347857661583, "grad_norm": 1.0669150287420783, "learning_rate": 2.392480619431879e-06, "loss": 0.5416, "step": 4870 }, { "epoch": 0.7087872185911401, "grad_norm": 1.0472161733755885, "learning_rate": 2.3708828505198265e-06, "loss": 0.5777, "step": 4880 }, { "epoch": 0.710239651416122, "grad_norm": 1.1252884484776227, "learning_rate": 2.349352672150681e-06, "loss": 0.5535, "step": 4890 }, { "epoch": 0.7116920842411039, "grad_norm": 1.1423409076437527, "learning_rate": 2.3278906378320854e-06, "loss": 0.5598, "step": 4900 }, { "epoch": 0.7131445170660857, "grad_norm": 0.9801237939355479, "learning_rate": 2.306497299319814e-06, "loss": 0.5551, "step": 4910 }, { "epoch": 0.7145969498910676, "grad_norm": 1.0526887175825372, "learning_rate": 2.285173206603564e-06, "loss": 0.5683, "step": 4920 }, { "epoch": 0.7160493827160493, "grad_norm": 1.1758853714133906, "learning_rate": 2.2639189078928453e-06, "loss": 0.5581, "step": 4930 }, { "epoch": 0.7175018155410312, "grad_norm": 1.107044757903735, "learning_rate": 2.242734949602856e-06, "loss": 0.5448, "step": 4940 }, { "epoch": 0.7189542483660131, "grad_norm": 1.2037164103649114, "learning_rate": 2.2216218763404647e-06, "loss": 0.5531, "step": 4950 }, { "epoch": 0.7204066811909949, "grad_norm": 1.0588992084011324, "learning_rate": 2.200580230890188e-06, "loss": 0.5501, "step": 4960 }, { "epoch": 0.7218591140159768, "grad_norm": 1.2543824405997601, "learning_rate": 2.17961055420024e-06, "loss": 0.5769, "step": 4970 }, { "epoch": 0.7233115468409586, "grad_norm": 1.1899069770329052, "learning_rate": 2.1587133853686422e-06, "loss": 0.5683, "step": 4980 }, { "epoch": 0.7247639796659404, "grad_norm": 1.144536370052011, "learning_rate": 2.137889261629334e-06, "loss": 0.5648, "step": 4990 }, { "epoch": 0.7262164124909223, "grad_norm": 1.1936078152653293, "learning_rate": 2.1171387183383936e-06, "loss": 0.5646, "step": 5000 }, { "epoch": 0.7276688453159041, "grad_norm": 1.26324013915445, "learning_rate": 2.096462288960251e-06, "loss": 0.5682, "step": 5010 }, { "epoch": 0.729121278140886, "grad_norm": 1.1381437228179463, "learning_rate": 2.0758605050539836e-06, "loss": 0.5571, "step": 5020 }, { "epoch": 0.7305737109658679, "grad_norm": 1.3500933515295954, "learning_rate": 2.0553338962596492e-06, "loss": 0.5716, "step": 5030 }, { "epoch": 0.7320261437908496, "grad_norm": 1.0940717331908218, "learning_rate": 2.03488299028467e-06, "loss": 0.5626, "step": 5040 }, { "epoch": 0.7334785766158315, "grad_norm": 1.1116999445105729, "learning_rate": 2.0145083128902647e-06, "loss": 0.5625, "step": 5050 }, { "epoch": 0.7349310094408134, "grad_norm": 1.144025480175903, "learning_rate": 1.9942103878779335e-06, "loss": 0.5601, "step": 5060 }, { "epoch": 0.7363834422657952, "grad_norm": 1.0557283567612936, "learning_rate": 1.9739897370759886e-06, "loss": 0.5523, "step": 5070 }, { "epoch": 0.7378358750907771, "grad_norm": 1.243995372081041, "learning_rate": 1.9538468803261514e-06, "loss": 0.5521, "step": 5080 }, { "epoch": 0.739288307915759, "grad_norm": 1.1122614530495916, "learning_rate": 1.9337823354701617e-06, "loss": 0.5615, "step": 5090 }, { "epoch": 0.7407407407407407, "grad_norm": 1.012804702506735, "learning_rate": 1.913796618336499e-06, "loss": 0.5514, "step": 5100 }, { "epoch": 0.7421931735657226, "grad_norm": 1.1487569184157758, "learning_rate": 1.8938902427270905e-06, "loss": 0.5595, "step": 5110 }, { "epoch": 0.7436456063907044, "grad_norm": 1.222308594990331, "learning_rate": 1.8740637204041195e-06, "loss": 0.5645, "step": 5120 }, { "epoch": 0.7450980392156863, "grad_norm": 1.1354476091482255, "learning_rate": 1.8543175610768715e-06, "loss": 0.5607, "step": 5130 }, { "epoch": 0.7465504720406682, "grad_norm": 1.2205544178436005, "learning_rate": 1.83465227238861e-06, "loss": 0.542, "step": 5140 }, { "epoch": 0.7480029048656499, "grad_norm": 1.2462160753237452, "learning_rate": 1.8150683599035517e-06, "loss": 0.5606, "step": 5150 }, { "epoch": 0.7494553376906318, "grad_norm": 1.1396860492016365, "learning_rate": 1.7955663270938501e-06, "loss": 0.5689, "step": 5160 }, { "epoch": 0.7509077705156136, "grad_norm": 1.1228524828818305, "learning_rate": 1.7761466753266598e-06, "loss": 0.5625, "step": 5170 }, { "epoch": 0.7523602033405955, "grad_norm": 1.1360291736903685, "learning_rate": 1.7568099038512466e-06, "loss": 0.5724, "step": 5180 }, { "epoch": 0.7538126361655774, "grad_norm": 1.226701284666325, "learning_rate": 1.7375565097861518e-06, "loss": 0.5653, "step": 5190 }, { "epoch": 0.7552650689905592, "grad_norm": 1.1971595467490777, "learning_rate": 1.7183869881064125e-06, "loss": 0.5681, "step": 5200 }, { "epoch": 0.756717501815541, "grad_norm": 1.003433379963408, "learning_rate": 1.6993018316308351e-06, "loss": 0.5497, "step": 5210 }, { "epoch": 0.7581699346405228, "grad_norm": 1.0677706687056256, "learning_rate": 1.6803015310093286e-06, "loss": 0.5663, "step": 5220 }, { "epoch": 0.7596223674655047, "grad_norm": 1.1960572257973088, "learning_rate": 1.6613865747102876e-06, "loss": 0.5566, "step": 5230 }, { "epoch": 0.7610748002904866, "grad_norm": 1.1110041512712467, "learning_rate": 1.6425574490080355e-06, "loss": 0.5474, "step": 5240 }, { "epoch": 0.7625272331154684, "grad_norm": 1.1953866183465143, "learning_rate": 1.6238146379703257e-06, "loss": 0.5602, "step": 5250 }, { "epoch": 0.7639796659404503, "grad_norm": 1.184221410195916, "learning_rate": 1.6051586234458932e-06, "loss": 0.558, "step": 5260 }, { "epoch": 0.7654320987654321, "grad_norm": 1.1917994670950118, "learning_rate": 1.5865898850520671e-06, "loss": 0.573, "step": 5270 }, { "epoch": 0.7668845315904139, "grad_norm": 1.205079091727242, "learning_rate": 1.5681089001624488e-06, "loss": 0.5565, "step": 5280 }, { "epoch": 0.7683369644153958, "grad_norm": 1.0590014592765518, "learning_rate": 1.5497161438946218e-06, "loss": 0.5537, "step": 5290 }, { "epoch": 0.7697893972403776, "grad_norm": 1.3045355829406655, "learning_rate": 1.5314120890979596e-06, "loss": 0.5608, "step": 5300 }, { "epoch": 0.7712418300653595, "grad_norm": 1.227226173650366, "learning_rate": 1.5131972063414451e-06, "loss": 0.563, "step": 5310 }, { "epoch": 0.7726942628903414, "grad_norm": 1.1505400844326525, "learning_rate": 1.4950719639015987e-06, "loss": 0.5618, "step": 5320 }, { "epoch": 0.7741466957153231, "grad_norm": 1.1971910791582392, "learning_rate": 1.4770368277504183e-06, "loss": 0.5559, "step": 5330 }, { "epoch": 0.775599128540305, "grad_norm": 1.1465426761189066, "learning_rate": 1.45909226154341e-06, "loss": 0.5757, "step": 5340 }, { "epoch": 0.7770515613652869, "grad_norm": 1.0530342043982832, "learning_rate": 1.4412387266076677e-06, "loss": 0.5699, "step": 5350 }, { "epoch": 0.7785039941902687, "grad_norm": 1.1921772808125664, "learning_rate": 1.4234766819300106e-06, "loss": 0.5592, "step": 5360 }, { "epoch": 0.7799564270152506, "grad_norm": 1.1969217401024441, "learning_rate": 1.4058065841451856e-06, "loss": 0.5658, "step": 5370 }, { "epoch": 0.7814088598402323, "grad_norm": 1.1371738180522346, "learning_rate": 1.3882288875241262e-06, "loss": 0.5523, "step": 5380 }, { "epoch": 0.7828612926652142, "grad_norm": 1.119312116230787, "learning_rate": 1.3707440439622754e-06, "loss": 0.5501, "step": 5390 }, { "epoch": 0.7843137254901961, "grad_norm": 1.200972988458609, "learning_rate": 1.353352502967966e-06, "loss": 0.5393, "step": 5400 }, { "epoch": 0.7857661583151779, "grad_norm": 1.005244568846047, "learning_rate": 1.336054711650867e-06, "loss": 0.5552, "step": 5410 }, { "epoch": 0.7872185911401598, "grad_norm": 0.9811514201367332, "learning_rate": 1.3188511147104882e-06, "loss": 0.5615, "step": 5420 }, { "epoch": 0.7886710239651417, "grad_norm": 1.2124333619418073, "learning_rate": 1.3017421544247466e-06, "loss": 0.5731, "step": 5430 }, { "epoch": 0.7901234567901234, "grad_norm": 1.0164638888045425, "learning_rate": 1.2847282706385962e-06, "loss": 0.5449, "step": 5440 }, { "epoch": 0.7915758896151053, "grad_norm": 1.0692055130184748, "learning_rate": 1.267809900752725e-06, "loss": 0.5581, "step": 5450 }, { "epoch": 0.7930283224400871, "grad_norm": 1.2243966381535343, "learning_rate": 1.2509874797122983e-06, "loss": 0.5694, "step": 5460 }, { "epoch": 0.794480755265069, "grad_norm": 1.1192058071022615, "learning_rate": 1.2342614399957952e-06, "loss": 0.5601, "step": 5470 }, { "epoch": 0.7959331880900509, "grad_norm": 1.210664779695526, "learning_rate": 1.217632211603868e-06, "loss": 0.5383, "step": 5480 }, { "epoch": 0.7973856209150327, "grad_norm": 1.2306429782422048, "learning_rate": 1.2011002220483099e-06, "loss": 0.5503, "step": 5490 }, { "epoch": 0.7988380537400145, "grad_norm": 1.1449496150562748, "learning_rate": 1.1846658963410472e-06, "loss": 0.561, "step": 5500 }, { "epoch": 0.8002904865649964, "grad_norm": 1.1809146975647171, "learning_rate": 1.168329656983222e-06, "loss": 0.5489, "step": 5510 }, { "epoch": 0.8017429193899782, "grad_norm": 1.1865786985653701, "learning_rate": 1.1520919239543272e-06, "loss": 0.5443, "step": 5520 }, { "epoch": 0.8031953522149601, "grad_norm": 1.2819514449232758, "learning_rate": 1.1359531147014102e-06, "loss": 0.5784, "step": 5530 }, { "epoch": 0.8046477850399419, "grad_norm": 1.140249494732679, "learning_rate": 1.11991364412834e-06, "loss": 0.5472, "step": 5540 }, { "epoch": 0.8061002178649237, "grad_norm": 1.0963574239357976, "learning_rate": 1.1039739245851426e-06, "loss": 0.5614, "step": 5550 }, { "epoch": 0.8075526506899056, "grad_norm": 1.1963836912036798, "learning_rate": 1.088134365857399e-06, "loss": 0.5516, "step": 5560 }, { "epoch": 0.8090050835148874, "grad_norm": 1.320400739555157, "learning_rate": 1.0723953751557098e-06, "loss": 0.5643, "step": 5570 }, { "epoch": 0.8104575163398693, "grad_norm": 1.2261172403861758, "learning_rate": 1.0567573571052265e-06, "loss": 0.545, "step": 5580 }, { "epoch": 0.8119099491648512, "grad_norm": 1.1363072652624087, "learning_rate": 1.0412207137352504e-06, "loss": 0.5562, "step": 5590 }, { "epoch": 0.813362381989833, "grad_norm": 1.0696753091917897, "learning_rate": 1.0257858444688968e-06, "loss": 0.5584, "step": 5600 }, { "epoch": 0.8148148148148148, "grad_norm": 1.092336652561905, "learning_rate": 1.0104531461128224e-06, "loss": 0.5509, "step": 5610 }, { "epoch": 0.8162672476397966, "grad_norm": 1.2190453226296554, "learning_rate": 9.952230128470358e-07, "loss": 0.5552, "step": 5620 }, { "epoch": 0.8177196804647785, "grad_norm": 1.1756174285580154, "learning_rate": 9.800958362147433e-07, "loss": 0.5611, "step": 5630 }, { "epoch": 0.8191721132897604, "grad_norm": 1.050298389841538, "learning_rate": 9.65072005112308e-07, "loss": 0.5536, "step": 5640 }, { "epoch": 0.8206245461147422, "grad_norm": 1.2990174959407426, "learning_rate": 9.501519057792275e-07, "loss": 0.5495, "step": 5650 }, { "epoch": 0.8220769789397241, "grad_norm": 1.1318695700100998, "learning_rate": 9.353359217882241e-07, "loss": 0.5557, "step": 5660 }, { "epoch": 0.8235294117647058, "grad_norm": 1.1818056539247317, "learning_rate": 9.206244340353732e-07, "loss": 0.5703, "step": 5670 }, { "epoch": 0.8249818445896877, "grad_norm": 1.191491253002993, "learning_rate": 9.060178207303077e-07, "loss": 0.5543, "step": 5680 }, { "epoch": 0.8264342774146696, "grad_norm": 1.2775803771232788, "learning_rate": 8.915164573865109e-07, "loss": 0.5673, "step": 5690 }, { "epoch": 0.8278867102396514, "grad_norm": 1.0993365384271814, "learning_rate": 8.771207168116407e-07, "loss": 0.5526, "step": 5700 }, { "epoch": 0.8293391430646333, "grad_norm": 1.2010857578242673, "learning_rate": 8.628309690979658e-07, "loss": 0.5465, "step": 5710 }, { "epoch": 0.830791575889615, "grad_norm": 1.1363204888828164, "learning_rate": 8.486475816128376e-07, "loss": 0.5522, "step": 5720 }, { "epoch": 0.8322440087145969, "grad_norm": 1.237168492535083, "learning_rate": 8.345709189892504e-07, "loss": 0.5377, "step": 5730 }, { "epoch": 0.8336964415395788, "grad_norm": 1.1890926723132464, "learning_rate": 8.206013431164683e-07, "loss": 0.5613, "step": 5740 }, { "epoch": 0.8351488743645606, "grad_norm": 1.2611972496063513, "learning_rate": 8.0673921313072e-07, "loss": 0.5562, "step": 5750 }, { "epoch": 0.8366013071895425, "grad_norm": 1.1453681982727373, "learning_rate": 7.929848854059663e-07, "loss": 0.5469, "step": 5760 }, { "epoch": 0.8380537400145244, "grad_norm": 1.1161546893459802, "learning_rate": 7.793387135447372e-07, "loss": 0.5688, "step": 5770 }, { "epoch": 0.8395061728395061, "grad_norm": 1.242951008236561, "learning_rate": 7.658010483690431e-07, "loss": 0.5516, "step": 5780 }, { "epoch": 0.840958605664488, "grad_norm": 1.1291848404892897, "learning_rate": 7.52372237911358e-07, "loss": 0.5558, "step": 5790 }, { "epoch": 0.8424110384894699, "grad_norm": 1.1344340429459099, "learning_rate": 7.390526274056625e-07, "loss": 0.5368, "step": 5800 }, { "epoch": 0.8438634713144517, "grad_norm": 1.2369341276497008, "learning_rate": 7.25842559278584e-07, "loss": 0.5438, "step": 5810 }, { "epoch": 0.8453159041394336, "grad_norm": 1.161564478717058, "learning_rate": 7.127423731405747e-07, "loss": 0.5524, "step": 5820 }, { "epoch": 0.8467683369644154, "grad_norm": 1.3389378618000198, "learning_rate": 6.997524057771964e-07, "loss": 0.5411, "step": 5830 }, { "epoch": 0.8482207697893972, "grad_norm": 1.2324708082947882, "learning_rate": 6.868729911404582e-07, "loss": 0.5594, "step": 5840 }, { "epoch": 0.8496732026143791, "grad_norm": 1.0931906751127958, "learning_rate": 6.741044603402214e-07, "loss": 0.5394, "step": 5850 }, { "epoch": 0.8511256354393609, "grad_norm": 1.1045798920330345, "learning_rate": 6.614471416357055e-07, "loss": 0.5517, "step": 5860 }, { "epoch": 0.8525780682643428, "grad_norm": 1.1003308882789462, "learning_rate": 6.489013604270277e-07, "loss": 0.5432, "step": 5870 }, { "epoch": 0.8540305010893247, "grad_norm": 1.1511825195957979, "learning_rate": 6.364674392468578e-07, "loss": 0.5543, "step": 5880 }, { "epoch": 0.8554829339143064, "grad_norm": 1.1016772920186344, "learning_rate": 6.241456977521115e-07, "loss": 0.5511, "step": 5890 }, { "epoch": 0.8569353667392883, "grad_norm": 1.2345711604547172, "learning_rate": 6.119364527157401e-07, "loss": 0.5546, "step": 5900 }, { "epoch": 0.8583877995642701, "grad_norm": 1.1026866190660687, "learning_rate": 5.998400180185838e-07, "loss": 0.5534, "step": 5910 }, { "epoch": 0.859840232389252, "grad_norm": 1.0696348901565953, "learning_rate": 5.878567046413025e-07, "loss": 0.5431, "step": 5920 }, { "epoch": 0.8612926652142339, "grad_norm": 1.074925388402079, "learning_rate": 5.759868206563834e-07, "loss": 0.5564, "step": 5930 }, { "epoch": 0.8627450980392157, "grad_norm": 1.1892355845709555, "learning_rate": 5.642306712202183e-07, "loss": 0.56, "step": 5940 }, { "epoch": 0.8641975308641975, "grad_norm": 1.1714018297678883, "learning_rate": 5.525885585652591e-07, "loss": 0.5477, "step": 5950 }, { "epoch": 0.8656499636891793, "grad_norm": 1.2243789216177572, "learning_rate": 5.410607819922481e-07, "loss": 0.5561, "step": 5960 }, { "epoch": 0.8671023965141612, "grad_norm": 1.158429282768604, "learning_rate": 5.296476378625237e-07, "loss": 0.5246, "step": 5970 }, { "epoch": 0.8685548293391431, "grad_norm": 1.2064879125921322, "learning_rate": 5.183494195904015e-07, "loss": 0.5434, "step": 5980 }, { "epoch": 0.8700072621641249, "grad_norm": 1.0370084252960212, "learning_rate": 5.071664176356294e-07, "loss": 0.556, "step": 5990 }, { "epoch": 0.8714596949891068, "grad_norm": 1.1529022886105922, "learning_rate": 4.960989194959225e-07, "loss": 0.5349, "step": 6000 }, { "epoch": 0.8729121278140886, "grad_norm": 1.0702466803229502, "learning_rate": 4.851472096995741e-07, "loss": 0.5641, "step": 6010 }, { "epoch": 0.8743645606390704, "grad_norm": 1.195504112892932, "learning_rate": 4.7431156979813097e-07, "loss": 0.5627, "step": 6020 }, { "epoch": 0.8758169934640523, "grad_norm": 1.0424744381436926, "learning_rate": 4.6359227835916954e-07, "loss": 0.5457, "step": 6030 }, { "epoch": 0.8772694262890341, "grad_norm": 1.136106426677912, "learning_rate": 4.529896109591203e-07, "loss": 0.5536, "step": 6040 }, { "epoch": 0.878721859114016, "grad_norm": 1.1941194023099557, "learning_rate": 4.425038401761961e-07, "loss": 0.5512, "step": 6050 }, { "epoch": 0.8801742919389978, "grad_norm": 1.1005592964409183, "learning_rate": 4.3213523558337354e-07, "loss": 0.5522, "step": 6060 }, { "epoch": 0.8816267247639796, "grad_norm": 1.3046172497671011, "learning_rate": 4.218840637414695e-07, "loss": 0.5389, "step": 6070 }, { "epoch": 0.8830791575889615, "grad_norm": 1.2050786337197097, "learning_rate": 4.117505881922856e-07, "loss": 0.5637, "step": 6080 }, { "epoch": 0.8845315904139434, "grad_norm": 1.1086711189663023, "learning_rate": 4.0173506945183295e-07, "loss": 0.5637, "step": 6090 }, { "epoch": 0.8859840232389252, "grad_norm": 1.142760086036647, "learning_rate": 3.9183776500363593e-07, "loss": 0.5639, "step": 6100 }, { "epoch": 0.8874364560639071, "grad_norm": 1.211597985547058, "learning_rate": 3.8205892929211175e-07, "loss": 0.5534, "step": 6110 }, { "epoch": 0.8888888888888888, "grad_norm": 1.125094111731544, "learning_rate": 3.7239881371603005e-07, "loss": 0.5514, "step": 6120 }, { "epoch": 0.8903413217138707, "grad_norm": 1.1253410539349802, "learning_rate": 3.6285766662204735e-07, "loss": 0.5593, "step": 6130 }, { "epoch": 0.8917937545388526, "grad_norm": 1.076054931723469, "learning_rate": 3.534357332983257e-07, "loss": 0.5494, "step": 6140 }, { "epoch": 0.8932461873638344, "grad_norm": 1.2433138382241562, "learning_rate": 3.441332559682242e-07, "loss": 0.5507, "step": 6150 }, { "epoch": 0.8946986201888163, "grad_norm": 1.172111145318429, "learning_rate": 3.349504737840742e-07, "loss": 0.5632, "step": 6160 }, { "epoch": 0.8961510530137982, "grad_norm": 1.2018077073853302, "learning_rate": 3.258876228210267e-07, "loss": 0.5381, "step": 6170 }, { "epoch": 0.8976034858387799, "grad_norm": 1.1218901853415595, "learning_rate": 3.169449360709914e-07, "loss": 0.5651, "step": 6180 }, { "epoch": 0.8990559186637618, "grad_norm": 1.075452696669577, "learning_rate": 3.0812264343663467e-07, "loss": 0.5518, "step": 6190 }, { "epoch": 0.9005083514887436, "grad_norm": 1.2898875627777047, "learning_rate": 2.99420971725482e-07, "loss": 0.5535, "step": 6200 }, { "epoch": 0.9019607843137255, "grad_norm": 1.064409341720963, "learning_rate": 2.9084014464407837e-07, "loss": 0.551, "step": 6210 }, { "epoch": 0.9034132171387074, "grad_norm": 1.1430289990560287, "learning_rate": 2.8238038279224e-07, "loss": 0.5351, "step": 6220 }, { "epoch": 0.9048656499636892, "grad_norm": 1.0942084433621513, "learning_rate": 2.740419036573844e-07, "loss": 0.5628, "step": 6230 }, { "epoch": 0.906318082788671, "grad_norm": 1.1827726416299507, "learning_rate": 2.6582492160893536e-07, "loss": 0.5698, "step": 6240 }, { "epoch": 0.9077705156136529, "grad_norm": 1.0512203056975564, "learning_rate": 2.5772964789281593e-07, "loss": 0.539, "step": 6250 }, { "epoch": 0.9092229484386347, "grad_norm": 1.177449766279641, "learning_rate": 2.4975629062601534e-07, "loss": 0.5475, "step": 6260 }, { "epoch": 0.9106753812636166, "grad_norm": 1.2124754199233574, "learning_rate": 2.419050547912388e-07, "loss": 0.541, "step": 6270 }, { "epoch": 0.9121278140885984, "grad_norm": 1.3580937630552576, "learning_rate": 2.3417614223163908e-07, "loss": 0.5588, "step": 6280 }, { "epoch": 0.9135802469135802, "grad_norm": 1.1170472146222037, "learning_rate": 2.26569751645625e-07, "loss": 0.5436, "step": 6290 }, { "epoch": 0.9150326797385621, "grad_norm": 1.1184802548299553, "learning_rate": 2.1908607858175612e-07, "loss": 0.5377, "step": 6300 }, { "epoch": 0.9164851125635439, "grad_norm": 1.1396702009546613, "learning_rate": 2.117253154337118e-07, "loss": 0.5683, "step": 6310 }, { "epoch": 0.9179375453885258, "grad_norm": 1.2119088736658123, "learning_rate": 2.0448765143534942e-07, "loss": 0.5668, "step": 6320 }, { "epoch": 0.9193899782135077, "grad_norm": 1.0448734314632342, "learning_rate": 1.973732726558364e-07, "loss": 0.5437, "step": 6330 }, { "epoch": 0.9208424110384895, "grad_norm": 1.2851112602098311, "learning_rate": 1.9038236199486693e-07, "loss": 0.5622, "step": 6340 }, { "epoch": 0.9222948438634713, "grad_norm": 1.1700640178574329, "learning_rate": 1.8351509917796218e-07, "loss": 0.542, "step": 6350 }, { "epoch": 0.9237472766884531, "grad_norm": 1.1416778336018678, "learning_rate": 1.7677166075184548e-07, "loss": 0.5529, "step": 6360 }, { "epoch": 0.925199709513435, "grad_norm": 1.1230308913216087, "learning_rate": 1.7015222007990883e-07, "loss": 0.5559, "step": 6370 }, { "epoch": 0.9266521423384169, "grad_norm": 1.1568250466964043, "learning_rate": 1.6365694733775305e-07, "loss": 0.5507, "step": 6380 }, { "epoch": 0.9281045751633987, "grad_norm": 1.1602815569402067, "learning_rate": 1.572860095088108e-07, "loss": 0.552, "step": 6390 }, { "epoch": 0.9295570079883806, "grad_norm": 1.0423401424679095, "learning_rate": 1.5103957038005935e-07, "loss": 0.5446, "step": 6400 }, { "epoch": 0.9310094408133623, "grad_norm": 1.1374874233890928, "learning_rate": 1.4491779053780298e-07, "loss": 0.5473, "step": 6410 }, { "epoch": 0.9324618736383442, "grad_norm": 1.1755709384042587, "learning_rate": 1.3892082736355283e-07, "loss": 0.5486, "step": 6420 }, { "epoch": 0.9339143064633261, "grad_norm": 1.1744643775241368, "learning_rate": 1.3304883502997133e-07, "loss": 0.5518, "step": 6430 }, { "epoch": 0.9353667392883079, "grad_norm": 1.1216236591765696, "learning_rate": 1.2730196449691756e-07, "loss": 0.5492, "step": 6440 }, { "epoch": 0.9368191721132898, "grad_norm": 1.1470393369010776, "learning_rate": 1.2168036350755975e-07, "loss": 0.5322, "step": 6450 }, { "epoch": 0.9382716049382716, "grad_norm": 1.1985354195876317, "learning_rate": 1.1618417658458003e-07, "loss": 0.5616, "step": 6460 }, { "epoch": 0.9397240377632534, "grad_norm": 1.1475497479759824, "learning_rate": 1.1081354502645913e-07, "loss": 0.5531, "step": 6470 }, { "epoch": 0.9411764705882353, "grad_norm": 1.1396353932104606, "learning_rate": 1.0556860690384252e-07, "loss": 0.5472, "step": 6480 }, { "epoch": 0.9426289034132171, "grad_norm": 1.1215848254083782, "learning_rate": 1.0044949705599216e-07, "loss": 0.5429, "step": 6490 }, { "epoch": 0.944081336238199, "grad_norm": 1.005591582016032, "learning_rate": 9.545634708731988e-08, "loss": 0.5418, "step": 6500 }, { "epoch": 0.9455337690631809, "grad_norm": 1.215225242394237, "learning_rate": 9.058928536400058e-08, "loss": 0.5578, "step": 6510 }, { "epoch": 0.9469862018881626, "grad_norm": 1.152537711229488, "learning_rate": 8.584843701067935e-08, "loss": 0.5404, "step": 6520 }, { "epoch": 0.9484386347131445, "grad_norm": 1.175848365037797, "learning_rate": 8.123392390724682e-08, "loss": 0.5522, "step": 6530 }, { "epoch": 0.9498910675381264, "grad_norm": 1.0183498527962453, "learning_rate": 7.674586468570999e-08, "loss": 0.5564, "step": 6540 }, { "epoch": 0.9513435003631082, "grad_norm": 1.2151729065782833, "learning_rate": 7.238437472714466e-08, "loss": 0.5561, "step": 6550 }, { "epoch": 0.9527959331880901, "grad_norm": 1.1402236462651618, "learning_rate": 6.81495661587217e-08, "loss": 0.5411, "step": 6560 }, { "epoch": 0.954248366013072, "grad_norm": 1.1521868862152016, "learning_rate": 6.404154785083383e-08, "loss": 0.5539, "step": 6570 }, { "epoch": 0.9557007988380537, "grad_norm": 1.1258302178296054, "learning_rate": 6.006042541428669e-08, "loss": 0.5532, "step": 6580 }, { "epoch": 0.9571532316630356, "grad_norm": 1.173412519187008, "learning_rate": 5.6206301197594404e-08, "loss": 0.5505, "step": 6590 }, { "epoch": 0.9586056644880174, "grad_norm": 1.136513704911577, "learning_rate": 5.247927428433885e-08, "loss": 0.5435, "step": 6600 }, { "epoch": 0.9600580973129993, "grad_norm": 1.1972723133655234, "learning_rate": 4.887944049062843e-08, "loss": 0.548, "step": 6610 }, { "epoch": 0.9615105301379812, "grad_norm": 1.240930781464282, "learning_rate": 4.5406892362632185e-08, "loss": 0.5538, "step": 6620 }, { "epoch": 0.9629629629629629, "grad_norm": 1.2645184421648727, "learning_rate": 4.206171917420121e-08, "loss": 0.5616, "step": 6630 }, { "epoch": 0.9644153957879448, "grad_norm": 1.1619344530688336, "learning_rate": 3.884400692457435e-08, "loss": 0.5578, "step": 6640 }, { "epoch": 0.9658678286129266, "grad_norm": 1.0415045949293107, "learning_rate": 3.575383833616497e-08, "loss": 0.536, "step": 6650 }, { "epoch": 0.9673202614379085, "grad_norm": 1.1707683296063809, "learning_rate": 3.2791292852437096e-08, "loss": 0.5444, "step": 6660 }, { "epoch": 0.9687726942628904, "grad_norm": 0.9579807050337852, "learning_rate": 2.99564466358615e-08, "loss": 0.5604, "step": 6670 }, { "epoch": 0.9702251270878722, "grad_norm": 1.155540906901066, "learning_rate": 2.7249372565957277e-08, "loss": 0.5495, "step": 6680 }, { "epoch": 0.971677559912854, "grad_norm": 1.0959456715901421, "learning_rate": 2.4670140237419428e-08, "loss": 0.5483, "step": 6690 }, { "epoch": 0.9731299927378358, "grad_norm": 1.0366185075689953, "learning_rate": 2.2218815958329754e-08, "loss": 0.5497, "step": 6700 }, { "epoch": 0.9745824255628177, "grad_norm": 1.0759294981597065, "learning_rate": 1.9895462748450444e-08, "loss": 0.5634, "step": 6710 }, { "epoch": 0.9760348583877996, "grad_norm": 1.1209995693338786, "learning_rate": 1.770014033760592e-08, "loss": 0.5508, "step": 6720 }, { "epoch": 0.9774872912127814, "grad_norm": 1.210238366549934, "learning_rate": 1.5632905164145173e-08, "loss": 0.5813, "step": 6730 }, { "epoch": 0.9789397240377633, "grad_norm": 1.15542524575641, "learning_rate": 1.3693810373494598e-08, "loss": 0.5421, "step": 6740 }, { "epoch": 0.9803921568627451, "grad_norm": 1.194050906215969, "learning_rate": 1.188290581678575e-08, "loss": 0.5586, "step": 6750 }, { "epoch": 0.9818445896877269, "grad_norm": 1.1566645017111077, "learning_rate": 1.0200238049580258e-08, "loss": 0.5632, "step": 6760 }, { "epoch": 0.9832970225127088, "grad_norm": 1.0710546930410338, "learning_rate": 8.645850330668559e-09, "loss": 0.5368, "step": 6770 }, { "epoch": 0.9847494553376906, "grad_norm": 1.175731861197897, "learning_rate": 7.219782620958571e-09, "loss": 0.5388, "step": 6780 }, { "epoch": 0.9862018881626725, "grad_norm": 1.0791848418311811, "learning_rate": 5.922071582449285e-09, "loss": 0.5585, "step": 6790 }, { "epoch": 0.9876543209876543, "grad_norm": 1.21651622954666, "learning_rate": 4.752750577288745e-09, "loss": 0.5603, "step": 6800 }, { "epoch": 0.9891067538126361, "grad_norm": 1.294701087862953, "learning_rate": 3.711849666914735e-09, "loss": 0.5713, "step": 6810 }, { "epoch": 0.990559186637618, "grad_norm": 1.100757408335571, "learning_rate": 2.799395611281508e-09, "loss": 0.5587, "step": 6820 }, { "epoch": 0.9920116194625999, "grad_norm": 1.282263624241459, "learning_rate": 2.0154118681753322e-09, "loss": 0.5588, "step": 6830 }, { "epoch": 0.9934640522875817, "grad_norm": 1.0975199346392859, "learning_rate": 1.3599185926072012e-09, "loss": 0.5724, "step": 6840 }, { "epoch": 0.9949164851125636, "grad_norm": 1.1620574281790235, "learning_rate": 8.329326362976897e-10, "loss": 0.5621, "step": 6850 }, { "epoch": 0.9963689179375453, "grad_norm": 1.1717561623715795, "learning_rate": 4.34467547242301e-10, "loss": 0.5506, "step": 6860 }, { "epoch": 0.9978213507625272, "grad_norm": 1.155270191238308, "learning_rate": 1.645335693623018e-10, "loss": 0.5533, "step": 6870 }, { "epoch": 0.9992737835875091, "grad_norm": 1.240301119345841, "learning_rate": 2.3137642244375202e-11, "loss": 0.5538, "step": 6880 }, { "epoch": 1.0, "step": 6885, "total_flos": 1942329112002560.0, "train_loss": 0.5927019230420812, "train_runtime": 56356.5973, "train_samples_per_second": 1.955, "train_steps_per_second": 0.122 } ], "logging_steps": 10, "max_steps": 6885, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1942329112002560.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }