{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.8119891008174385, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006811989100817439, "grad_norm": 11.914588928222656, "learning_rate": 3.6e-06, "loss": 0.9281, "step": 10 }, { "epoch": 0.013623978201634877, "grad_norm": 2.2081298828125, "learning_rate": 7.6e-06, "loss": 0.4782, "step": 20 }, { "epoch": 0.020435967302452316, "grad_norm": 1.648837924003601, "learning_rate": 1.16e-05, "loss": 0.3026, "step": 30 }, { "epoch": 0.027247956403269755, "grad_norm": 1.3150748014450073, "learning_rate": 1.56e-05, "loss": 0.2283, "step": 40 }, { "epoch": 0.0340599455040872, "grad_norm": 1.7804750204086304, "learning_rate": 1.9600000000000002e-05, "loss": 0.2138, "step": 50 }, { "epoch": 0.04087193460490463, "grad_norm": 1.8128317594528198, "learning_rate": 2.36e-05, "loss": 0.2035, "step": 60 }, { "epoch": 0.047683923705722074, "grad_norm": 1.4426037073135376, "learning_rate": 2.7600000000000003e-05, "loss": 0.1941, "step": 70 }, { "epoch": 0.05449591280653951, "grad_norm": 2.272278070449829, "learning_rate": 3.16e-05, "loss": 0.179, "step": 80 }, { "epoch": 0.06130790190735695, "grad_norm": 1.347985863685608, "learning_rate": 3.56e-05, "loss": 0.148, "step": 90 }, { "epoch": 0.0681198910081744, "grad_norm": 1.5960944890975952, "learning_rate": 3.960000000000001e-05, "loss": 0.1421, "step": 100 }, { "epoch": 0.07493188010899182, "grad_norm": 0.8870837688446045, "learning_rate": 4.36e-05, "loss": 0.1422, "step": 110 }, { "epoch": 0.08174386920980926, "grad_norm": 1.381057858467102, "learning_rate": 4.76e-05, "loss": 0.1392, "step": 120 }, { "epoch": 0.0885558583106267, "grad_norm": 1.140463948249817, "learning_rate": 5.16e-05, "loss": 0.13, "step": 130 }, { "epoch": 0.09536784741144415, "grad_norm": 1.0376285314559937, "learning_rate": 5.560000000000001e-05, "loss": 0.1254, "step": 140 }, { "epoch": 0.10217983651226158, "grad_norm": 0.6157049536705017, "learning_rate": 5.96e-05, "loss": 0.1216, "step": 150 }, { "epoch": 0.10899182561307902, "grad_norm": 1.6728339195251465, "learning_rate": 6.36e-05, "loss": 0.1169, "step": 160 }, { "epoch": 0.11580381471389646, "grad_norm": 1.3726643323898315, "learning_rate": 6.76e-05, "loss": 0.1201, "step": 170 }, { "epoch": 0.1226158038147139, "grad_norm": 1.2212395668029785, "learning_rate": 7.16e-05, "loss": 0.12, "step": 180 }, { "epoch": 0.12942779291553133, "grad_norm": 1.034472107887268, "learning_rate": 7.560000000000001e-05, "loss": 0.098, "step": 190 }, { "epoch": 0.1362397820163488, "grad_norm": 0.7338757514953613, "learning_rate": 7.960000000000001e-05, "loss": 0.0965, "step": 200 }, { "epoch": 0.14305177111716622, "grad_norm": 0.62514328956604, "learning_rate": 8.36e-05, "loss": 0.1108, "step": 210 }, { "epoch": 0.14986376021798364, "grad_norm": 0.8058353662490845, "learning_rate": 8.76e-05, "loss": 0.0948, "step": 220 }, { "epoch": 0.1566757493188011, "grad_norm": 0.837098240852356, "learning_rate": 9.16e-05, "loss": 0.0948, "step": 230 }, { "epoch": 0.16348773841961853, "grad_norm": 1.094609260559082, "learning_rate": 9.56e-05, "loss": 0.0935, "step": 240 }, { "epoch": 0.17029972752043596, "grad_norm": 1.555716872215271, "learning_rate": 9.960000000000001e-05, "loss": 0.0949, "step": 250 }, { "epoch": 0.1771117166212534, "grad_norm": 0.8324354290962219, "learning_rate": 9.999911419878559e-05, "loss": 0.0894, "step": 260 }, { "epoch": 0.18392370572207084, "grad_norm": 1.3196247816085815, "learning_rate": 9.999605221019081e-05, "loss": 0.1098, "step": 270 }, { "epoch": 0.1907356948228883, "grad_norm": 0.6212723255157471, "learning_rate": 9.999080323230761e-05, "loss": 0.0848, "step": 280 }, { "epoch": 0.19754768392370572, "grad_norm": 0.9073509573936462, "learning_rate": 9.998336749474329e-05, "loss": 0.099, "step": 290 }, { "epoch": 0.20435967302452315, "grad_norm": 0.8732384443283081, "learning_rate": 9.997374532276107e-05, "loss": 0.0838, "step": 300 }, { "epoch": 0.2111716621253406, "grad_norm": 1.0056062936782837, "learning_rate": 9.996193713726596e-05, "loss": 0.0818, "step": 310 }, { "epoch": 0.21798365122615804, "grad_norm": 0.7375788688659668, "learning_rate": 9.994794345478624e-05, "loss": 0.0768, "step": 320 }, { "epoch": 0.22479564032697547, "grad_norm": 1.0606452226638794, "learning_rate": 9.99317648874509e-05, "loss": 0.0845, "step": 330 }, { "epoch": 0.23160762942779292, "grad_norm": 0.699203610420227, "learning_rate": 9.991340214296292e-05, "loss": 0.0767, "step": 340 }, { "epoch": 0.23841961852861035, "grad_norm": 1.3575586080551147, "learning_rate": 9.989285602456819e-05, "loss": 0.075, "step": 350 }, { "epoch": 0.2452316076294278, "grad_norm": 0.7841135263442993, "learning_rate": 9.98701274310205e-05, "loss": 0.0728, "step": 360 }, { "epoch": 0.25204359673024523, "grad_norm": 0.7767183184623718, "learning_rate": 9.984521735654218e-05, "loss": 0.0769, "step": 370 }, { "epoch": 0.25885558583106266, "grad_norm": 0.483733594417572, "learning_rate": 9.981812689078057e-05, "loss": 0.0669, "step": 380 }, { "epoch": 0.2656675749318801, "grad_norm": 0.4933801591396332, "learning_rate": 9.978885721876041e-05, "loss": 0.0696, "step": 390 }, { "epoch": 0.2724795640326976, "grad_norm": 0.6362014412879944, "learning_rate": 9.975740962083198e-05, "loss": 0.0678, "step": 400 }, { "epoch": 0.279291553133515, "grad_norm": 0.7683391571044922, "learning_rate": 9.972378547261504e-05, "loss": 0.0735, "step": 410 }, { "epoch": 0.28610354223433243, "grad_norm": 0.8926170468330383, "learning_rate": 9.968798624493885e-05, "loss": 0.0656, "step": 420 }, { "epoch": 0.29291553133514986, "grad_norm": 0.6009325385093689, "learning_rate": 9.965001350377753e-05, "loss": 0.074, "step": 430 }, { "epoch": 0.2997275204359673, "grad_norm": 0.45287570357322693, "learning_rate": 9.960986891018183e-05, "loss": 0.0602, "step": 440 }, { "epoch": 0.3065395095367847, "grad_norm": 0.47310397028923035, "learning_rate": 9.95675542202063e-05, "loss": 0.0619, "step": 450 }, { "epoch": 0.3133514986376022, "grad_norm": 1.0169494152069092, "learning_rate": 9.952307128483256e-05, "loss": 0.0709, "step": 460 }, { "epoch": 0.3201634877384196, "grad_norm": 0.7056031227111816, "learning_rate": 9.947642204988835e-05, "loss": 0.0666, "step": 470 }, { "epoch": 0.32697547683923706, "grad_norm": 0.817714512348175, "learning_rate": 9.942760855596226e-05, "loss": 0.0734, "step": 480 }, { "epoch": 0.3337874659400545, "grad_norm": 0.5847324728965759, "learning_rate": 9.937663293831471e-05, "loss": 0.0643, "step": 490 }, { "epoch": 0.3405994550408719, "grad_norm": 0.8371860384941101, "learning_rate": 9.932349742678433e-05, "loss": 0.0617, "step": 500 }, { "epoch": 0.3474114441416894, "grad_norm": 0.47964030504226685, "learning_rate": 9.926820434569051e-05, "loss": 0.0649, "step": 510 }, { "epoch": 0.3542234332425068, "grad_norm": 0.6083551645278931, "learning_rate": 9.921075611373179e-05, "loss": 0.0634, "step": 520 }, { "epoch": 0.36103542234332425, "grad_norm": 0.6717512011528015, "learning_rate": 9.915115524387988e-05, "loss": 0.0637, "step": 530 }, { "epoch": 0.3678474114441417, "grad_norm": 0.7015753984451294, "learning_rate": 9.908940434326997e-05, "loss": 0.0633, "step": 540 }, { "epoch": 0.3746594005449591, "grad_norm": 0.6212232708930969, "learning_rate": 9.902550611308645e-05, "loss": 0.0593, "step": 550 }, { "epoch": 0.3814713896457766, "grad_norm": 0.6970530152320862, "learning_rate": 9.895946334844494e-05, "loss": 0.0579, "step": 560 }, { "epoch": 0.388283378746594, "grad_norm": 0.5176441669464111, "learning_rate": 9.889127893826989e-05, "loss": 0.0559, "step": 570 }, { "epoch": 0.39509536784741145, "grad_norm": 0.44634121656417847, "learning_rate": 9.882095586516831e-05, "loss": 0.0576, "step": 580 }, { "epoch": 0.4019073569482289, "grad_norm": 0.6069617867469788, "learning_rate": 9.874849720529921e-05, "loss": 0.0608, "step": 590 }, { "epoch": 0.4087193460490463, "grad_norm": 0.610893726348877, "learning_rate": 9.867390612823914e-05, "loss": 0.0592, "step": 600 }, { "epoch": 0.41553133514986373, "grad_norm": 0.3479655683040619, "learning_rate": 9.859718589684344e-05, "loss": 0.0658, "step": 610 }, { "epoch": 0.4223433242506812, "grad_norm": 0.7216345071792603, "learning_rate": 9.851833986710353e-05, "loss": 0.056, "step": 620 }, { "epoch": 0.42915531335149865, "grad_norm": 0.5811245441436768, "learning_rate": 9.843737148800023e-05, "loss": 0.0489, "step": 630 }, { "epoch": 0.4359673024523161, "grad_norm": 0.6808714866638184, "learning_rate": 9.835428430135271e-05, "loss": 0.0489, "step": 640 }, { "epoch": 0.4427792915531335, "grad_norm": 0.53304123878479, "learning_rate": 9.82690819416637e-05, "loss": 0.0551, "step": 650 }, { "epoch": 0.44959128065395093, "grad_norm": 0.5133969783782959, "learning_rate": 9.818176813596041e-05, "loss": 0.0523, "step": 660 }, { "epoch": 0.4564032697547684, "grad_norm": 0.42300981283187866, "learning_rate": 9.809234670363159e-05, "loss": 0.0635, "step": 670 }, { "epoch": 0.46321525885558584, "grad_norm": 0.5513554811477661, "learning_rate": 9.800082155626034e-05, "loss": 0.0571, "step": 680 }, { "epoch": 0.47002724795640327, "grad_norm": 0.4101255238056183, "learning_rate": 9.790719669745312e-05, "loss": 0.0571, "step": 690 }, { "epoch": 0.4768392370572207, "grad_norm": 0.7928181886672974, "learning_rate": 9.781147622266455e-05, "loss": 0.0576, "step": 700 }, { "epoch": 0.48365122615803813, "grad_norm": 0.6665974855422974, "learning_rate": 9.771366431901831e-05, "loss": 0.0626, "step": 710 }, { "epoch": 0.4904632152588556, "grad_norm": 0.43793386220932007, "learning_rate": 9.761376526512394e-05, "loss": 0.0567, "step": 720 }, { "epoch": 0.49727520435967304, "grad_norm": 0.4338440001010895, "learning_rate": 9.751178343088963e-05, "loss": 0.0519, "step": 730 }, { "epoch": 0.5040871934604905, "grad_norm": 0.47942614555358887, "learning_rate": 9.740772327733123e-05, "loss": 0.0446, "step": 740 }, { "epoch": 0.510899182561308, "grad_norm": 0.3280750811100006, "learning_rate": 9.730158935637697e-05, "loss": 0.052, "step": 750 }, { "epoch": 0.5177111716621253, "grad_norm": 0.6672011017799377, "learning_rate": 9.719338631066834e-05, "loss": 0.0417, "step": 760 }, { "epoch": 0.5245231607629428, "grad_norm": 0.386070191860199, "learning_rate": 9.708311887335713e-05, "loss": 0.0436, "step": 770 }, { "epoch": 0.5313351498637602, "grad_norm": 0.428362637758255, "learning_rate": 9.697079186789823e-05, "loss": 0.0539, "step": 780 }, { "epoch": 0.5381471389645777, "grad_norm": 0.4888722896575928, "learning_rate": 9.685641020783876e-05, "loss": 0.0517, "step": 790 }, { "epoch": 0.5449591280653951, "grad_norm": 0.4673832952976227, "learning_rate": 9.67399788966031e-05, "loss": 0.0523, "step": 800 }, { "epoch": 0.5517711171662125, "grad_norm": 0.29115697741508484, "learning_rate": 9.662150302727395e-05, "loss": 0.0521, "step": 810 }, { "epoch": 0.55858310626703, "grad_norm": 0.827384352684021, "learning_rate": 9.650098778236968e-05, "loss": 0.0477, "step": 820 }, { "epoch": 0.5653950953678474, "grad_norm": 0.33872804045677185, "learning_rate": 9.637843843361749e-05, "loss": 0.0471, "step": 830 }, { "epoch": 0.5722070844686649, "grad_norm": 0.3877025842666626, "learning_rate": 9.62538603417229e-05, "loss": 0.0426, "step": 840 }, { "epoch": 0.5790190735694822, "grad_norm": 0.42489877343177795, "learning_rate": 9.612725895613526e-05, "loss": 0.0551, "step": 850 }, { "epoch": 0.5858310626702997, "grad_norm": 0.3988380432128906, "learning_rate": 9.599863981480926e-05, "loss": 0.0487, "step": 860 }, { "epoch": 0.5926430517711172, "grad_norm": 0.5466487407684326, "learning_rate": 9.586800854396283e-05, "loss": 0.0467, "step": 870 }, { "epoch": 0.5994550408719346, "grad_norm": 0.37913820147514343, "learning_rate": 9.573537085783095e-05, "loss": 0.0399, "step": 880 }, { "epoch": 0.6062670299727521, "grad_norm": 0.47171854972839355, "learning_rate": 9.560073255841571e-05, "loss": 0.0402, "step": 890 }, { "epoch": 0.6130790190735694, "grad_norm": 0.5175816416740417, "learning_rate": 9.546409953523247e-05, "loss": 0.0514, "step": 900 }, { "epoch": 0.6198910081743869, "grad_norm": 0.3573410212993622, "learning_rate": 9.532547776505229e-05, "loss": 0.0384, "step": 910 }, { "epoch": 0.6267029972752044, "grad_norm": 0.4385380148887634, "learning_rate": 9.518487331164048e-05, "loss": 0.0478, "step": 920 }, { "epoch": 0.6335149863760218, "grad_norm": 0.49251607060432434, "learning_rate": 9.504229232549134e-05, "loss": 0.0421, "step": 930 }, { "epoch": 0.6403269754768393, "grad_norm": 0.33070334792137146, "learning_rate": 9.489774104355909e-05, "loss": 0.0402, "step": 940 }, { "epoch": 0.6471389645776566, "grad_norm": 0.616314709186554, "learning_rate": 9.475122578898507e-05, "loss": 0.0561, "step": 950 }, { "epoch": 0.6539509536784741, "grad_norm": 0.7195887565612793, "learning_rate": 9.460275297082119e-05, "loss": 0.0392, "step": 960 }, { "epoch": 0.6607629427792916, "grad_norm": 0.6277886033058167, "learning_rate": 9.445232908374948e-05, "loss": 0.0433, "step": 970 }, { "epoch": 0.667574931880109, "grad_norm": 0.7239585518836975, "learning_rate": 9.429996070779808e-05, "loss": 0.0513, "step": 980 }, { "epoch": 0.6743869209809265, "grad_norm": 0.8502191305160522, "learning_rate": 9.414565450805333e-05, "loss": 0.052, "step": 990 }, { "epoch": 0.6811989100817438, "grad_norm": 0.26333141326904297, "learning_rate": 9.398941723436831e-05, "loss": 0.0467, "step": 1000 }, { "epoch": 0.6880108991825613, "grad_norm": 0.6643750071525574, "learning_rate": 9.383125572106752e-05, "loss": 0.0416, "step": 1010 }, { "epoch": 0.6948228882833788, "grad_norm": 0.4409726858139038, "learning_rate": 9.367117688664791e-05, "loss": 0.0484, "step": 1020 }, { "epoch": 0.7016348773841962, "grad_norm": 0.575430154800415, "learning_rate": 9.35091877334763e-05, "loss": 0.0495, "step": 1030 }, { "epoch": 0.7084468664850136, "grad_norm": 0.4284694492816925, "learning_rate": 9.334529534748297e-05, "loss": 0.0442, "step": 1040 }, { "epoch": 0.715258855585831, "grad_norm": 0.4911642372608185, "learning_rate": 9.317950689785188e-05, "loss": 0.0405, "step": 1050 }, { "epoch": 0.7220708446866485, "grad_norm": 0.7064844965934753, "learning_rate": 9.301182963670688e-05, "loss": 0.054, "step": 1060 }, { "epoch": 0.728882833787466, "grad_norm": 0.7730808258056641, "learning_rate": 9.284227089879456e-05, "loss": 0.048, "step": 1070 }, { "epoch": 0.7356948228882834, "grad_norm": 0.3709051012992859, "learning_rate": 9.26708381011634e-05, "loss": 0.0505, "step": 1080 }, { "epoch": 0.7425068119891008, "grad_norm": 0.5168929100036621, "learning_rate": 9.249753874283937e-05, "loss": 0.0367, "step": 1090 }, { "epoch": 0.7493188010899182, "grad_norm": 0.4022100269794464, "learning_rate": 9.232238040449779e-05, "loss": 0.0421, "step": 1100 }, { "epoch": 0.7561307901907357, "grad_norm": 0.4457673728466034, "learning_rate": 9.214537074813181e-05, "loss": 0.0417, "step": 1110 }, { "epoch": 0.7629427792915532, "grad_norm": 0.5297082662582397, "learning_rate": 9.196651751671724e-05, "loss": 0.0429, "step": 1120 }, { "epoch": 0.7697547683923706, "grad_norm": 0.2837148904800415, "learning_rate": 9.178582853387384e-05, "loss": 0.0435, "step": 1130 }, { "epoch": 0.776566757493188, "grad_norm": 0.49884119629859924, "learning_rate": 9.160331170352304e-05, "loss": 0.0428, "step": 1140 }, { "epoch": 0.7833787465940054, "grad_norm": 0.3587488532066345, "learning_rate": 9.141897500954229e-05, "loss": 0.0403, "step": 1150 }, { "epoch": 0.7901907356948229, "grad_norm": 0.4518432319164276, "learning_rate": 9.123282651541576e-05, "loss": 0.0376, "step": 1160 }, { "epoch": 0.7970027247956403, "grad_norm": 0.48306331038475037, "learning_rate": 9.104487436388161e-05, "loss": 0.0414, "step": 1170 }, { "epoch": 0.8038147138964578, "grad_norm": 0.44562119245529175, "learning_rate": 9.085512677657582e-05, "loss": 0.039, "step": 1180 }, { "epoch": 0.8106267029972752, "grad_norm": 0.5168417692184448, "learning_rate": 9.066359205367258e-05, "loss": 0.0433, "step": 1190 }, { "epoch": 0.8174386920980926, "grad_norm": 0.3128747344017029, "learning_rate": 9.047027857352112e-05, "loss": 0.0459, "step": 1200 }, { "epoch": 0.8242506811989101, "grad_norm": 0.43080934882164, "learning_rate": 9.027519479227935e-05, "loss": 0.0418, "step": 1210 }, { "epoch": 0.8310626702997275, "grad_norm": 0.39959844946861267, "learning_rate": 9.007834924354383e-05, "loss": 0.0382, "step": 1220 }, { "epoch": 0.837874659400545, "grad_norm": 0.46520474553108215, "learning_rate": 8.987975053797655e-05, "loss": 0.0435, "step": 1230 }, { "epoch": 0.8446866485013624, "grad_norm": 0.500769317150116, "learning_rate": 8.967940736292825e-05, "loss": 0.0376, "step": 1240 }, { "epoch": 0.8514986376021798, "grad_norm": 0.7235398888587952, "learning_rate": 8.947732848205846e-05, "loss": 0.0495, "step": 1250 }, { "epoch": 0.8583106267029973, "grad_norm": 0.3423875570297241, "learning_rate": 8.927352273495204e-05, "loss": 0.0382, "step": 1260 }, { "epoch": 0.8651226158038147, "grad_norm": 0.5364619493484497, "learning_rate": 8.906799903673265e-05, "loss": 0.0417, "step": 1270 }, { "epoch": 0.8719346049046321, "grad_norm": 0.6217823028564453, "learning_rate": 8.88607663776726e-05, "loss": 0.0436, "step": 1280 }, { "epoch": 0.8787465940054496, "grad_norm": 0.3814774751663208, "learning_rate": 8.865183382279978e-05, "loss": 0.0456, "step": 1290 }, { "epoch": 0.885558583106267, "grad_norm": 0.5894232988357544, "learning_rate": 8.844121051150096e-05, "loss": 0.0413, "step": 1300 }, { "epoch": 0.8923705722070845, "grad_norm": 0.4752817153930664, "learning_rate": 8.822890565712211e-05, "loss": 0.0365, "step": 1310 }, { "epoch": 0.8991825613079019, "grad_norm": 0.3989897072315216, "learning_rate": 8.801492854656536e-05, "loss": 0.0397, "step": 1320 }, { "epoch": 0.9059945504087193, "grad_norm": 0.3819690942764282, "learning_rate": 8.779928853988268e-05, "loss": 0.0351, "step": 1330 }, { "epoch": 0.9128065395095368, "grad_norm": 0.487627774477005, "learning_rate": 8.758199506986655e-05, "loss": 0.038, "step": 1340 }, { "epoch": 0.9196185286103542, "grad_norm": 0.40310102701187134, "learning_rate": 8.73630576416373e-05, "loss": 0.0365, "step": 1350 }, { "epoch": 0.9264305177111717, "grad_norm": 0.30654123425483704, "learning_rate": 8.714248583222726e-05, "loss": 0.04, "step": 1360 }, { "epoch": 0.9332425068119891, "grad_norm": 0.5350182056427002, "learning_rate": 8.692028929016196e-05, "loss": 0.0377, "step": 1370 }, { "epoch": 0.9400544959128065, "grad_norm": 0.39081400632858276, "learning_rate": 8.669647773503797e-05, "loss": 0.0363, "step": 1380 }, { "epoch": 0.946866485013624, "grad_norm": 0.4992840886116028, "learning_rate": 8.647106095709773e-05, "loss": 0.0355, "step": 1390 }, { "epoch": 0.9536784741144414, "grad_norm": 0.3442818224430084, "learning_rate": 8.624404881680139e-05, "loss": 0.0452, "step": 1400 }, { "epoch": 0.9604904632152589, "grad_norm": 0.5338506102561951, "learning_rate": 8.601545124439535e-05, "loss": 0.0358, "step": 1410 }, { "epoch": 0.9673024523160763, "grad_norm": 0.3899770975112915, "learning_rate": 8.5785278239478e-05, "loss": 0.0422, "step": 1420 }, { "epoch": 0.9741144414168937, "grad_norm": 0.5235274434089661, "learning_rate": 8.555353987056224e-05, "loss": 0.0411, "step": 1430 }, { "epoch": 0.9809264305177112, "grad_norm": 0.4164868891239166, "learning_rate": 8.532024627463505e-05, "loss": 0.0351, "step": 1440 }, { "epoch": 0.9877384196185286, "grad_norm": 0.3429436683654785, "learning_rate": 8.508540765671407e-05, "loss": 0.0396, "step": 1450 }, { "epoch": 0.9945504087193461, "grad_norm": 0.45483162999153137, "learning_rate": 8.484903428940121e-05, "loss": 0.0388, "step": 1460 }, { "epoch": 1.0013623978201636, "grad_norm": 0.4117540419101715, "learning_rate": 8.461113651243334e-05, "loss": 0.0396, "step": 1470 }, { "epoch": 1.008174386920981, "grad_norm": 0.44719594717025757, "learning_rate": 8.437172473222987e-05, "loss": 0.0411, "step": 1480 }, { "epoch": 1.0149863760217983, "grad_norm": 0.5068361759185791, "learning_rate": 8.413080942143767e-05, "loss": 0.0343, "step": 1490 }, { "epoch": 1.021798365122616, "grad_norm": 0.43941476941108704, "learning_rate": 8.388840111847288e-05, "loss": 0.045, "step": 1500 }, { "epoch": 1.0286103542234333, "grad_norm": 0.4756196141242981, "learning_rate": 8.364451042705998e-05, "loss": 0.0337, "step": 1510 }, { "epoch": 1.0354223433242506, "grad_norm": 0.3626450002193451, "learning_rate": 8.33991480157679e-05, "loss": 0.0379, "step": 1520 }, { "epoch": 1.042234332425068, "grad_norm": 0.5754261016845703, "learning_rate": 8.315232461754338e-05, "loss": 0.0374, "step": 1530 }, { "epoch": 1.0490463215258856, "grad_norm": 0.45411282777786255, "learning_rate": 8.290405102924144e-05, "loss": 0.0404, "step": 1540 }, { "epoch": 1.055858310626703, "grad_norm": 0.5540292263031006, "learning_rate": 8.265433811115316e-05, "loss": 0.0406, "step": 1550 }, { "epoch": 1.0626702997275204, "grad_norm": 0.4548736810684204, "learning_rate": 8.240319678653049e-05, "loss": 0.0353, "step": 1560 }, { "epoch": 1.069482288828338, "grad_norm": 0.3220965564250946, "learning_rate": 8.215063804110857e-05, "loss": 0.0395, "step": 1570 }, { "epoch": 1.0762942779291553, "grad_norm": 0.33744776248931885, "learning_rate": 8.189667292262512e-05, "loss": 0.0327, "step": 1580 }, { "epoch": 1.0831062670299727, "grad_norm": 0.34971827268600464, "learning_rate": 8.164131254033716e-05, "loss": 0.0382, "step": 1590 }, { "epoch": 1.0899182561307903, "grad_norm": 0.3128986060619354, "learning_rate": 8.138456806453503e-05, "loss": 0.0322, "step": 1600 }, { "epoch": 1.0967302452316077, "grad_norm": 0.2257993221282959, "learning_rate": 8.112645072605386e-05, "loss": 0.0271, "step": 1610 }, { "epoch": 1.103542234332425, "grad_norm": 0.30597376823425293, "learning_rate": 8.086697181578222e-05, "loss": 0.0278, "step": 1620 }, { "epoch": 1.1103542234332424, "grad_norm": 0.31509286165237427, "learning_rate": 8.060614268416823e-05, "loss": 0.0301, "step": 1630 }, { "epoch": 1.11716621253406, "grad_norm": 0.4431317150592804, "learning_rate": 8.034397474072309e-05, "loss": 0.0309, "step": 1640 }, { "epoch": 1.1239782016348774, "grad_norm": 0.4654938578605652, "learning_rate": 8.008047945352193e-05, "loss": 0.0406, "step": 1650 }, { "epoch": 1.1307901907356948, "grad_norm": 0.42640626430511475, "learning_rate": 7.981566834870225e-05, "loss": 0.0299, "step": 1660 }, { "epoch": 1.1376021798365124, "grad_norm": 0.41219788789749146, "learning_rate": 7.954955300995961e-05, "loss": 0.0318, "step": 1670 }, { "epoch": 1.1444141689373297, "grad_norm": 0.3845755159854889, "learning_rate": 7.928214507804104e-05, "loss": 0.0338, "step": 1680 }, { "epoch": 1.151226158038147, "grad_norm": 0.31636008620262146, "learning_rate": 7.901345625023576e-05, "loss": 0.0352, "step": 1690 }, { "epoch": 1.1580381471389645, "grad_norm": 0.34709426760673523, "learning_rate": 7.874349827986354e-05, "loss": 0.0331, "step": 1700 }, { "epoch": 1.164850136239782, "grad_norm": 0.4313192665576935, "learning_rate": 7.847228297576053e-05, "loss": 0.0326, "step": 1710 }, { "epoch": 1.1716621253405994, "grad_norm": 0.4032236933708191, "learning_rate": 7.819982220176276e-05, "loss": 0.0355, "step": 1720 }, { "epoch": 1.1784741144414168, "grad_norm": 0.3324613571166992, "learning_rate": 7.792612787618714e-05, "loss": 0.0355, "step": 1730 }, { "epoch": 1.1852861035422344, "grad_norm": 0.44290757179260254, "learning_rate": 7.765121197131009e-05, "loss": 0.0327, "step": 1740 }, { "epoch": 1.1920980926430518, "grad_norm": 0.28540492057800293, "learning_rate": 7.737508651284391e-05, "loss": 0.0367, "step": 1750 }, { "epoch": 1.1989100817438691, "grad_norm": 0.38834914565086365, "learning_rate": 7.709776357941069e-05, "loss": 0.0373, "step": 1760 }, { "epoch": 1.2057220708446867, "grad_norm": 0.34177857637405396, "learning_rate": 7.681925530201392e-05, "loss": 0.0368, "step": 1770 }, { "epoch": 1.2125340599455041, "grad_norm": 0.45681893825531006, "learning_rate": 7.65395738635079e-05, "loss": 0.0318, "step": 1780 }, { "epoch": 1.2193460490463215, "grad_norm": 0.32232654094696045, "learning_rate": 7.62587314980648e-05, "loss": 0.0365, "step": 1790 }, { "epoch": 1.226158038147139, "grad_norm": 0.2634826898574829, "learning_rate": 7.597674049063947e-05, "loss": 0.0327, "step": 1800 }, { "epoch": 1.2329700272479565, "grad_norm": 0.4753483235836029, "learning_rate": 7.569361317643211e-05, "loss": 0.0337, "step": 1810 }, { "epoch": 1.2397820163487738, "grad_norm": 0.3038065433502197, "learning_rate": 7.540936194034865e-05, "loss": 0.0309, "step": 1820 }, { "epoch": 1.2465940054495912, "grad_norm": 0.32555919885635376, "learning_rate": 7.512399921645901e-05, "loss": 0.0313, "step": 1830 }, { "epoch": 1.2534059945504088, "grad_norm": 0.3383468985557556, "learning_rate": 7.483753748745317e-05, "loss": 0.032, "step": 1840 }, { "epoch": 1.2602179836512262, "grad_norm": 0.26944777369499207, "learning_rate": 7.454998928409516e-05, "loss": 0.0308, "step": 1850 }, { "epoch": 1.2670299727520435, "grad_norm": 0.2938184142112732, "learning_rate": 7.426136718467493e-05, "loss": 0.0324, "step": 1860 }, { "epoch": 1.273841961852861, "grad_norm": 0.276143878698349, "learning_rate": 7.397168381445812e-05, "loss": 0.0325, "step": 1870 }, { "epoch": 1.2806539509536785, "grad_norm": 0.3054909408092499, "learning_rate": 7.368095184513377e-05, "loss": 0.03, "step": 1880 }, { "epoch": 1.2874659400544959, "grad_norm": 0.24084536731243134, "learning_rate": 7.338918399426005e-05, "loss": 0.0274, "step": 1890 }, { "epoch": 1.2942779291553133, "grad_norm": 0.41324862837791443, "learning_rate": 7.309639302470801e-05, "loss": 0.0348, "step": 1900 }, { "epoch": 1.3010899182561309, "grad_norm": 0.29731935262680054, "learning_rate": 7.280259174410312e-05, "loss": 0.0312, "step": 1910 }, { "epoch": 1.3079019073569482, "grad_norm": 0.22514300048351288, "learning_rate": 7.250779300426517e-05, "loss": 0.0312, "step": 1920 }, { "epoch": 1.3147138964577656, "grad_norm": 0.5704501271247864, "learning_rate": 7.22120097006461e-05, "loss": 0.0325, "step": 1930 }, { "epoch": 1.3215258855585832, "grad_norm": 0.27702492475509644, "learning_rate": 7.191525477176577e-05, "loss": 0.0321, "step": 1940 }, { "epoch": 1.3283378746594006, "grad_norm": 0.34598076343536377, "learning_rate": 7.161754119864616e-05, "loss": 0.0298, "step": 1950 }, { "epoch": 1.335149863760218, "grad_norm": 0.24778622388839722, "learning_rate": 7.131888200424339e-05, "loss": 0.0277, "step": 1960 }, { "epoch": 1.3419618528610355, "grad_norm": 0.2454395443201065, "learning_rate": 7.101929025287816e-05, "loss": 0.0357, "step": 1970 }, { "epoch": 1.348773841961853, "grad_norm": 0.47679805755615234, "learning_rate": 7.071877904966423e-05, "loss": 0.0378, "step": 1980 }, { "epoch": 1.3555858310626703, "grad_norm": 0.2696547210216522, "learning_rate": 7.04173615399351e-05, "loss": 0.0299, "step": 1990 }, { "epoch": 1.3623978201634879, "grad_norm": 0.3305070698261261, "learning_rate": 7.011505090866913e-05, "loss": 0.0298, "step": 2000 }, { "epoch": 1.3692098092643052, "grad_norm": 0.35810503363609314, "learning_rate": 6.981186037991271e-05, "loss": 0.0304, "step": 2010 }, { "epoch": 1.3760217983651226, "grad_norm": 0.314117968082428, "learning_rate": 6.950780321620174e-05, "loss": 0.0352, "step": 2020 }, { "epoch": 1.38283378746594, "grad_norm": 0.33775216341018677, "learning_rate": 6.920289271798157e-05, "loss": 0.0378, "step": 2030 }, { "epoch": 1.3896457765667574, "grad_norm": 0.33370664715766907, "learning_rate": 6.889714222302517e-05, "loss": 0.0336, "step": 2040 }, { "epoch": 1.396457765667575, "grad_norm": 0.48640260100364685, "learning_rate": 6.85905651058497e-05, "loss": 0.0323, "step": 2050 }, { "epoch": 1.4032697547683923, "grad_norm": 0.3220215141773224, "learning_rate": 6.82831747771314e-05, "loss": 0.0276, "step": 2060 }, { "epoch": 1.4100817438692097, "grad_norm": 0.32791373133659363, "learning_rate": 6.797498468311907e-05, "loss": 0.0287, "step": 2070 }, { "epoch": 1.4168937329700273, "grad_norm": 0.36337828636169434, "learning_rate": 6.766600830504585e-05, "loss": 0.0291, "step": 2080 }, { "epoch": 1.4237057220708447, "grad_norm": 0.3391413390636444, "learning_rate": 6.735625915853942e-05, "loss": 0.0284, "step": 2090 }, { "epoch": 1.430517711171662, "grad_norm": 0.35755249857902527, "learning_rate": 6.70457507930309e-05, "loss": 0.0274, "step": 2100 }, { "epoch": 1.4373297002724796, "grad_norm": 0.2682415843009949, "learning_rate": 6.673449679116215e-05, "loss": 0.0274, "step": 2110 }, { "epoch": 1.444141689373297, "grad_norm": 0.475309818983078, "learning_rate": 6.642251076819148e-05, "loss": 0.0262, "step": 2120 }, { "epoch": 1.4509536784741144, "grad_norm": 0.3676445186138153, "learning_rate": 6.610980637139827e-05, "loss": 0.0318, "step": 2130 }, { "epoch": 1.457765667574932, "grad_norm": 0.45259350538253784, "learning_rate": 6.579639727948583e-05, "loss": 0.0296, "step": 2140 }, { "epoch": 1.4645776566757494, "grad_norm": 0.38819339871406555, "learning_rate": 6.548229720198315e-05, "loss": 0.0334, "step": 2150 }, { "epoch": 1.4713896457765667, "grad_norm": 0.4020323157310486, "learning_rate": 6.516751987864517e-05, "loss": 0.0273, "step": 2160 }, { "epoch": 1.4782016348773843, "grad_norm": 0.1928047388792038, "learning_rate": 6.485207907885175e-05, "loss": 0.0266, "step": 2170 }, { "epoch": 1.4850136239782017, "grad_norm": 0.442618727684021, "learning_rate": 6.453598860100536e-05, "loss": 0.0299, "step": 2180 }, { "epoch": 1.491825613079019, "grad_norm": 0.36381062865257263, "learning_rate": 6.421926227192749e-05, "loss": 0.0252, "step": 2190 }, { "epoch": 1.4986376021798364, "grad_norm": 0.4495033621788025, "learning_rate": 6.390191394625381e-05, "loss": 0.0265, "step": 2200 }, { "epoch": 1.5054495912806538, "grad_norm": 0.3564695715904236, "learning_rate": 6.358395750582817e-05, "loss": 0.026, "step": 2210 }, { "epoch": 1.5122615803814714, "grad_norm": 0.28276216983795166, "learning_rate": 6.326540685909532e-05, "loss": 0.0245, "step": 2220 }, { "epoch": 1.5190735694822888, "grad_norm": 0.44450217485427856, "learning_rate": 6.294627594049249e-05, "loss": 0.0253, "step": 2230 }, { "epoch": 1.5258855585831061, "grad_norm": 0.2726491391658783, "learning_rate": 6.262657870983989e-05, "loss": 0.0258, "step": 2240 }, { "epoch": 1.5326975476839237, "grad_norm": 0.35235723853111267, "learning_rate": 6.230632915173009e-05, "loss": 0.0303, "step": 2250 }, { "epoch": 1.5395095367847411, "grad_norm": 0.2119748741388321, "learning_rate": 6.198554127491622e-05, "loss": 0.029, "step": 2260 }, { "epoch": 1.5463215258855585, "grad_norm": 0.34444141387939453, "learning_rate": 6.166422911169923e-05, "loss": 0.0269, "step": 2270 }, { "epoch": 1.553133514986376, "grad_norm": 0.2883770763874054, "learning_rate": 6.1342406717314e-05, "loss": 0.0303, "step": 2280 }, { "epoch": 1.5599455040871935, "grad_norm": 0.2837648093700409, "learning_rate": 6.102008816931466e-05, "loss": 0.0272, "step": 2290 }, { "epoch": 1.5667574931880108, "grad_norm": 0.2236020863056183, "learning_rate": 6.069728756695866e-05, "loss": 0.0234, "step": 2300 }, { "epoch": 1.5735694822888284, "grad_norm": 0.4470672607421875, "learning_rate": 6.037401903059008e-05, "loss": 0.032, "step": 2310 }, { "epoch": 1.5803814713896458, "grad_norm": 0.3020336627960205, "learning_rate": 6.005029670102195e-05, "loss": 0.0227, "step": 2320 }, { "epoch": 1.5871934604904632, "grad_norm": 0.27960023283958435, "learning_rate": 5.972613473891766e-05, "loss": 0.0335, "step": 2330 }, { "epoch": 1.5940054495912808, "grad_norm": 0.308479368686676, "learning_rate": 5.940154732417158e-05, "loss": 0.0297, "step": 2340 }, { "epoch": 1.6008174386920981, "grad_norm": 0.3311978876590729, "learning_rate": 5.907654865528876e-05, "loss": 0.0312, "step": 2350 }, { "epoch": 1.6076294277929155, "grad_norm": 0.26757732033729553, "learning_rate": 5.875115294876381e-05, "loss": 0.0234, "step": 2360 }, { "epoch": 1.614441416893733, "grad_norm": 0.40103888511657715, "learning_rate": 5.842537443845908e-05, "loss": 0.0274, "step": 2370 }, { "epoch": 1.6212534059945503, "grad_norm": 0.17837531864643097, "learning_rate": 5.809922737498198e-05, "loss": 0.0225, "step": 2380 }, { "epoch": 1.6280653950953679, "grad_norm": 0.42968425154685974, "learning_rate": 5.777272602506165e-05, "loss": 0.027, "step": 2390 }, { "epoch": 1.6348773841961854, "grad_norm": 0.24213114380836487, "learning_rate": 5.744588467092483e-05, "loss": 0.0265, "step": 2400 }, { "epoch": 1.6416893732970026, "grad_norm": 0.3060871660709381, "learning_rate": 5.7118717609671194e-05, "loss": 0.0235, "step": 2410 }, { "epoch": 1.6485013623978202, "grad_norm": 0.20384085178375244, "learning_rate": 5.679123915264786e-05, "loss": 0.0261, "step": 2420 }, { "epoch": 1.6553133514986376, "grad_norm": 0.3139786720275879, "learning_rate": 5.646346362482342e-05, "loss": 0.0225, "step": 2430 }, { "epoch": 1.662125340599455, "grad_norm": 0.2353772073984146, "learning_rate": 5.613540536416132e-05, "loss": 0.0273, "step": 2440 }, { "epoch": 1.6689373297002725, "grad_norm": 0.3663155436515808, "learning_rate": 5.5807078720992645e-05, "loss": 0.0237, "step": 2450 }, { "epoch": 1.67574931880109, "grad_norm": 0.4667767882347107, "learning_rate": 5.547849805738836e-05, "loss": 0.0308, "step": 2460 }, { "epoch": 1.6825613079019073, "grad_norm": 0.2913496792316437, "learning_rate": 5.514967774653118e-05, "loss": 0.0222, "step": 2470 }, { "epoch": 1.6893732970027249, "grad_norm": 0.22617073357105255, "learning_rate": 5.482063217208674e-05, "loss": 0.0251, "step": 2480 }, { "epoch": 1.6961852861035422, "grad_norm": 0.3499128222465515, "learning_rate": 5.449137572757439e-05, "loss": 0.0216, "step": 2490 }, { "epoch": 1.7029972752043596, "grad_norm": 0.24365057051181793, "learning_rate": 5.4161922815737696e-05, "loss": 0.0268, "step": 2500 }, { "epoch": 1.7098092643051772, "grad_norm": 0.21294479072093964, "learning_rate": 5.3832287847914276e-05, "loss": 0.0273, "step": 2510 }, { "epoch": 1.7166212534059946, "grad_norm": 0.31520646810531616, "learning_rate": 5.35024852434055e-05, "loss": 0.0258, "step": 2520 }, { "epoch": 1.723433242506812, "grad_norm": 0.4261656403541565, "learning_rate": 5.317252942884567e-05, "loss": 0.0231, "step": 2530 }, { "epoch": 1.7302452316076296, "grad_norm": 0.29408591985702515, "learning_rate": 5.284243483757109e-05, "loss": 0.0304, "step": 2540 }, { "epoch": 1.7370572207084467, "grad_norm": 0.333383172750473, "learning_rate": 5.2512215908988484e-05, "loss": 0.0295, "step": 2550 }, { "epoch": 1.7438692098092643, "grad_norm": 0.2510589361190796, "learning_rate": 5.218188708794357e-05, "loss": 0.0254, "step": 2560 }, { "epoch": 1.750681198910082, "grad_norm": 0.3071255385875702, "learning_rate": 5.18514628240891e-05, "loss": 0.0233, "step": 2570 }, { "epoch": 1.757493188010899, "grad_norm": 0.3328297436237335, "learning_rate": 5.1520957571252795e-05, "loss": 0.0237, "step": 2580 }, { "epoch": 1.7643051771117166, "grad_norm": 0.2048969864845276, "learning_rate": 5.1190385786805106e-05, "loss": 0.0278, "step": 2590 }, { "epoch": 1.771117166212534, "grad_norm": 0.4445406496524811, "learning_rate": 5.085976193102677e-05, "loss": 0.0247, "step": 2600 }, { "epoch": 1.7779291553133514, "grad_norm": 0.2530488967895508, "learning_rate": 5.052910046647634e-05, "loss": 0.0218, "step": 2610 }, { "epoch": 1.784741144414169, "grad_norm": 0.31554245948791504, "learning_rate": 5.0198415857357464e-05, "loss": 0.0237, "step": 2620 }, { "epoch": 1.7915531335149864, "grad_norm": 0.2431655079126358, "learning_rate": 4.9867722568886223e-05, "loss": 0.0214, "step": 2630 }, { "epoch": 1.7983651226158037, "grad_norm": 0.28798162937164307, "learning_rate": 4.9537035066658314e-05, "loss": 0.0213, "step": 2640 }, { "epoch": 1.8051771117166213, "grad_norm": 0.25857627391815186, "learning_rate": 4.920636781601638e-05, "loss": 0.0272, "step": 2650 }, { "epoch": 1.8119891008174387, "grad_norm": 0.2804415225982666, "learning_rate": 4.88757352814172e-05, "loss": 0.0288, "step": 2660 }, { "epoch": 1.818801089918256, "grad_norm": 0.23555926978588104, "learning_rate": 4.8545151925798924e-05, "loss": 0.0247, "step": 2670 }, { "epoch": 1.8256130790190737, "grad_norm": 0.3501521050930023, "learning_rate": 4.821463220994848e-05, "loss": 0.026, "step": 2680 }, { "epoch": 1.832425068119891, "grad_norm": 0.3100302517414093, "learning_rate": 4.788419059186895e-05, "loss": 0.021, "step": 2690 }, { "epoch": 1.8392370572207084, "grad_norm": 0.28045013546943665, "learning_rate": 4.7553841526147205e-05, "loss": 0.0257, "step": 2700 }, { "epoch": 1.846049046321526, "grad_norm": 0.17547450959682465, "learning_rate": 4.722359946332156e-05, "loss": 0.023, "step": 2710 }, { "epoch": 1.8528610354223434, "grad_norm": 0.2572614550590515, "learning_rate": 4.6893478849249654e-05, "loss": 0.0226, "step": 2720 }, { "epoch": 1.8596730245231607, "grad_norm": 0.42476364970207214, "learning_rate": 4.656349412447664e-05, "loss": 0.023, "step": 2730 }, { "epoch": 1.8664850136239783, "grad_norm": 0.37075158953666687, "learning_rate": 4.623365972360337e-05, "loss": 0.0239, "step": 2740 }, { "epoch": 1.8732970027247955, "grad_norm": 0.27569836378097534, "learning_rate": 4.590399007465503e-05, "loss": 0.0216, "step": 2750 }, { "epoch": 1.880108991825613, "grad_norm": 0.25869858264923096, "learning_rate": 4.557449959845005e-05, "loss": 0.024, "step": 2760 }, { "epoch": 1.8869209809264307, "grad_norm": 0.2198791801929474, "learning_rate": 4.524520270796927e-05, "loss": 0.0213, "step": 2770 }, { "epoch": 1.8937329700272478, "grad_norm": 0.3058468997478485, "learning_rate": 4.491611380772545e-05, "loss": 0.0218, "step": 2780 }, { "epoch": 1.9005449591280654, "grad_norm": 0.2228512316942215, "learning_rate": 4.458724729313318e-05, "loss": 0.0218, "step": 2790 }, { "epoch": 1.9073569482288828, "grad_norm": 0.2506347894668579, "learning_rate": 4.42586175498792e-05, "loss": 0.023, "step": 2800 }, { "epoch": 1.9141689373297002, "grad_norm": 0.28511497378349304, "learning_rate": 4.3930238953293094e-05, "loss": 0.0211, "step": 2810 }, { "epoch": 1.9209809264305178, "grad_norm": 0.2836903929710388, "learning_rate": 4.360212586771847e-05, "loss": 0.0174, "step": 2820 }, { "epoch": 1.9277929155313351, "grad_norm": 0.2694113254547119, "learning_rate": 4.327429264588463e-05, "loss": 0.024, "step": 2830 }, { "epoch": 1.9346049046321525, "grad_norm": 0.25238320231437683, "learning_rate": 4.2946753628278725e-05, "loss": 0.022, "step": 2840 }, { "epoch": 1.94141689373297, "grad_norm": 0.22233974933624268, "learning_rate": 4.2619523142518474e-05, "loss": 0.0218, "step": 2850 }, { "epoch": 1.9482288828337875, "grad_norm": 0.22567766904830933, "learning_rate": 4.229261550272539e-05, "loss": 0.0211, "step": 2860 }, { "epoch": 1.9550408719346049, "grad_norm": 0.21269120275974274, "learning_rate": 4.196604500889868e-05, "loss": 0.0207, "step": 2870 }, { "epoch": 1.9618528610354224, "grad_norm": 0.25701943039894104, "learning_rate": 4.163982594628969e-05, "loss": 0.0218, "step": 2880 }, { "epoch": 1.9686648501362398, "grad_norm": 0.2941311299800873, "learning_rate": 4.131397258477702e-05, "loss": 0.0222, "step": 2890 }, { "epoch": 1.9754768392370572, "grad_norm": 0.20397907495498657, "learning_rate": 4.0988499178242315e-05, "loss": 0.0205, "step": 2900 }, { "epoch": 1.9822888283378748, "grad_norm": 0.21562394499778748, "learning_rate": 4.066341996394678e-05, "loss": 0.0288, "step": 2910 }, { "epoch": 1.989100817438692, "grad_norm": 0.25813037157058716, "learning_rate": 4.033874916190833e-05, "loss": 0.0215, "step": 2920 }, { "epoch": 1.9959128065395095, "grad_norm": 0.1991417109966278, "learning_rate": 4.001450097427966e-05, "loss": 0.019, "step": 2930 }, { "epoch": 2.002724795640327, "grad_norm": 0.21835818886756897, "learning_rate": 3.9690689584726894e-05, "loss": 0.0249, "step": 2940 }, { "epoch": 2.0095367847411443, "grad_norm": 0.24195794761180878, "learning_rate": 3.936732915780923e-05, "loss": 0.0177, "step": 2950 }, { "epoch": 2.016348773841962, "grad_norm": 0.3374285101890564, "learning_rate": 3.904443383835929e-05, "loss": 0.0247, "step": 2960 }, { "epoch": 2.0231607629427795, "grad_norm": 0.2824082374572754, "learning_rate": 3.872201775086437e-05, "loss": 0.0216, "step": 2970 }, { "epoch": 2.0299727520435966, "grad_norm": 0.29006993770599365, "learning_rate": 3.8400094998848616e-05, "loss": 0.0206, "step": 2980 }, { "epoch": 2.036784741144414, "grad_norm": 0.3308681547641754, "learning_rate": 3.807867966425611e-05, "loss": 0.0178, "step": 2990 }, { "epoch": 2.043596730245232, "grad_norm": 0.24560880661010742, "learning_rate": 3.775778580683481e-05, "loss": 0.0226, "step": 3000 }, { "epoch": 2.050408719346049, "grad_norm": 0.2389586716890335, "learning_rate": 3.743742746352156e-05, "loss": 0.021, "step": 3010 }, { "epoch": 2.0572207084468666, "grad_norm": 0.35238826274871826, "learning_rate": 3.711761864782817e-05, "loss": 0.0251, "step": 3020 }, { "epoch": 2.0640326975476837, "grad_norm": 0.2502613365650177, "learning_rate": 3.679837334922825e-05, "loss": 0.0201, "step": 3030 }, { "epoch": 2.0708446866485013, "grad_norm": 0.2527748942375183, "learning_rate": 3.647970553254538e-05, "loss": 0.0211, "step": 3040 }, { "epoch": 2.077656675749319, "grad_norm": 0.3349742293357849, "learning_rate": 3.61616291373422e-05, "loss": 0.0243, "step": 3050 }, { "epoch": 2.084468664850136, "grad_norm": 0.2768033444881439, "learning_rate": 3.584415807731065e-05, "loss": 0.0229, "step": 3060 }, { "epoch": 2.0912806539509536, "grad_norm": 0.21673381328582764, "learning_rate": 3.552730623966337e-05, "loss": 0.0223, "step": 3070 }, { "epoch": 2.0980926430517712, "grad_norm": 0.20745591819286346, "learning_rate": 3.521108748452617e-05, "loss": 0.0196, "step": 3080 }, { "epoch": 2.1049046321525884, "grad_norm": 0.27668702602386475, "learning_rate": 3.489551564433186e-05, "loss": 0.024, "step": 3090 }, { "epoch": 2.111716621253406, "grad_norm": 0.2564879357814789, "learning_rate": 3.4580604523215006e-05, "loss": 0.0194, "step": 3100 }, { "epoch": 2.1185286103542236, "grad_norm": 0.21311357617378235, "learning_rate": 3.4266367896408216e-05, "loss": 0.0291, "step": 3110 }, { "epoch": 2.1253405994550407, "grad_norm": 0.21265241503715515, "learning_rate": 3.3952819509639534e-05, "loss": 0.019, "step": 3120 }, { "epoch": 2.1321525885558583, "grad_norm": 0.25450852513313293, "learning_rate": 3.3639973078531165e-05, "loss": 0.0207, "step": 3130 }, { "epoch": 2.138964577656676, "grad_norm": 0.24124109745025635, "learning_rate": 3.332784228799947e-05, "loss": 0.0195, "step": 3140 }, { "epoch": 2.145776566757493, "grad_norm": 0.3012523055076599, "learning_rate": 3.301644079165638e-05, "loss": 0.0206, "step": 3150 }, { "epoch": 2.1525885558583107, "grad_norm": 0.2553965151309967, "learning_rate": 3.27057822112122e-05, "loss": 0.0169, "step": 3160 }, { "epoch": 2.1594005449591283, "grad_norm": 0.28278952836990356, "learning_rate": 3.239588013587958e-05, "loss": 0.0222, "step": 3170 }, { "epoch": 2.1662125340599454, "grad_norm": 0.2095153033733368, "learning_rate": 3.208674812177926e-05, "loss": 0.0189, "step": 3180 }, { "epoch": 2.173024523160763, "grad_norm": 0.30485105514526367, "learning_rate": 3.177839969134698e-05, "loss": 0.0219, "step": 3190 }, { "epoch": 2.1798365122615806, "grad_norm": 0.35161760449409485, "learning_rate": 3.1470848332742e-05, "loss": 0.0217, "step": 3200 }, { "epoch": 2.1866485013623977, "grad_norm": 0.24349473416805267, "learning_rate": 3.116410749925708e-05, "loss": 0.0222, "step": 3210 }, { "epoch": 2.1934604904632153, "grad_norm": 0.15715332329273224, "learning_rate": 3.085819060872995e-05, "loss": 0.0179, "step": 3220 }, { "epoch": 2.2002724795640325, "grad_norm": 0.22666095197200775, "learning_rate": 3.055311104295648e-05, "loss": 0.0198, "step": 3230 }, { "epoch": 2.20708446866485, "grad_norm": 0.22959241271018982, "learning_rate": 3.024888214710517e-05, "loss": 0.0162, "step": 3240 }, { "epoch": 2.2138964577656677, "grad_norm": 0.22255851328372955, "learning_rate": 2.994551722913349e-05, "loss": 0.0159, "step": 3250 }, { "epoch": 2.220708446866485, "grad_norm": 0.2214617133140564, "learning_rate": 2.9643029559205727e-05, "loss": 0.0225, "step": 3260 }, { "epoch": 2.2275204359673024, "grad_norm": 0.1882133036851883, "learning_rate": 2.934143236911248e-05, "loss": 0.0179, "step": 3270 }, { "epoch": 2.23433242506812, "grad_norm": 0.4131694436073303, "learning_rate": 2.90407388516919e-05, "loss": 0.0194, "step": 3280 }, { "epoch": 2.241144414168937, "grad_norm": 0.3278559148311615, "learning_rate": 2.8740962160252495e-05, "loss": 0.02, "step": 3290 }, { "epoch": 2.2479564032697548, "grad_norm": 0.21860350668430328, "learning_rate": 2.844211540799797e-05, "loss": 0.0177, "step": 3300 }, { "epoch": 2.2547683923705724, "grad_norm": 0.2650901675224304, "learning_rate": 2.8144211667453368e-05, "loss": 0.0183, "step": 3310 }, { "epoch": 2.2615803814713895, "grad_norm": 0.2598157823085785, "learning_rate": 2.7847263969893344e-05, "loss": 0.016, "step": 3320 }, { "epoch": 2.268392370572207, "grad_norm": 0.21535956859588623, "learning_rate": 2.7551285304772206e-05, "loss": 0.0173, "step": 3330 }, { "epoch": 2.2752043596730247, "grad_norm": 0.19479890167713165, "learning_rate": 2.7256288619155567e-05, "loss": 0.0181, "step": 3340 }, { "epoch": 2.282016348773842, "grad_norm": 0.21761104464530945, "learning_rate": 2.6962286817154158e-05, "loss": 0.0208, "step": 3350 }, { "epoch": 2.2888283378746594, "grad_norm": 0.18495774269104004, "learning_rate": 2.6669292759359166e-05, "loss": 0.0173, "step": 3360 }, { "epoch": 2.2956403269754766, "grad_norm": 0.2476925402879715, "learning_rate": 2.637731926227993e-05, "loss": 0.0231, "step": 3370 }, { "epoch": 2.302452316076294, "grad_norm": 0.3167796730995178, "learning_rate": 2.6086379097783033e-05, "loss": 0.0219, "step": 3380 }, { "epoch": 2.309264305177112, "grad_norm": 0.3013063371181488, "learning_rate": 2.579648499253377e-05, "loss": 0.0183, "step": 3390 }, { "epoch": 2.316076294277929, "grad_norm": 0.2609173357486725, "learning_rate": 2.5507649627439466e-05, "loss": 0.0214, "step": 3400 }, { "epoch": 2.3228882833787465, "grad_norm": 0.1826580911874771, "learning_rate": 2.5219885637094653e-05, "loss": 0.0191, "step": 3410 }, { "epoch": 2.329700272479564, "grad_norm": 0.21605326235294342, "learning_rate": 2.4933205609228533e-05, "loss": 0.0209, "step": 3420 }, { "epoch": 2.3365122615803813, "grad_norm": 0.23476341366767883, "learning_rate": 2.464762208415419e-05, "loss": 0.018, "step": 3430 }, { "epoch": 2.343324250681199, "grad_norm": 0.1948312371969223, "learning_rate": 2.4363147554220213e-05, "loss": 0.0145, "step": 3440 }, { "epoch": 2.3501362397820165, "grad_norm": 0.20815841853618622, "learning_rate": 2.407979446326411e-05, "loss": 0.0196, "step": 3450 }, { "epoch": 2.3569482288828336, "grad_norm": 0.23515887558460236, "learning_rate": 2.379757520606799e-05, "loss": 0.0203, "step": 3460 }, { "epoch": 2.363760217983651, "grad_norm": 0.2154649794101715, "learning_rate": 2.3516502127816455e-05, "loss": 0.0175, "step": 3470 }, { "epoch": 2.370572207084469, "grad_norm": 0.23456346988677979, "learning_rate": 2.323658752355647e-05, "loss": 0.0173, "step": 3480 }, { "epoch": 2.377384196185286, "grad_norm": 0.21330733597278595, "learning_rate": 2.2957843637659654e-05, "loss": 0.0178, "step": 3490 }, { "epoch": 2.3841961852861036, "grad_norm": 0.19244815409183502, "learning_rate": 2.2680282663286552e-05, "loss": 0.0229, "step": 3500 }, { "epoch": 2.391008174386921, "grad_norm": 0.20745113492012024, "learning_rate": 2.2403916741853364e-05, "loss": 0.0173, "step": 3510 }, { "epoch": 2.3978201634877383, "grad_norm": 0.19936102628707886, "learning_rate": 2.2128757962500817e-05, "loss": 0.0172, "step": 3520 }, { "epoch": 2.404632152588556, "grad_norm": 0.2921135127544403, "learning_rate": 2.1854818361565275e-05, "loss": 0.0171, "step": 3530 }, { "epoch": 2.4114441416893735, "grad_norm": 0.2126695066690445, "learning_rate": 2.1582109922052364e-05, "loss": 0.0199, "step": 3540 }, { "epoch": 2.4182561307901906, "grad_norm": 0.161210298538208, "learning_rate": 2.1310644573112635e-05, "loss": 0.0202, "step": 3550 }, { "epoch": 2.4250681198910082, "grad_norm": 0.1921418011188507, "learning_rate": 2.1040434189519924e-05, "loss": 0.0168, "step": 3560 }, { "epoch": 2.431880108991826, "grad_norm": 0.17595872282981873, "learning_rate": 2.0771490591151733e-05, "loss": 0.0208, "step": 3570 }, { "epoch": 2.438692098092643, "grad_norm": 0.18638396263122559, "learning_rate": 2.0503825542472317e-05, "loss": 0.0214, "step": 3580 }, { "epoch": 2.4455040871934606, "grad_norm": 0.24000069499015808, "learning_rate": 2.023745075201805e-05, "loss": 0.0155, "step": 3590 }, { "epoch": 2.452316076294278, "grad_norm": 0.13929104804992676, "learning_rate": 1.9972377871885157e-05, "loss": 0.0201, "step": 3600 }, { "epoch": 2.4591280653950953, "grad_norm": 0.23332083225250244, "learning_rate": 1.970861849722017e-05, "loss": 0.0159, "step": 3610 }, { "epoch": 2.465940054495913, "grad_norm": 0.2451397329568863, "learning_rate": 1.9446184165712587e-05, "loss": 0.0172, "step": 3620 }, { "epoch": 2.47275204359673, "grad_norm": 0.1490626186132431, "learning_rate": 1.9185086357090214e-05, "loss": 0.018, "step": 3630 }, { "epoch": 2.4795640326975477, "grad_norm": 0.16023452579975128, "learning_rate": 1.8925336492617057e-05, "loss": 0.0167, "step": 3640 }, { "epoch": 2.4863760217983653, "grad_norm": 0.2159489542245865, "learning_rate": 1.8666945934593666e-05, "loss": 0.0185, "step": 3650 }, { "epoch": 2.4931880108991824, "grad_norm": 0.18671192228794098, "learning_rate": 1.8409925985860126e-05, "loss": 0.0129, "step": 3660 }, { "epoch": 2.5, "grad_norm": 0.20349836349487305, "learning_rate": 1.8154287889301603e-05, "loss": 0.0177, "step": 3670 }, { "epoch": 2.5068119891008176, "grad_norm": 0.18601705133914948, "learning_rate": 1.7900042827356612e-05, "loss": 0.0205, "step": 3680 }, { "epoch": 2.5136239782016347, "grad_norm": 0.22594991326332092, "learning_rate": 1.76472019215278e-05, "loss": 0.0205, "step": 3690 }, { "epoch": 2.5204359673024523, "grad_norm": 0.18238820135593414, "learning_rate": 1.739577623189545e-05, "loss": 0.0142, "step": 3700 }, { "epoch": 2.5272479564032695, "grad_norm": 0.1694435328245163, "learning_rate": 1.7145776756633768e-05, "loss": 0.022, "step": 3710 }, { "epoch": 2.534059945504087, "grad_norm": 0.2308904379606247, "learning_rate": 1.6897214431529646e-05, "loss": 0.0166, "step": 3720 }, { "epoch": 2.5408719346049047, "grad_norm": 0.18409192562103271, "learning_rate": 1.6650100129504475e-05, "loss": 0.0132, "step": 3730 }, { "epoch": 2.547683923705722, "grad_norm": 0.17650723457336426, "learning_rate": 1.6404444660138335e-05, "loss": 0.0197, "step": 3740 }, { "epoch": 2.5544959128065394, "grad_norm": 0.24465468525886536, "learning_rate": 1.616025876919725e-05, "loss": 0.0163, "step": 3750 }, { "epoch": 2.561307901907357, "grad_norm": 0.19395938515663147, "learning_rate": 1.5917553138163172e-05, "loss": 0.0176, "step": 3760 }, { "epoch": 2.568119891008174, "grad_norm": 0.19339482486248016, "learning_rate": 1.5676338383766632e-05, "loss": 0.0196, "step": 3770 }, { "epoch": 2.5749318801089918, "grad_norm": 0.18326933681964874, "learning_rate": 1.5436625057522447e-05, "loss": 0.0154, "step": 3780 }, { "epoch": 2.5817438692098094, "grad_norm": 0.17008966207504272, "learning_rate": 1.519842364526804e-05, "loss": 0.0137, "step": 3790 }, { "epoch": 2.5885558583106265, "grad_norm": 0.1793888807296753, "learning_rate": 1.4961744566704855e-05, "loss": 0.0165, "step": 3800 }, { "epoch": 2.595367847411444, "grad_norm": 0.1575794667005539, "learning_rate": 1.4726598174942551e-05, "loss": 0.0147, "step": 3810 }, { "epoch": 2.6021798365122617, "grad_norm": 0.24643422663211823, "learning_rate": 1.4492994756046035e-05, "loss": 0.0207, "step": 3820 }, { "epoch": 2.608991825613079, "grad_norm": 0.1690363883972168, "learning_rate": 1.4260944528585645e-05, "loss": 0.0179, "step": 3830 }, { "epoch": 2.6158038147138964, "grad_norm": 0.229860320687294, "learning_rate": 1.4030457643190048e-05, "loss": 0.0138, "step": 3840 }, { "epoch": 2.622615803814714, "grad_norm": 0.1885327398777008, "learning_rate": 1.3801544182102311e-05, "loss": 0.016, "step": 3850 }, { "epoch": 2.629427792915531, "grad_norm": 0.1853918582201004, "learning_rate": 1.3574214158738763e-05, "loss": 0.0178, "step": 3860 }, { "epoch": 2.636239782016349, "grad_norm": 0.17312754690647125, "learning_rate": 1.3348477517251101e-05, "loss": 0.0159, "step": 3870 }, { "epoch": 2.6430517711171664, "grad_norm": 0.14870062470436096, "learning_rate": 1.312434413209131e-05, "loss": 0.0179, "step": 3880 }, { "epoch": 2.6498637602179835, "grad_norm": 0.35962745547294617, "learning_rate": 1.2901823807579727e-05, "loss": 0.0148, "step": 3890 }, { "epoch": 2.656675749318801, "grad_norm": 0.14894793927669525, "learning_rate": 1.2680926277476245e-05, "loss": 0.017, "step": 3900 }, { "epoch": 2.6634877384196187, "grad_norm": 0.20324669778347015, "learning_rate": 1.2461661204554397e-05, "loss": 0.0166, "step": 3910 }, { "epoch": 2.670299727520436, "grad_norm": 0.2097160369157791, "learning_rate": 1.2244038180178835e-05, "loss": 0.0161, "step": 3920 }, { "epoch": 2.6771117166212535, "grad_norm": 0.17441681027412415, "learning_rate": 1.2028066723885612e-05, "loss": 0.0163, "step": 3930 }, { "epoch": 2.683923705722071, "grad_norm": 0.18608888983726501, "learning_rate": 1.1813756282965888e-05, "loss": 0.0176, "step": 3940 }, { "epoch": 2.690735694822888, "grad_norm": 0.18648923933506012, "learning_rate": 1.1601116232052638e-05, "loss": 0.0168, "step": 3950 }, { "epoch": 2.697547683923706, "grad_norm": 0.15261727571487427, "learning_rate": 1.1390155872710517e-05, "loss": 0.0149, "step": 3960 }, { "epoch": 2.7043596730245234, "grad_norm": 0.2162063866853714, "learning_rate": 1.1180884433029087e-05, "loss": 0.0168, "step": 3970 }, { "epoch": 2.7111716621253406, "grad_norm": 0.24533921480178833, "learning_rate": 1.097331106721904e-05, "loss": 0.0147, "step": 3980 }, { "epoch": 2.717983651226158, "grad_norm": 0.20895080268383026, "learning_rate": 1.0767444855211862e-05, "loss": 0.015, "step": 3990 }, { "epoch": 2.7247956403269757, "grad_norm": 0.2006479650735855, "learning_rate": 1.0563294802262558e-05, "loss": 0.0173, "step": 4000 }, { "epoch": 2.731607629427793, "grad_norm": 0.16398422420024872, "learning_rate": 1.0360869838555809e-05, "loss": 0.0169, "step": 4010 }, { "epoch": 2.7384196185286105, "grad_norm": 0.22024202346801758, "learning_rate": 1.0160178818815313e-05, "loss": 0.015, "step": 4020 }, { "epoch": 2.7452316076294276, "grad_norm": 0.1872708946466446, "learning_rate": 9.961230521916387e-06, "loss": 0.0168, "step": 4030 }, { "epoch": 2.7520435967302452, "grad_norm": 0.2346954643726349, "learning_rate": 9.764033650502074e-06, "loss": 0.0176, "step": 4040 }, { "epoch": 2.758855585831063, "grad_norm": 0.15068836510181427, "learning_rate": 9.568596830602344e-06, "loss": 0.0137, "step": 4050 }, { "epoch": 2.76566757493188, "grad_norm": 0.20182640850543976, "learning_rate": 9.37492861125681e-06, "loss": 0.0181, "step": 4060 }, { "epoch": 2.7724795640326976, "grad_norm": 0.1375190019607544, "learning_rate": 9.183037464140804e-06, "loss": 0.0158, "step": 4070 }, { "epoch": 2.7792915531335147, "grad_norm": 0.25182825326919556, "learning_rate": 8.992931783194735e-06, "loss": 0.0134, "step": 4080 }, { "epoch": 2.7861035422343323, "grad_norm": 0.18647728860378265, "learning_rate": 8.80461988425696e-06, "loss": 0.0136, "step": 4090 }, { "epoch": 2.79291553133515, "grad_norm": 0.16191458702087402, "learning_rate": 8.618110004699974e-06, "loss": 0.0164, "step": 4100 }, { "epoch": 2.799727520435967, "grad_norm": 0.18361864984035492, "learning_rate": 8.4334103030701e-06, "loss": 0.0155, "step": 4110 }, { "epoch": 2.8065395095367847, "grad_norm": 0.21431824564933777, "learning_rate": 8.25052885873066e-06, "loss": 0.0154, "step": 4120 }, { "epoch": 2.8133514986376023, "grad_norm": 0.18994954228401184, "learning_rate": 8.06947367150846e-06, "loss": 0.016, "step": 4130 }, { "epoch": 2.8201634877384194, "grad_norm": 0.21481618285179138, "learning_rate": 7.890252661343938e-06, "loss": 0.0166, "step": 4140 }, { "epoch": 2.826975476839237, "grad_norm": 0.11670587211847305, "learning_rate": 7.712873667944681e-06, "loss": 0.0142, "step": 4150 }, { "epoch": 2.8337874659400546, "grad_norm": 0.19601042568683624, "learning_rate": 7.537344450442469e-06, "loss": 0.0104, "step": 4160 }, { "epoch": 2.8405994550408717, "grad_norm": 0.15036450326442719, "learning_rate": 7.36367268705393e-06, "loss": 0.0174, "step": 4170 }, { "epoch": 2.8474114441416893, "grad_norm": 0.23941321671009064, "learning_rate": 7.1918659747446e-06, "loss": 0.0191, "step": 4180 }, { "epoch": 2.854223433242507, "grad_norm": 0.1950898915529251, "learning_rate": 7.021931828896666e-06, "loss": 0.018, "step": 4190 }, { "epoch": 2.861035422343324, "grad_norm": 0.23307918012142181, "learning_rate": 6.8538776829801584e-06, "loss": 0.0127, "step": 4200 }, { "epoch": 2.8678474114441417, "grad_norm": 0.23717965185642242, "learning_rate": 6.687710888227849e-06, "loss": 0.0125, "step": 4210 }, { "epoch": 2.8746594005449593, "grad_norm": 0.18568864464759827, "learning_rate": 6.5234387133136565e-06, "loss": 0.0132, "step": 4220 }, { "epoch": 2.8814713896457764, "grad_norm": 0.18601331114768982, "learning_rate": 6.361068344034665e-06, "loss": 0.0156, "step": 4230 }, { "epoch": 2.888283378746594, "grad_norm": 0.1895252913236618, "learning_rate": 6.200606882996846e-06, "loss": 0.0144, "step": 4240 }, { "epoch": 2.8950953678474116, "grad_norm": 0.13856425881385803, "learning_rate": 6.042061349304312e-06, "loss": 0.0164, "step": 4250 }, { "epoch": 2.9019073569482288, "grad_norm": 0.14244164526462555, "learning_rate": 5.885438678252342e-06, "loss": 0.0178, "step": 4260 }, { "epoch": 2.9087193460490464, "grad_norm": 0.10831771790981293, "learning_rate": 5.730745721023939e-06, "loss": 0.0135, "step": 4270 }, { "epoch": 2.915531335149864, "grad_norm": 0.2154112458229065, "learning_rate": 5.577989244390192e-06, "loss": 0.014, "step": 4280 }, { "epoch": 2.922343324250681, "grad_norm": 0.1787579506635666, "learning_rate": 5.4271759304142635e-06, "loss": 0.0122, "step": 4290 }, { "epoch": 2.9291553133514987, "grad_norm": 0.20089909434318542, "learning_rate": 5.278312376159051e-06, "loss": 0.0147, "step": 4300 }, { "epoch": 2.9359673024523163, "grad_norm": 0.20588675141334534, "learning_rate": 5.1314050933986944e-06, "loss": 0.0134, "step": 4310 }, { "epoch": 2.9427792915531334, "grad_norm": 0.15475359559059143, "learning_rate": 4.986460508333634e-06, "loss": 0.0171, "step": 4320 }, { "epoch": 2.949591280653951, "grad_norm": 0.12290208041667938, "learning_rate": 4.843484961309597e-06, "loss": 0.0108, "step": 4330 }, { "epoch": 2.9564032697547686, "grad_norm": 0.23685646057128906, "learning_rate": 4.702484706540161e-06, "loss": 0.015, "step": 4340 }, { "epoch": 2.963215258855586, "grad_norm": 0.17012353241443634, "learning_rate": 4.563465911833259e-06, "loss": 0.0144, "step": 4350 }, { "epoch": 2.9700272479564034, "grad_norm": 0.15839093923568726, "learning_rate": 4.426434658321344e-06, "loss": 0.0118, "step": 4360 }, { "epoch": 2.976839237057221, "grad_norm": 0.14717762172222137, "learning_rate": 4.2913969401953466e-06, "loss": 0.0135, "step": 4370 }, { "epoch": 2.983651226158038, "grad_norm": 0.16831554472446442, "learning_rate": 4.15835866444253e-06, "loss": 0.013, "step": 4380 }, { "epoch": 2.9904632152588557, "grad_norm": 0.13316653668880463, "learning_rate": 4.027325650588043e-06, "loss": 0.0167, "step": 4390 }, { "epoch": 2.997275204359673, "grad_norm": 0.2631996273994446, "learning_rate": 3.898303630440419e-06, "loss": 0.0178, "step": 4400 }, { "epoch": 3.0040871934604905, "grad_norm": 0.16159358620643616, "learning_rate": 3.7712982478407877e-06, "loss": 0.0169, "step": 4410 }, { "epoch": 3.010899182561308, "grad_norm": 0.16235774755477905, "learning_rate": 3.6463150584160053e-06, "loss": 0.0171, "step": 4420 }, { "epoch": 3.017711171662125, "grad_norm": 0.11211927980184555, "learning_rate": 3.5233595293356957e-06, "loss": 0.0117, "step": 4430 }, { "epoch": 3.024523160762943, "grad_norm": 0.18224704265594482, "learning_rate": 3.4024370390730033e-06, "loss": 0.017, "step": 4440 }, { "epoch": 3.0313351498637604, "grad_norm": 0.18648995459079742, "learning_rate": 3.2835528771693992e-06, "loss": 0.0144, "step": 4450 }, { "epoch": 3.0381471389645776, "grad_norm": 0.1381874680519104, "learning_rate": 3.1667122440032505e-06, "loss": 0.0131, "step": 4460 }, { "epoch": 3.044959128065395, "grad_norm": 0.13673441112041473, "learning_rate": 3.051920250562351e-06, "loss": 0.0126, "step": 4470 }, { "epoch": 3.0517711171662127, "grad_norm": 0.17434169352054596, "learning_rate": 2.939181918220385e-06, "loss": 0.0136, "step": 4480 }, { "epoch": 3.05858310626703, "grad_norm": 0.17766402661800385, "learning_rate": 2.8285021785172226e-06, "loss": 0.0137, "step": 4490 }, { "epoch": 3.0653950953678475, "grad_norm": 0.22053247690200806, "learning_rate": 2.7198858729432288e-06, "loss": 0.0145, "step": 4500 }, { "epoch": 3.0722070844686646, "grad_norm": 0.19397376477718353, "learning_rate": 2.6133377527274905e-06, "loss": 0.0149, "step": 4510 }, { "epoch": 3.0790190735694822, "grad_norm": 0.10889869183301926, "learning_rate": 2.5088624786299366e-06, "loss": 0.0123, "step": 4520 }, { "epoch": 3.0858310626703, "grad_norm": 0.1636773943901062, "learning_rate": 2.406464620737531e-06, "loss": 0.0127, "step": 4530 }, { "epoch": 3.092643051771117, "grad_norm": 0.13466012477874756, "learning_rate": 2.3061486582642734e-06, "loss": 0.0135, "step": 4540 }, { "epoch": 3.0994550408719346, "grad_norm": 0.13705144822597504, "learning_rate": 2.2079189793553667e-06, "loss": 0.0126, "step": 4550 }, { "epoch": 3.106267029972752, "grad_norm": 0.204204723238945, "learning_rate": 2.111779880895165e-06, "loss": 0.011, "step": 4560 }, { "epoch": 3.1130790190735693, "grad_norm": 0.17932602763175964, "learning_rate": 2.01773556831929e-06, "loss": 0.0118, "step": 4570 }, { "epoch": 3.119891008174387, "grad_norm": 0.18473263084888458, "learning_rate": 1.9257901554306513e-06, "loss": 0.0118, "step": 4580 }, { "epoch": 3.1267029972752045, "grad_norm": 0.1656373143196106, "learning_rate": 1.835947664219445e-06, "loss": 0.0135, "step": 4590 }, { "epoch": 3.1335149863760217, "grad_norm": 0.18078264594078064, "learning_rate": 1.748212024687307e-06, "loss": 0.0118, "step": 4600 }, { "epoch": 3.1403269754768393, "grad_norm": 0.23627698421478271, "learning_rate": 1.6625870746753147e-06, "loss": 0.0151, "step": 4610 }, { "epoch": 3.147138964577657, "grad_norm": 0.15048933029174805, "learning_rate": 1.5790765596961853e-06, "loss": 0.015, "step": 4620 }, { "epoch": 3.153950953678474, "grad_norm": 0.2178574502468109, "learning_rate": 1.4976841327703717e-06, "loss": 0.0135, "step": 4630 }, { "epoch": 3.1607629427792916, "grad_norm": 0.1818486899137497, "learning_rate": 1.4184133542663014e-06, "loss": 0.0122, "step": 4640 }, { "epoch": 3.167574931880109, "grad_norm": 0.1654607504606247, "learning_rate": 1.341267691744641e-06, "loss": 0.0128, "step": 4650 }, { "epoch": 3.1743869209809263, "grad_norm": 0.207754448056221, "learning_rate": 1.2662505198065666e-06, "loss": 0.0224, "step": 4660 }, { "epoch": 3.181198910081744, "grad_norm": 0.16341248154640198, "learning_rate": 1.193365119946216e-06, "loss": 0.0153, "step": 4670 }, { "epoch": 3.1880108991825615, "grad_norm": 0.1576090306043625, "learning_rate": 1.1226146804070859e-06, "loss": 0.0102, "step": 4680 }, { "epoch": 3.1948228882833787, "grad_norm": 0.1799905151128769, "learning_rate": 1.0540022960426111e-06, "loss": 0.0134, "step": 4690 }, { "epoch": 3.2016348773841963, "grad_norm": 0.23539473116397858, "learning_rate": 9.875309681807443e-07, "loss": 0.0171, "step": 4700 }, { "epoch": 3.2084468664850134, "grad_norm": 0.1891935020685196, "learning_rate": 9.232036044927061e-07, "loss": 0.0136, "step": 4710 }, { "epoch": 3.215258855585831, "grad_norm": 0.1502537727355957, "learning_rate": 8.610230188657919e-07, "loss": 0.0135, "step": 4720 }, { "epoch": 3.2220708446866486, "grad_norm": 0.14308865368366241, "learning_rate": 8.009919312802372e-07, "loss": 0.0125, "step": 4730 }, { "epoch": 3.2288828337874658, "grad_norm": 0.17500245571136475, "learning_rate": 7.431129676902904e-07, "loss": 0.01, "step": 4740 }, { "epoch": 3.2356948228882834, "grad_norm": 0.12005341053009033, "learning_rate": 6.873886599093215e-07, "loss": 0.013, "step": 4750 }, { "epoch": 3.242506811989101, "grad_norm": 0.22890767455101013, "learning_rate": 6.338214454990776e-07, "loss": 0.0165, "step": 4760 }, { "epoch": 3.249318801089918, "grad_norm": 0.12232371419668198, "learning_rate": 5.82413667663051e-07, "loss": 0.0153, "step": 4770 }, { "epoch": 3.2561307901907357, "grad_norm": 0.16289682686328888, "learning_rate": 5.331675751439725e-07, "loss": 0.0144, "step": 4780 }, { "epoch": 3.2629427792915533, "grad_norm": 0.19280773401260376, "learning_rate": 4.86085322125479e-07, "loss": 0.012, "step": 4790 }, { "epoch": 3.2697547683923704, "grad_norm": 0.12008260190486908, "learning_rate": 4.411689681378284e-07, "loss": 0.0148, "step": 4800 }, { "epoch": 3.276566757493188, "grad_norm": 0.16363725066184998, "learning_rate": 3.9842047796786466e-07, "loss": 0.0125, "step": 4810 }, { "epoch": 3.2833787465940056, "grad_norm": 0.16861975193023682, "learning_rate": 3.578417215730323e-07, "loss": 0.0114, "step": 4820 }, { "epoch": 3.290190735694823, "grad_norm": 0.17795579135417938, "learning_rate": 3.1943447399958027e-07, "loss": 0.0136, "step": 4830 }, { "epoch": 3.2970027247956404, "grad_norm": 0.07885803282260895, "learning_rate": 2.8320041530495724e-07, "loss": 0.0203, "step": 4840 }, { "epoch": 3.3038147138964575, "grad_norm": 0.12531672418117523, "learning_rate": 2.491411304842539e-07, "loss": 0.0129, "step": 4850 }, { "epoch": 3.310626702997275, "grad_norm": 0.17444051802158356, "learning_rate": 2.1725810940094183e-07, "loss": 0.012, "step": 4860 }, { "epoch": 3.3174386920980927, "grad_norm": 0.14167378842830658, "learning_rate": 1.8755274672164202e-07, "loss": 0.0129, "step": 4870 }, { "epoch": 3.32425068119891, "grad_norm": 0.12788553535938263, "learning_rate": 1.600263418551573e-07, "loss": 0.0148, "step": 4880 }, { "epoch": 3.3310626702997275, "grad_norm": 0.2057434469461441, "learning_rate": 1.346800988955954e-07, "loss": 0.0154, "step": 4890 }, { "epoch": 3.337874659400545, "grad_norm": 0.17330636084079742, "learning_rate": 1.1151512656975005e-07, "loss": 0.0116, "step": 4900 }, { "epoch": 3.344686648501362, "grad_norm": 0.09420597553253174, "learning_rate": 9.053243818853973e-08, "loss": 0.0124, "step": 4910 }, { "epoch": 3.35149863760218, "grad_norm": 0.15236696600914001, "learning_rate": 7.173295160273763e-08, "loss": 0.0124, "step": 4920 }, { "epoch": 3.3583106267029974, "grad_norm": 0.15374703705310822, "learning_rate": 5.511748916279258e-08, "loss": 0.0132, "step": 4930 }, { "epoch": 3.3651226158038146, "grad_norm": 0.273964524269104, "learning_rate": 4.068677768285234e-08, "loss": 0.0115, "step": 4940 }, { "epoch": 3.371934604904632, "grad_norm": 0.11771193891763687, "learning_rate": 2.844144840898344e-08, "loss": 0.015, "step": 4950 }, { "epoch": 3.3787465940054497, "grad_norm": 0.12935270369052887, "learning_rate": 1.8382036991559936e-08, "loss": 0.0112, "step": 4960 }, { "epoch": 3.385558583106267, "grad_norm": 0.1538880318403244, "learning_rate": 1.0508983461832156e-08, "loss": 0.0166, "step": 4970 }, { "epoch": 3.3923705722070845, "grad_norm": 0.10488380491733551, "learning_rate": 4.822632212653222e-09, "loss": 0.018, "step": 4980 }, { "epoch": 3.399182561307902, "grad_norm": 0.18381240963935852, "learning_rate": 1.3232319834632912e-09, "loss": 0.0135, "step": 4990 }, { "epoch": 3.4059945504087192, "grad_norm": 0.14481449127197266, "learning_rate": 1.0935849353854721e-11, "loss": 0.0137, "step": 5000 }, { "epoch": 3.412806539509537, "grad_norm": 0.44966641068458557, "learning_rate": 5.398064519110622e-05, "loss": 0.0203, "step": 5010 }, { "epoch": 3.4196185286103544, "grad_norm": 0.3312857747077942, "learning_rate": 5.3815801579167394e-05, "loss": 0.0175, "step": 5020 }, { "epoch": 3.4264305177111716, "grad_norm": 0.5842679738998413, "learning_rate": 5.365091623823382e-05, "loss": 0.0213, "step": 5030 }, { "epoch": 3.433242506811989, "grad_norm": 0.5707949995994568, "learning_rate": 5.348599097146521e-05, "loss": 0.0228, "step": 5040 }, { "epoch": 3.4400544959128068, "grad_norm": 0.2389402836561203, "learning_rate": 5.3321027582457836e-05, "loss": 0.0237, "step": 5050 }, { "epoch": 3.446866485013624, "grad_norm": 0.3142755329608917, "learning_rate": 5.315602787522491e-05, "loss": 0.0248, "step": 5060 }, { "epoch": 3.4536784741144415, "grad_norm": 0.35478901863098145, "learning_rate": 5.299099365417678e-05, "loss": 0.0201, "step": 5070 }, { "epoch": 3.460490463215259, "grad_norm": 0.4287269115447998, "learning_rate": 5.2825926724101236e-05, "loss": 0.026, "step": 5080 }, { "epoch": 3.4673024523160763, "grad_norm": 0.5050956606864929, "learning_rate": 5.26608288901438e-05, "loss": 0.0295, "step": 5090 }, { "epoch": 3.474114441416894, "grad_norm": 0.36942875385284424, "learning_rate": 5.24957019577879e-05, "loss": 0.0259, "step": 5100 }, { "epoch": 3.480926430517711, "grad_norm": 0.40414538979530334, "learning_rate": 5.2330547732835266e-05, "loss": 0.0242, "step": 5110 }, { "epoch": 3.4877384196185286, "grad_norm": 0.35221511125564575, "learning_rate": 5.2165368021385996e-05, "loss": 0.0304, "step": 5120 }, { "epoch": 3.494550408719346, "grad_norm": 0.4094237685203552, "learning_rate": 5.200016462981897e-05, "loss": 0.0249, "step": 5130 }, { "epoch": 3.5013623978201633, "grad_norm": 0.24707941710948944, "learning_rate": 5.1834939364772015e-05, "loss": 0.0219, "step": 5140 }, { "epoch": 3.508174386920981, "grad_norm": 0.38713163137435913, "learning_rate": 5.166969403312214e-05, "loss": 0.0288, "step": 5150 }, { "epoch": 3.5149863760217985, "grad_norm": 0.3290533721446991, "learning_rate": 5.1504430441965844e-05, "loss": 0.0262, "step": 5160 }, { "epoch": 3.5217983651226157, "grad_norm": 0.3959462642669678, "learning_rate": 5.133915039859923e-05, "loss": 0.02, "step": 5170 }, { "epoch": 3.5286103542234333, "grad_norm": 0.3446705937385559, "learning_rate": 5.1173855710498444e-05, "loss": 0.023, "step": 5180 }, { "epoch": 3.5354223433242504, "grad_norm": 0.27180591225624084, "learning_rate": 5.100854818529967e-05, "loss": 0.0283, "step": 5190 }, { "epoch": 3.542234332425068, "grad_norm": 0.39243829250335693, "learning_rate": 5.084322963077951e-05, "loss": 0.029, "step": 5200 }, { "epoch": 3.5490463215258856, "grad_norm": 0.2588927149772644, "learning_rate": 5.067790185483522e-05, "loss": 0.0282, "step": 5210 }, { "epoch": 3.5558583106267028, "grad_norm": 0.18376407027244568, "learning_rate": 5.0512566665464844e-05, "loss": 0.0272, "step": 5220 }, { "epoch": 3.5626702997275204, "grad_norm": 0.29992175102233887, "learning_rate": 5.034722587074755e-05, "loss": 0.0257, "step": 5230 }, { "epoch": 3.569482288828338, "grad_norm": 0.243015319108963, "learning_rate": 5.018188127882375e-05, "loss": 0.0229, "step": 5240 }, { "epoch": 3.576294277929155, "grad_norm": 0.32886067032814026, "learning_rate": 5.0016534697875417e-05, "loss": 0.0211, "step": 5250 }, { "epoch": 3.5831062670299727, "grad_norm": 0.4220637083053589, "learning_rate": 4.9851187936106294e-05, "loss": 0.0246, "step": 5260 }, { "epoch": 3.5899182561307903, "grad_norm": 0.2974489629268646, "learning_rate": 4.968584280172206e-05, "loss": 0.0233, "step": 5270 }, { "epoch": 3.5967302452316074, "grad_norm": 0.502668023109436, "learning_rate": 4.95205011029106e-05, "loss": 0.0285, "step": 5280 }, { "epoch": 3.603542234332425, "grad_norm": 0.3639957904815674, "learning_rate": 4.935516464782227e-05, "loss": 0.0268, "step": 5290 }, { "epoch": 3.6103542234332426, "grad_norm": 0.36707913875579834, "learning_rate": 4.918983524455003e-05, "loss": 0.0246, "step": 5300 }, { "epoch": 3.61716621253406, "grad_norm": 0.22181017696857452, "learning_rate": 4.9024514701109766e-05, "loss": 0.0252, "step": 5310 }, { "epoch": 3.6239782016348774, "grad_norm": 0.42766740918159485, "learning_rate": 4.885920482542043e-05, "loss": 0.0225, "step": 5320 }, { "epoch": 3.630790190735695, "grad_norm": 0.26574602723121643, "learning_rate": 4.869390742528438e-05, "loss": 0.0208, "step": 5330 }, { "epoch": 3.637602179836512, "grad_norm": 0.18494778871536255, "learning_rate": 4.852862430836744e-05, "loss": 0.0248, "step": 5340 }, { "epoch": 3.6444141689373297, "grad_norm": 0.3686949610710144, "learning_rate": 4.836335728217933e-05, "loss": 0.0226, "step": 5350 }, { "epoch": 3.6512261580381473, "grad_norm": 0.29411113262176514, "learning_rate": 4.819810815405379e-05, "loss": 0.0255, "step": 5360 }, { "epoch": 3.6580381471389645, "grad_norm": 0.2379477620124817, "learning_rate": 4.803287873112877e-05, "loss": 0.0229, "step": 5370 }, { "epoch": 3.664850136239782, "grad_norm": 0.3780541718006134, "learning_rate": 4.786767082032681e-05, "loss": 0.0234, "step": 5380 }, { "epoch": 3.6716621253405997, "grad_norm": 0.24052190780639648, "learning_rate": 4.77024862283351e-05, "loss": 0.0229, "step": 5390 }, { "epoch": 3.678474114441417, "grad_norm": 0.2713554799556732, "learning_rate": 4.753732676158593e-05, "loss": 0.0242, "step": 5400 }, { "epoch": 3.6852861035422344, "grad_norm": 0.3661803603172302, "learning_rate": 4.737219422623672e-05, "loss": 0.0239, "step": 5410 }, { "epoch": 3.692098092643052, "grad_norm": 0.4185531735420227, "learning_rate": 4.720709042815044e-05, "loss": 0.0204, "step": 5420 }, { "epoch": 3.698910081743869, "grad_norm": 0.2620242238044739, "learning_rate": 4.704201717287578e-05, "loss": 0.0211, "step": 5430 }, { "epoch": 3.7057220708446867, "grad_norm": 0.26090627908706665, "learning_rate": 4.6876976265627404e-05, "loss": 0.0224, "step": 5440 }, { "epoch": 3.7125340599455043, "grad_norm": 0.2731458842754364, "learning_rate": 4.671196951126626e-05, "loss": 0.0269, "step": 5450 }, { "epoch": 3.7193460490463215, "grad_norm": 0.31026485562324524, "learning_rate": 4.654699871427971e-05, "loss": 0.0218, "step": 5460 }, { "epoch": 3.726158038147139, "grad_norm": 0.233415424823761, "learning_rate": 4.6382065678762034e-05, "loss": 0.0204, "step": 5470 }, { "epoch": 3.7329700272479567, "grad_norm": 0.3344708979129791, "learning_rate": 4.6217172208394424e-05, "loss": 0.0197, "step": 5480 }, { "epoch": 3.739782016348774, "grad_norm": 0.25975632667541504, "learning_rate": 4.605232010642549e-05, "loss": 0.0194, "step": 5490 }, { "epoch": 3.7465940054495914, "grad_norm": 0.2950715720653534, "learning_rate": 4.588751117565142e-05, "loss": 0.0193, "step": 5500 }, { "epoch": 3.7534059945504086, "grad_norm": 0.2784842252731323, "learning_rate": 4.5722747218396214e-05, "loss": 0.0251, "step": 5510 }, { "epoch": 3.760217983651226, "grad_norm": 0.2176719456911087, "learning_rate": 4.5558030036492194e-05, "loss": 0.0204, "step": 5520 }, { "epoch": 3.7670299727520433, "grad_norm": 0.28440573811531067, "learning_rate": 4.539336143125999e-05, "loss": 0.0265, "step": 5530 }, { "epoch": 3.773841961852861, "grad_norm": 0.25604936480522156, "learning_rate": 4.522874320348916e-05, "loss": 0.0225, "step": 5540 }, { "epoch": 3.7806539509536785, "grad_norm": 0.2565711438655853, "learning_rate": 4.506417715341821e-05, "loss": 0.019, "step": 5550 }, { "epoch": 3.7874659400544957, "grad_norm": 0.2216968685388565, "learning_rate": 4.489966508071511e-05, "loss": 0.022, "step": 5560 }, { "epoch": 3.7942779291553133, "grad_norm": 0.22490093111991882, "learning_rate": 4.4735208784457575e-05, "loss": 0.0197, "step": 5570 }, { "epoch": 3.801089918256131, "grad_norm": 0.3565233647823334, "learning_rate": 4.457081006311325e-05, "loss": 0.0242, "step": 5580 }, { "epoch": 3.807901907356948, "grad_norm": 0.270898699760437, "learning_rate": 4.440647071452027e-05, "loss": 0.0226, "step": 5590 }, { "epoch": 3.8147138964577656, "grad_norm": 0.26380616426467896, "learning_rate": 4.424219253586737e-05, "loss": 0.0221, "step": 5600 }, { "epoch": 3.821525885558583, "grad_norm": 0.3055083751678467, "learning_rate": 4.407797732367443e-05, "loss": 0.0251, "step": 5610 }, { "epoch": 3.8283378746594003, "grad_norm": 0.2543126046657562, "learning_rate": 4.391382687377268e-05, "loss": 0.0248, "step": 5620 }, { "epoch": 3.835149863760218, "grad_norm": 0.43203112483024597, "learning_rate": 4.374974298128512e-05, "loss": 0.0202, "step": 5630 }, { "epoch": 3.8419618528610355, "grad_norm": 0.20501923561096191, "learning_rate": 4.358572744060699e-05, "loss": 0.0243, "step": 5640 }, { "epoch": 3.8487738419618527, "grad_norm": 0.2543809413909912, "learning_rate": 4.342178204538588e-05, "loss": 0.0202, "step": 5650 }, { "epoch": 3.8555858310626703, "grad_norm": 0.37627357244491577, "learning_rate": 4.325790858850241e-05, "loss": 0.0208, "step": 5660 }, { "epoch": 3.862397820163488, "grad_norm": 0.19202812016010284, "learning_rate": 4.309410886205043e-05, "loss": 0.0228, "step": 5670 }, { "epoch": 3.869209809264305, "grad_norm": 0.19026115536689758, "learning_rate": 4.293038465731752e-05, "loss": 0.0222, "step": 5680 }, { "epoch": 3.8760217983651226, "grad_norm": 0.27928781509399414, "learning_rate": 4.276673776476533e-05, "loss": 0.0222, "step": 5690 }, { "epoch": 3.88283378746594, "grad_norm": 0.30648189783096313, "learning_rate": 4.260316997401007e-05, "loss": 0.0186, "step": 5700 }, { "epoch": 3.8896457765667574, "grad_norm": 0.2663455903530121, "learning_rate": 4.243968307380293e-05, "loss": 0.0237, "step": 5710 }, { "epoch": 3.896457765667575, "grad_norm": 0.22592630982398987, "learning_rate": 4.22762788520104e-05, "loss": 0.0234, "step": 5720 }, { "epoch": 3.9032697547683926, "grad_norm": 0.21950973570346832, "learning_rate": 4.211295909559491e-05, "loss": 0.0265, "step": 5730 }, { "epoch": 3.9100817438692097, "grad_norm": 0.21050743758678436, "learning_rate": 4.194972559059511e-05, "loss": 0.0197, "step": 5740 }, { "epoch": 3.9168937329700273, "grad_norm": 0.22975432872772217, "learning_rate": 4.178658012210651e-05, "loss": 0.0228, "step": 5750 }, { "epoch": 3.923705722070845, "grad_norm": 0.349044531583786, "learning_rate": 4.162352447426177e-05, "loss": 0.0207, "step": 5760 }, { "epoch": 3.930517711171662, "grad_norm": 0.22395232319831848, "learning_rate": 4.146056043021135e-05, "loss": 0.0203, "step": 5770 }, { "epoch": 3.9373297002724796, "grad_norm": 0.24076318740844727, "learning_rate": 4.1297689772103944e-05, "loss": 0.0218, "step": 5780 }, { "epoch": 3.9441416893732972, "grad_norm": 0.311708003282547, "learning_rate": 4.113491428106694e-05, "loss": 0.0247, "step": 5790 }, { "epoch": 3.9509536784741144, "grad_norm": 0.265595406293869, "learning_rate": 4.0972235737187055e-05, "loss": 0.0181, "step": 5800 }, { "epoch": 3.957765667574932, "grad_norm": 0.3528865873813629, "learning_rate": 4.080965591949076e-05, "loss": 0.0194, "step": 5810 }, { "epoch": 3.9645776566757496, "grad_norm": 0.3113243877887726, "learning_rate": 4.0647176605924924e-05, "loss": 0.0225, "step": 5820 }, { "epoch": 3.9713896457765667, "grad_norm": 0.3198659420013428, "learning_rate": 4.0484799573337255e-05, "loss": 0.0256, "step": 5830 }, { "epoch": 3.9782016348773843, "grad_norm": 0.22167012095451355, "learning_rate": 4.032252659745699e-05, "loss": 0.0226, "step": 5840 }, { "epoch": 3.9850136239782015, "grad_norm": 0.22256286442279816, "learning_rate": 4.016035945287539e-05, "loss": 0.0278, "step": 5850 }, { "epoch": 3.991825613079019, "grad_norm": 0.2504684329032898, "learning_rate": 3.999829991302635e-05, "loss": 0.0245, "step": 5860 }, { "epoch": 3.9986376021798367, "grad_norm": 0.2460675686597824, "learning_rate": 3.983634975016707e-05, "loss": 0.0214, "step": 5870 }, { "epoch": 4.005449591280654, "grad_norm": 0.28262001276016235, "learning_rate": 3.967451073535854e-05, "loss": 0.0256, "step": 5880 }, { "epoch": 4.012261580381471, "grad_norm": 0.3468887507915497, "learning_rate": 3.951278463844633e-05, "loss": 0.0251, "step": 5890 }, { "epoch": 4.0190735694822886, "grad_norm": 0.3931543231010437, "learning_rate": 3.935117322804111e-05, "loss": 0.0234, "step": 5900 }, { "epoch": 4.025885558583107, "grad_norm": 0.35787367820739746, "learning_rate": 3.918967827149938e-05, "loss": 0.0175, "step": 5910 }, { "epoch": 4.032697547683924, "grad_norm": 0.24113652110099792, "learning_rate": 3.9028301534904094e-05, "loss": 0.0222, "step": 5920 }, { "epoch": 4.039509536784741, "grad_norm": 0.265298068523407, "learning_rate": 3.88670447830454e-05, "loss": 0.0218, "step": 5930 }, { "epoch": 4.046321525885559, "grad_norm": 0.3670673072338104, "learning_rate": 3.870590977940132e-05, "loss": 0.0195, "step": 5940 }, { "epoch": 4.053133514986376, "grad_norm": 0.20872969925403595, "learning_rate": 3.8544898286118404e-05, "loss": 0.0181, "step": 5950 }, { "epoch": 4.059945504087193, "grad_norm": 0.2174217849969864, "learning_rate": 3.838401206399257e-05, "loss": 0.0189, "step": 5960 }, { "epoch": 4.066757493188011, "grad_norm": 0.25039082765579224, "learning_rate": 3.822325287244975e-05, "loss": 0.0203, "step": 5970 }, { "epoch": 4.073569482288828, "grad_norm": 0.2662447690963745, "learning_rate": 3.8062622469526725e-05, "loss": 0.0195, "step": 5980 }, { "epoch": 4.080381471389646, "grad_norm": 0.2717086374759674, "learning_rate": 3.790212261185183e-05, "loss": 0.0186, "step": 5990 }, { "epoch": 4.087193460490464, "grad_norm": 0.2525738477706909, "learning_rate": 3.7741755054625794e-05, "loss": 0.0229, "step": 6000 }, { "epoch": 4.094005449591281, "grad_norm": 0.20453284680843353, "learning_rate": 3.758152155160255e-05, "loss": 0.0178, "step": 6010 }, { "epoch": 4.100817438692098, "grad_norm": 0.28254011273384094, "learning_rate": 3.742142385506999e-05, "loss": 0.0171, "step": 6020 }, { "epoch": 4.107629427792916, "grad_norm": 0.4284875690937042, "learning_rate": 3.72614637158309e-05, "loss": 0.0196, "step": 6030 }, { "epoch": 4.114441416893733, "grad_norm": 0.2086813747882843, "learning_rate": 3.710164288318371e-05, "loss": 0.0194, "step": 6040 }, { "epoch": 4.12125340599455, "grad_norm": 0.20289340615272522, "learning_rate": 3.694196310490345e-05, "loss": 0.0152, "step": 6050 }, { "epoch": 4.128065395095367, "grad_norm": 0.34958550333976746, "learning_rate": 3.678242612722259e-05, "loss": 0.0209, "step": 6060 }, { "epoch": 4.1348773841961854, "grad_norm": 0.2462022453546524, "learning_rate": 3.6623033694811953e-05, "loss": 0.0186, "step": 6070 }, { "epoch": 4.141689373297003, "grad_norm": 0.15042909979820251, "learning_rate": 3.6463787550761665e-05, "loss": 0.0198, "step": 6080 }, { "epoch": 4.14850136239782, "grad_norm": 0.2676561176776886, "learning_rate": 3.630468943656202e-05, "loss": 0.0181, "step": 6090 }, { "epoch": 4.155313351498638, "grad_norm": 0.2557305097579956, "learning_rate": 3.6145741092084523e-05, "loss": 0.0168, "step": 6100 }, { "epoch": 4.162125340599455, "grad_norm": 0.28285613656044006, "learning_rate": 3.598694425556278e-05, "loss": 0.0174, "step": 6110 }, { "epoch": 4.168937329700272, "grad_norm": 0.19794082641601562, "learning_rate": 3.58283006635736e-05, "loss": 0.0187, "step": 6120 }, { "epoch": 4.17574931880109, "grad_norm": 0.3199867308139801, "learning_rate": 3.566981205101781e-05, "loss": 0.0188, "step": 6130 }, { "epoch": 4.182561307901907, "grad_norm": 0.21557827293872833, "learning_rate": 3.5511480151101556e-05, "loss": 0.0179, "step": 6140 }, { "epoch": 4.189373297002724, "grad_norm": 0.22134508192539215, "learning_rate": 3.5353306695317104e-05, "loss": 0.0148, "step": 6150 }, { "epoch": 4.1961852861035425, "grad_norm": 0.2104470133781433, "learning_rate": 3.519529341342402e-05, "loss": 0.0193, "step": 6160 }, { "epoch": 4.20299727520436, "grad_norm": 0.2785221040248871, "learning_rate": 3.503744203343026e-05, "loss": 0.0182, "step": 6170 }, { "epoch": 4.209809264305177, "grad_norm": 0.27562573552131653, "learning_rate": 3.487975428157318e-05, "loss": 0.0197, "step": 6180 }, { "epoch": 4.216621253405995, "grad_norm": 0.35668033361434937, "learning_rate": 3.472223188230083e-05, "loss": 0.0196, "step": 6190 }, { "epoch": 4.223433242506812, "grad_norm": 0.2609441578388214, "learning_rate": 3.4564876558252866e-05, "loss": 0.0218, "step": 6200 }, { "epoch": 4.230245231607629, "grad_norm": 0.28609734773635864, "learning_rate": 3.440769003024195e-05, "loss": 0.0169, "step": 6210 }, { "epoch": 4.237057220708447, "grad_norm": 0.18339040875434875, "learning_rate": 3.425067401723477e-05, "loss": 0.0186, "step": 6220 }, { "epoch": 4.243869209809264, "grad_norm": 0.138119637966156, "learning_rate": 3.409383023633325e-05, "loss": 0.0177, "step": 6230 }, { "epoch": 4.2506811989100814, "grad_norm": 0.22962254285812378, "learning_rate": 3.3937160402755894e-05, "loss": 0.0172, "step": 6240 }, { "epoch": 4.2574931880108995, "grad_norm": 0.2682797908782959, "learning_rate": 3.378066622981885e-05, "loss": 0.0189, "step": 6250 }, { "epoch": 4.264305177111717, "grad_norm": 0.20227015018463135, "learning_rate": 3.362434942891738e-05, "loss": 0.022, "step": 6260 }, { "epoch": 4.271117166212534, "grad_norm": 0.2518126368522644, "learning_rate": 3.346821170950693e-05, "loss": 0.0195, "step": 6270 }, { "epoch": 4.277929155313352, "grad_norm": 0.2634272575378418, "learning_rate": 3.3312254779084585e-05, "loss": 0.0188, "step": 6280 }, { "epoch": 4.284741144414169, "grad_norm": 0.25564995408058167, "learning_rate": 3.315648034317039e-05, "loss": 0.0215, "step": 6290 }, { "epoch": 4.291553133514986, "grad_norm": 0.31350597739219666, "learning_rate": 3.3000890105288564e-05, "loss": 0.0181, "step": 6300 }, { "epoch": 4.298365122615804, "grad_norm": 0.23671625554561615, "learning_rate": 3.284548576694908e-05, "loss": 0.0176, "step": 6310 }, { "epoch": 4.305177111716621, "grad_norm": 0.2342391163110733, "learning_rate": 3.2690269027628815e-05, "loss": 0.0156, "step": 6320 }, { "epoch": 4.3119891008174385, "grad_norm": 0.24075733125209808, "learning_rate": 3.253524158475324e-05, "loss": 0.0187, "step": 6330 }, { "epoch": 4.3188010899182565, "grad_norm": 0.16117766499519348, "learning_rate": 3.238040513367757e-05, "loss": 0.019, "step": 6340 }, { "epoch": 4.325613079019074, "grad_norm": 0.16016744077205658, "learning_rate": 3.222576136766843e-05, "loss": 0.0146, "step": 6350 }, { "epoch": 4.332425068119891, "grad_norm": 0.236736461520195, "learning_rate": 3.2071311977885324e-05, "loss": 0.018, "step": 6360 }, { "epoch": 4.339237057220709, "grad_norm": 0.182217076420784, "learning_rate": 3.191705865336197e-05, "loss": 0.0171, "step": 6370 }, { "epoch": 4.346049046321526, "grad_norm": 0.19513262808322906, "learning_rate": 3.1763003080988075e-05, "loss": 0.0155, "step": 6380 }, { "epoch": 4.352861035422343, "grad_norm": 0.19296902418136597, "learning_rate": 3.160914694549063e-05, "loss": 0.0191, "step": 6390 }, { "epoch": 4.359673024523161, "grad_norm": 0.2901662588119507, "learning_rate": 3.145549192941573e-05, "loss": 0.0174, "step": 6400 }, { "epoch": 4.366485013623978, "grad_norm": 0.21007436513900757, "learning_rate": 3.130203971310999e-05, "loss": 0.0194, "step": 6410 }, { "epoch": 4.3732970027247955, "grad_norm": 0.19525951147079468, "learning_rate": 3.114879197470225e-05, "loss": 0.0181, "step": 6420 }, { "epoch": 4.3801089918256135, "grad_norm": 0.22157281637191772, "learning_rate": 3.0995750390085285e-05, "loss": 0.0165, "step": 6430 }, { "epoch": 4.386920980926431, "grad_norm": 0.2652972340583801, "learning_rate": 3.084291663289728e-05, "loss": 0.0171, "step": 6440 }, { "epoch": 4.393732970027248, "grad_norm": 0.24139529466629028, "learning_rate": 3.069029237450375e-05, "loss": 0.0143, "step": 6450 }, { "epoch": 4.400544959128065, "grad_norm": 0.17755670845508575, "learning_rate": 3.053787928397911e-05, "loss": 0.0145, "step": 6460 }, { "epoch": 4.407356948228883, "grad_norm": 0.33576318621635437, "learning_rate": 3.0385679028088526e-05, "loss": 0.0179, "step": 6470 }, { "epoch": 4.4141689373297, "grad_norm": 0.12500669062137604, "learning_rate": 3.023369327126959e-05, "loss": 0.0147, "step": 6480 }, { "epoch": 4.420980926430517, "grad_norm": 0.17751501500606537, "learning_rate": 3.0081923675614198e-05, "loss": 0.016, "step": 6490 }, { "epoch": 4.427792915531335, "grad_norm": 0.22551394999027252, "learning_rate": 2.993037190085034e-05, "loss": 0.0157, "step": 6500 }, { "epoch": 4.4346049046321525, "grad_norm": 0.22585496306419373, "learning_rate": 2.977903960432392e-05, "loss": 0.0168, "step": 6510 }, { "epoch": 4.44141689373297, "grad_norm": 0.21578261256217957, "learning_rate": 2.9627928440980722e-05, "loss": 0.0187, "step": 6520 }, { "epoch": 4.448228882833788, "grad_norm": 0.22021692991256714, "learning_rate": 2.9477040063348183e-05, "loss": 0.0188, "step": 6530 }, { "epoch": 4.455040871934605, "grad_norm": 0.28897762298583984, "learning_rate": 2.9326376121517456e-05, "loss": 0.0165, "step": 6540 }, { "epoch": 4.461852861035422, "grad_norm": 0.25159355998039246, "learning_rate": 2.9175938263125236e-05, "loss": 0.0152, "step": 6550 }, { "epoch": 4.46866485013624, "grad_norm": 0.23536550998687744, "learning_rate": 2.9025728133335873e-05, "loss": 0.0203, "step": 6560 }, { "epoch": 4.475476839237057, "grad_norm": 0.2687535583972931, "learning_rate": 2.8875747374823288e-05, "loss": 0.0217, "step": 6570 }, { "epoch": 4.482288828337874, "grad_norm": 0.28790390491485596, "learning_rate": 2.872599762775298e-05, "loss": 0.0228, "step": 6580 }, { "epoch": 4.489100817438692, "grad_norm": 0.20599423348903656, "learning_rate": 2.857648052976425e-05, "loss": 0.0154, "step": 6590 }, { "epoch": 4.4959128065395095, "grad_norm": 0.25862014293670654, "learning_rate": 2.8427197715952047e-05, "loss": 0.0169, "step": 6600 }, { "epoch": 4.502724795640327, "grad_norm": 0.29005661606788635, "learning_rate": 2.8278150818849393e-05, "loss": 0.0176, "step": 6610 }, { "epoch": 4.509536784741145, "grad_norm": 0.20444929599761963, "learning_rate": 2.812934146840922e-05, "loss": 0.0168, "step": 6620 }, { "epoch": 4.516348773841962, "grad_norm": 0.16426856815814972, "learning_rate": 2.7980771291986764e-05, "loss": 0.0183, "step": 6630 }, { "epoch": 4.523160762942779, "grad_norm": 0.2749025225639343, "learning_rate": 2.783244191432167e-05, "loss": 0.0175, "step": 6640 }, { "epoch": 4.529972752043597, "grad_norm": 0.28311431407928467, "learning_rate": 2.768435495752022e-05, "loss": 0.0151, "step": 6650 }, { "epoch": 4.536784741144414, "grad_norm": 0.24218498170375824, "learning_rate": 2.753651204103771e-05, "loss": 0.0175, "step": 6660 }, { "epoch": 4.543596730245231, "grad_norm": 0.214820995926857, "learning_rate": 2.7388914781660523e-05, "loss": 0.0138, "step": 6670 }, { "epoch": 4.550408719346049, "grad_norm": 0.2261001467704773, "learning_rate": 2.7241564793488693e-05, "loss": 0.0183, "step": 6680 }, { "epoch": 4.5572207084468666, "grad_norm": 0.21669824421405792, "learning_rate": 2.7094463687918037e-05, "loss": 0.0161, "step": 6690 }, { "epoch": 4.564032697547684, "grad_norm": 0.25731489062309265, "learning_rate": 2.694761307362268e-05, "loss": 0.0149, "step": 6700 }, { "epoch": 4.570844686648502, "grad_norm": 0.29376113414764404, "learning_rate": 2.6801014556537467e-05, "loss": 0.0179, "step": 6710 }, { "epoch": 4.577656675749319, "grad_norm": 0.2186402678489685, "learning_rate": 2.6654669739840243e-05, "loss": 0.0191, "step": 6720 }, { "epoch": 4.584468664850136, "grad_norm": 0.21597842872142792, "learning_rate": 2.650858022393451e-05, "loss": 0.019, "step": 6730 }, { "epoch": 4.591280653950953, "grad_norm": 0.20672723650932312, "learning_rate": 2.6362747606431747e-05, "loss": 0.0173, "step": 6740 }, { "epoch": 4.598092643051771, "grad_norm": 0.16333183646202087, "learning_rate": 2.6217173482134172e-05, "loss": 0.0149, "step": 6750 }, { "epoch": 4.604904632152588, "grad_norm": 0.3173683285713196, "learning_rate": 2.6071859443017044e-05, "loss": 0.0136, "step": 6760 }, { "epoch": 4.6117166212534055, "grad_norm": 0.331967294216156, "learning_rate": 2.5926807078211414e-05, "loss": 0.0147, "step": 6770 }, { "epoch": 4.618528610354224, "grad_norm": 0.26017463207244873, "learning_rate": 2.5782017973986728e-05, "loss": 0.0151, "step": 6780 }, { "epoch": 4.625340599455041, "grad_norm": 0.17480212450027466, "learning_rate": 2.5637493713733374e-05, "loss": 0.0171, "step": 6790 }, { "epoch": 4.632152588555858, "grad_norm": 0.20509187877178192, "learning_rate": 2.549323587794559e-05, "loss": 0.0203, "step": 6800 }, { "epoch": 4.638964577656676, "grad_norm": 0.203098326921463, "learning_rate": 2.5349246044203895e-05, "loss": 0.0144, "step": 6810 }, { "epoch": 4.645776566757493, "grad_norm": 0.25146251916885376, "learning_rate": 2.520552578715808e-05, "loss": 0.0159, "step": 6820 }, { "epoch": 4.65258855585831, "grad_norm": 0.2880435585975647, "learning_rate": 2.506207667850981e-05, "loss": 0.0154, "step": 6830 }, { "epoch": 4.659400544959128, "grad_norm": 0.1960860937833786, "learning_rate": 2.4918900286995555e-05, "loss": 0.0155, "step": 6840 }, { "epoch": 4.666212534059945, "grad_norm": 0.1823454052209854, "learning_rate": 2.4775998178369458e-05, "loss": 0.0145, "step": 6850 }, { "epoch": 4.6730245231607626, "grad_norm": 0.2692583203315735, "learning_rate": 2.4633371915386017e-05, "loss": 0.0161, "step": 6860 }, { "epoch": 4.679836512261581, "grad_norm": 0.30107152462005615, "learning_rate": 2.4491023057783235e-05, "loss": 0.0184, "step": 6870 }, { "epoch": 4.686648501362398, "grad_norm": 0.19429023563861847, "learning_rate": 2.4348953162265375e-05, "loss": 0.0179, "step": 6880 }, { "epoch": 4.693460490463215, "grad_norm": 0.18987010419368744, "learning_rate": 2.420716378248607e-05, "loss": 0.0191, "step": 6890 }, { "epoch": 4.700272479564033, "grad_norm": 0.21912752091884613, "learning_rate": 2.4065656469031266e-05, "loss": 0.0136, "step": 6900 }, { "epoch": 4.70708446866485, "grad_norm": 0.17700830101966858, "learning_rate": 2.3924432769402268e-05, "loss": 0.0167, "step": 6910 }, { "epoch": 4.713896457765667, "grad_norm": 0.14365394413471222, "learning_rate": 2.3783494227998844e-05, "loss": 0.0203, "step": 6920 }, { "epoch": 4.720708446866485, "grad_norm": 0.2490224689245224, "learning_rate": 2.3642842386102264e-05, "loss": 0.0163, "step": 6930 }, { "epoch": 4.727520435967302, "grad_norm": 0.3222252428531647, "learning_rate": 2.3502478781858567e-05, "loss": 0.0133, "step": 6940 }, { "epoch": 4.73433242506812, "grad_norm": 0.2206520438194275, "learning_rate": 2.3362404950261628e-05, "loss": 0.0164, "step": 6950 }, { "epoch": 4.741144414168938, "grad_norm": 0.21772713959217072, "learning_rate": 2.3222622423136458e-05, "loss": 0.0148, "step": 6960 }, { "epoch": 4.747956403269755, "grad_norm": 0.18722061812877655, "learning_rate": 2.3083132729122332e-05, "loss": 0.014, "step": 6970 }, { "epoch": 4.754768392370572, "grad_norm": 0.3535923659801483, "learning_rate": 2.294393739365621e-05, "loss": 0.0211, "step": 6980 }, { "epoch": 4.76158038147139, "grad_norm": 0.1893048882484436, "learning_rate": 2.2805037938956e-05, "loss": 0.0167, "step": 6990 }, { "epoch": 4.768392370572207, "grad_norm": 0.23466837406158447, "learning_rate": 2.266643588400386e-05, "loss": 0.0169, "step": 7000 }, { "epoch": 4.775204359673024, "grad_norm": 0.1818532645702362, "learning_rate": 2.252813274452969e-05, "loss": 0.0174, "step": 7010 }, { "epoch": 4.782016348773842, "grad_norm": 0.24044625461101532, "learning_rate": 2.2390130032994427e-05, "loss": 0.0146, "step": 7020 }, { "epoch": 4.7888283378746594, "grad_norm": 0.19146227836608887, "learning_rate": 2.2252429258573633e-05, "loss": 0.0163, "step": 7030 }, { "epoch": 4.795640326975477, "grad_norm": 0.2928459644317627, "learning_rate": 2.2115031927140904e-05, "loss": 0.0159, "step": 7040 }, { "epoch": 4.802452316076295, "grad_norm": 0.26016002893447876, "learning_rate": 2.1977939541251463e-05, "loss": 0.0182, "step": 7050 }, { "epoch": 4.809264305177112, "grad_norm": 0.2691255509853363, "learning_rate": 2.1841153600125684e-05, "loss": 0.0158, "step": 7060 }, { "epoch": 4.816076294277929, "grad_norm": 0.21671241521835327, "learning_rate": 2.170467559963267e-05, "loss": 0.0167, "step": 7070 }, { "epoch": 4.822888283378747, "grad_norm": 0.2578423023223877, "learning_rate": 2.1568507032273982e-05, "loss": 0.0131, "step": 7080 }, { "epoch": 4.829700272479564, "grad_norm": 0.22187665104866028, "learning_rate": 2.1432649387167264e-05, "loss": 0.0147, "step": 7090 }, { "epoch": 4.836512261580381, "grad_norm": 0.16120664775371552, "learning_rate": 2.1297104150029973e-05, "loss": 0.0146, "step": 7100 }, { "epoch": 4.843324250681199, "grad_norm": 0.20281171798706055, "learning_rate": 2.116187280316307e-05, "loss": 0.0163, "step": 7110 }, { "epoch": 4.8501362397820165, "grad_norm": 0.30870872735977173, "learning_rate": 2.1026956825434908e-05, "loss": 0.0137, "step": 7120 }, { "epoch": 4.856948228882834, "grad_norm": 0.16448527574539185, "learning_rate": 2.0892357692265017e-05, "loss": 0.0135, "step": 7130 }, { "epoch": 4.863760217983652, "grad_norm": 0.229940727353096, "learning_rate": 2.0758076875607947e-05, "loss": 0.0159, "step": 7140 }, { "epoch": 4.870572207084469, "grad_norm": 0.1661119908094406, "learning_rate": 2.0624115843937207e-05, "loss": 0.0171, "step": 7150 }, { "epoch": 4.877384196185286, "grad_norm": 0.17345386743545532, "learning_rate": 2.0490476062229157e-05, "loss": 0.0156, "step": 7160 }, { "epoch": 4.884196185286104, "grad_norm": 0.30998191237449646, "learning_rate": 2.035715899194704e-05, "loss": 0.0151, "step": 7170 }, { "epoch": 4.891008174386921, "grad_norm": 0.16312265396118164, "learning_rate": 2.022416609102499e-05, "loss": 0.0146, "step": 7180 }, { "epoch": 4.897820163487738, "grad_norm": 0.19796396791934967, "learning_rate": 2.009149881385205e-05, "loss": 0.0197, "step": 7190 }, { "epoch": 4.904632152588556, "grad_norm": 0.27385029196739197, "learning_rate": 1.995915861125634e-05, "loss": 0.0143, "step": 7200 }, { "epoch": 4.9114441416893735, "grad_norm": 0.13566231727600098, "learning_rate": 1.9827146930489065e-05, "loss": 0.0131, "step": 7210 }, { "epoch": 4.918256130790191, "grad_norm": 0.26954782009124756, "learning_rate": 1.9695465215208848e-05, "loss": 0.018, "step": 7220 }, { "epoch": 4.925068119891008, "grad_norm": 0.20488935708999634, "learning_rate": 1.9564114905465813e-05, "loss": 0.0139, "step": 7230 }, { "epoch": 4.931880108991826, "grad_norm": 0.25250253081321716, "learning_rate": 1.9433097437685936e-05, "loss": 0.014, "step": 7240 }, { "epoch": 4.938692098092643, "grad_norm": 0.22722044587135315, "learning_rate": 1.930241424465521e-05, "loss": 0.0129, "step": 7250 }, { "epoch": 4.94550408719346, "grad_norm": 0.12395540624856949, "learning_rate": 1.9172066755504115e-05, "loss": 0.0117, "step": 7260 }, { "epoch": 4.952316076294278, "grad_norm": 0.1848660111427307, "learning_rate": 1.9042056395691914e-05, "loss": 0.0153, "step": 7270 }, { "epoch": 4.959128065395095, "grad_norm": 0.1646895408630371, "learning_rate": 1.8912384586991066e-05, "loss": 0.0127, "step": 7280 }, { "epoch": 4.9659400544959125, "grad_norm": 0.2536143958568573, "learning_rate": 1.8783052747471717e-05, "loss": 0.0145, "step": 7290 }, { "epoch": 4.9727520435967305, "grad_norm": 0.17167410254478455, "learning_rate": 1.865406229148611e-05, "loss": 0.0138, "step": 7300 }, { "epoch": 4.979564032697548, "grad_norm": 0.24148517847061157, "learning_rate": 1.8525414629653233e-05, "loss": 0.016, "step": 7310 }, { "epoch": 4.986376021798365, "grad_norm": 0.2849847674369812, "learning_rate": 1.8397111168843255e-05, "loss": 0.0142, "step": 7320 }, { "epoch": 4.993188010899183, "grad_norm": 0.19562356173992157, "learning_rate": 1.8269153312162323e-05, "loss": 0.0153, "step": 7330 }, { "epoch": 5.0, "grad_norm": 0.25182366371154785, "learning_rate": 1.8141542458937054e-05, "loss": 0.0128, "step": 7340 }, { "epoch": 5.006811989100817, "grad_norm": 0.22833839058876038, "learning_rate": 1.8014280004699268e-05, "loss": 0.0127, "step": 7350 }, { "epoch": 5.013623978201635, "grad_norm": 0.17050805687904358, "learning_rate": 1.788736734117078e-05, "loss": 0.0113, "step": 7360 }, { "epoch": 5.020435967302452, "grad_norm": 0.2042902410030365, "learning_rate": 1.7760805856248152e-05, "loss": 0.0131, "step": 7370 }, { "epoch": 5.0272479564032695, "grad_norm": 0.24889463186264038, "learning_rate": 1.7634596933987518e-05, "loss": 0.0164, "step": 7380 }, { "epoch": 5.0340599455040875, "grad_norm": 0.2117907702922821, "learning_rate": 1.7508741954589404e-05, "loss": 0.0177, "step": 7390 }, { "epoch": 5.040871934604905, "grad_norm": 0.17846384644508362, "learning_rate": 1.7383242294383717e-05, "loss": 0.0135, "step": 7400 }, { "epoch": 5.047683923705722, "grad_norm": 0.17487211525440216, "learning_rate": 1.7258099325814632e-05, "loss": 0.0115, "step": 7410 }, { "epoch": 5.05449591280654, "grad_norm": 0.18704567849636078, "learning_rate": 1.7133314417425594e-05, "loss": 0.0146, "step": 7420 }, { "epoch": 5.061307901907357, "grad_norm": 0.24722889065742493, "learning_rate": 1.7008888933844408e-05, "loss": 0.0148, "step": 7430 }, { "epoch": 5.068119891008174, "grad_norm": 0.19086501002311707, "learning_rate": 1.6884824235768172e-05, "loss": 0.0148, "step": 7440 }, { "epoch": 5.074931880108992, "grad_norm": 0.18787351250648499, "learning_rate": 1.6761121679948592e-05, "loss": 0.0125, "step": 7450 }, { "epoch": 5.081743869209809, "grad_norm": 0.28582966327667236, "learning_rate": 1.663778261917695e-05, "loss": 0.0148, "step": 7460 }, { "epoch": 5.0885558583106265, "grad_norm": 0.1483089029788971, "learning_rate": 1.651480840226952e-05, "loss": 0.0123, "step": 7470 }, { "epoch": 5.0953678474114446, "grad_norm": 0.24858340620994568, "learning_rate": 1.639220037405258e-05, "loss": 0.0148, "step": 7480 }, { "epoch": 5.102179836512262, "grad_norm": 0.1595468819141388, "learning_rate": 1.6269959875347906e-05, "loss": 0.0137, "step": 7490 }, { "epoch": 5.108991825613079, "grad_norm": 0.27670302987098694, "learning_rate": 1.614808824295802e-05, "loss": 0.0143, "step": 7500 }, { "epoch": 5.115803814713897, "grad_norm": 0.13641585409641266, "learning_rate": 1.602658680965152e-05, "loss": 0.0103, "step": 7510 }, { "epoch": 5.122615803814714, "grad_norm": 0.23924027383327484, "learning_rate": 1.5905456904148686e-05, "loss": 0.0146, "step": 7520 }, { "epoch": 5.129427792915531, "grad_norm": 0.21490581333637238, "learning_rate": 1.57846998511067e-05, "loss": 0.0146, "step": 7530 }, { "epoch": 5.136239782016348, "grad_norm": 0.28059524297714233, "learning_rate": 1.566431697110538e-05, "loss": 0.0117, "step": 7540 }, { "epoch": 5.143051771117166, "grad_norm": 0.20346660912036896, "learning_rate": 1.554430958063259e-05, "loss": 0.0113, "step": 7550 }, { "epoch": 5.1498637602179835, "grad_norm": 0.17303743958473206, "learning_rate": 1.5424678992069912e-05, "loss": 0.0123, "step": 7560 }, { "epoch": 5.156675749318801, "grad_norm": 0.21222537755966187, "learning_rate": 1.5305426513678362e-05, "loss": 0.012, "step": 7570 }, { "epoch": 5.163487738419619, "grad_norm": 0.22923482954502106, "learning_rate": 1.518655344958388e-05, "loss": 0.0136, "step": 7580 }, { "epoch": 5.170299727520436, "grad_norm": 0.28508874773979187, "learning_rate": 1.5068061099763275e-05, "loss": 0.0124, "step": 7590 }, { "epoch": 5.177111716621253, "grad_norm": 0.31998851895332336, "learning_rate": 1.494995076002988e-05, "loss": 0.0155, "step": 7600 }, { "epoch": 5.183923705722071, "grad_norm": 0.20339614152908325, "learning_rate": 1.4832223722019456e-05, "loss": 0.0125, "step": 7610 }, { "epoch": 5.190735694822888, "grad_norm": 0.19148500263690948, "learning_rate": 1.4714881273176035e-05, "loss": 0.0139, "step": 7620 }, { "epoch": 5.197547683923705, "grad_norm": 0.17169404029846191, "learning_rate": 1.4597924696737835e-05, "loss": 0.0123, "step": 7630 }, { "epoch": 5.204359673024523, "grad_norm": 0.17913302779197693, "learning_rate": 1.4481355271723252e-05, "loss": 0.0123, "step": 7640 }, { "epoch": 5.2111716621253406, "grad_norm": 0.34465935826301575, "learning_rate": 1.4365174272916809e-05, "loss": 0.0154, "step": 7650 }, { "epoch": 5.217983651226158, "grad_norm": 0.17733906209468842, "learning_rate": 1.4249382970855319e-05, "loss": 0.012, "step": 7660 }, { "epoch": 5.224795640326976, "grad_norm": 0.12495987117290497, "learning_rate": 1.4133982631813903e-05, "loss": 0.014, "step": 7670 }, { "epoch": 5.231607629427793, "grad_norm": 0.14085407555103302, "learning_rate": 1.4018974517792194e-05, "loss": 0.0108, "step": 7680 }, { "epoch": 5.23841961852861, "grad_norm": 0.1950143575668335, "learning_rate": 1.390435988650048e-05, "loss": 0.0115, "step": 7690 }, { "epoch": 5.245231607629428, "grad_norm": 0.2449447363615036, "learning_rate": 1.3790139991346006e-05, "loss": 0.0126, "step": 7700 }, { "epoch": 5.252043596730245, "grad_norm": 0.148986354470253, "learning_rate": 1.367631608141926e-05, "loss": 0.0117, "step": 7710 }, { "epoch": 5.258855585831062, "grad_norm": 0.2584574520587921, "learning_rate": 1.3562889401480278e-05, "loss": 0.0122, "step": 7720 }, { "epoch": 5.26566757493188, "grad_norm": 0.19506706297397614, "learning_rate": 1.3449861191945074e-05, "loss": 0.0145, "step": 7730 }, { "epoch": 5.272479564032698, "grad_norm": 0.15596213936805725, "learning_rate": 1.3337232688872009e-05, "loss": 0.0131, "step": 7740 }, { "epoch": 5.279291553133515, "grad_norm": 0.227974072098732, "learning_rate": 1.3225005123948364e-05, "loss": 0.0128, "step": 7750 }, { "epoch": 5.286103542234333, "grad_norm": 0.15332451462745667, "learning_rate": 1.311317972447681e-05, "loss": 0.0103, "step": 7760 }, { "epoch": 5.29291553133515, "grad_norm": 0.14478209614753723, "learning_rate": 1.3001757713361996e-05, "loss": 0.0123, "step": 7770 }, { "epoch": 5.299727520435967, "grad_norm": 0.2231355756521225, "learning_rate": 1.2890740309097204e-05, "loss": 0.0122, "step": 7780 }, { "epoch": 5.306539509536785, "grad_norm": 0.15700560808181763, "learning_rate": 1.2780128725750944e-05, "loss": 0.0122, "step": 7790 }, { "epoch": 5.313351498637602, "grad_norm": 0.1040923222899437, "learning_rate": 1.266992417295379e-05, "loss": 0.0122, "step": 7800 }, { "epoch": 5.320163487738419, "grad_norm": 0.23007836937904358, "learning_rate": 1.2560127855885073e-05, "loss": 0.0125, "step": 7810 }, { "epoch": 5.3269754768392374, "grad_norm": 0.19076195359230042, "learning_rate": 1.2450740975259745e-05, "loss": 0.0146, "step": 7820 }, { "epoch": 5.333787465940055, "grad_norm": 0.1625741422176361, "learning_rate": 1.234176472731517e-05, "loss": 0.0206, "step": 7830 }, { "epoch": 5.340599455040872, "grad_norm": 0.19957181811332703, "learning_rate": 1.2233200303798158e-05, "loss": 0.0111, "step": 7840 }, { "epoch": 5.34741144414169, "grad_norm": 0.26022225618362427, "learning_rate": 1.2125048891951846e-05, "loss": 0.015, "step": 7850 }, { "epoch": 5.354223433242507, "grad_norm": 0.19087891280651093, "learning_rate": 1.2017311674502745e-05, "loss": 0.0099, "step": 7860 }, { "epoch": 5.361035422343324, "grad_norm": 0.20264586806297302, "learning_rate": 1.1909989829647822e-05, "loss": 0.0111, "step": 7870 }, { "epoch": 5.367847411444142, "grad_norm": 0.15192349255084991, "learning_rate": 1.1803084531041553e-05, "loss": 0.0133, "step": 7880 }, { "epoch": 5.374659400544959, "grad_norm": 0.1685389131307602, "learning_rate": 1.1696596947783162e-05, "loss": 0.0115, "step": 7890 }, { "epoch": 5.381471389645776, "grad_norm": 0.16295170783996582, "learning_rate": 1.1590528244403803e-05, "loss": 0.012, "step": 7900 }, { "epoch": 5.3882833787465945, "grad_norm": 0.21759124100208282, "learning_rate": 1.148487958085382e-05, "loss": 0.013, "step": 7910 }, { "epoch": 5.395095367847412, "grad_norm": 0.14942030608654022, "learning_rate": 1.1379652112490086e-05, "loss": 0.0139, "step": 7920 }, { "epoch": 5.401907356948229, "grad_norm": 0.22778572142124176, "learning_rate": 1.1274846990063315e-05, "loss": 0.0121, "step": 7930 }, { "epoch": 5.408719346049046, "grad_norm": 0.18669773638248444, "learning_rate": 1.117046535970554e-05, "loss": 0.0147, "step": 7940 }, { "epoch": 5.415531335149864, "grad_norm": 0.15506009757518768, "learning_rate": 1.106650836291755e-05, "loss": 0.0177, "step": 7950 }, { "epoch": 5.422343324250681, "grad_norm": 0.2541573643684387, "learning_rate": 1.0962977136556418e-05, "loss": 0.0153, "step": 7960 }, { "epoch": 5.429155313351498, "grad_norm": 0.1981164813041687, "learning_rate": 1.0859872812823024e-05, "loss": 0.0132, "step": 7970 }, { "epoch": 5.435967302452316, "grad_norm": 0.2570594549179077, "learning_rate": 1.0757196519249747e-05, "loss": 0.016, "step": 7980 }, { "epoch": 5.4427792915531334, "grad_norm": 0.11997192353010178, "learning_rate": 1.0654949378688077e-05, "loss": 0.014, "step": 7990 }, { "epoch": 5.449591280653951, "grad_norm": 0.194411501288414, "learning_rate": 1.0553132509296376e-05, "loss": 0.013, "step": 8000 }, { "epoch": 5.456403269754769, "grad_norm": 0.16398310661315918, "learning_rate": 1.0451747024527613e-05, "loss": 0.013, "step": 8010 }, { "epoch": 5.463215258855586, "grad_norm": 0.2351941168308258, "learning_rate": 1.0350794033117189e-05, "loss": 0.0141, "step": 8020 }, { "epoch": 5.470027247956403, "grad_norm": 0.254794180393219, "learning_rate": 1.0250274639070856e-05, "loss": 0.0115, "step": 8030 }, { "epoch": 5.476839237057221, "grad_norm": 0.12862807512283325, "learning_rate": 1.0150189941652599e-05, "loss": 0.0104, "step": 8040 }, { "epoch": 5.483651226158038, "grad_norm": 0.1112130880355835, "learning_rate": 1.0050541035372635e-05, "loss": 0.0105, "step": 8050 }, { "epoch": 5.490463215258855, "grad_norm": 0.15486888587474823, "learning_rate": 9.951329009975458e-06, "loss": 0.0108, "step": 8060 }, { "epoch": 5.497275204359673, "grad_norm": 0.13335685431957245, "learning_rate": 9.852554950427845e-06, "loss": 0.0128, "step": 8070 }, { "epoch": 5.5040871934604905, "grad_norm": 0.16484335064888, "learning_rate": 9.754219936907105e-06, "loss": 0.0123, "step": 8080 }, { "epoch": 5.510899182561308, "grad_norm": 0.13687945902347565, "learning_rate": 9.656325044789194e-06, "loss": 0.0096, "step": 8090 }, { "epoch": 5.517711171662126, "grad_norm": 0.1303662657737732, "learning_rate": 9.55887134463697e-06, "loss": 0.0086, "step": 8100 }, { "epoch": 5.524523160762943, "grad_norm": 0.1451333612203598, "learning_rate": 9.461859902188475e-06, "loss": 0.0119, "step": 8110 }, { "epoch": 5.53133514986376, "grad_norm": 0.2170574814081192, "learning_rate": 9.365291778345303e-06, "loss": 0.0121, "step": 8120 }, { "epoch": 5.538147138964578, "grad_norm": 0.16463404893875122, "learning_rate": 9.269168029160991e-06, "loss": 0.0089, "step": 8130 }, { "epoch": 5.544959128065395, "grad_norm": 0.2275201976299286, "learning_rate": 9.173489705829447e-06, "loss": 0.0129, "step": 8140 }, { "epoch": 5.551771117166212, "grad_norm": 0.11590515077114105, "learning_rate": 9.078257854673516e-06, "loss": 0.0124, "step": 8150 }, { "epoch": 5.55858310626703, "grad_norm": 0.1156085953116417, "learning_rate": 8.983473517133429e-06, "loss": 0.0135, "step": 8160 }, { "epoch": 5.5653950953678475, "grad_norm": 0.12850528955459595, "learning_rate": 8.889137729755537e-06, "loss": 0.0105, "step": 8170 }, { "epoch": 5.572207084468665, "grad_norm": 0.2325507402420044, "learning_rate": 8.79525152418087e-06, "loss": 0.0136, "step": 8180 }, { "epoch": 5.579019073569482, "grad_norm": 0.17301425337791443, "learning_rate": 8.701815927133961e-06, "loss": 0.0124, "step": 8190 }, { "epoch": 5.5858310626703, "grad_norm": 0.13355191051959991, "learning_rate": 8.608831960411534e-06, "loss": 0.0114, "step": 8200 }, { "epoch": 5.592643051771117, "grad_norm": 0.2157035917043686, "learning_rate": 8.516300640871321e-06, "loss": 0.0126, "step": 8210 }, { "epoch": 5.599455040871934, "grad_norm": 0.16316112875938416, "learning_rate": 8.424222980421038e-06, "loss": 0.0133, "step": 8220 }, { "epoch": 5.606267029972752, "grad_norm": 0.10164311528205872, "learning_rate": 8.332599986007184e-06, "loss": 0.0111, "step": 8230 }, { "epoch": 5.613079019073569, "grad_norm": 0.13754205405712128, "learning_rate": 8.241432659604203e-06, "loss": 0.0098, "step": 8240 }, { "epoch": 5.6198910081743865, "grad_norm": 0.17243002355098724, "learning_rate": 8.150721998203331e-06, "loss": 0.0092, "step": 8250 }, { "epoch": 5.6267029972752045, "grad_norm": 0.14749637246131897, "learning_rate": 8.06046899380184e-06, "loss": 0.0098, "step": 8260 }, { "epoch": 5.633514986376022, "grad_norm": 0.12213444709777832, "learning_rate": 7.970674633392133e-06, "loss": 0.0139, "step": 8270 }, { "epoch": 5.640326975476839, "grad_norm": 0.1787102073431015, "learning_rate": 7.881339898950924e-06, "loss": 0.0142, "step": 8280 }, { "epoch": 5.647138964577657, "grad_norm": 0.10263296216726303, "learning_rate": 7.792465767428597e-06, "loss": 0.0116, "step": 8290 }, { "epoch": 5.653950953678474, "grad_norm": 0.11837161332368851, "learning_rate": 7.704053210738376e-06, "loss": 0.0095, "step": 8300 }, { "epoch": 5.660762942779291, "grad_norm": 0.13488389551639557, "learning_rate": 7.6161031957458494e-06, "loss": 0.0138, "step": 8310 }, { "epoch": 5.667574931880109, "grad_norm": 0.19569364190101624, "learning_rate": 7.5286166842582605e-06, "loss": 0.0121, "step": 8320 }, { "epoch": 5.674386920980926, "grad_norm": 0.19341343641281128, "learning_rate": 7.4415946330140814e-06, "loss": 0.0115, "step": 8330 }, { "epoch": 5.6811989100817435, "grad_norm": 0.15243728458881378, "learning_rate": 7.3550379936725644e-06, "loss": 0.0114, "step": 8340 }, { "epoch": 5.6880108991825615, "grad_norm": 0.13914422690868378, "learning_rate": 7.2689477128032035e-06, "loss": 0.0125, "step": 8350 }, { "epoch": 5.694822888283379, "grad_norm": 0.15893633663654327, "learning_rate": 7.183324731875551e-06, "loss": 0.0098, "step": 8360 }, { "epoch": 5.701634877384196, "grad_norm": 0.16882383823394775, "learning_rate": 7.098169987248782e-06, "loss": 0.0089, "step": 8370 }, { "epoch": 5.708446866485014, "grad_norm": 0.11707707494497299, "learning_rate": 7.013484410161553e-06, "loss": 0.0111, "step": 8380 }, { "epoch": 5.715258855585831, "grad_norm": 0.15138401091098785, "learning_rate": 6.92926892672176e-06, "loss": 0.011, "step": 8390 }, { "epoch": 5.722070844686648, "grad_norm": 0.15782202780246735, "learning_rate": 6.845524457896446e-06, "loss": 0.0087, "step": 8400 }, { "epoch": 5.728882833787466, "grad_norm": 0.14364789426326752, "learning_rate": 6.7622519195017165e-06, "loss": 0.0099, "step": 8410 }, { "epoch": 5.735694822888283, "grad_norm": 0.1990385502576828, "learning_rate": 6.679452222192684e-06, "loss": 0.0099, "step": 8420 }, { "epoch": 5.7425068119891005, "grad_norm": 0.11444421857595444, "learning_rate": 6.597126271453579e-06, "loss": 0.0088, "step": 8430 }, { "epoch": 5.7493188010899186, "grad_norm": 0.09519212692975998, "learning_rate": 6.51527496758782e-06, "loss": 0.0089, "step": 8440 }, { "epoch": 5.756130790190736, "grad_norm": 0.1508159637451172, "learning_rate": 6.433899205708155e-06, "loss": 0.0097, "step": 8450 }, { "epoch": 5.762942779291553, "grad_norm": 0.12732820212841034, "learning_rate": 6.352999875726856e-06, "loss": 0.0091, "step": 8460 }, { "epoch": 5.769754768392371, "grad_norm": 0.09891568869352341, "learning_rate": 6.272577862346052e-06, "loss": 0.0113, "step": 8470 }, { "epoch": 5.776566757493188, "grad_norm": 0.2046702355146408, "learning_rate": 6.192634045047996e-06, "loss": 0.0112, "step": 8480 }, { "epoch": 5.783378746594005, "grad_norm": 0.2202032059431076, "learning_rate": 6.113169298085458e-06, "loss": 0.0166, "step": 8490 }, { "epoch": 5.790190735694823, "grad_norm": 0.2339613288640976, "learning_rate": 6.034184490472195e-06, "loss": 0.0079, "step": 8500 }, { "epoch": 5.79700272479564, "grad_norm": 0.20225585997104645, "learning_rate": 5.955680485973386e-06, "loss": 0.0131, "step": 8510 }, { "epoch": 5.8038147138964575, "grad_norm": 0.2018497884273529, "learning_rate": 5.877658143096265e-06, "loss": 0.011, "step": 8520 }, { "epoch": 5.810626702997276, "grad_norm": 0.13856525719165802, "learning_rate": 5.800118315080661e-06, "loss": 0.01, "step": 8530 }, { "epoch": 5.817438692098093, "grad_norm": 0.1401432752609253, "learning_rate": 5.723061849889716e-06, "loss": 0.0084, "step": 8540 }, { "epoch": 5.82425068119891, "grad_norm": 0.1731623411178589, "learning_rate": 5.646489590200604e-06, "loss": 0.0126, "step": 8550 }, { "epoch": 5.831062670299728, "grad_norm": 0.12786374986171722, "learning_rate": 5.570402373395256e-06, "loss": 0.01, "step": 8560 }, { "epoch": 5.837874659400545, "grad_norm": 0.2641719877719879, "learning_rate": 5.494801031551305e-06, "loss": 0.0111, "step": 8570 }, { "epoch": 5.844686648501362, "grad_norm": 0.16163118183612823, "learning_rate": 5.41968639143291e-06, "loss": 0.0106, "step": 8580 }, { "epoch": 5.85149863760218, "grad_norm": 0.1381234973669052, "learning_rate": 5.345059274481751e-06, "loss": 0.0093, "step": 8590 }, { "epoch": 5.858310626702997, "grad_norm": 0.1420307159423828, "learning_rate": 5.270920496808002e-06, "loss": 0.0134, "step": 8600 }, { "epoch": 5.8651226158038146, "grad_norm": 0.1673470139503479, "learning_rate": 5.1972708691814695e-06, "loss": 0.0109, "step": 8610 }, { "epoch": 5.871934604904633, "grad_norm": 0.2173473834991455, "learning_rate": 5.124111197022674e-06, "loss": 0.0119, "step": 8620 }, { "epoch": 5.87874659400545, "grad_norm": 0.11630476266145706, "learning_rate": 5.051442280394081e-06, "loss": 0.009, "step": 8630 }, { "epoch": 5.885558583106267, "grad_norm": 0.0949091911315918, "learning_rate": 4.979264913991322e-06, "loss": 0.0119, "step": 8640 }, { "epoch": 5.892370572207085, "grad_norm": 0.16577839851379395, "learning_rate": 4.907579887134489e-06, "loss": 0.0126, "step": 8650 }, { "epoch": 5.899182561307902, "grad_norm": 0.17283402383327484, "learning_rate": 4.836387983759572e-06, "loss": 0.011, "step": 8660 }, { "epoch": 5.905994550408719, "grad_norm": 0.1463468372821808, "learning_rate": 4.765689982409816e-06, "loss": 0.0102, "step": 8670 }, { "epoch": 5.912806539509537, "grad_norm": 0.21168796718120575, "learning_rate": 4.695486656227233e-06, "loss": 0.0132, "step": 8680 }, { "epoch": 5.919618528610354, "grad_norm": 0.24207310378551483, "learning_rate": 4.625778772944156e-06, "loss": 0.0119, "step": 8690 }, { "epoch": 5.926430517711172, "grad_norm": 0.20093917846679688, "learning_rate": 4.556567094874825e-06, "loss": 0.0123, "step": 8700 }, { "epoch": 5.933242506811989, "grad_norm": 0.11250998079776764, "learning_rate": 4.487852378907059e-06, "loss": 0.0076, "step": 8710 }, { "epoch": 5.940054495912807, "grad_norm": 0.11169631779193878, "learning_rate": 4.419635376493986e-06, "loss": 0.0113, "step": 8720 }, { "epoch": 5.946866485013624, "grad_norm": 0.17939099669456482, "learning_rate": 4.351916833645825e-06, "loss": 0.0105, "step": 8730 }, { "epoch": 5.953678474114441, "grad_norm": 0.19434191286563873, "learning_rate": 4.284697490921691e-06, "loss": 0.0106, "step": 8740 }, { "epoch": 5.960490463215259, "grad_norm": 0.16198799014091492, "learning_rate": 4.2179780834215585e-06, "loss": 0.0127, "step": 8750 }, { "epoch": 5.967302452316076, "grad_norm": 0.22619812190532684, "learning_rate": 4.151759340778178e-06, "loss": 0.0117, "step": 8760 }, { "epoch": 5.974114441416893, "grad_norm": 0.2598056495189667, "learning_rate": 4.086041987149109e-06, "loss": 0.01, "step": 8770 }, { "epoch": 5.9809264305177114, "grad_norm": 0.18251881003379822, "learning_rate": 4.020826741208811e-06, "loss": 0.0102, "step": 8780 }, { "epoch": 5.987738419618529, "grad_norm": 0.18505583703517914, "learning_rate": 3.956114316140746e-06, "loss": 0.0121, "step": 8790 }, { "epoch": 5.994550408719346, "grad_norm": 0.14361293613910675, "learning_rate": 3.891905419629643e-06, "loss": 0.0099, "step": 8800 }, { "epoch": 6.001362397820164, "grad_norm": 0.10514985024929047, "learning_rate": 3.8282007538536946e-06, "loss": 0.0127, "step": 8810 }, { "epoch": 6.008174386920981, "grad_norm": 0.16004830598831177, "learning_rate": 3.7650010154769265e-06, "loss": 0.0089, "step": 8820 }, { "epoch": 6.014986376021798, "grad_norm": 0.18699565529823303, "learning_rate": 3.7023068956415608e-06, "loss": 0.0123, "step": 8830 }, { "epoch": 6.021798365122616, "grad_norm": 0.17017434537410736, "learning_rate": 3.6401190799604303e-06, "loss": 0.0084, "step": 8840 }, { "epoch": 6.028610354223433, "grad_norm": 0.18797238171100616, "learning_rate": 3.578438248509536e-06, "loss": 0.012, "step": 8850 }, { "epoch": 6.03542234332425, "grad_norm": 0.16716784238815308, "learning_rate": 3.5172650758205583e-06, "loss": 0.012, "step": 8860 }, { "epoch": 6.0422343324250685, "grad_norm": 0.10475629568099976, "learning_rate": 3.45660023087353e-06, "loss": 0.008, "step": 8870 }, { "epoch": 6.049046321525886, "grad_norm": 0.12020768970251083, "learning_rate": 3.3964443770894528e-06, "loss": 0.0087, "step": 8880 }, { "epoch": 6.055858310626703, "grad_norm": 0.10397229343652725, "learning_rate": 3.3367981723231245e-06, "loss": 0.0091, "step": 8890 }, { "epoch": 6.062670299727521, "grad_norm": 0.20012831687927246, "learning_rate": 3.2776622688558746e-06, "loss": 0.0099, "step": 8900 }, { "epoch": 6.069482288828338, "grad_norm": 0.19983907043933868, "learning_rate": 3.2190373133884677e-06, "loss": 0.0102, "step": 8910 }, { "epoch": 6.076294277929155, "grad_norm": 0.17271621525287628, "learning_rate": 3.1609239470340446e-06, "loss": 0.0104, "step": 8920 }, { "epoch": 6.083106267029973, "grad_norm": 0.16302776336669922, "learning_rate": 3.1033228053110373e-06, "loss": 0.0078, "step": 8930 }, { "epoch": 6.08991825613079, "grad_norm": 0.12263508886098862, "learning_rate": 3.0462345181363314e-06, "loss": 0.009, "step": 8940 }, { "epoch": 6.0967302452316074, "grad_norm": 0.11456681787967682, "learning_rate": 2.9896597098182654e-06, "loss": 0.0109, "step": 8950 }, { "epoch": 6.1035422343324255, "grad_norm": 0.08905057609081268, "learning_rate": 2.933598999049891e-06, "loss": 0.0112, "step": 8960 }, { "epoch": 6.110354223433243, "grad_norm": 0.15491244196891785, "learning_rate": 2.8780529989021697e-06, "loss": 0.0095, "step": 8970 }, { "epoch": 6.11716621253406, "grad_norm": 0.15372540056705475, "learning_rate": 2.823022316817242e-06, "loss": 0.0124, "step": 8980 }, { "epoch": 6.123978201634877, "grad_norm": 0.20342043042182922, "learning_rate": 2.7685075546018456e-06, "loss": 0.0123, "step": 8990 }, { "epoch": 6.130790190735695, "grad_norm": 0.12789203226566315, "learning_rate": 2.7145093084206598e-06, "loss": 0.0108, "step": 9000 }, { "epoch": 6.137602179836512, "grad_norm": 0.19718892872333527, "learning_rate": 2.661028168789892e-06, "loss": 0.0094, "step": 9010 }, { "epoch": 6.144414168937329, "grad_norm": 0.2571142911911011, "learning_rate": 2.6080647205706855e-06, "loss": 0.0091, "step": 9020 }, { "epoch": 6.151226158038147, "grad_norm": 0.08045794069766998, "learning_rate": 2.555619542962834e-06, "loss": 0.0101, "step": 9030 }, { "epoch": 6.1580381471389645, "grad_norm": 0.10921412706375122, "learning_rate": 2.503693209498409e-06, "loss": 0.0064, "step": 9040 }, { "epoch": 6.164850136239782, "grad_norm": 0.14346344769001007, "learning_rate": 2.452286288035449e-06, "loss": 0.0091, "step": 9050 }, { "epoch": 6.1716621253406, "grad_norm": 0.12146768718957901, "learning_rate": 2.4013993407518363e-06, "loss": 0.0127, "step": 9060 }, { "epoch": 6.178474114441417, "grad_norm": 0.11175204068422318, "learning_rate": 2.351032924139063e-06, "loss": 0.0076, "step": 9070 }, { "epoch": 6.185286103542234, "grad_norm": 0.1668560802936554, "learning_rate": 2.30118758899619e-06, "loss": 0.0112, "step": 9080 }, { "epoch": 6.192098092643052, "grad_norm": 0.15498773753643036, "learning_rate": 2.2518638804238157e-06, "loss": 0.0084, "step": 9090 }, { "epoch": 6.198910081743869, "grad_norm": 0.2582722306251526, "learning_rate": 2.203062337818118e-06, "loss": 0.0107, "step": 9100 }, { "epoch": 6.205722070844686, "grad_norm": 0.14667384326457977, "learning_rate": 2.1547834948649483e-06, "loss": 0.0106, "step": 9110 }, { "epoch": 6.212534059945504, "grad_norm": 0.08730646222829819, "learning_rate": 2.1070278795340017e-06, "loss": 0.011, "step": 9120 }, { "epoch": 6.2193460490463215, "grad_norm": 0.19518472254276276, "learning_rate": 2.059796014073029e-06, "loss": 0.0078, "step": 9130 }, { "epoch": 6.226158038147139, "grad_norm": 0.09343539923429489, "learning_rate": 2.01308841500214e-06, "loss": 0.0079, "step": 9140 }, { "epoch": 6.232970027247957, "grad_norm": 0.2299136221408844, "learning_rate": 1.9669055931081704e-06, "loss": 0.0122, "step": 9150 }, { "epoch": 6.239782016348774, "grad_norm": 0.16625770926475525, "learning_rate": 1.9212480534390507e-06, "loss": 0.0097, "step": 9160 }, { "epoch": 6.246594005449591, "grad_norm": 0.18141430616378784, "learning_rate": 1.8761162952983246e-06, "loss": 0.011, "step": 9170 }, { "epoch": 6.253405994550409, "grad_norm": 0.22686415910720825, "learning_rate": 1.8315108122396618e-06, "loss": 0.0114, "step": 9180 }, { "epoch": 6.260217983651226, "grad_norm": 0.14493921399116516, "learning_rate": 1.787432092061475e-06, "loss": 0.0085, "step": 9190 }, { "epoch": 6.267029972752043, "grad_norm": 0.12274694442749023, "learning_rate": 1.743880616801602e-06, "loss": 0.0113, "step": 9200 }, { "epoch": 6.273841961852861, "grad_norm": 0.10201839357614517, "learning_rate": 1.7008568627319865e-06, "loss": 0.0096, "step": 9210 }, { "epoch": 6.2806539509536785, "grad_norm": 0.20207750797271729, "learning_rate": 1.6583613003535226e-06, "loss": 0.0121, "step": 9220 }, { "epoch": 6.287465940054496, "grad_norm": 0.13486947119235992, "learning_rate": 1.6163943943908522e-06, "loss": 0.0123, "step": 9230 }, { "epoch": 6.294277929155314, "grad_norm": 0.14914485812187195, "learning_rate": 1.5749566037873476e-06, "loss": 0.01, "step": 9240 }, { "epoch": 6.301089918256131, "grad_norm": 0.1396232694387436, "learning_rate": 1.5340483817000428e-06, "loss": 0.0113, "step": 9250 }, { "epoch": 6.307901907356948, "grad_norm": 0.11976684629917145, "learning_rate": 1.4936701754947101e-06, "loss": 0.0096, "step": 9260 }, { "epoch": 6.314713896457766, "grad_norm": 0.14177760481834412, "learning_rate": 1.4538224267409361e-06, "loss": 0.0116, "step": 9270 }, { "epoch": 6.321525885558583, "grad_norm": 0.15875473618507385, "learning_rate": 1.414505571207314e-06, "loss": 0.0076, "step": 9280 }, { "epoch": 6.3283378746594, "grad_norm": 0.10427635163068771, "learning_rate": 1.3757200388566816e-06, "loss": 0.0077, "step": 9290 }, { "epoch": 6.335149863760218, "grad_norm": 0.11724657565355301, "learning_rate": 1.3374662538414074e-06, "loss": 0.0123, "step": 9300 }, { "epoch": 6.3419618528610355, "grad_norm": 0.08624394983053207, "learning_rate": 1.2997446344987617e-06, "loss": 0.0099, "step": 9310 }, { "epoch": 6.348773841961853, "grad_norm": 0.11943169683218002, "learning_rate": 1.262555593346315e-06, "loss": 0.0099, "step": 9320 }, { "epoch": 6.355585831062671, "grad_norm": 0.22859704494476318, "learning_rate": 1.2258995370774685e-06, "loss": 0.0116, "step": 9330 }, { "epoch": 6.362397820163488, "grad_norm": 0.20983096957206726, "learning_rate": 1.1897768665569798e-06, "loss": 0.0117, "step": 9340 }, { "epoch": 6.369209809264305, "grad_norm": 0.13772162795066833, "learning_rate": 1.1541879768165954e-06, "loss": 0.0092, "step": 9350 }, { "epoch": 6.376021798365123, "grad_norm": 0.15202628076076508, "learning_rate": 1.1191332570507085e-06, "loss": 0.0098, "step": 9360 }, { "epoch": 6.38283378746594, "grad_norm": 0.18177203834056854, "learning_rate": 1.0846130906121132e-06, "loss": 0.0164, "step": 9370 }, { "epoch": 6.389645776566757, "grad_norm": 0.17858490347862244, "learning_rate": 1.0506278550078131e-06, "loss": 0.0103, "step": 9380 }, { "epoch": 6.396457765667575, "grad_norm": 0.18811877071857452, "learning_rate": 1.0171779218949185e-06, "loss": 0.0125, "step": 9390 }, { "epoch": 6.4032697547683926, "grad_norm": 0.1804962009191513, "learning_rate": 9.842636570765174e-07, "loss": 0.0097, "step": 9400 }, { "epoch": 6.41008174386921, "grad_norm": 0.20443765819072723, "learning_rate": 9.518854204977612e-07, "loss": 0.01, "step": 9410 }, { "epoch": 6.416893732970027, "grad_norm": 0.11135527491569519, "learning_rate": 9.200435662418349e-07, "loss": 0.0101, "step": 9420 }, { "epoch": 6.423705722070845, "grad_norm": 0.10986144840717316, "learning_rate": 8.887384425261658e-07, "loss": 0.008, "step": 9430 }, { "epoch": 6.430517711171662, "grad_norm": 0.15490956604480743, "learning_rate": 8.579703916985648e-07, "loss": 0.0094, "step": 9440 }, { "epoch": 6.437329700272479, "grad_norm": 0.12304934859275818, "learning_rate": 8.277397502335194e-07, "loss": 0.0134, "step": 9450 }, { "epoch": 6.444141689373297, "grad_norm": 0.15748490393161774, "learning_rate": 7.980468487284675e-07, "loss": 0.0104, "step": 9460 }, { "epoch": 6.450953678474114, "grad_norm": 0.15610432624816895, "learning_rate": 7.688920119002297e-07, "loss": 0.0089, "step": 9470 }, { "epoch": 6.4577656675749315, "grad_norm": 0.1030815839767456, "learning_rate": 7.402755585814269e-07, "loss": 0.0099, "step": 9480 }, { "epoch": 6.46457765667575, "grad_norm": 0.20818915963172913, "learning_rate": 7.121978017170073e-07, "loss": 0.0115, "step": 9490 }, { "epoch": 6.471389645776567, "grad_norm": 0.1520918905735016, "learning_rate": 6.846590483608306e-07, "loss": 0.0084, "step": 9500 }, { "epoch": 6.478201634877384, "grad_norm": 0.13606111705303192, "learning_rate": 6.576595996722834e-07, "loss": 0.0159, "step": 9510 }, { "epoch": 6.485013623978202, "grad_norm": 0.1213141530752182, "learning_rate": 6.311997509130141e-07, "loss": 0.0093, "step": 9520 }, { "epoch": 6.491825613079019, "grad_norm": 0.18930743634700775, "learning_rate": 6.052797914436803e-07, "loss": 0.0114, "step": 9530 }, { "epoch": 6.498637602179836, "grad_norm": 0.2151637226343155, "learning_rate": 5.799000047208181e-07, "loss": 0.0133, "step": 9540 }, { "epoch": 6.505449591280654, "grad_norm": 0.16114141047000885, "learning_rate": 5.550606682937054e-07, "loss": 0.0115, "step": 9550 }, { "epoch": 6.512261580381471, "grad_norm": 0.1699608713388443, "learning_rate": 5.307620538013481e-07, "loss": 0.0114, "step": 9560 }, { "epoch": 6.5190735694822886, "grad_norm": 0.15840139985084534, "learning_rate": 5.070044269694874e-07, "loss": 0.0101, "step": 9570 }, { "epoch": 6.525885558583107, "grad_norm": 0.22831596434116364, "learning_rate": 4.837880476077417e-07, "loss": 0.0106, "step": 9580 }, { "epoch": 6.532697547683924, "grad_norm": 0.118828684091568, "learning_rate": 4.6111316960670835e-07, "loss": 0.0098, "step": 9590 }, { "epoch": 6.539509536784741, "grad_norm": 0.1655462384223938, "learning_rate": 4.389800409352218e-07, "loss": 0.0082, "step": 9600 }, { "epoch": 6.546321525885559, "grad_norm": 0.1253342479467392, "learning_rate": 4.173889036376277e-07, "loss": 0.0111, "step": 9610 }, { "epoch": 6.553133514986376, "grad_norm": 0.15380145609378815, "learning_rate": 3.963399938311463e-07, "loss": 0.0115, "step": 9620 }, { "epoch": 6.559945504087193, "grad_norm": 0.13774822652339935, "learning_rate": 3.7583354170328545e-07, "loss": 0.012, "step": 9630 }, { "epoch": 6.566757493188011, "grad_norm": 0.08887213468551636, "learning_rate": 3.558697715093207e-07, "loss": 0.0084, "step": 9640 }, { "epoch": 6.573569482288828, "grad_norm": 0.2804868817329407, "learning_rate": 3.3644890156983576e-07, "loss": 0.0109, "step": 9650 }, { "epoch": 6.580381471389646, "grad_norm": 0.12525686621665955, "learning_rate": 3.175711442683638e-07, "loss": 0.0084, "step": 9660 }, { "epoch": 6.587193460490463, "grad_norm": 0.12775982916355133, "learning_rate": 2.9923670604902197e-07, "loss": 0.0097, "step": 9670 }, { "epoch": 6.594005449591281, "grad_norm": 0.22419363260269165, "learning_rate": 2.814457874143028e-07, "loss": 0.0122, "step": 9680 }, { "epoch": 6.600817438692098, "grad_norm": 0.16230632364749908, "learning_rate": 2.641985829228366e-07, "loss": 0.0102, "step": 9690 }, { "epoch": 6.607629427792915, "grad_norm": 0.15815846621990204, "learning_rate": 2.474952811872877e-07, "loss": 0.0092, "step": 9700 }, { "epoch": 6.614441416893733, "grad_norm": 0.13755181431770325, "learning_rate": 2.3133606487228397e-07, "loss": 0.0116, "step": 9710 }, { "epoch": 6.62125340599455, "grad_norm": 0.09371072053909302, "learning_rate": 2.157211106924295e-07, "loss": 0.0087, "step": 9720 }, { "epoch": 6.628065395095367, "grad_norm": 0.1671672761440277, "learning_rate": 2.006505894103672e-07, "loss": 0.0107, "step": 9730 }, { "epoch": 6.6348773841961854, "grad_norm": 0.1295129358768463, "learning_rate": 1.8612466583489696e-07, "loss": 0.0098, "step": 9740 }, { "epoch": 6.641689373297003, "grad_norm": 0.2207920253276825, "learning_rate": 1.7214349881918834e-07, "loss": 0.0097, "step": 9750 }, { "epoch": 6.64850136239782, "grad_norm": 0.130056232213974, "learning_rate": 1.5870724125904845e-07, "loss": 0.0081, "step": 9760 }, { "epoch": 6.655313351498638, "grad_norm": 0.12633217871189117, "learning_rate": 1.4581604009124006e-07, "loss": 0.0096, "step": 9770 }, { "epoch": 6.662125340599455, "grad_norm": 0.16835469007492065, "learning_rate": 1.334700362918717e-07, "loss": 0.0091, "step": 9780 }, { "epoch": 6.668937329700272, "grad_norm": 0.26601773500442505, "learning_rate": 1.2166936487486015e-07, "loss": 0.0104, "step": 9790 }, { "epoch": 6.67574931880109, "grad_norm": 0.15718552470207214, "learning_rate": 1.1041415489045914e-07, "loss": 0.0089, "step": 9800 }, { "epoch": 6.682561307901907, "grad_norm": 0.14041031897068024, "learning_rate": 9.970452942384412e-08, "loss": 0.0104, "step": 9810 }, { "epoch": 6.689373297002724, "grad_norm": 0.10807531327009201, "learning_rate": 8.954060559375754e-08, "loss": 0.0087, "step": 9820 }, { "epoch": 6.6961852861035425, "grad_norm": 0.13568098843097687, "learning_rate": 7.99224945512489e-08, "loss": 0.0159, "step": 9830 }, { "epoch": 6.70299727520436, "grad_norm": 0.306471049785614, "learning_rate": 7.085030147843675e-08, "loss": 0.0124, "step": 9840 }, { "epoch": 6.709809264305177, "grad_norm": 0.14044924080371857, "learning_rate": 6.232412558736523e-08, "loss": 0.0117, "step": 9850 }, { "epoch": 6.716621253405995, "grad_norm": 0.14973674714565277, "learning_rate": 5.434406011893822e-08, "loss": 0.0139, "step": 9860 }, { "epoch": 6.723433242506812, "grad_norm": 0.10210314393043518, "learning_rate": 4.6910192341864664e-08, "loss": 0.0078, "step": 9870 }, { "epoch": 6.730245231607629, "grad_norm": 0.15292491018772125, "learning_rate": 4.0022603551737035e-08, "loss": 0.0099, "step": 9880 }, { "epoch": 6.737057220708447, "grad_norm": 0.17868728935718536, "learning_rate": 3.3681369070120985e-08, "loss": 0.012, "step": 9890 }, { "epoch": 6.743869209809264, "grad_norm": 0.17693090438842773, "learning_rate": 2.7886558243744866e-08, "loss": 0.0112, "step": 9900 }, { "epoch": 6.7506811989100814, "grad_norm": 0.1320875883102417, "learning_rate": 2.2638234443722596e-08, "loss": 0.0096, "step": 9910 }, { "epoch": 6.7574931880108995, "grad_norm": 0.1211492195725441, "learning_rate": 1.7936455064887504e-08, "loss": 0.013, "step": 9920 }, { "epoch": 6.764305177111717, "grad_norm": 0.1284903734922409, "learning_rate": 1.378127152514841e-08, "loss": 0.0066, "step": 9930 }, { "epoch": 6.771117166212534, "grad_norm": 0.12337515503168106, "learning_rate": 1.0172729264917857e-08, "loss": 0.0118, "step": 9940 }, { "epoch": 6.777929155313352, "grad_norm": 0.15872040390968323, "learning_rate": 7.1108677466458215e-09, "loss": 0.0107, "step": 9950 }, { "epoch": 6.784741144414169, "grad_norm": 0.13814593851566315, "learning_rate": 4.595720454353414e-09, "loss": 0.0119, "step": 9960 }, { "epoch": 6.791553133514986, "grad_norm": 0.16548724472522736, "learning_rate": 2.627314893294264e-09, "loss": 0.008, "step": 9970 }, { "epoch": 6.798365122615804, "grad_norm": 0.16446246206760406, "learning_rate": 1.2056725896270048e-09, "loss": 0.0097, "step": 9980 }, { "epoch": 6.805177111716621, "grad_norm": 0.1332317590713501, "learning_rate": 3.308090902098826e-10, "loss": 0.0099, "step": 9990 }, { "epoch": 6.8119891008174385, "grad_norm": 0.15192656219005585, "learning_rate": 2.7339624120159555e-12, "loss": 0.01, "step": 10000 }, { "epoch": 6.8119891008174385, "step": 10000, "total_flos": 0.0, "train_loss": 0.0075618208244442936, "train_runtime": 3810.2859, "train_samples_per_second": 83.983, "train_steps_per_second": 2.624 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }