{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 18.21668264621285, "eval_steps": 500, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009587727708533078, "grad_norm": 27.445323944091797, "learning_rate": 9.473684210526317e-07, "loss": 2.1709, "step": 10 }, { "epoch": 0.019175455417066157, "grad_norm": 19.005075454711914, "learning_rate": 2.0000000000000003e-06, "loss": 1.8704, "step": 20 }, { "epoch": 0.028763183125599234, "grad_norm": 14.785849571228027, "learning_rate": 3.0526315789473684e-06, "loss": 1.6318, "step": 30 }, { "epoch": 0.038350910834132314, "grad_norm": 4.634030342102051, "learning_rate": 4.105263157894737e-06, "loss": 0.8641, "step": 40 }, { "epoch": 0.04793863854266539, "grad_norm": 2.2945172786712646, "learning_rate": 5.1578947368421055e-06, "loss": 0.5394, "step": 50 }, { "epoch": 0.05752636625119847, "grad_norm": 1.7087739706039429, "learning_rate": 6.2105263157894745e-06, "loss": 0.4525, "step": 60 }, { "epoch": 0.06711409395973154, "grad_norm": 1.1094379425048828, "learning_rate": 7.2631578947368426e-06, "loss": 0.3063, "step": 70 }, { "epoch": 0.07670182166826463, "grad_norm": 1.5301676988601685, "learning_rate": 8.315789473684212e-06, "loss": 0.3153, "step": 80 }, { "epoch": 0.0862895493767977, "grad_norm": 1.1719224452972412, "learning_rate": 9.368421052631579e-06, "loss": 0.2466, "step": 90 }, { "epoch": 0.09587727708533078, "grad_norm": 1.751291275024414, "learning_rate": 1.0421052631578948e-05, "loss": 0.27, "step": 100 }, { "epoch": 0.10546500479386385, "grad_norm": 1.0524818897247314, "learning_rate": 1.1473684210526315e-05, "loss": 0.2333, "step": 110 }, { "epoch": 0.11505273250239693, "grad_norm": 1.508988857269287, "learning_rate": 1.2526315789473686e-05, "loss": 0.2399, "step": 120 }, { "epoch": 0.12464046021093, "grad_norm": 1.3286081552505493, "learning_rate": 1.3578947368421053e-05, "loss": 0.1962, "step": 130 }, { "epoch": 0.1342281879194631, "grad_norm": 1.7412567138671875, "learning_rate": 1.4631578947368422e-05, "loss": 0.2004, "step": 140 }, { "epoch": 0.14381591562799617, "grad_norm": 1.8567883968353271, "learning_rate": 1.568421052631579e-05, "loss": 0.174, "step": 150 }, { "epoch": 0.15340364333652926, "grad_norm": 1.5139102935791016, "learning_rate": 1.673684210526316e-05, "loss": 0.1765, "step": 160 }, { "epoch": 0.1629913710450623, "grad_norm": 1.6859902143478394, "learning_rate": 1.7789473684210527e-05, "loss": 0.168, "step": 170 }, { "epoch": 0.1725790987535954, "grad_norm": 1.8252370357513428, "learning_rate": 1.8842105263157894e-05, "loss": 0.1645, "step": 180 }, { "epoch": 0.18216682646212848, "grad_norm": 1.2732850313186646, "learning_rate": 1.9894736842105265e-05, "loss": 0.1554, "step": 190 }, { "epoch": 0.19175455417066156, "grad_norm": 1.0456390380859375, "learning_rate": 2.0947368421052632e-05, "loss": 0.1575, "step": 200 }, { "epoch": 0.20134228187919462, "grad_norm": 0.7651330828666687, "learning_rate": 2.2000000000000003e-05, "loss": 0.163, "step": 210 }, { "epoch": 0.2109300095877277, "grad_norm": 0.9984806776046753, "learning_rate": 2.305263157894737e-05, "loss": 0.1508, "step": 220 }, { "epoch": 0.22051773729626079, "grad_norm": 1.0750813484191895, "learning_rate": 2.410526315789474e-05, "loss": 0.1349, "step": 230 }, { "epoch": 0.23010546500479387, "grad_norm": 1.7777466773986816, "learning_rate": 2.5157894736842108e-05, "loss": 0.1448, "step": 240 }, { "epoch": 0.23969319271332695, "grad_norm": 1.3516716957092285, "learning_rate": 2.6210526315789475e-05, "loss": 0.1427, "step": 250 }, { "epoch": 0.24928092042186, "grad_norm": 1.1810095310211182, "learning_rate": 2.7263157894736846e-05, "loss": 0.1385, "step": 260 }, { "epoch": 0.2588686481303931, "grad_norm": 1.6512832641601562, "learning_rate": 2.8315789473684213e-05, "loss": 0.155, "step": 270 }, { "epoch": 0.2684563758389262, "grad_norm": 1.2209525108337402, "learning_rate": 2.9368421052631577e-05, "loss": 0.1378, "step": 280 }, { "epoch": 0.27804410354745923, "grad_norm": 1.0236748456954956, "learning_rate": 3.042105263157895e-05, "loss": 0.1409, "step": 290 }, { "epoch": 0.28763183125599234, "grad_norm": 1.065836787223816, "learning_rate": 3.147368421052632e-05, "loss": 0.1409, "step": 300 }, { "epoch": 0.2972195589645254, "grad_norm": 1.0454283952713013, "learning_rate": 3.2526315789473686e-05, "loss": 0.1333, "step": 310 }, { "epoch": 0.3068072866730585, "grad_norm": 0.5515532493591309, "learning_rate": 3.357894736842105e-05, "loss": 0.1137, "step": 320 }, { "epoch": 0.31639501438159157, "grad_norm": 1.323104977607727, "learning_rate": 3.463157894736842e-05, "loss": 0.1317, "step": 330 }, { "epoch": 0.3259827420901246, "grad_norm": 1.5426658391952515, "learning_rate": 3.5684210526315794e-05, "loss": 0.1174, "step": 340 }, { "epoch": 0.33557046979865773, "grad_norm": 0.9131991863250732, "learning_rate": 3.673684210526316e-05, "loss": 0.1171, "step": 350 }, { "epoch": 0.3451581975071908, "grad_norm": 1.0024508237838745, "learning_rate": 3.778947368421053e-05, "loss": 0.1162, "step": 360 }, { "epoch": 0.3547459252157239, "grad_norm": 1.1091963052749634, "learning_rate": 3.8842105263157896e-05, "loss": 0.1272, "step": 370 }, { "epoch": 0.36433365292425696, "grad_norm": 0.9772627949714661, "learning_rate": 3.989473684210526e-05, "loss": 0.1059, "step": 380 }, { "epoch": 0.37392138063279, "grad_norm": 0.92393958568573, "learning_rate": 4.094736842105264e-05, "loss": 0.113, "step": 390 }, { "epoch": 0.3835091083413231, "grad_norm": 0.9960997700691223, "learning_rate": 4.2e-05, "loss": 0.1077, "step": 400 }, { "epoch": 0.3930968360498562, "grad_norm": 1.0618188381195068, "learning_rate": 4.305263157894737e-05, "loss": 0.1084, "step": 410 }, { "epoch": 0.40268456375838924, "grad_norm": 0.7491030693054199, "learning_rate": 4.410526315789474e-05, "loss": 0.1021, "step": 420 }, { "epoch": 0.41227229146692235, "grad_norm": 0.9327500462532043, "learning_rate": 4.515789473684211e-05, "loss": 0.0984, "step": 430 }, { "epoch": 0.4218600191754554, "grad_norm": 0.7720574140548706, "learning_rate": 4.6210526315789473e-05, "loss": 0.0971, "step": 440 }, { "epoch": 0.4314477468839885, "grad_norm": 1.2057392597198486, "learning_rate": 4.726315789473684e-05, "loss": 0.1088, "step": 450 }, { "epoch": 0.44103547459252157, "grad_norm": 1.1223393678665161, "learning_rate": 4.8315789473684215e-05, "loss": 0.0992, "step": 460 }, { "epoch": 0.4506232023010546, "grad_norm": 0.6742480397224426, "learning_rate": 4.936842105263158e-05, "loss": 0.0963, "step": 470 }, { "epoch": 0.46021093000958774, "grad_norm": 1.0714161396026611, "learning_rate": 5.042105263157895e-05, "loss": 0.0974, "step": 480 }, { "epoch": 0.4697986577181208, "grad_norm": 0.7936097383499146, "learning_rate": 5.1473684210526317e-05, "loss": 0.1022, "step": 490 }, { "epoch": 0.4793863854266539, "grad_norm": 1.4822968244552612, "learning_rate": 5.252631578947369e-05, "loss": 0.0996, "step": 500 }, { "epoch": 0.48897411313518696, "grad_norm": 1.0476019382476807, "learning_rate": 5.357894736842105e-05, "loss": 0.1018, "step": 510 }, { "epoch": 0.49856184084372, "grad_norm": 0.9343310594558716, "learning_rate": 5.4631578947368425e-05, "loss": 0.102, "step": 520 }, { "epoch": 0.5081495685522531, "grad_norm": 0.8918314576148987, "learning_rate": 5.568421052631579e-05, "loss": 0.0986, "step": 530 }, { "epoch": 0.5177372962607862, "grad_norm": 1.155029296875, "learning_rate": 5.6736842105263166e-05, "loss": 0.1031, "step": 540 }, { "epoch": 0.5273250239693192, "grad_norm": 0.625169038772583, "learning_rate": 5.778947368421053e-05, "loss": 0.0907, "step": 550 }, { "epoch": 0.5369127516778524, "grad_norm": 1.0989243984222412, "learning_rate": 5.88421052631579e-05, "loss": 0.0843, "step": 560 }, { "epoch": 0.5465004793863855, "grad_norm": 0.8834158778190613, "learning_rate": 5.989473684210527e-05, "loss": 0.0777, "step": 570 }, { "epoch": 0.5560882070949185, "grad_norm": 0.7638639211654663, "learning_rate": 6.094736842105263e-05, "loss": 0.0781, "step": 580 }, { "epoch": 0.5656759348034516, "grad_norm": 1.2423137426376343, "learning_rate": 6.2e-05, "loss": 0.0886, "step": 590 }, { "epoch": 0.5752636625119847, "grad_norm": 1.082046627998352, "learning_rate": 6.305263157894738e-05, "loss": 0.0921, "step": 600 }, { "epoch": 0.5848513902205177, "grad_norm": 0.8878996968269348, "learning_rate": 6.410526315789474e-05, "loss": 0.0926, "step": 610 }, { "epoch": 0.5944391179290508, "grad_norm": 0.80406653881073, "learning_rate": 6.515789473684211e-05, "loss": 0.0983, "step": 620 }, { "epoch": 0.6040268456375839, "grad_norm": 0.8726837038993835, "learning_rate": 6.621052631578947e-05, "loss": 0.0833, "step": 630 }, { "epoch": 0.613614573346117, "grad_norm": 0.9104009866714478, "learning_rate": 6.726315789473685e-05, "loss": 0.0884, "step": 640 }, { "epoch": 0.62320230105465, "grad_norm": 0.6089403629302979, "learning_rate": 6.83157894736842e-05, "loss": 0.0835, "step": 650 }, { "epoch": 0.6327900287631831, "grad_norm": 0.8488327860832214, "learning_rate": 6.936842105263158e-05, "loss": 0.0812, "step": 660 }, { "epoch": 0.6423777564717162, "grad_norm": 1.121718168258667, "learning_rate": 7.042105263157895e-05, "loss": 0.0979, "step": 670 }, { "epoch": 0.6519654841802492, "grad_norm": 0.554762065410614, "learning_rate": 7.147368421052631e-05, "loss": 0.0941, "step": 680 }, { "epoch": 0.6615532118887824, "grad_norm": 0.8173949718475342, "learning_rate": 7.252631578947369e-05, "loss": 0.09, "step": 690 }, { "epoch": 0.6711409395973155, "grad_norm": 0.9960802793502808, "learning_rate": 7.357894736842106e-05, "loss": 0.0969, "step": 700 }, { "epoch": 0.6807286673058485, "grad_norm": 0.9952852725982666, "learning_rate": 7.463157894736844e-05, "loss": 0.0927, "step": 710 }, { "epoch": 0.6903163950143816, "grad_norm": 1.1024588346481323, "learning_rate": 7.56842105263158e-05, "loss": 0.0874, "step": 720 }, { "epoch": 0.6999041227229147, "grad_norm": 0.7529568672180176, "learning_rate": 7.673684210526316e-05, "loss": 0.0853, "step": 730 }, { "epoch": 0.7094918504314478, "grad_norm": 0.8373092412948608, "learning_rate": 7.778947368421053e-05, "loss": 0.0783, "step": 740 }, { "epoch": 0.7190795781399808, "grad_norm": 0.6158662438392639, "learning_rate": 7.884210526315789e-05, "loss": 0.0872, "step": 750 }, { "epoch": 0.7286673058485139, "grad_norm": 0.7315576076507568, "learning_rate": 7.989473684210527e-05, "loss": 0.0841, "step": 760 }, { "epoch": 0.738255033557047, "grad_norm": 0.5791612267494202, "learning_rate": 8.094736842105264e-05, "loss": 0.0706, "step": 770 }, { "epoch": 0.74784276126558, "grad_norm": 0.8657413721084595, "learning_rate": 8.2e-05, "loss": 0.0689, "step": 780 }, { "epoch": 0.7574304889741131, "grad_norm": 0.9742875695228577, "learning_rate": 8.305263157894737e-05, "loss": 0.0869, "step": 790 }, { "epoch": 0.7670182166826462, "grad_norm": 0.7406681776046753, "learning_rate": 8.410526315789475e-05, "loss": 0.0869, "step": 800 }, { "epoch": 0.7766059443911792, "grad_norm": 1.168278455734253, "learning_rate": 8.515789473684211e-05, "loss": 0.0803, "step": 810 }, { "epoch": 0.7861936720997124, "grad_norm": 1.1049866676330566, "learning_rate": 8.621052631578947e-05, "loss": 0.0851, "step": 820 }, { "epoch": 0.7957813998082455, "grad_norm": 0.9790105223655701, "learning_rate": 8.726315789473684e-05, "loss": 0.0788, "step": 830 }, { "epoch": 0.8053691275167785, "grad_norm": 0.762137770652771, "learning_rate": 8.831578947368422e-05, "loss": 0.0715, "step": 840 }, { "epoch": 0.8149568552253116, "grad_norm": 0.8730412125587463, "learning_rate": 8.936842105263158e-05, "loss": 0.0898, "step": 850 }, { "epoch": 0.8245445829338447, "grad_norm": 1.1794781684875488, "learning_rate": 9.042105263157895e-05, "loss": 0.0798, "step": 860 }, { "epoch": 0.8341323106423778, "grad_norm": 0.7828540205955505, "learning_rate": 9.147368421052633e-05, "loss": 0.0848, "step": 870 }, { "epoch": 0.8437200383509108, "grad_norm": 0.7496788501739502, "learning_rate": 9.252631578947369e-05, "loss": 0.0836, "step": 880 }, { "epoch": 0.8533077660594439, "grad_norm": 0.7298113703727722, "learning_rate": 9.357894736842106e-05, "loss": 0.0804, "step": 890 }, { "epoch": 0.862895493767977, "grad_norm": 0.7915740609169006, "learning_rate": 9.463157894736842e-05, "loss": 0.0978, "step": 900 }, { "epoch": 0.87248322147651, "grad_norm": 0.6587068438529968, "learning_rate": 9.56842105263158e-05, "loss": 0.0823, "step": 910 }, { "epoch": 0.8820709491850431, "grad_norm": 0.6733153462409973, "learning_rate": 9.673684210526316e-05, "loss": 0.0903, "step": 920 }, { "epoch": 0.8916586768935763, "grad_norm": 0.8253368139266968, "learning_rate": 9.778947368421053e-05, "loss": 0.0817, "step": 930 }, { "epoch": 0.9012464046021093, "grad_norm": 0.631831169128418, "learning_rate": 9.88421052631579e-05, "loss": 0.0692, "step": 940 }, { "epoch": 0.9108341323106424, "grad_norm": 0.4998478293418884, "learning_rate": 9.989473684210526e-05, "loss": 0.08, "step": 950 }, { "epoch": 0.9204218600191755, "grad_norm": 0.5345643162727356, "learning_rate": 9.999993865625701e-05, "loss": 0.0707, "step": 960 }, { "epoch": 0.9300095877277086, "grad_norm": 0.496713250875473, "learning_rate": 9.999972660400536e-05, "loss": 0.0759, "step": 970 }, { "epoch": 0.9395973154362416, "grad_norm": 0.4693014621734619, "learning_rate": 9.999936308655709e-05, "loss": 0.0781, "step": 980 }, { "epoch": 0.9491850431447747, "grad_norm": 0.5775050520896912, "learning_rate": 9.999884810501344e-05, "loss": 0.0748, "step": 990 }, { "epoch": 0.9587727708533078, "grad_norm": 0.7837674021720886, "learning_rate": 9.999818166093444e-05, "loss": 0.0783, "step": 1000 }, { "epoch": 0.9683604985618408, "grad_norm": 0.6740615367889404, "learning_rate": 9.999736375633896e-05, "loss": 0.0799, "step": 1010 }, { "epoch": 0.9779482262703739, "grad_norm": 0.644281268119812, "learning_rate": 9.999639439370469e-05, "loss": 0.0875, "step": 1020 }, { "epoch": 0.987535953978907, "grad_norm": 0.6877675652503967, "learning_rate": 9.999527357596816e-05, "loss": 0.0702, "step": 1030 }, { "epoch": 0.99712368168744, "grad_norm": 0.8206673860549927, "learning_rate": 9.999400130652465e-05, "loss": 0.0705, "step": 1040 }, { "epoch": 1.0067114093959733, "grad_norm": 0.5425058007240295, "learning_rate": 9.999257758922833e-05, "loss": 0.0773, "step": 1050 }, { "epoch": 1.0162991371045063, "grad_norm": 0.7658944725990295, "learning_rate": 9.999100242839203e-05, "loss": 0.0777, "step": 1060 }, { "epoch": 1.0258868648130393, "grad_norm": 0.73934006690979, "learning_rate": 9.998927582878747e-05, "loss": 0.0685, "step": 1070 }, { "epoch": 1.0354745925215725, "grad_norm": 0.38501349091529846, "learning_rate": 9.998739779564506e-05, "loss": 0.069, "step": 1080 }, { "epoch": 1.0450623202301055, "grad_norm": 0.45449578762054443, "learning_rate": 9.998536833465394e-05, "loss": 0.0559, "step": 1090 }, { "epoch": 1.0546500479386385, "grad_norm": 0.8127736449241638, "learning_rate": 9.998318745196203e-05, "loss": 0.068, "step": 1100 }, { "epoch": 1.0642377756471717, "grad_norm": 0.6800121068954468, "learning_rate": 9.998085515417588e-05, "loss": 0.0683, "step": 1110 }, { "epoch": 1.0738255033557047, "grad_norm": 0.688755214214325, "learning_rate": 9.997837144836082e-05, "loss": 0.0619, "step": 1120 }, { "epoch": 1.0834132310642377, "grad_norm": 0.6529737710952759, "learning_rate": 9.997573634204074e-05, "loss": 0.0716, "step": 1130 }, { "epoch": 1.093000958772771, "grad_norm": 0.773915708065033, "learning_rate": 9.997294984319827e-05, "loss": 0.0667, "step": 1140 }, { "epoch": 1.102588686481304, "grad_norm": 0.611422061920166, "learning_rate": 9.997001196027457e-05, "loss": 0.0695, "step": 1150 }, { "epoch": 1.112176414189837, "grad_norm": 0.6238502264022827, "learning_rate": 9.996692270216947e-05, "loss": 0.0632, "step": 1160 }, { "epoch": 1.1217641418983701, "grad_norm": 0.6252961158752441, "learning_rate": 9.996368207824128e-05, "loss": 0.0708, "step": 1170 }, { "epoch": 1.1313518696069031, "grad_norm": 0.3486538529396057, "learning_rate": 9.996029009830689e-05, "loss": 0.0662, "step": 1180 }, { "epoch": 1.1409395973154361, "grad_norm": 0.40418991446495056, "learning_rate": 9.995674677264173e-05, "loss": 0.0591, "step": 1190 }, { "epoch": 1.1505273250239694, "grad_norm": 0.4740557074546814, "learning_rate": 9.995305211197965e-05, "loss": 0.0701, "step": 1200 }, { "epoch": 1.1601150527325024, "grad_norm": 0.713366687297821, "learning_rate": 9.994920612751295e-05, "loss": 0.073, "step": 1210 }, { "epoch": 1.1697027804410354, "grad_norm": 0.6612546443939209, "learning_rate": 9.994520883089238e-05, "loss": 0.0681, "step": 1220 }, { "epoch": 1.1792905081495686, "grad_norm": 0.6933987736701965, "learning_rate": 9.994106023422699e-05, "loss": 0.0655, "step": 1230 }, { "epoch": 1.1888782358581016, "grad_norm": 0.4890410602092743, "learning_rate": 9.993676035008423e-05, "loss": 0.0633, "step": 1240 }, { "epoch": 1.1984659635666346, "grad_norm": 0.5587823987007141, "learning_rate": 9.993230919148985e-05, "loss": 0.0656, "step": 1250 }, { "epoch": 1.2080536912751678, "grad_norm": 0.6635778546333313, "learning_rate": 9.99277067719278e-05, "loss": 0.0603, "step": 1260 }, { "epoch": 1.2176414189837008, "grad_norm": 0.6514385342597961, "learning_rate": 9.99229531053403e-05, "loss": 0.0652, "step": 1270 }, { "epoch": 1.2272291466922338, "grad_norm": 0.5782362818717957, "learning_rate": 9.991804820612773e-05, "loss": 0.0644, "step": 1280 }, { "epoch": 1.236816874400767, "grad_norm": 0.39845097064971924, "learning_rate": 9.99129920891486e-05, "loss": 0.0617, "step": 1290 }, { "epoch": 1.2464046021093, "grad_norm": 0.5628125667572021, "learning_rate": 9.990778476971951e-05, "loss": 0.0613, "step": 1300 }, { "epoch": 1.255992329817833, "grad_norm": 0.4811013340950012, "learning_rate": 9.99024262636151e-05, "loss": 0.0644, "step": 1310 }, { "epoch": 1.2655800575263663, "grad_norm": 0.540348470211029, "learning_rate": 9.989691658706798e-05, "loss": 0.063, "step": 1320 }, { "epoch": 1.2751677852348993, "grad_norm": 0.593609631061554, "learning_rate": 9.989125575676876e-05, "loss": 0.0537, "step": 1330 }, { "epoch": 1.2847555129434325, "grad_norm": 0.4400087296962738, "learning_rate": 9.988544378986591e-05, "loss": 0.0634, "step": 1340 }, { "epoch": 1.2943432406519655, "grad_norm": 0.7038517594337463, "learning_rate": 9.987948070396571e-05, "loss": 0.0564, "step": 1350 }, { "epoch": 1.3039309683604985, "grad_norm": 0.4805976450443268, "learning_rate": 9.987336651713229e-05, "loss": 0.0604, "step": 1360 }, { "epoch": 1.3135186960690317, "grad_norm": 0.5478856563568115, "learning_rate": 9.986710124788745e-05, "loss": 0.0573, "step": 1370 }, { "epoch": 1.3231064237775647, "grad_norm": 0.6592814922332764, "learning_rate": 9.986068491521072e-05, "loss": 0.0604, "step": 1380 }, { "epoch": 1.332694151486098, "grad_norm": 0.7848181128501892, "learning_rate": 9.985411753853921e-05, "loss": 0.055, "step": 1390 }, { "epoch": 1.342281879194631, "grad_norm": 0.40262654423713684, "learning_rate": 9.984739913776765e-05, "loss": 0.0629, "step": 1400 }, { "epoch": 1.351869606903164, "grad_norm": 0.6241422295570374, "learning_rate": 9.984052973324817e-05, "loss": 0.0609, "step": 1410 }, { "epoch": 1.3614573346116972, "grad_norm": 0.7500850558280945, "learning_rate": 9.983350934579046e-05, "loss": 0.0742, "step": 1420 }, { "epoch": 1.3710450623202302, "grad_norm": 0.6990365386009216, "learning_rate": 9.982633799666146e-05, "loss": 0.0605, "step": 1430 }, { "epoch": 1.3806327900287632, "grad_norm": 0.5741100311279297, "learning_rate": 9.981901570758554e-05, "loss": 0.0639, "step": 1440 }, { "epoch": 1.3902205177372964, "grad_norm": 0.6131389141082764, "learning_rate": 9.981154250074422e-05, "loss": 0.0695, "step": 1450 }, { "epoch": 1.3998082454458294, "grad_norm": 0.6654881834983826, "learning_rate": 9.980391839877628e-05, "loss": 0.0755, "step": 1460 }, { "epoch": 1.4093959731543624, "grad_norm": 0.5249256491661072, "learning_rate": 9.979614342477753e-05, "loss": 0.0613, "step": 1470 }, { "epoch": 1.4189837008628956, "grad_norm": 0.5373178124427795, "learning_rate": 9.978821760230086e-05, "loss": 0.072, "step": 1480 }, { "epoch": 1.4285714285714286, "grad_norm": 0.4792821407318115, "learning_rate": 9.978014095535615e-05, "loss": 0.0549, "step": 1490 }, { "epoch": 1.4381591562799616, "grad_norm": 0.5644699931144714, "learning_rate": 9.977191350841016e-05, "loss": 0.065, "step": 1500 }, { "epoch": 1.4477468839884948, "grad_norm": 0.374956339597702, "learning_rate": 9.976353528638642e-05, "loss": 0.0545, "step": 1510 }, { "epoch": 1.4573346116970278, "grad_norm": 0.4185064733028412, "learning_rate": 9.975500631466527e-05, "loss": 0.0619, "step": 1520 }, { "epoch": 1.4669223394055608, "grad_norm": 0.3903638422489166, "learning_rate": 9.974632661908372e-05, "loss": 0.0526, "step": 1530 }, { "epoch": 1.476510067114094, "grad_norm": 0.45104435086250305, "learning_rate": 9.973749622593534e-05, "loss": 0.061, "step": 1540 }, { "epoch": 1.486097794822627, "grad_norm": 0.4152944087982178, "learning_rate": 9.972851516197019e-05, "loss": 0.0635, "step": 1550 }, { "epoch": 1.49568552253116, "grad_norm": 0.5824716091156006, "learning_rate": 9.971938345439484e-05, "loss": 0.0598, "step": 1560 }, { "epoch": 1.5052732502396933, "grad_norm": 0.5598675608634949, "learning_rate": 9.971010113087212e-05, "loss": 0.0529, "step": 1570 }, { "epoch": 1.5148609779482263, "grad_norm": 0.6759763956069946, "learning_rate": 9.970066821952118e-05, "loss": 0.0687, "step": 1580 }, { "epoch": 1.5244487056567593, "grad_norm": 0.4682703912258148, "learning_rate": 9.969108474891732e-05, "loss": 0.0557, "step": 1590 }, { "epoch": 1.5340364333652925, "grad_norm": 0.6091550588607788, "learning_rate": 9.968135074809194e-05, "loss": 0.0628, "step": 1600 }, { "epoch": 1.5436241610738255, "grad_norm": 0.5167152881622314, "learning_rate": 9.96714662465325e-05, "loss": 0.056, "step": 1610 }, { "epoch": 1.5532118887823585, "grad_norm": 0.5612486004829407, "learning_rate": 9.966143127418225e-05, "loss": 0.0565, "step": 1620 }, { "epoch": 1.5627996164908917, "grad_norm": 0.3620167672634125, "learning_rate": 9.965124586144039e-05, "loss": 0.0533, "step": 1630 }, { "epoch": 1.5723873441994247, "grad_norm": 0.6704486012458801, "learning_rate": 9.964091003916179e-05, "loss": 0.0633, "step": 1640 }, { "epoch": 1.5819750719079577, "grad_norm": 0.6531718969345093, "learning_rate": 9.963042383865694e-05, "loss": 0.0665, "step": 1650 }, { "epoch": 1.591562799616491, "grad_norm": 0.5249754786491394, "learning_rate": 9.961978729169192e-05, "loss": 0.0471, "step": 1660 }, { "epoch": 1.601150527325024, "grad_norm": 0.4377578794956207, "learning_rate": 9.960900043048826e-05, "loss": 0.0561, "step": 1670 }, { "epoch": 1.610738255033557, "grad_norm": 0.34821832180023193, "learning_rate": 9.959806328772279e-05, "loss": 0.0575, "step": 1680 }, { "epoch": 1.6203259827420902, "grad_norm": 0.41964197158813477, "learning_rate": 9.958697589652763e-05, "loss": 0.0552, "step": 1690 }, { "epoch": 1.6299137104506232, "grad_norm": 0.5038737058639526, "learning_rate": 9.957573829049004e-05, "loss": 0.0571, "step": 1700 }, { "epoch": 1.6395014381591562, "grad_norm": 0.5568312406539917, "learning_rate": 9.956435050365233e-05, "loss": 0.0535, "step": 1710 }, { "epoch": 1.6490891658676894, "grad_norm": 0.3089469075202942, "learning_rate": 9.955281257051178e-05, "loss": 0.0567, "step": 1720 }, { "epoch": 1.6586768935762224, "grad_norm": 0.5025231838226318, "learning_rate": 9.954112452602045e-05, "loss": 0.0595, "step": 1730 }, { "epoch": 1.6682646212847554, "grad_norm": 0.6473100185394287, "learning_rate": 9.952928640558519e-05, "loss": 0.0583, "step": 1740 }, { "epoch": 1.6778523489932886, "grad_norm": 0.38910412788391113, "learning_rate": 9.951729824506745e-05, "loss": 0.0606, "step": 1750 }, { "epoch": 1.6874400767018218, "grad_norm": 0.5367538332939148, "learning_rate": 9.950516008078325e-05, "loss": 0.0658, "step": 1760 }, { "epoch": 1.6970278044103546, "grad_norm": 0.5526398420333862, "learning_rate": 9.949287194950293e-05, "loss": 0.0554, "step": 1770 }, { "epoch": 1.7066155321188878, "grad_norm": 0.5616441369056702, "learning_rate": 9.948043388845121e-05, "loss": 0.0579, "step": 1780 }, { "epoch": 1.716203259827421, "grad_norm": 0.41163280606269836, "learning_rate": 9.946784593530694e-05, "loss": 0.0612, "step": 1790 }, { "epoch": 1.7257909875359538, "grad_norm": 0.45861759781837463, "learning_rate": 9.945510812820308e-05, "loss": 0.0524, "step": 1800 }, { "epoch": 1.735378715244487, "grad_norm": 0.4847518503665924, "learning_rate": 9.944222050572653e-05, "loss": 0.0545, "step": 1810 }, { "epoch": 1.7449664429530203, "grad_norm": 0.36065423488616943, "learning_rate": 9.942918310691803e-05, "loss": 0.0503, "step": 1820 }, { "epoch": 1.754554170661553, "grad_norm": 0.5361629128456116, "learning_rate": 9.941599597127202e-05, "loss": 0.0582, "step": 1830 }, { "epoch": 1.7641418983700863, "grad_norm": 0.290815532207489, "learning_rate": 9.940265913873657e-05, "loss": 0.0626, "step": 1840 }, { "epoch": 1.7737296260786195, "grad_norm": 0.3743116855621338, "learning_rate": 9.938917264971324e-05, "loss": 0.0577, "step": 1850 }, { "epoch": 1.7833173537871523, "grad_norm": 0.7040207982063293, "learning_rate": 9.937553654505691e-05, "loss": 0.0625, "step": 1860 }, { "epoch": 1.7929050814956855, "grad_norm": 0.4356692135334015, "learning_rate": 9.936175086607572e-05, "loss": 0.0616, "step": 1870 }, { "epoch": 1.8024928092042187, "grad_norm": 0.3443772494792938, "learning_rate": 9.934781565453089e-05, "loss": 0.0573, "step": 1880 }, { "epoch": 1.8120805369127517, "grad_norm": 0.4956841766834259, "learning_rate": 9.933373095263667e-05, "loss": 0.0528, "step": 1890 }, { "epoch": 1.8216682646212847, "grad_norm": 0.5193634629249573, "learning_rate": 9.931949680306012e-05, "loss": 0.0548, "step": 1900 }, { "epoch": 1.831255992329818, "grad_norm": 0.3799174129962921, "learning_rate": 9.930511324892104e-05, "loss": 0.0563, "step": 1910 }, { "epoch": 1.840843720038351, "grad_norm": 0.3923283815383911, "learning_rate": 9.929058033379181e-05, "loss": 0.0595, "step": 1920 }, { "epoch": 1.850431447746884, "grad_norm": 0.47552716732025146, "learning_rate": 9.927589810169733e-05, "loss": 0.0546, "step": 1930 }, { "epoch": 1.8600191754554172, "grad_norm": 0.4305611848831177, "learning_rate": 9.926106659711476e-05, "loss": 0.0523, "step": 1940 }, { "epoch": 1.8696069031639502, "grad_norm": 0.5576485395431519, "learning_rate": 9.924608586497348e-05, "loss": 0.0574, "step": 1950 }, { "epoch": 1.8791946308724832, "grad_norm": 0.31708958745002747, "learning_rate": 9.923095595065494e-05, "loss": 0.0482, "step": 1960 }, { "epoch": 1.8887823585810164, "grad_norm": 0.41617056727409363, "learning_rate": 9.921567689999247e-05, "loss": 0.0584, "step": 1970 }, { "epoch": 1.8983700862895494, "grad_norm": 0.5047758221626282, "learning_rate": 9.920024875927125e-05, "loss": 0.0642, "step": 1980 }, { "epoch": 1.9079578139980824, "grad_norm": 0.4173164367675781, "learning_rate": 9.918467157522805e-05, "loss": 0.0548, "step": 1990 }, { "epoch": 1.9175455417066156, "grad_norm": 0.4640159010887146, "learning_rate": 9.916894539505115e-05, "loss": 0.0499, "step": 2000 }, { "epoch": 1.9271332694151486, "grad_norm": 0.41713109612464905, "learning_rate": 9.915307026638018e-05, "loss": 0.0491, "step": 2010 }, { "epoch": 1.9367209971236816, "grad_norm": 0.392994225025177, "learning_rate": 9.9137046237306e-05, "loss": 0.0522, "step": 2020 }, { "epoch": 1.9463087248322148, "grad_norm": 0.32308030128479004, "learning_rate": 9.912087335637054e-05, "loss": 0.0557, "step": 2030 }, { "epoch": 1.9558964525407478, "grad_norm": 0.406943678855896, "learning_rate": 9.910455167256663e-05, "loss": 0.0523, "step": 2040 }, { "epoch": 1.9654841802492808, "grad_norm": 0.3809382915496826, "learning_rate": 9.908808123533787e-05, "loss": 0.0567, "step": 2050 }, { "epoch": 1.975071907957814, "grad_norm": 0.3431997299194336, "learning_rate": 9.907146209457852e-05, "loss": 0.0456, "step": 2060 }, { "epoch": 1.984659635666347, "grad_norm": 0.37939101457595825, "learning_rate": 9.905469430063325e-05, "loss": 0.0479, "step": 2070 }, { "epoch": 1.99424736337488, "grad_norm": 0.492702841758728, "learning_rate": 9.903777790429714e-05, "loss": 0.048, "step": 2080 }, { "epoch": 2.0038350910834133, "grad_norm": 0.41130146384239197, "learning_rate": 9.90207129568153e-05, "loss": 0.0545, "step": 2090 }, { "epoch": 2.0134228187919465, "grad_norm": 0.5280721187591553, "learning_rate": 9.900349950988297e-05, "loss": 0.0516, "step": 2100 }, { "epoch": 2.0230105465004793, "grad_norm": 0.3090174198150635, "learning_rate": 9.89861376156452e-05, "loss": 0.043, "step": 2110 }, { "epoch": 2.0325982742090125, "grad_norm": 0.35579144954681396, "learning_rate": 9.896862732669671e-05, "loss": 0.0584, "step": 2120 }, { "epoch": 2.0421860019175457, "grad_norm": 0.44842928647994995, "learning_rate": 9.89509686960818e-05, "loss": 0.0523, "step": 2130 }, { "epoch": 2.0517737296260785, "grad_norm": 0.4050745666027069, "learning_rate": 9.893316177729411e-05, "loss": 0.0529, "step": 2140 }, { "epoch": 2.0613614573346117, "grad_norm": 0.2710857093334198, "learning_rate": 9.891520662427651e-05, "loss": 0.0582, "step": 2150 }, { "epoch": 2.070949185043145, "grad_norm": 0.327932745218277, "learning_rate": 9.88971032914209e-05, "loss": 0.056, "step": 2160 }, { "epoch": 2.0805369127516777, "grad_norm": 0.41889169812202454, "learning_rate": 9.887885183356809e-05, "loss": 0.0449, "step": 2170 }, { "epoch": 2.090124640460211, "grad_norm": 0.37824153900146484, "learning_rate": 9.886045230600757e-05, "loss": 0.0478, "step": 2180 }, { "epoch": 2.099712368168744, "grad_norm": 0.4298747479915619, "learning_rate": 9.884190476447746e-05, "loss": 0.0479, "step": 2190 }, { "epoch": 2.109300095877277, "grad_norm": 0.5047415494918823, "learning_rate": 9.882320926516416e-05, "loss": 0.0509, "step": 2200 }, { "epoch": 2.11888782358581, "grad_norm": 0.3802444338798523, "learning_rate": 9.880436586470234e-05, "loss": 0.0469, "step": 2210 }, { "epoch": 2.1284755512943434, "grad_norm": 0.3608779311180115, "learning_rate": 9.87853746201747e-05, "loss": 0.0499, "step": 2220 }, { "epoch": 2.138063279002876, "grad_norm": 0.49108660221099854, "learning_rate": 9.876623558911181e-05, "loss": 0.0494, "step": 2230 }, { "epoch": 2.1476510067114094, "grad_norm": 0.35984379053115845, "learning_rate": 9.874694882949194e-05, "loss": 0.0513, "step": 2240 }, { "epoch": 2.1572387344199426, "grad_norm": 0.6457746624946594, "learning_rate": 9.872751439974084e-05, "loss": 0.0497, "step": 2250 }, { "epoch": 2.1668264621284754, "grad_norm": 0.4572752118110657, "learning_rate": 9.870793235873164e-05, "loss": 0.0497, "step": 2260 }, { "epoch": 2.1764141898370086, "grad_norm": 0.5329883098602295, "learning_rate": 9.868820276578463e-05, "loss": 0.0597, "step": 2270 }, { "epoch": 2.186001917545542, "grad_norm": 0.4147273302078247, "learning_rate": 9.866832568066706e-05, "loss": 0.0537, "step": 2280 }, { "epoch": 2.1955896452540746, "grad_norm": 0.3269449770450592, "learning_rate": 9.864830116359299e-05, "loss": 0.0541, "step": 2290 }, { "epoch": 2.205177372962608, "grad_norm": 0.38033929467201233, "learning_rate": 9.862812927522309e-05, "loss": 0.0493, "step": 2300 }, { "epoch": 2.214765100671141, "grad_norm": 0.39863190054893494, "learning_rate": 9.86078100766645e-05, "loss": 0.0582, "step": 2310 }, { "epoch": 2.224352828379674, "grad_norm": 0.3785865604877472, "learning_rate": 9.858734362947056e-05, "loss": 0.0451, "step": 2320 }, { "epoch": 2.233940556088207, "grad_norm": 0.3535449802875519, "learning_rate": 9.856672999564072e-05, "loss": 0.0569, "step": 2330 }, { "epoch": 2.2435282837967403, "grad_norm": 0.43401646614074707, "learning_rate": 9.854596923762026e-05, "loss": 0.0451, "step": 2340 }, { "epoch": 2.253116011505273, "grad_norm": 0.3438590466976166, "learning_rate": 9.852506141830018e-05, "loss": 0.0527, "step": 2350 }, { "epoch": 2.2627037392138063, "grad_norm": 0.524154543876648, "learning_rate": 9.850400660101698e-05, "loss": 0.0536, "step": 2360 }, { "epoch": 2.2722914669223395, "grad_norm": 0.6278344392776489, "learning_rate": 9.848280484955243e-05, "loss": 0.0566, "step": 2370 }, { "epoch": 2.2818791946308723, "grad_norm": 0.45389410853385925, "learning_rate": 9.846145622813343e-05, "loss": 0.0538, "step": 2380 }, { "epoch": 2.2914669223394055, "grad_norm": 0.3653407692909241, "learning_rate": 9.843996080143181e-05, "loss": 0.0496, "step": 2390 }, { "epoch": 2.3010546500479387, "grad_norm": 0.39420798420906067, "learning_rate": 9.84183186345641e-05, "loss": 0.0507, "step": 2400 }, { "epoch": 2.310642377756472, "grad_norm": 0.36511731147766113, "learning_rate": 9.839652979309135e-05, "loss": 0.0415, "step": 2410 }, { "epoch": 2.3202301054650047, "grad_norm": 0.6739844679832458, "learning_rate": 9.837459434301896e-05, "loss": 0.0497, "step": 2420 }, { "epoch": 2.329817833173538, "grad_norm": 0.3520050346851349, "learning_rate": 9.835251235079643e-05, "loss": 0.0476, "step": 2430 }, { "epoch": 2.3394055608820707, "grad_norm": 0.3880830705165863, "learning_rate": 9.833028388331719e-05, "loss": 0.0477, "step": 2440 }, { "epoch": 2.348993288590604, "grad_norm": 0.5605785250663757, "learning_rate": 9.830790900791842e-05, "loss": 0.0565, "step": 2450 }, { "epoch": 2.358581016299137, "grad_norm": 0.43835964798927307, "learning_rate": 9.828538779238074e-05, "loss": 0.0481, "step": 2460 }, { "epoch": 2.3681687440076704, "grad_norm": 0.46309876441955566, "learning_rate": 9.826272030492817e-05, "loss": 0.0459, "step": 2470 }, { "epoch": 2.377756471716203, "grad_norm": 0.315773606300354, "learning_rate": 9.823990661422778e-05, "loss": 0.0446, "step": 2480 }, { "epoch": 2.3873441994247364, "grad_norm": 0.37291958928108215, "learning_rate": 9.821694678938953e-05, "loss": 0.0394, "step": 2490 }, { "epoch": 2.396931927133269, "grad_norm": 0.5233327150344849, "learning_rate": 9.819384089996613e-05, "loss": 0.0494, "step": 2500 }, { "epoch": 2.4065196548418024, "grad_norm": 0.33032602071762085, "learning_rate": 9.817058901595269e-05, "loss": 0.0586, "step": 2510 }, { "epoch": 2.4161073825503356, "grad_norm": 0.39209842681884766, "learning_rate": 9.814719120778663e-05, "loss": 0.0528, "step": 2520 }, { "epoch": 2.425695110258869, "grad_norm": 0.3824262320995331, "learning_rate": 9.81236475463474e-05, "loss": 0.0502, "step": 2530 }, { "epoch": 2.4352828379674016, "grad_norm": 0.4724734127521515, "learning_rate": 9.809995810295633e-05, "loss": 0.0538, "step": 2540 }, { "epoch": 2.444870565675935, "grad_norm": 0.4816121459007263, "learning_rate": 9.80761229493763e-05, "loss": 0.0599, "step": 2550 }, { "epoch": 2.4544582933844676, "grad_norm": 0.4902478754520416, "learning_rate": 9.805214215781165e-05, "loss": 0.0579, "step": 2560 }, { "epoch": 2.464046021093001, "grad_norm": 0.4263833463191986, "learning_rate": 9.802801580090785e-05, "loss": 0.0496, "step": 2570 }, { "epoch": 2.473633748801534, "grad_norm": 0.4122842848300934, "learning_rate": 9.800374395175143e-05, "loss": 0.0601, "step": 2580 }, { "epoch": 2.4832214765100673, "grad_norm": 0.3193143308162689, "learning_rate": 9.797932668386955e-05, "loss": 0.0453, "step": 2590 }, { "epoch": 2.4928092042186, "grad_norm": 0.302079439163208, "learning_rate": 9.795476407122994e-05, "loss": 0.0526, "step": 2600 }, { "epoch": 2.5023969319271333, "grad_norm": 0.3169849216938019, "learning_rate": 9.793005618824066e-05, "loss": 0.0475, "step": 2610 }, { "epoch": 2.511984659635666, "grad_norm": 0.35016322135925293, "learning_rate": 9.790520310974978e-05, "loss": 0.0523, "step": 2620 }, { "epoch": 2.5215723873441993, "grad_norm": 0.5532832741737366, "learning_rate": 9.788020491104524e-05, "loss": 0.0516, "step": 2630 }, { "epoch": 2.5311601150527325, "grad_norm": 0.48316141963005066, "learning_rate": 9.785506166785461e-05, "loss": 0.0455, "step": 2640 }, { "epoch": 2.5407478427612658, "grad_norm": 0.53989177942276, "learning_rate": 9.78297734563448e-05, "loss": 0.05, "step": 2650 }, { "epoch": 2.5503355704697985, "grad_norm": 0.44286760687828064, "learning_rate": 9.780434035312196e-05, "loss": 0.0552, "step": 2660 }, { "epoch": 2.5599232981783318, "grad_norm": 0.5638286471366882, "learning_rate": 9.777876243523108e-05, "loss": 0.062, "step": 2670 }, { "epoch": 2.569511025886865, "grad_norm": 0.45765963196754456, "learning_rate": 9.775303978015585e-05, "loss": 0.0535, "step": 2680 }, { "epoch": 2.5790987535953978, "grad_norm": 0.3893742859363556, "learning_rate": 9.772717246581848e-05, "loss": 0.055, "step": 2690 }, { "epoch": 2.588686481303931, "grad_norm": 0.4707334637641907, "learning_rate": 9.770116057057933e-05, "loss": 0.055, "step": 2700 }, { "epoch": 2.598274209012464, "grad_norm": 0.4900120198726654, "learning_rate": 9.767500417323676e-05, "loss": 0.056, "step": 2710 }, { "epoch": 2.607861936720997, "grad_norm": 0.3331255316734314, "learning_rate": 9.764870335302689e-05, "loss": 0.0502, "step": 2720 }, { "epoch": 2.61744966442953, "grad_norm": 0.47928670048713684, "learning_rate": 9.762225818962336e-05, "loss": 0.0514, "step": 2730 }, { "epoch": 2.6270373921380634, "grad_norm": 0.3848089873790741, "learning_rate": 9.759566876313701e-05, "loss": 0.044, "step": 2740 }, { "epoch": 2.636625119846596, "grad_norm": 0.4957471787929535, "learning_rate": 9.756893515411574e-05, "loss": 0.0434, "step": 2750 }, { "epoch": 2.6462128475551294, "grad_norm": 0.5820662975311279, "learning_rate": 9.754205744354423e-05, "loss": 0.0484, "step": 2760 }, { "epoch": 2.6558005752636626, "grad_norm": 0.3916762173175812, "learning_rate": 9.751503571284368e-05, "loss": 0.0488, "step": 2770 }, { "epoch": 2.665388302972196, "grad_norm": 0.30791330337524414, "learning_rate": 9.748787004387157e-05, "loss": 0.0513, "step": 2780 }, { "epoch": 2.6749760306807286, "grad_norm": 0.5171549320220947, "learning_rate": 9.74605605189214e-05, "loss": 0.0516, "step": 2790 }, { "epoch": 2.684563758389262, "grad_norm": 0.47496703267097473, "learning_rate": 9.743310722072251e-05, "loss": 0.0493, "step": 2800 }, { "epoch": 2.6941514860977946, "grad_norm": 0.5075270533561707, "learning_rate": 9.74055102324397e-05, "loss": 0.0489, "step": 2810 }, { "epoch": 2.703739213806328, "grad_norm": 0.4490506052970886, "learning_rate": 9.737776963767313e-05, "loss": 0.0576, "step": 2820 }, { "epoch": 2.713326941514861, "grad_norm": 0.3923519551753998, "learning_rate": 9.734988552045792e-05, "loss": 0.0513, "step": 2830 }, { "epoch": 2.7229146692233943, "grad_norm": 0.2816771864891052, "learning_rate": 9.7321857965264e-05, "loss": 0.0578, "step": 2840 }, { "epoch": 2.732502396931927, "grad_norm": 0.6326708793640137, "learning_rate": 9.729368705699587e-05, "loss": 0.0452, "step": 2850 }, { "epoch": 2.7420901246404603, "grad_norm": 0.3657870292663574, "learning_rate": 9.726537288099215e-05, "loss": 0.0524, "step": 2860 }, { "epoch": 2.751677852348993, "grad_norm": 0.3347817063331604, "learning_rate": 9.723691552302562e-05, "loss": 0.0451, "step": 2870 }, { "epoch": 2.7612655800575263, "grad_norm": 0.4541146457195282, "learning_rate": 9.720831506930274e-05, "loss": 0.0487, "step": 2880 }, { "epoch": 2.7708533077660595, "grad_norm": 0.4089963734149933, "learning_rate": 9.71795716064634e-05, "loss": 0.0479, "step": 2890 }, { "epoch": 2.7804410354745928, "grad_norm": 0.3474633991718292, "learning_rate": 9.715068522158081e-05, "loss": 0.0467, "step": 2900 }, { "epoch": 2.7900287631831255, "grad_norm": 0.49998903274536133, "learning_rate": 9.712165600216107e-05, "loss": 0.0579, "step": 2910 }, { "epoch": 2.7996164908916588, "grad_norm": 0.41667240858078003, "learning_rate": 9.709248403614298e-05, "loss": 0.0456, "step": 2920 }, { "epoch": 2.8092042186001915, "grad_norm": 0.3876051604747772, "learning_rate": 9.706316941189779e-05, "loss": 0.0411, "step": 2930 }, { "epoch": 2.8187919463087248, "grad_norm": 0.34348323941230774, "learning_rate": 9.703371221822888e-05, "loss": 0.0463, "step": 2940 }, { "epoch": 2.828379674017258, "grad_norm": 0.5338907241821289, "learning_rate": 9.700411254437154e-05, "loss": 0.0476, "step": 2950 }, { "epoch": 2.837967401725791, "grad_norm": 0.5973591804504395, "learning_rate": 9.697437047999266e-05, "loss": 0.0531, "step": 2960 }, { "epoch": 2.847555129434324, "grad_norm": 0.31144216656684875, "learning_rate": 9.694448611519049e-05, "loss": 0.0494, "step": 2970 }, { "epoch": 2.857142857142857, "grad_norm": 0.4310339391231537, "learning_rate": 9.691445954049434e-05, "loss": 0.0448, "step": 2980 }, { "epoch": 2.86673058485139, "grad_norm": 0.36877721548080444, "learning_rate": 9.688429084686435e-05, "loss": 0.043, "step": 2990 }, { "epoch": 2.876318312559923, "grad_norm": 0.35387906432151794, "learning_rate": 9.685398012569115e-05, "loss": 0.055, "step": 3000 }, { "epoch": 2.8859060402684564, "grad_norm": 0.3781449496746063, "learning_rate": 9.682352746879562e-05, "loss": 0.0513, "step": 3010 }, { "epoch": 2.8954937679769897, "grad_norm": 0.3556309938430786, "learning_rate": 9.679293296842863e-05, "loss": 0.0556, "step": 3020 }, { "epoch": 2.9050814956855224, "grad_norm": 0.4965471923351288, "learning_rate": 9.676219671727072e-05, "loss": 0.0502, "step": 3030 }, { "epoch": 2.9146692233940557, "grad_norm": 0.40289080142974854, "learning_rate": 9.673131880843185e-05, "loss": 0.0474, "step": 3040 }, { "epoch": 2.9242569511025884, "grad_norm": 0.3517281115055084, "learning_rate": 9.67002993354511e-05, "loss": 0.0557, "step": 3050 }, { "epoch": 2.9338446788111217, "grad_norm": 0.5005010366439819, "learning_rate": 9.66691383922964e-05, "loss": 0.059, "step": 3060 }, { "epoch": 2.943432406519655, "grad_norm": 0.36781349778175354, "learning_rate": 9.66378360733642e-05, "loss": 0.055, "step": 3070 }, { "epoch": 2.953020134228188, "grad_norm": 0.310249388217926, "learning_rate": 9.660639247347931e-05, "loss": 0.0523, "step": 3080 }, { "epoch": 2.962607861936721, "grad_norm": 0.27061378955841064, "learning_rate": 9.657480768789446e-05, "loss": 0.0505, "step": 3090 }, { "epoch": 2.972195589645254, "grad_norm": 0.34516626596450806, "learning_rate": 9.654308181229006e-05, "loss": 0.0489, "step": 3100 }, { "epoch": 2.981783317353787, "grad_norm": 0.3140753209590912, "learning_rate": 9.651121494277396e-05, "loss": 0.0531, "step": 3110 }, { "epoch": 2.99137104506232, "grad_norm": 0.4165388345718384, "learning_rate": 9.647920717588114e-05, "loss": 0.0571, "step": 3120 }, { "epoch": 3.0009587727708533, "grad_norm": 0.36014652252197266, "learning_rate": 9.644705860857339e-05, "loss": 0.0515, "step": 3130 }, { "epoch": 3.0105465004793865, "grad_norm": 0.4353986382484436, "learning_rate": 9.641476933823899e-05, "loss": 0.0488, "step": 3140 }, { "epoch": 3.0201342281879193, "grad_norm": 0.4083373546600342, "learning_rate": 9.638233946269253e-05, "loss": 0.052, "step": 3150 }, { "epoch": 3.0297219558964525, "grad_norm": 0.3805656135082245, "learning_rate": 9.634976908017446e-05, "loss": 0.0461, "step": 3160 }, { "epoch": 3.0393096836049858, "grad_norm": 0.36862942576408386, "learning_rate": 9.631705828935092e-05, "loss": 0.0526, "step": 3170 }, { "epoch": 3.0488974113135185, "grad_norm": 0.4625187814235687, "learning_rate": 9.628420718931338e-05, "loss": 0.0536, "step": 3180 }, { "epoch": 3.0584851390220518, "grad_norm": 0.2972494959831238, "learning_rate": 9.625121587957834e-05, "loss": 0.0468, "step": 3190 }, { "epoch": 3.068072866730585, "grad_norm": 0.5064423084259033, "learning_rate": 9.621808446008708e-05, "loss": 0.0516, "step": 3200 }, { "epoch": 3.0776605944391178, "grad_norm": 0.28751927614212036, "learning_rate": 9.618481303120528e-05, "loss": 0.0463, "step": 3210 }, { "epoch": 3.087248322147651, "grad_norm": 0.4198159873485565, "learning_rate": 9.615140169372274e-05, "loss": 0.0395, "step": 3220 }, { "epoch": 3.096836049856184, "grad_norm": 0.41463902592658997, "learning_rate": 9.611785054885312e-05, "loss": 0.0501, "step": 3230 }, { "epoch": 3.106423777564717, "grad_norm": 0.37878739833831787, "learning_rate": 9.608415969823361e-05, "loss": 0.0484, "step": 3240 }, { "epoch": 3.11601150527325, "grad_norm": 0.4990726113319397, "learning_rate": 9.605032924392457e-05, "loss": 0.049, "step": 3250 }, { "epoch": 3.1255992329817834, "grad_norm": 0.39530688524246216, "learning_rate": 9.601635928840927e-05, "loss": 0.0658, "step": 3260 }, { "epoch": 3.135186960690316, "grad_norm": 0.5206883549690247, "learning_rate": 9.598224993459364e-05, "loss": 0.0538, "step": 3270 }, { "epoch": 3.1447746883988494, "grad_norm": 0.5972046256065369, "learning_rate": 9.594800128580582e-05, "loss": 0.054, "step": 3280 }, { "epoch": 3.1543624161073827, "grad_norm": 0.33001407980918884, "learning_rate": 9.591361344579595e-05, "loss": 0.0544, "step": 3290 }, { "epoch": 3.1639501438159154, "grad_norm": 0.38547295331954956, "learning_rate": 9.58790865187358e-05, "loss": 0.0422, "step": 3300 }, { "epoch": 3.1735378715244487, "grad_norm": 0.3369503915309906, "learning_rate": 9.584442060921851e-05, "loss": 0.0472, "step": 3310 }, { "epoch": 3.183125599232982, "grad_norm": 0.2815903127193451, "learning_rate": 9.580961582225826e-05, "loss": 0.0463, "step": 3320 }, { "epoch": 3.1927133269415147, "grad_norm": 0.42745402455329895, "learning_rate": 9.577467226328987e-05, "loss": 0.0517, "step": 3330 }, { "epoch": 3.202301054650048, "grad_norm": 0.46006882190704346, "learning_rate": 9.573959003816856e-05, "loss": 0.0494, "step": 3340 }, { "epoch": 3.211888782358581, "grad_norm": 0.47103896737098694, "learning_rate": 9.57043692531697e-05, "loss": 0.0511, "step": 3350 }, { "epoch": 3.221476510067114, "grad_norm": 0.41211676597595215, "learning_rate": 9.566901001498826e-05, "loss": 0.0512, "step": 3360 }, { "epoch": 3.231064237775647, "grad_norm": 0.5582764148712158, "learning_rate": 9.563351243073878e-05, "loss": 0.0584, "step": 3370 }, { "epoch": 3.2406519654841803, "grad_norm": 0.3129172921180725, "learning_rate": 9.559787660795474e-05, "loss": 0.0596, "step": 3380 }, { "epoch": 3.2502396931927136, "grad_norm": 0.4259207844734192, "learning_rate": 9.556210265458854e-05, "loss": 0.0507, "step": 3390 }, { "epoch": 3.2598274209012463, "grad_norm": 0.29509371519088745, "learning_rate": 9.552619067901089e-05, "loss": 0.0519, "step": 3400 }, { "epoch": 3.2694151486097796, "grad_norm": 0.33097851276397705, "learning_rate": 9.549014079001074e-05, "loss": 0.0503, "step": 3410 }, { "epoch": 3.2790028763183123, "grad_norm": 0.6283732056617737, "learning_rate": 9.545395309679469e-05, "loss": 0.052, "step": 3420 }, { "epoch": 3.2885906040268456, "grad_norm": 0.29192429780960083, "learning_rate": 9.54176277089869e-05, "loss": 0.0452, "step": 3430 }, { "epoch": 3.2981783317353788, "grad_norm": 0.3860151767730713, "learning_rate": 9.538116473662861e-05, "loss": 0.0536, "step": 3440 }, { "epoch": 3.307766059443912, "grad_norm": 0.5127553343772888, "learning_rate": 9.534456429017784e-05, "loss": 0.0521, "step": 3450 }, { "epoch": 3.3173537871524448, "grad_norm": 0.4540964961051941, "learning_rate": 9.530782648050907e-05, "loss": 0.0552, "step": 3460 }, { "epoch": 3.326941514860978, "grad_norm": 0.34647271037101746, "learning_rate": 9.52709514189129e-05, "loss": 0.0457, "step": 3470 }, { "epoch": 3.336529242569511, "grad_norm": 0.4515313506126404, "learning_rate": 9.523393921709574e-05, "loss": 0.0467, "step": 3480 }, { "epoch": 3.346116970278044, "grad_norm": 0.3084343373775482, "learning_rate": 9.519678998717935e-05, "loss": 0.0462, "step": 3490 }, { "epoch": 3.3557046979865772, "grad_norm": 0.5871327519416809, "learning_rate": 9.515950384170073e-05, "loss": 0.0566, "step": 3500 }, { "epoch": 3.3652924256951104, "grad_norm": 0.4407544732093811, "learning_rate": 9.51220808936115e-05, "loss": 0.0436, "step": 3510 }, { "epoch": 3.3748801534036432, "grad_norm": 0.3434475362300873, "learning_rate": 9.508452125627779e-05, "loss": 0.0483, "step": 3520 }, { "epoch": 3.3844678811121764, "grad_norm": 0.5896394848823547, "learning_rate": 9.504682504347978e-05, "loss": 0.0435, "step": 3530 }, { "epoch": 3.3940556088207097, "grad_norm": 0.380214661359787, "learning_rate": 9.500899236941139e-05, "loss": 0.053, "step": 3540 }, { "epoch": 3.4036433365292424, "grad_norm": 0.2878900170326233, "learning_rate": 9.497102334867989e-05, "loss": 0.0488, "step": 3550 }, { "epoch": 3.4132310642377757, "grad_norm": 0.6185137629508972, "learning_rate": 9.493291809630562e-05, "loss": 0.0512, "step": 3560 }, { "epoch": 3.422818791946309, "grad_norm": 0.5001134872436523, "learning_rate": 9.489467672772162e-05, "loss": 0.055, "step": 3570 }, { "epoch": 3.4324065196548417, "grad_norm": 0.46808385848999023, "learning_rate": 9.485629935877323e-05, "loss": 0.0524, "step": 3580 }, { "epoch": 3.441994247363375, "grad_norm": 0.4512917399406433, "learning_rate": 9.481778610571782e-05, "loss": 0.0487, "step": 3590 }, { "epoch": 3.451581975071908, "grad_norm": 0.39726588129997253, "learning_rate": 9.477913708522435e-05, "loss": 0.0578, "step": 3600 }, { "epoch": 3.461169702780441, "grad_norm": 0.32351112365722656, "learning_rate": 9.474035241437312e-05, "loss": 0.0488, "step": 3610 }, { "epoch": 3.470757430488974, "grad_norm": 0.47034138441085815, "learning_rate": 9.470143221065531e-05, "loss": 0.0618, "step": 3620 }, { "epoch": 3.4803451581975073, "grad_norm": 0.23497724533081055, "learning_rate": 9.46623765919727e-05, "loss": 0.0499, "step": 3630 }, { "epoch": 3.48993288590604, "grad_norm": 0.25630268454551697, "learning_rate": 9.462318567663728e-05, "loss": 0.0508, "step": 3640 }, { "epoch": 3.4995206136145733, "grad_norm": 0.3957800269126892, "learning_rate": 9.458385958337087e-05, "loss": 0.0554, "step": 3650 }, { "epoch": 3.5091083413231066, "grad_norm": 0.25262129306793213, "learning_rate": 9.454439843130483e-05, "loss": 0.0473, "step": 3660 }, { "epoch": 3.5186960690316393, "grad_norm": 0.3933389186859131, "learning_rate": 9.450480233997963e-05, "loss": 0.0471, "step": 3670 }, { "epoch": 3.5282837967401726, "grad_norm": 0.26438847184181213, "learning_rate": 9.446507142934452e-05, "loss": 0.0557, "step": 3680 }, { "epoch": 3.537871524448706, "grad_norm": 0.2720869183540344, "learning_rate": 9.442520581975718e-05, "loss": 0.0492, "step": 3690 }, { "epoch": 3.547459252157239, "grad_norm": 0.3165934383869171, "learning_rate": 9.438520563198328e-05, "loss": 0.0512, "step": 3700 }, { "epoch": 3.557046979865772, "grad_norm": 0.6523368954658508, "learning_rate": 9.434507098719624e-05, "loss": 0.0574, "step": 3710 }, { "epoch": 3.566634707574305, "grad_norm": 0.41401076316833496, "learning_rate": 9.430480200697676e-05, "loss": 0.0509, "step": 3720 }, { "epoch": 3.576222435282838, "grad_norm": 0.29742154479026794, "learning_rate": 9.426439881331248e-05, "loss": 0.0489, "step": 3730 }, { "epoch": 3.585810162991371, "grad_norm": 0.40217605233192444, "learning_rate": 9.422386152859763e-05, "loss": 0.0466, "step": 3740 }, { "epoch": 3.5953978906999042, "grad_norm": 0.3434045612812042, "learning_rate": 9.418319027563263e-05, "loss": 0.0575, "step": 3750 }, { "epoch": 3.6049856184084375, "grad_norm": 0.6345980763435364, "learning_rate": 9.414238517762373e-05, "loss": 0.0453, "step": 3760 }, { "epoch": 3.6145733461169702, "grad_norm": 0.43346667289733887, "learning_rate": 9.410144635818266e-05, "loss": 0.055, "step": 3770 }, { "epoch": 3.6241610738255035, "grad_norm": 0.36115562915802, "learning_rate": 9.406037394132623e-05, "loss": 0.0535, "step": 3780 }, { "epoch": 3.6337488015340362, "grad_norm": 0.2766103744506836, "learning_rate": 9.401916805147596e-05, "loss": 0.0463, "step": 3790 }, { "epoch": 3.6433365292425695, "grad_norm": 0.39829254150390625, "learning_rate": 9.397782881345767e-05, "loss": 0.0463, "step": 3800 }, { "epoch": 3.6529242569511027, "grad_norm": 0.3240996301174164, "learning_rate": 9.39363563525012e-05, "loss": 0.0516, "step": 3810 }, { "epoch": 3.662511984659636, "grad_norm": 0.416238009929657, "learning_rate": 9.389475079423988e-05, "loss": 0.0483, "step": 3820 }, { "epoch": 3.6720997123681687, "grad_norm": 0.24697421491146088, "learning_rate": 9.385301226471032e-05, "loss": 0.0451, "step": 3830 }, { "epoch": 3.681687440076702, "grad_norm": 0.3078657388687134, "learning_rate": 9.381114089035188e-05, "loss": 0.0454, "step": 3840 }, { "epoch": 3.6912751677852347, "grad_norm": 0.26055672764778137, "learning_rate": 9.376913679800638e-05, "loss": 0.0426, "step": 3850 }, { "epoch": 3.700862895493768, "grad_norm": 0.36363962292671204, "learning_rate": 9.372700011491768e-05, "loss": 0.0535, "step": 3860 }, { "epoch": 3.710450623202301, "grad_norm": 0.23066310584545135, "learning_rate": 9.36847309687313e-05, "loss": 0.0391, "step": 3870 }, { "epoch": 3.7200383509108343, "grad_norm": 0.35935813188552856, "learning_rate": 9.364232948749402e-05, "loss": 0.0404, "step": 3880 }, { "epoch": 3.729626078619367, "grad_norm": 0.42284151911735535, "learning_rate": 9.359979579965352e-05, "loss": 0.0456, "step": 3890 }, { "epoch": 3.7392138063279003, "grad_norm": 0.29598748683929443, "learning_rate": 9.355713003405797e-05, "loss": 0.0486, "step": 3900 }, { "epoch": 3.748801534036433, "grad_norm": 0.30880895256996155, "learning_rate": 9.351433231995568e-05, "loss": 0.0524, "step": 3910 }, { "epoch": 3.7583892617449663, "grad_norm": 0.2683268189430237, "learning_rate": 9.34714027869946e-05, "loss": 0.0458, "step": 3920 }, { "epoch": 3.7679769894534996, "grad_norm": 0.3789876401424408, "learning_rate": 9.342834156522204e-05, "loss": 0.0529, "step": 3930 }, { "epoch": 3.777564717162033, "grad_norm": 0.2747150957584381, "learning_rate": 9.338514878508428e-05, "loss": 0.0474, "step": 3940 }, { "epoch": 3.7871524448705656, "grad_norm": 0.3292723000049591, "learning_rate": 9.334182457742607e-05, "loss": 0.0544, "step": 3950 }, { "epoch": 3.796740172579099, "grad_norm": 0.28527846932411194, "learning_rate": 9.329836907349033e-05, "loss": 0.0419, "step": 3960 }, { "epoch": 3.8063279002876316, "grad_norm": 0.37766164541244507, "learning_rate": 9.325478240491771e-05, "loss": 0.0503, "step": 3970 }, { "epoch": 3.815915627996165, "grad_norm": 0.4285350739955902, "learning_rate": 9.321106470374618e-05, "loss": 0.0493, "step": 3980 }, { "epoch": 3.825503355704698, "grad_norm": 0.432804137468338, "learning_rate": 9.316721610241068e-05, "loss": 0.0452, "step": 3990 }, { "epoch": 3.8350910834132312, "grad_norm": 0.32709524035453796, "learning_rate": 9.312323673374269e-05, "loss": 0.049, "step": 4000 }, { "epoch": 3.844678811121764, "grad_norm": 0.2850819230079651, "learning_rate": 9.30791267309698e-05, "loss": 0.0379, "step": 4010 }, { "epoch": 3.8542665388302972, "grad_norm": 0.3472555875778198, "learning_rate": 9.303488622771535e-05, "loss": 0.0412, "step": 4020 }, { "epoch": 3.8638542665388305, "grad_norm": 0.545179545879364, "learning_rate": 9.299051535799799e-05, "loss": 0.0535, "step": 4030 }, { "epoch": 3.8734419942473632, "grad_norm": 0.43045416474342346, "learning_rate": 9.29460142562313e-05, "loss": 0.0564, "step": 4040 }, { "epoch": 3.8830297219558965, "grad_norm": 0.30958643555641174, "learning_rate": 9.290138305722343e-05, "loss": 0.0423, "step": 4050 }, { "epoch": 3.8926174496644297, "grad_norm": 0.3504599630832672, "learning_rate": 9.285662189617652e-05, "loss": 0.0525, "step": 4060 }, { "epoch": 3.9022051773729625, "grad_norm": 0.5074465870857239, "learning_rate": 9.281173090868651e-05, "loss": 0.0505, "step": 4070 }, { "epoch": 3.9117929050814957, "grad_norm": 0.30970317125320435, "learning_rate": 9.27667102307426e-05, "loss": 0.0404, "step": 4080 }, { "epoch": 3.921380632790029, "grad_norm": 0.35298407077789307, "learning_rate": 9.27215599987268e-05, "loss": 0.0461, "step": 4090 }, { "epoch": 3.9309683604985617, "grad_norm": 0.32086381316185, "learning_rate": 9.267628034941369e-05, "loss": 0.0476, "step": 4100 }, { "epoch": 3.940556088207095, "grad_norm": 0.33907032012939453, "learning_rate": 9.26308714199698e-05, "loss": 0.0446, "step": 4110 }, { "epoch": 3.950143815915628, "grad_norm": 0.23291510343551636, "learning_rate": 9.258533334795336e-05, "loss": 0.0542, "step": 4120 }, { "epoch": 3.959731543624161, "grad_norm": 0.3786979913711548, "learning_rate": 9.253966627131379e-05, "loss": 0.049, "step": 4130 }, { "epoch": 3.969319271332694, "grad_norm": 0.4073876142501831, "learning_rate": 9.249387032839125e-05, "loss": 0.046, "step": 4140 }, { "epoch": 3.9789069990412274, "grad_norm": 0.3822251856327057, "learning_rate": 9.244794565791639e-05, "loss": 0.0472, "step": 4150 }, { "epoch": 3.98849472674976, "grad_norm": 0.43598631024360657, "learning_rate": 9.240189239900972e-05, "loss": 0.0388, "step": 4160 }, { "epoch": 3.9980824544582934, "grad_norm": 0.2129432111978531, "learning_rate": 9.235571069118131e-05, "loss": 0.0492, "step": 4170 }, { "epoch": 4.007670182166827, "grad_norm": 0.3745039999485016, "learning_rate": 9.23094006743304e-05, "loss": 0.0447, "step": 4180 }, { "epoch": 4.01725790987536, "grad_norm": 0.3619850277900696, "learning_rate": 9.226296248874482e-05, "loss": 0.0523, "step": 4190 }, { "epoch": 4.026845637583893, "grad_norm": 0.3835139274597168, "learning_rate": 9.221639627510076e-05, "loss": 0.048, "step": 4200 }, { "epoch": 4.036433365292425, "grad_norm": 0.28674259781837463, "learning_rate": 9.216970217446219e-05, "loss": 0.0387, "step": 4210 }, { "epoch": 4.046021093000959, "grad_norm": 0.25763458013534546, "learning_rate": 9.21228803282805e-05, "loss": 0.0506, "step": 4220 }, { "epoch": 4.055608820709492, "grad_norm": 0.36224737763404846, "learning_rate": 9.207593087839406e-05, "loss": 0.0453, "step": 4230 }, { "epoch": 4.065196548418025, "grad_norm": 0.38200250267982483, "learning_rate": 9.202885396702782e-05, "loss": 0.0431, "step": 4240 }, { "epoch": 4.074784276126558, "grad_norm": 0.336946964263916, "learning_rate": 9.198164973679285e-05, "loss": 0.0443, "step": 4250 }, { "epoch": 4.0843720038350915, "grad_norm": 0.3541509807109833, "learning_rate": 9.193431833068586e-05, "loss": 0.0499, "step": 4260 }, { "epoch": 4.093959731543624, "grad_norm": 0.3337682783603668, "learning_rate": 9.188685989208886e-05, "loss": 0.0474, "step": 4270 }, { "epoch": 4.103547459252157, "grad_norm": 0.4774644076824188, "learning_rate": 9.183927456476864e-05, "loss": 0.0413, "step": 4280 }, { "epoch": 4.11313518696069, "grad_norm": 0.3974810540676117, "learning_rate": 9.179156249287646e-05, "loss": 0.0495, "step": 4290 }, { "epoch": 4.1227229146692235, "grad_norm": 0.35930874943733215, "learning_rate": 9.174372382094745e-05, "loss": 0.0481, "step": 4300 }, { "epoch": 4.132310642377757, "grad_norm": 0.39746561646461487, "learning_rate": 9.169575869390028e-05, "loss": 0.0401, "step": 4310 }, { "epoch": 4.14189837008629, "grad_norm": 0.3344055414199829, "learning_rate": 9.164766725703669e-05, "loss": 0.0471, "step": 4320 }, { "epoch": 4.151486097794822, "grad_norm": 0.23866185545921326, "learning_rate": 9.159944965604105e-05, "loss": 0.0424, "step": 4330 }, { "epoch": 4.1610738255033555, "grad_norm": 0.3230268657207489, "learning_rate": 9.155110603697996e-05, "loss": 0.0475, "step": 4340 }, { "epoch": 4.170661553211889, "grad_norm": 0.3797110915184021, "learning_rate": 9.150263654630172e-05, "loss": 0.0458, "step": 4350 }, { "epoch": 4.180249280920422, "grad_norm": 0.41824665665626526, "learning_rate": 9.145404133083591e-05, "loss": 0.0401, "step": 4360 }, { "epoch": 4.189837008628955, "grad_norm": 0.45811742544174194, "learning_rate": 9.140532053779307e-05, "loss": 0.0533, "step": 4370 }, { "epoch": 4.199424736337488, "grad_norm": 0.3115192651748657, "learning_rate": 9.135647431476407e-05, "loss": 0.0475, "step": 4380 }, { "epoch": 4.209012464046021, "grad_norm": 0.27874428033828735, "learning_rate": 9.130750280971978e-05, "loss": 0.0444, "step": 4390 }, { "epoch": 4.218600191754554, "grad_norm": 0.5270777940750122, "learning_rate": 9.125840617101058e-05, "loss": 0.0514, "step": 4400 }, { "epoch": 4.228187919463087, "grad_norm": 0.40683162212371826, "learning_rate": 9.120918454736593e-05, "loss": 0.0472, "step": 4410 }, { "epoch": 4.23777564717162, "grad_norm": 0.30064043402671814, "learning_rate": 9.11598380878939e-05, "loss": 0.0492, "step": 4420 }, { "epoch": 4.247363374880154, "grad_norm": 0.4496791362762451, "learning_rate": 9.111036694208072e-05, "loss": 0.0471, "step": 4430 }, { "epoch": 4.256951102588687, "grad_norm": 0.39262011647224426, "learning_rate": 9.106077125979037e-05, "loss": 0.0487, "step": 4440 }, { "epoch": 4.26653883029722, "grad_norm": 0.34774985909461975, "learning_rate": 9.101105119126405e-05, "loss": 0.0452, "step": 4450 }, { "epoch": 4.276126558005752, "grad_norm": 0.4597591459751129, "learning_rate": 9.096120688711978e-05, "loss": 0.0521, "step": 4460 }, { "epoch": 4.285714285714286, "grad_norm": 0.594453752040863, "learning_rate": 9.091123849835195e-05, "loss": 0.0555, "step": 4470 }, { "epoch": 4.295302013422819, "grad_norm": 0.45329248905181885, "learning_rate": 9.086114617633079e-05, "loss": 0.0408, "step": 4480 }, { "epoch": 4.304889741131352, "grad_norm": 0.34534817934036255, "learning_rate": 9.081093007280205e-05, "loss": 0.0554, "step": 4490 }, { "epoch": 4.314477468839885, "grad_norm": 0.36244168877601624, "learning_rate": 9.076059033988636e-05, "loss": 0.0487, "step": 4500 }, { "epoch": 4.324065196548418, "grad_norm": 0.32668572664260864, "learning_rate": 9.071012713007892e-05, "loss": 0.0483, "step": 4510 }, { "epoch": 4.333652924256951, "grad_norm": 0.31663575768470764, "learning_rate": 9.065954059624895e-05, "loss": 0.0484, "step": 4520 }, { "epoch": 4.343240651965484, "grad_norm": 0.2809025049209595, "learning_rate": 9.06088308916393e-05, "loss": 0.042, "step": 4530 }, { "epoch": 4.352828379674017, "grad_norm": 0.2432290017604828, "learning_rate": 9.05579981698659e-05, "loss": 0.0463, "step": 4540 }, { "epoch": 4.3624161073825505, "grad_norm": 0.2573339343070984, "learning_rate": 9.050704258491736e-05, "loss": 0.0462, "step": 4550 }, { "epoch": 4.372003835091084, "grad_norm": 0.42221635580062866, "learning_rate": 9.045596429115447e-05, "loss": 0.0472, "step": 4560 }, { "epoch": 4.381591562799617, "grad_norm": 0.35964876413345337, "learning_rate": 9.040476344330977e-05, "loss": 0.0448, "step": 4570 }, { "epoch": 4.391179290508149, "grad_norm": 0.27407506108283997, "learning_rate": 9.035344019648702e-05, "loss": 0.0431, "step": 4580 }, { "epoch": 4.4007670182166825, "grad_norm": 0.31676268577575684, "learning_rate": 9.03019947061608e-05, "loss": 0.0441, "step": 4590 }, { "epoch": 4.410354745925216, "grad_norm": 0.2982436716556549, "learning_rate": 9.025042712817598e-05, "loss": 0.043, "step": 4600 }, { "epoch": 4.419942473633749, "grad_norm": 0.3181396424770355, "learning_rate": 9.019873761874727e-05, "loss": 0.0484, "step": 4610 }, { "epoch": 4.429530201342282, "grad_norm": 0.3732481002807617, "learning_rate": 9.014692633445878e-05, "loss": 0.055, "step": 4620 }, { "epoch": 4.439117929050815, "grad_norm": 0.42074957489967346, "learning_rate": 9.009499343226348e-05, "loss": 0.047, "step": 4630 }, { "epoch": 4.448705656759348, "grad_norm": 0.35802584886550903, "learning_rate": 9.004293906948278e-05, "loss": 0.0489, "step": 4640 }, { "epoch": 4.458293384467881, "grad_norm": 0.33133867383003235, "learning_rate": 8.999076340380603e-05, "loss": 0.049, "step": 4650 }, { "epoch": 4.467881112176414, "grad_norm": 0.28263920545578003, "learning_rate": 8.993846659329005e-05, "loss": 0.056, "step": 4660 }, { "epoch": 4.477468839884947, "grad_norm": 0.5171105861663818, "learning_rate": 8.988604879635862e-05, "loss": 0.047, "step": 4670 }, { "epoch": 4.487056567593481, "grad_norm": 0.264189749956131, "learning_rate": 8.983351017180208e-05, "loss": 0.0432, "step": 4680 }, { "epoch": 4.496644295302014, "grad_norm": 0.2710209786891937, "learning_rate": 8.978085087877672e-05, "loss": 0.048, "step": 4690 }, { "epoch": 4.506232023010546, "grad_norm": 0.20794712007045746, "learning_rate": 8.972807107680445e-05, "loss": 0.0524, "step": 4700 }, { "epoch": 4.515819750719079, "grad_norm": 0.2759157419204712, "learning_rate": 8.96751709257722e-05, "loss": 0.0463, "step": 4710 }, { "epoch": 4.525407478427613, "grad_norm": 0.45379728078842163, "learning_rate": 8.962215058593146e-05, "loss": 0.0483, "step": 4720 }, { "epoch": 4.534995206136146, "grad_norm": 0.35511714220046997, "learning_rate": 8.956901021789785e-05, "loss": 0.0473, "step": 4730 }, { "epoch": 4.544582933844679, "grad_norm": 0.49189603328704834, "learning_rate": 8.951574998265058e-05, "loss": 0.0448, "step": 4740 }, { "epoch": 4.554170661553212, "grad_norm": 0.7247273921966553, "learning_rate": 8.946237004153197e-05, "loss": 0.0514, "step": 4750 }, { "epoch": 4.563758389261745, "grad_norm": 0.5640259385108948, "learning_rate": 8.940887055624696e-05, "loss": 0.0495, "step": 4760 }, { "epoch": 4.573346116970278, "grad_norm": 0.9589868187904358, "learning_rate": 8.935525168886262e-05, "loss": 0.0497, "step": 4770 }, { "epoch": 4.582933844678811, "grad_norm": 0.24826788902282715, "learning_rate": 8.930151360180773e-05, "loss": 0.0526, "step": 4780 }, { "epoch": 4.592521572387344, "grad_norm": 0.4066452980041504, "learning_rate": 8.924765645787216e-05, "loss": 0.0482, "step": 4790 }, { "epoch": 4.6021093000958775, "grad_norm": 0.41626861691474915, "learning_rate": 8.919368042020645e-05, "loss": 0.0469, "step": 4800 }, { "epoch": 4.611697027804411, "grad_norm": 0.35766589641571045, "learning_rate": 8.913958565232132e-05, "loss": 0.0489, "step": 4810 }, { "epoch": 4.621284755512944, "grad_norm": 0.24869422614574432, "learning_rate": 8.908537231808716e-05, "loss": 0.043, "step": 4820 }, { "epoch": 4.630872483221476, "grad_norm": 0.3498132526874542, "learning_rate": 8.903104058173354e-05, "loss": 0.044, "step": 4830 }, { "epoch": 4.6404602109300095, "grad_norm": 0.5257985591888428, "learning_rate": 8.897659060784869e-05, "loss": 0.0487, "step": 4840 }, { "epoch": 4.650047938638543, "grad_norm": 0.3492990732192993, "learning_rate": 8.892202256137905e-05, "loss": 0.0516, "step": 4850 }, { "epoch": 4.659635666347076, "grad_norm": 0.5162085294723511, "learning_rate": 8.886733660762871e-05, "loss": 0.0526, "step": 4860 }, { "epoch": 4.669223394055609, "grad_norm": 0.3405402600765228, "learning_rate": 8.881253291225895e-05, "loss": 0.0449, "step": 4870 }, { "epoch": 4.6788111217641415, "grad_norm": 0.4526231586933136, "learning_rate": 8.875761164128772e-05, "loss": 0.053, "step": 4880 }, { "epoch": 4.688398849472675, "grad_norm": 0.3826616108417511, "learning_rate": 8.870257296108918e-05, "loss": 0.0467, "step": 4890 }, { "epoch": 4.697986577181208, "grad_norm": 0.3477012813091278, "learning_rate": 8.86474170383931e-05, "loss": 0.0486, "step": 4900 }, { "epoch": 4.707574304889741, "grad_norm": 0.2914051115512848, "learning_rate": 8.859214404028447e-05, "loss": 0.042, "step": 4910 }, { "epoch": 4.717162032598274, "grad_norm": 0.40637078881263733, "learning_rate": 8.85367541342029e-05, "loss": 0.0432, "step": 4920 }, { "epoch": 4.726749760306808, "grad_norm": 0.36229225993156433, "learning_rate": 8.848124748794218e-05, "loss": 0.0498, "step": 4930 }, { "epoch": 4.736337488015341, "grad_norm": 0.33015790581703186, "learning_rate": 8.842562426964974e-05, "loss": 0.0441, "step": 4940 }, { "epoch": 4.745925215723873, "grad_norm": 0.35154151916503906, "learning_rate": 8.83698846478261e-05, "loss": 0.0463, "step": 4950 }, { "epoch": 4.755512943432406, "grad_norm": 0.2888050377368927, "learning_rate": 8.831402879132446e-05, "loss": 0.0455, "step": 4960 }, { "epoch": 4.76510067114094, "grad_norm": 0.3235926628112793, "learning_rate": 8.825805686935011e-05, "loss": 0.0551, "step": 4970 }, { "epoch": 4.774688398849473, "grad_norm": 0.44466277956962585, "learning_rate": 8.820196905145997e-05, "loss": 0.0476, "step": 4980 }, { "epoch": 4.784276126558006, "grad_norm": 0.39051833748817444, "learning_rate": 8.814576550756197e-05, "loss": 0.04, "step": 4990 }, { "epoch": 4.793863854266538, "grad_norm": 0.3532402813434601, "learning_rate": 8.808944640791467e-05, "loss": 0.0489, "step": 5000 }, { "epoch": 4.803451581975072, "grad_norm": 0.34791117906570435, "learning_rate": 8.803301192312667e-05, "loss": 0.0466, "step": 5010 }, { "epoch": 4.813039309683605, "grad_norm": 0.31138908863067627, "learning_rate": 8.797646222415614e-05, "loss": 0.0407, "step": 5020 }, { "epoch": 4.822627037392138, "grad_norm": 0.2896534502506256, "learning_rate": 8.79197974823102e-05, "loss": 0.0479, "step": 5030 }, { "epoch": 4.832214765100671, "grad_norm": 0.26334378123283386, "learning_rate": 8.786301786924456e-05, "loss": 0.0469, "step": 5040 }, { "epoch": 4.8418024928092045, "grad_norm": 0.2446843832731247, "learning_rate": 8.780612355696283e-05, "loss": 0.0461, "step": 5050 }, { "epoch": 4.851390220517738, "grad_norm": 0.2954402267932892, "learning_rate": 8.774911471781613e-05, "loss": 0.0472, "step": 5060 }, { "epoch": 4.86097794822627, "grad_norm": 0.22677741944789886, "learning_rate": 8.769199152450249e-05, "loss": 0.04, "step": 5070 }, { "epoch": 4.870565675934803, "grad_norm": 0.32872337102890015, "learning_rate": 8.76347541500664e-05, "loss": 0.0479, "step": 5080 }, { "epoch": 4.8801534036433365, "grad_norm": 0.4457066059112549, "learning_rate": 8.757740276789818e-05, "loss": 0.0439, "step": 5090 }, { "epoch": 4.88974113135187, "grad_norm": 0.24604512751102448, "learning_rate": 8.751993755173358e-05, "loss": 0.0468, "step": 5100 }, { "epoch": 4.899328859060403, "grad_norm": 0.3143763840198517, "learning_rate": 8.746235867565313e-05, "loss": 0.0458, "step": 5110 }, { "epoch": 4.908916586768935, "grad_norm": 0.3161276876926422, "learning_rate": 8.74046663140817e-05, "loss": 0.0502, "step": 5120 }, { "epoch": 4.9185043144774685, "grad_norm": 0.2833130657672882, "learning_rate": 8.734686064178797e-05, "loss": 0.0419, "step": 5130 }, { "epoch": 4.928092042186002, "grad_norm": 0.4420258104801178, "learning_rate": 8.728894183388381e-05, "loss": 0.0465, "step": 5140 }, { "epoch": 4.937679769894535, "grad_norm": 0.353081077337265, "learning_rate": 8.723091006582389e-05, "loss": 0.0451, "step": 5150 }, { "epoch": 4.947267497603068, "grad_norm": 0.4228033125400543, "learning_rate": 8.717276551340501e-05, "loss": 0.0495, "step": 5160 }, { "epoch": 4.956855225311601, "grad_norm": 0.3678063452243805, "learning_rate": 8.711450835276565e-05, "loss": 0.0395, "step": 5170 }, { "epoch": 4.966442953020135, "grad_norm": 0.4963276982307434, "learning_rate": 8.705613876038543e-05, "loss": 0.042, "step": 5180 }, { "epoch": 4.976030680728667, "grad_norm": 0.3559805452823639, "learning_rate": 8.699765691308456e-05, "loss": 0.0448, "step": 5190 }, { "epoch": 4.9856184084372, "grad_norm": 0.253312885761261, "learning_rate": 8.69390629880233e-05, "loss": 0.0539, "step": 5200 }, { "epoch": 4.995206136145733, "grad_norm": 0.29010236263275146, "learning_rate": 8.688035716270141e-05, "loss": 0.0447, "step": 5210 }, { "epoch": 5.004793863854267, "grad_norm": 0.35962191224098206, "learning_rate": 8.682153961495767e-05, "loss": 0.0484, "step": 5220 }, { "epoch": 5.0143815915628, "grad_norm": 0.2923009395599365, "learning_rate": 8.676261052296928e-05, "loss": 0.0488, "step": 5230 }, { "epoch": 5.023969319271333, "grad_norm": 0.33261337876319885, "learning_rate": 8.670357006525131e-05, "loss": 0.053, "step": 5240 }, { "epoch": 5.033557046979865, "grad_norm": 0.3641784191131592, "learning_rate": 8.66444184206563e-05, "loss": 0.0429, "step": 5250 }, { "epoch": 5.043144774688399, "grad_norm": 0.4545520544052124, "learning_rate": 8.658515576837347e-05, "loss": 0.0487, "step": 5260 }, { "epoch": 5.052732502396932, "grad_norm": 0.3597351312637329, "learning_rate": 8.652578228792841e-05, "loss": 0.0571, "step": 5270 }, { "epoch": 5.062320230105465, "grad_norm": 0.26271480321884155, "learning_rate": 8.646629815918244e-05, "loss": 0.046, "step": 5280 }, { "epoch": 5.071907957813998, "grad_norm": 0.2976760268211365, "learning_rate": 8.640670356233202e-05, "loss": 0.049, "step": 5290 }, { "epoch": 5.0814956855225315, "grad_norm": 0.3539637327194214, "learning_rate": 8.634699867790832e-05, "loss": 0.046, "step": 5300 }, { "epoch": 5.091083413231064, "grad_norm": 0.314113587141037, "learning_rate": 8.628718368677655e-05, "loss": 0.0474, "step": 5310 }, { "epoch": 5.100671140939597, "grad_norm": 0.3386295735836029, "learning_rate": 8.622725877013549e-05, "loss": 0.0438, "step": 5320 }, { "epoch": 5.11025886864813, "grad_norm": 0.4622576832771301, "learning_rate": 8.616722410951689e-05, "loss": 0.0447, "step": 5330 }, { "epoch": 5.1198465963566635, "grad_norm": 0.23671875894069672, "learning_rate": 8.610707988678503e-05, "loss": 0.0457, "step": 5340 }, { "epoch": 5.129434324065197, "grad_norm": 0.38376542925834656, "learning_rate": 8.604682628413601e-05, "loss": 0.0521, "step": 5350 }, { "epoch": 5.13902205177373, "grad_norm": 0.2503417432308197, "learning_rate": 8.598646348409729e-05, "loss": 0.0466, "step": 5360 }, { "epoch": 5.148609779482262, "grad_norm": 0.33504578471183777, "learning_rate": 8.592599166952718e-05, "loss": 0.0499, "step": 5370 }, { "epoch": 5.1581975071907955, "grad_norm": 0.2641712725162506, "learning_rate": 8.586541102361414e-05, "loss": 0.0471, "step": 5380 }, { "epoch": 5.167785234899329, "grad_norm": 0.363615483045578, "learning_rate": 8.580472172987638e-05, "loss": 0.0451, "step": 5390 }, { "epoch": 5.177372962607862, "grad_norm": 0.29901939630508423, "learning_rate": 8.574392397216123e-05, "loss": 0.0472, "step": 5400 }, { "epoch": 5.186960690316395, "grad_norm": 0.299882173538208, "learning_rate": 8.568301793464457e-05, "loss": 0.0492, "step": 5410 }, { "epoch": 5.196548418024928, "grad_norm": 0.25945836305618286, "learning_rate": 8.562200380183033e-05, "loss": 0.0354, "step": 5420 }, { "epoch": 5.206136145733462, "grad_norm": 0.39987847208976746, "learning_rate": 8.556088175854984e-05, "loss": 0.0367, "step": 5430 }, { "epoch": 5.215723873441994, "grad_norm": 0.31205254793167114, "learning_rate": 8.54996519899614e-05, "loss": 0.0412, "step": 5440 }, { "epoch": 5.225311601150527, "grad_norm": 0.3277497887611389, "learning_rate": 8.543831468154955e-05, "loss": 0.0502, "step": 5450 }, { "epoch": 5.23489932885906, "grad_norm": 0.3311022222042084, "learning_rate": 8.537687001912471e-05, "loss": 0.0477, "step": 5460 }, { "epoch": 5.244487056567594, "grad_norm": 0.42579907178878784, "learning_rate": 8.531531818882241e-05, "loss": 0.0509, "step": 5470 }, { "epoch": 5.254074784276127, "grad_norm": 0.30724838376045227, "learning_rate": 8.52536593771029e-05, "loss": 0.0418, "step": 5480 }, { "epoch": 5.263662511984659, "grad_norm": 0.3175548017024994, "learning_rate": 8.519189377075049e-05, "loss": 0.0507, "step": 5490 }, { "epoch": 5.273250239693192, "grad_norm": 0.3461003601551056, "learning_rate": 8.513002155687297e-05, "loss": 0.0495, "step": 5500 }, { "epoch": 5.282837967401726, "grad_norm": 0.27968931198120117, "learning_rate": 8.50680429229011e-05, "loss": 0.0424, "step": 5510 }, { "epoch": 5.292425695110259, "grad_norm": 0.2532777190208435, "learning_rate": 8.500595805658806e-05, "loss": 0.0429, "step": 5520 }, { "epoch": 5.302013422818792, "grad_norm": 0.2897396981716156, "learning_rate": 8.494376714600878e-05, "loss": 0.0479, "step": 5530 }, { "epoch": 5.311601150527325, "grad_norm": 0.32838040590286255, "learning_rate": 8.48814703795595e-05, "loss": 0.0462, "step": 5540 }, { "epoch": 5.3211888782358585, "grad_norm": 0.23218947649002075, "learning_rate": 8.481906794595702e-05, "loss": 0.038, "step": 5550 }, { "epoch": 5.330776605944391, "grad_norm": 0.4271414577960968, "learning_rate": 8.475656003423837e-05, "loss": 0.0424, "step": 5560 }, { "epoch": 5.340364333652924, "grad_norm": 0.3327130079269409, "learning_rate": 8.469394683376003e-05, "loss": 0.0461, "step": 5570 }, { "epoch": 5.349952061361457, "grad_norm": 0.34635308384895325, "learning_rate": 8.463122853419748e-05, "loss": 0.0462, "step": 5580 }, { "epoch": 5.3595397890699905, "grad_norm": 0.35077422857284546, "learning_rate": 8.456840532554448e-05, "loss": 0.0477, "step": 5590 }, { "epoch": 5.369127516778524, "grad_norm": 0.44980722665786743, "learning_rate": 8.450547739811275e-05, "loss": 0.0423, "step": 5600 }, { "epoch": 5.378715244487057, "grad_norm": 0.28166648745536804, "learning_rate": 8.444244494253106e-05, "loss": 0.0431, "step": 5610 }, { "epoch": 5.388302972195589, "grad_norm": 0.33736804127693176, "learning_rate": 8.437930814974499e-05, "loss": 0.0479, "step": 5620 }, { "epoch": 5.3978906999041225, "grad_norm": 0.25710147619247437, "learning_rate": 8.43160672110161e-05, "loss": 0.042, "step": 5630 }, { "epoch": 5.407478427612656, "grad_norm": 0.29803675413131714, "learning_rate": 8.425272231792148e-05, "loss": 0.0488, "step": 5640 }, { "epoch": 5.417066155321189, "grad_norm": 0.35298973321914673, "learning_rate": 8.418927366235305e-05, "loss": 0.042, "step": 5650 }, { "epoch": 5.426653883029722, "grad_norm": 0.32311904430389404, "learning_rate": 8.41257214365172e-05, "loss": 0.0452, "step": 5660 }, { "epoch": 5.436241610738255, "grad_norm": 0.38360047340393066, "learning_rate": 8.406206583293394e-05, "loss": 0.0572, "step": 5670 }, { "epoch": 5.445829338446788, "grad_norm": 0.4456116855144501, "learning_rate": 8.399830704443653e-05, "loss": 0.0464, "step": 5680 }, { "epoch": 5.455417066155321, "grad_norm": 0.3833318054676056, "learning_rate": 8.393444526417071e-05, "loss": 0.0461, "step": 5690 }, { "epoch": 5.465004793863854, "grad_norm": 0.27611926198005676, "learning_rate": 8.387048068559435e-05, "loss": 0.0437, "step": 5700 }, { "epoch": 5.474592521572387, "grad_norm": 0.3786008954048157, "learning_rate": 8.380641350247665e-05, "loss": 0.0477, "step": 5710 }, { "epoch": 5.484180249280921, "grad_norm": 0.471384197473526, "learning_rate": 8.37422439088976e-05, "loss": 0.0449, "step": 5720 }, { "epoch": 5.493767976989454, "grad_norm": 0.2924197018146515, "learning_rate": 8.36779720992475e-05, "loss": 0.0476, "step": 5730 }, { "epoch": 5.503355704697986, "grad_norm": 0.24068906903266907, "learning_rate": 8.361359826822625e-05, "loss": 0.0477, "step": 5740 }, { "epoch": 5.512943432406519, "grad_norm": 0.24523060023784637, "learning_rate": 8.354912261084281e-05, "loss": 0.0489, "step": 5750 }, { "epoch": 5.522531160115053, "grad_norm": 0.3498481810092926, "learning_rate": 8.348454532241461e-05, "loss": 0.0387, "step": 5760 }, { "epoch": 5.532118887823586, "grad_norm": 0.3108651340007782, "learning_rate": 8.341986659856698e-05, "loss": 0.0377, "step": 5770 }, { "epoch": 5.541706615532119, "grad_norm": 0.3618451654911041, "learning_rate": 8.335508663523248e-05, "loss": 0.048, "step": 5780 }, { "epoch": 5.551294343240652, "grad_norm": 0.769836962223053, "learning_rate": 8.329020562865038e-05, "loss": 0.0422, "step": 5790 }, { "epoch": 5.5608820709491855, "grad_norm": 0.24395880103111267, "learning_rate": 8.322522377536604e-05, "loss": 0.0395, "step": 5800 }, { "epoch": 5.570469798657718, "grad_norm": 0.5865891575813293, "learning_rate": 8.316014127223033e-05, "loss": 0.0565, "step": 5810 }, { "epoch": 5.580057526366251, "grad_norm": 0.318808376789093, "learning_rate": 8.3094958316399e-05, "loss": 0.0453, "step": 5820 }, { "epoch": 5.589645254074784, "grad_norm": 0.44590169191360474, "learning_rate": 8.302967510533213e-05, "loss": 0.0524, "step": 5830 }, { "epoch": 5.5992329817833175, "grad_norm": 0.3664915859699249, "learning_rate": 8.296429183679349e-05, "loss": 0.0434, "step": 5840 }, { "epoch": 5.608820709491851, "grad_norm": 0.34023183584213257, "learning_rate": 8.289880870884995e-05, "loss": 0.0595, "step": 5850 }, { "epoch": 5.618408437200383, "grad_norm": 0.33271753787994385, "learning_rate": 8.283322591987086e-05, "loss": 0.0476, "step": 5860 }, { "epoch": 5.627996164908916, "grad_norm": 0.30905163288116455, "learning_rate": 8.276754366852754e-05, "loss": 0.0486, "step": 5870 }, { "epoch": 5.6375838926174495, "grad_norm": 0.3950500786304474, "learning_rate": 8.27017621537926e-05, "loss": 0.0524, "step": 5880 }, { "epoch": 5.647171620325983, "grad_norm": 0.3802347481250763, "learning_rate": 8.26358815749393e-05, "loss": 0.0453, "step": 5890 }, { "epoch": 5.656759348034516, "grad_norm": 0.27361515164375305, "learning_rate": 8.256990213154102e-05, "loss": 0.0426, "step": 5900 }, { "epoch": 5.666347075743049, "grad_norm": 0.28120309114456177, "learning_rate": 8.250382402347065e-05, "loss": 0.0406, "step": 5910 }, { "epoch": 5.675934803451582, "grad_norm": 0.44831210374832153, "learning_rate": 8.243764745089999e-05, "loss": 0.0433, "step": 5920 }, { "epoch": 5.685522531160115, "grad_norm": 0.2854187488555908, "learning_rate": 8.237137261429904e-05, "loss": 0.0438, "step": 5930 }, { "epoch": 5.695110258868648, "grad_norm": 0.3696000874042511, "learning_rate": 8.230499971443555e-05, "loss": 0.0399, "step": 5940 }, { "epoch": 5.704697986577181, "grad_norm": 0.794933021068573, "learning_rate": 8.223852895237427e-05, "loss": 0.0452, "step": 5950 }, { "epoch": 5.714285714285714, "grad_norm": 0.3321564793586731, "learning_rate": 8.21719605294765e-05, "loss": 0.0484, "step": 5960 }, { "epoch": 5.723873441994248, "grad_norm": 0.29202380776405334, "learning_rate": 8.210529464739928e-05, "loss": 0.0432, "step": 5970 }, { "epoch": 5.73346116970278, "grad_norm": 0.32877346873283386, "learning_rate": 8.203853150809494e-05, "loss": 0.046, "step": 5980 }, { "epoch": 5.743048897411313, "grad_norm": 0.45695215463638306, "learning_rate": 8.197167131381045e-05, "loss": 0.0464, "step": 5990 }, { "epoch": 5.752636625119846, "grad_norm": 0.20887207984924316, "learning_rate": 8.190471426708675e-05, "loss": 0.0428, "step": 6000 }, { "epoch": 5.76222435282838, "grad_norm": 0.31597304344177246, "learning_rate": 8.183766057075819e-05, "loss": 0.0409, "step": 6010 }, { "epoch": 5.771812080536913, "grad_norm": 0.3338216245174408, "learning_rate": 8.177051042795192e-05, "loss": 0.0461, "step": 6020 }, { "epoch": 5.781399808245446, "grad_norm": 0.32134512066841125, "learning_rate": 8.170326404208724e-05, "loss": 0.0411, "step": 6030 }, { "epoch": 5.790987535953979, "grad_norm": 0.2781100571155548, "learning_rate": 8.163592161687499e-05, "loss": 0.0425, "step": 6040 }, { "epoch": 5.800575263662512, "grad_norm": 0.34772852063179016, "learning_rate": 8.156848335631697e-05, "loss": 0.0368, "step": 6050 }, { "epoch": 5.810162991371045, "grad_norm": 0.3309897184371948, "learning_rate": 8.15009494647053e-05, "loss": 0.04, "step": 6060 }, { "epoch": 5.819750719079578, "grad_norm": 0.252763032913208, "learning_rate": 8.143332014662176e-05, "loss": 0.0398, "step": 6070 }, { "epoch": 5.829338446788111, "grad_norm": 0.3265877664089203, "learning_rate": 8.136559560693722e-05, "loss": 0.045, "step": 6080 }, { "epoch": 5.8389261744966445, "grad_norm": 0.4045432209968567, "learning_rate": 8.129777605081105e-05, "loss": 0.0428, "step": 6090 }, { "epoch": 5.848513902205178, "grad_norm": 0.2679883539676666, "learning_rate": 8.12298616836904e-05, "loss": 0.0433, "step": 6100 }, { "epoch": 5.85810162991371, "grad_norm": 0.4409831166267395, "learning_rate": 8.116185271130965e-05, "loss": 0.0457, "step": 6110 }, { "epoch": 5.867689357622243, "grad_norm": 0.4434974491596222, "learning_rate": 8.10937493396898e-05, "loss": 0.0476, "step": 6120 }, { "epoch": 5.8772770853307765, "grad_norm": 0.363570898771286, "learning_rate": 8.102555177513776e-05, "loss": 0.0405, "step": 6130 }, { "epoch": 5.88686481303931, "grad_norm": 0.31658318638801575, "learning_rate": 8.095726022424583e-05, "loss": 0.0434, "step": 6140 }, { "epoch": 5.896452540747843, "grad_norm": 0.3343175947666168, "learning_rate": 8.088887489389099e-05, "loss": 0.0421, "step": 6150 }, { "epoch": 5.906040268456376, "grad_norm": 0.2580268681049347, "learning_rate": 8.082039599123434e-05, "loss": 0.0415, "step": 6160 }, { "epoch": 5.9156279961649085, "grad_norm": 0.36179137229919434, "learning_rate": 8.07518237237204e-05, "loss": 0.0425, "step": 6170 }, { "epoch": 5.925215723873442, "grad_norm": 0.3440069556236267, "learning_rate": 8.068315829907658e-05, "loss": 0.0404, "step": 6180 }, { "epoch": 5.934803451581975, "grad_norm": 0.39785268902778625, "learning_rate": 8.061439992531241e-05, "loss": 0.0425, "step": 6190 }, { "epoch": 5.944391179290508, "grad_norm": 0.29912492632865906, "learning_rate": 8.054554881071909e-05, "loss": 0.0465, "step": 6200 }, { "epoch": 5.953978906999041, "grad_norm": 0.3317604660987854, "learning_rate": 8.047660516386868e-05, "loss": 0.0432, "step": 6210 }, { "epoch": 5.963566634707575, "grad_norm": 0.3451102077960968, "learning_rate": 8.040756919361358e-05, "loss": 0.0452, "step": 6220 }, { "epoch": 5.973154362416107, "grad_norm": 0.3293020725250244, "learning_rate": 8.03384411090859e-05, "loss": 0.0367, "step": 6230 }, { "epoch": 5.98274209012464, "grad_norm": 0.30293816328048706, "learning_rate": 8.026922111969674e-05, "loss": 0.0442, "step": 6240 }, { "epoch": 5.992329817833173, "grad_norm": 0.2671773433685303, "learning_rate": 8.019990943513565e-05, "loss": 0.0482, "step": 6250 }, { "epoch": 6.001917545541707, "grad_norm": 0.30587103962898254, "learning_rate": 8.013050626536992e-05, "loss": 0.054, "step": 6260 }, { "epoch": 6.01150527325024, "grad_norm": 0.3319852948188782, "learning_rate": 8.0061011820644e-05, "loss": 0.0454, "step": 6270 }, { "epoch": 6.021093000958773, "grad_norm": 0.5606246590614319, "learning_rate": 7.999142631147884e-05, "loss": 0.0491, "step": 6280 }, { "epoch": 6.030680728667305, "grad_norm": 0.3884483873844147, "learning_rate": 7.992174994867123e-05, "loss": 0.0488, "step": 6290 }, { "epoch": 6.040268456375839, "grad_norm": 0.30733785033226013, "learning_rate": 7.985198294329324e-05, "loss": 0.0434, "step": 6300 }, { "epoch": 6.049856184084372, "grad_norm": 0.9947719573974609, "learning_rate": 7.978212550669144e-05, "loss": 0.0452, "step": 6310 }, { "epoch": 6.059443911792905, "grad_norm": 0.3336857259273529, "learning_rate": 7.971217785048644e-05, "loss": 0.0445, "step": 6320 }, { "epoch": 6.069031639501438, "grad_norm": 0.3001098930835724, "learning_rate": 7.964214018657208e-05, "loss": 0.042, "step": 6330 }, { "epoch": 6.0786193672099715, "grad_norm": 0.32423412799835205, "learning_rate": 7.957201272711492e-05, "loss": 0.041, "step": 6340 }, { "epoch": 6.088207094918504, "grad_norm": 0.2871480882167816, "learning_rate": 7.950179568455347e-05, "loss": 0.0436, "step": 6350 }, { "epoch": 6.097794822627037, "grad_norm": 0.4804290533065796, "learning_rate": 7.94314892715977e-05, "loss": 0.0393, "step": 6360 }, { "epoch": 6.10738255033557, "grad_norm": 0.459533154964447, "learning_rate": 7.936109370122824e-05, "loss": 0.0468, "step": 6370 }, { "epoch": 6.1169702780441035, "grad_norm": 0.25455859303474426, "learning_rate": 7.929060918669585e-05, "loss": 0.0409, "step": 6380 }, { "epoch": 6.126558005752637, "grad_norm": 0.34990832209587097, "learning_rate": 7.922003594152068e-05, "loss": 0.0389, "step": 6390 }, { "epoch": 6.13614573346117, "grad_norm": 0.2321031242609024, "learning_rate": 7.914937417949175e-05, "loss": 0.0428, "step": 6400 }, { "epoch": 6.145733461169703, "grad_norm": 0.3366633951663971, "learning_rate": 7.907862411466616e-05, "loss": 0.0417, "step": 6410 }, { "epoch": 6.1553211888782355, "grad_norm": 0.3831850588321686, "learning_rate": 7.900778596136855e-05, "loss": 0.0409, "step": 6420 }, { "epoch": 6.164908916586769, "grad_norm": 0.3772655129432678, "learning_rate": 7.893685993419036e-05, "loss": 0.0412, "step": 6430 }, { "epoch": 6.174496644295302, "grad_norm": 0.4264662563800812, "learning_rate": 7.88658462479893e-05, "loss": 0.0437, "step": 6440 }, { "epoch": 6.184084372003835, "grad_norm": 0.3162544369697571, "learning_rate": 7.879474511788854e-05, "loss": 0.0388, "step": 6450 }, { "epoch": 6.193672099712368, "grad_norm": 0.34539514780044556, "learning_rate": 7.872355675927623e-05, "loss": 0.0416, "step": 6460 }, { "epoch": 6.203259827420902, "grad_norm": 0.3206475079059601, "learning_rate": 7.865228138780469e-05, "loss": 0.0468, "step": 6470 }, { "epoch": 6.212847555129434, "grad_norm": 0.3619016110897064, "learning_rate": 7.858091921938988e-05, "loss": 0.0448, "step": 6480 }, { "epoch": 6.222435282837967, "grad_norm": 0.3190850615501404, "learning_rate": 7.850947047021069e-05, "loss": 0.0388, "step": 6490 }, { "epoch": 6.2320230105465, "grad_norm": 0.3191368579864502, "learning_rate": 7.843793535670827e-05, "loss": 0.0449, "step": 6500 }, { "epoch": 6.241610738255034, "grad_norm": 0.24938683211803436, "learning_rate": 7.836631409558538e-05, "loss": 0.0379, "step": 6510 }, { "epoch": 6.251198465963567, "grad_norm": 0.27279171347618103, "learning_rate": 7.829460690380584e-05, "loss": 0.0398, "step": 6520 }, { "epoch": 6.2607861936721, "grad_norm": 0.4261578917503357, "learning_rate": 7.822281399859365e-05, "loss": 0.0441, "step": 6530 }, { "epoch": 6.270373921380632, "grad_norm": 0.3505672216415405, "learning_rate": 7.815093559743256e-05, "loss": 0.0464, "step": 6540 }, { "epoch": 6.279961649089166, "grad_norm": 0.8695809841156006, "learning_rate": 7.807897191806527e-05, "loss": 0.0459, "step": 6550 }, { "epoch": 6.289549376797699, "grad_norm": 0.3453594446182251, "learning_rate": 7.800692317849285e-05, "loss": 0.0437, "step": 6560 }, { "epoch": 6.299137104506232, "grad_norm": 0.4360389709472656, "learning_rate": 7.7934789596974e-05, "loss": 0.0495, "step": 6570 }, { "epoch": 6.308724832214765, "grad_norm": 0.4259977340698242, "learning_rate": 7.786257139202447e-05, "loss": 0.0486, "step": 6580 }, { "epoch": 6.3183125599232985, "grad_norm": 0.4518745541572571, "learning_rate": 7.779026878241635e-05, "loss": 0.0455, "step": 6590 }, { "epoch": 6.327900287631831, "grad_norm": 0.38590195775032043, "learning_rate": 7.771788198717741e-05, "loss": 0.043, "step": 6600 }, { "epoch": 6.337488015340364, "grad_norm": 0.2825833559036255, "learning_rate": 7.764541122559046e-05, "loss": 0.0439, "step": 6610 }, { "epoch": 6.347075743048897, "grad_norm": 0.364486962556839, "learning_rate": 7.757285671719264e-05, "loss": 0.0429, "step": 6620 }, { "epoch": 6.3566634707574305, "grad_norm": 0.32037052512168884, "learning_rate": 7.750021868177485e-05, "loss": 0.0433, "step": 6630 }, { "epoch": 6.366251198465964, "grad_norm": 0.2986597716808319, "learning_rate": 7.742749733938094e-05, "loss": 0.0407, "step": 6640 }, { "epoch": 6.375838926174497, "grad_norm": 0.20917120575904846, "learning_rate": 7.73546929103072e-05, "loss": 0.0361, "step": 6650 }, { "epoch": 6.385426653883029, "grad_norm": 0.3319404125213623, "learning_rate": 7.728180561510155e-05, "loss": 0.04, "step": 6660 }, { "epoch": 6.3950143815915625, "grad_norm": 0.4171611964702606, "learning_rate": 7.720883567456298e-05, "loss": 0.0348, "step": 6670 }, { "epoch": 6.404602109300096, "grad_norm": 0.44948673248291016, "learning_rate": 7.713578330974081e-05, "loss": 0.0489, "step": 6680 }, { "epoch": 6.414189837008629, "grad_norm": 0.3433539569377899, "learning_rate": 7.706264874193409e-05, "loss": 0.038, "step": 6690 }, { "epoch": 6.423777564717162, "grad_norm": 0.44886866211891174, "learning_rate": 7.698943219269086e-05, "loss": 0.0437, "step": 6700 }, { "epoch": 6.433365292425695, "grad_norm": 0.30656543374061584, "learning_rate": 7.691613388380752e-05, "loss": 0.0409, "step": 6710 }, { "epoch": 6.442953020134228, "grad_norm": 0.3929513692855835, "learning_rate": 7.684275403732811e-05, "loss": 0.0441, "step": 6720 }, { "epoch": 6.452540747842761, "grad_norm": 0.44606807827949524, "learning_rate": 7.676929287554372e-05, "loss": 0.0457, "step": 6730 }, { "epoch": 6.462128475551294, "grad_norm": 0.3216160535812378, "learning_rate": 7.669575062099175e-05, "loss": 0.0469, "step": 6740 }, { "epoch": 6.471716203259827, "grad_norm": 0.24256640672683716, "learning_rate": 7.662212749645527e-05, "loss": 0.0384, "step": 6750 }, { "epoch": 6.481303930968361, "grad_norm": 0.37510934472084045, "learning_rate": 7.654842372496232e-05, "loss": 0.0389, "step": 6760 }, { "epoch": 6.490891658676894, "grad_norm": 0.3382836878299713, "learning_rate": 7.647463952978524e-05, "loss": 0.0448, "step": 6770 }, { "epoch": 6.500479386385427, "grad_norm": 0.4976375102996826, "learning_rate": 7.640077513443999e-05, "loss": 0.0413, "step": 6780 }, { "epoch": 6.510067114093959, "grad_norm": 0.273062527179718, "learning_rate": 7.632683076268552e-05, "loss": 0.0432, "step": 6790 }, { "epoch": 6.519654841802493, "grad_norm": 0.34846237301826477, "learning_rate": 7.625280663852301e-05, "loss": 0.0501, "step": 6800 }, { "epoch": 6.529242569511026, "grad_norm": 0.26076826453208923, "learning_rate": 7.617870298619527e-05, "loss": 0.0428, "step": 6810 }, { "epoch": 6.538830297219559, "grad_norm": 0.8371449708938599, "learning_rate": 7.610452003018602e-05, "loss": 0.0437, "step": 6820 }, { "epoch": 6.548418024928092, "grad_norm": 0.28489676117897034, "learning_rate": 7.603025799521918e-05, "loss": 0.0446, "step": 6830 }, { "epoch": 6.558005752636625, "grad_norm": 0.3971545994281769, "learning_rate": 7.595591710625829e-05, "loss": 0.045, "step": 6840 }, { "epoch": 6.567593480345158, "grad_norm": 0.24828213453292847, "learning_rate": 7.588149758850572e-05, "loss": 0.0431, "step": 6850 }, { "epoch": 6.577181208053691, "grad_norm": 0.23631419241428375, "learning_rate": 7.580699966740201e-05, "loss": 0.0384, "step": 6860 }, { "epoch": 6.586768935762224, "grad_norm": 0.3739171326160431, "learning_rate": 7.57324235686253e-05, "loss": 0.0513, "step": 6870 }, { "epoch": 6.5963566634707576, "grad_norm": 0.29776638746261597, "learning_rate": 7.565776951809043e-05, "loss": 0.0437, "step": 6880 }, { "epoch": 6.605944391179291, "grad_norm": 0.24786557257175446, "learning_rate": 7.558303774194848e-05, "loss": 0.045, "step": 6890 }, { "epoch": 6.615532118887824, "grad_norm": 0.2621402442455292, "learning_rate": 7.550822846658592e-05, "loss": 0.036, "step": 6900 }, { "epoch": 6.625119846596356, "grad_norm": 0.4778667092323303, "learning_rate": 7.543334191862408e-05, "loss": 0.0403, "step": 6910 }, { "epoch": 6.6347075743048896, "grad_norm": 0.37852802872657776, "learning_rate": 7.535837832491826e-05, "loss": 0.0433, "step": 6920 }, { "epoch": 6.644295302013423, "grad_norm": 0.5725548267364502, "learning_rate": 7.528333791255723e-05, "loss": 0.0434, "step": 6930 }, { "epoch": 6.653883029721956, "grad_norm": 0.39372578263282776, "learning_rate": 7.520822090886245e-05, "loss": 0.0403, "step": 6940 }, { "epoch": 6.663470757430489, "grad_norm": 0.2831190526485443, "learning_rate": 7.513302754138741e-05, "loss": 0.0424, "step": 6950 }, { "epoch": 6.673058485139022, "grad_norm": 0.27865827083587646, "learning_rate": 7.50577580379169e-05, "loss": 0.0397, "step": 6960 }, { "epoch": 6.682646212847555, "grad_norm": 0.42975571751594543, "learning_rate": 7.49824126264664e-05, "loss": 0.0426, "step": 6970 }, { "epoch": 6.692233940556088, "grad_norm": 0.3423265218734741, "learning_rate": 7.490699153528124e-05, "loss": 0.045, "step": 6980 }, { "epoch": 6.701821668264621, "grad_norm": 0.25411704182624817, "learning_rate": 7.483149499283616e-05, "loss": 0.0396, "step": 6990 }, { "epoch": 6.7114093959731544, "grad_norm": 0.35409414768218994, "learning_rate": 7.475592322783434e-05, "loss": 0.0382, "step": 7000 }, { "epoch": 6.720997123681688, "grad_norm": 0.28262168169021606, "learning_rate": 7.468027646920687e-05, "loss": 0.045, "step": 7010 }, { "epoch": 6.730584851390221, "grad_norm": 0.4541366398334503, "learning_rate": 7.460455494611206e-05, "loss": 0.0389, "step": 7020 }, { "epoch": 6.740172579098753, "grad_norm": 0.27586543560028076, "learning_rate": 7.452875888793465e-05, "loss": 0.0352, "step": 7030 }, { "epoch": 6.7497603068072864, "grad_norm": 0.2681753933429718, "learning_rate": 7.445288852428518e-05, "loss": 0.0492, "step": 7040 }, { "epoch": 6.75934803451582, "grad_norm": 0.32088425755500793, "learning_rate": 7.437694408499933e-05, "loss": 0.0524, "step": 7050 }, { "epoch": 6.768935762224353, "grad_norm": 0.3608848452568054, "learning_rate": 7.430092580013712e-05, "loss": 0.0444, "step": 7060 }, { "epoch": 6.778523489932886, "grad_norm": 0.2983666658401489, "learning_rate": 7.42248338999823e-05, "loss": 0.0484, "step": 7070 }, { "epoch": 6.788111217641419, "grad_norm": 0.48037657141685486, "learning_rate": 7.414866861504164e-05, "loss": 0.0441, "step": 7080 }, { "epoch": 6.797698945349952, "grad_norm": 0.3220434784889221, "learning_rate": 7.407243017604418e-05, "loss": 0.0407, "step": 7090 }, { "epoch": 6.807286673058485, "grad_norm": 0.21454603970050812, "learning_rate": 7.399611881394061e-05, "loss": 0.0484, "step": 7100 }, { "epoch": 6.816874400767018, "grad_norm": 0.3658502995967865, "learning_rate": 7.391973475990247e-05, "loss": 0.0471, "step": 7110 }, { "epoch": 6.826462128475551, "grad_norm": 0.6076493859291077, "learning_rate": 7.384327824532158e-05, "loss": 0.0512, "step": 7120 }, { "epoch": 6.836049856184085, "grad_norm": 0.27629798650741577, "learning_rate": 7.376674950180918e-05, "loss": 0.0432, "step": 7130 }, { "epoch": 6.845637583892618, "grad_norm": 0.4255768954753876, "learning_rate": 7.36901487611954e-05, "loss": 0.042, "step": 7140 }, { "epoch": 6.855225311601151, "grad_norm": 0.34027740359306335, "learning_rate": 7.361347625552842e-05, "loss": 0.0417, "step": 7150 }, { "epoch": 6.864813039309683, "grad_norm": 0.29743191599845886, "learning_rate": 7.353673221707382e-05, "loss": 0.0506, "step": 7160 }, { "epoch": 6.874400767018217, "grad_norm": 0.2994328439235687, "learning_rate": 7.345991687831393e-05, "loss": 0.042, "step": 7170 }, { "epoch": 6.88398849472675, "grad_norm": 0.2891611158847809, "learning_rate": 7.338303047194697e-05, "loss": 0.0396, "step": 7180 }, { "epoch": 6.893576222435283, "grad_norm": 0.2870160937309265, "learning_rate": 7.330607323088657e-05, "loss": 0.0477, "step": 7190 }, { "epoch": 6.903163950143816, "grad_norm": 0.4798467457294464, "learning_rate": 7.322904538826083e-05, "loss": 0.0409, "step": 7200 }, { "epoch": 6.912751677852349, "grad_norm": 0.30976602435112, "learning_rate": 7.31519471774118e-05, "loss": 0.0431, "step": 7210 }, { "epoch": 6.922339405560882, "grad_norm": 0.32751721143722534, "learning_rate": 7.307477883189463e-05, "loss": 0.0415, "step": 7220 }, { "epoch": 6.931927133269415, "grad_norm": 0.3902662992477417, "learning_rate": 7.299754058547704e-05, "loss": 0.0359, "step": 7230 }, { "epoch": 6.941514860977948, "grad_norm": 0.21194472908973694, "learning_rate": 7.292023267213835e-05, "loss": 0.0409, "step": 7240 }, { "epoch": 6.9511025886864815, "grad_norm": 0.28738507628440857, "learning_rate": 7.284285532606906e-05, "loss": 0.0433, "step": 7250 }, { "epoch": 6.960690316395015, "grad_norm": 0.27712157368659973, "learning_rate": 7.276540878166996e-05, "loss": 0.0445, "step": 7260 }, { "epoch": 6.970278044103548, "grad_norm": 0.36444854736328125, "learning_rate": 7.268789327355143e-05, "loss": 0.0424, "step": 7270 }, { "epoch": 6.97986577181208, "grad_norm": 0.26638609170913696, "learning_rate": 7.261030903653278e-05, "loss": 0.0415, "step": 7280 }, { "epoch": 6.9894534995206135, "grad_norm": 0.29326483607292175, "learning_rate": 7.253265630564155e-05, "loss": 0.0404, "step": 7290 }, { "epoch": 6.999041227229147, "grad_norm": 0.563951849937439, "learning_rate": 7.245493531611274e-05, "loss": 0.0462, "step": 7300 }, { "epoch": 7.00862895493768, "grad_norm": 0.2669621407985687, "learning_rate": 7.237714630338812e-05, "loss": 0.0489, "step": 7310 }, { "epoch": 7.018216682646213, "grad_norm": 0.29936525225639343, "learning_rate": 7.229928950311558e-05, "loss": 0.042, "step": 7320 }, { "epoch": 7.027804410354746, "grad_norm": 0.29611873626708984, "learning_rate": 7.222136515114828e-05, "loss": 0.0451, "step": 7330 }, { "epoch": 7.037392138063279, "grad_norm": 0.2841253876686096, "learning_rate": 7.214337348354408e-05, "loss": 0.0401, "step": 7340 }, { "epoch": 7.046979865771812, "grad_norm": 0.39095616340637207, "learning_rate": 7.206531473656473e-05, "loss": 0.0443, "step": 7350 }, { "epoch": 7.056567593480345, "grad_norm": 0.3568895757198334, "learning_rate": 7.19871891466752e-05, "loss": 0.04, "step": 7360 }, { "epoch": 7.066155321188878, "grad_norm": 0.4422648549079895, "learning_rate": 7.190899695054293e-05, "loss": 0.0357, "step": 7370 }, { "epoch": 7.075743048897412, "grad_norm": 0.3040291965007782, "learning_rate": 7.183073838503715e-05, "loss": 0.0375, "step": 7380 }, { "epoch": 7.085330776605945, "grad_norm": 0.3379688560962677, "learning_rate": 7.175241368722812e-05, "loss": 0.0441, "step": 7390 }, { "epoch": 7.094918504314477, "grad_norm": 0.23404334485530853, "learning_rate": 7.167402309438649e-05, "loss": 0.0438, "step": 7400 }, { "epoch": 7.10450623202301, "grad_norm": 0.19392350316047668, "learning_rate": 7.159556684398246e-05, "loss": 0.0429, "step": 7410 }, { "epoch": 7.114093959731544, "grad_norm": 0.3650771975517273, "learning_rate": 7.151704517368513e-05, "loss": 0.0417, "step": 7420 }, { "epoch": 7.123681687440077, "grad_norm": 0.3727266788482666, "learning_rate": 7.143845832136188e-05, "loss": 0.0381, "step": 7430 }, { "epoch": 7.13326941514861, "grad_norm": 0.2589777410030365, "learning_rate": 7.13598065250774e-05, "loss": 0.046, "step": 7440 }, { "epoch": 7.142857142857143, "grad_norm": 0.3064965009689331, "learning_rate": 7.128109002309324e-05, "loss": 0.0419, "step": 7450 }, { "epoch": 7.152444870565676, "grad_norm": 0.3681334853172302, "learning_rate": 7.120230905386688e-05, "loss": 0.0456, "step": 7460 }, { "epoch": 7.162032598274209, "grad_norm": 0.23908288776874542, "learning_rate": 7.112346385605115e-05, "loss": 0.0395, "step": 7470 }, { "epoch": 7.171620325982742, "grad_norm": 0.26035764813423157, "learning_rate": 7.104455466849339e-05, "loss": 0.0411, "step": 7480 }, { "epoch": 7.181208053691275, "grad_norm": 0.25808098912239075, "learning_rate": 7.096558173023486e-05, "loss": 0.0405, "step": 7490 }, { "epoch": 7.1907957813998085, "grad_norm": 0.21516771614551544, "learning_rate": 7.088654528050986e-05, "loss": 0.0411, "step": 7500 }, { "epoch": 7.200383509108342, "grad_norm": 0.27496856451034546, "learning_rate": 7.080744555874517e-05, "loss": 0.0332, "step": 7510 }, { "epoch": 7.209971236816874, "grad_norm": 0.43999767303466797, "learning_rate": 7.072828280455917e-05, "loss": 0.0384, "step": 7520 }, { "epoch": 7.219558964525407, "grad_norm": 0.3292781710624695, "learning_rate": 7.06490572577612e-05, "loss": 0.042, "step": 7530 }, { "epoch": 7.2291466922339405, "grad_norm": 0.3117612600326538, "learning_rate": 7.056976915835087e-05, "loss": 0.0387, "step": 7540 }, { "epoch": 7.238734419942474, "grad_norm": 0.2206171602010727, "learning_rate": 7.049041874651722e-05, "loss": 0.0362, "step": 7550 }, { "epoch": 7.248322147651007, "grad_norm": 0.2644396722316742, "learning_rate": 7.04110062626381e-05, "loss": 0.0373, "step": 7560 }, { "epoch": 7.25790987535954, "grad_norm": 0.2682825028896332, "learning_rate": 7.033153194727934e-05, "loss": 0.039, "step": 7570 }, { "epoch": 7.2674976030680725, "grad_norm": 0.3411322832107544, "learning_rate": 7.025199604119416e-05, "loss": 0.0454, "step": 7580 }, { "epoch": 7.277085330776606, "grad_norm": 0.3761787712574005, "learning_rate": 7.017239878532227e-05, "loss": 0.0379, "step": 7590 }, { "epoch": 7.286673058485139, "grad_norm": 0.24610835313796997, "learning_rate": 7.009274042078927e-05, "loss": 0.0465, "step": 7600 }, { "epoch": 7.296260786193672, "grad_norm": 0.3763638138771057, "learning_rate": 7.00130211889059e-05, "loss": 0.0351, "step": 7610 }, { "epoch": 7.305848513902205, "grad_norm": 0.2616029679775238, "learning_rate": 6.993324133116726e-05, "loss": 0.039, "step": 7620 }, { "epoch": 7.315436241610739, "grad_norm": 0.40914463996887207, "learning_rate": 6.985340108925209e-05, "loss": 0.0417, "step": 7630 }, { "epoch": 7.325023969319272, "grad_norm": 0.3503078520298004, "learning_rate": 6.977350070502208e-05, "loss": 0.0456, "step": 7640 }, { "epoch": 7.334611697027804, "grad_norm": 0.40051010251045227, "learning_rate": 6.96935404205211e-05, "loss": 0.047, "step": 7650 }, { "epoch": 7.344199424736337, "grad_norm": 0.3985821306705475, "learning_rate": 6.96135204779745e-05, "loss": 0.0409, "step": 7660 }, { "epoch": 7.353787152444871, "grad_norm": 0.5366324782371521, "learning_rate": 6.95334411197883e-05, "loss": 0.0445, "step": 7670 }, { "epoch": 7.363374880153404, "grad_norm": 0.2314271628856659, "learning_rate": 6.945330258854854e-05, "loss": 0.0345, "step": 7680 }, { "epoch": 7.372962607861937, "grad_norm": 0.24734103679656982, "learning_rate": 6.937310512702056e-05, "loss": 0.0354, "step": 7690 }, { "epoch": 7.382550335570469, "grad_norm": 0.7746879458427429, "learning_rate": 6.929284897814812e-05, "loss": 0.0398, "step": 7700 }, { "epoch": 7.392138063279003, "grad_norm": 0.3436695635318756, "learning_rate": 6.921253438505285e-05, "loss": 0.0426, "step": 7710 }, { "epoch": 7.401725790987536, "grad_norm": 0.3027035593986511, "learning_rate": 6.913216159103339e-05, "loss": 0.0365, "step": 7720 }, { "epoch": 7.411313518696069, "grad_norm": 0.23207184672355652, "learning_rate": 6.905173083956468e-05, "loss": 0.0397, "step": 7730 }, { "epoch": 7.420901246404602, "grad_norm": 0.2601774036884308, "learning_rate": 6.897124237429726e-05, "loss": 0.0377, "step": 7740 }, { "epoch": 7.4304889741131355, "grad_norm": 0.37864232063293457, "learning_rate": 6.889069643905646e-05, "loss": 0.0426, "step": 7750 }, { "epoch": 7.440076701821669, "grad_norm": 0.29199257493019104, "learning_rate": 6.881009327784176e-05, "loss": 0.0414, "step": 7760 }, { "epoch": 7.449664429530201, "grad_norm": 0.39418113231658936, "learning_rate": 6.872943313482596e-05, "loss": 0.04, "step": 7770 }, { "epoch": 7.459252157238734, "grad_norm": 0.2868475615978241, "learning_rate": 6.864871625435448e-05, "loss": 0.0373, "step": 7780 }, { "epoch": 7.4688398849472675, "grad_norm": 0.27719494700431824, "learning_rate": 6.856794288094461e-05, "loss": 0.0401, "step": 7790 }, { "epoch": 7.478427612655801, "grad_norm": 0.33910930156707764, "learning_rate": 6.848711325928481e-05, "loss": 0.0375, "step": 7800 }, { "epoch": 7.488015340364334, "grad_norm": 0.4122414290904999, "learning_rate": 6.840622763423391e-05, "loss": 0.0437, "step": 7810 }, { "epoch": 7.497603068072867, "grad_norm": 0.2600208818912506, "learning_rate": 6.832528625082036e-05, "loss": 0.0418, "step": 7820 }, { "epoch": 7.5071907957813995, "grad_norm": 0.27382367849349976, "learning_rate": 6.824428935424158e-05, "loss": 0.0512, "step": 7830 }, { "epoch": 7.516778523489933, "grad_norm": 0.27426889538764954, "learning_rate": 6.816323718986313e-05, "loss": 0.0339, "step": 7840 }, { "epoch": 7.526366251198466, "grad_norm": 0.32315194606781006, "learning_rate": 6.808213000321796e-05, "loss": 0.0387, "step": 7850 }, { "epoch": 7.535953978906999, "grad_norm": 0.2910844683647156, "learning_rate": 6.80009680400058e-05, "loss": 0.0351, "step": 7860 }, { "epoch": 7.545541706615532, "grad_norm": 0.3915770649909973, "learning_rate": 6.791975154609216e-05, "loss": 0.0439, "step": 7870 }, { "epoch": 7.555129434324066, "grad_norm": 0.2871047258377075, "learning_rate": 6.78384807675079e-05, "loss": 0.039, "step": 7880 }, { "epoch": 7.564717162032598, "grad_norm": 0.3511698544025421, "learning_rate": 6.775715595044822e-05, "loss": 0.039, "step": 7890 }, { "epoch": 7.574304889741131, "grad_norm": 0.23974575102329254, "learning_rate": 6.767577734127209e-05, "loss": 0.0438, "step": 7900 }, { "epoch": 7.583892617449664, "grad_norm": 0.21983303129673004, "learning_rate": 6.759434518650133e-05, "loss": 0.043, "step": 7910 }, { "epoch": 7.593480345158198, "grad_norm": 0.2729918658733368, "learning_rate": 6.75128597328201e-05, "loss": 0.0423, "step": 7920 }, { "epoch": 7.603068072866731, "grad_norm": 0.34236469864845276, "learning_rate": 6.743132122707394e-05, "loss": 0.0443, "step": 7930 }, { "epoch": 7.612655800575264, "grad_norm": 0.24948126077651978, "learning_rate": 6.73497299162691e-05, "loss": 0.037, "step": 7940 }, { "epoch": 7.622243528283796, "grad_norm": 0.3250608444213867, "learning_rate": 6.726808604757184e-05, "loss": 0.0476, "step": 7950 }, { "epoch": 7.63183125599233, "grad_norm": 0.2713163495063782, "learning_rate": 6.718638986830758e-05, "loss": 0.0391, "step": 7960 }, { "epoch": 7.641418983700863, "grad_norm": 0.3012318015098572, "learning_rate": 6.710464162596023e-05, "loss": 0.0445, "step": 7970 }, { "epoch": 7.651006711409396, "grad_norm": 0.4039930999279022, "learning_rate": 6.702284156817143e-05, "loss": 0.045, "step": 7980 }, { "epoch": 7.660594439117929, "grad_norm": 0.22321514785289764, "learning_rate": 6.694098994273977e-05, "loss": 0.0395, "step": 7990 }, { "epoch": 7.6701821668264625, "grad_norm": 0.3009647727012634, "learning_rate": 6.685908699762002e-05, "loss": 0.0425, "step": 8000 }, { "epoch": 7.679769894534996, "grad_norm": 0.23675967752933502, "learning_rate": 6.677713298092251e-05, "loss": 0.043, "step": 8010 }, { "epoch": 7.689357622243528, "grad_norm": 0.3453296422958374, "learning_rate": 6.669512814091219e-05, "loss": 0.0402, "step": 8020 }, { "epoch": 7.698945349952061, "grad_norm": 0.35849177837371826, "learning_rate": 6.6613072726008e-05, "loss": 0.0412, "step": 8030 }, { "epoch": 7.7085330776605945, "grad_norm": 0.2602018117904663, "learning_rate": 6.65309669847821e-05, "loss": 0.0456, "step": 8040 }, { "epoch": 7.718120805369128, "grad_norm": 0.296563059091568, "learning_rate": 6.64488111659591e-05, "loss": 0.0354, "step": 8050 }, { "epoch": 7.727708533077661, "grad_norm": 0.2529861629009247, "learning_rate": 6.636660551841527e-05, "loss": 0.046, "step": 8060 }, { "epoch": 7.737296260786193, "grad_norm": 0.3589211404323578, "learning_rate": 6.62843502911779e-05, "loss": 0.0486, "step": 8070 }, { "epoch": 7.7468839884947265, "grad_norm": 0.28562942147254944, "learning_rate": 6.620204573342444e-05, "loss": 0.04, "step": 8080 }, { "epoch": 7.75647171620326, "grad_norm": 0.42662665247917175, "learning_rate": 6.611969209448175e-05, "loss": 0.0417, "step": 8090 }, { "epoch": 7.766059443911793, "grad_norm": 0.3339911997318268, "learning_rate": 6.603728962382542e-05, "loss": 0.0344, "step": 8100 }, { "epoch": 7.775647171620326, "grad_norm": 0.5838896632194519, "learning_rate": 6.595483857107891e-05, "loss": 0.0371, "step": 8110 }, { "epoch": 7.785234899328859, "grad_norm": 0.30259743332862854, "learning_rate": 6.587233918601292e-05, "loss": 0.0392, "step": 8120 }, { "epoch": 7.794822627037393, "grad_norm": 0.4095616340637207, "learning_rate": 6.578979171854449e-05, "loss": 0.034, "step": 8130 }, { "epoch": 7.804410354745925, "grad_norm": 0.4089941084384918, "learning_rate": 6.570719641873639e-05, "loss": 0.0432, "step": 8140 }, { "epoch": 7.813998082454458, "grad_norm": 0.22477275133132935, "learning_rate": 6.562455353679624e-05, "loss": 0.0482, "step": 8150 }, { "epoch": 7.823585810162991, "grad_norm": 0.24884644150733948, "learning_rate": 6.554186332307583e-05, "loss": 0.0357, "step": 8160 }, { "epoch": 7.833173537871525, "grad_norm": 0.40433716773986816, "learning_rate": 6.545912602807029e-05, "loss": 0.0393, "step": 8170 }, { "epoch": 7.842761265580058, "grad_norm": 0.1963358074426651, "learning_rate": 6.537634190241742e-05, "loss": 0.0369, "step": 8180 }, { "epoch": 7.85234899328859, "grad_norm": 0.30618107318878174, "learning_rate": 6.529351119689688e-05, "loss": 0.0365, "step": 8190 }, { "epoch": 7.861936720997123, "grad_norm": 0.9213468432426453, "learning_rate": 6.52106341624294e-05, "loss": 0.0415, "step": 8200 }, { "epoch": 7.871524448705657, "grad_norm": 0.41490432620048523, "learning_rate": 6.512771105007609e-05, "loss": 0.0432, "step": 8210 }, { "epoch": 7.88111217641419, "grad_norm": 0.3433400094509125, "learning_rate": 6.504474211103766e-05, "loss": 0.0383, "step": 8220 }, { "epoch": 7.890699904122723, "grad_norm": 0.2565036714076996, "learning_rate": 6.496172759665357e-05, "loss": 0.039, "step": 8230 }, { "epoch": 7.900287631831256, "grad_norm": 0.36820822954177856, "learning_rate": 6.487866775840141e-05, "loss": 0.0373, "step": 8240 }, { "epoch": 7.9098753595397895, "grad_norm": 0.26671302318573, "learning_rate": 6.479556284789608e-05, "loss": 0.0339, "step": 8250 }, { "epoch": 7.919463087248322, "grad_norm": 0.3026654124259949, "learning_rate": 6.471241311688894e-05, "loss": 0.0363, "step": 8260 }, { "epoch": 7.929050814956855, "grad_norm": 0.24896202981472015, "learning_rate": 6.46292188172672e-05, "loss": 0.0394, "step": 8270 }, { "epoch": 7.938638542665388, "grad_norm": 0.3126719892024994, "learning_rate": 6.454598020105306e-05, "loss": 0.0439, "step": 8280 }, { "epoch": 7.9482262703739215, "grad_norm": 0.33165302872657776, "learning_rate": 6.446269752040295e-05, "loss": 0.0393, "step": 8290 }, { "epoch": 7.957813998082455, "grad_norm": 0.6648756265640259, "learning_rate": 6.437937102760682e-05, "loss": 0.0356, "step": 8300 }, { "epoch": 7.967401725790987, "grad_norm": 0.24022682011127472, "learning_rate": 6.429600097508732e-05, "loss": 0.0406, "step": 8310 }, { "epoch": 7.97698945349952, "grad_norm": 1.2279690504074097, "learning_rate": 6.421258761539904e-05, "loss": 0.0434, "step": 8320 }, { "epoch": 7.9865771812080535, "grad_norm": 0.2868311107158661, "learning_rate": 6.412913120122779e-05, "loss": 0.0372, "step": 8330 }, { "epoch": 7.996164908916587, "grad_norm": 0.25136950612068176, "learning_rate": 6.40456319853898e-05, "loss": 0.0405, "step": 8340 }, { "epoch": 8.00575263662512, "grad_norm": 0.3662584722042084, "learning_rate": 6.396209022083098e-05, "loss": 0.041, "step": 8350 }, { "epoch": 8.015340364333653, "grad_norm": 0.3134470283985138, "learning_rate": 6.387850616062605e-05, "loss": 0.0357, "step": 8360 }, { "epoch": 8.024928092042186, "grad_norm": 0.3947703540325165, "learning_rate": 6.379488005797797e-05, "loss": 0.0384, "step": 8370 }, { "epoch": 8.03451581975072, "grad_norm": 0.3272991478443146, "learning_rate": 6.371121216621698e-05, "loss": 0.0392, "step": 8380 }, { "epoch": 8.044103547459253, "grad_norm": 1.1089465618133545, "learning_rate": 6.362750273879996e-05, "loss": 0.047, "step": 8390 }, { "epoch": 8.053691275167786, "grad_norm": 0.2133249044418335, "learning_rate": 6.354375202930958e-05, "loss": 0.0333, "step": 8400 }, { "epoch": 8.063279002876317, "grad_norm": 0.3814240097999573, "learning_rate": 6.345996029145356e-05, "loss": 0.0419, "step": 8410 }, { "epoch": 8.07286673058485, "grad_norm": 0.38257062435150146, "learning_rate": 6.337612777906398e-05, "loss": 0.0412, "step": 8420 }, { "epoch": 8.082454458293384, "grad_norm": 0.20826545357704163, "learning_rate": 6.329225474609633e-05, "loss": 0.0402, "step": 8430 }, { "epoch": 8.092042186001917, "grad_norm": 0.2289332151412964, "learning_rate": 6.320834144662897e-05, "loss": 0.0392, "step": 8440 }, { "epoch": 8.10162991371045, "grad_norm": 0.29565075039863586, "learning_rate": 6.312438813486211e-05, "loss": 0.0347, "step": 8450 }, { "epoch": 8.111217641418984, "grad_norm": 0.21872690320014954, "learning_rate": 6.30403950651173e-05, "loss": 0.0357, "step": 8460 }, { "epoch": 8.120805369127517, "grad_norm": 0.24760524928569794, "learning_rate": 6.295636249183643e-05, "loss": 0.0331, "step": 8470 }, { "epoch": 8.13039309683605, "grad_norm": 0.2806303799152374, "learning_rate": 6.287229066958113e-05, "loss": 0.0393, "step": 8480 }, { "epoch": 8.139980824544583, "grad_norm": 0.45841529965400696, "learning_rate": 6.278817985303184e-05, "loss": 0.0434, "step": 8490 }, { "epoch": 8.149568552253116, "grad_norm": 0.21284928917884827, "learning_rate": 6.270403029698722e-05, "loss": 0.0311, "step": 8500 }, { "epoch": 8.15915627996165, "grad_norm": 0.312191367149353, "learning_rate": 6.261984225636324e-05, "loss": 0.0409, "step": 8510 }, { "epoch": 8.168744007670183, "grad_norm": 0.38339605927467346, "learning_rate": 6.253561598619247e-05, "loss": 0.0367, "step": 8520 }, { "epoch": 8.178331735378714, "grad_norm": 0.24168361723423004, "learning_rate": 6.245135174162323e-05, "loss": 0.0419, "step": 8530 }, { "epoch": 8.187919463087248, "grad_norm": 0.3038835823535919, "learning_rate": 6.236704977791898e-05, "loss": 0.0349, "step": 8540 }, { "epoch": 8.19750719079578, "grad_norm": 0.32537156343460083, "learning_rate": 6.228271035045735e-05, "loss": 0.0347, "step": 8550 }, { "epoch": 8.207094918504314, "grad_norm": 0.2789401412010193, "learning_rate": 6.21983337147295e-05, "loss": 0.0339, "step": 8560 }, { "epoch": 8.216682646212847, "grad_norm": 0.4282236397266388, "learning_rate": 6.211392012633932e-05, "loss": 0.0352, "step": 8570 }, { "epoch": 8.22627037392138, "grad_norm": 0.3608817458152771, "learning_rate": 6.202946984100261e-05, "loss": 0.0373, "step": 8580 }, { "epoch": 8.235858101629914, "grad_norm": 0.29480835795402527, "learning_rate": 6.194498311454636e-05, "loss": 0.0321, "step": 8590 }, { "epoch": 8.245445829338447, "grad_norm": 0.27964943647384644, "learning_rate": 6.186046020290792e-05, "loss": 0.0428, "step": 8600 }, { "epoch": 8.25503355704698, "grad_norm": 0.2138575315475464, "learning_rate": 6.177590136213429e-05, "loss": 0.0344, "step": 8610 }, { "epoch": 8.264621284755513, "grad_norm": 0.3693723678588867, "learning_rate": 6.169130684838132e-05, "loss": 0.0449, "step": 8620 }, { "epoch": 8.274209012464047, "grad_norm": 0.24271826446056366, "learning_rate": 6.160667691791287e-05, "loss": 0.0414, "step": 8630 }, { "epoch": 8.28379674017258, "grad_norm": 0.27349698543548584, "learning_rate": 6.152201182710016e-05, "loss": 0.0437, "step": 8640 }, { "epoch": 8.293384467881111, "grad_norm": 0.265661358833313, "learning_rate": 6.143731183242085e-05, "loss": 0.0402, "step": 8650 }, { "epoch": 8.302972195589644, "grad_norm": 0.3084318935871124, "learning_rate": 6.13525771904584e-05, "loss": 0.0424, "step": 8660 }, { "epoch": 8.312559923298178, "grad_norm": 0.42005741596221924, "learning_rate": 6.126780815790116e-05, "loss": 0.0386, "step": 8670 }, { "epoch": 8.322147651006711, "grad_norm": 0.349277526140213, "learning_rate": 6.118300499154174e-05, "loss": 0.0355, "step": 8680 }, { "epoch": 8.331735378715244, "grad_norm": 0.3930281102657318, "learning_rate": 6.109816794827607e-05, "loss": 0.0386, "step": 8690 }, { "epoch": 8.341323106423777, "grad_norm": 0.2631587088108063, "learning_rate": 6.101329728510278e-05, "loss": 0.0376, "step": 8700 }, { "epoch": 8.35091083413231, "grad_norm": 0.3070177137851715, "learning_rate": 6.0928393259122285e-05, "loss": 0.039, "step": 8710 }, { "epoch": 8.360498561840844, "grad_norm": 0.3494318425655365, "learning_rate": 6.084345612753611e-05, "loss": 0.0405, "step": 8720 }, { "epoch": 8.370086289549377, "grad_norm": 0.2996184825897217, "learning_rate": 6.0758486147646035e-05, "loss": 0.0386, "step": 8730 }, { "epoch": 8.37967401725791, "grad_norm": 0.39091756939888, "learning_rate": 6.0673483576853365e-05, "loss": 0.038, "step": 8740 }, { "epoch": 8.389261744966444, "grad_norm": 0.28855571150779724, "learning_rate": 6.0588448672658125e-05, "loss": 0.0403, "step": 8750 }, { "epoch": 8.398849472674977, "grad_norm": 0.25725746154785156, "learning_rate": 6.05033816926583e-05, "loss": 0.0338, "step": 8760 }, { "epoch": 8.40843720038351, "grad_norm": 0.2737105190753937, "learning_rate": 6.041828289454903e-05, "loss": 0.0417, "step": 8770 }, { "epoch": 8.418024928092041, "grad_norm": 0.3197145462036133, "learning_rate": 6.033315253612186e-05, "loss": 0.0428, "step": 8780 }, { "epoch": 8.427612655800575, "grad_norm": 0.35713446140289307, "learning_rate": 6.0247990875263914e-05, "loss": 0.0376, "step": 8790 }, { "epoch": 8.437200383509108, "grad_norm": 0.354390949010849, "learning_rate": 6.016279816995718e-05, "loss": 0.0384, "step": 8800 }, { "epoch": 8.446788111217641, "grad_norm": 0.31738895177841187, "learning_rate": 6.0077574678277636e-05, "loss": 0.048, "step": 8810 }, { "epoch": 8.456375838926174, "grad_norm": 0.28505873680114746, "learning_rate": 5.999232065839456e-05, "loss": 0.0353, "step": 8820 }, { "epoch": 8.465963566634708, "grad_norm": 0.3551139831542969, "learning_rate": 5.990703636856974e-05, "loss": 0.0422, "step": 8830 }, { "epoch": 8.47555129434324, "grad_norm": 0.23753251135349274, "learning_rate": 5.982172206715656e-05, "loss": 0.0356, "step": 8840 }, { "epoch": 8.485139022051774, "grad_norm": 0.3025340735912323, "learning_rate": 5.973637801259944e-05, "loss": 0.0416, "step": 8850 }, { "epoch": 8.494726749760307, "grad_norm": 0.3358081579208374, "learning_rate": 5.9651004463432826e-05, "loss": 0.0406, "step": 8860 }, { "epoch": 8.50431447746884, "grad_norm": 0.2748364508152008, "learning_rate": 5.95656016782806e-05, "loss": 0.0355, "step": 8870 }, { "epoch": 8.513902205177374, "grad_norm": 0.27150842547416687, "learning_rate": 5.948016991585514e-05, "loss": 0.0356, "step": 8880 }, { "epoch": 8.523489932885907, "grad_norm": 0.2812124490737915, "learning_rate": 5.9394709434956664e-05, "loss": 0.0419, "step": 8890 }, { "epoch": 8.53307766059444, "grad_norm": 0.29283568263053894, "learning_rate": 5.9309220494472314e-05, "loss": 0.0408, "step": 8900 }, { "epoch": 8.542665388302972, "grad_norm": 0.4069705605506897, "learning_rate": 5.9223703353375534e-05, "loss": 0.0425, "step": 8910 }, { "epoch": 8.552253116011505, "grad_norm": 0.2776540219783783, "learning_rate": 5.913815827072513e-05, "loss": 0.0365, "step": 8920 }, { "epoch": 8.561840843720038, "grad_norm": 0.2777857482433319, "learning_rate": 5.905258550566458e-05, "loss": 0.0368, "step": 8930 }, { "epoch": 8.571428571428571, "grad_norm": 0.3018902838230133, "learning_rate": 5.896698531742122e-05, "loss": 0.0377, "step": 8940 }, { "epoch": 8.581016299137104, "grad_norm": 0.622887134552002, "learning_rate": 5.888135796530544e-05, "loss": 0.0448, "step": 8950 }, { "epoch": 8.590604026845638, "grad_norm": 0.28407829999923706, "learning_rate": 5.879570370870995e-05, "loss": 0.0373, "step": 8960 }, { "epoch": 8.60019175455417, "grad_norm": 0.2791987955570221, "learning_rate": 5.871002280710892e-05, "loss": 0.0402, "step": 8970 }, { "epoch": 8.609779482262704, "grad_norm": 0.27533990144729614, "learning_rate": 5.862431552005729e-05, "loss": 0.0434, "step": 8980 }, { "epoch": 8.619367209971237, "grad_norm": 0.27701878547668457, "learning_rate": 5.85385821071899e-05, "loss": 0.0383, "step": 8990 }, { "epoch": 8.62895493767977, "grad_norm": 0.269197016954422, "learning_rate": 5.845282282822071e-05, "loss": 0.0389, "step": 9000 }, { "epoch": 8.638542665388304, "grad_norm": 0.3775997757911682, "learning_rate": 5.836703794294208e-05, "loss": 0.0401, "step": 9010 }, { "epoch": 8.648130393096835, "grad_norm": 0.21519199013710022, "learning_rate": 5.828122771122392e-05, "loss": 0.0326, "step": 9020 }, { "epoch": 8.657718120805368, "grad_norm": 0.4001868963241577, "learning_rate": 5.819539239301291e-05, "loss": 0.04, "step": 9030 }, { "epoch": 8.667305848513902, "grad_norm": 0.19594238698482513, "learning_rate": 5.810953224833177e-05, "loss": 0.0301, "step": 9040 }, { "epoch": 8.676893576222435, "grad_norm": 0.19823068380355835, "learning_rate": 5.802364753727836e-05, "loss": 0.0344, "step": 9050 }, { "epoch": 8.686481303930968, "grad_norm": 0.26146700978279114, "learning_rate": 5.793773852002502e-05, "loss": 0.0444, "step": 9060 }, { "epoch": 8.696069031639501, "grad_norm": 0.36863768100738525, "learning_rate": 5.7851805456817677e-05, "loss": 0.0364, "step": 9070 }, { "epoch": 8.705656759348035, "grad_norm": 0.2518344521522522, "learning_rate": 5.7765848607975136e-05, "loss": 0.0394, "step": 9080 }, { "epoch": 8.715244487056568, "grad_norm": 0.2473488301038742, "learning_rate": 5.767986823388825e-05, "loss": 0.0326, "step": 9090 }, { "epoch": 8.724832214765101, "grad_norm": 0.20669348537921906, "learning_rate": 5.7593864595019096e-05, "loss": 0.0408, "step": 9100 }, { "epoch": 8.734419942473634, "grad_norm": 0.32804393768310547, "learning_rate": 5.750783795190029e-05, "loss": 0.0388, "step": 9110 }, { "epoch": 8.744007670182167, "grad_norm": 0.18472160398960114, "learning_rate": 5.7421788565134074e-05, "loss": 0.0395, "step": 9120 }, { "epoch": 8.7535953978907, "grad_norm": 0.3553003668785095, "learning_rate": 5.733571669539167e-05, "loss": 0.0432, "step": 9130 }, { "epoch": 8.763183125599234, "grad_norm": 0.2398902177810669, "learning_rate": 5.72496226034123e-05, "loss": 0.0354, "step": 9140 }, { "epoch": 8.772770853307765, "grad_norm": 0.2900802195072174, "learning_rate": 5.716350655000261e-05, "loss": 0.0449, "step": 9150 }, { "epoch": 8.782358581016299, "grad_norm": 0.17919373512268066, "learning_rate": 5.707736879603568e-05, "loss": 0.0413, "step": 9160 }, { "epoch": 8.791946308724832, "grad_norm": 0.2598424255847931, "learning_rate": 5.6991209602450424e-05, "loss": 0.0432, "step": 9170 }, { "epoch": 8.801534036433365, "grad_norm": 0.4794408082962036, "learning_rate": 5.69050292302506e-05, "loss": 0.0392, "step": 9180 }, { "epoch": 8.811121764141898, "grad_norm": 0.3420094847679138, "learning_rate": 5.6818827940504225e-05, "loss": 0.0335, "step": 9190 }, { "epoch": 8.820709491850431, "grad_norm": 1.9920908212661743, "learning_rate": 5.673260599434259e-05, "loss": 0.0427, "step": 9200 }, { "epoch": 8.830297219558965, "grad_norm": 0.28250133991241455, "learning_rate": 5.664636365295965e-05, "loss": 0.0349, "step": 9210 }, { "epoch": 8.839884947267498, "grad_norm": 0.22743001580238342, "learning_rate": 5.656010117761105e-05, "loss": 0.0401, "step": 9220 }, { "epoch": 8.849472674976031, "grad_norm": 0.2771368622779846, "learning_rate": 5.647381882961349e-05, "loss": 0.0424, "step": 9230 }, { "epoch": 8.859060402684564, "grad_norm": 0.38394448161125183, "learning_rate": 5.638751687034387e-05, "loss": 0.0357, "step": 9240 }, { "epoch": 8.868648130393098, "grad_norm": 0.22416839003562927, "learning_rate": 5.630119556123848e-05, "loss": 0.0347, "step": 9250 }, { "epoch": 8.87823585810163, "grad_norm": 0.1746525913476944, "learning_rate": 5.6214855163792224e-05, "loss": 0.0366, "step": 9260 }, { "epoch": 8.887823585810162, "grad_norm": 0.26215359568595886, "learning_rate": 5.6128495939557835e-05, "loss": 0.0411, "step": 9270 }, { "epoch": 8.897411313518695, "grad_norm": 0.3498288691043854, "learning_rate": 5.604211815014509e-05, "loss": 0.0404, "step": 9280 }, { "epoch": 8.906999041227229, "grad_norm": 0.19935335218906403, "learning_rate": 5.595572205721999e-05, "loss": 0.0356, "step": 9290 }, { "epoch": 8.916586768935762, "grad_norm": 0.3347182869911194, "learning_rate": 5.5869307922504e-05, "loss": 0.0393, "step": 9300 }, { "epoch": 8.926174496644295, "grad_norm": 0.3638782203197479, "learning_rate": 5.578287600777321e-05, "loss": 0.0324, "step": 9310 }, { "epoch": 8.935762224352828, "grad_norm": 0.2433633953332901, "learning_rate": 5.569642657485761e-05, "loss": 0.0351, "step": 9320 }, { "epoch": 8.945349952061362, "grad_norm": 0.2311711609363556, "learning_rate": 5.560995988564023e-05, "loss": 0.0386, "step": 9330 }, { "epoch": 8.954937679769895, "grad_norm": 0.2803432047367096, "learning_rate": 5.552347620205638e-05, "loss": 0.0461, "step": 9340 }, { "epoch": 8.964525407478428, "grad_norm": 0.25586047768592834, "learning_rate": 5.5436975786092873e-05, "loss": 0.0384, "step": 9350 }, { "epoch": 8.974113135186961, "grad_norm": 0.3626959025859833, "learning_rate": 5.535045889978717e-05, "loss": 0.0374, "step": 9360 }, { "epoch": 8.983700862895494, "grad_norm": 0.3548148572444916, "learning_rate": 5.526392580522666e-05, "loss": 0.0416, "step": 9370 }, { "epoch": 8.993288590604028, "grad_norm": 2.09843111038208, "learning_rate": 5.5177376764547814e-05, "loss": 0.0434, "step": 9380 }, { "epoch": 9.002876318312559, "grad_norm": 0.4216479957103729, "learning_rate": 5.5090812039935426e-05, "loss": 0.0404, "step": 9390 }, { "epoch": 9.012464046021092, "grad_norm": 0.292222261428833, "learning_rate": 5.5004231893621774e-05, "loss": 0.0362, "step": 9400 }, { "epoch": 9.022051773729626, "grad_norm": 0.37306836247444153, "learning_rate": 5.491763658788589e-05, "loss": 0.0367, "step": 9410 }, { "epoch": 9.031639501438159, "grad_norm": 0.2755350172519684, "learning_rate": 5.483102638505269e-05, "loss": 0.0401, "step": 9420 }, { "epoch": 9.041227229146692, "grad_norm": 0.2616848349571228, "learning_rate": 5.4744401547492254e-05, "loss": 0.0337, "step": 9430 }, { "epoch": 9.050814956855225, "grad_norm": 0.28111451864242554, "learning_rate": 5.465776233761896e-05, "loss": 0.0384, "step": 9440 }, { "epoch": 9.060402684563758, "grad_norm": 0.23586216568946838, "learning_rate": 5.4571109017890753e-05, "loss": 0.0405, "step": 9450 }, { "epoch": 9.069990412272292, "grad_norm": 0.3019304871559143, "learning_rate": 5.44844418508083e-05, "loss": 0.0389, "step": 9460 }, { "epoch": 9.079578139980825, "grad_norm": 0.3531333804130554, "learning_rate": 5.4397761098914254e-05, "loss": 0.0334, "step": 9470 }, { "epoch": 9.089165867689358, "grad_norm": 0.40830254554748535, "learning_rate": 5.431106702479235e-05, "loss": 0.0357, "step": 9480 }, { "epoch": 9.098753595397891, "grad_norm": 0.44957104325294495, "learning_rate": 5.4224359891066765e-05, "loss": 0.039, "step": 9490 }, { "epoch": 9.108341323106425, "grad_norm": 0.6519899964332581, "learning_rate": 5.413763996040117e-05, "loss": 0.0402, "step": 9500 }, { "epoch": 9.117929050814958, "grad_norm": 0.4034676253795624, "learning_rate": 5.405090749549804e-05, "loss": 0.0459, "step": 9510 }, { "epoch": 9.12751677852349, "grad_norm": 0.3996933698654175, "learning_rate": 5.396416275909779e-05, "loss": 0.0398, "step": 9520 }, { "epoch": 9.137104506232022, "grad_norm": 0.16408595442771912, "learning_rate": 5.387740601397806e-05, "loss": 0.0358, "step": 9530 }, { "epoch": 9.146692233940556, "grad_norm": 0.3471783995628357, "learning_rate": 5.379063752295282e-05, "loss": 0.0391, "step": 9540 }, { "epoch": 9.156279961649089, "grad_norm": 0.4107268452644348, "learning_rate": 5.370385754887164e-05, "loss": 0.0424, "step": 9550 }, { "epoch": 9.165867689357622, "grad_norm": 0.32927405834198, "learning_rate": 5.3617066354618874e-05, "loss": 0.0453, "step": 9560 }, { "epoch": 9.175455417066155, "grad_norm": 0.41520607471466064, "learning_rate": 5.3530264203112856e-05, "loss": 0.0392, "step": 9570 }, { "epoch": 9.185043144774689, "grad_norm": 0.3985765278339386, "learning_rate": 5.344345135730513e-05, "loss": 0.0364, "step": 9580 }, { "epoch": 9.194630872483222, "grad_norm": 0.344056099653244, "learning_rate": 5.335662808017964e-05, "loss": 0.0444, "step": 9590 }, { "epoch": 9.204218600191755, "grad_norm": 0.3382169008255005, "learning_rate": 5.32697946347519e-05, "loss": 0.0375, "step": 9600 }, { "epoch": 9.213806327900288, "grad_norm": 0.3668196499347687, "learning_rate": 5.318295128406825e-05, "loss": 0.0427, "step": 9610 }, { "epoch": 9.223394055608821, "grad_norm": 0.22777938842773438, "learning_rate": 5.3096098291205044e-05, "loss": 0.0362, "step": 9620 }, { "epoch": 9.232981783317355, "grad_norm": 0.2992532551288605, "learning_rate": 5.300923591926783e-05, "loss": 0.0344, "step": 9630 }, { "epoch": 9.242569511025886, "grad_norm": 0.2733289301395416, "learning_rate": 5.292236443139056e-05, "loss": 0.0318, "step": 9640 }, { "epoch": 9.25215723873442, "grad_norm": 0.2972942292690277, "learning_rate": 5.283548409073482e-05, "loss": 0.0357, "step": 9650 }, { "epoch": 9.261744966442953, "grad_norm": 0.3721420466899872, "learning_rate": 5.274859516048901e-05, "loss": 0.0356, "step": 9660 }, { "epoch": 9.271332694151486, "grad_norm": 0.13791558146476746, "learning_rate": 5.266169790386756e-05, "loss": 0.0345, "step": 9670 }, { "epoch": 9.280920421860019, "grad_norm": 0.2645628750324249, "learning_rate": 5.257479258411008e-05, "loss": 0.0426, "step": 9680 }, { "epoch": 9.290508149568552, "grad_norm": 0.3136797845363617, "learning_rate": 5.248787946448065e-05, "loss": 0.0354, "step": 9690 }, { "epoch": 9.300095877277085, "grad_norm": 0.25481873750686646, "learning_rate": 5.240095880826695e-05, "loss": 0.0401, "step": 9700 }, { "epoch": 9.309683604985619, "grad_norm": 0.24243059754371643, "learning_rate": 5.231403087877955e-05, "loss": 0.0422, "step": 9710 }, { "epoch": 9.319271332694152, "grad_norm": 0.22734355926513672, "learning_rate": 5.2227095939350966e-05, "loss": 0.0409, "step": 9720 }, { "epoch": 9.328859060402685, "grad_norm": 0.35372641682624817, "learning_rate": 5.214015425333502e-05, "loss": 0.0413, "step": 9730 }, { "epoch": 9.338446788111218, "grad_norm": 0.2218106985092163, "learning_rate": 5.205320608410591e-05, "loss": 0.0385, "step": 9740 }, { "epoch": 9.348034515819752, "grad_norm": 0.8550918698310852, "learning_rate": 5.196625169505755e-05, "loss": 0.0383, "step": 9750 }, { "epoch": 9.357622243528283, "grad_norm": 0.325469434261322, "learning_rate": 5.18792913496026e-05, "loss": 0.0377, "step": 9760 }, { "epoch": 9.367209971236816, "grad_norm": 0.2887977063655853, "learning_rate": 5.1792325311171875e-05, "loss": 0.039, "step": 9770 }, { "epoch": 9.37679769894535, "grad_norm": 0.267398476600647, "learning_rate": 5.1705353843213336e-05, "loss": 0.0351, "step": 9780 }, { "epoch": 9.386385426653883, "grad_norm": 0.3469073176383972, "learning_rate": 5.1618377209191447e-05, "loss": 0.0373, "step": 9790 }, { "epoch": 9.395973154362416, "grad_norm": 0.399781733751297, "learning_rate": 5.1531395672586314e-05, "loss": 0.0345, "step": 9800 }, { "epoch": 9.405560882070949, "grad_norm": 0.3050326704978943, "learning_rate": 5.144440949689287e-05, "loss": 0.0436, "step": 9810 }, { "epoch": 9.415148609779482, "grad_norm": 0.22124247252941132, "learning_rate": 5.135741894562014e-05, "loss": 0.0384, "step": 9820 }, { "epoch": 9.424736337488016, "grad_norm": 0.32914167642593384, "learning_rate": 5.127042428229036e-05, "loss": 0.0395, "step": 9830 }, { "epoch": 9.434324065196549, "grad_norm": 0.302157998085022, "learning_rate": 5.118342577043829e-05, "loss": 0.0446, "step": 9840 }, { "epoch": 9.443911792905082, "grad_norm": 0.29756733775138855, "learning_rate": 5.1096423673610246e-05, "loss": 0.035, "step": 9850 }, { "epoch": 9.453499520613615, "grad_norm": 0.21626603603363037, "learning_rate": 5.100941825536353e-05, "loss": 0.0487, "step": 9860 }, { "epoch": 9.463087248322148, "grad_norm": 0.31502407789230347, "learning_rate": 5.092240977926538e-05, "loss": 0.0384, "step": 9870 }, { "epoch": 9.47267497603068, "grad_norm": 0.3153168261051178, "learning_rate": 5.083539850889239e-05, "loss": 0.0377, "step": 9880 }, { "epoch": 9.482262703739213, "grad_norm": 0.3235209584236145, "learning_rate": 5.074838470782957e-05, "loss": 0.0402, "step": 9890 }, { "epoch": 9.491850431447746, "grad_norm": 0.4194275438785553, "learning_rate": 5.066136863966963e-05, "loss": 0.0349, "step": 9900 }, { "epoch": 9.50143815915628, "grad_norm": 0.26690346002578735, "learning_rate": 5.0574350568012086e-05, "loss": 0.037, "step": 9910 }, { "epoch": 9.511025886864813, "grad_norm": 0.3191596567630768, "learning_rate": 5.0487330756462624e-05, "loss": 0.0427, "step": 9920 }, { "epoch": 9.520613614573346, "grad_norm": 0.21837887167930603, "learning_rate": 5.040030946863209e-05, "loss": 0.031, "step": 9930 }, { "epoch": 9.53020134228188, "grad_norm": 0.28201964497566223, "learning_rate": 5.0313286968135884e-05, "loss": 0.0348, "step": 9940 }, { "epoch": 9.539789069990412, "grad_norm": 0.6378640532493591, "learning_rate": 5.022626351859305e-05, "loss": 0.0392, "step": 9950 }, { "epoch": 9.549376797698946, "grad_norm": 0.27877506613731384, "learning_rate": 5.01392393836255e-05, "loss": 0.0435, "step": 9960 }, { "epoch": 9.558964525407479, "grad_norm": 0.21583925187587738, "learning_rate": 5.0052214826857225e-05, "loss": 0.036, "step": 9970 }, { "epoch": 9.568552253116012, "grad_norm": 0.3575581908226013, "learning_rate": 4.996519011191351e-05, "loss": 0.0344, "step": 9980 }, { "epoch": 9.578139980824545, "grad_norm": 0.2446652501821518, "learning_rate": 4.9878165502420104e-05, "loss": 0.0382, "step": 9990 }, { "epoch": 9.587727708533077, "grad_norm": 0.1690993756055832, "learning_rate": 4.979114126200244e-05, "loss": 0.0392, "step": 10000 }, { "epoch": 9.59731543624161, "grad_norm": 0.3892661929130554, "learning_rate": 4.970411765428484e-05, "loss": 0.0366, "step": 10010 }, { "epoch": 9.606903163950143, "grad_norm": 0.26752811670303345, "learning_rate": 4.961709494288966e-05, "loss": 0.0377, "step": 10020 }, { "epoch": 9.616490891658676, "grad_norm": 0.3104531466960907, "learning_rate": 4.9530073391436654e-05, "loss": 0.0371, "step": 10030 }, { "epoch": 9.62607861936721, "grad_norm": 0.3081854283809662, "learning_rate": 4.944305326354194e-05, "loss": 0.0377, "step": 10040 }, { "epoch": 9.635666347075743, "grad_norm": 0.32180699706077576, "learning_rate": 4.935603482281739e-05, "loss": 0.0364, "step": 10050 }, { "epoch": 9.645254074784276, "grad_norm": 0.30046379566192627, "learning_rate": 4.926901833286974e-05, "loss": 0.0341, "step": 10060 }, { "epoch": 9.65484180249281, "grad_norm": 0.24152809381484985, "learning_rate": 4.918200405729986e-05, "loss": 0.0453, "step": 10070 }, { "epoch": 9.664429530201343, "grad_norm": 0.8806717395782471, "learning_rate": 4.909499225970184e-05, "loss": 0.0352, "step": 10080 }, { "epoch": 9.674017257909876, "grad_norm": 0.3561595380306244, "learning_rate": 4.9007983203662326e-05, "loss": 0.0337, "step": 10090 }, { "epoch": 9.683604985618409, "grad_norm": 0.3623135983943939, "learning_rate": 4.892097715275961e-05, "loss": 0.0361, "step": 10100 }, { "epoch": 9.693192713326942, "grad_norm": 0.3282937705516815, "learning_rate": 4.883397437056293e-05, "loss": 0.0357, "step": 10110 }, { "epoch": 9.702780441035475, "grad_norm": 0.28583481907844543, "learning_rate": 4.87469751206316e-05, "loss": 0.032, "step": 10120 }, { "epoch": 9.712368168744007, "grad_norm": 0.20011906325817108, "learning_rate": 4.865997966651421e-05, "loss": 0.0335, "step": 10130 }, { "epoch": 9.72195589645254, "grad_norm": 0.23072586953639984, "learning_rate": 4.857298827174787e-05, "loss": 0.0326, "step": 10140 }, { "epoch": 9.731543624161073, "grad_norm": 0.21280129253864288, "learning_rate": 4.8486001199857416e-05, "loss": 0.0354, "step": 10150 }, { "epoch": 9.741131351869607, "grad_norm": 0.4237668812274933, "learning_rate": 4.839901871435457e-05, "loss": 0.0351, "step": 10160 }, { "epoch": 9.75071907957814, "grad_norm": 0.2798875868320465, "learning_rate": 4.831204107873713e-05, "loss": 0.0353, "step": 10170 }, { "epoch": 9.760306807286673, "grad_norm": 0.20780718326568604, "learning_rate": 4.822506855648825e-05, "loss": 0.0326, "step": 10180 }, { "epoch": 9.769894534995206, "grad_norm": 0.2649904489517212, "learning_rate": 4.8138101411075574e-05, "loss": 0.035, "step": 10190 }, { "epoch": 9.77948226270374, "grad_norm": 0.26445141434669495, "learning_rate": 4.805113990595046e-05, "loss": 0.0468, "step": 10200 }, { "epoch": 9.789069990412273, "grad_norm": 0.3209472894668579, "learning_rate": 4.796418430454718e-05, "loss": 0.0375, "step": 10210 }, { "epoch": 9.798657718120806, "grad_norm": 0.19877949357032776, "learning_rate": 4.787723487028209e-05, "loss": 0.0381, "step": 10220 }, { "epoch": 9.808245445829339, "grad_norm": 0.3071509301662445, "learning_rate": 4.779029186655292e-05, "loss": 0.0432, "step": 10230 }, { "epoch": 9.817833173537872, "grad_norm": 0.4730135500431061, "learning_rate": 4.77033555567379e-05, "loss": 0.0374, "step": 10240 }, { "epoch": 9.827420901246404, "grad_norm": 0.29888778924942017, "learning_rate": 4.761642620419497e-05, "loss": 0.0357, "step": 10250 }, { "epoch": 9.837008628954937, "grad_norm": 0.2550467550754547, "learning_rate": 4.7529504072260974e-05, "loss": 0.0309, "step": 10260 }, { "epoch": 9.84659635666347, "grad_norm": 0.25972646474838257, "learning_rate": 4.744258942425094e-05, "loss": 0.0421, "step": 10270 }, { "epoch": 9.856184084372003, "grad_norm": 0.4071574807167053, "learning_rate": 4.735568252345718e-05, "loss": 0.0351, "step": 10280 }, { "epoch": 9.865771812080537, "grad_norm": 0.4687805771827698, "learning_rate": 4.726878363314855e-05, "loss": 0.0369, "step": 10290 }, { "epoch": 9.87535953978907, "grad_norm": 0.41865023970603943, "learning_rate": 4.718189301656962e-05, "loss": 0.0345, "step": 10300 }, { "epoch": 9.884947267497603, "grad_norm": 0.30435627698898315, "learning_rate": 4.709501093693997e-05, "loss": 0.0321, "step": 10310 }, { "epoch": 9.894534995206136, "grad_norm": 0.3561161458492279, "learning_rate": 4.7008137657453214e-05, "loss": 0.0409, "step": 10320 }, { "epoch": 9.90412272291467, "grad_norm": 0.36440134048461914, "learning_rate": 4.692127344127637e-05, "loss": 0.033, "step": 10330 }, { "epoch": 9.913710450623203, "grad_norm": 0.26994454860687256, "learning_rate": 4.683441855154899e-05, "loss": 0.0346, "step": 10340 }, { "epoch": 9.923298178331736, "grad_norm": 0.2506847381591797, "learning_rate": 4.674757325138239e-05, "loss": 0.0314, "step": 10350 }, { "epoch": 9.93288590604027, "grad_norm": 0.20864498615264893, "learning_rate": 4.666073780385879e-05, "loss": 0.0366, "step": 10360 }, { "epoch": 9.9424736337488, "grad_norm": 0.18419000506401062, "learning_rate": 4.65739124720306e-05, "loss": 0.0329, "step": 10370 }, { "epoch": 9.952061361457334, "grad_norm": 0.3387259244918823, "learning_rate": 4.648709751891957e-05, "loss": 0.0381, "step": 10380 }, { "epoch": 9.961649089165867, "grad_norm": 0.2119244635105133, "learning_rate": 4.640029320751606e-05, "loss": 0.0351, "step": 10390 }, { "epoch": 9.9712368168744, "grad_norm": 0.4716765880584717, "learning_rate": 4.63134998007781e-05, "loss": 0.0378, "step": 10400 }, { "epoch": 9.980824544582934, "grad_norm": 0.47296905517578125, "learning_rate": 4.622671756163075e-05, "loss": 0.0397, "step": 10410 }, { "epoch": 9.990412272291467, "grad_norm": 0.3720930218696594, "learning_rate": 4.6139946752965216e-05, "loss": 0.0387, "step": 10420 }, { "epoch": 10.0, "grad_norm": 0.2873878479003906, "learning_rate": 4.6053187637638115e-05, "loss": 0.0336, "step": 10430 }, { "epoch": 10.009587727708533, "grad_norm": 0.27077776193618774, "learning_rate": 4.596644047847061e-05, "loss": 0.0335, "step": 10440 }, { "epoch": 10.019175455417066, "grad_norm": 0.29882556200027466, "learning_rate": 4.587970553824762e-05, "loss": 0.0329, "step": 10450 }, { "epoch": 10.0287631831256, "grad_norm": 0.23539794981479645, "learning_rate": 4.579298307971709e-05, "loss": 0.0319, "step": 10460 }, { "epoch": 10.038350910834133, "grad_norm": 0.47081291675567627, "learning_rate": 4.570627336558915e-05, "loss": 0.0448, "step": 10470 }, { "epoch": 10.047938638542666, "grad_norm": 0.21392913162708282, "learning_rate": 4.561957665853532e-05, "loss": 0.0406, "step": 10480 }, { "epoch": 10.0575263662512, "grad_norm": 0.31942254304885864, "learning_rate": 4.553289322118769e-05, "loss": 0.0347, "step": 10490 }, { "epoch": 10.06711409395973, "grad_norm": 0.22749362885951996, "learning_rate": 4.544622331613817e-05, "loss": 0.0414, "step": 10500 }, { "epoch": 10.076701821668264, "grad_norm": 0.24884119629859924, "learning_rate": 4.5359567205937706e-05, "loss": 0.0314, "step": 10510 }, { "epoch": 10.086289549376797, "grad_norm": 0.26897284388542175, "learning_rate": 4.527292515309541e-05, "loss": 0.0394, "step": 10520 }, { "epoch": 10.09587727708533, "grad_norm": 0.3579690158367157, "learning_rate": 4.518629742007786e-05, "loss": 0.0365, "step": 10530 }, { "epoch": 10.105465004793864, "grad_norm": 0.19811834394931793, "learning_rate": 4.509968426930817e-05, "loss": 0.0358, "step": 10540 }, { "epoch": 10.115052732502397, "grad_norm": 0.2834417223930359, "learning_rate": 4.501308596316537e-05, "loss": 0.0329, "step": 10550 }, { "epoch": 10.12464046021093, "grad_norm": 0.1813543736934662, "learning_rate": 4.492650276398347e-05, "loss": 0.0345, "step": 10560 }, { "epoch": 10.134228187919463, "grad_norm": 0.23895332217216492, "learning_rate": 4.483993493405075e-05, "loss": 0.0328, "step": 10570 }, { "epoch": 10.143815915627997, "grad_norm": 0.2329237461090088, "learning_rate": 4.475338273560886e-05, "loss": 0.0334, "step": 10580 }, { "epoch": 10.15340364333653, "grad_norm": 0.32786402106285095, "learning_rate": 4.466684643085223e-05, "loss": 0.0362, "step": 10590 }, { "epoch": 10.162991371045063, "grad_norm": 0.2858993709087372, "learning_rate": 4.458032628192699e-05, "loss": 0.0349, "step": 10600 }, { "epoch": 10.172579098753596, "grad_norm": 0.38395509123802185, "learning_rate": 4.449382255093044e-05, "loss": 0.0384, "step": 10610 }, { "epoch": 10.182166826462128, "grad_norm": 0.35513293743133545, "learning_rate": 4.440733549991006e-05, "loss": 0.0317, "step": 10620 }, { "epoch": 10.191754554170661, "grad_norm": 0.21551890671253204, "learning_rate": 4.432086539086292e-05, "loss": 0.0373, "step": 10630 }, { "epoch": 10.201342281879194, "grad_norm": 0.22998203337192535, "learning_rate": 4.423441248573463e-05, "loss": 0.0376, "step": 10640 }, { "epoch": 10.210930009587727, "grad_norm": 0.4294188618659973, "learning_rate": 4.4147977046418776e-05, "loss": 0.0356, "step": 10650 }, { "epoch": 10.22051773729626, "grad_norm": 0.2688153386116028, "learning_rate": 4.406155933475599e-05, "loss": 0.0364, "step": 10660 }, { "epoch": 10.230105465004794, "grad_norm": 0.39193832874298096, "learning_rate": 4.3975159612533244e-05, "loss": 0.0337, "step": 10670 }, { "epoch": 10.239693192713327, "grad_norm": 0.4422641694545746, "learning_rate": 4.388877814148296e-05, "loss": 0.0328, "step": 10680 }, { "epoch": 10.24928092042186, "grad_norm": 0.25854796171188354, "learning_rate": 4.380241518328231e-05, "loss": 0.0338, "step": 10690 }, { "epoch": 10.258868648130393, "grad_norm": 0.282626748085022, "learning_rate": 4.371607099955236e-05, "loss": 0.0398, "step": 10700 }, { "epoch": 10.268456375838927, "grad_norm": 0.2568127512931824, "learning_rate": 4.362974585185734e-05, "loss": 0.0354, "step": 10710 }, { "epoch": 10.27804410354746, "grad_norm": 0.28798142075538635, "learning_rate": 4.3543440001703786e-05, "loss": 0.0354, "step": 10720 }, { "epoch": 10.287631831255993, "grad_norm": 0.28471261262893677, "learning_rate": 4.345715371053976e-05, "loss": 0.0365, "step": 10730 }, { "epoch": 10.297219558964525, "grad_norm": 0.27555039525032043, "learning_rate": 4.3370887239754085e-05, "loss": 0.0324, "step": 10740 }, { "epoch": 10.306807286673058, "grad_norm": 0.34258362650871277, "learning_rate": 4.328464085067559e-05, "loss": 0.0313, "step": 10750 }, { "epoch": 10.316395014381591, "grad_norm": 0.2875727117061615, "learning_rate": 4.319841480457221e-05, "loss": 0.034, "step": 10760 }, { "epoch": 10.325982742090124, "grad_norm": 0.37291842699050903, "learning_rate": 4.311220936265025e-05, "loss": 0.0358, "step": 10770 }, { "epoch": 10.335570469798657, "grad_norm": 0.28330934047698975, "learning_rate": 4.302602478605364e-05, "loss": 0.0371, "step": 10780 }, { "epoch": 10.34515819750719, "grad_norm": 0.2582619786262512, "learning_rate": 4.29398613358631e-05, "loss": 0.0373, "step": 10790 }, { "epoch": 10.354745925215724, "grad_norm": 0.4369192123413086, "learning_rate": 4.2853719273095306e-05, "loss": 0.035, "step": 10800 }, { "epoch": 10.364333652924257, "grad_norm": 0.7189898490905762, "learning_rate": 4.276759885870221e-05, "loss": 0.0306, "step": 10810 }, { "epoch": 10.37392138063279, "grad_norm": 0.25174766778945923, "learning_rate": 4.26815003535701e-05, "loss": 0.0409, "step": 10820 }, { "epoch": 10.383509108341324, "grad_norm": 0.251800537109375, "learning_rate": 4.2595424018518994e-05, "loss": 0.0338, "step": 10830 }, { "epoch": 10.393096836049857, "grad_norm": 0.2858979105949402, "learning_rate": 4.250937011430167e-05, "loss": 0.041, "step": 10840 }, { "epoch": 10.40268456375839, "grad_norm": 0.1836014688014984, "learning_rate": 4.2423338901602985e-05, "loss": 0.0356, "step": 10850 }, { "epoch": 10.412272291466923, "grad_norm": 0.279307097196579, "learning_rate": 4.233733064103906e-05, "loss": 0.0359, "step": 10860 }, { "epoch": 10.421860019175455, "grad_norm": 0.32045918703079224, "learning_rate": 4.225134559315647e-05, "loss": 0.0377, "step": 10870 }, { "epoch": 10.431447746883988, "grad_norm": 0.2521663010120392, "learning_rate": 4.2165384018431495e-05, "loss": 0.0301, "step": 10880 }, { "epoch": 10.441035474592521, "grad_norm": 0.7854000329971313, "learning_rate": 4.207944617726931e-05, "loss": 0.0337, "step": 10890 }, { "epoch": 10.450623202301054, "grad_norm": 0.2677070200443268, "learning_rate": 4.1993532330003146e-05, "loss": 0.0392, "step": 10900 }, { "epoch": 10.460210930009588, "grad_norm": 0.4461430609226227, "learning_rate": 4.190764273689359e-05, "loss": 0.0306, "step": 10910 }, { "epoch": 10.46979865771812, "grad_norm": 0.30843472480773926, "learning_rate": 4.1821777658127765e-05, "loss": 0.0259, "step": 10920 }, { "epoch": 10.479386385426654, "grad_norm": 0.5075517296791077, "learning_rate": 4.17359373538185e-05, "loss": 0.0376, "step": 10930 }, { "epoch": 10.488974113135187, "grad_norm": 0.3522166609764099, "learning_rate": 4.16501220840036e-05, "loss": 0.0272, "step": 10940 }, { "epoch": 10.49856184084372, "grad_norm": 0.3115832805633545, "learning_rate": 4.156433210864499e-05, "loss": 0.0421, "step": 10950 }, { "epoch": 10.508149568552254, "grad_norm": 0.29928937554359436, "learning_rate": 4.147856768762804e-05, "loss": 0.0329, "step": 10960 }, { "epoch": 10.517737296260787, "grad_norm": 0.2621513903141022, "learning_rate": 4.139282908076064e-05, "loss": 0.0313, "step": 10970 }, { "epoch": 10.527325023969318, "grad_norm": 0.31416305899620056, "learning_rate": 4.130711654777254e-05, "loss": 0.0311, "step": 10980 }, { "epoch": 10.536912751677852, "grad_norm": 0.23825299739837646, "learning_rate": 4.1221430348314415e-05, "loss": 0.0386, "step": 10990 }, { "epoch": 10.546500479386385, "grad_norm": 0.2471434473991394, "learning_rate": 4.11357707419573e-05, "loss": 0.038, "step": 11000 }, { "epoch": 10.556088207094918, "grad_norm": 0.2707345187664032, "learning_rate": 4.105013798819155e-05, "loss": 0.0356, "step": 11010 }, { "epoch": 10.565675934803451, "grad_norm": 0.3994966149330139, "learning_rate": 4.0964532346426235e-05, "loss": 0.0326, "step": 11020 }, { "epoch": 10.575263662511984, "grad_norm": 0.5146787762641907, "learning_rate": 4.087895407598824e-05, "loss": 0.0361, "step": 11030 }, { "epoch": 10.584851390220518, "grad_norm": 0.2920519709587097, "learning_rate": 4.079340343612165e-05, "loss": 0.0326, "step": 11040 }, { "epoch": 10.594439117929051, "grad_norm": 0.27901026606559753, "learning_rate": 4.070788068598672e-05, "loss": 0.037, "step": 11050 }, { "epoch": 10.604026845637584, "grad_norm": 0.26402774453163147, "learning_rate": 4.062238608465927e-05, "loss": 0.0337, "step": 11060 }, { "epoch": 10.613614573346117, "grad_norm": 0.24872805178165436, "learning_rate": 4.053691989112986e-05, "loss": 0.0343, "step": 11070 }, { "epoch": 10.62320230105465, "grad_norm": 0.21889743208885193, "learning_rate": 4.0451482364303e-05, "loss": 0.0329, "step": 11080 }, { "epoch": 10.632790028763184, "grad_norm": 0.31977149844169617, "learning_rate": 4.03660737629963e-05, "loss": 0.0395, "step": 11090 }, { "epoch": 10.642377756471717, "grad_norm": 0.3449043929576874, "learning_rate": 4.028069434593982e-05, "loss": 0.0362, "step": 11100 }, { "epoch": 10.651965484180248, "grad_norm": 0.356534481048584, "learning_rate": 4.019534437177516e-05, "loss": 0.0453, "step": 11110 }, { "epoch": 10.661553211888782, "grad_norm": 0.3510785400867462, "learning_rate": 4.0110024099054756e-05, "loss": 0.03, "step": 11120 }, { "epoch": 10.671140939597315, "grad_norm": 0.4049818813800812, "learning_rate": 4.002473378624107e-05, "loss": 0.0337, "step": 11130 }, { "epoch": 10.680728667305848, "grad_norm": 0.2889692485332489, "learning_rate": 3.9939473691705765e-05, "loss": 0.0369, "step": 11140 }, { "epoch": 10.690316395014381, "grad_norm": 0.25454413890838623, "learning_rate": 3.9854244073728996e-05, "loss": 0.0373, "step": 11150 }, { "epoch": 10.699904122722915, "grad_norm": 0.28601503372192383, "learning_rate": 3.976904519049862e-05, "loss": 0.0384, "step": 11160 }, { "epoch": 10.709491850431448, "grad_norm": 0.22738857567310333, "learning_rate": 3.968387730010935e-05, "loss": 0.0352, "step": 11170 }, { "epoch": 10.719079578139981, "grad_norm": 0.2723415493965149, "learning_rate": 3.9598740660562005e-05, "loss": 0.0372, "step": 11180 }, { "epoch": 10.728667305848514, "grad_norm": 0.35877975821495056, "learning_rate": 3.951363552976275e-05, "loss": 0.0321, "step": 11190 }, { "epoch": 10.738255033557047, "grad_norm": 0.2732999324798584, "learning_rate": 3.942856216552234e-05, "loss": 0.0423, "step": 11200 }, { "epoch": 10.74784276126558, "grad_norm": 0.1939064860343933, "learning_rate": 3.934352082555522e-05, "loss": 0.0383, "step": 11210 }, { "epoch": 10.757430488974114, "grad_norm": 0.34008413553237915, "learning_rate": 3.92585117674789e-05, "loss": 0.0374, "step": 11220 }, { "epoch": 10.767018216682647, "grad_norm": 0.32701992988586426, "learning_rate": 3.917353524881302e-05, "loss": 0.0336, "step": 11230 }, { "epoch": 10.776605944391179, "grad_norm": 0.29676583409309387, "learning_rate": 3.908859152697872e-05, "loss": 0.0358, "step": 11240 }, { "epoch": 10.786193672099712, "grad_norm": 0.21634122729301453, "learning_rate": 3.900368085929775e-05, "loss": 0.0357, "step": 11250 }, { "epoch": 10.795781399808245, "grad_norm": 0.29007887840270996, "learning_rate": 3.8918803502991744e-05, "loss": 0.0396, "step": 11260 }, { "epoch": 10.805369127516778, "grad_norm": 0.2906304895877838, "learning_rate": 3.883395971518138e-05, "loss": 0.0293, "step": 11270 }, { "epoch": 10.814956855225311, "grad_norm": 0.19408248364925385, "learning_rate": 3.874914975288575e-05, "loss": 0.0338, "step": 11280 }, { "epoch": 10.824544582933845, "grad_norm": 0.9713996052742004, "learning_rate": 3.8664373873021356e-05, "loss": 0.0367, "step": 11290 }, { "epoch": 10.834132310642378, "grad_norm": 0.43305110931396484, "learning_rate": 3.857963233240153e-05, "loss": 0.0409, "step": 11300 }, { "epoch": 10.843720038350911, "grad_norm": 0.4623974859714508, "learning_rate": 3.849492538773552e-05, "loss": 0.0322, "step": 11310 }, { "epoch": 10.853307766059444, "grad_norm": 0.13911698758602142, "learning_rate": 3.841025329562789e-05, "loss": 0.0371, "step": 11320 }, { "epoch": 10.862895493767978, "grad_norm": 0.40783533453941345, "learning_rate": 3.832561631257748e-05, "loss": 0.0334, "step": 11330 }, { "epoch": 10.87248322147651, "grad_norm": 0.2820438742637634, "learning_rate": 3.824101469497685e-05, "loss": 0.0357, "step": 11340 }, { "epoch": 10.882070949185042, "grad_norm": 0.2518521547317505, "learning_rate": 3.8156448699111414e-05, "loss": 0.0398, "step": 11350 }, { "epoch": 10.891658676893575, "grad_norm": 0.22868366539478302, "learning_rate": 3.80719185811587e-05, "loss": 0.0329, "step": 11360 }, { "epoch": 10.901246404602109, "grad_norm": 0.28649628162384033, "learning_rate": 3.79874245971875e-05, "loss": 0.0362, "step": 11370 }, { "epoch": 10.910834132310642, "grad_norm": 0.2933325171470642, "learning_rate": 3.790296700315717e-05, "loss": 0.0322, "step": 11380 }, { "epoch": 10.920421860019175, "grad_norm": 0.34184950590133667, "learning_rate": 3.781854605491684e-05, "loss": 0.034, "step": 11390 }, { "epoch": 10.930009587727708, "grad_norm": 0.26722094416618347, "learning_rate": 3.773416200820463e-05, "loss": 0.0369, "step": 11400 }, { "epoch": 10.939597315436242, "grad_norm": 0.22674645483493805, "learning_rate": 3.764981511864686e-05, "loss": 0.0349, "step": 11410 }, { "epoch": 10.949185043144775, "grad_norm": 0.6623883843421936, "learning_rate": 3.756550564175727e-05, "loss": 0.0331, "step": 11420 }, { "epoch": 10.958772770853308, "grad_norm": 0.3025140166282654, "learning_rate": 3.748123383293629e-05, "loss": 0.0364, "step": 11430 }, { "epoch": 10.968360498561841, "grad_norm": 0.2423921674489975, "learning_rate": 3.739699994747026e-05, "loss": 0.0305, "step": 11440 }, { "epoch": 10.977948226270374, "grad_norm": 0.2216835469007492, "learning_rate": 3.731280424053061e-05, "loss": 0.0338, "step": 11450 }, { "epoch": 10.987535953978908, "grad_norm": 0.4063700735569, "learning_rate": 3.7228646967173096e-05, "loss": 0.0437, "step": 11460 }, { "epoch": 10.997123681687441, "grad_norm": 0.21180011332035065, "learning_rate": 3.7144528382337086e-05, "loss": 0.0362, "step": 11470 }, { "epoch": 11.006711409395972, "grad_norm": 0.22706526517868042, "learning_rate": 3.706044874084474e-05, "loss": 0.0343, "step": 11480 }, { "epoch": 11.016299137104506, "grad_norm": 0.3348940908908844, "learning_rate": 3.6976408297400257e-05, "loss": 0.0344, "step": 11490 }, { "epoch": 11.025886864813039, "grad_norm": 0.21291491389274597, "learning_rate": 3.6892407306589035e-05, "loss": 0.0329, "step": 11500 }, { "epoch": 11.035474592521572, "grad_norm": 0.3505829870700836, "learning_rate": 3.6808446022877e-05, "loss": 0.0339, "step": 11510 }, { "epoch": 11.045062320230105, "grad_norm": 0.36319780349731445, "learning_rate": 3.672452470060982e-05, "loss": 0.0338, "step": 11520 }, { "epoch": 11.054650047938638, "grad_norm": 0.3714457154273987, "learning_rate": 3.6640643594012057e-05, "loss": 0.0419, "step": 11530 }, { "epoch": 11.064237775647172, "grad_norm": 0.27974534034729004, "learning_rate": 3.6556802957186486e-05, "loss": 0.0359, "step": 11540 }, { "epoch": 11.073825503355705, "grad_norm": 0.34719452261924744, "learning_rate": 3.647300304411323e-05, "loss": 0.0367, "step": 11550 }, { "epoch": 11.083413231064238, "grad_norm": 0.24294276535511017, "learning_rate": 3.6389244108649114e-05, "loss": 0.0316, "step": 11560 }, { "epoch": 11.093000958772771, "grad_norm": 0.3280002474784851, "learning_rate": 3.6305526404526785e-05, "loss": 0.0315, "step": 11570 }, { "epoch": 11.102588686481305, "grad_norm": 0.25797387957572937, "learning_rate": 3.6221850185354014e-05, "loss": 0.0306, "step": 11580 }, { "epoch": 11.112176414189838, "grad_norm": 0.2705564498901367, "learning_rate": 3.613821570461284e-05, "loss": 0.0333, "step": 11590 }, { "epoch": 11.12176414189837, "grad_norm": 0.2857078015804291, "learning_rate": 3.605462321565899e-05, "loss": 0.0329, "step": 11600 }, { "epoch": 11.131351869606902, "grad_norm": 0.23920407891273499, "learning_rate": 3.597107297172084e-05, "loss": 0.0366, "step": 11610 }, { "epoch": 11.140939597315436, "grad_norm": 0.31336209177970886, "learning_rate": 3.588756522589888e-05, "loss": 0.03, "step": 11620 }, { "epoch": 11.150527325023969, "grad_norm": 0.2026471495628357, "learning_rate": 3.5804100231164824e-05, "loss": 0.0328, "step": 11630 }, { "epoch": 11.160115052732502, "grad_norm": 0.166408970952034, "learning_rate": 3.572067824036092e-05, "loss": 0.0357, "step": 11640 }, { "epoch": 11.169702780441035, "grad_norm": 0.24978677928447723, "learning_rate": 3.5637299506199075e-05, "loss": 0.0289, "step": 11650 }, { "epoch": 11.179290508149569, "grad_norm": 0.36853691935539246, "learning_rate": 3.5553964281260225e-05, "loss": 0.036, "step": 11660 }, { "epoch": 11.188878235858102, "grad_norm": 0.31218189001083374, "learning_rate": 3.547067281799345e-05, "loss": 0.0327, "step": 11670 }, { "epoch": 11.198465963566635, "grad_norm": 0.2616768777370453, "learning_rate": 3.538742536871531e-05, "loss": 0.0378, "step": 11680 }, { "epoch": 11.208053691275168, "grad_norm": 0.3586946725845337, "learning_rate": 3.530422218560903e-05, "loss": 0.0378, "step": 11690 }, { "epoch": 11.217641418983701, "grad_norm": 0.1958369016647339, "learning_rate": 3.522106352072366e-05, "loss": 0.0368, "step": 11700 }, { "epoch": 11.227229146692235, "grad_norm": 0.30349719524383545, "learning_rate": 3.5137949625973484e-05, "loss": 0.0396, "step": 11710 }, { "epoch": 11.236816874400766, "grad_norm": 0.22439143061637878, "learning_rate": 3.505488075313712e-05, "loss": 0.0275, "step": 11720 }, { "epoch": 11.2464046021093, "grad_norm": 0.3639642596244812, "learning_rate": 3.4971857153856825e-05, "loss": 0.03, "step": 11730 }, { "epoch": 11.255992329817833, "grad_norm": 0.19874945282936096, "learning_rate": 3.488887907963766e-05, "loss": 0.0341, "step": 11740 }, { "epoch": 11.265580057526366, "grad_norm": 0.6180244088172913, "learning_rate": 3.480594678184681e-05, "loss": 0.0346, "step": 11750 }, { "epoch": 11.275167785234899, "grad_norm": 0.27457571029663086, "learning_rate": 3.472306051171281e-05, "loss": 0.0359, "step": 11760 }, { "epoch": 11.284755512943432, "grad_norm": 0.18931525945663452, "learning_rate": 3.464022052032473e-05, "loss": 0.0311, "step": 11770 }, { "epoch": 11.294343240651965, "grad_norm": 0.2550256848335266, "learning_rate": 3.455742705863143e-05, "loss": 0.0346, "step": 11780 }, { "epoch": 11.303930968360499, "grad_norm": 0.21088473498821259, "learning_rate": 3.447468037744084e-05, "loss": 0.0295, "step": 11790 }, { "epoch": 11.313518696069032, "grad_norm": 0.25027552247047424, "learning_rate": 3.439198072741921e-05, "loss": 0.0375, "step": 11800 }, { "epoch": 11.323106423777565, "grad_norm": 0.5064207315444946, "learning_rate": 3.4309328359090264e-05, "loss": 0.0332, "step": 11810 }, { "epoch": 11.332694151486098, "grad_norm": 0.2110755443572998, "learning_rate": 3.422672352283453e-05, "loss": 0.0351, "step": 11820 }, { "epoch": 11.342281879194632, "grad_norm": 0.27771392464637756, "learning_rate": 3.41441664688885e-05, "loss": 0.0383, "step": 11830 }, { "epoch": 11.351869606903165, "grad_norm": 0.34242868423461914, "learning_rate": 3.406165744734397e-05, "loss": 0.0298, "step": 11840 }, { "epoch": 11.361457334611696, "grad_norm": 0.3390040099620819, "learning_rate": 3.397919670814723e-05, "loss": 0.0377, "step": 11850 }, { "epoch": 11.37104506232023, "grad_norm": 0.15492115914821625, "learning_rate": 3.389678450109827e-05, "loss": 0.0403, "step": 11860 }, { "epoch": 11.380632790028763, "grad_norm": 0.3101263642311096, "learning_rate": 3.3814421075850035e-05, "loss": 0.0362, "step": 11870 }, { "epoch": 11.390220517737296, "grad_norm": 0.2800522446632385, "learning_rate": 3.3732106681907816e-05, "loss": 0.032, "step": 11880 }, { "epoch": 11.39980824544583, "grad_norm": 0.26244333386421204, "learning_rate": 3.364984156862825e-05, "loss": 0.0307, "step": 11890 }, { "epoch": 11.409395973154362, "grad_norm": 0.48606979846954346, "learning_rate": 3.356762598521874e-05, "loss": 0.0335, "step": 11900 }, { "epoch": 11.418983700862896, "grad_norm": 0.5852661728858948, "learning_rate": 3.348546018073662e-05, "loss": 0.0433, "step": 11910 }, { "epoch": 11.428571428571429, "grad_norm": 0.252837598323822, "learning_rate": 3.340334440408846e-05, "loss": 0.0257, "step": 11920 }, { "epoch": 11.438159156279962, "grad_norm": 0.2573808431625366, "learning_rate": 3.332127890402926e-05, "loss": 0.0331, "step": 11930 }, { "epoch": 11.447746883988495, "grad_norm": 0.25154879689216614, "learning_rate": 3.3239263929161734e-05, "loss": 0.0389, "step": 11940 }, { "epoch": 11.457334611697028, "grad_norm": 0.2564004957675934, "learning_rate": 3.315729972793553e-05, "loss": 0.0386, "step": 11950 }, { "epoch": 11.466922339405562, "grad_norm": 0.45886269211769104, "learning_rate": 3.307538654864645e-05, "loss": 0.0365, "step": 11960 }, { "epoch": 11.476510067114093, "grad_norm": 0.157767191529274, "learning_rate": 3.29935246394358e-05, "loss": 0.0356, "step": 11970 }, { "epoch": 11.486097794822626, "grad_norm": 0.3403734564781189, "learning_rate": 3.2911714248289525e-05, "loss": 0.0335, "step": 11980 }, { "epoch": 11.49568552253116, "grad_norm": 0.207637757062912, "learning_rate": 3.282995562303754e-05, "loss": 0.0291, "step": 11990 }, { "epoch": 11.505273250239693, "grad_norm": 0.2571353614330292, "learning_rate": 3.2748249011352864e-05, "loss": 0.031, "step": 12000 }, { "epoch": 11.514860977948226, "grad_norm": 0.29838424921035767, "learning_rate": 3.266659466075108e-05, "loss": 0.0312, "step": 12010 }, { "epoch": 11.52444870565676, "grad_norm": 0.35853999853134155, "learning_rate": 3.258499281858936e-05, "loss": 0.0349, "step": 12020 }, { "epoch": 11.534036433365292, "grad_norm": 0.22435715794563293, "learning_rate": 3.250344373206584e-05, "loss": 0.0321, "step": 12030 }, { "epoch": 11.543624161073826, "grad_norm": 0.364653617143631, "learning_rate": 3.242194764821881e-05, "loss": 0.0291, "step": 12040 }, { "epoch": 11.553211888782359, "grad_norm": 0.20518454909324646, "learning_rate": 3.2340504813926086e-05, "loss": 0.0335, "step": 12050 }, { "epoch": 11.562799616490892, "grad_norm": 0.3099921941757202, "learning_rate": 3.2259115475904064e-05, "loss": 0.036, "step": 12060 }, { "epoch": 11.572387344199425, "grad_norm": 0.40152508020401, "learning_rate": 3.217777988070715e-05, "loss": 0.0377, "step": 12070 }, { "epoch": 11.581975071907959, "grad_norm": 0.2941493093967438, "learning_rate": 3.2096498274726925e-05, "loss": 0.0304, "step": 12080 }, { "epoch": 11.59156279961649, "grad_norm": 0.1939501017332077, "learning_rate": 3.201527090419144e-05, "loss": 0.0309, "step": 12090 }, { "epoch": 11.601150527325023, "grad_norm": 0.28782132267951965, "learning_rate": 3.193409801516443e-05, "loss": 0.0368, "step": 12100 }, { "epoch": 11.610738255033556, "grad_norm": 0.22255367040634155, "learning_rate": 3.1852979853544575e-05, "loss": 0.034, "step": 12110 }, { "epoch": 11.62032598274209, "grad_norm": 0.24580125510692596, "learning_rate": 3.177191666506479e-05, "loss": 0.0316, "step": 12120 }, { "epoch": 11.629913710450623, "grad_norm": 0.16919176280498505, "learning_rate": 3.169090869529146e-05, "loss": 0.032, "step": 12130 }, { "epoch": 11.639501438159156, "grad_norm": 0.16586647927761078, "learning_rate": 3.1609956189623704e-05, "loss": 0.0318, "step": 12140 }, { "epoch": 11.64908916586769, "grad_norm": 0.25521326065063477, "learning_rate": 3.1529059393292573e-05, "loss": 0.0339, "step": 12150 }, { "epoch": 11.658676893576223, "grad_norm": 0.40948987007141113, "learning_rate": 3.1448218551360394e-05, "loss": 0.0417, "step": 12160 }, { "epoch": 11.668264621284756, "grad_norm": 0.2603534460067749, "learning_rate": 3.136743390872001e-05, "loss": 0.0332, "step": 12170 }, { "epoch": 11.677852348993289, "grad_norm": 0.24372372031211853, "learning_rate": 3.128670571009399e-05, "loss": 0.0325, "step": 12180 }, { "epoch": 11.687440076701822, "grad_norm": 0.18494637310504913, "learning_rate": 3.1206034200033904e-05, "loss": 0.0324, "step": 12190 }, { "epoch": 11.697027804410356, "grad_norm": 0.3946174681186676, "learning_rate": 3.1125419622919614e-05, "loss": 0.0327, "step": 12200 }, { "epoch": 11.706615532118889, "grad_norm": 0.5735461115837097, "learning_rate": 3.104486222295853e-05, "loss": 0.0294, "step": 12210 }, { "epoch": 11.71620325982742, "grad_norm": 0.25579607486724854, "learning_rate": 3.096436224418482e-05, "loss": 0.0347, "step": 12220 }, { "epoch": 11.725790987535953, "grad_norm": 0.40547341108322144, "learning_rate": 3.088391993045873e-05, "loss": 0.037, "step": 12230 }, { "epoch": 11.735378715244487, "grad_norm": 0.3765973746776581, "learning_rate": 3.080353552546578e-05, "loss": 0.0307, "step": 12240 }, { "epoch": 11.74496644295302, "grad_norm": 0.40163904428482056, "learning_rate": 3.0723209272716124e-05, "loss": 0.0295, "step": 12250 }, { "epoch": 11.754554170661553, "grad_norm": 0.3667445182800293, "learning_rate": 3.064294141554372e-05, "loss": 0.0328, "step": 12260 }, { "epoch": 11.764141898370086, "grad_norm": 0.22410856187343597, "learning_rate": 3.056273219710565e-05, "loss": 0.0355, "step": 12270 }, { "epoch": 11.77372962607862, "grad_norm": 0.278154581785202, "learning_rate": 3.048258186038129e-05, "loss": 0.038, "step": 12280 }, { "epoch": 11.783317353787153, "grad_norm": 0.4203621745109558, "learning_rate": 3.040249064817176e-05, "loss": 0.0338, "step": 12290 }, { "epoch": 11.792905081495686, "grad_norm": 0.29441940784454346, "learning_rate": 3.0322458803098973e-05, "loss": 0.027, "step": 12300 }, { "epoch": 11.80249280920422, "grad_norm": 0.2775827646255493, "learning_rate": 3.0242486567605068e-05, "loss": 0.031, "step": 12310 }, { "epoch": 11.812080536912752, "grad_norm": 0.38520553708076477, "learning_rate": 3.016257418395152e-05, "loss": 0.0333, "step": 12320 }, { "epoch": 11.821668264621284, "grad_norm": 0.26599544286727905, "learning_rate": 3.008272189421861e-05, "loss": 0.0301, "step": 12330 }, { "epoch": 11.831255992329817, "grad_norm": 0.22733962535858154, "learning_rate": 3.0002929940304498e-05, "loss": 0.0298, "step": 12340 }, { "epoch": 11.84084372003835, "grad_norm": 0.27661770582199097, "learning_rate": 2.992319856392457e-05, "loss": 0.0342, "step": 12350 }, { "epoch": 11.850431447746884, "grad_norm": 0.26731380820274353, "learning_rate": 2.9843528006610733e-05, "loss": 0.0295, "step": 12360 }, { "epoch": 11.860019175455417, "grad_norm": 0.3973303437232971, "learning_rate": 2.976391850971065e-05, "loss": 0.0301, "step": 12370 }, { "epoch": 11.86960690316395, "grad_norm": 0.3120301067829132, "learning_rate": 2.968437031438698e-05, "loss": 0.0348, "step": 12380 }, { "epoch": 11.879194630872483, "grad_norm": 0.2932593524456024, "learning_rate": 2.9604883661616702e-05, "loss": 0.0308, "step": 12390 }, { "epoch": 11.888782358581016, "grad_norm": 0.2067721039056778, "learning_rate": 2.9525458792190365e-05, "loss": 0.0323, "step": 12400 }, { "epoch": 11.89837008628955, "grad_norm": 0.30877119302749634, "learning_rate": 2.9446095946711367e-05, "loss": 0.0336, "step": 12410 }, { "epoch": 11.907957813998083, "grad_norm": 0.1372332125902176, "learning_rate": 2.93667953655952e-05, "loss": 0.0341, "step": 12420 }, { "epoch": 11.917545541706616, "grad_norm": 0.2722005844116211, "learning_rate": 2.9287557289068736e-05, "loss": 0.0347, "step": 12430 }, { "epoch": 11.92713326941515, "grad_norm": 0.35675281286239624, "learning_rate": 2.9208381957169485e-05, "loss": 0.0354, "step": 12440 }, { "epoch": 11.936720997123683, "grad_norm": 0.4129658639431, "learning_rate": 2.9129269609744935e-05, "loss": 0.0235, "step": 12450 }, { "epoch": 11.946308724832214, "grad_norm": 0.23059901595115662, "learning_rate": 2.905022048645172e-05, "loss": 0.0361, "step": 12460 }, { "epoch": 11.955896452540747, "grad_norm": 0.20640157163143158, "learning_rate": 2.8971234826754983e-05, "loss": 0.0306, "step": 12470 }, { "epoch": 11.96548418024928, "grad_norm": 0.27325066924095154, "learning_rate": 2.8892312869927578e-05, "loss": 0.033, "step": 12480 }, { "epoch": 11.975071907957814, "grad_norm": 0.2237732708454132, "learning_rate": 2.881345485504945e-05, "loss": 0.0309, "step": 12490 }, { "epoch": 11.984659635666347, "grad_norm": 0.2271834760904312, "learning_rate": 2.8734661021006747e-05, "loss": 0.0267, "step": 12500 }, { "epoch": 11.99424736337488, "grad_norm": 0.27549734711647034, "learning_rate": 2.8655931606491294e-05, "loss": 0.0338, "step": 12510 }, { "epoch": 12.003835091083413, "grad_norm": 0.19603657722473145, "learning_rate": 2.8577266849999672e-05, "loss": 0.0303, "step": 12520 }, { "epoch": 12.013422818791947, "grad_norm": 0.1858394742012024, "learning_rate": 2.849866698983267e-05, "loss": 0.0255, "step": 12530 }, { "epoch": 12.02301054650048, "grad_norm": 0.17287525534629822, "learning_rate": 2.8420132264094468e-05, "loss": 0.0297, "step": 12540 }, { "epoch": 12.032598274209013, "grad_norm": 0.32775846123695374, "learning_rate": 2.83416629106919e-05, "loss": 0.0345, "step": 12550 }, { "epoch": 12.042186001917546, "grad_norm": 0.17536644637584686, "learning_rate": 2.8263259167333777e-05, "loss": 0.0286, "step": 12560 }, { "epoch": 12.05177372962608, "grad_norm": 0.18874387443065643, "learning_rate": 2.818492127153018e-05, "loss": 0.0293, "step": 12570 }, { "epoch": 12.06136145733461, "grad_norm": 0.1686885803937912, "learning_rate": 2.8106649460591716e-05, "loss": 0.0302, "step": 12580 }, { "epoch": 12.070949185043144, "grad_norm": 0.14021116495132446, "learning_rate": 2.802844397162877e-05, "loss": 0.0321, "step": 12590 }, { "epoch": 12.080536912751677, "grad_norm": 0.32412388920783997, "learning_rate": 2.7950305041550818e-05, "loss": 0.0337, "step": 12600 }, { "epoch": 12.09012464046021, "grad_norm": 0.2775496244430542, "learning_rate": 2.7872232907065738e-05, "loss": 0.0348, "step": 12610 }, { "epoch": 12.099712368168744, "grad_norm": 0.20718041062355042, "learning_rate": 2.7794227804679063e-05, "loss": 0.0318, "step": 12620 }, { "epoch": 12.109300095877277, "grad_norm": 0.14198093116283417, "learning_rate": 2.7716289970693236e-05, "loss": 0.0285, "step": 12630 }, { "epoch": 12.11888782358581, "grad_norm": 0.23473426699638367, "learning_rate": 2.7638419641206914e-05, "loss": 0.0311, "step": 12640 }, { "epoch": 12.128475551294343, "grad_norm": 0.22687584161758423, "learning_rate": 2.7560617052114297e-05, "loss": 0.0265, "step": 12650 }, { "epoch": 12.138063279002877, "grad_norm": 0.22875012457370758, "learning_rate": 2.7482882439104385e-05, "loss": 0.0324, "step": 12660 }, { "epoch": 12.14765100671141, "grad_norm": 0.2869175970554352, "learning_rate": 2.740521603766022e-05, "loss": 0.0343, "step": 12670 }, { "epoch": 12.157238734419943, "grad_norm": 0.24454490840435028, "learning_rate": 2.7327618083058192e-05, "loss": 0.0354, "step": 12680 }, { "epoch": 12.166826462128476, "grad_norm": 0.26888319849967957, "learning_rate": 2.7250088810367404e-05, "loss": 0.0317, "step": 12690 }, { "epoch": 12.176414189837008, "grad_norm": 0.2190038412809372, "learning_rate": 2.7172628454448888e-05, "loss": 0.0394, "step": 12700 }, { "epoch": 12.186001917545541, "grad_norm": 0.1673816591501236, "learning_rate": 2.7095237249954875e-05, "loss": 0.0272, "step": 12710 }, { "epoch": 12.195589645254074, "grad_norm": 0.32721394300460815, "learning_rate": 2.7017915431328078e-05, "loss": 0.0341, "step": 12720 }, { "epoch": 12.205177372962607, "grad_norm": 0.2936406135559082, "learning_rate": 2.6940663232801144e-05, "loss": 0.0294, "step": 12730 }, { "epoch": 12.21476510067114, "grad_norm": 3.8611295223236084, "learning_rate": 2.6863480888395714e-05, "loss": 0.0293, "step": 12740 }, { "epoch": 12.224352828379674, "grad_norm": 0.16587217152118683, "learning_rate": 2.6786368631921836e-05, "loss": 0.03, "step": 12750 }, { "epoch": 12.233940556088207, "grad_norm": 0.5451092720031738, "learning_rate": 2.6709326696977215e-05, "loss": 0.0325, "step": 12760 }, { "epoch": 12.24352828379674, "grad_norm": 0.20002365112304688, "learning_rate": 2.6632355316946643e-05, "loss": 0.0255, "step": 12770 }, { "epoch": 12.253116011505274, "grad_norm": 0.8898112773895264, "learning_rate": 2.655545472500105e-05, "loss": 0.0348, "step": 12780 }, { "epoch": 12.262703739213807, "grad_norm": 0.3279706835746765, "learning_rate": 2.647862515409697e-05, "loss": 0.0259, "step": 12790 }, { "epoch": 12.27229146692234, "grad_norm": 0.2899661958217621, "learning_rate": 2.6401866836975795e-05, "loss": 0.0375, "step": 12800 }, { "epoch": 12.281879194630873, "grad_norm": 0.2332329899072647, "learning_rate": 2.632518000616312e-05, "loss": 0.0319, "step": 12810 }, { "epoch": 12.291466922339406, "grad_norm": 0.23844292759895325, "learning_rate": 2.6248564893967886e-05, "loss": 0.0344, "step": 12820 }, { "epoch": 12.301054650047938, "grad_norm": 0.20757047832012177, "learning_rate": 2.617202173248181e-05, "loss": 0.0365, "step": 12830 }, { "epoch": 12.310642377756471, "grad_norm": 0.23326794803142548, "learning_rate": 2.609555075357869e-05, "loss": 0.0385, "step": 12840 }, { "epoch": 12.320230105465004, "grad_norm": 0.20900526642799377, "learning_rate": 2.6019152188913638e-05, "loss": 0.0333, "step": 12850 }, { "epoch": 12.329817833173538, "grad_norm": 0.2453479766845703, "learning_rate": 2.5942826269922376e-05, "loss": 0.0317, "step": 12860 }, { "epoch": 12.33940556088207, "grad_norm": 0.45544683933258057, "learning_rate": 2.5866573227820557e-05, "loss": 0.0299, "step": 12870 }, { "epoch": 12.348993288590604, "grad_norm": 0.31227871775627136, "learning_rate": 2.5790393293603097e-05, "loss": 0.029, "step": 12880 }, { "epoch": 12.358581016299137, "grad_norm": 0.32639333605766296, "learning_rate": 2.571428669804346e-05, "loss": 0.0323, "step": 12890 }, { "epoch": 12.36816874400767, "grad_norm": 0.3351771831512451, "learning_rate": 2.563825367169289e-05, "loss": 0.0304, "step": 12900 }, { "epoch": 12.377756471716204, "grad_norm": 0.47458702325820923, "learning_rate": 2.5562294444879787e-05, "loss": 0.03, "step": 12910 }, { "epoch": 12.387344199424737, "grad_norm": 0.2465980499982834, "learning_rate": 2.5486409247708987e-05, "loss": 0.0378, "step": 12920 }, { "epoch": 12.39693192713327, "grad_norm": 0.42310255765914917, "learning_rate": 2.5410598310061118e-05, "loss": 0.0323, "step": 12930 }, { "epoch": 12.406519654841803, "grad_norm": 1.066576361656189, "learning_rate": 2.5334861861591753e-05, "loss": 0.0347, "step": 12940 }, { "epoch": 12.416107382550335, "grad_norm": 0.24553652107715607, "learning_rate": 2.525920013173091e-05, "loss": 0.0288, "step": 12950 }, { "epoch": 12.425695110258868, "grad_norm": 0.17061471939086914, "learning_rate": 2.51836133496822e-05, "loss": 0.0293, "step": 12960 }, { "epoch": 12.435282837967401, "grad_norm": 0.2702957093715668, "learning_rate": 2.5108101744422197e-05, "loss": 0.0337, "step": 12970 }, { "epoch": 12.444870565675934, "grad_norm": 0.2967221736907959, "learning_rate": 2.5032665544699762e-05, "loss": 0.0388, "step": 12980 }, { "epoch": 12.454458293384468, "grad_norm": 0.18429528176784515, "learning_rate": 2.495730497903535e-05, "loss": 0.0339, "step": 12990 }, { "epoch": 12.464046021093, "grad_norm": 0.4446472227573395, "learning_rate": 2.4882020275720247e-05, "loss": 0.0297, "step": 13000 }, { "epoch": 12.473633748801534, "grad_norm": 0.2481614649295807, "learning_rate": 2.480681166281592e-05, "loss": 0.0332, "step": 13010 }, { "epoch": 12.483221476510067, "grad_norm": 0.4030400216579437, "learning_rate": 2.4731679368153392e-05, "loss": 0.0386, "step": 13020 }, { "epoch": 12.4928092042186, "grad_norm": 0.20716169476509094, "learning_rate": 2.4656623619332476e-05, "loss": 0.0289, "step": 13030 }, { "epoch": 12.502396931927134, "grad_norm": 0.18714624643325806, "learning_rate": 2.4581644643721075e-05, "loss": 0.0257, "step": 13040 }, { "epoch": 12.511984659635667, "grad_norm": 0.2566820979118347, "learning_rate": 2.4506742668454514e-05, "loss": 0.0267, "step": 13050 }, { "epoch": 12.5215723873442, "grad_norm": 0.237356036901474, "learning_rate": 2.44319179204349e-05, "loss": 0.0317, "step": 13060 }, { "epoch": 12.531160115052732, "grad_norm": 0.29655054211616516, "learning_rate": 2.4357170626330394e-05, "loss": 0.0328, "step": 13070 }, { "epoch": 12.540747842761265, "grad_norm": 0.29281550645828247, "learning_rate": 2.4282501012574495e-05, "loss": 0.0295, "step": 13080 }, { "epoch": 12.550335570469798, "grad_norm": 0.477317750453949, "learning_rate": 2.4207909305365363e-05, "loss": 0.0353, "step": 13090 }, { "epoch": 12.559923298178331, "grad_norm": 0.2606201767921448, "learning_rate": 2.4133395730665214e-05, "loss": 0.0288, "step": 13100 }, { "epoch": 12.569511025886865, "grad_norm": 0.18180538713932037, "learning_rate": 2.405896051419957e-05, "loss": 0.0349, "step": 13110 }, { "epoch": 12.579098753595398, "grad_norm": 0.3665505349636078, "learning_rate": 2.398460388145653e-05, "loss": 0.0321, "step": 13120 }, { "epoch": 12.588686481303931, "grad_norm": 0.28408095240592957, "learning_rate": 2.3910326057686127e-05, "loss": 0.0359, "step": 13130 }, { "epoch": 12.598274209012464, "grad_norm": 0.19122740626335144, "learning_rate": 2.3836127267899778e-05, "loss": 0.0299, "step": 13140 }, { "epoch": 12.607861936720997, "grad_norm": 0.18212218582630157, "learning_rate": 2.3762007736869353e-05, "loss": 0.0328, "step": 13150 }, { "epoch": 12.61744966442953, "grad_norm": 0.33118176460266113, "learning_rate": 2.3687967689126667e-05, "loss": 0.0291, "step": 13160 }, { "epoch": 12.627037392138064, "grad_norm": 0.43079885840415955, "learning_rate": 2.3614007348962724e-05, "loss": 0.0303, "step": 13170 }, { "epoch": 12.636625119846597, "grad_norm": 0.21110649406909943, "learning_rate": 2.3540126940427166e-05, "loss": 0.0334, "step": 13180 }, { "epoch": 12.64621284755513, "grad_norm": 0.18830737471580505, "learning_rate": 2.3466326687327396e-05, "loss": 0.0316, "step": 13190 }, { "epoch": 12.655800575263662, "grad_norm": 0.33135518431663513, "learning_rate": 2.3392606813228008e-05, "loss": 0.0375, "step": 13200 }, { "epoch": 12.665388302972195, "grad_norm": 0.2647267282009125, "learning_rate": 2.3318967541450153e-05, "loss": 0.0294, "step": 13210 }, { "epoch": 12.674976030680728, "grad_norm": 0.2796458303928375, "learning_rate": 2.3245409095070803e-05, "loss": 0.0282, "step": 13220 }, { "epoch": 12.684563758389261, "grad_norm": 0.31999823451042175, "learning_rate": 2.317193169692205e-05, "loss": 0.0363, "step": 13230 }, { "epoch": 12.694151486097795, "grad_norm": 0.21032322943210602, "learning_rate": 2.3098535569590458e-05, "loss": 0.0341, "step": 13240 }, { "epoch": 12.703739213806328, "grad_norm": 0.31383687257766724, "learning_rate": 2.3025220935416447e-05, "loss": 0.0301, "step": 13250 }, { "epoch": 12.713326941514861, "grad_norm": 0.4095149040222168, "learning_rate": 2.2951988016493548e-05, "loss": 0.036, "step": 13260 }, { "epoch": 12.722914669223394, "grad_norm": 0.21426613628864288, "learning_rate": 2.2878837034667737e-05, "loss": 0.0346, "step": 13270 }, { "epoch": 12.732502396931928, "grad_norm": 0.312098890542984, "learning_rate": 2.2805768211536758e-05, "loss": 0.0342, "step": 13280 }, { "epoch": 12.74209012464046, "grad_norm": 0.2564839720726013, "learning_rate": 2.273278176844951e-05, "loss": 0.0323, "step": 13290 }, { "epoch": 12.751677852348994, "grad_norm": 0.314685583114624, "learning_rate": 2.2659877926505353e-05, "loss": 0.0382, "step": 13300 }, { "epoch": 12.761265580057525, "grad_norm": 0.1301986277103424, "learning_rate": 2.2587056906553348e-05, "loss": 0.034, "step": 13310 }, { "epoch": 12.770853307766059, "grad_norm": 0.23595231771469116, "learning_rate": 2.251431892919171e-05, "loss": 0.0293, "step": 13320 }, { "epoch": 12.780441035474592, "grad_norm": 0.23706960678100586, "learning_rate": 2.2441664214767085e-05, "loss": 0.0355, "step": 13330 }, { "epoch": 12.790028763183125, "grad_norm": 0.20160214602947235, "learning_rate": 2.2369092983373912e-05, "loss": 0.0315, "step": 13340 }, { "epoch": 12.799616490891658, "grad_norm": 0.1787547618150711, "learning_rate": 2.2296605454853673e-05, "loss": 0.0314, "step": 13350 }, { "epoch": 12.809204218600192, "grad_norm": 0.36770564317703247, "learning_rate": 2.222420184879437e-05, "loss": 0.0372, "step": 13360 }, { "epoch": 12.818791946308725, "grad_norm": 0.3025970160961151, "learning_rate": 2.2151882384529683e-05, "loss": 0.0255, "step": 13370 }, { "epoch": 12.828379674017258, "grad_norm": 0.25169727206230164, "learning_rate": 2.207964728113848e-05, "loss": 0.0269, "step": 13380 }, { "epoch": 12.837967401725791, "grad_norm": 0.37031155824661255, "learning_rate": 2.200749675744402e-05, "loss": 0.0293, "step": 13390 }, { "epoch": 12.847555129434324, "grad_norm": 0.21579872071743011, "learning_rate": 2.1935431032013388e-05, "loss": 0.0302, "step": 13400 }, { "epoch": 12.857142857142858, "grad_norm": 0.20838379859924316, "learning_rate": 2.1863450323156725e-05, "loss": 0.034, "step": 13410 }, { "epoch": 12.86673058485139, "grad_norm": 0.2365337610244751, "learning_rate": 2.179155484892671e-05, "loss": 0.0321, "step": 13420 }, { "epoch": 12.876318312559924, "grad_norm": 0.24535539746284485, "learning_rate": 2.1719744827117737e-05, "loss": 0.0318, "step": 13430 }, { "epoch": 12.885906040268456, "grad_norm": 0.32186776399612427, "learning_rate": 2.1648020475265418e-05, "loss": 0.0353, "step": 13440 }, { "epoch": 12.895493767976989, "grad_norm": 0.2927076518535614, "learning_rate": 2.1576382010645764e-05, "loss": 0.0318, "step": 13450 }, { "epoch": 12.905081495685522, "grad_norm": 0.2444140613079071, "learning_rate": 2.1504829650274672e-05, "loss": 0.034, "step": 13460 }, { "epoch": 12.914669223394055, "grad_norm": 0.17273946106433868, "learning_rate": 2.1433363610907147e-05, "loss": 0.0339, "step": 13470 }, { "epoch": 12.924256951102588, "grad_norm": 0.3511595129966736, "learning_rate": 2.1361984109036765e-05, "loss": 0.0284, "step": 13480 }, { "epoch": 12.933844678811122, "grad_norm": 0.21930259466171265, "learning_rate": 2.1290691360894872e-05, "loss": 0.0337, "step": 13490 }, { "epoch": 12.943432406519655, "grad_norm": 0.13534465432167053, "learning_rate": 2.121948558245008e-05, "loss": 0.0325, "step": 13500 }, { "epoch": 12.953020134228188, "grad_norm": 0.25757452845573425, "learning_rate": 2.1148366989407496e-05, "loss": 0.0344, "step": 13510 }, { "epoch": 12.962607861936721, "grad_norm": 0.3126337230205536, "learning_rate": 2.1077335797208153e-05, "loss": 0.0266, "step": 13520 }, { "epoch": 12.972195589645255, "grad_norm": 0.2144749015569687, "learning_rate": 2.100639222102827e-05, "loss": 0.0296, "step": 13530 }, { "epoch": 12.981783317353788, "grad_norm": 0.33655446767807007, "learning_rate": 2.0935536475778682e-05, "loss": 0.0319, "step": 13540 }, { "epoch": 12.991371045062321, "grad_norm": 0.16992558538913727, "learning_rate": 2.0864768776104183e-05, "loss": 0.0335, "step": 13550 }, { "epoch": 13.000958772770852, "grad_norm": 0.2082756608724594, "learning_rate": 2.079408933638279e-05, "loss": 0.0338, "step": 13560 }, { "epoch": 13.010546500479386, "grad_norm": 0.2862843871116638, "learning_rate": 2.0723498370725162e-05, "loss": 0.0289, "step": 13570 }, { "epoch": 13.020134228187919, "grad_norm": 0.29127344489097595, "learning_rate": 2.0652996092973974e-05, "loss": 0.0379, "step": 13580 }, { "epoch": 13.029721955896452, "grad_norm": 0.1825907677412033, "learning_rate": 2.0582582716703243e-05, "loss": 0.0267, "step": 13590 }, { "epoch": 13.039309683604985, "grad_norm": 0.20657765865325928, "learning_rate": 2.0512258455217636e-05, "loss": 0.0337, "step": 13600 }, { "epoch": 13.048897411313519, "grad_norm": 0.20046214759349823, "learning_rate": 2.044202352155185e-05, "loss": 0.0256, "step": 13610 }, { "epoch": 13.058485139022052, "grad_norm": 0.23749665915966034, "learning_rate": 2.0371878128470047e-05, "loss": 0.033, "step": 13620 }, { "epoch": 13.068072866730585, "grad_norm": 0.1981140673160553, "learning_rate": 2.0301822488465106e-05, "loss": 0.0323, "step": 13630 }, { "epoch": 13.077660594439118, "grad_norm": 0.3064008951187134, "learning_rate": 2.0231856813757995e-05, "loss": 0.029, "step": 13640 }, { "epoch": 13.087248322147651, "grad_norm": 0.3160218596458435, "learning_rate": 2.016198131629716e-05, "loss": 0.0317, "step": 13650 }, { "epoch": 13.096836049856185, "grad_norm": 0.1925330013036728, "learning_rate": 2.0092196207757886e-05, "loss": 0.0308, "step": 13660 }, { "epoch": 13.106423777564718, "grad_norm": 0.2060590237379074, "learning_rate": 2.002250169954165e-05, "loss": 0.0352, "step": 13670 }, { "epoch": 13.116011505273251, "grad_norm": 0.21879933774471283, "learning_rate": 1.9952898002775444e-05, "loss": 0.0262, "step": 13680 }, { "epoch": 13.125599232981783, "grad_norm": 0.22108188271522522, "learning_rate": 1.9883385328311155e-05, "loss": 0.0333, "step": 13690 }, { "epoch": 13.135186960690316, "grad_norm": 0.26251569390296936, "learning_rate": 1.981396388672496e-05, "loss": 0.0314, "step": 13700 }, { "epoch": 13.144774688398849, "grad_norm": 0.29389551281929016, "learning_rate": 1.9744633888316684e-05, "loss": 0.0333, "step": 13710 }, { "epoch": 13.154362416107382, "grad_norm": 0.1754542887210846, "learning_rate": 1.9675395543109087e-05, "loss": 0.0306, "step": 13720 }, { "epoch": 13.163950143815915, "grad_norm": 0.2529279589653015, "learning_rate": 1.9606249060847275e-05, "loss": 0.029, "step": 13730 }, { "epoch": 13.173537871524449, "grad_norm": 0.25833970308303833, "learning_rate": 1.9537194650998176e-05, "loss": 0.0257, "step": 13740 }, { "epoch": 13.183125599232982, "grad_norm": 0.2809722423553467, "learning_rate": 1.9468232522749685e-05, "loss": 0.03, "step": 13750 }, { "epoch": 13.192713326941515, "grad_norm": 0.2745196521282196, "learning_rate": 1.9399362885010186e-05, "loss": 0.0259, "step": 13760 }, { "epoch": 13.202301054650048, "grad_norm": 0.26047447323799133, "learning_rate": 1.9330585946407896e-05, "loss": 0.0293, "step": 13770 }, { "epoch": 13.211888782358582, "grad_norm": 0.2309299260377884, "learning_rate": 1.9261901915290222e-05, "loss": 0.0263, "step": 13780 }, { "epoch": 13.221476510067115, "grad_norm": 0.19574059545993805, "learning_rate": 1.9193310999723086e-05, "loss": 0.0256, "step": 13790 }, { "epoch": 13.231064237775648, "grad_norm": 0.24411630630493164, "learning_rate": 1.9124813407490345e-05, "loss": 0.0266, "step": 13800 }, { "epoch": 13.24065196548418, "grad_norm": 0.2317860871553421, "learning_rate": 1.9056409346093167e-05, "loss": 0.0362, "step": 13810 }, { "epoch": 13.250239693192713, "grad_norm": 0.34288397431373596, "learning_rate": 1.89880990227494e-05, "loss": 0.031, "step": 13820 }, { "epoch": 13.259827420901246, "grad_norm": 0.22115236520767212, "learning_rate": 1.8919882644392894e-05, "loss": 0.0303, "step": 13830 }, { "epoch": 13.269415148609779, "grad_norm": 0.1675620973110199, "learning_rate": 1.8851760417672897e-05, "loss": 0.0267, "step": 13840 }, { "epoch": 13.279002876318312, "grad_norm": 0.22504985332489014, "learning_rate": 1.8783732548953487e-05, "loss": 0.03, "step": 13850 }, { "epoch": 13.288590604026846, "grad_norm": 0.2568277418613434, "learning_rate": 1.87157992443129e-05, "loss": 0.0347, "step": 13860 }, { "epoch": 13.298178331735379, "grad_norm": 0.24830462038516998, "learning_rate": 1.8647960709542866e-05, "loss": 0.0313, "step": 13870 }, { "epoch": 13.307766059443912, "grad_norm": 0.1982988864183426, "learning_rate": 1.8580217150148034e-05, "loss": 0.0286, "step": 13880 }, { "epoch": 13.317353787152445, "grad_norm": 0.17509537935256958, "learning_rate": 1.851256877134538e-05, "loss": 0.0283, "step": 13890 }, { "epoch": 13.326941514860978, "grad_norm": 0.27267399430274963, "learning_rate": 1.8445015778063528e-05, "loss": 0.0308, "step": 13900 }, { "epoch": 13.336529242569512, "grad_norm": 0.2444014698266983, "learning_rate": 1.8377558374942143e-05, "loss": 0.0335, "step": 13910 }, { "epoch": 13.346116970278045, "grad_norm": 0.4355910122394562, "learning_rate": 1.831019676633129e-05, "loss": 0.0326, "step": 13920 }, { "epoch": 13.355704697986576, "grad_norm": 0.6526142954826355, "learning_rate": 1.8242931156290893e-05, "loss": 0.0299, "step": 13930 }, { "epoch": 13.36529242569511, "grad_norm": 0.20145297050476074, "learning_rate": 1.8175761748590063e-05, "loss": 0.0315, "step": 13940 }, { "epoch": 13.374880153403643, "grad_norm": 0.22952324151992798, "learning_rate": 1.8108688746706427e-05, "loss": 0.031, "step": 13950 }, { "epoch": 13.384467881112176, "grad_norm": 0.38137954473495483, "learning_rate": 1.8041712353825635e-05, "loss": 0.0387, "step": 13960 }, { "epoch": 13.39405560882071, "grad_norm": 0.2673424482345581, "learning_rate": 1.7974832772840617e-05, "loss": 0.0272, "step": 13970 }, { "epoch": 13.403643336529242, "grad_norm": 0.2189689427614212, "learning_rate": 1.790805020635109e-05, "loss": 0.0317, "step": 13980 }, { "epoch": 13.413231064237776, "grad_norm": 1.2192716598510742, "learning_rate": 1.7841364856662824e-05, "loss": 0.0258, "step": 13990 }, { "epoch": 13.422818791946309, "grad_norm": 0.13329686224460602, "learning_rate": 1.7774776925787136e-05, "loss": 0.0257, "step": 14000 }, { "epoch": 13.432406519654842, "grad_norm": 0.2741002142429352, "learning_rate": 1.7708286615440183e-05, "loss": 0.0271, "step": 14010 }, { "epoch": 13.441994247363375, "grad_norm": 0.7737520337104797, "learning_rate": 1.764189412704247e-05, "loss": 0.0283, "step": 14020 }, { "epoch": 13.451581975071909, "grad_norm": 0.24316097795963287, "learning_rate": 1.7575599661718068e-05, "loss": 0.0302, "step": 14030 }, { "epoch": 13.461169702780442, "grad_norm": 0.23543784022331238, "learning_rate": 1.7509403420294208e-05, "loss": 0.0311, "step": 14040 }, { "epoch": 13.470757430488973, "grad_norm": 0.19010919332504272, "learning_rate": 1.7443305603300497e-05, "loss": 0.0276, "step": 14050 }, { "epoch": 13.480345158197506, "grad_norm": 0.1994113028049469, "learning_rate": 1.7377306410968396e-05, "loss": 0.0298, "step": 14060 }, { "epoch": 13.48993288590604, "grad_norm": 0.30696478486061096, "learning_rate": 1.731140604323063e-05, "loss": 0.0275, "step": 14070 }, { "epoch": 13.499520613614573, "grad_norm": 0.3128091096878052, "learning_rate": 1.7245604699720535e-05, "loss": 0.0272, "step": 14080 }, { "epoch": 13.509108341323106, "grad_norm": 2.206577777862549, "learning_rate": 1.7179902579771474e-05, "loss": 0.0326, "step": 14090 }, { "epoch": 13.51869606903164, "grad_norm": 0.18835577368736267, "learning_rate": 1.711429988241619e-05, "loss": 0.0276, "step": 14100 }, { "epoch": 13.528283796740173, "grad_norm": 0.2255256026983261, "learning_rate": 1.7048796806386304e-05, "loss": 0.0301, "step": 14110 }, { "epoch": 13.537871524448706, "grad_norm": 0.3144644796848297, "learning_rate": 1.6983393550111648e-05, "loss": 0.0324, "step": 14120 }, { "epoch": 13.547459252157239, "grad_norm": 0.20487931370735168, "learning_rate": 1.691809031171962e-05, "loss": 0.0352, "step": 14130 }, { "epoch": 13.557046979865772, "grad_norm": 0.22863590717315674, "learning_rate": 1.6852887289034632e-05, "loss": 0.0343, "step": 14140 }, { "epoch": 13.566634707574305, "grad_norm": 0.30829718708992004, "learning_rate": 1.67877846795776e-05, "loss": 0.0342, "step": 14150 }, { "epoch": 13.576222435282839, "grad_norm": 0.2026831954717636, "learning_rate": 1.672278268056516e-05, "loss": 0.0266, "step": 14160 }, { "epoch": 13.585810162991372, "grad_norm": 0.18998700380325317, "learning_rate": 1.6657881488909192e-05, "loss": 0.0316, "step": 14170 }, { "epoch": 13.595397890699903, "grad_norm": 0.2338184267282486, "learning_rate": 1.659308130121622e-05, "loss": 0.0315, "step": 14180 }, { "epoch": 13.604985618408437, "grad_norm": 0.421129047870636, "learning_rate": 1.6528382313786784e-05, "loss": 0.0322, "step": 14190 }, { "epoch": 13.61457334611697, "grad_norm": 0.28092893958091736, "learning_rate": 1.6463784722614845e-05, "loss": 0.0269, "step": 14200 }, { "epoch": 13.624161073825503, "grad_norm": 0.19112944602966309, "learning_rate": 1.6399288723387195e-05, "loss": 0.0258, "step": 14210 }, { "epoch": 13.633748801534036, "grad_norm": 0.286045640707016, "learning_rate": 1.63348945114829e-05, "loss": 0.0324, "step": 14220 }, { "epoch": 13.64333652924257, "grad_norm": 0.280977338552475, "learning_rate": 1.6270602281972686e-05, "loss": 0.0265, "step": 14230 }, { "epoch": 13.652924256951103, "grad_norm": 0.28009748458862305, "learning_rate": 1.6206412229618307e-05, "loss": 0.034, "step": 14240 }, { "epoch": 13.662511984659636, "grad_norm": 0.2950078845024109, "learning_rate": 1.6142324548871978e-05, "loss": 0.0332, "step": 14250 }, { "epoch": 13.67209971236817, "grad_norm": 0.19593513011932373, "learning_rate": 1.607833943387585e-05, "loss": 0.0322, "step": 14260 }, { "epoch": 13.681687440076702, "grad_norm": 0.3256717026233673, "learning_rate": 1.6014457078461353e-05, "loss": 0.0311, "step": 14270 }, { "epoch": 13.691275167785236, "grad_norm": 0.48480740189552307, "learning_rate": 1.59506776761486e-05, "loss": 0.0265, "step": 14280 }, { "epoch": 13.700862895493769, "grad_norm": 0.17794422805309296, "learning_rate": 1.588700142014583e-05, "loss": 0.0302, "step": 14290 }, { "epoch": 13.7104506232023, "grad_norm": 0.21641989052295685, "learning_rate": 1.5823428503348846e-05, "loss": 0.0269, "step": 14300 }, { "epoch": 13.720038350910833, "grad_norm": 0.21487939357757568, "learning_rate": 1.57599591183404e-05, "loss": 0.0333, "step": 14310 }, { "epoch": 13.729626078619367, "grad_norm": 0.20198583602905273, "learning_rate": 1.569659345738959e-05, "loss": 0.0316, "step": 14320 }, { "epoch": 13.7392138063279, "grad_norm": 0.24818021059036255, "learning_rate": 1.5633331712451287e-05, "loss": 0.0322, "step": 14330 }, { "epoch": 13.748801534036433, "grad_norm": 0.3211008906364441, "learning_rate": 1.5570174075165617e-05, "loss": 0.0286, "step": 14340 }, { "epoch": 13.758389261744966, "grad_norm": 0.27913060784339905, "learning_rate": 1.5507120736857316e-05, "loss": 0.0309, "step": 14350 }, { "epoch": 13.7679769894535, "grad_norm": 0.3094828724861145, "learning_rate": 1.5444171888535127e-05, "loss": 0.0262, "step": 14360 }, { "epoch": 13.777564717162033, "grad_norm": 0.26376375555992126, "learning_rate": 1.538132772089131e-05, "loss": 0.0312, "step": 14370 }, { "epoch": 13.787152444870566, "grad_norm": 0.27103152871131897, "learning_rate": 1.531858842430096e-05, "loss": 0.029, "step": 14380 }, { "epoch": 13.7967401725791, "grad_norm": 0.2528936564922333, "learning_rate": 1.5255954188821554e-05, "loss": 0.0302, "step": 14390 }, { "epoch": 13.806327900287632, "grad_norm": 0.2022869884967804, "learning_rate": 1.519342520419223e-05, "loss": 0.028, "step": 14400 }, { "epoch": 13.815915627996166, "grad_norm": 0.2736548185348511, "learning_rate": 1.5131001659833349e-05, "loss": 0.0391, "step": 14410 }, { "epoch": 13.825503355704697, "grad_norm": 0.20340123772621155, "learning_rate": 1.5068683744845802e-05, "loss": 0.0259, "step": 14420 }, { "epoch": 13.83509108341323, "grad_norm": 0.30253875255584717, "learning_rate": 1.5006471648010567e-05, "loss": 0.0318, "step": 14430 }, { "epoch": 13.844678811121764, "grad_norm": 0.18290819227695465, "learning_rate": 1.4944365557787982e-05, "loss": 0.0266, "step": 14440 }, { "epoch": 13.854266538830297, "grad_norm": 0.17378397285938263, "learning_rate": 1.4882365662317338e-05, "loss": 0.0307, "step": 14450 }, { "epoch": 13.86385426653883, "grad_norm": 0.17450757324695587, "learning_rate": 1.4820472149416154e-05, "loss": 0.0375, "step": 14460 }, { "epoch": 13.873441994247363, "grad_norm": 0.17673359811306, "learning_rate": 1.4758685206579754e-05, "loss": 0.0336, "step": 14470 }, { "epoch": 13.883029721955896, "grad_norm": 0.17782671749591827, "learning_rate": 1.4697005020980547e-05, "loss": 0.0264, "step": 14480 }, { "epoch": 13.89261744966443, "grad_norm": 0.22997714579105377, "learning_rate": 1.4635431779467628e-05, "loss": 0.0364, "step": 14490 }, { "epoch": 13.902205177372963, "grad_norm": 0.23629331588745117, "learning_rate": 1.4573965668566037e-05, "loss": 0.0293, "step": 14500 }, { "epoch": 13.911792905081496, "grad_norm": 0.2348259836435318, "learning_rate": 1.4512606874476348e-05, "loss": 0.0296, "step": 14510 }, { "epoch": 13.92138063279003, "grad_norm": 0.2225087732076645, "learning_rate": 1.4451355583074027e-05, "loss": 0.0286, "step": 14520 }, { "epoch": 13.930968360498563, "grad_norm": 0.23287685215473175, "learning_rate": 1.4390211979908847e-05, "loss": 0.0279, "step": 14530 }, { "epoch": 13.940556088207096, "grad_norm": 0.19362808763980865, "learning_rate": 1.4329176250204369e-05, "loss": 0.0334, "step": 14540 }, { "epoch": 13.950143815915627, "grad_norm": 0.25659292936325073, "learning_rate": 1.4268248578857384e-05, "loss": 0.0286, "step": 14550 }, { "epoch": 13.95973154362416, "grad_norm": 0.19965949654579163, "learning_rate": 1.4207429150437368e-05, "loss": 0.0336, "step": 14560 }, { "epoch": 13.969319271332694, "grad_norm": 0.21127323806285858, "learning_rate": 1.4146718149185833e-05, "loss": 0.0311, "step": 14570 }, { "epoch": 13.978906999041227, "grad_norm": 0.2175043374300003, "learning_rate": 1.408611575901585e-05, "loss": 0.0232, "step": 14580 }, { "epoch": 13.98849472674976, "grad_norm": 0.2855774462223053, "learning_rate": 1.4025622163511498e-05, "loss": 0.03, "step": 14590 }, { "epoch": 13.998082454458293, "grad_norm": 0.27606961131095886, "learning_rate": 1.3965237545927274e-05, "loss": 0.0285, "step": 14600 }, { "epoch": 14.007670182166827, "grad_norm": 0.20237654447555542, "learning_rate": 1.3904962089187529e-05, "loss": 0.0263, "step": 14610 }, { "epoch": 14.01725790987536, "grad_norm": 0.17577792704105377, "learning_rate": 1.3844795975885921e-05, "loss": 0.028, "step": 14620 }, { "epoch": 14.026845637583893, "grad_norm": 0.24930806457996368, "learning_rate": 1.3784739388284911e-05, "loss": 0.0308, "step": 14630 }, { "epoch": 14.036433365292426, "grad_norm": 0.16480274498462677, "learning_rate": 1.372479250831516e-05, "loss": 0.0301, "step": 14640 }, { "epoch": 14.04602109300096, "grad_norm": 0.20912165939807892, "learning_rate": 1.3664955517574968e-05, "loss": 0.0278, "step": 14650 }, { "epoch": 14.055608820709493, "grad_norm": 0.3317655622959137, "learning_rate": 1.3605228597329738e-05, "loss": 0.0317, "step": 14660 }, { "epoch": 14.065196548418024, "grad_norm": 0.240800142288208, "learning_rate": 1.3545611928511475e-05, "loss": 0.0352, "step": 14670 }, { "epoch": 14.074784276126557, "grad_norm": 0.2574955224990845, "learning_rate": 1.3486105691718187e-05, "loss": 0.0272, "step": 14680 }, { "epoch": 14.08437200383509, "grad_norm": 0.26954057812690735, "learning_rate": 1.3426710067213322e-05, "loss": 0.0309, "step": 14690 }, { "epoch": 14.093959731543624, "grad_norm": 0.23546206951141357, "learning_rate": 1.336742523492523e-05, "loss": 0.0332, "step": 14700 }, { "epoch": 14.103547459252157, "grad_norm": 0.2285180389881134, "learning_rate": 1.3308251374446734e-05, "loss": 0.0436, "step": 14710 }, { "epoch": 14.11313518696069, "grad_norm": 0.22198130190372467, "learning_rate": 1.324918866503439e-05, "loss": 0.0283, "step": 14720 }, { "epoch": 14.122722914669223, "grad_norm": 0.37202128767967224, "learning_rate": 1.3190237285608076e-05, "loss": 0.0296, "step": 14730 }, { "epoch": 14.132310642377757, "grad_norm": 0.2728140652179718, "learning_rate": 1.3131397414750385e-05, "loss": 0.0313, "step": 14740 }, { "epoch": 14.14189837008629, "grad_norm": 0.19201789796352386, "learning_rate": 1.3072669230706197e-05, "loss": 0.0315, "step": 14750 }, { "epoch": 14.151486097794823, "grad_norm": 0.2704322040081024, "learning_rate": 1.3014052911381974e-05, "loss": 0.0279, "step": 14760 }, { "epoch": 14.161073825503356, "grad_norm": 0.23162490129470825, "learning_rate": 1.2955548634345327e-05, "loss": 0.0288, "step": 14770 }, { "epoch": 14.17066155321189, "grad_norm": 0.1527073085308075, "learning_rate": 1.289715657682447e-05, "loss": 0.0287, "step": 14780 }, { "epoch": 14.180249280920421, "grad_norm": 0.48836442828178406, "learning_rate": 1.2838876915707681e-05, "loss": 0.0334, "step": 14790 }, { "epoch": 14.189837008628954, "grad_norm": 0.22852776944637299, "learning_rate": 1.2780709827542708e-05, "loss": 0.0301, "step": 14800 }, { "epoch": 14.199424736337487, "grad_norm": 1.632561445236206, "learning_rate": 1.2722655488536294e-05, "loss": 0.0296, "step": 14810 }, { "epoch": 14.20901246404602, "grad_norm": 0.20910300314426422, "learning_rate": 1.2664714074553652e-05, "loss": 0.0277, "step": 14820 }, { "epoch": 14.218600191754554, "grad_norm": 0.284138023853302, "learning_rate": 1.260688576111791e-05, "loss": 0.0275, "step": 14830 }, { "epoch": 14.228187919463087, "grad_norm": 0.24799588322639465, "learning_rate": 1.2549170723409549e-05, "loss": 0.0291, "step": 14840 }, { "epoch": 14.23777564717162, "grad_norm": 0.18639959394931793, "learning_rate": 1.2491569136265896e-05, "loss": 0.0284, "step": 14850 }, { "epoch": 14.247363374880154, "grad_norm": 0.19724729657173157, "learning_rate": 1.243408117418064e-05, "loss": 0.0266, "step": 14860 }, { "epoch": 14.256951102588687, "grad_norm": 0.1451575756072998, "learning_rate": 1.2376707011303257e-05, "loss": 0.0313, "step": 14870 }, { "epoch": 14.26653883029722, "grad_norm": 0.13136418163776398, "learning_rate": 1.2319446821438458e-05, "loss": 0.0257, "step": 14880 }, { "epoch": 14.276126558005753, "grad_norm": 0.212480828166008, "learning_rate": 1.2262300778045693e-05, "loss": 0.0309, "step": 14890 }, { "epoch": 14.285714285714286, "grad_norm": 0.179280087351799, "learning_rate": 1.220526905423866e-05, "loss": 0.0334, "step": 14900 }, { "epoch": 14.29530201342282, "grad_norm": 0.19260522723197937, "learning_rate": 1.2148351822784748e-05, "loss": 0.0321, "step": 14910 }, { "epoch": 14.304889741131351, "grad_norm": 0.2079414278268814, "learning_rate": 1.2091549256104457e-05, "loss": 0.0314, "step": 14920 }, { "epoch": 14.314477468839884, "grad_norm": 0.1942739635705948, "learning_rate": 1.2034861526270996e-05, "loss": 0.0307, "step": 14930 }, { "epoch": 14.324065196548418, "grad_norm": 0.28928378224372864, "learning_rate": 1.1978288805009641e-05, "loss": 0.0267, "step": 14940 }, { "epoch": 14.33365292425695, "grad_norm": 0.3712955415248871, "learning_rate": 1.192183126369732e-05, "loss": 0.0329, "step": 14950 }, { "epoch": 14.343240651965484, "grad_norm": 0.22929075360298157, "learning_rate": 1.1865489073361996e-05, "loss": 0.0264, "step": 14960 }, { "epoch": 14.352828379674017, "grad_norm": 0.31317007541656494, "learning_rate": 1.1809262404682247e-05, "loss": 0.0242, "step": 14970 }, { "epoch": 14.36241610738255, "grad_norm": 0.5237254500389099, "learning_rate": 1.1753151427986646e-05, "loss": 0.0292, "step": 14980 }, { "epoch": 14.372003835091084, "grad_norm": 0.21789228916168213, "learning_rate": 1.169715631325336e-05, "loss": 0.0314, "step": 14990 }, { "epoch": 14.381591562799617, "grad_norm": 0.29379501938819885, "learning_rate": 1.1641277230109492e-05, "loss": 0.0332, "step": 15000 }, { "epoch": 14.39117929050815, "grad_norm": 0.17771072685718536, "learning_rate": 1.1585514347830738e-05, "loss": 0.0267, "step": 15010 }, { "epoch": 14.400767018216683, "grad_norm": 0.24794255197048187, "learning_rate": 1.1529867835340707e-05, "loss": 0.0267, "step": 15020 }, { "epoch": 14.410354745925215, "grad_norm": 0.21468493342399597, "learning_rate": 1.1474337861210543e-05, "loss": 0.0267, "step": 15030 }, { "epoch": 14.419942473633748, "grad_norm": 0.17512547969818115, "learning_rate": 1.1418924593658314e-05, "loss": 0.0239, "step": 15040 }, { "epoch": 14.429530201342281, "grad_norm": 0.2626974284648895, "learning_rate": 1.1363628200548593e-05, "loss": 0.0328, "step": 15050 }, { "epoch": 14.439117929050814, "grad_norm": 0.21883651614189148, "learning_rate": 1.1308448849391846e-05, "loss": 0.0283, "step": 15060 }, { "epoch": 14.448705656759348, "grad_norm": 0.2517321705818176, "learning_rate": 1.1253386707344044e-05, "loss": 0.0319, "step": 15070 }, { "epoch": 14.458293384467881, "grad_norm": 0.23790787160396576, "learning_rate": 1.1198441941206033e-05, "loss": 0.0254, "step": 15080 }, { "epoch": 14.467881112176414, "grad_norm": 0.2755306363105774, "learning_rate": 1.1143614717423145e-05, "loss": 0.0297, "step": 15090 }, { "epoch": 14.477468839884947, "grad_norm": 0.17343682050704956, "learning_rate": 1.1088905202084604e-05, "loss": 0.0271, "step": 15100 }, { "epoch": 14.48705656759348, "grad_norm": 0.4037168323993683, "learning_rate": 1.1034313560923032e-05, "loss": 0.0318, "step": 15110 }, { "epoch": 14.496644295302014, "grad_norm": 0.25027063488960266, "learning_rate": 1.097983995931407e-05, "loss": 0.0344, "step": 15120 }, { "epoch": 14.506232023010547, "grad_norm": 0.2531662583351135, "learning_rate": 1.0925484562275678e-05, "loss": 0.0336, "step": 15130 }, { "epoch": 14.51581975071908, "grad_norm": 0.27917400002479553, "learning_rate": 1.0871247534467788e-05, "loss": 0.0316, "step": 15140 }, { "epoch": 14.525407478427613, "grad_norm": 0.26147523522377014, "learning_rate": 1.0817129040191698e-05, "loss": 0.0278, "step": 15150 }, { "epoch": 14.534995206136145, "grad_norm": 0.24168430268764496, "learning_rate": 1.076312924338973e-05, "loss": 0.03, "step": 15160 }, { "epoch": 14.544582933844678, "grad_norm": 0.17934760451316833, "learning_rate": 1.0709248307644559e-05, "loss": 0.0275, "step": 15170 }, { "epoch": 14.554170661553211, "grad_norm": 0.38495177030563354, "learning_rate": 1.0655486396178782e-05, "loss": 0.0317, "step": 15180 }, { "epoch": 14.563758389261745, "grad_norm": 0.22225984930992126, "learning_rate": 1.0601843671854477e-05, "loss": 0.0312, "step": 15190 }, { "epoch": 14.573346116970278, "grad_norm": 0.29296278953552246, "learning_rate": 1.0548320297172665e-05, "loss": 0.0315, "step": 15200 }, { "epoch": 14.582933844678811, "grad_norm": 0.3371207118034363, "learning_rate": 1.0494916434272783e-05, "loss": 0.0299, "step": 15210 }, { "epoch": 14.592521572387344, "grad_norm": 0.220375657081604, "learning_rate": 1.0441632244932237e-05, "loss": 0.0265, "step": 15220 }, { "epoch": 14.602109300095877, "grad_norm": 0.1987174153327942, "learning_rate": 1.0388467890565928e-05, "loss": 0.0261, "step": 15230 }, { "epoch": 14.61169702780441, "grad_norm": 0.25363320112228394, "learning_rate": 1.0335423532225735e-05, "loss": 0.0301, "step": 15240 }, { "epoch": 14.621284755512944, "grad_norm": 0.22231195867061615, "learning_rate": 1.028249933060001e-05, "loss": 0.0353, "step": 15250 }, { "epoch": 14.630872483221477, "grad_norm": 0.20641197264194489, "learning_rate": 1.022969544601311e-05, "loss": 0.0254, "step": 15260 }, { "epoch": 14.64046021093001, "grad_norm": 0.25588056445121765, "learning_rate": 1.0177012038424927e-05, "loss": 0.0327, "step": 15270 }, { "epoch": 14.650047938638544, "grad_norm": 0.3196217715740204, "learning_rate": 1.0124449267430414e-05, "loss": 0.0306, "step": 15280 }, { "epoch": 14.659635666347075, "grad_norm": 0.37711241841316223, "learning_rate": 1.0072007292259029e-05, "loss": 0.0314, "step": 15290 }, { "epoch": 14.669223394055608, "grad_norm": 0.299496591091156, "learning_rate": 1.0019686271774314e-05, "loss": 0.0273, "step": 15300 }, { "epoch": 14.678811121764141, "grad_norm": 0.20070233941078186, "learning_rate": 9.967486364473416e-06, "loss": 0.0348, "step": 15310 }, { "epoch": 14.688398849472675, "grad_norm": 0.1786354035139084, "learning_rate": 9.915407728486603e-06, "loss": 0.0315, "step": 15320 }, { "epoch": 14.697986577181208, "grad_norm": 0.19913482666015625, "learning_rate": 9.863450521576729e-06, "loss": 0.0332, "step": 15330 }, { "epoch": 14.707574304889741, "grad_norm": 0.26217663288116455, "learning_rate": 9.81161490113885e-06, "loss": 0.0299, "step": 15340 }, { "epoch": 14.717162032598274, "grad_norm": 0.17626221477985382, "learning_rate": 9.759901024199642e-06, "loss": 0.0258, "step": 15350 }, { "epoch": 14.726749760306808, "grad_norm": 0.5230224132537842, "learning_rate": 9.708309047417041e-06, "loss": 0.0286, "step": 15360 }, { "epoch": 14.73633748801534, "grad_norm": 0.19318176805973053, "learning_rate": 9.656839127079659e-06, "loss": 0.0254, "step": 15370 }, { "epoch": 14.745925215723874, "grad_norm": 0.30321067571640015, "learning_rate": 9.6054914191064e-06, "loss": 0.0304, "step": 15380 }, { "epoch": 14.755512943432407, "grad_norm": 0.2519323229789734, "learning_rate": 9.554266079045909e-06, "loss": 0.0325, "step": 15390 }, { "epoch": 14.765100671140939, "grad_norm": 0.24592278897762299, "learning_rate": 9.503163262076181e-06, "loss": 0.0336, "step": 15400 }, { "epoch": 14.774688398849472, "grad_norm": 0.19091877341270447, "learning_rate": 9.452183123004e-06, "loss": 0.0247, "step": 15410 }, { "epoch": 14.784276126558005, "grad_norm": 0.26081383228302, "learning_rate": 9.401325816264573e-06, "loss": 0.0333, "step": 15420 }, { "epoch": 14.793863854266538, "grad_norm": 0.27854666113853455, "learning_rate": 9.350591495920952e-06, "loss": 0.024, "step": 15430 }, { "epoch": 14.803451581975072, "grad_norm": 0.36169877648353577, "learning_rate": 9.299980315663686e-06, "loss": 0.031, "step": 15440 }, { "epoch": 14.813039309683605, "grad_norm": 0.18000735342502594, "learning_rate": 9.24949242881023e-06, "loss": 0.0289, "step": 15450 }, { "epoch": 14.822627037392138, "grad_norm": 0.25608521699905396, "learning_rate": 9.199127988304607e-06, "loss": 0.0284, "step": 15460 }, { "epoch": 14.832214765100671, "grad_norm": 0.2771013379096985, "learning_rate": 9.148887146716812e-06, "loss": 0.0283, "step": 15470 }, { "epoch": 14.841802492809204, "grad_norm": 0.17078572511672974, "learning_rate": 9.09877005624249e-06, "loss": 0.0294, "step": 15480 }, { "epoch": 14.851390220517738, "grad_norm": 0.17408467829227448, "learning_rate": 9.048776868702347e-06, "loss": 0.0255, "step": 15490 }, { "epoch": 14.860977948226271, "grad_norm": 0.20527216792106628, "learning_rate": 8.998907735541789e-06, "loss": 0.0329, "step": 15500 }, { "epoch": 14.870565675934804, "grad_norm": 0.23558159172534943, "learning_rate": 8.94916280783038e-06, "loss": 0.0294, "step": 15510 }, { "epoch": 14.880153403643337, "grad_norm": 0.16163650155067444, "learning_rate": 8.89954223626146e-06, "loss": 0.0264, "step": 15520 }, { "epoch": 14.889741131351869, "grad_norm": 0.2564382255077362, "learning_rate": 8.850046171151666e-06, "loss": 0.0332, "step": 15530 }, { "epoch": 14.899328859060402, "grad_norm": 0.2050989419221878, "learning_rate": 8.80067476244042e-06, "loss": 0.0307, "step": 15540 }, { "epoch": 14.908916586768935, "grad_norm": 0.18448740243911743, "learning_rate": 8.751428159689528e-06, "loss": 0.0306, "step": 15550 }, { "epoch": 14.918504314477468, "grad_norm": 0.29133155941963196, "learning_rate": 8.702306512082753e-06, "loss": 0.0243, "step": 15560 }, { "epoch": 14.928092042186002, "grad_norm": 0.141392782330513, "learning_rate": 8.653309968425322e-06, "loss": 0.0242, "step": 15570 }, { "epoch": 14.937679769894535, "grad_norm": 0.21134333312511444, "learning_rate": 8.60443867714345e-06, "loss": 0.0318, "step": 15580 }, { "epoch": 14.947267497603068, "grad_norm": 0.2590806484222412, "learning_rate": 8.55569278628393e-06, "loss": 0.0253, "step": 15590 }, { "epoch": 14.956855225311601, "grad_norm": 0.21871857345104218, "learning_rate": 8.507072443513702e-06, "loss": 0.0258, "step": 15600 }, { "epoch": 14.966442953020135, "grad_norm": 0.25187286734580994, "learning_rate": 8.458577796119382e-06, "loss": 0.03, "step": 15610 }, { "epoch": 14.976030680728668, "grad_norm": 0.17888393998146057, "learning_rate": 8.410208991006784e-06, "loss": 0.0274, "step": 15620 }, { "epoch": 14.985618408437201, "grad_norm": 0.1486871838569641, "learning_rate": 8.361966174700514e-06, "loss": 0.0269, "step": 15630 }, { "epoch": 14.995206136145734, "grad_norm": 0.6585232019424438, "learning_rate": 8.31384949334353e-06, "loss": 0.0294, "step": 15640 }, { "epoch": 15.004793863854266, "grad_norm": 0.36748427152633667, "learning_rate": 8.265859092696686e-06, "loss": 0.0318, "step": 15650 }, { "epoch": 15.014381591562799, "grad_norm": 0.22082515060901642, "learning_rate": 8.217995118138294e-06, "loss": 0.0294, "step": 15660 }, { "epoch": 15.023969319271332, "grad_norm": 0.1767498254776001, "learning_rate": 8.170257714663642e-06, "loss": 0.0275, "step": 15670 }, { "epoch": 15.033557046979865, "grad_norm": 0.24185898900032043, "learning_rate": 8.12264702688465e-06, "loss": 0.0279, "step": 15680 }, { "epoch": 15.043144774688399, "grad_norm": 0.22703923285007477, "learning_rate": 8.075163199029357e-06, "loss": 0.0268, "step": 15690 }, { "epoch": 15.052732502396932, "grad_norm": 0.2051907479763031, "learning_rate": 8.027806374941481e-06, "loss": 0.0272, "step": 15700 }, { "epoch": 15.062320230105465, "grad_norm": 0.24761435389518738, "learning_rate": 7.980576698080005e-06, "loss": 0.0301, "step": 15710 }, { "epoch": 15.071907957813998, "grad_norm": 0.17438143491744995, "learning_rate": 7.933474311518796e-06, "loss": 0.0351, "step": 15720 }, { "epoch": 15.081495685522532, "grad_norm": 0.20341135561466217, "learning_rate": 7.88649935794606e-06, "loss": 0.0264, "step": 15730 }, { "epoch": 15.091083413231065, "grad_norm": 0.24047966301441193, "learning_rate": 7.83965197966397e-06, "loss": 0.0268, "step": 15740 }, { "epoch": 15.100671140939598, "grad_norm": 0.19311171770095825, "learning_rate": 7.792932318588264e-06, "loss": 0.033, "step": 15750 }, { "epoch": 15.110258868648131, "grad_norm": 0.18407687544822693, "learning_rate": 7.746340516247779e-06, "loss": 0.0243, "step": 15760 }, { "epoch": 15.119846596356663, "grad_norm": 0.21947818994522095, "learning_rate": 7.69987671378401e-06, "loss": 0.0255, "step": 15770 }, { "epoch": 15.129434324065196, "grad_norm": 0.4175131916999817, "learning_rate": 7.653541051950692e-06, "loss": 0.0245, "step": 15780 }, { "epoch": 15.139022051773729, "grad_norm": 0.29046544432640076, "learning_rate": 7.607333671113409e-06, "loss": 0.0365, "step": 15790 }, { "epoch": 15.148609779482262, "grad_norm": 0.25391921401023865, "learning_rate": 7.561254711249127e-06, "loss": 0.0266, "step": 15800 }, { "epoch": 15.158197507190796, "grad_norm": 0.19595490396022797, "learning_rate": 7.515304311945787e-06, "loss": 0.0306, "step": 15810 }, { "epoch": 15.167785234899329, "grad_norm": 0.1492607444524765, "learning_rate": 7.469482612401857e-06, "loss": 0.0306, "step": 15820 }, { "epoch": 15.177372962607862, "grad_norm": 0.2468632310628891, "learning_rate": 7.423789751425958e-06, "loss": 0.0275, "step": 15830 }, { "epoch": 15.186960690316395, "grad_norm": 0.20901519060134888, "learning_rate": 7.378225867436428e-06, "loss": 0.0252, "step": 15840 }, { "epoch": 15.196548418024928, "grad_norm": 0.28785982728004456, "learning_rate": 7.332791098460867e-06, "loss": 0.0326, "step": 15850 }, { "epoch": 15.206136145733462, "grad_norm": 0.2834322154521942, "learning_rate": 7.287485582135728e-06, "loss": 0.0302, "step": 15860 }, { "epoch": 15.215723873441995, "grad_norm": 0.24561063945293427, "learning_rate": 7.242309455705959e-06, "loss": 0.0292, "step": 15870 }, { "epoch": 15.225311601150528, "grad_norm": 0.23040306568145752, "learning_rate": 7.197262856024539e-06, "loss": 0.0246, "step": 15880 }, { "epoch": 15.234899328859061, "grad_norm": 0.22045479714870453, "learning_rate": 7.152345919552045e-06, "loss": 0.0314, "step": 15890 }, { "epoch": 15.244487056567593, "grad_norm": 0.2748197913169861, "learning_rate": 7.107558782356255e-06, "loss": 0.0292, "step": 15900 }, { "epoch": 15.254074784276126, "grad_norm": 0.2709030210971832, "learning_rate": 7.0629015801117744e-06, "loss": 0.0299, "step": 15910 }, { "epoch": 15.26366251198466, "grad_norm": 0.2666435241699219, "learning_rate": 7.018374448099596e-06, "loss": 0.0324, "step": 15920 }, { "epoch": 15.273250239693192, "grad_norm": 0.32848596572875977, "learning_rate": 6.973977521206654e-06, "loss": 0.0344, "step": 15930 }, { "epoch": 15.282837967401726, "grad_norm": 0.23068153858184814, "learning_rate": 6.929710933925487e-06, "loss": 0.0262, "step": 15940 }, { "epoch": 15.292425695110259, "grad_norm": 0.24479450285434723, "learning_rate": 6.885574820353752e-06, "loss": 0.0269, "step": 15950 }, { "epoch": 15.302013422818792, "grad_norm": 0.21294337511062622, "learning_rate": 6.841569314193902e-06, "loss": 0.0265, "step": 15960 }, { "epoch": 15.311601150527325, "grad_norm": 0.28778862953186035, "learning_rate": 6.797694548752703e-06, "loss": 0.0273, "step": 15970 }, { "epoch": 15.321188878235859, "grad_norm": 0.189237579703331, "learning_rate": 6.753950656940905e-06, "loss": 0.0267, "step": 15980 }, { "epoch": 15.330776605944392, "grad_norm": 0.28015297651290894, "learning_rate": 6.710337771272745e-06, "loss": 0.034, "step": 15990 }, { "epoch": 15.340364333652925, "grad_norm": 0.1625533103942871, "learning_rate": 6.666856023865658e-06, "loss": 0.0233, "step": 16000 }, { "epoch": 15.349952061361458, "grad_norm": 0.21412205696105957, "learning_rate": 6.623505546439773e-06, "loss": 0.0253, "step": 16010 }, { "epoch": 15.35953978906999, "grad_norm": 0.26244086027145386, "learning_rate": 6.580286470317598e-06, "loss": 0.0256, "step": 16020 }, { "epoch": 15.369127516778523, "grad_norm": 0.28637972474098206, "learning_rate": 6.537198926423549e-06, "loss": 0.0283, "step": 16030 }, { "epoch": 15.378715244487056, "grad_norm": 0.2678770124912262, "learning_rate": 6.494243045283621e-06, "loss": 0.0271, "step": 16040 }, { "epoch": 15.38830297219559, "grad_norm": 0.1962299942970276, "learning_rate": 6.45141895702493e-06, "loss": 0.0258, "step": 16050 }, { "epoch": 15.397890699904123, "grad_norm": 0.26651138067245483, "learning_rate": 6.40872679137538e-06, "loss": 0.0276, "step": 16060 }, { "epoch": 15.407478427612656, "grad_norm": 0.23737022280693054, "learning_rate": 6.366166677663204e-06, "loss": 0.0309, "step": 16070 }, { "epoch": 15.417066155321189, "grad_norm": 0.2531161606311798, "learning_rate": 6.323738744816654e-06, "loss": 0.0329, "step": 16080 }, { "epoch": 15.426653883029722, "grad_norm": 0.26035356521606445, "learning_rate": 6.2814431213635065e-06, "loss": 0.0286, "step": 16090 }, { "epoch": 15.436241610738255, "grad_norm": 0.2163701057434082, "learning_rate": 6.239279935430786e-06, "loss": 0.027, "step": 16100 }, { "epoch": 15.445829338446789, "grad_norm": 0.18169005215168, "learning_rate": 6.197249314744275e-06, "loss": 0.024, "step": 16110 }, { "epoch": 15.455417066155322, "grad_norm": 0.24503251910209656, "learning_rate": 6.155351386628205e-06, "loss": 0.0298, "step": 16120 }, { "epoch": 15.465004793863855, "grad_norm": 0.19895343482494354, "learning_rate": 6.113586278004835e-06, "loss": 0.0233, "step": 16130 }, { "epoch": 15.474592521572387, "grad_norm": 0.2949654459953308, "learning_rate": 6.071954115394063e-06, "loss": 0.0256, "step": 16140 }, { "epoch": 15.48418024928092, "grad_norm": 0.13835924863815308, "learning_rate": 6.030455024913029e-06, "loss": 0.029, "step": 16150 }, { "epoch": 15.493767976989453, "grad_norm": 0.36957499384880066, "learning_rate": 5.989089132275799e-06, "loss": 0.0369, "step": 16160 }, { "epoch": 15.503355704697986, "grad_norm": 0.22811642289161682, "learning_rate": 5.947856562792925e-06, "loss": 0.0306, "step": 16170 }, { "epoch": 15.51294343240652, "grad_norm": 0.3362506330013275, "learning_rate": 5.906757441371069e-06, "loss": 0.0346, "step": 16180 }, { "epoch": 15.522531160115053, "grad_norm": 0.20575332641601562, "learning_rate": 5.865791892512623e-06, "loss": 0.0305, "step": 16190 }, { "epoch": 15.532118887823586, "grad_norm": 0.1870652139186859, "learning_rate": 5.824960040315386e-06, "loss": 0.0253, "step": 16200 }, { "epoch": 15.541706615532119, "grad_norm": 0.4694177508354187, "learning_rate": 5.784262008472124e-06, "loss": 0.0287, "step": 16210 }, { "epoch": 15.551294343240652, "grad_norm": 0.2506779134273529, "learning_rate": 5.7436979202702194e-06, "loss": 0.0331, "step": 16220 }, { "epoch": 15.560882070949186, "grad_norm": 0.18632706999778748, "learning_rate": 5.703267898591275e-06, "loss": 0.0234, "step": 16230 }, { "epoch": 15.570469798657719, "grad_norm": 0.14531591534614563, "learning_rate": 5.662972065910799e-06, "loss": 0.0245, "step": 16240 }, { "epoch": 15.580057526366252, "grad_norm": 0.19370119273662567, "learning_rate": 5.622810544297796e-06, "loss": 0.0262, "step": 16250 }, { "epoch": 15.589645254074785, "grad_norm": 0.2350122630596161, "learning_rate": 5.582783455414375e-06, "loss": 0.0262, "step": 16260 }, { "epoch": 15.599232981783317, "grad_norm": 0.2912338078022003, "learning_rate": 5.5428909205154035e-06, "loss": 0.0284, "step": 16270 }, { "epoch": 15.60882070949185, "grad_norm": 0.28382018208503723, "learning_rate": 5.503133060448168e-06, "loss": 0.0257, "step": 16280 }, { "epoch": 15.618408437200383, "grad_norm": 0.1536964774131775, "learning_rate": 5.463509995651978e-06, "loss": 0.0274, "step": 16290 }, { "epoch": 15.627996164908916, "grad_norm": 0.5844811201095581, "learning_rate": 5.4240218461577894e-06, "loss": 0.0294, "step": 16300 }, { "epoch": 15.63758389261745, "grad_norm": 0.2484215646982193, "learning_rate": 5.384668731587844e-06, "loss": 0.0278, "step": 16310 }, { "epoch": 15.647171620325983, "grad_norm": 0.2738986015319824, "learning_rate": 5.345450771155358e-06, "loss": 0.0271, "step": 16320 }, { "epoch": 15.656759348034516, "grad_norm": 0.23017966747283936, "learning_rate": 5.3063680836641095e-06, "loss": 0.0261, "step": 16330 }, { "epoch": 15.66634707574305, "grad_norm": 0.1773134022951126, "learning_rate": 5.2674207875080595e-06, "loss": 0.03, "step": 16340 }, { "epoch": 15.675934803451582, "grad_norm": 0.1907745748758316, "learning_rate": 5.228609000671081e-06, "loss": 0.0224, "step": 16350 }, { "epoch": 15.685522531160116, "grad_norm": 0.2307148277759552, "learning_rate": 5.1899328407264855e-06, "loss": 0.0294, "step": 16360 }, { "epoch": 15.695110258868649, "grad_norm": 0.3302120566368103, "learning_rate": 5.151392424836782e-06, "loss": 0.0292, "step": 16370 }, { "epoch": 15.70469798657718, "grad_norm": 0.2139192521572113, "learning_rate": 5.112987869753216e-06, "loss": 0.0296, "step": 16380 }, { "epoch": 15.714285714285714, "grad_norm": 0.16015082597732544, "learning_rate": 5.074719291815522e-06, "loss": 0.029, "step": 16390 }, { "epoch": 15.723873441994247, "grad_norm": 0.19606702029705048, "learning_rate": 5.036586806951465e-06, "loss": 0.029, "step": 16400 }, { "epoch": 15.73346116970278, "grad_norm": 0.30746451020240784, "learning_rate": 4.998590530676584e-06, "loss": 0.0285, "step": 16410 }, { "epoch": 15.743048897411313, "grad_norm": 0.16113652288913727, "learning_rate": 4.960730578093753e-06, "loss": 0.028, "step": 16420 }, { "epoch": 15.752636625119846, "grad_norm": 0.23624086380004883, "learning_rate": 4.923007063892926e-06, "loss": 0.0251, "step": 16430 }, { "epoch": 15.76222435282838, "grad_norm": 0.19934307038784027, "learning_rate": 4.885420102350696e-06, "loss": 0.0238, "step": 16440 }, { "epoch": 15.771812080536913, "grad_norm": 0.2440912276506424, "learning_rate": 4.847969807330038e-06, "loss": 0.0231, "step": 16450 }, { "epoch": 15.781399808245446, "grad_norm": 0.2768200933933258, "learning_rate": 4.810656292279875e-06, "loss": 0.0268, "step": 16460 }, { "epoch": 15.79098753595398, "grad_norm": 0.29489603638648987, "learning_rate": 4.773479670234821e-06, "loss": 0.0358, "step": 16470 }, { "epoch": 15.800575263662513, "grad_norm": 0.26058635115623474, "learning_rate": 4.7364400538147665e-06, "loss": 0.0272, "step": 16480 }, { "epoch": 15.810162991371046, "grad_norm": 0.19268332421779633, "learning_rate": 4.699537555224598e-06, "loss": 0.028, "step": 16490 }, { "epoch": 15.819750719079579, "grad_norm": 0.27744096517562866, "learning_rate": 4.6627722862537915e-06, "loss": 0.0278, "step": 16500 }, { "epoch": 15.82933844678811, "grad_norm": 0.3575479984283447, "learning_rate": 4.626144358276147e-06, "loss": 0.0275, "step": 16510 }, { "epoch": 15.838926174496644, "grad_norm": 0.20007503032684326, "learning_rate": 4.589653882249378e-06, "loss": 0.0309, "step": 16520 }, { "epoch": 15.848513902205177, "grad_norm": 0.20804741978645325, "learning_rate": 4.553300968714841e-06, "loss": 0.0249, "step": 16530 }, { "epoch": 15.85810162991371, "grad_norm": 0.2726737856864929, "learning_rate": 4.5170857277971765e-06, "loss": 0.0259, "step": 16540 }, { "epoch": 15.867689357622243, "grad_norm": 0.21122261881828308, "learning_rate": 4.48100826920394e-06, "loss": 0.029, "step": 16550 }, { "epoch": 15.877277085330777, "grad_norm": 0.28613051772117615, "learning_rate": 4.4450687022253135e-06, "loss": 0.0255, "step": 16560 }, { "epoch": 15.88686481303931, "grad_norm": 0.2184969037771225, "learning_rate": 4.409267135733764e-06, "loss": 0.0233, "step": 16570 }, { "epoch": 15.896452540747843, "grad_norm": 0.19320517778396606, "learning_rate": 4.37360367818373e-06, "loss": 0.0271, "step": 16580 }, { "epoch": 15.906040268456376, "grad_norm": 0.18892447650432587, "learning_rate": 4.338078437611237e-06, "loss": 0.0265, "step": 16590 }, { "epoch": 15.91562799616491, "grad_norm": 0.23824314773082733, "learning_rate": 4.3026915216336225e-06, "loss": 0.0269, "step": 16600 }, { "epoch": 15.925215723873443, "grad_norm": 0.1431523561477661, "learning_rate": 4.267443037449198e-06, "loss": 0.0269, "step": 16610 }, { "epoch": 15.934803451581976, "grad_norm": 0.22107666730880737, "learning_rate": 4.232333091836932e-06, "loss": 0.0293, "step": 16620 }, { "epoch": 15.944391179290509, "grad_norm": 0.27542436122894287, "learning_rate": 4.197361791156096e-06, "loss": 0.03, "step": 16630 }, { "epoch": 15.95397890699904, "grad_norm": 0.234486922621727, "learning_rate": 4.162529241345958e-06, "loss": 0.0325, "step": 16640 }, { "epoch": 15.963566634707574, "grad_norm": 0.24536362290382385, "learning_rate": 4.127835547925479e-06, "loss": 0.0211, "step": 16650 }, { "epoch": 15.973154362416107, "grad_norm": 0.2566201686859131, "learning_rate": 4.093280815992989e-06, "loss": 0.0244, "step": 16660 }, { "epoch": 15.98274209012464, "grad_norm": 0.3387947380542755, "learning_rate": 4.058865150225833e-06, "loss": 0.0279, "step": 16670 }, { "epoch": 15.992329817833173, "grad_norm": 0.5632581114768982, "learning_rate": 4.024588654880079e-06, "loss": 0.0298, "step": 16680 }, { "epoch": 16.001917545541705, "grad_norm": 0.2585551142692566, "learning_rate": 3.990451433790254e-06, "loss": 0.0313, "step": 16690 }, { "epoch": 16.01150527325024, "grad_norm": 0.2654295563697815, "learning_rate": 3.956453590368914e-06, "loss": 0.0258, "step": 16700 }, { "epoch": 16.02109300095877, "grad_norm": 0.243434339761734, "learning_rate": 3.922595227606435e-06, "loss": 0.0263, "step": 16710 }, { "epoch": 16.030680728667306, "grad_norm": 0.23672133684158325, "learning_rate": 3.8888764480706276e-06, "loss": 0.029, "step": 16720 }, { "epoch": 16.040268456375838, "grad_norm": 0.28110471367836, "learning_rate": 3.855297353906512e-06, "loss": 0.0313, "step": 16730 }, { "epoch": 16.049856184084373, "grad_norm": 0.17387288808822632, "learning_rate": 3.821858046835913e-06, "loss": 0.0263, "step": 16740 }, { "epoch": 16.059443911792904, "grad_norm": 0.16623635590076447, "learning_rate": 3.7885586281572016e-06, "loss": 0.0234, "step": 16750 }, { "epoch": 16.06903163950144, "grad_norm": 0.20889221131801605, "learning_rate": 3.7553991987449912e-06, "loss": 0.0198, "step": 16760 }, { "epoch": 16.07861936720997, "grad_norm": 0.2764891982078552, "learning_rate": 3.7223798590498403e-06, "loss": 0.0306, "step": 16770 }, { "epoch": 16.088207094918506, "grad_norm": 0.17139260470867157, "learning_rate": 3.689500709097893e-06, "loss": 0.0204, "step": 16780 }, { "epoch": 16.097794822627037, "grad_norm": 0.25818943977355957, "learning_rate": 3.6567618484906307e-06, "loss": 0.0243, "step": 16790 }, { "epoch": 16.107382550335572, "grad_norm": 0.33521944284439087, "learning_rate": 3.6241633764045545e-06, "loss": 0.0289, "step": 16800 }, { "epoch": 16.116970278044104, "grad_norm": 0.23774349689483643, "learning_rate": 3.591705391590905e-06, "loss": 0.0284, "step": 16810 }, { "epoch": 16.126558005752635, "grad_norm": 0.17396867275238037, "learning_rate": 3.5593879923753015e-06, "loss": 0.0292, "step": 16820 }, { "epoch": 16.13614573346117, "grad_norm": 0.32836684584617615, "learning_rate": 3.5272112766574993e-06, "loss": 0.0261, "step": 16830 }, { "epoch": 16.1457334611697, "grad_norm": 0.2727390229701996, "learning_rate": 3.4951753419110943e-06, "loss": 0.0294, "step": 16840 }, { "epoch": 16.155321188878236, "grad_norm": 0.36386972665786743, "learning_rate": 3.4632802851832013e-06, "loss": 0.0256, "step": 16850 }, { "epoch": 16.164908916586768, "grad_norm": 0.20322419703006744, "learning_rate": 3.431526203094171e-06, "loss": 0.0242, "step": 16860 }, { "epoch": 16.174496644295303, "grad_norm": 0.23579928278923035, "learning_rate": 3.3999131918372785e-06, "loss": 0.03, "step": 16870 }, { "epoch": 16.184084372003834, "grad_norm": 0.20980890095233917, "learning_rate": 3.3684413471784804e-06, "loss": 0.0281, "step": 16880 }, { "epoch": 16.19367209971237, "grad_norm": 0.17388616502285004, "learning_rate": 3.3371107644560805e-06, "loss": 0.0312, "step": 16890 }, { "epoch": 16.2032598274209, "grad_norm": 0.43162086606025696, "learning_rate": 3.3059215385804585e-06, "loss": 0.0281, "step": 16900 }, { "epoch": 16.212847555129436, "grad_norm": 0.21873044967651367, "learning_rate": 3.274873764033759e-06, "loss": 0.0255, "step": 16910 }, { "epoch": 16.222435282837967, "grad_norm": 0.2102050930261612, "learning_rate": 3.243967534869652e-06, "loss": 0.0272, "step": 16920 }, { "epoch": 16.232023010546502, "grad_norm": 0.21298690140247345, "learning_rate": 3.213202944713023e-06, "loss": 0.0261, "step": 16930 }, { "epoch": 16.241610738255034, "grad_norm": 0.30388498306274414, "learning_rate": 3.1825800867596566e-06, "loss": 0.0338, "step": 16940 }, { "epoch": 16.251198465963565, "grad_norm": 0.2536049485206604, "learning_rate": 3.152099053776014e-06, "loss": 0.0292, "step": 16950 }, { "epoch": 16.2607861936721, "grad_norm": 0.2809562385082245, "learning_rate": 3.121759938098906e-06, "loss": 0.0262, "step": 16960 }, { "epoch": 16.27037392138063, "grad_norm": 0.2241629660129547, "learning_rate": 3.091562831635253e-06, "loss": 0.0288, "step": 16970 }, { "epoch": 16.279961649089167, "grad_norm": 0.1237056627869606, "learning_rate": 3.061507825861748e-06, "loss": 0.0209, "step": 16980 }, { "epoch": 16.289549376797698, "grad_norm": 0.13440051674842834, "learning_rate": 3.031595011824656e-06, "loss": 0.0273, "step": 16990 }, { "epoch": 16.299137104506233, "grad_norm": 0.28445371985435486, "learning_rate": 3.0018244801394535e-06, "loss": 0.034, "step": 17000 }, { "epoch": 16.308724832214764, "grad_norm": 0.3177470862865448, "learning_rate": 2.9721963209906502e-06, "loss": 0.0301, "step": 17010 }, { "epoch": 16.3183125599233, "grad_norm": 0.1341092437505722, "learning_rate": 2.942710624131412e-06, "loss": 0.0266, "step": 17020 }, { "epoch": 16.32790028763183, "grad_norm": 0.19116052985191345, "learning_rate": 2.9133674788833833e-06, "loss": 0.0311, "step": 17030 }, { "epoch": 16.337488015340366, "grad_norm": 0.1874174177646637, "learning_rate": 2.884166974136343e-06, "loss": 0.0236, "step": 17040 }, { "epoch": 16.347075743048897, "grad_norm": 0.36720889806747437, "learning_rate": 2.855109198347983e-06, "loss": 0.0278, "step": 17050 }, { "epoch": 16.35666347075743, "grad_norm": 0.38599368929862976, "learning_rate": 2.826194239543617e-06, "loss": 0.0323, "step": 17060 }, { "epoch": 16.366251198465964, "grad_norm": 0.19532305002212524, "learning_rate": 2.797422185315929e-06, "loss": 0.0222, "step": 17070 }, { "epoch": 16.375838926174495, "grad_norm": 0.2218206375837326, "learning_rate": 2.768793122824681e-06, "loss": 0.0255, "step": 17080 }, { "epoch": 16.38542665388303, "grad_norm": 0.3124590516090393, "learning_rate": 2.740307138796483e-06, "loss": 0.0249, "step": 17090 }, { "epoch": 16.39501438159156, "grad_norm": 0.21726781129837036, "learning_rate": 2.7119643195245238e-06, "loss": 0.0218, "step": 17100 }, { "epoch": 16.404602109300097, "grad_norm": 0.5927583575248718, "learning_rate": 2.683764750868273e-06, "loss": 0.0263, "step": 17110 }, { "epoch": 16.414189837008628, "grad_norm": 0.28960007429122925, "learning_rate": 2.6557085182532582e-06, "loss": 0.0291, "step": 17120 }, { "epoch": 16.423777564717163, "grad_norm": 0.35697048902511597, "learning_rate": 2.6277957066708047e-06, "loss": 0.0273, "step": 17130 }, { "epoch": 16.433365292425695, "grad_norm": 0.2136591225862503, "learning_rate": 2.6000264006777743e-06, "loss": 0.0325, "step": 17140 }, { "epoch": 16.44295302013423, "grad_norm": 0.3051040768623352, "learning_rate": 2.5724006843962866e-06, "loss": 0.0298, "step": 17150 }, { "epoch": 16.45254074784276, "grad_norm": 0.1534937173128128, "learning_rate": 2.5449186415134885e-06, "loss": 0.0263, "step": 17160 }, { "epoch": 16.462128475551296, "grad_norm": 0.17988426983356476, "learning_rate": 2.5175803552812906e-06, "loss": 0.0278, "step": 17170 }, { "epoch": 16.471716203259827, "grad_norm": 0.48748767375946045, "learning_rate": 2.490385908516141e-06, "loss": 0.0308, "step": 17180 }, { "epoch": 16.48130393096836, "grad_norm": 0.191914901137352, "learning_rate": 2.463335383598725e-06, "loss": 0.0303, "step": 17190 }, { "epoch": 16.490891658676894, "grad_norm": 0.21671634912490845, "learning_rate": 2.4364288624737442e-06, "loss": 0.0276, "step": 17200 }, { "epoch": 16.500479386385425, "grad_norm": 0.13923166692256927, "learning_rate": 2.4096664266496814e-06, "loss": 0.0331, "step": 17210 }, { "epoch": 16.51006711409396, "grad_norm": 0.20780488848686218, "learning_rate": 2.3830481571985365e-06, "loss": 0.0243, "step": 17220 }, { "epoch": 16.51965484180249, "grad_norm": 0.39643654227256775, "learning_rate": 2.3565741347555792e-06, "loss": 0.0289, "step": 17230 }, { "epoch": 16.529242569511027, "grad_norm": 0.18083330988883972, "learning_rate": 2.3302444395190915e-06, "loss": 0.0216, "step": 17240 }, { "epoch": 16.538830297219558, "grad_norm": 0.1432444006204605, "learning_rate": 2.3040591512501765e-06, "loss": 0.0318, "step": 17250 }, { "epoch": 16.548418024928093, "grad_norm": 0.2874661386013031, "learning_rate": 2.278018349272465e-06, "loss": 0.0279, "step": 17260 }, { "epoch": 16.558005752636625, "grad_norm": 0.2093266099691391, "learning_rate": 2.2521221124718826e-06, "loss": 0.0226, "step": 17270 }, { "epoch": 16.56759348034516, "grad_norm": 0.3234308063983917, "learning_rate": 2.2263705192964334e-06, "loss": 0.0295, "step": 17280 }, { "epoch": 16.57718120805369, "grad_norm": 0.6225463151931763, "learning_rate": 2.2007636477559436e-06, "loss": 0.031, "step": 17290 }, { "epoch": 16.586768935762223, "grad_norm": 0.31777986884117126, "learning_rate": 2.1753015754218453e-06, "loss": 0.0311, "step": 17300 }, { "epoch": 16.596356663470758, "grad_norm": 0.2332683950662613, "learning_rate": 2.149984379426906e-06, "loss": 0.0263, "step": 17310 }, { "epoch": 16.60594439117929, "grad_norm": 0.23592767119407654, "learning_rate": 2.1248121364650265e-06, "loss": 0.0229, "step": 17320 }, { "epoch": 16.615532118887824, "grad_norm": 0.4014437198638916, "learning_rate": 2.0997849227909983e-06, "loss": 0.026, "step": 17330 }, { "epoch": 16.625119846596355, "grad_norm": 0.18571177124977112, "learning_rate": 2.0749028142202807e-06, "loss": 0.0281, "step": 17340 }, { "epoch": 16.63470757430489, "grad_norm": 0.2480279952287674, "learning_rate": 2.050165886128741e-06, "loss": 0.0283, "step": 17350 }, { "epoch": 16.644295302013422, "grad_norm": 0.20139874517917633, "learning_rate": 2.0255742134524804e-06, "loss": 0.0263, "step": 17360 }, { "epoch": 16.653883029721957, "grad_norm": 0.18241684138774872, "learning_rate": 2.001127870687541e-06, "loss": 0.0206, "step": 17370 }, { "epoch": 16.66347075743049, "grad_norm": 0.26072490215301514, "learning_rate": 1.9768269318897414e-06, "loss": 0.0251, "step": 17380 }, { "epoch": 16.673058485139023, "grad_norm": 0.33512383699417114, "learning_rate": 1.9526714706744055e-06, "loss": 0.0282, "step": 17390 }, { "epoch": 16.682646212847555, "grad_norm": 0.279745876789093, "learning_rate": 1.928661560216172e-06, "loss": 0.0233, "step": 17400 }, { "epoch": 16.69223394055609, "grad_norm": 0.2306470274925232, "learning_rate": 1.904797273248754e-06, "loss": 0.0272, "step": 17410 }, { "epoch": 16.70182166826462, "grad_norm": 0.14322997629642487, "learning_rate": 1.8810786820647242e-06, "loss": 0.0272, "step": 17420 }, { "epoch": 16.711409395973153, "grad_norm": 0.25938233733177185, "learning_rate": 1.8575058585152905e-06, "loss": 0.0308, "step": 17430 }, { "epoch": 16.720997123681688, "grad_norm": 0.23380053043365479, "learning_rate": 1.8340788740101034e-06, "loss": 0.028, "step": 17440 }, { "epoch": 16.73058485139022, "grad_norm": 0.27241095900535583, "learning_rate": 1.810797799517e-06, "loss": 0.0293, "step": 17450 }, { "epoch": 16.740172579098754, "grad_norm": 0.24621997773647308, "learning_rate": 1.7876627055618155e-06, "loss": 0.0258, "step": 17460 }, { "epoch": 16.749760306807286, "grad_norm": 0.15812641382217407, "learning_rate": 1.7646736622281667e-06, "loss": 0.0259, "step": 17470 }, { "epoch": 16.75934803451582, "grad_norm": 0.18936626613140106, "learning_rate": 1.7418307391572354e-06, "loss": 0.026, "step": 17480 }, { "epoch": 16.768935762224352, "grad_norm": 0.16878223419189453, "learning_rate": 1.7191340055475513e-06, "loss": 0.0281, "step": 17490 }, { "epoch": 16.778523489932887, "grad_norm": 0.18892349302768707, "learning_rate": 1.696583530154794e-06, "loss": 0.0259, "step": 17500 }, { "epoch": 16.78811121764142, "grad_norm": 0.243266299366951, "learning_rate": 1.6741793812915907e-06, "loss": 0.0248, "step": 17510 }, { "epoch": 16.797698945349953, "grad_norm": 0.20740211009979248, "learning_rate": 1.6519216268272796e-06, "loss": 0.0264, "step": 17520 }, { "epoch": 16.807286673058485, "grad_norm": 0.16220887005329132, "learning_rate": 1.6298103341877369e-06, "loss": 0.0226, "step": 17530 }, { "epoch": 16.81687440076702, "grad_norm": 0.3126187026500702, "learning_rate": 1.6078455703551486e-06, "loss": 0.0326, "step": 17540 }, { "epoch": 16.82646212847555, "grad_norm": 0.1612725555896759, "learning_rate": 1.5860274018678345e-06, "loss": 0.0327, "step": 17550 }, { "epoch": 16.836049856184083, "grad_norm": 0.20316867530345917, "learning_rate": 1.5643558948200131e-06, "loss": 0.0252, "step": 17560 }, { "epoch": 16.845637583892618, "grad_norm": 0.20207004249095917, "learning_rate": 1.5428311148616204e-06, "loss": 0.0298, "step": 17570 }, { "epoch": 16.85522531160115, "grad_norm": 0.2780834436416626, "learning_rate": 1.5214531271981192e-06, "loss": 0.026, "step": 17580 }, { "epoch": 16.864813039309684, "grad_norm": 0.3551330268383026, "learning_rate": 1.5002219965902896e-06, "loss": 0.0255, "step": 17590 }, { "epoch": 16.874400767018216, "grad_norm": 0.23651057481765747, "learning_rate": 1.4791377873540235e-06, "loss": 0.0274, "step": 17600 }, { "epoch": 16.88398849472675, "grad_norm": 0.19430945813655853, "learning_rate": 1.4582005633601515e-06, "loss": 0.0232, "step": 17610 }, { "epoch": 16.893576222435282, "grad_norm": 0.21821914613246918, "learning_rate": 1.437410388034227e-06, "loss": 0.0278, "step": 17620 }, { "epoch": 16.903163950143817, "grad_norm": 0.23415020108222961, "learning_rate": 1.4167673243563717e-06, "loss": 0.0331, "step": 17630 }, { "epoch": 16.91275167785235, "grad_norm": 0.207551971077919, "learning_rate": 1.3962714348610295e-06, "loss": 0.0305, "step": 17640 }, { "epoch": 16.922339405560884, "grad_norm": 0.28280988335609436, "learning_rate": 1.3759227816368182e-06, "loss": 0.0297, "step": 17650 }, { "epoch": 16.931927133269415, "grad_norm": 0.24366876482963562, "learning_rate": 1.3557214263263286e-06, "loss": 0.0247, "step": 17660 }, { "epoch": 16.941514860977946, "grad_norm": 0.20423495769500732, "learning_rate": 1.3356674301259532e-06, "loss": 0.0263, "step": 17670 }, { "epoch": 16.95110258868648, "grad_norm": 0.19706788659095764, "learning_rate": 1.3157608537856582e-06, "loss": 0.0297, "step": 17680 }, { "epoch": 16.960690316395013, "grad_norm": 0.2174736112356186, "learning_rate": 1.2960017576088446e-06, "loss": 0.0278, "step": 17690 }, { "epoch": 16.970278044103548, "grad_norm": 0.2222086638212204, "learning_rate": 1.2763902014521656e-06, "loss": 0.0276, "step": 17700 }, { "epoch": 16.97986577181208, "grad_norm": 0.20257794857025146, "learning_rate": 1.2569262447252928e-06, "loss": 0.034, "step": 17710 }, { "epoch": 16.989453499520614, "grad_norm": 0.2699783146381378, "learning_rate": 1.2376099463907887e-06, "loss": 0.0226, "step": 17720 }, { "epoch": 16.999041227229146, "grad_norm": 0.19566196203231812, "learning_rate": 1.2184413649639182e-06, "loss": 0.028, "step": 17730 }, { "epoch": 17.00862895493768, "grad_norm": 0.23381511867046356, "learning_rate": 1.1994205585124652e-06, "loss": 0.029, "step": 17740 }, { "epoch": 17.018216682646212, "grad_norm": 0.19119040668010712, "learning_rate": 1.180547584656533e-06, "loss": 0.0239, "step": 17750 }, { "epoch": 17.027804410354747, "grad_norm": 0.23085108399391174, "learning_rate": 1.1618225005684158e-06, "loss": 0.0275, "step": 17760 }, { "epoch": 17.03739213806328, "grad_norm": 0.21077860891819, "learning_rate": 1.1432453629723893e-06, "loss": 0.0309, "step": 17770 }, { "epoch": 17.046979865771814, "grad_norm": 0.18925194442272186, "learning_rate": 1.124816228144565e-06, "loss": 0.0271, "step": 17780 }, { "epoch": 17.056567593480345, "grad_norm": 0.22407986223697662, "learning_rate": 1.106535151912702e-06, "loss": 0.0273, "step": 17790 }, { "epoch": 17.066155321188877, "grad_norm": 0.21448639035224915, "learning_rate": 1.0884021896560237e-06, "loss": 0.0258, "step": 17800 }, { "epoch": 17.07574304889741, "grad_norm": 0.24161478877067566, "learning_rate": 1.0704173963050957e-06, "loss": 0.0289, "step": 17810 }, { "epoch": 17.085330776605943, "grad_norm": 0.1643606573343277, "learning_rate": 1.0525808263416205e-06, "loss": 0.0258, "step": 17820 }, { "epoch": 17.094918504314478, "grad_norm": 0.2575829327106476, "learning_rate": 1.0348925337982817e-06, "loss": 0.0274, "step": 17830 }, { "epoch": 17.10450623202301, "grad_norm": 0.1602732241153717, "learning_rate": 1.0173525722585897e-06, "loss": 0.0358, "step": 17840 }, { "epoch": 17.114093959731544, "grad_norm": 0.23271816968917847, "learning_rate": 9.999609948567024e-07, "loss": 0.0373, "step": 17850 }, { "epoch": 17.123681687440076, "grad_norm": 0.18822619318962097, "learning_rate": 9.82717854277293e-07, "loss": 0.0278, "step": 17860 }, { "epoch": 17.13326941514861, "grad_norm": 0.37295079231262207, "learning_rate": 9.656232027553558e-07, "loss": 0.0245, "step": 17870 }, { "epoch": 17.142857142857142, "grad_norm": 0.207114115357399, "learning_rate": 9.486770920760668e-07, "loss": 0.0237, "step": 17880 }, { "epoch": 17.152444870565677, "grad_norm": 0.2382437288761139, "learning_rate": 9.318795735746233e-07, "loss": 0.0262, "step": 17890 }, { "epoch": 17.16203259827421, "grad_norm": 0.3437121510505676, "learning_rate": 9.152306981360992e-07, "loss": 0.0274, "step": 17900 }, { "epoch": 17.171620325982744, "grad_norm": 0.1845656931400299, "learning_rate": 8.987305161952731e-07, "loss": 0.0251, "step": 17910 }, { "epoch": 17.181208053691275, "grad_norm": 0.2611910402774811, "learning_rate": 8.823790777364837e-07, "loss": 0.0263, "step": 17920 }, { "epoch": 17.190795781399807, "grad_norm": 0.3325332701206207, "learning_rate": 8.661764322934695e-07, "loss": 0.0314, "step": 17930 }, { "epoch": 17.20038350910834, "grad_norm": 0.38311854004859924, "learning_rate": 8.50122628949257e-07, "loss": 0.0279, "step": 17940 }, { "epoch": 17.209971236816873, "grad_norm": 0.1343742161989212, "learning_rate": 8.342177163359389e-07, "loss": 0.028, "step": 17950 }, { "epoch": 17.219558964525408, "grad_norm": 0.19379399716854095, "learning_rate": 8.184617426346131e-07, "loss": 0.0301, "step": 17960 }, { "epoch": 17.22914669223394, "grad_norm": 0.16689153015613556, "learning_rate": 8.028547555751553e-07, "loss": 0.029, "step": 17970 }, { "epoch": 17.238734419942475, "grad_norm": 0.45647260546684265, "learning_rate": 7.873968024361467e-07, "loss": 0.0307, "step": 17980 }, { "epoch": 17.248322147651006, "grad_norm": 0.19029688835144043, "learning_rate": 7.720879300446682e-07, "loss": 0.0269, "step": 17990 }, { "epoch": 17.25790987535954, "grad_norm": 0.26700901985168457, "learning_rate": 7.569281847762122e-07, "loss": 0.026, "step": 18000 }, { "epoch": 17.267497603068072, "grad_norm": 0.20858362317085266, "learning_rate": 7.419176125544991e-07, "loss": 0.0304, "step": 18010 }, { "epoch": 17.277085330776607, "grad_norm": 0.23115743696689606, "learning_rate": 7.270562588513663e-07, "loss": 0.0389, "step": 18020 }, { "epoch": 17.28667305848514, "grad_norm": 0.17492881417274475, "learning_rate": 7.123441686866183e-07, "loss": 0.0293, "step": 18030 }, { "epoch": 17.29626078619367, "grad_norm": 0.12759244441986084, "learning_rate": 6.977813866278826e-07, "loss": 0.0239, "step": 18040 }, { "epoch": 17.305848513902205, "grad_norm": 0.18989066779613495, "learning_rate": 6.833679567905038e-07, "loss": 0.0292, "step": 18050 }, { "epoch": 17.315436241610737, "grad_norm": 0.5339308977127075, "learning_rate": 6.691039228373774e-07, "loss": 0.0337, "step": 18060 }, { "epoch": 17.325023969319272, "grad_norm": 0.18861901760101318, "learning_rate": 6.549893279788277e-07, "loss": 0.0288, "step": 18070 }, { "epoch": 17.334611697027803, "grad_norm": 0.18615840375423431, "learning_rate": 6.410242149724966e-07, "loss": 0.0246, "step": 18080 }, { "epoch": 17.34419942473634, "grad_norm": 0.1773938536643982, "learning_rate": 6.272086261231769e-07, "loss": 0.0272, "step": 18090 }, { "epoch": 17.35378715244487, "grad_norm": 0.2144092619419098, "learning_rate": 6.135426032827185e-07, "loss": 0.0299, "step": 18100 }, { "epoch": 17.363374880153405, "grad_norm": 0.18490025401115417, "learning_rate": 6.000261878498947e-07, "loss": 0.0297, "step": 18110 }, { "epoch": 17.372962607861936, "grad_norm": 0.18837903439998627, "learning_rate": 5.86659420770247e-07, "loss": 0.0272, "step": 18120 }, { "epoch": 17.38255033557047, "grad_norm": 0.2982289791107178, "learning_rate": 5.734423425359958e-07, "loss": 0.0314, "step": 18130 }, { "epoch": 17.392138063279003, "grad_norm": 0.2356351912021637, "learning_rate": 5.603749931859137e-07, "loss": 0.0258, "step": 18140 }, { "epoch": 17.401725790987538, "grad_norm": 0.13853472471237183, "learning_rate": 5.474574123051912e-07, "loss": 0.0289, "step": 18150 }, { "epoch": 17.41131351869607, "grad_norm": 0.2044096440076828, "learning_rate": 5.346896390253153e-07, "loss": 0.0244, "step": 18160 }, { "epoch": 17.4209012464046, "grad_norm": 0.33529403805732727, "learning_rate": 5.220717120239693e-07, "loss": 0.0282, "step": 18170 }, { "epoch": 17.430488974113135, "grad_norm": 0.2302224040031433, "learning_rate": 5.096036695248885e-07, "loss": 0.0299, "step": 18180 }, { "epoch": 17.440076701821667, "grad_norm": 0.22276417911052704, "learning_rate": 4.972855492977823e-07, "loss": 0.0294, "step": 18190 }, { "epoch": 17.449664429530202, "grad_norm": 0.5279762744903564, "learning_rate": 4.851173886581794e-07, "loss": 0.0286, "step": 18200 }, { "epoch": 17.459252157238733, "grad_norm": 0.22499582171440125, "learning_rate": 4.7309922446732715e-07, "loss": 0.0239, "step": 18210 }, { "epoch": 17.46883988494727, "grad_norm": 0.2594180703163147, "learning_rate": 4.61231093132114e-07, "loss": 0.0275, "step": 18220 }, { "epoch": 17.4784276126558, "grad_norm": 0.1713213175535202, "learning_rate": 4.495130306049034e-07, "loss": 0.0243, "step": 18230 }, { "epoch": 17.488015340364335, "grad_norm": 0.3286925256252289, "learning_rate": 4.3794507238347214e-07, "loss": 0.0316, "step": 18240 }, { "epoch": 17.497603068072866, "grad_norm": 0.23200523853302002, "learning_rate": 4.2652725351085556e-07, "loss": 0.0265, "step": 18250 }, { "epoch": 17.5071907957814, "grad_norm": 0.22095492482185364, "learning_rate": 4.1525960857530243e-07, "loss": 0.024, "step": 18260 }, { "epoch": 17.516778523489933, "grad_norm": 0.17762340605258942, "learning_rate": 4.041421717101146e-07, "loss": 0.0268, "step": 18270 }, { "epoch": 17.526366251198468, "grad_norm": 0.2298087775707245, "learning_rate": 3.931749765935744e-07, "loss": 0.0257, "step": 18280 }, { "epoch": 17.535953978907, "grad_norm": 0.21401867270469666, "learning_rate": 3.8235805644882273e-07, "loss": 0.0245, "step": 18290 }, { "epoch": 17.54554170661553, "grad_norm": 0.5458080172538757, "learning_rate": 3.716914440437813e-07, "loss": 0.033, "step": 18300 }, { "epoch": 17.555129434324066, "grad_norm": 0.17889949679374695, "learning_rate": 3.611751716910472e-07, "loss": 0.0303, "step": 18310 }, { "epoch": 17.564717162032597, "grad_norm": 0.0861106589436531, "learning_rate": 3.508092712477651e-07, "loss": 0.025, "step": 18320 }, { "epoch": 17.574304889741132, "grad_norm": 0.396636962890625, "learning_rate": 3.405937741155829e-07, "loss": 0.03, "step": 18330 }, { "epoch": 17.583892617449663, "grad_norm": 0.3980105221271515, "learning_rate": 3.30528711240502e-07, "loss": 0.0217, "step": 18340 }, { "epoch": 17.5934803451582, "grad_norm": 0.2600933313369751, "learning_rate": 3.206141131128326e-07, "loss": 0.0278, "step": 18350 }, { "epoch": 17.60306807286673, "grad_norm": 0.20506466925144196, "learning_rate": 3.108500097670719e-07, "loss": 0.0216, "step": 18360 }, { "epoch": 17.612655800575265, "grad_norm": 0.31107306480407715, "learning_rate": 3.0123643078180943e-07, "loss": 0.0296, "step": 18370 }, { "epoch": 17.622243528283796, "grad_norm": 0.2587839663028717, "learning_rate": 2.9177340527966613e-07, "loss": 0.0265, "step": 18380 }, { "epoch": 17.63183125599233, "grad_norm": 0.293157160282135, "learning_rate": 2.824609619271723e-07, "loss": 0.0239, "step": 18390 }, { "epoch": 17.641418983700863, "grad_norm": 0.22268742322921753, "learning_rate": 2.732991289347064e-07, "loss": 0.0283, "step": 18400 }, { "epoch": 17.651006711409394, "grad_norm": 0.21071119606494904, "learning_rate": 2.6428793405640087e-07, "loss": 0.0241, "step": 18410 }, { "epoch": 17.66059443911793, "grad_norm": 0.25878384709358215, "learning_rate": 2.554274045900418e-07, "loss": 0.0224, "step": 18420 }, { "epoch": 17.67018216682646, "grad_norm": 0.2513992488384247, "learning_rate": 2.46717567377025e-07, "loss": 0.0271, "step": 18430 }, { "epoch": 17.679769894534996, "grad_norm": 0.1096489429473877, "learning_rate": 2.381584488022337e-07, "loss": 0.0233, "step": 18440 }, { "epoch": 17.689357622243527, "grad_norm": 0.24723054468631744, "learning_rate": 2.2975007479397738e-07, "loss": 0.0254, "step": 18450 }, { "epoch": 17.698945349952062, "grad_norm": 0.22072063386440277, "learning_rate": 2.2149247082392522e-07, "loss": 0.0273, "step": 18460 }, { "epoch": 17.708533077660594, "grad_norm": 0.2557280957698822, "learning_rate": 2.1338566190699517e-07, "loss": 0.0322, "step": 18470 }, { "epoch": 17.71812080536913, "grad_norm": 0.3068563938140869, "learning_rate": 2.0542967260131497e-07, "loss": 0.0211, "step": 18480 }, { "epoch": 17.72770853307766, "grad_norm": 0.18864025175571442, "learning_rate": 1.976245270081334e-07, "loss": 0.028, "step": 18490 }, { "epoch": 17.737296260786195, "grad_norm": 0.20000196993350983, "learning_rate": 1.899702487717203e-07, "loss": 0.0239, "step": 18500 }, { "epoch": 17.746883988494726, "grad_norm": 0.5573348999023438, "learning_rate": 1.8246686107935562e-07, "loss": 0.03, "step": 18510 }, { "epoch": 17.75647171620326, "grad_norm": 0.09101556986570358, "learning_rate": 1.7511438666119594e-07, "loss": 0.0336, "step": 18520 }, { "epoch": 17.766059443911793, "grad_norm": 0.2559066712856293, "learning_rate": 1.6791284779024696e-07, "loss": 0.0285, "step": 18530 }, { "epoch": 17.775647171620324, "grad_norm": 0.23298071324825287, "learning_rate": 1.6086226628226898e-07, "loss": 0.0319, "step": 18540 }, { "epoch": 17.78523489932886, "grad_norm": 0.1978902518749237, "learning_rate": 1.5396266349574362e-07, "loss": 0.0269, "step": 18550 }, { "epoch": 17.79482262703739, "grad_norm": 0.5722432732582092, "learning_rate": 1.4721406033177954e-07, "loss": 0.0291, "step": 18560 }, { "epoch": 17.804410354745926, "grad_norm": 0.29033163189888, "learning_rate": 1.4061647723405125e-07, "loss": 0.0288, "step": 18570 }, { "epoch": 17.813998082454457, "grad_norm": 0.19131603837013245, "learning_rate": 1.3416993418874924e-07, "loss": 0.0247, "step": 18580 }, { "epoch": 17.823585810162992, "grad_norm": 0.25687092542648315, "learning_rate": 1.2787445072452998e-07, "loss": 0.0267, "step": 18590 }, { "epoch": 17.833173537871524, "grad_norm": 0.16243956983089447, "learning_rate": 1.217300459124271e-07, "loss": 0.0273, "step": 18600 }, { "epoch": 17.84276126558006, "grad_norm": 0.17303957045078278, "learning_rate": 1.1573673836580701e-07, "loss": 0.0353, "step": 18610 }, { "epoch": 17.85234899328859, "grad_norm": 0.4954906702041626, "learning_rate": 1.0989454624032448e-07, "loss": 0.0239, "step": 18620 }, { "epoch": 17.861936720997125, "grad_norm": 0.500385582447052, "learning_rate": 1.0420348723385043e-07, "loss": 0.0279, "step": 18630 }, { "epoch": 17.871524448705657, "grad_norm": 0.28065744042396545, "learning_rate": 9.866357858642205e-08, "loss": 0.024, "step": 18640 }, { "epoch": 17.88111217641419, "grad_norm": 0.22515705227851868, "learning_rate": 9.32748370802039e-08, "loss": 0.0273, "step": 18650 }, { "epoch": 17.890699904122723, "grad_norm": 0.4083874225616455, "learning_rate": 8.803727903942127e-08, "loss": 0.0269, "step": 18660 }, { "epoch": 17.900287631831254, "grad_norm": 0.3455846309661865, "learning_rate": 8.295092033031027e-08, "loss": 0.0277, "step": 18670 }, { "epoch": 17.90987535953979, "grad_norm": 0.15052051842212677, "learning_rate": 7.801577636108448e-08, "loss": 0.0358, "step": 18680 }, { "epoch": 17.91946308724832, "grad_norm": 0.21173402667045593, "learning_rate": 7.323186208188504e-08, "loss": 0.0256, "step": 18690 }, { "epoch": 17.929050814956856, "grad_norm": 0.3735136389732361, "learning_rate": 6.859919198470288e-08, "loss": 0.031, "step": 18700 }, { "epoch": 17.938638542665387, "grad_norm": 0.2103312462568283, "learning_rate": 6.411778010340097e-08, "loss": 0.0322, "step": 18710 }, { "epoch": 17.948226270373922, "grad_norm": 0.19569391012191772, "learning_rate": 5.978764001359771e-08, "loss": 0.0291, "step": 18720 }, { "epoch": 17.957813998082454, "grad_norm": 0.25286465883255005, "learning_rate": 5.5608784832683616e-08, "loss": 0.0277, "step": 18730 }, { "epoch": 17.96740172579099, "grad_norm": 0.2856442332267761, "learning_rate": 5.158122721974357e-08, "loss": 0.0254, "step": 18740 }, { "epoch": 17.97698945349952, "grad_norm": 0.15211383998394012, "learning_rate": 4.770497937554574e-08, "loss": 0.024, "step": 18750 }, { "epoch": 17.986577181208055, "grad_norm": 0.28586897253990173, "learning_rate": 4.398005304248609e-08, "loss": 0.0239, "step": 18760 }, { "epoch": 17.996164908916587, "grad_norm": 0.18181052803993225, "learning_rate": 4.0406459504555016e-08, "loss": 0.0236, "step": 18770 }, { "epoch": 18.005752636625118, "grad_norm": 0.19704671204090118, "learning_rate": 3.698420958732074e-08, "loss": 0.0251, "step": 18780 }, { "epoch": 18.015340364333653, "grad_norm": 0.19747470319271088, "learning_rate": 3.371331365786823e-08, "loss": 0.0313, "step": 18790 }, { "epoch": 18.024928092042185, "grad_norm": 0.23974737524986267, "learning_rate": 3.05937816247992e-08, "loss": 0.0334, "step": 18800 }, { "epoch": 18.03451581975072, "grad_norm": 0.31815865635871887, "learning_rate": 2.7625622938165507e-08, "loss": 0.025, "step": 18810 }, { "epoch": 18.04410354745925, "grad_norm": 0.14651015400886536, "learning_rate": 2.4808846589474687e-08, "loss": 0.0252, "step": 18820 }, { "epoch": 18.053691275167786, "grad_norm": 0.31359338760375977, "learning_rate": 2.214346111164556e-08, "loss": 0.0255, "step": 18830 }, { "epoch": 18.063279002876317, "grad_norm": 0.3521699607372284, "learning_rate": 1.9629474578986008e-08, "loss": 0.0229, "step": 18840 }, { "epoch": 18.072866730584852, "grad_norm": 0.2816530168056488, "learning_rate": 1.726689460716524e-08, "loss": 0.0262, "step": 18850 }, { "epoch": 18.082454458293384, "grad_norm": 0.27596089243888855, "learning_rate": 1.5055728353191578e-08, "loss": 0.0266, "step": 18860 }, { "epoch": 18.09204218600192, "grad_norm": 0.25768667459487915, "learning_rate": 1.2995982515406901e-08, "loss": 0.0273, "step": 18870 }, { "epoch": 18.10162991371045, "grad_norm": 0.13152585923671722, "learning_rate": 1.1087663333431141e-08, "loss": 0.0268, "step": 18880 }, { "epoch": 18.111217641418985, "grad_norm": 0.1559949666261673, "learning_rate": 9.330776588184487e-09, "loss": 0.0307, "step": 18890 }, { "epoch": 18.120805369127517, "grad_norm": 0.25546255707740784, "learning_rate": 7.725327601826315e-09, "loss": 0.0254, "step": 18900 }, { "epoch": 18.13039309683605, "grad_norm": 0.17455005645751953, "learning_rate": 6.271321237788508e-09, "loss": 0.0331, "step": 18910 }, { "epoch": 18.139980824544583, "grad_norm": 0.25416553020477295, "learning_rate": 4.9687619007199316e-09, "loss": 0.0332, "step": 18920 }, { "epoch": 18.149568552253115, "grad_norm": 0.19471152126789093, "learning_rate": 3.817653536480892e-09, "loss": 0.0248, "step": 18930 }, { "epoch": 18.15915627996165, "grad_norm": 0.26644882559776306, "learning_rate": 2.8179996321597845e-09, "loss": 0.0248, "step": 18940 }, { "epoch": 18.16874400767018, "grad_norm": 0.18680621683597565, "learning_rate": 1.9698032160231363e-09, "loss": 0.0252, "step": 18950 }, { "epoch": 18.178331735378716, "grad_norm": 0.22466066479682922, "learning_rate": 1.2730668575322569e-09, "loss": 0.0221, "step": 18960 }, { "epoch": 18.187919463087248, "grad_norm": 0.27246662974357605, "learning_rate": 7.277926673210367e-10, "loss": 0.0258, "step": 18970 }, { "epoch": 18.197507190795783, "grad_norm": 0.17329837381839752, "learning_rate": 3.3398229720149607e-10, "loss": 0.0266, "step": 18980 }, { "epoch": 18.207094918504314, "grad_norm": 0.3577910363674164, "learning_rate": 9.163694015268398e-11, "loss": 0.0296, "step": 18990 }, { "epoch": 18.21668264621285, "grad_norm": 0.24373145401477814, "learning_rate": 7.57330315126481e-13, "loss": 0.029, "step": 19000 }, { "epoch": 18.21668264621285, "step": 19000, "total_flos": 0.0, "train_loss": 0.04628433942951654, "train_runtime": 5633.6681, "train_samples_per_second": 107.923, "train_steps_per_second": 3.373 } ], "logging_steps": 10, "max_steps": 19000, "num_input_tokens_seen": 0, "num_train_epochs": 19, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }