diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13343 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.21668264621285, + "eval_steps": 500, + "global_step": 19000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009587727708533078, + "grad_norm": 27.445323944091797, + "learning_rate": 9.473684210526317e-07, + "loss": 2.1709, + "step": 10 + }, + { + "epoch": 0.019175455417066157, + "grad_norm": 19.005075454711914, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8704, + "step": 20 + }, + { + "epoch": 0.028763183125599234, + "grad_norm": 14.785849571228027, + "learning_rate": 3.0526315789473684e-06, + "loss": 1.6318, + "step": 30 + }, + { + "epoch": 0.038350910834132314, + "grad_norm": 4.634030342102051, + "learning_rate": 4.105263157894737e-06, + "loss": 0.8641, + "step": 40 + }, + { + "epoch": 0.04793863854266539, + "grad_norm": 2.2945172786712646, + "learning_rate": 5.1578947368421055e-06, + "loss": 0.5394, + "step": 50 + }, + { + "epoch": 0.05752636625119847, + "grad_norm": 1.7087739706039429, + "learning_rate": 6.2105263157894745e-06, + "loss": 0.4525, + "step": 60 + }, + { + "epoch": 0.06711409395973154, + "grad_norm": 1.1094379425048828, + "learning_rate": 7.2631578947368426e-06, + "loss": 0.3063, + "step": 70 + }, + { + "epoch": 0.07670182166826463, + "grad_norm": 1.5301676988601685, + "learning_rate": 8.315789473684212e-06, + "loss": 0.3153, + "step": 80 + }, + { + "epoch": 0.0862895493767977, + "grad_norm": 1.1719224452972412, + "learning_rate": 9.368421052631579e-06, + "loss": 0.2466, + "step": 90 + }, + { + "epoch": 0.09587727708533078, + "grad_norm": 1.751291275024414, + "learning_rate": 1.0421052631578948e-05, + "loss": 0.27, + "step": 100 + }, + { + "epoch": 0.10546500479386385, + "grad_norm": 1.0524818897247314, + "learning_rate": 1.1473684210526315e-05, + "loss": 0.2333, + "step": 110 + }, + { + "epoch": 0.11505273250239693, + "grad_norm": 1.508988857269287, + "learning_rate": 1.2526315789473686e-05, + "loss": 0.2399, + "step": 120 + }, + { + "epoch": 0.12464046021093, + "grad_norm": 1.3286081552505493, + "learning_rate": 1.3578947368421053e-05, + "loss": 0.1962, + "step": 130 + }, + { + "epoch": 0.1342281879194631, + "grad_norm": 1.7412567138671875, + "learning_rate": 1.4631578947368422e-05, + "loss": 0.2004, + "step": 140 + }, + { + "epoch": 0.14381591562799617, + "grad_norm": 1.8567883968353271, + "learning_rate": 1.568421052631579e-05, + "loss": 0.174, + "step": 150 + }, + { + "epoch": 0.15340364333652926, + "grad_norm": 1.5139102935791016, + "learning_rate": 1.673684210526316e-05, + "loss": 0.1765, + "step": 160 + }, + { + "epoch": 0.1629913710450623, + "grad_norm": 1.6859902143478394, + "learning_rate": 1.7789473684210527e-05, + "loss": 0.168, + "step": 170 + }, + { + "epoch": 0.1725790987535954, + "grad_norm": 1.8252370357513428, + "learning_rate": 1.8842105263157894e-05, + "loss": 0.1645, + "step": 180 + }, + { + "epoch": 0.18216682646212848, + "grad_norm": 1.2732850313186646, + "learning_rate": 1.9894736842105265e-05, + "loss": 0.1554, + "step": 190 + }, + { + "epoch": 0.19175455417066156, + "grad_norm": 1.0456390380859375, + "learning_rate": 2.0947368421052632e-05, + "loss": 0.1575, + "step": 200 + }, + { + "epoch": 0.20134228187919462, + "grad_norm": 0.7651330828666687, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.163, + "step": 210 + }, + { + "epoch": 0.2109300095877277, + "grad_norm": 0.9984806776046753, + "learning_rate": 2.305263157894737e-05, + "loss": 0.1508, + "step": 220 + }, + { + "epoch": 0.22051773729626079, + "grad_norm": 1.0750813484191895, + "learning_rate": 2.410526315789474e-05, + "loss": 0.1349, + "step": 230 + }, + { + "epoch": 0.23010546500479387, + "grad_norm": 1.7777466773986816, + "learning_rate": 2.5157894736842108e-05, + "loss": 0.1448, + "step": 240 + }, + { + "epoch": 0.23969319271332695, + "grad_norm": 1.3516716957092285, + "learning_rate": 2.6210526315789475e-05, + "loss": 0.1427, + "step": 250 + }, + { + "epoch": 0.24928092042186, + "grad_norm": 1.1810095310211182, + "learning_rate": 2.7263157894736846e-05, + "loss": 0.1385, + "step": 260 + }, + { + "epoch": 0.2588686481303931, + "grad_norm": 1.6512832641601562, + "learning_rate": 2.8315789473684213e-05, + "loss": 0.155, + "step": 270 + }, + { + "epoch": 0.2684563758389262, + "grad_norm": 1.2209525108337402, + "learning_rate": 2.9368421052631577e-05, + "loss": 0.1378, + "step": 280 + }, + { + "epoch": 0.27804410354745923, + "grad_norm": 1.0236748456954956, + "learning_rate": 3.042105263157895e-05, + "loss": 0.1409, + "step": 290 + }, + { + "epoch": 0.28763183125599234, + "grad_norm": 1.065836787223816, + "learning_rate": 3.147368421052632e-05, + "loss": 0.1409, + "step": 300 + }, + { + "epoch": 0.2972195589645254, + "grad_norm": 1.0454283952713013, + "learning_rate": 3.2526315789473686e-05, + "loss": 0.1333, + "step": 310 + }, + { + "epoch": 0.3068072866730585, + "grad_norm": 0.5515532493591309, + "learning_rate": 3.357894736842105e-05, + "loss": 0.1137, + "step": 320 + }, + { + "epoch": 0.31639501438159157, + "grad_norm": 1.323104977607727, + "learning_rate": 3.463157894736842e-05, + "loss": 0.1317, + "step": 330 + }, + { + "epoch": 0.3259827420901246, + "grad_norm": 1.5426658391952515, + "learning_rate": 3.5684210526315794e-05, + "loss": 0.1174, + "step": 340 + }, + { + "epoch": 0.33557046979865773, + "grad_norm": 0.9131991863250732, + "learning_rate": 3.673684210526316e-05, + "loss": 0.1171, + "step": 350 + }, + { + "epoch": 0.3451581975071908, + "grad_norm": 1.0024508237838745, + "learning_rate": 3.778947368421053e-05, + "loss": 0.1162, + "step": 360 + }, + { + "epoch": 0.3547459252157239, + "grad_norm": 1.1091963052749634, + "learning_rate": 3.8842105263157896e-05, + "loss": 0.1272, + "step": 370 + }, + { + "epoch": 0.36433365292425696, + "grad_norm": 0.9772627949714661, + "learning_rate": 3.989473684210526e-05, + "loss": 0.1059, + "step": 380 + }, + { + "epoch": 0.37392138063279, + "grad_norm": 0.92393958568573, + "learning_rate": 4.094736842105264e-05, + "loss": 0.113, + "step": 390 + }, + { + "epoch": 0.3835091083413231, + "grad_norm": 0.9960997700691223, + "learning_rate": 4.2e-05, + "loss": 0.1077, + "step": 400 + }, + { + "epoch": 0.3930968360498562, + "grad_norm": 1.0618188381195068, + "learning_rate": 4.305263157894737e-05, + "loss": 0.1084, + "step": 410 + }, + { + "epoch": 0.40268456375838924, + "grad_norm": 0.7491030693054199, + "learning_rate": 4.410526315789474e-05, + "loss": 0.1021, + "step": 420 + }, + { + "epoch": 0.41227229146692235, + "grad_norm": 0.9327500462532043, + "learning_rate": 4.515789473684211e-05, + "loss": 0.0984, + "step": 430 + }, + { + "epoch": 0.4218600191754554, + "grad_norm": 0.7720574140548706, + "learning_rate": 4.6210526315789473e-05, + "loss": 0.0971, + "step": 440 + }, + { + "epoch": 0.4314477468839885, + "grad_norm": 1.2057392597198486, + "learning_rate": 4.726315789473684e-05, + "loss": 0.1088, + "step": 450 + }, + { + "epoch": 0.44103547459252157, + "grad_norm": 1.1223393678665161, + "learning_rate": 4.8315789473684215e-05, + "loss": 0.0992, + "step": 460 + }, + { + "epoch": 0.4506232023010546, + "grad_norm": 0.6742480397224426, + "learning_rate": 4.936842105263158e-05, + "loss": 0.0963, + "step": 470 + }, + { + "epoch": 0.46021093000958774, + "grad_norm": 1.0714161396026611, + "learning_rate": 5.042105263157895e-05, + "loss": 0.0974, + "step": 480 + }, + { + "epoch": 0.4697986577181208, + "grad_norm": 0.7936097383499146, + "learning_rate": 5.1473684210526317e-05, + "loss": 0.1022, + "step": 490 + }, + { + "epoch": 0.4793863854266539, + "grad_norm": 1.4822968244552612, + "learning_rate": 5.252631578947369e-05, + "loss": 0.0996, + "step": 500 + }, + { + "epoch": 0.48897411313518696, + "grad_norm": 1.0476019382476807, + "learning_rate": 5.357894736842105e-05, + "loss": 0.1018, + "step": 510 + }, + { + "epoch": 0.49856184084372, + "grad_norm": 0.9343310594558716, + "learning_rate": 5.4631578947368425e-05, + "loss": 0.102, + "step": 520 + }, + { + "epoch": 0.5081495685522531, + "grad_norm": 0.8918314576148987, + "learning_rate": 5.568421052631579e-05, + "loss": 0.0986, + "step": 530 + }, + { + "epoch": 0.5177372962607862, + "grad_norm": 1.155029296875, + "learning_rate": 5.6736842105263166e-05, + "loss": 0.1031, + "step": 540 + }, + { + "epoch": 0.5273250239693192, + "grad_norm": 0.625169038772583, + "learning_rate": 5.778947368421053e-05, + "loss": 0.0907, + "step": 550 + }, + { + "epoch": 0.5369127516778524, + "grad_norm": 1.0989243984222412, + "learning_rate": 5.88421052631579e-05, + "loss": 0.0843, + "step": 560 + }, + { + "epoch": 0.5465004793863855, + "grad_norm": 0.8834158778190613, + "learning_rate": 5.989473684210527e-05, + "loss": 0.0777, + "step": 570 + }, + { + "epoch": 0.5560882070949185, + "grad_norm": 0.7638639211654663, + "learning_rate": 6.094736842105263e-05, + "loss": 0.0781, + "step": 580 + }, + { + "epoch": 0.5656759348034516, + "grad_norm": 1.2423137426376343, + "learning_rate": 6.2e-05, + "loss": 0.0886, + "step": 590 + }, + { + "epoch": 0.5752636625119847, + "grad_norm": 1.082046627998352, + "learning_rate": 6.305263157894738e-05, + "loss": 0.0921, + "step": 600 + }, + { + "epoch": 0.5848513902205177, + "grad_norm": 0.8878996968269348, + "learning_rate": 6.410526315789474e-05, + "loss": 0.0926, + "step": 610 + }, + { + "epoch": 0.5944391179290508, + "grad_norm": 0.80406653881073, + "learning_rate": 6.515789473684211e-05, + "loss": 0.0983, + "step": 620 + }, + { + "epoch": 0.6040268456375839, + "grad_norm": 0.8726837038993835, + "learning_rate": 6.621052631578947e-05, + "loss": 0.0833, + "step": 630 + }, + { + "epoch": 0.613614573346117, + "grad_norm": 0.9104009866714478, + "learning_rate": 6.726315789473685e-05, + "loss": 0.0884, + "step": 640 + }, + { + "epoch": 0.62320230105465, + "grad_norm": 0.6089403629302979, + "learning_rate": 6.83157894736842e-05, + "loss": 0.0835, + "step": 650 + }, + { + "epoch": 0.6327900287631831, + "grad_norm": 0.8488327860832214, + "learning_rate": 6.936842105263158e-05, + "loss": 0.0812, + "step": 660 + }, + { + "epoch": 0.6423777564717162, + "grad_norm": 1.121718168258667, + "learning_rate": 7.042105263157895e-05, + "loss": 0.0979, + "step": 670 + }, + { + "epoch": 0.6519654841802492, + "grad_norm": 0.554762065410614, + "learning_rate": 7.147368421052631e-05, + "loss": 0.0941, + "step": 680 + }, + { + "epoch": 0.6615532118887824, + "grad_norm": 0.8173949718475342, + "learning_rate": 7.252631578947369e-05, + "loss": 0.09, + "step": 690 + }, + { + "epoch": 0.6711409395973155, + "grad_norm": 0.9960802793502808, + "learning_rate": 7.357894736842106e-05, + "loss": 0.0969, + "step": 700 + }, + { + "epoch": 0.6807286673058485, + "grad_norm": 0.9952852725982666, + "learning_rate": 7.463157894736844e-05, + "loss": 0.0927, + "step": 710 + }, + { + "epoch": 0.6903163950143816, + "grad_norm": 1.1024588346481323, + "learning_rate": 7.56842105263158e-05, + "loss": 0.0874, + "step": 720 + }, + { + "epoch": 0.6999041227229147, + "grad_norm": 0.7529568672180176, + "learning_rate": 7.673684210526316e-05, + "loss": 0.0853, + "step": 730 + }, + { + "epoch": 0.7094918504314478, + "grad_norm": 0.8373092412948608, + "learning_rate": 7.778947368421053e-05, + "loss": 0.0783, + "step": 740 + }, + { + "epoch": 0.7190795781399808, + "grad_norm": 0.6158662438392639, + "learning_rate": 7.884210526315789e-05, + "loss": 0.0872, + "step": 750 + }, + { + "epoch": 0.7286673058485139, + "grad_norm": 0.7315576076507568, + "learning_rate": 7.989473684210527e-05, + "loss": 0.0841, + "step": 760 + }, + { + "epoch": 0.738255033557047, + "grad_norm": 0.5791612267494202, + "learning_rate": 8.094736842105264e-05, + "loss": 0.0706, + "step": 770 + }, + { + "epoch": 0.74784276126558, + "grad_norm": 0.8657413721084595, + "learning_rate": 8.2e-05, + "loss": 0.0689, + "step": 780 + }, + { + "epoch": 0.7574304889741131, + "grad_norm": 0.9742875695228577, + "learning_rate": 8.305263157894737e-05, + "loss": 0.0869, + "step": 790 + }, + { + "epoch": 0.7670182166826462, + "grad_norm": 0.7406681776046753, + "learning_rate": 8.410526315789475e-05, + "loss": 0.0869, + "step": 800 + }, + { + "epoch": 0.7766059443911792, + "grad_norm": 1.168278455734253, + "learning_rate": 8.515789473684211e-05, + "loss": 0.0803, + "step": 810 + }, + { + "epoch": 0.7861936720997124, + "grad_norm": 1.1049866676330566, + "learning_rate": 8.621052631578947e-05, + "loss": 0.0851, + "step": 820 + }, + { + "epoch": 0.7957813998082455, + "grad_norm": 0.9790105223655701, + "learning_rate": 8.726315789473684e-05, + "loss": 0.0788, + "step": 830 + }, + { + "epoch": 0.8053691275167785, + "grad_norm": 0.762137770652771, + "learning_rate": 8.831578947368422e-05, + "loss": 0.0715, + "step": 840 + }, + { + "epoch": 0.8149568552253116, + "grad_norm": 0.8730412125587463, + "learning_rate": 8.936842105263158e-05, + "loss": 0.0898, + "step": 850 + }, + { + "epoch": 0.8245445829338447, + "grad_norm": 1.1794781684875488, + "learning_rate": 9.042105263157895e-05, + "loss": 0.0798, + "step": 860 + }, + { + "epoch": 0.8341323106423778, + "grad_norm": 0.7828540205955505, + "learning_rate": 9.147368421052633e-05, + "loss": 0.0848, + "step": 870 + }, + { + "epoch": 0.8437200383509108, + "grad_norm": 0.7496788501739502, + "learning_rate": 9.252631578947369e-05, + "loss": 0.0836, + "step": 880 + }, + { + "epoch": 0.8533077660594439, + "grad_norm": 0.7298113703727722, + "learning_rate": 9.357894736842106e-05, + "loss": 0.0804, + "step": 890 + }, + { + "epoch": 0.862895493767977, + "grad_norm": 0.7915740609169006, + "learning_rate": 9.463157894736842e-05, + "loss": 0.0978, + "step": 900 + }, + { + "epoch": 0.87248322147651, + "grad_norm": 0.6587068438529968, + "learning_rate": 9.56842105263158e-05, + "loss": 0.0823, + "step": 910 + }, + { + "epoch": 0.8820709491850431, + "grad_norm": 0.6733153462409973, + "learning_rate": 9.673684210526316e-05, + "loss": 0.0903, + "step": 920 + }, + { + "epoch": 0.8916586768935763, + "grad_norm": 0.8253368139266968, + "learning_rate": 9.778947368421053e-05, + "loss": 0.0817, + "step": 930 + }, + { + "epoch": 0.9012464046021093, + "grad_norm": 0.631831169128418, + "learning_rate": 9.88421052631579e-05, + "loss": 0.0692, + "step": 940 + }, + { + "epoch": 0.9108341323106424, + "grad_norm": 0.4998478293418884, + "learning_rate": 9.989473684210526e-05, + "loss": 0.08, + "step": 950 + }, + { + "epoch": 0.9204218600191755, + "grad_norm": 0.5345643162727356, + "learning_rate": 9.999993865625701e-05, + "loss": 0.0707, + "step": 960 + }, + { + "epoch": 0.9300095877277086, + "grad_norm": 0.496713250875473, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0759, + "step": 970 + }, + { + "epoch": 0.9395973154362416, + "grad_norm": 0.4693014621734619, + "learning_rate": 9.999936308655709e-05, + "loss": 0.0781, + "step": 980 + }, + { + "epoch": 0.9491850431447747, + "grad_norm": 0.5775050520896912, + "learning_rate": 9.999884810501344e-05, + "loss": 0.0748, + "step": 990 + }, + { + "epoch": 0.9587727708533078, + "grad_norm": 0.7837674021720886, + "learning_rate": 9.999818166093444e-05, + "loss": 0.0783, + "step": 1000 + }, + { + "epoch": 0.9683604985618408, + "grad_norm": 0.6740615367889404, + "learning_rate": 9.999736375633896e-05, + "loss": 0.0799, + "step": 1010 + }, + { + "epoch": 0.9779482262703739, + "grad_norm": 0.644281268119812, + "learning_rate": 9.999639439370469e-05, + "loss": 0.0875, + "step": 1020 + }, + { + "epoch": 0.987535953978907, + "grad_norm": 0.6877675652503967, + "learning_rate": 9.999527357596816e-05, + "loss": 0.0702, + "step": 1030 + }, + { + "epoch": 0.99712368168744, + "grad_norm": 0.8206673860549927, + "learning_rate": 9.999400130652465e-05, + "loss": 0.0705, + "step": 1040 + }, + { + "epoch": 1.0067114093959733, + "grad_norm": 0.5425058007240295, + "learning_rate": 9.999257758922833e-05, + "loss": 0.0773, + "step": 1050 + }, + { + "epoch": 1.0162991371045063, + "grad_norm": 0.7658944725990295, + "learning_rate": 9.999100242839203e-05, + "loss": 0.0777, + "step": 1060 + }, + { + "epoch": 1.0258868648130393, + "grad_norm": 0.73934006690979, + "learning_rate": 9.998927582878747e-05, + "loss": 0.0685, + "step": 1070 + }, + { + "epoch": 1.0354745925215725, + "grad_norm": 0.38501349091529846, + "learning_rate": 9.998739779564506e-05, + "loss": 0.069, + "step": 1080 + }, + { + "epoch": 1.0450623202301055, + "grad_norm": 0.45449578762054443, + "learning_rate": 9.998536833465394e-05, + "loss": 0.0559, + "step": 1090 + }, + { + "epoch": 1.0546500479386385, + "grad_norm": 0.8127736449241638, + "learning_rate": 9.998318745196203e-05, + "loss": 0.068, + "step": 1100 + }, + { + "epoch": 1.0642377756471717, + "grad_norm": 0.6800121068954468, + "learning_rate": 9.998085515417588e-05, + "loss": 0.0683, + "step": 1110 + }, + { + "epoch": 1.0738255033557047, + "grad_norm": 0.688755214214325, + "learning_rate": 9.997837144836082e-05, + "loss": 0.0619, + "step": 1120 + }, + { + "epoch": 1.0834132310642377, + "grad_norm": 0.6529737710952759, + "learning_rate": 9.997573634204074e-05, + "loss": 0.0716, + "step": 1130 + }, + { + "epoch": 1.093000958772771, + "grad_norm": 0.773915708065033, + "learning_rate": 9.997294984319827e-05, + "loss": 0.0667, + "step": 1140 + }, + { + "epoch": 1.102588686481304, + "grad_norm": 0.611422061920166, + "learning_rate": 9.997001196027457e-05, + "loss": 0.0695, + "step": 1150 + }, + { + "epoch": 1.112176414189837, + "grad_norm": 0.6238502264022827, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0632, + "step": 1160 + }, + { + "epoch": 1.1217641418983701, + "grad_norm": 0.6252961158752441, + "learning_rate": 9.996368207824128e-05, + "loss": 0.0708, + "step": 1170 + }, + { + "epoch": 1.1313518696069031, + "grad_norm": 0.3486538529396057, + "learning_rate": 9.996029009830689e-05, + "loss": 0.0662, + "step": 1180 + }, + { + "epoch": 1.1409395973154361, + "grad_norm": 0.40418991446495056, + "learning_rate": 9.995674677264173e-05, + "loss": 0.0591, + "step": 1190 + }, + { + "epoch": 1.1505273250239694, + "grad_norm": 0.4740557074546814, + "learning_rate": 9.995305211197965e-05, + "loss": 0.0701, + "step": 1200 + }, + { + "epoch": 1.1601150527325024, + "grad_norm": 0.713366687297821, + "learning_rate": 9.994920612751295e-05, + "loss": 0.073, + "step": 1210 + }, + { + "epoch": 1.1697027804410354, + "grad_norm": 0.6612546443939209, + "learning_rate": 9.994520883089238e-05, + "loss": 0.0681, + "step": 1220 + }, + { + "epoch": 1.1792905081495686, + "grad_norm": 0.6933987736701965, + "learning_rate": 9.994106023422699e-05, + "loss": 0.0655, + "step": 1230 + }, + { + "epoch": 1.1888782358581016, + "grad_norm": 0.4890410602092743, + "learning_rate": 9.993676035008423e-05, + "loss": 0.0633, + "step": 1240 + }, + { + "epoch": 1.1984659635666346, + "grad_norm": 0.5587823987007141, + "learning_rate": 9.993230919148985e-05, + "loss": 0.0656, + "step": 1250 + }, + { + "epoch": 1.2080536912751678, + "grad_norm": 0.6635778546333313, + "learning_rate": 9.99277067719278e-05, + "loss": 0.0603, + "step": 1260 + }, + { + "epoch": 1.2176414189837008, + "grad_norm": 0.6514385342597961, + "learning_rate": 9.99229531053403e-05, + "loss": 0.0652, + "step": 1270 + }, + { + "epoch": 1.2272291466922338, + "grad_norm": 0.5782362818717957, + "learning_rate": 9.991804820612773e-05, + "loss": 0.0644, + "step": 1280 + }, + { + "epoch": 1.236816874400767, + "grad_norm": 0.39845097064971924, + "learning_rate": 9.99129920891486e-05, + "loss": 0.0617, + "step": 1290 + }, + { + "epoch": 1.2464046021093, + "grad_norm": 0.5628125667572021, + "learning_rate": 9.990778476971951e-05, + "loss": 0.0613, + "step": 1300 + }, + { + "epoch": 1.255992329817833, + "grad_norm": 0.4811013340950012, + "learning_rate": 9.99024262636151e-05, + "loss": 0.0644, + "step": 1310 + }, + { + "epoch": 1.2655800575263663, + "grad_norm": 0.540348470211029, + "learning_rate": 9.989691658706798e-05, + "loss": 0.063, + "step": 1320 + }, + { + "epoch": 1.2751677852348993, + "grad_norm": 0.593609631061554, + "learning_rate": 9.989125575676876e-05, + "loss": 0.0537, + "step": 1330 + }, + { + "epoch": 1.2847555129434325, + "grad_norm": 0.4400087296962738, + "learning_rate": 9.988544378986591e-05, + "loss": 0.0634, + "step": 1340 + }, + { + "epoch": 1.2943432406519655, + "grad_norm": 0.7038517594337463, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0564, + "step": 1350 + }, + { + "epoch": 1.3039309683604985, + "grad_norm": 0.4805976450443268, + "learning_rate": 9.987336651713229e-05, + "loss": 0.0604, + "step": 1360 + }, + { + "epoch": 1.3135186960690317, + "grad_norm": 0.5478856563568115, + "learning_rate": 9.986710124788745e-05, + "loss": 0.0573, + "step": 1370 + }, + { + "epoch": 1.3231064237775647, + "grad_norm": 0.6592814922332764, + "learning_rate": 9.986068491521072e-05, + "loss": 0.0604, + "step": 1380 + }, + { + "epoch": 1.332694151486098, + "grad_norm": 0.7848181128501892, + "learning_rate": 9.985411753853921e-05, + "loss": 0.055, + "step": 1390 + }, + { + "epoch": 1.342281879194631, + "grad_norm": 0.40262654423713684, + "learning_rate": 9.984739913776765e-05, + "loss": 0.0629, + "step": 1400 + }, + { + "epoch": 1.351869606903164, + "grad_norm": 0.6241422295570374, + "learning_rate": 9.984052973324817e-05, + "loss": 0.0609, + "step": 1410 + }, + { + "epoch": 1.3614573346116972, + "grad_norm": 0.7500850558280945, + "learning_rate": 9.983350934579046e-05, + "loss": 0.0742, + "step": 1420 + }, + { + "epoch": 1.3710450623202302, + "grad_norm": 0.6990365386009216, + "learning_rate": 9.982633799666146e-05, + "loss": 0.0605, + "step": 1430 + }, + { + "epoch": 1.3806327900287632, + "grad_norm": 0.5741100311279297, + "learning_rate": 9.981901570758554e-05, + "loss": 0.0639, + "step": 1440 + }, + { + "epoch": 1.3902205177372964, + "grad_norm": 0.6131389141082764, + "learning_rate": 9.981154250074422e-05, + "loss": 0.0695, + "step": 1450 + }, + { + "epoch": 1.3998082454458294, + "grad_norm": 0.6654881834983826, + "learning_rate": 9.980391839877628e-05, + "loss": 0.0755, + "step": 1460 + }, + { + "epoch": 1.4093959731543624, + "grad_norm": 0.5249256491661072, + "learning_rate": 9.979614342477753e-05, + "loss": 0.0613, + "step": 1470 + }, + { + "epoch": 1.4189837008628956, + "grad_norm": 0.5373178124427795, + "learning_rate": 9.978821760230086e-05, + "loss": 0.072, + "step": 1480 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.4792821407318115, + "learning_rate": 9.978014095535615e-05, + "loss": 0.0549, + "step": 1490 + }, + { + "epoch": 1.4381591562799616, + "grad_norm": 0.5644699931144714, + "learning_rate": 9.977191350841016e-05, + "loss": 0.065, + "step": 1500 + }, + { + "epoch": 1.4477468839884948, + "grad_norm": 0.374956339597702, + "learning_rate": 9.976353528638642e-05, + "loss": 0.0545, + "step": 1510 + }, + { + "epoch": 1.4573346116970278, + "grad_norm": 0.4185064733028412, + "learning_rate": 9.975500631466527e-05, + "loss": 0.0619, + "step": 1520 + }, + { + "epoch": 1.4669223394055608, + "grad_norm": 0.3903638422489166, + "learning_rate": 9.974632661908372e-05, + "loss": 0.0526, + "step": 1530 + }, + { + "epoch": 1.476510067114094, + "grad_norm": 0.45104435086250305, + "learning_rate": 9.973749622593534e-05, + "loss": 0.061, + "step": 1540 + }, + { + "epoch": 1.486097794822627, + "grad_norm": 0.4152944087982178, + "learning_rate": 9.972851516197019e-05, + "loss": 0.0635, + "step": 1550 + }, + { + "epoch": 1.49568552253116, + "grad_norm": 0.5824716091156006, + "learning_rate": 9.971938345439484e-05, + "loss": 0.0598, + "step": 1560 + }, + { + "epoch": 1.5052732502396933, + "grad_norm": 0.5598675608634949, + "learning_rate": 9.971010113087212e-05, + "loss": 0.0529, + "step": 1570 + }, + { + "epoch": 1.5148609779482263, + "grad_norm": 0.6759763956069946, + "learning_rate": 9.970066821952118e-05, + "loss": 0.0687, + "step": 1580 + }, + { + "epoch": 1.5244487056567593, + "grad_norm": 0.4682703912258148, + "learning_rate": 9.969108474891732e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 1.5340364333652925, + "grad_norm": 0.6091550588607788, + "learning_rate": 9.968135074809194e-05, + "loss": 0.0628, + "step": 1600 + }, + { + "epoch": 1.5436241610738255, + "grad_norm": 0.5167152881622314, + "learning_rate": 9.96714662465325e-05, + "loss": 0.056, + "step": 1610 + }, + { + "epoch": 1.5532118887823585, + "grad_norm": 0.5612486004829407, + "learning_rate": 9.966143127418225e-05, + "loss": 0.0565, + "step": 1620 + }, + { + "epoch": 1.5627996164908917, + "grad_norm": 0.3620167672634125, + "learning_rate": 9.965124586144039e-05, + "loss": 0.0533, + "step": 1630 + }, + { + "epoch": 1.5723873441994247, + "grad_norm": 0.6704486012458801, + "learning_rate": 9.964091003916179e-05, + "loss": 0.0633, + "step": 1640 + }, + { + "epoch": 1.5819750719079577, + "grad_norm": 0.6531718969345093, + "learning_rate": 9.963042383865694e-05, + "loss": 0.0665, + "step": 1650 + }, + { + "epoch": 1.591562799616491, + "grad_norm": 0.5249754786491394, + "learning_rate": 9.961978729169192e-05, + "loss": 0.0471, + "step": 1660 + }, + { + "epoch": 1.601150527325024, + "grad_norm": 0.4377578794956207, + "learning_rate": 9.960900043048826e-05, + "loss": 0.0561, + "step": 1670 + }, + { + "epoch": 1.610738255033557, + "grad_norm": 0.34821832180023193, + "learning_rate": 9.959806328772279e-05, + "loss": 0.0575, + "step": 1680 + }, + { + "epoch": 1.6203259827420902, + "grad_norm": 0.41964197158813477, + "learning_rate": 9.958697589652763e-05, + "loss": 0.0552, + "step": 1690 + }, + { + "epoch": 1.6299137104506232, + "grad_norm": 0.5038737058639526, + "learning_rate": 9.957573829049004e-05, + "loss": 0.0571, + "step": 1700 + }, + { + "epoch": 1.6395014381591562, + "grad_norm": 0.5568312406539917, + "learning_rate": 9.956435050365233e-05, + "loss": 0.0535, + "step": 1710 + }, + { + "epoch": 1.6490891658676894, + "grad_norm": 0.3089469075202942, + "learning_rate": 9.955281257051178e-05, + "loss": 0.0567, + "step": 1720 + }, + { + "epoch": 1.6586768935762224, + "grad_norm": 0.5025231838226318, + "learning_rate": 9.954112452602045e-05, + "loss": 0.0595, + "step": 1730 + }, + { + "epoch": 1.6682646212847554, + "grad_norm": 0.6473100185394287, + "learning_rate": 9.952928640558519e-05, + "loss": 0.0583, + "step": 1740 + }, + { + "epoch": 1.6778523489932886, + "grad_norm": 0.38910412788391113, + "learning_rate": 9.951729824506745e-05, + "loss": 0.0606, + "step": 1750 + }, + { + "epoch": 1.6874400767018218, + "grad_norm": 0.5367538332939148, + "learning_rate": 9.950516008078325e-05, + "loss": 0.0658, + "step": 1760 + }, + { + "epoch": 1.6970278044103546, + "grad_norm": 0.5526398420333862, + "learning_rate": 9.949287194950293e-05, + "loss": 0.0554, + "step": 1770 + }, + { + "epoch": 1.7066155321188878, + "grad_norm": 0.5616441369056702, + "learning_rate": 9.948043388845121e-05, + "loss": 0.0579, + "step": 1780 + }, + { + "epoch": 1.716203259827421, + "grad_norm": 0.41163280606269836, + "learning_rate": 9.946784593530694e-05, + "loss": 0.0612, + "step": 1790 + }, + { + "epoch": 1.7257909875359538, + "grad_norm": 0.45861759781837463, + "learning_rate": 9.945510812820308e-05, + "loss": 0.0524, + "step": 1800 + }, + { + "epoch": 1.735378715244487, + "grad_norm": 0.4847518503665924, + "learning_rate": 9.944222050572653e-05, + "loss": 0.0545, + "step": 1810 + }, + { + "epoch": 1.7449664429530203, + "grad_norm": 0.36065423488616943, + "learning_rate": 9.942918310691803e-05, + "loss": 0.0503, + "step": 1820 + }, + { + "epoch": 1.754554170661553, + "grad_norm": 0.5361629128456116, + "learning_rate": 9.941599597127202e-05, + "loss": 0.0582, + "step": 1830 + }, + { + "epoch": 1.7641418983700863, + "grad_norm": 0.290815532207489, + "learning_rate": 9.940265913873657e-05, + "loss": 0.0626, + "step": 1840 + }, + { + "epoch": 1.7737296260786195, + "grad_norm": 0.3743116855621338, + "learning_rate": 9.938917264971324e-05, + "loss": 0.0577, + "step": 1850 + }, + { + "epoch": 1.7833173537871523, + "grad_norm": 0.7040207982063293, + "learning_rate": 9.937553654505691e-05, + "loss": 0.0625, + "step": 1860 + }, + { + "epoch": 1.7929050814956855, + "grad_norm": 0.4356692135334015, + "learning_rate": 9.936175086607572e-05, + "loss": 0.0616, + "step": 1870 + }, + { + "epoch": 1.8024928092042187, + "grad_norm": 0.3443772494792938, + "learning_rate": 9.934781565453089e-05, + "loss": 0.0573, + "step": 1880 + }, + { + "epoch": 1.8120805369127517, + "grad_norm": 0.4956841766834259, + "learning_rate": 9.933373095263667e-05, + "loss": 0.0528, + "step": 1890 + }, + { + "epoch": 1.8216682646212847, + "grad_norm": 0.5193634629249573, + "learning_rate": 9.931949680306012e-05, + "loss": 0.0548, + "step": 1900 + }, + { + "epoch": 1.831255992329818, + "grad_norm": 0.3799174129962921, + "learning_rate": 9.930511324892104e-05, + "loss": 0.0563, + "step": 1910 + }, + { + "epoch": 1.840843720038351, + "grad_norm": 0.3923283815383911, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0595, + "step": 1920 + }, + { + "epoch": 1.850431447746884, + "grad_norm": 0.47552716732025146, + "learning_rate": 9.927589810169733e-05, + "loss": 0.0546, + "step": 1930 + }, + { + "epoch": 1.8600191754554172, + "grad_norm": 0.4305611848831177, + "learning_rate": 9.926106659711476e-05, + "loss": 0.0523, + "step": 1940 + }, + { + "epoch": 1.8696069031639502, + "grad_norm": 0.5576485395431519, + "learning_rate": 9.924608586497348e-05, + "loss": 0.0574, + "step": 1950 + }, + { + "epoch": 1.8791946308724832, + "grad_norm": 0.31708958745002747, + "learning_rate": 9.923095595065494e-05, + "loss": 0.0482, + "step": 1960 + }, + { + "epoch": 1.8887823585810164, + "grad_norm": 0.41617056727409363, + "learning_rate": 9.921567689999247e-05, + "loss": 0.0584, + "step": 1970 + }, + { + "epoch": 1.8983700862895494, + "grad_norm": 0.5047758221626282, + "learning_rate": 9.920024875927125e-05, + "loss": 0.0642, + "step": 1980 + }, + { + "epoch": 1.9079578139980824, + "grad_norm": 0.4173164367675781, + "learning_rate": 9.918467157522805e-05, + "loss": 0.0548, + "step": 1990 + }, + { + "epoch": 1.9175455417066156, + "grad_norm": 0.4640159010887146, + "learning_rate": 9.916894539505115e-05, + "loss": 0.0499, + "step": 2000 + }, + { + "epoch": 1.9271332694151486, + "grad_norm": 0.41713109612464905, + "learning_rate": 9.915307026638018e-05, + "loss": 0.0491, + "step": 2010 + }, + { + "epoch": 1.9367209971236816, + "grad_norm": 0.392994225025177, + "learning_rate": 9.9137046237306e-05, + "loss": 0.0522, + "step": 2020 + }, + { + "epoch": 1.9463087248322148, + "grad_norm": 0.32308030128479004, + "learning_rate": 9.912087335637054e-05, + "loss": 0.0557, + "step": 2030 + }, + { + "epoch": 1.9558964525407478, + "grad_norm": 0.406943678855896, + "learning_rate": 9.910455167256663e-05, + "loss": 0.0523, + "step": 2040 + }, + { + "epoch": 1.9654841802492808, + "grad_norm": 0.3809382915496826, + "learning_rate": 9.908808123533787e-05, + "loss": 0.0567, + "step": 2050 + }, + { + "epoch": 1.975071907957814, + "grad_norm": 0.3431997299194336, + "learning_rate": 9.907146209457852e-05, + "loss": 0.0456, + "step": 2060 + }, + { + "epoch": 1.984659635666347, + "grad_norm": 0.37939101457595825, + "learning_rate": 9.905469430063325e-05, + "loss": 0.0479, + "step": 2070 + }, + { + "epoch": 1.99424736337488, + "grad_norm": 0.492702841758728, + "learning_rate": 9.903777790429714e-05, + "loss": 0.048, + "step": 2080 + }, + { + "epoch": 2.0038350910834133, + "grad_norm": 0.41130146384239197, + "learning_rate": 9.90207129568153e-05, + "loss": 0.0545, + "step": 2090 + }, + { + "epoch": 2.0134228187919465, + "grad_norm": 0.5280721187591553, + "learning_rate": 9.900349950988297e-05, + "loss": 0.0516, + "step": 2100 + }, + { + "epoch": 2.0230105465004793, + "grad_norm": 0.3090174198150635, + "learning_rate": 9.89861376156452e-05, + "loss": 0.043, + "step": 2110 + }, + { + "epoch": 2.0325982742090125, + "grad_norm": 0.35579144954681396, + "learning_rate": 9.896862732669671e-05, + "loss": 0.0584, + "step": 2120 + }, + { + "epoch": 2.0421860019175457, + "grad_norm": 0.44842928647994995, + "learning_rate": 9.89509686960818e-05, + "loss": 0.0523, + "step": 2130 + }, + { + "epoch": 2.0517737296260785, + "grad_norm": 0.4050745666027069, + "learning_rate": 9.893316177729411e-05, + "loss": 0.0529, + "step": 2140 + }, + { + "epoch": 2.0613614573346117, + "grad_norm": 0.2710857093334198, + "learning_rate": 9.891520662427651e-05, + "loss": 0.0582, + "step": 2150 + }, + { + "epoch": 2.070949185043145, + "grad_norm": 0.327932745218277, + "learning_rate": 9.88971032914209e-05, + "loss": 0.056, + "step": 2160 + }, + { + "epoch": 2.0805369127516777, + "grad_norm": 0.41889169812202454, + "learning_rate": 9.887885183356809e-05, + "loss": 0.0449, + "step": 2170 + }, + { + "epoch": 2.090124640460211, + "grad_norm": 0.37824153900146484, + "learning_rate": 9.886045230600757e-05, + "loss": 0.0478, + "step": 2180 + }, + { + "epoch": 2.099712368168744, + "grad_norm": 0.4298747479915619, + "learning_rate": 9.884190476447746e-05, + "loss": 0.0479, + "step": 2190 + }, + { + "epoch": 2.109300095877277, + "grad_norm": 0.5047415494918823, + "learning_rate": 9.882320926516416e-05, + "loss": 0.0509, + "step": 2200 + }, + { + "epoch": 2.11888782358581, + "grad_norm": 0.3802444338798523, + "learning_rate": 9.880436586470234e-05, + "loss": 0.0469, + "step": 2210 + }, + { + "epoch": 2.1284755512943434, + "grad_norm": 0.3608779311180115, + "learning_rate": 9.87853746201747e-05, + "loss": 0.0499, + "step": 2220 + }, + { + "epoch": 2.138063279002876, + "grad_norm": 0.49108660221099854, + "learning_rate": 9.876623558911181e-05, + "loss": 0.0494, + "step": 2230 + }, + { + "epoch": 2.1476510067114094, + "grad_norm": 0.35984379053115845, + "learning_rate": 9.874694882949194e-05, + "loss": 0.0513, + "step": 2240 + }, + { + "epoch": 2.1572387344199426, + "grad_norm": 0.6457746624946594, + "learning_rate": 9.872751439974084e-05, + "loss": 0.0497, + "step": 2250 + }, + { + "epoch": 2.1668264621284754, + "grad_norm": 0.4572752118110657, + "learning_rate": 9.870793235873164e-05, + "loss": 0.0497, + "step": 2260 + }, + { + "epoch": 2.1764141898370086, + "grad_norm": 0.5329883098602295, + "learning_rate": 9.868820276578463e-05, + "loss": 0.0597, + "step": 2270 + }, + { + "epoch": 2.186001917545542, + "grad_norm": 0.4147273302078247, + "learning_rate": 9.866832568066706e-05, + "loss": 0.0537, + "step": 2280 + }, + { + "epoch": 2.1955896452540746, + "grad_norm": 0.3269449770450592, + "learning_rate": 9.864830116359299e-05, + "loss": 0.0541, + "step": 2290 + }, + { + "epoch": 2.205177372962608, + "grad_norm": 0.38033929467201233, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0493, + "step": 2300 + }, + { + "epoch": 2.214765100671141, + "grad_norm": 0.39863190054893494, + "learning_rate": 9.86078100766645e-05, + "loss": 0.0582, + "step": 2310 + }, + { + "epoch": 2.224352828379674, + "grad_norm": 0.3785865604877472, + "learning_rate": 9.858734362947056e-05, + "loss": 0.0451, + "step": 2320 + }, + { + "epoch": 2.233940556088207, + "grad_norm": 0.3535449802875519, + "learning_rate": 9.856672999564072e-05, + "loss": 0.0569, + "step": 2330 + }, + { + "epoch": 2.2435282837967403, + "grad_norm": 0.43401646614074707, + "learning_rate": 9.854596923762026e-05, + "loss": 0.0451, + "step": 2340 + }, + { + "epoch": 2.253116011505273, + "grad_norm": 0.3438590466976166, + "learning_rate": 9.852506141830018e-05, + "loss": 0.0527, + "step": 2350 + }, + { + "epoch": 2.2627037392138063, + "grad_norm": 0.524154543876648, + "learning_rate": 9.850400660101698e-05, + "loss": 0.0536, + "step": 2360 + }, + { + "epoch": 2.2722914669223395, + "grad_norm": 0.6278344392776489, + "learning_rate": 9.848280484955243e-05, + "loss": 0.0566, + "step": 2370 + }, + { + "epoch": 2.2818791946308723, + "grad_norm": 0.45389410853385925, + "learning_rate": 9.846145622813343e-05, + "loss": 0.0538, + "step": 2380 + }, + { + "epoch": 2.2914669223394055, + "grad_norm": 0.3653407692909241, + "learning_rate": 9.843996080143181e-05, + "loss": 0.0496, + "step": 2390 + }, + { + "epoch": 2.3010546500479387, + "grad_norm": 0.39420798420906067, + "learning_rate": 9.84183186345641e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 2.310642377756472, + "grad_norm": 0.36511731147766113, + "learning_rate": 9.839652979309135e-05, + "loss": 0.0415, + "step": 2410 + }, + { + "epoch": 2.3202301054650047, + "grad_norm": 0.6739844679832458, + "learning_rate": 9.837459434301896e-05, + "loss": 0.0497, + "step": 2420 + }, + { + "epoch": 2.329817833173538, + "grad_norm": 0.3520050346851349, + "learning_rate": 9.835251235079643e-05, + "loss": 0.0476, + "step": 2430 + }, + { + "epoch": 2.3394055608820707, + "grad_norm": 0.3880830705165863, + "learning_rate": 9.833028388331719e-05, + "loss": 0.0477, + "step": 2440 + }, + { + "epoch": 2.348993288590604, + "grad_norm": 0.5605785250663757, + "learning_rate": 9.830790900791842e-05, + "loss": 0.0565, + "step": 2450 + }, + { + "epoch": 2.358581016299137, + "grad_norm": 0.43835964798927307, + "learning_rate": 9.828538779238074e-05, + "loss": 0.0481, + "step": 2460 + }, + { + "epoch": 2.3681687440076704, + "grad_norm": 0.46309876441955566, + "learning_rate": 9.826272030492817e-05, + "loss": 0.0459, + "step": 2470 + }, + { + "epoch": 2.377756471716203, + "grad_norm": 0.315773606300354, + "learning_rate": 9.823990661422778e-05, + "loss": 0.0446, + "step": 2480 + }, + { + "epoch": 2.3873441994247364, + "grad_norm": 0.37291958928108215, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 2.396931927133269, + "grad_norm": 0.5233327150344849, + "learning_rate": 9.819384089996613e-05, + "loss": 0.0494, + "step": 2500 + }, + { + "epoch": 2.4065196548418024, + "grad_norm": 0.33032602071762085, + "learning_rate": 9.817058901595269e-05, + "loss": 0.0586, + "step": 2510 + }, + { + "epoch": 2.4161073825503356, + "grad_norm": 0.39209842681884766, + "learning_rate": 9.814719120778663e-05, + "loss": 0.0528, + "step": 2520 + }, + { + "epoch": 2.425695110258869, + "grad_norm": 0.3824262320995331, + "learning_rate": 9.81236475463474e-05, + "loss": 0.0502, + "step": 2530 + }, + { + "epoch": 2.4352828379674016, + "grad_norm": 0.4724734127521515, + "learning_rate": 9.809995810295633e-05, + "loss": 0.0538, + "step": 2540 + }, + { + "epoch": 2.444870565675935, + "grad_norm": 0.4816121459007263, + "learning_rate": 9.80761229493763e-05, + "loss": 0.0599, + "step": 2550 + }, + { + "epoch": 2.4544582933844676, + "grad_norm": 0.4902478754520416, + "learning_rate": 9.805214215781165e-05, + "loss": 0.0579, + "step": 2560 + }, + { + "epoch": 2.464046021093001, + "grad_norm": 0.4263833463191986, + "learning_rate": 9.802801580090785e-05, + "loss": 0.0496, + "step": 2570 + }, + { + "epoch": 2.473633748801534, + "grad_norm": 0.4122842848300934, + "learning_rate": 9.800374395175143e-05, + "loss": 0.0601, + "step": 2580 + }, + { + "epoch": 2.4832214765100673, + "grad_norm": 0.3193143308162689, + "learning_rate": 9.797932668386955e-05, + "loss": 0.0453, + "step": 2590 + }, + { + "epoch": 2.4928092042186, + "grad_norm": 0.302079439163208, + "learning_rate": 9.795476407122994e-05, + "loss": 0.0526, + "step": 2600 + }, + { + "epoch": 2.5023969319271333, + "grad_norm": 0.3169849216938019, + "learning_rate": 9.793005618824066e-05, + "loss": 0.0475, + "step": 2610 + }, + { + "epoch": 2.511984659635666, + "grad_norm": 0.35016322135925293, + "learning_rate": 9.790520310974978e-05, + "loss": 0.0523, + "step": 2620 + }, + { + "epoch": 2.5215723873441993, + "grad_norm": 0.5532832741737366, + "learning_rate": 9.788020491104524e-05, + "loss": 0.0516, + "step": 2630 + }, + { + "epoch": 2.5311601150527325, + "grad_norm": 0.48316141963005066, + "learning_rate": 9.785506166785461e-05, + "loss": 0.0455, + "step": 2640 + }, + { + "epoch": 2.5407478427612658, + "grad_norm": 0.53989177942276, + "learning_rate": 9.78297734563448e-05, + "loss": 0.05, + "step": 2650 + }, + { + "epoch": 2.5503355704697985, + "grad_norm": 0.44286760687828064, + "learning_rate": 9.780434035312196e-05, + "loss": 0.0552, + "step": 2660 + }, + { + "epoch": 2.5599232981783318, + "grad_norm": 0.5638286471366882, + "learning_rate": 9.777876243523108e-05, + "loss": 0.062, + "step": 2670 + }, + { + "epoch": 2.569511025886865, + "grad_norm": 0.45765963196754456, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0535, + "step": 2680 + }, + { + "epoch": 2.5790987535953978, + "grad_norm": 0.3893742859363556, + "learning_rate": 9.772717246581848e-05, + "loss": 0.055, + "step": 2690 + }, + { + "epoch": 2.588686481303931, + "grad_norm": 0.4707334637641907, + "learning_rate": 9.770116057057933e-05, + "loss": 0.055, + "step": 2700 + }, + { + "epoch": 2.598274209012464, + "grad_norm": 0.4900120198726654, + "learning_rate": 9.767500417323676e-05, + "loss": 0.056, + "step": 2710 + }, + { + "epoch": 2.607861936720997, + "grad_norm": 0.3331255316734314, + "learning_rate": 9.764870335302689e-05, + "loss": 0.0502, + "step": 2720 + }, + { + "epoch": 2.61744966442953, + "grad_norm": 0.47928670048713684, + "learning_rate": 9.762225818962336e-05, + "loss": 0.0514, + "step": 2730 + }, + { + "epoch": 2.6270373921380634, + "grad_norm": 0.3848089873790741, + "learning_rate": 9.759566876313701e-05, + "loss": 0.044, + "step": 2740 + }, + { + "epoch": 2.636625119846596, + "grad_norm": 0.4957471787929535, + "learning_rate": 9.756893515411574e-05, + "loss": 0.0434, + "step": 2750 + }, + { + "epoch": 2.6462128475551294, + "grad_norm": 0.5820662975311279, + "learning_rate": 9.754205744354423e-05, + "loss": 0.0484, + "step": 2760 + }, + { + "epoch": 2.6558005752636626, + "grad_norm": 0.3916762173175812, + "learning_rate": 9.751503571284368e-05, + "loss": 0.0488, + "step": 2770 + }, + { + "epoch": 2.665388302972196, + "grad_norm": 0.30791330337524414, + "learning_rate": 9.748787004387157e-05, + "loss": 0.0513, + "step": 2780 + }, + { + "epoch": 2.6749760306807286, + "grad_norm": 0.5171549320220947, + "learning_rate": 9.74605605189214e-05, + "loss": 0.0516, + "step": 2790 + }, + { + "epoch": 2.684563758389262, + "grad_norm": 0.47496703267097473, + "learning_rate": 9.743310722072251e-05, + "loss": 0.0493, + "step": 2800 + }, + { + "epoch": 2.6941514860977946, + "grad_norm": 0.5075270533561707, + "learning_rate": 9.74055102324397e-05, + "loss": 0.0489, + "step": 2810 + }, + { + "epoch": 2.703739213806328, + "grad_norm": 0.4490506052970886, + "learning_rate": 9.737776963767313e-05, + "loss": 0.0576, + "step": 2820 + }, + { + "epoch": 2.713326941514861, + "grad_norm": 0.3923519551753998, + "learning_rate": 9.734988552045792e-05, + "loss": 0.0513, + "step": 2830 + }, + { + "epoch": 2.7229146692233943, + "grad_norm": 0.2816771864891052, + "learning_rate": 9.7321857965264e-05, + "loss": 0.0578, + "step": 2840 + }, + { + "epoch": 2.732502396931927, + "grad_norm": 0.6326708793640137, + "learning_rate": 9.729368705699587e-05, + "loss": 0.0452, + "step": 2850 + }, + { + "epoch": 2.7420901246404603, + "grad_norm": 0.3657870292663574, + "learning_rate": 9.726537288099215e-05, + "loss": 0.0524, + "step": 2860 + }, + { + "epoch": 2.751677852348993, + "grad_norm": 0.3347817063331604, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0451, + "step": 2870 + }, + { + "epoch": 2.7612655800575263, + "grad_norm": 0.4541146457195282, + "learning_rate": 9.720831506930274e-05, + "loss": 0.0487, + "step": 2880 + }, + { + "epoch": 2.7708533077660595, + "grad_norm": 0.4089963734149933, + "learning_rate": 9.71795716064634e-05, + "loss": 0.0479, + "step": 2890 + }, + { + "epoch": 2.7804410354745928, + "grad_norm": 0.3474633991718292, + "learning_rate": 9.715068522158081e-05, + "loss": 0.0467, + "step": 2900 + }, + { + "epoch": 2.7900287631831255, + "grad_norm": 0.49998903274536133, + "learning_rate": 9.712165600216107e-05, + "loss": 0.0579, + "step": 2910 + }, + { + "epoch": 2.7996164908916588, + "grad_norm": 0.41667240858078003, + "learning_rate": 9.709248403614298e-05, + "loss": 0.0456, + "step": 2920 + }, + { + "epoch": 2.8092042186001915, + "grad_norm": 0.3876051604747772, + "learning_rate": 9.706316941189779e-05, + "loss": 0.0411, + "step": 2930 + }, + { + "epoch": 2.8187919463087248, + "grad_norm": 0.34348323941230774, + "learning_rate": 9.703371221822888e-05, + "loss": 0.0463, + "step": 2940 + }, + { + "epoch": 2.828379674017258, + "grad_norm": 0.5338907241821289, + "learning_rate": 9.700411254437154e-05, + "loss": 0.0476, + "step": 2950 + }, + { + "epoch": 2.837967401725791, + "grad_norm": 0.5973591804504395, + "learning_rate": 9.697437047999266e-05, + "loss": 0.0531, + "step": 2960 + }, + { + "epoch": 2.847555129434324, + "grad_norm": 0.31144216656684875, + "learning_rate": 9.694448611519049e-05, + "loss": 0.0494, + "step": 2970 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.4310339391231537, + "learning_rate": 9.691445954049434e-05, + "loss": 0.0448, + "step": 2980 + }, + { + "epoch": 2.86673058485139, + "grad_norm": 0.36877721548080444, + "learning_rate": 9.688429084686435e-05, + "loss": 0.043, + "step": 2990 + }, + { + "epoch": 2.876318312559923, + "grad_norm": 0.35387906432151794, + "learning_rate": 9.685398012569115e-05, + "loss": 0.055, + "step": 3000 + }, + { + "epoch": 2.8859060402684564, + "grad_norm": 0.3781449496746063, + "learning_rate": 9.682352746879562e-05, + "loss": 0.0513, + "step": 3010 + }, + { + "epoch": 2.8954937679769897, + "grad_norm": 0.3556309938430786, + "learning_rate": 9.679293296842863e-05, + "loss": 0.0556, + "step": 3020 + }, + { + "epoch": 2.9050814956855224, + "grad_norm": 0.4965471923351288, + "learning_rate": 9.676219671727072e-05, + "loss": 0.0502, + "step": 3030 + }, + { + "epoch": 2.9146692233940557, + "grad_norm": 0.40289080142974854, + "learning_rate": 9.673131880843185e-05, + "loss": 0.0474, + "step": 3040 + }, + { + "epoch": 2.9242569511025884, + "grad_norm": 0.3517281115055084, + "learning_rate": 9.67002993354511e-05, + "loss": 0.0557, + "step": 3050 + }, + { + "epoch": 2.9338446788111217, + "grad_norm": 0.5005010366439819, + "learning_rate": 9.66691383922964e-05, + "loss": 0.059, + "step": 3060 + }, + { + "epoch": 2.943432406519655, + "grad_norm": 0.36781349778175354, + "learning_rate": 9.66378360733642e-05, + "loss": 0.055, + "step": 3070 + }, + { + "epoch": 2.953020134228188, + "grad_norm": 0.310249388217926, + "learning_rate": 9.660639247347931e-05, + "loss": 0.0523, + "step": 3080 + }, + { + "epoch": 2.962607861936721, + "grad_norm": 0.27061378955841064, + "learning_rate": 9.657480768789446e-05, + "loss": 0.0505, + "step": 3090 + }, + { + "epoch": 2.972195589645254, + "grad_norm": 0.34516626596450806, + "learning_rate": 9.654308181229006e-05, + "loss": 0.0489, + "step": 3100 + }, + { + "epoch": 2.981783317353787, + "grad_norm": 0.3140753209590912, + "learning_rate": 9.651121494277396e-05, + "loss": 0.0531, + "step": 3110 + }, + { + "epoch": 2.99137104506232, + "grad_norm": 0.4165388345718384, + "learning_rate": 9.647920717588114e-05, + "loss": 0.0571, + "step": 3120 + }, + { + "epoch": 3.0009587727708533, + "grad_norm": 0.36014652252197266, + "learning_rate": 9.644705860857339e-05, + "loss": 0.0515, + "step": 3130 + }, + { + "epoch": 3.0105465004793865, + "grad_norm": 0.4353986382484436, + "learning_rate": 9.641476933823899e-05, + "loss": 0.0488, + "step": 3140 + }, + { + "epoch": 3.0201342281879193, + "grad_norm": 0.4083373546600342, + "learning_rate": 9.638233946269253e-05, + "loss": 0.052, + "step": 3150 + }, + { + "epoch": 3.0297219558964525, + "grad_norm": 0.3805656135082245, + "learning_rate": 9.634976908017446e-05, + "loss": 0.0461, + "step": 3160 + }, + { + "epoch": 3.0393096836049858, + "grad_norm": 0.36862942576408386, + "learning_rate": 9.631705828935092e-05, + "loss": 0.0526, + "step": 3170 + }, + { + "epoch": 3.0488974113135185, + "grad_norm": 0.4625187814235687, + "learning_rate": 9.628420718931338e-05, + "loss": 0.0536, + "step": 3180 + }, + { + "epoch": 3.0584851390220518, + "grad_norm": 0.2972494959831238, + "learning_rate": 9.625121587957834e-05, + "loss": 0.0468, + "step": 3190 + }, + { + "epoch": 3.068072866730585, + "grad_norm": 0.5064423084259033, + "learning_rate": 9.621808446008708e-05, + "loss": 0.0516, + "step": 3200 + }, + { + "epoch": 3.0776605944391178, + "grad_norm": 0.28751927614212036, + "learning_rate": 9.618481303120528e-05, + "loss": 0.0463, + "step": 3210 + }, + { + "epoch": 3.087248322147651, + "grad_norm": 0.4198159873485565, + "learning_rate": 9.615140169372274e-05, + "loss": 0.0395, + "step": 3220 + }, + { + "epoch": 3.096836049856184, + "grad_norm": 0.41463902592658997, + "learning_rate": 9.611785054885312e-05, + "loss": 0.0501, + "step": 3230 + }, + { + "epoch": 3.106423777564717, + "grad_norm": 0.37878739833831787, + "learning_rate": 9.608415969823361e-05, + "loss": 0.0484, + "step": 3240 + }, + { + "epoch": 3.11601150527325, + "grad_norm": 0.4990726113319397, + "learning_rate": 9.605032924392457e-05, + "loss": 0.049, + "step": 3250 + }, + { + "epoch": 3.1255992329817834, + "grad_norm": 0.39530688524246216, + "learning_rate": 9.601635928840927e-05, + "loss": 0.0658, + "step": 3260 + }, + { + "epoch": 3.135186960690316, + "grad_norm": 0.5206883549690247, + "learning_rate": 9.598224993459364e-05, + "loss": 0.0538, + "step": 3270 + }, + { + "epoch": 3.1447746883988494, + "grad_norm": 0.5972046256065369, + "learning_rate": 9.594800128580582e-05, + "loss": 0.054, + "step": 3280 + }, + { + "epoch": 3.1543624161073827, + "grad_norm": 0.33001407980918884, + "learning_rate": 9.591361344579595e-05, + "loss": 0.0544, + "step": 3290 + }, + { + "epoch": 3.1639501438159154, + "grad_norm": 0.38547295331954956, + "learning_rate": 9.58790865187358e-05, + "loss": 0.0422, + "step": 3300 + }, + { + "epoch": 3.1735378715244487, + "grad_norm": 0.3369503915309906, + "learning_rate": 9.584442060921851e-05, + "loss": 0.0472, + "step": 3310 + }, + { + "epoch": 3.183125599232982, + "grad_norm": 0.2815903127193451, + "learning_rate": 9.580961582225826e-05, + "loss": 0.0463, + "step": 3320 + }, + { + "epoch": 3.1927133269415147, + "grad_norm": 0.42745402455329895, + "learning_rate": 9.577467226328987e-05, + "loss": 0.0517, + "step": 3330 + }, + { + "epoch": 3.202301054650048, + "grad_norm": 0.46006882190704346, + "learning_rate": 9.573959003816856e-05, + "loss": 0.0494, + "step": 3340 + }, + { + "epoch": 3.211888782358581, + "grad_norm": 0.47103896737098694, + "learning_rate": 9.57043692531697e-05, + "loss": 0.0511, + "step": 3350 + }, + { + "epoch": 3.221476510067114, + "grad_norm": 0.41211676597595215, + "learning_rate": 9.566901001498826e-05, + "loss": 0.0512, + "step": 3360 + }, + { + "epoch": 3.231064237775647, + "grad_norm": 0.5582764148712158, + "learning_rate": 9.563351243073878e-05, + "loss": 0.0584, + "step": 3370 + }, + { + "epoch": 3.2406519654841803, + "grad_norm": 0.3129172921180725, + "learning_rate": 9.559787660795474e-05, + "loss": 0.0596, + "step": 3380 + }, + { + "epoch": 3.2502396931927136, + "grad_norm": 0.4259207844734192, + "learning_rate": 9.556210265458854e-05, + "loss": 0.0507, + "step": 3390 + }, + { + "epoch": 3.2598274209012463, + "grad_norm": 0.29509371519088745, + "learning_rate": 9.552619067901089e-05, + "loss": 0.0519, + "step": 3400 + }, + { + "epoch": 3.2694151486097796, + "grad_norm": 0.33097851276397705, + "learning_rate": 9.549014079001074e-05, + "loss": 0.0503, + "step": 3410 + }, + { + "epoch": 3.2790028763183123, + "grad_norm": 0.6283732056617737, + "learning_rate": 9.545395309679469e-05, + "loss": 0.052, + "step": 3420 + }, + { + "epoch": 3.2885906040268456, + "grad_norm": 0.29192429780960083, + "learning_rate": 9.54176277089869e-05, + "loss": 0.0452, + "step": 3430 + }, + { + "epoch": 3.2981783317353788, + "grad_norm": 0.3860151767730713, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0536, + "step": 3440 + }, + { + "epoch": 3.307766059443912, + "grad_norm": 0.5127553343772888, + "learning_rate": 9.534456429017784e-05, + "loss": 0.0521, + "step": 3450 + }, + { + "epoch": 3.3173537871524448, + "grad_norm": 0.4540964961051941, + "learning_rate": 9.530782648050907e-05, + "loss": 0.0552, + "step": 3460 + }, + { + "epoch": 3.326941514860978, + "grad_norm": 0.34647271037101746, + "learning_rate": 9.52709514189129e-05, + "loss": 0.0457, + "step": 3470 + }, + { + "epoch": 3.336529242569511, + "grad_norm": 0.4515313506126404, + "learning_rate": 9.523393921709574e-05, + "loss": 0.0467, + "step": 3480 + }, + { + "epoch": 3.346116970278044, + "grad_norm": 0.3084343373775482, + "learning_rate": 9.519678998717935e-05, + "loss": 0.0462, + "step": 3490 + }, + { + "epoch": 3.3557046979865772, + "grad_norm": 0.5871327519416809, + "learning_rate": 9.515950384170073e-05, + "loss": 0.0566, + "step": 3500 + }, + { + "epoch": 3.3652924256951104, + "grad_norm": 0.4407544732093811, + "learning_rate": 9.51220808936115e-05, + "loss": 0.0436, + "step": 3510 + }, + { + "epoch": 3.3748801534036432, + "grad_norm": 0.3434475362300873, + "learning_rate": 9.508452125627779e-05, + "loss": 0.0483, + "step": 3520 + }, + { + "epoch": 3.3844678811121764, + "grad_norm": 0.5896394848823547, + "learning_rate": 9.504682504347978e-05, + "loss": 0.0435, + "step": 3530 + }, + { + "epoch": 3.3940556088207097, + "grad_norm": 0.380214661359787, + "learning_rate": 9.500899236941139e-05, + "loss": 0.053, + "step": 3540 + }, + { + "epoch": 3.4036433365292424, + "grad_norm": 0.2878900170326233, + "learning_rate": 9.497102334867989e-05, + "loss": 0.0488, + "step": 3550 + }, + { + "epoch": 3.4132310642377757, + "grad_norm": 0.6185137629508972, + "learning_rate": 9.493291809630562e-05, + "loss": 0.0512, + "step": 3560 + }, + { + "epoch": 3.422818791946309, + "grad_norm": 0.5001134872436523, + "learning_rate": 9.489467672772162e-05, + "loss": 0.055, + "step": 3570 + }, + { + "epoch": 3.4324065196548417, + "grad_norm": 0.46808385848999023, + "learning_rate": 9.485629935877323e-05, + "loss": 0.0524, + "step": 3580 + }, + { + "epoch": 3.441994247363375, + "grad_norm": 0.4512917399406433, + "learning_rate": 9.481778610571782e-05, + "loss": 0.0487, + "step": 3590 + }, + { + "epoch": 3.451581975071908, + "grad_norm": 0.39726588129997253, + "learning_rate": 9.477913708522435e-05, + "loss": 0.0578, + "step": 3600 + }, + { + "epoch": 3.461169702780441, + "grad_norm": 0.32351112365722656, + "learning_rate": 9.474035241437312e-05, + "loss": 0.0488, + "step": 3610 + }, + { + "epoch": 3.470757430488974, + "grad_norm": 0.47034138441085815, + "learning_rate": 9.470143221065531e-05, + "loss": 0.0618, + "step": 3620 + }, + { + "epoch": 3.4803451581975073, + "grad_norm": 0.23497724533081055, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0499, + "step": 3630 + }, + { + "epoch": 3.48993288590604, + "grad_norm": 0.25630268454551697, + "learning_rate": 9.462318567663728e-05, + "loss": 0.0508, + "step": 3640 + }, + { + "epoch": 3.4995206136145733, + "grad_norm": 0.3957800269126892, + "learning_rate": 9.458385958337087e-05, + "loss": 0.0554, + "step": 3650 + }, + { + "epoch": 3.5091083413231066, + "grad_norm": 0.25262129306793213, + "learning_rate": 9.454439843130483e-05, + "loss": 0.0473, + "step": 3660 + }, + { + "epoch": 3.5186960690316393, + "grad_norm": 0.3933389186859131, + "learning_rate": 9.450480233997963e-05, + "loss": 0.0471, + "step": 3670 + }, + { + "epoch": 3.5282837967401726, + "grad_norm": 0.26438847184181213, + "learning_rate": 9.446507142934452e-05, + "loss": 0.0557, + "step": 3680 + }, + { + "epoch": 3.537871524448706, + "grad_norm": 0.2720869183540344, + "learning_rate": 9.442520581975718e-05, + "loss": 0.0492, + "step": 3690 + }, + { + "epoch": 3.547459252157239, + "grad_norm": 0.3165934383869171, + "learning_rate": 9.438520563198328e-05, + "loss": 0.0512, + "step": 3700 + }, + { + "epoch": 3.557046979865772, + "grad_norm": 0.6523368954658508, + "learning_rate": 9.434507098719624e-05, + "loss": 0.0574, + "step": 3710 + }, + { + "epoch": 3.566634707574305, + "grad_norm": 0.41401076316833496, + "learning_rate": 9.430480200697676e-05, + "loss": 0.0509, + "step": 3720 + }, + { + "epoch": 3.576222435282838, + "grad_norm": 0.29742154479026794, + "learning_rate": 9.426439881331248e-05, + "loss": 0.0489, + "step": 3730 + }, + { + "epoch": 3.585810162991371, + "grad_norm": 0.40217605233192444, + "learning_rate": 9.422386152859763e-05, + "loss": 0.0466, + "step": 3740 + }, + { + "epoch": 3.5953978906999042, + "grad_norm": 0.3434045612812042, + "learning_rate": 9.418319027563263e-05, + "loss": 0.0575, + "step": 3750 + }, + { + "epoch": 3.6049856184084375, + "grad_norm": 0.6345980763435364, + "learning_rate": 9.414238517762373e-05, + "loss": 0.0453, + "step": 3760 + }, + { + "epoch": 3.6145733461169702, + "grad_norm": 0.43346667289733887, + "learning_rate": 9.410144635818266e-05, + "loss": 0.055, + "step": 3770 + }, + { + "epoch": 3.6241610738255035, + "grad_norm": 0.36115562915802, + "learning_rate": 9.406037394132623e-05, + "loss": 0.0535, + "step": 3780 + }, + { + "epoch": 3.6337488015340362, + "grad_norm": 0.2766103744506836, + "learning_rate": 9.401916805147596e-05, + "loss": 0.0463, + "step": 3790 + }, + { + "epoch": 3.6433365292425695, + "grad_norm": 0.39829254150390625, + "learning_rate": 9.397782881345767e-05, + "loss": 0.0463, + "step": 3800 + }, + { + "epoch": 3.6529242569511027, + "grad_norm": 0.3240996301174164, + "learning_rate": 9.39363563525012e-05, + "loss": 0.0516, + "step": 3810 + }, + { + "epoch": 3.662511984659636, + "grad_norm": 0.416238009929657, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0483, + "step": 3820 + }, + { + "epoch": 3.6720997123681687, + "grad_norm": 0.24697421491146088, + "learning_rate": 9.385301226471032e-05, + "loss": 0.0451, + "step": 3830 + }, + { + "epoch": 3.681687440076702, + "grad_norm": 0.3078657388687134, + "learning_rate": 9.381114089035188e-05, + "loss": 0.0454, + "step": 3840 + }, + { + "epoch": 3.6912751677852347, + "grad_norm": 0.26055672764778137, + "learning_rate": 9.376913679800638e-05, + "loss": 0.0426, + "step": 3850 + }, + { + "epoch": 3.700862895493768, + "grad_norm": 0.36363962292671204, + "learning_rate": 9.372700011491768e-05, + "loss": 0.0535, + "step": 3860 + }, + { + "epoch": 3.710450623202301, + "grad_norm": 0.23066310584545135, + "learning_rate": 9.36847309687313e-05, + "loss": 0.0391, + "step": 3870 + }, + { + "epoch": 3.7200383509108343, + "grad_norm": 0.35935813188552856, + "learning_rate": 9.364232948749402e-05, + "loss": 0.0404, + "step": 3880 + }, + { + "epoch": 3.729626078619367, + "grad_norm": 0.42284151911735535, + "learning_rate": 9.359979579965352e-05, + "loss": 0.0456, + "step": 3890 + }, + { + "epoch": 3.7392138063279003, + "grad_norm": 0.29598748683929443, + "learning_rate": 9.355713003405797e-05, + "loss": 0.0486, + "step": 3900 + }, + { + "epoch": 3.748801534036433, + "grad_norm": 0.30880895256996155, + "learning_rate": 9.351433231995568e-05, + "loss": 0.0524, + "step": 3910 + }, + { + "epoch": 3.7583892617449663, + "grad_norm": 0.2683268189430237, + "learning_rate": 9.34714027869946e-05, + "loss": 0.0458, + "step": 3920 + }, + { + "epoch": 3.7679769894534996, + "grad_norm": 0.3789876401424408, + "learning_rate": 9.342834156522204e-05, + "loss": 0.0529, + "step": 3930 + }, + { + "epoch": 3.777564717162033, + "grad_norm": 0.2747150957584381, + "learning_rate": 9.338514878508428e-05, + "loss": 0.0474, + "step": 3940 + }, + { + "epoch": 3.7871524448705656, + "grad_norm": 0.3292723000049591, + "learning_rate": 9.334182457742607e-05, + "loss": 0.0544, + "step": 3950 + }, + { + "epoch": 3.796740172579099, + "grad_norm": 0.28527846932411194, + "learning_rate": 9.329836907349033e-05, + "loss": 0.0419, + "step": 3960 + }, + { + "epoch": 3.8063279002876316, + "grad_norm": 0.37766164541244507, + "learning_rate": 9.325478240491771e-05, + "loss": 0.0503, + "step": 3970 + }, + { + "epoch": 3.815915627996165, + "grad_norm": 0.4285350739955902, + "learning_rate": 9.321106470374618e-05, + "loss": 0.0493, + "step": 3980 + }, + { + "epoch": 3.825503355704698, + "grad_norm": 0.432804137468338, + "learning_rate": 9.316721610241068e-05, + "loss": 0.0452, + "step": 3990 + }, + { + "epoch": 3.8350910834132312, + "grad_norm": 0.32709524035453796, + "learning_rate": 9.312323673374269e-05, + "loss": 0.049, + "step": 4000 + }, + { + "epoch": 3.844678811121764, + "grad_norm": 0.2850819230079651, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0379, + "step": 4010 + }, + { + "epoch": 3.8542665388302972, + "grad_norm": 0.3472555875778198, + "learning_rate": 9.303488622771535e-05, + "loss": 0.0412, + "step": 4020 + }, + { + "epoch": 3.8638542665388305, + "grad_norm": 0.545179545879364, + "learning_rate": 9.299051535799799e-05, + "loss": 0.0535, + "step": 4030 + }, + { + "epoch": 3.8734419942473632, + "grad_norm": 0.43045416474342346, + "learning_rate": 9.29460142562313e-05, + "loss": 0.0564, + "step": 4040 + }, + { + "epoch": 3.8830297219558965, + "grad_norm": 0.30958643555641174, + "learning_rate": 9.290138305722343e-05, + "loss": 0.0423, + "step": 4050 + }, + { + "epoch": 3.8926174496644297, + "grad_norm": 0.3504599630832672, + "learning_rate": 9.285662189617652e-05, + "loss": 0.0525, + "step": 4060 + }, + { + "epoch": 3.9022051773729625, + "grad_norm": 0.5074465870857239, + "learning_rate": 9.281173090868651e-05, + "loss": 0.0505, + "step": 4070 + }, + { + "epoch": 3.9117929050814957, + "grad_norm": 0.30970317125320435, + "learning_rate": 9.27667102307426e-05, + "loss": 0.0404, + "step": 4080 + }, + { + "epoch": 3.921380632790029, + "grad_norm": 0.35298407077789307, + "learning_rate": 9.27215599987268e-05, + "loss": 0.0461, + "step": 4090 + }, + { + "epoch": 3.9309683604985617, + "grad_norm": 0.32086381316185, + "learning_rate": 9.267628034941369e-05, + "loss": 0.0476, + "step": 4100 + }, + { + "epoch": 3.940556088207095, + "grad_norm": 0.33907032012939453, + "learning_rate": 9.26308714199698e-05, + "loss": 0.0446, + "step": 4110 + }, + { + "epoch": 3.950143815915628, + "grad_norm": 0.23291510343551636, + "learning_rate": 9.258533334795336e-05, + "loss": 0.0542, + "step": 4120 + }, + { + "epoch": 3.959731543624161, + "grad_norm": 0.3786979913711548, + "learning_rate": 9.253966627131379e-05, + "loss": 0.049, + "step": 4130 + }, + { + "epoch": 3.969319271332694, + "grad_norm": 0.4073876142501831, + "learning_rate": 9.249387032839125e-05, + "loss": 0.046, + "step": 4140 + }, + { + "epoch": 3.9789069990412274, + "grad_norm": 0.3822251856327057, + "learning_rate": 9.244794565791639e-05, + "loss": 0.0472, + "step": 4150 + }, + { + "epoch": 3.98849472674976, + "grad_norm": 0.43598631024360657, + "learning_rate": 9.240189239900972e-05, + "loss": 0.0388, + "step": 4160 + }, + { + "epoch": 3.9980824544582934, + "grad_norm": 0.2129432111978531, + "learning_rate": 9.235571069118131e-05, + "loss": 0.0492, + "step": 4170 + }, + { + "epoch": 4.007670182166827, + "grad_norm": 0.3745039999485016, + "learning_rate": 9.23094006743304e-05, + "loss": 0.0447, + "step": 4180 + }, + { + "epoch": 4.01725790987536, + "grad_norm": 0.3619850277900696, + "learning_rate": 9.226296248874482e-05, + "loss": 0.0523, + "step": 4190 + }, + { + "epoch": 4.026845637583893, + "grad_norm": 0.3835139274597168, + "learning_rate": 9.221639627510076e-05, + "loss": 0.048, + "step": 4200 + }, + { + "epoch": 4.036433365292425, + "grad_norm": 0.28674259781837463, + "learning_rate": 9.216970217446219e-05, + "loss": 0.0387, + "step": 4210 + }, + { + "epoch": 4.046021093000959, + "grad_norm": 0.25763458013534546, + "learning_rate": 9.21228803282805e-05, + "loss": 0.0506, + "step": 4220 + }, + { + "epoch": 4.055608820709492, + "grad_norm": 0.36224737763404846, + "learning_rate": 9.207593087839406e-05, + "loss": 0.0453, + "step": 4230 + }, + { + "epoch": 4.065196548418025, + "grad_norm": 0.38200250267982483, + "learning_rate": 9.202885396702782e-05, + "loss": 0.0431, + "step": 4240 + }, + { + "epoch": 4.074784276126558, + "grad_norm": 0.336946964263916, + "learning_rate": 9.198164973679285e-05, + "loss": 0.0443, + "step": 4250 + }, + { + "epoch": 4.0843720038350915, + "grad_norm": 0.3541509807109833, + "learning_rate": 9.193431833068586e-05, + "loss": 0.0499, + "step": 4260 + }, + { + "epoch": 4.093959731543624, + "grad_norm": 0.3337682783603668, + "learning_rate": 9.188685989208886e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 4.103547459252157, + "grad_norm": 0.4774644076824188, + "learning_rate": 9.183927456476864e-05, + "loss": 0.0413, + "step": 4280 + }, + { + "epoch": 4.11313518696069, + "grad_norm": 0.3974810540676117, + "learning_rate": 9.179156249287646e-05, + "loss": 0.0495, + "step": 4290 + }, + { + "epoch": 4.1227229146692235, + "grad_norm": 0.35930874943733215, + "learning_rate": 9.174372382094745e-05, + "loss": 0.0481, + "step": 4300 + }, + { + "epoch": 4.132310642377757, + "grad_norm": 0.39746561646461487, + "learning_rate": 9.169575869390028e-05, + "loss": 0.0401, + "step": 4310 + }, + { + "epoch": 4.14189837008629, + "grad_norm": 0.3344055414199829, + "learning_rate": 9.164766725703669e-05, + "loss": 0.0471, + "step": 4320 + }, + { + "epoch": 4.151486097794822, + "grad_norm": 0.23866185545921326, + "learning_rate": 9.159944965604105e-05, + "loss": 0.0424, + "step": 4330 + }, + { + "epoch": 4.1610738255033555, + "grad_norm": 0.3230268657207489, + "learning_rate": 9.155110603697996e-05, + "loss": 0.0475, + "step": 4340 + }, + { + "epoch": 4.170661553211889, + "grad_norm": 0.3797110915184021, + "learning_rate": 9.150263654630172e-05, + "loss": 0.0458, + "step": 4350 + }, + { + "epoch": 4.180249280920422, + "grad_norm": 0.41824665665626526, + "learning_rate": 9.145404133083591e-05, + "loss": 0.0401, + "step": 4360 + }, + { + "epoch": 4.189837008628955, + "grad_norm": 0.45811742544174194, + "learning_rate": 9.140532053779307e-05, + "loss": 0.0533, + "step": 4370 + }, + { + "epoch": 4.199424736337488, + "grad_norm": 0.3115192651748657, + "learning_rate": 9.135647431476407e-05, + "loss": 0.0475, + "step": 4380 + }, + { + "epoch": 4.209012464046021, + "grad_norm": 0.27874428033828735, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0444, + "step": 4390 + }, + { + "epoch": 4.218600191754554, + "grad_norm": 0.5270777940750122, + "learning_rate": 9.125840617101058e-05, + "loss": 0.0514, + "step": 4400 + }, + { + "epoch": 4.228187919463087, + "grad_norm": 0.40683162212371826, + "learning_rate": 9.120918454736593e-05, + "loss": 0.0472, + "step": 4410 + }, + { + "epoch": 4.23777564717162, + "grad_norm": 0.30064043402671814, + "learning_rate": 9.11598380878939e-05, + "loss": 0.0492, + "step": 4420 + }, + { + "epoch": 4.247363374880154, + "grad_norm": 0.4496791362762451, + "learning_rate": 9.111036694208072e-05, + "loss": 0.0471, + "step": 4430 + }, + { + "epoch": 4.256951102588687, + "grad_norm": 0.39262011647224426, + "learning_rate": 9.106077125979037e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 4.26653883029722, + "grad_norm": 0.34774985909461975, + "learning_rate": 9.101105119126405e-05, + "loss": 0.0452, + "step": 4450 + }, + { + "epoch": 4.276126558005752, + "grad_norm": 0.4597591459751129, + "learning_rate": 9.096120688711978e-05, + "loss": 0.0521, + "step": 4460 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.594453752040863, + "learning_rate": 9.091123849835195e-05, + "loss": 0.0555, + "step": 4470 + }, + { + "epoch": 4.295302013422819, + "grad_norm": 0.45329248905181885, + "learning_rate": 9.086114617633079e-05, + "loss": 0.0408, + "step": 4480 + }, + { + "epoch": 4.304889741131352, + "grad_norm": 0.34534817934036255, + "learning_rate": 9.081093007280205e-05, + "loss": 0.0554, + "step": 4490 + }, + { + "epoch": 4.314477468839885, + "grad_norm": 0.36244168877601624, + "learning_rate": 9.076059033988636e-05, + "loss": 0.0487, + "step": 4500 + }, + { + "epoch": 4.324065196548418, + "grad_norm": 0.32668572664260864, + "learning_rate": 9.071012713007892e-05, + "loss": 0.0483, + "step": 4510 + }, + { + "epoch": 4.333652924256951, + "grad_norm": 0.31663575768470764, + "learning_rate": 9.065954059624895e-05, + "loss": 0.0484, + "step": 4520 + }, + { + "epoch": 4.343240651965484, + "grad_norm": 0.2809025049209595, + "learning_rate": 9.06088308916393e-05, + "loss": 0.042, + "step": 4530 + }, + { + "epoch": 4.352828379674017, + "grad_norm": 0.2432290017604828, + "learning_rate": 9.05579981698659e-05, + "loss": 0.0463, + "step": 4540 + }, + { + "epoch": 4.3624161073825505, + "grad_norm": 0.2573339343070984, + "learning_rate": 9.050704258491736e-05, + "loss": 0.0462, + "step": 4550 + }, + { + "epoch": 4.372003835091084, + "grad_norm": 0.42221635580062866, + "learning_rate": 9.045596429115447e-05, + "loss": 0.0472, + "step": 4560 + }, + { + "epoch": 4.381591562799617, + "grad_norm": 0.35964876413345337, + "learning_rate": 9.040476344330977e-05, + "loss": 0.0448, + "step": 4570 + }, + { + "epoch": 4.391179290508149, + "grad_norm": 0.27407506108283997, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0431, + "step": 4580 + }, + { + "epoch": 4.4007670182166825, + "grad_norm": 0.31676268577575684, + "learning_rate": 9.03019947061608e-05, + "loss": 0.0441, + "step": 4590 + }, + { + "epoch": 4.410354745925216, + "grad_norm": 0.2982436716556549, + "learning_rate": 9.025042712817598e-05, + "loss": 0.043, + "step": 4600 + }, + { + "epoch": 4.419942473633749, + "grad_norm": 0.3181396424770355, + "learning_rate": 9.019873761874727e-05, + "loss": 0.0484, + "step": 4610 + }, + { + "epoch": 4.429530201342282, + "grad_norm": 0.3732481002807617, + "learning_rate": 9.014692633445878e-05, + "loss": 0.055, + "step": 4620 + }, + { + "epoch": 4.439117929050815, + "grad_norm": 0.42074957489967346, + "learning_rate": 9.009499343226348e-05, + "loss": 0.047, + "step": 4630 + }, + { + "epoch": 4.448705656759348, + "grad_norm": 0.35802584886550903, + "learning_rate": 9.004293906948278e-05, + "loss": 0.0489, + "step": 4640 + }, + { + "epoch": 4.458293384467881, + "grad_norm": 0.33133867383003235, + "learning_rate": 8.999076340380603e-05, + "loss": 0.049, + "step": 4650 + }, + { + "epoch": 4.467881112176414, + "grad_norm": 0.28263920545578003, + "learning_rate": 8.993846659329005e-05, + "loss": 0.056, + "step": 4660 + }, + { + "epoch": 4.477468839884947, + "grad_norm": 0.5171105861663818, + "learning_rate": 8.988604879635862e-05, + "loss": 0.047, + "step": 4670 + }, + { + "epoch": 4.487056567593481, + "grad_norm": 0.264189749956131, + "learning_rate": 8.983351017180208e-05, + "loss": 0.0432, + "step": 4680 + }, + { + "epoch": 4.496644295302014, + "grad_norm": 0.2710209786891937, + "learning_rate": 8.978085087877672e-05, + "loss": 0.048, + "step": 4690 + }, + { + "epoch": 4.506232023010546, + "grad_norm": 0.20794712007045746, + "learning_rate": 8.972807107680445e-05, + "loss": 0.0524, + "step": 4700 + }, + { + "epoch": 4.515819750719079, + "grad_norm": 0.2759157419204712, + "learning_rate": 8.96751709257722e-05, + "loss": 0.0463, + "step": 4710 + }, + { + "epoch": 4.525407478427613, + "grad_norm": 0.45379728078842163, + "learning_rate": 8.962215058593146e-05, + "loss": 0.0483, + "step": 4720 + }, + { + "epoch": 4.534995206136146, + "grad_norm": 0.35511714220046997, + "learning_rate": 8.956901021789785e-05, + "loss": 0.0473, + "step": 4730 + }, + { + "epoch": 4.544582933844679, + "grad_norm": 0.49189603328704834, + "learning_rate": 8.951574998265058e-05, + "loss": 0.0448, + "step": 4740 + }, + { + "epoch": 4.554170661553212, + "grad_norm": 0.7247273921966553, + "learning_rate": 8.946237004153197e-05, + "loss": 0.0514, + "step": 4750 + }, + { + "epoch": 4.563758389261745, + "grad_norm": 0.5640259385108948, + "learning_rate": 8.940887055624696e-05, + "loss": 0.0495, + "step": 4760 + }, + { + "epoch": 4.573346116970278, + "grad_norm": 0.9589868187904358, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0497, + "step": 4770 + }, + { + "epoch": 4.582933844678811, + "grad_norm": 0.24826788902282715, + "learning_rate": 8.930151360180773e-05, + "loss": 0.0526, + "step": 4780 + }, + { + "epoch": 4.592521572387344, + "grad_norm": 0.4066452980041504, + "learning_rate": 8.924765645787216e-05, + "loss": 0.0482, + "step": 4790 + }, + { + "epoch": 4.6021093000958775, + "grad_norm": 0.41626861691474915, + "learning_rate": 8.919368042020645e-05, + "loss": 0.0469, + "step": 4800 + }, + { + "epoch": 4.611697027804411, + "grad_norm": 0.35766589641571045, + "learning_rate": 8.913958565232132e-05, + "loss": 0.0489, + "step": 4810 + }, + { + "epoch": 4.621284755512944, + "grad_norm": 0.24869422614574432, + "learning_rate": 8.908537231808716e-05, + "loss": 0.043, + "step": 4820 + }, + { + "epoch": 4.630872483221476, + "grad_norm": 0.3498132526874542, + "learning_rate": 8.903104058173354e-05, + "loss": 0.044, + "step": 4830 + }, + { + "epoch": 4.6404602109300095, + "grad_norm": 0.5257985591888428, + "learning_rate": 8.897659060784869e-05, + "loss": 0.0487, + "step": 4840 + }, + { + "epoch": 4.650047938638543, + "grad_norm": 0.3492990732192993, + "learning_rate": 8.892202256137905e-05, + "loss": 0.0516, + "step": 4850 + }, + { + "epoch": 4.659635666347076, + "grad_norm": 0.5162085294723511, + "learning_rate": 8.886733660762871e-05, + "loss": 0.0526, + "step": 4860 + }, + { + "epoch": 4.669223394055609, + "grad_norm": 0.3405402600765228, + "learning_rate": 8.881253291225895e-05, + "loss": 0.0449, + "step": 4870 + }, + { + "epoch": 4.6788111217641415, + "grad_norm": 0.4526231586933136, + "learning_rate": 8.875761164128772e-05, + "loss": 0.053, + "step": 4880 + }, + { + "epoch": 4.688398849472675, + "grad_norm": 0.3826616108417511, + "learning_rate": 8.870257296108918e-05, + "loss": 0.0467, + "step": 4890 + }, + { + "epoch": 4.697986577181208, + "grad_norm": 0.3477012813091278, + "learning_rate": 8.86474170383931e-05, + "loss": 0.0486, + "step": 4900 + }, + { + "epoch": 4.707574304889741, + "grad_norm": 0.2914051115512848, + "learning_rate": 8.859214404028447e-05, + "loss": 0.042, + "step": 4910 + }, + { + "epoch": 4.717162032598274, + "grad_norm": 0.40637078881263733, + "learning_rate": 8.85367541342029e-05, + "loss": 0.0432, + "step": 4920 + }, + { + "epoch": 4.726749760306808, + "grad_norm": 0.36229225993156433, + "learning_rate": 8.848124748794218e-05, + "loss": 0.0498, + "step": 4930 + }, + { + "epoch": 4.736337488015341, + "grad_norm": 0.33015790581703186, + "learning_rate": 8.842562426964974e-05, + "loss": 0.0441, + "step": 4940 + }, + { + "epoch": 4.745925215723873, + "grad_norm": 0.35154151916503906, + "learning_rate": 8.83698846478261e-05, + "loss": 0.0463, + "step": 4950 + }, + { + "epoch": 4.755512943432406, + "grad_norm": 0.2888050377368927, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0455, + "step": 4960 + }, + { + "epoch": 4.76510067114094, + "grad_norm": 0.3235926628112793, + "learning_rate": 8.825805686935011e-05, + "loss": 0.0551, + "step": 4970 + }, + { + "epoch": 4.774688398849473, + "grad_norm": 0.44466277956962585, + "learning_rate": 8.820196905145997e-05, + "loss": 0.0476, + "step": 4980 + }, + { + "epoch": 4.784276126558006, + "grad_norm": 0.39051833748817444, + "learning_rate": 8.814576550756197e-05, + "loss": 0.04, + "step": 4990 + }, + { + "epoch": 4.793863854266538, + "grad_norm": 0.3532402813434601, + "learning_rate": 8.808944640791467e-05, + "loss": 0.0489, + "step": 5000 + }, + { + "epoch": 4.803451581975072, + "grad_norm": 0.34791117906570435, + "learning_rate": 8.803301192312667e-05, + "loss": 0.0466, + "step": 5010 + }, + { + "epoch": 4.813039309683605, + "grad_norm": 0.31138908863067627, + "learning_rate": 8.797646222415614e-05, + "loss": 0.0407, + "step": 5020 + }, + { + "epoch": 4.822627037392138, + "grad_norm": 0.2896534502506256, + "learning_rate": 8.79197974823102e-05, + "loss": 0.0479, + "step": 5030 + }, + { + "epoch": 4.832214765100671, + "grad_norm": 0.26334378123283386, + "learning_rate": 8.786301786924456e-05, + "loss": 0.0469, + "step": 5040 + }, + { + "epoch": 4.8418024928092045, + "grad_norm": 0.2446843832731247, + "learning_rate": 8.780612355696283e-05, + "loss": 0.0461, + "step": 5050 + }, + { + "epoch": 4.851390220517738, + "grad_norm": 0.2954402267932892, + "learning_rate": 8.774911471781613e-05, + "loss": 0.0472, + "step": 5060 + }, + { + "epoch": 4.86097794822627, + "grad_norm": 0.22677741944789886, + "learning_rate": 8.769199152450249e-05, + "loss": 0.04, + "step": 5070 + }, + { + "epoch": 4.870565675934803, + "grad_norm": 0.32872337102890015, + "learning_rate": 8.76347541500664e-05, + "loss": 0.0479, + "step": 5080 + }, + { + "epoch": 4.8801534036433365, + "grad_norm": 0.4457066059112549, + "learning_rate": 8.757740276789818e-05, + "loss": 0.0439, + "step": 5090 + }, + { + "epoch": 4.88974113135187, + "grad_norm": 0.24604512751102448, + "learning_rate": 8.751993755173358e-05, + "loss": 0.0468, + "step": 5100 + }, + { + "epoch": 4.899328859060403, + "grad_norm": 0.3143763840198517, + "learning_rate": 8.746235867565313e-05, + "loss": 0.0458, + "step": 5110 + }, + { + "epoch": 4.908916586768935, + "grad_norm": 0.3161276876926422, + "learning_rate": 8.74046663140817e-05, + "loss": 0.0502, + "step": 5120 + }, + { + "epoch": 4.9185043144774685, + "grad_norm": 0.2833130657672882, + "learning_rate": 8.734686064178797e-05, + "loss": 0.0419, + "step": 5130 + }, + { + "epoch": 4.928092042186002, + "grad_norm": 0.4420258104801178, + "learning_rate": 8.728894183388381e-05, + "loss": 0.0465, + "step": 5140 + }, + { + "epoch": 4.937679769894535, + "grad_norm": 0.353081077337265, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0451, + "step": 5150 + }, + { + "epoch": 4.947267497603068, + "grad_norm": 0.4228033125400543, + "learning_rate": 8.717276551340501e-05, + "loss": 0.0495, + "step": 5160 + }, + { + "epoch": 4.956855225311601, + "grad_norm": 0.3678063452243805, + "learning_rate": 8.711450835276565e-05, + "loss": 0.0395, + "step": 5170 + }, + { + "epoch": 4.966442953020135, + "grad_norm": 0.4963276982307434, + "learning_rate": 8.705613876038543e-05, + "loss": 0.042, + "step": 5180 + }, + { + "epoch": 4.976030680728667, + "grad_norm": 0.3559805452823639, + "learning_rate": 8.699765691308456e-05, + "loss": 0.0448, + "step": 5190 + }, + { + "epoch": 4.9856184084372, + "grad_norm": 0.253312885761261, + "learning_rate": 8.69390629880233e-05, + "loss": 0.0539, + "step": 5200 + }, + { + "epoch": 4.995206136145733, + "grad_norm": 0.29010236263275146, + "learning_rate": 8.688035716270141e-05, + "loss": 0.0447, + "step": 5210 + }, + { + "epoch": 5.004793863854267, + "grad_norm": 0.35962191224098206, + "learning_rate": 8.682153961495767e-05, + "loss": 0.0484, + "step": 5220 + }, + { + "epoch": 5.0143815915628, + "grad_norm": 0.2923009395599365, + "learning_rate": 8.676261052296928e-05, + "loss": 0.0488, + "step": 5230 + }, + { + "epoch": 5.023969319271333, + "grad_norm": 0.33261337876319885, + "learning_rate": 8.670357006525131e-05, + "loss": 0.053, + "step": 5240 + }, + { + "epoch": 5.033557046979865, + "grad_norm": 0.3641784191131592, + "learning_rate": 8.66444184206563e-05, + "loss": 0.0429, + "step": 5250 + }, + { + "epoch": 5.043144774688399, + "grad_norm": 0.4545520544052124, + "learning_rate": 8.658515576837347e-05, + "loss": 0.0487, + "step": 5260 + }, + { + "epoch": 5.052732502396932, + "grad_norm": 0.3597351312637329, + "learning_rate": 8.652578228792841e-05, + "loss": 0.0571, + "step": 5270 + }, + { + "epoch": 5.062320230105465, + "grad_norm": 0.26271480321884155, + "learning_rate": 8.646629815918244e-05, + "loss": 0.046, + "step": 5280 + }, + { + "epoch": 5.071907957813998, + "grad_norm": 0.2976760268211365, + "learning_rate": 8.640670356233202e-05, + "loss": 0.049, + "step": 5290 + }, + { + "epoch": 5.0814956855225315, + "grad_norm": 0.3539637327194214, + "learning_rate": 8.634699867790832e-05, + "loss": 0.046, + "step": 5300 + }, + { + "epoch": 5.091083413231064, + "grad_norm": 0.314113587141037, + "learning_rate": 8.628718368677655e-05, + "loss": 0.0474, + "step": 5310 + }, + { + "epoch": 5.100671140939597, + "grad_norm": 0.3386295735836029, + "learning_rate": 8.622725877013549e-05, + "loss": 0.0438, + "step": 5320 + }, + { + "epoch": 5.11025886864813, + "grad_norm": 0.4622576832771301, + "learning_rate": 8.616722410951689e-05, + "loss": 0.0447, + "step": 5330 + }, + { + "epoch": 5.1198465963566635, + "grad_norm": 0.23671875894069672, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0457, + "step": 5340 + }, + { + "epoch": 5.129434324065197, + "grad_norm": 0.38376542925834656, + "learning_rate": 8.604682628413601e-05, + "loss": 0.0521, + "step": 5350 + }, + { + "epoch": 5.13902205177373, + "grad_norm": 0.2503417432308197, + "learning_rate": 8.598646348409729e-05, + "loss": 0.0466, + "step": 5360 + }, + { + "epoch": 5.148609779482262, + "grad_norm": 0.33504578471183777, + "learning_rate": 8.592599166952718e-05, + "loss": 0.0499, + "step": 5370 + }, + { + "epoch": 5.1581975071907955, + "grad_norm": 0.2641712725162506, + "learning_rate": 8.586541102361414e-05, + "loss": 0.0471, + "step": 5380 + }, + { + "epoch": 5.167785234899329, + "grad_norm": 0.363615483045578, + "learning_rate": 8.580472172987638e-05, + "loss": 0.0451, + "step": 5390 + }, + { + "epoch": 5.177372962607862, + "grad_norm": 0.29901939630508423, + "learning_rate": 8.574392397216123e-05, + "loss": 0.0472, + "step": 5400 + }, + { + "epoch": 5.186960690316395, + "grad_norm": 0.299882173538208, + "learning_rate": 8.568301793464457e-05, + "loss": 0.0492, + "step": 5410 + }, + { + "epoch": 5.196548418024928, + "grad_norm": 0.25945836305618286, + "learning_rate": 8.562200380183033e-05, + "loss": 0.0354, + "step": 5420 + }, + { + "epoch": 5.206136145733462, + "grad_norm": 0.39987847208976746, + "learning_rate": 8.556088175854984e-05, + "loss": 0.0367, + "step": 5430 + }, + { + "epoch": 5.215723873441994, + "grad_norm": 0.31205254793167114, + "learning_rate": 8.54996519899614e-05, + "loss": 0.0412, + "step": 5440 + }, + { + "epoch": 5.225311601150527, + "grad_norm": 0.3277497887611389, + "learning_rate": 8.543831468154955e-05, + "loss": 0.0502, + "step": 5450 + }, + { + "epoch": 5.23489932885906, + "grad_norm": 0.3311022222042084, + "learning_rate": 8.537687001912471e-05, + "loss": 0.0477, + "step": 5460 + }, + { + "epoch": 5.244487056567594, + "grad_norm": 0.42579907178878784, + "learning_rate": 8.531531818882241e-05, + "loss": 0.0509, + "step": 5470 + }, + { + "epoch": 5.254074784276127, + "grad_norm": 0.30724838376045227, + "learning_rate": 8.52536593771029e-05, + "loss": 0.0418, + "step": 5480 + }, + { + "epoch": 5.263662511984659, + "grad_norm": 0.3175548017024994, + "learning_rate": 8.519189377075049e-05, + "loss": 0.0507, + "step": 5490 + }, + { + "epoch": 5.273250239693192, + "grad_norm": 0.3461003601551056, + "learning_rate": 8.513002155687297e-05, + "loss": 0.0495, + "step": 5500 + }, + { + "epoch": 5.282837967401726, + "grad_norm": 0.27968931198120117, + "learning_rate": 8.50680429229011e-05, + "loss": 0.0424, + "step": 5510 + }, + { + "epoch": 5.292425695110259, + "grad_norm": 0.2532777190208435, + "learning_rate": 8.500595805658806e-05, + "loss": 0.0429, + "step": 5520 + }, + { + "epoch": 5.302013422818792, + "grad_norm": 0.2897396981716156, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0479, + "step": 5530 + }, + { + "epoch": 5.311601150527325, + "grad_norm": 0.32838040590286255, + "learning_rate": 8.48814703795595e-05, + "loss": 0.0462, + "step": 5540 + }, + { + "epoch": 5.3211888782358585, + "grad_norm": 0.23218947649002075, + "learning_rate": 8.481906794595702e-05, + "loss": 0.038, + "step": 5550 + }, + { + "epoch": 5.330776605944391, + "grad_norm": 0.4271414577960968, + "learning_rate": 8.475656003423837e-05, + "loss": 0.0424, + "step": 5560 + }, + { + "epoch": 5.340364333652924, + "grad_norm": 0.3327130079269409, + "learning_rate": 8.469394683376003e-05, + "loss": 0.0461, + "step": 5570 + }, + { + "epoch": 5.349952061361457, + "grad_norm": 0.34635308384895325, + "learning_rate": 8.463122853419748e-05, + "loss": 0.0462, + "step": 5580 + }, + { + "epoch": 5.3595397890699905, + "grad_norm": 0.35077422857284546, + "learning_rate": 8.456840532554448e-05, + "loss": 0.0477, + "step": 5590 + }, + { + "epoch": 5.369127516778524, + "grad_norm": 0.44980722665786743, + "learning_rate": 8.450547739811275e-05, + "loss": 0.0423, + "step": 5600 + }, + { + "epoch": 5.378715244487057, + "grad_norm": 0.28166648745536804, + "learning_rate": 8.444244494253106e-05, + "loss": 0.0431, + "step": 5610 + }, + { + "epoch": 5.388302972195589, + "grad_norm": 0.33736804127693176, + "learning_rate": 8.437930814974499e-05, + "loss": 0.0479, + "step": 5620 + }, + { + "epoch": 5.3978906999041225, + "grad_norm": 0.25710147619247437, + "learning_rate": 8.43160672110161e-05, + "loss": 0.042, + "step": 5630 + }, + { + "epoch": 5.407478427612656, + "grad_norm": 0.29803675413131714, + "learning_rate": 8.425272231792148e-05, + "loss": 0.0488, + "step": 5640 + }, + { + "epoch": 5.417066155321189, + "grad_norm": 0.35298973321914673, + "learning_rate": 8.418927366235305e-05, + "loss": 0.042, + "step": 5650 + }, + { + "epoch": 5.426653883029722, + "grad_norm": 0.32311904430389404, + "learning_rate": 8.41257214365172e-05, + "loss": 0.0452, + "step": 5660 + }, + { + "epoch": 5.436241610738255, + "grad_norm": 0.38360047340393066, + "learning_rate": 8.406206583293394e-05, + "loss": 0.0572, + "step": 5670 + }, + { + "epoch": 5.445829338446788, + "grad_norm": 0.4456116855144501, + "learning_rate": 8.399830704443653e-05, + "loss": 0.0464, + "step": 5680 + }, + { + "epoch": 5.455417066155321, + "grad_norm": 0.3833318054676056, + "learning_rate": 8.393444526417071e-05, + "loss": 0.0461, + "step": 5690 + }, + { + "epoch": 5.465004793863854, + "grad_norm": 0.27611926198005676, + "learning_rate": 8.387048068559435e-05, + "loss": 0.0437, + "step": 5700 + }, + { + "epoch": 5.474592521572387, + "grad_norm": 0.3786008954048157, + "learning_rate": 8.380641350247665e-05, + "loss": 0.0477, + "step": 5710 + }, + { + "epoch": 5.484180249280921, + "grad_norm": 0.471384197473526, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0449, + "step": 5720 + }, + { + "epoch": 5.493767976989454, + "grad_norm": 0.2924197018146515, + "learning_rate": 8.36779720992475e-05, + "loss": 0.0476, + "step": 5730 + }, + { + "epoch": 5.503355704697986, + "grad_norm": 0.24068906903266907, + "learning_rate": 8.361359826822625e-05, + "loss": 0.0477, + "step": 5740 + }, + { + "epoch": 5.512943432406519, + "grad_norm": 0.24523060023784637, + "learning_rate": 8.354912261084281e-05, + "loss": 0.0489, + "step": 5750 + }, + { + "epoch": 5.522531160115053, + "grad_norm": 0.3498481810092926, + "learning_rate": 8.348454532241461e-05, + "loss": 0.0387, + "step": 5760 + }, + { + "epoch": 5.532118887823586, + "grad_norm": 0.3108651340007782, + "learning_rate": 8.341986659856698e-05, + "loss": 0.0377, + "step": 5770 + }, + { + "epoch": 5.541706615532119, + "grad_norm": 0.3618451654911041, + "learning_rate": 8.335508663523248e-05, + "loss": 0.048, + "step": 5780 + }, + { + "epoch": 5.551294343240652, + "grad_norm": 0.769836962223053, + "learning_rate": 8.329020562865038e-05, + "loss": 0.0422, + "step": 5790 + }, + { + "epoch": 5.5608820709491855, + "grad_norm": 0.24395880103111267, + "learning_rate": 8.322522377536604e-05, + "loss": 0.0395, + "step": 5800 + }, + { + "epoch": 5.570469798657718, + "grad_norm": 0.5865891575813293, + "learning_rate": 8.316014127223033e-05, + "loss": 0.0565, + "step": 5810 + }, + { + "epoch": 5.580057526366251, + "grad_norm": 0.318808376789093, + "learning_rate": 8.3094958316399e-05, + "loss": 0.0453, + "step": 5820 + }, + { + "epoch": 5.589645254074784, + "grad_norm": 0.44590169191360474, + "learning_rate": 8.302967510533213e-05, + "loss": 0.0524, + "step": 5830 + }, + { + "epoch": 5.5992329817833175, + "grad_norm": 0.3664915859699249, + "learning_rate": 8.296429183679349e-05, + "loss": 0.0434, + "step": 5840 + }, + { + "epoch": 5.608820709491851, + "grad_norm": 0.34023183584213257, + "learning_rate": 8.289880870884995e-05, + "loss": 0.0595, + "step": 5850 + }, + { + "epoch": 5.618408437200383, + "grad_norm": 0.33271753787994385, + "learning_rate": 8.283322591987086e-05, + "loss": 0.0476, + "step": 5860 + }, + { + "epoch": 5.627996164908916, + "grad_norm": 0.30905163288116455, + "learning_rate": 8.276754366852754e-05, + "loss": 0.0486, + "step": 5870 + }, + { + "epoch": 5.6375838926174495, + "grad_norm": 0.3950500786304474, + "learning_rate": 8.27017621537926e-05, + "loss": 0.0524, + "step": 5880 + }, + { + "epoch": 5.647171620325983, + "grad_norm": 0.3802347481250763, + "learning_rate": 8.26358815749393e-05, + "loss": 0.0453, + "step": 5890 + }, + { + "epoch": 5.656759348034516, + "grad_norm": 0.27361515164375305, + "learning_rate": 8.256990213154102e-05, + "loss": 0.0426, + "step": 5900 + }, + { + "epoch": 5.666347075743049, + "grad_norm": 0.28120309114456177, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0406, + "step": 5910 + }, + { + "epoch": 5.675934803451582, + "grad_norm": 0.44831210374832153, + "learning_rate": 8.243764745089999e-05, + "loss": 0.0433, + "step": 5920 + }, + { + "epoch": 5.685522531160115, + "grad_norm": 0.2854187488555908, + "learning_rate": 8.237137261429904e-05, + "loss": 0.0438, + "step": 5930 + }, + { + "epoch": 5.695110258868648, + "grad_norm": 0.3696000874042511, + "learning_rate": 8.230499971443555e-05, + "loss": 0.0399, + "step": 5940 + }, + { + "epoch": 5.704697986577181, + "grad_norm": 0.794933021068573, + "learning_rate": 8.223852895237427e-05, + "loss": 0.0452, + "step": 5950 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.3321564793586731, + "learning_rate": 8.21719605294765e-05, + "loss": 0.0484, + "step": 5960 + }, + { + "epoch": 5.723873441994248, + "grad_norm": 0.29202380776405334, + "learning_rate": 8.210529464739928e-05, + "loss": 0.0432, + "step": 5970 + }, + { + "epoch": 5.73346116970278, + "grad_norm": 0.32877346873283386, + "learning_rate": 8.203853150809494e-05, + "loss": 0.046, + "step": 5980 + }, + { + "epoch": 5.743048897411313, + "grad_norm": 0.45695215463638306, + "learning_rate": 8.197167131381045e-05, + "loss": 0.0464, + "step": 5990 + }, + { + "epoch": 5.752636625119846, + "grad_norm": 0.20887207984924316, + "learning_rate": 8.190471426708675e-05, + "loss": 0.0428, + "step": 6000 + }, + { + "epoch": 5.76222435282838, + "grad_norm": 0.31597304344177246, + "learning_rate": 8.183766057075819e-05, + "loss": 0.0409, + "step": 6010 + }, + { + "epoch": 5.771812080536913, + "grad_norm": 0.3338216245174408, + "learning_rate": 8.177051042795192e-05, + "loss": 0.0461, + "step": 6020 + }, + { + "epoch": 5.781399808245446, + "grad_norm": 0.32134512066841125, + "learning_rate": 8.170326404208724e-05, + "loss": 0.0411, + "step": 6030 + }, + { + "epoch": 5.790987535953979, + "grad_norm": 0.2781100571155548, + "learning_rate": 8.163592161687499e-05, + "loss": 0.0425, + "step": 6040 + }, + { + "epoch": 5.800575263662512, + "grad_norm": 0.34772852063179016, + "learning_rate": 8.156848335631697e-05, + "loss": 0.0368, + "step": 6050 + }, + { + "epoch": 5.810162991371045, + "grad_norm": 0.3309897184371948, + "learning_rate": 8.15009494647053e-05, + "loss": 0.04, + "step": 6060 + }, + { + "epoch": 5.819750719079578, + "grad_norm": 0.252763032913208, + "learning_rate": 8.143332014662176e-05, + "loss": 0.0398, + "step": 6070 + }, + { + "epoch": 5.829338446788111, + "grad_norm": 0.3265877664089203, + "learning_rate": 8.136559560693722e-05, + "loss": 0.045, + "step": 6080 + }, + { + "epoch": 5.8389261744966445, + "grad_norm": 0.4045432209968567, + "learning_rate": 8.129777605081105e-05, + "loss": 0.0428, + "step": 6090 + }, + { + "epoch": 5.848513902205178, + "grad_norm": 0.2679883539676666, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0433, + "step": 6100 + }, + { + "epoch": 5.85810162991371, + "grad_norm": 0.4409831166267395, + "learning_rate": 8.116185271130965e-05, + "loss": 0.0457, + "step": 6110 + }, + { + "epoch": 5.867689357622243, + "grad_norm": 0.4434974491596222, + "learning_rate": 8.10937493396898e-05, + "loss": 0.0476, + "step": 6120 + }, + { + "epoch": 5.8772770853307765, + "grad_norm": 0.363570898771286, + "learning_rate": 8.102555177513776e-05, + "loss": 0.0405, + "step": 6130 + }, + { + "epoch": 5.88686481303931, + "grad_norm": 0.31658318638801575, + "learning_rate": 8.095726022424583e-05, + "loss": 0.0434, + "step": 6140 + }, + { + "epoch": 5.896452540747843, + "grad_norm": 0.3343175947666168, + "learning_rate": 8.088887489389099e-05, + "loss": 0.0421, + "step": 6150 + }, + { + "epoch": 5.906040268456376, + "grad_norm": 0.2580268681049347, + "learning_rate": 8.082039599123434e-05, + "loss": 0.0415, + "step": 6160 + }, + { + "epoch": 5.9156279961649085, + "grad_norm": 0.36179137229919434, + "learning_rate": 8.07518237237204e-05, + "loss": 0.0425, + "step": 6170 + }, + { + "epoch": 5.925215723873442, + "grad_norm": 0.3440069556236267, + "learning_rate": 8.068315829907658e-05, + "loss": 0.0404, + "step": 6180 + }, + { + "epoch": 5.934803451581975, + "grad_norm": 0.39785268902778625, + "learning_rate": 8.061439992531241e-05, + "loss": 0.0425, + "step": 6190 + }, + { + "epoch": 5.944391179290508, + "grad_norm": 0.29912492632865906, + "learning_rate": 8.054554881071909e-05, + "loss": 0.0465, + "step": 6200 + }, + { + "epoch": 5.953978906999041, + "grad_norm": 0.3317604660987854, + "learning_rate": 8.047660516386868e-05, + "loss": 0.0432, + "step": 6210 + }, + { + "epoch": 5.963566634707575, + "grad_norm": 0.3451102077960968, + "learning_rate": 8.040756919361358e-05, + "loss": 0.0452, + "step": 6220 + }, + { + "epoch": 5.973154362416107, + "grad_norm": 0.3293020725250244, + "learning_rate": 8.03384411090859e-05, + "loss": 0.0367, + "step": 6230 + }, + { + "epoch": 5.98274209012464, + "grad_norm": 0.30293816328048706, + "learning_rate": 8.026922111969674e-05, + "loss": 0.0442, + "step": 6240 + }, + { + "epoch": 5.992329817833173, + "grad_norm": 0.2671773433685303, + "learning_rate": 8.019990943513565e-05, + "loss": 0.0482, + "step": 6250 + }, + { + "epoch": 6.001917545541707, + "grad_norm": 0.30587103962898254, + "learning_rate": 8.013050626536992e-05, + "loss": 0.054, + "step": 6260 + }, + { + "epoch": 6.01150527325024, + "grad_norm": 0.3319852948188782, + "learning_rate": 8.0061011820644e-05, + "loss": 0.0454, + "step": 6270 + }, + { + "epoch": 6.021093000958773, + "grad_norm": 0.5606246590614319, + "learning_rate": 7.999142631147884e-05, + "loss": 0.0491, + "step": 6280 + }, + { + "epoch": 6.030680728667305, + "grad_norm": 0.3884483873844147, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0488, + "step": 6290 + }, + { + "epoch": 6.040268456375839, + "grad_norm": 0.30733785033226013, + "learning_rate": 7.985198294329324e-05, + "loss": 0.0434, + "step": 6300 + }, + { + "epoch": 6.049856184084372, + "grad_norm": 0.9947719573974609, + "learning_rate": 7.978212550669144e-05, + "loss": 0.0452, + "step": 6310 + }, + { + "epoch": 6.059443911792905, + "grad_norm": 0.3336857259273529, + "learning_rate": 7.971217785048644e-05, + "loss": 0.0445, + "step": 6320 + }, + { + "epoch": 6.069031639501438, + "grad_norm": 0.3001098930835724, + "learning_rate": 7.964214018657208e-05, + "loss": 0.042, + "step": 6330 + }, + { + "epoch": 6.0786193672099715, + "grad_norm": 0.32423412799835205, + "learning_rate": 7.957201272711492e-05, + "loss": 0.041, + "step": 6340 + }, + { + "epoch": 6.088207094918504, + "grad_norm": 0.2871480882167816, + "learning_rate": 7.950179568455347e-05, + "loss": 0.0436, + "step": 6350 + }, + { + "epoch": 6.097794822627037, + "grad_norm": 0.4804290533065796, + "learning_rate": 7.94314892715977e-05, + "loss": 0.0393, + "step": 6360 + }, + { + "epoch": 6.10738255033557, + "grad_norm": 0.459533154964447, + "learning_rate": 7.936109370122824e-05, + "loss": 0.0468, + "step": 6370 + }, + { + "epoch": 6.1169702780441035, + "grad_norm": 0.25455859303474426, + "learning_rate": 7.929060918669585e-05, + "loss": 0.0409, + "step": 6380 + }, + { + "epoch": 6.126558005752637, + "grad_norm": 0.34990832209587097, + "learning_rate": 7.922003594152068e-05, + "loss": 0.0389, + "step": 6390 + }, + { + "epoch": 6.13614573346117, + "grad_norm": 0.2321031242609024, + "learning_rate": 7.914937417949175e-05, + "loss": 0.0428, + "step": 6400 + }, + { + "epoch": 6.145733461169703, + "grad_norm": 0.3366633951663971, + "learning_rate": 7.907862411466616e-05, + "loss": 0.0417, + "step": 6410 + }, + { + "epoch": 6.1553211888782355, + "grad_norm": 0.3831850588321686, + "learning_rate": 7.900778596136855e-05, + "loss": 0.0409, + "step": 6420 + }, + { + "epoch": 6.164908916586769, + "grad_norm": 0.3772655129432678, + "learning_rate": 7.893685993419036e-05, + "loss": 0.0412, + "step": 6430 + }, + { + "epoch": 6.174496644295302, + "grad_norm": 0.4264662563800812, + "learning_rate": 7.88658462479893e-05, + "loss": 0.0437, + "step": 6440 + }, + { + "epoch": 6.184084372003835, + "grad_norm": 0.3162544369697571, + "learning_rate": 7.879474511788854e-05, + "loss": 0.0388, + "step": 6450 + }, + { + "epoch": 6.193672099712368, + "grad_norm": 0.34539514780044556, + "learning_rate": 7.872355675927623e-05, + "loss": 0.0416, + "step": 6460 + }, + { + "epoch": 6.203259827420902, + "grad_norm": 0.3206475079059601, + "learning_rate": 7.865228138780469e-05, + "loss": 0.0468, + "step": 6470 + }, + { + "epoch": 6.212847555129434, + "grad_norm": 0.3619016110897064, + "learning_rate": 7.858091921938988e-05, + "loss": 0.0448, + "step": 6480 + }, + { + "epoch": 6.222435282837967, + "grad_norm": 0.3190850615501404, + "learning_rate": 7.850947047021069e-05, + "loss": 0.0388, + "step": 6490 + }, + { + "epoch": 6.2320230105465, + "grad_norm": 0.3191368579864502, + "learning_rate": 7.843793535670827e-05, + "loss": 0.0449, + "step": 6500 + }, + { + "epoch": 6.241610738255034, + "grad_norm": 0.24938683211803436, + "learning_rate": 7.836631409558538e-05, + "loss": 0.0379, + "step": 6510 + }, + { + "epoch": 6.251198465963567, + "grad_norm": 0.27279171347618103, + "learning_rate": 7.829460690380584e-05, + "loss": 0.0398, + "step": 6520 + }, + { + "epoch": 6.2607861936721, + "grad_norm": 0.4261578917503357, + "learning_rate": 7.822281399859365e-05, + "loss": 0.0441, + "step": 6530 + }, + { + "epoch": 6.270373921380632, + "grad_norm": 0.3505672216415405, + "learning_rate": 7.815093559743256e-05, + "loss": 0.0464, + "step": 6540 + }, + { + "epoch": 6.279961649089166, + "grad_norm": 0.8695809841156006, + "learning_rate": 7.807897191806527e-05, + "loss": 0.0459, + "step": 6550 + }, + { + "epoch": 6.289549376797699, + "grad_norm": 0.3453594446182251, + "learning_rate": 7.800692317849285e-05, + "loss": 0.0437, + "step": 6560 + }, + { + "epoch": 6.299137104506232, + "grad_norm": 0.4360389709472656, + "learning_rate": 7.7934789596974e-05, + "loss": 0.0495, + "step": 6570 + }, + { + "epoch": 6.308724832214765, + "grad_norm": 0.4259977340698242, + "learning_rate": 7.786257139202447e-05, + "loss": 0.0486, + "step": 6580 + }, + { + "epoch": 6.3183125599232985, + "grad_norm": 0.4518745541572571, + "learning_rate": 7.779026878241635e-05, + "loss": 0.0455, + "step": 6590 + }, + { + "epoch": 6.327900287631831, + "grad_norm": 0.38590195775032043, + "learning_rate": 7.771788198717741e-05, + "loss": 0.043, + "step": 6600 + }, + { + "epoch": 6.337488015340364, + "grad_norm": 0.2825833559036255, + "learning_rate": 7.764541122559046e-05, + "loss": 0.0439, + "step": 6610 + }, + { + "epoch": 6.347075743048897, + "grad_norm": 0.364486962556839, + "learning_rate": 7.757285671719264e-05, + "loss": 0.0429, + "step": 6620 + }, + { + "epoch": 6.3566634707574305, + "grad_norm": 0.32037052512168884, + "learning_rate": 7.750021868177485e-05, + "loss": 0.0433, + "step": 6630 + }, + { + "epoch": 6.366251198465964, + "grad_norm": 0.2986597716808319, + "learning_rate": 7.742749733938094e-05, + "loss": 0.0407, + "step": 6640 + }, + { + "epoch": 6.375838926174497, + "grad_norm": 0.20917120575904846, + "learning_rate": 7.73546929103072e-05, + "loss": 0.0361, + "step": 6650 + }, + { + "epoch": 6.385426653883029, + "grad_norm": 0.3319404125213623, + "learning_rate": 7.728180561510155e-05, + "loss": 0.04, + "step": 6660 + }, + { + "epoch": 6.3950143815915625, + "grad_norm": 0.4171611964702606, + "learning_rate": 7.720883567456298e-05, + "loss": 0.0348, + "step": 6670 + }, + { + "epoch": 6.404602109300096, + "grad_norm": 0.44948673248291016, + "learning_rate": 7.713578330974081e-05, + "loss": 0.0489, + "step": 6680 + }, + { + "epoch": 6.414189837008629, + "grad_norm": 0.3433539569377899, + "learning_rate": 7.706264874193409e-05, + "loss": 0.038, + "step": 6690 + }, + { + "epoch": 6.423777564717162, + "grad_norm": 0.44886866211891174, + "learning_rate": 7.698943219269086e-05, + "loss": 0.0437, + "step": 6700 + }, + { + "epoch": 6.433365292425695, + "grad_norm": 0.30656543374061584, + "learning_rate": 7.691613388380752e-05, + "loss": 0.0409, + "step": 6710 + }, + { + "epoch": 6.442953020134228, + "grad_norm": 0.3929513692855835, + "learning_rate": 7.684275403732811e-05, + "loss": 0.0441, + "step": 6720 + }, + { + "epoch": 6.452540747842761, + "grad_norm": 0.44606807827949524, + "learning_rate": 7.676929287554372e-05, + "loss": 0.0457, + "step": 6730 + }, + { + "epoch": 6.462128475551294, + "grad_norm": 0.3216160535812378, + "learning_rate": 7.669575062099175e-05, + "loss": 0.0469, + "step": 6740 + }, + { + "epoch": 6.471716203259827, + "grad_norm": 0.24256640672683716, + "learning_rate": 7.662212749645527e-05, + "loss": 0.0384, + "step": 6750 + }, + { + "epoch": 6.481303930968361, + "grad_norm": 0.37510934472084045, + "learning_rate": 7.654842372496232e-05, + "loss": 0.0389, + "step": 6760 + }, + { + "epoch": 6.490891658676894, + "grad_norm": 0.3382836878299713, + "learning_rate": 7.647463952978524e-05, + "loss": 0.0448, + "step": 6770 + }, + { + "epoch": 6.500479386385427, + "grad_norm": 0.4976375102996826, + "learning_rate": 7.640077513443999e-05, + "loss": 0.0413, + "step": 6780 + }, + { + "epoch": 6.510067114093959, + "grad_norm": 0.273062527179718, + "learning_rate": 7.632683076268552e-05, + "loss": 0.0432, + "step": 6790 + }, + { + "epoch": 6.519654841802493, + "grad_norm": 0.34846237301826477, + "learning_rate": 7.625280663852301e-05, + "loss": 0.0501, + "step": 6800 + }, + { + "epoch": 6.529242569511026, + "grad_norm": 0.26076826453208923, + "learning_rate": 7.617870298619527e-05, + "loss": 0.0428, + "step": 6810 + }, + { + "epoch": 6.538830297219559, + "grad_norm": 0.8371449708938599, + "learning_rate": 7.610452003018602e-05, + "loss": 0.0437, + "step": 6820 + }, + { + "epoch": 6.548418024928092, + "grad_norm": 0.28489676117897034, + "learning_rate": 7.603025799521918e-05, + "loss": 0.0446, + "step": 6830 + }, + { + "epoch": 6.558005752636625, + "grad_norm": 0.3971545994281769, + "learning_rate": 7.595591710625829e-05, + "loss": 0.045, + "step": 6840 + }, + { + "epoch": 6.567593480345158, + "grad_norm": 0.24828213453292847, + "learning_rate": 7.588149758850572e-05, + "loss": 0.0431, + "step": 6850 + }, + { + "epoch": 6.577181208053691, + "grad_norm": 0.23631419241428375, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0384, + "step": 6860 + }, + { + "epoch": 6.586768935762224, + "grad_norm": 0.3739171326160431, + "learning_rate": 7.57324235686253e-05, + "loss": 0.0513, + "step": 6870 + }, + { + "epoch": 6.5963566634707576, + "grad_norm": 0.29776638746261597, + "learning_rate": 7.565776951809043e-05, + "loss": 0.0437, + "step": 6880 + }, + { + "epoch": 6.605944391179291, + "grad_norm": 0.24786557257175446, + "learning_rate": 7.558303774194848e-05, + "loss": 0.045, + "step": 6890 + }, + { + "epoch": 6.615532118887824, + "grad_norm": 0.2621402442455292, + "learning_rate": 7.550822846658592e-05, + "loss": 0.036, + "step": 6900 + }, + { + "epoch": 6.625119846596356, + "grad_norm": 0.4778667092323303, + "learning_rate": 7.543334191862408e-05, + "loss": 0.0403, + "step": 6910 + }, + { + "epoch": 6.6347075743048896, + "grad_norm": 0.37852802872657776, + "learning_rate": 7.535837832491826e-05, + "loss": 0.0433, + "step": 6920 + }, + { + "epoch": 6.644295302013423, + "grad_norm": 0.5725548267364502, + "learning_rate": 7.528333791255723e-05, + "loss": 0.0434, + "step": 6930 + }, + { + "epoch": 6.653883029721956, + "grad_norm": 0.39372578263282776, + "learning_rate": 7.520822090886245e-05, + "loss": 0.0403, + "step": 6940 + }, + { + "epoch": 6.663470757430489, + "grad_norm": 0.2831190526485443, + "learning_rate": 7.513302754138741e-05, + "loss": 0.0424, + "step": 6950 + }, + { + "epoch": 6.673058485139022, + "grad_norm": 0.27865827083587646, + "learning_rate": 7.50577580379169e-05, + "loss": 0.0397, + "step": 6960 + }, + { + "epoch": 6.682646212847555, + "grad_norm": 0.42975571751594543, + "learning_rate": 7.49824126264664e-05, + "loss": 0.0426, + "step": 6970 + }, + { + "epoch": 6.692233940556088, + "grad_norm": 0.3423265218734741, + "learning_rate": 7.490699153528124e-05, + "loss": 0.045, + "step": 6980 + }, + { + "epoch": 6.701821668264621, + "grad_norm": 0.25411704182624817, + "learning_rate": 7.483149499283616e-05, + "loss": 0.0396, + "step": 6990 + }, + { + "epoch": 6.7114093959731544, + "grad_norm": 0.35409414768218994, + "learning_rate": 7.475592322783434e-05, + "loss": 0.0382, + "step": 7000 + }, + { + "epoch": 6.720997123681688, + "grad_norm": 0.28262168169021606, + "learning_rate": 7.468027646920687e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 6.730584851390221, + "grad_norm": 0.4541366398334503, + "learning_rate": 7.460455494611206e-05, + "loss": 0.0389, + "step": 7020 + }, + { + "epoch": 6.740172579098753, + "grad_norm": 0.27586543560028076, + "learning_rate": 7.452875888793465e-05, + "loss": 0.0352, + "step": 7030 + }, + { + "epoch": 6.7497603068072864, + "grad_norm": 0.2681753933429718, + "learning_rate": 7.445288852428518e-05, + "loss": 0.0492, + "step": 7040 + }, + { + "epoch": 6.75934803451582, + "grad_norm": 0.32088425755500793, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0524, + "step": 7050 + }, + { + "epoch": 6.768935762224353, + "grad_norm": 0.3608848452568054, + "learning_rate": 7.430092580013712e-05, + "loss": 0.0444, + "step": 7060 + }, + { + "epoch": 6.778523489932886, + "grad_norm": 0.2983666658401489, + "learning_rate": 7.42248338999823e-05, + "loss": 0.0484, + "step": 7070 + }, + { + "epoch": 6.788111217641419, + "grad_norm": 0.48037657141685486, + "learning_rate": 7.414866861504164e-05, + "loss": 0.0441, + "step": 7080 + }, + { + "epoch": 6.797698945349952, + "grad_norm": 0.3220434784889221, + "learning_rate": 7.407243017604418e-05, + "loss": 0.0407, + "step": 7090 + }, + { + "epoch": 6.807286673058485, + "grad_norm": 0.21454603970050812, + "learning_rate": 7.399611881394061e-05, + "loss": 0.0484, + "step": 7100 + }, + { + "epoch": 6.816874400767018, + "grad_norm": 0.3658502995967865, + "learning_rate": 7.391973475990247e-05, + "loss": 0.0471, + "step": 7110 + }, + { + "epoch": 6.826462128475551, + "grad_norm": 0.6076493859291077, + "learning_rate": 7.384327824532158e-05, + "loss": 0.0512, + "step": 7120 + }, + { + "epoch": 6.836049856184085, + "grad_norm": 0.27629798650741577, + "learning_rate": 7.376674950180918e-05, + "loss": 0.0432, + "step": 7130 + }, + { + "epoch": 6.845637583892618, + "grad_norm": 0.4255768954753876, + "learning_rate": 7.36901487611954e-05, + "loss": 0.042, + "step": 7140 + }, + { + "epoch": 6.855225311601151, + "grad_norm": 0.34027740359306335, + "learning_rate": 7.361347625552842e-05, + "loss": 0.0417, + "step": 7150 + }, + { + "epoch": 6.864813039309683, + "grad_norm": 0.29743191599845886, + "learning_rate": 7.353673221707382e-05, + "loss": 0.0506, + "step": 7160 + }, + { + "epoch": 6.874400767018217, + "grad_norm": 0.2994328439235687, + "learning_rate": 7.345991687831393e-05, + "loss": 0.042, + "step": 7170 + }, + { + "epoch": 6.88398849472675, + "grad_norm": 0.2891611158847809, + "learning_rate": 7.338303047194697e-05, + "loss": 0.0396, + "step": 7180 + }, + { + "epoch": 6.893576222435283, + "grad_norm": 0.2870160937309265, + "learning_rate": 7.330607323088657e-05, + "loss": 0.0477, + "step": 7190 + }, + { + "epoch": 6.903163950143816, + "grad_norm": 0.4798467457294464, + "learning_rate": 7.322904538826083e-05, + "loss": 0.0409, + "step": 7200 + }, + { + "epoch": 6.912751677852349, + "grad_norm": 0.30976602435112, + "learning_rate": 7.31519471774118e-05, + "loss": 0.0431, + "step": 7210 + }, + { + "epoch": 6.922339405560882, + "grad_norm": 0.32751721143722534, + "learning_rate": 7.307477883189463e-05, + "loss": 0.0415, + "step": 7220 + }, + { + "epoch": 6.931927133269415, + "grad_norm": 0.3902662992477417, + "learning_rate": 7.299754058547704e-05, + "loss": 0.0359, + "step": 7230 + }, + { + "epoch": 6.941514860977948, + "grad_norm": 0.21194472908973694, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0409, + "step": 7240 + }, + { + "epoch": 6.9511025886864815, + "grad_norm": 0.28738507628440857, + "learning_rate": 7.284285532606906e-05, + "loss": 0.0433, + "step": 7250 + }, + { + "epoch": 6.960690316395015, + "grad_norm": 0.27712157368659973, + "learning_rate": 7.276540878166996e-05, + "loss": 0.0445, + "step": 7260 + }, + { + "epoch": 6.970278044103548, + "grad_norm": 0.36444854736328125, + "learning_rate": 7.268789327355143e-05, + "loss": 0.0424, + "step": 7270 + }, + { + "epoch": 6.97986577181208, + "grad_norm": 0.26638609170913696, + "learning_rate": 7.261030903653278e-05, + "loss": 0.0415, + "step": 7280 + }, + { + "epoch": 6.9894534995206135, + "grad_norm": 0.29326483607292175, + "learning_rate": 7.253265630564155e-05, + "loss": 0.0404, + "step": 7290 + }, + { + "epoch": 6.999041227229147, + "grad_norm": 0.563951849937439, + "learning_rate": 7.245493531611274e-05, + "loss": 0.0462, + "step": 7300 + }, + { + "epoch": 7.00862895493768, + "grad_norm": 0.2669621407985687, + "learning_rate": 7.237714630338812e-05, + "loss": 0.0489, + "step": 7310 + }, + { + "epoch": 7.018216682646213, + "grad_norm": 0.29936525225639343, + "learning_rate": 7.229928950311558e-05, + "loss": 0.042, + "step": 7320 + }, + { + "epoch": 7.027804410354746, + "grad_norm": 0.29611873626708984, + "learning_rate": 7.222136515114828e-05, + "loss": 0.0451, + "step": 7330 + }, + { + "epoch": 7.037392138063279, + "grad_norm": 0.2841253876686096, + "learning_rate": 7.214337348354408e-05, + "loss": 0.0401, + "step": 7340 + }, + { + "epoch": 7.046979865771812, + "grad_norm": 0.39095616340637207, + "learning_rate": 7.206531473656473e-05, + "loss": 0.0443, + "step": 7350 + }, + { + "epoch": 7.056567593480345, + "grad_norm": 0.3568895757198334, + "learning_rate": 7.19871891466752e-05, + "loss": 0.04, + "step": 7360 + }, + { + "epoch": 7.066155321188878, + "grad_norm": 0.4422648549079895, + "learning_rate": 7.190899695054293e-05, + "loss": 0.0357, + "step": 7370 + }, + { + "epoch": 7.075743048897412, + "grad_norm": 0.3040291965007782, + "learning_rate": 7.183073838503715e-05, + "loss": 0.0375, + "step": 7380 + }, + { + "epoch": 7.085330776605945, + "grad_norm": 0.3379688560962677, + "learning_rate": 7.175241368722812e-05, + "loss": 0.0441, + "step": 7390 + }, + { + "epoch": 7.094918504314477, + "grad_norm": 0.23404334485530853, + "learning_rate": 7.167402309438649e-05, + "loss": 0.0438, + "step": 7400 + }, + { + "epoch": 7.10450623202301, + "grad_norm": 0.19392350316047668, + "learning_rate": 7.159556684398246e-05, + "loss": 0.0429, + "step": 7410 + }, + { + "epoch": 7.114093959731544, + "grad_norm": 0.3650771975517273, + "learning_rate": 7.151704517368513e-05, + "loss": 0.0417, + "step": 7420 + }, + { + "epoch": 7.123681687440077, + "grad_norm": 0.3727266788482666, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0381, + "step": 7430 + }, + { + "epoch": 7.13326941514861, + "grad_norm": 0.2589777410030365, + "learning_rate": 7.13598065250774e-05, + "loss": 0.046, + "step": 7440 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.3064965009689331, + "learning_rate": 7.128109002309324e-05, + "loss": 0.0419, + "step": 7450 + }, + { + "epoch": 7.152444870565676, + "grad_norm": 0.3681334853172302, + "learning_rate": 7.120230905386688e-05, + "loss": 0.0456, + "step": 7460 + }, + { + "epoch": 7.162032598274209, + "grad_norm": 0.23908288776874542, + "learning_rate": 7.112346385605115e-05, + "loss": 0.0395, + "step": 7470 + }, + { + "epoch": 7.171620325982742, + "grad_norm": 0.26035764813423157, + "learning_rate": 7.104455466849339e-05, + "loss": 0.0411, + "step": 7480 + }, + { + "epoch": 7.181208053691275, + "grad_norm": 0.25808098912239075, + "learning_rate": 7.096558173023486e-05, + "loss": 0.0405, + "step": 7490 + }, + { + "epoch": 7.1907957813998085, + "grad_norm": 0.21516771614551544, + "learning_rate": 7.088654528050986e-05, + "loss": 0.0411, + "step": 7500 + }, + { + "epoch": 7.200383509108342, + "grad_norm": 0.27496856451034546, + "learning_rate": 7.080744555874517e-05, + "loss": 0.0332, + "step": 7510 + }, + { + "epoch": 7.209971236816874, + "grad_norm": 0.43999767303466797, + "learning_rate": 7.072828280455917e-05, + "loss": 0.0384, + "step": 7520 + }, + { + "epoch": 7.219558964525407, + "grad_norm": 0.3292781710624695, + "learning_rate": 7.06490572577612e-05, + "loss": 0.042, + "step": 7530 + }, + { + "epoch": 7.2291466922339405, + "grad_norm": 0.3117612600326538, + "learning_rate": 7.056976915835087e-05, + "loss": 0.0387, + "step": 7540 + }, + { + "epoch": 7.238734419942474, + "grad_norm": 0.2206171602010727, + "learning_rate": 7.049041874651722e-05, + "loss": 0.0362, + "step": 7550 + }, + { + "epoch": 7.248322147651007, + "grad_norm": 0.2644396722316742, + "learning_rate": 7.04110062626381e-05, + "loss": 0.0373, + "step": 7560 + }, + { + "epoch": 7.25790987535954, + "grad_norm": 0.2682825028896332, + "learning_rate": 7.033153194727934e-05, + "loss": 0.039, + "step": 7570 + }, + { + "epoch": 7.2674976030680725, + "grad_norm": 0.3411322832107544, + "learning_rate": 7.025199604119416e-05, + "loss": 0.0454, + "step": 7580 + }, + { + "epoch": 7.277085330776606, + "grad_norm": 0.3761787712574005, + "learning_rate": 7.017239878532227e-05, + "loss": 0.0379, + "step": 7590 + }, + { + "epoch": 7.286673058485139, + "grad_norm": 0.24610835313796997, + "learning_rate": 7.009274042078927e-05, + "loss": 0.0465, + "step": 7600 + }, + { + "epoch": 7.296260786193672, + "grad_norm": 0.3763638138771057, + "learning_rate": 7.00130211889059e-05, + "loss": 0.0351, + "step": 7610 + }, + { + "epoch": 7.305848513902205, + "grad_norm": 0.2616029679775238, + "learning_rate": 6.993324133116726e-05, + "loss": 0.039, + "step": 7620 + }, + { + "epoch": 7.315436241610739, + "grad_norm": 0.40914463996887207, + "learning_rate": 6.985340108925209e-05, + "loss": 0.0417, + "step": 7630 + }, + { + "epoch": 7.325023969319272, + "grad_norm": 0.3503078520298004, + "learning_rate": 6.977350070502208e-05, + "loss": 0.0456, + "step": 7640 + }, + { + "epoch": 7.334611697027804, + "grad_norm": 0.40051010251045227, + "learning_rate": 6.96935404205211e-05, + "loss": 0.047, + "step": 7650 + }, + { + "epoch": 7.344199424736337, + "grad_norm": 0.3985821306705475, + "learning_rate": 6.96135204779745e-05, + "loss": 0.0409, + "step": 7660 + }, + { + "epoch": 7.353787152444871, + "grad_norm": 0.5366324782371521, + "learning_rate": 6.95334411197883e-05, + "loss": 0.0445, + "step": 7670 + }, + { + "epoch": 7.363374880153404, + "grad_norm": 0.2314271628856659, + "learning_rate": 6.945330258854854e-05, + "loss": 0.0345, + "step": 7680 + }, + { + "epoch": 7.372962607861937, + "grad_norm": 0.24734103679656982, + "learning_rate": 6.937310512702056e-05, + "loss": 0.0354, + "step": 7690 + }, + { + "epoch": 7.382550335570469, + "grad_norm": 0.7746879458427429, + "learning_rate": 6.929284897814812e-05, + "loss": 0.0398, + "step": 7700 + }, + { + "epoch": 7.392138063279003, + "grad_norm": 0.3436695635318756, + "learning_rate": 6.921253438505285e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 7.401725790987536, + "grad_norm": 0.3027035593986511, + "learning_rate": 6.913216159103339e-05, + "loss": 0.0365, + "step": 7720 + }, + { + "epoch": 7.411313518696069, + "grad_norm": 0.23207184672355652, + "learning_rate": 6.905173083956468e-05, + "loss": 0.0397, + "step": 7730 + }, + { + "epoch": 7.420901246404602, + "grad_norm": 0.2601774036884308, + "learning_rate": 6.897124237429726e-05, + "loss": 0.0377, + "step": 7740 + }, + { + "epoch": 7.4304889741131355, + "grad_norm": 0.37864232063293457, + "learning_rate": 6.889069643905646e-05, + "loss": 0.0426, + "step": 7750 + }, + { + "epoch": 7.440076701821669, + "grad_norm": 0.29199257493019104, + "learning_rate": 6.881009327784176e-05, + "loss": 0.0414, + "step": 7760 + }, + { + "epoch": 7.449664429530201, + "grad_norm": 0.39418113231658936, + "learning_rate": 6.872943313482596e-05, + "loss": 0.04, + "step": 7770 + }, + { + "epoch": 7.459252157238734, + "grad_norm": 0.2868475615978241, + "learning_rate": 6.864871625435448e-05, + "loss": 0.0373, + "step": 7780 + }, + { + "epoch": 7.4688398849472675, + "grad_norm": 0.27719494700431824, + "learning_rate": 6.856794288094461e-05, + "loss": 0.0401, + "step": 7790 + }, + { + "epoch": 7.478427612655801, + "grad_norm": 0.33910930156707764, + "learning_rate": 6.848711325928481e-05, + "loss": 0.0375, + "step": 7800 + }, + { + "epoch": 7.488015340364334, + "grad_norm": 0.4122414290904999, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0437, + "step": 7810 + }, + { + "epoch": 7.497603068072867, + "grad_norm": 0.2600208818912506, + "learning_rate": 6.832528625082036e-05, + "loss": 0.0418, + "step": 7820 + }, + { + "epoch": 7.5071907957813995, + "grad_norm": 0.27382367849349976, + "learning_rate": 6.824428935424158e-05, + "loss": 0.0512, + "step": 7830 + }, + { + "epoch": 7.516778523489933, + "grad_norm": 0.27426889538764954, + "learning_rate": 6.816323718986313e-05, + "loss": 0.0339, + "step": 7840 + }, + { + "epoch": 7.526366251198466, + "grad_norm": 0.32315194606781006, + "learning_rate": 6.808213000321796e-05, + "loss": 0.0387, + "step": 7850 + }, + { + "epoch": 7.535953978906999, + "grad_norm": 0.2910844683647156, + "learning_rate": 6.80009680400058e-05, + "loss": 0.0351, + "step": 7860 + }, + { + "epoch": 7.545541706615532, + "grad_norm": 0.3915770649909973, + "learning_rate": 6.791975154609216e-05, + "loss": 0.0439, + "step": 7870 + }, + { + "epoch": 7.555129434324066, + "grad_norm": 0.2871047258377075, + "learning_rate": 6.78384807675079e-05, + "loss": 0.039, + "step": 7880 + }, + { + "epoch": 7.564717162032598, + "grad_norm": 0.3511698544025421, + "learning_rate": 6.775715595044822e-05, + "loss": 0.039, + "step": 7890 + }, + { + "epoch": 7.574304889741131, + "grad_norm": 0.23974575102329254, + "learning_rate": 6.767577734127209e-05, + "loss": 0.0438, + "step": 7900 + }, + { + "epoch": 7.583892617449664, + "grad_norm": 0.21983303129673004, + "learning_rate": 6.759434518650133e-05, + "loss": 0.043, + "step": 7910 + }, + { + "epoch": 7.593480345158198, + "grad_norm": 0.2729918658733368, + "learning_rate": 6.75128597328201e-05, + "loss": 0.0423, + "step": 7920 + }, + { + "epoch": 7.603068072866731, + "grad_norm": 0.34236469864845276, + "learning_rate": 6.743132122707394e-05, + "loss": 0.0443, + "step": 7930 + }, + { + "epoch": 7.612655800575264, + "grad_norm": 0.24948126077651978, + "learning_rate": 6.73497299162691e-05, + "loss": 0.037, + "step": 7940 + }, + { + "epoch": 7.622243528283796, + "grad_norm": 0.3250608444213867, + "learning_rate": 6.726808604757184e-05, + "loss": 0.0476, + "step": 7950 + }, + { + "epoch": 7.63183125599233, + "grad_norm": 0.2713163495063782, + "learning_rate": 6.718638986830758e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 7.641418983700863, + "grad_norm": 0.3012318015098572, + "learning_rate": 6.710464162596023e-05, + "loss": 0.0445, + "step": 7970 + }, + { + "epoch": 7.651006711409396, + "grad_norm": 0.4039930999279022, + "learning_rate": 6.702284156817143e-05, + "loss": 0.045, + "step": 7980 + }, + { + "epoch": 7.660594439117929, + "grad_norm": 0.22321514785289764, + "learning_rate": 6.694098994273977e-05, + "loss": 0.0395, + "step": 7990 + }, + { + "epoch": 7.6701821668264625, + "grad_norm": 0.3009647727012634, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0425, + "step": 8000 + }, + { + "epoch": 7.679769894534996, + "grad_norm": 0.23675967752933502, + "learning_rate": 6.677713298092251e-05, + "loss": 0.043, + "step": 8010 + }, + { + "epoch": 7.689357622243528, + "grad_norm": 0.3453296422958374, + "learning_rate": 6.669512814091219e-05, + "loss": 0.0402, + "step": 8020 + }, + { + "epoch": 7.698945349952061, + "grad_norm": 0.35849177837371826, + "learning_rate": 6.6613072726008e-05, + "loss": 0.0412, + "step": 8030 + }, + { + "epoch": 7.7085330776605945, + "grad_norm": 0.2602018117904663, + "learning_rate": 6.65309669847821e-05, + "loss": 0.0456, + "step": 8040 + }, + { + "epoch": 7.718120805369128, + "grad_norm": 0.296563059091568, + "learning_rate": 6.64488111659591e-05, + "loss": 0.0354, + "step": 8050 + }, + { + "epoch": 7.727708533077661, + "grad_norm": 0.2529861629009247, + "learning_rate": 6.636660551841527e-05, + "loss": 0.046, + "step": 8060 + }, + { + "epoch": 7.737296260786193, + "grad_norm": 0.3589211404323578, + "learning_rate": 6.62843502911779e-05, + "loss": 0.0486, + "step": 8070 + }, + { + "epoch": 7.7468839884947265, + "grad_norm": 0.28562942147254944, + "learning_rate": 6.620204573342444e-05, + "loss": 0.04, + "step": 8080 + }, + { + "epoch": 7.75647171620326, + "grad_norm": 0.42662665247917175, + "learning_rate": 6.611969209448175e-05, + "loss": 0.0417, + "step": 8090 + }, + { + "epoch": 7.766059443911793, + "grad_norm": 0.3339911997318268, + "learning_rate": 6.603728962382542e-05, + "loss": 0.0344, + "step": 8100 + }, + { + "epoch": 7.775647171620326, + "grad_norm": 0.5838896632194519, + "learning_rate": 6.595483857107891e-05, + "loss": 0.0371, + "step": 8110 + }, + { + "epoch": 7.785234899328859, + "grad_norm": 0.30259743332862854, + "learning_rate": 6.587233918601292e-05, + "loss": 0.0392, + "step": 8120 + }, + { + "epoch": 7.794822627037393, + "grad_norm": 0.4095616340637207, + "learning_rate": 6.578979171854449e-05, + "loss": 0.034, + "step": 8130 + }, + { + "epoch": 7.804410354745925, + "grad_norm": 0.4089941084384918, + "learning_rate": 6.570719641873639e-05, + "loss": 0.0432, + "step": 8140 + }, + { + "epoch": 7.813998082454458, + "grad_norm": 0.22477275133132935, + "learning_rate": 6.562455353679624e-05, + "loss": 0.0482, + "step": 8150 + }, + { + "epoch": 7.823585810162991, + "grad_norm": 0.24884644150733948, + "learning_rate": 6.554186332307583e-05, + "loss": 0.0357, + "step": 8160 + }, + { + "epoch": 7.833173537871525, + "grad_norm": 0.40433716773986816, + "learning_rate": 6.545912602807029e-05, + "loss": 0.0393, + "step": 8170 + }, + { + "epoch": 7.842761265580058, + "grad_norm": 0.1963358074426651, + "learning_rate": 6.537634190241742e-05, + "loss": 0.0369, + "step": 8180 + }, + { + "epoch": 7.85234899328859, + "grad_norm": 0.30618107318878174, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0365, + "step": 8190 + }, + { + "epoch": 7.861936720997123, + "grad_norm": 0.9213468432426453, + "learning_rate": 6.52106341624294e-05, + "loss": 0.0415, + "step": 8200 + }, + { + "epoch": 7.871524448705657, + "grad_norm": 0.41490432620048523, + "learning_rate": 6.512771105007609e-05, + "loss": 0.0432, + "step": 8210 + }, + { + "epoch": 7.88111217641419, + "grad_norm": 0.3433400094509125, + "learning_rate": 6.504474211103766e-05, + "loss": 0.0383, + "step": 8220 + }, + { + "epoch": 7.890699904122723, + "grad_norm": 0.2565036714076996, + "learning_rate": 6.496172759665357e-05, + "loss": 0.039, + "step": 8230 + }, + { + "epoch": 7.900287631831256, + "grad_norm": 0.36820822954177856, + "learning_rate": 6.487866775840141e-05, + "loss": 0.0373, + "step": 8240 + }, + { + "epoch": 7.9098753595397895, + "grad_norm": 0.26671302318573, + "learning_rate": 6.479556284789608e-05, + "loss": 0.0339, + "step": 8250 + }, + { + "epoch": 7.919463087248322, + "grad_norm": 0.3026654124259949, + "learning_rate": 6.471241311688894e-05, + "loss": 0.0363, + "step": 8260 + }, + { + "epoch": 7.929050814956855, + "grad_norm": 0.24896202981472015, + "learning_rate": 6.46292188172672e-05, + "loss": 0.0394, + "step": 8270 + }, + { + "epoch": 7.938638542665388, + "grad_norm": 0.3126719892024994, + "learning_rate": 6.454598020105306e-05, + "loss": 0.0439, + "step": 8280 + }, + { + "epoch": 7.9482262703739215, + "grad_norm": 0.33165302872657776, + "learning_rate": 6.446269752040295e-05, + "loss": 0.0393, + "step": 8290 + }, + { + "epoch": 7.957813998082455, + "grad_norm": 0.6648756265640259, + "learning_rate": 6.437937102760682e-05, + "loss": 0.0356, + "step": 8300 + }, + { + "epoch": 7.967401725790987, + "grad_norm": 0.24022682011127472, + "learning_rate": 6.429600097508732e-05, + "loss": 0.0406, + "step": 8310 + }, + { + "epoch": 7.97698945349952, + "grad_norm": 1.2279690504074097, + "learning_rate": 6.421258761539904e-05, + "loss": 0.0434, + "step": 8320 + }, + { + "epoch": 7.9865771812080535, + "grad_norm": 0.2868311107158661, + "learning_rate": 6.412913120122779e-05, + "loss": 0.0372, + "step": 8330 + }, + { + "epoch": 7.996164908916587, + "grad_norm": 0.25136950612068176, + "learning_rate": 6.40456319853898e-05, + "loss": 0.0405, + "step": 8340 + }, + { + "epoch": 8.00575263662512, + "grad_norm": 0.3662584722042084, + "learning_rate": 6.396209022083098e-05, + "loss": 0.041, + "step": 8350 + }, + { + "epoch": 8.015340364333653, + "grad_norm": 0.3134470283985138, + "learning_rate": 6.387850616062605e-05, + "loss": 0.0357, + "step": 8360 + }, + { + "epoch": 8.024928092042186, + "grad_norm": 0.3947703540325165, + "learning_rate": 6.379488005797797e-05, + "loss": 0.0384, + "step": 8370 + }, + { + "epoch": 8.03451581975072, + "grad_norm": 0.3272991478443146, + "learning_rate": 6.371121216621698e-05, + "loss": 0.0392, + "step": 8380 + }, + { + "epoch": 8.044103547459253, + "grad_norm": 1.1089465618133545, + "learning_rate": 6.362750273879996e-05, + "loss": 0.047, + "step": 8390 + }, + { + "epoch": 8.053691275167786, + "grad_norm": 0.2133249044418335, + "learning_rate": 6.354375202930958e-05, + "loss": 0.0333, + "step": 8400 + }, + { + "epoch": 8.063279002876317, + "grad_norm": 0.3814240097999573, + "learning_rate": 6.345996029145356e-05, + "loss": 0.0419, + "step": 8410 + }, + { + "epoch": 8.07286673058485, + "grad_norm": 0.38257062435150146, + "learning_rate": 6.337612777906398e-05, + "loss": 0.0412, + "step": 8420 + }, + { + "epoch": 8.082454458293384, + "grad_norm": 0.20826545357704163, + "learning_rate": 6.329225474609633e-05, + "loss": 0.0402, + "step": 8430 + }, + { + "epoch": 8.092042186001917, + "grad_norm": 0.2289332151412964, + "learning_rate": 6.320834144662897e-05, + "loss": 0.0392, + "step": 8440 + }, + { + "epoch": 8.10162991371045, + "grad_norm": 0.29565075039863586, + "learning_rate": 6.312438813486211e-05, + "loss": 0.0347, + "step": 8450 + }, + { + "epoch": 8.111217641418984, + "grad_norm": 0.21872690320014954, + "learning_rate": 6.30403950651173e-05, + "loss": 0.0357, + "step": 8460 + }, + { + "epoch": 8.120805369127517, + "grad_norm": 0.24760524928569794, + "learning_rate": 6.295636249183643e-05, + "loss": 0.0331, + "step": 8470 + }, + { + "epoch": 8.13039309683605, + "grad_norm": 0.2806303799152374, + "learning_rate": 6.287229066958113e-05, + "loss": 0.0393, + "step": 8480 + }, + { + "epoch": 8.139980824544583, + "grad_norm": 0.45841529965400696, + "learning_rate": 6.278817985303184e-05, + "loss": 0.0434, + "step": 8490 + }, + { + "epoch": 8.149568552253116, + "grad_norm": 0.21284928917884827, + "learning_rate": 6.270403029698722e-05, + "loss": 0.0311, + "step": 8500 + }, + { + "epoch": 8.15915627996165, + "grad_norm": 0.312191367149353, + "learning_rate": 6.261984225636324e-05, + "loss": 0.0409, + "step": 8510 + }, + { + "epoch": 8.168744007670183, + "grad_norm": 0.38339605927467346, + "learning_rate": 6.253561598619247e-05, + "loss": 0.0367, + "step": 8520 + }, + { + "epoch": 8.178331735378714, + "grad_norm": 0.24168361723423004, + "learning_rate": 6.245135174162323e-05, + "loss": 0.0419, + "step": 8530 + }, + { + "epoch": 8.187919463087248, + "grad_norm": 0.3038835823535919, + "learning_rate": 6.236704977791898e-05, + "loss": 0.0349, + "step": 8540 + }, + { + "epoch": 8.19750719079578, + "grad_norm": 0.32537156343460083, + "learning_rate": 6.228271035045735e-05, + "loss": 0.0347, + "step": 8550 + }, + { + "epoch": 8.207094918504314, + "grad_norm": 0.2789401412010193, + "learning_rate": 6.21983337147295e-05, + "loss": 0.0339, + "step": 8560 + }, + { + "epoch": 8.216682646212847, + "grad_norm": 0.4282236397266388, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0352, + "step": 8570 + }, + { + "epoch": 8.22627037392138, + "grad_norm": 0.3608817458152771, + "learning_rate": 6.202946984100261e-05, + "loss": 0.0373, + "step": 8580 + }, + { + "epoch": 8.235858101629914, + "grad_norm": 0.29480835795402527, + "learning_rate": 6.194498311454636e-05, + "loss": 0.0321, + "step": 8590 + }, + { + "epoch": 8.245445829338447, + "grad_norm": 0.27964943647384644, + "learning_rate": 6.186046020290792e-05, + "loss": 0.0428, + "step": 8600 + }, + { + "epoch": 8.25503355704698, + "grad_norm": 0.2138575315475464, + "learning_rate": 6.177590136213429e-05, + "loss": 0.0344, + "step": 8610 + }, + { + "epoch": 8.264621284755513, + "grad_norm": 0.3693723678588867, + "learning_rate": 6.169130684838132e-05, + "loss": 0.0449, + "step": 8620 + }, + { + "epoch": 8.274209012464047, + "grad_norm": 0.24271826446056366, + "learning_rate": 6.160667691791287e-05, + "loss": 0.0414, + "step": 8630 + }, + { + "epoch": 8.28379674017258, + "grad_norm": 0.27349698543548584, + "learning_rate": 6.152201182710016e-05, + "loss": 0.0437, + "step": 8640 + }, + { + "epoch": 8.293384467881111, + "grad_norm": 0.265661358833313, + "learning_rate": 6.143731183242085e-05, + "loss": 0.0402, + "step": 8650 + }, + { + "epoch": 8.302972195589644, + "grad_norm": 0.3084318935871124, + "learning_rate": 6.13525771904584e-05, + "loss": 0.0424, + "step": 8660 + }, + { + "epoch": 8.312559923298178, + "grad_norm": 0.42005741596221924, + "learning_rate": 6.126780815790116e-05, + "loss": 0.0386, + "step": 8670 + }, + { + "epoch": 8.322147651006711, + "grad_norm": 0.349277526140213, + "learning_rate": 6.118300499154174e-05, + "loss": 0.0355, + "step": 8680 + }, + { + "epoch": 8.331735378715244, + "grad_norm": 0.3930281102657318, + "learning_rate": 6.109816794827607e-05, + "loss": 0.0386, + "step": 8690 + }, + { + "epoch": 8.341323106423777, + "grad_norm": 0.2631587088108063, + "learning_rate": 6.101329728510278e-05, + "loss": 0.0376, + "step": 8700 + }, + { + "epoch": 8.35091083413231, + "grad_norm": 0.3070177137851715, + "learning_rate": 6.0928393259122285e-05, + "loss": 0.039, + "step": 8710 + }, + { + "epoch": 8.360498561840844, + "grad_norm": 0.3494318425655365, + "learning_rate": 6.084345612753611e-05, + "loss": 0.0405, + "step": 8720 + }, + { + "epoch": 8.370086289549377, + "grad_norm": 0.2996184825897217, + "learning_rate": 6.0758486147646035e-05, + "loss": 0.0386, + "step": 8730 + }, + { + "epoch": 8.37967401725791, + "grad_norm": 0.39091756939888, + "learning_rate": 6.0673483576853365e-05, + "loss": 0.038, + "step": 8740 + }, + { + "epoch": 8.389261744966444, + "grad_norm": 0.28855571150779724, + "learning_rate": 6.0588448672658125e-05, + "loss": 0.0403, + "step": 8750 + }, + { + "epoch": 8.398849472674977, + "grad_norm": 0.25725746154785156, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0338, + "step": 8760 + }, + { + "epoch": 8.40843720038351, + "grad_norm": 0.2737105190753937, + "learning_rate": 6.041828289454903e-05, + "loss": 0.0417, + "step": 8770 + }, + { + "epoch": 8.418024928092041, + "grad_norm": 0.3197145462036133, + "learning_rate": 6.033315253612186e-05, + "loss": 0.0428, + "step": 8780 + }, + { + "epoch": 8.427612655800575, + "grad_norm": 0.35713446140289307, + "learning_rate": 6.0247990875263914e-05, + "loss": 0.0376, + "step": 8790 + }, + { + "epoch": 8.437200383509108, + "grad_norm": 0.354390949010849, + "learning_rate": 6.016279816995718e-05, + "loss": 0.0384, + "step": 8800 + }, + { + "epoch": 8.446788111217641, + "grad_norm": 0.31738895177841187, + "learning_rate": 6.0077574678277636e-05, + "loss": 0.048, + "step": 8810 + }, + { + "epoch": 8.456375838926174, + "grad_norm": 0.28505873680114746, + "learning_rate": 5.999232065839456e-05, + "loss": 0.0353, + "step": 8820 + }, + { + "epoch": 8.465963566634708, + "grad_norm": 0.3551139831542969, + "learning_rate": 5.990703636856974e-05, + "loss": 0.0422, + "step": 8830 + }, + { + "epoch": 8.47555129434324, + "grad_norm": 0.23753251135349274, + "learning_rate": 5.982172206715656e-05, + "loss": 0.0356, + "step": 8840 + }, + { + "epoch": 8.485139022051774, + "grad_norm": 0.3025340735912323, + "learning_rate": 5.973637801259944e-05, + "loss": 0.0416, + "step": 8850 + }, + { + "epoch": 8.494726749760307, + "grad_norm": 0.3358081579208374, + "learning_rate": 5.9651004463432826e-05, + "loss": 0.0406, + "step": 8860 + }, + { + "epoch": 8.50431447746884, + "grad_norm": 0.2748364508152008, + "learning_rate": 5.95656016782806e-05, + "loss": 0.0355, + "step": 8870 + }, + { + "epoch": 8.513902205177374, + "grad_norm": 0.27150842547416687, + "learning_rate": 5.948016991585514e-05, + "loss": 0.0356, + "step": 8880 + }, + { + "epoch": 8.523489932885907, + "grad_norm": 0.2812124490737915, + "learning_rate": 5.9394709434956664e-05, + "loss": 0.0419, + "step": 8890 + }, + { + "epoch": 8.53307766059444, + "grad_norm": 0.29283568263053894, + "learning_rate": 5.9309220494472314e-05, + "loss": 0.0408, + "step": 8900 + }, + { + "epoch": 8.542665388302972, + "grad_norm": 0.4069705605506897, + "learning_rate": 5.9223703353375534e-05, + "loss": 0.0425, + "step": 8910 + }, + { + "epoch": 8.552253116011505, + "grad_norm": 0.2776540219783783, + "learning_rate": 5.913815827072513e-05, + "loss": 0.0365, + "step": 8920 + }, + { + "epoch": 8.561840843720038, + "grad_norm": 0.2777857482433319, + "learning_rate": 5.905258550566458e-05, + "loss": 0.0368, + "step": 8930 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.3018902838230133, + "learning_rate": 5.896698531742122e-05, + "loss": 0.0377, + "step": 8940 + }, + { + "epoch": 8.581016299137104, + "grad_norm": 0.622887134552002, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0448, + "step": 8950 + }, + { + "epoch": 8.590604026845638, + "grad_norm": 0.28407829999923706, + "learning_rate": 5.879570370870995e-05, + "loss": 0.0373, + "step": 8960 + }, + { + "epoch": 8.60019175455417, + "grad_norm": 0.2791987955570221, + "learning_rate": 5.871002280710892e-05, + "loss": 0.0402, + "step": 8970 + }, + { + "epoch": 8.609779482262704, + "grad_norm": 0.27533990144729614, + "learning_rate": 5.862431552005729e-05, + "loss": 0.0434, + "step": 8980 + }, + { + "epoch": 8.619367209971237, + "grad_norm": 0.27701878547668457, + "learning_rate": 5.85385821071899e-05, + "loss": 0.0383, + "step": 8990 + }, + { + "epoch": 8.62895493767977, + "grad_norm": 0.269197016954422, + "learning_rate": 5.845282282822071e-05, + "loss": 0.0389, + "step": 9000 + }, + { + "epoch": 8.638542665388304, + "grad_norm": 0.3775997757911682, + "learning_rate": 5.836703794294208e-05, + "loss": 0.0401, + "step": 9010 + }, + { + "epoch": 8.648130393096835, + "grad_norm": 0.21519199013710022, + "learning_rate": 5.828122771122392e-05, + "loss": 0.0326, + "step": 9020 + }, + { + "epoch": 8.657718120805368, + "grad_norm": 0.4001868963241577, + "learning_rate": 5.819539239301291e-05, + "loss": 0.04, + "step": 9030 + }, + { + "epoch": 8.667305848513902, + "grad_norm": 0.19594238698482513, + "learning_rate": 5.810953224833177e-05, + "loss": 0.0301, + "step": 9040 + }, + { + "epoch": 8.676893576222435, + "grad_norm": 0.19823068380355835, + "learning_rate": 5.802364753727836e-05, + "loss": 0.0344, + "step": 9050 + }, + { + "epoch": 8.686481303930968, + "grad_norm": 0.26146700978279114, + "learning_rate": 5.793773852002502e-05, + "loss": 0.0444, + "step": 9060 + }, + { + "epoch": 8.696069031639501, + "grad_norm": 0.36863768100738525, + "learning_rate": 5.7851805456817677e-05, + "loss": 0.0364, + "step": 9070 + }, + { + "epoch": 8.705656759348035, + "grad_norm": 0.2518344521522522, + "learning_rate": 5.7765848607975136e-05, + "loss": 0.0394, + "step": 9080 + }, + { + "epoch": 8.715244487056568, + "grad_norm": 0.2473488301038742, + "learning_rate": 5.767986823388825e-05, + "loss": 0.0326, + "step": 9090 + }, + { + "epoch": 8.724832214765101, + "grad_norm": 0.20669348537921906, + "learning_rate": 5.7593864595019096e-05, + "loss": 0.0408, + "step": 9100 + }, + { + "epoch": 8.734419942473634, + "grad_norm": 0.32804393768310547, + "learning_rate": 5.750783795190029e-05, + "loss": 0.0388, + "step": 9110 + }, + { + "epoch": 8.744007670182167, + "grad_norm": 0.18472160398960114, + "learning_rate": 5.7421788565134074e-05, + "loss": 0.0395, + "step": 9120 + }, + { + "epoch": 8.7535953978907, + "grad_norm": 0.3553003668785095, + "learning_rate": 5.733571669539167e-05, + "loss": 0.0432, + "step": 9130 + }, + { + "epoch": 8.763183125599234, + "grad_norm": 0.2398902177810669, + "learning_rate": 5.72496226034123e-05, + "loss": 0.0354, + "step": 9140 + }, + { + "epoch": 8.772770853307765, + "grad_norm": 0.2900802195072174, + "learning_rate": 5.716350655000261e-05, + "loss": 0.0449, + "step": 9150 + }, + { + "epoch": 8.782358581016299, + "grad_norm": 0.17919373512268066, + "learning_rate": 5.707736879603568e-05, + "loss": 0.0413, + "step": 9160 + }, + { + "epoch": 8.791946308724832, + "grad_norm": 0.2598424255847931, + "learning_rate": 5.6991209602450424e-05, + "loss": 0.0432, + "step": 9170 + }, + { + "epoch": 8.801534036433365, + "grad_norm": 0.4794408082962036, + "learning_rate": 5.69050292302506e-05, + "loss": 0.0392, + "step": 9180 + }, + { + "epoch": 8.811121764141898, + "grad_norm": 0.3420094847679138, + "learning_rate": 5.6818827940504225e-05, + "loss": 0.0335, + "step": 9190 + }, + { + "epoch": 8.820709491850431, + "grad_norm": 1.9920908212661743, + "learning_rate": 5.673260599434259e-05, + "loss": 0.0427, + "step": 9200 + }, + { + "epoch": 8.830297219558965, + "grad_norm": 0.28250133991241455, + "learning_rate": 5.664636365295965e-05, + "loss": 0.0349, + "step": 9210 + }, + { + "epoch": 8.839884947267498, + "grad_norm": 0.22743001580238342, + "learning_rate": 5.656010117761105e-05, + "loss": 0.0401, + "step": 9220 + }, + { + "epoch": 8.849472674976031, + "grad_norm": 0.2771368622779846, + "learning_rate": 5.647381882961349e-05, + "loss": 0.0424, + "step": 9230 + }, + { + "epoch": 8.859060402684564, + "grad_norm": 0.38394448161125183, + "learning_rate": 5.638751687034387e-05, + "loss": 0.0357, + "step": 9240 + }, + { + "epoch": 8.868648130393098, + "grad_norm": 0.22416839003562927, + "learning_rate": 5.630119556123848e-05, + "loss": 0.0347, + "step": 9250 + }, + { + "epoch": 8.87823585810163, + "grad_norm": 0.1746525913476944, + "learning_rate": 5.6214855163792224e-05, + "loss": 0.0366, + "step": 9260 + }, + { + "epoch": 8.887823585810162, + "grad_norm": 0.26215359568595886, + "learning_rate": 5.6128495939557835e-05, + "loss": 0.0411, + "step": 9270 + }, + { + "epoch": 8.897411313518695, + "grad_norm": 0.3498288691043854, + "learning_rate": 5.604211815014509e-05, + "loss": 0.0404, + "step": 9280 + }, + { + "epoch": 8.906999041227229, + "grad_norm": 0.19935335218906403, + "learning_rate": 5.595572205721999e-05, + "loss": 0.0356, + "step": 9290 + }, + { + "epoch": 8.916586768935762, + "grad_norm": 0.3347182869911194, + "learning_rate": 5.5869307922504e-05, + "loss": 0.0393, + "step": 9300 + }, + { + "epoch": 8.926174496644295, + "grad_norm": 0.3638782203197479, + "learning_rate": 5.578287600777321e-05, + "loss": 0.0324, + "step": 9310 + }, + { + "epoch": 8.935762224352828, + "grad_norm": 0.2433633953332901, + "learning_rate": 5.569642657485761e-05, + "loss": 0.0351, + "step": 9320 + }, + { + "epoch": 8.945349952061362, + "grad_norm": 0.2311711609363556, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0386, + "step": 9330 + }, + { + "epoch": 8.954937679769895, + "grad_norm": 0.2803432047367096, + "learning_rate": 5.552347620205638e-05, + "loss": 0.0461, + "step": 9340 + }, + { + "epoch": 8.964525407478428, + "grad_norm": 0.25586047768592834, + "learning_rate": 5.5436975786092873e-05, + "loss": 0.0384, + "step": 9350 + }, + { + "epoch": 8.974113135186961, + "grad_norm": 0.3626959025859833, + "learning_rate": 5.535045889978717e-05, + "loss": 0.0374, + "step": 9360 + }, + { + "epoch": 8.983700862895494, + "grad_norm": 0.3548148572444916, + "learning_rate": 5.526392580522666e-05, + "loss": 0.0416, + "step": 9370 + }, + { + "epoch": 8.993288590604028, + "grad_norm": 2.09843111038208, + "learning_rate": 5.5177376764547814e-05, + "loss": 0.0434, + "step": 9380 + }, + { + "epoch": 9.002876318312559, + "grad_norm": 0.4216479957103729, + "learning_rate": 5.5090812039935426e-05, + "loss": 0.0404, + "step": 9390 + }, + { + "epoch": 9.012464046021092, + "grad_norm": 0.292222261428833, + "learning_rate": 5.5004231893621774e-05, + "loss": 0.0362, + "step": 9400 + }, + { + "epoch": 9.022051773729626, + "grad_norm": 0.37306836247444153, + "learning_rate": 5.491763658788589e-05, + "loss": 0.0367, + "step": 9410 + }, + { + "epoch": 9.031639501438159, + "grad_norm": 0.2755350172519684, + "learning_rate": 5.483102638505269e-05, + "loss": 0.0401, + "step": 9420 + }, + { + "epoch": 9.041227229146692, + "grad_norm": 0.2616848349571228, + "learning_rate": 5.4744401547492254e-05, + "loss": 0.0337, + "step": 9430 + }, + { + "epoch": 9.050814956855225, + "grad_norm": 0.28111451864242554, + "learning_rate": 5.465776233761896e-05, + "loss": 0.0384, + "step": 9440 + }, + { + "epoch": 9.060402684563758, + "grad_norm": 0.23586216568946838, + "learning_rate": 5.4571109017890753e-05, + "loss": 0.0405, + "step": 9450 + }, + { + "epoch": 9.069990412272292, + "grad_norm": 0.3019304871559143, + "learning_rate": 5.44844418508083e-05, + "loss": 0.0389, + "step": 9460 + }, + { + "epoch": 9.079578139980825, + "grad_norm": 0.3531333804130554, + "learning_rate": 5.4397761098914254e-05, + "loss": 0.0334, + "step": 9470 + }, + { + "epoch": 9.089165867689358, + "grad_norm": 0.40830254554748535, + "learning_rate": 5.431106702479235e-05, + "loss": 0.0357, + "step": 9480 + }, + { + "epoch": 9.098753595397891, + "grad_norm": 0.44957104325294495, + "learning_rate": 5.4224359891066765e-05, + "loss": 0.039, + "step": 9490 + }, + { + "epoch": 9.108341323106425, + "grad_norm": 0.6519899964332581, + "learning_rate": 5.413763996040117e-05, + "loss": 0.0402, + "step": 9500 + }, + { + "epoch": 9.117929050814958, + "grad_norm": 0.4034676253795624, + "learning_rate": 5.405090749549804e-05, + "loss": 0.0459, + "step": 9510 + }, + { + "epoch": 9.12751677852349, + "grad_norm": 0.3996933698654175, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0398, + "step": 9520 + }, + { + "epoch": 9.137104506232022, + "grad_norm": 0.16408595442771912, + "learning_rate": 5.387740601397806e-05, + "loss": 0.0358, + "step": 9530 + }, + { + "epoch": 9.146692233940556, + "grad_norm": 0.3471783995628357, + "learning_rate": 5.379063752295282e-05, + "loss": 0.0391, + "step": 9540 + }, + { + "epoch": 9.156279961649089, + "grad_norm": 0.4107268452644348, + "learning_rate": 5.370385754887164e-05, + "loss": 0.0424, + "step": 9550 + }, + { + "epoch": 9.165867689357622, + "grad_norm": 0.32927405834198, + "learning_rate": 5.3617066354618874e-05, + "loss": 0.0453, + "step": 9560 + }, + { + "epoch": 9.175455417066155, + "grad_norm": 0.41520607471466064, + "learning_rate": 5.3530264203112856e-05, + "loss": 0.0392, + "step": 9570 + }, + { + "epoch": 9.185043144774689, + "grad_norm": 0.3985765278339386, + "learning_rate": 5.344345135730513e-05, + "loss": 0.0364, + "step": 9580 + }, + { + "epoch": 9.194630872483222, + "grad_norm": 0.344056099653244, + "learning_rate": 5.335662808017964e-05, + "loss": 0.0444, + "step": 9590 + }, + { + "epoch": 9.204218600191755, + "grad_norm": 0.3382169008255005, + "learning_rate": 5.32697946347519e-05, + "loss": 0.0375, + "step": 9600 + }, + { + "epoch": 9.213806327900288, + "grad_norm": 0.3668196499347687, + "learning_rate": 5.318295128406825e-05, + "loss": 0.0427, + "step": 9610 + }, + { + "epoch": 9.223394055608821, + "grad_norm": 0.22777938842773438, + "learning_rate": 5.3096098291205044e-05, + "loss": 0.0362, + "step": 9620 + }, + { + "epoch": 9.232981783317355, + "grad_norm": 0.2992532551288605, + "learning_rate": 5.300923591926783e-05, + "loss": 0.0344, + "step": 9630 + }, + { + "epoch": 9.242569511025886, + "grad_norm": 0.2733289301395416, + "learning_rate": 5.292236443139056e-05, + "loss": 0.0318, + "step": 9640 + }, + { + "epoch": 9.25215723873442, + "grad_norm": 0.2972942292690277, + "learning_rate": 5.283548409073482e-05, + "loss": 0.0357, + "step": 9650 + }, + { + "epoch": 9.261744966442953, + "grad_norm": 0.3721420466899872, + "learning_rate": 5.274859516048901e-05, + "loss": 0.0356, + "step": 9660 + }, + { + "epoch": 9.271332694151486, + "grad_norm": 0.13791558146476746, + "learning_rate": 5.266169790386756e-05, + "loss": 0.0345, + "step": 9670 + }, + { + "epoch": 9.280920421860019, + "grad_norm": 0.2645628750324249, + "learning_rate": 5.257479258411008e-05, + "loss": 0.0426, + "step": 9680 + }, + { + "epoch": 9.290508149568552, + "grad_norm": 0.3136797845363617, + "learning_rate": 5.248787946448065e-05, + "loss": 0.0354, + "step": 9690 + }, + { + "epoch": 9.300095877277085, + "grad_norm": 0.25481873750686646, + "learning_rate": 5.240095880826695e-05, + "loss": 0.0401, + "step": 9700 + }, + { + "epoch": 9.309683604985619, + "grad_norm": 0.24243059754371643, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0422, + "step": 9710 + }, + { + "epoch": 9.319271332694152, + "grad_norm": 0.22734355926513672, + "learning_rate": 5.2227095939350966e-05, + "loss": 0.0409, + "step": 9720 + }, + { + "epoch": 9.328859060402685, + "grad_norm": 0.35372641682624817, + "learning_rate": 5.214015425333502e-05, + "loss": 0.0413, + "step": 9730 + }, + { + "epoch": 9.338446788111218, + "grad_norm": 0.2218106985092163, + "learning_rate": 5.205320608410591e-05, + "loss": 0.0385, + "step": 9740 + }, + { + "epoch": 9.348034515819752, + "grad_norm": 0.8550918698310852, + "learning_rate": 5.196625169505755e-05, + "loss": 0.0383, + "step": 9750 + }, + { + "epoch": 9.357622243528283, + "grad_norm": 0.325469434261322, + "learning_rate": 5.18792913496026e-05, + "loss": 0.0377, + "step": 9760 + }, + { + "epoch": 9.367209971236816, + "grad_norm": 0.2887977063655853, + "learning_rate": 5.1792325311171875e-05, + "loss": 0.039, + "step": 9770 + }, + { + "epoch": 9.37679769894535, + "grad_norm": 0.267398476600647, + "learning_rate": 5.1705353843213336e-05, + "loss": 0.0351, + "step": 9780 + }, + { + "epoch": 9.386385426653883, + "grad_norm": 0.3469073176383972, + "learning_rate": 5.1618377209191447e-05, + "loss": 0.0373, + "step": 9790 + }, + { + "epoch": 9.395973154362416, + "grad_norm": 0.399781733751297, + "learning_rate": 5.1531395672586314e-05, + "loss": 0.0345, + "step": 9800 + }, + { + "epoch": 9.405560882070949, + "grad_norm": 0.3050326704978943, + "learning_rate": 5.144440949689287e-05, + "loss": 0.0436, + "step": 9810 + }, + { + "epoch": 9.415148609779482, + "grad_norm": 0.22124247252941132, + "learning_rate": 5.135741894562014e-05, + "loss": 0.0384, + "step": 9820 + }, + { + "epoch": 9.424736337488016, + "grad_norm": 0.32914167642593384, + "learning_rate": 5.127042428229036e-05, + "loss": 0.0395, + "step": 9830 + }, + { + "epoch": 9.434324065196549, + "grad_norm": 0.302157998085022, + "learning_rate": 5.118342577043829e-05, + "loss": 0.0446, + "step": 9840 + }, + { + "epoch": 9.443911792905082, + "grad_norm": 0.29756733775138855, + "learning_rate": 5.1096423673610246e-05, + "loss": 0.035, + "step": 9850 + }, + { + "epoch": 9.453499520613615, + "grad_norm": 0.21626603603363037, + "learning_rate": 5.100941825536353e-05, + "loss": 0.0487, + "step": 9860 + }, + { + "epoch": 9.463087248322148, + "grad_norm": 0.31502407789230347, + "learning_rate": 5.092240977926538e-05, + "loss": 0.0384, + "step": 9870 + }, + { + "epoch": 9.47267497603068, + "grad_norm": 0.3153168261051178, + "learning_rate": 5.083539850889239e-05, + "loss": 0.0377, + "step": 9880 + }, + { + "epoch": 9.482262703739213, + "grad_norm": 0.3235209584236145, + "learning_rate": 5.074838470782957e-05, + "loss": 0.0402, + "step": 9890 + }, + { + "epoch": 9.491850431447746, + "grad_norm": 0.4194275438785553, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0349, + "step": 9900 + }, + { + "epoch": 9.50143815915628, + "grad_norm": 0.26690346002578735, + "learning_rate": 5.0574350568012086e-05, + "loss": 0.037, + "step": 9910 + }, + { + "epoch": 9.511025886864813, + "grad_norm": 0.3191596567630768, + "learning_rate": 5.0487330756462624e-05, + "loss": 0.0427, + "step": 9920 + }, + { + "epoch": 9.520613614573346, + "grad_norm": 0.21837887167930603, + "learning_rate": 5.040030946863209e-05, + "loss": 0.031, + "step": 9930 + }, + { + "epoch": 9.53020134228188, + "grad_norm": 0.28201964497566223, + "learning_rate": 5.0313286968135884e-05, + "loss": 0.0348, + "step": 9940 + }, + { + "epoch": 9.539789069990412, + "grad_norm": 0.6378640532493591, + "learning_rate": 5.022626351859305e-05, + "loss": 0.0392, + "step": 9950 + }, + { + "epoch": 9.549376797698946, + "grad_norm": 0.27877506613731384, + "learning_rate": 5.01392393836255e-05, + "loss": 0.0435, + "step": 9960 + }, + { + "epoch": 9.558964525407479, + "grad_norm": 0.21583925187587738, + "learning_rate": 5.0052214826857225e-05, + "loss": 0.036, + "step": 9970 + }, + { + "epoch": 9.568552253116012, + "grad_norm": 0.3575581908226013, + "learning_rate": 4.996519011191351e-05, + "loss": 0.0344, + "step": 9980 + }, + { + "epoch": 9.578139980824545, + "grad_norm": 0.2446652501821518, + "learning_rate": 4.9878165502420104e-05, + "loss": 0.0382, + "step": 9990 + }, + { + "epoch": 9.587727708533077, + "grad_norm": 0.1690993756055832, + "learning_rate": 4.979114126200244e-05, + "loss": 0.0392, + "step": 10000 + }, + { + "epoch": 9.59731543624161, + "grad_norm": 0.3892661929130554, + "learning_rate": 4.970411765428484e-05, + "loss": 0.0366, + "step": 10010 + }, + { + "epoch": 9.606903163950143, + "grad_norm": 0.26752811670303345, + "learning_rate": 4.961709494288966e-05, + "loss": 0.0377, + "step": 10020 + }, + { + "epoch": 9.616490891658676, + "grad_norm": 0.3104531466960907, + "learning_rate": 4.9530073391436654e-05, + "loss": 0.0371, + "step": 10030 + }, + { + "epoch": 9.62607861936721, + "grad_norm": 0.3081854283809662, + "learning_rate": 4.944305326354194e-05, + "loss": 0.0377, + "step": 10040 + }, + { + "epoch": 9.635666347075743, + "grad_norm": 0.32180699706077576, + "learning_rate": 4.935603482281739e-05, + "loss": 0.0364, + "step": 10050 + }, + { + "epoch": 9.645254074784276, + "grad_norm": 0.30046379566192627, + "learning_rate": 4.926901833286974e-05, + "loss": 0.0341, + "step": 10060 + }, + { + "epoch": 9.65484180249281, + "grad_norm": 0.24152809381484985, + "learning_rate": 4.918200405729986e-05, + "loss": 0.0453, + "step": 10070 + }, + { + "epoch": 9.664429530201343, + "grad_norm": 0.8806717395782471, + "learning_rate": 4.909499225970184e-05, + "loss": 0.0352, + "step": 10080 + }, + { + "epoch": 9.674017257909876, + "grad_norm": 0.3561595380306244, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.0337, + "step": 10090 + }, + { + "epoch": 9.683604985618409, + "grad_norm": 0.3623135983943939, + "learning_rate": 4.892097715275961e-05, + "loss": 0.0361, + "step": 10100 + }, + { + "epoch": 9.693192713326942, + "grad_norm": 0.3282937705516815, + "learning_rate": 4.883397437056293e-05, + "loss": 0.0357, + "step": 10110 + }, + { + "epoch": 9.702780441035475, + "grad_norm": 0.28583481907844543, + "learning_rate": 4.87469751206316e-05, + "loss": 0.032, + "step": 10120 + }, + { + "epoch": 9.712368168744007, + "grad_norm": 0.20011906325817108, + "learning_rate": 4.865997966651421e-05, + "loss": 0.0335, + "step": 10130 + }, + { + "epoch": 9.72195589645254, + "grad_norm": 0.23072586953639984, + "learning_rate": 4.857298827174787e-05, + "loss": 0.0326, + "step": 10140 + }, + { + "epoch": 9.731543624161073, + "grad_norm": 0.21280129253864288, + "learning_rate": 4.8486001199857416e-05, + "loss": 0.0354, + "step": 10150 + }, + { + "epoch": 9.741131351869607, + "grad_norm": 0.4237668812274933, + "learning_rate": 4.839901871435457e-05, + "loss": 0.0351, + "step": 10160 + }, + { + "epoch": 9.75071907957814, + "grad_norm": 0.2798875868320465, + "learning_rate": 4.831204107873713e-05, + "loss": 0.0353, + "step": 10170 + }, + { + "epoch": 9.760306807286673, + "grad_norm": 0.20780718326568604, + "learning_rate": 4.822506855648825e-05, + "loss": 0.0326, + "step": 10180 + }, + { + "epoch": 9.769894534995206, + "grad_norm": 0.2649904489517212, + "learning_rate": 4.8138101411075574e-05, + "loss": 0.035, + "step": 10190 + }, + { + "epoch": 9.77948226270374, + "grad_norm": 0.26445141434669495, + "learning_rate": 4.805113990595046e-05, + "loss": 0.0468, + "step": 10200 + }, + { + "epoch": 9.789069990412273, + "grad_norm": 0.3209472894668579, + "learning_rate": 4.796418430454718e-05, + "loss": 0.0375, + "step": 10210 + }, + { + "epoch": 9.798657718120806, + "grad_norm": 0.19877949357032776, + "learning_rate": 4.787723487028209e-05, + "loss": 0.0381, + "step": 10220 + }, + { + "epoch": 9.808245445829339, + "grad_norm": 0.3071509301662445, + "learning_rate": 4.779029186655292e-05, + "loss": 0.0432, + "step": 10230 + }, + { + "epoch": 9.817833173537872, + "grad_norm": 0.4730135500431061, + "learning_rate": 4.77033555567379e-05, + "loss": 0.0374, + "step": 10240 + }, + { + "epoch": 9.827420901246404, + "grad_norm": 0.29888778924942017, + "learning_rate": 4.761642620419497e-05, + "loss": 0.0357, + "step": 10250 + }, + { + "epoch": 9.837008628954937, + "grad_norm": 0.2550467550754547, + "learning_rate": 4.7529504072260974e-05, + "loss": 0.0309, + "step": 10260 + }, + { + "epoch": 9.84659635666347, + "grad_norm": 0.25972646474838257, + "learning_rate": 4.744258942425094e-05, + "loss": 0.0421, + "step": 10270 + }, + { + "epoch": 9.856184084372003, + "grad_norm": 0.4071574807167053, + "learning_rate": 4.735568252345718e-05, + "loss": 0.0351, + "step": 10280 + }, + { + "epoch": 9.865771812080537, + "grad_norm": 0.4687805771827698, + "learning_rate": 4.726878363314855e-05, + "loss": 0.0369, + "step": 10290 + }, + { + "epoch": 9.87535953978907, + "grad_norm": 0.41865023970603943, + "learning_rate": 4.718189301656962e-05, + "loss": 0.0345, + "step": 10300 + }, + { + "epoch": 9.884947267497603, + "grad_norm": 0.30435627698898315, + "learning_rate": 4.709501093693997e-05, + "loss": 0.0321, + "step": 10310 + }, + { + "epoch": 9.894534995206136, + "grad_norm": 0.3561161458492279, + "learning_rate": 4.7008137657453214e-05, + "loss": 0.0409, + "step": 10320 + }, + { + "epoch": 9.90412272291467, + "grad_norm": 0.36440134048461914, + "learning_rate": 4.692127344127637e-05, + "loss": 0.033, + "step": 10330 + }, + { + "epoch": 9.913710450623203, + "grad_norm": 0.26994454860687256, + "learning_rate": 4.683441855154899e-05, + "loss": 0.0346, + "step": 10340 + }, + { + "epoch": 9.923298178331736, + "grad_norm": 0.2506847381591797, + "learning_rate": 4.674757325138239e-05, + "loss": 0.0314, + "step": 10350 + }, + { + "epoch": 9.93288590604027, + "grad_norm": 0.20864498615264893, + "learning_rate": 4.666073780385879e-05, + "loss": 0.0366, + "step": 10360 + }, + { + "epoch": 9.9424736337488, + "grad_norm": 0.18419000506401062, + "learning_rate": 4.65739124720306e-05, + "loss": 0.0329, + "step": 10370 + }, + { + "epoch": 9.952061361457334, + "grad_norm": 0.3387259244918823, + "learning_rate": 4.648709751891957e-05, + "loss": 0.0381, + "step": 10380 + }, + { + "epoch": 9.961649089165867, + "grad_norm": 0.2119244635105133, + "learning_rate": 4.640029320751606e-05, + "loss": 0.0351, + "step": 10390 + }, + { + "epoch": 9.9712368168744, + "grad_norm": 0.4716765880584717, + "learning_rate": 4.63134998007781e-05, + "loss": 0.0378, + "step": 10400 + }, + { + "epoch": 9.980824544582934, + "grad_norm": 0.47296905517578125, + "learning_rate": 4.622671756163075e-05, + "loss": 0.0397, + "step": 10410 + }, + { + "epoch": 9.990412272291467, + "grad_norm": 0.3720930218696594, + "learning_rate": 4.6139946752965216e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 10.0, + "grad_norm": 0.2873878479003906, + "learning_rate": 4.6053187637638115e-05, + "loss": 0.0336, + "step": 10430 + }, + { + "epoch": 10.009587727708533, + "grad_norm": 0.27077776193618774, + "learning_rate": 4.596644047847061e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 10.019175455417066, + "grad_norm": 0.29882556200027466, + "learning_rate": 4.587970553824762e-05, + "loss": 0.0329, + "step": 10450 + }, + { + "epoch": 10.0287631831256, + "grad_norm": 0.23539794981479645, + "learning_rate": 4.579298307971709e-05, + "loss": 0.0319, + "step": 10460 + }, + { + "epoch": 10.038350910834133, + "grad_norm": 0.47081291675567627, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0448, + "step": 10470 + }, + { + "epoch": 10.047938638542666, + "grad_norm": 0.21392913162708282, + "learning_rate": 4.561957665853532e-05, + "loss": 0.0406, + "step": 10480 + }, + { + "epoch": 10.0575263662512, + "grad_norm": 0.31942254304885864, + "learning_rate": 4.553289322118769e-05, + "loss": 0.0347, + "step": 10490 + }, + { + "epoch": 10.06711409395973, + "grad_norm": 0.22749362885951996, + "learning_rate": 4.544622331613817e-05, + "loss": 0.0414, + "step": 10500 + }, + { + "epoch": 10.076701821668264, + "grad_norm": 0.24884119629859924, + "learning_rate": 4.5359567205937706e-05, + "loss": 0.0314, + "step": 10510 + }, + { + "epoch": 10.086289549376797, + "grad_norm": 0.26897284388542175, + "learning_rate": 4.527292515309541e-05, + "loss": 0.0394, + "step": 10520 + }, + { + "epoch": 10.09587727708533, + "grad_norm": 0.3579690158367157, + "learning_rate": 4.518629742007786e-05, + "loss": 0.0365, + "step": 10530 + }, + { + "epoch": 10.105465004793864, + "grad_norm": 0.19811834394931793, + "learning_rate": 4.509968426930817e-05, + "loss": 0.0358, + "step": 10540 + }, + { + "epoch": 10.115052732502397, + "grad_norm": 0.2834417223930359, + "learning_rate": 4.501308596316537e-05, + "loss": 0.0329, + "step": 10550 + }, + { + "epoch": 10.12464046021093, + "grad_norm": 0.1813543736934662, + "learning_rate": 4.492650276398347e-05, + "loss": 0.0345, + "step": 10560 + }, + { + "epoch": 10.134228187919463, + "grad_norm": 0.23895332217216492, + "learning_rate": 4.483993493405075e-05, + "loss": 0.0328, + "step": 10570 + }, + { + "epoch": 10.143815915627997, + "grad_norm": 0.2329237461090088, + "learning_rate": 4.475338273560886e-05, + "loss": 0.0334, + "step": 10580 + }, + { + "epoch": 10.15340364333653, + "grad_norm": 0.32786402106285095, + "learning_rate": 4.466684643085223e-05, + "loss": 0.0362, + "step": 10590 + }, + { + "epoch": 10.162991371045063, + "grad_norm": 0.2858993709087372, + "learning_rate": 4.458032628192699e-05, + "loss": 0.0349, + "step": 10600 + }, + { + "epoch": 10.172579098753596, + "grad_norm": 0.38395509123802185, + "learning_rate": 4.449382255093044e-05, + "loss": 0.0384, + "step": 10610 + }, + { + "epoch": 10.182166826462128, + "grad_norm": 0.35513293743133545, + "learning_rate": 4.440733549991006e-05, + "loss": 0.0317, + "step": 10620 + }, + { + "epoch": 10.191754554170661, + "grad_norm": 0.21551890671253204, + "learning_rate": 4.432086539086292e-05, + "loss": 0.0373, + "step": 10630 + }, + { + "epoch": 10.201342281879194, + "grad_norm": 0.22998203337192535, + "learning_rate": 4.423441248573463e-05, + "loss": 0.0376, + "step": 10640 + }, + { + "epoch": 10.210930009587727, + "grad_norm": 0.4294188618659973, + "learning_rate": 4.4147977046418776e-05, + "loss": 0.0356, + "step": 10650 + }, + { + "epoch": 10.22051773729626, + "grad_norm": 0.2688153386116028, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0364, + "step": 10660 + }, + { + "epoch": 10.230105465004794, + "grad_norm": 0.39193832874298096, + "learning_rate": 4.3975159612533244e-05, + "loss": 0.0337, + "step": 10670 + }, + { + "epoch": 10.239693192713327, + "grad_norm": 0.4422641694545746, + "learning_rate": 4.388877814148296e-05, + "loss": 0.0328, + "step": 10680 + }, + { + "epoch": 10.24928092042186, + "grad_norm": 0.25854796171188354, + "learning_rate": 4.380241518328231e-05, + "loss": 0.0338, + "step": 10690 + }, + { + "epoch": 10.258868648130393, + "grad_norm": 0.282626748085022, + "learning_rate": 4.371607099955236e-05, + "loss": 0.0398, + "step": 10700 + }, + { + "epoch": 10.268456375838927, + "grad_norm": 0.2568127512931824, + "learning_rate": 4.362974585185734e-05, + "loss": 0.0354, + "step": 10710 + }, + { + "epoch": 10.27804410354746, + "grad_norm": 0.28798142075538635, + "learning_rate": 4.3543440001703786e-05, + "loss": 0.0354, + "step": 10720 + }, + { + "epoch": 10.287631831255993, + "grad_norm": 0.28471261262893677, + "learning_rate": 4.345715371053976e-05, + "loss": 0.0365, + "step": 10730 + }, + { + "epoch": 10.297219558964525, + "grad_norm": 0.27555039525032043, + "learning_rate": 4.3370887239754085e-05, + "loss": 0.0324, + "step": 10740 + }, + { + "epoch": 10.306807286673058, + "grad_norm": 0.34258362650871277, + "learning_rate": 4.328464085067559e-05, + "loss": 0.0313, + "step": 10750 + }, + { + "epoch": 10.316395014381591, + "grad_norm": 0.2875727117061615, + "learning_rate": 4.319841480457221e-05, + "loss": 0.034, + "step": 10760 + }, + { + "epoch": 10.325982742090124, + "grad_norm": 0.37291842699050903, + "learning_rate": 4.311220936265025e-05, + "loss": 0.0358, + "step": 10770 + }, + { + "epoch": 10.335570469798657, + "grad_norm": 0.28330934047698975, + "learning_rate": 4.302602478605364e-05, + "loss": 0.0371, + "step": 10780 + }, + { + "epoch": 10.34515819750719, + "grad_norm": 0.2582619786262512, + "learning_rate": 4.29398613358631e-05, + "loss": 0.0373, + "step": 10790 + }, + { + "epoch": 10.354745925215724, + "grad_norm": 0.4369192123413086, + "learning_rate": 4.2853719273095306e-05, + "loss": 0.035, + "step": 10800 + }, + { + "epoch": 10.364333652924257, + "grad_norm": 0.7189898490905762, + "learning_rate": 4.276759885870221e-05, + "loss": 0.0306, + "step": 10810 + }, + { + "epoch": 10.37392138063279, + "grad_norm": 0.25174766778945923, + "learning_rate": 4.26815003535701e-05, + "loss": 0.0409, + "step": 10820 + }, + { + "epoch": 10.383509108341324, + "grad_norm": 0.251800537109375, + "learning_rate": 4.2595424018518994e-05, + "loss": 0.0338, + "step": 10830 + }, + { + "epoch": 10.393096836049857, + "grad_norm": 0.2858979105949402, + "learning_rate": 4.250937011430167e-05, + "loss": 0.041, + "step": 10840 + }, + { + "epoch": 10.40268456375839, + "grad_norm": 0.1836014688014984, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0356, + "step": 10850 + }, + { + "epoch": 10.412272291466923, + "grad_norm": 0.279307097196579, + "learning_rate": 4.233733064103906e-05, + "loss": 0.0359, + "step": 10860 + }, + { + "epoch": 10.421860019175455, + "grad_norm": 0.32045918703079224, + "learning_rate": 4.225134559315647e-05, + "loss": 0.0377, + "step": 10870 + }, + { + "epoch": 10.431447746883988, + "grad_norm": 0.2521663010120392, + "learning_rate": 4.2165384018431495e-05, + "loss": 0.0301, + "step": 10880 + }, + { + "epoch": 10.441035474592521, + "grad_norm": 0.7854000329971313, + "learning_rate": 4.207944617726931e-05, + "loss": 0.0337, + "step": 10890 + }, + { + "epoch": 10.450623202301054, + "grad_norm": 0.2677070200443268, + "learning_rate": 4.1993532330003146e-05, + "loss": 0.0392, + "step": 10900 + }, + { + "epoch": 10.460210930009588, + "grad_norm": 0.4461430609226227, + "learning_rate": 4.190764273689359e-05, + "loss": 0.0306, + "step": 10910 + }, + { + "epoch": 10.46979865771812, + "grad_norm": 0.30843472480773926, + "learning_rate": 4.1821777658127765e-05, + "loss": 0.0259, + "step": 10920 + }, + { + "epoch": 10.479386385426654, + "grad_norm": 0.5075517296791077, + "learning_rate": 4.17359373538185e-05, + "loss": 0.0376, + "step": 10930 + }, + { + "epoch": 10.488974113135187, + "grad_norm": 0.3522166609764099, + "learning_rate": 4.16501220840036e-05, + "loss": 0.0272, + "step": 10940 + }, + { + "epoch": 10.49856184084372, + "grad_norm": 0.3115832805633545, + "learning_rate": 4.156433210864499e-05, + "loss": 0.0421, + "step": 10950 + }, + { + "epoch": 10.508149568552254, + "grad_norm": 0.29928937554359436, + "learning_rate": 4.147856768762804e-05, + "loss": 0.0329, + "step": 10960 + }, + { + "epoch": 10.517737296260787, + "grad_norm": 0.2621513903141022, + "learning_rate": 4.139282908076064e-05, + "loss": 0.0313, + "step": 10970 + }, + { + "epoch": 10.527325023969318, + "grad_norm": 0.31416305899620056, + "learning_rate": 4.130711654777254e-05, + "loss": 0.0311, + "step": 10980 + }, + { + "epoch": 10.536912751677852, + "grad_norm": 0.23825299739837646, + "learning_rate": 4.1221430348314415e-05, + "loss": 0.0386, + "step": 10990 + }, + { + "epoch": 10.546500479386385, + "grad_norm": 0.2471434473991394, + "learning_rate": 4.11357707419573e-05, + "loss": 0.038, + "step": 11000 + }, + { + "epoch": 10.556088207094918, + "grad_norm": 0.2707345187664032, + "learning_rate": 4.105013798819155e-05, + "loss": 0.0356, + "step": 11010 + }, + { + "epoch": 10.565675934803451, + "grad_norm": 0.3994966149330139, + "learning_rate": 4.0964532346426235e-05, + "loss": 0.0326, + "step": 11020 + }, + { + "epoch": 10.575263662511984, + "grad_norm": 0.5146787762641907, + "learning_rate": 4.087895407598824e-05, + "loss": 0.0361, + "step": 11030 + }, + { + "epoch": 10.584851390220518, + "grad_norm": 0.2920519709587097, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0326, + "step": 11040 + }, + { + "epoch": 10.594439117929051, + "grad_norm": 0.27901026606559753, + "learning_rate": 4.070788068598672e-05, + "loss": 0.037, + "step": 11050 + }, + { + "epoch": 10.604026845637584, + "grad_norm": 0.26402774453163147, + "learning_rate": 4.062238608465927e-05, + "loss": 0.0337, + "step": 11060 + }, + { + "epoch": 10.613614573346117, + "grad_norm": 0.24872805178165436, + "learning_rate": 4.053691989112986e-05, + "loss": 0.0343, + "step": 11070 + }, + { + "epoch": 10.62320230105465, + "grad_norm": 0.21889743208885193, + "learning_rate": 4.0451482364303e-05, + "loss": 0.0329, + "step": 11080 + }, + { + "epoch": 10.632790028763184, + "grad_norm": 0.31977149844169617, + "learning_rate": 4.03660737629963e-05, + "loss": 0.0395, + "step": 11090 + }, + { + "epoch": 10.642377756471717, + "grad_norm": 0.3449043929576874, + "learning_rate": 4.028069434593982e-05, + "loss": 0.0362, + "step": 11100 + }, + { + "epoch": 10.651965484180248, + "grad_norm": 0.356534481048584, + "learning_rate": 4.019534437177516e-05, + "loss": 0.0453, + "step": 11110 + }, + { + "epoch": 10.661553211888782, + "grad_norm": 0.3510785400867462, + "learning_rate": 4.0110024099054756e-05, + "loss": 0.03, + "step": 11120 + }, + { + "epoch": 10.671140939597315, + "grad_norm": 0.4049818813800812, + "learning_rate": 4.002473378624107e-05, + "loss": 0.0337, + "step": 11130 + }, + { + "epoch": 10.680728667305848, + "grad_norm": 0.2889692485332489, + "learning_rate": 3.9939473691705765e-05, + "loss": 0.0369, + "step": 11140 + }, + { + "epoch": 10.690316395014381, + "grad_norm": 0.25454413890838623, + "learning_rate": 3.9854244073728996e-05, + "loss": 0.0373, + "step": 11150 + }, + { + "epoch": 10.699904122722915, + "grad_norm": 0.28601503372192383, + "learning_rate": 3.976904519049862e-05, + "loss": 0.0384, + "step": 11160 + }, + { + "epoch": 10.709491850431448, + "grad_norm": 0.22738857567310333, + "learning_rate": 3.968387730010935e-05, + "loss": 0.0352, + "step": 11170 + }, + { + "epoch": 10.719079578139981, + "grad_norm": 0.2723415493965149, + "learning_rate": 3.9598740660562005e-05, + "loss": 0.0372, + "step": 11180 + }, + { + "epoch": 10.728667305848514, + "grad_norm": 0.35877975821495056, + "learning_rate": 3.951363552976275e-05, + "loss": 0.0321, + "step": 11190 + }, + { + "epoch": 10.738255033557047, + "grad_norm": 0.2732999324798584, + "learning_rate": 3.942856216552234e-05, + "loss": 0.0423, + "step": 11200 + }, + { + "epoch": 10.74784276126558, + "grad_norm": 0.1939064860343933, + "learning_rate": 3.934352082555522e-05, + "loss": 0.0383, + "step": 11210 + }, + { + "epoch": 10.757430488974114, + "grad_norm": 0.34008413553237915, + "learning_rate": 3.92585117674789e-05, + "loss": 0.0374, + "step": 11220 + }, + { + "epoch": 10.767018216682647, + "grad_norm": 0.32701992988586426, + "learning_rate": 3.917353524881302e-05, + "loss": 0.0336, + "step": 11230 + }, + { + "epoch": 10.776605944391179, + "grad_norm": 0.29676583409309387, + "learning_rate": 3.908859152697872e-05, + "loss": 0.0358, + "step": 11240 + }, + { + "epoch": 10.786193672099712, + "grad_norm": 0.21634122729301453, + "learning_rate": 3.900368085929775e-05, + "loss": 0.0357, + "step": 11250 + }, + { + "epoch": 10.795781399808245, + "grad_norm": 0.29007887840270996, + "learning_rate": 3.8918803502991744e-05, + "loss": 0.0396, + "step": 11260 + }, + { + "epoch": 10.805369127516778, + "grad_norm": 0.2906304895877838, + "learning_rate": 3.883395971518138e-05, + "loss": 0.0293, + "step": 11270 + }, + { + "epoch": 10.814956855225311, + "grad_norm": 0.19408248364925385, + "learning_rate": 3.874914975288575e-05, + "loss": 0.0338, + "step": 11280 + }, + { + "epoch": 10.824544582933845, + "grad_norm": 0.9713996052742004, + "learning_rate": 3.8664373873021356e-05, + "loss": 0.0367, + "step": 11290 + }, + { + "epoch": 10.834132310642378, + "grad_norm": 0.43305110931396484, + "learning_rate": 3.857963233240153e-05, + "loss": 0.0409, + "step": 11300 + }, + { + "epoch": 10.843720038350911, + "grad_norm": 0.4623974859714508, + "learning_rate": 3.849492538773552e-05, + "loss": 0.0322, + "step": 11310 + }, + { + "epoch": 10.853307766059444, + "grad_norm": 0.13911698758602142, + "learning_rate": 3.841025329562789e-05, + "loss": 0.0371, + "step": 11320 + }, + { + "epoch": 10.862895493767978, + "grad_norm": 0.40783533453941345, + "learning_rate": 3.832561631257748e-05, + "loss": 0.0334, + "step": 11330 + }, + { + "epoch": 10.87248322147651, + "grad_norm": 0.2820438742637634, + "learning_rate": 3.824101469497685e-05, + "loss": 0.0357, + "step": 11340 + }, + { + "epoch": 10.882070949185042, + "grad_norm": 0.2518521547317505, + "learning_rate": 3.8156448699111414e-05, + "loss": 0.0398, + "step": 11350 + }, + { + "epoch": 10.891658676893575, + "grad_norm": 0.22868366539478302, + "learning_rate": 3.80719185811587e-05, + "loss": 0.0329, + "step": 11360 + }, + { + "epoch": 10.901246404602109, + "grad_norm": 0.28649628162384033, + "learning_rate": 3.79874245971875e-05, + "loss": 0.0362, + "step": 11370 + }, + { + "epoch": 10.910834132310642, + "grad_norm": 0.2933325171470642, + "learning_rate": 3.790296700315717e-05, + "loss": 0.0322, + "step": 11380 + }, + { + "epoch": 10.920421860019175, + "grad_norm": 0.34184950590133667, + "learning_rate": 3.781854605491684e-05, + "loss": 0.034, + "step": 11390 + }, + { + "epoch": 10.930009587727708, + "grad_norm": 0.26722094416618347, + "learning_rate": 3.773416200820463e-05, + "loss": 0.0369, + "step": 11400 + }, + { + "epoch": 10.939597315436242, + "grad_norm": 0.22674645483493805, + "learning_rate": 3.764981511864686e-05, + "loss": 0.0349, + "step": 11410 + }, + { + "epoch": 10.949185043144775, + "grad_norm": 0.6623883843421936, + "learning_rate": 3.756550564175727e-05, + "loss": 0.0331, + "step": 11420 + }, + { + "epoch": 10.958772770853308, + "grad_norm": 0.3025140166282654, + "learning_rate": 3.748123383293629e-05, + "loss": 0.0364, + "step": 11430 + }, + { + "epoch": 10.968360498561841, + "grad_norm": 0.2423921674489975, + "learning_rate": 3.739699994747026e-05, + "loss": 0.0305, + "step": 11440 + }, + { + "epoch": 10.977948226270374, + "grad_norm": 0.2216835469007492, + "learning_rate": 3.731280424053061e-05, + "loss": 0.0338, + "step": 11450 + }, + { + "epoch": 10.987535953978908, + "grad_norm": 0.4063700735569, + "learning_rate": 3.7228646967173096e-05, + "loss": 0.0437, + "step": 11460 + }, + { + "epoch": 10.997123681687441, + "grad_norm": 0.21180011332035065, + "learning_rate": 3.7144528382337086e-05, + "loss": 0.0362, + "step": 11470 + }, + { + "epoch": 11.006711409395972, + "grad_norm": 0.22706526517868042, + "learning_rate": 3.706044874084474e-05, + "loss": 0.0343, + "step": 11480 + }, + { + "epoch": 11.016299137104506, + "grad_norm": 0.3348940908908844, + "learning_rate": 3.6976408297400257e-05, + "loss": 0.0344, + "step": 11490 + }, + { + "epoch": 11.025886864813039, + "grad_norm": 0.21291491389274597, + "learning_rate": 3.6892407306589035e-05, + "loss": 0.0329, + "step": 11500 + }, + { + "epoch": 11.035474592521572, + "grad_norm": 0.3505829870700836, + "learning_rate": 3.6808446022877e-05, + "loss": 0.0339, + "step": 11510 + }, + { + "epoch": 11.045062320230105, + "grad_norm": 0.36319780349731445, + "learning_rate": 3.672452470060982e-05, + "loss": 0.0338, + "step": 11520 + }, + { + "epoch": 11.054650047938638, + "grad_norm": 0.3714457154273987, + "learning_rate": 3.6640643594012057e-05, + "loss": 0.0419, + "step": 11530 + }, + { + "epoch": 11.064237775647172, + "grad_norm": 0.27974534034729004, + "learning_rate": 3.6556802957186486e-05, + "loss": 0.0359, + "step": 11540 + }, + { + "epoch": 11.073825503355705, + "grad_norm": 0.34719452261924744, + "learning_rate": 3.647300304411323e-05, + "loss": 0.0367, + "step": 11550 + }, + { + "epoch": 11.083413231064238, + "grad_norm": 0.24294276535511017, + "learning_rate": 3.6389244108649114e-05, + "loss": 0.0316, + "step": 11560 + }, + { + "epoch": 11.093000958772771, + "grad_norm": 0.3280002474784851, + "learning_rate": 3.6305526404526785e-05, + "loss": 0.0315, + "step": 11570 + }, + { + "epoch": 11.102588686481305, + "grad_norm": 0.25797387957572937, + "learning_rate": 3.6221850185354014e-05, + "loss": 0.0306, + "step": 11580 + }, + { + "epoch": 11.112176414189838, + "grad_norm": 0.2705564498901367, + "learning_rate": 3.613821570461284e-05, + "loss": 0.0333, + "step": 11590 + }, + { + "epoch": 11.12176414189837, + "grad_norm": 0.2857078015804291, + "learning_rate": 3.605462321565899e-05, + "loss": 0.0329, + "step": 11600 + }, + { + "epoch": 11.131351869606902, + "grad_norm": 0.23920407891273499, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0366, + "step": 11610 + }, + { + "epoch": 11.140939597315436, + "grad_norm": 0.31336209177970886, + "learning_rate": 3.588756522589888e-05, + "loss": 0.03, + "step": 11620 + }, + { + "epoch": 11.150527325023969, + "grad_norm": 0.2026471495628357, + "learning_rate": 3.5804100231164824e-05, + "loss": 0.0328, + "step": 11630 + }, + { + "epoch": 11.160115052732502, + "grad_norm": 0.166408970952034, + "learning_rate": 3.572067824036092e-05, + "loss": 0.0357, + "step": 11640 + }, + { + "epoch": 11.169702780441035, + "grad_norm": 0.24978677928447723, + "learning_rate": 3.5637299506199075e-05, + "loss": 0.0289, + "step": 11650 + }, + { + "epoch": 11.179290508149569, + "grad_norm": 0.36853691935539246, + "learning_rate": 3.5553964281260225e-05, + "loss": 0.036, + "step": 11660 + }, + { + "epoch": 11.188878235858102, + "grad_norm": 0.31218189001083374, + "learning_rate": 3.547067281799345e-05, + "loss": 0.0327, + "step": 11670 + }, + { + "epoch": 11.198465963566635, + "grad_norm": 0.2616768777370453, + "learning_rate": 3.538742536871531e-05, + "loss": 0.0378, + "step": 11680 + }, + { + "epoch": 11.208053691275168, + "grad_norm": 0.3586946725845337, + "learning_rate": 3.530422218560903e-05, + "loss": 0.0378, + "step": 11690 + }, + { + "epoch": 11.217641418983701, + "grad_norm": 0.1958369016647339, + "learning_rate": 3.522106352072366e-05, + "loss": 0.0368, + "step": 11700 + }, + { + "epoch": 11.227229146692235, + "grad_norm": 0.30349719524383545, + "learning_rate": 3.5137949625973484e-05, + "loss": 0.0396, + "step": 11710 + }, + { + "epoch": 11.236816874400766, + "grad_norm": 0.22439143061637878, + "learning_rate": 3.505488075313712e-05, + "loss": 0.0275, + "step": 11720 + }, + { + "epoch": 11.2464046021093, + "grad_norm": 0.3639642596244812, + "learning_rate": 3.4971857153856825e-05, + "loss": 0.03, + "step": 11730 + }, + { + "epoch": 11.255992329817833, + "grad_norm": 0.19874945282936096, + "learning_rate": 3.488887907963766e-05, + "loss": 0.0341, + "step": 11740 + }, + { + "epoch": 11.265580057526366, + "grad_norm": 0.6180244088172913, + "learning_rate": 3.480594678184681e-05, + "loss": 0.0346, + "step": 11750 + }, + { + "epoch": 11.275167785234899, + "grad_norm": 0.27457571029663086, + "learning_rate": 3.472306051171281e-05, + "loss": 0.0359, + "step": 11760 + }, + { + "epoch": 11.284755512943432, + "grad_norm": 0.18931525945663452, + "learning_rate": 3.464022052032473e-05, + "loss": 0.0311, + "step": 11770 + }, + { + "epoch": 11.294343240651965, + "grad_norm": 0.2550256848335266, + "learning_rate": 3.455742705863143e-05, + "loss": 0.0346, + "step": 11780 + }, + { + "epoch": 11.303930968360499, + "grad_norm": 0.21088473498821259, + "learning_rate": 3.447468037744084e-05, + "loss": 0.0295, + "step": 11790 + }, + { + "epoch": 11.313518696069032, + "grad_norm": 0.25027552247047424, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0375, + "step": 11800 + }, + { + "epoch": 11.323106423777565, + "grad_norm": 0.5064207315444946, + "learning_rate": 3.4309328359090264e-05, + "loss": 0.0332, + "step": 11810 + }, + { + "epoch": 11.332694151486098, + "grad_norm": 0.2110755443572998, + "learning_rate": 3.422672352283453e-05, + "loss": 0.0351, + "step": 11820 + }, + { + "epoch": 11.342281879194632, + "grad_norm": 0.27771392464637756, + "learning_rate": 3.41441664688885e-05, + "loss": 0.0383, + "step": 11830 + }, + { + "epoch": 11.351869606903165, + "grad_norm": 0.34242868423461914, + "learning_rate": 3.406165744734397e-05, + "loss": 0.0298, + "step": 11840 + }, + { + "epoch": 11.361457334611696, + "grad_norm": 0.3390040099620819, + "learning_rate": 3.397919670814723e-05, + "loss": 0.0377, + "step": 11850 + }, + { + "epoch": 11.37104506232023, + "grad_norm": 0.15492115914821625, + "learning_rate": 3.389678450109827e-05, + "loss": 0.0403, + "step": 11860 + }, + { + "epoch": 11.380632790028763, + "grad_norm": 0.3101263642311096, + "learning_rate": 3.3814421075850035e-05, + "loss": 0.0362, + "step": 11870 + }, + { + "epoch": 11.390220517737296, + "grad_norm": 0.2800522446632385, + "learning_rate": 3.3732106681907816e-05, + "loss": 0.032, + "step": 11880 + }, + { + "epoch": 11.39980824544583, + "grad_norm": 0.26244333386421204, + "learning_rate": 3.364984156862825e-05, + "loss": 0.0307, + "step": 11890 + }, + { + "epoch": 11.409395973154362, + "grad_norm": 0.48606979846954346, + "learning_rate": 3.356762598521874e-05, + "loss": 0.0335, + "step": 11900 + }, + { + "epoch": 11.418983700862896, + "grad_norm": 0.5852661728858948, + "learning_rate": 3.348546018073662e-05, + "loss": 0.0433, + "step": 11910 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.252837598323822, + "learning_rate": 3.340334440408846e-05, + "loss": 0.0257, + "step": 11920 + }, + { + "epoch": 11.438159156279962, + "grad_norm": 0.2573808431625366, + "learning_rate": 3.332127890402926e-05, + "loss": 0.0331, + "step": 11930 + }, + { + "epoch": 11.447746883988495, + "grad_norm": 0.25154879689216614, + "learning_rate": 3.3239263929161734e-05, + "loss": 0.0389, + "step": 11940 + }, + { + "epoch": 11.457334611697028, + "grad_norm": 0.2564004957675934, + "learning_rate": 3.315729972793553e-05, + "loss": 0.0386, + "step": 11950 + }, + { + "epoch": 11.466922339405562, + "grad_norm": 0.45886269211769104, + "learning_rate": 3.307538654864645e-05, + "loss": 0.0365, + "step": 11960 + }, + { + "epoch": 11.476510067114093, + "grad_norm": 0.157767191529274, + "learning_rate": 3.29935246394358e-05, + "loss": 0.0356, + "step": 11970 + }, + { + "epoch": 11.486097794822626, + "grad_norm": 0.3403734564781189, + "learning_rate": 3.2911714248289525e-05, + "loss": 0.0335, + "step": 11980 + }, + { + "epoch": 11.49568552253116, + "grad_norm": 0.207637757062912, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0291, + "step": 11990 + }, + { + "epoch": 11.505273250239693, + "grad_norm": 0.2571353614330292, + "learning_rate": 3.2748249011352864e-05, + "loss": 0.031, + "step": 12000 + }, + { + "epoch": 11.514860977948226, + "grad_norm": 0.29838424921035767, + "learning_rate": 3.266659466075108e-05, + "loss": 0.0312, + "step": 12010 + }, + { + "epoch": 11.52444870565676, + "grad_norm": 0.35853999853134155, + "learning_rate": 3.258499281858936e-05, + "loss": 0.0349, + "step": 12020 + }, + { + "epoch": 11.534036433365292, + "grad_norm": 0.22435715794563293, + "learning_rate": 3.250344373206584e-05, + "loss": 0.0321, + "step": 12030 + }, + { + "epoch": 11.543624161073826, + "grad_norm": 0.364653617143631, + "learning_rate": 3.242194764821881e-05, + "loss": 0.0291, + "step": 12040 + }, + { + "epoch": 11.553211888782359, + "grad_norm": 0.20518454909324646, + "learning_rate": 3.2340504813926086e-05, + "loss": 0.0335, + "step": 12050 + }, + { + "epoch": 11.562799616490892, + "grad_norm": 0.3099921941757202, + "learning_rate": 3.2259115475904064e-05, + "loss": 0.036, + "step": 12060 + }, + { + "epoch": 11.572387344199425, + "grad_norm": 0.40152508020401, + "learning_rate": 3.217777988070715e-05, + "loss": 0.0377, + "step": 12070 + }, + { + "epoch": 11.581975071907959, + "grad_norm": 0.2941493093967438, + "learning_rate": 3.2096498274726925e-05, + "loss": 0.0304, + "step": 12080 + }, + { + "epoch": 11.59156279961649, + "grad_norm": 0.1939501017332077, + "learning_rate": 3.201527090419144e-05, + "loss": 0.0309, + "step": 12090 + }, + { + "epoch": 11.601150527325023, + "grad_norm": 0.28782132267951965, + "learning_rate": 3.193409801516443e-05, + "loss": 0.0368, + "step": 12100 + }, + { + "epoch": 11.610738255033556, + "grad_norm": 0.22255367040634155, + "learning_rate": 3.1852979853544575e-05, + "loss": 0.034, + "step": 12110 + }, + { + "epoch": 11.62032598274209, + "grad_norm": 0.24580125510692596, + "learning_rate": 3.177191666506479e-05, + "loss": 0.0316, + "step": 12120 + }, + { + "epoch": 11.629913710450623, + "grad_norm": 0.16919176280498505, + "learning_rate": 3.169090869529146e-05, + "loss": 0.032, + "step": 12130 + }, + { + "epoch": 11.639501438159156, + "grad_norm": 0.16586647927761078, + "learning_rate": 3.1609956189623704e-05, + "loss": 0.0318, + "step": 12140 + }, + { + "epoch": 11.64908916586769, + "grad_norm": 0.25521326065063477, + "learning_rate": 3.1529059393292573e-05, + "loss": 0.0339, + "step": 12150 + }, + { + "epoch": 11.658676893576223, + "grad_norm": 0.40948987007141113, + "learning_rate": 3.1448218551360394e-05, + "loss": 0.0417, + "step": 12160 + }, + { + "epoch": 11.668264621284756, + "grad_norm": 0.2603534460067749, + "learning_rate": 3.136743390872001e-05, + "loss": 0.0332, + "step": 12170 + }, + { + "epoch": 11.677852348993289, + "grad_norm": 0.24372372031211853, + "learning_rate": 3.128670571009399e-05, + "loss": 0.0325, + "step": 12180 + }, + { + "epoch": 11.687440076701822, + "grad_norm": 0.18494637310504913, + "learning_rate": 3.1206034200033904e-05, + "loss": 0.0324, + "step": 12190 + }, + { + "epoch": 11.697027804410356, + "grad_norm": 0.3946174681186676, + "learning_rate": 3.1125419622919614e-05, + "loss": 0.0327, + "step": 12200 + }, + { + "epoch": 11.706615532118889, + "grad_norm": 0.5735461115837097, + "learning_rate": 3.104486222295853e-05, + "loss": 0.0294, + "step": 12210 + }, + { + "epoch": 11.71620325982742, + "grad_norm": 0.25579607486724854, + "learning_rate": 3.096436224418482e-05, + "loss": 0.0347, + "step": 12220 + }, + { + "epoch": 11.725790987535953, + "grad_norm": 0.40547341108322144, + "learning_rate": 3.088391993045873e-05, + "loss": 0.037, + "step": 12230 + }, + { + "epoch": 11.735378715244487, + "grad_norm": 0.3765973746776581, + "learning_rate": 3.080353552546578e-05, + "loss": 0.0307, + "step": 12240 + }, + { + "epoch": 11.74496644295302, + "grad_norm": 0.40163904428482056, + "learning_rate": 3.0723209272716124e-05, + "loss": 0.0295, + "step": 12250 + }, + { + "epoch": 11.754554170661553, + "grad_norm": 0.3667445182800293, + "learning_rate": 3.064294141554372e-05, + "loss": 0.0328, + "step": 12260 + }, + { + "epoch": 11.764141898370086, + "grad_norm": 0.22410856187343597, + "learning_rate": 3.056273219710565e-05, + "loss": 0.0355, + "step": 12270 + }, + { + "epoch": 11.77372962607862, + "grad_norm": 0.278154581785202, + "learning_rate": 3.048258186038129e-05, + "loss": 0.038, + "step": 12280 + }, + { + "epoch": 11.783317353787153, + "grad_norm": 0.4203621745109558, + "learning_rate": 3.040249064817176e-05, + "loss": 0.0338, + "step": 12290 + }, + { + "epoch": 11.792905081495686, + "grad_norm": 0.29441940784454346, + "learning_rate": 3.0322458803098973e-05, + "loss": 0.027, + "step": 12300 + }, + { + "epoch": 11.80249280920422, + "grad_norm": 0.2775827646255493, + "learning_rate": 3.0242486567605068e-05, + "loss": 0.031, + "step": 12310 + }, + { + "epoch": 11.812080536912752, + "grad_norm": 0.38520553708076477, + "learning_rate": 3.016257418395152e-05, + "loss": 0.0333, + "step": 12320 + }, + { + "epoch": 11.821668264621284, + "grad_norm": 0.26599544286727905, + "learning_rate": 3.008272189421861e-05, + "loss": 0.0301, + "step": 12330 + }, + { + "epoch": 11.831255992329817, + "grad_norm": 0.22733962535858154, + "learning_rate": 3.0002929940304498e-05, + "loss": 0.0298, + "step": 12340 + }, + { + "epoch": 11.84084372003835, + "grad_norm": 0.27661770582199097, + "learning_rate": 2.992319856392457e-05, + "loss": 0.0342, + "step": 12350 + }, + { + "epoch": 11.850431447746884, + "grad_norm": 0.26731380820274353, + "learning_rate": 2.9843528006610733e-05, + "loss": 0.0295, + "step": 12360 + }, + { + "epoch": 11.860019175455417, + "grad_norm": 0.3973303437232971, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0301, + "step": 12370 + }, + { + "epoch": 11.86960690316395, + "grad_norm": 0.3120301067829132, + "learning_rate": 2.968437031438698e-05, + "loss": 0.0348, + "step": 12380 + }, + { + "epoch": 11.879194630872483, + "grad_norm": 0.2932593524456024, + "learning_rate": 2.9604883661616702e-05, + "loss": 0.0308, + "step": 12390 + }, + { + "epoch": 11.888782358581016, + "grad_norm": 0.2067721039056778, + "learning_rate": 2.9525458792190365e-05, + "loss": 0.0323, + "step": 12400 + }, + { + "epoch": 11.89837008628955, + "grad_norm": 0.30877119302749634, + "learning_rate": 2.9446095946711367e-05, + "loss": 0.0336, + "step": 12410 + }, + { + "epoch": 11.907957813998083, + "grad_norm": 0.1372332125902176, + "learning_rate": 2.93667953655952e-05, + "loss": 0.0341, + "step": 12420 + }, + { + "epoch": 11.917545541706616, + "grad_norm": 0.2722005844116211, + "learning_rate": 2.9287557289068736e-05, + "loss": 0.0347, + "step": 12430 + }, + { + "epoch": 11.92713326941515, + "grad_norm": 0.35675281286239624, + "learning_rate": 2.9208381957169485e-05, + "loss": 0.0354, + "step": 12440 + }, + { + "epoch": 11.936720997123683, + "grad_norm": 0.4129658639431, + "learning_rate": 2.9129269609744935e-05, + "loss": 0.0235, + "step": 12450 + }, + { + "epoch": 11.946308724832214, + "grad_norm": 0.23059901595115662, + "learning_rate": 2.905022048645172e-05, + "loss": 0.0361, + "step": 12460 + }, + { + "epoch": 11.955896452540747, + "grad_norm": 0.20640157163143158, + "learning_rate": 2.8971234826754983e-05, + "loss": 0.0306, + "step": 12470 + }, + { + "epoch": 11.96548418024928, + "grad_norm": 0.27325066924095154, + "learning_rate": 2.8892312869927578e-05, + "loss": 0.033, + "step": 12480 + }, + { + "epoch": 11.975071907957814, + "grad_norm": 0.2237732708454132, + "learning_rate": 2.881345485504945e-05, + "loss": 0.0309, + "step": 12490 + }, + { + "epoch": 11.984659635666347, + "grad_norm": 0.2271834760904312, + "learning_rate": 2.8734661021006747e-05, + "loss": 0.0267, + "step": 12500 + }, + { + "epoch": 11.99424736337488, + "grad_norm": 0.27549734711647034, + "learning_rate": 2.8655931606491294e-05, + "loss": 0.0338, + "step": 12510 + }, + { + "epoch": 12.003835091083413, + "grad_norm": 0.19603657722473145, + "learning_rate": 2.8577266849999672e-05, + "loss": 0.0303, + "step": 12520 + }, + { + "epoch": 12.013422818791947, + "grad_norm": 0.1858394742012024, + "learning_rate": 2.849866698983267e-05, + "loss": 0.0255, + "step": 12530 + }, + { + "epoch": 12.02301054650048, + "grad_norm": 0.17287525534629822, + "learning_rate": 2.8420132264094468e-05, + "loss": 0.0297, + "step": 12540 + }, + { + "epoch": 12.032598274209013, + "grad_norm": 0.32775846123695374, + "learning_rate": 2.83416629106919e-05, + "loss": 0.0345, + "step": 12550 + }, + { + "epoch": 12.042186001917546, + "grad_norm": 0.17536644637584686, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0286, + "step": 12560 + }, + { + "epoch": 12.05177372962608, + "grad_norm": 0.18874387443065643, + "learning_rate": 2.818492127153018e-05, + "loss": 0.0293, + "step": 12570 + }, + { + "epoch": 12.06136145733461, + "grad_norm": 0.1686885803937912, + "learning_rate": 2.8106649460591716e-05, + "loss": 0.0302, + "step": 12580 + }, + { + "epoch": 12.070949185043144, + "grad_norm": 0.14021116495132446, + "learning_rate": 2.802844397162877e-05, + "loss": 0.0321, + "step": 12590 + }, + { + "epoch": 12.080536912751677, + "grad_norm": 0.32412388920783997, + "learning_rate": 2.7950305041550818e-05, + "loss": 0.0337, + "step": 12600 + }, + { + "epoch": 12.09012464046021, + "grad_norm": 0.2775496244430542, + "learning_rate": 2.7872232907065738e-05, + "loss": 0.0348, + "step": 12610 + }, + { + "epoch": 12.099712368168744, + "grad_norm": 0.20718041062355042, + "learning_rate": 2.7794227804679063e-05, + "loss": 0.0318, + "step": 12620 + }, + { + "epoch": 12.109300095877277, + "grad_norm": 0.14198093116283417, + "learning_rate": 2.7716289970693236e-05, + "loss": 0.0285, + "step": 12630 + }, + { + "epoch": 12.11888782358581, + "grad_norm": 0.23473426699638367, + "learning_rate": 2.7638419641206914e-05, + "loss": 0.0311, + "step": 12640 + }, + { + "epoch": 12.128475551294343, + "grad_norm": 0.22687584161758423, + "learning_rate": 2.7560617052114297e-05, + "loss": 0.0265, + "step": 12650 + }, + { + "epoch": 12.138063279002877, + "grad_norm": 0.22875012457370758, + "learning_rate": 2.7482882439104385e-05, + "loss": 0.0324, + "step": 12660 + }, + { + "epoch": 12.14765100671141, + "grad_norm": 0.2869175970554352, + "learning_rate": 2.740521603766022e-05, + "loss": 0.0343, + "step": 12670 + }, + { + "epoch": 12.157238734419943, + "grad_norm": 0.24454490840435028, + "learning_rate": 2.7327618083058192e-05, + "loss": 0.0354, + "step": 12680 + }, + { + "epoch": 12.166826462128476, + "grad_norm": 0.26888319849967957, + "learning_rate": 2.7250088810367404e-05, + "loss": 0.0317, + "step": 12690 + }, + { + "epoch": 12.176414189837008, + "grad_norm": 0.2190038412809372, + "learning_rate": 2.7172628454448888e-05, + "loss": 0.0394, + "step": 12700 + }, + { + "epoch": 12.186001917545541, + "grad_norm": 0.1673816591501236, + "learning_rate": 2.7095237249954875e-05, + "loss": 0.0272, + "step": 12710 + }, + { + "epoch": 12.195589645254074, + "grad_norm": 0.32721394300460815, + "learning_rate": 2.7017915431328078e-05, + "loss": 0.0341, + "step": 12720 + }, + { + "epoch": 12.205177372962607, + "grad_norm": 0.2936406135559082, + "learning_rate": 2.6940663232801144e-05, + "loss": 0.0294, + "step": 12730 + }, + { + "epoch": 12.21476510067114, + "grad_norm": 3.8611295223236084, + "learning_rate": 2.6863480888395714e-05, + "loss": 0.0293, + "step": 12740 + }, + { + "epoch": 12.224352828379674, + "grad_norm": 0.16587217152118683, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.03, + "step": 12750 + }, + { + "epoch": 12.233940556088207, + "grad_norm": 0.5451092720031738, + "learning_rate": 2.6709326696977215e-05, + "loss": 0.0325, + "step": 12760 + }, + { + "epoch": 12.24352828379674, + "grad_norm": 0.20002365112304688, + "learning_rate": 2.6632355316946643e-05, + "loss": 0.0255, + "step": 12770 + }, + { + "epoch": 12.253116011505274, + "grad_norm": 0.8898112773895264, + "learning_rate": 2.655545472500105e-05, + "loss": 0.0348, + "step": 12780 + }, + { + "epoch": 12.262703739213807, + "grad_norm": 0.3279706835746765, + "learning_rate": 2.647862515409697e-05, + "loss": 0.0259, + "step": 12790 + }, + { + "epoch": 12.27229146692234, + "grad_norm": 0.2899661958217621, + "learning_rate": 2.6401866836975795e-05, + "loss": 0.0375, + "step": 12800 + }, + { + "epoch": 12.281879194630873, + "grad_norm": 0.2332329899072647, + "learning_rate": 2.632518000616312e-05, + "loss": 0.0319, + "step": 12810 + }, + { + "epoch": 12.291466922339406, + "grad_norm": 0.23844292759895325, + "learning_rate": 2.6248564893967886e-05, + "loss": 0.0344, + "step": 12820 + }, + { + "epoch": 12.301054650047938, + "grad_norm": 0.20757047832012177, + "learning_rate": 2.617202173248181e-05, + "loss": 0.0365, + "step": 12830 + }, + { + "epoch": 12.310642377756471, + "grad_norm": 0.23326794803142548, + "learning_rate": 2.609555075357869e-05, + "loss": 0.0385, + "step": 12840 + }, + { + "epoch": 12.320230105465004, + "grad_norm": 0.20900526642799377, + "learning_rate": 2.6019152188913638e-05, + "loss": 0.0333, + "step": 12850 + }, + { + "epoch": 12.329817833173538, + "grad_norm": 0.2453479766845703, + "learning_rate": 2.5942826269922376e-05, + "loss": 0.0317, + "step": 12860 + }, + { + "epoch": 12.33940556088207, + "grad_norm": 0.45544683933258057, + "learning_rate": 2.5866573227820557e-05, + "loss": 0.0299, + "step": 12870 + }, + { + "epoch": 12.348993288590604, + "grad_norm": 0.31227871775627136, + "learning_rate": 2.5790393293603097e-05, + "loss": 0.029, + "step": 12880 + }, + { + "epoch": 12.358581016299137, + "grad_norm": 0.32639333605766296, + "learning_rate": 2.571428669804346e-05, + "loss": 0.0323, + "step": 12890 + }, + { + "epoch": 12.36816874400767, + "grad_norm": 0.3351771831512451, + "learning_rate": 2.563825367169289e-05, + "loss": 0.0304, + "step": 12900 + }, + { + "epoch": 12.377756471716204, + "grad_norm": 0.47458702325820923, + "learning_rate": 2.5562294444879787e-05, + "loss": 0.03, + "step": 12910 + }, + { + "epoch": 12.387344199424737, + "grad_norm": 0.2465980499982834, + "learning_rate": 2.5486409247708987e-05, + "loss": 0.0378, + "step": 12920 + }, + { + "epoch": 12.39693192713327, + "grad_norm": 0.42310255765914917, + "learning_rate": 2.5410598310061118e-05, + "loss": 0.0323, + "step": 12930 + }, + { + "epoch": 12.406519654841803, + "grad_norm": 1.066576361656189, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0347, + "step": 12940 + }, + { + "epoch": 12.416107382550335, + "grad_norm": 0.24553652107715607, + "learning_rate": 2.525920013173091e-05, + "loss": 0.0288, + "step": 12950 + }, + { + "epoch": 12.425695110258868, + "grad_norm": 0.17061471939086914, + "learning_rate": 2.51836133496822e-05, + "loss": 0.0293, + "step": 12960 + }, + { + "epoch": 12.435282837967401, + "grad_norm": 0.2702957093715668, + "learning_rate": 2.5108101744422197e-05, + "loss": 0.0337, + "step": 12970 + }, + { + "epoch": 12.444870565675934, + "grad_norm": 0.2967221736907959, + "learning_rate": 2.5032665544699762e-05, + "loss": 0.0388, + "step": 12980 + }, + { + "epoch": 12.454458293384468, + "grad_norm": 0.18429528176784515, + "learning_rate": 2.495730497903535e-05, + "loss": 0.0339, + "step": 12990 + }, + { + "epoch": 12.464046021093, + "grad_norm": 0.4446472227573395, + "learning_rate": 2.4882020275720247e-05, + "loss": 0.0297, + "step": 13000 + }, + { + "epoch": 12.473633748801534, + "grad_norm": 0.2481614649295807, + "learning_rate": 2.480681166281592e-05, + "loss": 0.0332, + "step": 13010 + }, + { + "epoch": 12.483221476510067, + "grad_norm": 0.4030400216579437, + "learning_rate": 2.4731679368153392e-05, + "loss": 0.0386, + "step": 13020 + }, + { + "epoch": 12.4928092042186, + "grad_norm": 0.20716169476509094, + "learning_rate": 2.4656623619332476e-05, + "loss": 0.0289, + "step": 13030 + }, + { + "epoch": 12.502396931927134, + "grad_norm": 0.18714624643325806, + "learning_rate": 2.4581644643721075e-05, + "loss": 0.0257, + "step": 13040 + }, + { + "epoch": 12.511984659635667, + "grad_norm": 0.2566820979118347, + "learning_rate": 2.4506742668454514e-05, + "loss": 0.0267, + "step": 13050 + }, + { + "epoch": 12.5215723873442, + "grad_norm": 0.237356036901474, + "learning_rate": 2.44319179204349e-05, + "loss": 0.0317, + "step": 13060 + }, + { + "epoch": 12.531160115052732, + "grad_norm": 0.29655054211616516, + "learning_rate": 2.4357170626330394e-05, + "loss": 0.0328, + "step": 13070 + }, + { + "epoch": 12.540747842761265, + "grad_norm": 0.29281550645828247, + "learning_rate": 2.4282501012574495e-05, + "loss": 0.0295, + "step": 13080 + }, + { + "epoch": 12.550335570469798, + "grad_norm": 0.477317750453949, + "learning_rate": 2.4207909305365363e-05, + "loss": 0.0353, + "step": 13090 + }, + { + "epoch": 12.559923298178331, + "grad_norm": 0.2606201767921448, + "learning_rate": 2.4133395730665214e-05, + "loss": 0.0288, + "step": 13100 + }, + { + "epoch": 12.569511025886865, + "grad_norm": 0.18180538713932037, + "learning_rate": 2.405896051419957e-05, + "loss": 0.0349, + "step": 13110 + }, + { + "epoch": 12.579098753595398, + "grad_norm": 0.3665505349636078, + "learning_rate": 2.398460388145653e-05, + "loss": 0.0321, + "step": 13120 + }, + { + "epoch": 12.588686481303931, + "grad_norm": 0.28408095240592957, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 12.598274209012464, + "grad_norm": 0.19122740626335144, + "learning_rate": 2.3836127267899778e-05, + "loss": 0.0299, + "step": 13140 + }, + { + "epoch": 12.607861936720997, + "grad_norm": 0.18212218582630157, + "learning_rate": 2.3762007736869353e-05, + "loss": 0.0328, + "step": 13150 + }, + { + "epoch": 12.61744966442953, + "grad_norm": 0.33118176460266113, + "learning_rate": 2.3687967689126667e-05, + "loss": 0.0291, + "step": 13160 + }, + { + "epoch": 12.627037392138064, + "grad_norm": 0.43079885840415955, + "learning_rate": 2.3614007348962724e-05, + "loss": 0.0303, + "step": 13170 + }, + { + "epoch": 12.636625119846597, + "grad_norm": 0.21110649406909943, + "learning_rate": 2.3540126940427166e-05, + "loss": 0.0334, + "step": 13180 + }, + { + "epoch": 12.64621284755513, + "grad_norm": 0.18830737471580505, + "learning_rate": 2.3466326687327396e-05, + "loss": 0.0316, + "step": 13190 + }, + { + "epoch": 12.655800575263662, + "grad_norm": 0.33135518431663513, + "learning_rate": 2.3392606813228008e-05, + "loss": 0.0375, + "step": 13200 + }, + { + "epoch": 12.665388302972195, + "grad_norm": 0.2647267282009125, + "learning_rate": 2.3318967541450153e-05, + "loss": 0.0294, + "step": 13210 + }, + { + "epoch": 12.674976030680728, + "grad_norm": 0.2796458303928375, + "learning_rate": 2.3245409095070803e-05, + "loss": 0.0282, + "step": 13220 + }, + { + "epoch": 12.684563758389261, + "grad_norm": 0.31999823451042175, + "learning_rate": 2.317193169692205e-05, + "loss": 0.0363, + "step": 13230 + }, + { + "epoch": 12.694151486097795, + "grad_norm": 0.21032322943210602, + "learning_rate": 2.3098535569590458e-05, + "loss": 0.0341, + "step": 13240 + }, + { + "epoch": 12.703739213806328, + "grad_norm": 0.31383687257766724, + "learning_rate": 2.3025220935416447e-05, + "loss": 0.0301, + "step": 13250 + }, + { + "epoch": 12.713326941514861, + "grad_norm": 0.4095149040222168, + "learning_rate": 2.2951988016493548e-05, + "loss": 0.036, + "step": 13260 + }, + { + "epoch": 12.722914669223394, + "grad_norm": 0.21426613628864288, + "learning_rate": 2.2878837034667737e-05, + "loss": 0.0346, + "step": 13270 + }, + { + "epoch": 12.732502396931928, + "grad_norm": 0.312098890542984, + "learning_rate": 2.2805768211536758e-05, + "loss": 0.0342, + "step": 13280 + }, + { + "epoch": 12.74209012464046, + "grad_norm": 0.2564839720726013, + "learning_rate": 2.273278176844951e-05, + "loss": 0.0323, + "step": 13290 + }, + { + "epoch": 12.751677852348994, + "grad_norm": 0.314685583114624, + "learning_rate": 2.2659877926505353e-05, + "loss": 0.0382, + "step": 13300 + }, + { + "epoch": 12.761265580057525, + "grad_norm": 0.1301986277103424, + "learning_rate": 2.2587056906553348e-05, + "loss": 0.034, + "step": 13310 + }, + { + "epoch": 12.770853307766059, + "grad_norm": 0.23595231771469116, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0293, + "step": 13320 + }, + { + "epoch": 12.780441035474592, + "grad_norm": 0.23706960678100586, + "learning_rate": 2.2441664214767085e-05, + "loss": 0.0355, + "step": 13330 + }, + { + "epoch": 12.790028763183125, + "grad_norm": 0.20160214602947235, + "learning_rate": 2.2369092983373912e-05, + "loss": 0.0315, + "step": 13340 + }, + { + "epoch": 12.799616490891658, + "grad_norm": 0.1787547618150711, + "learning_rate": 2.2296605454853673e-05, + "loss": 0.0314, + "step": 13350 + }, + { + "epoch": 12.809204218600192, + "grad_norm": 0.36770564317703247, + "learning_rate": 2.222420184879437e-05, + "loss": 0.0372, + "step": 13360 + }, + { + "epoch": 12.818791946308725, + "grad_norm": 0.3025970160961151, + "learning_rate": 2.2151882384529683e-05, + "loss": 0.0255, + "step": 13370 + }, + { + "epoch": 12.828379674017258, + "grad_norm": 0.25169727206230164, + "learning_rate": 2.207964728113848e-05, + "loss": 0.0269, + "step": 13380 + }, + { + "epoch": 12.837967401725791, + "grad_norm": 0.37031155824661255, + "learning_rate": 2.200749675744402e-05, + "loss": 0.0293, + "step": 13390 + }, + { + "epoch": 12.847555129434324, + "grad_norm": 0.21579872071743011, + "learning_rate": 2.1935431032013388e-05, + "loss": 0.0302, + "step": 13400 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.20838379859924316, + "learning_rate": 2.1863450323156725e-05, + "loss": 0.034, + "step": 13410 + }, + { + "epoch": 12.86673058485139, + "grad_norm": 0.2365337610244751, + "learning_rate": 2.179155484892671e-05, + "loss": 0.0321, + "step": 13420 + }, + { + "epoch": 12.876318312559924, + "grad_norm": 0.24535539746284485, + "learning_rate": 2.1719744827117737e-05, + "loss": 0.0318, + "step": 13430 + }, + { + "epoch": 12.885906040268456, + "grad_norm": 0.32186776399612427, + "learning_rate": 2.1648020475265418e-05, + "loss": 0.0353, + "step": 13440 + }, + { + "epoch": 12.895493767976989, + "grad_norm": 0.2927076518535614, + "learning_rate": 2.1576382010645764e-05, + "loss": 0.0318, + "step": 13450 + }, + { + "epoch": 12.905081495685522, + "grad_norm": 0.2444140613079071, + "learning_rate": 2.1504829650274672e-05, + "loss": 0.034, + "step": 13460 + }, + { + "epoch": 12.914669223394055, + "grad_norm": 0.17273946106433868, + "learning_rate": 2.1433363610907147e-05, + "loss": 0.0339, + "step": 13470 + }, + { + "epoch": 12.924256951102588, + "grad_norm": 0.3511595129966736, + "learning_rate": 2.1361984109036765e-05, + "loss": 0.0284, + "step": 13480 + }, + { + "epoch": 12.933844678811122, + "grad_norm": 0.21930259466171265, + "learning_rate": 2.1290691360894872e-05, + "loss": 0.0337, + "step": 13490 + }, + { + "epoch": 12.943432406519655, + "grad_norm": 0.13534465432167053, + "learning_rate": 2.121948558245008e-05, + "loss": 0.0325, + "step": 13500 + }, + { + "epoch": 12.953020134228188, + "grad_norm": 0.25757452845573425, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0344, + "step": 13510 + }, + { + "epoch": 12.962607861936721, + "grad_norm": 0.3126337230205536, + "learning_rate": 2.1077335797208153e-05, + "loss": 0.0266, + "step": 13520 + }, + { + "epoch": 12.972195589645255, + "grad_norm": 0.2144749015569687, + "learning_rate": 2.100639222102827e-05, + "loss": 0.0296, + "step": 13530 + }, + { + "epoch": 12.981783317353788, + "grad_norm": 0.33655446767807007, + "learning_rate": 2.0935536475778682e-05, + "loss": 0.0319, + "step": 13540 + }, + { + "epoch": 12.991371045062321, + "grad_norm": 0.16992558538913727, + "learning_rate": 2.0864768776104183e-05, + "loss": 0.0335, + "step": 13550 + }, + { + "epoch": 13.000958772770852, + "grad_norm": 0.2082756608724594, + "learning_rate": 2.079408933638279e-05, + "loss": 0.0338, + "step": 13560 + }, + { + "epoch": 13.010546500479386, + "grad_norm": 0.2862843871116638, + "learning_rate": 2.0723498370725162e-05, + "loss": 0.0289, + "step": 13570 + }, + { + "epoch": 13.020134228187919, + "grad_norm": 0.29127344489097595, + "learning_rate": 2.0652996092973974e-05, + "loss": 0.0379, + "step": 13580 + }, + { + "epoch": 13.029721955896452, + "grad_norm": 0.1825907677412033, + "learning_rate": 2.0582582716703243e-05, + "loss": 0.0267, + "step": 13590 + }, + { + "epoch": 13.039309683604985, + "grad_norm": 0.20657765865325928, + "learning_rate": 2.0512258455217636e-05, + "loss": 0.0337, + "step": 13600 + }, + { + "epoch": 13.048897411313519, + "grad_norm": 0.20046214759349823, + "learning_rate": 2.044202352155185e-05, + "loss": 0.0256, + "step": 13610 + }, + { + "epoch": 13.058485139022052, + "grad_norm": 0.23749665915966034, + "learning_rate": 2.0371878128470047e-05, + "loss": 0.033, + "step": 13620 + }, + { + "epoch": 13.068072866730585, + "grad_norm": 0.1981140673160553, + "learning_rate": 2.0301822488465106e-05, + "loss": 0.0323, + "step": 13630 + }, + { + "epoch": 13.077660594439118, + "grad_norm": 0.3064008951187134, + "learning_rate": 2.0231856813757995e-05, + "loss": 0.029, + "step": 13640 + }, + { + "epoch": 13.087248322147651, + "grad_norm": 0.3160218596458435, + "learning_rate": 2.016198131629716e-05, + "loss": 0.0317, + "step": 13650 + }, + { + "epoch": 13.096836049856185, + "grad_norm": 0.1925330013036728, + "learning_rate": 2.0092196207757886e-05, + "loss": 0.0308, + "step": 13660 + }, + { + "epoch": 13.106423777564718, + "grad_norm": 0.2060590237379074, + "learning_rate": 2.002250169954165e-05, + "loss": 0.0352, + "step": 13670 + }, + { + "epoch": 13.116011505273251, + "grad_norm": 0.21879933774471283, + "learning_rate": 1.9952898002775444e-05, + "loss": 0.0262, + "step": 13680 + }, + { + "epoch": 13.125599232981783, + "grad_norm": 0.22108188271522522, + "learning_rate": 1.9883385328311155e-05, + "loss": 0.0333, + "step": 13690 + }, + { + "epoch": 13.135186960690316, + "grad_norm": 0.26251569390296936, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0314, + "step": 13700 + }, + { + "epoch": 13.144774688398849, + "grad_norm": 0.29389551281929016, + "learning_rate": 1.9744633888316684e-05, + "loss": 0.0333, + "step": 13710 + }, + { + "epoch": 13.154362416107382, + "grad_norm": 0.1754542887210846, + "learning_rate": 1.9675395543109087e-05, + "loss": 0.0306, + "step": 13720 + }, + { + "epoch": 13.163950143815915, + "grad_norm": 0.2529279589653015, + "learning_rate": 1.9606249060847275e-05, + "loss": 0.029, + "step": 13730 + }, + { + "epoch": 13.173537871524449, + "grad_norm": 0.25833970308303833, + "learning_rate": 1.9537194650998176e-05, + "loss": 0.0257, + "step": 13740 + }, + { + "epoch": 13.183125599232982, + "grad_norm": 0.2809722423553467, + "learning_rate": 1.9468232522749685e-05, + "loss": 0.03, + "step": 13750 + }, + { + "epoch": 13.192713326941515, + "grad_norm": 0.2745196521282196, + "learning_rate": 1.9399362885010186e-05, + "loss": 0.0259, + "step": 13760 + }, + { + "epoch": 13.202301054650048, + "grad_norm": 0.26047447323799133, + "learning_rate": 1.9330585946407896e-05, + "loss": 0.0293, + "step": 13770 + }, + { + "epoch": 13.211888782358582, + "grad_norm": 0.2309299260377884, + "learning_rate": 1.9261901915290222e-05, + "loss": 0.0263, + "step": 13780 + }, + { + "epoch": 13.221476510067115, + "grad_norm": 0.19574059545993805, + "learning_rate": 1.9193310999723086e-05, + "loss": 0.0256, + "step": 13790 + }, + { + "epoch": 13.231064237775648, + "grad_norm": 0.24411630630493164, + "learning_rate": 1.9124813407490345e-05, + "loss": 0.0266, + "step": 13800 + }, + { + "epoch": 13.24065196548418, + "grad_norm": 0.2317860871553421, + "learning_rate": 1.9056409346093167e-05, + "loss": 0.0362, + "step": 13810 + }, + { + "epoch": 13.250239693192713, + "grad_norm": 0.34288397431373596, + "learning_rate": 1.89880990227494e-05, + "loss": 0.031, + "step": 13820 + }, + { + "epoch": 13.259827420901246, + "grad_norm": 0.22115236520767212, + "learning_rate": 1.8919882644392894e-05, + "loss": 0.0303, + "step": 13830 + }, + { + "epoch": 13.269415148609779, + "grad_norm": 0.1675620973110199, + "learning_rate": 1.8851760417672897e-05, + "loss": 0.0267, + "step": 13840 + }, + { + "epoch": 13.279002876318312, + "grad_norm": 0.22504985332489014, + "learning_rate": 1.8783732548953487e-05, + "loss": 0.03, + "step": 13850 + }, + { + "epoch": 13.288590604026846, + "grad_norm": 0.2568277418613434, + "learning_rate": 1.87157992443129e-05, + "loss": 0.0347, + "step": 13860 + }, + { + "epoch": 13.298178331735379, + "grad_norm": 0.24830462038516998, + "learning_rate": 1.8647960709542866e-05, + "loss": 0.0313, + "step": 13870 + }, + { + "epoch": 13.307766059443912, + "grad_norm": 0.1982988864183426, + "learning_rate": 1.8580217150148034e-05, + "loss": 0.0286, + "step": 13880 + }, + { + "epoch": 13.317353787152445, + "grad_norm": 0.17509537935256958, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0283, + "step": 13890 + }, + { + "epoch": 13.326941514860978, + "grad_norm": 0.27267399430274963, + "learning_rate": 1.8445015778063528e-05, + "loss": 0.0308, + "step": 13900 + }, + { + "epoch": 13.336529242569512, + "grad_norm": 0.2444014698266983, + "learning_rate": 1.8377558374942143e-05, + "loss": 0.0335, + "step": 13910 + }, + { + "epoch": 13.346116970278045, + "grad_norm": 0.4355910122394562, + "learning_rate": 1.831019676633129e-05, + "loss": 0.0326, + "step": 13920 + }, + { + "epoch": 13.355704697986576, + "grad_norm": 0.6526142954826355, + "learning_rate": 1.8242931156290893e-05, + "loss": 0.0299, + "step": 13930 + }, + { + "epoch": 13.36529242569511, + "grad_norm": 0.20145297050476074, + "learning_rate": 1.8175761748590063e-05, + "loss": 0.0315, + "step": 13940 + }, + { + "epoch": 13.374880153403643, + "grad_norm": 0.22952324151992798, + "learning_rate": 1.8108688746706427e-05, + "loss": 0.031, + "step": 13950 + }, + { + "epoch": 13.384467881112176, + "grad_norm": 0.38137954473495483, + "learning_rate": 1.8041712353825635e-05, + "loss": 0.0387, + "step": 13960 + }, + { + "epoch": 13.39405560882071, + "grad_norm": 0.2673424482345581, + "learning_rate": 1.7974832772840617e-05, + "loss": 0.0272, + "step": 13970 + }, + { + "epoch": 13.403643336529242, + "grad_norm": 0.2189689427614212, + "learning_rate": 1.790805020635109e-05, + "loss": 0.0317, + "step": 13980 + }, + { + "epoch": 13.413231064237776, + "grad_norm": 1.2192716598510742, + "learning_rate": 1.7841364856662824e-05, + "loss": 0.0258, + "step": 13990 + }, + { + "epoch": 13.422818791946309, + "grad_norm": 0.13329686224460602, + "learning_rate": 1.7774776925787136e-05, + "loss": 0.0257, + "step": 14000 + }, + { + "epoch": 13.432406519654842, + "grad_norm": 0.2741002142429352, + "learning_rate": 1.7708286615440183e-05, + "loss": 0.0271, + "step": 14010 + }, + { + "epoch": 13.441994247363375, + "grad_norm": 0.7737520337104797, + "learning_rate": 1.764189412704247e-05, + "loss": 0.0283, + "step": 14020 + }, + { + "epoch": 13.451581975071909, + "grad_norm": 0.24316097795963287, + "learning_rate": 1.7575599661718068e-05, + "loss": 0.0302, + "step": 14030 + }, + { + "epoch": 13.461169702780442, + "grad_norm": 0.23543784022331238, + "learning_rate": 1.7509403420294208e-05, + "loss": 0.0311, + "step": 14040 + }, + { + "epoch": 13.470757430488973, + "grad_norm": 0.19010919332504272, + "learning_rate": 1.7443305603300497e-05, + "loss": 0.0276, + "step": 14050 + }, + { + "epoch": 13.480345158197506, + "grad_norm": 0.1994113028049469, + "learning_rate": 1.7377306410968396e-05, + "loss": 0.0298, + "step": 14060 + }, + { + "epoch": 13.48993288590604, + "grad_norm": 0.30696478486061096, + "learning_rate": 1.731140604323063e-05, + "loss": 0.0275, + "step": 14070 + }, + { + "epoch": 13.499520613614573, + "grad_norm": 0.3128091096878052, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.0272, + "step": 14080 + }, + { + "epoch": 13.509108341323106, + "grad_norm": 2.206577777862549, + "learning_rate": 1.7179902579771474e-05, + "loss": 0.0326, + "step": 14090 + }, + { + "epoch": 13.51869606903164, + "grad_norm": 0.18835577368736267, + "learning_rate": 1.711429988241619e-05, + "loss": 0.0276, + "step": 14100 + }, + { + "epoch": 13.528283796740173, + "grad_norm": 0.2255256026983261, + "learning_rate": 1.7048796806386304e-05, + "loss": 0.0301, + "step": 14110 + }, + { + "epoch": 13.537871524448706, + "grad_norm": 0.3144644796848297, + "learning_rate": 1.6983393550111648e-05, + "loss": 0.0324, + "step": 14120 + }, + { + "epoch": 13.547459252157239, + "grad_norm": 0.20487931370735168, + "learning_rate": 1.691809031171962e-05, + "loss": 0.0352, + "step": 14130 + }, + { + "epoch": 13.557046979865772, + "grad_norm": 0.22863590717315674, + "learning_rate": 1.6852887289034632e-05, + "loss": 0.0343, + "step": 14140 + }, + { + "epoch": 13.566634707574305, + "grad_norm": 0.30829718708992004, + "learning_rate": 1.67877846795776e-05, + "loss": 0.0342, + "step": 14150 + }, + { + "epoch": 13.576222435282839, + "grad_norm": 0.2026831954717636, + "learning_rate": 1.672278268056516e-05, + "loss": 0.0266, + "step": 14160 + }, + { + "epoch": 13.585810162991372, + "grad_norm": 0.18998700380325317, + "learning_rate": 1.6657881488909192e-05, + "loss": 0.0316, + "step": 14170 + }, + { + "epoch": 13.595397890699903, + "grad_norm": 0.2338184267282486, + "learning_rate": 1.659308130121622e-05, + "loss": 0.0315, + "step": 14180 + }, + { + "epoch": 13.604985618408437, + "grad_norm": 0.421129047870636, + "learning_rate": 1.6528382313786784e-05, + "loss": 0.0322, + "step": 14190 + }, + { + "epoch": 13.61457334611697, + "grad_norm": 0.28092893958091736, + "learning_rate": 1.6463784722614845e-05, + "loss": 0.0269, + "step": 14200 + }, + { + "epoch": 13.624161073825503, + "grad_norm": 0.19112944602966309, + "learning_rate": 1.6399288723387195e-05, + "loss": 0.0258, + "step": 14210 + }, + { + "epoch": 13.633748801534036, + "grad_norm": 0.286045640707016, + "learning_rate": 1.63348945114829e-05, + "loss": 0.0324, + "step": 14220 + }, + { + "epoch": 13.64333652924257, + "grad_norm": 0.280977338552475, + "learning_rate": 1.6270602281972686e-05, + "loss": 0.0265, + "step": 14230 + }, + { + "epoch": 13.652924256951103, + "grad_norm": 0.28009748458862305, + "learning_rate": 1.6206412229618307e-05, + "loss": 0.034, + "step": 14240 + }, + { + "epoch": 13.662511984659636, + "grad_norm": 0.2950078845024109, + "learning_rate": 1.6142324548871978e-05, + "loss": 0.0332, + "step": 14250 + }, + { + "epoch": 13.67209971236817, + "grad_norm": 0.19593513011932373, + "learning_rate": 1.607833943387585e-05, + "loss": 0.0322, + "step": 14260 + }, + { + "epoch": 13.681687440076702, + "grad_norm": 0.3256717026233673, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0311, + "step": 14270 + }, + { + "epoch": 13.691275167785236, + "grad_norm": 0.48480740189552307, + "learning_rate": 1.59506776761486e-05, + "loss": 0.0265, + "step": 14280 + }, + { + "epoch": 13.700862895493769, + "grad_norm": 0.17794422805309296, + "learning_rate": 1.588700142014583e-05, + "loss": 0.0302, + "step": 14290 + }, + { + "epoch": 13.7104506232023, + "grad_norm": 0.21641989052295685, + "learning_rate": 1.5823428503348846e-05, + "loss": 0.0269, + "step": 14300 + }, + { + "epoch": 13.720038350910833, + "grad_norm": 0.21487939357757568, + "learning_rate": 1.57599591183404e-05, + "loss": 0.0333, + "step": 14310 + }, + { + "epoch": 13.729626078619367, + "grad_norm": 0.20198583602905273, + "learning_rate": 1.569659345738959e-05, + "loss": 0.0316, + "step": 14320 + }, + { + "epoch": 13.7392138063279, + "grad_norm": 0.24818021059036255, + "learning_rate": 1.5633331712451287e-05, + "loss": 0.0322, + "step": 14330 + }, + { + "epoch": 13.748801534036433, + "grad_norm": 0.3211008906364441, + "learning_rate": 1.5570174075165617e-05, + "loss": 0.0286, + "step": 14340 + }, + { + "epoch": 13.758389261744966, + "grad_norm": 0.27913060784339905, + "learning_rate": 1.5507120736857316e-05, + "loss": 0.0309, + "step": 14350 + }, + { + "epoch": 13.7679769894535, + "grad_norm": 0.3094828724861145, + "learning_rate": 1.5444171888535127e-05, + "loss": 0.0262, + "step": 14360 + }, + { + "epoch": 13.777564717162033, + "grad_norm": 0.26376375555992126, + "learning_rate": 1.538132772089131e-05, + "loss": 0.0312, + "step": 14370 + }, + { + "epoch": 13.787152444870566, + "grad_norm": 0.27103152871131897, + "learning_rate": 1.531858842430096e-05, + "loss": 0.029, + "step": 14380 + }, + { + "epoch": 13.7967401725791, + "grad_norm": 0.2528936564922333, + "learning_rate": 1.5255954188821554e-05, + "loss": 0.0302, + "step": 14390 + }, + { + "epoch": 13.806327900287632, + "grad_norm": 0.2022869884967804, + "learning_rate": 1.519342520419223e-05, + "loss": 0.028, + "step": 14400 + }, + { + "epoch": 13.815915627996166, + "grad_norm": 0.2736548185348511, + "learning_rate": 1.5131001659833349e-05, + "loss": 0.0391, + "step": 14410 + }, + { + "epoch": 13.825503355704697, + "grad_norm": 0.20340123772621155, + "learning_rate": 1.5068683744845802e-05, + "loss": 0.0259, + "step": 14420 + }, + { + "epoch": 13.83509108341323, + "grad_norm": 0.30253875255584717, + "learning_rate": 1.5006471648010567e-05, + "loss": 0.0318, + "step": 14430 + }, + { + "epoch": 13.844678811121764, + "grad_norm": 0.18290819227695465, + "learning_rate": 1.4944365557787982e-05, + "loss": 0.0266, + "step": 14440 + }, + { + "epoch": 13.854266538830297, + "grad_norm": 0.17378397285938263, + "learning_rate": 1.4882365662317338e-05, + "loss": 0.0307, + "step": 14450 + }, + { + "epoch": 13.86385426653883, + "grad_norm": 0.17450757324695587, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.0375, + "step": 14460 + }, + { + "epoch": 13.873441994247363, + "grad_norm": 0.17673359811306, + "learning_rate": 1.4758685206579754e-05, + "loss": 0.0336, + "step": 14470 + }, + { + "epoch": 13.883029721955896, + "grad_norm": 0.17782671749591827, + "learning_rate": 1.4697005020980547e-05, + "loss": 0.0264, + "step": 14480 + }, + { + "epoch": 13.89261744966443, + "grad_norm": 0.22997714579105377, + "learning_rate": 1.4635431779467628e-05, + "loss": 0.0364, + "step": 14490 + }, + { + "epoch": 13.902205177372963, + "grad_norm": 0.23629331588745117, + "learning_rate": 1.4573965668566037e-05, + "loss": 0.0293, + "step": 14500 + }, + { + "epoch": 13.911792905081496, + "grad_norm": 0.2348259836435318, + "learning_rate": 1.4512606874476348e-05, + "loss": 0.0296, + "step": 14510 + }, + { + "epoch": 13.92138063279003, + "grad_norm": 0.2225087732076645, + "learning_rate": 1.4451355583074027e-05, + "loss": 0.0286, + "step": 14520 + }, + { + "epoch": 13.930968360498563, + "grad_norm": 0.23287685215473175, + "learning_rate": 1.4390211979908847e-05, + "loss": 0.0279, + "step": 14530 + }, + { + "epoch": 13.940556088207096, + "grad_norm": 0.19362808763980865, + "learning_rate": 1.4329176250204369e-05, + "loss": 0.0334, + "step": 14540 + }, + { + "epoch": 13.950143815915627, + "grad_norm": 0.25659292936325073, + "learning_rate": 1.4268248578857384e-05, + "loss": 0.0286, + "step": 14550 + }, + { + "epoch": 13.95973154362416, + "grad_norm": 0.19965949654579163, + "learning_rate": 1.4207429150437368e-05, + "loss": 0.0336, + "step": 14560 + }, + { + "epoch": 13.969319271332694, + "grad_norm": 0.21127323806285858, + "learning_rate": 1.4146718149185833e-05, + "loss": 0.0311, + "step": 14570 + }, + { + "epoch": 13.978906999041227, + "grad_norm": 0.2175043374300003, + "learning_rate": 1.408611575901585e-05, + "loss": 0.0232, + "step": 14580 + }, + { + "epoch": 13.98849472674976, + "grad_norm": 0.2855774462223053, + "learning_rate": 1.4025622163511498e-05, + "loss": 0.03, + "step": 14590 + }, + { + "epoch": 13.998082454458293, + "grad_norm": 0.27606961131095886, + "learning_rate": 1.3965237545927274e-05, + "loss": 0.0285, + "step": 14600 + }, + { + "epoch": 14.007670182166827, + "grad_norm": 0.20237654447555542, + "learning_rate": 1.3904962089187529e-05, + "loss": 0.0263, + "step": 14610 + }, + { + "epoch": 14.01725790987536, + "grad_norm": 0.17577792704105377, + "learning_rate": 1.3844795975885921e-05, + "loss": 0.028, + "step": 14620 + }, + { + "epoch": 14.026845637583893, + "grad_norm": 0.24930806457996368, + "learning_rate": 1.3784739388284911e-05, + "loss": 0.0308, + "step": 14630 + }, + { + "epoch": 14.036433365292426, + "grad_norm": 0.16480274498462677, + "learning_rate": 1.372479250831516e-05, + "loss": 0.0301, + "step": 14640 + }, + { + "epoch": 14.04602109300096, + "grad_norm": 0.20912165939807892, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0278, + "step": 14650 + }, + { + "epoch": 14.055608820709493, + "grad_norm": 0.3317655622959137, + "learning_rate": 1.3605228597329738e-05, + "loss": 0.0317, + "step": 14660 + }, + { + "epoch": 14.065196548418024, + "grad_norm": 0.240800142288208, + "learning_rate": 1.3545611928511475e-05, + "loss": 0.0352, + "step": 14670 + }, + { + "epoch": 14.074784276126557, + "grad_norm": 0.2574955224990845, + "learning_rate": 1.3486105691718187e-05, + "loss": 0.0272, + "step": 14680 + }, + { + "epoch": 14.08437200383509, + "grad_norm": 0.26954057812690735, + "learning_rate": 1.3426710067213322e-05, + "loss": 0.0309, + "step": 14690 + }, + { + "epoch": 14.093959731543624, + "grad_norm": 0.23546206951141357, + "learning_rate": 1.336742523492523e-05, + "loss": 0.0332, + "step": 14700 + }, + { + "epoch": 14.103547459252157, + "grad_norm": 0.2285180389881134, + "learning_rate": 1.3308251374446734e-05, + "loss": 0.0436, + "step": 14710 + }, + { + "epoch": 14.11313518696069, + "grad_norm": 0.22198130190372467, + "learning_rate": 1.324918866503439e-05, + "loss": 0.0283, + "step": 14720 + }, + { + "epoch": 14.122722914669223, + "grad_norm": 0.37202128767967224, + "learning_rate": 1.3190237285608076e-05, + "loss": 0.0296, + "step": 14730 + }, + { + "epoch": 14.132310642377757, + "grad_norm": 0.2728140652179718, + "learning_rate": 1.3131397414750385e-05, + "loss": 0.0313, + "step": 14740 + }, + { + "epoch": 14.14189837008629, + "grad_norm": 0.19201789796352386, + "learning_rate": 1.3072669230706197e-05, + "loss": 0.0315, + "step": 14750 + }, + { + "epoch": 14.151486097794823, + "grad_norm": 0.2704322040081024, + "learning_rate": 1.3014052911381974e-05, + "loss": 0.0279, + "step": 14760 + }, + { + "epoch": 14.161073825503356, + "grad_norm": 0.23162490129470825, + "learning_rate": 1.2955548634345327e-05, + "loss": 0.0288, + "step": 14770 + }, + { + "epoch": 14.17066155321189, + "grad_norm": 0.1527073085308075, + "learning_rate": 1.289715657682447e-05, + "loss": 0.0287, + "step": 14780 + }, + { + "epoch": 14.180249280920421, + "grad_norm": 0.48836442828178406, + "learning_rate": 1.2838876915707681e-05, + "loss": 0.0334, + "step": 14790 + }, + { + "epoch": 14.189837008628954, + "grad_norm": 0.22852776944637299, + "learning_rate": 1.2780709827542708e-05, + "loss": 0.0301, + "step": 14800 + }, + { + "epoch": 14.199424736337487, + "grad_norm": 1.632561445236206, + "learning_rate": 1.2722655488536294e-05, + "loss": 0.0296, + "step": 14810 + }, + { + "epoch": 14.20901246404602, + "grad_norm": 0.20910300314426422, + "learning_rate": 1.2664714074553652e-05, + "loss": 0.0277, + "step": 14820 + }, + { + "epoch": 14.218600191754554, + "grad_norm": 0.284138023853302, + "learning_rate": 1.260688576111791e-05, + "loss": 0.0275, + "step": 14830 + }, + { + "epoch": 14.228187919463087, + "grad_norm": 0.24799588322639465, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0291, + "step": 14840 + }, + { + "epoch": 14.23777564717162, + "grad_norm": 0.18639959394931793, + "learning_rate": 1.2491569136265896e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 14.247363374880154, + "grad_norm": 0.19724729657173157, + "learning_rate": 1.243408117418064e-05, + "loss": 0.0266, + "step": 14860 + }, + { + "epoch": 14.256951102588687, + "grad_norm": 0.1451575756072998, + "learning_rate": 1.2376707011303257e-05, + "loss": 0.0313, + "step": 14870 + }, + { + "epoch": 14.26653883029722, + "grad_norm": 0.13136418163776398, + "learning_rate": 1.2319446821438458e-05, + "loss": 0.0257, + "step": 14880 + }, + { + "epoch": 14.276126558005753, + "grad_norm": 0.212480828166008, + "learning_rate": 1.2262300778045693e-05, + "loss": 0.0309, + "step": 14890 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.179280087351799, + "learning_rate": 1.220526905423866e-05, + "loss": 0.0334, + "step": 14900 + }, + { + "epoch": 14.29530201342282, + "grad_norm": 0.19260522723197937, + "learning_rate": 1.2148351822784748e-05, + "loss": 0.0321, + "step": 14910 + }, + { + "epoch": 14.304889741131351, + "grad_norm": 0.2079414278268814, + "learning_rate": 1.2091549256104457e-05, + "loss": 0.0314, + "step": 14920 + }, + { + "epoch": 14.314477468839884, + "grad_norm": 0.1942739635705948, + "learning_rate": 1.2034861526270996e-05, + "loss": 0.0307, + "step": 14930 + }, + { + "epoch": 14.324065196548418, + "grad_norm": 0.28928378224372864, + "learning_rate": 1.1978288805009641e-05, + "loss": 0.0267, + "step": 14940 + }, + { + "epoch": 14.33365292425695, + "grad_norm": 0.3712955415248871, + "learning_rate": 1.192183126369732e-05, + "loss": 0.0329, + "step": 14950 + }, + { + "epoch": 14.343240651965484, + "grad_norm": 0.22929075360298157, + "learning_rate": 1.1865489073361996e-05, + "loss": 0.0264, + "step": 14960 + }, + { + "epoch": 14.352828379674017, + "grad_norm": 0.31317007541656494, + "learning_rate": 1.1809262404682247e-05, + "loss": 0.0242, + "step": 14970 + }, + { + "epoch": 14.36241610738255, + "grad_norm": 0.5237254500389099, + "learning_rate": 1.1753151427986646e-05, + "loss": 0.0292, + "step": 14980 + }, + { + "epoch": 14.372003835091084, + "grad_norm": 0.21789228916168213, + "learning_rate": 1.169715631325336e-05, + "loss": 0.0314, + "step": 14990 + }, + { + "epoch": 14.381591562799617, + "grad_norm": 0.29379501938819885, + "learning_rate": 1.1641277230109492e-05, + "loss": 0.0332, + "step": 15000 + }, + { + "epoch": 14.39117929050815, + "grad_norm": 0.17771072685718536, + "learning_rate": 1.1585514347830738e-05, + "loss": 0.0267, + "step": 15010 + }, + { + "epoch": 14.400767018216683, + "grad_norm": 0.24794255197048187, + "learning_rate": 1.1529867835340707e-05, + "loss": 0.0267, + "step": 15020 + }, + { + "epoch": 14.410354745925215, + "grad_norm": 0.21468493342399597, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0267, + "step": 15030 + }, + { + "epoch": 14.419942473633748, + "grad_norm": 0.17512547969818115, + "learning_rate": 1.1418924593658314e-05, + "loss": 0.0239, + "step": 15040 + }, + { + "epoch": 14.429530201342281, + "grad_norm": 0.2626974284648895, + "learning_rate": 1.1363628200548593e-05, + "loss": 0.0328, + "step": 15050 + }, + { + "epoch": 14.439117929050814, + "grad_norm": 0.21883651614189148, + "learning_rate": 1.1308448849391846e-05, + "loss": 0.0283, + "step": 15060 + }, + { + "epoch": 14.448705656759348, + "grad_norm": 0.2517321705818176, + "learning_rate": 1.1253386707344044e-05, + "loss": 0.0319, + "step": 15070 + }, + { + "epoch": 14.458293384467881, + "grad_norm": 0.23790787160396576, + "learning_rate": 1.1198441941206033e-05, + "loss": 0.0254, + "step": 15080 + }, + { + "epoch": 14.467881112176414, + "grad_norm": 0.2755306363105774, + "learning_rate": 1.1143614717423145e-05, + "loss": 0.0297, + "step": 15090 + }, + { + "epoch": 14.477468839884947, + "grad_norm": 0.17343682050704956, + "learning_rate": 1.1088905202084604e-05, + "loss": 0.0271, + "step": 15100 + }, + { + "epoch": 14.48705656759348, + "grad_norm": 0.4037168323993683, + "learning_rate": 1.1034313560923032e-05, + "loss": 0.0318, + "step": 15110 + }, + { + "epoch": 14.496644295302014, + "grad_norm": 0.25027063488960266, + "learning_rate": 1.097983995931407e-05, + "loss": 0.0344, + "step": 15120 + }, + { + "epoch": 14.506232023010547, + "grad_norm": 0.2531662583351135, + "learning_rate": 1.0925484562275678e-05, + "loss": 0.0336, + "step": 15130 + }, + { + "epoch": 14.51581975071908, + "grad_norm": 0.27917400002479553, + "learning_rate": 1.0871247534467788e-05, + "loss": 0.0316, + "step": 15140 + }, + { + "epoch": 14.525407478427613, + "grad_norm": 0.26147523522377014, + "learning_rate": 1.0817129040191698e-05, + "loss": 0.0278, + "step": 15150 + }, + { + "epoch": 14.534995206136145, + "grad_norm": 0.24168430268764496, + "learning_rate": 1.076312924338973e-05, + "loss": 0.03, + "step": 15160 + }, + { + "epoch": 14.544582933844678, + "grad_norm": 0.17934760451316833, + "learning_rate": 1.0709248307644559e-05, + "loss": 0.0275, + "step": 15170 + }, + { + "epoch": 14.554170661553211, + "grad_norm": 0.38495177030563354, + "learning_rate": 1.0655486396178782e-05, + "loss": 0.0317, + "step": 15180 + }, + { + "epoch": 14.563758389261745, + "grad_norm": 0.22225984930992126, + "learning_rate": 1.0601843671854477e-05, + "loss": 0.0312, + "step": 15190 + }, + { + "epoch": 14.573346116970278, + "grad_norm": 0.29296278953552246, + "learning_rate": 1.0548320297172665e-05, + "loss": 0.0315, + "step": 15200 + }, + { + "epoch": 14.582933844678811, + "grad_norm": 0.3371207118034363, + "learning_rate": 1.0494916434272783e-05, + "loss": 0.0299, + "step": 15210 + }, + { + "epoch": 14.592521572387344, + "grad_norm": 0.220375657081604, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0265, + "step": 15220 + }, + { + "epoch": 14.602109300095877, + "grad_norm": 0.1987174153327942, + "learning_rate": 1.0388467890565928e-05, + "loss": 0.0261, + "step": 15230 + }, + { + "epoch": 14.61169702780441, + "grad_norm": 0.25363320112228394, + "learning_rate": 1.0335423532225735e-05, + "loss": 0.0301, + "step": 15240 + }, + { + "epoch": 14.621284755512944, + "grad_norm": 0.22231195867061615, + "learning_rate": 1.028249933060001e-05, + "loss": 0.0353, + "step": 15250 + }, + { + "epoch": 14.630872483221477, + "grad_norm": 0.20641197264194489, + "learning_rate": 1.022969544601311e-05, + "loss": 0.0254, + "step": 15260 + }, + { + "epoch": 14.64046021093001, + "grad_norm": 0.25588056445121765, + "learning_rate": 1.0177012038424927e-05, + "loss": 0.0327, + "step": 15270 + }, + { + "epoch": 14.650047938638544, + "grad_norm": 0.3196217715740204, + "learning_rate": 1.0124449267430414e-05, + "loss": 0.0306, + "step": 15280 + }, + { + "epoch": 14.659635666347075, + "grad_norm": 0.37711241841316223, + "learning_rate": 1.0072007292259029e-05, + "loss": 0.0314, + "step": 15290 + }, + { + "epoch": 14.669223394055608, + "grad_norm": 0.299496591091156, + "learning_rate": 1.0019686271774314e-05, + "loss": 0.0273, + "step": 15300 + }, + { + "epoch": 14.678811121764141, + "grad_norm": 0.20070233941078186, + "learning_rate": 9.967486364473416e-06, + "loss": 0.0348, + "step": 15310 + }, + { + "epoch": 14.688398849472675, + "grad_norm": 0.1786354035139084, + "learning_rate": 9.915407728486603e-06, + "loss": 0.0315, + "step": 15320 + }, + { + "epoch": 14.697986577181208, + "grad_norm": 0.19913482666015625, + "learning_rate": 9.863450521576729e-06, + "loss": 0.0332, + "step": 15330 + }, + { + "epoch": 14.707574304889741, + "grad_norm": 0.26217663288116455, + "learning_rate": 9.81161490113885e-06, + "loss": 0.0299, + "step": 15340 + }, + { + "epoch": 14.717162032598274, + "grad_norm": 0.17626221477985382, + "learning_rate": 9.759901024199642e-06, + "loss": 0.0258, + "step": 15350 + }, + { + "epoch": 14.726749760306808, + "grad_norm": 0.5230224132537842, + "learning_rate": 9.708309047417041e-06, + "loss": 0.0286, + "step": 15360 + }, + { + "epoch": 14.73633748801534, + "grad_norm": 0.19318176805973053, + "learning_rate": 9.656839127079659e-06, + "loss": 0.0254, + "step": 15370 + }, + { + "epoch": 14.745925215723874, + "grad_norm": 0.30321067571640015, + "learning_rate": 9.6054914191064e-06, + "loss": 0.0304, + "step": 15380 + }, + { + "epoch": 14.755512943432407, + "grad_norm": 0.2519323229789734, + "learning_rate": 9.554266079045909e-06, + "loss": 0.0325, + "step": 15390 + }, + { + "epoch": 14.765100671140939, + "grad_norm": 0.24592278897762299, + "learning_rate": 9.503163262076181e-06, + "loss": 0.0336, + "step": 15400 + }, + { + "epoch": 14.774688398849472, + "grad_norm": 0.19091877341270447, + "learning_rate": 9.452183123004e-06, + "loss": 0.0247, + "step": 15410 + }, + { + "epoch": 14.784276126558005, + "grad_norm": 0.26081383228302, + "learning_rate": 9.401325816264573e-06, + "loss": 0.0333, + "step": 15420 + }, + { + "epoch": 14.793863854266538, + "grad_norm": 0.27854666113853455, + "learning_rate": 9.350591495920952e-06, + "loss": 0.024, + "step": 15430 + }, + { + "epoch": 14.803451581975072, + "grad_norm": 0.36169877648353577, + "learning_rate": 9.299980315663686e-06, + "loss": 0.031, + "step": 15440 + }, + { + "epoch": 14.813039309683605, + "grad_norm": 0.18000735342502594, + "learning_rate": 9.24949242881023e-06, + "loss": 0.0289, + "step": 15450 + }, + { + "epoch": 14.822627037392138, + "grad_norm": 0.25608521699905396, + "learning_rate": 9.199127988304607e-06, + "loss": 0.0284, + "step": 15460 + }, + { + "epoch": 14.832214765100671, + "grad_norm": 0.2771013379096985, + "learning_rate": 9.148887146716812e-06, + "loss": 0.0283, + "step": 15470 + }, + { + "epoch": 14.841802492809204, + "grad_norm": 0.17078572511672974, + "learning_rate": 9.09877005624249e-06, + "loss": 0.0294, + "step": 15480 + }, + { + "epoch": 14.851390220517738, + "grad_norm": 0.17408467829227448, + "learning_rate": 9.048776868702347e-06, + "loss": 0.0255, + "step": 15490 + }, + { + "epoch": 14.860977948226271, + "grad_norm": 0.20527216792106628, + "learning_rate": 8.998907735541789e-06, + "loss": 0.0329, + "step": 15500 + }, + { + "epoch": 14.870565675934804, + "grad_norm": 0.23558159172534943, + "learning_rate": 8.94916280783038e-06, + "loss": 0.0294, + "step": 15510 + }, + { + "epoch": 14.880153403643337, + "grad_norm": 0.16163650155067444, + "learning_rate": 8.89954223626146e-06, + "loss": 0.0264, + "step": 15520 + }, + { + "epoch": 14.889741131351869, + "grad_norm": 0.2564382255077362, + "learning_rate": 8.850046171151666e-06, + "loss": 0.0332, + "step": 15530 + }, + { + "epoch": 14.899328859060402, + "grad_norm": 0.2050989419221878, + "learning_rate": 8.80067476244042e-06, + "loss": 0.0307, + "step": 15540 + }, + { + "epoch": 14.908916586768935, + "grad_norm": 0.18448740243911743, + "learning_rate": 8.751428159689528e-06, + "loss": 0.0306, + "step": 15550 + }, + { + "epoch": 14.918504314477468, + "grad_norm": 0.29133155941963196, + "learning_rate": 8.702306512082753e-06, + "loss": 0.0243, + "step": 15560 + }, + { + "epoch": 14.928092042186002, + "grad_norm": 0.141392782330513, + "learning_rate": 8.653309968425322e-06, + "loss": 0.0242, + "step": 15570 + }, + { + "epoch": 14.937679769894535, + "grad_norm": 0.21134333312511444, + "learning_rate": 8.60443867714345e-06, + "loss": 0.0318, + "step": 15580 + }, + { + "epoch": 14.947267497603068, + "grad_norm": 0.2590806484222412, + "learning_rate": 8.55569278628393e-06, + "loss": 0.0253, + "step": 15590 + }, + { + "epoch": 14.956855225311601, + "grad_norm": 0.21871857345104218, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0258, + "step": 15600 + }, + { + "epoch": 14.966442953020135, + "grad_norm": 0.25187286734580994, + "learning_rate": 8.458577796119382e-06, + "loss": 0.03, + "step": 15610 + }, + { + "epoch": 14.976030680728668, + "grad_norm": 0.17888393998146057, + "learning_rate": 8.410208991006784e-06, + "loss": 0.0274, + "step": 15620 + }, + { + "epoch": 14.985618408437201, + "grad_norm": 0.1486871838569641, + "learning_rate": 8.361966174700514e-06, + "loss": 0.0269, + "step": 15630 + }, + { + "epoch": 14.995206136145734, + "grad_norm": 0.6585232019424438, + "learning_rate": 8.31384949334353e-06, + "loss": 0.0294, + "step": 15640 + }, + { + "epoch": 15.004793863854266, + "grad_norm": 0.36748427152633667, + "learning_rate": 8.265859092696686e-06, + "loss": 0.0318, + "step": 15650 + }, + { + "epoch": 15.014381591562799, + "grad_norm": 0.22082515060901642, + "learning_rate": 8.217995118138294e-06, + "loss": 0.0294, + "step": 15660 + }, + { + "epoch": 15.023969319271332, + "grad_norm": 0.1767498254776001, + "learning_rate": 8.170257714663642e-06, + "loss": 0.0275, + "step": 15670 + }, + { + "epoch": 15.033557046979865, + "grad_norm": 0.24185898900032043, + "learning_rate": 8.12264702688465e-06, + "loss": 0.0279, + "step": 15680 + }, + { + "epoch": 15.043144774688399, + "grad_norm": 0.22703923285007477, + "learning_rate": 8.075163199029357e-06, + "loss": 0.0268, + "step": 15690 + }, + { + "epoch": 15.052732502396932, + "grad_norm": 0.2051907479763031, + "learning_rate": 8.027806374941481e-06, + "loss": 0.0272, + "step": 15700 + }, + { + "epoch": 15.062320230105465, + "grad_norm": 0.24761435389518738, + "learning_rate": 7.980576698080005e-06, + "loss": 0.0301, + "step": 15710 + }, + { + "epoch": 15.071907957813998, + "grad_norm": 0.17438143491744995, + "learning_rate": 7.933474311518796e-06, + "loss": 0.0351, + "step": 15720 + }, + { + "epoch": 15.081495685522532, + "grad_norm": 0.20341135561466217, + "learning_rate": 7.88649935794606e-06, + "loss": 0.0264, + "step": 15730 + }, + { + "epoch": 15.091083413231065, + "grad_norm": 0.24047966301441193, + "learning_rate": 7.83965197966397e-06, + "loss": 0.0268, + "step": 15740 + }, + { + "epoch": 15.100671140939598, + "grad_norm": 0.19311171770095825, + "learning_rate": 7.792932318588264e-06, + "loss": 0.033, + "step": 15750 + }, + { + "epoch": 15.110258868648131, + "grad_norm": 0.18407687544822693, + "learning_rate": 7.746340516247779e-06, + "loss": 0.0243, + "step": 15760 + }, + { + "epoch": 15.119846596356663, + "grad_norm": 0.21947818994522095, + "learning_rate": 7.69987671378401e-06, + "loss": 0.0255, + "step": 15770 + }, + { + "epoch": 15.129434324065196, + "grad_norm": 0.4175131916999817, + "learning_rate": 7.653541051950692e-06, + "loss": 0.0245, + "step": 15780 + }, + { + "epoch": 15.139022051773729, + "grad_norm": 0.29046544432640076, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0365, + "step": 15790 + }, + { + "epoch": 15.148609779482262, + "grad_norm": 0.25391921401023865, + "learning_rate": 7.561254711249127e-06, + "loss": 0.0266, + "step": 15800 + }, + { + "epoch": 15.158197507190796, + "grad_norm": 0.19595490396022797, + "learning_rate": 7.515304311945787e-06, + "loss": 0.0306, + "step": 15810 + }, + { + "epoch": 15.167785234899329, + "grad_norm": 0.1492607444524765, + "learning_rate": 7.469482612401857e-06, + "loss": 0.0306, + "step": 15820 + }, + { + "epoch": 15.177372962607862, + "grad_norm": 0.2468632310628891, + "learning_rate": 7.423789751425958e-06, + "loss": 0.0275, + "step": 15830 + }, + { + "epoch": 15.186960690316395, + "grad_norm": 0.20901519060134888, + "learning_rate": 7.378225867436428e-06, + "loss": 0.0252, + "step": 15840 + }, + { + "epoch": 15.196548418024928, + "grad_norm": 0.28785982728004456, + "learning_rate": 7.332791098460867e-06, + "loss": 0.0326, + "step": 15850 + }, + { + "epoch": 15.206136145733462, + "grad_norm": 0.2834322154521942, + "learning_rate": 7.287485582135728e-06, + "loss": 0.0302, + "step": 15860 + }, + { + "epoch": 15.215723873441995, + "grad_norm": 0.24561063945293427, + "learning_rate": 7.242309455705959e-06, + "loss": 0.0292, + "step": 15870 + }, + { + "epoch": 15.225311601150528, + "grad_norm": 0.23040306568145752, + "learning_rate": 7.197262856024539e-06, + "loss": 0.0246, + "step": 15880 + }, + { + "epoch": 15.234899328859061, + "grad_norm": 0.22045479714870453, + "learning_rate": 7.152345919552045e-06, + "loss": 0.0314, + "step": 15890 + }, + { + "epoch": 15.244487056567593, + "grad_norm": 0.2748197913169861, + "learning_rate": 7.107558782356255e-06, + "loss": 0.0292, + "step": 15900 + }, + { + "epoch": 15.254074784276126, + "grad_norm": 0.2709030210971832, + "learning_rate": 7.0629015801117744e-06, + "loss": 0.0299, + "step": 15910 + }, + { + "epoch": 15.26366251198466, + "grad_norm": 0.2666435241699219, + "learning_rate": 7.018374448099596e-06, + "loss": 0.0324, + "step": 15920 + }, + { + "epoch": 15.273250239693192, + "grad_norm": 0.32848596572875977, + "learning_rate": 6.973977521206654e-06, + "loss": 0.0344, + "step": 15930 + }, + { + "epoch": 15.282837967401726, + "grad_norm": 0.23068153858184814, + "learning_rate": 6.929710933925487e-06, + "loss": 0.0262, + "step": 15940 + }, + { + "epoch": 15.292425695110259, + "grad_norm": 0.24479450285434723, + "learning_rate": 6.885574820353752e-06, + "loss": 0.0269, + "step": 15950 + }, + { + "epoch": 15.302013422818792, + "grad_norm": 0.21294337511062622, + "learning_rate": 6.841569314193902e-06, + "loss": 0.0265, + "step": 15960 + }, + { + "epoch": 15.311601150527325, + "grad_norm": 0.28778862953186035, + "learning_rate": 6.797694548752703e-06, + "loss": 0.0273, + "step": 15970 + }, + { + "epoch": 15.321188878235859, + "grad_norm": 0.189237579703331, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0267, + "step": 15980 + }, + { + "epoch": 15.330776605944392, + "grad_norm": 0.28015297651290894, + "learning_rate": 6.710337771272745e-06, + "loss": 0.034, + "step": 15990 + }, + { + "epoch": 15.340364333652925, + "grad_norm": 0.1625533103942871, + "learning_rate": 6.666856023865658e-06, + "loss": 0.0233, + "step": 16000 + }, + { + "epoch": 15.349952061361458, + "grad_norm": 0.21412205696105957, + "learning_rate": 6.623505546439773e-06, + "loss": 0.0253, + "step": 16010 + }, + { + "epoch": 15.35953978906999, + "grad_norm": 0.26244086027145386, + "learning_rate": 6.580286470317598e-06, + "loss": 0.0256, + "step": 16020 + }, + { + "epoch": 15.369127516778523, + "grad_norm": 0.28637972474098206, + "learning_rate": 6.537198926423549e-06, + "loss": 0.0283, + "step": 16030 + }, + { + "epoch": 15.378715244487056, + "grad_norm": 0.2678770124912262, + "learning_rate": 6.494243045283621e-06, + "loss": 0.0271, + "step": 16040 + }, + { + "epoch": 15.38830297219559, + "grad_norm": 0.1962299942970276, + "learning_rate": 6.45141895702493e-06, + "loss": 0.0258, + "step": 16050 + }, + { + "epoch": 15.397890699904123, + "grad_norm": 0.26651138067245483, + "learning_rate": 6.40872679137538e-06, + "loss": 0.0276, + "step": 16060 + }, + { + "epoch": 15.407478427612656, + "grad_norm": 0.23737022280693054, + "learning_rate": 6.366166677663204e-06, + "loss": 0.0309, + "step": 16070 + }, + { + "epoch": 15.417066155321189, + "grad_norm": 0.2531161606311798, + "learning_rate": 6.323738744816654e-06, + "loss": 0.0329, + "step": 16080 + }, + { + "epoch": 15.426653883029722, + "grad_norm": 0.26035356521606445, + "learning_rate": 6.2814431213635065e-06, + "loss": 0.0286, + "step": 16090 + }, + { + "epoch": 15.436241610738255, + "grad_norm": 0.2163701057434082, + "learning_rate": 6.239279935430786e-06, + "loss": 0.027, + "step": 16100 + }, + { + "epoch": 15.445829338446789, + "grad_norm": 0.18169005215168, + "learning_rate": 6.197249314744275e-06, + "loss": 0.024, + "step": 16110 + }, + { + "epoch": 15.455417066155322, + "grad_norm": 0.24503251910209656, + "learning_rate": 6.155351386628205e-06, + "loss": 0.0298, + "step": 16120 + }, + { + "epoch": 15.465004793863855, + "grad_norm": 0.19895343482494354, + "learning_rate": 6.113586278004835e-06, + "loss": 0.0233, + "step": 16130 + }, + { + "epoch": 15.474592521572387, + "grad_norm": 0.2949654459953308, + "learning_rate": 6.071954115394063e-06, + "loss": 0.0256, + "step": 16140 + }, + { + "epoch": 15.48418024928092, + "grad_norm": 0.13835924863815308, + "learning_rate": 6.030455024913029e-06, + "loss": 0.029, + "step": 16150 + }, + { + "epoch": 15.493767976989453, + "grad_norm": 0.36957499384880066, + "learning_rate": 5.989089132275799e-06, + "loss": 0.0369, + "step": 16160 + }, + { + "epoch": 15.503355704697986, + "grad_norm": 0.22811642289161682, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0306, + "step": 16170 + }, + { + "epoch": 15.51294343240652, + "grad_norm": 0.3362506330013275, + "learning_rate": 5.906757441371069e-06, + "loss": 0.0346, + "step": 16180 + }, + { + "epoch": 15.522531160115053, + "grad_norm": 0.20575332641601562, + "learning_rate": 5.865791892512623e-06, + "loss": 0.0305, + "step": 16190 + }, + { + "epoch": 15.532118887823586, + "grad_norm": 0.1870652139186859, + "learning_rate": 5.824960040315386e-06, + "loss": 0.0253, + "step": 16200 + }, + { + "epoch": 15.541706615532119, + "grad_norm": 0.4694177508354187, + "learning_rate": 5.784262008472124e-06, + "loss": 0.0287, + "step": 16210 + }, + { + "epoch": 15.551294343240652, + "grad_norm": 0.2506779134273529, + "learning_rate": 5.7436979202702194e-06, + "loss": 0.0331, + "step": 16220 + }, + { + "epoch": 15.560882070949186, + "grad_norm": 0.18632706999778748, + "learning_rate": 5.703267898591275e-06, + "loss": 0.0234, + "step": 16230 + }, + { + "epoch": 15.570469798657719, + "grad_norm": 0.14531591534614563, + "learning_rate": 5.662972065910799e-06, + "loss": 0.0245, + "step": 16240 + }, + { + "epoch": 15.580057526366252, + "grad_norm": 0.19370119273662567, + "learning_rate": 5.622810544297796e-06, + "loss": 0.0262, + "step": 16250 + }, + { + "epoch": 15.589645254074785, + "grad_norm": 0.2350122630596161, + "learning_rate": 5.582783455414375e-06, + "loss": 0.0262, + "step": 16260 + }, + { + "epoch": 15.599232981783317, + "grad_norm": 0.2912338078022003, + "learning_rate": 5.5428909205154035e-06, + "loss": 0.0284, + "step": 16270 + }, + { + "epoch": 15.60882070949185, + "grad_norm": 0.28382018208503723, + "learning_rate": 5.503133060448168e-06, + "loss": 0.0257, + "step": 16280 + }, + { + "epoch": 15.618408437200383, + "grad_norm": 0.1536964774131775, + "learning_rate": 5.463509995651978e-06, + "loss": 0.0274, + "step": 16290 + }, + { + "epoch": 15.627996164908916, + "grad_norm": 0.5844811201095581, + "learning_rate": 5.4240218461577894e-06, + "loss": 0.0294, + "step": 16300 + }, + { + "epoch": 15.63758389261745, + "grad_norm": 0.2484215646982193, + "learning_rate": 5.384668731587844e-06, + "loss": 0.0278, + "step": 16310 + }, + { + "epoch": 15.647171620325983, + "grad_norm": 0.2738986015319824, + "learning_rate": 5.345450771155358e-06, + "loss": 0.0271, + "step": 16320 + }, + { + "epoch": 15.656759348034516, + "grad_norm": 0.23017966747283936, + "learning_rate": 5.3063680836641095e-06, + "loss": 0.0261, + "step": 16330 + }, + { + "epoch": 15.66634707574305, + "grad_norm": 0.1773134022951126, + "learning_rate": 5.2674207875080595e-06, + "loss": 0.03, + "step": 16340 + }, + { + "epoch": 15.675934803451582, + "grad_norm": 0.1907745748758316, + "learning_rate": 5.228609000671081e-06, + "loss": 0.0224, + "step": 16350 + }, + { + "epoch": 15.685522531160116, + "grad_norm": 0.2307148277759552, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0294, + "step": 16360 + }, + { + "epoch": 15.695110258868649, + "grad_norm": 0.3302120566368103, + "learning_rate": 5.151392424836782e-06, + "loss": 0.0292, + "step": 16370 + }, + { + "epoch": 15.70469798657718, + "grad_norm": 0.2139192521572113, + "learning_rate": 5.112987869753216e-06, + "loss": 0.0296, + "step": 16380 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 0.16015082597732544, + "learning_rate": 5.074719291815522e-06, + "loss": 0.029, + "step": 16390 + }, + { + "epoch": 15.723873441994247, + "grad_norm": 0.19606702029705048, + "learning_rate": 5.036586806951465e-06, + "loss": 0.029, + "step": 16400 + }, + { + "epoch": 15.73346116970278, + "grad_norm": 0.30746451020240784, + "learning_rate": 4.998590530676584e-06, + "loss": 0.0285, + "step": 16410 + }, + { + "epoch": 15.743048897411313, + "grad_norm": 0.16113652288913727, + "learning_rate": 4.960730578093753e-06, + "loss": 0.028, + "step": 16420 + }, + { + "epoch": 15.752636625119846, + "grad_norm": 0.23624086380004883, + "learning_rate": 4.923007063892926e-06, + "loss": 0.0251, + "step": 16430 + }, + { + "epoch": 15.76222435282838, + "grad_norm": 0.19934307038784027, + "learning_rate": 4.885420102350696e-06, + "loss": 0.0238, + "step": 16440 + }, + { + "epoch": 15.771812080536913, + "grad_norm": 0.2440912276506424, + "learning_rate": 4.847969807330038e-06, + "loss": 0.0231, + "step": 16450 + }, + { + "epoch": 15.781399808245446, + "grad_norm": 0.2768200933933258, + "learning_rate": 4.810656292279875e-06, + "loss": 0.0268, + "step": 16460 + }, + { + "epoch": 15.79098753595398, + "grad_norm": 0.29489603638648987, + "learning_rate": 4.773479670234821e-06, + "loss": 0.0358, + "step": 16470 + }, + { + "epoch": 15.800575263662513, + "grad_norm": 0.26058635115623474, + "learning_rate": 4.7364400538147665e-06, + "loss": 0.0272, + "step": 16480 + }, + { + "epoch": 15.810162991371046, + "grad_norm": 0.19268332421779633, + "learning_rate": 4.699537555224598e-06, + "loss": 0.028, + "step": 16490 + }, + { + "epoch": 15.819750719079579, + "grad_norm": 0.27744096517562866, + "learning_rate": 4.6627722862537915e-06, + "loss": 0.0278, + "step": 16500 + }, + { + "epoch": 15.82933844678811, + "grad_norm": 0.3575479984283447, + "learning_rate": 4.626144358276147e-06, + "loss": 0.0275, + "step": 16510 + }, + { + "epoch": 15.838926174496644, + "grad_norm": 0.20007503032684326, + "learning_rate": 4.589653882249378e-06, + "loss": 0.0309, + "step": 16520 + }, + { + "epoch": 15.848513902205177, + "grad_norm": 0.20804741978645325, + "learning_rate": 4.553300968714841e-06, + "loss": 0.0249, + "step": 16530 + }, + { + "epoch": 15.85810162991371, + "grad_norm": 0.2726737856864929, + "learning_rate": 4.5170857277971765e-06, + "loss": 0.0259, + "step": 16540 + }, + { + "epoch": 15.867689357622243, + "grad_norm": 0.21122261881828308, + "learning_rate": 4.48100826920394e-06, + "loss": 0.029, + "step": 16550 + }, + { + "epoch": 15.877277085330777, + "grad_norm": 0.28613051772117615, + "learning_rate": 4.4450687022253135e-06, + "loss": 0.0255, + "step": 16560 + }, + { + "epoch": 15.88686481303931, + "grad_norm": 0.2184969037771225, + "learning_rate": 4.409267135733764e-06, + "loss": 0.0233, + "step": 16570 + }, + { + "epoch": 15.896452540747843, + "grad_norm": 0.19320517778396606, + "learning_rate": 4.37360367818373e-06, + "loss": 0.0271, + "step": 16580 + }, + { + "epoch": 15.906040268456376, + "grad_norm": 0.18892447650432587, + "learning_rate": 4.338078437611237e-06, + "loss": 0.0265, + "step": 16590 + }, + { + "epoch": 15.91562799616491, + "grad_norm": 0.23824314773082733, + "learning_rate": 4.3026915216336225e-06, + "loss": 0.0269, + "step": 16600 + }, + { + "epoch": 15.925215723873443, + "grad_norm": 0.1431523561477661, + "learning_rate": 4.267443037449198e-06, + "loss": 0.0269, + "step": 16610 + }, + { + "epoch": 15.934803451581976, + "grad_norm": 0.22107666730880737, + "learning_rate": 4.232333091836932e-06, + "loss": 0.0293, + "step": 16620 + }, + { + "epoch": 15.944391179290509, + "grad_norm": 0.27542436122894287, + "learning_rate": 4.197361791156096e-06, + "loss": 0.03, + "step": 16630 + }, + { + "epoch": 15.95397890699904, + "grad_norm": 0.234486922621727, + "learning_rate": 4.162529241345958e-06, + "loss": 0.0325, + "step": 16640 + }, + { + "epoch": 15.963566634707574, + "grad_norm": 0.24536362290382385, + "learning_rate": 4.127835547925479e-06, + "loss": 0.0211, + "step": 16650 + }, + { + "epoch": 15.973154362416107, + "grad_norm": 0.2566201686859131, + "learning_rate": 4.093280815992989e-06, + "loss": 0.0244, + "step": 16660 + }, + { + "epoch": 15.98274209012464, + "grad_norm": 0.3387947380542755, + "learning_rate": 4.058865150225833e-06, + "loss": 0.0279, + "step": 16670 + }, + { + "epoch": 15.992329817833173, + "grad_norm": 0.5632581114768982, + "learning_rate": 4.024588654880079e-06, + "loss": 0.0298, + "step": 16680 + }, + { + "epoch": 16.001917545541705, + "grad_norm": 0.2585551142692566, + "learning_rate": 3.990451433790254e-06, + "loss": 0.0313, + "step": 16690 + }, + { + "epoch": 16.01150527325024, + "grad_norm": 0.2654295563697815, + "learning_rate": 3.956453590368914e-06, + "loss": 0.0258, + "step": 16700 + }, + { + "epoch": 16.02109300095877, + "grad_norm": 0.243434339761734, + "learning_rate": 3.922595227606435e-06, + "loss": 0.0263, + "step": 16710 + }, + { + "epoch": 16.030680728667306, + "grad_norm": 0.23672133684158325, + "learning_rate": 3.8888764480706276e-06, + "loss": 0.029, + "step": 16720 + }, + { + "epoch": 16.040268456375838, + "grad_norm": 0.28110471367836, + "learning_rate": 3.855297353906512e-06, + "loss": 0.0313, + "step": 16730 + }, + { + "epoch": 16.049856184084373, + "grad_norm": 0.17387288808822632, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0263, + "step": 16740 + }, + { + "epoch": 16.059443911792904, + "grad_norm": 0.16623635590076447, + "learning_rate": 3.7885586281572016e-06, + "loss": 0.0234, + "step": 16750 + }, + { + "epoch": 16.06903163950144, + "grad_norm": 0.20889221131801605, + "learning_rate": 3.7553991987449912e-06, + "loss": 0.0198, + "step": 16760 + }, + { + "epoch": 16.07861936720997, + "grad_norm": 0.2764891982078552, + "learning_rate": 3.7223798590498403e-06, + "loss": 0.0306, + "step": 16770 + }, + { + "epoch": 16.088207094918506, + "grad_norm": 0.17139260470867157, + "learning_rate": 3.689500709097893e-06, + "loss": 0.0204, + "step": 16780 + }, + { + "epoch": 16.097794822627037, + "grad_norm": 0.25818943977355957, + "learning_rate": 3.6567618484906307e-06, + "loss": 0.0243, + "step": 16790 + }, + { + "epoch": 16.107382550335572, + "grad_norm": 0.33521944284439087, + "learning_rate": 3.6241633764045545e-06, + "loss": 0.0289, + "step": 16800 + }, + { + "epoch": 16.116970278044104, + "grad_norm": 0.23774349689483643, + "learning_rate": 3.591705391590905e-06, + "loss": 0.0284, + "step": 16810 + }, + { + "epoch": 16.126558005752635, + "grad_norm": 0.17396867275238037, + "learning_rate": 3.5593879923753015e-06, + "loss": 0.0292, + "step": 16820 + }, + { + "epoch": 16.13614573346117, + "grad_norm": 0.32836684584617615, + "learning_rate": 3.5272112766574993e-06, + "loss": 0.0261, + "step": 16830 + }, + { + "epoch": 16.1457334611697, + "grad_norm": 0.2727390229701996, + "learning_rate": 3.4951753419110943e-06, + "loss": 0.0294, + "step": 16840 + }, + { + "epoch": 16.155321188878236, + "grad_norm": 0.36386972665786743, + "learning_rate": 3.4632802851832013e-06, + "loss": 0.0256, + "step": 16850 + }, + { + "epoch": 16.164908916586768, + "grad_norm": 0.20322419703006744, + "learning_rate": 3.431526203094171e-06, + "loss": 0.0242, + "step": 16860 + }, + { + "epoch": 16.174496644295303, + "grad_norm": 0.23579928278923035, + "learning_rate": 3.3999131918372785e-06, + "loss": 0.03, + "step": 16870 + }, + { + "epoch": 16.184084372003834, + "grad_norm": 0.20980890095233917, + "learning_rate": 3.3684413471784804e-06, + "loss": 0.0281, + "step": 16880 + }, + { + "epoch": 16.19367209971237, + "grad_norm": 0.17388616502285004, + "learning_rate": 3.3371107644560805e-06, + "loss": 0.0312, + "step": 16890 + }, + { + "epoch": 16.2032598274209, + "grad_norm": 0.43162086606025696, + "learning_rate": 3.3059215385804585e-06, + "loss": 0.0281, + "step": 16900 + }, + { + "epoch": 16.212847555129436, + "grad_norm": 0.21873044967651367, + "learning_rate": 3.274873764033759e-06, + "loss": 0.0255, + "step": 16910 + }, + { + "epoch": 16.222435282837967, + "grad_norm": 0.2102050930261612, + "learning_rate": 3.243967534869652e-06, + "loss": 0.0272, + "step": 16920 + }, + { + "epoch": 16.232023010546502, + "grad_norm": 0.21298690140247345, + "learning_rate": 3.213202944713023e-06, + "loss": 0.0261, + "step": 16930 + }, + { + "epoch": 16.241610738255034, + "grad_norm": 0.30388498306274414, + "learning_rate": 3.1825800867596566e-06, + "loss": 0.0338, + "step": 16940 + }, + { + "epoch": 16.251198465963565, + "grad_norm": 0.2536049485206604, + "learning_rate": 3.152099053776014e-06, + "loss": 0.0292, + "step": 16950 + }, + { + "epoch": 16.2607861936721, + "grad_norm": 0.2809562385082245, + "learning_rate": 3.121759938098906e-06, + "loss": 0.0262, + "step": 16960 + }, + { + "epoch": 16.27037392138063, + "grad_norm": 0.2241629660129547, + "learning_rate": 3.091562831635253e-06, + "loss": 0.0288, + "step": 16970 + }, + { + "epoch": 16.279961649089167, + "grad_norm": 0.1237056627869606, + "learning_rate": 3.061507825861748e-06, + "loss": 0.0209, + "step": 16980 + }, + { + "epoch": 16.289549376797698, + "grad_norm": 0.13440051674842834, + "learning_rate": 3.031595011824656e-06, + "loss": 0.0273, + "step": 16990 + }, + { + "epoch": 16.299137104506233, + "grad_norm": 0.28445371985435486, + "learning_rate": 3.0018244801394535e-06, + "loss": 0.034, + "step": 17000 + }, + { + "epoch": 16.308724832214764, + "grad_norm": 0.3177470862865448, + "learning_rate": 2.9721963209906502e-06, + "loss": 0.0301, + "step": 17010 + }, + { + "epoch": 16.3183125599233, + "grad_norm": 0.1341092437505722, + "learning_rate": 2.942710624131412e-06, + "loss": 0.0266, + "step": 17020 + }, + { + "epoch": 16.32790028763183, + "grad_norm": 0.19116052985191345, + "learning_rate": 2.9133674788833833e-06, + "loss": 0.0311, + "step": 17030 + }, + { + "epoch": 16.337488015340366, + "grad_norm": 0.1874174177646637, + "learning_rate": 2.884166974136343e-06, + "loss": 0.0236, + "step": 17040 + }, + { + "epoch": 16.347075743048897, + "grad_norm": 0.36720889806747437, + "learning_rate": 2.855109198347983e-06, + "loss": 0.0278, + "step": 17050 + }, + { + "epoch": 16.35666347075743, + "grad_norm": 0.38599368929862976, + "learning_rate": 2.826194239543617e-06, + "loss": 0.0323, + "step": 17060 + }, + { + "epoch": 16.366251198465964, + "grad_norm": 0.19532305002212524, + "learning_rate": 2.797422185315929e-06, + "loss": 0.0222, + "step": 17070 + }, + { + "epoch": 16.375838926174495, + "grad_norm": 0.2218206375837326, + "learning_rate": 2.768793122824681e-06, + "loss": 0.0255, + "step": 17080 + }, + { + "epoch": 16.38542665388303, + "grad_norm": 0.3124590516090393, + "learning_rate": 2.740307138796483e-06, + "loss": 0.0249, + "step": 17090 + }, + { + "epoch": 16.39501438159156, + "grad_norm": 0.21726781129837036, + "learning_rate": 2.7119643195245238e-06, + "loss": 0.0218, + "step": 17100 + }, + { + "epoch": 16.404602109300097, + "grad_norm": 0.5927583575248718, + "learning_rate": 2.683764750868273e-06, + "loss": 0.0263, + "step": 17110 + }, + { + "epoch": 16.414189837008628, + "grad_norm": 0.28960007429122925, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.0291, + "step": 17120 + }, + { + "epoch": 16.423777564717163, + "grad_norm": 0.35697048902511597, + "learning_rate": 2.6277957066708047e-06, + "loss": 0.0273, + "step": 17130 + }, + { + "epoch": 16.433365292425695, + "grad_norm": 0.2136591225862503, + "learning_rate": 2.6000264006777743e-06, + "loss": 0.0325, + "step": 17140 + }, + { + "epoch": 16.44295302013423, + "grad_norm": 0.3051040768623352, + "learning_rate": 2.5724006843962866e-06, + "loss": 0.0298, + "step": 17150 + }, + { + "epoch": 16.45254074784276, + "grad_norm": 0.1534937173128128, + "learning_rate": 2.5449186415134885e-06, + "loss": 0.0263, + "step": 17160 + }, + { + "epoch": 16.462128475551296, + "grad_norm": 0.17988426983356476, + "learning_rate": 2.5175803552812906e-06, + "loss": 0.0278, + "step": 17170 + }, + { + "epoch": 16.471716203259827, + "grad_norm": 0.48748767375946045, + "learning_rate": 2.490385908516141e-06, + "loss": 0.0308, + "step": 17180 + }, + { + "epoch": 16.48130393096836, + "grad_norm": 0.191914901137352, + "learning_rate": 2.463335383598725e-06, + "loss": 0.0303, + "step": 17190 + }, + { + "epoch": 16.490891658676894, + "grad_norm": 0.21671634912490845, + "learning_rate": 2.4364288624737442e-06, + "loss": 0.0276, + "step": 17200 + }, + { + "epoch": 16.500479386385425, + "grad_norm": 0.13923166692256927, + "learning_rate": 2.4096664266496814e-06, + "loss": 0.0331, + "step": 17210 + }, + { + "epoch": 16.51006711409396, + "grad_norm": 0.20780488848686218, + "learning_rate": 2.3830481571985365e-06, + "loss": 0.0243, + "step": 17220 + }, + { + "epoch": 16.51965484180249, + "grad_norm": 0.39643654227256775, + "learning_rate": 2.3565741347555792e-06, + "loss": 0.0289, + "step": 17230 + }, + { + "epoch": 16.529242569511027, + "grad_norm": 0.18083330988883972, + "learning_rate": 2.3302444395190915e-06, + "loss": 0.0216, + "step": 17240 + }, + { + "epoch": 16.538830297219558, + "grad_norm": 0.1432444006204605, + "learning_rate": 2.3040591512501765e-06, + "loss": 0.0318, + "step": 17250 + }, + { + "epoch": 16.548418024928093, + "grad_norm": 0.2874661386013031, + "learning_rate": 2.278018349272465e-06, + "loss": 0.0279, + "step": 17260 + }, + { + "epoch": 16.558005752636625, + "grad_norm": 0.2093266099691391, + "learning_rate": 2.2521221124718826e-06, + "loss": 0.0226, + "step": 17270 + }, + { + "epoch": 16.56759348034516, + "grad_norm": 0.3234308063983917, + "learning_rate": 2.2263705192964334e-06, + "loss": 0.0295, + "step": 17280 + }, + { + "epoch": 16.57718120805369, + "grad_norm": 0.6225463151931763, + "learning_rate": 2.2007636477559436e-06, + "loss": 0.031, + "step": 17290 + }, + { + "epoch": 16.586768935762223, + "grad_norm": 0.31777986884117126, + "learning_rate": 2.1753015754218453e-06, + "loss": 0.0311, + "step": 17300 + }, + { + "epoch": 16.596356663470758, + "grad_norm": 0.2332683950662613, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0263, + "step": 17310 + }, + { + "epoch": 16.60594439117929, + "grad_norm": 0.23592767119407654, + "learning_rate": 2.1248121364650265e-06, + "loss": 0.0229, + "step": 17320 + }, + { + "epoch": 16.615532118887824, + "grad_norm": 0.4014437198638916, + "learning_rate": 2.0997849227909983e-06, + "loss": 0.026, + "step": 17330 + }, + { + "epoch": 16.625119846596355, + "grad_norm": 0.18571177124977112, + "learning_rate": 2.0749028142202807e-06, + "loss": 0.0281, + "step": 17340 + }, + { + "epoch": 16.63470757430489, + "grad_norm": 0.2480279952287674, + "learning_rate": 2.050165886128741e-06, + "loss": 0.0283, + "step": 17350 + }, + { + "epoch": 16.644295302013422, + "grad_norm": 0.20139874517917633, + "learning_rate": 2.0255742134524804e-06, + "loss": 0.0263, + "step": 17360 + }, + { + "epoch": 16.653883029721957, + "grad_norm": 0.18241684138774872, + "learning_rate": 2.001127870687541e-06, + "loss": 0.0206, + "step": 17370 + }, + { + "epoch": 16.66347075743049, + "grad_norm": 0.26072490215301514, + "learning_rate": 1.9768269318897414e-06, + "loss": 0.0251, + "step": 17380 + }, + { + "epoch": 16.673058485139023, + "grad_norm": 0.33512383699417114, + "learning_rate": 1.9526714706744055e-06, + "loss": 0.0282, + "step": 17390 + }, + { + "epoch": 16.682646212847555, + "grad_norm": 0.279745876789093, + "learning_rate": 1.928661560216172e-06, + "loss": 0.0233, + "step": 17400 + }, + { + "epoch": 16.69223394055609, + "grad_norm": 0.2306470274925232, + "learning_rate": 1.904797273248754e-06, + "loss": 0.0272, + "step": 17410 + }, + { + "epoch": 16.70182166826462, + "grad_norm": 0.14322997629642487, + "learning_rate": 1.8810786820647242e-06, + "loss": 0.0272, + "step": 17420 + }, + { + "epoch": 16.711409395973153, + "grad_norm": 0.25938233733177185, + "learning_rate": 1.8575058585152905e-06, + "loss": 0.0308, + "step": 17430 + }, + { + "epoch": 16.720997123681688, + "grad_norm": 0.23380053043365479, + "learning_rate": 1.8340788740101034e-06, + "loss": 0.028, + "step": 17440 + }, + { + "epoch": 16.73058485139022, + "grad_norm": 0.27241095900535583, + "learning_rate": 1.810797799517e-06, + "loss": 0.0293, + "step": 17450 + }, + { + "epoch": 16.740172579098754, + "grad_norm": 0.24621997773647308, + "learning_rate": 1.7876627055618155e-06, + "loss": 0.0258, + "step": 17460 + }, + { + "epoch": 16.749760306807286, + "grad_norm": 0.15812641382217407, + "learning_rate": 1.7646736622281667e-06, + "loss": 0.0259, + "step": 17470 + }, + { + "epoch": 16.75934803451582, + "grad_norm": 0.18936626613140106, + "learning_rate": 1.7418307391572354e-06, + "loss": 0.026, + "step": 17480 + }, + { + "epoch": 16.768935762224352, + "grad_norm": 0.16878223419189453, + "learning_rate": 1.7191340055475513e-06, + "loss": 0.0281, + "step": 17490 + }, + { + "epoch": 16.778523489932887, + "grad_norm": 0.18892349302768707, + "learning_rate": 1.696583530154794e-06, + "loss": 0.0259, + "step": 17500 + }, + { + "epoch": 16.78811121764142, + "grad_norm": 0.243266299366951, + "learning_rate": 1.6741793812915907e-06, + "loss": 0.0248, + "step": 17510 + }, + { + "epoch": 16.797698945349953, + "grad_norm": 0.20740211009979248, + "learning_rate": 1.6519216268272796e-06, + "loss": 0.0264, + "step": 17520 + }, + { + "epoch": 16.807286673058485, + "grad_norm": 0.16220887005329132, + "learning_rate": 1.6298103341877369e-06, + "loss": 0.0226, + "step": 17530 + }, + { + "epoch": 16.81687440076702, + "grad_norm": 0.3126187026500702, + "learning_rate": 1.6078455703551486e-06, + "loss": 0.0326, + "step": 17540 + }, + { + "epoch": 16.82646212847555, + "grad_norm": 0.1612725555896759, + "learning_rate": 1.5860274018678345e-06, + "loss": 0.0327, + "step": 17550 + }, + { + "epoch": 16.836049856184083, + "grad_norm": 0.20316867530345917, + "learning_rate": 1.5643558948200131e-06, + "loss": 0.0252, + "step": 17560 + }, + { + "epoch": 16.845637583892618, + "grad_norm": 0.20207004249095917, + "learning_rate": 1.5428311148616204e-06, + "loss": 0.0298, + "step": 17570 + }, + { + "epoch": 16.85522531160115, + "grad_norm": 0.2780834436416626, + "learning_rate": 1.5214531271981192e-06, + "loss": 0.026, + "step": 17580 + }, + { + "epoch": 16.864813039309684, + "grad_norm": 0.3551330268383026, + "learning_rate": 1.5002219965902896e-06, + "loss": 0.0255, + "step": 17590 + }, + { + "epoch": 16.874400767018216, + "grad_norm": 0.23651057481765747, + "learning_rate": 1.4791377873540235e-06, + "loss": 0.0274, + "step": 17600 + }, + { + "epoch": 16.88398849472675, + "grad_norm": 0.19430945813655853, + "learning_rate": 1.4582005633601515e-06, + "loss": 0.0232, + "step": 17610 + }, + { + "epoch": 16.893576222435282, + "grad_norm": 0.21821914613246918, + "learning_rate": 1.437410388034227e-06, + "loss": 0.0278, + "step": 17620 + }, + { + "epoch": 16.903163950143817, + "grad_norm": 0.23415020108222961, + "learning_rate": 1.4167673243563717e-06, + "loss": 0.0331, + "step": 17630 + }, + { + "epoch": 16.91275167785235, + "grad_norm": 0.207551971077919, + "learning_rate": 1.3962714348610295e-06, + "loss": 0.0305, + "step": 17640 + }, + { + "epoch": 16.922339405560884, + "grad_norm": 0.28280988335609436, + "learning_rate": 1.3759227816368182e-06, + "loss": 0.0297, + "step": 17650 + }, + { + "epoch": 16.931927133269415, + "grad_norm": 0.24366876482963562, + "learning_rate": 1.3557214263263286e-06, + "loss": 0.0247, + "step": 17660 + }, + { + "epoch": 16.941514860977946, + "grad_norm": 0.20423495769500732, + "learning_rate": 1.3356674301259532e-06, + "loss": 0.0263, + "step": 17670 + }, + { + "epoch": 16.95110258868648, + "grad_norm": 0.19706788659095764, + "learning_rate": 1.3157608537856582e-06, + "loss": 0.0297, + "step": 17680 + }, + { + "epoch": 16.960690316395013, + "grad_norm": 0.2174736112356186, + "learning_rate": 1.2960017576088446e-06, + "loss": 0.0278, + "step": 17690 + }, + { + "epoch": 16.970278044103548, + "grad_norm": 0.2222086638212204, + "learning_rate": 1.2763902014521656e-06, + "loss": 0.0276, + "step": 17700 + }, + { + "epoch": 16.97986577181208, + "grad_norm": 0.20257794857025146, + "learning_rate": 1.2569262447252928e-06, + "loss": 0.034, + "step": 17710 + }, + { + "epoch": 16.989453499520614, + "grad_norm": 0.2699783146381378, + "learning_rate": 1.2376099463907887e-06, + "loss": 0.0226, + "step": 17720 + }, + { + "epoch": 16.999041227229146, + "grad_norm": 0.19566196203231812, + "learning_rate": 1.2184413649639182e-06, + "loss": 0.028, + "step": 17730 + }, + { + "epoch": 17.00862895493768, + "grad_norm": 0.23381511867046356, + "learning_rate": 1.1994205585124652e-06, + "loss": 0.029, + "step": 17740 + }, + { + "epoch": 17.018216682646212, + "grad_norm": 0.19119040668010712, + "learning_rate": 1.180547584656533e-06, + "loss": 0.0239, + "step": 17750 + }, + { + "epoch": 17.027804410354747, + "grad_norm": 0.23085108399391174, + "learning_rate": 1.1618225005684158e-06, + "loss": 0.0275, + "step": 17760 + }, + { + "epoch": 17.03739213806328, + "grad_norm": 0.21077860891819, + "learning_rate": 1.1432453629723893e-06, + "loss": 0.0309, + "step": 17770 + }, + { + "epoch": 17.046979865771814, + "grad_norm": 0.18925194442272186, + "learning_rate": 1.124816228144565e-06, + "loss": 0.0271, + "step": 17780 + }, + { + "epoch": 17.056567593480345, + "grad_norm": 0.22407986223697662, + "learning_rate": 1.106535151912702e-06, + "loss": 0.0273, + "step": 17790 + }, + { + "epoch": 17.066155321188877, + "grad_norm": 0.21448639035224915, + "learning_rate": 1.0884021896560237e-06, + "loss": 0.0258, + "step": 17800 + }, + { + "epoch": 17.07574304889741, + "grad_norm": 0.24161478877067566, + "learning_rate": 1.0704173963050957e-06, + "loss": 0.0289, + "step": 17810 + }, + { + "epoch": 17.085330776605943, + "grad_norm": 0.1643606573343277, + "learning_rate": 1.0525808263416205e-06, + "loss": 0.0258, + "step": 17820 + }, + { + "epoch": 17.094918504314478, + "grad_norm": 0.2575829327106476, + "learning_rate": 1.0348925337982817e-06, + "loss": 0.0274, + "step": 17830 + }, + { + "epoch": 17.10450623202301, + "grad_norm": 0.1602732241153717, + "learning_rate": 1.0173525722585897e-06, + "loss": 0.0358, + "step": 17840 + }, + { + "epoch": 17.114093959731544, + "grad_norm": 0.23271816968917847, + "learning_rate": 9.999609948567024e-07, + "loss": 0.0373, + "step": 17850 + }, + { + "epoch": 17.123681687440076, + "grad_norm": 0.18822619318962097, + "learning_rate": 9.82717854277293e-07, + "loss": 0.0278, + "step": 17860 + }, + { + "epoch": 17.13326941514861, + "grad_norm": 0.37295079231262207, + "learning_rate": 9.656232027553558e-07, + "loss": 0.0245, + "step": 17870 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 0.207114115357399, + "learning_rate": 9.486770920760668e-07, + "loss": 0.0237, + "step": 17880 + }, + { + "epoch": 17.152444870565677, + "grad_norm": 0.2382437288761139, + "learning_rate": 9.318795735746233e-07, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 17.16203259827421, + "grad_norm": 0.3437121510505676, + "learning_rate": 9.152306981360992e-07, + "loss": 0.0274, + "step": 17900 + }, + { + "epoch": 17.171620325982744, + "grad_norm": 0.1845656931400299, + "learning_rate": 8.987305161952731e-07, + "loss": 0.0251, + "step": 17910 + }, + { + "epoch": 17.181208053691275, + "grad_norm": 0.2611910402774811, + "learning_rate": 8.823790777364837e-07, + "loss": 0.0263, + "step": 17920 + }, + { + "epoch": 17.190795781399807, + "grad_norm": 0.3325332701206207, + "learning_rate": 8.661764322934695e-07, + "loss": 0.0314, + "step": 17930 + }, + { + "epoch": 17.20038350910834, + "grad_norm": 0.38311854004859924, + "learning_rate": 8.50122628949257e-07, + "loss": 0.0279, + "step": 17940 + }, + { + "epoch": 17.209971236816873, + "grad_norm": 0.1343742161989212, + "learning_rate": 8.342177163359389e-07, + "loss": 0.028, + "step": 17950 + }, + { + "epoch": 17.219558964525408, + "grad_norm": 0.19379399716854095, + "learning_rate": 8.184617426346131e-07, + "loss": 0.0301, + "step": 17960 + }, + { + "epoch": 17.22914669223394, + "grad_norm": 0.16689153015613556, + "learning_rate": 8.028547555751553e-07, + "loss": 0.029, + "step": 17970 + }, + { + "epoch": 17.238734419942475, + "grad_norm": 0.45647260546684265, + "learning_rate": 7.873968024361467e-07, + "loss": 0.0307, + "step": 17980 + }, + { + "epoch": 17.248322147651006, + "grad_norm": 0.19029688835144043, + "learning_rate": 7.720879300446682e-07, + "loss": 0.0269, + "step": 17990 + }, + { + "epoch": 17.25790987535954, + "grad_norm": 0.26700901985168457, + "learning_rate": 7.569281847762122e-07, + "loss": 0.026, + "step": 18000 + }, + { + "epoch": 17.267497603068072, + "grad_norm": 0.20858362317085266, + "learning_rate": 7.419176125544991e-07, + "loss": 0.0304, + "step": 18010 + }, + { + "epoch": 17.277085330776607, + "grad_norm": 0.23115743696689606, + "learning_rate": 7.270562588513663e-07, + "loss": 0.0389, + "step": 18020 + }, + { + "epoch": 17.28667305848514, + "grad_norm": 0.17492881417274475, + "learning_rate": 7.123441686866183e-07, + "loss": 0.0293, + "step": 18030 + }, + { + "epoch": 17.29626078619367, + "grad_norm": 0.12759244441986084, + "learning_rate": 6.977813866278826e-07, + "loss": 0.0239, + "step": 18040 + }, + { + "epoch": 17.305848513902205, + "grad_norm": 0.18989066779613495, + "learning_rate": 6.833679567905038e-07, + "loss": 0.0292, + "step": 18050 + }, + { + "epoch": 17.315436241610737, + "grad_norm": 0.5339308977127075, + "learning_rate": 6.691039228373774e-07, + "loss": 0.0337, + "step": 18060 + }, + { + "epoch": 17.325023969319272, + "grad_norm": 0.18861901760101318, + "learning_rate": 6.549893279788277e-07, + "loss": 0.0288, + "step": 18070 + }, + { + "epoch": 17.334611697027803, + "grad_norm": 0.18615840375423431, + "learning_rate": 6.410242149724966e-07, + "loss": 0.0246, + "step": 18080 + }, + { + "epoch": 17.34419942473634, + "grad_norm": 0.1773938536643982, + "learning_rate": 6.272086261231769e-07, + "loss": 0.0272, + "step": 18090 + }, + { + "epoch": 17.35378715244487, + "grad_norm": 0.2144092619419098, + "learning_rate": 6.135426032827185e-07, + "loss": 0.0299, + "step": 18100 + }, + { + "epoch": 17.363374880153405, + "grad_norm": 0.18490025401115417, + "learning_rate": 6.000261878498947e-07, + "loss": 0.0297, + "step": 18110 + }, + { + "epoch": 17.372962607861936, + "grad_norm": 0.18837903439998627, + "learning_rate": 5.86659420770247e-07, + "loss": 0.0272, + "step": 18120 + }, + { + "epoch": 17.38255033557047, + "grad_norm": 0.2982289791107178, + "learning_rate": 5.734423425359958e-07, + "loss": 0.0314, + "step": 18130 + }, + { + "epoch": 17.392138063279003, + "grad_norm": 0.2356351912021637, + "learning_rate": 5.603749931859137e-07, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 17.401725790987538, + "grad_norm": 0.13853472471237183, + "learning_rate": 5.474574123051912e-07, + "loss": 0.0289, + "step": 18150 + }, + { + "epoch": 17.41131351869607, + "grad_norm": 0.2044096440076828, + "learning_rate": 5.346896390253153e-07, + "loss": 0.0244, + "step": 18160 + }, + { + "epoch": 17.4209012464046, + "grad_norm": 0.33529403805732727, + "learning_rate": 5.220717120239693e-07, + "loss": 0.0282, + "step": 18170 + }, + { + "epoch": 17.430488974113135, + "grad_norm": 0.2302224040031433, + "learning_rate": 5.096036695248885e-07, + "loss": 0.0299, + "step": 18180 + }, + { + "epoch": 17.440076701821667, + "grad_norm": 0.22276417911052704, + "learning_rate": 4.972855492977823e-07, + "loss": 0.0294, + "step": 18190 + }, + { + "epoch": 17.449664429530202, + "grad_norm": 0.5279762744903564, + "learning_rate": 4.851173886581794e-07, + "loss": 0.0286, + "step": 18200 + }, + { + "epoch": 17.459252157238733, + "grad_norm": 0.22499582171440125, + "learning_rate": 4.7309922446732715e-07, + "loss": 0.0239, + "step": 18210 + }, + { + "epoch": 17.46883988494727, + "grad_norm": 0.2594180703163147, + "learning_rate": 4.61231093132114e-07, + "loss": 0.0275, + "step": 18220 + }, + { + "epoch": 17.4784276126558, + "grad_norm": 0.1713213175535202, + "learning_rate": 4.495130306049034e-07, + "loss": 0.0243, + "step": 18230 + }, + { + "epoch": 17.488015340364335, + "grad_norm": 0.3286925256252289, + "learning_rate": 4.3794507238347214e-07, + "loss": 0.0316, + "step": 18240 + }, + { + "epoch": 17.497603068072866, + "grad_norm": 0.23200523853302002, + "learning_rate": 4.2652725351085556e-07, + "loss": 0.0265, + "step": 18250 + }, + { + "epoch": 17.5071907957814, + "grad_norm": 0.22095492482185364, + "learning_rate": 4.1525960857530243e-07, + "loss": 0.024, + "step": 18260 + }, + { + "epoch": 17.516778523489933, + "grad_norm": 0.17762340605258942, + "learning_rate": 4.041421717101146e-07, + "loss": 0.0268, + "step": 18270 + }, + { + "epoch": 17.526366251198468, + "grad_norm": 0.2298087775707245, + "learning_rate": 3.931749765935744e-07, + "loss": 0.0257, + "step": 18280 + }, + { + "epoch": 17.535953978907, + "grad_norm": 0.21401867270469666, + "learning_rate": 3.8235805644882273e-07, + "loss": 0.0245, + "step": 18290 + }, + { + "epoch": 17.54554170661553, + "grad_norm": 0.5458080172538757, + "learning_rate": 3.716914440437813e-07, + "loss": 0.033, + "step": 18300 + }, + { + "epoch": 17.555129434324066, + "grad_norm": 0.17889949679374695, + "learning_rate": 3.611751716910472e-07, + "loss": 0.0303, + "step": 18310 + }, + { + "epoch": 17.564717162032597, + "grad_norm": 0.0861106589436531, + "learning_rate": 3.508092712477651e-07, + "loss": 0.025, + "step": 18320 + }, + { + "epoch": 17.574304889741132, + "grad_norm": 0.396636962890625, + "learning_rate": 3.405937741155829e-07, + "loss": 0.03, + "step": 18330 + }, + { + "epoch": 17.583892617449663, + "grad_norm": 0.3980105221271515, + "learning_rate": 3.30528711240502e-07, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 17.5934803451582, + "grad_norm": 0.2600933313369751, + "learning_rate": 3.206141131128326e-07, + "loss": 0.0278, + "step": 18350 + }, + { + "epoch": 17.60306807286673, + "grad_norm": 0.20506466925144196, + "learning_rate": 3.108500097670719e-07, + "loss": 0.0216, + "step": 18360 + }, + { + "epoch": 17.612655800575265, + "grad_norm": 0.31107306480407715, + "learning_rate": 3.0123643078180943e-07, + "loss": 0.0296, + "step": 18370 + }, + { + "epoch": 17.622243528283796, + "grad_norm": 0.2587839663028717, + "learning_rate": 2.9177340527966613e-07, + "loss": 0.0265, + "step": 18380 + }, + { + "epoch": 17.63183125599233, + "grad_norm": 0.293157160282135, + "learning_rate": 2.824609619271723e-07, + "loss": 0.0239, + "step": 18390 + }, + { + "epoch": 17.641418983700863, + "grad_norm": 0.22268742322921753, + "learning_rate": 2.732991289347064e-07, + "loss": 0.0283, + "step": 18400 + }, + { + "epoch": 17.651006711409394, + "grad_norm": 0.21071119606494904, + "learning_rate": 2.6428793405640087e-07, + "loss": 0.0241, + "step": 18410 + }, + { + "epoch": 17.66059443911793, + "grad_norm": 0.25878384709358215, + "learning_rate": 2.554274045900418e-07, + "loss": 0.0224, + "step": 18420 + }, + { + "epoch": 17.67018216682646, + "grad_norm": 0.2513992488384247, + "learning_rate": 2.46717567377025e-07, + "loss": 0.0271, + "step": 18430 + }, + { + "epoch": 17.679769894534996, + "grad_norm": 0.1096489429473877, + "learning_rate": 2.381584488022337e-07, + "loss": 0.0233, + "step": 18440 + }, + { + "epoch": 17.689357622243527, + "grad_norm": 0.24723054468631744, + "learning_rate": 2.2975007479397738e-07, + "loss": 0.0254, + "step": 18450 + }, + { + "epoch": 17.698945349952062, + "grad_norm": 0.22072063386440277, + "learning_rate": 2.2149247082392522e-07, + "loss": 0.0273, + "step": 18460 + }, + { + "epoch": 17.708533077660594, + "grad_norm": 0.2557280957698822, + "learning_rate": 2.1338566190699517e-07, + "loss": 0.0322, + "step": 18470 + }, + { + "epoch": 17.71812080536913, + "grad_norm": 0.3068563938140869, + "learning_rate": 2.0542967260131497e-07, + "loss": 0.0211, + "step": 18480 + }, + { + "epoch": 17.72770853307766, + "grad_norm": 0.18864025175571442, + "learning_rate": 1.976245270081334e-07, + "loss": 0.028, + "step": 18490 + }, + { + "epoch": 17.737296260786195, + "grad_norm": 0.20000196993350983, + "learning_rate": 1.899702487717203e-07, + "loss": 0.0239, + "step": 18500 + }, + { + "epoch": 17.746883988494726, + "grad_norm": 0.5573348999023438, + "learning_rate": 1.8246686107935562e-07, + "loss": 0.03, + "step": 18510 + }, + { + "epoch": 17.75647171620326, + "grad_norm": 0.09101556986570358, + "learning_rate": 1.7511438666119594e-07, + "loss": 0.0336, + "step": 18520 + }, + { + "epoch": 17.766059443911793, + "grad_norm": 0.2559066712856293, + "learning_rate": 1.6791284779024696e-07, + "loss": 0.0285, + "step": 18530 + }, + { + "epoch": 17.775647171620324, + "grad_norm": 0.23298071324825287, + "learning_rate": 1.6086226628226898e-07, + "loss": 0.0319, + "step": 18540 + }, + { + "epoch": 17.78523489932886, + "grad_norm": 0.1978902518749237, + "learning_rate": 1.5396266349574362e-07, + "loss": 0.0269, + "step": 18550 + }, + { + "epoch": 17.79482262703739, + "grad_norm": 0.5722432732582092, + "learning_rate": 1.4721406033177954e-07, + "loss": 0.0291, + "step": 18560 + }, + { + "epoch": 17.804410354745926, + "grad_norm": 0.29033163189888, + "learning_rate": 1.4061647723405125e-07, + "loss": 0.0288, + "step": 18570 + }, + { + "epoch": 17.813998082454457, + "grad_norm": 0.19131603837013245, + "learning_rate": 1.3416993418874924e-07, + "loss": 0.0247, + "step": 18580 + }, + { + "epoch": 17.823585810162992, + "grad_norm": 0.25687092542648315, + "learning_rate": 1.2787445072452998e-07, + "loss": 0.0267, + "step": 18590 + }, + { + "epoch": 17.833173537871524, + "grad_norm": 0.16243956983089447, + "learning_rate": 1.217300459124271e-07, + "loss": 0.0273, + "step": 18600 + }, + { + "epoch": 17.84276126558006, + "grad_norm": 0.17303957045078278, + "learning_rate": 1.1573673836580701e-07, + "loss": 0.0353, + "step": 18610 + }, + { + "epoch": 17.85234899328859, + "grad_norm": 0.4954906702041626, + "learning_rate": 1.0989454624032448e-07, + "loss": 0.0239, + "step": 18620 + }, + { + "epoch": 17.861936720997125, + "grad_norm": 0.500385582447052, + "learning_rate": 1.0420348723385043e-07, + "loss": 0.0279, + "step": 18630 + }, + { + "epoch": 17.871524448705657, + "grad_norm": 0.28065744042396545, + "learning_rate": 9.866357858642205e-08, + "loss": 0.024, + "step": 18640 + }, + { + "epoch": 17.88111217641419, + "grad_norm": 0.22515705227851868, + "learning_rate": 9.32748370802039e-08, + "loss": 0.0273, + "step": 18650 + }, + { + "epoch": 17.890699904122723, + "grad_norm": 0.4083874225616455, + "learning_rate": 8.803727903942127e-08, + "loss": 0.0269, + "step": 18660 + }, + { + "epoch": 17.900287631831254, + "grad_norm": 0.3455846309661865, + "learning_rate": 8.295092033031027e-08, + "loss": 0.0277, + "step": 18670 + }, + { + "epoch": 17.90987535953979, + "grad_norm": 0.15052051842212677, + "learning_rate": 7.801577636108448e-08, + "loss": 0.0358, + "step": 18680 + }, + { + "epoch": 17.91946308724832, + "grad_norm": 0.21173402667045593, + "learning_rate": 7.323186208188504e-08, + "loss": 0.0256, + "step": 18690 + }, + { + "epoch": 17.929050814956856, + "grad_norm": 0.3735136389732361, + "learning_rate": 6.859919198470288e-08, + "loss": 0.031, + "step": 18700 + }, + { + "epoch": 17.938638542665387, + "grad_norm": 0.2103312462568283, + "learning_rate": 6.411778010340097e-08, + "loss": 0.0322, + "step": 18710 + }, + { + "epoch": 17.948226270373922, + "grad_norm": 0.19569391012191772, + "learning_rate": 5.978764001359771e-08, + "loss": 0.0291, + "step": 18720 + }, + { + "epoch": 17.957813998082454, + "grad_norm": 0.25286465883255005, + "learning_rate": 5.5608784832683616e-08, + "loss": 0.0277, + "step": 18730 + }, + { + "epoch": 17.96740172579099, + "grad_norm": 0.2856442332267761, + "learning_rate": 5.158122721974357e-08, + "loss": 0.0254, + "step": 18740 + }, + { + "epoch": 17.97698945349952, + "grad_norm": 0.15211383998394012, + "learning_rate": 4.770497937554574e-08, + "loss": 0.024, + "step": 18750 + }, + { + "epoch": 17.986577181208055, + "grad_norm": 0.28586897253990173, + "learning_rate": 4.398005304248609e-08, + "loss": 0.0239, + "step": 18760 + }, + { + "epoch": 17.996164908916587, + "grad_norm": 0.18181052803993225, + "learning_rate": 4.0406459504555016e-08, + "loss": 0.0236, + "step": 18770 + }, + { + "epoch": 18.005752636625118, + "grad_norm": 0.19704671204090118, + "learning_rate": 3.698420958732074e-08, + "loss": 0.0251, + "step": 18780 + }, + { + "epoch": 18.015340364333653, + "grad_norm": 0.19747470319271088, + "learning_rate": 3.371331365786823e-08, + "loss": 0.0313, + "step": 18790 + }, + { + "epoch": 18.024928092042185, + "grad_norm": 0.23974737524986267, + "learning_rate": 3.05937816247992e-08, + "loss": 0.0334, + "step": 18800 + }, + { + "epoch": 18.03451581975072, + "grad_norm": 0.31815865635871887, + "learning_rate": 2.7625622938165507e-08, + "loss": 0.025, + "step": 18810 + }, + { + "epoch": 18.04410354745925, + "grad_norm": 0.14651015400886536, + "learning_rate": 2.4808846589474687e-08, + "loss": 0.0252, + "step": 18820 + }, + { + "epoch": 18.053691275167786, + "grad_norm": 0.31359338760375977, + "learning_rate": 2.214346111164556e-08, + "loss": 0.0255, + "step": 18830 + }, + { + "epoch": 18.063279002876317, + "grad_norm": 0.3521699607372284, + "learning_rate": 1.9629474578986008e-08, + "loss": 0.0229, + "step": 18840 + }, + { + "epoch": 18.072866730584852, + "grad_norm": 0.2816530168056488, + "learning_rate": 1.726689460716524e-08, + "loss": 0.0262, + "step": 18850 + }, + { + "epoch": 18.082454458293384, + "grad_norm": 0.27596089243888855, + "learning_rate": 1.5055728353191578e-08, + "loss": 0.0266, + "step": 18860 + }, + { + "epoch": 18.09204218600192, + "grad_norm": 0.25768667459487915, + "learning_rate": 1.2995982515406901e-08, + "loss": 0.0273, + "step": 18870 + }, + { + "epoch": 18.10162991371045, + "grad_norm": 0.13152585923671722, + "learning_rate": 1.1087663333431141e-08, + "loss": 0.0268, + "step": 18880 + }, + { + "epoch": 18.111217641418985, + "grad_norm": 0.1559949666261673, + "learning_rate": 9.330776588184487e-09, + "loss": 0.0307, + "step": 18890 + }, + { + "epoch": 18.120805369127517, + "grad_norm": 0.25546255707740784, + "learning_rate": 7.725327601826315e-09, + "loss": 0.0254, + "step": 18900 + }, + { + "epoch": 18.13039309683605, + "grad_norm": 0.17455005645751953, + "learning_rate": 6.271321237788508e-09, + "loss": 0.0331, + "step": 18910 + }, + { + "epoch": 18.139980824544583, + "grad_norm": 0.25416553020477295, + "learning_rate": 4.9687619007199316e-09, + "loss": 0.0332, + "step": 18920 + }, + { + "epoch": 18.149568552253115, + "grad_norm": 0.19471152126789093, + "learning_rate": 3.817653536480892e-09, + "loss": 0.0248, + "step": 18930 + }, + { + "epoch": 18.15915627996165, + "grad_norm": 0.26644882559776306, + "learning_rate": 2.8179996321597845e-09, + "loss": 0.0248, + "step": 18940 + }, + { + "epoch": 18.16874400767018, + "grad_norm": 0.18680621683597565, + "learning_rate": 1.9698032160231363e-09, + "loss": 0.0252, + "step": 18950 + }, + { + "epoch": 18.178331735378716, + "grad_norm": 0.22466066479682922, + "learning_rate": 1.2730668575322569e-09, + "loss": 0.0221, + "step": 18960 + }, + { + "epoch": 18.187919463087248, + "grad_norm": 0.27246662974357605, + "learning_rate": 7.277926673210367e-10, + "loss": 0.0258, + "step": 18970 + }, + { + "epoch": 18.197507190795783, + "grad_norm": 0.17329837381839752, + "learning_rate": 3.3398229720149607e-10, + "loss": 0.0266, + "step": 18980 + }, + { + "epoch": 18.207094918504314, + "grad_norm": 0.3577910363674164, + "learning_rate": 9.163694015268398e-11, + "loss": 0.0296, + "step": 18990 + }, + { + "epoch": 18.21668264621285, + "grad_norm": 0.24373145401477814, + "learning_rate": 7.57330315126481e-13, + "loss": 0.029, + "step": 19000 + }, + { + "epoch": 18.21668264621285, + "step": 19000, + "total_flos": 0.0, + "train_loss": 0.04628433942951654, + "train_runtime": 5633.6681, + "train_samples_per_second": 107.923, + "train_steps_per_second": 3.373 + } + ], + "logging_steps": 10, + "max_steps": 19000, + "num_input_tokens_seen": 0, + "num_train_epochs": 19, + "save_steps": 20000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}